# Natural Language Toolkit: Tagset Mapping # # Copyright (C) 2001-2017 NLTK Project # Author: Nathan Schneider # Steven Bird # URL: # For license information, see LICENSE.TXT """ Interface for converting POS tags from various treebanks to the universal tagset of Petrov, Das, & McDonald. The tagset consists of the following 12 coarse tags: VERB - verbs (all tenses and modes) NOUN - nouns (common and proper) PRON - pronouns ADJ - adjectives ADV - adverbs ADP - adpositions (prepositions and postpositions) CONJ - conjunctions DET - determiners NUM - cardinal numbers PRT - particles or other function words X - other: foreign words, typos, abbreviations . - punctuation @see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/ """ from __future__ import print_function, unicode_literals, division from collections import defaultdict from os.path import join from nltk.data import load _UNIVERSAL_DATA = "taggers/universal_tagset" _UNIVERSAL_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.') # _MAPPINGS = defaultdict(lambda: defaultdict(dict)) # the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag _MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK'))) def _load_universal_map(fileid): contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text") # When mapping to the Universal Tagset, # map unknown inputs to 'X' not 'UNK' _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X' for line in contents.splitlines(): line = line.strip() if line == '': continue fine, coarse = line.split('\t') assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse) assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine) _MAPPINGS[fileid]['universal'][fine] = coarse def tagset_mapping(source, target): """ Retrieve the mapping dictionary between tagsets. >>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\ 'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\ 'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'} True """ if source not in _MAPPINGS or target not in _MAPPINGS[source]: if target == 'universal': _load_universal_map(source) return _MAPPINGS[source][target] def map_tag(source, target, source_tag): """ Maps the tag from the source tagset to the target tagset. >>> map_tag('en-ptb', 'universal', 'VBZ') 'VERB' >>> map_tag('en-ptb', 'universal', 'VBP') 'VERB' >>> map_tag('en-ptb', 'universal', '``') '.' """ # we need a systematic approach to naming if target == 'universal': if source == 'wsj': source = 'en-ptb' if source == 'brown': source = 'en-brown' return tagset_mapping(source, target)[source_tag]