# encoding: utf-8 # Natural Language Toolkit: Senna POS Tagger # # Copyright (C) 2001-2017 NLTK Project # Author: Rami Al-Rfou' # URL: # For license information, see LICENSE.TXT """ Senna POS tagger, NER Tagger, Chunk Tagger The input is: - path to the directory that contains SENNA executables. If the path is incorrect, SennaTagger will automatically search for executable file specified in SENNA environment variable - (optionally) the encoding of the input data (default:utf-8) Note: Unit tests for this module can be found in test/unit/test_senna.py >>> from nltk.tag import SennaTagger >>> tagger = SennaTagger('/usr/share/senna-v3.0') >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] >>> from nltk.tag import SennaChunkTagger >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] >>> from nltk.tag import SennaNERTagger >>> nertagger = SennaNERTagger('/usr/share/senna-v3.0') >>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP [('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'), ('London', 'B-LOC'), ('.', 'O')] >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP [('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')] """ from nltk.compat import python_2_unicode_compatible from nltk.classify import Senna @python_2_unicode_compatible class SennaTagger(Senna): def __init__(self, path, encoding='utf-8'): super(SennaTagger, self).__init__(path, ['pos'], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super(SennaTagger, self).tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations['word'], annotations['pos']) return tagged_sents @python_2_unicode_compatible class SennaChunkTagger(Senna): def __init__(self, path, encoding='utf-8'): super(SennaChunkTagger, self).__init__(path, ['chk'], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super(SennaChunkTagger, self).tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations['word'], annotations['chk']) return tagged_sents def bio_to_chunks(self, tagged_sent, chunk_type): """ Extracts the chunks in a BIO chunk-tagged sentence. >>> from nltk.tag import SennaChunkTagger >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') >>> sent = 'What is the airspeed of an unladen swallow ?'.split() >>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP >>> tagged_sent # doctest: +SKIP [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')] :param tagged_sent: A list of tuples of word and BIO chunk tag. :type tagged_sent: list(tuple) :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP' :type tagged_sent: str :return: An iterable of tuples of chunks that users want to extract and their corresponding indices. :rtype: iter(tuple(str)) """ current_chunk = [] current_chunk_position = [] for idx, word_pos in enumerate(tagged_sent): word, pos = word_pos if '-'+chunk_type in pos: # Append the word to the current_chunk. current_chunk.append((word)) current_chunk_position.append((idx)) else: if current_chunk: # Flush the full chunk when out of an NP. _chunk_str = ' '.join(current_chunk) _chunk_pos_str = '-'.join(map(str, current_chunk_position)) yield _chunk_str, _chunk_pos_str current_chunk = [] current_chunk_position = [] if current_chunk: # Flush the last chunk. yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position)) @python_2_unicode_compatible class SennaNERTagger(Senna): def __init__(self, path, encoding='utf-8'): super(SennaNERTagger, self).__init__(path, ['ner'], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super(SennaNERTagger, self).tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations['word'], annotations['ner']) return tagged_sents # skip doctests if Senna is not installed def setup_module(module): from nose import SkipTest try: tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) except OSError: raise SkipTest("Senna executable not found")