# Natural Language Toolkit: Interface to BLLIP Parser # # Author: David McClosky # # Copyright (C) 2001-2017 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function from nltk.parse.api import ParserI from nltk.tree import Tree """ Interface for parsing with BLLIP Parser. Requires the Python bllipparser module. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. The former is generally easier if you have a BLLIP Parser unified model directory -- a basic model can be obtained from NLTK's downloader. More unified parsing models can be obtained with BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher`` or see docs for ``bllipparser.ModelFetcher.download_and_install_model``). Basic usage:: # download and install a basic unified parsing model (Wall Street Journal) # sudo python -m nltk.downloader bllip_wsj_no_aux >>> from nltk.data import find >>> model_dir = find('models/bllip_wsj_no_aux').path >>> bllip = BllipParser.from_unified_model_dir(model_dir) # 1-best parsing >>> sentence1 = 'British left waffles on Falklands .'.split() >>> top_parse = bllip.parse_one(sentence1) >>> print(top_parse) (S1 (S (NP (JJ British) (NN left)) (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands)))) (. .))) # n-best parsing >>> sentence2 = 'Time flies'.split() >>> all_parses = bllip.parse_all(sentence2) >>> print(len(all_parses)) 50 >>> print(all_parses[0]) (S1 (S (NP (NNP Time)) (VP (VBZ flies)))) # incorporating external tagging constraints (None means unconstrained tag) >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')]) >>> print(next(constrained1)) (S1 (NP (VB Time) (NNS flies))) >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)]) >>> print(next(constrained2)) (S1 (NP (NN Time) (VBZ flies))) References ---------- - Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of the 1st North American chapter of the Association for Computational Linguistics conference. Association for Computational Linguistics, 2000. - Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing and MaxEnt discriminative reranking." Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics. Association for Computational Linguistics, 2005. Known issues ------------ Note that BLLIP Parser is not currently threadsafe. Since this module uses a SWIG interface, it is potentially unsafe to create multiple ``BllipParser`` objects in the same process. BLLIP Parser currently has issues with non-ASCII text and will raise an error if given any. See http://pypi.python.org/pypi/bllipparser/ for more information on BLLIP Parser's Python interface. """ __all__ = ['BllipParser'] # this block allows this module to be imported even if bllipparser isn't # available try: from bllipparser import RerankingParser from bllipparser.RerankingParser import get_unified_model_parameters def _ensure_bllip_import_or_error(): pass except ImportError as ie: def _ensure_bllip_import_or_error(ie=ie): raise ImportError("Couldn't import bllipparser module: %s" % ie) def _ensure_ascii(words): try: for i, word in enumerate(words): word.decode('ascii') except UnicodeDecodeError: raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser " "currently doesn't support non-ASCII inputs." % (i, word)) def _scored_parse_to_nltk_tree(scored_parse): return Tree.fromstring(str(scored_parse.ptb_parse)) class BllipParser(ParserI): """ Interface for parsing with BLLIP Parser. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. """ def __init__(self, parser_model=None, reranker_features=None, reranker_weights=None, parser_options=None, reranker_options=None): """ Load a BLLIP Parser model from scratch. You'll typically want to use the ``from_unified_model_dir()`` class method to construct this object. :param parser_model: Path to parser model directory :type parser_model: str :param reranker_features: Path the reranker model's features file :type reranker_features: str :param reranker_weights: Path the reranker model's weights file :type reranker_weights: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) """ _ensure_bllip_import_or_error() parser_options = parser_options or {} reranker_options = reranker_options or {} self.rrp = RerankingParser() self.rrp.load_parser_model(parser_model, **parser_options) if reranker_features and reranker_weights: self.rrp.load_reranker_model(features_filename=reranker_features, weights_filename=reranker_weights, **reranker_options) def parse(self, sentence): """ Use BLLIP Parser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this BLLIP Parser instance's tagger. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: The sentence to be parsed :type sentence: list(str) :rtype: iter(Tree) """ _ensure_ascii(sentence) nbest_list = self.rrp.parse(sentence) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) def tagged_parse(self, word_and_tag_pairs): """ Use BLLIP to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. BLLIP will attempt to use the tags provided but may use others if it can't come up with a complete parse subject to those constraints. You may also specify a tag as ``None`` to leave a token's tag unconstrained. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: Input sentence to parse as (word, tag) pairs :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ words = [] tag_map = {} for i, (word, tag) in enumerate(word_and_tag_pairs): words.append(word) if tag is not None: tag_map[i] = tag _ensure_ascii(words) nbest_list = self.rrp.parse_tagged(words, tag_map) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) @classmethod def from_unified_model_dir(this_class, model_dir, parser_options=None, reranker_options=None): """ Create a ``BllipParser`` object from a unified parsing model directory. Unified parsing model directories are a standardized way of storing BLLIP parser and reranker models together on disk. See ``bllipparser.RerankingParser.get_unified_model_parameters()`` for more information about unified model directories. :return: A ``BllipParser`` object using the parser and reranker models in the model directory. :param model_dir: Path to the unified model directory. :type model_dir: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) :rtype: BllipParser """ (parser_model_dir, reranker_features_filename, reranker_weights_filename) = get_unified_model_parameters(model_dir) return this_class(parser_model_dir, reranker_features_filename, reranker_weights_filename, parser_options, reranker_options) def demo(): """This assumes the Python module bllipparser is installed.""" # download and install a basic unified parsing model (Wall Street Journal) # sudo python -m nltk.downloader bllip_wsj_no_aux from nltk.data import find model_dir = find('models/bllip_wsj_no_aux').path print('Loading BLLIP Parsing models...') # the easiest way to get started is to use a unified model bllip = BllipParser.from_unified_model_dir(model_dir) print('Done.') sentence1 = 'British left waffles on Falklands .'.split() sentence2 = 'I saw the man with the telescope .'.split() # this sentence is known to fail under the WSJ parsing model fail1 = '# ! ? : -'.split() for sentence in (sentence1, sentence2, fail1): print('Sentence: %r' % ' '.join(sentence)) try: tree = next(bllip.parse(sentence)) print(tree) except StopIteration: print("(parse failed)") # n-best parsing demo for i, parse in enumerate(bllip.parse(sentence1)): print('parse %d:\n%s' % (i, parse)) # using external POS tag constraints print("forcing 'tree' to be 'NN':", next(bllip.tagged_parse([('A', None), ('tree', 'NN')]))) print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':", next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')]))) # constraints don't have to make sense... (though on more complicated # sentences, they may cause the parse to fail) print("forcing 'A' to be 'NNP':", next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)]))) def setup_module(module): from nose import SkipTest try: _ensure_bllip_import_or_error() except ImportError: raise SkipTest('doctests from nltk.parse.bllip are skipped because ' 'the bllipparser module is not installed')