# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to MaltParser # # Author: Dan Garrette # Contributor: Liling Tan, Mustufain, osamamukhtar11 # # Copyright (C) 2001-2017 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function from __future__ import unicode_literals from six import text_type import os import sys import tempfile import subprocess import inspect from nltk.data import ZipFilePathPointer from nltk.internals import find_dir, find_file, find_jars_within_path from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.util import taggedsents_to_conll def malt_regex_tagger(): from nltk.tag import RegexpTagger _tagger = RegexpTagger( [(r'\.$','.'), (r'\,$',','), (r'\?$','?'), # fullstop, comma, Qmark (r'\($','('), (r'\)$',')'), # round brackets (r'\[$','['), (r'\]$',']'), # square brackets (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'DT'), # articles (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive (r'(on|On|in|In|at|At|since|Since)$', 'IN'),# time prepopsitions (r'(for|For|ago|Ago|before|Before)$', 'IN'),# time prepopsitions (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions (r'(across|Across|through|Through)$', 'IN'),# space prepopsitions (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN'), # nouns (default) ]) return _tagger.tag def find_maltparser(parser_dirname): """ A module to find MaltParser .jar file and its dependencies. """ if os.path.exists(parser_dirname): # If a full path is given. _malt_dir = parser_dirname else: # Try to find path to maltparser directory in environment variables. _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',)) # Checks that that the found directory contains all the necessary .jar malt_dependencies = ['','',''] _malt_jars = set(find_jars_within_path(_malt_dir)) _jars = set(os.path.split(jar)[1] for jar in _malt_jars) malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar']) assert malt_dependencies.issubset(_jars) assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)) return list(_malt_jars) def find_malt_model(model_filename): """ A module to find pre-trained MaltParser model. """ if model_filename == None: return 'malt_temp.mco' elif os.path.exists(model_filename): # If a full path is given. return model_filename else: # Try to find path to malt model in environment variables. return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False) class MaltParser(ParserI): """ A class for dependency parsing with MaltParser. The input is the paths to: - a maltparser directory - (optionally) the path to a pre-trained MaltParser .mco model file - (optionally) the tagger to use for POS tagging before parsing - (optionally) additional Java arguments Example: >>> from nltk.parse import malt >>> # With MALT_PARSER and MALT_MODEL environment set. >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP (shot I (elephant an) (in (pajamas my)) .) >>> # Without MALT_PARSER and MALT_MODEL environment. >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP (shot I (elephant an) (in (pajamas my)) .) """ def __init__(self, parser_dirname, model_filename=None, tagger=None, additional_java_args=None): """ An interface for parsing with the Malt Parser. :param parser_dirname: The path to the maltparser directory that contains the maltparser-1.x.jar :type parser_dirname: str :param model_filename: The name of the pre-trained model with .mco file extension. If provided, training will not be required. (see http://www.maltparser.org/mco/mco.html and see http://www.patful.com/chalk/node/185) :type model_filename: str :param tagger: The tagger used to POS tag the raw string before formatting to CONLL format. It should behave like `nltk.pos_tag` :type tagger: function :param additional_java_args: This is the additional Java arguments that one can use when calling Maltparser, usually this is the heapsize limits, e.g. `additional_java_args=['-Xmx1024m']` (see http://goo.gl/mpDBvQ) :type additional_java_args: list """ # Find all the necessary jar files for MaltParser. self.malt_jars = find_maltparser(parser_dirname) # Initialize additional java arguments. self.additional_java_args = additional_java_args if \ additional_java_args is not None else [] # Initialize model. self.model = find_malt_model(model_filename) self._trained = self.model != 'malt_temp.mco' # Set the working_dir parameters i.e. `-w` from MaltParser's option. self.working_dir = tempfile.gettempdir() # Initialize POS tagger. self.tagger = tagger if tagger is not None else malt_regex_tagger() def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'): """ Use MaltParser to parse multiple POS tagged sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: iter(iter(``DependencyGraph``)) the dependency graph representation of each sentence """ if not self._trained: raise Exception("Parser has not been trained. Call train() first.") with tempfile.NamedTemporaryFile(prefix='malt_input.conll.', dir=self.working_dir, mode='w', delete=False) as input_file: with tempfile.NamedTemporaryFile(prefix='malt_output.conll.', dir=self.working_dir, mode='w', delete=False) as output_file: # Convert list of sentences to CONLL format. for line in taggedsents_to_conll(sentences): input_file.write(text_type(line)) input_file.close() # Generate command to run maltparser. cmd =self.generate_malt_command(input_file.name, output_file.name, mode="parse") # This is a maltparser quirk, it needs to be run # where the model file is. otherwise it goes into an awkward # missing .jars or strange -w working_dir problem. _current_path = os.getcwd() # Remembers the current path. try: # Change to modelfile path os.chdir(os.path.split(self.model)[0]) except: pass ret = self._execute(cmd, verbose) # Run command. os.chdir(_current_path) # Change back to current path. if ret is not 0: raise Exception("MaltParser parsing (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) # Must return iter(iter(Tree)) with open(output_file.name) as infile: for tree_str in infile.read().split('\n\n'): yield(iter([DependencyGraph(tree_str, top_relation_label=top_relation_label)])) os.remove(input_file.name) os.remove(output_file.name) def parse_sents(self, sentences, verbose=False, top_relation_label='null'): """ Use MaltParser to parse multiple sentences. Takes a list of sentences, where each sentence is a list of words. Each sentence will be automatically tagged with this MaltParser instance's tagger. :param sentences: Input sentences to parse :type sentence: list(list(str)) :return: iter(DependencyGraph) """ tagged_sentences = (self.tagger(sentence) for sentence in sentences) return self.parse_tagged_sents(tagged_sentences, verbose, top_relation_label=top_relation_label) def generate_malt_command(self, inputfilename, outputfilename=None, mode=None): """ This function generates the maltparser command use at the terminal. :param inputfilename: path to the input file :type inputfilename: str :param outputfilename: path to the output file :type outputfilename: str """ cmd = ['java'] cmd+= self.additional_java_args # Adds additional java arguments # Joins classpaths with ";" if on Windows and on Linux/Mac use ":" classpaths_separator = ';' if sys.platform.startswith('win') else ':' cmd+= ['-cp', classpaths_separator.join(self.malt_jars)] # Adds classpaths for jars cmd+= ['org.maltparser.Malt'] # Adds the main function. # Adds the model file. if os.path.exists(self.model): # when parsing cmd+= ['-c', os.path.split(self.model)[-1]] else: # when learning cmd+= ['-c', self.model] cmd+= ['-i', inputfilename] if mode == 'parse': cmd+= ['-o', outputfilename] cmd+= ['-m', mode] # mode use to generate parses. return cmd @staticmethod def _execute(cmd, verbose=False): output = None if verbose else subprocess.PIPE p = subprocess.Popen(cmd, stdout=output, stderr=output) return p.wait() def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of ``DependencyGraph`` objects :param depgraphs: list of ``DependencyGraph`` objects for training input data :type depgraphs: DependencyGraph """ # Write the conll_str to malt_train.conll file in /tmp/ with tempfile.NamedTemporaryFile(prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False) as input_file: input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs)) input_file.write(text_type(input_str)) # Trains the model with the malt_train.conll self.train_from_file(input_file.name, verbose=verbose) # Removes the malt_train.conll once training finishes. os.remove(input_file.name) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data :type conll_file: str """ # If conll_file is a ZipFilePathPointer, # then we need to do some extra massaging if isinstance(conll_file, ZipFilePathPointer): with tempfile.NamedTemporaryFile(prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False) as input_file: with conll_file.open() as conll_input_file: conll_str = conll_input_file.read() input_file.write(text_type(conll_str)) return self.train_from_file(input_file.name, verbose=verbose) # Generate command to run maltparser. cmd =self.generate_malt_command(conll_file, mode="learn") ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser training (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) self._trained = True if __name__ == '__main__': ''' A demostration function to show how NLTK users can use the malt parser API. >>> from nltk import pos_tag >>> assert 'MALT_PARSER' in os.environ, str( ... "Please set MALT_PARSER in your global environment, e.g.:\n" ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'") >>> >>> assert 'MALT_MODEL' in os.environ, str( ... "Please set MALT_MODEL in your global environment, e.g.:\n" ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'") >>> >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" ... "2 sees _ VB _ _ 0 ROOT _ _\n" ... "3 a _ DT _ _ 4 SPEC _ _\n" ... "4 dog _ NN _ _ 2 OBJ _ _\n" ... "5 . _ . _ _ 2 PUNCT _ _\n") >>> >>> >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" ... "2 walks _ VB _ _ 0 ROOT _ _\n" ... "3 . _ . _ _ 2 PUNCT _ _\n") >>> dg1 = DependencyGraph(_dg1_str) >>> dg2 = DependencyGraph(_dg2_str) >>> # Initialize a MaltParser object >>> parser_dirname = 'maltparser-1.7.2' >>> mp = MaltParser(parser_dirname=parser_dirname) >>> >>> # Trains a model. >>> mp.train([dg1,dg2], verbose=False) >>> sent1 = ['John','sees','Mary', '.'] >>> sent2 = ['John', 'walks', 'a', 'dog', '.'] >>> >>> # Parse a single sentence. >>> parsed_sent1 = mp.parse_one(sent1) >>> parsed_sent2 = mp.parse_one(sent2) >>> print (parsed_sent1.tree()) (sees John Mary .) >>> print (parsed_sent2.tree()) (walks John (dog a) .) >>> >>> # Parsing multiple sentences. >>> sentences = [sent1,sent2] >>> parsed_sents = mp.parse_sents(sentences) >>> print(next(next(parsed_sents)).tree()) (sees John Mary .) >>> print(next(next(parsed_sents)).tree()) (walks John (dog a) .) >>> >>> # Initialize a MaltParser object with an English pre-trained model. >>> parser_dirname = 'maltparser-1.7.2' >>> model_name = 'engmalt.linear-1.7.mco' >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag) >>> sent1 = 'I shot an elephant in my pajamas .'.split() >>> sent2 = 'Time flies like banana .'.split() >>> # Parse a single sentence. >>> print(mp.parse_one(sent1).tree()) (shot I (elephant an) (in (pajamas my)) .) # Parsing multiple sentences >>> sentences = [sent1,sent2] >>> parsed_sents = mp.parse_sents(sentences) >>> print(next(next(parsed_sents)).tree()) (shot I (elephant an) (in (pajamas my)) .) >>> print(next(next(parsed_sents)).tree()) (flies Time (like banana) .) ''' import doctest doctest.testmod()