# -*- coding: utf-8 -*- import math as m from auxiliaries.dset_proc_aux import * from kelly import get_kelly_info, get_svalex_info, cefr_scale from word_pic import get_mutual_info from auxiliaries.dset_prep_aux import clean_value from auxiliaries.match_aux import add_keyword_info #import saldo as s class SentStatistics(): """ Collects statistical information about a Sentence instance (see 'ling_unit.py'). Links to tag sets referred to in this file: - morpho-syntactic: http://spraakbanken.gu.se/korp/markup/msdtags.html - dependency relations: http://stp.lingfil.uu.se/~nivre/swedish_treebank/dep.html Args: sent (instance): a Sentence instance kelly_list (list): the loaded Kelly word list (see kelly.py) parameters (dict): sentence search parameters passed as a dictionary Attributes: sent: see above stats: the object that will be filled with the statistical information modal_verb_list: a list of verbs usable as modal (auxiliary) verbs kelly_list: see above params: see 'parameters' above Yields: A container for statistical information for a sentence. """ def __init__(self, sent, parameters={}): self.sent = sent self.stats = {"voc_cefr": {"A1":0, "A2":0, "B1":0, "B2":0, "C1":0, "C2":0, "?":0, "-":0}} self.modal_verb_list = [u"kunna", u"måste", u"skola", u"vilja", u"böra", u"få"] self.params = parameters def get_len_stats(self,t): """ Collects lenght-based statisctics. Arg: t: a Token instance """ #Long words if len(t.word) > 6: put_feature_value(self.stats, "long_w", 1) #Extra-long words if len(t.word) > 13: put_feature_value(self.stats, "xlong_w", 1) #Total length of characters per token put_feature_value_list(self.stats, "tok_len", len(t.word)) return self.stats def get_kelly_stats(self,t,kelly_list): """ Collects information from the Kelly word list including: the CEFR level for all categories and frequency only for lexical categories (nouns, verbs, adjectives, adverbs). The information is added to the 'stats' attribute. Arg: t: a Token instance """ if self.params: #for sent_match v = get_kelly_info(kelly_list, t, self.params.get("target_cefr","")) else: v = get_kelly_info(kelly_list, t, self.sent.level) #except AttributeError: # v = get_kelly_info(self.kelly_list, t, self.text_level) cefr = v[1] self.stats["voc_cefr"][cefr] += 1.0 if t.pos in ["NN", "VB"] and (v[0] == "above"): put_feature_value(self.stats, "diff_NNVB", 1.0) if t.pos in ["NN", "JJ","VB", "AB"]: freq_kelly = v[2] if freq_kelly: log_freq_kelly = m.log(freq_kelly) #following Coh-metrix else: log_freq_kelly = 0.0 put_feature_value_list(self.stats, "voc_freq_kelly", log_freq_kelly) #token of a suitable CEFR level according to Kelly? if v[0] == "above": #or v == "not in kelly" or v == "no lemma"? put_feature_value_list(self.stats,"diff_voc", t.word) return self.stats def get_svalex_stats(self,t, svalex_list): if t.pos in ["NN", "VB", "JJ", "AB"]: svalex_info = get_svalex_info(svalex_list, t, self.params.get("target_cefr","any")) svalex_fr,out_of_svalex = svalex_info[0], svalex_info[1] put_feature_value_list(self.stats,"svalex_fr", svalex_fr) if out_of_svalex: put_feature_value_list(self.stats,"out_of_svalex", t.word) return self.stats def get_morpho_synt_stats(self,s,t,i): """ Gathers information based on part-of-speech and morpho-syntactic tags. Args: s: an instance of the Sentence class t: an instance of the Token class i: current index of token in the sentence """ put_feature_value_list(self.stats,"pos_unigr", t.pos) #Verbs if t.pos == "VB": if not self.stats["finite"]: if ("INF" not in t.msd) and ("SUP" not in t.msd) and ("PRF" not in t.msd): # only modal verb as finite verb without VG not allowed if t.lemma: if t.lemma[0] in self.modal_verb_list: #få, ska sometimes non-modal use try: ch_deprel = [tt.deprel for tt in self.stats["heads"][t.ref]] if "VG" in ch_deprel: put_feature_value_list(self.stats, "finite", 1.0) except KeyError: pass else: put_feature_value_list(self.stats, "finite", 1.0) else: put_feature_value_list(self.stats, "finite", 1.0) if t.deprel not in ["VG", "SP"]: #SP e.g. är öppen put_feature_value(self.stats, "main_verb", 1.0) if t.lemma: #check next word if verb (also non-modal use of those verbs) if t.lemma[0] in self.modal_verb_list: try: if s.nodes[i+1].deprel == "VG": put_feature_value_list(self.stats, "modal_verb", t.word) except IndexError: for w in s.nodes[i:]: if (w.pos == "VB" and w.deprel == "VG" and w.depheadid == t.ref): put_feature_value_list(self.stats, "modal_verb", t.word) if "SFO" in t.msd: if t.lemma[0][-1] == "s": # e.g. finns put_feature_value_list(self.stats, "sverb", t.word) else: put_feature_value(self.stats, "passive", 1.0) if t.msd[:6] == "PC.PRF": put_feature_value(self.stats, "perf_pc", 1.0) if t.msd[:6] == "PC.PRS": put_feature_value(self.stats, "pres_pc", 1.0) if "PRT" in t.msd: put_feature_value(self.stats, "past_VB", 1.0) elif "PRS" in t.msd: put_feature_value(self.stats, "pres_VB", 1.0) elif "SUP" in t.msd: put_feature_value(self.stats, "sup_VB", 1.0) if "IMP" in t.msd: put_feature_value(self.stats, "imp_VB", 1.0) if "KON" in t.msd: put_feature_value(self.stats, "konj_VB", 1.0) if t.word in ["han", "hon", "det", "den"]: put_feature_value(self.stats, "PN_3SG", 1.0) if t.pos == "NN" and ("NEU" in t.msd): put_feature_value(self.stats, "neu_NN", 1.0) #Relative strucutres (pronouns etc.) if t.pos in ["HA", "HD", "HP", "HS"]: if s.nodes[-1].word != "?": #to exclude interrogative use of those #(but indirect questions not handled...) put_feature_value(self.stats, "rel_str", 1.0) return self.stats def deprel_stats(self,t,root_ref, verb_args): """ Collects syntactic information based on the lenght and the direction of dependencies. Arg: t: a Token instance root_ref: position (index) of ROOT element in the sentence """ put_feature_value_list(self.stats,"deprel_unigr", t.deprel) if t.pos == "VB": verb_args[t.ref] = [] if t.depheadid == None or t.depheadid == '': put_feature_value_list(self.stats, "dep_len", 0.0) else: dep_len = int(t.ref)-int(t.depheadid) put_feature_value_list(self.stats, "dep_len", abs(dep_len)) if dep_len < 0: put_feature_value(self.stats, "left_arc", 1.0) if dep_len > 0: put_feature_value(self.stats, "right_arc", 1.0) #Collecting verbal arguments (all - restrict to pronouns and nouns?) if t.depheadid in verb_args: verb_args[t.depheadid].append(t) self.stats["verb_args"] = verb_args if root_ref: if t.depheadid == root_ref: root_dep_len = int(root_ref)-int(t.ref) put_feature_value_list(self.stats, "root_dep_len", abs(root_dep_len)) return self.stats def get_semantic_stats(self,t): """ Gets semantic information, based only on number of senses for now, since no word-sense disambiguation is used. Arg: t: a Token instance """ #nr of senses per word put_feature_value_list(self.stats, "senses/w", len(t.saldo)) if t.pos == "NN": put_feature_value_list(self.stats, "nn_senses/nn", len(t.saldo)) return self.stats def get_sentmatch_stats(self, t, all_tokens, word_pictures): if t.word == self.params["query_w"]: put_feature_value(self.stats, "keyword_count", 1.0) #Keyword repetition if not t.word.isalpha(): put_feature_value_list(self.stats, "non_alpha", t.word) elif t.word.isalpha() and not t.lemma: put_feature_value_list(self.stats, "non_lemmatized", t.word) if t.deprel in ["ES", "FS", "SS", "FP", "SP", "VS"]: #logical, dummy, other subjects + compl.s self.stats["has_subject"] = True if t.deprel == "NA": # negation adverbials put_feature_value_list(self.stats, "neg_form", t.word) if "AN" in t.msd: put_feature_value_list(self.stats,"abbrev",t.word) if "PM" in t.msd: put_feature_value_list(self.stats,"proper_name",t.word) if t.deprel == "ROOT": put_feature_value_list(self.stats, "roots", t) add_keyword_info(t, self.stats, self.params) #put_feature_value_list(self.stats,"senses/w",float(len(t.saldo))) mi,used_rel_lemma = get_mutual_info(t, all_tokens, self.stats, word_pictures) put_feature_value_list(self.stats,"used_rel_lemmas",used_rel_lemma) put_feature_value_list(self.stats,"MI",mi) #put_feature_value_list(self.stats,"MI_lemmas",used_rel_lemma) return self.stats def get_stats_SWE(self, kelly_list, svalex_list, word_pictures={}, use_deprel=True,use_ngrams=False): """ Gathers statistical information from different linguistic levels for a sentence based on information specific to the Korp pipeline tags for Swedish (as of June 2015) and the Swedish Kelly wordlist. Arg: use_deprel: whether to use dependency relation tags use_ngrams: whether to collect ngrams (uni- and bigrams), see 'get_ngrams()' in 'feat_aux.py' """ s = self.sent root_ref = check_root(s) #self.stats["diff_voc"] = 0.0 tokens = [] verb_args = {} #{"verb1": {arg1_pos:"PN", arg1_deprel:"SS", etc.},..} self.stats["finite"] = [] self.stats["heads"] = {} self.stats["keyword"] = {} self.stats["has_subject"] = 0 self.stats["used_rel_lemmas"] = [] # Collect the position ('ref') of the dependency head of each token # +1 compared to regular indexes, string type for tkn in s.nodes: if tkn.deprel == "ROOT": put_feature_value_list(self.stats["heads"], tkn.ref, tkn) #"ROOT" else: put_feature_value_list(self.stats["heads"], tkn.depheadid, tkn) for i,t in enumerate(s.nodes): mapped_token = map_Token_to_dict(t) #just a fix, see dset_proc_aux.py tokens.append(mapped_token) #get statistics from different liguistic levels self.stats = self.get_len_stats(t) self.stats = self.get_kelly_stats(t, kelly_list) if svalex_list: self.stats = self.get_svalex_stats(t, svalex_list) self.stats = self.get_semantic_stats(t) self.stats = self.get_morpho_synt_stats(s,t,i) if self.params: self.stats = self.get_sentmatch_stats(t,s.nodes,word_pictures) #add lemma unigrams lm_ngram = get_lemma_ngrams(s, t, i, "uni") if lm_ngram: put_feature_value_list(self.stats,"lemma_unigr", lm_ngram) if use_ngrams: #bi- and trigrams self.stats = get_ngrams(self.stats,s,t,i) if use_deprel: self.stats = self.deprel_stats(t,root_ref, verb_args) #if self.params: # self.stats = self.get_sentmatch_stats(s,t,i) #TO DO: get rid off the 'params' argument? #fix for JSON serialization issue with the Token class self.stats["tokens"] = s.nodes # retain a copy of Token instances s.nodes = tokens # change Token instances to 'dict'-s return self.stats def get_stats(): #implement a language independent version based on universal tag sets (Nivre) #kelly or other frequency lists? #POS tags (only 12 coarse grain features) #skip syntactic features or use a Korp-external annotation + mapping? pass def print_statistics(self): print "Statisctics for:'" + self.sent.words + "'" for k,v in self.stats.iteritems(): print k + ": \t",v