# -*- coding: utf-8 -*- from __future__ import division import codecs, os #import classify_level - wEKA-based import well_formedness import context_independence from auxiliaries.match_aux import * import ling_complexity as lc class SentMatch: """ Computes and stores scores describing how well a KWIC (KeyWord In Context), i.e. a corpus sentence containing a search keyword, satisfies a number of search parameters and criteria. Args: kwic (instance): a Korp (kwic) sentence mapped to a kwic instance. stats (instance): a SentStatistic instance, result of the SentStatistics.get_stats_SWE() method parameters (dict): search parameters passed as a dictionary criteria (dict): list of filters and rankers to use Attributes: kwic (instance): see above stats (instance): see above sent (instance): a Sentence instance params (dict): see above criteria (dict): see above match (dict): match value tuples per search parameter The first element is a boolean, True if a sentence is bad according to a criteria, False otherwise; the second member provides information. match_score (int):overall match score sent_left (str): tokens to the left of the search keyword (keyword specified in params) sent_right (str): tokens to the right of the search keyword Yields: An object with an overall match scores and detailed information for a sentence based on the input parameters and criteria. """ def __init__(self, kwic, stats, parameters, criteria): self.kwic = kwic #context dependency (CD) experiments self.stats = stats try: self.sent = kwic.sentence # Sentence instance but .nodes changed # to list of 'dict's (JSON serialization fix) except: self.sent = kwic self.params = parameters self.criteria = criteria self.match = {} self.match_score = 0 self.sent_left = "" self.sent_right = "" #print self.sent.words.encode("utf-8") #Creating sentence left and right attributes for exercise item generation split_keyword_context(self) def check_wellformedness(self): # WELL-FORMEDNESS if "root" in self.criteria["well_formedness"]: well_formedness.has_root(self) if "sent_tokenization" in self.criteria["well_formedness"]: well_formedness.check_sent_tokenization(self) if "non_alpha" in self.criteria["well_formedness"] or \ "non_lemmatized" in self.criteria["well_formedness"]: thresholds = {"non_alpha":self.params["non_alpha_thr"], "non_lemmatized":self.params["non_lemmatized_thr"]} well_formedness.get_bad_lexica_percentage(self, thresholds) if "elliptic" in self.criteria["well_formedness"]: well_formedness.check_ellipsis(self) #TO DO: Discard sent if it doesn't satisfy one of the sub-criteria? def check_isolability(self, demon_pronouns, anaph_adv): # CONTEXT-INDEPENDENCE context_indep = False time_adv_antecedent = False dialogue_answ = [("MID", "IN", "MID"), ("IN", "MID"), ("MID", "AB", "MID")] if "roots" in self.stats and "struct_conn" in self.criteria["isolability"]: context_indep = context_independence.check_root_POS(self, self.stats["roots"]) if "yn_answer" in self.criteria["isolability"]: context_indep = context_independence.is_yn_answer(self, dialogue_answ) anaphora_types_to_check = [k for k in self.criteria["isolability"] if k[:5] == "anaph"] if anaphora_types_to_check: for j,tok in enumerate(self.stats["tokens"]): context_indep = context_independence.check_anaphora(tok, j, self, time_adv_antecedent, demon_pronouns, anaph_adv, anaphora_types_to_check) if "anaphora-PN" in self.criteria["isolability"]: if "anaphora-PN" in self.match: unresolved_pn = [word for (bool_val, word) in self.match["anaphora-PN"]] else: unresolved_pn = [] if "resolved?_anaphora-PN" in self.stats: pn_with_ant_candidate = [word for bool_val, word in self.stats["resolved?_anaphora-PN"]] adjusted_value = (len(unresolved_pn) - (len(pn_with_ant_candidate) * 0.5)) self.match["anaphora-PN"] = (adjusted_value, "unresolved PNs: %s; PNs with antecedent candidates: \ %s" % (", ".join(unresolved_pn).encode("utf-8"), ", ".join(pn_with_ant_candidate).encode("utf-8"))) elif unresolved_pn: self.match["anaphora-PN"] = (len(unresolved_pn), "unresolved PNs: %s" % ", ".join(unresolved_pn).encode("utf-8")) if "anaphora-AB" in self.criteria["isolability"]: if "anaphora-AB" in self.match: try: self.match["anaphora-AB"] = \ (len(self.match["anaphora-AB"]), ", ".join([m for (b,m) in self.match["anaphora-AB"]]).encode("utf-8")) except TypeError: print self.match["anaphora-AB"].encode("utf-8") #TO DO: Discard sent if it doesn't satisfy one of the sub-criteria? def check_sensitive_voc(self, sensitive_voc): # Sensitive vocabulary (partial PARSNIPS) # PARSNIP = Politics Alcohol Religion Sex Narcotics # Isms (e.g. communism or atheism) Pork if "sensitive_voc" in self.criteria: categories_to_filter = self.params["sensitive_voc_cats"] #select the relevant subset of sensitive words voc_to_filter = [] for l2 in sensitive_voc: if categories_to_filter == ["all"]: voc_to_filter.append(l2[0].decode("utf-8")) else: cat = l2[2].split(",") for c in cat: if c.strip(" ").strip("\n") in categories_to_filter: voc_to_filter.append(l2[0].decode("utf-8")) #check if any token matches any item in the sublist sens_voc_in_sent = [] for tkn in self.sent: if tkn["lemma"]: sensitive_w = [] for lm in tkn["lemma"]: if lm in voc_to_filter or tkn["word"].lower() in voc_to_filter: sensitive_w.append(True) else: sensitive_w.append(False) # TO DO: incorporate Google's list? or another Swedish list? # TO DO: sense-based version if sum(sensitive_w): sens_voc_in_sent.append(tkn["word"]) elif tkn["word"].lower() in voc_to_filter: sens_voc_in_sent.append(tkn["word"]) if sens_voc_in_sent: put_feature_value(self.match, "sensitive_voc", (len(sens_voc_in_sent), ", ".join(sens_voc_in_sent).encode("utf-8"))) def check_readability(self, classifier, instance): # READABLE - machine learning based, CICLING 2015 feaatures cefr_scale = {"A1":1, "A2":2, "B1":3, "B2":4, "C1":5, "C2":6} pred_cefr = lc.classify(classifier, instance) target_cefr = self.params["target_cefr"] if pred_cefr != target_cefr: level_diff = cefr_scale[pred_cefr]-cefr_scale[target_cefr] #level_diff > 0 = difficult sentences else: level_diff = 0 if self.criteria["readability"] == "filter" and level_diff: put_feature_value(self.match, "readability", (level_diff, pred_cefr)) elif self.criteria["readability"] == "ranker": put_feature_value(self.match, "readability", (level_diff, pred_cefr)) #def check_informativity(self): # INFORMATIVE # if "lex_to_func" in self.criteria["informative"]: # # Ratio of lexical tokens to non-lexical tokens # nb_lexical_words = 0 # nb_func_words = 0 # for pos in self.stats["pos_unigr"]: # #duplicate from sent_features: # if pos in ["NN", "JJ","VB", "AB"]: #keep adverbs? # nb_lexical_words += 1 # elif pos not in ["PM", "UO","MAD", "MID"]:# prop names, foreign words, punctuation not counted as func. words # nb_func_words += 1 # if nb_func_words: # lex_func_ratio = (nb_lexical_words / nb_func_words) # diff_lex_func = -(lex_func_ratio - self.params["lex_to_func_thr"]) # else: # diff_lex_func = nb_lexical_words #if no func words # self.match["lex_to_func"] = (diff_lex_func, lex_func_ratio) # # diff_lex_func stands for the extent of the satisfaction of the criteria # # below threshold (bad ratios) will result in negative values def check_typicality(self): # TYPICAL - based on Mutual Information value from Korp word pictures #mi_info = [str(el[0]) + ": " + str(el[1]) for el in # zip(self.stats["used_rel_lemmas"], # self.stats["MI"]) if el[1]] mi_info = "summed Lexicographers' Mutual Information score" self.match["typicality"] = (sum(self.stats["MI"]), mi_info) # TO DO: improve mi_info (add pair, not only single lemmas) # TO DO: valency (use verb_args from stats) def check_other_criteria(self, speaking_verbs_list): # EXTRA if "length" in self.criteria["other_criteria"]: # Is the length of the sentence out of the desired length range? sent_len = int(self.sent.length) min_len = self.params["min_len"] max_len = self.params["max_len"] out_of_len = out_of_length_range(self.sent.length, min_len, max_len) if out_of_len: if sent_len < min_len: diff = min_len - sent_len else: diff = sent_len - max_len info = "%d tokens long" % self.sent.length self.match["length"] = (diff, info) # Is there keyword repetition? if "repkw" in self.criteria["other_criteria"]: if "keyword_count" in self.stats: kw_rep = self.stats["keyword_count"] - 1 if kw_rep: self.match["repkw"] = (kw_rep, "%d repetition(s)" % kw_rep) # Is the keyword's position out of the defined distance range from the chosen #target sentence edge (start or end)? if "kw_position" in self.criteria["other_criteria"]: kw_in_pos = is_keyword_within_position(self.sent, #self.kwic self.params["target_edge"], self.params["proportion"]) if not kw_in_pos: self.match["kw_position"] = (not kw_in_pos, """search term not within %d\% of sentence %s""" % (self.params["proportion"], self.params["target_edge"])) # Is the sentence interrogative? if "interrogative" in self.criteria["other_criteria"]: if self.sent[-1]["word"] == "?": put_feature_value(self.stats, "interrogative", True) self.match["interrogative"] = (True, "") # Check for direct speech if "direct_speech" in self.criteria["other_criteria"]: speaking_verbs = [l[0].decode("utf-8") for l in speaking_verbs_list] direct_speech_patterns = [("MID", "VB", "PN"), ("MID", "VB", "PM"), ("PAD", "VB", "PN"), ("PAD", "VB", "PM")] #TO DO: handle auxiliaries and modal verbs token_objs = self.stats["tokens"] # list of original Token instances # from Sentence.nodes try: speaking = [] for t in token_objs: if t.lemma: if t.lemma[0] in speaking_verbs: speaking.append(t) if speaking: for vb in speaking: verb_idx = int(vb.ref)-1 try: ds_candidate = (token_objs[verb_idx-1].pos,token_objs[verb_idx].pos, token_objs[verb_idx+1].pos) if ds_candidate in direct_speech_patterns: put_feature_value(self.match, "direct_speech", (True, ds_candidate)) except IndexError: pass except IndexError: pass #Is the sentence negatively formulated? if "neg_form" in self.criteria["other_criteria"]: if "neg_form" in self.stats: neg_form = self.stats["neg_form"] if neg_form: self.match["neg_form"] = (len(neg_form), ", ".join(neg_form).encode("utf-8")) #Are there modal verbs in the sentence? if "modal_verb" in self.criteria["other_criteria"]: if "modal_verb" in self.stats: mv = self.stats["modal_verb"] self.match["modal_verb"] = (len(mv), ", ".join(mv).encode("utf-8")) #Are there participles? if "participle" in self.criteria["other_criteria"]: pc = [t["word"] for t in self.sent if t["pos"] == "PC"] if pc: self.match["participle"] = (len(pc), ", ".join(pc).encode("utf-8")) #Are there s-verbs? if "sverb" in self.criteria["other_criteria"]: if "sverb" in self.stats: sverb = self.stats["sverb"] if sverb: self.match["sverb"] = (len(sverb), ", ".join(sverb).encode("utf-8")) #Are there proper_names? if "proper_name" in self.criteria["other_criteria"]: if "proper_name" in self.stats: prop_name = self.stats["proper_name"] if prop_name: self.match["proper_name"] = (len(prop_name), ", ".join(prop_name).encode("utf-8")) #Are there abbreviations? (change to True/False in the fun above?) if "abbrev" in self.criteria["other_criteria"]: if "abbrev" in self.stats: abbrev = self.stats["abbrev"] if abbrev: self.match["abbrev"] = (len(abbrev), ", ".join(abbrev).encode("utf-8")) # Information from Kelly if "diff_voc_kelly" in self.criteria["other_criteria"]: #Is the % of words above the target CEFR level #above the chosen limit? if "diff_voc" in self.stats: nr_diff_voc = len(self.stats["diff_voc"]) #TO DO: do as below for all criteria if self.criteria["other_criteria"]["diff_voc_kelly"] == "filter": diff_voc_perc = (nr_diff_voc / self.sent.length) * 100 diff_voc = diff_voc_perc > self.params["voc_thr"] if diff_voc: self.match["diff_voc_kelly"] = (diff_voc, ", ".join(self.stats["diff_voc"]).encode("utf-8")) else: self.match["diff_voc_kelly"] = (nr_diff_voc, ", ".join(self.stats["diff_voc"]).encode("utf-8")) #How many words are above the word frequency threshold? # try: # kelly_fr = [float(fr < self.params["kelly_freq_thr"]) for # fr in self.stats["voc_freq_kelly"]] # except KeyError: #when no lexical tokens in sent # kelly_fr = [0] # self.match["kelly_freq"] = (kelly_fr, "%d token(s)" % # float(sum(kelly_fr))) # Information from SVALex - only lexical tokens checked if "svalex_fr" in self.criteria["other_criteria"]: # frequency WITHIN the target CEFR level (not AT level) avg_freq_thr = {"A1":50, "A2":55, "B1":54, "B2":55, "C1":58} if "svalex_fr" in self.stats: if self.criteria["other_criteria"]["svalex_fr"] == "filter": avg_svalex_freq = avg_freq_thr[self.params["target_cefr"]] > \ (sum(self.stats["svalex_fr"]) / len(self.stats["svalex_fr"])) else: avg_svalex_freq = sum(self.stats["svalex_fr"]) / \ len(self.stats["svalex_fr"]) if avg_svalex_freq: #info = ", ".join([str(fr) for fr in self.stats["svalex_fr"]]) info = "average frequency (SVALex)" self.match["svalex_fr"] = (avg_svalex_freq, info) if "out_of_svalex" in self.criteria["other_criteria"]: # Is word in SVALex (at any level)? if "out_of_svalex" in self.stats: out_of_svalex = len(self.stats["out_of_svalex"]) if out_of_svalex: #assume when ranking that if no key = 0 self.match["out_of_svalex"] = (out_of_svalex, ", ".join(self.stats["out_of_svalex"]).encode("utf-8")) # TO DO: Cognates? #def __str__(self): # return "Corpus: %s\nSent ID: %s\nSentence: %s\n" % (self.kwic.corpus, self.kwic.position, self.sent) #to do: visualize detailed match values