# -*- coding: utf-8 -*- """ Functions for assessing the well-formedness of a sentence. """ from __future__ import division from auxiliaries.dset_proc_aux import * def has_root(sent_match): """Checks wether the sentence has a dependency root. """ if "roots" not in sent_match.stats.keys(): put_feature_value_list(sent_match.match, "no_root", (True, "no dependency root")) def check_sent_start(sent_match): """ Checks sentence beginning for capital letters. """ if sent_match.stats["tokens"][0].word in [u"”", '"', "'", "-", u"–", "("]: if not sent_match.stats["tokens"][1].word[0].isupper(): put_feature_value_list(sent_match.match, "sent_tokenization", (True, "no initial capital")) #return elif not sent_match.stats["tokens"][0].word[0].isupper(): #and not sent_match.stats["tokens"][0].word.isdigit() put_feature_value_list(sent_match.match, "sent_tokenization", (True, "no initial capital")) #return def check_sent_end(sent_match): """ Checks sentence end for major delimiters. """ sent_end = sent_match.stats["tokens"][-1] if sent_end.word in [u"”", '"', "'", ")"]: if sent_match.stats["tokens"][-2].word not in [".", "!", "?"]: #!= "MAD" put_feature_value_list(sent_match.match, "sent_tokenization", (True, "ends with: '%s'" % sent_match.stats["tokens"][-2].word)) elif sent_end.word not in [".", "!", "?"]: #"MAD" and sent_match.stats["tokens"][-1].word not in [":", "..."] put_feature_value_list(sent_match.match, "sent_tokenization", (True, "ends with: '%s'" % sent_end.word)) #ends with period, but second last word is abbreviation: elif sent_end.word == "." and "AN" in sent_match.stats["tokens"][-2].msd.split("."): put_feature_value_list(sent_match.match, "sent_tokenization", (True, "ends with: '%s'" % sent_match.stats["tokens"][-2].word)) def check_sent_tokenization(sent_match): """ Checks whether the sentence is correctly tokenized. """ check_sent_start(sent_match) check_sent_end(sent_match) def get_bad_lexica_percentage(sent_match, thresholds): """ Checks whether the percentage of non-alpha tokens and unrecognized lemmas is within the specified threshold. """ criteria = ["non_alpha", "non_lemmatized"] for criterion in criteria: if criterion == "non_lemmatized": #exclude punctuation marks from non lemmatized items (already counted in non alpha) value = len(sent_match.stats.get(criterion, [])) else: value = len(sent_match.stats.get(criterion, [])) if value > 0: corr_value = 0 # needed for (.) added manually as sentence ending punctuation as # sentence tokenization work-around in Korp annotation lab, # present only in the development data if "".join([t.word for t in sent_match.stats["tokens"]][-3:]) == "(.)": corr_value = 2 try: percentage = (value / len(sent_match.sent.nodes)-corr_value) * 100 except AttributeError: percentage = (value / len(sent_match.sent.sentence.nodes)-corr_value) * 100 if percentage > thresholds[criterion]: crit_str = " ".join(criterion.split("_")) message = "%d %s tokens: %s" % (value,crit_str, ", ".join(sent_match.stats.get(criterion, []))) #nr of tokens per categgory put_feature_value_list(sent_match.match, criterion, (percentage, message)) def check_ellipsis(sent_match): """ Checks whether the sentence is elliptic, i.e. lacks the subject or a finite verb. """ # No subject required with imperative or passive if sent_match.stats.has_key("imp_VB") or sent_match.sent[-1]["word"] == "?": #or sent_match.stats.get("passive", 0.0) has_subject = True elif sent_match.stats["has_subject"]: has_subject = True else: has_subject = False if not sent_match.stats["finite"]: #or not has_subject put_feature_value(sent_match.match, "elliptic", (True, "no finite verb")) #no finite verb elif not has_subject: put_feature_value(sent_match.match, "elliptic", (True, "no subject"))