#!/usr/bin/env python # -*- coding: utf8 -*- import sys import codecs import annotate.compound as compound import re def read_csv(num_of_fields, tolerates=-1): if tolerates == -1: tolerates = num_of_fields for line in sys.stdin: e = line[:-1].split('\t') if len(e) == num_of_fields: yield e elif len(e) >= tolerates and len(e) < num_of_fields: f = [""] * (num_of_fields - len(e)) yield e + f return def read_saldom(xml='lexikon/saldom/saldom.xml'): """Read the (sblex) XML version of SALDO's morphological lexicon (lexikon/saldom/saldom.xml). """ import xml.etree.cElementTree as cet base_structure = {} lexicon = {} base_structure["LEXICON"] = lexicon allforms = {} base_structure["ALL_FORMS"] = allforms context = cet.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = context.next() for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': pos = elem.findtext("pos") lem = elem.findtext("lem") table = elem.find("table") multiword = ("_" in lem) saldo_list = elem.findall("saldo") for sitem in saldo_list: saldo = sitem.text for form in list(table): word = form.findtext("wf") param = form.findtext("param") if (multiword and param == "frag") or (not multiword and param not in ("c", "ci", "cm", "sms")): allforms[saldo] = allforms.get(saldo, []) + [word] allforms[saldo] = list(set(allforms[saldo])) # Done parsing section. Clear tree to save memory if elem.tag in ['LexicalEntry', 'frame', 'resFrame']: root.clear() return base_structure if __name__ == '__main__': saldom = read_saldom() lu_regexp = re.compile(r'\[LU\s(.+?)\]') extra_examples = {} for (name, _, _, examples_raw, _, _, _, _, lus_block, proplus, _, _, _, _) in read_csv(num_of_fields=14): examples_raw = examples_raw.decode("utf-8") lus_block = lus_block.decode("utf-8") lus = [] lu_lines = [line.strip() for line in lus_block.split(";;")] for lu_line in lu_lines: if len(lu_line) != 0 and " " in lu_line: (pos, lus_raw) = tuple(lu_line.strip().split(" ", 1)) lus.extend([lu.strip() for lu in lus_raw.split(",")]) examples = examples_raw.split(";;") forms = {} for lu in lus: if lu in saldom["ALL_FORMS"]: forms[lu] = saldom["ALL_FORMS"][lu] else: forms[lu] = [] for example in examples: units = lu_regexp.findall(example) for lu in lus: for f in forms[lu]: if f in units: if not (example,f) in extra_examples.keys(): extra_examples[(example,f)] = [] extra_examples[(example,f)].append(lu) break all = 0 ok = 0 bad = 0 for e in extra_examples.keys(): #if len(extra_examples[e]) == 1: # output = "%s\t*\t*\t*\t*\t%s" % (extra_examples[e][0],re.sub(r'\[.+?\s', "", e[0].replace("]","")).replace(e[1], "[" + e[1] + "]").strip()) # print output.encode("utf-8") if len(extra_examples[e]) == 1: for lu in extra_examples[e]: #output = "%s\t*\t*\t*\t*\t%s" % (lu,re.sub(r'\[.+?\s', "", e[0].replace("]","")).replace(e[1], "[" + e[1] + "]").strip()) # TODO: Fixa [i]n[i]t[i]era-problemet ex = re.sub(r'(^|\s)(' + e[1] + r')(\s|$)', r'\1[' + e[1] + r']\3', re.sub(r'\[.+?\s', "", e[0].replace("]",""))).strip() output = "%s\t*\t*\t*\t*\t%s" % (lu,ex) #re.sub(r'(^|\s)(' + e[1] + r')(\s|$)', r'\1[' + e[1] + r']\3', x) print output.encode("utf-8") all += 1 if len(extra_examples[e]) == 1: ok += 1 else: bad += 1 #print "FOUND " + str(all) + " EXAMPLES" #print str(ok) + " ARE DISTINCT" #print str(bad) + " ARE AMBIGUOS"