#!/usr/bin/env python # -*- coding: utf8 -*- import sys import re import codecs from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring TODO = "todo" ############################# # OUTPUT # ############################# class LMF: def __init__(self, lang): self.lang = lang self.lexical_entries = [] self._lexical_entries_set = set() self._le_senses = set() def add_lexical_entry(self, lexical_entry): self.lexical_entries.append(lexical_entry) self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) def __unicode__(self): return "\n".join([ '', '', '', '', ' ', '', '', ' ' % self.lang, "\n".join([unicode(e) for e in self.lexical_entries]), '', '']) class LexicalEntry: def __init__(self): self.features = [] self.senses = [] self._pos = "" self._wf = "" self.idattr = "" def add_sense(self, sense): self.senses.append(sense) def __unicode__(self): le_string = '' if(self.idattr): le_string = '' % (self.idattr) return "\n".join([ le_string, '', '\n'.join([unicode(s) for s in self.senses]), '\n'.join([unicode(f) for f in self.features]), '']) def add_feature(self, feature): self.features.append(feature) class Feature: def __init__(self, att, val): self.att = att self.val = val def __unicode__(self): return '' % (self.att, escape(self.val)) class Sense: def __init__(self, sense): self.sense = sense self.relations = [] self.sense_examples = [] self.konst_examples = [] self.konst_definitions = [] self.features = [] self.int_const_elems = [] self.ext_const_elems = [] def add_int_const_elem(self, const_elem): self.int_const_elems.append(const_elem) def add_ext_const_elem(self, const_elem): self.ext_const_elems.append(const_elem) def add_sense_relation(self, sense_relation): self.relations.append(sense_relation) def add_sense_example(self, sense_example): self.sense_examples.append(sense_example) def add_konst_example(self, konst_example): self.konst_examples.append(konst_example) def add_konst_definition(self, konst_definition): self.konst_definitions.append(konst_definition) def add_feature(self, feature): self.features.append(feature) def __unicode__(self): construction_elements_block = "" return "\n".join(['' % (self.sense), "\n".join([unicode(rel) for rel in self.relations]), "\n".join([unicode(ex) for ex in self.sense_examples]), "\n".join([unicode(ke) for ke in self.konst_examples]), "\n".join([unicode(kd) for kd in self.konst_definitions]), "\n".join([unicode(ice) for ice in self.int_const_elems]), "\n".join([unicode(ece) for ece in self.ext_const_elems]), '\n'.join([unicode(f) for f in self.features]), '' ]) class KonstExample: def __init__(self, example_parts): self.example_parts = example_parts def __unicode__(self): return '%s' % (self.unicodeRec()) def unicodeRec(self): out = [] c = 0 for p in self.example_parts: if p[0] == "/freetext": out.append('%s' % (c, escapeContent(p[1]))) elif p[0] == "/leaf": out.append('%s' % (p[1].get("name", ""), c, escapeContent(p[2]))) elif p[0] == "/branch": out.append('%s' % (p[1].get("name", ""), c, KonstExample(p[2]).unicodeRec())) c += 1 return "".join(out) class KonstDefinition: def __init__(self, definition_parts): self.definition_parts = definition_parts def __unicode__(self): return '%s' % (self.unicodeRec()) def unicodeRec(self): out = [] c = 0 for p in self.definition_parts: if p[0] == "/freetext": out.append('%s' % (c, escapeContent(p[1]))) elif p[0] == "/leaf": out.append('%s' % (p[1].get("name", ""), c, escapeContent(p[2]))) elif p[0] == "/branch": out.append('%s' % (p[1].get("name", ""), c, KonstDefinition(p[2]).unicodeRec())) c += 1 return "".join(out) class KonstInternalConstructionElement: def __init__(self, elemtype, attributes_dict): self.attributes = attributes_dict self.elemtype = elemtype def __unicode__(self): attributes_text = " ".join(['%s="%s"' % (x, escape(self.attributes[x]) ) for x in self.attributes]) return '' % (self.elemtype, attributes_text) class KonstExternalConstructionElement: def __init__(self, elemtype, attributes_dict): self.attributes = attributes_dict self.elemtype = elemtype def __unicode__(self): attributes_text = " ".join(['%s="%s"' % (x, escape(self.attributes[x]) ) for x in self.attributes]) return '' % (self.elemtype, attributes_text) def escape(s): s = s.replace('&', '&') s = s.replace("'", ''') s = s.replace('<', '<') s = s.replace('>', '>') s = s.replace('[NI]', '(NI)') # special for konstruktikon s = s.replace('[DNI]', '(DNI)') # special for konstruktikon return s.replace('"', '"') def escapeContent(s): s = s.replace('&', '&') s = s.replace('<', '<') s = s.replace('>', '>') s = s.replace('[NI]', '(NI)') # special for konstruktikon s = s.replace('[DNI]', '(DNI)') # special for konstruktikon return s ############################# # INPUT # ############################# def handle_example(example): example_parts = [] if example.text != None: first_text = example.text.strip() else: first_text = "" if first_text != "": example_parts.append(("/freetext", first_text)) for part in example: inners = part.findall("e") if inners == None or len(inners) == 0: example_parts.append(("/leaf", part.attrib, part.text.strip() )) else: example_parts.append(("/branch", part.attrib, handle_example(part) )) thetail = part.tail if thetail != None: thetail = thetail.strip() if thetail != "": example_parts.append(("/freetext", thetail)) return example_parts def take(entry, tagname, resort): e2 = entry.find(tagname) if e2 != None: t = e2.text if(t == None): return resort else: return t.strip() else: return resort def list_to_dict(alist): dict = {} for item in alist: dict[item[0]] = item[1] return dict if __name__ == '__main__': global mode global language if len(sys.argv) < 2: mode = 0 language = 0 # english else: if sys.argv[1] == "simplified": mode = 1 else: mode = 0 if sys.argv[2] == "english": language = 0 else: language = 1 lines = [] tree = ElementTree() tree.parse("../konstruktikon-data/constructicon.xml") lmf = LMF('swe') entries = tree.findall("entry") for entry in entries: if entry.tag == TODO: continue e = LexicalEntry() the_id = entry.get("{http://www.w3.org/XML/1998/namespace}id") s = Sense("konstruktikon--" + the_id) e.add_sense(s) typee = take(entry, "type", "") if typee != "": s.add_feature(Feature("type", typee)) cat = take(entry, "cat", "") if cat != "": s.add_feature(Feature("cat", cat)) inheritance = take(entry, "inheritance", "") if inheritance != "": s.add_feature(Feature("inheritance", inheritance)) evokes = take(entry, "evokes", "") if evokes != "": s.add_feature(Feature("evokes", evokes)) definition = entry.find("definition") def_parts = [] if definition != None: if definition.text != None: first_text = definition.text.strip() if first_text != "": def_parts.append(("/freetext", first_text)) for atom in definition: if atom.tag == TODO: continue if atom.text != None: def_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip())) thetail = atom.tail if thetail != None: thetail = thetail.strip() if thetail != "": def_parts.append(("/freetext", thetail)) if len(def_parts) != 0: s.add_konst_definition(KonstDefinition(def_parts)) structure = take(entry, "structure", "") if structure != "": s.add_feature(Feature("structure", structure)) saldo_link_set = set() cee = take(entry, "cee", "") if cee != "": for cp in cee.split(): s.add_feature(Feature("cee", cp)) # Also add a saldo-link (if its a saldo sense id) if cp.count(".") == 2: saldo_link_set.add(cp) coll = take(entry, "coll", "") if coll != "": # first take all this kind: {apa: bpa} coll_parts = re.findall(r'\{.*?\}', coll) # then delete them coll = re.sub(r'\{.*?\}', "", coll) lu_parts = coll.split() for lup in lu_parts: lup = lup.strip() if lup != "": coll_parts.append(lup) for cp in coll_parts: s.add_feature(Feature("coll", cp)) saldo_senses = re.findall(r'\w+\.\.\d+', cp, re.U) for ss in saldo_senses: saldo_link_set.add(ss) for sl in saldo_link_set: e.add_feature(Feature("saldoLink", sl)) c_e = entry.find("construction_elements") if c_e != None: internal = c_e.find("internal") if internal != None: for el in internal: if el.tag == TODO: continue element = None if el.tag == "role": element = KonstInternalConstructionElement("role", list_to_dict(el.items())) elif el.tag == "cat": element = KonstInternalConstructionElement("cat", list_to_dict(el.items())) s.add_int_const_elem(element) external = c_e.find("external") if external != None: for el in external: if el.tag == TODO: continue element = None if el.tag == "role": element = KonstExternalConstructionElement("role", list_to_dict(el.items())) elif el.tag == "cat": element = KonstExternalConstructionElement("cat", list_to_dict(el.items())) s.add_ext_const_elem(element) examples = entry.find("examples") for example in examples: if example != None: if example.tag == TODO: continue example_parts = handle_example(example) s.add_konst_example(KonstExample(example_parts)) comment = take(entry, "comment", "") if comment != "": s.add_feature(Feature("comment", comment)) reference = take(entry, "reference", "") if reference != "": s.add_feature(Feature("reference", reference)) lmf.add_lexical_entry(e) print unicode(lmf).encode("utf-8")