#!/usr/bin/env python
# -*- coding: utf8 -*-
import sys
import re
import codecs
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring
#############################
# OUTPUT #
#############################
def read_saldom(xml='lexikon/saldom/saldom.xml'):
"""Read the (sblex) XML version of SALDO's morphological lexicon (lexikon/saldom/saldom.xml)."""
import xml.etree.cElementTree as cet
context = cet.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element
context = iter(context)
event, root = context.next()
possibilities = {} # in the form { (gf,pos) : [saldo..1,saldo..2 ... ] }
for event, elem in context:
if event == "end":
if elem.tag == 'LexicalEntry':
pos = elem.findtext("pos")
gf = elem.findtext("gf")
if not (gf, pos) in possibilities:
possibilities[(gf, pos)] = []
saldo_list = elem.findall("saldo")
for sitem in saldo_list:
possibilities[(gf, pos)].append(sitem.text)
# Done parsing section. Clear tree to save memory
if elem.tag in ['LexicalEntry', 'frame', 'resFrame']:
root.clear()
return possibilities
class LMF:
def __init__(self, lang):
self.lang = lang
self.lexical_entries = []
self._lexical_entries_set = set()
self._le_senses = set()
self.semantic_predicates = []
def add_lexical_entry(self, lexical_entry):
self.lexical_entries.append(lexical_entry)
self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) # Ett fulhack for att speeda upp det lite (ersatt med nagot battre i framtiden)
def add_semantic_predicate(self, semantic_predicate):
self.semantic_predicates.append(semantic_predicate)
def __unicode__(self):
return "\n".join([
'',
'',
'',
'',
' ',
'',
'',
' ' % self.lang,
"\n".join([unicode(e) for e in self.lexical_entries]),
"\n".join([unicode(s) for s in self.semantic_predicates]),
'',
''])
class LexicalEntry:
def __init__(self):
self.features = []
self.lemma = None
self.wordforms = []
self.senses = []
self._pos = ""
self._wf = ""
self.idattr = ""
def add_sense(self, sense):
self.senses.append(sense)
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def add_wordform(self, wordform):
self.wordforms.append(wordform)
def __unicode__(self):
le_unicodeing = ''
if(self.idattr):
le_unicodeing = '' % (self.idattr)
return "\n".join([
le_unicodeing,
'\n'.join([unicode(f) for f in self.features]),
unicode(self.lemma),
'\n'.join([unicode(w) for w in self.wordforms]),
'\n'.join([unicode(s) for s in self.senses]),
''])
"""
class Lemma:
def __init__(self):
self.features = [] # now including writtenForm and partOfSpeech!
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def __unicode__(self):
if self.features:
return "\n".join(['\n',
'\n'.join([unicode(f) for f in self.features]),
'\n'])
else:
return ''
"""
class Lemma:
def __init__(self):
self.form_representations = []
self.features = [] # now including writtenForm and partOfSpeech!
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def add_form_representation(self, form_representation):
self.form_representations.append(form_representation)
def __unicode__(self):
if self.features or self.form_representations:
return "\n".join(['', '\n'.join(unicode(fr) for fr in self.form_representations),''])
#return "\n".join(['\n',
# '\n'.join([unicode(f) for f in self.features]),
# '\n'])
else:
return ''
class WordForm:
def __init__(self):
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
return "\n".join(['',
'\n'.join([unicode(f) for f in self.features]),
''])
class FormRepresentation:
def __init__(self):
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def __unicode__(self):
if self.features:
return "\n".join(['','\n'.join([unicode(f) for f in self.features]),''])
else:
return ''
class Feature:
def __init__(self, att, val):
self.att = att
self.val = val
def __unicode__(self):
return '' % (self.att, escape(self.val))
class Sense:
def __init__(self, sense):
self.sense = sense
self.relations = []
self.predicative_representations = []
self.sense_examples = []
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def add_sense_relation(self, sense_relation):
self.relations.append(sense_relation)
def add_predicative_representation(self, predicative_representation):
self.predicative_representations.append(predicative_representation)
def add_sense_example(self, sense_example):
self.sense_examples.append(sense_example)
def __unicode__(self):
if not self.relations and not self.predicative_representations and not self.sense_examples and not self.features:
return '' % (self.sense)
else:
return "\n".join(['' % (self.sense),
"\n".join([unicode(pre) for pre in self.predicative_representations]),
"\n".join([unicode(rel) for rel in self.relations]),
"\n".join([unicode(ex) for ex in self.sense_examples]),
"\n".join([unicode(f) for f in self.features]),
''
])
class SenseRelation:
def __init__(self, target, relation_types):
self.target = target
self.relation_types = relation_types
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
return "\n".join(['' % (self.target),
'\n'.join(['' % t for t in self.relation_types]),
'\n'.join([unicode(f) for f in self.features]),
''
])
class SenseExample:
def __init__(self, example):
self.example = example
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
return "\n".join([
'',
'' % (escape(self.example)),
"\n".join([unicode(f) for f in self.features]),
''
])
class SemanticPredicate:
def __init__(self, id, domain, semantic_types):
self.id = id
#self.domain = domain
self.semantic_types = semantic_types
self.semantic_arguments = []
self.features = []
if domain != None and domain != "":
self.add_feature(Feature("domain", domain))
def add_semantic_argument(self, argument):
self.semantic_arguments.append(argument)
def add_feature(self, feature):
self.features.append(feature)
def generateFeatures(self, att, vals):
for val in vals:
self.add_feature(Feature(att, val.unicodeip()))
def __unicode__(self):
extras = ""
for st in self.semantic_types:
extras += ''
return "\n".join([
'' % (self.id),
"\n".join(['\n' % (st) for st in self.semantic_types]),
"\n".join([unicode(fe) for fe in self.features]),
"\n".join([unicode(sa) for sa in self.semantic_arguments]),
''
])
class SemanticArgument:
def __init__(self, semantic_role, core_type):
self.semantic_role = semantic_role
self.core_type = core_type
def __unicode__(self):
return '' % (self.semantic_role, self.core_type)
class PredicativeRepresentation:
def __init__(self, idref):
self.idref = idref
def __unicode__(self):
return '' % (self.idref, self.idref)
def escape(s):
s = s.replace('&', '&')
s = s.replace("'", ''')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s.replace('"', '"')
def escapeContent(s):
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s
#############################
# INPUT #
#############################
def take(entry, tagname, resort):
e2 = entry.find(tagname)
if e2 != None:
t = e2.text
if(t == None):
return resort
else:
return t.unicodeip()
else:
return resort
def list_to_dict(alist):
dict = {}
for item in alist:
dict[item[0]] = item[1]
return dict
posHash = {
u"subst." : u"nn",
u"prep." : u"pp",
u"pron." : u"pn",
u"förk." : u"nna",
u"subst. plural" : u"nn",
u"verb" : u"vb",
u"subst., ingen böjning" : u"nn",
u"adj." : u"av",
u"adv." : u"ab",
u"interj." : u"in",
u"räkn." : u"nl",
u"förled" : u"",
u"namn" : u"pm",
u"adj., ingen böjning" : u"av",
u"subst. bestämd form singular" : u"nn",
u"konj." : u"kn"
}
multiPosHash = {
u"subst." : u"nnm",
u"prep." : u"ppm",
u"pron." : u"pnm",
u"förk." : u"nna",
u"subst. plural" : u"nnm",
u"verb" : u"vbm",
u"subst., ingen böjning" : u"nnm",
u"adj." : u"avm",
u"adv." : u"abm",
u"interj." : u"inm",
u"räkn." : u"nlm",
u"förled" : u"",
u"namn" : u"pmm",
u"adj., ingen böjning" : u"avm",
u"subst. bestämd form singular" : u"nnm",
u"konj." : u"knm"
}
lexinInflectionToMsd = {
u"best.f.sing." : u"sg def nom",
u"obest.f.pl." : u"pl indef nom",
u"best.f.pl." : u"pl def nom",
u"tform" : u"pos indef sg n nom", # ?
u"aform" : u"pos indef pl nom", # ?
u"infinitiv" : u"inf aktiv",
u"imperfekt" : u"pret ind aktiv",
u"supinum" : u"sup aktiv",
u"perf.part." : u"",
u"imperativ" : u"imper"
}
def lexinPosToSaldoPos(inpos, inword):
if " " in inword:
pos = multiPosHash.get(inpos, None)
else:
pos = posHash.get(inpos, None)
if inpos in [u"subst. plural"]:
extra = u"plural"
elif inpos in [u"subst., ingen böjning", u"adj., ingen böjning"]:
extra = u"oböjl"
elif inpos in [u"subst. bestämd form singular"]:
extra = u"sing best"
elif inpos in [u"förled"]:
extra = u"förled"
else:
extra = None
return (pos, extra)
if __name__ == '__main__':
global mode
global language
global saldom
saldom = read_saldom()
tree = ElementTree()
tree.parse("orig_lexin/svenska4/swe_swe.xml")
lmf = LMF('swe')
usedSensesCount = {}
entries = tree.findall("Article")
for entry in entries:
e = LexicalEntry()
lexin_lemma = entry.find("Lemma")
saldo_link_set = set()
lemma = Lemma()
e.lemma = lemma
fr = FormRepresentation()
lemma.add_form_representation(fr)
fr.add_feature(Feature("rawForm", lexin_lemma.get("Value")))
lexin_pos = lexin_lemma.get("Type")
pos = None
if lexin_pos != None:
fr.add_feature(Feature("lexinPartOfSpeech", lexin_pos))
pos = lexinPosToSaldoPos(lexin_pos, lexin_lemma.get("Value"))[0]
if pos != None:
fr.add_feature(Feature("partOfSpeech", pos))
infinitiv = None
for lexin_inflection in lexin_lemma.findall("Inflection"):
wordform = WordForm()
lexin_text = lexin_inflection.text
wordform.add_feature(Feature("writtenForm", lexin_text))
lexin_form = lexin_inflection.get("Form")
if lexin_form != None:
if lexin_form == "infinitiv":
infinitiv = lexin_text
msd = lexinInflectionToMsd.get(lexin_form,"")
wordform.add_feature(Feature("msd", msd))
wordform.add_feature(Feature("lexinForm", lexin_form))
if lexin_inflection.get("Spec") != None:
wordform.add_feature(Feature("lexinSpec", lexin_inflection.get("Spec")))
e.add_wordform(wordform)
writtenForm = None
if (pos == "vb" or pos == "vbm") and infinitiv != None:
# we have to find the infinitive since Lexin is based on present tense
writtenForm = infinitiv
else:
writtenForm = lexin_lemma.get("Value").replace("|", "")
fr.add_feature(Feature("writtenForm", writtenForm))
if pos != None and writtenForm != None:
possible_saldo_senses = saldom.get((writtenForm, pos),[])
#if len(possible_saldo_senses) == 0:
#print writtenForm + " (" + pos + ")"
for pss in possible_saldo_senses:
e.add_feature(Feature("saldoLink", pss))
lexin_phonetic = lexin_lemma.find("Phonetic")
if lexin_phonetic != None:
fr.add_feature(Feature("phoneticForm", lexin_phonetic.text))
if lexin_lemma.get("Hyphenate", "") != "":
fr.add_feature(Feature("hyphenatedForm", lexin_lemma.get("Hyphenate")))
if lexin_lemma.get("Rank", "") != "":
fr.add_feature(Feature("rank", lexin_lemma.get("Rank")))
if lexin_lemma.get("ID", "") != "":
fr.add_feature(Feature("lexinID", lexin_lemma.get("ID")))
if lexin_lemma.get("Variant", "") != "":
fr.add_feature(Feature("lexinVariant", lexin_lemma.get("Variant")))
for lexin_reference in lexin_lemma.findall("Reference"):
reftype = lexin_reference.get("Type")
if reftype == "see":
e.add_feature(Feature("see", lexin_reference.get("Value")))
elif reftype == "compare":
e.add_feature(Feature("compareWith", lexin_reference.get("Value")))
for lexin_lexeme in lexin_lemma.findall("Lexeme"):
baseform = lexin_lemma.get("Value").replace("|", "")
index = usedSensesCount.get(baseform, 0) + 1
#if not index.isdigit():
# index = "1"
#if usedSensesCount.get(baseform, 0) >= index:
# index = usedSensesCount.get(baseform, 0) + 1
sense = Sense("lexin--" + baseform + ".." + str(index))
lexin_lexemeno = lexin_lexeme.get("Lexemeno")
lexin_variantID = lexin_lemma.get("VariantID","")
sense.add_feature(Feature("lexinVariantID", lexin_variantID))
if not lexin_lexemeno.isdigit():
lexin_lexemeno = "1"
sense.add_feature(Feature("lexinLexemeNumber", lexin_lexemeno))
if baseform in usedSensesCount:
usedSensesCount[baseform] = usedSensesCount[baseform] + 1
else:
usedSensesCount[baseform] = 1
if lexin_lexeme.get("ID", None) != None:
sense.add_feature(Feature("lexinID", lexin_lexeme.get("ID")))
if lexin_lexeme.find("Definition") != None:
sense.add_feature(Feature("definition", lexin_lexeme.find("Definition").text))
for lexin_example in lexin_lexeme.findall("Example"):
sense_example = SenseExample(lexin_example.text)
sense_example.add_feature(Feature("type", "example"))
sense.add_sense_example(sense_example)
for lexin_idiom in lexin_lexeme.findall("Idiom"):
sense_example = SenseExample(lexin_idiom.text)
sense_example.add_feature(Feature("type", "idiom"))
sense.add_sense_example(sense_example)
for lexin_compound in lexin_lexeme.findall("Compound"):
sense_example = SenseExample(lexin_compound.text)
sense_example.add_feature(Feature("type", "compound"))
sense.add_sense_example(sense_example)
for lexin_comment in lexin_lexeme.findall("Comment"):
comment_type = lexin_comment.get("Type")
if comment_type == "style":
sense.add_feature(Feature("usg", lexin_comment.text))
elif comment_type == "def":
sense.add_feature(Feature("desc", lexin_comment.text))
for lexin_reference in lexin_lexeme.findall("Reference"):
reftype = lexin_reference.get("Type")
if reftype == "see":
sense.add_feature(Feature("see", lexin_reference.get("Value")))
elif reftype == "compare":
sense.add_feature(Feature("compareWith", lexin_reference.get("Value")))
elif reftype == "antonym":
sense.add_feature(Feature("antonym", lexin_reference.get("Value")))
for lexin_gramcom in lexin_lexeme.findall("Gramcom"):
sense.add_feature(Feature("gram", lexin_gramcom.text))
for lexin_graminfo in lexin_lexeme.findall("Graminfo"):
sense.add_feature(Feature("gram", lexin_graminfo.text))
for lexin_theme in lexin_lexeme.findall("Theme"):
for theme_string in lexin_theme.get("Tema").split(","):
sense.add_feature(Feature("lexinTheme", theme_string.split(":")[0].strip()))
e.add_sense(sense)
lmf.add_lexical_entry(e)
print unicode(lmf).encode("utf-8")