#!/usr/bin/env python
# -*- coding: utf8 -*-
import sys
import re
import codecs
import os
import glob
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring
def read_saldom(xml='lexikon/saldom/saldom.xml'):
"""Read the (sblex) XML version of SALDO's morphological lexicon (lexikon/saldom/saldom.xml)."""
import xml.etree.cElementTree as cet
context = cet.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element
context = iter(context)
event, root = context.next()
possibilities = {} # in the form { (gf,pos) : [saldo..1,saldo..2 ... ] }
for event, elem in context:
if event == "end":
if elem.tag == 'LexicalEntry':
pos = elem.findtext("pos")
gf = elem.findtext("gf")
if not (gf, pos) in possibilities:
possibilities[(gf, pos)] = []
saldo_list = elem.findall("saldo")
for sitem in saldo_list:
possibilities[(gf, pos)].append(sitem.text)
# Done parsing section. Clear tree to save memory
if elem.tag in ['LexicalEntry', 'frame', 'resFrame']:
root.clear()
return possibilities
def read_languages():
output = {}
all_languages = [("russian", "orig_lexin/ryska"),
("albanian", "orig_lexin/albanska"),
("arabic", "orig_lexin/arabic"),
("bosnic", "orig_lexin/bosniska"),
("english", "orig_lexin/engelska"),
("finnish", "orig_lexin/finska"),
("greek", "orig_lexin/grekiska"),
("croatian", "orig_lexin/kroatiska"),
("northKurdish", "orig_lexin/nordkurdiska"),
("farsi", "orig_lexin/persiska"),
("serbian", "orig_lexin/serbiska"),
("serbianCyrillic", "orig_lexin/serbiska_kyrillisk"),
("somali", "orig_lexin/somaliska"),
("spanish", "orig_lexin/spanska"),
("southKurdish", "orig_lexin/sydkurdiska"),
("turkish", "orig_lexin/turkiska")] # TODO: add the others as well
wrong_keys = 0
for (lang_name, folder_path) in all_languages:
for infile in glob.glob( os.path.join(folder_path, '*.xml') ):
tree = ElementTree()
tree.parse(infile)
lexin_words = tree.findall("Word")
for lw in lexin_words:
item = {"comment": (), "idioms": [], "synonyms": [], "examples": [], "compounds": [], "trans": None}
#lexin_Value = lw.get("Value", "")
#lexin_Variant = lw.get("Variant", "")
#if ! lexin_Variant.isdigit():
# lexin_Variant = "1"
key = lw.get("VariantID")
if key == None:
wrong_keys += 1
continue
#key = (lexin_Value, lexin_Variant)
baseLang = lw.find("BaseLang")
targetLang = None
if lw.find("TargetLang") != None:
targetLang = lw.find("TargetLang")
if targetLang == None:
print "missing TargetLang"
continue
translation = targetLang.find("Translation")
if translation != None:
item["trans"] = translation.text
comment = targetLang.find("Comment")
if comment != None:
swe_comment = baseLang.find("Comment").text
item["comment"] = (swe_comment, comment.text)
for example in targetLang.findall("Example"):
example_id = example.get("ID")
for swe_example in baseLang.findall("Example"):
if example_id == swe_example.get("ID"):
item["examples"].append((swe_example.text, example.text))
break
for compound in targetLang.findall("Compound"):
compound_id = compound.get("ID")
for swe_compound in baseLang.findall("Compound"):
if compound_id == swe_compound.get("ID"):
item["compounds"].append((swe_compound.text, compound.text))
break
for idiom in targetLang.findall("Idiom"):
idiom_id = idiom.get("ID")
for swe_idiom in baseLang.findall("Idiom"):
if idiom_id == swe_idiom.get("ID"):
item["idioms"].append((swe_idiom.text, idiom.text))
break
for synonym in targetLang.findall("Synonym"):
item["synonyms"].append(synonym.text)
if not key in output:
output[key] = []
output[key].append((lang_name, item))
return output
#############################
# OUTPUT #
#############################
class LMF:
def __init__(self, lang):
self.lang = lang
self.lexical_entries = []
self._lexical_entries_set = set()
self._le_senses = set()
self.semantic_predicates = []
def add_lexical_entry(self, lexical_entry):
self.lexical_entries.append(lexical_entry)
self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) # Ett fulhack for att speeda upp det lite (ersatt med nagot battre i framtiden)
def add_semantic_predicate(self, semantic_predicate):
self.semantic_predicates.append(semantic_predicate)
def __unicode__(self):
return "\n".join([
'',
'',
'',
'',
' ',
'',
'',
' ' % self.lang,
"\n".join([unicode(e) for e in self.lexical_entries]),
"\n".join([unicode(s) for s in self.semantic_predicates]),
'',
''])
class LexicalEntry:
def __init__(self):
self.features = []
self.lemma = None
self.wordforms = []
self.senses = []
self._pos = ""
self._wf = ""
self.idattr = ""
def add_sense(self, sense):
self.senses.append(sense)
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def add_wordform(self, wordform):
self.wordforms.append(wordform)
def __unicode__(self):
le_unicodeing = ''
if(self.idattr):
le_unicodeing = '' % (self.idattr)
return "\n".join([
le_unicodeing,
'\n'.join([unicode(f) for f in self.features]),
unicode(self.lemma),
'\n'.join([unicode(w) for w in self.wordforms]),
'\n'.join([unicode(s) for s in self.senses]),
''])
"""
class Lemma:
def __init__(self):
self.features = [] # now including writtenForm and partOfSpeech!
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def __unicode__(self):
if self.features:
return "\n".join(['\n',
'\n'.join([unicode(f) for f in self.features]),
'\n'])
else:
return ''
"""
class Lemma:
def __init__(self):
self.form_representations = []
self.features = [] # now including writtenForm and partOfSpeech!
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def add_form_representation(self, form_representation):
self.form_representations.append(form_representation)
def __unicode__(self):
if self.features or self.form_representations:
return "\n".join(['', '\n'.join(unicode(fr) for fr in self.form_representations),''])
#return "\n".join(['\n',
# '\n'.join([unicode(f) for f in self.features]),
# '\n'])
else:
return ''
class WordForm:
def __init__(self):
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
return "\n".join(['',
'\n'.join([unicode(f) for f in self.features]),
''])
class FormRepresentation:
def __init__(self):
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def add_feature_unique(self, feature):
for existing_feature in self.features:
if(existing_feature.att == feature.att and existing_feature.val == feature.val):
return
self.add_feature(feature)
def __unicode__(self):
if self.features:
return "\n".join(['','\n'.join([unicode(f) for f in self.features]),''])
else:
return ''
class Feature:
def __init__(self, att, val):
self.att = att
self.val = val
def __unicode__(self):
return '' % (self.att, escape(self.val))
class Sense:
def __init__(self, sense):
self.sense = sense
self.relations = []
self.predicative_representations = []
self.sense_examples = []
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def add_sense_relation(self, sense_relation):
self.relations.append(sense_relation)
def add_predicative_representation(self, predicative_representation):
self.predicative_representations.append(predicative_representation)
def add_sense_example(self, sense_example):
self.sense_examples.append(sense_example)
def __unicode__(self):
if not self.relations and not self.predicative_representations and not self.sense_examples and not self.features:
return '' % (self.sense)
else:
return "\n".join(['' % (self.sense),
"\n".join([unicode(pre) for pre in self.predicative_representations]),
"\n".join([unicode(rel) for rel in self.relations]),
"\n".join([unicode(ex) for ex in self.sense_examples]),
"\n".join([unicode(f) for f in self.features]),
''
])
class SenseRelation:
def __init__(self, target, relation_types):
self.target = target
self.relation_types = relation_types
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
return "\n".join(['' % (self.target),
'\n'.join(['' % t for t in self.relation_types]),
'\n'.join([unicode(f) for f in self.features]),
''
])
class SenseExample:
def __init__(self, example):
self.example = example
self.features = []
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
return "\n".join([
'',
'' % (escape(self.example)),
"\n".join([unicode(f) for f in self.features]),
''
])
class SemanticPredicate:
def __init__(self, id, domain, semantic_types):
self.id = id
#self.domain = domain
self.semantic_types = semantic_types
self.semantic_arguments = []
self.features = []
if domain != None and domain != "":
self.add_feature(Feature("domain", domain))
def add_semantic_argument(self, argument):
self.semantic_arguments.append(argument)
def add_feature(self, feature):
self.features.append(feature)
def generateFeatures(self, att, vals):
for val in vals:
self.add_feature(Feature(att, val.unicodeip()))
def __unicode__(self):
extras = ""
for st in self.semantic_types:
extras += ''
return "\n".join([
'' % (self.id),
"\n".join(['\n' % (st) for st in self.semantic_types]),
"\n".join([unicode(fe) for fe in self.features]),
"\n".join([unicode(sa) for sa in self.semantic_arguments]),
''
])
class SemanticArgument:
def __init__(self, semantic_role, core_type):
self.semantic_role = semantic_role
self.core_type = core_type
def __unicode__(self):
return '' % (self.semantic_role, self.core_type)
class PredicativeRepresentation:
def __init__(self, idref):
self.idref = idref
def __unicode__(self):
return '' % (self.idref, self.idref)
def escape(s):
s = s.replace('&', '&')
s = s.replace("'", ''')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s.replace('"', '"')
def escapeContent(s):
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s
#############################
# INPUT #
#############################
def take(entry, tagname, resort):
e2 = entry.find(tagname)
if e2 != None:
t = e2.text
if(t == None):
return resort
else:
return t.unicodeip()
else:
return resort
def list_to_dict(alist):
dict = {}
for item in alist:
dict[item[0]] = item[1]
return dict
posHash = {
u"subst." : u"nn",
u"prep." : u"pp",
u"pron." : u"pn",
u"förk." : u"nna",
u"subst. plural" : u"nn",
u"verb" : u"vb",
u"subst., ingen böjning" : u"nn",
u"adj." : u"av",
u"adv." : u"ab",
u"interj." : u"in",
u"räkn." : u"nl",
u"förled" : u"",
u"namn" : u"pm",
u"adj., ingen böjning" : u"av",
u"subst. bestämd form singular" : u"nn",
u"konj." : u"kn"
}
multiPosHash = {
u"subst." : u"nnm",
u"prep." : u"ppm",
u"pron." : u"pnm",
u"förk." : u"nna",
u"subst. plural" : u"nnm",
u"verb" : u"vbm",
u"subst., ingen böjning" : u"nnm",
u"adj." : u"avm",
u"adv." : u"abm",
u"interj." : u"inm",
u"räkn." : u"nlm",
u"förled" : u"",
u"namn" : u"pmm",
u"adj., ingen böjning" : u"avm",
u"subst. bestämd form singular" : u"nnm",
u"konj." : u"knm"
}
lexinInflectionToMsd = {
u"best.f.sing." : u"sg def nom",
u"obest.f.pl." : u"pl indef nom",
u"best.f.pl." : u"pl def nom",
u"tform" : u"pos indef sg n nom", # ?
u"aform" : u"pos indef pl nom", # ?
u"infinitiv" : u"inf aktiv",
u"imperfekt" : u"pret ind aktiv",
u"supinum" : u"sup aktiv",
u"perf.part." : u"",
u"imperativ" : u"imper"
}
def lexinPosToSaldoPos(inpos, inword):
if " " in inword:
pos = multiPosHash.get(inpos, None)
else:
pos = posHash.get(inpos, None)
if inpos in [u"subst. plural"]:
extra = u"plural"
elif inpos in [u"subst., ingen böjning", u"adj., ingen böjning"]:
extra = u"oböjl"
elif inpos in [u"subst. bestämd form singular"]:
extra = u"sing best"
elif inpos in [u"förled"]:
extra = u"förled"
else:
extra = None
return (pos, extra)
if __name__ == '__main__':
global mode
global language
global saldom
saldom = read_saldom()
extra_languages = read_languages()
tree = ElementTree()
tree.parse("orig_lexin/svenska4/swe_swe.xml")
lmf = LMF('swe')
usedSensesCount = {}
entries = tree.findall("Article")
for entry in entries:
e = LexicalEntry()
lexin_lemma = entry.find("Lemma")
saldo_link_set = set()
lemma = Lemma()
e.lemma = lemma
fr = FormRepresentation()
lemma.add_form_representation(fr)
fr.add_feature(Feature("rawForm", lexin_lemma.get("Value")))
lexin_pos = lexin_lemma.get("Type")
pos = None
if lexin_pos != None:
fr.add_feature(Feature("lexinPartOfSpeech", lexin_pos))
pos = lexinPosToSaldoPos(lexin_pos, lexin_lemma.get("Value"))[0]
if pos != None:
fr.add_feature(Feature("partOfSpeech", pos))
infinitiv = None
for lexin_inflection in lexin_lemma.findall("Inflection"):
wordform = WordForm()
lexin_text = lexin_inflection.text
wordform.add_feature(Feature("writtenForm", lexin_text))
lexin_form = lexin_inflection.get("Form")
if lexin_form != None:
if lexin_form == "infinitiv":
infinitiv = lexin_text
msd = lexinInflectionToMsd.get(lexin_form,"")
wordform.add_feature(Feature("msd", msd))
wordform.add_feature(Feature("lexinForm", lexin_form))
if lexin_inflection.get("Spec") != None:
wordform.add_feature(Feature("lexinSpec", lexin_inflection.get("Spec")))
e.add_wordform(wordform)
writtenForm = None
if (pos == "vb" or pos == "vbm") and infinitiv != None:
# we have to find the infinitive since Lexin is based on present tense
writtenForm = infinitiv
else:
writtenForm = lexin_lemma.get("Value").replace("|", "")
fr.add_feature(Feature("writtenForm", writtenForm))
if pos != None and writtenForm != None:
possible_saldo_senses = saldom.get((writtenForm, pos),[])
#if len(possible_saldo_senses) == 0:
#print writtenForm + " (" + pos + ")"
for pss in possible_saldo_senses:
e.add_feature(Feature("saldoLink", pss))
lexin_phonetic = lexin_lemma.find("Phonetic")
if lexin_phonetic != None:
fr.add_feature(Feature("phoneticForm", lexin_phonetic.text))
if lexin_lemma.get("Hyphenate", "") != "":
fr.add_feature(Feature("hyphenatedForm", lexin_lemma.get("Hyphenate")))
if lexin_lemma.get("Rank", "") != "":
fr.add_feature(Feature("rank", lexin_lemma.get("Rank")))
if lexin_lemma.get("ID", "") != "":
fr.add_feature(Feature("lexinID", lexin_lemma.get("ID")))
if lexin_lemma.get("Variant", "") != "":
fr.add_feature(Feature("lexinVariant", lexin_lemma.get("Variant")))
for lexin_reference in lexin_lemma.findall("Reference"):
reftype = lexin_reference.get("Type")
if reftype == "see":
e.add_feature(Feature("see", lexin_reference.get("Value")))
elif reftype == "compare":
e.add_feature(Feature("compareWith", lexin_reference.get("Value")))
for lexin_lexeme in lexin_lemma.findall("Lexeme"):
baseform = lexin_lemma.get("Value").replace("|", "")
index = usedSensesCount.get(baseform, 0) + 1
#if not index.isdigit():
# index = "1"
#if usedSensesCount.get(baseform, 0) >= index:
# index = usedSensesCount.get(baseform, 0) + 1
sense = Sense("lexin--" + baseform + ".." + str(index))
lexin_lexemeno = lexin_lexeme.get("Lexemeno")
lexin_variantID = lexin_lexeme.get("VariantID","")
langdata = extra_languages.get(lexin_variantID, [])
sense.add_feature(Feature("lexinVariantID", lexin_variantID))
if not lexin_lexemeno.isdigit():
lexin_lexemeno = "1"
sense.add_feature(Feature("lexinLexemeNumber", lexin_lexemeno))
if baseform in usedSensesCount:
usedSensesCount[baseform] = usedSensesCount[baseform] + 1
else:
usedSensesCount[baseform] = 1
if lexin_lexeme.get("ID", None) != None:
sense.add_feature(Feature("lexinID", lexin_lexeme.get("ID")))
if lexin_lexeme.find("Definition") != None:
sense.add_feature(Feature("definition", lexin_lexeme.find("Definition").text))
for lexin_example in lexin_lexeme.findall("Example"):
sense_example = SenseExample(lexin_example.text)
sense_example.add_feature(Feature("type", "example"))
for (lang, item) in langdata:
for (swe_ex, ex) in item["examples"]:
if ex != None:
if swe_ex.strip() == lexin_example.text.strip():
sense_example.add_feature(Feature(lang + "Text", ex))
break
sense.add_sense_example(sense_example)
for lexin_idiom in lexin_lexeme.findall("Idiom"):
sense_example = SenseExample(lexin_idiom.text)
sense_example.add_feature(Feature("type", "idiom"))
if lexin_idiom.find("Definition") != None:
sense_example.add_feature(Feature("definition", lexin_idiom.find("Definition").text))
for (lang, item) in langdata:
for (swe_idiom, idiom) in item["idioms"]:
if idiom != None:
swe_idiom = swe_idiom.split(" (")[0]
#print "COMP " + swe_idiom.strip() + " --- " + lexin_idiom.text.strip()
if swe_idiom.strip() == lexin_idiom.text.strip():
#print "idiom: " + idiom
sense_example.add_feature(Feature(lang + "Text", idiom))
break
sense.add_sense_example(sense_example)
for lexin_compound in lexin_lexeme.findall("Compound"):
sense_example = SenseExample(lexin_compound.text)
sense_example.add_feature(Feature("type", "compound"))
for (lang, item) in langdata:
for (swe_comp, comp) in item["compounds"]:
if comp != None:
#print "COMP " + swe_comp.strip() + " --- " + lexin_compound.text.strip().replace("|","")
if swe_comp.strip() == lexin_compound.text.strip().replace("|",""):
sense_example.add_feature(Feature(lang + "Text", comp))
break
sense.add_sense_example(sense_example)
for lexin_comment in lexin_lexeme.findall("Comment"):
comment_type = lexin_comment.get("Type")
if comment_type == "style":
sense.add_feature(Feature("usg", lexin_comment.text))
elif comment_type == "def":
sense.add_feature(Feature("desc", lexin_comment.text))
for lexin_reference in lexin_lexeme.findall("Reference"):
reftype = lexin_reference.get("Type")
if reftype == "see":
sense.add_feature(Feature("see", lexin_reference.get("Value")))
elif reftype == "compare":
sense.add_feature(Feature("compareWith", lexin_reference.get("Value")))
elif reftype == "antonym":
sense.add_feature(Feature("antonym", lexin_reference.get("Value")))
for lexin_gramcom in lexin_lexeme.findall("Gramcom"):
sense.add_feature(Feature("gram", lexin_gramcom.text))
for lexin_graminfo in lexin_lexeme.findall("Graminfo"):
sense.add_feature(Feature("gram", lexin_graminfo.text))
for lexin_theme in lexin_lexeme.findall("Theme"):
for theme_string in lexin_theme.get("Tema").split(","):
sense.add_feature(Feature("lexinTheme", theme_string.split(":")[0].strip()))
for (lang, item) in langdata:
if item["trans"] != None:
sense.add_feature(Feature(lang + "Translation", item["trans"]))
if item["synonyms"] != None:
for syn in item["synonyms"]:
sense.add_feature(Feature(lang + "Synonyme", syn))
e.add_sense(sense)
lmf.add_lexical_entry(e)
print unicode(lmf).encode("utf-8")