#!/usr/bin/env python # -*- coding: utf8 -*- import sys import codecs import re from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring ### NAMESPACES: ################ NOT USED RIGHT NOW BUT THE MECHANISM IS (PARTLY) AVAILABLE ns_saldo_pos = 'saldo_pos' ns_saldo_sense = 'saldo_sense' ns_saldo_lemgram = 'saldo_lemgram' ns_saldo_paradigm = 'saldo_pdgm' ns_simple_semtype = 'simple_semtype' ns_kelly_id = 'kelly_id' ns_lwt_id = 'lwt_id' ns_simple_class = 'simple_class' ns_simple_domain = 'simple_domain' ################################ class LMF: def __init__(self, lang): self.lang = lang self.lexical_entries = [] self._lexical_entries_set = set() self._le_senses = set() self.useNamespace = False self.semantic_predicates = [] def add_lexical_entry(self, lexical_entry): self.lexical_entries.append(lexical_entry) self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf])) # Ett fulhack for att speeda upp det lite (ersatt med nagot battre i framtiden) def add_semantic_predicate(self, semantic_predicate): self.semantic_predicates.append(semantic_predicate) def __str__(self): return "\n".join([ '', '', '', '', ' ', '', '' if not self.useNamespace else '', ' ' % self.lang, "\n".join([str(e) for e in self.lexical_entries]), "\n".join([str(s) for s in self.semantic_predicates]), '', '']) class LexicalEntry: def __init__(self): self.features = [] self.lemma = None self.wordforms = [] self.senses = [] self.saldolinks = [] self._pos = "" self._wf = "" self.idattr = "" def add_sense(self, sense): self.senses.append(sense) def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def add_wordform(self, wordform): self.wordforms.append(wordform) def add_saldoLink(self, saldoLink): self.saldolinks.append(saldoLink) def __str__(self): le_string = '' if(self.idattr): le_string = '' % (self.idattr) return "\n".join([ le_string, '\n'.join([str(f) for f in self.features]), str(self.lemma), '\n'.join([str(w) for w in self.wordforms]), '\n'.join([str(s) for s in self.senses]), '\n'.join([str(f) for f in self.saldolinks]), '']) class SaldoLink: def __init__(self, saldo_id): self.saldo_id = saldo_id def __str__(self): return '' % (self.saldo_id) """ class Lemma: def __init__(self): self.features = [] # now including writtenForm and partOfSpeech! def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def __str__(self): if self.features: return "\n".join(['\n', '\n'.join([str(f) for f in self.features]), '\n']) else: return '' """ class Lemma: def __init__(self): self.form_representations = [] self.features = [] # now including writtenForm and partOfSpeech! def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def add_form_representation(self, form_representation): self.form_representations.append(form_representation) def __str__(self): if self.features or self.form_representations: return "\n".join(['', '\n'.join(str(fr) for fr in self.form_representations),'']) #return "\n".join(['\n', # '\n'.join([str(f) for f in self.features]), # '\n']) else: return '' class WordForm: def __init__(self): self.features = [] self.form_representations = [] def add_feature(self, feature): self.features.append(feature) def add_form_representation(self, form_representation): self.form_representations.append(form_representation) def __str__(self): return "\n".join(['', '\n'.join(str(fr) for fr in self.form_representations), '\n'.join([str(f) for f in self.features]), '']) class FormRepresentation: def __init__(self): self.features = [] def add_feature(self, feature): self.features.append(feature) def add_feature_unique(self, feature): for existing_feature in self.features: if(existing_feature.att == feature.att and existing_feature.val == feature.val): return self.add_feature(feature) def __str__(self): if self.features: return "\n".join(['','\n'.join([str(f) for f in self.features]),'']) else: return '' class Feature: def __init__(self, att, val): self.att = att self.val = val def __str__(self): return '' % (self.att, escape(self.val)) class Sense: def __init__(self, sense): self.sense = sense self.relations = [] self.predicative_representations = [] self.sense_examples = [] self.features = [] def add_sense_relation(self, sense_relation): self.relations.append(sense_relation) def add_predicative_representation(self, predicative_representation): self.predicative_representations.append(predicative_representation) def add_sense_example(self, sense_example): self.sense_examples.append(sense_example) def add_feature(self, feature): self.features.append(feature) def __str__(self): if not self.relations and not self.predicative_representations and not self.sense_examples and not self.features: return '' % (self.sense) else: return "\n".join(['' % (self.sense), '\n'.join([str(f) for f in self.features]), "\n".join([str(pre) for pre in self.predicative_representations]), "\n".join([str(rel) for rel in self.relations]), "\n".join([str(ex) for ex in self.sense_examples]), '' ]) class SenseRelation: def __init__(self, target, relation_types): self.target = target self.relation_types = relation_types self.features = [] def add_feature(self, feature): self.features.append(feature) def __str__(self): return "\n".join(['' % (self.target), '\n'.join(['' % t for t in self.relation_types]), '\n'.join([str(f) for f in self.features]), '' ]) class SenseExample: def __init__(self, example): self.example = example def __str__(self): return "\n".join([ '', '' % (self.example), '' ]) class SemanticPredicate: def __init__(self, id, domain, semantic_types): self.id = id #self.domain = domain self.semantic_types = semantic_types self.semantic_arguments = [] self.features = [] if domain != None and domain != "": self.add_feature(Feature("domain", domain)) def add_semantic_argument(self, argument): self.semantic_arguments.append(argument) def add_feature(self, feature): self.features.append(feature) def generateFeatures(self, att, vals): for val in vals: self.add_feature(Feature(att, val.strip())) def __str__(self): extras = "" for st in self.semantic_types: extras += '' return "\n".join([ '' % (self.id), "\n".join(['\n' % (st) for st in self.semantic_types]), "\n".join([str(fe) for fe in self.features]), "\n".join([str(sa) for sa in self.semantic_arguments]), '' ]) class SemanticArgument: def __init__(self, semantic_role, core_type): self.semantic_role = semantic_role self.core_type = core_type def __str__(self): return '' % (self.semantic_role, self.core_type) class PredicativeRepresentation: def __init__(self, idref): self.idref = idref def __str__(self): return '' % (self.idref, self.idref) # HELPER FUNCTIONS ------------------------------------------------------------------------------------- def escape(s): s = s.replace('&', '&') s = s.replace("'", ''') s = s.replace('<', '<') s = s.replace('>', '>') return s.replace('"', '"') def read_csv(num_of_fields, tolerates=-1): if tolerates == -1: tolerates = num_of_fields for line in sys.stdin: e = line[:-1].split('\t') if len(e) == num_of_fields: yield e elif len(e) >= tolerates and len(e) < num_of_fields: f = [""] * (num_of_fields - len(e)) yield e + f return def read_csv_from_file(path, num_of_fields): with codecs.open(path, encoding='utf-8') as f: for line in f: e = [x.encode("utf-8") for x in line[:-1].split('\t')] #e = line[:-1].split('\t') yield e[0:num_of_fields] def give_namespace(namespace, identifier): return identifier #return '%s:%s' % (namespace, identifier) def search_for_le_with_sense(lmf, sense): for (s, le) in lmf._le_senses: if s == sense: return le return None def dequote(s): if (s[0] == "'" and s[-1] == "'") or (s[0] == '"' and s[-1] == '"') : return s[1:-1] else: return s # --- SALDO ------------------------------------------------ ##def sorting_by_sense(a, b): ## if a[0] == b[0]: ## return 0 ## elif a[0] < b[0]: ## return -1 ## else: ## return 1 # a sense is unique for an entry. def saldo_data(): forms = {} senses = set() for (saldo, primary, secondary, lemgram, gf, pos, paradigm) in read_csv(num_of_fields=7): if saldo in forms: forms[saldo].append((lemgram, gf, pos, paradigm)) else: forms[saldo] = [(lemgram, gf, pos, paradigm)] senses.add((saldo, primary, secondary)) return (forms, sorted(list(senses), key=lambda x: x[0])) def saldo(): lmf = LMF('swe') (forms, senses) = saldo_data() lexical_entry = LexicalEntry() sense = Sense(give_namespace(ns_saldo_sense,'PRIM..1')) lexical_entry.lemma = Lemma() lexical_entry.add_sense(sense) lmf.add_lexical_entry(lexical_entry) counter = 0 for (saldo, primary, secondary) in senses: #saldo = give_namespace(ns_saldo_sense, saldo.strip()) counter += 1 lexical_entry = LexicalEntry() lemma = Lemma() for (lemgram, gf, pos, paradigm) in forms[saldo]: form_representation = FormRepresentation() form_representation.add_feature(Feature("writtenForm", gf)) form_representation.add_feature(Feature("partOfSpeech", give_namespace(ns_saldo_pos, pos))) form_representation.add_feature(Feature("lemgram", give_namespace(ns_saldo_lemgram, lemgram))) form_representation.add_feature(Feature("paradigm", give_namespace(ns_saldo_paradigm, paradigm))) lemma.add_form_representation(form_representation) lexical_entry.lemma = lemma sense = Sense(give_namespace(ns_saldo_sense, saldo)) #if primary != 'PRIM..1': primary_relation = SenseRelation(give_namespace(ns_saldo_sense, primary), ['primary']) sense.add_sense_relation(primary_relation) if secondary != 'PRIM..1': for sec in secondary.split(' '): secondary_relation = SenseRelation(give_namespace(ns_saldo_sense, sec), ['secondary']) sense.add_sense_relation(secondary_relation) lexical_entry.add_sense(sense) lmf.add_lexical_entry(lexical_entry) #return counter #return len(lmf.lexical_entries) return str(lmf) #def search_for_lexical_entry(lmf, pos, saldo): # #saldo = saldo.split("..")[0] # #if (pos + "." + saldo) in lmf._lexical_entries_set: # Fulhack for att gora det snabbare, ersatt med nat battre i framtiden. # if saldo in # for le in lmf.lexical_entries: # if pos == le._pos and saldo == le._wf: # return le # return None #def search_for_sense(lexical_entry, saldo): # for s in lexical_entry.senses: # if s.sense == saldo: # return s # return None # ------- SALDO EXAMPLES ----------------------------------- def saldo_examples_data(): examples = [] for (saldo, _, _, _, _, example) in read_csv(num_of_fields=6): saldo = give_namespace(ns_saldo_sense, saldo.strip()) example = example.strip() if example and example != "*": examples.append((saldo, example)) return examples def saldo_examples(): lmf = LMF('swe') added_examples = [] for (saldo, example) in saldo_examples_data(): le = search_for_le_with_sense(lmf, saldo) if not le: le = LexicalEntry() le.lemma = Lemma() lmf._le_senses.add((saldo, le)) sense = Sense(saldo) le.add_sense(sense) lmf.add_lexical_entry(le) #sense = search_for_sense(le, saldo) sense = le.senses[0] if not (example, saldo) in added_examples: # We don't want duplicates if we have different sources added_examples.append((example, saldo)) sense.add_sense_example(SenseExample(escape(example))) return str(lmf) # ------- SWESAURUS ---------------------------------------- def swesaurus_data(): return read_csv(num_of_fields=5) def swesaurus(): lmf = LMF('swe') synsets = swesaurus_data() for (saldo1, saldo2, type_of, degree, source) in synsets: saldo1 = give_namespace(ns_saldo_sense, saldo1.strip()) saldo2 = give_namespace(ns_saldo_sense, saldo2.strip()) saldo = [saldo1, saldo2] le = [None, None] # The relations are not always symmetric, but since a SenseRelation's IDREF has to point at a Sense ID, # we have to make sure that there is an inverse relation (y-to-x for each x-to-y) n = 2 if type_of == "syn" else 1 for i in range(n): le[i] = search_for_le_with_sense(lmf, saldo[i]) if le[i] == None: le[i] = LexicalEntry() le[i].lemma = Lemma() lmf._le_senses.add((saldo[i], le[i])) lmf.add_lexical_entry(le[i]) sense = Sense(saldo[i]) le[i].add_sense(sense) else: sense = le[i].senses[0] sense_relation = SenseRelation(saldo[(i+1) % 2], [type_of]) sense_relation.add_feature(Feature("degree", degree)) sense_relation.add_feature(Feature("source", source)) sense.add_sense_relation(sense_relation) return str(lmf) # ---------- LWT ------------------------------------------- def lwt_data(): return read_csv(num_of_fields=5) def lwt(): # LexicalEntry acts more like a semantic entry right now lmf = LMF('swe') lmf.useNamespace = True entries = lwt_data() for (s_id, saldo, eng, definition, example) in entries: eng2 = eng.replace("'", "").replace("?", "") if "(" in eng2: eng2 = eng2.split("(")[0].strip() if "/" in eng2: eng2 = eng2.split("/")[0].strip() le = LexicalEntry() sense = Sense("lwt--" + s_id) saldo = saldo.strip() for s in saldo.split(): s = s.strip() if s != 'PRIM..1': le.add_saldoLink(SaldoLink(s)) sense.add_feature(Feature("saldoSense", s)) le.lemma = Lemma() le.add_sense(sense) lmf.add_lexical_entry(le) form_representation = FormRepresentation() form_representation.add_feature(Feature("lwtID", s_id)) form_representation.add_feature(Feature("english", eng)) if definition and definition != "--": form_representation.add_feature(Feature("definition", dequote(definition))) if example and example != "--": form_representation.add_feature(Feature("example", dequote(example))) # Maybe the apostrophes around (most of) the examples should be stripped of? le.lemma.add_form_representation(form_representation) return str(lmf) # ---------- PAROLE ---------------------------------------- def parole_data(): return read_csv(num_of_fields=5) def parole(): lmf = LMF('swe') lmf.useNamespace = True entries = parole_data() parole_entries = {} for (baseform, saldo, pos, valency, paroleid) in entries: if paroleid == "zz": paroleid = baseform + "_zz" saldo = saldo.strip() # sometimes the tabs have become spaces, so this is a fix for that: if saldo == "av arbeta_av..1": baseform = "arbeta av" saldo = "arbeta_av..1" elif saldo == "bort arbeta_bort..1": baseform = "arbeta bort" saldo = "arbeta_bort..1" elif saldo == " epilera..1": saldo = "epilera..1" elif saldo == "frottera _sig..1": saldo = "frottera_sig..1" if baseform == "gille(s)stuga": baseform = "gillestuga" if paroleid in parole_entries: if saldo != "PRIM..1": parole_entries[paroleid]["saldo"].append(saldo) else: parole_entries[paroleid] = {} parole_entries[paroleid]["pos"] = pos parole_entries[paroleid]["baseform"] = baseform parole_entries[paroleid]["valency"] = valency parole_entries[paroleid]["paroleid"] = paroleid parole_entries[paroleid]["saldo"] = [saldo] if saldo != "PRIM..1" else [] for pe in parole_entries: le = LexicalEntry() lemma = Lemma() le.lemma = lemma fr = FormRepresentation() lemma.add_form_representation(fr) s = Sense("parole--" + parole_entries[pe]["paroleid"]) le.add_sense(s) fr.add_feature(Feature("partOfSpeech", parole_entries[pe]["pos"])) fr.add_feature(Feature("writtenForm", parole_entries[pe]["baseform"])) fr.add_feature(Feature("valency", parole_entries[pe]["valency"])) fr.add_feature(Feature("paroleID", parole_entries[pe]["paroleid"])) for sid in parole_entries[pe]["saldo"]: le.add_saldoLink(SaldoLink(sid)) s.add_feature(Feature("saldoSense", sid)) lmf.add_lexical_entry(le) #saldo = give_namespace(ns_saldo_sense, saldo.strip()) #if saldo == "PRIM..1": # index = 0 # while True: # index += 1 # dummy = search_for_le_with_sense(lmf, "parolelexplus--" + baseform.strip() + ".." + str(index)) # if not dummy: # break # saldo = "parolelexplus--" + baseform.strip() + ".." + str(index) #le = search_for_le_with_sense(lmf, saldo) #if not le: # le = LexicalEntry() # le.add_sense(Sense(saldo)) # lmf._le_senses.add((saldo, le)) # lmf.add_lexical_entry(le) # le.lemma = Lemma() #form_representation = FormRepresentation() #form_representation.add_feature(Feature("partOfSpeech", give_namespace(ns_saldo_pos, pos))) #form_representation.add_feature(Feature("writtenForm", baseform)) #form_representation.add_feature(Feature("valency", v)) #form_representation.add_feature(Feature("paroleID", x)) #le.lemma.add_form_representation(form_representation) #le.add_feature(Feature("x", x)) # Don't know what x really is. It is not rendered to the old XML version return str(lmf) # ---------- SIMPLE ---------------------------------------- simple_semantic_types = {'++ext' : 'Extensional', '++psy' : 'Psychological_property', '++phy' : 'Physical_property', '++soc' : 'Social_property', '++tem' : 'Temporal_property', '++inp' : 'Intensifying_property', '++rel' : 'Relational_property'} simple_argmap = {'a_00' : '0', 'a0' : '1', 'a0 a1' : '2', 'a0 a1a' : '2', # markup error? 'a0 a' : '2', # markup error? 'a0 a1 a2' : '3', 'a0 a1a2' : '3'} # markup error? def simple_expand_semantic_type(abbrev): abbrev = simple_semantic_types.get(abbrev, abbrev) if abbrev[0] == "+": abbrev = abbrev[1:] return abbrev def simple_data(): return read_csv(num_of_fields=17) def simple(): lmf = LMF('swe') lmf.useNamespace = True entries = simple_data() for (baseform,paroleid,_,ssensen,gldb,bc,ontology,domain,lexiquest,gldbex,usynsemu,args,argreal,predfornoun,verbnoun,pos,saldo) in entries: le = LexicalEntry() newID = "simple--" + paroleid + "-" + ssensen[2:] sense = Sense(newID) le.add_sense(sense) if domain == "g": domain = "Gen" ## SemanticType (= ontology) sense.add_feature(Feature("semanticType", simple_expand_semantic_type(ontology))) sense.add_feature(Feature("domain", domain)) if ssensen[0:2] == "<<": sense.add_feature(Feature("simpleSenseNumber", ssensen[2:])) sense.add_feature(Feature("GLDB", gldb)) if gldbex != "-": sense.add_feature(Feature("GLDBExample", gldbex)) # lemma/sense/nuance ## Basic Concepts if bc != "ZZ": sense.add_feature(Feature("basicConcept", bc)) # lemma/sense/nuance ## LexiQuest lexiquest = lexiquest.strip() for c in lexiquest.split("@"): sense.add_feature(Feature("class", c)) ## Codes(?), number of links between an usyn construction and corresponding semu specifications usynsemu = usynsemu.strip() if usynsemu != "-": if usynsemu[0] == "p": usynsemu = usynsemu[1:] sense.add_feature(Feature("numberOfUsynSemuLinks", usynsemu)) ## Arguments if args != "-": args = simple_argmap[args] sense.add_feature(Feature("numberOfArguments", args)) ## Argument relisations: argreal = argreal.strip() if argreal != "aa_00": #for ar in argreal.split("_OR_"): <-- This can be realised in the future # ... sense.add_feature(Feature("argumentRealisation", argreal)) ## Predicate for noun if predfornoun != "-": sense.add_feature(Feature("predicate", predfornoun)) ## Type of verb_noun # typ av verbalt substantiv l_n (verb nominalisation); l_ag (agent nominalisation); l_pa (process nominalization); annars l_00 if verbnoun != "-" and verbnoun != "l_00": verbnoun = verbnoun[2:] sense.add_feature(Feature("verbalizedNounType", verbnoun)) lemma = Lemma() le.lemma = lemma fr = FormRepresentation() lemma.add_form_representation(fr) fr.add_feature(Feature("partOfSpeech", pos)) fr.add_feature(Feature("writtenForm", baseform)) fr.add_feature(Feature("paroleID", paroleid)) if saldo != "-": for s in saldo.split(";"): le.add_saldoLink(SaldoLink(s)) sense.add_feature(Feature("saldoSense", s)) lmf.add_lexical_entry(le) #saldo = give_namespace(ns_saldo_sense, saldo.strip()) #if saldo == 'PRIM..1' or saldo == '-': # index = 0 # while True: # index += 1 # dummy = search_for_le_with_sense(lmf, "simpleplus--" + baseform.strip().replace(" ", "_") + ".." + str(index)) # if not dummy: # break # saldo = "simpleplus--" + baseform.strip().replace(" ", "_") + ".." + str(index) #le = search_for_le_with_sense(lmf, saldo) #if not le: # le = LexicalEntry() # le.add_sense(Sense(saldo)) # lmf._le_senses.add((saldo, le)) # lmf.add_lexical_entry(le) # le.lemma = Lemma() #form_representation = FormRepresentation() # ## Written Form #form_representation.add_feature(Feature("writtenForm", baseform)) # ## Part-of-speech #form_representation.add_feature(Feature("partOfSpeech", give_namespace(ns_saldo_pos, pos))) # ## SemanticType (= ontology) #form_representation.add_feature(Feature("semanticType", give_namespace(ns_simple_semtype, simple_expand_semantic_type(ontology)))) # ## Domain #if domain == "g": # domain = "Gen" #form_representation.add_feature(Feature("domain", give_namespace(ns_simple_domain, domain))) # ## Simple Sense Number #if ssensen[0:2] == "<<": # form_representation.add_feature(Feature("simpleSenseNumber", ssensen[2:])) # ## GLDB lemma/sense/nuance numbers #form_representation.add_feature(Feature("GLDB", gldb)) # lemma/sense/nuance # ## GLDB example (only for verbs) #if gldbex != "-": # form_representation.add_feature(Feature("GLDBExample", gldbex)) # lemma/sense/nuance # ## Basic Concepts #if bc != "ZZ": # form_representation.add_feature(Feature("basicConcept", bc)) # lemma/sense/nuance # ## LexiQuest #lexiquest = lexiquest.strip() #for c in lexiquest.split("@"): # form_representation.add_feature(Feature("class", give_namespace(ns_simple_class, c))) # # ## Codes(?), number of links between an usyn construction and corresponding semu specifications #usynsemu = usynsemu.strip() #if usynsemu != "-": # if usynsemu[0] == "p": # usynsemu = usynsemu[1:] # form_representation.add_feature(Feature("numberOfUsynSemuLinks", usynsemu)) # ## Arguments #if args != "-": # args = simple_argmap[args] # form_representation.add_feature(Feature("numberOfArguments", args)) # ## Argument relisations: #argreal = argreal.strip() #if argreal != "aa_00": # #for ar in argreal.split("_OR_"): <-- This can be realised in the future # # ... # form_representation.add_feature(Feature("argumentRealisation", argreal)) # ## Predicate for noun #if predfornoun != "-": # form_representation.add_feature(Feature("predicate", predfornoun)) # ## Type of verb_noun ## typ av verbalt substantiv l_n (verb nominalisation); l_ag (agent nominalisation); l_pa (process nominalization); annars l_00 #if verbnoun != "-" and verbnoun != "l_00": # verbnoun = verbnoun[2:] # form_representation.add_feature(Feature("verbalizedNounType", verbnoun)) # #le.lemma.add_form_representation(form_representation) return str(lmf) # ---------- KELLY ----------------------------------------- kelly_to_saldo = {'verb' : 'vb', 'noun' : 'nn', 'noun-en' : 'nn', 'noun-ett' : 'nn', 'noun-en/-ett' : 'nn', 'adjective' : 'av', 'numeral' : 'nl', 'proper name' : 'pm', 'adverb' : 'ab', 'aux verb' : 'vb', 'conj' : 'kn', 'det' : 'pn', 'interj' : 'in', 'particip' : 'vb', 'particle' : 'ab', 'prep' : 'pp', 'pronoun' : 'pn', 'subj' : 'sn'} def map_kelly_pos_to_saldo(pos): return kelly_to_saldo.get(pos, pos) def kelly_data(): return read_csv(num_of_fields=10, tolerates=9) def kelly(): lmf = LMF('swe') lmf.useNamespace = True entries = kelly_data() for (id_num, raw, wpm, cefr, source, grammar, baseform, saldo, pos, example) in entries: saldo = saldo.strip() if "(" in baseform: extrainfo = "(" + baseform.split("(")[1] baseform = baseform.split("(")[0].strip() else: extrainfo = None le = LexicalEntry() sense = Sense("kelly--" + baseform) le.add_sense(sense) for s in saldo.split(): le.add_saldoLink(SaldoLink(s.strip())) sense.add_feature(Feature("saldoSense", s)) kellyid = "" # TODO: FIX lemma = Lemma() le.lemma = lemma form_representation = FormRepresentation() lemma.add_form_representation(form_representation) form_representation.add_feature(Feature("writtenForm", baseform)) if extrainfo: form_representation.add_feature(Feature("formInformation", extrainfo)) form_representation.add_feature(Feature("partOfSpeech", map_kelly_pos_to_saldo(pos))) form_representation.add_feature(Feature("kellyPartOfSpeech", pos)) form_representation.add_feature(Feature("kellyIdentifier", id_num)) form_representation.add_feature(Feature("raw", raw)) form_representation.add_feature(Feature("wpm", wpm)) form_representation.add_feature(Feature("cefr", cefr)) form_representation.add_feature(Feature("source", source)) if grammar: # Maybe both grammar and example should always be there but empty? form_representation.add_feature(Feature("grammar", grammar)) if example: form_representation.add_feature(Feature("example", example)) # Maybe "e.g. " in the start of the sentences should be deleted? lmf.add_lexical_entry(le) #for s in saldo.split(): # s = s.strip() # if s == 'PRIM..1': # index = 0 # while True: # index += 1 # dummy = search_for_le_with_sense(lmf, "kelly--" + baseform.strip().replace(" ", "_") + ".." + str(index)) # if not dummy: # break # s = "kelly--" + baseform.strip().replace(" ", "_") + ".." + str(index) # s = give_namespace(ns_saldo_sense, s) # le = search_for_le_with_sense(lmf, s) # if not le: # le = LexicalEntry() # le.add_sense(Sense(s)) # lmf._le_senses.add((s, le)) # lmf.add_lexical_entry(le) # # lemma = le.lemma # if not lemma: # lemma = Lemma() # le.lemma = lemma # form_representation = FormRepresentation() # lemma.add_form_representation(form_representation) # # form_representation.add_feature_unique(Feature("writtenForm", baseform)) # if extrainfo: # form_representation.add_feature_unique(Feature("formInformation", extrainfo)) # form_representation.add_feature_unique(Feature("partOfSpeech", give_namespace(ns_saldo_pos, map_kelly_pos_to_saldo(pos)))) # # form_representation.add_feature_unique(Feature("kellyIdentifier", give_namespace(ns_kelly_id, id_num))) # form_representation.add_feature_unique(Feature("raw", raw)) # form_representation.add_feature_unique(Feature("wpm", wpm)) # form_representation.add_feature_unique(Feature("cefr", cefr)) # form_representation.add_feature_unique(Feature("source", source)) # if(grammar): # Maybe both grammar and example should always be there but empty? # form_representation.add_feature(Feature("grammar", grammar)) # if(example): # form_representation.add_feature(Feature("example", example)) # Maybe "e.g. " in the start of the sentences should be deleted? return str(lmf) # ---------- WORDNET --------------------------------------- # Wordnet is a little special because it operates on the files 'wn3_synsets.txt' and 'wordnet-saldo.txt' already in the directory saldo_pos_from_wordnet = { "n" : "nn", "s" : "av", "v" : "vb", "r" : "ab", "a" : "av"} # adjective satellites (s) can also e numerals etc. def wordnet_data(): synsets = {} with codecs.open('wn3_synsets.txt', encoding='utf-8') as f: for line in f: e = [x.encode("utf-8") for x in line[:-1].split('\t')] synsets[e[0]] = {} syn = synsets[e[0]] syn["gloss"] = e[2] syn["pos"] = e[3] syn["definition"] = e[4] # The definition may contain one or more examples, grab these later! FORM: definition; "[example]"; "[example]" ... entries = {} with codecs.open('wordnet-saldo.txt', encoding='utf-8') as f: for line in f: e = [x.encode("utf-8") for x in line[:-1].split('\t')] if not e[0] in entries: entries[e[0]] = {} ent = entries[e[0]] ent["saldo"] = e[1] ent["synset"] = e[0] ent["type"] = e[2] ent["core"] = e[6] ent["freq"] = e[4] ent["gloss"] = synsets[e[0]]["gloss"] ent["pos"] = synsets[e[0]]["pos"] ent["definition"] = synsets[e[0]]["definition"] #with codecs.open('wordnet-saldo-relations.txt', encoding='utf-8') as f: # for line in f: # e = [x.encode("utf-8") for x in line[:-1].split('\t')] # if not "relations" in entries[e[0]]: # entries[e[0]]["relations"] = [] # rels = entries[e[0]]["relations"] # rels.append((e[1], e[2])) return entries def wordnet(): lmf = LMF('swe') lmf.useNamespace = True objects = wordnet_data() for key in objects: item = objects[key] le = LexicalEntry() lmf.add_lexical_entry(le) sense = Sense("wordnet--" + key.replace(":","_").replace("%","_")) le.add_sense(sense) le.add_saldoLink(SaldoLink(item["saldo"])) sense.add_feature(Feature("saldoSense", item["saldo"])) #if "relations" in item: # for rel in item["relations"]: # s.add_sense_relation(SenseRelation(rel[1], [rel[0]])) lemma = Lemma() le.lemma = lemma fr = FormRepresentation() glosses = item["gloss"].split(", ") for gl in glosses: fr.add_feature(Feature("gloss", gl)) fr.add_feature(Feature("partOfSpeech", saldo_pos_from_wordnet[item["pos"]])) fr.add_feature(Feature("wordnetPartOfSpeech", item["pos"])) def_and_examples = [x.strip() for x in item["definition"].split(";")] if len(def_and_examples) >= 1: sense.add_feature(Feature("definition", def_and_examples[0])) if len(def_and_examples) > 1: for examp in def_and_examples[1:]: if examp != "": sense.add_feature(Feature("example", dequote(examp))) sense.add_feature(Feature("synset", item["synset"])) sense.add_feature(Feature("type", item["type"])) sense.add_feature(Feature("core", item["core"])) sense.add_feature(Feature("frequency", item["freq"])) lemma.add_form_representation(fr) return str(lmf) # ---------- CROSS PIVOT ----------------------------------- # Uses the raw material from dalin and fsv to make a cross pivot resource # allowing searching on for example "brev" to find "bref" etc. def cp_fsvbase_data(): return read_csv_from_file("../fsv/fsv.txt", 10) def cp_dalinbase_data(): return read_csv_from_file("dalin_saldo.txt", 10) def crosspivot(): lmf = LMF('swe') pivots = {} # Data from Dalin entries = cp_dalinbase_data() for (old_spelling, new_spelling, pos, dalin_gram, dalin_lemgram, le_type, pattern, saldo_lemgram, saldo_senses, skos) in entries: if saldo_lemgram != "--" and saldo_lemgram != "PRIM..1": if saldo_lemgram in pivots: pivots[saldo_lemgram].append(("_1800", dalin_lemgram, skos)) else: pivots[saldo_lemgram] = [("_1800", dalin_lemgram, skos)] # Data from FSV entries = cp_fsvbase_data() for (old_spelling, new_spelling, pos, _, fsv_lemgram, le_type, pattern, saldo_lemgram, saldo_senses, skos) in entries: if saldo_lemgram != "--" and saldo_lemgram != "PRIM..1": if " " in saldo_lemgram: saldo_lemgrams = saldo_lemgram.split(" ") else: saldo_lemgrams = [saldo_lemgram] for sl in saldo_lemgrams: if not sl in pivots: pivots[sl] = [] pivots[sl].append(("old", fsv_lemgram, skos)) for pivot in pivots: le = LexicalEntry() lemma = Lemma() le.lemma = lemma saldo_fr = FormRepresentation() saldo_fr.add_feature(Feature("category", "modern")) saldo_fr.add_feature(Feature("lemgram", pivot)) lemma.add_form_representation(saldo_fr) for post in pivots[pivot]: fr = FormRepresentation() fr.add_feature(Feature("category", post[0])) fr.add_feature(Feature("lemgram", post[1])) fr.add_feature(Feature("match", post[2])) lemma.add_form_representation(fr) lmf.add_lexical_entry(le) return str(lmf) # ---------- DALIN BASE MATERIAL ----------------------------------- def dalinbase_data(): return read_csv(num_of_fields=10) def dalinbase(): lmf = LMF('swe') entries = dalinbase_data() for (old_spelling, new_spelling, pos, dalin_gram, dalin_lemgram, le_type, pattern, saldo_lemgram, saldo_senses, match_type) in entries: le = LexicalEntry() lemma = Lemma() le.lemma = lemma fr = FormRepresentation() fr.add_feature(Feature("lemgram", dalin_lemgram)) fr.add_feature(Feature("oldSpelling", old_spelling)) fr.add_feature(Feature("newSpelling", new_spelling)) fr.add_feature(Feature("xref", le_type)) fr.add_feature(Feature("partOfSpeech", pos)) if pattern != "--": fr.add_feature(Feature("paradigm", pattern)) lemma.add_form_representation(fr) lmf.add_lexical_entry(le) return str(lmf) # ----------- SWEDBERG "FAKE" MORPHOLOGY ------------------------- def swedbergm(): total = [] xml_tree = fromstring(sys.stdin.read()) for entry in xml_tree.find("Lexicon"): lemma = entry.find("Lemma") baseforms = [] lem = None pos = "prim" if lemma != None: freps = lemma.findall("FormRepresentation") if freps != None: for fr in freps: allfeats = fr.findall("feat") if allfeats != None: for feat in allfeats: if feat.attrib["att"] == "writtenForm": baseforms.append(feat.attrib["val"].encode('utf-8')) elif feat.attrib["att"] == "lemgram": lem = feat.attrib["val"].encode('utf-8') elif feat.attrib["att"] == "partOfSpeech": pos = feat.attrib["val"].encode('utf-8') total.append((lem, baseforms, pos)) lmf = LMF('swe') for (lemg, wflist, pos) in total: le = LexicalEntry() lemma = Lemma() le.lemma = lemma fr = FormRepresentation() lemma.add_form_representation(fr) if lemg != None: fr.add_feature(Feature("lemgram", lemg)) for wf in wflist: pass fr.add_feature(Feature("writtenForm", wf)) wordform = WordForm() wordform.add_feature(Feature("writtenForm", wf)) wordform.add_feature(Feature("msd", "prim")) le.add_wordform(wordform) if pos != "prim": fr.add_feature(Feature("partOfSpeech", pos)) fr.add_feature(Feature("paradigm", "prim")) lmf.add_lexical_entry(le) return str(lmf) # ----------- AKADEMISK ORDLISTA---------------------------- def ao_data(): return read_csv(num_of_fields=3) def ao(): pos_conversion = {"adverb" : "ab", "substantiv" : "nn", "adjektiv" : "av", "verb" : "vb", "preposition" : "pp", "konjunktion" : "kn", "particip" : "av", "frågande/relativt_possesivuttryck" : "pn", "partikel" : "pp", "possessivuttryck" : "pn", "pronomen" : "pn", "subjunktion" : "sn", } lmf = LMF('swe') entries = ao_data() rank = 1 for (ao_lemma, pos, saldo_id) in entries: le = LexicalEntry() lemma = Lemma() le.lemma = lemma fr = FormRepresentation() lemma.add_form_representation(fr) fr.add_feature(Feature("writtenForm", ao_lemma.strip())) lemgrams = saldo_id.split("|") for l in lemgrams: fr.add_feature(Feature("lemgram", l.strip())) fr.add_feature(Feature("partOfSpeech", pos_conversion[pos.strip()])) fr.add_feature(Feature("nativePartOfSpeech", pos.strip())) fr.add_feature(Feature("rank", str(rank))) sense = Sense("ao--" + ao_lemma.strip()) le.add_sense(sense) rank += 1 lmf.add_lexical_entry(le) return str(lmf) # ---------------------------------------------------------- if __name__ == '__main__': if len(sys.argv) > 1: resource = sys.argv[1] if resource == 'saldo': print saldo() if resource == 'saldoe': print saldo_examples() elif resource == 'swesaurus': print swesaurus() elif resource == 'lwt': print lwt() elif resource == 'parole': print parole() elif resource == 'simple': print simple() elif resource == 'kelly': print kelly() elif resource == 'wordnet': print wordnet() elif resource == 'crosspivot': print crosspivot() elif resource == 'dalinbase': print dalinbase() elif resource == 'swedbergm': print swedbergm() elif resource == 'ao': print ao()