# -*- coding: utf-8 -*- ''' Created on Nov 29, 2016 @author: David ''' import xml.etree.ElementTree as ET import re import sys lexin_file = "siwoco/LEXIN.xml" posMapping = {"AB": "adv.", "ABM":"adv.", "NN": "subst.", "NNM": "subst.", "VB": "verb", "VBM": "verb", "vB": "verb", "JJ": "adj.", "JJM": "adj.", "PP": "prep.", "PPM": "prep.", "IN": "interj.", "INM": "interj.", "PM": "namn", "PMM": "namn", "PN": "pron.", "PNM": "pron.", "DT": "obestämd artikel", "DTM": "obestämd artikel", "IE": "infinitivmärke", "IEM": "infinitivmärke", "HA": "adv.", "HAM": "adv.", "HD": "adv.", "HS": "adv.", "HP": "adv.", "KN":"konk.", "KNM":"konj.", "SN": "konj.", "SNM": "konj.", "PL":"adv.", "PLM": "adv.", "RG": "räkn.", "RGM": "räkn.", "RO": "räkn.", "ROM": "räkn.", "X": "any", } tree = ET.parse(source=lexin_file) lemma_entries = tree.findall("lemma-entry") root = tree.getroot() def gender(lemma,pos): """ Looks up the gender of a noun in Lexin """ # if (pos != "subst."): # print("Cannot lookup gender of non-nouns") # sys.exit() mapped_pos = "" if (pos not in posMapping.values()): mapped_pos = __map_pos__(pos) if (not mapped_pos): #print("Could not map pos '{}'".format(pos)) pass else: pos = mapped_pos #print(pos) entry = get_entry(lemma,pos) #print(entry) if (entry == -1): return -1 inflections = entry.find("inflection").text if (inflections): inflections = inflections.split(" ") else: #print("No inflectional information found!") return -1 infl_count = len(inflections) if ("~" in lemma): lemma = "-"+lemma.split("~")[1] # based on different inflectional patterns, retrieve gender if (infl_count < 1): #print("No inflectional information found!") return -1 # 1. only one form given # 1.1 ends with t -> ett # 1.2 else -> en if (infl_count == 1): if (inflections[0].endswith("t")): return 1 return 0 # 2. two forms # 2.1. -n -r -> en # 2.2 "ett" + lemma -> ett if (infl_count == 2): if (inflections[1] == lemma and inflections[0] == "ett"): return 1 if (inflections[1] == lemma and inflections[0] == "en"): return 0 if (inflections[0].endswith("n") and inflections[1].endswith("r")): return 0 return -1 # or 0? # 3. three forms # 3.1. -t -0 -n -> ett # 3.2. -n -0 -a (?) -> en if (infl_count == 3): if (inflections[0].endswith("t") and inflections[1] == lemma and inflections[2].endswith("n")): return 1 if (inflections[0].endswith("n") and inflections[1] == lemma and inflections[2].endswith("a")): return 0 return -1 # or -1? # 4. four?? # 7. seven elements -> both ett+en if (infl_count == 7): return 2 return -1 def __map_pos__(pos): return posMapping[pos] def get_pos(lemma): result = [] for entry in root.findall("lemma-entry[form='"+lemma+"']"): if (entry == -1): return "X" result.append(entry.find("pos").text) if (len(result) > 1): print("More than one POS found!") if (len(result) < 1): return "X" return result[0] def get_entry(lemma,pos): """ Retrieves an entry from Lexin """ mapped_pos = "" if (pos not in posMapping.values()): mapped_pos = __map_pos__(pos) if (not mapped_pos): #print("Could not map pos '{}'".format(pos)) pass else: pos = mapped_pos result = [] for entry in root.findall("lemma-entry[form='"+lemma+"']"): if (pos != "any"): epos = entry.find("pos") #if (not epos): # return -1 try: epos = epos.text except AttributeError: #print("epos is NoneType for {}".format(lemma)) return -1 if (epos): pass else: #print("Could not find pos \"{} ({})\"".format(lemma,pos)) pass if (epos == pos): result.append(entry) else: # handle POS references to other entries m = re.match("^se (.+)$", epos) if (m): rpos = get_pos(m.group(1)) if (rpos == pos): result.append(entry) else: result.append(entry) if (len(result) > 1): print("More than one entry found!") sys.exit() if (len(result) == 0): return -1 # if (lemma.endswith("1")): # #print("No entry found for {}. Aborting.".format(lemma)) # return -1 # else: # #print("Trying {} 1".format(lemma)) # return get_entry(lemma + " 1", pos) return result[0] def polysemy(lemma,pos): ''' Returns the degree of polysemy of a lemma and part of speech ''' entry = get_entry(lemma, pos) if (entry == -1): return 0 lexemes = entry.findall("lexeme") return len(lexemes) def homonymy(lemma): """ Returns the degree of homonymy of a lemma, regardless of part of speech """ doh = 0 for i in range(1,6): e = get_entry(lemma+" "+str(i), "any") doh += 1 if (e == -1): break return doh if __name__ == "__main__": print(homonymy("akt"))