#!/usr/bin/env python
# -*- coding: utf8 -*-
import sys
import re
import codecs
import os
import glob
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring


def read_saldom(xml='lexikon/saldom/saldom.xml'):
    """Read the (sblex) XML version of SALDO's morphological lexicon (lexikon/saldom/saldom.xml)."""
    import xml.etree.cElementTree as cet
    
    context = cet.iterparse(xml, events=("start", "end")) # "start" needed to save reference to root element
    context = iter(context)
    event, root = context.next()

    possibilities = {} # in the form { (gf,pos) : [saldo..1,saldo..2 ... ] }

    for event, elem in context:
        if event == "end":
            if elem.tag == 'LexicalEntry':
                        
                pos = elem.findtext("pos")
                gf = elem.findtext("gf")

                if not (gf, pos) in possibilities:
                    possibilities[(gf, pos)] = []
                
                saldo_list = elem.findall("saldo")
                for sitem in saldo_list:
                    possibilities[(gf, pos)].append(sitem.text)

            # Done parsing section. Clear tree to save memory
            if elem.tag in ['LexicalEntry', 'frame', 'resFrame']:
                root.clear()

    return possibilities

def read_languages():
    output = {}
    all_languages = [("russian", "orig_lexin/ryska"),
                     ("albanian", "orig_lexin/albanska"),
                     ("arabic", "orig_lexin/arabic"),
                     ("bosnic", "orig_lexin/bosniska"),
                     ("english", "orig_lexin/engelska"),
                     ("finnish", "orig_lexin/finska"),
                     ("greek", "orig_lexin/grekiska"),
                     ("croatian", "orig_lexin/kroatiska"),
                     ("northKurdish", "orig_lexin/nordkurdiska"),
                     ("farsi", "orig_lexin/persiska"),
                     ("serbian", "orig_lexin/serbiska"),
                     ("serbianCyrillic", "orig_lexin/serbiska_kyrillisk"),
                     ("somali", "orig_lexin/somaliska"),
                     ("spanish", "orig_lexin/spanska"),
                     ("southKurdish", "orig_lexin/sydkurdiska"),
                     ("turkish", "orig_lexin/turkiska")] # TODO: add the others as well
    wrong_keys = 0
    for (lang_name, folder_path) in all_languages:
        for infile in glob.glob( os.path.join(folder_path, '*.xml') ):
            tree = ElementTree()
            tree.parse(infile)
            lexin_words = tree.findall("Word")
            for lw in lexin_words:
                item = {"comment": (), "idioms": [], "synonyms": [], "examples": [], "compounds": [], "trans": None}
                #lexin_Value = lw.get("Value", "")
                #lexin_Variant = lw.get("Variant", "")
                #if ! lexin_Variant.isdigit():
                #    lexin_Variant = "1"
                key = lw.get("VariantID")
                if key == None:
                    wrong_keys += 1
                    continue
                #key = (lexin_Value, lexin_Variant)
                baseLang = lw.find("BaseLang")
                targetLang = None
                if lw.find("TargetLang") != None:
                    targetLang = lw.find("TargetLang")
                if targetLang == None:
                    print "missing TargetLang"
                    continue
                translation = targetLang.find("Translation")
                if translation != None:
                    item["trans"] = translation.text
                comment = targetLang.find("Comment")
                if comment != None:
                    swe_comment = baseLang.find("Comment").text
                    item["comment"] = (swe_comment, comment.text)

                for example in targetLang.findall("Example"):
                    example_id = example.get("ID")
                    for swe_example in baseLang.findall("Example"):
                        if example_id == swe_example.get("ID"):
                            item["examples"].append((swe_example.text, example.text))
                            break

                for compound in targetLang.findall("Compound"):
                    compound_id = compound.get("ID")
                    for swe_compound in baseLang.findall("Compound"):
                        if compound_id == swe_compound.get("ID"):
                            item["compounds"].append((swe_compound.text, compound.text))
                            break

                for idiom in targetLang.findall("Idiom"):
                    idiom_id = idiom.get("ID")
                    for swe_idiom in baseLang.findall("Idiom"):
                        if idiom_id == swe_idiom.get("ID"):
                            item["idioms"].append((swe_idiom.text, idiom.text))
                            break

                for synonym in targetLang.findall("Synonym"):
                    item["synonyms"].append(synonym.text)

                if not key in output:
                    output[key] = []
                output[key].append((lang_name, item))
    return output


#############################
#          OUTPUT           #
#############################

class LMF:
    def __init__(self, lang):
        self.lang = lang
        self.lexical_entries = []
        self._lexical_entries_set = set()
        self._le_senses = set()
        self.semantic_predicates = []

    def add_lexical_entry(self, lexical_entry):
        self.lexical_entries.append(lexical_entry)
        self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf]))   # Ett fulhack for att speeda upp det lite (ersatt med nagot battre i framtiden)
        
    def add_semantic_predicate(self, semantic_predicate):
        self.semantic_predicates.append(semantic_predicate)

    def __unicode__(self):
        return "\n".join([
                '<?xml version="1.0" encoding="UTF-8"?>',
                '<!-- $Id$ -->',
                '<LexicalResource dtdVersion="16">',
                '<GlobalInformation>',
                ' <feat att="languageCoding" val="ISO 639-3"/>',
                '</GlobalInformation>',
                '<Lexicon>',
                ' <feat att="language" val="%s"/>' % self.lang,
                "\n".join([unicode(e) for e in self.lexical_entries]),
                "\n".join([unicode(s) for s in self.semantic_predicates]),
                '</Lexicon>',
                '</LexicalResource>'])


class LexicalEntry:    
    def __init__(self):
        self.features = []
        self.lemma = None
        self.wordforms = []
        self.senses = []
        self._pos = ""
        self._wf = ""
        self.idattr = ""
        
    def add_sense(self, sense):
        self.senses.append(sense)

    def add_feature(self, feature):
        self.features.append(feature)

    def add_feature_unique(self, feature):
        for existing_feature in self.features:
            if(existing_feature.att == feature.att and existing_feature.val == feature.val):
                return
        self.add_feature(feature)
    
    def add_wordform(self, wordform):
        self.wordforms.append(wordform)
        
    def __unicode__(self):
        le_unicodeing = '<LexicalEntry>'
        if(self.idattr):
            le_unicodeing = '<LexicalEntry id="%s">' % (self.idattr)
        return "\n".join([
            le_unicodeing,
            '\n'.join([unicode(f) for f in self.features]),
            unicode(self.lemma),
            '\n'.join([unicode(w) for w in self.wordforms]),
            '\n'.join([unicode(s) for s in self.senses]),
            '</LexicalEntry>'])

"""                    
class Lemma:
    def __init__(self):
        self.features = [] # now including writtenForm and partOfSpeech!
    
    def add_feature(self, feature):
        self.features.append(feature)
    
    def add_feature_unique(self, feature):
        for existing_feature in self.features:
            if(existing_feature.att == feature.att and existing_feature.val == feature.val):
                return
        self.add_feature(feature)
    
    def __unicode__(self):
        if self.features:
            return "\n".join(['<Lemma>\n<FormRepresentation>',
                              '\n'.join([unicode(f) for f in self.features]),
                              '</FormRepresentation>\n</Lemma>'])
        else:
            return '<Lemma/>'
"""

class Lemma:
    def __init__(self):
        self.form_representations = []
        self.features = [] # now including writtenForm and partOfSpeech!
    
    def add_feature(self, feature):
        self.features.append(feature)
    
    def add_feature_unique(self, feature):
        for existing_feature in self.features:
            if(existing_feature.att == feature.att and existing_feature.val == feature.val):
                return
        self.add_feature(feature)
    
    def add_form_representation(self, form_representation):
        self.form_representations.append(form_representation)
    
    def __unicode__(self):
        if self.features or self.form_representations:
            return "\n".join(['<Lemma>', '\n'.join(unicode(fr) for fr in self.form_representations),'</Lemma>'])
            #return "\n".join(['<Lemma>\n<FormRepresentation>',
            #                  '\n'.join([unicode(f) for f in self.features]),
            #                  '</FormRepresentation>\n</Lemma>'])
        else:
            return '<Lemma/>'


class WordForm:
    def __init__(self):
        self.features = []
    
    def add_feature(self, feature):
        self.features.append(feature)
    
    def __unicode__(self):
        return "\n".join(['<WordForm>',
                          '\n'.join([unicode(f) for f in self.features]),
                          '</WordForm>'])

            
class FormRepresentation:
    def __init__(self):
        self.features = []
        
    def add_feature(self, feature):
        self.features.append(feature)
    
    def add_feature_unique(self, feature):
        for existing_feature in self.features:
            if(existing_feature.att == feature.att and existing_feature.val == feature.val):
                return
        self.add_feature(feature)
    
    def __unicode__(self):
        if self.features:
            return "\n".join(['<FormRepresentation>','\n'.join([unicode(f) for f in self.features]),'</FormRepresentation>'])
        else:
            return '<FormRepresentation/>'

class Feature:
    def __init__(self, att, val):
        self.att = att
        self.val = val
        
    def __unicode__(self):
        return '<feat att="%s" val="%s"/>' % (self.att, escape(self.val))


class Sense:
    def __init__(self, sense):
        self.sense =  sense
        self.relations = []
        self.predicative_representations = []
        self.sense_examples = []
        self.features = []

    def add_feature(self, feature):
        self.features.append(feature)
        
    def add_sense_relation(self, sense_relation):
        self.relations.append(sense_relation)
    
    def add_predicative_representation(self, predicative_representation):
        self.predicative_representations.append(predicative_representation)
    
    def add_sense_example(self, sense_example):
        self.sense_examples.append(sense_example)

    def __unicode__(self):
        if not self.relations and not self.predicative_representations and not self.sense_examples and not self.features:
            return '<Sense id="%s"/>' % (self.sense)
        else:
            return "\n".join(['<Sense id="%s">' % (self.sense),
                            "\n".join([unicode(pre) for pre in self.predicative_representations]),
                            "\n".join([unicode(rel) for rel in self.relations]),
                            "\n".join([unicode(ex) for ex in self.sense_examples]),
                            "\n".join([unicode(f) for f in self.features]),
                            '</Sense>'
                            ])


class SenseRelation:
    def __init__(self, target, relation_types):
        self.target = target
        self.relation_types = relation_types
        self.features = []
        
    def add_feature(self, feature):
        self.features.append(feature)
        
    def __unicode__(self):
        return "\n".join(['<SenseRelation targets="%s">' % (self.target),
                            '\n'.join(['<feat att="label" val="%s"/>' % t for t in self.relation_types]),
                            '\n'.join([unicode(f) for f in self.features]),
                            '</SenseRelation>'
                        ])


class SenseExample:
    def __init__(self, example):
        self.example = example
        self.features = []

    def add_feature(self, feature):
        self.features.append(feature)
        
    def __unicode__(self):
        return "\n".join([
                        '<SenseExample>',
                        '<feat att="text" val="%s"/>' % (escape(self.example)),
                        "\n".join([unicode(f) for f in self.features]),
                        '</SenseExample>'
                        ])
        
            
class SemanticPredicate:
    def __init__(self, id, domain, semantic_types):
        self.id = id
        #self.domain = domain
        self.semantic_types = semantic_types
        self.semantic_arguments = []
        self.features = []
        if domain != None and domain != "":
            self.add_feature(Feature("domain", domain))
    
    def add_semantic_argument(self, argument):
        self.semantic_arguments.append(argument)
    
    def add_feature(self, feature):
        self.features.append(feature)
    
    def generateFeatures(self, att, vals):
        for val in vals:
            self.add_feature(Feature(att, val.unicodeip()))
        
    def __unicode__(self):
        extras = ""
        for st in self.semantic_types:
            extras += '' 
        return "\n".join([
                        '<SemanticPredicate id="%s">' % (self.id),
                        "\n".join(['\n<feat att="semanticType" val="%s"/>' % (st) for st in self.semantic_types]),
                        "\n".join([unicode(fe) for fe in self.features]),
                        "\n".join([unicode(sa) for sa in self.semantic_arguments]),
                        '</SemanticPredicate>'
                        ])

class SemanticArgument:
    def __init__(self, semantic_role, core_type):
        self.semantic_role = semantic_role
        self.core_type = core_type
    
    def __unicode__(self):
        return '<SemanticArgument><feat att="semanticRole" val="%s"/><feat att="type" val="%s"/></SemanticArgument>' % (self.semantic_role, self.core_type)

class PredicativeRepresentation:
    def __init__(self, idref):
        self.idref = idref
        
    def __unicode__(self):
        return '<PredicativeRepresentation predicate="%s" correspondences="%s"/>' % (self.idref, self.idref)

def escape(s):
    s = s.replace('&', '&amp;')
    s = s.replace("'", '&apos;')
    s = s.replace('<', '&lt;')
    s = s.replace('>', '&gt;')
    return s.replace('"', '&quot;')

def escapeContent(s):
    s = s.replace('&', '&amp;')
    s = s.replace('<', '&lt;')
    s = s.replace('>', '&gt;')
    return s


#############################
#           INPUT           #
#############################


def take(entry, tagname, resort):
    e2 = entry.find(tagname)
    if e2 != None:
        t = e2.text
        if(t == None):
            return resort
        else:
            return t.unicodeip()
    else:
        return resort

def list_to_dict(alist):
    dict = {}
    for item in alist:
        dict[item[0]] = item[1]
    return dict

posHash = {
    u"subst." : u"nn",
    u"prep." : u"pp",
    u"pron." : u"pn",
    u"förk." : u"nna",
    u"subst. plural" : u"nn",
    u"verb" : u"vb",
    u"subst., ingen böjning" : u"nn",
    u"adj." : u"av",
    u"adv." : u"ab",
    u"interj." : u"in",
    u"räkn." : u"nl",
    u"förled" : u"",
    u"namn" : u"pm",
    u"adj., ingen böjning" : u"av",
    u"subst. bestämd form singular" : u"nn",
    u"konj." : u"kn"
}

multiPosHash = {
    u"subst." : u"nnm",
    u"prep." : u"ppm",
    u"pron." : u"pnm",
    u"förk." : u"nna",
    u"subst. plural" : u"nnm",
    u"verb" : u"vbm",
    u"subst., ingen böjning" : u"nnm",
    u"adj." : u"avm",
    u"adv." : u"abm",
    u"interj." : u"inm",
    u"räkn." : u"nlm",
    u"förled" : u"",
    u"namn" : u"pmm",
    u"adj., ingen böjning" : u"avm",
    u"subst. bestämd form singular" : u"nnm",
    u"konj." : u"knm"
}

lexinInflectionToMsd = {
    u"best.f.sing." : u"sg def nom",
    u"obest.f.pl." : u"pl indef nom",
    u"best.f.pl." : u"pl def nom",
    u"tform" : u"pos indef sg n nom", # ?
    u"aform" : u"pos indef pl nom", # ?
    u"infinitiv" : u"inf aktiv",
    u"imperfekt" : u"pret ind aktiv",
    u"supinum" : u"sup aktiv",
    u"perf.part." : u"",
    u"imperativ" : u"imper"
}

def lexinPosToSaldoPos(inpos, inword):
    if " " in inword:
        pos = multiPosHash.get(inpos, None)
    else:
        pos = posHash.get(inpos, None)
    if inpos in [u"subst. plural"]:
        extra = u"plural"
    elif inpos in [u"subst., ingen böjning", u"adj., ingen böjning"]:
        extra = u"oböjl"
    elif inpos in [u"subst. bestämd form singular"]:
        extra = u"sing best"
    elif inpos in [u"förled"]:
        extra = u"förled"
    else:
        extra = None
    return (pos, extra)

if __name__ == '__main__':
    global mode
    global language
    global saldom

    saldom = read_saldom()

    extra_languages = read_languages()

    tree = ElementTree()
    tree.parse("orig_lexin/svenska4/swe_swe.xml")
    
    lmf = LMF('swe')
    
    usedSensesCount = {}


    entries = tree.findall("Article")
    for entry in entries:
        e = LexicalEntry()
        lexin_lemma = entry.find("Lemma")
        saldo_link_set = set()
        lemma = Lemma()
        e.lemma = lemma
        fr = FormRepresentation()
        lemma.add_form_representation(fr)
        fr.add_feature(Feature("rawForm", lexin_lemma.get("Value")))
        lexin_pos = lexin_lemma.get("Type")
        pos = None
        if lexin_pos != None:
            fr.add_feature(Feature("lexinPartOfSpeech", lexin_pos))
            pos = lexinPosToSaldoPos(lexin_pos, lexin_lemma.get("Value"))[0]
            if pos != None:
                fr.add_feature(Feature("partOfSpeech", pos))
        
        infinitiv = None
        for lexin_inflection in lexin_lemma.findall("Inflection"):
            wordform = WordForm()
            lexin_text = lexin_inflection.text
            wordform.add_feature(Feature("writtenForm", lexin_text))
            lexin_form = lexin_inflection.get("Form")
            if lexin_form != None:
                if lexin_form == "infinitiv":
                    infinitiv = lexin_text
                msd = lexinInflectionToMsd.get(lexin_form,"")
                wordform.add_feature(Feature("msd", msd))
                wordform.add_feature(Feature("lexinForm", lexin_form))
                if lexin_inflection.get("Spec") != None:
                    wordform.add_feature(Feature("lexinSpec", lexin_inflection.get("Spec")))
            e.add_wordform(wordform)

        writtenForm = None
        if (pos == "vb" or pos == "vbm") and infinitiv != None:
            # we have to find the infinitive since Lexin is based on present tense
            writtenForm = infinitiv
        else:
            writtenForm = lexin_lemma.get("Value").replace("|", "")
        fr.add_feature(Feature("writtenForm", writtenForm))

        if pos != None and writtenForm != None:
            possible_saldo_senses = saldom.get((writtenForm, pos),[])
            #if len(possible_saldo_senses) == 0:
                #print writtenForm + " (" + pos + ")"
            for pss in possible_saldo_senses:
                e.add_feature(Feature("saldoLink", pss))


        lexin_phonetic = lexin_lemma.find("Phonetic")
        if lexin_phonetic != None:
            fr.add_feature(Feature("phoneticForm", lexin_phonetic.text))
        if lexin_lemma.get("Hyphenate", "") != "":
            fr.add_feature(Feature("hyphenatedForm", lexin_lemma.get("Hyphenate")))
        if lexin_lemma.get("Rank", "") != "":
            fr.add_feature(Feature("rank", lexin_lemma.get("Rank")))
        if lexin_lemma.get("ID", "") != "":
            fr.add_feature(Feature("lexinID", lexin_lemma.get("ID")))
        if lexin_lemma.get("Variant", "") != "":
            fr.add_feature(Feature("lexinVariant", lexin_lemma.get("Variant")))
        for lexin_reference in lexin_lemma.findall("Reference"):
                reftype = lexin_reference.get("Type")
                if reftype == "see":
                    e.add_feature(Feature("see", lexin_reference.get("Value")))
                elif reftype == "compare":
                    e.add_feature(Feature("compareWith", lexin_reference.get("Value")))

        for lexin_lexeme in lexin_lemma.findall("Lexeme"):
            baseform = lexin_lemma.get("Value").replace("|", "")
            index = usedSensesCount.get(baseform, 0) + 1
            #if not index.isdigit():
            #    index = "1"
            #if usedSensesCount.get(baseform, 0) >= index:
            #    index = usedSensesCount.get(baseform, 0) + 1
            sense = Sense("lexin--" + baseform + ".." + str(index))
            lexin_lexemeno = lexin_lexeme.get("Lexemeno")
            lexin_variantID = lexin_lexeme.get("VariantID","")

            langdata = extra_languages.get(lexin_variantID, [])

            sense.add_feature(Feature("lexinVariantID", lexin_variantID))
            if not lexin_lexemeno.isdigit():
                lexin_lexemeno = "1"
            sense.add_feature(Feature("lexinLexemeNumber", lexin_lexemeno))
            if baseform in usedSensesCount:
                usedSensesCount[baseform] = usedSensesCount[baseform] + 1
            else:
                usedSensesCount[baseform] = 1
            if lexin_lexeme.get("ID", None) != None:
                sense.add_feature(Feature("lexinID", lexin_lexeme.get("ID")))
            if lexin_lexeme.find("Definition") != None:
                sense.add_feature(Feature("definition", lexin_lexeme.find("Definition").text))
            for lexin_example in lexin_lexeme.findall("Example"):
                sense_example = SenseExample(lexin_example.text)
                sense_example.add_feature(Feature("type", "example"))
                for (lang, item) in langdata:
                    for (swe_ex, ex) in item["examples"]:
                        if ex != None:
                            if swe_ex.strip() == lexin_example.text.strip():
                                sense_example.add_feature(Feature(lang + "Text", ex))
                                break
                sense.add_sense_example(sense_example)
            for lexin_idiom in lexin_lexeme.findall("Idiom"):
                sense_example = SenseExample(lexin_idiom.text)
                sense_example.add_feature(Feature("type", "idiom"))
                if lexin_idiom.find("Definition") != None:
                    sense_example.add_feature(Feature("definition", lexin_idiom.find("Definition").text))
                for (lang, item) in langdata:
                    for (swe_idiom, idiom) in item["idioms"]:
                        if idiom != None:
                            swe_idiom = swe_idiom.split(" (")[0]
                            #print "COMP " + swe_idiom.strip() + " --- " + lexin_idiom.text.strip()
                            if swe_idiom.strip() == lexin_idiom.text.strip():
                                #print "idiom: " + idiom
                                sense_example.add_feature(Feature(lang + "Text", idiom))
                                break
                sense.add_sense_example(sense_example)
            for lexin_compound in lexin_lexeme.findall("Compound"):
                sense_example = SenseExample(lexin_compound.text)
                sense_example.add_feature(Feature("type", "compound"))
                for (lang, item) in langdata:
                    for (swe_comp, comp) in item["compounds"]:
                        if comp != None:
                            #print "COMP " + swe_comp.strip() + " --- " + lexin_compound.text.strip().replace("|","")
                            if swe_comp.strip() == lexin_compound.text.strip().replace("|",""):
                                sense_example.add_feature(Feature(lang + "Text", comp))
                                break
                sense.add_sense_example(sense_example)
            for lexin_comment in lexin_lexeme.findall("Comment"):
                comment_type = lexin_comment.get("Type")
                if comment_type == "style":
                    sense.add_feature(Feature("usg", lexin_comment.text))
                elif comment_type == "def":
                    sense.add_feature(Feature("desc", lexin_comment.text))
            for lexin_reference in lexin_lexeme.findall("Reference"):
                    reftype = lexin_reference.get("Type")
                    if reftype == "see":
                        sense.add_feature(Feature("see", lexin_reference.get("Value")))
                    elif reftype == "compare":
                        sense.add_feature(Feature("compareWith", lexin_reference.get("Value")))
                    elif reftype == "antonym":
                        sense.add_feature(Feature("antonym", lexin_reference.get("Value")))
            for lexin_gramcom in lexin_lexeme.findall("Gramcom"):
                sense.add_feature(Feature("gram", lexin_gramcom.text))
            for lexin_graminfo in lexin_lexeme.findall("Graminfo"):
                sense.add_feature(Feature("gram", lexin_graminfo.text))
            for lexin_theme in lexin_lexeme.findall("Theme"):
                for theme_string in lexin_theme.get("Tema").split(","):
                    sense.add_feature(Feature("lexinTheme", theme_string.split(":")[0].strip()))

            for (lang, item) in langdata:
                if item["trans"] != None:
                    sense.add_feature(Feature(lang + "Translation", item["trans"]))
                if item["synonyms"] != None:
                    for syn in item["synonyms"]:
                        sense.add_feature(Feature(lang + "Synonyme", syn))
            e.add_sense(sense)


        lmf.add_lexical_entry(e)
    print unicode(lmf).encode("utf-8")