#!/usr/bin/env python
# -*- coding: utf8 -*-
import sys
import re
import codecs
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring
from pprint import PrettyPrinter
TODO = "todo"
#############################
# OUTPUT #
#############################
def recursivelyConvertIndexedStructure(obj):
# We need to rearrange obj.e and obj.text in the correct order by their n-indexes:
ke = []
ktext = []
if "e" in obj:
ke = obj["e"] if type(obj["e"]) == list else [obj["e"]]
if "text" in obj:
ktext = obj["text"] if type(obj["text"]) == list else [obj["text"]]
ke = [{"t" : "e", "item" : obj} for obj in ke]
ktext = [{"t" : "text", "item" : obj} for obj in ktext]
res = ke + ktext
res.sort(indexedStructureSort);
for objin in res:
if objin["item"]:
children = recursivelyConvertIndexedStructure(objin["item"])
if children and len(children) != 0:
objin["item"]["children"] = children
return res;
def indexedStructureSort(first, second):
return first["item"]["n"] - second["item"]["n"]
def branchToXML(structure):
out = ""
c = 0
for obj in structure:
if obj["t"] == "text":
if obj["item"]["#text"].strip() != "":
out += '' + obj["item"]["#text"].strip() + '';
else: # e
label = obj["item"]["name"]
if "children" in obj["item"]:
out += '' + branchToXML(obj["item"]["children"]) + '';
else:
if "#text" in obj["item"]:
out += '' + obj["item"]["#text"] + '';
c += 1
return out
def recReconstruct(arr):
texts = []
es = []
index = -1
for obj in arr:
index += 1
if obj["type"] == "text":
texts.append({"#text" : obj["text"], "n" : index})
else:
es.append({"name" : obj["name"], "n" : index})
children = recReconstruct(obj["children"])
if len(children[0]) == 1 and len(children[1]) == 0:
es[len(es)-1]["#text"] = children[0][0]["#text"]
else:
if len(children[0]) > 0:
es[len(es)-1]["text"] = children[0]
if len(children[1]) > 0:
es[len(es)-1]["e"] = children[1]
return [texts, es]
def markupStringToBranch(instr):
queue = list(instr)
tokens = []
tokenTypes = []
while len(queue) > 0:
c = queue.pop(0) # Could propbably be optimized to O(1) instead of O(n)
token = ""
lastTokenType = None
if c == "[":
lastTokenType = 1
token = "["
tokens.append(token)
tokenTypes.append(1)
elif c == "]":
lastTokenType = 2
token = "]"
tokens.append(token)
tokenTypes.append(2)
token = ""
while len(queue) >= 1 and queue[0] not in " []":
token += queue.pop(0)
if token and token != "":
tokens.append(token)
tokenTypes.append(3)
else:
token = c
while len(queue) >= 1 and queue[0] not in "[]":
token += queue.pop(0)
if token and token != "":
tokens.append(token)
tokenTypes.append(0)
# Make AST
output = []
e_stack = []
while len(tokens) > 0:
t = tokens.pop(0)
tt = tokenTypes.pop(0)
if tt == 0: # Text
item = {"type" : "text", "name" : "", "text" : t}
if len(e_stack) > 0:
e_stack[len(e_stack)-1]["children"].append(item)
else:
output.append(item)
elif tt == 1: # [
e_stack.append({"type" : None, "name" : "", "children" : []})
elif tt == 2: # ]
pass
elif tt == 3: # entity
e = e_stack.pop()
e["name"] = t
if len(e_stack) != 0:
parent_e = e_stack[len(e_stack)-1]
parent_e["children"].append(e)
else:
output.append(e)
#Reconstruct the AST
reconstructed = recReconstruct(output)
outObj = {}
if len(reconstructed[0]) > 0:
outObj["text"] = reconstructed[0]
if len(reconstructed[1]) > 0:
outObj["e"] = reconstructed[1]
return outObj
def print_as_new(tree, indent=0):
output = ""
if tree["items"] and len(tree["items"]) > 0:
for item in tree["items"]:
if type(item) == unicode:
output += item
elif type(item) == dict:
output += print_as_new(item, indent + 1)
return "\n" + "\t" * indent + output
def print_as_markup(tree, indent=0):
output = ""
if tree["items"] and len(tree["items"]) > 0:
last = ""
for item in tree["items"]:
if type(item) == unicode:
if indent != 0:
if len(item.split()) != 0:
if not item.split()[0].isupper():
output += item
else:
if len(item.strip().split()) <= 1:
last = item
else:
role = item.split()[0]
text = " ".join(item.split()[1:])
output += text + " ~" + role
else:
output += item
elif type(item) == dict:
output += "[" + print_as_markup(item, indent + 1) + "]"
if last != "":
output += " ~" + last.strip()
return output
class LMF:
def __init__(self, lang):
self.lang = lang
self.lexical_entries = []
self._lexical_entries_set = set()
self._le_senses = set()
def add_lexical_entry(self, lexical_entry):
self.lexical_entries.append(lexical_entry)
self._lexical_entries_set.add(".".join([lexical_entry._pos, lexical_entry._wf]))
def __unicode__(self):
return "\n".join([
'',
'',
'',
'',
' ',
'',
'',
' ' % self.lang,
"\n".join([unicode(e) for e in self.lexical_entries]),
'',
''])
class LexicalEntry:
def __init__(self):
self.features = []
self.senses = []
self._pos = ""
self._wf = ""
self.idattr = ""
def add_sense(self, sense):
self.senses.append(sense)
def __unicode__(self):
le_string = ''
if(self.idattr):
le_string = '' % (self.idattr)
return "\n".join([
le_string,
'',
'\n'.join([unicode(s) for s in self.senses]),
'\n'.join([unicode(f) for f in self.features]),
''])
def add_feature(self, feature):
self.features.append(feature)
class Feature:
def __init__(self, att, val):
self.att = att
self.val = val
def __unicode__(self):
return '' % (self.att, escape(self.val))
class Sense:
def __init__(self, sense):
self.sense = sense
self.relations = []
self.sense_examples = []
self.konst_examples = []
self.features = []
def add_sense_relation(self, sense_relation):
self.relations.append(sense_relation)
def add_sense_example(self, sense_example):
self.sense_examples.append(sense_example)
def add_konst_example(self, konst_example):
self.konst_examples.append(konst_example)
def add_feature(self, feature):
self.features.append(feature)
def __unicode__(self):
construction_elements_block = ""
return "\n".join(['' % (self.sense),
"\n".join([unicode(rel) for rel in self.relations]),
"\n".join([unicode(ex) for ex in self.sense_examples]),
"\n".join([unicode(ke) for ke in self.konst_examples]),
'\n'.join([unicode(f) for f in self.features]),
''
])
class KonstExample:
def __init__(self, example_parts):
self.example_parts = example_parts
def __unicode__(self):
return '%s' % self.example_parts
""" return '%s' % (self.unicodeRec())
def unicodeRec(self):
out = []
c = 0
for p in self.example_parts:
if p[0] == "/freetext":
out.append('%s' % (c, escapeContent(p[1])))
elif p[0] == "/leaf":
out.append('%s' % (p[1].get("name", ""), c, escapeContent(p[2])))
elif p[0] == "/branch":
out.append('%s' % (p[1].get("name", ""), c, KonstExample(p[2]).unicodeRec()))
c += 1
return "".join(out) """
def escape(s):
s = s.replace('&', '&')
s = s.replace("'", ''')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s.replace('"', '"')
def escapeContent(s):
s = s.replace('&', '&')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s
#############################
# INPUT #
#############################
def read_csv_file(filepath, num_of_fields, tolerates=-1):
if tolerates == -1:
tolerates = num_of_fields
with codecs.open(filepath, encoding='utf-8') as f:
skip = True
for line in f:
if skip:
skip = False
#print "SKIPPING"
#print line
else:
e = line[:-1].split('\t')
if len(e) == num_of_fields:
yield e
elif len(e) >= tolerates and len(e) < num_of_fields:
f = [""] * (num_of_fields - len(e))
yield e + f
return
#def handle_example(example):
# example_parts = []
# if example.text != None:
# first_text = example.text.strip()
# else:
# first_text = ""
# if first_text != "":
# example_parts.append(("/freetext", first_text))
# for part in example:
# inners = part.findall("e")
# if inners == None or len(inners) == 0:
# example_parts.append(("/leaf", part.attrib, part.text.strip() ))
# else:
# example_parts.append(("/branch", part.attrib, handle_example(part) ))
# thetail = part.tail
# if thetail != None:
# thetail = thetail.strip()
# if thetail != "":
# example_parts.append(("/freetext", thetail))
# return example_parts
def list_to_dict(alist):
dict = {}
for item in alist:
dict[item[0]] = item[1]
return dict
if __name__ == '__main__':
lmf = LMF('swe')
for (fr, st, d, fr_example, fe_core_list, fe_noncore_list, fe_cmp_list, fe_cmp_example, lus, lus_new, notes, created_by, createdate, modifdate) in read_csv_file("swefn-db.csv", num_of_fields=14):
le = LexicalEntry()
s = Sense(u"swefn--" + fr)
le.add_sense(s)
s.add_feature(Feature(u"BFNID", fr))
if st.strip() != "":
s.add_feature(Feature(u"semanticType", st))
if d.strip() != "":
s.add_feature(Feature(u"domain", d))
local_fe_table = {"LU" : "LU"}
for fe_core in fe_core_list.split(","):
fe_core = fe_core.strip()
if fe_core != "":
fe_parts = fe_core.split()
if len(fe_parts) == 2:
name = fe_parts[0].strip()
token = fe_parts[1].strip()[1:-1]
local_fe_table[token] = name
s.add_feature(Feature(u"coreElement", name))
elif "(" in fe_core:
#print fr + " can be fixed"
fe_parts = fe_core.split("(")
name = fe_parts[0].strip()
token = fe_parts[1].strip()[:-1]
local_fe_table[token] = name
s.add_feature(Feature(u"coreElement", name))
else:
s.add_feature(fe_core.strip())
for fe_per in fe_noncore_list.split(","):
fe_per = fe_per.strip()
if fe_per != "":
fe_parts = fe_per.split(" ")
if len(fe_parts) == 2:
name = fe_parts[0].strip()
token = fe_parts[1].strip()[1:-1]
local_fe_table[token] = name
s.add_feature(Feature(u"peripheralElement", name))
elif "(" in fe_per:
fe_parts = fe_per.split("(")
name = fe_parts[0].strip()
token = fe_parts[1].strip()[:-1]
local_fe_table[token] = name
s.add_feature(Feature(u"peripheralElement", name))
else:
s.add_feature(fe_per.strip())
if fe_cmp_list.strip() != "":
for compound in fe_cmp_list.split(","):
compound = compound.strip()
s.add_feature(Feature(u"compound", compound))
if fe_cmp_example.strip() != "":
for compound_example in fe_cmp_example.split(";;"):
compound_example = compound_example.strip()
s.add_feature(Feature(u"compoundExample", compound_example))
if lus.strip() != "":
for lu_category in lus.split(";;"):
if ":" in lu_category:
lu_category = lu_category.split(":")[1]
for lu in lu_category.split(","):
lu = lu.strip()
if lu != "":
s.add_feature(Feature(u"LU", lu))
le.add_feature(Feature(u"saldoLink", lu))
if fr_example.strip() != "":
for example in fr_example.split(";;"):
example = escape(example.strip())
example = re.sub(r'](\.|\?|\!|,)', r'] \1', example)
level = 0
tokens = []
current = ""
for i in range(0, len(example)):
char = example[i]
if char == "[":
if current != "":
tokens.append(current)
current = ""
level += 1
tokens.append('[')
elif char == "]":
if current != "":
tokens.append(current)
current = ""
tokens.append(']')
level -= 1
else:
current += char
if current != "":
tokens.append(current)
if level == 0:
root = {"parent" : None, "items" : []}
current = root
for i in range(0, len(tokens)):
token = tokens[i]
if token == '[':
new_tree = {"parent" : current, "items" : []}
current["items"].append(new_tree)
current = new_tree
elif token == ']':
current = current["parent"]
else:
current["items"].append(token)
#pp = PrettyPrinter()
#pp.pprint(current)
#####
#print
#print tokens
#print
markup = print_as_markup(current)
def repfunc(x):
return "]" + local_fe_table.get(x.group(1), x.group(1))
new_markup = re.sub(r'\s\~(\S*)]', repfunc, markup)
#print new_markup
#print
#print branchToXML(recursivelyConvertIndexedStructure(markupStringToBranch(new_markup)))
s.add_konst_example(KonstExample(branchToXML(recursivelyConvertIndexedStructure(markupStringToBranch(new_markup)))))
#print
else:
pass#print "NOT MATCHING [ AND ]"
if lus_new.strip() != "":
for new_lu_category in lus_new.split(";;"):
if ":" in new_lu_category:
new_lu_category = new_lu_category.split(":")[1]
for new_lu in new_lu_category.split(","):
new_lu = new_lu.strip()
if new_lu != "":
s.add_feature(Feature(u"suggestionForLU", new_lu))
if notes.strip() != "": # TODO: Det finns notes som delas med ;; men hur hantera det analogt med konstruktikon?
s.add_feature(Feature(u"comment", notes))
s.add_feature(Feature(u"createdBy", created_by))
s.add_feature(Feature(u"createdDate", createdate))
s.add_feature(Feature(u"modifDate", modifdate))
lmf.add_lexical_entry(le)
print unicode(lmf).encode("utf-8")
def structify(input):
input = join_parts(input)
for i in range(0, len(input)):
if "[" in input[i]:
input[i] = structify(input[i])
return input
def join_parts(input):
status = 0
starting = None
index = -1
for element in input:
index += 1
if element == '[':
if status == 0:
starting = index
status += 1
elif element == ']':
status -= 1
if status == 0 and starting != None:
return [input[:starting], input[starting+1:index], input[index+1:]]
def conv(input):
status = 0
starting = None
index = -1
for element in input:
index += 1
if element == '[':
if status == 0:
starting = index
status += 1
elif element == ']':
status -= 1
if status == 0 and starting != None:
part = input[starting+1:index]
items = part.split()
label = ""
other = ""
if len(items[0]) == 1:
label = items[0]
other = " ".join(items[1:])
return input[:starting] + "[" + other + "]" + label + input[index+1:]
#return [input[:starting], input[starting+1:index], input[index+1:]]