#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import re
import codecs
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import fromstring
TODO = "todo"
KONST = "{http://spraakbanken.gu.se/swe/resurs/konstruktikon}"
KARP = "{http://spraakbanken.gu.se/eng/research/infrastructure/karp/karp}"
URL = "konstruktikon.xml"
class HTML:
def __init__(self):
self.entries = []
def __str__(self):
return "\n".join([
'',
'',
'
',
'Ett svenskt konstruktikon, utvecklingsversion',
'',
'',
'',
'',
"\n".join(["%s" % (e) for e in self.entries]),
'',
''])#.encode('utf-8')
def add_entry(self, entry):
self.entries.append(entry)
class KonstruktikonEntry:
def __init__(self):
self.typee = ""
self.cat = ""
self.inheritance = ""
self.evokes = ""
self.definition = None
self.structure = ""
self.cee = []
self.coll = []
self.construction_elements_internal = []
self.construction_elements_external = []
self.examples = []
self.comment = None
self.status = ""
self.reference = ""
self.id = ""
self.illustration = ""
def add_example(self, contents):
self.examples.append(Example(contents))
def set_definition(self, contents):
self.definition = Example(contents)
def set_comment(self, contents):
self.comment = Example(contents)
def add_internal_construction_element(self, contents):
self.construction_elements_internal.append(ConstructionElement(contents))
def add_external_construction_element(self, contents):
self.construction_elements_external.append(ConstructionElement(contents))
def __str__(self):
global mode
global language
if self.illustration != "":
illustration_html = ' - %s' % (self.illustration)
else:
illustration_html = ''
if language == 0: # english
l_comment = "comment"
l_structure = "structure"
l_examples = "examples"
l_coll = "common words"
l_status = "status"
l_cee = "construction evoking elements"
l_cat = "category"
l_evokes = "frame evoked"
else: # swedish
l_comment = "kommentar"
l_structure = "struktur"
l_examples = "exempel"
l_coll = "vanliga ord"
l_status = "status"
l_cee = "cee"
l_cat = "kategori"
l_evokes = "frame evoked"
if mode == 1:
examples = ""
if len(self.examples) != 0:
#print self.examples[1]
examples = '%s | |
' % (l_examples, "".join( [ ('%s' % (example)) for example in self.examples]))
internal = ""
s = "\n".join([
'' % (self.id, self.id.split("--")[1], illustration_html) ,
'' % (self.id),
'definition | %s |
' % (self.definition),
cond(l_structure, self.structure),
cond(l_coll, " ".join(self.coll)),
examples,
'%s | %s |
' % (l_comment, self.comment),
'
'])
return s#.encode('utf-8')
else:
examples = ""
if len(self.examples) != 0:
#print self.examples[1]
examples = 'examples | |
' % ("".join( [ ('%s' % (example)) for example in self.examples]))
internal = ""
if len(self.construction_elements_internal) != 0:
internal = 'internal construction elements | |
' % ("".join( [ ('%s' % (elem)) for elem in self.construction_elements_internal]))
external = ""
if len(self.construction_elements_external) != 0:
external = 'external construction elements | |
' % ("".join( [ ('%s' % (elem)) for elem in self.construction_elements_external]))
s = "\n".join([
'' % (self.id, self.id.split("--")[1], illustration_html) ,
'' % (self.id),
cond("type", self.typee),
cond(l_cat, self.cat),
cond(l_evokes, self.evokes),
'definition | %s |
' % (self.definition or ""),
cond(l_structure, self.structure),
linked_cond("inheritance", self.inheritance),
cond(l_cee, " ".join(self.cee)),
cond(l_coll, " ".join(self.coll)),
internal,
external,
examples,
cond(l_comment, self.comment),
cond("reference", self.reference),
cond("status", self.status),
'
'])
return s#.encode('utf-8')
def linked_cond(label, item):
if(item != "" and item != None):
regexp = re.compile(r'\w*\.\.\d+', re.UNICODE)
item = re.sub(regexp, sensify, item)
return '%s | %s |
' % (label, item, item)
else:
return ''
def cond(label, item):
if(item != "" and item != None):
regexp = re.compile(r'\w*\.\.\d+', re.UNICODE)
item = re.sub(regexp, sensify, item)
return '%s | %s |
' % (label, item)
else:
return ''
def sensify(matchobj):
sense = matchobj.group(0)
parts = sense.split("..")
if len(parts) == 2:
return '%s%s' % (parts[0], parts[1])
else:
return sense
def escape(s):
s = s.replace('&', '&')
s = s.replace("'", ''')
s = s.replace('<', '<')
s = s.replace('>', '>')
return s.replace('"', '"')
class ConstructionElement:
def __init__(self, contents):
self.contents = contents # [("name", "Activity"), ("cat", "vb")]
def __str__(self):
out = []
theName = ""
#print self.contents
regexp = re.compile(r'\w*\.\.\d+', re.UNICODE)
for (att, value) in self.contents:
if att == "aux":
att = "other"
if att != "name":
out.append(att + "=" + re.sub(regexp, sensify, value))
else:
theName = value
return theName + ": " + " ".join(out)
class Example:
def __init__(self, contents):
self.contents = contents
def __str__(self):
return self.make_string(self.contents)#.encode('utf-8')
def make_string(self, contents):
out = []
for p in contents:
if len(p) == 2:
if p[0] == "/freetext":
# Freetext
out.append(escape(p[1]))
elif len(p) == 3:
if p[0] == "/leaf":
# Node
out.append('[' + escape(p[2]) + ']' + escape(p[1].get("name", "")) + '')
else:
# Branch
out.append('[')
#print p[2]
for item in p[2]:
if len(item) == 2:
# Freetext
out.append(escape(item[1]))
elif item[0] == "/branch":
# Branch
out.append('[' + self.make_string(item[2]) + ']' + escape(item[1].get("name", "")) + '')
else:
# Node
out.append('[' + escape(item[2]) + ']' + escape(item[1].get("name", "")) + '')
out.append(']' + escape(p[1].get("name", "")) + '')
outstr = " ".join(out)
return outstr
#class Constructicon_Element:
#def styleForNode(node):
#############################
# INPUT #
#############################
def structify(s):
s = re.sub(r'(\S+)\.\.(\d+)',lambda x: x.group(1)+""+x.group(2)+"",s,re.U)
s = re.sub(r'"(.+?)"',lambda x: "" + x.group(1) + "",s,re.U)
s = re.sub(r'([^\s\]\/]+)_([^\s\]\/]+)',lambda x: x.group(1) + '' + x.group(2) + '' if '' not in x.group(0) else x.group(0),s,re.U)
s = re.sub(r',,([^\s|]+)', lambda x: '' + x.group(1) + '' if '' not in x.group(0) else x.group(0),s,re.U)
return s
# t = t.replace(/,,([^\s]+)/g, function(f, a) {
# if( f.indexOf("") == -1 ) {
# return '' + a + ''
# } else {
# return f;
# }
# });
"""
def handle_example(example):
example_parts = []
if example.text != None:
first_text = example.text.strip()
else:
first_text = ""
if first_text != "":
example_parts.append(("/freetext", first_text))
for part in example:
inners = part.findall("e")
if inners == None or len(inners) == 0:
example_parts.append(("/leaf", part.attrib, part.text.strip() ))
else:
example_parts.append(("/branch", part.attrib, handle_example(part) ))
thetail = part.tail
if thetail != None:
thetail = thetail.strip()
if thetail != "":
example_parts.append(("/freetext", thetail))
return example_parts
"""
def handle_markup(example):
example_parts = []
for part in example.getchildren():
if part.tag == KARP+"e" or part.tag == "e":
if len(part.getchildren()) == 0:
example_parts.append(("/leaf", part.attrib, part.text.strip() ))
else:
example_parts.append(("/branch", part.attrib, handle_markup(part) ))
elif part.tag == KARP+"text" or part.tag == "text":
example_parts.append(("/freetext", part.text.strip()))
return example_parts
def take(entry, tagname, resort):
e2 = entry.find(tagname)
if e2 != None:
t = e2.text
if t == None:
return resort
else:
return t.strip()
else:
return resort
"""
def list_to_dict(alist):
dict = {}
for item in alist:
dict[item[0]] = item[1]
return dict
"""
def dict_to_list(indict):
outlist = []
for item in indict:
if item != "uid" and item != "type":
outlist.append((item, indict[item]))
return outlist
if __name__ == '__main__':
global mode
global language
if len(sys.argv) < 2:
mode = 0
language = 0 # english
else:
if sys.argv[1] == "simplified":
mode = 1
else:
mode = 0
if sys.argv[2] == "english":
language = 0
else:
language = 1
lines = []
tree = ElementTree()
tree.parse(URL)
html = HTML()
entries = tree.find("Lexicon").findall("LexicalEntry")
for le in entries:
skip = False
sense = le.find("Sense")
e = KonstruktikonEntry()
e.id = sense.get("id")
cees = []
colls = []
feats = sense.findall("feat")
for feat in feats:
feat_att = feat.get("att")
feat_val = feat.get("val")
if feat_att == "type":
e.typee = escape(feat_val)
elif feat_att == "cat":
e.cat = escape(feat_val)
elif feat_att == "evokes":
e.evokes = escape(feat_val)
elif feat_att == "structure":
e.structure = structify(feat_val)
elif feat_att == "cee":
cees.append(escape(feat_val))
elif feat_att == "coll":
colls.append(escape(feat_val))
elif feat_att == "reference":
e.reference = escape(feat_val)
elif feat_att == "internal_comment": # New, but should maybe not be rendered out
pass
elif feat_att == "comment": # Right now it's not a feat but I think it should be!
e.comment = escape(feat_val)
elif feat_att == "entry_status": # New field
e.status = escape(feat_val)
if e.status == "Suggestion":
skip = True
elif feat_att == "illustration":
e.illustration = escape(feat_val)
e.coll = colls
e.cee = cees
examples = sense.findall(KARP+"example", namespaces={"karp":"karp"})
for example in examples:
e.add_example(handle_markup(example))
definition = sense.find(KARP+"definition", namespaces={"karp":"karp"})
if definition != None and definition != "":
e.set_definition(handle_markup(definition))
#### These are because we don't save the elements back with namespaces at the moment ###
examples = sense.findall("example", namespaces={"karp":"karp"})
for example in examples:
e.add_example(handle_markup(example))
definition = sense.find("definition", namespaces={"karp":"karp"})
if definition != None and definition != "":
e.set_definition(handle_markup(definition))
########################################################################################
int_const_elements = sense.findall(KONST+"int_const_elem", namespaces={"konst":"konst"})
for int_const_element in int_const_elements:
e.add_internal_construction_element(dict_to_list(int_const_element.attrib))
int_const_elements = sense.findall("int_const_elem", namespaces={"konst":"konst"})
for int_const_element in int_const_elements:
e.add_internal_construction_element(dict_to_list(int_const_element.attrib))
ext_const_elements = sense.findall(KONST+"ext_const_elem", namespaces={"konst":"konst"})
for ext_const_element in ext_const_elements:
e.add_external_construction_element(dict_to_list(ext_const_element.attrib))
ext_const_elements = sense.findall("ext_const_elem", namespaces={"konst":"konst"})
for ext_const_element in ext_const_elements:
e.add_external_construction_element(dict_to_list(ext_const_element.attrib))
if not skip:
html.add_entry(e)
"""
e.typee = take(entry, "type", "")
e.cat = take(entry, "cat", "")
e.inheritance = take(entry, "inheritance", "")
e.evokes = take(entry, "evokes", "")
definition = entry.find("definition")
def_parts = []
if definition != None:
if definition.text != None:
first_text = definition.text.strip()
if first_text != "":
def_parts.append(("/freetext", first_text))
for atom in definition:
if atom.tag == TODO:
continue
if atom.text != None:
def_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip()))
thetail = atom.tail
if thetail != None:
thetail = thetail.strip()
if thetail != "":
def_parts.append(("/freetext", thetail))
#print def_parts
#print
e.set_definition(def_parts)
e.structure = take(entry, "structure", "")
e.cee = take(entry, "cee", "")
e.coll = take(entry, "coll", "")
c_e = entry.find("construction_elements")
if c_e != None:
internal = c_e.find("internal")
if internal != None:
for el in internal:
if el.tag == TODO:
continue
e.add_internal_construction_element(el.tag, el.items())
external = c_e.find("external")
if external != None:
for el in external:
if el.tag == TODO:
continue
e.add_external_construction_element(el.tag, el.items())
examples = entry.find("examples")
for example in examples:
if example != None:
if example.tag == TODO:
continue
example_parts = handle_example(example)
#print example_parts
#print "---"
e.add_example(example_parts)
comment = entry.find("comment")
com_parts = []
if comment != None:
if comment.text != None:
first_text = comment.text.strip()
if first_text != "":
com_parts.append(("/freetext", first_text))
for atom in comment:
if atom.tag == TODO:
continue
if atom.text != None:
com_parts.append(("/leaf", list_to_dict(atom.items()), atom.text.strip()))
thetail = atom.tail
if thetail != None:
thetail = thetail.strip()
if thetail != "":
com_parts.append(("/freetext", thetail))
e.set_comment(com_parts)
#e.comment = take(entry, "comment", "")
e.reference = take(entry, "reference", "")
if mode == 0 or e.typee == "cx" or e.typee == "Cx":
html.add_entry(e)
"""
a = unicode(html)
print a.encode("utf-8")