#!/usr/bin/env python import sys import codecs import re from xml.etree.ElementTree import ElementTree from xml.etree.ElementTree import fromstring total = [] xml_tree = fromstring(sys.stdin.read()) for entry in xml_tree.find("Lexicon"): lemma = entry.find("Lemma") baseforms = [] lemgram = None if lemma != None: freps = lemma.findall("FormRepresentation") if freps != None: for fr in freps: allfeats = fr.findall("feat") if allfeats != None: for feat in allfeats: if feat.attrib["att"] == "writtenForm": baseforms.append(feat.attrib["val"]) elif feat.attrib["att"] == "lemgram": lemgram = feat.attrib["val"] total.append((lemgram, baseforms)) for ent in total: print ent # This is unfinished. It's supposed to be used to build a raw material from the Swedberg resource.