#!/usr/bin/env python # -*- coding: utf8 -*- import sys import re import codecs import xml.etree.cElementTree if __name__ == '__main__': total_num_le = 0 total_num_written_forms_lemmas = 0 total_num_written_forms_wordforms = 0 total_num_senses = 0 total_num_lemgrams = 0 total_postags = set() print "Resource\tEntries\tLemgrams\tSenses\tBase forms\tInflected forms\tUnique pos tags" if len(sys.argv) > 1: resources = sys.argv[1].split(",") else: resources = [("saldo","lmf/saldo/saldo.xml"), ("saldom", "lmf/saldom/saldom.xml"), ("saldoe", "lmf/saldoe/saldoe.xml"), ("lwt", "lmf/lwt/lwt.xml"), ("parole+", "lmf/parolelexplus/parolelexplus.xml"), ("simple+", "lmf/simpleplus/simpleplus.xml"), ("swesaurus", "lmf/swesaurus/swesaurus.xml"), ("swefn", "lmf/swefn/swefn.xml"), ("konstruktikon", "lmf/konstruktikon/konstruktikon.xml"), ("kelly", "lmf/kelly/kelly.xml"), ("wordnet-saldo", "lmf/wordnet-saldo/wordnet-saldo.xml"), ("diapivot", "lmf/diapivot/diapivot.xml"), ("dalin", "lmf/dalin/dalin.xml"), ("dalinm", "lmf/dalinm/dalinm.xml"), ("dalin-base", "lmf/dalin-base/dalin-base.xml"), ("swedberg", "lmf/swedberg/swedberg.xml"), ("swedbergm", "lmf/swedbergm/swedbergm.xml"), ("fsvm", "lmf/fsvm/fsvm.xml"), ("schlyter", "lmf/schlyter/schlyter.xml"), ("söderwall", "lmf/soederwall/soederwall.xml"), ("söderwall supplement", "lmf/soederwall-supp/soederwall-supp.xml")] # TODO: Read the resources from the metadata for (resource_name, resource_path) in resources: res_num_le = 0 res_num_written_forms_lemmas = 0 res_num_written_forms_wordforms = 0 res_num_senses = 0 res_num_lemgrams = 0 res_postags = set() for event, elem in xml.etree.cElementTree.iterparse(resource_path): if elem.tag == "LexicalEntry": res_num_le += 1 lemma = elem.find("Lemma") if lemma != None: # Count base forms frs = lemma.findall("FormRepresentation") if frs != None and len(frs) > 0: for fr in frs: for e in fr: if e.tag == "feat": att = e.get("att") if att == "writtenForm": res_num_written_forms_lemmas += 1 elif att == "partOfSpeech": res_postags.add(e.get("val")) total_postags.add(e.get("val")) elif att == "lemgram": res_num_lemgrams += 1 # Count inflected forms wfs = elem.findall("WordForm") if wfs != None and len(wfs) > 0: used = set() for wf in wfs: for e in wf: if e.tag == "feat" and e.get("att") == "msd": msd = e.get("val") if ":" in msd: msdpart = msd.split(":")[0] if not msdpart in used: used.add(msdpart) res_num_written_forms_wordforms += 1 else: res_num_written_forms_wordforms += 1 # Count senses senses = elem.findall("Sense") if senses != None and len(senses) > 0: for sense in senses: res_num_senses += 1 # We could do len(senses) but this is a good skeleton for adding checks INSIDE the sense tag. total_num_le += res_num_le total_num_written_forms_lemmas += res_num_written_forms_lemmas total_num_written_forms_wordforms += res_num_written_forms_wordforms total_num_senses += res_num_senses total_num_lemgrams += res_num_lemgrams print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % (resource_name, str(res_num_le), str(res_num_lemgrams), str(res_num_senses), str(res_num_written_forms_lemmas), str(res_num_written_forms_wordforms), str(len(res_postags)) ) print "%s\t%s\t%s\t%s\t%s\t%s\t%s" % ("Σ", str(total_num_le), str(total_num_lemgrams), str(total_num_senses), str(total_num_written_forms_lemmas), str(total_num_written_forms_wordforms), str(len(total_postags)) )