#!/usr/bin/env python # -*- coding: utf8 -*- import sys import codecs import annotate.compound as compound import re def read_csv(num_of_fields, tolerates=-1): if tolerates == -1: tolerates = num_of_fields for line in sys.stdin: e = line[:-1].split('\t') if len(e) == num_of_fields: yield e elif len(e) >= tolerates and len(e) < num_of_fields: f = [""] * (num_of_fields - len(e)) yield e + f return def read_csv_file(filepath, num_of_fields, tolerates=-1): if tolerates == -1: tolerates = num_of_fields with codecs.open('saldoexempel.txt', encoding='utf-8') as f: for line in f: line = line.encode("utf-8") e = line[:-1].split('\t') if len(e) == num_of_fields: yield e elif len(e) >= tolerates and len(e) < num_of_fields: f = [""] * (num_of_fields - len(e)) yield e + f return def jsitem(key, value): if type(value) == list: realvalue = '[' + ", ".join(map(lambda x: '"' + x + '"', value)) + "]" else: if value[0] == '"': value = value[1:-1] realvalue = '"' + value + '"' return '"' + key + '" : ' + realvalue if __name__ == '__main__': # First open SweFN lus = {} for (name, _, _, examples_raw, _, _, _, _, _, proplus, _, _, _, _) in read_csv(num_of_fields=14): #examples_raw = examples_raw.decode("utf-8") lus_block = proplus#.decode("utf-8") lu_lines = [line.strip() for line in lus_block.split(";;")] for lu_line in lu_lines: if len(lu_line) != 0 and " " in lu_line: (pos, lus_raw) = tuple(lu_line.strip().split(" ", 1)) newlus = [lu.strip() for lu in lus_raw.split(",")] for nlu in newlus: nlu = nlu.strip() if nlu: if nlu not in lus: lus[nlu] = {} lus[nlu]["frame"] = [] if name not in lus[nlu]: lus[nlu]["frame"].append(name) # Then open saldoe.txt and get the remaining information for (sense, prdesc, scdesc, pos, info, example) in read_csv_file("saldoexempel.txt", num_of_fields=6): sense = sense.strip() example = example.strip() if "+" in info: # It is a New LU if sense not in lus: lus[sense] = {} lus[sense]["pos"] = pos lus[sense]["pr"] = prdesc lus[sense]["sc"] = scdesc lus[sense]["info"] = info.replace('"', """) lus[sense]["example"] = example.replace('"', """) lines = [] for lu in lus.keys(): obje = lus[lu] lines.append('"' + lu + '" : {' + ", ".join([jsitem(x, obje[x]) for x in obje.keys()]) + '}') #for lu in lus.keys(): # lines.append('"' + lu + '" : [' + ", ".join(map(lambda x: '"' + x + '"', lus[lu])) + "]") output = 'generated_swefn_newlus = {' + ",\n".join(lines) + '};' print output#.decode("utf-8")