''' Created on Nov 30, 2016 @author: David ''' import compound import lexin import pattern import syllable import pipeline import codecs import sys from collections import Counter cefr_levels = ["A1","A2","B1","B2","C1"] file = "f:/parll data/data/svalex-ttm.csv" outfile = "f:/parll data/data/scowl-summary.csv" f = codecs.open(file, "r", "utf-8") dummy = {} syllable_summary = {} unigram_summary = {} bigram_summary = {} trigram_summary = {} suffix_summary = {} suffix_len_summary = {} compound_summary = {} compound_count_summary = {} gender_summary = {} polysemy_summary = {} homonymy_summary = {} pos_summary = {} summaries = [syllable_summary,unigram_summary,dummy,bigram_summary,dummy,trigram_summary,dummy,suffix_summary,suffix_len_summary,compound_summary,compound_count_summary,gender_summary,polysemy_summary,homonymy_summary] def reduce_to_value(something): #print(type(something)) if (type(something) is list): if not something: return 0 something = something[0] if (type(something) is list): return something[0] else: return something if (isinstance(something, Counter)): return 0 return something for cefr_level in cefr_levels: pos_summary[cefr_level] = {} for summary in summaries: summary[cefr_level] = {} count = 0 for line in f: lemma,pos,cefr = line.rstrip().split("\t") result=pipeline.run(lemma, pos, cefr) for i in range(len(summaries)): summary = summaries[i] value = reduce_to_value(result[i]) if (pos not in pos_summary[cefr]): pos_summary[cefr][pos] = 0 if (value not in summary[cefr]): summary[cefr][value] = 0 summary[cefr][value] += 1 pos_summary[cefr][pos] += 1 # if (count == 50): # break # count += 1 # summaries.append(pos_summary) o = codecs.open(outfile,"w","utf-8") for i in range(len(summaries)): for cefr in cefr_levels: print("{} {} {}".format(i,cefr,summaries[i][cefr])) o.write("{} {} {}".format(i,cefr,summaries[i][cefr])) o.write("--------------------\n")