''' Created on Dec 6, 2016 @author: David ''' from pattern import padded_ngrams import codecs import sys file = "f:/parll data/data/svalex-ttm.csv" out = "f:/parll data/data/trigram-padded.dist" f = codecs.open(file, "r", "utf-8") o = codecs.open(out, "w", "utf-8") dist = {} def cefrmap(cefr): if (cefr == "A1"): return 1 if (cefr == "A2"): return 2 if (cefr == "B1"): return 3 if (cefr == "B2"): return 4 if (cefr == "C1"): return 5 if (cefr == "C2"): return 6 return 0 for line in f: lemma,pos,cefr = line.rstrip().split("\t") bigram = padded_ngrams(lemma, 2) trigram= padded_ngrams(lemma, 3) mcefr = cefrmap(cefr) - 1 for a,b,c in trigram: gram = a+b+c if (not gram in dist.keys()): dist[gram] = [0,0,0,0,0] dist[gram][mcefr] += 1 #print(bigram) #sys.exit() print(dist) for key,val in dist.items(): o.write("{}\t{}\n".format(key, "\t".join(str(v) for v in val)))