''' Created on Dec 1, 2016 @author: David ''' import codecs import math import sys file = "f:/parll data/data/pos_dist.txt" out = "f:/parll data/data/pos_tfidf.txt" f = codecs.open(file, "r", "utf-8") o = codecs.open(out , "w", "utf-8") udict_a1 = {} udict_a2 = {} udict_b1 = {} udict_b2 = {} udict_c1 = {} udict = [udict_a1, udict_a2, udict_b1, udict_b2, udict_c1] i = 0 for line in f: nline = line.rstrip()[1:-1] udictn = udict[i] for values in nline.split(","): ngram,freq = values.split(":") udictn[ngram.strip()] = int(freq.strip()) i += 1 def mcount(key): c = 0 for i in range(len(udict)): cdict = udict[i] if (key in cdict.keys()): c += 1 return c for i in range(len(udict)): cdict = udict[i] for key,val in cdict.items(): tf = val idf = math.log(5/mcount(key)) tfidf = tf*idf if (tfidf > 0): print("tfidf {} {}: {}".format(tfidf,i,key)) o.write("tfidf {} {}: {}\n".format(tfidf,i,key)) o.write("\n") print("\n") #if (tfidf == 0): #print("useless {}".format(key)) #cdict.pop(key,None) #print(udict)