''' Created on Dec 6, 2016 @author: David ''' import codecs import math import sys file = "f:/parll data/data/trigram-padded.dist" f = codecs.open(file, "r", "utf-8") out = "f:/parll data/data/trigram-padded-tfidf.dist" o = codecs.open(out, "w", "utf-8") excount = 0 for line in f: res = line.rstrip().split("\t") key = res.pop(0) tfr = 0 idfr= 0 intervec = [0,0,0,0,0] for i in range(len(res)): nzc = 5-res.count("0") idf = math.log(5/nzc) tf = int(res[i]) tflog = 0 if (tf != 0): tflog = 1+math.log(tf) tfidf = tflog*idf intervec[i] = tfidf vesum = sum(intervec) if (vesum > 0): o.write("{}\t{}\n".format(key,"\t".join(str(v) for v in intervec))) else: excount += 1 print("Excluding {}".format(key)) print("excluded {} items".format(excount))