import codecs from collections import defaultdict class Corpus: def __init__(self, filename=None): self.freqs = defaultdict(int) if filename == None: self.name = 'None' self.read = True else: self.name = filename self.read = False def __read_data(self): with codecs.open(self.name,encoding='utf-8') as f: for l in f: (w,i) = l[:-1].split('\t') self.freqs[w] = int(i) def add_frequency_to_table(self,words): if not self.read: self.__read_data() self.read = True # Words that start with '*' will not be counted # Variants of a word may be separated by ','. Each variant will # be counted. words = sum([w.split(',') for w in words if not w.endswith('*')],[]) return [(w,self.freqs[w]) for w in words]