#-*- coding: utf-8 -*- ''' Created on Nov 28, 2016 @author: David ''' from syllable import count_syllables from pattern import padded_ngrams,suffixes from compound import compounds from lexin import gender,homonymy,polysemy from topic import get_topics import codecs import numpy as np bigram_utile = codecs.open("siwoco/bigram-utile.txt","r",encoding="utf-8").readlines() trigram_utile = codecs.open("siwoco/trigram-utile.txt", "r", encoding="utf-8").readlines() compound_utile = codecs.open("siwoco/compound-utile.txt", "r", encoding="utf-8").readlines() #outfile = "siwoco-svalex-d-topic-bigram-compact.csv" #o = codecs.open(outfile, 'w', 'utf-8') #o.write("len,pos,syllable_count,"+",".join("uni_"+s for s in alphabet)+","+",".join("bi_"+s for s in bi_alph)+",suffix_len,compound_count,cefr\n") #o.write("len,syllable_count,suffix_len,compound_count,gender,dop,doh,") #for i in range(len(bigram_utile)): # o.write("bi"+str(i)+",") #for i in range(len(trigram_utile)): # o.write("tr"+str(i)+",") #for i in range(33): # o.write("to"+str(i)+",") #for i in range(len(compound_utile)): # o.write("co"+str(i)+",") # for i in range(len(suffix_pickle)): # o.write("s"+str(i)+",") #o.write("cefr\n") def xxx(l,p): st = "" lt = ["".join(v) for v in l] for v in p: if v in lt: st += "1," else: st += "0," return st def xxx2(l,p): st = [] lt = ["".join(v) for v in l] for v in p: if v in lt: st.append(1) else: st.append(0) return st def run_vector(word,pos): bigrams,syllable_count,suffix_len,compound,compound_count,gen,dop,doh,topic_dist = run(word,pos) arr = [len(word),syllable_count,suffix_len,compound_count,gen,dop,doh] arr.extend(xxx2(bigrams, bigram_utile)) arr.extend(topic_dist) arr.extend(xxx2(compound, compound_utile)) return np.array(arr).reshape(1,-1) def run_file(file): count = 0 f = codecs.open(file, 'r', 'utf-8') for line in f: if (count%1000==0): print(count) count += 1 word,pos,cefr = line.rstrip().split("\t") bigrams,syllable_count,suffix_len,compound,compound_count,gen,dop,doh,topic_dist = run(word,pos) #run(word,pos,cefr) # ignore unigrams # map unigram_dist # unimap = umap(unigram_dist) #bismap = bimap(bigram_dist) # # #st = str(len(word))+","+pos+","+str(syllable_count)+","+",".join(str(s) for s in bismap)+","+str(suffix_len)+","+str(compound_count)+","+cefr # # pos is not really helpful + xxx(trigrams, trigram_pickle) + st = str(len(word))+","+str(syllable_count)+","+str(suffix_len)+","+str(compound_count)+","+str(gen)+","+str(dop)+","+str(doh)+","+xxx(bigrams, bigram_utile)+ ",".join([str(x) for x in topic_dist])+","+ xxx(compound, compound_utile) + cefr+"\n" #","+cefr+"\n" #o.write(st) def cefrmap(cefr): if (cefr == "A1"): return 1 if (cefr == "A2"): return 2 if (cefr == "B1"): return 3 if (cefr == "B2"): return 4 if (cefr == "C1"): return 5 if (cefr == "C2"): return 6 return 0 def run(word, pos): syllable_count = count_syllables(word) #unigrams = ngrams(word, 1) #unigram_dist = Counter(unigrams) bigrams = padded_ngrams(word, 2) #bigram_dist = Counter(bigrams) #trigrams = padded_ngrams(word, 3) #trigram_dist = Counter(trigrams) suffix = suffixes(word) suffix_len = len(suffix) compound = compounds(word, pos) compound_count = len(compound) topics = get_topics(word) gen = -2 if (pos == "NN"): gen = gender(word,pos) dop = polysemy(word, pos) doh = homonymy(word) return (bigrams,syllable_count,suffix_len,compound,compound_count,gen,dop,doh,topics) #return (syllable_count,suffix,suffix_len,compound,compound_count) if __name__ == '__main__': run_file("svalex-adj-d.csv")