#-*- coding: utf-8 -*-
'''
Created on Nov 28, 2016

@author: David
'''
from syllable import count_syllables
from pattern import padded_ngrams,suffixes
from compound import compounds
from lexin import gender,homonymy,polysemy
from topic import get_topics

import codecs
import numpy as np

bigram_utile = codecs.open("siwoco/bigram-utile.txt","r",encoding="utf-8").readlines()
trigram_utile = codecs.open("siwoco/trigram-utile.txt", "r", encoding="utf-8").readlines()
compound_utile = codecs.open("siwoco/compound-utile.txt", "r", encoding="utf-8").readlines()

#outfile = "siwoco-svalex-d-topic-bigram-compact.csv"
 
#o = codecs.open(outfile, 'w', 'utf-8')
#o.write("len,pos,syllable_count,"+",".join("uni_"+s for s in alphabet)+","+",".join("bi_"+s for s in bi_alph)+",suffix_len,compound_count,cefr\n")
#o.write("len,syllable_count,suffix_len,compound_count,gender,dop,doh,")
#for i in range(len(bigram_utile)):
#    o.write("bi"+str(i)+",")
#for i in range(len(trigram_utile)):
#    o.write("tr"+str(i)+",")
#for i in range(33):
#    o.write("to"+str(i)+",")
#for i in range(len(compound_utile)):
#    o.write("co"+str(i)+",")
# for i in range(len(suffix_pickle)):
#     o.write("s"+str(i)+",")
#o.write("cefr\n")

def xxx(l,p):
    st = ""
    lt = ["".join(v) for v in l]
    for v in p:
        if v in lt:
            st += "1,"
        else:
            st += "0,"
    return st

def xxx2(l,p):
    st = []
    lt = ["".join(v) for v in l]
    for v in p:
        if v in lt:
            st.append(1)
        else:
            st.append(0)
    return st

def run_vector(word,pos):
    bigrams,syllable_count,suffix_len,compound,compound_count,gen,dop,doh,topic_dist = run(word,pos)
    arr = [len(word),syllable_count,suffix_len,compound_count,gen,dop,doh]
    arr.extend(xxx2(bigrams, bigram_utile))
    arr.extend(topic_dist)
    arr.extend(xxx2(compound, compound_utile))
    return np.array(arr).reshape(1,-1)
        
def run_file(file):
    count = 0
    f = codecs.open(file, 'r', 'utf-8')
    for line in f:
        if (count%1000==0):
            print(count)
        count += 1
        word,pos,cefr = line.rstrip().split("\t")
        bigrams,syllable_count,suffix_len,compound,compound_count,gen,dop,doh,topic_dist = run(word,pos)
        

        #run(word,pos,cefr)
        # ignore unigrams
        # map unigram_dist
#         unimap = umap(unigram_dist)
        #bismap = bimap(bigram_dist)
#         
#         
        #st = str(len(word))+","+pos+","+str(syllable_count)+","+",".join(str(s) for s in bismap)+","+str(suffix_len)+","+str(compound_count)+","+cefr
#         
#        pos is not really helpful  + xxx(trigrams, trigram_pickle) +
        st = str(len(word))+","+str(syllable_count)+","+str(suffix_len)+","+str(compound_count)+","+str(gen)+","+str(dop)+","+str(doh)+","+xxx(bigrams, bigram_utile)+ ",".join([str(x) for x in topic_dist])+","+ xxx(compound, compound_utile) + cefr+"\n"
        
        #","+cefr+"\n"
        
        #o.write(st)

def cefrmap(cefr):
    if (cefr == "A1"):
        return 1
    if (cefr == "A2"):
        return 2
    if (cefr == "B1"):
        return 3
    if (cefr == "B2"):
        return 4
    if (cefr == "C1"):
        return 5
    if (cefr == "C2"):
        return 6
    return 0


    
def run(word, pos):
    syllable_count = count_syllables(word)
    #unigrams = ngrams(word, 1)
    #unigram_dist = Counter(unigrams)
    bigrams = padded_ngrams(word, 2)
    #bigram_dist = Counter(bigrams)
    #trigrams = padded_ngrams(word, 3)
    #trigram_dist = Counter(trigrams)
    suffix = suffixes(word)
    suffix_len = len(suffix)
    compound = compounds(word, pos)
    compound_count = len(compound)
    topics = get_topics(word)
    gen = -2
    if (pos == "NN"):
        gen = gender(word,pos)
        
    dop = polysemy(word, pos)
    doh = homonymy(word)
    return (bigrams,syllable_count,suffix_len,compound,compound_count,gen,dop,doh,topics)
    #return (syllable_count,suffix,suffix_len,compound,compound_count)

if __name__ == '__main__':
    run_file("svalex-adj-d.csv")