import codecs
import evaluate
import os.path
import random
from sklearn.cross_validation import KFold 
from subprocess import Popen, PIPE
import sys

"""
Creates new files of the data, divided into test, dev and train,
based on the value of seed, folds, n_baseforms and the inputfiles.
Writes the abstract paradigms of each language to file.
Tunes the svm configurations.
An index of the files are kept in indexfile.
"""

# TODO this should be set in Makefile
_todir = 'xval/'

def preprocess(seed, folds, n_baseforms, indexfile, inputfiles, tmp):
    # Create a file with only n_baseforms random words
    path = evaluate.set_path()
    langname = '_'.join(os.path.basename(inputfiles['table_file']).split('.')[0].split('_')[-2:])
    filename = create_filename(langname, seed, folds, n_baseforms, path)
    config = inputfiles['config_file']
    indexfile = path+indexfile # the name of the index file 
    if os.path.isfile(filename) and conf_exist(filename, config):
          print langname,'already ok'
          return
    baseforms = [b.strip() for b in codecs.open(inputfiles['baseform_file'],encoding='utf-8').readlines()]
    used_baseforms = collect_and_print_baseforms(baseforms, n_baseforms, seed, filename)


    # Create folds
    print langname, 'using', len(used_baseforms), 'baseforms'
    ss = KFold(len(used_baseforms),n_folds=folds,shuffle=True,random_state=seed)
         
    gold_tables = read_gold(inputfiles['table_file'])
    for i,(train,test) in enumerate(ss):
        print langname, 'fold', i
        txtfile, parafile, devfile, testfile, testtablefile = create_fold_names(i, langname, filename, indexfile, path)
        print '\tcreating paradigms'
        create_para_file(parafile, txtfile, train, used_baseforms, gold_tables, path)
        print '\tprinting data files'
        print_test_file(test, used_baseforms, gold_tables, devfile, testfile, testtablefile)
        add_to_index(langname,indexfile,parafile,testfile,testtablefile,txtfile,filename)
        print '\tconfiguring'

        make_conf(parafile, devfile, testtablefile, parafile, inputfiles, config, tmp)
    print langname,'ok'


def read_gold(gold_file):
    current_baseform,current_table = '',[]
    gold_tables     = {}
    for line in codecs.open(gold_file,'r',encoding='utf-8'):
        if line.strip():
            forms = line.strip().split(',')
            # New table
            if forms[1]!=current_baseform:
                if current_table:
                    gold_tables[current_baseform] = current_table
                    current_table = []
                current_baseform = forms[1]
                current_table.append(','.join(forms[0].split('|')))
            # Traversing table
            else:
                current_table.append(','.join(forms[0].split('|')))
    if current_table:
          gold_tables[current_baseform] = current_table
    return gold_tables


def create_fold_names(i, langname, filename, indexfile, path):
    """
    Give names to txtfile      with the training data baseforms
                  paradigmfile with the abstract paradigms for the training data
                  testfile     with the baseforms for the test data
                  devfile      with the baseforms for the dev data
                  testtable    with the correct inflection for the test data
    """
    path,ext = os.path.splitext(filename)
    txtfile       = os.path.normpath('%s-%d.txt'      % (path,i))
    paradigmfile  = os.path.normpath('%s-%d.para'     % (path,i))
    testfile      = os.path.normpath('%s-%d-test.txt' % (path,i))
    devfile       = os.path.normpath('%s-%d-dev.txt'  % (path,i))
    testtablefile = os.path.normpath('%s-%d-test-table.txt' % (path,i))
    return txtfile, paradigmfile, devfile, testfile,testtablefile


def create_filename(langname, seed, folds, n_baseforms, path):
    return '%s/data/%s/%s-%d-%d-%d.txt' % (path,_todir,langname,n_baseforms,folds,seed)


def print_test_file(test_indices, baseforms, tables, devfilename, testfilename, testtablefilename):
    testdevwords = [baseforms[i] for i in test_indices]
    half = len(testdevwords)/2
    testwords = '\n'.join(testdevwords[:half])
    devwords = '\n'.join(testdevwords[half:])
    testtables = []
    for i in test_indices:
        table = tables[baseforms[i]]
        base = baseforms[i]
        testtables.append('\n'.join(row+','+base for row in table))
    codecs.open(testfilename, 'w', encoding='utf8').write(testwords)
    codecs.open(devfilename, 'w', encoding='utf8').write(devwords)
    codecs.open(testtablefilename, 'w', encoding='utf8').write('\n\n'.join(testtables))

def add_to_index(langname,indexfile,parafile,testfile,testtablefile,txtfile,allbases):
    #add to index
    open(indexfile, 'a').write('\n%s' % '\t'.join((langname,parafile,txtfile,testfile,testtablefile,allbases)))


def create_para_file(paradigmfile, txtfile, indices, baseforms, tables, path):
    """ 
    This function will call the perl program extract to create
    the abstract paradigms
    """

    # Generate a new para file using extract
    name, ext = os.path.splitext(paradigmfile)
    tmp_file = name+'.tmp' 
    extract =  path+'/src/extract.perl'
    fileobj = codecs.open(paradigmfile, 'w', encoding='utf8')

    train_paradigms = ['\n'.join(tables[baseforms[i]]) for i in indices]
    train_bases     = '\n'.join([baseforms[i] for i in indices])
    print >> sys.stderr, 'Writing trainig baseforms to txt file'
    codecs.open(txtfile, 'w', encoding='utf8').write(train_bases)
    # training data may be to large to pipe it from echo, therefore write tmp file
    print >> sys.stderr, 'Writing trainig paradigms to tmp'
    codecs.open(tmp_file, 'w', encoding='utf8').write('\n\n'.join(train_paradigms))
    print >> sys.stderr, 'Calling extract: cat',tmp_file,'| extract -u >',paradigmfile
    p1 = Popen(["cat",tmp_file], stdout=PIPE)
    p2 = Popen([extract, '-u'], stdin=p1.stdout, stdout=fileobj, stderr=PIPE) 
    p1.stdout.close()  # Allow p1 to receive a SIGPIPE if p2 exits.
    errors = p2.communicate()[1]
    if errors: print >> sys.stderr, 'Errors:',errors
    #Popen(['sed', 's/,,/,/g','-i',paradigmfile], stderr=PIPE) 
    print >> sys.stderr, 'Saving results in',paradigmfile


def collect_and_print_baseforms(baseforms, n, seed, filename):
     """ Selects a given number of baseforms, using the seed """
     if n< len(baseforms):
         random.seed(seed)
         use_baseforms = random.sample(baseforms,n)
     else:
         use_baseforms = baseforms
     codecs.open(filename, 'w',encoding='utf-8').write('\n'.join(use_baseforms))
     return use_baseforms


def conf_exist(langname, config):
    """ Search for configuration file """
    if os.path.isfile(config):
        conffile = open(config)
    else:
        conffile = open('src/'+config)
    return [line.strip() for line in conffile if '#' not in line and langname in line]

def make_conf(paradigmfile, devfile, testtables, langname, inputfiles,config,tmp):
    """ Tunes the configuration and writes the results to the conf file """
    import classifier_svm_tune
    conf_input = (paradigmfile, inputfiles['wiki_file'],testtables,devfile)
    print >> sys.stderr, 'Creating new conf file',config, langname, conf_input
    classifier_svm_tune.tune([langname], {langname: conf_input}, config=config, append=True,tmp=tmp)

# Usage: python preprocess.py 41 10 5000 wiki_file inflection_file baseform_file config_file tmp_dir
if __name__ == "__main__":
      seed        = int(sys.argv[1])
      folds       = int(sys.argv[2])
      n_baseforms = int(sys.argv[3])
      indexname = sys.argv[4]
      inputfiles  = {'wiki_file' : sys.argv[5]
                    ,'table_file' : sys.argv[6]
                    ,'baseform_file' : sys.argv[7]
                    ,'config_file' : sys.argv[8]
                    }
      tmp = sys.argv[9]

      preprocess(seed, folds, n_baseforms, indexname, inputfiles, tmp)