import codecs import evaluate import os.path import random from sklearn.cross_validation import KFold from subprocess import Popen, PIPE import sys """ Creates new files of the data, divided into test, dev and train, based on the value of seed, folds, n_baseforms and the inputfiles. Writes the abstract paradigms of each language to file. Tunes the svm configurations. An index of the files are kept in indexfile. """ # TODO this should be set in Makefile _todir = 'xval/' def preprocess(seed, folds, n_baseforms, indexfile, inputfiles, tmp): # Create a file with only n_baseforms random words path = evaluate.set_path() langname = '_'.join(os.path.basename(inputfiles['table_file']).split('.')[0].split('_')[-2:]) filename = create_filename(langname, seed, folds, n_baseforms, path) config = inputfiles['config_file'] indexfile = path+indexfile # the name of the index file if os.path.isfile(filename) and conf_exist(filename, config): print langname,'already ok' return baseforms = [b.strip() for b in codecs.open(inputfiles['baseform_file'],encoding='utf-8').readlines()] used_baseforms = collect_and_print_baseforms(baseforms, n_baseforms, seed, filename) # Create folds print langname, 'using', len(used_baseforms), 'baseforms' ss = KFold(len(used_baseforms),n_folds=folds,shuffle=True,random_state=seed) gold_tables = read_gold(inputfiles['table_file']) for i,(train,test) in enumerate(ss): print langname, 'fold', i txtfile, parafile, devfile, testfile, testtablefile = create_fold_names(i, langname, filename, indexfile, path) print '\tcreating paradigms' create_para_file(parafile, txtfile, train, used_baseforms, gold_tables, path) print '\tprinting data files' print_test_file(test, used_baseforms, gold_tables, devfile, testfile, testtablefile) add_to_index(langname,indexfile,parafile,testfile,testtablefile,txtfile,filename) print '\tconfiguring' make_conf(parafile, devfile, testtablefile, parafile, inputfiles, config, tmp) print langname,'ok' def read_gold(gold_file): current_baseform,current_table = '',[] gold_tables = {} for line in codecs.open(gold_file,'r',encoding='utf-8'): if line.strip(): forms = line.strip().split(',') # New table if forms[1]!=current_baseform: if current_table: gold_tables[current_baseform] = current_table current_table = [] current_baseform = forms[1] current_table.append(','.join(forms[0].split('|'))) # Traversing table else: current_table.append(','.join(forms[0].split('|'))) if current_table: gold_tables[current_baseform] = current_table return gold_tables def create_fold_names(i, langname, filename, indexfile, path): """ Give names to txtfile with the training data baseforms paradigmfile with the abstract paradigms for the training data testfile with the baseforms for the test data devfile with the baseforms for the dev data testtable with the correct inflection for the test data """ path,ext = os.path.splitext(filename) txtfile = os.path.normpath('%s-%d.txt' % (path,i)) paradigmfile = os.path.normpath('%s-%d.para' % (path,i)) testfile = os.path.normpath('%s-%d-test.txt' % (path,i)) devfile = os.path.normpath('%s-%d-dev.txt' % (path,i)) testtablefile = os.path.normpath('%s-%d-test-table.txt' % (path,i)) return txtfile, paradigmfile, devfile, testfile,testtablefile def create_filename(langname, seed, folds, n_baseforms, path): return '%s/data/%s/%s-%d-%d-%d.txt' % (path,_todir,langname,n_baseforms,folds,seed) def print_test_file(test_indices, baseforms, tables, devfilename, testfilename, testtablefilename): testdevwords = [baseforms[i] for i in test_indices] half = len(testdevwords)/2 testwords = '\n'.join(testdevwords[:half]) devwords = '\n'.join(testdevwords[half:]) testtables = [] for i in test_indices: table = tables[baseforms[i]] base = baseforms[i] testtables.append('\n'.join(row+','+base for row in table)) codecs.open(testfilename, 'w', encoding='utf8').write(testwords) codecs.open(devfilename, 'w', encoding='utf8').write(devwords) codecs.open(testtablefilename, 'w', encoding='utf8').write('\n\n'.join(testtables)) def add_to_index(langname,indexfile,parafile,testfile,testtablefile,txtfile,allbases): #add to index open(indexfile, 'a').write('\n%s' % '\t'.join((langname,parafile,txtfile,testfile,testtablefile,allbases))) def create_para_file(paradigmfile, txtfile, indices, baseforms, tables, path): """ This function will call the perl program extract to create the abstract paradigms """ # Generate a new para file using extract name, ext = os.path.splitext(paradigmfile) tmp_file = name+'.tmp' extract = path+'/src/extract.perl' fileobj = codecs.open(paradigmfile, 'w', encoding='utf8') train_paradigms = ['\n'.join(tables[baseforms[i]]) for i in indices] train_bases = '\n'.join([baseforms[i] for i in indices]) print >> sys.stderr, 'Writing trainig baseforms to txt file' codecs.open(txtfile, 'w', encoding='utf8').write(train_bases) # training data may be to large to pipe it from echo, therefore write tmp file print >> sys.stderr, 'Writing trainig paradigms to tmp' codecs.open(tmp_file, 'w', encoding='utf8').write('\n\n'.join(train_paradigms)) print >> sys.stderr, 'Calling extract: cat',tmp_file,'| extract -u >',paradigmfile p1 = Popen(["cat",tmp_file], stdout=PIPE) p2 = Popen([extract, '-u'], stdin=p1.stdout, stdout=fileobj, stderr=PIPE) p1.stdout.close() # Allow p1 to receive a SIGPIPE if p2 exits. errors = p2.communicate()[1] if errors: print >> sys.stderr, 'Errors:',errors #Popen(['sed', 's/,,/,/g','-i',paradigmfile], stderr=PIPE) print >> sys.stderr, 'Saving results in',paradigmfile def collect_and_print_baseforms(baseforms, n, seed, filename): """ Selects a given number of baseforms, using the seed """ if n< len(baseforms): random.seed(seed) use_baseforms = random.sample(baseforms,n) else: use_baseforms = baseforms codecs.open(filename, 'w',encoding='utf-8').write('\n'.join(use_baseforms)) return use_baseforms def conf_exist(langname, config): """ Search for configuration file """ if os.path.isfile(config): conffile = open(config) else: conffile = open('src/'+config) return [line.strip() for line in conffile if '#' not in line and langname in line] def make_conf(paradigmfile, devfile, testtables, langname, inputfiles,config,tmp): """ Tunes the configuration and writes the results to the conf file """ import classifier_svm_tune conf_input = (paradigmfile, inputfiles['wiki_file'],testtables,devfile) print >> sys.stderr, 'Creating new conf file',config, langname, conf_input classifier_svm_tune.tune([langname], {langname: conf_input}, config=config, append=True,tmp=tmp) # Usage: python preprocess.py 41 10 5000 wiki_file inflection_file baseform_file config_file tmp_dir if __name__ == "__main__": seed = int(sys.argv[1]) folds = int(sys.argv[2]) n_baseforms = int(sys.argv[3]) indexname = sys.argv[4] inputfiles = {'wiki_file' : sys.argv[5] ,'table_file' : sys.argv[6] ,'baseform_file' : sys.argv[7] ,'config_file' : sys.argv[8] } tmp = sys.argv[9] preprocess(seed, folds, n_baseforms, indexname, inputfiles, tmp)