import evaluate import codecs import os # This script tunes the parameters for classifier_svm using the dev set # Parameters are output into classifier_svm.conf using the following format: # lang maxlen prefix suffix C # For example: # de_nouns 5 0 1 2.500000 # de_verbs 7 1 1 5.000000 # es_verbs 9 0 1 0.050000 # fi_nounadj 7 0 1 5.000000 # fi_verbs 9 0 1 5.000000 # maxlen: the maximum length substrings to use as features # prefix: whether to use prefix features # suffix: whether to use suffix features # C: penalty parameter for LIBLINEAR # The file classifier_svm.conf is later read by classifier_svm.py when evaluating the test set def runsvm(l, maxlen, prefix, C,languages={},tmp_dir=''): if not languages: languages = {'de_nouns':('../paradigms/de_nouns_train_nodev.para', '../data/wikipedia/stats_de.txt', '../data/wiktionary-morphology-1.1/inflections_de_noun.csv','../data/wiktionary-morphology-1.1/base_forms_de_noun_dev.txt') ,'de_verbs' : ('../paradigms/de_verbs_train_nodev.para', '../data/wikipedia/stats_de.txt', '../data/wiktionary-morphology-1.1/inflections_de_verb.csv', '../data/wiktionary-morphology-1.1/base_forms_de_verb_dev.txt') ,'es_verbs' : ('../paradigms/es_verbs_train_nodev.para', '../data/wikipedia/stats_es.txt', '../data/wiktionary-morphology-1.1/inflections_es_verb.csv', '../data/wiktionary-morphology-1.1/base_forms_es_verb_dev.txt') ,'fi_nounadj' : ('../paradigms/fi_nounadj_train_nodev.para', '../data/wikipedia/stats_fi.txt', '../data/wiktionary-morphology-1.1/inflections_fi_nounadj.csv', '../data/wiktionary-morphology-1.1/base_forms_fi_nounadj_dev.txt') ,'fi_verbs' : ('../paradigms/fi_verbs_train_nodev.para', '../data/wikipedia/stats_fi.txt', '../data/wiktionary-morphology-1.1/inflections_fi_verb.csv', '../data/wiktionary-morphology-1.1/base_forms_fi_verb_dev.txt') } # Temporarily write our current params to classifier_svm.conf # so eval will use them when doing one evaluation on the dev set config_tmp = './%s%sclassifier_svm_tmp.conf' % (tmp_dir,l.split('/')[-1][:5]) f = open(config_tmp, 'w') print >>f, "%s\t%i\t%i\t%i\t%f" % (l,maxlen,prefix,1,C) f.close() classifiers = ['classifier_svm'] import classifier_svm classifiers = ('classifier_svm',classifier_svm.Classifier()) classifiers[1].thislang = l classifiers[1].config = config_tmp lang_d = {l : [(languages[l][0], languages[l][3], languages[l][2])]} with codecs.open(os.devnull,'w',encoding='utf-8') as tmp: x = evaluate.eval([classifiers], [l],tmp,langdir=lang_d, config=config_tmp,verbose=False) (tableacc,formacc) = x return (tableacc,formacc) def tune(languages, language_dict={},append=False,config='config_svm.conf',tmp=''): bestparams = {} for l in languages: besttable = 0.0 bestform = 0.0 i = 0 for maxlen in [3,5,7,9]: for prefix in [1,0]: C = 5.0 #for C in [0.01,0.05,0.15,0.25,0.35,0.45,0.55,0.65,0.75,0.85,0.95,1.1,1.5,1.9,2.5,3.0,5.0]: i += 1 (pertable,perform) = runsvm(l, maxlen, prefix, C, languages=language_dict, tmp_dir=tmp) if pertable >= besttable and perform >= bestform: bestparams[l] = (maxlen,prefix,C) besttable = pertable bestform = perform print '\tTested %d configurations' % i f = open(config, 'a' if append else 'w') for l in languages: print >>f, "%s\t%i\t%i\t%i\t%f" % (l,bestparams[l][0],bestparams[l][1],1,bestparams[l][2]) f.close() if __name__ =="__main__": #tune(['de_nouns','de_verbs','es_verbs','fi_nounadj','fi_verbs']) tune(['mt_verbs'+str(i) for i in range(0,10)])