# -*- encoding: UTF-8 -*- from __future__ import division import codecs import corpus as C import numpy as np import os.path from paradigms import Paradigms import random import sys # TODO ska vi ha med korpus-koden? """ This module is resposible for the cross evaluation. The data should be prepared (by using preprocess.py) and the paths to the needed files for each iteration should be found in an index file (data/xval/index.txt). """ # all existing classifiers all_classifiers = ["classifier_dummy","classifier_most_freq","classifier_most_freq_c_r","classifier_most_freq_c", "classifier_maxsuff", "classifier_maxsuff_c","classifier_svm" ,"classifier_dfa", "classifier_dfa_pair"] def eval(classifiers, test_languages, out, xval='', path="../", verbose=True, config='classifier_svm_true_new5.conf', langdir={}): """ Evaluates the specified classifiers on the specified languages """ if xval: index_file = read_index(xval) else: index_file = langdir languages = [] for language in test_languages: if verbose: print >>out, '' print 'lang',language print >>out, '[ %s ]' % language fold_data = index_file[language] # the corpus file should always be in data/wikipedia corpus_file = path+'data/wikipedia/stats_'+language.split('_')[0]+'.txt' if verbose: print >>out,'Cross validation on %s with %d folds' % (language, len(fold_data)) best_form_accs = [] best_table_accs = [] corpus = C.Corpus(corpus_file) paras = [Paradigms(info[0]) for info in fold_data] gold_tables = [read_gold(info[2]) for info in fold_data] if verbose: print 'read gold from', ', '.join([info[2] for info in fold_data]) for cname, c in classifiers: xres = [] # go through the folds for i, (para, test, tables) in enumerate(fold_data): p = paras[i] if verbose==2: print>>out, 'Iteration',i, print>>out, 'paradigms',p.paradigmfilename, len(test) if "svm" in cname: # all svms want a config? c.config = path+config if verbose==2: print 'configuration file'':',c.config c.thislang = os.path.normpath(para) c.out = out c.train(p, corpus) gold = gold_tables[i] res = test_words(c, gold, test, p, out, verbose=verbose) xres.append(res) if verbose==2: print_classoutput(res, out) print >> out, '' print >> out, cname oracle_table =np.array([count_match(float("inf"),x['matches'],x['total_words']) for x in xres]) match_pos = sum((x['matches'] for x in xres),[]) total = np.array([x['total_words'] for x in xres]) oracle_forms = np.array([x['oracle_form']/x['all_forms'] for x in xres]) forms = np.array([x['all_forms'] for x in xres]) form_acc = np.array([x['form_acc']/x['all_forms'] for x in xres]) table_acc = np.array([x['all_correct']/x['total_words'] for x in xres]) best_form_accs.append((form_acc.mean(), cname)) best_table_accs.append((table_acc.mean(), cname)) correct_t = sum((x['all_correct'] for x in xres)) correct_f = sum((x['form_acc'] for x in xres)) non_exists = np.array([x['non_exist']/x['all_forms'] for x in xres]) variants = np.array([x['variants']/x['all_forms'] for x in xres]) print_classoutput2(total.sum(), forms.sum(), form_acc, table_acc, correct_t, correct_f, match_pos, non_exists, variants, out) # The below are the same for all classfiers, to be used in printing later total = total.sum() oracle_forms = oracle_forms.mean() oracle_table = oracle_table.mean() print >> out, '\n oracle_table: %.2f%s' % (oracle_table,'%') print >> out, ' oracle_form: %.2f%s' % (100*oracle_forms,'%') no_paras = sum(len(p.instances) for p in paras[0].paradigms.values()) print >> out, 'Used %d train tables in each iteartion' % no_paras print >> out, '' languages.append((language, best_form_accs, best_table_accs)) print >> out, '' print >> out,'* Summary\n[table acc.]' print >> out,11*' ', for cname, c in classifiers: print >> out,'%11s' % '_'.join(cname.split('_')[1:]), print >> out, '' for lang_res in languages: print >> out, ' %-9s'% lang_res[0], print >> out, ''.join(('%11.2f%%' % round(100*s,2) for s,c in lang_res[2])) print >> out, '[form acc.]' print >> out, 11*' ', for cname, c in classifiers: print >> out, '%11s' % '_'.join(cname.split('_')[1:]), print >> out, '' for lang_res in languages: print >> out, ' %-9s'% lang_res[0], print >> out, ''.join(('%11.2f%%' % round(100*s,2) for s,c in lang_res[1])) return (best_table_accs[0],form_acc) # Only for svm_conf def read_index(indexfile): # lang : (para, train, test, tables, all_bases) index_dir = {} for line in codecs.open(indexfile).readlines(): if line.strip(): xs = line.strip().split('\t') index_dir.setdefault(xs[0],[]).append((path+xs[1],path+xs[3],path+xs[4])) return index_dir def find_conf(langname, p, c, t, test,config): if os.path.isfile(config): return config else: return 'src/'+config def collect_baseforms(baseforms, n, seed): if n< len(baseforms): random.seed(seed) return random.sample(baseforms,n) else: return baseforms def print_classoutput2(total, forms, form_acc, table_acc, correct_t, correct_f, match_pos, non_exist, variants, out): print >> out,' table: %.2f%s (%d of %d)' % (100*table_acc.mean(),'%', correct_t, total), print >> out,'\tSd: %.2f. Max %.2f%%. Min %.2f%%' % (table_acc.std()*100, table_acc.max()*100, table_acc.min()*100) print >> out,' form: %.2f%s (%d of %d).' % (100*form_acc.mean(),'%', correct_f,forms), print >> out,'\tSd: %.2f. Max %.2f%%. Min %.2f%%' % (form_acc.std()*100, form_acc.max()*100, form_acc.min()*100) print >> out,' Empty forms/fold: %.2f%%. Variants/fold: %.2f%%' % (non_exist.mean(), variants.mean()) print >> out,' 1: %.2f%s' % (count_match(1,match_pos,total),'%'), print >> out,'2: %.2f%s' % (count_match(2,match_pos,total),'%'), print >> out,'3: %.2f%s' % (count_match(3,match_pos,total),'%'), print >> out,'5: %.2f%s' % (count_match(5,match_pos,total),'%'), print >> out,'10: %.2f%s' % (count_match(10,match_pos,total),'%') def print_classoutput(res, out): total = res['total_words'] match_pos = res['matches'] forms = res['all_forms'] print >>out, 'words: %d)' % total print >>out, ' table: %.2f%s (%d of %d)' % (100*(res['all_correct']/float(total)),'%', res['all_correct'], total) print >>out, ' form: %.2f%s (%d of %d)' % (100*(res['form_acc']/float(forms)),'%',res['form_acc'],forms) print >>out, ' Empty forms: %.2f%%, forms with variants: %.2f%%' % (100*(res['non_exist']/float(forms)),100*(res['variants']/float(forms))) print >>out, ' 1: %.2f%s' % (count_match(1,match_pos,total),'%'), print >>out, '2: %.2f%s' % (count_match(2,match_pos,total),'%'), print >>out, '3: %.2f%s' % (count_match(3,match_pos,total),'%'), print >>out, '5: %.2f%s' % (count_match(5,match_pos,total),'%'), print >>out, '10: %.2f%s' % (count_match(10,match_pos,total),'%') def count_match(pos, match_pos, total): return 100*len([x for x in match_pos if x> out, '' print >> out, word total +=1 max_score = 0 word = word.strip() g_table = gold_tables.get(word,'') this_forms = len(g_table) -1 # -1 to avoid counting the Input form i = 1 if not g_table: print >> sys.stderr, "Error! Word %s was not present in gold standard" % word #.encode('utf-8') bs = classifier.classify(word) if verbose==2: print>> out, 'options',len(bs) match_position, found_match = 0, float("inf") for b,confidence in bs: # Select the best choice from the classifier table = paradigmobj.get_table(b, word) if table: if verbose==2: print >> out, '\t%d/%d: %s as %s (%.2f)?'% (i,len(bs),word,b,confidence), # Pick the first option returned. # Different options correspond to differnt matching strategies table = table[0] score, forms, ne, var = rate(table, g_table, out,verbose and match_position==0) non_exist += ne variants += var max_score = max(score,max_score) if match_position==0: all_correct += score==forms form_acc += score # the number of forms depends on whether the classifier finds the # non-existing forms or not this_forms = forms if score==forms: found_match = match_position if verbose==2: print >> out, 'Correct!' break match_position += 1 # counts the number of invalid suggestions from the classifier if verbose==2: # and match_position==0: print >> out, '\tIncorrect: Guessed',b,'for word',word,'scoring',score,'out of',forms else: if verbose==2: print >> out, '\t%d/%d: Fail %s = %s (%.2f)'% (i,len(bs),word,b,confidence) match_position += 1 # counts the number of invalid suggestions from the classifier i += 1 all_forms += this_forms if verbose==2 and found_match==float("inf"): print >> out, '---->',word # .encode('utf-8') elif verbose==2 and found_match!=0: print >> out, '-->',word.encode('utf-8') oracle_form += max_score # add the highest encountered score matches.append(found_match) return {'all_correct' : all_correct ,'form_acc' : form_acc ,'all_forms' : all_forms ,'matches' : matches ,'total_words': total ,'oracle_form': oracle_form ,'non_exist' :non_exist ,'variants' : variants } def rate(table,g_table, out,verbose): # TODO explain how scores are given to variant forms if not table: # Check that classifier returns a valid guess non_exist = sum(1 for f in g_table if f.endswith('*')) variants = sum(len(g.split(','))-1 for g in g_table) return 0,len(g_table), non_exist, variants score = [] keep = [] non_exist, variants = 0,0 if len(table)!=len(g_table): print >> out, 'rating unequals',table[0],len(table),g_table[0],len(g_table) print >> out, table print >> out, g_table # remove the first form (the Input slot) of the tables before comparing for (guess,gold) in zip(table[1:],g_table[1:]): is_nonexist = gold.endswith('*') non_exist += int(is_nonexist) variants += len(gold.split(','))-1 # if gold splits more than one form, then n variants are used if verbose==2: keep.append((gold,guess,'-' if gold!=guess else "")) if not is_nonexist or guess!=gold: # if it is a non-exist form, only negative scores are counted if verbose==2: print >> out, 'counting score', is_nonexist, guess!=gold score.append(guess==gold) keep.append((gold,guess,'-' if gold!=guess else "")) if verbose==2 and sum(score)!=len(score): print >> out, '\n\t', '\n\t'.join(' '.join(k) for k in keep) #guesses = guess.split(',') return sum(score),len(score), non_exist, variants def read_baseforms(inputfile): return [line.strip() for line in codecs.open(inputfile,'r',encoding='utf-8')] def read_gold(gold_file): current_baseform,current_table = '',[] gold_tables = {} for line in codecs.open(gold_file,'r',encoding='utf-8'): if line.strip(): forms = line.strip().split(',') # New table if forms[-1]!=current_baseform: if current_table: gold_tables[current_baseform] = current_table current_table = [] current_baseform = forms[-1] current_table.append(','.join(forms[:-1])) # Traversing table else: current_table.append(','.join(forms[:-1])) if current_table: gold_tables[current_baseform] = current_table return gold_tables def set_path(): # If file is run from Make (..) set path to other files to '.', # if run from src, set path to other files to '..' if os.path.dirname(sys.argv[0]): return "./" else: return "../" all_languages = ["de_noun","de_verb","es_verb","fi_nounadj","fi_verb","mt_verbs","ca_nouns","ca_verbs","en_nouns","en_verbs","fr_nouns","fr_verbs","gl_nouns","gl_verbs","it_nouns","it_verbs","pt_nouns","pt_verbs","ru_nouns","ru_verbs"] usage = """ Usage: python evaluate.py -c classifier1 classifier2 ... classifierN -l en_nouns de_noun -xval indexfile.txt or for all languages, all classifiers: python evaluate.py -ca ... classifierN -la -xval indexfile.txt -v[0,1,2] verbose -h help """ if __name__ == "__main__": inputs = False verbose = True find_output, output = False, 'std_output.txt' find_xval, xval_file = False, '' find_config, config = False, '' use_all_classifiers = False use_all_languages = False classnames, classifiers, languages = [], [], [] path = set_path() # non-elegant argument parsing if "-h" in sys.argv: print usage sys.exit() for arg in sys.argv[1:]: if arg == "-v0": verbose = False if arg == "-v2": verbose = 2 elif arg == "-xval": find_xval = True elif arg == "-config": find_config = True elif arg == "-ca": use_all_classifiers = True elif arg == "-la": use_all_languages = True elif arg == "-o": find_output = True inputs = False elif arg == "-l": inputs = True elif find_output: output = arg find_output = False elif find_config: config = arg find_config = False elif find_xval: xval_file = arg find_xval = False elif inputs: languages.append(arg) else: classnames.append(arg) if use_all_languages: languages = all_languages if use_all_classifiers: classnames = all_classifiers # load the given classifiers for c in classnames: if c.startswith("classifier_dummy"): import classifier_dummy classifiers.append((c,classifier_dummy.Classifier())) elif c == "classifier_most_freq": import classifier_most_freq classifiers.append((c,classifier_most_freq.Classifier())) #elif c == "classifier_most_freq_c": # import classifier_most_freq_c # classifiers.append((c,classifier_most_freq_c.Classifier())) elif c == "classifier_svm": import classifier_svm classifiers.append((c,classifier_svm.Classifier())) #elif c == "classifier_perceptron": # import classifier_perceptron # classifiers.append((c,classifier_perceptron.Classifier())) #elif c == "classifier_perceptron_c": # import classifier_perceptron_c # classifiers.append((c,classifier_perceptron_c.Classifier())) #elif c == "classifier_ann": # import classifier_ann # classifiers.append((c,classifier_ann.Classifier())) elif c == "classifier_maxsuff": import classifier_maxsuff classifiers.append((c,classifier_maxsuff.Classifier())) #elif c == "classifier_maxsuff_c": # import classifier_maxsuff_c # classifiers.append((c,classifier_maxsuff_c.Classifier())) #elif c == "classifier_maxsuff_c_r": # import classifier_maxsuff_c_r # classifiers.append((c,classifier_maxsuff_c_r.Classifier())) #elif c == "classifier_dfa": # import classifier_dfa # classifiers.append((c,classifier_dfa.Classifier())) #elif c == "classifier_dfa_pair": # import classifier_dfa_pair # classifiers.append((c,classifier_dfa_pair.Classifier())) if not classifiers: print >> sys.stderr, 'No classifiers given' sys.exit(1) with codecs.open(output,'w',encoding='utf-8') as out: eval(classifiers, languages, out, verbose=verbose, path=path, config=config, xval=xval_file)