# -*- coding: utf-8 -*- import codecs import sys def testandeval(gold_file,verbose_file,out_file): gold_tables = read_gold(gold_file) possible_tables = parse_verbose(verbose_file) suggestions = [] no_solution = [] max_scores = [] all_correct,pairwise = 0,[] distribution = [] noOfchunks = 10 for word,tables in possible_tables.iteritems(): gold_table = gold_tables[word] max_score = 0 suggested_score = None best_paradigm = ('',[]) for table,paradigm,confidence in tables: n = rate([w for w,c in table],gold_table) if suggested_score is None: # First table = suggested table suggested_score = n correct = n==1 all_correct += correct pairwise.append((confidence,n,correct)) suggested_paradigm = paradigm suggested_table = table suggested_confidence = confidence if n > max_score: max_score = n best_paradigm = (paradigm,table,confidence) if max_score > suggested_score: suggestions.append((word,best_paradigm,max_score,suggested_table,suggested_paradigm,suggested_score,gold_table,suggested_confidence)) if max_score<1: no_solution.append((word,max_score)) max_scores.append(max_score) distribution = [] for i,chunk in enumerate(chunks(sorted(pairwise),noOfchunks)): c_val = sum(c for (conf,p,c) in chunk) p_val = sum(p for (conf,p,c) in chunk) l = float(len(chunk)) distribution.append((noOfchunks-i,100*c_val/l,100*p_val/l,c_val,l)) # Printing printings = [] printings.append('Tested %d words, found %d better paradigms' % (len(possible_tables),len(suggestions))) if no_solution: printings.append('No matching paradigm found for %d words: %s' % (len(no_solution),' '.join(w for w,s in no_solution))) printings.append('Oracle score: %f, %d/%d' % (sum(max_scores)/float(len(max_scores)),sum(s for s in max_scores if s==1),len(max_scores))) printings.append('Correct %f (%d/%d)' % (float(all_correct)/len(pairwise),all_correct,len(pairwise))) printings.append('Top %d: %d%% (%d%%) (%d/%d)' % (distribution[-1])) printings.append('Distribution: ' + '/'.join(str(int(c)) for (_,c,_,_,_) in reversed(distribution))) printings.append('Pairwise %f' % (sum(p for (co,p,c) in pairwise)/float(len(pairwise)))) printings.append('Distribution: ' + '/'.join(str(int(p)) for (_,_,p,_,_) in reversed(distribution))) printings.append('+ means that the form could be improved by picking the suggested paradigm') printings.append('- means that the form would be wrong when using the suggested paradigm') printings.append('x means that the form would be wrong using when using either the guessed or the suggested paradigm') output = ['Correct %.3f (%d/%d)' % (float(all_correct)/len(pairwise),all_correct,len(pairwise)) ,'Pairwise %.3f' % (sum(p for (co,p,c) in pairwise)/float(len(pairwise))) ,'Oracle score: %.3f, %d/%d' % (sum(max_scores)/float(len(max_scores)),sum(s for s in max_scores if s==1),len(max_scores))] for w,(max_p,max_t,max_c),max_s,g_t,g_p,g_s,gold,s_conf in suggestions: printings.append('\n%s: %s %.2f (%s) instead of %s %.2f (%s)' % (w,max_p,max_s,max_c,g_p,g_s,s_conf)) printings.append('Gold %s %s' % (max_p,g_p)) printings.append('\n'.join('\t'.join(pp(x))+'\t'+improvement(x) for x in zip(gold,max_t,g_t))) print ("\n".join(printings)).encode('utf-8') codecs.open(out_file,'w',encoding='utf8').write('\n'.join(output)) def pp((a,b,c)): return a,':'.join(b),':'.join(c) def improvement((gold,new,old)): if gold==new[0]: if new[0]==old[0]: return '' else: return '+' else: if gold==old[0]: return '-' else: return 'x' def read_gold(gold_file): current_baseform,current_table = '',[] gold_tables = {} for line in codecs.open(gold_file,'r',encoding='utf-8'): forms = line.split(',') # New table if forms[1]!=current_baseform: if current_table: gold_tables[current_baseform] = current_table current_table = [] current_baseform = forms[1] current_table.append(forms[0].strip()) # Traversing table else: current_table.append(forms[0].strip()) if current_table: gold_tables[current_baseform] = current_table return gold_tables def parse_verbose(stream): table_dict = {} word = '' our_table = [] table, paradigm, confidence = [], '', 0 for line in stream: if line.strip(): line = line.decode('utf8') if not line.startswith('\t'): if paradigm: table_dict.setdefault(word,[]).append((table,paradigm,confidence)) word = line.split()[0] table,paradigm = [],'' elif line.strip().startswith('p_'): if paradigm: table_dict.setdefault(word,[]).append((table,paradigm,confidence)) paradigm = line.split()[0] confidence = float(line.split()[2]) table = [] else: table.append(line.strip().split(':')) if paradigm: table_dict.setdefault(word,[]).append((table,paradigm,confidence)) return table_dict def rate(table,g_table): score = [] if len(table)!=len(g_table): print 'rating unequals',table[0],len(table),g_table[0],len(g_table) for (guess,gold) in zip(table,g_table): score.append(guess==gold) return sum(score)/float(len(score)) def chunks(l,noOfchunks): n = int(len(l)/float(noOfchunks)) return [l[i:i+n] for i in range(0, len(l), n)] if __name__ == "__main__": print 'Testing',sys.argv[1] testandeval(sys.argv[1],sys.stdin,sys.argv[2])