import codecs from collections import defaultdict import sys import math truth = defaultdict(set) if __name__ == "__main__": num = int(sys.argv[1]) else: num = 1000 with codecs.open('data/saldo/saldo_baseforms.txt', encoding='utf-8') as f: for l in f: (w,p) = l[:-1].split('\t') truth[w].add(p) trainp = defaultdict(set) with codecs.open('data/saldo/p_names.txt', encoding='utf-8') as f: for l in f: (w,p) = l[:-1].split('\t') trainp[w].add(p) paradigms = defaultdict(set) with codecs.open('paradigms/se_nouns_train.para', encoding='utf-8') as f: for l in f: (_,insts) = l[:-1].split('\t') xs = [v.split('=')[1] for i in insts.split('#') for v in i.split(',') if v[0] == '0'] for x in xs: paradigms['p_%s' % xs[0]].update(trainp[x]) paradigms_in_selection = set() for s in paradigms.values(): paradigms_in_selection.update(s) data = [] with codecs.open('output/sv_c_nouns.txt', encoding='utf-8') as f: for l in f: (d,r,_) = l[:-1].split('\t') (p,w) = r.split(' ') data.append((float(d),p,w)) (correct,unknown,wrong) = (0,0,0) count = 0 bins = {1:{'correct':0,'wrong':0}} bin_number = 1 for (d,p,w) in sorted(set(data),reverse=True): if count == num: break if w in trainp: pass else: count += 1 if count > bin_number*(num/10): bin_number += 1 bins[bin_number] = {'correct':bins[bin_number-1]['correct'],'wrong':bins[bin_number-1]['wrong']} if len(paradigms[p].intersection(truth[w])) > 0: s = '+\t%d\t%s %s\t%f\t%s' % (count,p,w,d," ".join(paradigms[p].intersection(truth[w]))) bins[bin_number]['correct'] += 1 elif len(paradigms_in_selection.intersection(truth[w])) > 0: s = '-\t%d\t%s %s\t%f\t%s' % (count,p,w,d, list(paradigms[p])[0] + " => " + " ".join(paradigms_in_selection.intersection(truth[w]))) bins[bin_number]['wrong'] += 1 elif len(set([x.split('_')[1] for x in paradigms[p]]).intersection(set([x.split('_')[1] for x in truth[w]]))) > 0: s = '+?\t%d\t%s %s\t%f\t%s' % (count,p,w,d, list(paradigms[p])[0] + " =? " + " ".join(truth[w])) bins[bin_number]['correct'] += 1 else: s = '-?\t%d\t%s %s\t%f\t%s' % (count,p,w,d, list(paradigms[p])[0] + " =>? " + " ".join(truth[w])) bins[bin_number]['wrong'] += 1 print s.encode('utf-8') print for (n,sc) in bins.iteritems(): (c,w) = (bins[n]['correct'],bins[n]['wrong']) total = float(c + w) (cp,wp) = (c/total*100, w/total*100) print '%s\tC: %d (%s), W: %d (%s)' % (str(n*10)+'%', bins[n]['correct'], '%.1f' % cp+'%' ,bins[n]['wrong'],'%.1f' % wp+'%')