#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import re import codecs import math from collections import defaultdict class Pattern: def __init__(self,pattern): self.raw = pattern self.vars = [] self.regex = self.__create_regex(pattern) def __create_regex(self,pattern): r = '^' for pc in pattern.split('+'): if pc.isdigit(): self.vars.append(pc) r += r'(.+)' else: r += pc.replace('"','') return re.compile(r+'$') def match(self,s): if len(self.vars) == 1: return [zip(self.vars,[x]) for x in self.regex.findall(s)] else: return [zip(self.vars,x) for x in self.regex.findall(s)] def instantiate(self,var_values): new = self.raw for (i,v) in var_values.iteritems(): new = new.replace(i,v) return new.replace('+','').replace('"','') def strlen(self): d = dict([(v,'') for v in self.vars]) return len(self.instantiate(d)) def embeddedness(self): # internal affix? return dict([(v,(self.raw.find(v) != 0, self.raw.find(v) != len(self.raw)-1)) for v in self.vars]) class Corpus: def __init__(self, candidate_words, filename=None, rule_file=None): self.fuzzymatching = False self.freqs = defaultdict(int) self.words = [w.strip() for w in candidate_words] if filename != None: with codecs.open(filename,encoding='utf-8') as f: for l in f: (w,i) = l[:-1].split('\t') self.freqs[w] = int(i) if rule_file: import fuzzy self.fuzzy = fuzzy.Fuzzy(self.freqs,rule_file,0.2,3) self.fuzzymatching = True def add_frequency_to_table(self,words): if not self.fuzzymatching: return [(w,self.freqs[w]) for w in words] else: res = [] for w in words: variants = self.fuzzy.findvariants(w) freq = sum(self.freqs[v] for v in variants) res.append(("/".join(set([w]+variants)),freq)) return res class Paradigm: def __init__(self, s, corpus): self.corpus = corpus (p, insts) = s.split('\t') self.paradigm = [Pattern(patt.strip()) for patt in p.split('#')] self._instances = [dict([kv.split('=') for kv in inst.split(',')]) for inst in insts.split('#')] self.instances = [(self.instantiate_table(d),d) for d in self._instances] self.id = 'p_' + self.instances[0][1]['0'] self.var_embeddedness = self.__global_embeddedness() def __global_embeddedness(self): es = {} for v in self.paradigm[0].vars: # all patterns have the same vars. es[v] = (False,False) for d in [p.embeddedness() for p in self.paradigm]: es[v] = (es[v][0] or d[v][0], es[v][1] or d[v][1]) return es def instantiate_table(self,var_values): return self.corpus.add_frequency_to_table([patt.instantiate(var_values) for patt in self.paradigm]) def instantiate_using_wordform(self,w,only_baseform): return [self.instantiate_table(m['ms']) for m in self.match_patterns(w,only_baseform)] def match_patterns(self,w,only_baseform=False): var_bindings = [] for patt in [self.paradigm[0]] if only_baseform else self.paradigm: for x in patt.match(w): var_binding = dict(x) if len(var_binding) > 0 and var_binding not in var_bindings: var_bindings.append(var_binding) return [{'w':w,'table':self.instantiate_table(ms),'ms':ms,'p':self} for ms in var_bindings] class Paradigms: def __init__(self,filename, corpus): self.paradigms = [] with codecs.open(filename,encoding='utf-8') as f: self.paradigms = [Paradigm(p.strip(), corpus) for p in f if len(p) > 0] def match_paradigms(self,w, only_baseform=False): return [c for p in self.paradigms for c in p.match_patterns(w,only_baseform)] class Oracle: def __init__(self, words, paradigms_filename, corpus_filename=None, rules_filename=None): self.corpus = Corpus(words, corpus_filename, rules_filename) self.paradigms = Paradigms(paradigms_filename, self.corpus) def len_common_prefix(self,m,n): count = 0 pos = 0 for (m,n) in zip(m,n): if m == n: count += 1 #count += 2**(10-pos) #pos += 1 else: return count return count def ngrams(self,words, n): dictionary = {} # defaultdict(int) for word in words: index = 0 padded_word = ' '*(n-1)+word+' '*(n-1) while index+n <= len(padded_word): gram = padded_word[index:index+n] if gram in dictionary: dictionary[gram] += 1 else: dictionary[gram] = 1 index += 1 return dictionary def compare_ngrams(self,ing,ng): shared = 0 for k in ing: if k in ing: shared += 1 return shared def ngram_score(self,c,n): ibfs = [i[1]['0'] for i in c['p'].instances] # instance baseforms ings = [self.ngrams([ibf], n) for ibf in ibfs] # ngrams for the instance baseforms ng = self.ngrams([c['table'][0][0]],n) # ngrams for the current baseform return max([self.compare_ngrams(ing,ng) for ing in ings]) def nc_score(self,c): match = c['ms'] instances = c['p'].instances nc_scores = [] (baseform,_) = c['table'][0] # This is the 0-form we're given for (_,inst) in instances: paradigmhead = inst['0'] # the 0-form in the paradigm val = self.len_common_prefix(baseform[::-1],paradigmhead[::-1]) # matching suffix length nc_scores.append(val) maxscore = max(nc_scores) numhits = len(filter(lambda x: x==maxscore, nc_scores)) # Number of hits of max. suffix length return (100 * (maxscore + 0.001 * numhits)) # We weight the total by 100 to override dist for sure def instance_table_distribution(self,instances): inst_freqs = zip(*[[n+1 for (_,n) in table] for (table,_) in instances]) total = sum([sum(ns) for ns in inst_freqs]) return [sum(ns)/float(total) for ns in inst_freqs] def c_score(self,c): # itable = self.instance_table_distribution(c['p'].instances) score = 1 for (w,n) in set(c['table']): # zip(table,itable): score += math.log(n+1) return score def paradigm_distribution(self,candidates): num_of_instances = sum([len(c['p'].instances) for c in candidates]) return [(len(c['p'].instances)/float(num_of_instances), c) for c in candidates] def measure(self,dist,c): c_score = self.c_score(c) nc_score = self.nc_score(c) # ng_score = self.ngram_score(c,3) return dist+c_score*nc_score def order_candidates(self,candidates): return sorted([(self.measure(dist,c), (c['p'].id,c['table'])) for (dist,c) in self.paradigm_distribution(candidates)], reverse=True) def go_fish(self,verbose=True,only_baseform=False): for word in self.corpus.words: candidates = self.paradigms.match_paradigms(word,only_baseform) ocandidates = self.order_candidates(candidates) if len(ocandidates) > 0: if verbose: print ('%s\t%d' % (word,self.corpus.freqs[word])).encode('utf-8') for (n,(p,t)) in ocandidates: print ('\t%s %s\t%.2f\n\t\t%s' % (p, t[0][0], n, "\n\t\t".join(['%s:%d' % (w,d) for (w,d) in t]))).encode('utf-8') print else: (n,(p,t)) = ocandidates[0] # winner print ('%s\t%s %s\t%s' % (n,p,word, "#".join([w for (w,_) in t]))).encode('utf-8') if __name__ == "__main__": words = [l.split('\t')[0].decode('utf-8').strip() for l in sys.stdin] n = '-n' in sys.argv f = '-f' in sys.argv b = '-b' in sys.argv corpus = sys.argv[1] if not n else None rules = sys.argv[3-n] if f else None paradigms = sys.argv[1] if n else sys.argv[2] only_base = n or b oracle = Oracle(words, paradigms,corpus,rules) oracle.go_fish('-v' in sys.argv,only_base)