# -*- coding: utf-8 -*- import pyximport; pyximport.install() from hashfilter_malloc import dijkstrafind from peditdistance import edit_dist import sys import time import itertools import codecs # TODO ha dictionary för att slippa räkna om allt? class Fuzzy: def __init__(self,lexicon,rules,max_dist,no_variant): self.alpha = dict((x, y+1) for y, x in enumerate(u"abcdefghijklmnopqrstuvwxyzåäöæþøçèéêëíîïóôûüÿᛘ∂ ")) self.hashlexicon = {} for word,freq in lexicon.iteritems(): n_word = self._normalize(word) key = sum([self._iso(c) for c in n_word]) self.hashlexicon.setdefault(key,{}).setdefault(n_word, []) self.variants = {} self.editmap, self.hashmap = self._mkeditMap(rules) self.max_distance = max_dist self.no_variant = no_variant self.max_nodes = 10000 def findvariants(self,word): """ rule based spell checking. applies edit distance word is a normalized type returns [variant] """ word = self._normalize(word) if word in self.variants: return self.variants[word] if len(word) >= 32: # can't handle to long words return [] else: ngrams = self._getngrams(word) wordhash = self._hashiso(word) wordlen = len(word)-1 max_dist = self.max_distance #min(self.max_distance*(len(word)/3),.2) #print 'max_dist',max_dist,len(word) if wordhash >= 50014198618560734057L: # can't handle to big wordhashes! return [] changelist = [] hashmapget = self.hashmap.get for (ngram_hash, involvedletters) in itertools.chain(*ngrams): changelist.extend((x - ngram_hash, weight, involvedletters) for (x, weight, pos) in hashmapget(ngram_hash, []) if self._ok(pos,involvedletters,wordlen) ) # insert-rules for (x,weight,pos) in hashmapget(0,[]): changelist.append((x, weight, 0)) d_changelist = self._deltaize(changelist) res = [variant for (dist,eds,(variant,info)) in self._countdijkstrafind(d_changelist, word, wordhash, max_dist)] self.variants[word] = res return res def _iso(self,char): return pow(self.alpha.get(char.lower(), 0), 5) def _hashiso(self,word): return sum(self._iso(c) for c in word) def _mkeditMap(self,rules): editMap = {} hashMap = {} maxl = 1 for line in codecs.open(rules,'r',encoding='utf8'): if not line.strip(): continue line = line.split(u'\t') val = int(float(line[2])*1000000) x, y = line[0], line[1] pos = line[3].strip() self._update_value(pos,val,(x,y),editMap) hashMap.setdefault(self._hashiso(x), []).append((self._hashiso(y), val, pos)) maxl = max(len(x), len(y), maxl) return ((editMap, maxl), hashMap) def _update_value(self,pos,val,(rulefrom,ruleto),editmap): x,y,(s,m,e) = editmap.setdefault((rulefrom,ruleto),(0,0,('_','_','_'))) if pos=='S': s = min(s, val) elif pos=='M': m = min(m, val) elif pos=='E': e = min(e, val) editmap.update({(rulefrom, ruleto):(len(rulefrom), len(ruleto), (s,m,e))}) def _ok(self, pos, involvedletters, w): return (pos!='E' or (1 << w & involvedletters)) and (pos!='S' or (1 & involvedletters)) def _getngrams(self,word): xs = [] for j in range(1,self.editmap[1]+1): xs.append([(sum((self._iso(c) for c in word[i:i+j])),2**j-1 << i) for i in xrange(len(word)-j+1)]) return xs def _deltaize(self,rules): ret = [] rules = sorted(rules, key=lambda rule: rule[0]) if rules: curr_h, curr_w, curr_u = rules[0][0], rules[0][1], [rules[0][2]], for rule in rules[1:]: if rule[0] == curr_h: curr_w = min(curr_w, rule[1]) curr_u.append(rule[2]) else: ret.append((curr_h, curr_w, self._subsumption_filter(curr_u))) curr_h, curr_w, curr_u = rule[0], rule[1], [rule[2]] ret.append((curr_h, curr_w, list(set(curr_u)))) ret = sorted(ret, key=lambda rule: rule[1]) ret[0] = ret[0]+(0, 0) for reti in xrange(1, len(ret)): ret[reti] = ret[reti]+(ret[reti-1][0], ret[reti-1][1]) return ret def _subsumption_filter(self,bitmaps): bitmaps = sorted(bitmaps) rets = [] for bitmap in bitmaps: for ret in rets: if (ret | bitmap) == bitmap: break else: rets.append(bitmap) return rets def _countdijkstrafind(self,changes, word, wordhash,max_dist): #th = int(self.max_distance*1000000) th = max_dist*1000000 k = self.no_variant seen = set([]) topklist = [(th+1, 0, ())]*k lexget = self.hashlexicon.get iters = 0 for (hash_w, weight, iters) in dijkstrafind(changes, wordhash, len(word), th, self.max_nodes): if not (hash_w or weight or iters): # (memory) error break if topklist[-1][2] and topklist[-1][0] < weight: break ok = lexget(hash_w, {}) for (variantword, info) in ok.iteritems(): if not variantword in seen: dist, eds = edit_dist(word, variantword, rules=self.editmap[0], n=self.editmap[1]) seen.add(variantword) topklist.append((dist, eds, (variantword, info))) topklist = sorted(topklist)[:k] return [(x, y, z) for x, y, z in topklist if z] def _normalize(self,word): return word.strip().lower().replace(u'w',u'v').replace(u'æ',u'ä').replace(u'ø',u'ö').replace(u'ß',u'ss').replace('j','i')