# -*- coding: utf-8 -*-
import pyximport; pyximport.install()
from hashfilter_malloc import dijkstrafind
from peditdistance import edit_dist
import sys
import time
import itertools
import codecs

# TODO ha dictionary för att slippa räkna om allt?
class Fuzzy:

    def __init__(self,lexicon,rules,max_dist,no_variant):

        self.alpha = dict((x, y+1) for y, x in enumerate(u"abcdefghijklmnopqrstuvwxyzåäöæþøçèéêëíîïóôûüÿᛘ∂ "))
        self.hashlexicon = {}
        for word,freq in lexicon.iteritems():
             n_word = self._normalize(word)
             key = sum([self._iso(c) for c in n_word])
             self.hashlexicon.setdefault(key,{}).setdefault(n_word, [])

        self.variants = {}
        self.editmap, self.hashmap = self._mkeditMap(rules)
        self.max_distance = max_dist
        self.no_variant   = no_variant
        self.max_nodes    = 10000

    def findvariants(self,word):
        """
          rule based spell checking. applies edit distance
          word is a normalized type
          returns [variant]
         """
        word = self._normalize(word)
	if word in self.variants:
	    return self.variants[word]
        if len(word) >= 32:
            # can't handle to long words
            return []

        else:
            ngrams   = self._getngrams(word)
            wordhash = self._hashiso(word) 
            wordlen  = len(word)-1
	    max_dist = self.max_distance #min(self.max_distance*(len(word)/3),.2)
	    #print 'max_dist',max_dist,len(word)

            if wordhash >= 50014198618560734057L:
                # can't handle to big wordhashes!
                return []

            changelist = []
            hashmapget = self.hashmap.get

            for (ngram_hash, involvedletters) in itertools.chain(*ngrams):

                changelist.extend((x - ngram_hash, weight, involvedletters) 
                                   for (x, weight, pos) in hashmapget(ngram_hash, []) if self._ok(pos,involvedletters,wordlen) )

            # insert-rules
            for (x,weight,pos) in hashmapget(0,[]):
                changelist.append((x, weight, 0))

            d_changelist = self._deltaize(changelist)
            
            res =  [variant for (dist,eds,(variant,info)) in self._countdijkstrafind(d_changelist, word, wordhash, max_dist)]
            self.variants[word] =  res
            return res


    def _iso(self,char):
        return pow(self.alpha.get(char.lower(), 0), 5)

    def _hashiso(self,word):
        return sum(self._iso(c) for c in word)


    def _mkeditMap(self,rules):
        editMap = {}
        hashMap = {}
        maxl    = 1
        for line in codecs.open(rules,'r',encoding='utf8'):
            if not line.strip():
              continue
            line = line.split(u'\t')
            val  = int(float(line[2])*1000000)
            x, y = line[0], line[1]
            pos  = line[3].strip()
            self._update_value(pos,val,(x,y),editMap)
            hashMap.setdefault(self._hashiso(x), []).append((self._hashiso(y), val, pos))
    
    
            maxl = max(len(x), len(y), maxl)
              
        return ((editMap, maxl), hashMap)

    def _update_value(self,pos,val,(rulefrom,ruleto),editmap):
        x,y,(s,m,e) = editmap.setdefault((rulefrom,ruleto),(0,0,('_','_','_')))
        if pos=='S':
           s = min(s, val)
        elif pos=='M':
           m = min(m, val)
        elif pos=='E':
           e = min(e, val)
        editmap.update({(rulefrom, ruleto):(len(rulefrom), len(ruleto), (s,m,e))})


    def _ok(self, pos, involvedletters, w):
       return (pos!='E' or (1 << w & involvedletters)) and (pos!='S' or (1  & involvedletters))

    def  _getngrams(self,word):
        xs = []
        for j in range(1,self.editmap[1]+1):
            xs.append([(sum((self._iso(c) for c in word[i:i+j])),2**j-1 << i) for i in xrange(len(word)-j+1)])
        return xs


    def _deltaize(self,rules):

        ret   = []
        rules = sorted(rules, key=lambda rule: rule[0])

        if rules:
        
            curr_h, curr_w, curr_u = rules[0][0], rules[0][1], [rules[0][2]],
            for rule in rules[1:]:
                if rule[0] == curr_h:
                    curr_w = min(curr_w, rule[1])
                    curr_u.append(rule[2])
                else:
                    ret.append((curr_h, curr_w, self._subsumption_filter(curr_u)))
                    curr_h, curr_w, curr_u = rule[0], rule[1], [rule[2]]
        
            ret.append((curr_h, curr_w, list(set(curr_u))))
        
            ret = sorted(ret, key=lambda rule: rule[1])
            ret[0] = ret[0]+(0, 0)
            for reti in xrange(1, len(ret)):
                ret[reti] = ret[reti]+(ret[reti-1][0], ret[reti-1][1])
        

        return ret 

    def  _subsumption_filter(self,bitmaps):

       bitmaps = sorted(bitmaps)
       rets    = []

       for bitmap in bitmaps:
           for ret in rets:
               if (ret | bitmap) == bitmap:
                   break
           else:
               rets.append(bitmap)

       return rets


    def _countdijkstrafind(self,changes, word, wordhash,max_dist):

      #th       = int(self.max_distance*1000000)
        th       = max_dist*1000000
        k        = self.no_variant
        seen     = set([])
        topklist = [(th+1, 0, ())]*k
        lexget   = self.hashlexicon.get
        iters     = 0
   
        for (hash_w, weight, iters) in dijkstrafind(changes, wordhash, len(word), th, self.max_nodes):

            if not (hash_w or weight or iters):
                # (memory) error
                break

            if topklist[-1][2] and topklist[-1][0] < weight:
                break
                
            ok = lexget(hash_w, {})

            for (variantword, info) in ok.iteritems():
                if not variantword in seen:
                    dist, eds = edit_dist(word, variantword, 
                                          rules=self.editmap[0], n=self.editmap[1])
                    seen.add(variantword)
                    topklist.append((dist, eds, (variantword, info)))
                    topklist = sorted(topklist)[:k]
   
        return [(x, y, z) for x, y, z in topklist if z]
  

    def _normalize(self,word):
        return word.strip().lower().replace(u'w',u'v').replace(u'æ',u'ä').replace(u'ø',u'ö').replace(u'ß',u'ss').replace('j','i')