# -*- coding: utf-8 -*- # TODO hur mycket ska vokal-vokal, konsonant-konsonant, osv kosta? # i issame & i initiering av matrisen __weights = [0]*2500 __distances = [0]*2500 # TODO probably need to work on this to make it effective def edit_dist(s1,s2,rules,n): """ calculates the distance """ """ string1 , string2, dictionary of rules, maximum n-gram length """ weights = __weights distances = __distances h = len(s1)+3 w = len(s2)+3 #print 'matching',s1,'and',s2 s1 = '^'+s1+'$' s2 = '^'+s2+'$' weights[0] = 0 distances[0] = 0 for i in xrange(1,h): # to exclude letters should cost at least 2 000 000 weights[i] = (i+2)*1000000 distances[i] = i for j in xrange(1,w): weights[50*j] = (j+2)*1000000 distances[50*j] = j for i in xrange(1,h): for j in xrange(1,w): samew,samed = issame(s1[i-1],s2[j-1]) minweight = samew+weights[(i-1)+50*(j-1)] mindist = samed+distances[(i-1)+50*(j-1)] ss1 = s1[:i] ss2 = s2[:j] firstround = True # trying not to do things to many times... positionS = True positionE = False for x in xrange(1,n+1): a = s1[i-x:i] positionS = i-x<2 # in start of word? #print 'positions[0]',i,x,positions[0],a,s1 ###print 'positions[1]',i,h,positions[1],a,s1 positionE = i >= h-2 # in end of word? # delete minweight, mindist = update_if_min(rules.get((a,'')),i,j,minweight,mindist,positionS,positionE,weights,distances) ###print 'distance: delete',a # change for y in xrange(1,n+1): b = s2[j-y:j] ###print 'distance: ',a,' to ',b positionE = i >= h-2 and j >= w-2 # in end of word? minweight, mindist = update_if_min(rules.get((a,b)),i,j,minweight,mindist,positionS,positionE,weights,distances) # insert if firstround: ###print 'distance: insert',b minweight, mindist = update_if_min(rules.get(('',b)),i,j,minweight,mindist,i-x<1,positionE,weights,distances) firstround = False weights[i+50*j] = minweight distances[i+50*j] = mindist return weights[(h-1)+50*(w-1)],distances[(h-1)+50*(w-1)] def update_if_min(val,i,j,minweight,mindist,positionS,positionE,weights,distances): # val = (i,j,(s,m,e)) ###print 'positions',positions w = 0 if val: if positionS: if positionE: w = min(val[2][0],val[2][2]) else: w = val[2][0] else: if positionE: w = val[2][2] else: w = val[2][1] if not w=='_': thisweight = weights[(i-val[0])+50*(j-val[1])]+w if minweight > thisweight: minweight = thisweight mindist = distances[(i-val[0])+50*(j-val[1])]+1 return minweight, mindist def issame(a,b): if a == b: return 0,0 else: return 5000000, 1 #def issame(a,b): # vow = u"aeiouyåäöAEIOUYÅÄÖ"; #/* vokaler*/ # if a==b: # return 0,0 # if a in vow and b not in vow: # return 3000000,1 # return 2000000,1 # same code as in edit_dist, but no call to extra function, everything very inlined: # for (a,b) in product(((s1[:i])[-x:] for x in xrange(1,n+1)) # ,((s2[:j])[-y:] for y in xrange(1,n+1))): # # TODO fortare med två loopar? # val = rules.get((a,b),None) # if val: # thisweight = weights[i-val[0]][j-val[1]]+val[2] # if minweight>thisweight: # minweight = thisweight # mindist = distances[i-val[0]][j-val[1]]+1 # ## TODO delete-loop # for a in [(s1[:i])[-x:] for x in xrange(1,n+1)]: # val = rules.get((a,'_'),None) # if val: # thisweight = weights[i-val[0]][j-val[1]]+val[2] # if minweight>thisweight: # minweight = thisweight # mindist = distances[i-val[0]][j-val[1]]+1 # # # TODO insert-loop # for b in [(s2[:j])[-y:] for y in xrange(1,n+1)]: # val = rules.get(('_',b),None) # if val: # thisweight = weights[i-val[0]][j-val[1]]+val[2] # if minweight>thisweight: # minweight = thisweight # mindist = distances[i-val[0]][j-val[1]]+1