I'm a python n00b and I'd like some suggestions on how to improve the algorithm to improve the performance of this method to compute the Jaro-Winkler distance of two names.
def winklerCompareP(str1, str2):
"""Return approximate string comparator measure (between 0.0 and 1.0)
USAGE:
score = winkler(str1, str2)
ARGUMENTS:
str1 The first string
str2 The second string
DESCRIPTION:
As described in 'An Application of the Fellegi-Sunter Model of
Record Linkage to the 1990 U.S. Decennial Census' by William E. Winkler
and Yves Thibaudeau.
Based on the 'jaro' string comparator, but modifies it according to whether
the first few characters are the same or not.
"""
# Quick check if the strings are the same - - - - - - - - - - - - - - - - - -
#
jaro_winkler_marker_char = chr(1)
if (str1 == str2):
return 1.0
len1 = len(str1)
len2 = len(str2)
halflen = max(len1,len2) / 2 - 1
ass1 = '' # Characters assigned in str1
ass2 = '' # Characters assigned in str2
#ass1 = ''
#ass2 = ''
workstr1 = str1
workstr2 = str2
common1 = 0 # Number of common characters
common2 = 0
#print "'len1', str1[i], start, end, index, ass1, workstr2, common1"
# Analyse the first string - - - - - - - - - - - - - - - - - - - - - - - - -
#
for i in range(len1):
start = max(0,i-halflen)
end = min(i+halflen+1,len2)
index = workstr2.find(str1[i],start,end)
#print 'len1', str1[i], start, end, index, ass1, workstr2, common1
if (index > -1): # Found common character
common1 += 1
#ass1 += str1[i]
ass1 = ass1 + str1[i]
workstr2 = workstr2[:index]+jaro_winkler_marker_char+workstr2[index+1:]
#print "str1 analyse result", ass1, common1
#print "str1 analyse result", ass1, common1
# Analyse the second string - - - - - - - - - - - - - - - - - - - - - - - - -
#
for i in range(len2):
start = max(0,i-halflen)
end = min(i+halflen+1,len1)
index = workstr1.find(str2[i],start,end)
#print 'len2', str2[i], start, end, index, ass1, workstr1, common2
if (index > -1): # Found common character
common2 += 1
#ass2 += str2[i]
ass2 = ass2 + str2[i]
workstr1 = workstr1[:index]+jaro_winkler_marker_char+workstr1[index+1:]
if (common1 != common2):
print('Winkler: Wrong common values for strings "%s" and "%s"' % \
(str1, str2) + ', common1: %i, common2: %i' % (common1, common2) + \
', common should be the same.')
common1 = float(common1+common2) / 2.0 ##### This is just a fix #####
if (common1 == 0):
return 0.0
# Compute number of transpositions - - - - - - - - - - - - - - - - - - - - -
#
transposition = 0
for i in range(len(ass1)):
if (ass1[i] != ass2[i]):
transposition += 1
transposition = transposition / 2.0
# Now compute how many characters are common at beginning - - - - - - - - - -
#
minlen = min(len1,len2)
for same in range(minlen+1):
if (str1[:same] != str2[:same]):
break
same -= 1
if (same > 4):
same = 4
common1 = float(common1)
w = 1./3.*(common1 / float(len1) + common1 / float(len2) + (common1-transposition) / common1)
wn = w + same*0.1 * (1.0 - w)
return wn
Example output
ZIMMERMANN ARMIENTO 0.814583333
ZIMMERMANN ZIMMERMANN 1
ZIMMERMANN CANNONS 0.766666667
CANNONS AKKER 0.8
CANNONS ALDERSON 0.845833333
CANNONS ALLANBY 0.833333333