Grammalecte  Check-in [6ee195e8d3]

Overview
Comment:[graphspell] JaroWinkler: modify variable names
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 6ee195e8d3e283062d755428846877d8801d50f9f31d8b44bbeda69d4c2bbacf
User & Date: olr on 2021-02-10 08:45:16
Other Links: manifest | tags
Context
2021-02-10
10:14
[fr] faux positifs et ajustements check-in: 4698a4f99c user: olr tags: fr, trunk
08:45
[graphspell] JaroWinkler: modify variable names check-in: 6ee195e8d3 user: olr tags: graphspell, trunk
2021-02-09
16:23
[graphspell] suggestion mechanism: extend search further check-in: d2cc0989db user: olr tags: graphspell, trunk
Changes

Modified graphspell/str_transform.py from [3aad4bb295] to [d580e06cf9].

   110    110                   d[i-1, j-1] + nCost,    # Substitution
   111    111               )
   112    112               if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
   113    113                   d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
   114    114       return d[nLen1-1, nLen2-1]
   115    115   
   116    116   
   117         -def distanceJaroWinkler (a, b, boost = .666):
          117  +def distanceJaroWinkler (sWord1, sWord2, fBoost = .666):
   118    118       # https://github.com/thsig/jaro-winkler-JS
   119         -    #if (a == b): return 1.0
   120         -    a_len = len(a)
   121         -    b_len = len(b)
   122         -    nMax = max(a_len, b_len)
   123         -    a_flag = [None for _ in range(nMax)]
   124         -    b_flag = [None for _ in range(nMax)]
   125         -    search_range = (max(a_len, b_len) // 2) - 1
   126         -    minv = min(a_len, b_len)
          119  +    #if (sWord1 == sWord2): return 1.0
          120  +    nLen1 = len(sWord1)
          121  +    nLen2 = len(sWord2)
          122  +    nMax = max(nLen1, nLen2)
          123  +    aFlags1 = [ None for _ in range(nMax) ]
          124  +    aFlags2 = [ None for _ in range(nMax) ]
          125  +    nSearchRange = (max(nLen1, nLen2) // 2) - 1
          126  +    nMinLen = min(nLen1, nLen2)
   127    127   
   128    128       # Looking only within the search range, count and flag the matched pairs.
   129         -    Num_com = 0
   130         -    yl1 = b_len - 1
   131         -    for i in range(a_len):
   132         -        lowlim = i - search_range  if i >= search_range  else 0
   133         -        hilim  = i + search_range  if (i + search_range) <= yl1  else yl1
   134         -        for j in range(lowlim, hilim+1):
   135         -            if b_flag[j] != 1 and a[j:j+1] == b[i:i+1]:
   136         -                a_flag[j] = 1
   137         -                b_flag[i] = 1
   138         -                Num_com += 1
          129  +    nCommon = 0
          130  +    yl1 = nLen2 - 1
          131  +    for i in range(nLen1):
          132  +        nLowLim = i - nSearchRange  if i >= nSearchRange  else 0
          133  +        nHiLim  = i + nSearchRange  if (i + nSearchRange) <= yl1  else yl1
          134  +        for j in range(nLowLim, nHiLim+1):
          135  +            if aFlags2[j] != 1 and sWord1[j:j+1] == sWord2[i:i+1]:
          136  +                aFlags1[j] = 1
          137  +                aFlags2[i] = 1
          138  +                nCommon += 1
   139    139                   break
   140    140   
   141    141       # Return if no characters in common
   142         -    if Num_com == 0:
          142  +    if nCommon == 0:
   143    143           return 0.0
   144    144   
   145    145       # Count the number of transpositions
   146    146       k = 0
   147         -    N_trans = 0
   148         -    for i in range(a_len):
   149         -        if a_flag[i] == 1:
   150         -            for j in range(k, b_len):
   151         -                if b_flag[j] == 1:
          147  +    nTrans = 0
          148  +    for i in range(nLen1):
          149  +        if aFlags1[i] == 1:
          150  +            for j in range(k, nLen2):
          151  +                if aFlags2[j] == 1:
   152    152                       k = j + 1
   153    153                       break
   154         -            if a[i] != b[j]:
   155         -                N_trans += 1
   156         -    N_trans = N_trans // 2
          154  +            if sWord1[i] != sWord2[j]:
          155  +                nTrans += 1
          156  +    nTrans = nTrans // 2
   157    157   
   158    158       # Adjust for similarities in nonmatched characters
   159         -    N_simi = 0
   160         -    if minv > Num_com:
   161         -        for i in range(a_len):
   162         -            if not a_flag[i]:
   163         -                for j in range(b_len):
   164         -                    if not b_flag[j]:
   165         -                        if a[i] in dDistanceBetweenChars and b[j] in dDistanceBetweenChars[a[i]]:
   166         -                            N_simi += dDistanceBetweenChars[a[i]][b[j]]
   167         -                            b_flag[j] = 2
          159  +    nSimi = 0
          160  +    if nMinLen > nCommon:
          161  +        for i in range(nLen1):
          162  +            if not aFlags1[i]:
          163  +                for j in range(nLen2):
          164  +                    if not aFlags2[j]:
          165  +                        if sWord1[i] in dDistanceBetweenChars and sWord2[j] in dDistanceBetweenChars[sWord1[i]]:
          166  +                            nSimi += dDistanceBetweenChars[sWord1[i]][sWord2[j]]
          167  +                            aFlags2[j] = 2
   168    168                               break
   169    169   
   170         -    Num_sim = (N_simi / 10.0) + Num_com
          170  +    fSim = (nSimi / 10.0) + nCommon
   171    171   
   172    172       # Main weight computation
   173         -    weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com
   174         -    weight = weight / 3
          173  +    fWeight = fSim / nLen1 + fSim / nLen2 + (nCommon - nTrans) / nCommon
          174  +    fWeight = fWeight / 3
   175    175   
   176    176       # Continue to boost the weight if the strings are similar
   177         -    if weight > boost:
          177  +    if fWeight > fBoost:
   178    178           # Adjust for having up to the first 4 characters in common
   179         -        j = 4  if minv >= 4  else minv
          179  +        j = 4  if nMinLen >= 4  else nMinLen
   180    180           i = 0
   181         -        while i < j  and a[i] == b[i]:
          181  +        while i < j  and sWord1[i] == sWord2[i]:
   182    182               i += 1
   183    183           if i:
   184         -            weight += i * 0.1 * (1.0 - weight)
          184  +            fWeight += i * 0.1 * (1.0 - fWeight)
   185    185           # Adjust for long strings.
   186    186           # After agreeing beginning chars, at least two more must agree
   187    187           # and the agreeing characters must be more than half of the
   188    188           # remaining characters.
   189         -        if minv > 4  and  Num_com > i + 1  and  2 * Num_com >= minv + i:
   190         -            weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2))
   191         -    return weight
          189  +        if nMinLen > 4  and  nCommon > i + 1  and  2 * nCommon >= nMinLen + i:
          190  +            fWeight += (1 - fWeight) * ((nCommon - i - 1) / (nLen1 * nLen2 - i*2 + 2))
          191  +    return fWeight
   192    192   
   193    193   
   194    194   def distanceSift4 (s1, s2, nMaxOffset=5):
   195    195       "implementation of general Sift4."
   196    196       # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
   197    197       if not s1:
   198    198           return len(s2)