Grammalecte  Diff

Differences From Artifact [0a316c953c]:

To Artifact [8c9fd715c3]:1
2

3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
..
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
...
235
236
237
238
239
240
241

242
243
244
245
246
247
248

# list of similar chars
# useful for suggestion mechanism


import re
import unicodedata


_xTransCharsForSpelling = str.maketrans({
  'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st'
})

def spellingNormalization (sWord):

  return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


_xTransCharsForSimplification = str.maketrans({
  'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i",
  'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i',
  'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i',
  'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i',
  'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i',
  'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v',
  'œ': 'oe', 'æ': 'ae',
  'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st', 
})

def simplifyWord (sWord):
  "word simplication before calculating distance between words"
  sWord = sWord.lower().translate(_xTransCharsForSimplification)
  sNewWord = ""
  for i, c in enumerate(sWord, 1):
................................................................................
  "Ë": "EeÉéÈèÊêËëĒēŒœ",

  "f": "fF",
  "F": "Ff",

  "g": "gGjJĵĴ",
  "G": "GgJjĴĵ",
  
  "h": "hH",
  "H": "Hh",

  "i": "iIîÎïÏyYíÍìÌīĪÿŸ",
  "I": "IiÎîÏïYyÍíÌìĪīŸÿ",
  "î": "iIîÎïÏyYíÍìÌīĪÿŸ",
  "Î": "IiÎîÏïYyÍíÌìĪīŸÿ",
................................................................................
  "X": ("CC", "CT", "XX"),
  "z": ("ss", "zh"),
  "Z": ("SS", "ZH"),
}


def get1toXReplacement (cPrev, cCur, cNext):

  if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant):
    return ()
  return d1toX.get(cCur, ())


d2toX = {
  "am": ("an", "en", "em"),
>
|
|
>


>| | >1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
..
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
...
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
"""
List of similar chars
useful for suggestion mechanism
"""

import re
import unicodedata


_xTransCharsForSpelling = str.maketrans({
  'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st'
})

def spellingNormalization (sWord):
  "nomalization NFC and removing ligatures"
  return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


_xTransCharsForSimplification = str.maketrans({
  'à': 'a', 'é': 'e', 'î': 'i', 'ô': 'o', 'û': 'u', 'ÿ': 'i', "y": "i",
  'â': 'a', 'è': 'e', 'ï': 'i', 'ö': 'o', 'ù': 'u', 'ŷ': 'i',
  'ä': 'a', 'ê': 'e', 'í': 'i', 'ó': 'o', 'ü': 'u', 'ý': 'i',
  'á': 'a', 'ë': 'e', 'ì': 'i', 'ò': 'o', 'ú': 'u', 'ỳ': 'i',
  'ā': 'a', 'ē': 'e', 'ī': 'i', 'ō': 'o', 'ū': 'u', 'ȳ': 'i',
  'ç': 'c', 'ñ': 'n', 'k': 'q', 'w': 'v',
  'œ': 'oe', 'æ': 'ae',
  'ſ': 's', 'ffi': 'ffi', 'ffl': 'ffl', 'ff': 'ff', 'ſt': 'ft', 'fi': 'fi', 'fl': 'fl', 'st': 'st',
})

def simplifyWord (sWord):
  "word simplication before calculating distance between words"
  sWord = sWord.lower().translate(_xTransCharsForSimplification)
  sNewWord = ""
  for i, c in enumerate(sWord, 1):
................................................................................
  "Ë": "EeÉéÈèÊêËëĒēŒœ",

  "f": "fF",
  "F": "Ff",

  "g": "gGjJĵĴ",
  "G": "GgJjĴĵ",

  "h": "hH",
  "H": "Hh",

  "i": "iIîÎïÏyYíÍìÌīĪÿŸ",
  "I": "IiÎîÏïYyÍíÌìĪīŸÿ",
  "î": "iIîÎïÏyYíÍìÌīĪÿŸ",
  "Î": "IiÎîÏïYyÍíÌìĪīŸÿ",
................................................................................
  "X": ("CC", "CT", "XX"),
  "z": ("ss", "zh"),
  "Z": ("SS", "ZH"),
}


def get1toXReplacement (cPrev, cCur, cNext):
  "return tuple of replacements for <cCur>"
  if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant):
    return ()
  return d1toX.get(cCur, ())


d2toX = {
  "am": ("an", "en", "em"),