Grammalecte  Check-in [3348432a36]

Overview
Comment:[graphspell] normalize characters before spell checking
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 3348432a36ef9de5c9093131397ae5ebd3d5b151fb108f44812abcbb1920f8e2
User & Date: olr on 2018-02-19 09:07:58
Other Links: manifest | tags
Context
2018-02-19
09:11
[fr] version 0.6.2 check-in: 18027d1022 user: olr tags: fr, trunk
09:07
[graphspell] normalize characters before spell checking check-in: 3348432a36 user: olr tags: graphspell, trunk
07:30
[fr] fichier des affixes: ICONV ? -> s, REP: suppression de cad -> c’est-à-dire check-in: b9474152b2 user: olr tags: fr, trunk
Changes

Modified gc_lang/fr/perf_memo.txt from [cec037999d] to [190717e473].

    19     19   0.5.12      2016.10.14 18:58    4.51895     1.0843      0.772805    0.22387     0.249411    0.261593    0.628802    0.339303    0.0570326   0.00805416  
    20     20   0.5.15      2017.01.22 11:44    4.85204     1.16134     0.770762    0.227874    0.244574    0.253305    0.58831     0.319987    0.0603996   0.00694786  
    21     21   0.5.15      2017.01.22 11:47    4.85593     1.15248     0.762924    0.22744     0.243461    0.254609    0.586741    0.317503    0.0588827   0.00701016  (unicode normalisation NFC)
    22     22   0.5.15      2017.01.31 12:06    4.88227     1.18008     0.782217    0.232617    0.247672    0.257628    0.596903    0.32169     0.0603505   0.00695196  
    23     23   0.5.15      2017.02.05 10:10    4.90222     1.18444     0.786696    0.233413    0.25071     0.260214    0.602112    0.325235    0.0609932   0.00706897  
    24     24   0.5.16      2017.05.12 07:41    4.92201     1.19269     0.80639     0.239147    0.257518    0.266523    0.62111     0.33359     0.0634668   0.00757178  
    25     25   0.6.1       2018.02.12 09:58    5.25924     1.2649      0.878442    0.257465    0.280558    0.293903    0.686887    0.391275    0.0672474   0.00824723  
           26  +0.6.1       2018.02.19 09:06    6.20116     1.44334     1.02936     0.272956    0.311561    0.362367    0.812705    0.419061    0.0773003   0.00845671  (spelling normalization)

Modified graphspell-js/char_player.js from [c9b14a8774] to [c171c18615].

     2      2   // useful for suggestion mechanism
     3      3   
     4      4   ${map}
     5      5   
     6      6   
     7      7   var char_player = {
     8      8   
     9         -    _dTransChars: new Map([
            9  +    _xTransCharsForSpelling: new Map([
           10  +        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
           11  +    ]),
           12  +
           13  +    spellingNormalization: function (sWord) {
           14  +        let sNewWord = "";
           15  +        for (let c of sWord) {
           16  +            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
           17  +        }
           18  +        return sNewWord.normalize("NFC");
           19  +    },
           20  +
           21  +    _xTransCharsForSimplification: new Map([
    10     22           ['à', 'a'],  ['é', 'e'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'i'],  ['y', 'i'],
    11     23           ['â', 'a'],  ['è', 'e'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'i'],
    12     24           ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'i'],
    13     25           ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'i'],
    14     26           ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'i'],
    15     27           ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
    16     28           ['œ', 'oe'], ['æ', 'ae'], 
           29  +        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st']
    17     30       ]),
    18     31   
    19     32       simplifyWord: function (sWord) {
    20     33           // word simplication before calculating distance between words
    21     34           sWord = sWord.toLowerCase();
    22     35           let sNewWord = "";
    23     36           let i = 1;
    24     37           for (let c of sWord) {
    25         -            let cNew = this._dTransChars.gl_get(c, c);
           38  +            let cNew = this._xTransCharsForSimplification.gl_get(c, c);
    26     39               let cNext = sWord.slice(i, i+1)
    27         -            if (cNew != this._dTransChars.gl_get(cNext, cNext)) {
           40  +            if (cNew != this._xTransCharsForSimplification.gl_get(cNext, cNext)) {
    28     41                   sNewWord += cNew;
    29     42               }
    30     43               i++;
    31     44           }
    32     45           return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "e").replace(/ei/g, "e").replace(/ph/g, "f");
    33     46       },
    34     47   

Modified graphspell-js/ibdawg.js from [08ad598b63] to [73e27f350e].

   206    206               "sByDic": this.sByDic    // binary word graph
   207    207           };
   208    208           return oJSON;
   209    209       }
   210    210   
   211    211       isValidToken (sToken) {
   212    212           // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
          213  +        sToken = char_player.spellingNormalization(sToken)
   213    214           if (this.isValid(sToken)) {
   214    215               return true;
   215    216           }
   216    217           if (sToken.includes("-")) {
   217    218               if (sToken.gl_count("-") > 4) {
   218    219                   return true;
   219    220               }
................................................................................
   276    277               }
   277    278           }
   278    279           return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
   279    280       }
   280    281   
   281    282       getMorph (sWord) {
   282    283           // retrieves morphologies list, different casing allowed
          284  +        sWord = char_player.spellingNormalization(sWord)
   283    285           let l = this.morph(sWord);
   284    286           if (sWord[0].gl_isUpperCase()) {
   285    287               l.push(...this.morph(sWord.toLowerCase()));
   286    288               if (sWord.gl_isUpperCase() && sWord.length > 1) {
   287    289                   l.push(...this.morph(sWord.gl_toCapitalize()));
   288    290               }
   289    291           }
   290    292           return l;
   291    293       }
   292    294   
   293    295       suggest (sWord, nSuggLimit=10) {
   294    296           // returns a array of suggestions for <sWord>
          297  +        sWord = char_player.spellingNormalization(sWord)
   295    298           let sPfx = "";
   296    299           let sSfx = "";
   297    300           [sPfx, sWord, sSfx] = char_player.cut(sWord);
   298    301           let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
   299    302           let nMaxDel = Math.floor(sWord.length / 5);
   300    303           let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
   301    304           let oSuggResult = new SuggResult(sWord);

Modified graphspell/char_player.py from [82e97eae54] to [e841b9211a].

     1      1   # list of similar chars
     2      2   # useful for suggestion mechanism
     3      3   
     4      4   import re
            5  +import unicodedata
     5      6   
     6      7   
     7         -_xTransChars = str.maketrans({
            8  +_xTransCharsForSpelling = str.maketrans({
            9  +    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
           10  +})
           11  +
           12  +def spellingNormalization (sWord):
           13  +    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))
           14  +
           15  +
           16  +_xTransCharsForSimplification = str.maketrans({
     8     17       'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
     9     18       'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    10     19       'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    11     20       'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    12     21       'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    13     22       'ñ': 'n',  'k': 'q',  'w': 'v',
    14     23       'œ': 'oe',  'æ': 'ae', 
           24  +    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st', 
    15     25   })
    16     26   
    17     27   def simplifyWord (sWord):
    18     28       "word simplication before calculating distance between words"
    19         -    sWord = sWord.lower().translate(_xTransChars)
           29  +    sWord = sWord.lower().translate(_xTransCharsForSimplification)
    20     30       sNewWord = ""
    21     31       for i, c in enumerate(sWord, 1):
    22     32           if c != sWord[i:i+1]:
    23     33               sNewWord += c
    24     34       return sNewWord.replace("eau", "o").replace("au", "o").replace("ai", "e").replace("ei", "e").replace("ph", "f")
    25     35   
    26     36   

Modified graphspell/ibdawg.py from [3bf18d8144] to [c41b426a86].

   214    214                               "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ]
   215    215                           }, ensure_ascii=False))
   216    216               if bInJSModule:
   217    217                   hDst.write(";\n\nexports.dictionary = dictionary;\n")
   218    218   
   219    219       def isValidToken (self, sToken):
   220    220           "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
          221  +        sToken = cp.spellingNormalization(sToken)
   221    222           if self.isValid(sToken):
   222    223               return True
   223    224           if "-" in sToken:
   224    225               if sToken.count("-") > 4:
   225    226                   return True
   226    227               return all(self.isValid(sWord)  for sWord in sToken.split("-"))
   227    228           return False
................................................................................
   256    257               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   257    258               if iAddr == None:
   258    259                   return False
   259    260           return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
   260    261   
   261    262       def getMorph (self, sWord):
   262    263           "retrieves morphologies list, different casing allowed"
          264  +        sWord = cp.spellingNormalization(sWord)
   263    265           l = self.morph(sWord)
   264    266           if sWord[0:1].isupper():
   265    267               l.extend(self.morph(sWord.lower()))
   266    268               if sWord.isupper() and len(sWord) > 1:
   267    269                   l.extend(self.morph(sWord.capitalize()))
   268    270           return l
   269    271   
   270    272       #@timethis
   271    273       def suggest (self, sWord, nSuggLimit=10):
   272    274           "returns a set of suggestions for <sWord>"
          275  +        sWord = cp.spellingNormalization(sWord)
   273    276           sPfx, sWord, sSfx = cp.cut(sWord)
   274    277           nMaxSwitch = max(len(sWord) // 3, 1)
   275    278           nMaxDel = len(sWord) // 5
   276    279           nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
   277    280           oSuggResult = SuggResult(sWord)
   278    281           self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
   279    282           if sWord.istitle():
................................................................................
   326    329                   self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True) # remove last char and go on
   327    330                   for sRepl in cp.dFinal1.get(sRemain, ()):
   328    331                       self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)
   329    332   
   330    333       #@timethis
   331    334       def suggest2 (self, sWord, nMaxSugg=10):
   332    335           "returns a set of suggestions for <sWord>"
          336  +        sWord = cp.spellingNormalization(sWord)
   333    337           sPfx, sWord, sSfx = cp.cut(sWord)
   334    338           oSuggResult = SuggResult(sWord)
   335    339           self._suggest2(oSuggResult)
   336    340           aSugg = oSuggResult.getSuggestions()
   337    341           if sSfx or sPfx:
   338    342               # we add what we removed
   339    343               return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
................................................................................
   382    386                       aTails.add(sTail + self.dCharVal[nVal])
   383    387                   if n and not aTails:
   384    388                       aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
   385    389           return aTails
   386    390   
   387    391       def drawPath (self, sWord, iAddr=0):
   388    392           "show the path taken by <sWord> in the graph"
          393  +        sWord = cp.spellingNormalization(sWord)
   389    394           c1 = sWord[0:1]  if sWord  else " "
   390    395           iPos = -1
   391    396           n = 0
   392    397           print(c1 + ": ", end="")
   393    398           for c2, jAddr in self._getCharArcs(iAddr):
   394    399               print(c2, end="")
   395    400               if c2 == sWord[0:1]: