Grammalecte  Check-in [1329ae8f1c]

Overview
Comment:[core] ibdawg: clean words before damerau-levenshtein comparison
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 1329ae8f1c84636f6974834cc2bb7383def1d59e208f6573baeea10795b5faa6
User & Date: olr on 2017-10-25 11:37:33
Other Links: manifest | tags
Context
2017-10-25
14:30
[core][fr] ibdawg: char_player > phonème o check-in: 0ad1970e9c user: olr tags: core, fr, trunk
11:37
[core] ibdawg: clean words before damerau-levenshtein comparison check-in: 1329ae8f1c user: olr tags: core, trunk
09:41
[core] ibdawg: suggestion mechanism > split word function check-in: 388e8809cf user: olr tags: core, trunk
Changes

Modified gc_core/js/char_player.js from [c0ed55106f] to [9c8e1eeca8].

     1      1   // list of similar chars
     2      2   // useful for suggestion mechanism
     3      3   
     4      4   ${map}
     5      5   
     6      6   
     7      7   var char_player = {
            8  +
            9  +    _dTransChars: new Map([
           10  +        ['à', 'a'],  ['é', 'e'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'y'],
           11  +        ['â', 'a'],  ['è', 'e'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'y'],
           12  +        ['ä', 'a'],  ['ê', 'e'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'y'],
           13  +        ['á', 'a'],  ['ë', 'e'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'y'],
           14  +        ['ā', 'a'],  ['ē', 'e'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'y'],
           15  +        ['ñ', 'n'],
           16  +        ['œ', 'oe'], ['æ', 'ae'], 
           17  +    ]),
           18  +
           19  +    cleanWord: function (sWord) {
           20  +        // word simplication before calculating distance between words
           21  +        sWord = sWord.toLowerCase();
           22  +        let sRes = "";
           23  +        for (let c of sWord) {
           24  +            sRes += this._dTransChars.gl_get(c, c);
           25  +        }
           26  +        return sWord;
           27  +    },
     8     28   
     9     29       distanceDamerauLevenshtein: function (s1, s2) {
    10     30           // distance of Damerau-Levenshtein between <s1> and <s2>
    11     31           // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    12     32           try {
    13     33               let nLen1 = s1.length;
    14     34               let nLen2 = s2.length;
................................................................................
    52     72           'â', 'è', 'ï', 'ö', 'ù', 'ŷ',
    53     73           'ä', 'ê', 'í', 'ó', 'ü', 'ý',
    54     74           'á', 'ë', 'ì', 'ò', 'ú', 'ỳ',
    55     75           'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ',
    56     76           'h', 'œ', 'æ'
    57     77       ]),
    58     78   
    59         -    clearWord: function (sWord) {
           79  +    shrinkWord: function (sWord) {
    60     80           // remove vovels and h
    61     81           let sRes = "";
    62     82           for (let cChar of sWord.slice(1)) {
    63     83               if (!this.aVovels.has(cChar)) {
    64     84                   sRes += cChar;
    65     85               }
    66     86           }

Modified gc_core/js/ibdawg.js from [c871817c8f] to [82209aec2c].

   199    199           if (sWord.gl_isTitle()) {
   200    200               aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl));
   201    201           }
   202    202           else if (sWord.gl_isLowerCase()) {
   203    203               aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl));
   204    204           }
   205    205           if (aSugg.size == 0) {
   206         -            aSugg.gl_update(this._suggestWithCrushedUselessChars(char_player.clearWord(sWord)));
          206  +            aSugg.gl_update(this._suggestWithCrushedUselessChars(char_player.shrinkWord(sWord)));
   207    207           }
   208    208           // Set to Array
   209    209           aSugg = Array.from(aSugg);
   210    210           aSugg = aSugg.filter((sSugg) => { return !sSugg.endsWith("è") && !sSugg.endsWith("È"); }); // fr language 
   211    211           if (sWord.gl_isTitle()) {
   212    212               aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); });
   213    213           }
   214    214           let dDistTemp = new Map();
   215         -        aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sWord, sSugg)); });
          215  +        let sCleanWord = char_player.cleanWord(sWord)
          216  +        aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); });
   216    217           aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg);
   217    218           dDistTemp.clear();
   218    219           if (sSfx || sPfx) {
   219    220               // we add what we removed
   220    221               return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } );
   221    222           }
   222    223           return aSugg;

Modified gc_core/py/char_player.py from [2ac4c0eb20] to [83d2f45b7c].

     1      1   # list of similar chars
     2      2   # useful for suggestion mechanism
     3      3   
     4      4   import re
     5      5   
            6  +
            7  +_xTransChars = str.maketrans({
            8  +    'à': 'a',  'é': 'e',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'y',
            9  +    'â': 'a',  'è': 'e',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'y',
           10  +    'ä': 'a',  'ê': 'e',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'y',
           11  +    'á': 'a',  'ë': 'e',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'y',
           12  +    'ā': 'a',  'ē': 'e',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'y',
           13  +    'ñ': 'n',
           14  +    'œ': 'oe',  'æ': 'ae', 
           15  +})
           16  +
           17  +def cleanWord (sWord):
           18  +    "word simplication before calculating distance between words"
           19  +    return sWord.lower().translate(_xTransChars)
           20  +
     6     21   
     7     22   def distanceDamerauLevenshtein (s1, s2):
     8     23       "distance of Damerau-Levenshtein between <s1> and <s2>"
     9     24       # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    10     25       d = {}
    11     26       nLen1 = len(s1)
    12     27       nLen2 = len(s2)
................................................................................
    41     56   
    42     57   _xTransVovels = str.maketrans(_dVovels)
    43     58   
    44     59   
    45     60   aVovels = frozenset(_dVovels.keys())
    46     61   
    47     62   
    48         -def clearWord (sWord):
           63  +def shrinkWord (sWord):
    49     64       "remove vovels and h"
    50     65       return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels)
    51     66   
    52     67   
    53     68   # Similar chars
    54     69   
    55     70   d1to1 = {

Modified gc_core/py/ibdawg.py from [e132c3a736] to [f563ae7bdb].

   196    196           if sWord.istitle():
   197    197               aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
   198    198               aSugg = set(map(lambda sSugg: sSugg.title(), aSugg))
   199    199           elif sWord.islower():
   200    200               aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
   201    201           if not aSugg:
   202    202               #print("crush useless chars")
   203         -            aSugg.update(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
          203  +            aSugg.update(self._suggestWithCrushedUselessChars(cp.shrinkWord(sWord)))
   204    204           aSugg = cp.filterSugg(aSugg)
   205         -        aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sWord, sSugg))[:nMaxSugg]
          205  +        sCleanWord = cp.cleanWord(sWord)
          206  +        aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg]
   206    207           if sSfx or sPfx:
   207    208               # we add what we removed
   208    209               return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
   209    210           return aSugg
   210    211   
   211    212       def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
   212    213           "returns a set of suggestions"