Grammalecte  Check-in [3b3a02f4d3]

Overview
Comment:[graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | bdic_opt
Files: files | file ages | folders
SHA3-256: 3b3a02f4d385bf7b9d9c2ef3f992cc0c7cc150c8b7366c74ade85c8154499a54
User & Date: olr on 2020-09-15 13:50:00
Other Links: branch diff | manifest | tags
Context
2020-09-15
14:01
[graphspell][js] remove specific trick in cleanWord() check-in: 6569849b49 user: olr tags: bdic_opt, graphspell
13:50
[graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) check-in: 3b3a02f4d3 user: olr tags: bdic_opt, graphspell
2020-09-14
14:38
[graphspell] string comparison: use Jaro-Winkler check-in: efebe44d15 user: olr tags: bdic_opt, graphspell
Changes

Modified graphspell-js/char_player.js from [0602ec129b] to [8dac23cf9b].

     4      4   /* jshint esversion:6 */
     5      5   /* jslint esversion:6 */
     6      6   
     7      7   ${map}
     8      8   
     9      9   
    10     10   var char_player = {
    11         -
           11  +    /*
           12  +        oDistanceBetweenChars:
           13  +            - with Jaro-Winkler, values between 1 and 10
           14  +            - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9)
           15  +    */
    12     16       oDistanceBetweenChars: {
    13         -        "a": {},
    14         -        "e": {"é": 0.5},
    15         -        "é": {"e": 0.5},
    16         -        "i": {"y": 0.2},
    17         -        "o": {},
    18         -        "u": {},
    19         -        "y": {"i": 0.3},
    20         -        "b": {"d": 0.8, "h": 0.9},
    21         -        "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
    22         -        "d": {"b": 0.8},
    23         -        "f": {"v": 0.8},
    24         -        "g": {"j": 0.5},
    25         -        "h": {"b": 0.9},
    26         -        "j": {"g": 0.5, "i": 0.9},
    27         -        "k": {"c": 0.5, "q": 0.1, "x": 0.5},
    28         -        "l": {"i": 0.9},
    29         -        "m": {"n": 0.8},
    30         -        "n": {"m": 0.8, "r": 0.9},
    31         -        "p": {"q": 0.9},
    32         -        "q": {"c": 0.5, "k": 0.1, "p": 0.9},
    33         -        "r": {"n": 0.9, "j": 0.9},
    34         -        "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
    35         -        "t": {"d": 0.9},
    36         -        "v": {"f": 0.8, "w": 0.1},
    37         -        "w": {"v": 0.1},
    38         -        "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
    39         -        "z": {"s": 0.5}
           17  +        //"a": {},
           18  +        "e": {"é": 5},
           19  +        //"é": {"e": 5},
           20  +        "i": {"y": 2},
           21  +        //"o": {},
           22  +        //"u": {},
           23  +        "y": {"i": 3},
           24  +        "b": {"d": 8, "h": 9},
           25  +        "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8},
           26  +        "d": {"b": 8},
           27  +        "f": {"v": 8},
           28  +        "g": {"j": 5},
           29  +        "h": {"b": 9},
           30  +        "j": {"g": 5, "i": 9},
           31  +        "k": {"c": 5, "q": 1, "x": 5},
           32  +        "l": {"i": 9},
           33  +        "m": {"n": 8},
           34  +        "n": {"m": 8, "r": 9},
           35  +        "p": {"q": 9},
           36  +        "q": {"c": 5, "k": 1, "p": 9},
           37  +        "r": {"n": 9, "j": 9},
           38  +        "s": {"c": 5, "ç": 1, "x": 5, "z": 5},
           39  +        "t": {"d": 9},
           40  +        "v": {"f": 8, "w": 1},
           41  +        "w": {"v": 1},
           42  +        "x": {"c": 5, "k": 5, "q": 5, "s": 5},
           43  +        "z": {"s": 5}
    40     44       },
    41     45   
    42     46       distanceBetweenChars: function (c1, c2) {
    43     47           if (c1 == c2) {
    44     48               return 0;
    45     49           }
    46     50           if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {

Modified graphspell-js/ibdawg.js from [1cb5337715] to [2160aa77f7].

    18     18       var char_player = require("./char_player.js");
    19     19   }
    20     20   
    21     21   
    22     22   class SuggResult {
    23     23       // Structure for storing, classifying and filtering suggestions
    24     24   
    25         -    constructor (sWord, nDistLimit=-1) {
           25  +    constructor (sWord, nSuggLimit=10, nDistLimit=-1) {
    26     26           this.sWord = sWord;
    27     27           this.sSimplifiedWord = str_transform.simplifyWord(sWord);
    28     28           this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
    29     29           this.nMinDist = 1000;
    30         -        this.aSugg = new Set();
    31         -        this.dSugg = new Map([ [0, []],  [1, []],  [2, []] ]);
    32         -        this.aAllSugg = new Set();      // all found words even those refused
           30  +        // Temporary sets
           31  +        this.aAllSugg = new Set();  // All suggestions, even the one rejected
           32  +        this.dGoodSugg = new Map(); // Acceptable suggestions
           33  +        this.dBestSugg = new Map(); // Best suggestions
           34  +        // Parameters
           35  +        this.nSuggLimit = nSuggLimit;
           36  +        this.nSuggLimitExt = nSuggLimit + 2;                // we add few entries in case suggestions merge after casing modifications
           37  +        this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit
           38  +        this.nGoodSuggLimit = nSuggLimit * 15;              // n times the requested limit
    33     39       }
    34     40   
    35         -    addSugg (sSugg, nDeep=0) {
           41  +    addSugg (sSugg) {
    36     42           // add a suggestion
    37     43           if (this.aAllSugg.has(sSugg)) {
    38     44               return;
    39     45           }
    40     46           this.aAllSugg.add(sSugg);
    41         -        if (!this.aSugg.has(sSugg)) {
    42         -            //let nDist = Math.floor(str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)));
    43         -            let nDist = Math.floor((1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)))*10);
    44         -            if (nDist <= this.nDistLimit) {
    45         -                if (sSugg.includes(" ")) { // add 1 to distance for split suggestions
    46         -                    nDist += 1;
    47         -                }
    48         -                if (!this.dSugg.has(nDist)) {
    49         -                    this.dSugg.set(nDist, []);
    50         -                }
    51         -                this.dSugg.get(nDist).push(sSugg);
    52         -                this.aSugg.add(sSugg);
    53         -                if (nDist < this.nMinDist) {
    54         -                    this.nMinDist = nDist;
    55         -                }
    56         -                this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
    57         -            }
    58         -        }
    59         -    }
    60         -
    61         -    getSuggestions (nSuggLimit=10) {
           47  +        // jaro 0->1 1 les chaines sont égale
           48  +        let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg));
           49  +        let nDist = Math.floor(nDistJaro * 10);
           50  +        if (nDistJaro < .11) {        // Best suggestions
           51  +            this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000));
           52  +            if (this.dBestSugg.size > this.nBestSuggLimit) {
           53  +                this.nDistLimit = -1; // make suggest() to end search
           54  +            }
           55  +        } else if (nDistJaro < .33) { // Good suggestions
           56  +            this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000));
           57  +            if (this.dGoodSugg.size > this.nGoodSuggLimit) {
           58  +                this.nDistLimit = -1; // make suggest() to end search
           59  +            }
           60  +        } else {
           61  +            if (nDist < this.nMinDist) {
           62  +                this.nMinDist = nDist;
           63  +            }
           64  +            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist);
           65  +        }
           66  +        if (nDist <= this.nDistLimit) {
           67  +            if (nDist < this.nMinDist) {
           68  +                this.nMinDist = nDist;
           69  +            }
           70  +            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
           71  +        }
           72  +    }
           73  +
           74  +    getSuggestions () {
    62     75           // return a list of suggestions
    63     76           let lRes = [];
    64         -        let bFirstListSorted = false;
    65         -        for (let [nDist, lSugg] of this.dSugg.entries()) {
    66         -            if (nDist > this.nDistLimit) {
    67         -                break;
    68         -            }
    69         -            if (!bFirstListSorted && lSugg.length > 1) {
    70         -                //lRes.sort((a, b) => { return str_transform.distanceDamerauLevenshtein(this.sWord, a) - str_transform.distanceDamerauLevenshtein(this.sWord, b); });
    71         -                lRes.sort((a, b) => { return str_transform.distanceJaroWinkler(this.sWord, b) - str_transform.distanceJaroWinkler(this.sWord, a); });
    72         -                bFirstListSorted = true;
    73         -            }
    74         -            lRes.push(...lSugg);
    75         -            if (lRes.length > nSuggLimit) {
    76         -                break;
           77  +        if (this.dBestSugg.size > 0) {
           78  +            // sort only with simplified words
           79  +            let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; });
           80  +            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
           81  +            for (let i=0;  i < nSize;  i++){
           82  +                lRes.push(lResTmp[i][0]);
    77     83               }
    78     84           }
           85  +        if (lRes.length < this.nSuggLimitExt) {
           86  +            // sort with simplified words and original word
           87  +            let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => {
           88  +                // Low precision to rely more on simplified words
           89  +                let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10);
           90  +                let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10);
           91  +                if (nJaroA == nJaroB) {
           92  +                    return a[1] - b[1];     // warning: both lists are NOT sorted the same way (key: a-b)
           93  +                } else {
           94  +                    return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a)
           95  +                }
           96  +            }).slice(0, this.nSuggLimitExt);
           97  +            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
           98  +            for (let i=0;  i < nSize;  i++){
           99  +                lRes.push(lResTmp[i][0]);
          100  +            }
          101  +        }
          102  +        // casing
    79    103           if (this.sWord.gl_isUpperCase()) {
    80    104               lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
    81    105               lRes = [...new Set(lRes)];
    82    106           }
    83    107           else if (this.sWord.slice(0,1).gl_isUpperCase()) {
    84    108               lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); });
    85    109               lRes = [...new Set(lRes)];
    86    110           }
    87         -        return lRes.slice(0, nSuggLimit);
          111  +        return lRes.slice(0, this.nSuggLimit);
    88    112       }
    89    113   
    90    114       reset () {
    91         -        this.aSugg.clear();
    92    115           this.dSugg.clear();
          116  +        this.dGoodSugg.clear();
          117  +        this.dBestSugg.clear();
    93    118       }
    94    119   }
    95    120   
    96    121   
    97    122   class IBDAWG {
    98    123       // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH
    99    124   
................................................................................
   327    352           if (this.lexicographer) {
   328    353               [sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
   329    354           }
   330    355           let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
   331    356           let nMaxDel = Math.floor(sWord.length / 5);
   332    357           let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
   333    358           let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
   334         -        let oSuggResult = new SuggResult(sWord);
          359  +        let oSuggResult = new SuggResult(sWord, nSuggLimit);
          360  +        let sWord = str_transform.cleanWord(sWord);
   335    361           if (bSplitTrailingNumbers) {
   336    362               this._splitTrailingNumbers(oSuggResult, sWord);
   337    363           }
   338    364           this._splitSuggest(oSuggResult, sWord);
   339    365           this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump);
   340         -        let aSugg = oSuggResult.getSuggestions(nSuggLimit);
          366  +        let aSugg = oSuggResult.getSuggestions();
   341    367           if (this.lexicographer) {
   342    368               aSugg = this.lexicographer.filterSugg(aSugg);
   343    369           }
   344    370           if (sSfx || sPfx) {
   345    371               // we add what we removed
   346    372               return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } );
   347    373           }

Modified graphspell-js/str_transform.js from [5a573a5745] to [8ec0376c2c].

    62     62               if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
    63     63                   sNewWord += c;
    64     64               }
    65     65               i++;
    66     66           }
    67     67           return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    68     68       },
           69  +
           70  +    cleanWord: function (sWord) {
           71  +        // word clean for the user who make commun and preditive error help suggest
           72  +        // remove letters repeated more than 2 times
           73  +        if (sWord.match(/(.)(\1){2,}/igm)){
           74  +            sWord = sWord.replace(/(.*)(.)(.\2)/igm,'$1$2').replace(/(.)(\1)+/igm,'$1$1');
           75  +        }
           76  +        // words ending with -ik -> replace with -ique
           77  +        if (sWord.match(/ik$/ig)){
           78  +            sWord = sWord.replace(/(.*)ik$/ig,'$1ique');
           79  +        }
           80  +        return sWord;
           81  +    },
    69     82   
    70     83       _xTransNumbersToExponent: new Map([
    71     84           ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    72     85       ]),
    73     86   
    74     87       numbersToExponent: function (sWord) {
    75     88           let sNewWord = "";
................................................................................
   205    218           let adjwt = char_player.oDistanceBetweenChars;
   206    219           if (minv > Num_com) {
   207    220             for (let i = 0; i < a_len; i++) {
   208    221               if (!a_flag[i]) {
   209    222                 for (let j = 0; j < b_len; j++) {
   210    223                   if (!b_flag[j]) {
   211    224                     if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
   212         -                    N_simi += adjwt[a[i]][b[j]] * 10; // le fois 10 est un ajustement pour que ça fonctionne bien dans cette fonction
          225  +                    N_simi += adjwt[a[i]][b[j]];
   213    226                       b_flag[j] = 2;
   214    227                       break;
   215    228                     }
   216    229                   }
   217    230                 }
   218    231               }
   219    232             }