Grammalecte  Check-in [3b3a02f4d3]

Overview
Comment:[graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | bdic_opt
Files: files | file ages | folders
SHA3-256: 3b3a02f4d385bf7b9d9c2ef3f992cc0c7cc150c8b7366c74ade85c8154499a54
User & Date: olr on 2020-09-15 13:50:00
Other Links: branch diff | manifest | tags
Context
2020-09-15
14:01
[graphspell][js] remove specific trick in cleanWord() check-in: 6569849b49 user: olr tags: bdic_opt, graphspell
13:50
[graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) check-in: 3b3a02f4d3 user: olr tags: bdic_opt, graphspell
2020-09-14
14:38
[graphspell] string comparison: use Jaro-Winkler check-in: efebe44d15 user: olr tags: bdic_opt, graphspell
Changes

Modified graphspell-js/char_player.js from [0602ec129b] to [8dac23cf9b].

4
5
6
7
8
9
10
11





12
13
14
15
16
17
18
19
20
21

22
23
24
25
26
27

28
29
30
31
32

33
34

35
36
37
38

39
40
41
42
43
44
45
46
/* jshint esversion:6 */
/* jslint esversion:6 */

${map}


var char_player = {






    oDistanceBetweenChars: {
        "a": {},
        "e": {"é": 0.5},
        "é": {"e": 0.5},
        "i": {"y": 0.2},
        "o": {},
        "u": {},
        "y": {"i": 0.3},
        "b": {"d": 0.8, "h": 0.9},
        "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},

        "d": {"b": 0.8},
        "f": {"v": 0.8},
        "g": {"j": 0.5},
        "h": {"b": 0.9},
        "j": {"g": 0.5, "i": 0.9},
        "k": {"c": 0.5, "q": 0.1, "x": 0.5},

        "l": {"i": 0.9},
        "m": {"n": 0.8},
        "n": {"m": 0.8, "r": 0.9},
        "p": {"q": 0.9},
        "q": {"c": 0.5, "k": 0.1, "p": 0.9},

        "r": {"n": 0.9, "j": 0.9},
        "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},

        "t": {"d": 0.9},
        "v": {"f": 0.8, "w": 0.1},
        "w": {"v": 0.1},
        "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},

        "z": {"s": 0.5}
    },

    distanceBetweenChars: function (c1, c2) {
        if (c1 == c2) {
            return 0;
        }
        if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {







<
>
>
>
>
>

|
|
|
|
|
|
|
|
<
>
|
|
|
|
|
<
>
|
|
|
|
<
>
|
<
>
|
|
|
<
>
|







4
5
6
7
8
9
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24

25
26
27
28
29
30

31
32
33
34
35

36
37

38
39
40
41

42
43
44
45
46
47
48
49
50
/* jshint esversion:6 */
/* jslint esversion:6 */

${map}


var char_player = {

    /*
        oDistanceBetweenChars:
            - with Jaro-Winkler, values between 1 and 10
            - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9)
    */
    oDistanceBetweenChars: {
        //"a": {},
        "e": {"é": 5},
        //"é": {"e": 5},
        "i": {"y": 2},
        //"o": {},
        //"u": {},
        "y": {"i": 3},
        "b": {"d": 8, "h": 9},

        "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8},
        "d": {"b": 8},
        "f": {"v": 8},
        "g": {"j": 5},
        "h": {"b": 9},
        "j": {"g": 5, "i": 9},

        "k": {"c": 5, "q": 1, "x": 5},
        "l": {"i": 9},
        "m": {"n": 8},
        "n": {"m": 8, "r": 9},
        "p": {"q": 9},

        "q": {"c": 5, "k": 1, "p": 9},
        "r": {"n": 9, "j": 9},

        "s": {"c": 5, "ç": 1, "x": 5, "z": 5},
        "t": {"d": 9},
        "v": {"f": 8, "w": 1},
        "w": {"v": 1},

        "x": {"c": 5, "k": 5, "q": 5, "s": 5},
        "z": {"s": 5}
    },

    distanceBetweenChars: function (c1, c2) {
        if (c1 == c2) {
            return 0;
        }
        if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {

Modified graphspell-js/ibdawg.js from [1cb5337715] to [2160aa77f7].

18
19
20
21
22
23
24
25
26
27
28
29

30
31
32






33
34
35
36
37
38
39
40
41
42

43
44
45
46





47
48
49




50
51
52

53
54
55
56
57



58

59
60

61
62
63
64
65
66
67






68
69
70





71
72





73



74
75
76
77
78

79
80
81
82
83
84
85
86
87
88
89
90
91
92

93
94
95
96
97
98
99
...
327
328
329
330
331
332
333
334

335
336
337
338
339
340
341
342
343
344
345
346
347
    var char_player = require("./char_player.js");
}


class SuggResult {
    // Structure for storing, classifying and filtering suggestions

    constructor (sWord, nDistLimit=-1) {
        this.sWord = sWord;
        this.sSimplifiedWord = str_transform.simplifyWord(sWord);
        this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
        this.nMinDist = 1000;

        this.aSugg = new Set();
        this.dSugg = new Map([ [0, []],  [1, []],  [2, []] ]);
        this.aAllSugg = new Set();      // all found words even those refused






    }

    addSugg (sSugg, nDeep=0) {
        // add a suggestion
        if (this.aAllSugg.has(sSugg)) {
            return;
        }
        this.aAllSugg.add(sSugg);
        if (!this.aSugg.has(sSugg)) {
            //let nDist = Math.floor(str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)));

            let nDist = Math.floor((1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)))*10);
            if (nDist <= this.nDistLimit) {
                if (sSugg.includes(" ")) { // add 1 to distance for split suggestions
                    nDist += 1;





                }
                if (!this.dSugg.has(nDist)) {
                    this.dSugg.set(nDist, []);




                }
                this.dSugg.get(nDist).push(sSugg);
                this.aSugg.add(sSugg);

                if (nDist < this.nMinDist) {
                    this.nMinDist = nDist;
                }
                this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
            }



        }

    }


    getSuggestions (nSuggLimit=10) {
        // return a list of suggestions
        let lRes = [];
        let bFirstListSorted = false;
        for (let [nDist, lSugg] of this.dSugg.entries()) {
            if (nDist > this.nDistLimit) {
                break;






            }
            if (!bFirstListSorted && lSugg.length > 1) {
                //lRes.sort((a, b) => { return str_transform.distanceDamerauLevenshtein(this.sWord, a) - str_transform.distanceDamerauLevenshtein(this.sWord, b); });





                lRes.sort((a, b) => { return str_transform.distanceJaroWinkler(this.sWord, b) - str_transform.distanceJaroWinkler(this.sWord, a); });
                bFirstListSorted = true;





            }



            lRes.push(...lSugg);
            if (lRes.length > nSuggLimit) {
                break;
            }
        }

        if (this.sWord.gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
            lRes = [...new Set(lRes)];
        }
        else if (this.sWord.slice(0,1).gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); });
            lRes = [...new Set(lRes)];
        }
        return lRes.slice(0, nSuggLimit);
    }

    reset () {
        this.aSugg.clear();
        this.dSugg.clear();

    }
}


class IBDAWG {
    // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH

................................................................................
        if (this.lexicographer) {
            [sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
        }
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
        let oSuggResult = new SuggResult(sWord);

        if (bSplitTrailingNumbers) {
            this._splitTrailingNumbers(oSuggResult, sWord);
        }
        this._splitSuggest(oSuggResult, sWord);
        this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump);
        let aSugg = oSuggResult.getSuggestions(nSuggLimit);
        if (this.lexicographer) {
            aSugg = this.lexicographer.filterSugg(aSugg);
        }
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } );
        }







|




>
|
<
|
>
>
>
>
>
>


|





<
<
>
|
<
<
<
>
>
>
>
>
|
<
<
>
>
>
>
|
<
<
>
|
|
|
|
|
>
>
>
|
>
|
|
>
|


<
<
<
<
>
>
>
>
>
>

<
<
>
>
>
>
>
|
<
>
>
>
>
>
|
>
>
>
|
<
<


>








|



|
|
>







 







|
>





|







18
19
20
21
22
23
24
25
26
27
28
29
30
31

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46


47
48



49
50
51
52
53
54


55
56
57
58
59


60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76




77
78
79
80
81
82
83


84
85
86
87
88
89

90
91
92
93
94
95
96
97
98
99


100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
...
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
    var char_player = require("./char_player.js");
}


class SuggResult {
    // Structure for storing, classifying and filtering suggestions

    constructor (sWord, nSuggLimit=10, nDistLimit=-1) {
        this.sWord = sWord;
        this.sSimplifiedWord = str_transform.simplifyWord(sWord);
        this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
        this.nMinDist = 1000;
        // Temporary sets
        this.aAllSugg = new Set();  // All suggestions, even the one rejected

        this.dGoodSugg = new Map(); // Acceptable suggestions
        this.dBestSugg = new Map(); // Best suggestions
        // Parameters
        this.nSuggLimit = nSuggLimit;
        this.nSuggLimitExt = nSuggLimit + 2;                // we add few entries in case suggestions merge after casing modifications
        this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit
        this.nGoodSuggLimit = nSuggLimit * 15;              // n times the requested limit
    }

    addSugg (sSugg) {
        // add a suggestion
        if (this.aAllSugg.has(sSugg)) {
            return;
        }
        this.aAllSugg.add(sSugg);


        // jaro 0->1 1 les chaines sont égale
        let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg));



        let nDist = Math.floor(nDistJaro * 10);
        if (nDistJaro < .11) {        // Best suggestions
            this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000));
            if (this.dBestSugg.size > this.nBestSuggLimit) {
                this.nDistLimit = -1; // make suggest() to end search
            }


        } else if (nDistJaro < .33) { // Good suggestions
            this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000));
            if (this.dGoodSugg.size > this.nGoodSuggLimit) {
                this.nDistLimit = -1; // make suggest() to end search
            }


        } else {
            if (nDist < this.nMinDist) {
                this.nMinDist = nDist;
            }
            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist);
        }
        if (nDist <= this.nDistLimit) {
            if (nDist < this.nMinDist) {
                this.nMinDist = nDist;
            }
            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
        }
    }

    getSuggestions () {
        // return a list of suggestions
        let lRes = [];




        if (this.dBestSugg.size > 0) {
            // sort only with simplified words
            let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; });
            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
            for (let i=0;  i < nSize;  i++){
                lRes.push(lResTmp[i][0]);
            }


        }
        if (lRes.length < this.nSuggLimitExt) {
            // sort with simplified words and original word
            let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => {
                // Low precision to rely more on simplified words
                let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10);

                let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10);
                if (nJaroA == nJaroB) {
                    return a[1] - b[1];     // warning: both lists are NOT sorted the same way (key: a-b)
                } else {
                    return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a)
                }
            }).slice(0, this.nSuggLimitExt);
            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
            for (let i=0;  i < nSize;  i++){
                lRes.push(lResTmp[i][0]);


            }
        }
        // casing
        if (this.sWord.gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
            lRes = [...new Set(lRes)];
        }
        else if (this.sWord.slice(0,1).gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); });
            lRes = [...new Set(lRes)];
        }
        return lRes.slice(0, this.nSuggLimit);
    }

    reset () {
        this.dSugg.clear();
        this.dGoodSugg.clear();
        this.dBestSugg.clear();
    }
}


class IBDAWG {
    // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH

................................................................................
        if (this.lexicographer) {
            [sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
        }
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
        let oSuggResult = new SuggResult(sWord, nSuggLimit);
        let sWord = str_transform.cleanWord(sWord);
        if (bSplitTrailingNumbers) {
            this._splitTrailingNumbers(oSuggResult, sWord);
        }
        this._splitSuggest(oSuggResult, sWord);
        this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump);
        let aSugg = oSuggResult.getSuggestions();
        if (this.lexicographer) {
            aSugg = this.lexicographer.filterSugg(aSugg);
        }
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } );
        }

Modified graphspell-js/str_transform.js from [5a573a5745] to [8ec0376c2c].

62
63
64
65
66
67
68













69
70
71
72
73
74
75
...
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
            if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
                sNewWord += c;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    },














    _xTransNumbersToExponent: new Map([
        ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    ]),

    numbersToExponent: function (sWord) {
        let sNewWord = "";
................................................................................
        let adjwt = char_player.oDistanceBetweenChars;
        if (minv > Num_com) {
          for (let i = 0; i < a_len; i++) {
            if (!a_flag[i]) {
              for (let j = 0; j < b_len; j++) {
                if (!b_flag[j]) {
                  if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
                    N_simi += adjwt[a[i]][b[j]] * 10; // le fois 10 est un ajustement pour que ça fonctionne bien dans cette fonction
                    b_flag[j] = 2;
                    break;
                  }
                }
              }
            }
          }







>
>
>
>
>
>
>
>
>
>
>
>
>







 







|







62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
...
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
            if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
                sNewWord += c;
            }
            i++;
        }
        return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    },

    cleanWord: function (sWord) {
        // word clean for the user who make commun and preditive error help suggest
        // remove letters repeated more than 2 times
        if (sWord.match(/(.)(\1){2,}/igm)){
            sWord = sWord.replace(/(.*)(.)(.\2)/igm,'$1$2').replace(/(.)(\1)+/igm,'$1$1');
        }
        // words ending with -ik -> replace with -ique
        if (sWord.match(/ik$/ig)){
            sWord = sWord.replace(/(.*)ik$/ig,'$1ique');
        }
        return sWord;
    },

    _xTransNumbersToExponent: new Map([
        ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    ]),

    numbersToExponent: function (sWord) {
        let sNewWord = "";
................................................................................
        let adjwt = char_player.oDistanceBetweenChars;
        if (minv > Num_com) {
          for (let i = 0; i < a_len; i++) {
            if (!a_flag[i]) {
              for (let j = 0; j < b_len; j++) {
                if (!b_flag[j]) {
                  if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
                    N_simi += adjwt[a[i]][b[j]];
                    b_flag[j] = 2;
                    break;
                  }
                }
              }
            }
          }