Grammalecte  Changes On Branch spellsugg

Changes In Branch spellsugg Excluding Merge-Ins

This is equivalent to a diff from 8ddc15536e to 6c5050fe91

2017-11-21
16:57
[core] better suggestion engine Closed-Leaf check-in: 6c5050fe91 user: olr tags: core, spellsugg
2017-11-09
11:56
[core] ibdawg: suggestion mechanism > reduce 1toX replacements overload (much, much faster) check-in: 767e396f2d user: olr tags: core, spellsugg
2017-11-08
15:57
[fr] màj: écriture dystypographique check-in: 0bd6a29f03 user: olr tags: fr, trunk
2017-11-07
11:56
[core] ibdawg: another suggestion method check-in: 1edae62ad8 user: olr tags: core, spellsugg
2017-11-06
17:21
[core][py] timer for testing check-in: 8ddc15536e user: olr tags: core, trunk
14:21
[core][py] ibdawg: debugging check-in: cbe1422160 user: olr tags: core, trunk

Modified cli.py from [ed7458a4d3] to [68793c6981].

206
207
208
209
210
211
212

213
214
215
216
217
218
219
                        echo("* " + sWord)
                        for sMorph in oDict.getMorph(sWord):
                            echo("  {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph)))
            elif sText.startswith("!"):
                for sWord in sText[1:].strip().split():
                    if sWord:
                        echo(" | ".join(oDict.suggest(sWord)))

            elif sText.startswith(">"):
                oDict.drawPath(sText[1:].strip())
            elif sText.startswith("="):
                for sRes in oDict.select(sText[1:].strip()):
                    echo(sRes)
            elif sText.startswith("/+ "):
                gce.setOptions({ opt:True  for opt in sText[3:].strip().split()  if opt in gce.getOptions() })







>







206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
                        echo("* " + sWord)
                        for sMorph in oDict.getMorph(sWord):
                            echo("  {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph)))
            elif sText.startswith("!"):
                for sWord in sText[1:].strip().split():
                    if sWord:
                        echo(" | ".join(oDict.suggest(sWord)))
                        #echo(" | ".join(oDict.suggest2(sWord)))
            elif sText.startswith(">"):
                oDict.drawPath(sText[1:].strip())
            elif sText.startswith("="):
                for sRes in oDict.select(sText[1:].strip()):
                    echo(sRes)
            elif sText.startswith("/+ "):
                gce.setOptions({ opt:True  for opt in sText[3:].strip().split()  if opt in gce.getOptions() })

Modified gc_core/js/char_player.js from [f5b2267e7d] to [0547b59e35].

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
...
223
224
225
226
227
228
229
230
231
232
233


234
235
236
237
238
239
240
241
242
243
244
245
246
...
262
263
264
265
266
267
268







269
270
271
272
273
274
275
        let sRes = "";
        for (let c of sWord) {
            sRes += this._dTransChars.gl_get(c, c);
        }
        return sRes.replace("eau", "o").replace("au", "o");
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i] = new Array(nLen2 + 1);
            }
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    let nCost = (s1[i] === s2[j]) ? 0 : 1;
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            //console.log(s2 + ": " + matrix[nLen1][nLen2]);
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            helpers.logerror(e);
        }
    },

    showDistance (s1, s2) {
        let s1b = this.cleanWord(s1);
        let s2b = this.cleanWord(s2);
        console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
        console.log(`Distance: ${s1b} / ${s2b} = ${this.distanceDamerauLevenshtein(s1b, s2b)})`);
    },

    // Method: Remove Useless Chars

    aVovels: new Set([
        'a', 'e', 'i', 'o', 'u', 'y',
        'à', 'é', 'î', 'ô', 'û', 'ÿ',
        'â', 'è', 'ï', 'ö', 'ù', 'ŷ',
        'ä', 'ê', 'í', 'ó', 'ü', 'ý',
        'á', 'ë', 'ì', 'ò', 'ú', 'ỳ',
        'ā', 'ē', 'ī', 'ō', 'ū', 'ȳ',
        'h', 'œ', 'æ'
    ]),

    shrinkWord: function (sWord) {
        // remove vovels and h
        let sRes = "";
        for (let cChar of sWord.slice(1)) {
            if (!this.aVovels.has(cChar)) {
                sRes += cChar;
            }
        }
        return sWord.slice(0, 1).replace("h", "") + sRes;
    },


    // Similar chars

    d1to1: new Map([
        ["1", "liîLIÎ"],
        ["2", "zZ"],
................................................................................
    d1toX: new Map([
        ["æ", ["ae",]],
        ["Æ", ["AE",]],
        ["b", ["bb",]],
        ["B", ["BB",]],
        ["c", ["cc", "ss", "qu", "ch"]],
        ["C", ["CC", "SS", "QU", "CH"]],
        ["ç", ["ss", "cc", "qh", "ch"]],
        ["Ç", ["SS", "CC", "QH", "CH"]],
        ["d", ["dd",]],
        ["D", ["DD",]],


        ["f", ["ff", "ph"]],
        ["F", ["FF", "PH"]],
        ["g", ["gu", "ge", "gg", "gh"]],
        ["G", ["GU", "GE", "GG", "GH"]],
        ["i", ["ii",]],
        ["I", ["II",]],
        ["j", ["jj", "dj"]],
        ["J", ["JJ", "DJ"]],
        ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]],
        ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]],
        ["l", ["ll",]],
        ["L", ["LL",]],
        ["m", ["mm", "mn"]],
................................................................................
        ["t", ["tt", "th"]],
        ["T", ["TT", "TH"]],
        ["x", ["cc", "ct", "xx"]],
        ["X", ["CC", "CT", "XX"]],
        ["z", ["ss", "zh"]],
        ["Z", ["SS", "ZH"]],
    ]),








    d2toX: new Map([
        ["an", ["en",]],
        ["AN", ["EN",]],
        ["au", ["eau", "o", "ô"]],
        ["AU", ["EAU", "O", "Ô"]],
        ["en", ["an",]],







|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







<
<


>
>




<
<







 







>
>
>
>
>
>
>







22
23
24
25
26
27
28
29
30
31































































32
33
34
35
36
37
38
...
160
161
162
163
164
165
166


167
168
169
170
171
172
173
174


175
176
177
178
179
180
181
...
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
        let sRes = "";
        for (let c of sWord) {
            sRes += this._dTransChars.gl_get(c, c);
        }
        return sRes.replace("eau", "o").replace("au", "o");
    },

    aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"),
    aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"),
    aDouble: new Set("bcçdfjklmnprstzBCÇDFJKLMNPRSTZ"),  // letters that may be used twice successively

































































    // Similar chars

    d1to1: new Map([
        ["1", "liîLIÎ"],
        ["2", "zZ"],
................................................................................
    d1toX: new Map([
        ["æ", ["ae",]],
        ["Æ", ["AE",]],
        ["b", ["bb",]],
        ["B", ["BB",]],
        ["c", ["cc", "ss", "qu", "ch"]],
        ["C", ["CC", "SS", "QU", "CH"]],


        ["d", ["dd",]],
        ["D", ["DD",]],
        ["é", ["ai", "ei"]],
        ["É", ["AI", "EI"]],
        ["f", ["ff", "ph"]],
        ["F", ["FF", "PH"]],
        ["g", ["gu", "ge", "gg", "gh"]],
        ["G", ["GU", "GE", "GG", "GH"]],


        ["j", ["jj", "dj"]],
        ["J", ["JJ", "DJ"]],
        ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]],
        ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]],
        ["l", ["ll",]],
        ["L", ["LL",]],
        ["m", ["mm", "mn"]],
................................................................................
        ["t", ["tt", "th"]],
        ["T", ["TT", "TH"]],
        ["x", ["cc", "ct", "xx"]],
        ["X", ["CC", "CT", "XX"]],
        ["z", ["ss", "zh"]],
        ["Z", ["SS", "ZH"]],
    ]),

    get1toXReplacement: function (cPrev, cCur, cNext) {
        if (this.aConsonant.has(cCur)  &&  (this.aConsonant.has(cPrev)  ||  this.aConsonant.has(cNext))) {
            return [];
        }
        return this.d1toX.gl_get(cCur, []);
    },

    d2toX: new Map([
        ["an", ["en",]],
        ["AN", ["EN",]],
        ["au", ["eau", "o", "ô"]],
        ["AU", ["EAU", "O", "Ô"]],
        ["en", ["an",]],

Modified gc_core/js/ibdawg.js from [c87880e4ad] to [b7bc0fa0c9].

13
14
15
16
17
18
19































































20
21
22
23
24
25
26
...
189
190
191
192
193
194
195

196
197
198


199
200

201
202
203

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

238
239







240

241
242
243
244
245
246

247

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286









287
288
289
290
291
292
293
294


// Don’t remove <string>. Necessary in TB.
${string}
${map}
${set}

































































class IBDAWG {
    // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH

    constructor (sDicName, sPath="") {
        try {
            let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName;
................................................................................
    }

    suggest (sWord, nMaxSugg=10) {
        // returns a array of suggestions for <sWord>
        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);

        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
        let aSugg = this._suggest(sWord, nMaxDel, nMaxHardRepl);


        if (sWord.gl_isTitle()) {
            aSugg.gl_update(this._suggest(sWord.toLowerCase(), nMaxDel, nMaxHardRepl));

        }
        else if (sWord.gl_isLowerCase()) {
            aSugg.gl_update(this._suggest(sWord.gl_toCapitalize(), nMaxDel, nMaxHardRepl));

        }
        // Set to Array
        aSugg = Array.from(aSugg);
        aSugg = char_player.filterSugg(aSugg);
        if (sWord.gl_isTitle()) {
            aSugg = aSugg.map((sSugg) => { return sSugg.gl_toCapitalize(); });
        }
        let dDistTemp = new Map();
        let sCleanWord = char_player.cleanWord(sWord);
        aSugg.forEach((sSugg) => { dDistTemp.set(sSugg, char_player.distanceDamerauLevenshtein(sCleanWord, char_player.cleanWord(sSugg))); });
        aSugg = aSugg.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); }).slice(0, nMaxSugg);
        dDistTemp.clear();
        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } );
        }
        return aSugg;
    }

    _suggest (sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
        // returns a set of suggestions
        // recursive function
        let aSugg = new Set();
        if (sRemain == "") {
            if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
                aSugg.add(sNewWord);
            }
            for (let sTail of this._getTails(iAddr)) {
                aSugg.add(sNewWord+sTail);
            }
            return aSugg;
        }
        let cCurrent = sRemain.slice(0, 1);
        for (let [cChar, jAddr] of this._getSimilarArcs(cCurrent, iAddr)) {

            aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar));
        }







        if (!bAvoidLoop) { // avoid infinite loop

            if (cCurrent == sRemain.slice(1, 2)) {
                // same char, we remove 1 char without adding 1 to <sNewWord>
                aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord));
            }
            else {
                // switching chars

                aSugg.gl_update(this._suggest(sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));

                // delete char
                if (nMaxDel > 0) {
                    aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
                }
            }
            // Phonetic replacements
            for (let sRepl of char_player.d1toX.gl_get(cCurrent, [])) {
                aSugg.gl_update(this._suggest(sRepl + sRemain.slice(1), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
            }
            for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) {
                aSugg.gl_update(this._suggest(sRepl + sRemain.slice(2), nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
            }
            // Hard replacements
            if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) {
                for (let [nVal, kAddr] of this._getArcs1(iAddr)) {
                    if (this.dCharVal.has(nVal)) {
                        let cChar = this.dCharVal.get(nVal);
                        if (!char_player.d1to1.gl_get(cCurrent, "").includes(cChar)) {
                            aSugg.gl_update(this._suggest(sRemain.slice(1), nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true));
                        }
                    }
                }
            }
            // end of word
            if (sRemain.length == 2) {
                for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) {
                    aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
                }
            }
            else if (sRemain.length == 1) {
                aSugg.gl_update(this._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true)); // remove last char and go on
                for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) {
                    aSugg.gl_update(this._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true));
                }
            }
        }
        return aSugg;
    }










    * _getSimilarArcs (cChar, iAddr) {
        // generator: yield similar char of <cChar> and address of the following node
        for (let c of char_player.d1to1.gl_get(cChar, [cChar])) {
            if (this.dChar.has(c)) {
                let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
                if (jAddr) {
                    yield [c, jAddr];
                }







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>


<
>
>

<
>


<
>

<
<
|
<
<
<
<
<
<
<
<







|


<


|


|

|


|
>
|
|
>
>
>
>
>
>
>

>
|
|
|
|
|
|
>
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
|
|


|




|



|

|



<


>
>
>
>
>
>
>
>
>
|







13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
...
252
253
254
255
256
257
258
259
260
261

262
263
264

265
266
267

268
269


270








271
272
273
274
275
276
277
278
279
280

281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327


328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346

347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365


// Don’t remove <string>. Necessary in TB.
${string}
${map}
${set}


class SuggResult {
    // Structure for storing, classifying and filtering suggestions

    constructor (sWord, nDistLimit=-1) {
        this.sWord = sWord;
        this.sCleanWord = char_player.cleanWord(sWord);
        this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
        this.nMinDist = 1000;
        this.aSugg = new Set();
        this.dSugg = new Map([ [0, []],  [1, []] ]);
    }

    addSugg (sSugg, nDeep=0) {
        // add a suggestion
        if (!this.aSugg.has(sSugg)) {
            let nDist = str_transform.distanceDamerauLevenshtein(this.sCleanWord, char_player.cleanWord(sSugg));
            if (nDist <= this.nDistLimit) {
                if (!this.dSugg.has(nDist)) {
                    this.dSugg.set(nDist, []);
                }
                this.dSugg.get(nDist).push(sSugg);
                this.aSugg.add(sSugg);
                if (nDist < this.nMinDist) {
                    this.nMinDist = nDist;
                }
                this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+2);
            }
        }
    }

    getSuggestions (nSuggLimit=10, nDistLimit=-1) {
        // return a list of suggestions
        let lRes = [];
        if (this.dSugg.get(0).length) {
            // we sort the better results with the original word
            let dDistTemp = new Map();
            lRes.forEach((sSugg) => { dDistTemp.set(sSugg, str_transform.distanceDamerauLevenshtein(this.sWord, sSugg)); });
            lRes = lRes.sort((sA, sB) => { return dDistTemp.get(sA) - dDistTemp.get(sB); });
            dDistTemp.clear();
        }
        for (let lSugg of this.dSugg.values()) {
            for (let sSugg of lSugg) { lRes.push(sSugg); }
            if (lRes.length > nSuggLimit) {
                break;
            }
        }
        lRes = char_player.filterSugg(lRes);
        if (this.sWord.gl_isTitle()) {
            lRes = lRes.map((sSugg) => { return sSugg.gl_toCapitalize(); });
        }
        else if (this.sWord.gl_isUpperCase()) {
            lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
        }
        return lRes.slice(0, nSuggLimit);
    }

    reset () {
        this.aSugg.clear();
        this.dSugg.clear();
    }
}


class IBDAWG {
    // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH

    constructor (sDicName, sPath="") {
        try {
            let sURL = (sPath !== "") ? sPath + "/" + sDicName : "resource://grammalecte/_dictionaries/"+sDicName;
................................................................................
    }

    suggest (sWord, nMaxSugg=10) {
        // returns a array of suggestions for <sWord>
        let sPfx = "";
        let sSfx = "";
        [sPfx, sWord, sSfx] = char_player.cut(sWord);
        let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
        let nMaxDel = Math.floor(sWord.length / 5);
        let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);

        let oSuggResult = new SuggResult(sWord);
        this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl);
        if (sWord.gl_isTitle()) {

            this._suggest(oSuggResult, sWord.toLowerCase(), nMaxSwitch, nMaxDel, nMaxHardRepl);
        }
        else if (sWord.gl_isLowerCase()) {

            this._suggest(oSuggResult, sWord.gl_toCapitalize(), nMaxSwitch, nMaxDel, nMaxHardRepl);
        }


        let aSugg = oSuggResult.getSuggestions();








        if (sSfx || sPfx) {
            // we add what we removed
            return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx } );
        }
        return aSugg;
    }

    _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
        // returns a set of suggestions
        // recursive function

        if (sRemain == "") {
            if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
                oSuggResult.addSugg(sNewWord);
            }
            for (let sTail of this._getTails(iAddr)) {
                oSuggResult.addSugg(sNewWord+sTail);
            }
            return;
        }
        let cCurrent = sRemain.slice(0, 1);
        for (let [cChar, jAddr] of this._getCharArcs(iAddr)) {
            if (char_player.d1to1.gl_get(cCurrent, [cCurrent]).includes(cChar)) {
                this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar);
            }
            else if (!bAvoidLoop && nMaxHardRepl) {
                this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, true);
            }
        }
        /*for (let [cChar, jAddr] of this._getSimilarCharArcs(cCurrent, iAddr)) {
            this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar);
        }*/
        if (!bAvoidLoop) { // avoid infinite loop
            if (sRemain.length > 1) {
                if (cCurrent == sRemain.slice(1, 2)) {
                    // same char, we remove 1 char without adding 1 to <sNewWord>
                    this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord);
                }
                else {
                    // switching chars
                    if (nMaxSwitch > 0) {
                        this._suggest(oSuggResult, sRemain.slice(1, 2)+sRemain.slice(0, 1)+sRemain.slice(2), nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                    }
                    // delete char
                    if (nMaxDel > 0) {
                        this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                    }
                }
                // Phonetic replacements
                for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) {
                    this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
                for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) {
                    this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
                // Hard replacements
                /*if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) {
                    for (let [cChar, kAddr] of this._getCharArcs(iAddr)) {


                        if (!char_player.d1to1.gl_get(cCurrent, [cCurrent]).includes(cChar)) {
                            this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, true);
                        }
                    }
                }*/
            }
            // end of word
            if (sRemain.length == 2) {
                for (let sRepl of char_player.dFinal2.gl_get(sRemain, [])) {
                    this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
            }
            else if (sRemain.length == 1) {
                this._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true); // remove last char and go on
                for (let sRepl of char_player.dFinal1.gl_get(sRemain, [])) {
                    this._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
                }
            }
        }

    }

    * _getCharArcs (iAddr) {
        // generator: yield all chars and addresses from node at address <iAddr>
        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
            if (nVal < this.nChar) {
                yield [this.dCharVal.get(nVal), jAddr];
            }
        }
    }

    * _getSimilarCharArcs (cChar, iAddr) {
        // generator: yield similar char of <cChar> and address of the following node
        for (let c of char_player.d1to1.gl_get(cChar, [cChar])) {
            if (this.dChar.has(c)) {
                let jAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
                if (jAddr) {
                    yield [c, jAddr];
                }

Modified gc_core/js/str_transform.js from [6745507121] to [e70f6ede55].

1
2
3
4
5
6










































7
8
9
10
11
12
13
//// STRING TRANSFORMATION
/*jslint esversion: 6*/

// Note: 48 is the ASCII code for "0"

var str_transform = {










































    getStemFromSuffixCode: function (sFlex, sSfxCode) {
        // Suffix only
        if (sSfxCode == "0") {
            return sFlex;
        }
        return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
//// STRING TRANSFORMATION
/*jslint esversion: 6*/

// Note: 48 is the ASCII code for "0"

var str_transform = {

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i] = new Array(nLen2 + 1);
            }
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    let nCost = (s1[i] === s2[j]) ? 0 : 1;
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            //console.log(s2 + ": " + matrix[nLen1][nLen2]);
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            helpers.logerror(e);
        }
    },

    showDistance (s1, s2) {
        console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
    },

    getStemFromSuffixCode: function (sFlex, sSfxCode) {
        // Suffix only
        if (sSfxCode == "0") {
            return sFlex;
        }
        return sSfxCode[0] == '0' ? sFlex + sSfxCode.slice(1) : sFlex.slice(0, -(sSfxCode.charCodeAt(0)-48)) + sSfxCode.slice(1);
    },

Modified gc_core/py/char_player.py from [97f69bd70d] to [b0152aab01].

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
...
201
202
203
204
205
206
207
208
209
210
211


212
213
214
215
216
217
218
219
220
221
222
223
224
...
240
241
242
243
244
245
246







247
248
249
250
251
252
253
})

def cleanWord (sWord):
    "word simplication before calculating distance between words"
    return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o")


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


def showDistance (s1, s2):
    s1b = cleanWord(s1);
    s2b = cleanWord(s2);
    print(f"Distance: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}")
    print(f"Distance: {s1b} / {s2b} = {distanceDamerauLevenshtein(s1b, s2b)}")


# Method: Remove Useless Chars

_dVovels = {
    'a': '',  'e': '',  'i': '',  'o': '',  'u': '',  'y': '',
    'à': '',  'é': '',  'î': '',  'ô': '',  'û': '',  'ÿ': '',
    'â': '',  'è': '',  'ï': '',  'ö': '',  'ù': '',  'ŷ': '',
    'ä': '',  'ê': '',  'í': '',  'ó': '',  'ü': '',  'ý': '',
    'á': '',  'ë': '',  'ì': '',  'ò': '',  'ú': '',  'ỳ': '',
    'ā': '',  'ē': '',  'ī': '',  'ō': '',  'ū': '',  'ȳ': '',
    'h': '',  'œ': '',  'æ': ''
 }

_xTransVovels = str.maketrans(_dVovels)


aVovels = frozenset(_dVovels.keys())


def shrinkWord (sWord):
    "remove vovels and h"
    return sWord[0:1].replace("h", "") + sWord[1:].translate(_xTransVovels)


# Similar chars

d1to1 = {
    "1": "liîLIÎ",
    "2": "zZ",
................................................................................
d1toX = {
    "æ": ("ae",),
    "Æ": ("AE",),
    "b": ("bb",),
    "B": ("BB",),
    "c": ("cc", "ss", "qu", "ch"),
    "C": ("CC", "SS", "QU", "CH"),
    "ç": ("ss", "cc", "qh", "ch"),
    "Ç": ("SS", "CC", "QH", "CH"),
    "d": ("dd",),
    "D": ("DD",),


    "f": ("ff", "ph"),
    "F": ("FF", "PH"),
    "g": ("gu", "ge", "gg", "gh"),
    "G": ("GU", "GE", "GG", "GH"),
    "i": ("ii",),
    "I": ("II",),
    "j": ("jj", "dj"),
    "J": ("JJ", "DJ"),
    "k": ("qu", "ck", "ch", "cu", "kk", "kh"),
    "K": ("QU", "CK", "CH", "CU", "KK", "KH"),
    "l": ("ll",),
    "L": ("LL",),
    "m": ("mm", "mn"),
................................................................................
    "t": ("tt", "th"),
    "T": ("TT", "TH"),
    "x": ("cc", "ct", "xx"),
    "X": ("CC", "CT", "XX"),
    "z": ("ss", "zh"),
    "Z": ("SS", "ZH"),
}








d2toX = {
    "an": ("en",),
    "AN": ("EN",),
    "au": ("eau", "o", "ô"),
    "AU": ("EAU", "O", "Ô"),
    "en": ("an",),







|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







<
<


>
>




<
<







 







>
>
>
>
>
>
>







15
16
17
18
19
20
21
22
23
24
















































25
26
27
28
29
30
31
...
153
154
155
156
157
158
159


160
161
162
163
164
165
166
167


168
169
170
171
172
173
174
...
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
})

def cleanWord (sWord):
    "word simplication before calculating distance between words"
    return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o")


aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ")
aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
aDouble = set("bcçdfjklmnprstzBCÇDFJKLMNPRSTZ")  # letters that may be used twice successively


















































# Similar chars

d1to1 = {
    "1": "liîLIÎ",
    "2": "zZ",
................................................................................
d1toX = {
    "æ": ("ae",),
    "Æ": ("AE",),
    "b": ("bb",),
    "B": ("BB",),
    "c": ("cc", "ss", "qu", "ch"),
    "C": ("CC", "SS", "QU", "CH"),


    "d": ("dd",),
    "D": ("DD",),
    "é": ("ai", "ei"),
    "É": ("AI", "EI"),
    "f": ("ff", "ph"),
    "F": ("FF", "PH"),
    "g": ("gu", "ge", "gg", "gh"),
    "G": ("GU", "GE", "GG", "GH"),


    "j": ("jj", "dj"),
    "J": ("JJ", "DJ"),
    "k": ("qu", "ck", "ch", "cu", "kk", "kh"),
    "K": ("QU", "CK", "CH", "CU", "KK", "KH"),
    "l": ("ll",),
    "L": ("LL",),
    "m": ("mm", "mn"),
................................................................................
    "t": ("tt", "th"),
    "T": ("TT", "TH"),
    "x": ("cc", "ct", "xx"),
    "X": ("CC", "CT", "XX"),
    "z": ("ss", "zh"),
    "Z": ("SS", "ZH"),
}


def get1toXReplacement (cPrev, cCur, cNext):
    if cCur in aConsonant  and  (cPrev in aConsonant  or  cNext in aConsonant):
        return ()
    return d1toX.get(cCur, ())


d2toX = {
    "an": ("en",),
    "AN": ("EN",),
    "au": ("eau", "o", "ô"),
    "AU": ("EAU", "O", "Ô"),
    "en": ("an",),

Modified gc_core/py/ibdawg.py from [a0b40e49a1] to [dcdab2db72].

4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
..
23
24
25
26
27
28
29















































30
31
32
33
34
35
36
...
200
201
202
203
204
205
206

207
208

209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

232
233
234
235
236





237
238
239

240
241
242
243
244
245


246
247
248
249
250

251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268












269
270



















271
272
273
274
275
276
277







278
279
280
281
282
283
284
...
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import traceback
import pkgutil
import re
from itertools import chain
from functools import wraps
import time

#import logging
#logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp
from .echo import echo


def timethis (func):
................................................................................
        fStart = time.time()
        result = func(*args, **kwargs)
        fEnd = time.time()
        print(func.__name__, fEnd - fStart)
        return result
    return wrapper

















































class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, sDicName):
        self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName)
        if not self.by:
................................................................................
                l.extend(self.morph(sWord.capitalize()))
        return l

    @timethis
    def suggest (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sPfx, sWord, sSfx = cp.cut(sWord)

        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)

        aSugg = self._suggest(sWord, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        if sWord.istitle():
            aSugg.update(self._suggest(sWord.lower(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
            aSugg = set(map(lambda sSugg: sSugg.title(), aSugg))
        elif sWord.islower():
            aSugg.update(self._suggest(sWord.title(), nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl))
        aSugg = cp.filterSugg(aSugg)
        sCleanWord = cp.cleanWord(sWord)
        aSugg = sorted(aSugg, key=lambda sSugg: cp.distanceDamerauLevenshtein(sCleanWord, cp.cleanWord(sSugg)))[:nMaxSugg]
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest (self, sRemain, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
        "returns a set of suggestions"
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
        aSugg = set()
        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
                #logging.info((nDeep * "  ") + "__" + sNewWord + "__")
                aSugg.add(sNewWord)

            for sTail in self._getTails(iAddr):
                #logging.info((nDeep * "  ") + "__" + sNewWord+sTail + "__")
                aSugg.add(sNewWord+sTail)
            return aSugg
        cCurrent = sRemain[0:1]





        for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr):
            aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar))
        if not bAvoidLoop: # avoid infinite loop

            if cCurrent == sRemain[1:2]:
                # same char, we remove 1 char without adding 1 to <sNewWord>
                aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord))
            else:
                # switching chars
                aSugg.update(self._suggest(sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))


                # delete char
                if nMaxDel > 0:
                    aSugg.update(self._suggest(sRemain[1:], nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            # Phonetic replacements
            for sRepl in cp.d1toX.get(cCurrent, ()):

                aSugg.update(self._suggest(sRepl + sRemain[1:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            for sRepl in cp.d2toX.get(sRemain[0:2], ()):
                aSugg.update(self._suggest(sRepl + sRemain[2:], nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            # Hard replacements
            if nDeep > 3 and nMaxHardRepl and len(sRemain) >= 2:
                for nVal, kAddr in self._getArcs1(iAddr):
                    if nVal in self.dCharVal:
                        cChar = self.dCharVal[nVal]
                        if cChar not in cp.d1to1.get(cCurrent, ""):
                            aSugg.update(self._suggest(sRemain[1:], nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, True))
            # end of word
            if len(sRemain) == 2:
                for sRepl in cp.dFinal2.get(sRemain, ()):
                    aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))
            elif len(sRemain) == 1:
                aSugg.update(self._suggest("", nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True)) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    aSugg.update(self._suggest(sRepl, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, True))












        return aSugg




















    def _getSimilarArcs (self, cChar, iAddr):
        "generator: yield similar char of <cChar> and address of the following node"
        for c in cp.d1to1.get(cChar, [cChar]):
            if c in self.dChar:
                jAddr = self._lookupArcNode(self.dChar[c], iAddr)
                if jAddr:
                    yield (c, jAddr)








    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal < self.nChar:
                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
................................................................................
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        cChar = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        print(cChar + ": ", end="")
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal in self.dCharVal:
                print(self.dCharVal[nVal], end="")
                if self.dCharVal[nVal] == sWord[0:1]:
                    iNextNodeAddr = jAddr
                    iPos = n
                n += 1
        if not sWord:
            return
        if iPos >= 0:
            print("\n   "+ " " * iPos + "|")
            self.drawPath(sWord[1:], iNextNodeAddr)

    def select (self, sPattern=""):







|
|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>


>
|

|
<

|
<
|
<





|
<

|
<


<
<
>

<
|
|

>
>
>
>
>
|
|

>
|
|
|
|
|
<
>
>
|
|
|
|
<
>
|
|
|
|
|
|
<
<
|
|



|

|

|
>
>
>
>
>
>
>
>
>
>
>
>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|






>
>
>
>
>
>
>







 







|


|
|
|
<
|
|
|
|







4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
..
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
...
247
248
249
250
251
252
253
254
255
256
257
258
259
260

261
262

263

264
265
266
267
268
269

270
271

272
273


274
275

276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292

293
294
295
296
297
298

299
300
301
302
303
304
305


306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
...
370
371
372
373
374
375
376
377
378
379
380
381
382

383
384
385
386
387
388
389
390
391
392
393
import traceback
import pkgutil
import re
from itertools import chain
from functools import wraps
import time

import logging
logging.basicConfig(filename="suggestions.log", level=logging.DEBUG)

from . import str_transform as st
from . import char_player as cp
from .echo import echo


def timethis (func):
................................................................................
        fStart = time.time()
        result = func(*args, **kwargs)
        fEnd = time.time()
        print(func.__name__, fEnd - fStart)
        return result
    return wrapper


class SuggResult:
    """Structure for storing, classifying and filtering suggestions"""

    def __init__ (self, sWord, nDistLimit=-1):
        self.sWord = sWord
        self.sCleanWord = cp.cleanWord(sWord)
        self.nDistLimit = nDistLimit  if nDistLimit >= 0  else  (len(sWord) // 3) + 1
        self.nMinDist = 1000
        self.aSugg = set()
        self.dSugg = { 0: [],  1: [] }

    def addSugg (self, sSugg, nDeep=0):
        "add a suggestion"
        #logging.info((nDeep * "  ") + "__" + sSugg + "__")
        if sSugg not in self.aSugg:
            nDist = st.distanceDamerauLevenshtein(self.sCleanWord, cp.cleanWord(sSugg))
            if nDist <= self.nDistLimit:
                if nDist not in self.dSugg:
                    self.dSugg[nDist] = []
                self.dSugg[nDist].append(sSugg)
                self.aSugg.add(sSugg)
                if nDist < self.nMinDist:
                    self.nMinDist = nDist
                self.nDistLimit = min(self.nDistLimit, self.nMinDist+2)

    def getSuggestions (self, nSuggLimit=10, nDistLimit=-1):
        "return a list of suggestions"
        lRes = []
        if self.dSugg[0]:
            # we sort the better results with the original word
            self.dSugg[0].sort(key=lambda sSugg: st.distanceDamerauLevenshtein(self.sWord, sSugg))
        for lSugg in self.dSugg.values():
            lRes.extend(lSugg)
            if len(lRes) > nSuggLimit:
                break
        lRes = list(cp.filterSugg(lRes))
        if self.sWord.istitle():
            lRes = list(map(lambda sSugg: sSugg.title(), lRes))
        elif self.sWord.isupper():
            lRes = list(map(lambda sSugg: sSugg.upper(), lRes))
        return lRes[:nSuggLimit]

    def reset (self):
        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, sDicName):
        self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName)
        if not self.by:
................................................................................
                l.extend(self.morph(sWord.capitalize()))
        return l

    @timethis
    def suggest (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sPfx, sWord, sSfx = cp.cut(sWord)
        nMaxSwitch = max(len(sWord) // 3, 1)
        nMaxDel = len(sWord) // 5
        nMaxHardRepl = max((len(sWord) - 5) // 4, 1)
        oSuggResult = SuggResult(sWord)
        self._suggest(oSuggResult, sWord, nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)
        if sWord.istitle():
            self._suggest(oSuggResult, sWord.lower(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)

        elif sWord.islower():
            self._suggest(oSuggResult, sWord.title(), nMaxSwitch=nMaxSwitch, nMaxDel=nMaxDel, nMaxHardRepl=nMaxHardRepl)

        aSugg = oSuggResult.getSuggestions()

        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nDeep=0, iAddr=0, sNewWord="", sAction="", bAvoidLoop=False):

        # recursive function
        #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain + " · " + sAction)

        if not sRemain:
            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:


                oSuggResult.addSugg(sNewWord, nDeep)
            for sTail in self._getTails(iAddr):

                oSuggResult.addSugg(sNewWord+sTail, nDeep)
            return
        cCurrent = sRemain[0:1]
        for cChar, jAddr in self._getCharArcs(iAddr):
            if cChar in cp.d1to1.get(cCurrent, cCurrent):
                self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar, "*")
            elif not bAvoidLoop and nMaxHardRepl:
                self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, jAddr, sNewWord+cChar, "[["+cChar+"]]", True)
        #for cChar, jAddr in self._getSimilarCharArcs(cCurrent, iAddr):
        #    self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, jAddr, sNewWord+cChar, "*")
        if not bAvoidLoop: # avoid infinite loop
            if len(sRemain) > 1:
                if cCurrent == sRemain[1:2]:
                    # same char, we remove 1 char without adding 1 to <sNewWord>
                    self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+"/2")
                else:
                    # switching chars

                    if nMaxSwitch:
                        self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "><",True)
                    # delete char
                    if nMaxDel:
                        self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "-"+cCurrent, True)
                # Phonetic replacements

                for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]):
                    self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+">"+sRepl, True)
                for sRepl in cp.d2toX.get(sRemain[0:2], ()):
                    self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain[0:2]+">"+sRepl, True)
                # Hard replacements
                #if nDeep > 3 and nMaxHardRepl:
                #    for cChar, kAddr in self._getCharArcs(iAddr):


                #        if cChar not in cp.d1to1.get(cCurrent, ""):
                #            self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl-1, nDeep+1, kAddr, sNewWord+cChar, "[["+cChar+"]]", True)
            # end of word
            if len(sRemain) == 2:
                for sRepl in cp.dFinal2.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True)
            elif len(sRemain) == 1:
                self._suggest(oSuggResult, "", nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " [last char removed] ", True) # remove last char and go on
                for sRepl in cp.dFinal1.get(sRemain, ()):
                    self._suggest(oSuggResult, sRepl, nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain + " >> " + sRepl, True)
        return

    @timethis
    def suggest2 (self, sWord, nMaxSugg=10):
        "returns a set of suggestions for <sWord>"
        sPfx, sWord, sSfx = cp.cut(sWord)
        oSuggResult = SuggResult(sWord)
        self._suggest2(oSuggResult)
        aSugg = oSuggResult.getSuggestions()
        if sSfx or sPfx:
            # we add what we removed
            return list(map(lambda sSug: sPfx + sSug + sSfx, aSugg))
        return aSugg

    def _suggest2 (self, oSuggResult, nDeep=0, iAddr=0, sNewWord=""):
        # recursive function
        #logging.info((nDeep * "  ") + sNewWord)
        if nDeep >= oSuggResult.nDistLimit:
            sCleanNewWord = cp.cleanWord(sNewWord)
            if st.distanceSift4(oSuggResult.sCleanWord[:len(sCleanNewWord)], sCleanNewWord) > oSuggResult.nDistLimit:
                return
        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
            oSuggResult.addSugg(sNewWord, nDeep)
        for cChar, jAddr in self._getCharArcsWithPriority(iAddr, oSuggResult.sWord[nDeep:nDeep+1]):
            self._suggest2(oSuggResult, nDeep+1, jAddr, sNewWord+cChar)
        return

    def _getCharArcs (self, iAddr):
        "generator: yield all chars and addresses from node at address <iAddr>"
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal < self.nChar:
                yield (self.dCharVal[nVal], jAddr)

    def _getSimilarCharArcs (self, cChar, iAddr):
        "generator: yield similar char of <cChar> and address of the following node"
        for c in cp.d1to1.get(cChar, [cChar]):
            if c in self.dChar:
                jAddr = self._lookupArcNode(self.dChar[c], iAddr)
                if jAddr:
                    yield (c, jAddr)

    def _getCharArcsWithPriority (self, iAddr, cChar):
        if not cChar:
            yield from self._getCharArcs(iAddr)
        lTuple = list(self._getCharArcs(iAddr))
        lTuple.sort(key=lambda t: 0  if t[0] in cp.d1to1.get(cChar, cChar)  else  1)
        yield from lTuple

    def _getTails (self, iAddr, sTail="", n=2):
        "return a list of suffixes ending at a distance of <n> from <iAddr>"
        aTails = set()
        for nVal, jAddr in self._getArcs(iAddr):
            if nVal < self.nChar:
                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
................................................................................
                    aTails.add(sTail + self.dCharVal[nVal])
                if n and not aTails:
                    aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
        return aTails

    def drawPath (self, sWord, iAddr=0):
        "show the path taken by <sWord> in the graph"
        c1 = sWord[0:1]  if sWord  else " "
        iPos = -1
        n = 0
        print(c1 + ": ", end="")
        for c2, jAddr in self._getCharArcs(iAddr):
            print(c2, end="")

            if c2 == sWord[0:1]:
                iNextNodeAddr = jAddr
                iPos = n
            n += 1
        if not sWord:
            return
        if iPos >= 0:
            print("\n   "+ " " * iPos + "|")
            self.drawPath(sWord[1:], iNextNodeAddr)

    def select (self, sPattern=""):

Modified gc_core/py/str_transform.py from [23bc31f0e4] to [3d25d1270b].

1
2

















































































































3
4
5
6
7
8
9
..
36
37
38
39
40
41
42

43
44
45
46
47
48
49
50

51
52
53
54
55
56
57
..
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!python3


















































































































## No stemming

def noStemming (sFlex, sStem):
    return sStem

def rebuildWord (sFlex, cmd1, cmd2):
    if cmd1 == "_":
................................................................................
        return "0"
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  


def changeWordWithSuffixCode (sWord, sSfxCode):
    if sSfxCode == "0":
        return sWord
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]


# Prefix and suffix

def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
            "n(pfx)/m(sfx)"
        with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
            pfx [optional]: string to add before the flexion 
................................................................................
        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)
        sAff = "{}/".format(chr(n+48))  if not sPfx  else "{}{}/".format(chr(n+48), sPfx)
        sAff += chr(m+48)  if not sSfx  else "{}{}".format(chr(m+48), sSfx)
        return sAff
    return sStem

def longestCommonSubstring (s1, s2):
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring
    M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ]
    longest, x_longest = 0, 0
    for x in range(1, 1+len(s1)):
        for y in range(1, 1+len(s2)):
            if s1[x-1] == s2[y-1]:
                M[x][y] = M[x-1][y-1] + 1
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]

def changeWordWithAffixCode (sWord, sAffCode):
    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>








>







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
...
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
...
187
188
189
190
191
192
193















194
195
196
197
198
199
200
201
202
203
#!python3


#### DISTANCE CALCULATIONS

def longestCommonSubstring (s1, s2):
    # http://en.wikipedia.org/wiki/Longest_common_substring_problem
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Longest_common_substring
    M = [ [0]*(1+len(s2)) for i in range(1+len(s1)) ]
    longest, x_longest = 0, 0
    for x in range(1, 1+len(s1)):
        for y in range(1, 1+len(s2)):
            if s1[x-1] == s2[y-1]:
                M[x][y] = M[x-1][y-1] + 1
                if M[x][y] > longest:
                    longest = M[x][y]
                    x_longest = x
            else:
                M[x][y] = 0
    return s1[x_longest-longest : x_longest]


def distanceDamerauLevenshtein (s1, s2):
    "distance of Damerau-Levenshtein between <s1> and <s2>"
    # https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
    d = {}
    nLen1 = len(s1)
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)
    if not s2:
        return len(s1)
    nLen1, nLen2 = len(s1), len(s2)
    i1, i2 = 0, 0   # Cursors for each string
    nLargestCS = 0  # Largest common substring
    nLocalCS = 0    # Local common substring
    nTrans = 0      # Number of transpositions ('ab' vs 'ba')
    lOffset = []    # Offset pair array, for computing the transpositions
 
    while i1 < nLen1 and i2 < nLen2:
        if s1[i1] == s2[i2]:
            nLocalCS += 1
            # Check if current match is a transposition
            bTrans = False
            i = 0
            while i < len(lOffset):
                t = lOffset[i]
                if i1 <= t[0] or i2 <= t[1]:
                    bTrans = abs(i2-i1) >= abs(t[1] - t[0])
                    if bTrans:
                        nTrans += 1
                    elif not t[2]:
                        t[2] = True
                        nTrans += 1
                    break
                elif i1 > t[1] and i2 > t[0]:
                    del lOffset[i]
                else:
                    i += 1
            lOffset.append([i1, i2, bTrans])
        else:
            nLargestCS += nLocalCS
            nLocalCS = 0
            if i1 != i2:
                i1 = i2 = min(i1, i2)
            for i in range(nMaxOffset):
                if i1 + i >= nLen1 and i2 + i >= nLen2:
                    break
                elif i1 + i < nLen1 and s1[i1+i] == s2[i2]:
                    i1 += i - 1
                    i2 -= 1
                    break
                elif i2 + i < nLen2 and s1[i1] == s2[i2+i]:
                    i2 += i - 1
                    i1 -= 1
                    break
        i1 += 1
        i2 += 1
        if i1 >= nLen1 or i2 >= nLen2:
            nLargestCS += nLocalCS
            nLocalCS = 0
            i1 = i2 = min(i1, i2)
    nLargestCS += nLocalCS
    return round(max(nLen1, nLen2) - nLargestCS + nTrans)


def showDistance (s1, s2):
    print(f"Damerau-Levenshtein: {s1} / {s2} = {distanceDamerauLevenshtein(s1, s2)}")
    print(f"Sift4: {s1} / {s2} = {distanceSift4(s1, s2)}")




#### STEMMING OPERATIONS

## No stemming

def noStemming (sFlex, sStem):
    return sStem

def rebuildWord (sFlex, cmd1, cmd2):
    if cmd1 == "_":
................................................................................
        return "0"
    jSfx = 0
    for i in range(min(len(sFlex), len(sStem))):
        if sFlex[i] != sStem[i]:
            break
        jSfx += 1
    return chr(len(sFlex)-jSfx+48) + sStem[jSfx:]  


def changeWordWithSuffixCode (sWord, sSfxCode):
    if sSfxCode == "0":
        return sWord
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]


# Prefix and suffix

def defineAffixCode (sFlex, sStem):
    """ Returns a string defining how to get stem from flexion. Examples:
            "0" if stem = flexion
            "stem" if no common substring
            "n(pfx)/m(sfx)"
        with n and m: chars with numeric meaning, "0" = 0, "1" = 1, ... ":" = 10, etc. (See ASCII table.) Says how many letters to strip from flexion.
            pfx [optional]: string to add before the flexion 
................................................................................
        n = sFlex.find(sSubs)
        m = len(sFlex) - (len(sSubs)+n)
        sAff = "{}/".format(chr(n+48))  if not sPfx  else "{}{}/".format(chr(n+48), sPfx)
        sAff += chr(m+48)  if not sSfx  else "{}{}".format(chr(m+48), sSfx)
        return sAff
    return sStem

















def changeWordWithAffixCode (sWord, sAffCode):
    if sAffCode == "0":
        return sWord
    if '/' not in sAffCode:
        return "# error #"
    sPfxCode, sSfxCode = sAffCode.split('/')
    sWord = sPfxCode[1:] + sWord[(ord(sPfxCode[0])-48):] 
    return sWord[:-(ord(sSfxCode[0])-48)] + sSfxCode[1:]  if sSfxCode[0] != '0'  else sWord + sSfxCode[1:]

Modified gc_lang/fr/config.ini from [c76d410d86] to [bd9c32e343].

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
oxt_identifier = French.linguistic.resources.from.Dicollecte.by.OlivierR

# Firefox
fx_identifier = French-GC@grammalecte.net
fx_name = Grammalecte [fr]

win_fx_dev_path = C:\Program Files\Firefox Developer Edition\firefox.exe
win_fx_nightly_path = C:\Program Files (x86)\Nightly\firefox.exe
linux_fx_dev_path = /usr/bin/firefox
linux_fx_nightly_path = /usr/bin/firefox


# Thunderbird
tb_identifier = French-GC-TB@grammalecte.net
tb_name = Grammalecte [fr]







|







29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
oxt_identifier = French.linguistic.resources.from.Dicollecte.by.OlivierR

# Firefox
fx_identifier = French-GC@grammalecte.net
fx_name = Grammalecte [fr]

win_fx_dev_path = C:\Program Files\Firefox Developer Edition\firefox.exe
win_fx_nightly_path = C:\Program Files\Nightly\firefox.exe
linux_fx_dev_path = /usr/bin/firefox
linux_fx_nightly_path = /usr/bin/firefox


# Thunderbird
tb_identifier = French-GC-TB@grammalecte.net
tb_name = Grammalecte [fr]