Grammalecte  Changes On Branch 3b3a02f4d385bf7b

Changes In Branch bdic_opt Through [3b3a02f4d3] Excluding Merge-Ins

This is equivalent to a diff from 4a19028115 to 3b3a02f4d3

2020-09-15
14:01
[graphspell][js] remove specific trick in cleanWord() check-in: 6569849b49 user: olr tags: bdic_opt, graphspell
13:50
[graphspell][js] suggest optimisation with Jaro-Winkler (thanks to IllusionPerdu) check-in: 3b3a02f4d3 user: olr tags: bdic_opt, graphspell
2020-09-14
14:38
[graphspell] string comparison: use Jaro-Winkler check-in: efebe44d15 user: olr tags: bdic_opt, graphspell
07:55
[fr] ajustements check-in: cd8c458e7b user: olr tags: fr, trunk
2020-09-11
19:20
merge trunk check-in: 43afb8b856 user: olr tags: bdic_opt
19:18
[fr] tests: spellchecker.suggest() check-in: 4a19028115 user: olr tags: fr, trunk
14:21
[fr] faux positif check-in: 86f302f4ef user: olr tags: fr, trunk

Modified gc_lang/fr/modules/tests_modules.py from [1156e862bc] to [8a9c45807b].

    18     18   def timeblock (label, hDst=None):
    19     19       "performance counter (contextmanager)"
    20     20       start = time.perf_counter()
    21     21       try:
    22     22           yield
    23     23       finally:
    24     24           end = time.perf_counter()
    25         -        print('{} : {}'.format(label, end - start))
           25  +        print('{:<20} : {}'.format(label, end - start))
    26     26           if hDst:
    27     27               hDst.write("{:<12.6}".format(end-start))
    28     28   
    29     29   
    30     30   class TestDictionary (unittest.TestCase):
    31     31       "Test du correcteur orthographique"
    32     32   
................................................................................
    47     47               self.assertTrue(self.oDic.isValid(sWord), sWord)
    48     48   
    49     49       def test_isvalid_failed (self):
    50     50           for sWord in ["BranchE", "BRanche", "BRAnCHE", "émilie", "éMILIE", "émiLie"]:
    51     51               self.assertFalse(self.oDic.isValid(sWord), sWord)
    52     52   
    53     53       def test_suggest (self):
    54         -        for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission"]:
           54  +        for sWord in ["déelirranttesss", "vallidasion", "Emilie", "exibission", "ditirembique", "jai", "email"]:
    55     55               with timeblock(sWord):
    56     56                   self.assertNotEqual(0, self.oDic.suggest(sWord))
    57     57   
    58     58   
    59     59   class TestConjugation (unittest.TestCase):
    60     60       "Tests des conjugaisons"
    61     61   

Modified gc_lang/fr/perf_memo.text from [6a0d81df00] to [ad156793c1].

    26     26   0.6.2       2018.02.19 19:06    5.51302     1.29359     0.874157    0.260415    0.271596    0.290641    0.684754    0.376905    0.0815201   0.00919633  (spelling normalization)
    27     27   1.0         2018.11.23 10:59    2.88577     0.702486    0.485648    0.139897    0.14079     0.148125    0.348751    0.201061    0.0360297   0.0043535   (x2, with new GC engine)
    28     28   1.1         2019.05.16 09:42    1.50743     0.360923    0.261113    0.0749272   0.0763827   0.0771537   0.180504    0.102942    0.0182762   0.0021925   (×2, but new processor: AMD Ryzen 7 2700X)
    29     29   1.2.1       2019.08.06 20:57    1.42886     0.358425    0.247356    0.0704405   0.0754886   0.0765604   0.177197    0.0988517   0.0188103   0.0020243
    30     30   1.6.0       2020.01.03 20:22    1.38847     0.346214    0.240242    0.0709539   0.0737499   0.0748733   0.176477    0.0969171   0.0187857   0.0025143   (nouveau dictionnaire avec lemmes masculins)
    31     31   1.9.0       2020.04.20 19:57    1.51183     0.369546    0.25681     0.0734314   0.0764396   0.0785668   0.183922    0.103674    0.0185812   0.002099    (NFC normalization)
    32     32   1.9.2       2020.05.12 08:43    1.62465     0.398831    0.273012    0.0810811   0.080937    0.0845885   0.204133    0.114146    0.0212864   0.0029547
    33         -1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474   
    34         -1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773   
           33  +1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474
           34  +1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773
           35  +1.12.2      2020.09.11 19:16    1.35297     0.330545    0.221731    0.0666998   0.0692539   0.0701707   0.160564    0.0891676   0.015807    0.0045998

Modified gc_lang/fr/webext/gce_worker.js from [c94d68b34b] to [d11e3267b8].

   181    181               oTokenizer = new Tokenizer("fr");
   182    182               if (dOptions !== null) {
   183    183                   if (!(dOptions instanceof Map)) {
   184    184                       dOptions = helpers.objectToMap(dOptions);
   185    185                   }
   186    186                   gc_engine.setOptions(dOptions);
   187    187               }
   188         -            //tests();
          188  +            tests();
   189    189               bInitDone = true;
   190    190           } else {
   191    191               console.log("[Worker] Already initialized…")
   192    192           }
   193    193           // we always retrieve options from the gc_engine, for setOptions filters obsolete options
   194    194           dOptions = helpers.mapToObject(gc_engine.getOptions());
   195    195           postMessage(createResponse("init", dOptions, oInfo, true));
................................................................................
   288    288   function resetOptions (oInfo={}) {
   289    289       gc_engine.resetOptions();
   290    290       let dOptions = helpers.mapToObject(gc_engine.getOptions());
   291    291       postMessage(createResponse("resetOptions", dOptions, oInfo, true));
   292    292   }
   293    293   
   294    294   function tests () {
   295         -    console.log(conj.getConj("devenir", ":E", ":2s"));
          295  +    /*console.log(conj.getConj("devenir", ":E", ":2s"));
   296    296       console.log(mfsp.getMasForm("emmerdeuse", true));
   297    297       console.log(mfsp.getMasForm("pointilleuse", false));
   298    298       console.log(phonet.getSimil("est"));
   299    299       let aRes = gc_engine.parse("Je suit...");
   300    300       for (let oErr of aRes) {
   301    301           console.log(text.getReadableError(oErr));
          302  +    }*/
          303  +    for (let sWord of ["fatiqué", "coeur", "trèèèèèèèèès", "vraaaaiiiimeeeeennnt", "apele", "email", "Co2", "emmppâiiiller", "testt", "apelaion", "exsepttion", "sintaxik", "ebriete", "ennormmement"]) {
          304  +        console.time("Suggestions for " + sWord);
          305  +        for (let aSugg of oSpellChecker.suggest(sWord)) {
          306  +            if (aSugg.length) {
          307  +                console.log(sWord + " -> ", aSugg.join(" "));
          308  +            }
          309  +        }
          310  +        console.timeEnd("Suggestions for " + sWord);
   302    311       }
   303    312   }
   304    313   
   305    314   function textToTest (sText, sCountry, bDebug, bContext, oInfo={}) {
   306    315       if (!gc_engine) {
   307    316           postMessage(createResponse("textToTest", "# Grammar checker not loaded.", oInfo, true));
   308    317           return;

Modified graphspell-js/char_player.js from [0602ec129b] to [8dac23cf9b].

     4      4   /* jshint esversion:6 */
     5      5   /* jslint esversion:6 */
     6      6   
     7      7   ${map}
     8      8   
     9      9   
    10     10   var char_player = {
    11         -
           11  +    /*
           12  +        oDistanceBetweenChars:
           13  +            - with Jaro-Winkler, values between 1 and 10
           14  +            - with Damerau-Levenshtein, values / 10 (between 0 and 1: 0.1, 0.2 ... 0.9)
           15  +    */
    12     16       oDistanceBetweenChars: {
    13         -        "a": {},
    14         -        "e": {"é": 0.5},
    15         -        "é": {"e": 0.5},
    16         -        "i": {"y": 0.2},
    17         -        "o": {},
    18         -        "u": {},
    19         -        "y": {"i": 0.3},
    20         -        "b": {"d": 0.8, "h": 0.9},
    21         -        "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
    22         -        "d": {"b": 0.8},
    23         -        "f": {"v": 0.8},
    24         -        "g": {"j": 0.5},
    25         -        "h": {"b": 0.9},
    26         -        "j": {"g": 0.5, "i": 0.9},
    27         -        "k": {"c": 0.5, "q": 0.1, "x": 0.5},
    28         -        "l": {"i": 0.9},
    29         -        "m": {"n": 0.8},
    30         -        "n": {"m": 0.8, "r": 0.9},
    31         -        "p": {"q": 0.9},
    32         -        "q": {"c": 0.5, "k": 0.1, "p": 0.9},
    33         -        "r": {"n": 0.9, "j": 0.9},
    34         -        "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
    35         -        "t": {"d": 0.9},
    36         -        "v": {"f": 0.8, "w": 0.1},
    37         -        "w": {"v": 0.1},
    38         -        "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
    39         -        "z": {"s": 0.5}
           17  +        //"a": {},
           18  +        "e": {"é": 5},
           19  +        //"é": {"e": 5},
           20  +        "i": {"y": 2},
           21  +        //"o": {},
           22  +        //"u": {},
           23  +        "y": {"i": 3},
           24  +        "b": {"d": 8, "h": 9},
           25  +        "c": {"ç": 1, "k": 5, "q": 5, "s": 5, "x": 5, "z": 8},
           26  +        "d": {"b": 8},
           27  +        "f": {"v": 8},
           28  +        "g": {"j": 5},
           29  +        "h": {"b": 9},
           30  +        "j": {"g": 5, "i": 9},
           31  +        "k": {"c": 5, "q": 1, "x": 5},
           32  +        "l": {"i": 9},
           33  +        "m": {"n": 8},
           34  +        "n": {"m": 8, "r": 9},
           35  +        "p": {"q": 9},
           36  +        "q": {"c": 5, "k": 1, "p": 9},
           37  +        "r": {"n": 9, "j": 9},
           38  +        "s": {"c": 5, "ç": 1, "x": 5, "z": 5},
           39  +        "t": {"d": 9},
           40  +        "v": {"f": 8, "w": 1},
           41  +        "w": {"v": 1},
           42  +        "x": {"c": 5, "k": 5, "q": 5, "s": 5},
           43  +        "z": {"s": 5}
    40     44       },
    41     45   
    42     46       distanceBetweenChars: function (c1, c2) {
    43     47           if (c1 == c2) {
    44     48               return 0;
    45     49           }
    46     50           if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {

Modified graphspell-js/ibdawg.js from [69d7490b82] to [2160aa77f7].

    18     18       var char_player = require("./char_player.js");
    19     19   }
    20     20   
    21     21   
    22     22   class SuggResult {
    23     23       // Structure for storing, classifying and filtering suggestions
    24     24   
    25         -    constructor (sWord, nDistLimit=-1) {
           25  +    constructor (sWord, nSuggLimit=10, nDistLimit=-1) {
    26     26           this.sWord = sWord;
    27     27           this.sSimplifiedWord = str_transform.simplifyWord(sWord);
    28     28           this.nDistLimit = (nDistLimit >= 0) ? nDistLimit :  Math.floor(sWord.length / 3) + 1;
    29     29           this.nMinDist = 1000;
    30         -        this.aSugg = new Set();
    31         -        this.dSugg = new Map([ [0, []],  [1, []],  [2, []] ]);
    32         -        this.aAllSugg = new Set();      // all found words even those refused
           30  +        // Temporary sets
           31  +        this.aAllSugg = new Set();  // All suggestions, even the one rejected
           32  +        this.dGoodSugg = new Map(); // Acceptable suggestions
           33  +        this.dBestSugg = new Map(); // Best suggestions
           34  +        // Parameters
           35  +        this.nSuggLimit = nSuggLimit;
           36  +        this.nSuggLimitExt = nSuggLimit + 2;                // we add few entries in case suggestions merge after casing modifications
           37  +        this.nBestSuggLimit = Math.floor(nSuggLimit * 1.5); // n times the requested limit
           38  +        this.nGoodSuggLimit = nSuggLimit * 15;              // n times the requested limit
    33     39       }
    34     40   
    35         -    addSugg (sSugg, nDeep=0) {
           41  +    addSugg (sSugg) {
    36     42           // add a suggestion
    37     43           if (this.aAllSugg.has(sSugg)) {
    38     44               return;
    39     45           }
    40     46           this.aAllSugg.add(sSugg);
    41         -        if (!this.aSugg.has(sSugg)) {
    42         -            let nDist = Math.floor(str_transform.distanceDamerauLevenshtein(this.sSimplifiedWord, str_transform.simplifyWord(sSugg)));
    43         -            if (nDist <= this.nDistLimit) {
    44         -                if (sSugg.includes(" ")) { // add 1 to distance for split suggestions
    45         -                    nDist += 1;
    46         -                }
    47         -                if (!this.dSugg.has(nDist)) {
    48         -                    this.dSugg.set(nDist, []);
    49         -                }
    50         -                this.dSugg.get(nDist).push(sSugg);
    51         -                this.aSugg.add(sSugg);
    52         -                if (nDist < this.nMinDist) {
    53         -                    this.nMinDist = nDist;
    54         -                }
    55         -                this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
    56         -            }
    57         -        }
    58         -    }
    59         -
    60         -    getSuggestions (nSuggLimit=10) {
           47  +        // jaro 0->1 1 les chaines sont égale
           48  +        let nDistJaro = 1 - str_transform.distanceJaroWinkler(this.sSimplifiedWord, str_transform.simplifyWord(sSugg));
           49  +        let nDist = Math.floor(nDistJaro * 10);
           50  +        if (nDistJaro < .11) {        // Best suggestions
           51  +            this.dBestSugg.set(sSugg, Math.round(nDistJaro*1000));
           52  +            if (this.dBestSugg.size > this.nBestSuggLimit) {
           53  +                this.nDistLimit = -1; // make suggest() to end search
           54  +            }
           55  +        } else if (nDistJaro < .33) { // Good suggestions
           56  +            this.dGoodSugg.set(sSugg, Math.round(nDistJaro*1000));
           57  +            if (this.dGoodSugg.size > this.nGoodSuggLimit) {
           58  +                this.nDistLimit = -1; // make suggest() to end search
           59  +            }
           60  +        } else {
           61  +            if (nDist < this.nMinDist) {
           62  +                this.nMinDist = nDist;
           63  +            }
           64  +            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist);
           65  +        }
           66  +        if (nDist <= this.nDistLimit) {
           67  +            if (nDist < this.nMinDist) {
           68  +                this.nMinDist = nDist;
           69  +            }
           70  +            this.nDistLimit = Math.min(this.nDistLimit, this.nMinDist+1);
           71  +        }
           72  +    }
           73  +
           74  +    getSuggestions () {
    61     75           // return a list of suggestions
    62     76           let lRes = [];
    63         -        let bFirstListSorted = false;
    64         -        for (let [nDist, lSugg] of this.dSugg.entries()) {
    65         -            if (nDist > this.nDistLimit) {
    66         -                break;
    67         -            }
    68         -            if (!bFirstListSorted && lSugg.length > 1) {
    69         -                lRes.sort((a, b) => { return str_transform.distanceDamerauLevenshtein(this.sWord, a) - str_transform.distanceDamerauLevenshtein(this.sWord, b); });
    70         -                bFirstListSorted = true;
    71         -            }
    72         -            lRes.push(...lSugg);
    73         -            if (lRes.length > nSuggLimit) {
    74         -                break;
           77  +        if (this.dBestSugg.size > 0) {
           78  +            // sort only with simplified words
           79  +            let lResTmp = [...this.dBestSugg.entries()].sort((a, b) => { return a[1] - b[1]; });
           80  +            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
           81  +            for (let i=0;  i < nSize;  i++){
           82  +                lRes.push(lResTmp[i][0]);
    75     83               }
    76     84           }
           85  +        if (lRes.length < this.nSuggLimitExt) {
           86  +            // sort with simplified words and original word
           87  +            let lResTmp = [...this.dGoodSugg.entries()].sort((a, b) => {
           88  +                // Low precision to rely more on simplified words
           89  +                let nJaroA = Math.round(str_transform.distanceJaroWinkler(this.sWord, a[0]) * 10);
           90  +                let nJaroB = Math.round(str_transform.distanceJaroWinkler(this.sWord, b[0]) * 10);
           91  +                if (nJaroA == nJaroB) {
           92  +                    return a[1] - b[1];     // warning: both lists are NOT sorted the same way (key: a-b)
           93  +                } else {
           94  +                    return nJaroB - nJaroA; // warning: both lists are NOT sorted the same way (key: b-a)
           95  +                }
           96  +            }).slice(0, this.nSuggLimitExt);
           97  +            let nSize = Math.min(this.nSuggLimitExt, lResTmp.length);
           98  +            for (let i=0;  i < nSize;  i++){
           99  +                lRes.push(lResTmp[i][0]);
          100  +            }
          101  +        }
          102  +        // casing
    77    103           if (this.sWord.gl_isUpperCase()) {
    78    104               lRes = lRes.map((sSugg) => { return sSugg.toUpperCase(); });
    79    105               lRes = [...new Set(lRes)];
    80    106           }
    81    107           else if (this.sWord.slice(0,1).gl_isUpperCase()) {
    82    108               lRes = lRes.map((sSugg) => { return sSugg.slice(0,1).toUpperCase() + sSugg.slice(1); });
    83    109               lRes = [...new Set(lRes)];
    84    110           }
    85         -        return lRes.slice(0, nSuggLimit);
          111  +        return lRes.slice(0, this.nSuggLimit);
    86    112       }
    87    113   
    88    114       reset () {
    89         -        this.aSugg.clear();
    90    115           this.dSugg.clear();
          116  +        this.dGoodSugg.clear();
          117  +        this.dBestSugg.clear();
    91    118       }
    92    119   }
    93    120   
    94    121   
    95    122   class IBDAWG {
    96    123       // INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH
    97    124   
................................................................................
   120    147           }
   121    148           /*
   122    149               Properties:
   123    150               sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
   124    151               nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset,
   125    152           */
   126    153   
   127         -        /*
   128         -            Bug workaround.
   129         -            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
   130         -            So we convert huge hexadecimal string to list of numbers…
   131         -            https://github.com/mozilla/addons-linter/issues/1361
   132         -        */
   133         -        let lTemp = [];
   134         -        for (let i = 0;  i < this.sByDic.length;  i+=2) {
   135         -            lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16));
   136         -        }
   137         -        this.byDic = lTemp;
   138         -        //this.byDic = new Uint8Array(lTemp);  // not quicker, even slower
   139         -        /* end of bug workaround */
   140         -
   141    154           if (!(this.sHeader.startsWith("/grammalecte-fsa/") || this.sHeader.startsWith("/pyfsa/"))) {
   142    155               throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader);
   143    156           }
   144    157           if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) {
   145    158               throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod);
   146    159           }
   147    160           // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
................................................................................
   153    166               this.funcStemming = str_transform.changeWordWithSuffixCode;
   154    167           } else if (this.cStemming == "A") {
   155    168               this.funcStemming = str_transform.changeWordWithAffixCode;
   156    169           } else {
   157    170               this.funcStemming = str_transform.noStemming;
   158    171           }
   159    172   
          173  +        /*
          174  +            Bug workaround.
          175  +            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
          176  +            So we convert huge hexadecimal string to list of numbers…
          177  +            https://github.com/mozilla/addons-linter/issues/1361
          178  +        */
          179  +        /*
          180  +            Performance trick:
          181  +            Instead of converting bytes to integers each times we parse the binary dictionary,
          182  +            we do it once, then parse the array
          183  +        */
          184  +        let nAcc = 0;
          185  +        let lBytesBuffer = [];
          186  +        let lTemp = [];
          187  +        let nDivisor = (this.nBytesArc + this.nBytesNodeAddress) / 2;
          188  +        for (let i = 0;  i < this.sByDic.length;  i+=2) {
          189  +            lBytesBuffer.push(parseInt(this.sByDic.slice(i, i+2), 16));
          190  +            if (nAcc == (this.nBytesArc - 1)) {
          191  +                lTemp.push(this._convBytesToInteger(lBytesBuffer));
          192  +                lBytesBuffer = [];
          193  +            }
          194  +            else if (nAcc == (this.nBytesArc + this.nBytesNodeAddress - 1)) {
          195  +                lTemp.push(Math.round(this._convBytesToInteger(lBytesBuffer) / nDivisor));  // Math.round should be useless, BUT with JS who knowns what can happen…
          196  +                lBytesBuffer = [];
          197  +                nAcc = -1;
          198  +            }
          199  +            nAcc = nAcc + 1;
          200  +        }
          201  +        this.byDic = lTemp;
          202  +        /* end of bug workaround */
          203  +
   160    204           this._arcMask = (2 ** ((this.nBytesArc * 8) - 3)) - 1;
   161    205           this._finalNodeMask = 1 << ((this.nBytesArc * 8) - 1);
   162    206           this._lastArcMask = 1 << ((this.nBytesArc * 8) - 2);
   163    207   
   164         -
   165         -        // Configuring DAWG functions according to nCompressionMethod
   166         -        switch (this.nCompressionMethod) {
   167         -            case 1:
   168         -                this.morph = this._morph1;
   169         -                this.stem = this._stem1;
   170         -                this._lookupArcNode = this._lookupArcNode1;
   171         -                this._getArcs = this._getArcs1;
   172         -                this._writeNodes = this._writeNodes1;
   173         -                break;
   174         -            case 2:
   175         -                this.morph = this._morph2;
   176         -                this.stem = this._stem2;
   177         -                this._lookupArcNode = this._lookupArcNode2;
   178         -                this._getArcs = this._getArcs2;
   179         -                this._writeNodes = this._writeNodes2;
   180         -                break;
   181         -            case 3:
   182         -                this.morph = this._morph3;
   183         -                this.stem = this._stem3;
   184         -                this._lookupArcNode = this._lookupArcNode3;
   185         -                this._getArcs = this._getArcs3;
   186         -                this._writeNodes = this._writeNodes3;
   187         -                break;
   188         -            default:
   189         -                throw ValueError("# Error: unknown code: " + this.nCompressionMethod);
   190         -        }
   191    208           //console.log(this.getInfo());
   192    209           this.bAcronymValid = true;
   193    210           this.bNumAtLastValid = false;
   194    211   
   195    212           // lexicographer module ?
   196    213           this.lexicographer = null;
   197    214           // JS still sucks: we’ll try importation when importation will be available in Workers. Still waiting...
   198    215           if (self && self.hasOwnProperty("lexgraph_"+this.sLangCode)) { // self is the Worker
   199    216               this.lexicographer = self["lexgraph_"+this.sLangCode];
   200    217           }
   201         -
   202    218       }
   203    219   
   204    220       getInfo () {
   205    221           return  `  Language: ${this.sLangName}   Lang code: ${this.sLangCode}   Dictionary name: ${this.sDicName}\n` +
   206    222                   `  Compression method: ${this.nCompressionMethod}   Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
   207    223                   `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
   208    224                   `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
................................................................................
   304    320                   return false;
   305    321               }
   306    322               iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
   307    323               if (iAddr === null) {
   308    324                   return false;
   309    325               }
   310    326           }
   311         -        return Boolean(this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask);
          327  +        return Boolean(this.byDic[iAddr] & this._finalNodeMask);
   312    328       }
   313    329   
   314    330       getMorph (sWord) {
   315    331           // retrieves morphologies list, different casing allowed
   316    332           if (!sWord) {
   317    333               return [];
   318    334           }
   319    335           sWord = str_transform.spellingNormalization(sWord);
   320         -        let l = this.morph(sWord);
          336  +        let l = this._morph(sWord);
   321    337           if (sWord[0].gl_isUpperCase()) {
   322         -            l.push(...this.morph(sWord.toLowerCase()));
          338  +            l.push(...this._morph(sWord.toLowerCase()));
   323    339               if (sWord.gl_isUpperCase() && sWord.length > 1) {
   324         -                l.push(...this.morph(sWord.gl_toCapitalize()));
          340  +                l.push(...this._morph(sWord.gl_toCapitalize()));
   325    341               }
   326    342           }
   327    343           return l;
   328    344       }
   329    345   
   330    346       suggest (sWord, nSuggLimit=10, bSplitTrailingNumbers=false) {
   331    347           // returns a array of suggestions for <sWord>
................................................................................
   336    352           if (this.lexicographer) {
   337    353               [sPfx, sWord, sSfx] = this.lexicographer.split(sWord);
   338    354           }
   339    355           let nMaxSwitch = Math.max(Math.floor(sWord.length / 3), 1);
   340    356           let nMaxDel = Math.floor(sWord.length / 5);
   341    357           let nMaxHardRepl = Math.max(Math.floor((sWord.length - 5) / 4), 1);
   342    358           let nMaxJump = Math.max(Math.floor(sWord.length / 4), 1);
   343         -        let oSuggResult = new SuggResult(sWord);
          359  +        let oSuggResult = new SuggResult(sWord, nSuggLimit);
          360  +        let sWord = str_transform.cleanWord(sWord);
   344    361           if (bSplitTrailingNumbers) {
   345    362               this._splitTrailingNumbers(oSuggResult, sWord);
   346    363           }
   347    364           this._splitSuggest(oSuggResult, sWord);
   348    365           this._suggest(oSuggResult, sWord, nMaxSwitch, nMaxDel, nMaxHardRepl, nMaxJump);
   349         -        let aSugg = oSuggResult.getSuggestions(nSuggLimit);
          366  +        let aSugg = oSuggResult.getSuggestions();
   350    367           if (this.lexicographer) {
   351    368               aSugg = this.lexicographer.filterSugg(aSugg);
   352    369           }
   353    370           if (sSfx || sPfx) {
   354    371               // we add what we removed
   355    372               return aSugg.map( (sSugg) => { return sPfx + sSugg + sSfx; } );
   356    373           }
................................................................................
   376    393               }
   377    394           }
   378    395       }
   379    396   
   380    397       _suggest (oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=false) {
   381    398           // returns a set of suggestions
   382    399           // recursive function
   383         -        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
          400  +        if (this.byDic[iAddr] & this._finalNodeMask) {
   384    401               if (sRemain == "") {
   385    402                   oSuggResult.addSugg(sNewWord);
   386    403                   for (let sTail of this._getTails(iAddr)) {
   387    404                       oSuggResult.addSugg(sNewWord+sTail);
   388    405                   }
   389    406                   return;
   390    407               }
................................................................................
   486    503       }
   487    504   
   488    505       _getTails (iAddr, sTail="", n=2) {
   489    506           // return a list of suffixes ending at a distance of <n> from <iAddr>
   490    507           let aTails = new Set();
   491    508           for (let [nVal, jAddr] of this._getArcs(iAddr)) {
   492    509               if (nVal <= this.nChar) {
   493         -                if (this._convBytesToInteger(this.byDic.slice(jAddr, jAddr+this.nBytesArc)) & this._finalNodeMask) {
          510  +                if (this.byDic[jAddr] & this._finalNodeMask) {
   494    511                       aTails.add(sTail + this.dCharVal.get(nVal));
   495    512                   }
   496    513                   if (n && aTails.size == 0) {
   497    514                       aTails.gl_update(this._getTails(jAddr, sTail+this.dCharVal.get(nVal), n-1));
   498    515                   }
   499    516               }
   500    517           }
   501    518           return aTails;
   502    519       }
   503    520   
   504         -    // morph (sWord) {
   505         -    //     is defined in constructor
   506         -    // }
   507         -
   508    521       getSimilarEntries (sWord, nSuggLimit=10) {
   509    522           // return a list of tuples (similar word, stem, morphology)
   510    523           if (sWord == "") {
   511    524               return [];
   512    525           }
   513    526           let lResult = [];
   514    527           for (let sSimilar of this.suggest(sWord, nSuggLimit)) {
................................................................................
   528    541               zFlexPattern = (sFlexPattern !== "") ? new RegExp(sFlexPattern) : null;
   529    542               zTagsPattern = (sTagsPattern !== "") ? new RegExp(sTagsPattern) : null;
   530    543           }
   531    544           catch (e) {
   532    545               console.log("Error in regex pattern");
   533    546               console.log(e.message);
   534    547           }
   535         -        yield* this._select1(zFlexPattern, zTagsPattern, 0, "");
          548  +        yield* this._select(zFlexPattern, zTagsPattern, 0, "");
   536    549       }
   537    550   
   538         -    // VERSION 1
   539         -
   540         -    * _select1 (zFlexPattern, zTagsPattern, iAddr, sWord) {
          551  +    * _select (zFlexPattern, zTagsPattern, iAddr, sWord) {
   541    552           // recursive generator
   542         -        for (let [nVal, jAddr] of this._getArcs1(iAddr)) {
          553  +        for (let [nVal, jAddr] of this._getArcs(iAddr)) {
   543    554               if (nVal <= this.nChar) {
   544    555                   // simple character
   545         -                yield* this._select1(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]);
          556  +                yield* this._select(zFlexPattern, zTagsPattern, jAddr, sWord + this.lArcVal[nVal]);
   546    557               } else {
   547    558                   if (!zFlexPattern || zFlexPattern.test(sWord)) {
   548    559                       let sStem = this.funcStemming(sWord, this.lArcVal[nVal]);
   549         -                    for (let [nMorphVal, _] of this._getArcs1(jAddr)) {
          560  +                    for (let [nMorphVal, _] of this._getArcs(jAddr)) {
   550    561                           if (!zTagsPattern || zTagsPattern.test(this.lArcVal[nMorphVal])) {
   551    562                               yield [sWord, sStem, this.lArcVal[nMorphVal]];
   552    563                           }
   553    564                       }
   554    565                   }
   555    566               }
   556    567           }
   557    568       }
   558    569   
   559         -    _morph1 (sWord) {
          570  +    _morph (sWord) {
   560    571           // returns morphologies of sWord
   561    572           let iAddr = 0;
   562    573           for (let c of sWord) {
   563    574               if (!this.dChar.has(c)) {
   564    575                   return [];
   565    576               }
   566    577               iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
   567    578               if (iAddr === null) {
   568    579                   return [];
   569    580               }
   570    581           }
   571         -        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
          582  +        if (this.byDic[iAddr] & this._finalNodeMask) {
   572    583               let l = [];
   573    584               let nRawArc = 0;
   574    585               while (!(nRawArc & this._lastArcMask)) {
   575         -                let iEndArcAddr = iAddr + this.nBytesArc;
   576         -                nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
          586  +                let iEndArcAddr = iAddr + 1;
          587  +                nRawArc = this.byDic[iAddr];
   577    588                   let nArc = nRawArc & this._arcMask;
   578    589                   if (nArc > this.nChar) {
   579    590                       // This value is not a char, this is a stemming code
   580    591                       let sStem = ">" + this.funcStemming(sWord, this.lArcVal[nArc]);
   581    592                       // Now , we go to the next node and retrieve all following arcs values, all of them are tags
   582         -                    let iAddr2 = this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
          593  +                    let iAddr2 = this.byDic[iEndArcAddr];
   583    594                       let nRawArc2 = 0;
   584    595                       while (!(nRawArc2 & this._lastArcMask)) {
   585         -                        let iEndArcAddr2 = iAddr2 + this.nBytesArc;
   586         -                        nRawArc2 = this._convBytesToInteger(this.byDic.slice(iAddr2, iEndArcAddr2));
          596  +                        let iEndArcAddr2 = iAddr2 + 1;
          597  +                        nRawArc2 = this.byDic[iAddr2];
   587    598                           l.push(sStem + "/" + this.lArcVal[nRawArc2 & this._arcMask]);
   588         -                        iAddr2 = iEndArcAddr2+this.nBytesNodeAddress;
          599  +                        iAddr2 = iEndArcAddr2 + 1;
   589    600                       }
   590    601                   }
   591         -                iAddr = iEndArcAddr + this.nBytesNodeAddress;
          602  +                iAddr = iEndArcAddr + 1;
   592    603               }
   593    604               return l;
   594    605           }
   595    606           return [];
   596    607       }
   597    608   
   598         -    _stem1 (sWord) {
          609  +    _stem (sWord) {
   599    610           // returns stems list of sWord
   600    611           let iAddr = 0;
   601    612           for (let c of sWord) {
   602    613               if (!this.dChar.has(c)) {
   603    614                   return [];
   604    615               }
   605    616               iAddr = this._lookupArcNode(this.dChar.get(c), iAddr);
   606    617               if (iAddr === null) {
   607    618                   return [];
   608    619               }
   609    620           }
   610         -        if (this._convBytesToInteger(this.byDic.slice(iAddr, iAddr+this.nBytesArc)) & this._finalNodeMask) {
          621  +        if (this.byDic[iAddr] & this._finalNodeMask) {
   611    622               let l = [];
   612    623               let nRawArc = 0;
   613    624               while (!(nRawArc & this._lastArcMask)) {
   614         -                let iEndArcAddr = iAddr + this.nBytesArc;
   615         -                nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
          625  +                let iEndArcAddr = iAddr + 1;
          626  +                nRawArc = this.byDic[iAddr];
   616    627                   let nArc = nRawArc & this._arcMask;
   617    628                   if (nArc > this.nChar) {
   618    629                       // This value is not a char, this is a stemming code
   619    630                       l.push(this.funcStemming(sWord, this.lArcVal[nArc]));
   620    631                   }
   621         -                iAddr = iEndArcAddr + this.nBytesNodeAddress;
          632  +                iAddr = iEndArcAddr + 1;
   622    633               }
   623    634               return l;
   624    635           }
   625    636           return [];
   626    637       }
   627    638   
   628         -    _lookupArcNode1 (nVal, iAddr) {
          639  +    _lookupArcNode (nVal, iAddr) {
   629    640           // looks if nVal is an arc at the node at iAddr, if yes, returns address of next node else None
   630    641           while (true) {
   631         -            let iEndArcAddr = iAddr+this.nBytesArc;
   632         -            let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
          642  +            let iEndArcAddr = iAddr+1;
          643  +            let nRawArc = this.byDic[iAddr];
   633    644               if (nVal == (nRawArc & this._arcMask)) {
   634    645                   // the value we are looking for
   635    646                   // we return the address of the next node
   636         -                return this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress));
          647  +                return this.byDic[iEndArcAddr];
   637    648               }
   638    649               else {
   639    650                   // value not found
   640    651                   if (nRawArc & this._lastArcMask) {
   641    652                       return null;
   642    653                   }
   643         -                iAddr = iEndArcAddr + this.nBytesNodeAddress;
          654  +                iAddr = iEndArcAddr + 1;
   644    655               }
   645    656           }
   646    657       }
   647    658   
   648         -    * _getArcs1 (iAddr) {
          659  +    * _getArcs (iAddr) {
   649    660           // generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)
   650    661           while (true) {
   651         -            let iEndArcAddr = iAddr+this.nBytesArc;
   652         -            let nRawArc = this._convBytesToInteger(this.byDic.slice(iAddr, iEndArcAddr));
   653         -            yield [nRawArc & this._arcMask, this._convBytesToInteger(this.byDic.slice(iEndArcAddr, iEndArcAddr+this.nBytesNodeAddress))];
          662  +            let iEndArcAddr = iAddr+1;
          663  +            let nRawArc = this.byDic[iAddr];
          664  +            yield [nRawArc & this._arcMask, this.byDic[iEndArcAddr]];
   654    665               if (nRawArc & this._lastArcMask) {
   655    666                   break;
   656    667               }
   657         -            iAddr = iEndArcAddr+this.nBytesNodeAddress;
          668  +            iAddr = iEndArcAddr+1;
   658    669           }
   659    670       }
   660         -
   661         -    // VERSION 2
   662         -    _morph2 (sWord) {
   663         -        // to do
   664         -    }
   665         -
   666         -    _stem2 (sWord) {
   667         -        // to do
   668         -    }
   669         -
   670         -    _lookupArcNode2 (nVal, iAddr) {
   671         -        // to do
   672         -    }
   673         -
   674         -
   675         -    // VERSION 3
   676         -    _morph3 (sWord) {
   677         -        // to do
   678         -    }
   679         -
   680         -    _stem3 (sWord) {
   681         -        // to do
   682         -    }
   683         -
   684         -    _lookupArcNode3 (nVal, iAddr) {
   685         -        // to do
   686         -    }
   687    671   }
   688    672   
   689    673   
   690    674   if (typeof(exports) !== 'undefined') {
   691    675       exports.IBDAWG = IBDAWG;
   692    676   }

Modified graphspell-js/str_transform.js from [4c4ee6009e] to [8ec0376c2c].

     7      7   "use strict";
     8      8   
     9      9   
    10     10   if (typeof(process) !== 'undefined') {
    11     11       var char_player = require("./char_player.js");
    12     12   }
    13     13   
           14  +
    14     15   
    15     16   // Note: 48 is the ASCII code for "0"
    16     17   
    17     18   var str_transform = {
    18     19   
    19     20       getNgrams: function (sWord, n=2) {
    20     21           let lNgrams = [];
................................................................................
    61     62               if (c != sWord.slice(i, i+1) || (c == 'e' && sWord.slice(i, i+2) != "ee")) {  // exception for <e> to avoid confusion between crée / créai
    62     63                   sNewWord += c;
    63     64               }
    64     65               i++;
    65     66           }
    66     67           return sNewWord.replace(/eau/g, "o").replace(/au/g, "o").replace(/ai/g, "éi").replace(/ei/g, "é").replace(/ph/g, "f");
    67     68       },
           69  +
           70  +    cleanWord: function (sWord) {
           71  +        // word clean for the user who make commun and preditive error help suggest
           72  +        // remove letters repeated more than 2 times
           73  +        if (sWord.match(/(.)(\1){2,}/igm)){
           74  +            sWord = sWord.replace(/(.*)(.)(.\2)/igm,'$1$2').replace(/(.)(\1)+/igm,'$1$1');
           75  +        }
           76  +        // words ending with -ik -> replace with -ique
           77  +        if (sWord.match(/ik$/ig)){
           78  +            sWord = sWord.replace(/(.*)ik$/ig,'$1ique');
           79  +        }
           80  +        return sWord;
           81  +    },
    68     82   
    69     83       _xTransNumbersToExponent: new Map([
    70     84           ["0", "⁰"], ["1", "¹"], ["2", "²"], ["3", "³"], ["4", "⁴"], ["5", "⁵"], ["6", "⁶"], ["7", "⁷"], ["8", "⁸"], ["9", "⁹"]
    71     85       ]),
    72     86   
    73     87       numbersToExponent: function (sWord) {
    74     88           let sNewWord = "";
................................................................................
   148    162               }
   149    163               return matrix[nLen1][nLen2];
   150    164           }
   151    165           catch (e) {
   152    166               console.error(e);
   153    167           }
   154    168       },
          169  +
          170  +    distanceJaroWinkler: function(a, b, boost = .666) {
          171  +        // https://github.com/thsig/jaro-winkler-JS
          172  +        //if (a == b) { return 1.0; }
          173  +        let a_len = a.length;
          174  +        let b_len = b.length;
          175  +        let a_flag = [];
          176  +        let b_flag = [];
          177  +        let search_range = Math.floor(Math.max(a_len, b_len) / 2) - 1;
          178  +        let minv = Math.min(a_len, b_len);
          179  +
          180  +        // Looking only within the search range, count and flag the matched pairs.
          181  +        let Num_com = 0;
          182  +        let yl1 = b_len - 1;
          183  +        for (let i = 0; i < a_len; i++) {
          184  +          let lowlim = (i >= search_range) ? i - search_range : 0;
          185  +          let hilim  = ((i + search_range) <= yl1) ? (i + search_range) : yl1;
          186  +          for (let j = lowlim; j <= hilim; j++) {
          187  +            if (b_flag[j] !== 1 && a[j] === b[i]) {
          188  +              a_flag[j] = 1;
          189  +              b_flag[i] = 1;
          190  +              Num_com++;
          191  +              break;
          192  +            }
          193  +          }
          194  +        }
          195  +
          196  +        // Return if no characters in common
          197  +        if (Num_com === 0) { return 0.0; }
          198  +
          199  +        // Count the number of transpositions
          200  +        let k = 0;
          201  +        let N_trans = 0;
          202  +        for (let i = 0; i < a_len; i++) {
          203  +          if (a_flag[i] === 1) {
          204  +            let j;
          205  +            for (j = k; j < b_len; j++) {
          206  +              if (b_flag[j] === 1) {
          207  +                k = j + 1;
          208  +                break;
          209  +              }
          210  +            }
          211  +            if (a[i] !== b[j]) { N_trans++; }
          212  +          }
          213  +        }
          214  +        N_trans = Math.floor(N_trans / 2);
          215  +
          216  +        // Adjust for similarities in nonmatched characters
          217  +        let N_simi = 0;
          218  +        let adjwt = char_player.oDistanceBetweenChars;
          219  +        if (minv > Num_com) {
          220  +          for (let i = 0; i < a_len; i++) {
          221  +            if (!a_flag[i]) {
          222  +              for (let j = 0; j < b_len; j++) {
          223  +                if (!b_flag[j]) {
          224  +                  if (adjwt[a[i]] && adjwt[a[i]][b[j]]) {
          225  +                    N_simi += adjwt[a[i]][b[j]];
          226  +                    b_flag[j] = 2;
          227  +                    break;
          228  +                  }
          229  +                }
          230  +              }
          231  +            }
          232  +          }
          233  +        }
          234  +
          235  +        let Num_sim = (N_simi / 10.0) + Num_com;
          236  +
          237  +        // Main weight computation
          238  +        let weight = Num_sim / a_len + Num_sim / b_len + (Num_com - N_trans) / Num_com;
          239  +        weight = weight / 3;
          240  +
          241  +        // Continue to boost the weight if the strings are similar
          242  +        if (weight > boost) {
          243  +          // Adjust for having up to the first 4 characters in common
          244  +          let j = (minv >= 4) ? 4 : minv;
          245  +          let i;
          246  +          for (i = 0; (i < j) && a[i] === b[i]; i++) { }
          247  +          if (i) { weight += i * 0.1 * (1.0 - weight) };
          248  +
          249  +          // Adjust for long strings.
          250  +          // After agreeing beginning chars, at least two more must agree
          251  +          // and the agreeing characters must be more than half of the
          252  +          // remaining characters.
          253  +          if (minv > 4 && Num_com > i + 1 && 2 * Num_com >= minv + i) {
          254  +            weight += (1 - weight) * ((Num_com - i - 1) / (a_len * b_len - i*2 + 2));
          255  +          }
          256  +        }
          257  +
          258  +        return weight;
          259  +    },
   155    260   
   156    261       showDistance (s1, s2) {
   157    262           console.log(`Distance: ${s1} / ${s2} = ${this.distanceDamerauLevenshtein(s1, s2)})`);
   158    263       },
   159    264   
   160    265       // Suffix only
   161    266       defineSuffixCode: function (sFlex, sStem) {

Modified graphspell/ibdawg.py from [d16ed0d683] to [bda5a789eb].

   116    116               else:
   117    117                   raise OSError("# Error. Unknown file type: "+source)
   118    118           else:
   119    119               self._initJSON(source)
   120    120   
   121    121           self.sFileName = source  if isinstance(source, str)  else "[None]"
   122    122   
          123  +        # Performance trick:
          124  +        #     Instead of converting bytes to integers each times we parse the binary dictionary,
          125  +        #     we do it once, then parse the array
          126  +        nAcc = 0
          127  +        byBuffer = b""
          128  +        lTemp = []
          129  +        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
          130  +        for i in range(0, len(self.byDic)):
          131  +            byBuffer += self.byDic[i:i+1]
          132  +            if nAcc == (self.nBytesArc - 1):
          133  +                lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
          134  +                byBuffer = b""
          135  +            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
          136  +                lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
          137  +                byBuffer = b""
          138  +                nAcc = -1
          139  +            nAcc = nAcc + 1
          140  +        self.byDic = lTemp;
          141  +
          142  +        # masks
   123    143           self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
   124    144           self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
   125    145           self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
   126         -        self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2
   127    146   
   128    147           # function to decode the affix/suffix code
   129    148           if self.cStemming == "S":
   130    149               self.funcStemming = st.changeWordWithSuffixCode
   131    150           elif self.cStemming == "A":
   132    151               self.funcStemming = st.changeWordWithAffixCode
   133    152           else:
   134    153               self.funcStemming = st.noStemming
   135    154   
   136         -        # Configuring DAWG functions according to nCompressionMethod
   137         -        if self.nCompressionMethod == 1:
   138         -            self.morph = self._morph1
   139         -            self.stem = self._stem1
   140         -            self._lookupArcNode = self._lookupArcNode1
   141         -            self._getArcs = self._getArcs1
   142         -            self._writeNodes = self._writeNodes1
   143         -        elif self.nCompressionMethod == 2:
   144         -            self.morph = self._morph2
   145         -            self.stem = self._stem2
   146         -            self._lookupArcNode = self._lookupArcNode2
   147         -            self._getArcs = self._getArcs2
   148         -            self._writeNodes = self._writeNodes2
   149         -        elif self.nCompressionMethod == 3:
   150         -            self.morph = self._morph3
   151         -            self.stem = self._stem3
   152         -            self._lookupArcNode = self._lookupArcNode3
   153         -            self._getArcs = self._getArcs3
   154         -            self._writeNodes = self._writeNodes3
   155         -        else:
   156         -            raise ValueError("  # Error: unknown code: {}".format(self.nCompressionMethod))
   157         -
   158    155           self.bAcronymValid = False
   159    156           self.bNumAtLastValid = False
   160    157   
   161    158           # lexicographer module ?
   162    159           self.lexicographer = None
   163    160           try:
   164    161               self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell")
................................................................................
   200    197           self.cStemming = l.pop(0)
   201    198           self.nTag = self.nArcVal - self.nChar - self.nAff
   202    199           # <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
   203    200           self.dChar = {}
   204    201           for i in range(1, self.nChar+1):
   205    202               self.dChar[self.lArcVal[i]] = i
   206    203           self.dCharVal = { v: k  for k, v in self.dChar.items() }
   207         -        self.nBytesOffset = 1 # version 3
   208    204   
   209    205       def _initJSON (self, oJSON):
   210    206           "initialize with a JSON text file"
   211    207           self.sByDic = ""  # init to prevent pylint whining
   212    208           self.__dict__.update(oJSON)
   213    209           self.byDic = binascii.unhexlify(self.sByDic)
   214    210           self.dCharVal = { v: k  for k, v in self.dChar.items() }
................................................................................
   244    240                   "nNode": self.nNode,
   245    241                   "nArc": self.nArc,
   246    242                   "nArcVal": self.nArcVal,
   247    243                   "lArcVal": self.lArcVal,
   248    244                   "nCompressionMethod": self.nCompressionMethod,
   249    245                   "nBytesArc": self.nBytesArc,
   250    246                   "nBytesNodeAddress": self.nBytesNodeAddress,
   251         -                "nBytesOffset": self.nBytesOffset,
   252    247                   # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
   253    248                   # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
   254    249                   # https://github.com/mozilla/addons-linter/issues/1361
   255    250                   "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ],
   256    251                   "l2grams": list(self.a2grams)
   257    252               }, ensure_ascii=False))
   258    253               if bInJSModule:
................................................................................
   296    291           iAddr = 0
   297    292           for c in sWord:
   298    293               if c not in self.dChar:
   299    294                   return False
   300    295               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   301    296               if iAddr is None:
   302    297                   return False
   303         -        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
          298  +        return bool(self.byDic[iAddr] & self._finalNodeMask)
   304    299   
   305    300       def getMorph (self, sWord):
   306    301           "retrieves morphologies list, different casing allowed"
   307    302           if not sWord:
   308    303               return []
   309    304           sWord = st.spellingNormalization(sWord)
   310         -        l = self.morph(sWord)
          305  +        l = self._morph(sWord)
   311    306           if sWord[0:1].isupper():
   312         -            l.extend(self.morph(sWord.lower()))
          307  +            l.extend(self._morph(sWord.lower()))
   313    308               if sWord.isupper() and len(sWord) > 1:
   314         -                l.extend(self.morph(sWord.capitalize()))
          309  +                l.extend(self._morph(sWord.capitalize()))
   315    310           return l
   316    311   
   317    312       #@timethis
   318    313       def suggest (self, sWord, nSuggLimit=10, bSplitTrailingNumbers=False):
   319    314           "returns a set of suggestions for <sWord>"
   320    315           sWord = sWord.rstrip(".")   # useful for LibreOffice
   321    316           sWord = st.spellingNormalization(sWord)
................................................................................
   352    347                   sWord1, sWord2 = sWord.split(cSplitter, 1)
   353    348                   if self.isValid(sWord1) and self.isValid(sWord2):
   354    349                       oSuggResult.addSugg(sWord1+" "+sWord2)
   355    350   
   356    351       def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
   357    352           # recursive function
   358    353           #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
   359         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          354  +        if self.byDic[iAddr] & self._finalNodeMask:
   360    355               if not sRemain:
   361    356                   oSuggResult.addSugg(sNewWord, nDeep)
   362    357                   for sTail in self._getTails(iAddr):
   363    358                       oSuggResult.addSugg(sNewWord+sTail, nDeep)
   364    359                   return
   365    360               if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
   366    361                   if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
................................................................................
   419    414                   yield (self.dCharVal[nVal], jAddr)
   420    415   
   421    416       def _getTails (self, iAddr, sTail="", n=2):
   422    417           "return a list of suffixes ending at a distance of <n> from <iAddr>"
   423    418           aTails = set()
   424    419           for nVal, jAddr in self._getArcs(iAddr):
   425    420               if nVal <= self.nChar:
   426         -                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          421  +                if self.byDic[jAddr] & self._finalNodeMask:
   427    422                       aTails.add(sTail + self.dCharVal[nVal])
   428    423                   if n and not aTails:
   429    424                       aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
   430    425           return aTails
   431    426   
   432    427       def drawPath (self, sWord, iAddr=0):
   433    428           "show the path taken by <sWord> in the graph"
................................................................................
   467    462               if sFlexPattern:
   468    463                   zFlexPattern = re.compile(sFlexPattern)
   469    464               if sTagsPattern:
   470    465                   zTagsPattern = re.compile(sTagsPattern)
   471    466           except re.error:
   472    467               print("# Error in regex pattern")
   473    468               traceback.print_exc()
   474         -        yield from self._select1(zFlexPattern, zTagsPattern, 0, "")
          469  +        yield from self._select(zFlexPattern, zTagsPattern, 0, "")
   475    470   
   476         -    # def morph (self, sWord):
   477         -    #     is defined in __init__
   478         -
   479         -    # VERSION 1
   480         -    def _select1 (self, zFlexPattern, zTagsPattern, iAddr, sWord):
          471  +    def _select (self, zFlexPattern, zTagsPattern, iAddr, sWord):
   481    472           # recursive generator
   482         -        for nVal, jAddr in self._getArcs1(iAddr):
          473  +        for nVal, jAddr in self._getArcs(iAddr):
   483    474               if nVal <= self.nChar:
   484    475                   # simple character
   485         -                yield from self._select1(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal])
          476  +                yield from self._select(zFlexPattern, zTagsPattern, jAddr, sWord + self.lArcVal[nVal])
   486    477               else:
   487    478                   if not zFlexPattern or zFlexPattern.search(sWord):
   488    479                       sStem = self.funcStemming(sWord, self.lArcVal[nVal])
   489         -                    for nMorphVal, _ in self._getArcs1(jAddr):
          480  +                    for nMorphVal, _ in self._getArcs(jAddr):
   490    481                           if not zTagsPattern or zTagsPattern.search(self.lArcVal[nMorphVal]):
   491    482                               yield [sWord, sStem, self.lArcVal[nMorphVal]]
   492    483   
   493         -    def _morph1 (self, sWord):
          484  +    def _morph (self, sWord):
   494    485           "returns morphologies of <sWord>"
   495    486           iAddr = 0
   496    487           for c in sWord:
   497    488               if c not in self.dChar:
   498    489                   return []
   499    490               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   500    491               if iAddr is None:
   501    492                   return []
   502         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          493  +        if self.byDic[iAddr] & self._finalNodeMask:
   503    494               l = []
   504    495               nRawArc = 0
   505    496               while not nRawArc & self._lastArcMask:
   506         -                iEndArcAddr = iAddr + self.nBytesArc
   507         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          497  +                iEndArcAddr = iAddr + 1
          498  +                nRawArc = self.byDic[iAddr]
   508    499                   nArc = nRawArc & self._arcMask
   509    500                   if nArc > self.nChar:
   510    501                       # This value is not a char, this is a stemming code
   511    502                       sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
   512    503                       # Now , we go to the next node and retrieve all following arcs values, all of them are tags
   513         -                    iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
          504  +                    iAddr2 = self.byDic[iEndArcAddr]
   514    505                       nRawArc2 = 0
   515    506                       while not nRawArc2 & self._lastArcMask:
   516         -                        iEndArcAddr2 = iAddr2 + self.nBytesArc
   517         -                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
          507  +                        iEndArcAddr2 = iAddr2 + 1
          508  +                        nRawArc2 = self.byDic[iAddr2]
   518    509                           l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
   519         -                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress
   520         -                iAddr = iEndArcAddr+self.nBytesNodeAddress
          510  +                        iAddr2 = iEndArcAddr2 + 1
          511  +                iAddr = iEndArcAddr + 1
   521    512               return l
   522    513           return []
   523    514   
   524         -    def _stem1 (self, sWord):
          515  +    def _stem (self, sWord):
   525    516           "returns stems list of <sWord>"
   526    517           iAddr = 0
   527    518           for c in sWord:
   528    519               if c not in self.dChar:
   529    520                   return []
   530    521               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   531    522               if iAddr is None:
   532    523                   return []
   533         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          524  +        if self.byDic[iAddr] & self._finalNodeMask:
   534    525               l = []
   535    526               nRawArc = 0
   536    527               while not nRawArc & self._lastArcMask:
   537         -                iEndArcAddr = iAddr + self.nBytesArc
   538         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          528  +                iEndArcAddr = iAddr + 1
          529  +                nRawArc = self.byDic[iAddr]
   539    530                   nArc = nRawArc & self._arcMask
   540    531                   if nArc > self.nChar:
   541    532                       # This value is not a char, this is a stemming code
   542    533                       l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
   543         -                iAddr = iEndArcAddr+self.nBytesNodeAddress
          534  +                iAddr = iEndArcAddr + 1
   544    535               return l
   545    536           return []
   546    537   
   547         -    def _lookupArcNode1 (self, nVal, iAddr):
          538  +    def _lookupArcNode (self, nVal, iAddr):
   548    539           "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
   549    540           while True:
   550         -            iEndArcAddr = iAddr+self.nBytesArc
   551         -            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          541  +            iEndArcAddr = iAddr + 1
          542  +            nRawArc = self.byDic[iAddr]
   552    543               if nVal == (nRawArc & self._arcMask):
   553    544                   # the value we are looking for
   554    545                   # we return the address of the next node
   555         -                return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
          546  +                return self.byDic[iEndArcAddr]
   556    547               # value not found
   557    548               if nRawArc & self._lastArcMask:
   558    549                   return None
   559         -            iAddr = iEndArcAddr+self.nBytesNodeAddress
          550  +            iAddr = iEndArcAddr + 1
   560    551   
   561         -    def _getArcs1 (self, iAddr):
          552  +    def _getArcs (self, iAddr):
   562    553           "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
   563    554           while True:
   564         -            iEndArcAddr = iAddr+self.nBytesArc
   565         -            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   566         -            yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
          555  +            iEndArcAddr = iAddr + 1
          556  +            nRawArc = self.byDic[iAddr]
          557  +            yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
   567    558               if nRawArc & self._lastArcMask:
   568    559                   break
   569         -            iAddr = iEndArcAddr+self.nBytesNodeAddress
          560  +            iAddr = iEndArcAddr + 1
   570    561   
   571         -    def _writeNodes1 (self, spfDest):
          562  +    def _writeNodes (self, spfDest):
   572    563           "for debugging only"
   573    564           print(" > Write binary nodes")
   574    565           with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
   575    566               iAddr = 0
   576    567               hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
   577    568               while iAddr < len(self.byDic):
   578         -                iEndArcAddr = iAddr+self.nBytesArc
   579         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          569  +                iEndArcAddr = iAddr + 1
          570  +                nRawArc = self.byDic[iAddr]
   580    571                   nArc = nRawArc & self._arcMask
   581         -                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \
   582         -                                                                            int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \
   583         -                                                                                           byteorder='big')))
   584         -                iAddr = iEndArcAddr+self.nBytesNodeAddress
   585         -                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
   586         -                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
   587         -            hDst.close()
   588         -
   589         -    # VERSION 2
   590         -    def _morph2 (self, sWord):
   591         -        "returns morphologies of <sWord>"
   592         -        iAddr = 0
   593         -        for c in sWord:
   594         -            if c not in self.dChar:
   595         -                return []
   596         -            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   597         -            if iAddr is None:
   598         -                return []
   599         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
   600         -            l = []
   601         -            nRawArc = 0
   602         -            while not nRawArc & self._lastArcMask:
   603         -                iEndArcAddr = iAddr + self.nBytesArc
   604         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   605         -                nArc = nRawArc & self._arcMask
   606         -                if nArc > self.nChar:
   607         -                    # This value is not a char, this is a stemming code
   608         -                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
   609         -                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
   610         -                    if not nRawArc & self._addrBitMask:
   611         -                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   612         -                    else:
   613         -                        # we go to the end of the node
   614         -                        iAddr2 = iEndArcAddr
   615         -                        while not nRawArc & self._lastArcMask:
   616         -                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
   617         -                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
   618         -                    nRawArc2 = 0
   619         -                    while not nRawArc2 & self._lastArcMask:
   620         -                        iEndArcAddr2 = iAddr2 + self.nBytesArc
   621         -                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
   622         -                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
   623         -                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not nRawArc2 & self._addrBitMask else iEndArcAddr2
   624         -                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr
   625         -            return l
   626         -        return []
   627         -
   628         -    def _stem2 (self, sWord):
   629         -        "returns stems list of <sWord>"
   630         -        iAddr = 0
   631         -        for c in sWord:
   632         -            if c not in self.dChar:
   633         -                return []
   634         -            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   635         -            if iAddr is None:
   636         -                return []
   637         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
   638         -            l = []
   639         -            nRawArc = 0
   640         -            while not nRawArc & self._lastArcMask:
   641         -                iEndArcAddr = iAddr + self.nBytesArc
   642         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   643         -                nArc = nRawArc & self._arcMask
   644         -                if nArc > self.nChar:
   645         -                    # This value is not a char, this is a stemming code
   646         -                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
   647         -                    # Now , we go to the next node
   648         -                    if not nRawArc & self._addrBitMask:
   649         -                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   650         -                    else:
   651         -                        # we go to the end of the node
   652         -                        iAddr2 = iEndArcAddr
   653         -                        while not nRawArc & self._lastArcMask:
   654         -                            nRawArc = int.from_bytes(self.byDic[iAddr2:iAddr2+self.nBytesArc], byteorder='big')
   655         -                            iAddr2 += self.nBytesArc + self.nBytesNodeAddress
   656         -                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr
   657         -            return l
   658         -        return []
   659         -
   660         -    def _lookupArcNode2 (self, nVal, iAddr):
   661         -        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
   662         -        while True:
   663         -            iEndArcAddr = iAddr+self.nBytesArc
   664         -            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   665         -            if nVal == (nRawArc & self._arcMask):
   666         -                # the value we are looking for
   667         -                if not nRawArc & self._addrBitMask:
   668         -                    # we return the address of the next node
   669         -                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   670         -                # we go to the end of the node
   671         -                iAddr = iEndArcAddr
   672         -                while not nRawArc & self._lastArcMask:
   673         -                    nRawArc = int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big')
   674         -                    iAddr += self.nBytesArc + self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else self.nBytesArc
   675         -                return iAddr
   676         -            # value not found
   677         -            if nRawArc & self._lastArcMask:
   678         -                return None
   679         -            iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr
   680         -
   681         -    def _writeNodes2 (self, spfDest):
   682         -        "for debugging only"
   683         -        print(" > Write binary nodes")
   684         -        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
   685         -            iAddr = 0
   686         -            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
   687         -            while iAddr < len(self.byDic):
   688         -                iEndArcAddr = iAddr+self.nBytesArc
   689         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   690         -                nArc = nRawArc & self._arcMask
   691         -                if not nRawArc & self._addrBitMask:
   692         -                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   693         -                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
   694         -                    iAddr = iEndArcAddr+self.nBytesNodeAddress
   695         -                else:
   696         -                    hDst.write("  {:<20}  {:0>16}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:]))
   697         -                    iAddr = iEndArcAddr
   698         -                if nRawArc & self._lastArcMask:
   699         -                    hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
   700         -            hDst.close()
   701         -
   702         -    # VERSION 3
   703         -    def _morph3 (self, sWord):
   704         -        "returns morphologies of <sWord>"
   705         -        iAddr = 0
   706         -        for c in sWord:
   707         -            if c not in self.dChar:
   708         -                return []
   709         -            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   710         -            if iAddr is None:
   711         -                return []
   712         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
   713         -            l = []
   714         -            nRawArc = 0
   715         -            iAddrNode = iAddr
   716         -            while not nRawArc & self._lastArcMask:
   717         -                iEndArcAddr = iAddr + self.nBytesArc
   718         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   719         -                nArc = nRawArc & self._arcMask
   720         -                if nArc > self.nChar:
   721         -                    # This value is not a char, this is a stemming code
   722         -                    sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
   723         -                    # Now , we go to the next node and retrieve all following arcs values, all of them are tags
   724         -                    if not nRawArc & self._addrBitMask:
   725         -                        iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   726         -                    else:
   727         -                        iAddr2 = iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
   728         -                    nRawArc2 = 0
   729         -                    while not nRawArc2 & self._lastArcMask:
   730         -                        iEndArcAddr2 = iAddr2 + self.nBytesArc
   731         -                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
   732         -                        l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
   733         -                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress  if not nRawArc2 & self._addrBitMask  else iEndArcAddr2+self.nBytesOffset
   734         -                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr+self.nBytesOffset
   735         -            return l
   736         -        return []
   737         -
   738         -    def _stem3 (self, sWord):
   739         -        "returns stems list of <sWord>"
   740         -        iAddr = 0
   741         -        for c in sWord:
   742         -            if c not in self.dChar:
   743         -                return []
   744         -            iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   745         -            if iAddr is None:
   746         -                return []
   747         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
   748         -            l = []
   749         -            nRawArc = 0
   750         -            #iAddrNode = iAddr
   751         -            while not nRawArc & self._lastArcMask:
   752         -                iEndArcAddr = iAddr + self.nBytesArc
   753         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   754         -                nArc = nRawArc & self._arcMask
   755         -                if nArc > self.nChar:
   756         -                    # This value is not a char, this is a stemming code
   757         -                    l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
   758         -                iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr+self.nBytesOffset
   759         -            return l
   760         -        return []
   761         -
   762         -    def _lookupArcNode3 (self, nVal, iAddr):
   763         -        "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
   764         -        iAddrNode = iAddr
   765         -        while True:
   766         -            iEndArcAddr = iAddr+self.nBytesArc
   767         -            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   768         -            if nVal == (nRawArc & self._arcMask):
   769         -                # the value we are looking for
   770         -                if not nRawArc & self._addrBitMask:
   771         -                    return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   772         -                return iAddrNode + int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
   773         -            # value not found
   774         -            if nRawArc & self._lastArcMask:
   775         -                return None
   776         -            iAddr = iEndArcAddr+self.nBytesNodeAddress  if not nRawArc & self._addrBitMask  else iEndArcAddr+self.nBytesOffset
   777         -
   778         -    def _writeNodes3 (self, spfDest):
   779         -        "for debugging only"
   780         -        print(" > Write binary nodes")
   781         -        with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
   782         -            iAddr = 0
   783         -            hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
   784         -            while iAddr < len(self.byDic):
   785         -                iEndArcAddr = iAddr+self.nBytesArc
   786         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   787         -                nArc = nRawArc & self._arcMask
   788         -                if not nRawArc & self._addrBitMask:
   789         -                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   790         -                    hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
   791         -                    iAddr = iEndArcAddr+self.nBytesNodeAddress
   792         -                else:
   793         -                    iNextNodeAddr = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesOffset], byteorder='big')
   794         -                    hDst.write("  {:<20}  {:0>16}  i{:>10}   +{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", iNextNodeAddr))
   795         -                    iAddr = iEndArcAddr+self.nBytesOffset
   796         -                if nRawArc & self._lastArcMask:
          572  +                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))
          573  +                iAddr = iEndArcAddr + 1
          574  +                if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
   797    575                       hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
   798    576               hDst.close()