Grammalecte  Check-in [ac98115a31]

Overview
Comment:[graphspell][build] dawg builder: new parameters + consistency
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | build | graphspell
Files: files | file ages | folders
SHA3-256: ac98115a3112c7b2725ac9aa0a4e8ef89e420a6de84b38732c368b05ad13a45a
User & Date: olr on 2018-02-11 10:20:42
Other Links: manifest | tags
Context
2018-02-11
16:06
[graphspell][py] ibdawg: initialization from binary file or JSON check-in: 96290e4468 user: olr tags: graphspell, trunk
10:20
[graphspell][build] dawg builder: new parameters + consistency check-in: ac98115a31 user: olr tags: build, graphspell, trunk
09:22
[graphspell] JSON data clarification check-in: 9c995c9f7f user: olr tags: graphspell, trunk
Changes

Modified gc_core/js/lang_core/gc_engine.js from [c5ee3ec605] to [2f5e964b5d].

319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

    //// Initialization

    load: function (sContext="JavaScript", sPath="") {
        try {
            if (typeof(require) !== 'undefined') {
                var ibdawg = require("resource://grammalecte/graphspell/ibdawg.js");
                _oDict = new ibdawg.IBDAWG("${dic_name}.json");
            } else {
                _oDict = new IBDAWG("${dic_name}.json", sPath);
            }
            _sAppContext = sContext;
            _dOptions = gc_options.getOptions(sContext).gl_shallowCopy();     // duplication necessary, to be able to reset to default
        }
        catch (e) {
            helpers.logerror(e);
        }







|

|







319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

    //// Initialization

    load: function (sContext="JavaScript", sPath="") {
        try {
            if (typeof(require) !== 'undefined') {
                var ibdawg = require("resource://grammalecte/graphspell/ibdawg.js");
                _oDict = new ibdawg.IBDAWG("${dic_filename}.json");
            } else {
                _oDict = new IBDAWG("${dic_filename}.json", sPath);
            }
            _sAppContext = sContext;
            _dOptions = gc_options.getOptions(sContext).gl_shallowCopy();     // duplication necessary, to be able to reset to default
        }
        catch (e) {
            helpers.logerror(e);
        }

Modified gc_core/py/lang_core/gc_engine.py from [e1c3ad1859] to [9fc11201d4].

288
289
290
291
292
293
294
295
296
297
298
299
300
301
302


def load (sContext="Python"):
    global _oDict
    global _sAppContext
    global _dOptions
    try:
        _oDict = IBDAWG("${dic_name}.bdic")
        _sAppContext = sContext
        _dOptions = dict(gc_options.getOptions(sContext))   # duplication necessary, to be able to reset to default
    except:
        traceback.print_exc()


def setOption (sOpt, bVal):







|







288
289
290
291
292
293
294
295
296
297
298
299
300
301
302


def load (sContext="Python"):
    global _oDict
    global _sAppContext
    global _dOptions
    try:
        _oDict = IBDAWG("${dic_filename}.bdic")
        _sAppContext = sContext
        _dOptions = dict(gc_options.getOptions(sContext))   # duplication necessary, to be able to reset to default
    except:
        traceback.print_exc()


def setOption (sOpt, bVal):

Modified gc_lang/fr/build.py from [9eae4b1757] to [248d1906b2].

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

def createThunderbirdExtension (sLang, dVars, spLangPack):
    "create extension for Thunderbird"
    print("Building extension for Thunderbird")
    sExtensionName = dVars['tb_identifier'] + "-v" + dVars['version'] + '.xpi'
    spfZip = "_build/" + sExtensionName
    hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED)
    _copyGrammalecteJSPackageInZipFile(hZip, spLangPack, dVars['dic_name']+".json")
    for spf in ["LICENSE.txt", "LICENSE.fr.txt"]:
        hZip.write(spf)
    dVars = _createOptionsForThunderbird(dVars)
    helpers.addFolderToZipAndFileFile(hZip, "gc_lang/"+sLang+"/tb", "", dVars, True)
    spDict = "gc_lang/"+sLang+"/xpi/data/dictionaries"
    for sp in os.listdir(spDict):
        if os.path.isdir(spDict+"/"+sp):







|







75
76
77
78
79
80
81
82
83
84
85
86
87
88
89

def createThunderbirdExtension (sLang, dVars, spLangPack):
    "create extension for Thunderbird"
    print("Building extension for Thunderbird")
    sExtensionName = dVars['tb_identifier'] + "-v" + dVars['version'] + '.xpi'
    spfZip = "_build/" + sExtensionName
    hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED)
    _copyGrammalecteJSPackageInZipFile(hZip, spLangPack, dVars['dic_filename']+".json")
    for spf in ["LICENSE.txt", "LICENSE.fr.txt"]:
        hZip.write(spf)
    dVars = _createOptionsForThunderbird(dVars)
    helpers.addFolderToZipAndFileFile(hZip, "gc_lang/"+sLang+"/tb", "", dVars, True)
    spDict = "gc_lang/"+sLang+"/xpi/data/dictionaries"
    for sp in os.listdir(spDict):
        if os.path.isdir(spDict+"/"+sp):

Modified gc_lang/fr/config.ini from [874aca7947] to [a48aa6be74].

12
13
14
15
16
17
18


19
20
21
22
23
24
25
26
27
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png

# lexicon source
lexicon_src = lexicons/French.lex


# binary dictionary name
dic_name = fr
# Finite state automaton compression: 1, 2 (experimental) or 3 (experimental)
fsa_method = 1
# stemming method: S for suffixes only, A for prefixes and suffixes
stemming_method = S

# LibreOffice
unopkg = C:/Program Files/LibreOffice 5/program/unopkg.com







>
>

|







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png

# lexicon source
lexicon_src = lexicons/French.lex
# binary dictionary file name
dic_filename = fr
# binary dictionary name
dic_name = French
# Finite state automaton compression: 1, 2 (experimental) or 3 (experimental)
fsa_method = 1
# stemming method: S for suffixes only, A for prefixes and suffixes
stemming_method = S

# LibreOffice
unopkg = C:/Program Files/LibreOffice 5/program/unopkg.com

Modified gc_lang/fr/modules/tests.py from [f1c386c508] to [43d45242b9].

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—")


class TestDictionary (unittest.TestCase):

    @classmethod
    def setUpClass (cls):
        cls.oDic = IBDAWG("${dic_name}.bdic")

    def test_lookup (self):
        for sWord in ["branche", "Émilie"]:
            self.assertTrue(self.oDic.lookup(sWord), sWord)

    def test_lookup_failed (self):
        for sWord in ["Branche", "BRANCHE", "BranchE", "BRanche", "BRAnCHE", "émilie"]:







|







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—")


class TestDictionary (unittest.TestCase):

    @classmethod
    def setUpClass (cls):
        cls.oDic = IBDAWG("${dic_filename}.bdic")

    def test_lookup (self):
        for sWord in ["branche", "Émilie"]:
            self.assertTrue(self.oDic.lookup(sWord), sWord)

    def test_lookup_failed (self):
        for sWord in ["Branche", "BRANCHE", "BranchE", "BRanche", "BRAnCHE", "émilie"]:

Modified gc_lang/fr/webext/panel/lex_editor.js from [eea97bdcc4] to [83999992e4].

596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
        oWidgets.setDictData(lEntry.length, oJSON.sDate);
        oWidgets.showElement("export_button");
    },

    build: function (lEntry) {
        oWidgets.showElement("build_progress");
        let xProgressNode = document.getElementById("build_progress");
        let oDAWG = new DAWG(lEntry, "Français - dictionnaire personnel", "S", xProgressNode);
        this.oJSON = oDAWG.createBinary(1);
        this.save();
        oWidgets.hideElement("build_progress");
        oWidgets.showElement("export_button");
    },

    save: function () {







|







596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
        oWidgets.setDictData(lEntry.length, oJSON.sDate);
        oWidgets.showElement("export_button");
    },

    build: function (lEntry) {
        oWidgets.showElement("build_progress");
        let xProgressNode = document.getElementById("build_progress");
        let oDAWG = new DAWG(lEntry, "S", "fr", "Français", "Dictionnaire personnel", xProgressNode);
        this.oJSON = oDAWG.createBinary(1);
        this.save();
        oWidgets.hideElement("build_progress");
        oWidgets.showElement("export_button");
    },

    save: function () {

Modified graphspell-js/dawg.js from [287e75028f] to [4f989404b7].

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
...
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
        This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
        We store suffix/affix codes and tags within the graph after the “real” word.
        A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
        Each arc is an index in this.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
        Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
    */

    constructor (lEntrySrc, sLangCode, sLangName, sDicName, cStemming, xProgressBarNode=null) {
        console.log("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====");
        let funcStemmingGen = null;
        switch (cStemming.toUpperCase()) {
            case "A":
                funcStemmingGen = str_transform.defineAffixCode; break;
            case "S":
                funcStemmingGen = str_transform.defineSuffixCode; break;
................................................................................
        let oJSON = {
            "sHeader": "/pyfsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntries": this.nEntry,
            "nChar": this.nChar,
            "nAff": this.nAff,
            "nTag": this.nTag,
            "cStemming": this.cStemming,
            "dChar": helpers.mapToObject(this.dChar),
            "nNode": this.nNode,
            "nArc": this.nArc,







|







 







|







24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
...
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
        This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
        We store suffix/affix codes and tags within the graph after the “real” word.
        A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
        Each arc is an index in this.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
        Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
    */

    constructor (lEntrySrc, cStemming, sLangCode, sLangName="", sDicName="", xProgressBarNode=null) {
        console.log("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====");
        let funcStemmingGen = null;
        switch (cStemming.toUpperCase()) {
            case "A":
                funcStemmingGen = str_transform.defineAffixCode; break;
            case "S":
                funcStemmingGen = str_transform.defineSuffixCode; break;
................................................................................
        let oJSON = {
            "sHeader": "/pyfsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntry": this.nEntry,
            "nChar": this.nChar,
            "nAff": this.nAff,
            "nTag": this.nTag,
            "cStemming": this.cStemming,
            "dChar": helpers.mapToObject(this.dChar),
            "nNode": this.nNode,
            "nArc": this.nArc,

Modified graphspell-js/ibdawg.js from [35c6ada92f] to [4ebdc2968e].

99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
...
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
        }
        catch (e) {
            throw Error("# Error. File not found or not loadable.\n" + e.message + "\n");
        }
        /*
            Properties:
            sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            nEntries, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset,
        */

        /*
            Bug workaround.
            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
            So we convert huge hexadecimal string to list of numbers…
            https://github.com/mozilla/addons-linter/issues/1361
................................................................................
        this.bOptNumAtLast = false;
    }

    getInfo () {
        return  `  Language: ${this.sLangName}   Lang code: ${this.sLangCode}   Dictionary name: ${this.sDicName}\n` +
                `  Compression method: ${this.nCompressionMethod}   Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntries} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
                `  Address size: ${this.nBytesNodeAddress} bytes,  Arc size: ${this.nBytesArc} bytes\n`;
    }

    isValidToken (sToken) {
        // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
        if (this.isValid(sToken)) {
            return true;







|







 







|







99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
...
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
        }
        catch (e) {
            throw Error("# Error. File not found or not loadable.\n" + e.message + "\n");
        }
        /*
            Properties:
            sName, nCompressionMethod, sHeader, lArcVal, nArcVal, sByDic, sLang, nChar, nBytesArc, nBytesNodeAddress,
            nEntry, nNode, nArc, nAff, cStemming, nTag, dChar, nBytesOffset,
        */

        /*
            Bug workaround.
            Mozilla’s JS parser sucks. Can’t read file bigger than 4 Mb!
            So we convert huge hexadecimal string to list of numbers…
            https://github.com/mozilla/addons-linter/issues/1361
................................................................................
        this.bOptNumAtLast = false;
    }

    getInfo () {
        return  `  Language: ${this.sLangName}   Lang code: ${this.sLangCode}   Dictionary name: ${this.sDicName}\n` +
                `  Compression method: ${this.nCompressionMethod}   Date: ${this.sDate}   Stemming: ${this.cStemming}FX\n` +
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
                `  Address size: ${this.nBytesNodeAddress} bytes,  Arc size: ${this.nBytesArc} bytes\n`;
    }

    isValidToken (sToken) {
        // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
        if (this.isValid(sToken)) {
            return true;

Modified graphspell/dawg.py from [8edab9530f] to [daf4f76e4f].

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
...
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangCode, sLangName, sDicName, cStemming):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
................................................................................
            hDst.write(json.dumps({
                            "sHeader": "/pyfsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sDicName": self.sDicName,
                            "sFileName": self.sFileName,
                            "sDate": str(datetime.datetime.now())[:-7],
                            "nEntries": self.nEntry,
                            "nChar": self.nChar,
                            "nAff": self.nAff,
                            "nTag": self.nTag,
                            "cStemming": self.cStemming,
                            "dChar": self.dChar,
                            "nNode": self.nNode,
                            "nArc": self.nArc,
                            "nArcVal": self.nArcVal,
                            "lArcVal": self.lArcVal,
                            "nCompressionMethod": nCompressionMethod,
                            "nBytesArc": self.nBytesArc,
                            "nBytesNodeAddress": self.nBytesNodeAddress,
                            "nBytesOffset": self.nBytesOffset
                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def _writeBinary (self, sPathFile, nCompressionMethod):
        """
        Format of the binary indexable dictionary:







|







 







|












|



|







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
...
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, cStemming, sLangCode, sLangName="", sDicName=""):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
................................................................................
            hDst.write(json.dumps({
                            "sHeader": "/pyfsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sDicName": self.sDicName,
                            "sFileName": self.sFileName,
                            "sDate": str(datetime.datetime.now())[:-7],
                            "nEntry": self.nEntry,
                            "nChar": self.nChar,
                            "nAff": self.nAff,
                            "nTag": self.nTag,
                            "cStemming": self.cStemming,
                            "dChar": self.dChar,
                            "nNode": self.nNode,
                            "nArc": self.nArc,
                            "nArcVal": self.nArcVal,
                            "lArcVal": self.lArcVal,
                            "nCompressionMethod": nCompressionMethod,
                            "nBytesArc": self.nBytesArc,
                            "nBytesNodeAddress": self.nBytesNodeAddress,
                            "nBytesOffset": self.nBytesOffset,
                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ]
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

    def _writeBinary (self, sPathFile, nCompressionMethod):
        """
        Format of the binary indexable dictionary:

Modified graphspell/ibdawg.py from [433de414a7] to [4820a60f14].

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103


104
105
106
107
108
109
110
111
112
113
114
...
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170



171
172

173

174
175





176





177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195




196
197
198
199
200
201
202
        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, sDicName):
        self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName)
        if not self.by:
            raise OSError("# Error. File not found or not loadable: "+sDicName)

        if self.by[0:7] != b"/pyfsa/":
            raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8]))
        try:
            header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
        except Exception:
            raise Exception

        self.sName = sDicName
        self.nCompressionMethod = int(self.by[7:8].decode("utf-8"))
        self.sHeader = header.decode("utf-8")
        self.lArcVal = values.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = bdic

        l = info.decode("utf-8").split("/")
        self.sLang = l[0]


        self.nChar = int(l[1])
        self.nBytesArc = int(l[2])
        self.nBytesNodeAddress = int(l[3])
        self.nEntries = int(l[4])
        self.nNode = int(l[5])
        self.nArc = int(l[6])
        self.nAff = int(l[7])
        self.cStemming = l[8]
        if self.cStemming == "S":
            self.funcStemming = st.changeWordWithSuffixCode
        elif self.cStemming == "A":
................................................................................
        self.bOptNumSigle = False
        self.bOptNumAtLast = False

    def getInfo (self):
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntries:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)

    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        import json
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({



                            "sName": self.sName,
                            "nCompressionMethod": self.nCompressionMethod,

                            "sDate": str(datetime.datetime.now())[:-7],

                            "sHeader": self.sHeader,
                            "lArcVal": self.lArcVal,





                            "nArcVal": self.nArcVal,





                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "byDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ],
                            "sLang": self.sLang,
                            "nChar": self.nChar,
                            "nBytesArc": self.nBytesArc,
                            "nBytesNodeAddress": self.nBytesNodeAddress,
                            "nEntries": self.nEntries,
                            "nNode": self.nNode,
                            "nArc": self.nArc,
                            "nAff": self.nAff,
                            "cStemming": self.cStemming,
                            "nTag": self.nTag,
                            "dChar": self.dChar,
                            "nBytesOffset": self.nBytesOffset
                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")





    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        if self.isValid(sToken):
            return True
        if "-" in sToken:
            if sToken.count("-") > 4:







|
|

|










|







|
>
>



|







 







|









>
>
>
|
<
>

>
|
|
>
>
>
>
>

>
>
>
>
>



|
<
<
<
<
<
<
<
<
<
<
<
<



>
>
>
>







74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
...
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196












197
198
199
200
201
202
203
204
205
206
207
208
209
210
        self.aSugg.clear()
        self.dSugg.clear()


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, sfDict):
        self.by = pkgutil.get_data(__package__, "_dictionaries/" + sfDict)
        if not self.by:
            raise OSError("# Error. File not found or not loadable: "+sfDict)

        if self.by[0:7] != b"/pyfsa/":
            raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8]))
        try:
            header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
        except Exception:
            raise Exception

        self.sFileName = sfDict
        self.nCompressionMethod = int(self.by[7:8].decode("utf-8"))
        self.sHeader = header.decode("utf-8")
        self.lArcVal = values.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = bdic

        l = info.decode("utf-8").split("/")
        self.sLangCode = "xx"
        self.sLangName = l[0]
        self.sDicName = ""
        self.nChar = int(l[1])
        self.nBytesArc = int(l[2])
        self.nBytesNodeAddress = int(l[3])
        self.nEntry = int(l[4])
        self.nNode = int(l[5])
        self.nArc = int(l[6])
        self.nAff = int(l[7])
        self.cStemming = l[8]
        if self.cStemming == "S":
            self.funcStemming = st.changeWordWithSuffixCode
        elif self.cStemming == "A":
................................................................................
        self.bOptNumSigle = False
        self.bOptNumAtLast = False

    def getInfo (self):
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)

    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        import json
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sHeader": "/pyfsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sDicName": self.sDicName,

                            "sFileName": self.sFileName,
                            "sDate": str(datetime.datetime.now())[:-7],
                            "nEntry": self.nEntry,
                            "nChar": self.nChar,
                            "nAff": self.nAff,
                            "nTag": self.nTag,
                            "cStemming": self.cStemming,
                            "dChar": self.dChar,
                            "nNode": self.nNode,
                            "nArc": self.nArc,
                            "nArcVal": self.nArcVal,
                            "lArcVal": self.lArcVal,
                            "nCompressionMethod": self.nCompressionMethod,
                            "nBytesArc": self.nBytesArc,
                            "nBytesNodeAddress": self.nBytesNodeAddress,
                            "nBytesOffset": self.nBytesOffset,
                            # JavaScript is a pile of shit, so Mozilla’s JS parser don’t like file bigger than 4 Mb!
                            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
                            # https://github.com/mozilla/addons-linter/issues/1361
                            "sByDic": self.byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in self.byDic ]












                        }, ensure_ascii=False))
            if bInJSModule:
                hDst.write(";\n\nexports.dictionary = dictionary;\n")

                            
                            


    def isValidToken (self, sToken):
        "checks if <sToken> is valid (if there is hyphens in <sToken>, <sToken> is split, each part is checked)"
        if self.isValid(sToken):
            return True
        if "-" in sToken:
            if sToken.count("-") > 4:

Modified lex_build.py from [57c70320f0] to [41cc230f34].

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

27
28
29
30
31
32
33
34
35
36
37
import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangName, sDicName, bJSON=False, cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, sLangName, cStemmingMethod)
    dir_util.mkpath("graphspell/_dictionaries")
    oDAWG.writeInfo("graphspell/_dictionaries/" + sDicName + ".info.txt")
    oDAWG.createBinary("graphspell/_dictionaries/" + sDicName + ".bdic", int(nCompressMethod))
    if bJSON:
        dir_util.mkpath("graphspell-js/_dictionaries")
        oDic = IBDAWG(sDicName + ".bdic")
        oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sDicName + ".json", bBinaryDictAsHexString=True)


def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon")

    xParser.add_argument("lang_name", type=str, help="language name")
    xParser.add_argument("dic_name", type=str, help="dictionary file name (without extension)")
    xParser.add_argument("-js", "--json", help="Build dictionary in JSON", action="store_true")
    xParser.add_argument("-s", "--stemming", help="stemming method: S=suffixes, A=affixes, N=no stemming", type=str, choices=["S", "A", "N"], default="S")
    xParser.add_argument("-c", "--compress", help="compression method: 1, 2 (beta), 3, (beta)", type=int, choices=[1, 2, 3], default=1)
    xArgs = xParser.parse_args()
    build(xArgs.src_lexicon, xArgs.lang_name, xArgs.dic_name, xArgs.json)
    

if __name__ == '__main__':
    main()







|

|

|
|


|
|





>

|




|




5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName)
    dir_util.mkpath("graphspell/_dictionaries")
    oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    oDAWG.createBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))
    if bJSON:
        dir_util.mkpath("graphspell-js/_dictionaries")
        oDic = IBDAWG(sfDict + ".bdic")
        oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)


def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon")
    xParser.add_argument("lang_code", type=str, help="language code")
    xParser.add_argument("lang_name", type=str, help="language name")
    xParser.add_argument("dic_filename", type=str, help="dictionary file name (without extension)")
    xParser.add_argument("-js", "--json", help="Build dictionary in JSON", action="store_true")
    xParser.add_argument("-s", "--stemming", help="stemming method: S=suffixes, A=affixes, N=no stemming", type=str, choices=["S", "A", "N"], default="S")
    xParser.add_argument("-c", "--compress", help="compression method: 1, 2 (beta), 3, (beta)", type=int, choices=[1, 2, 3], default=1)
    xArgs = xParser.parse_args()
    build(xArgs.src_lexicon, xArgs.lang_code, xArgs.lang_name, xArgs.dic_filename, xArgs.json)
    

if __name__ == '__main__':
    main()

Modified make.py from [72eca8b4d4] to [b310b82ca7].

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
...
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
...
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319

320
321
322
323
324
325
326
def createOXT (spLang, dVars, dOxt, spLangPack, bInstall):
    "create extension for Writer"
    print("Building extension for Writer")
    spfZip = "_build/" + dVars['name'] + "-"+ dVars['lang'] +"-v" + dVars['version'] + '.oxt'
    hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED)

    # Package and parser
    copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_name']+".bdic", "pythonpath/")
    hZip.write("grammalecte-cli.py", "pythonpath/grammalecte-cli.py")

    # Extension files
    hZip.writestr("META-INF/manifest.xml", helpers.fileFile("gc_core/py/oxt/manifest.xml", dVars))
    hZip.writestr("description.xml", helpers.fileFile("gc_core/py/oxt/description.xml", dVars))
    hZip.writestr("Linguistic.xcu", helpers.fileFile("gc_core/py/oxt/Linguistic.xcu", dVars))
    hZip.writestr("Grammalecte.py", helpers.fileFile("gc_core/py/oxt/Grammalecte.py", dVars))
................................................................................
        hDst.write("html = 1\n")


def createPackageZip (sLang, dVars, spLangPack):
    "create server zip"
    spfZip = "_build/" + dVars['name'] + "-"+ dVars['lang'] +"-v" + dVars['version'] + '.zip'
    hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED)
    copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_name']+".bdic")
    for spf in ["grammalecte-cli.py", "grammalecte-server.py", "bottle.py", \
                "grammalecte-server-options._global.ini", "grammalecte-server-options."+sLang+".ini", \
                "README.txt", "LICENSE.txt", "LICENSE.fr.txt"]:
        hZip.write(spf)
    hZip.writestr("setup.py", helpers.fileFile("gc_lang/fr/setup.py", dVars))


def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sDicName, sAddPath=""):
    for sf in os.listdir("grammalecte"):
        if not os.path.isdir("grammalecte/"+sf):
            hZip.write("grammalecte/"+sf, sAddPath+"grammalecte/"+sf)
    for sf in os.listdir("grammalecte/graphspell"):
        if not os.path.isdir("grammalecte/graphspell/"+sf):
            hZip.write("grammalecte/graphspell/"+sf, sAddPath+"grammalecte/graphspell/"+sf)
    hZip.write("grammalecte/graphspell/_dictionaries/"+sDicName, sAddPath+"grammalecte/graphspell/_dictionaries/"+sDicName)
    for sf in os.listdir(spLangPack):
        if not os.path.isdir(spLangPack+"/"+sf):
            hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf)


def create (sLang, xConfig, bInstallOXT, bJavaScript):
    oNow = datetime.datetime.now()
................................................................................
        for sf in os.listdir("graphspell-js"):
            if not os.path.isdir("graphspell-js/"+sf):
                file_util.copy_file("graphspell-js/"+sf, "grammalecte-js/graphspell")
                helpers.copyAndFileTemplate("graphspell-js/"+sf, "grammalecte-js/graphspell/"+sf, dVars)


def copyGraphspellDictionary (dVars, bJavaScript=False):
    spfPyDic = "graphspell/_dictionaries/"+dVars['dic_name']+".bdic"
    spfJSDic = "graphspell-js/_dictionaries/"+dVars['dic_name']+".json"
    if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
        buildDictionary(dVars, bJavaScript)
    file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
    file_util.copy_file(spfPyDic[:-5]+".info.txt", "grammalecte/graphspell/_dictionaries")
    if bJavaScript:
        file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")


def buildDictionary (dVars, bJavaScript):
    lex_build.build(dVars['lexicon_src'], dVars['lang_name'], dVars['dic_name'], bJavaScript, dVars['stemming_method'], int(dVars['fsa_method']))



def main ():
    print("Python: " + sys.version)
    xParser = argparse.ArgumentParser()
    xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)")
    xParser.add_argument("-b", "--build_data", help="launch build_data.py (part 1 and 2)", action="store_true")







|







 







|







|






|







 







|
|









|
>







74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
...
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
...
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
def createOXT (spLang, dVars, dOxt, spLangPack, bInstall):
    "create extension for Writer"
    print("Building extension for Writer")
    spfZip = "_build/" + dVars['name'] + "-"+ dVars['lang'] +"-v" + dVars['version'] + '.oxt'
    hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED)

    # Package and parser
    copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_filename']+".bdic", "pythonpath/")
    hZip.write("grammalecte-cli.py", "pythonpath/grammalecte-cli.py")

    # Extension files
    hZip.writestr("META-INF/manifest.xml", helpers.fileFile("gc_core/py/oxt/manifest.xml", dVars))
    hZip.writestr("description.xml", helpers.fileFile("gc_core/py/oxt/description.xml", dVars))
    hZip.writestr("Linguistic.xcu", helpers.fileFile("gc_core/py/oxt/Linguistic.xcu", dVars))
    hZip.writestr("Grammalecte.py", helpers.fileFile("gc_core/py/oxt/Grammalecte.py", dVars))
................................................................................
        hDst.write("html = 1\n")


def createPackageZip (sLang, dVars, spLangPack):
    "create server zip"
    spfZip = "_build/" + dVars['name'] + "-"+ dVars['lang'] +"-v" + dVars['version'] + '.zip'
    hZip = zipfile.ZipFile(spfZip, mode='w', compression=zipfile.ZIP_DEFLATED)
    copyGrammalectePyPackageInZipFile(hZip, spLangPack, dVars['dic_filename']+".bdic")
    for spf in ["grammalecte-cli.py", "grammalecte-server.py", "bottle.py", \
                "grammalecte-server-options._global.ini", "grammalecte-server-options."+sLang+".ini", \
                "README.txt", "LICENSE.txt", "LICENSE.fr.txt"]:
        hZip.write(spf)
    hZip.writestr("setup.py", helpers.fileFile("gc_lang/fr/setup.py", dVars))


def copyGrammalectePyPackageInZipFile (hZip, spLangPack, sfDict, sAddPath=""):
    for sf in os.listdir("grammalecte"):
        if not os.path.isdir("grammalecte/"+sf):
            hZip.write("grammalecte/"+sf, sAddPath+"grammalecte/"+sf)
    for sf in os.listdir("grammalecte/graphspell"):
        if not os.path.isdir("grammalecte/graphspell/"+sf):
            hZip.write("grammalecte/graphspell/"+sf, sAddPath+"grammalecte/graphspell/"+sf)
    hZip.write("grammalecte/graphspell/_dictionaries/"+sfDict, sAddPath+"grammalecte/graphspell/_dictionaries/"+sfDict)
    for sf in os.listdir(spLangPack):
        if not os.path.isdir(spLangPack+"/"+sf):
            hZip.write(spLangPack+"/"+sf, sAddPath+spLangPack+"/"+sf)


def create (sLang, xConfig, bInstallOXT, bJavaScript):
    oNow = datetime.datetime.now()
................................................................................
        for sf in os.listdir("graphspell-js"):
            if not os.path.isdir("graphspell-js/"+sf):
                file_util.copy_file("graphspell-js/"+sf, "grammalecte-js/graphspell")
                helpers.copyAndFileTemplate("graphspell-js/"+sf, "grammalecte-js/graphspell/"+sf, dVars)


def copyGraphspellDictionary (dVars, bJavaScript=False):
    spfPyDic = "graphspell/_dictionaries/"+dVars['dic_filename']+".bdic"
    spfJSDic = "graphspell-js/_dictionaries/"+dVars['dic_filename']+".json"
    if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
        buildDictionary(dVars, bJavaScript)
    file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
    file_util.copy_file(spfPyDic[:-5]+".info.txt", "grammalecte/graphspell/_dictionaries")
    if bJavaScript:
        file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")


def buildDictionary (dVars, bJavaScript):
    lex_build.build(dVars['lexicon_src'], dVars['lang'], dVars['lang_name'], dVars['dic_filename'], \
                    bJavaScript, dVars['dic_name'], dVars['stemming_method'], int(dVars['fsa_method']))


def main ():
    print("Python: " + sys.version)
    xParser = argparse.ArgumentParser()
    xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)")
    xParser.add_argument("-b", "--build_data", help="launch build_data.py (part 1 and 2)", action="store_true")