Grammalecte  Check-in [8333a8bf1b]

Overview
Comment:[graphspell] new header <grammalecte-fsa> for binary dictionaries
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | multid
Files: files | file ages | folders
SHA3-256: 8333a8bf1bb8a85bc0b361a59d2fe3c81874c2aef94c4879717f49044168f011
User & Date: olr on 2018-04-01 08:16:11
Other Links: branch diff | manifest | tags
Context
2018-04-01
09:31
[fr][bug] conj: test if tTags exists check-in: a89587a82c user: olr tags: fr, multid
08:16
[graphspell] new header <grammalecte-fsa> for binary dictionaries check-in: 8333a8bf1b user: olr tags: graphspell, multid
08:01
[fx] main panel: dictionaries check-in: 24a9f4dab6 user: olr tags: fx, multid
Changes

Modified graphspell-js/dawg.js from [e2cd530970] to [3711bc314d].

372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
        if (nCompressionMethod == 1) {
            sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            for (let oNode of this.dMinimizedNodes.values()) {
                sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            }
        }
        let oJSON = {
            "sHeader": "/pyfsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntry": this.nEntry,
            "nChar": this.nChar,







|







372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
        if (nCompressionMethod == 1) {
            sByDic = this.oRoot.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            for (let oNode of this.dMinimizedNodes.values()) {
                sByDic += oNode.convToBytes1(this.nBytesArc, this.nBytesNodeAddress);
            }
        }
        let oJSON = {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": "[none]",
            "sDate": this._getDate(),
            "nEntry": this.nEntry,
            "nChar": this.nChar,

Modified graphspell-js/ibdawg.js from [050f6e0036] to [1b3dff6227].

115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16));
        }
        this.byDic = lTemp;
        //this.byDic = new Uint8Array(lTemp);  // not quicker, even slower
        /* end of bug workaround */

        if (!this.sHeader.startsWith("/pyfsa/")) {
            throw TypeError("# Error. Not a pyfsa binary dictionary. Header: " + this.sHeader);
        }
        if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) {
            throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod);
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
................................................................................
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
                `  Address size: ${this.nBytesNodeAddress} bytes,  Arc size: ${this.nBytesArc} bytes\n`;
    }

    getJSON () {
        let oJSON = {
            "sHeader": "/pyfsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": this.sFileName,
            "sDate": this.sDate,
            "nEntry": this.nEntry,
            "nChar": this.nChar,







|
|







 







|







115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
        for (let i = 0;  i < this.sByDic.length;  i+=2) {
            lTemp.push(parseInt(this.sByDic.slice(i, i+2), 16));
        }
        this.byDic = lTemp;
        //this.byDic = new Uint8Array(lTemp);  // not quicker, even slower
        /* end of bug workaround */

        if (!this.sHeader.startsWith("/grammalecte-fsa/")) {
            throw TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: " + this.sHeader);
        }
        if (!(this.nCompressionMethod == 1 || this.nCompressionMethod == 2 || this.nCompressionMethod == 3)) {
            throw RangeError("# Error. Unknown dictionary compression method: " + this.nCompressionMethod);
        }
        // <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        this.dChar = helpers.objectToMap(this.dChar);
        this.dCharVal = this.dChar.gl_reverse();
................................................................................
                `  Arcs values:  ${this.nArcVal} = ${this.nChar} characters,  ${this.nAff} affixes,  ${this.nTag} tags\n` +
                `  Dictionary: ${this.nEntry} entries,    ${this.nNode} nodes,   ${this.nArc} arcs\n` +
                `  Address size: ${this.nBytesNodeAddress} bytes,  Arc size: ${this.nBytesArc} bytes\n`;
    }

    getJSON () {
        let oJSON = {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": this.sLangCode,
            "sLangName": this.sLangName,
            "sDicName": this.sDicName,
            "sFileName": this.sFileName,
            "sDate": this.sDate,
            "nEntry": this.nEntry,
            "nChar": this.nChar,

Modified graphspell/dawg.py from [7cd1b4dc56] to [63684196d2].

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
...
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
...
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 3:
            byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
        return {
            "sHeader": "/pyfsa/",
            "sLangCode": self.sLangCode,
            "sLangName": self.sLangName,
            "sDicName": self.sDicName,
            "sFileName": self.sFileName,
            "sDate": self._getDate(),
            "nEntry": self.nEntry,
            "nChar": self.nChar,
................................................................................

    def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False):
        """
        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
        - Section Header:
            /pyfsa/[compression method]
                * compression method is an ASCII string
        
        - Section Informations:
            /[lang code]
            /[lang name]
            /[dictionary name]
            /[date creation]
................................................................................
                  See DawgNode.convToBytes() for details.
        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/pyfsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
                                                                              self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                              self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            hDst.write(sInfo.encode("utf-8"))
            hDst.write(b"\0\0\0\0")







|







 







|







 







|







396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
...
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
...
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress)
        elif nCompressionMethod == 3:
            byDic = self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
            for oNode in self.lSortedNodes:
                byDic += oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset)
        return {
            "sHeader": "/grammalecte-fsa/",
            "sLangCode": self.sLangCode,
            "sLangName": self.sLangName,
            "sDicName": self.sDicName,
            "sFileName": self.sFileName,
            "sDate": self._getDate(),
            "nEntry": self.nEntry,
            "nChar": self.nChar,
................................................................................

    def writeBinary (self, sPathFile, nCompressionMethod, bDebug=False):
        """
        Format of the binary indexable dictionary:
        Each section is separated with 4 bytes of \0
        
        - Section Header:
            /grammalecte-fsa/[compression method]
                * compression method is an ASCII string
        
        - Section Informations:
            /[lang code]
            /[lang name]
            /[dictionary name]
            /[date creation]
................................................................................
                  See DawgNode.convToBytes() for details.
        """
        self._calculateBinary(nCompressionMethod)
        if not sPathFile.endswith(".bdic"):
            sPathFile += "."+str(nCompressionMethod)+".bdic"
        with open(sPathFile, 'wb') as hDst:
            # header
            hDst.write("/grammalecte-fsa/{}/".format(nCompressionMethod).encode("utf-8"))
            hDst.write(b"\0\0\0\0")
            # infos
            sInfo = "{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//{}//".format(self.sLangCode, self.sLangName, self.sDicName, self._getDate(), \
                                                                              self.nChar, self.nBytesArc, self.nBytesNodeAddress, \
                                                                              self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming)
            hDst.write(sInfo.encode("utf-8"))
            hDst.write(b"\0\0\0\0")

Modified graphspell/ibdawg.py from [b794aeec36] to [f523996e8f].

132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
...
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
            raise ValueError("  # Error: unknown code: {}".format(self.nCompressionMethod))

        self.bOptNumSigle = False
        self.bOptNumAtLast = False

    def _initBinary (self):
        "initialize with binary structure file"
        if self.by[0:7] != b"/pyfsa/":
            raise TypeError("# Error. Not a pyfsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[7:8] == b"1" or self.by[7:8] == b"2" or self.by[7:8] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[7:8]))
        try:
            header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
        except Exception:
            raise Exception
        
        self.nCompressionMethod = int(self.by[7:8].decode("utf-8"))
        self.sHeader = header.decode("utf-8")
        self.lArcVal = values.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = bdic

        l = info.decode("utf-8").split("//")
        self.sLangCode = l.pop(0)
................................................................................

    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sHeader": "/pyfsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sDicName": self.sDicName,
                            "sFileName": self.sFileName,
                            "sDate": self.sDate,
                            "nEntry": self.nEntry,
                            "nChar": self.nChar,







|
|
|
|





|







 







|







132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
...
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
            raise ValueError("  # Error: unknown code: {}".format(self.nCompressionMethod))

        self.bOptNumSigle = False
        self.bOptNumAtLast = False

    def _initBinary (self):
        "initialize with binary structure file"
        if self.by[0:17] != b"/grammalecte-fsa/":
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
        try:
            header, info, values, bdic = self.by.split(b"\0\0\0\0", 3)
        except Exception:
            raise Exception
        
        self.nCompressionMethod = int(self.by[17:18].decode("utf-8"))
        self.sHeader = header.decode("utf-8")
        self.lArcVal = values.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = bdic

        l = info.decode("utf-8").split("//")
        self.sLangCode = l.pop(0)
................................................................................

    def writeAsJSObject (self, spfDest, bInJSModule=False, bBinaryDictAsHexString=False):
        "write IBDAWG as a JavaScript object in a JavaScript module"
        with open(spfDest, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write(json.dumps({
                            "sHeader": "/grammalecte-fsa/",
                            "sLangCode": self.sLangCode,
                            "sLangName": self.sLangName,
                            "sDicName": self.sDicName,
                            "sFileName": self.sFileName,
                            "sDate": self.sDate,
                            "nEntry": self.nEntry,
                            "nChar": self.nChar,