Grammalecte  Check-in [866ec22f7d]

Overview
Comment:[graphspell][py] ibdawg: remove binary dict support
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | dict2
Files: files | file ages | folders
SHA3-256: 866ec22f7d45c0d8a1688402451a3057e28589c24fe070a78c5567d19bbdbe80
User & Date: olr on 2020-11-04 12:21:12
Other Links: branch diff | manifest | tags
Context
2020-11-04
12:37
[graphspell][py] remove duplicate method check-in: a3980c3ca4 user: olr tags: dict2, graphspell
12:21
[graphspell][py] ibdawg: remove binary dict support check-in: 866ec22f7d user: olr tags: dict2, graphspell
12:02
[graphspell] ibdawg: code cleaning, remove old code, useless compression versions check-in: 86250e8e6c user: olr tags: dict2, graphspell
Changes

Modified graphspell/ibdawg.py from [2cf8f6a51c] to [0da0287637].

110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
...
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, source):
        if isinstance(source, str):
            self.by = pkgutil.get_data(__package__, "_dictionaries/" + source)
            if not self.by:
                raise OSError("# Error. File not found or not loadable: "+source)

            if source.endswith(".bdic"):
                self._initBinary()
            elif source.endswith(".json"):
                self._initJSON(json.loads(self.by.decode("utf-8")))     #json.loads(self.by)    # In Python 3.6, can read directly binary strings
            else:
                raise OSError("# Error. Unknown file type: "+source)
        else:
            self._initJSON(source)

        self.sFileName = source  if isinstance(source, str)  else "[None]"

        # Performance trick:
        #     Instead of converting bytes to integers each times we parse the binary dictionary,
        #     we do it once, then parse the array
        nAcc = 0
        byBuffer = b""
        self.lByDic = []
................................................................................
        # lexicographer module ?
        self.lexicographer = None
        try:
            self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell")
        except ImportError:
            print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>")


    def _initBinary (self):
        "initialize with binary structure file"
        if self.by[0:17] != b"/grammalecte-fsa/":
            raise TypeError("# Error. Not a grammalecte-fsa binary dictionary. Header: {}".format(self.by[0:9]))
        if not(self.by[17:18] == b"1" or self.by[17:18] == b"2" or self.by[17:18] == b"3"):
            raise ValueError("# Error. Unknown dictionary version: {}".format(self.by[17:18]))
        try:
            byHeader, byInfo, byValues, by2grams, byDic = self.by.split(b"\0\0\0\0", 4)
        except Exception:
            raise Exception

        self.nCompressionMethod = int(self.by[17:18].decode("utf-8"))
        self.sHeader = byHeader.decode("utf-8")
        self.lArcVal = byValues.decode("utf-8").split("\t")
        self.nArcVal = len(self.lArcVal)
        self.byDic = byDic
        self.a2grams = set(by2grams.decode("utf-8").split("\t"))

        l = byInfo.decode("utf-8").split("//")
        self.sLangCode = l.pop(0)
        self.sLangName = l.pop(0)
        self.sDicName = l.pop(0)
        self.sDescription = l.pop(0)
        self.sDate = l.pop(0)
        self.nChar = int(l.pop(0))
        self.nBytesArc = int(l.pop(0))
        self.nBytesNodeAddress = int(l.pop(0))
        self.nEntry = int(l.pop(0))
        self.nNode = int(l.pop(0))
        self.nArc = int(l.pop(0))
        self.nAff = int(l.pop(0))
        self.cStemming = l.pop(0)
        self.nTag = self.nArcVal - self.nChar - self.nAff
        # <dChar> to get the value of an arc, <dCharVal> to get the char of an arc with its value
        self.dChar = {}
        for i in range(1, self.nChar+1):
            self.dChar[self.lArcVal[i]] = i
        self.dCharVal = { v: k  for k, v in self.dChar.items() }

    def _initJSON (self, oJSON):
        "initialize with a JSON text file"
        self.sByDic = ""  # init to prevent pylint whining
        self.__dict__.update(oJSON)
        self.byDic = binascii.unhexlify(self.sByDic)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }
        self.a2grams = set(getattr(self, 'l2grams'))  if hasattr(self, 'l2grams')  else None

    def getInfo (self):
        "return string about the IBDAWG"
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)







|
|
|
|
|
|
|
|
|
|
|
|
|
|







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
...
166
167
168
169
170
171
172
















































173
174
175
176
177
178
179


class IBDAWG:
    """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""

    def __init__ (self, source):
        if isinstance(source, str):
            by = pkgutil.get_data(__package__, "_dictionaries/" + source)
            if not by:
                raise OSError("# Error. File not found or not loadable: "+source)
            self.sFileName = source
            oData = json.loads(by.decode("utf-8"))     #json.loads(by)    # In Python 3.6, can read directly binary strings
        else:
            self.sFileName = "[None]"
            oData = source

        self.sByDic = ""  # init to prevent pylint whining
        self.__dict__.update(oData)
        self.byDic = binascii.unhexlify(self.sByDic)
        self.dCharVal = { v: k  for k, v in self.dChar.items() }
        self.a2grams = set(getattr(self, 'l2grams'))  if hasattr(self, 'l2grams')  else None

        # Performance trick:
        #     Instead of converting bytes to integers each times we parse the binary dictionary,
        #     we do it once, then parse the array
        nAcc = 0
        byBuffer = b""
        self.lByDic = []
................................................................................
        # lexicographer module ?
        self.lexicographer = None
        try:
            self.lexicographer = importlib.import_module(".lexgraph_"+self.sLangCode, "grammalecte.graphspell")
        except ImportError:
            print("# No module <graphspell.lexgraph_"+self.sLangCode+".py>")

















































    def getInfo (self):
        "return string about the IBDAWG"
        return  "  Language: {0.sLangName}   Lang code: {0.sLangCode}   Dictionary name: {0.sDicName}" \
                "  Compression method: {0.nCompressionMethod:>2}   Date: {0.sDate}   Stemming: {0.cStemming}FX\n" \
                "  Arcs values:  {0.nArcVal:>10,} = {0.nChar:>5,} characters,  {0.nAff:>6,} affixes,  {0.nTag:>6,} tags\n" \
                "  Dictionary: {0.nEntry:>12,} entries,    {0.nNode:>11,} nodes,   {0.nArc:>11,} arcs\n" \
                "  Address size: {0.nBytesNodeAddress:>1} bytes,  Arc size: {0.nBytesArc:>1} bytes\n".format(self)