Grammalecte  Check-in [ae767aaff5]

Overview
Comment:[graphspell][py] ibdawg optimization: precalculate bytes in binary dictionary
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | bdic_opt
Files: files | file ages | folders
SHA3-256: ae767aaff504b1b6ff830c62014977875f6db31a8bed7f656ee18f61e0915ecb
User & Date: olr on 2020-09-11 17:22:31
Other Links: branch diff | manifest | tags
Context
2020-09-11
19:20
merge trunk check-in: 43afb8b856 user: olr tags: bdic_opt
17:22
[graphspell][py] ibdawg optimization: precalculate bytes in binary dictionary check-in: ae767aaff5 user: olr tags: bdic_opt, graphspell
15:53
[graphspell][js] ibdawg optimization: precalculate bytes in binary dictionary check-in: 443f28094b user: olr tags: bdic_opt, graphspell
Changes

Modified gc_lang/fr/perf_memo.text from [6a0d81df00] to [ad156793c1].

    26     26   0.6.2       2018.02.19 19:06    5.51302     1.29359     0.874157    0.260415    0.271596    0.290641    0.684754    0.376905    0.0815201   0.00919633  (spelling normalization)
    27     27   1.0         2018.11.23 10:59    2.88577     0.702486    0.485648    0.139897    0.14079     0.148125    0.348751    0.201061    0.0360297   0.0043535   (x2, with new GC engine)
    28     28   1.1         2019.05.16 09:42    1.50743     0.360923    0.261113    0.0749272   0.0763827   0.0771537   0.180504    0.102942    0.0182762   0.0021925   (×2, but new processor: AMD Ryzen 7 2700X)
    29     29   1.2.1       2019.08.06 20:57    1.42886     0.358425    0.247356    0.0704405   0.0754886   0.0765604   0.177197    0.0988517   0.0188103   0.0020243
    30     30   1.6.0       2020.01.03 20:22    1.38847     0.346214    0.240242    0.0709539   0.0737499   0.0748733   0.176477    0.0969171   0.0187857   0.0025143   (nouveau dictionnaire avec lemmes masculins)
    31     31   1.9.0       2020.04.20 19:57    1.51183     0.369546    0.25681     0.0734314   0.0764396   0.0785668   0.183922    0.103674    0.0185812   0.002099    (NFC normalization)
    32     32   1.9.2       2020.05.12 08:43    1.62465     0.398831    0.273012    0.0810811   0.080937    0.0845885   0.204133    0.114146    0.0212864   0.0029547
    33         -1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474   
    34         -1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773   
           33  +1.12.2      2020.09.09 13:34    1.50568     0.374504    0.233108    0.0798712   0.0804466   0.0769674   0.171519    0.0945132   0.0165344   0.0019474
           34  +1.12.2      2020.09.09 13:35    1.41094     0.359093    0.236443    0.06968     0.0734418   0.0738087   0.169371    0.0946279   0.0167106   0.0019773
           35  +1.12.2      2020.09.11 19:16    1.35297     0.330545    0.221731    0.0666998   0.0692539   0.0701707   0.160564    0.0891676   0.015807    0.0045998

Modified graphspell/ibdawg.py from [d16ed0d683] to [0fe5cbd03f].

   116    116               else:
   117    117                   raise OSError("# Error. Unknown file type: "+source)
   118    118           else:
   119    119               self._initJSON(source)
   120    120   
   121    121           self.sFileName = source  if isinstance(source, str)  else "[None]"
   122    122   
          123  +        # Performance trick:
          124  +        #     Instead of converting bytes to integers each times we parse the binary dictionary,
          125  +        #     we do it once, then parse the array
          126  +        nAcc = 0
          127  +        byBuffer = b""
          128  +        lTemp = []
          129  +        nDivisor = (self.nBytesArc + self.nBytesNodeAddress) / 2
          130  +        for i in range(0, len(self.byDic)):
          131  +            byBuffer += self.byDic[i:i+1]
          132  +            if nAcc == (self.nBytesArc - 1):
          133  +                lTemp.append(int.from_bytes(byBuffer, byteorder="big"))
          134  +                byBuffer = b""
          135  +            elif nAcc == (self.nBytesArc + self.nBytesNodeAddress - 1):
          136  +                lTemp.append(round(int.from_bytes(byBuffer, byteorder="big") / nDivisor))
          137  +                byBuffer = b""
          138  +                nAcc = -1
          139  +            nAcc = nAcc + 1
          140  +        self.byDic = lTemp;
          141  +
          142  +        # masks
   123    143           self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
   124    144           self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
   125    145           self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
   126    146           self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2
   127    147   
   128    148           # function to decode the affix/suffix code
   129    149           if self.cStemming == "S":
................................................................................
   296    316           iAddr = 0
   297    317           for c in sWord:
   298    318               if c not in self.dChar:
   299    319                   return False
   300    320               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   301    321               if iAddr is None:
   302    322                   return False
   303         -        return bool(int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask)
          323  +        return bool(self.byDic[iAddr] & self._finalNodeMask)
   304    324   
   305    325       def getMorph (self, sWord):
   306    326           "retrieves morphologies list, different casing allowed"
   307    327           if not sWord:
   308    328               return []
   309    329           sWord = st.spellingNormalization(sWord)
   310    330           l = self.morph(sWord)
................................................................................
   352    372                   sWord1, sWord2 = sWord.split(cSplitter, 1)
   353    373                   if self.isValid(sWord1) and self.isValid(sWord2):
   354    374                       oSuggResult.addSugg(sWord1+" "+sWord2)
   355    375   
   356    376       def _suggest (self, oSuggResult, sRemain, nMaxSwitch=0, nMaxDel=0, nMaxHardRepl=0, nMaxJump=0, nDist=0, nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
   357    377           # recursive function
   358    378           #logging.info((nDeep * "  ") + sNewWord + ":" + sRemain)
   359         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          379  +        if self.byDic[iAddr] & self._finalNodeMask:
   360    380               if not sRemain:
   361    381                   oSuggResult.addSugg(sNewWord, nDeep)
   362    382                   for sTail in self._getTails(iAddr):
   363    383                       oSuggResult.addSugg(sNewWord+sTail, nDeep)
   364    384                   return
   365    385               if (len(sNewWord) + len(sRemain) == len(oSuggResult.sWord)) and oSuggResult.sWord.lower().startswith(sNewWord.lower()) and self.isValid(sRemain):
   366    386                   if self.sLangCode == "fr" and sNewWord.lower() in ("l", "d", "n", "m", "t", "s", "c", "j", "qu", "lorsqu", "puisqu", "quoiqu", "jusqu", "quelqu") and sRemain[0:1] in cp.aVowel:
................................................................................
   419    439                   yield (self.dCharVal[nVal], jAddr)
   420    440   
   421    441       def _getTails (self, iAddr, sTail="", n=2):
   422    442           "return a list of suffixes ending at a distance of <n> from <iAddr>"
   423    443           aTails = set()
   424    444           for nVal, jAddr in self._getArcs(iAddr):
   425    445               if nVal <= self.nChar:
   426         -                if int.from_bytes(self.byDic[jAddr:jAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          446  +                if self.byDic[jAddr] & self._finalNodeMask:
   427    447                       aTails.add(sTail + self.dCharVal[nVal])
   428    448                   if n and not aTails:
   429    449                       aTails.update(self._getTails(jAddr, sTail+self.dCharVal[nVal], n-1))
   430    450           return aTails
   431    451   
   432    452       def drawPath (self, sWord, iAddr=0):
   433    453           "show the path taken by <sWord> in the graph"
................................................................................
   495    515           iAddr = 0
   496    516           for c in sWord:
   497    517               if c not in self.dChar:
   498    518                   return []
   499    519               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   500    520               if iAddr is None:
   501    521                   return []
   502         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          522  +        if self.byDic[iAddr] & self._finalNodeMask:
   503    523               l = []
   504    524               nRawArc = 0
   505    525               while not nRawArc & self._lastArcMask:
   506         -                iEndArcAddr = iAddr + self.nBytesArc
   507         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          526  +                iEndArcAddr = iAddr + 1
          527  +                nRawArc = self.byDic[iAddr]
   508    528                   nArc = nRawArc & self._arcMask
   509    529                   if nArc > self.nChar:
   510    530                       # This value is not a char, this is a stemming code
   511    531                       sStem = ">" + self.funcStemming(sWord, self.lArcVal[nArc])
   512    532                       # Now , we go to the next node and retrieve all following arcs values, all of them are tags
   513         -                    iAddr2 = int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
          533  +                    iAddr2 = self.byDic[iEndArcAddr]
   514    534                       nRawArc2 = 0
   515    535                       while not nRawArc2 & self._lastArcMask:
   516         -                        iEndArcAddr2 = iAddr2 + self.nBytesArc
   517         -                        nRawArc2 = int.from_bytes(self.byDic[iAddr2:iEndArcAddr2], byteorder='big')
          536  +                        iEndArcAddr2 = iAddr2 + 1
          537  +                        nRawArc2 = self.byDic[iAddr2]
   518    538                           l.append(sStem + "/" + self.lArcVal[nRawArc2 & self._arcMask])
   519         -                        iAddr2 = iEndArcAddr2+self.nBytesNodeAddress
   520         -                iAddr = iEndArcAddr+self.nBytesNodeAddress
          539  +                        iAddr2 = iEndArcAddr2 + 1
          540  +                iAddr = iEndArcAddr + 1
   521    541               return l
   522    542           return []
   523    543   
   524    544       def _stem1 (self, sWord):
   525    545           "returns stems list of <sWord>"
   526    546           iAddr = 0
   527    547           for c in sWord:
   528    548               if c not in self.dChar:
   529    549                   return []
   530    550               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   531    551               if iAddr is None:
   532    552                   return []
   533         -        if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          553  +        if self.byDic[iAddr] & self._finalNodeMask:
   534    554               l = []
   535    555               nRawArc = 0
   536    556               while not nRawArc & self._lastArcMask:
   537         -                iEndArcAddr = iAddr + self.nBytesArc
   538         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          557  +                iEndArcAddr = iAddr + 1
          558  +                nRawArc = self.byDic[iAddr]
   539    559                   nArc = nRawArc & self._arcMask
   540    560                   if nArc > self.nChar:
   541    561                       # This value is not a char, this is a stemming code
   542    562                       l.append(self.funcStemming(sWord, self.lArcVal[nArc]))
   543         -                iAddr = iEndArcAddr+self.nBytesNodeAddress
          563  +                iAddr = iEndArcAddr + 1
   544    564               return l
   545    565           return []
   546    566   
   547    567       def _lookupArcNode1 (self, nVal, iAddr):
   548    568           "looks if <nVal> is an arc at the node at <iAddr>, if yes, returns address of next node else None"
   549    569           while True:
   550         -            iEndArcAddr = iAddr+self.nBytesArc
   551         -            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          570  +            iEndArcAddr = iAddr + 1
          571  +            nRawArc = self.byDic[iAddr]
   552    572               if nVal == (nRawArc & self._arcMask):
   553    573                   # the value we are looking for
   554    574                   # we return the address of the next node
   555         -                return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
          575  +                return self.byDic[iEndArcAddr]
   556    576               # value not found
   557    577               if nRawArc & self._lastArcMask:
   558    578                   return None
   559         -            iAddr = iEndArcAddr+self.nBytesNodeAddress
          579  +            iAddr = iEndArcAddr + 1
   560    580   
   561    581       def _getArcs1 (self, iAddr):
   562    582           "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
   563    583           while True:
   564         -            iEndArcAddr = iAddr+self.nBytesArc
   565         -            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
   566         -            yield nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
          584  +            iEndArcAddr = iAddr + 1
          585  +            nRawArc = self.byDic[iAddr]
          586  +            yield nRawArc & self._arcMask, self.byDic[iEndArcAddr]
   567    587               if nRawArc & self._lastArcMask:
   568    588                   break
   569         -            iAddr = iEndArcAddr+self.nBytesNodeAddress
          589  +            iAddr = iEndArcAddr + 1
   570    590   
   571    591       def _writeNodes1 (self, spfDest):
   572    592           "for debugging only"
   573    593           print(" > Write binary nodes")
   574    594           with open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
   575    595               iAddr = 0
   576    596               hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
   577    597               while iAddr < len(self.byDic):
   578         -                iEndArcAddr = iAddr+self.nBytesArc
   579         -                nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          598  +                iEndArcAddr = iAddr + 1
          599  +                nRawArc = self.byDic[iAddr]
   580    600                   nArc = nRawArc & self._arcMask
   581         -                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", \
   582         -                                                                            int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], \
   583         -                                                                                           byteorder='big')))
   584         -                iAddr = iEndArcAddr+self.nBytesNodeAddress
          601  +                hDst.write("  {:<20}  {:0>16}  i{:>10}   #{:_>10}\n".format(self.lArcVal[nArc], bin(nRawArc)[2:], "?", self.byDic[iEndArcAddr]))
          602  +                iAddr = iEndArcAddr + 1
   585    603                   if (nRawArc & self._lastArcMask) and iAddr < len(self.byDic):
   586    604                       hDst.write("\ni{:_>10} -- #{:_>10}\n".format("?", iAddr))
   587    605               hDst.close()
   588    606   
   589    607       # VERSION 2
   590    608       def _morph2 (self, sWord):
   591    609           "returns morphologies of <sWord>"