Grammalecte  Check-in [fe361b4e8d]

Overview
Comment:[core] code cleaning
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: fe361b4e8d6378fe8c08482252cb7d6c157a1c26b9ccdedf0bba76c5d9f95b41
User & Date: olr on 2017-06-29 19:40:12
Other Links: manifest | tags
Context
2017-06-30
08:10
[fr] faux positifs check-in: c39ac2a56a user: olr tags: fr, trunk
2017-06-29
19:40
[core] code cleaning check-in: fe361b4e8d user: olr tags: core, trunk
13:20
[core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM) check-in: b038740434 user: olr tags: core, trunk
Changes

Modified gc_core/py/dawg.py from [d8c1a249fb] to [eb4c8506bd].

   145    145           #    for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True):
   146    146           #        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
   147    147           #    hFreqDst.close()
   148    148           
   149    149           self.sFile = spfSrc
   150    150           self.sLang = sLangName
   151    151           self.nEntry = len(lWord)
   152         -        self.previousWord = []
          152  +        self.aPreviousEntry = []
   153    153           DawgNode.resetNextId()
   154         -        self.root = DawgNode()
   155         -        self.uncheckedNodes = []  # list of nodes that have not been checked for duplication.
   156         -        self.minimizedNodes = {}  # list of unique nodes that have been checked for duplication.
   157         -        self.sortedNodes = []     # version 2 and 3
          154  +        self.oRoot = DawgNode()
          155  +        self.lUncheckedNodes = []  # list of nodes that have not been checked for duplication.
          156  +        self.lMinimizedNodes = {}  # list of unique nodes that have been checked for duplication.
          157  +        self.lSortedNodes = []     # version 2 and 3
   158    158           self.nNode = 0
   159    159           self.nArc = 0
   160    160           self.dChar = dChar
   161    161           self.nChar = len(dChar)
   162    162           self.nAff = nAff
   163    163           self.lArcVal = lVal
   164    164           self.nArcVal = len(lVal)
................................................................................
   170    170               self.funcStemming = st.changeWordWithSuffixCode
   171    171           else:
   172    172               self.funcStemming = st.noStemming
   173    173           
   174    174           # build
   175    175           lWord.sort()
   176    176           oProgBar = ProgressBar(0, len(lWord))
   177         -        for word in lWord:
   178         -            self.insert(word)
          177  +        for aEntry in lWord:
          178  +            self.insert(aEntry)
   179    179               oProgBar.increment(1)
   180    180           oProgBar.done()
   181    181           self.finish()
   182    182           self.countNodes()
   183    183           self.countArcs()
   184    184           self.sortNodes()
   185    185           self.sortNodeArcs(dValOccur)
   186         -        #self.sortNodeArcs2 (self.root, "")
          186  +        #self.sortNodeArcs2 (self.oRoot, "")
   187    187           self.displayInfo()
   188    188   
   189    189       # BUILD DAWG
   190         -    def insert (self, word):
   191         -        if word < self.previousWord:
          190  +    def insert (self, aEntry):
          191  +        if aEntry < self.aPreviousEntry:
   192    192               sys.exit("# Error: Words must be inserted in alphabetical order.")
   193    193           
   194    194           # find common prefix between word and previous word
   195         -        commonPrefix = 0
   196         -        for i in range(min(len(word), len(self.previousWord))):
   197         -            if word[i] != self.previousWord[i]:
          195  +        nCommonPrefix = 0
          196  +        for i in range(min(len(aEntry), len(self.aPreviousEntry))):
          197  +            if aEntry[i] != self.aPreviousEntry[i]:
   198    198                   break
   199         -            commonPrefix += 1
          199  +            nCommonPrefix += 1
   200    200   
   201         -        # Check the uncheckedNodes for redundant nodes, proceeding from last
          201  +        # Check the lUncheckedNodes for redundant nodes, proceeding from last
   202    202           # one down to the common prefix size. Then truncate the list at that point.
   203         -        self._minimize(commonPrefix)
          203  +        self._minimize(nCommonPrefix)
   204    204   
   205    205           # add the suffix, starting from the correct node mid-way through the graph
   206         -        if len(self.uncheckedNodes) == 0:
   207         -            oNode = self.root
          206  +        if len(self.lUncheckedNodes) == 0:
          207  +            oNode = self.oRoot
   208    208           else:
   209         -            oNode = self.uncheckedNodes[-1][2]
          209  +            oNode = self.lUncheckedNodes[-1][2]
   210    210   
   211         -        iChar = commonPrefix
   212         -        for c in word[commonPrefix:]:
          211  +        iChar = nCommonPrefix
          212  +        for c in aEntry[nCommonPrefix:]:
   213    213               oNextNode = DawgNode()
   214    214               oNode.arcs[c] = oNextNode
   215         -            self.uncheckedNodes.append((oNode, c, oNextNode))
   216         -            if iChar == (len(word) - 2): 
          215  +            self.lUncheckedNodes.append((oNode, c, oNextNode))
          216  +            if iChar == (len(aEntry) - 2): 
   217    217                   oNode.final = True
   218    218               iChar += 1
   219    219               oNode = oNextNode
   220    220           oNode.final = True
   221         -        self.previousWord = word
          221  +        self.aPreviousEntry = aEntry
   222    222   
   223    223       def finish (self):
   224    224           "minimize unchecked nodes"
   225    225           self._minimize(0)
   226    226   
   227    227       def _minimize (self, downTo):
   228    228           # proceed from the leaf up to a certain point
   229         -        for i in range( len(self.uncheckedNodes)-1, downTo-1, -1 ):
   230         -            (parent, char, child) = self.uncheckedNodes[i]
   231         -            if child in self.minimizedNodes:
          229  +        for i in range( len(self.lUncheckedNodes)-1, downTo-1, -1 ):
          230  +            oNode, char, oChildNode = self.lUncheckedNodes[i]
          231  +            if oChildNode in self.lMinimizedNodes:
   232    232                   # replace the child with the previously encountered one
   233         -                parent.arcs[char] = self.minimizedNodes[child]
          233  +                oNode.arcs[char] = self.lMinimizedNodes[oChildNode]
   234    234               else:
   235    235                   # add the state to the minimized nodes.
   236         -                self.minimizedNodes[child] = child
   237         -            self.uncheckedNodes.pop()
          236  +                self.lMinimizedNodes[oChildNode] = oChildNode
          237  +            self.lUncheckedNodes.pop()
   238    238   
   239    239       def countNodes (self):
   240         -        self.nNode = len(self.minimizedNodes)
          240  +        self.nNode = len(self.lMinimizedNodes)
   241    241   
   242    242       def countArcs (self):
   243    243           self.nArc = 0
   244         -        for node in self.minimizedNodes:
   245         -            self.nArc += len(node.arcs)
          244  +        for oNode in self.lMinimizedNodes:
          245  +            self.nArc += len(oNode.arcs)
   246    246       
   247    247       def sortNodeArcs (self, dValOccur):
   248    248           print(" > Sort node arcs")
   249         -        self.root.sortArcs(dValOccur)
   250         -        for oNode in self.minimizedNodes:
          249  +        self.oRoot.sortArcs(dValOccur)
          250  +        for oNode in self.lMinimizedNodes:
   251    251               oNode.sortArcs(dValOccur)
   252    252       
   253    253       def sortNodeArcs2 (self, oNode, cPrevious=""):
   254    254           # recursive function
   255         -        dCharOccur = getCharOrderForPreviousChar(cPrevious)
          255  +        dCharOccur = getCharOrderAfterChar(cPrevious)
   256    256           if dCharOccur:
   257    257               oNode.sortArcs2(dCharOccur, self.lArcVal)
   258    258           for nArcVal, oNextNode in oNode.arcs.items():
   259    259               self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])
   260    260   
   261    261       def sortNodes (self):
   262    262           print(" > Sort nodes")
   263         -        for oNode in self.root.arcs.values():
          263  +        for oNode in self.oRoot.arcs.values():
   264    264               self._parseNodes(oNode)
   265    265       
   266    266       def _parseNodes (self, oNode):
   267    267           # Warning: recursive method
   268    268           if oNode.pos > 0:
   269    269               return
   270    270           oNode.setPos()
   271         -        self.sortedNodes.append(oNode)
          271  +        self.lSortedNodes.append(oNode)
   272    272           for oNextNode in oNode.arcs.values():
   273    273                self._parseNodes(oNextNode)
   274    274           
   275    275       def lookup (self, sWord):
   276         -        oNode = self.root
          276  +        oNode = self.oRoot
   277    277           for c in sWord:
   278    278               if self.dChar.get(c, '') not in oNode.arcs:
   279    279                   return False
   280    280               oNode = oNode.arcs[self.dChar[c]]
   281    281           return oNode.final
   282    282   
   283    283       def morph (self, sWord):
   284         -        oNode = self.root
          284  +        oNode = self.oRoot
   285    285           for c in sWord:
   286    286               if self.dChar.get(c, '') not in oNode.arcs:
   287    287                   return ''
   288    288               oNode = oNode.arcs[self.dChar[c]]
   289    289           if oNode.final:
   290    290               s = "* "
   291    291               for arc in oNode.arcs:
................................................................................
   306    306           print(" * {:<12} {:>16,}".format("Arc values:", self.nArcVal))
   307    307           print(" * {:<12} {:>16,}".format("Nodes:", self.nNode))
   308    308           print(" * {:<12} {:>16,}".format("Arcs:", self.nArc))
   309    309           print(" * {:<12} {:>16}".format("Stemming:", self.cStemming + "FX"))
   310    310   
   311    311       def getArcStats (self):
   312    312           d = {}
   313         -        for oNode in self.minimizedNodes:
          313  +        for oNode in self.lMinimizedNodes:
   314    314               n = len(oNode.arcs)
   315    315               d[n] = d.get(n, 0) + 1
   316    316           s = " * Nodes:\n"
   317    317           for n in d:
   318    318               s = s + " {:>9} nodes have {:>3} arcs\n".format(d[n], n)
   319    319           return s
   320    320   
................................................................................
   327    327                   hDst.write(" {:>6}. {}\n".format(i, s))
   328    328               hDst.close()
   329    329   
   330    330       # BINARY CONVERSION
   331    331       def createBinary (self, sPathFile, nMethod, bDebug=False):
   332    332           print(" > Write DAWG as an indexable binary dictionary [method: %d]" % nMethod)
   333    333           if nMethod == 1:
   334         -            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 2 ) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
          334  +            self.nBytesArc = ( (self.nArcVal.bit_length() + 2) // 8 ) + 1   # We add 2 bits. See DawgNode.convToBytes1()
   335    335               self._calcNumBytesNodeAddress()
   336    336               self._calcNodesAddress1()
   337    337           elif nMethod == 2:
   338         -            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
          338  +            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes2()
   339    339               self._calcNumBytesNodeAddress()
   340    340               self._calcNodesAddress2()
   341    341           elif nMethod == 3:
   342         -            self.nBytesArc = ( ( (self.nArcVal).bit_length() + 3 ) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
          342  +            self.nBytesArc = ( (self.nArcVal.bit_length() + 3) // 8 ) + 1   # We add 3 bits. See DawgNode.convToBytes3()
   343    343               self.nBytesOffset = 1
   344    344               self.nMaxOffset = (2 ** (self.nBytesOffset * 8)) - 1
   345    345               self._calcNumBytesNodeAddress()
   346    346               self._calcNodesAddress3()
   347    347           else:
   348    348               print(" # Error: unknown compression method")
   349    349           print("   Arc values (chars, affixes and tags): {}  ->  {} bytes".format( self.nArcVal, len("\t".join(self.lArcVal).encode("utf-8")) ))
................................................................................
   358    358           "how many bytes needed to store all nodes/arcs in the binary dictionary"
   359    359           self.nBytesNodeAddress = 1
   360    360           while ((self.nBytesArc + self.nBytesNodeAddress) * self.nArc) > (2 ** (self.nBytesNodeAddress * 8)):
   361    361               self.nBytesNodeAddress += 1
   362    362   
   363    363       def _calcNodesAddress1 (self):
   364    364           nBytesNode = self.nBytesArc + self.nBytesNodeAddress
   365         -        iAddr = len(self.root.arcs) * nBytesNode
   366         -        for oNode in self.minimizedNodes:
          365  +        iAddr = len(self.oRoot.arcs) * nBytesNode
          366  +        for oNode in self.lMinimizedNodes:
   367    367               oNode.addr = iAddr
   368    368               iAddr += max(len(oNode.arcs), 1) * nBytesNode
   369    369   
   370    370       def _calcNodesAddress2 (self):
   371    371           nBytesNode = self.nBytesArc + self.nBytesNodeAddress
   372         -        iAddr = len(self.root.arcs) * nBytesNode
   373         -        for oNode in self.sortedNodes:
          372  +        iAddr = len(self.oRoot.arcs) * nBytesNode
          373  +        for oNode in self.lSortedNodes:
   374    374               oNode.addr = iAddr
   375    375               iAddr += max(len(oNode.arcs), 1) * nBytesNode
   376    376               for oNextNode in oNode.arcs.values():
   377    377                   if (oNode.pos + 1) == oNextNode.pos:
   378    378                       iAddr -= self.nBytesNodeAddress
   379    379                       #break
   380    380   
   381    381       def _calcNodesAddress3 (self):
   382    382           nBytesNode = self.nBytesArc + self.nBytesNodeAddress
   383    383           # theorical nodes size if only addresses and no offset
   384         -        self.root.size = len(self.root.arcs) * nBytesNode
   385         -        for oNode in self.sortedNodes:
          384  +        self.oRoot.size = len(self.oRoot.arcs) * nBytesNode
          385  +        for oNode in self.lSortedNodes:
   386    386               oNode.size = max(len(oNode.arcs), 1) * nBytesNode
   387    387           # rewind and calculate dropdown from the end, several times
   388    388           nDiff = self.nBytesNodeAddress - self.nBytesOffset
   389    389           bEnd = False
   390    390           while not bEnd:
   391    391               bEnd = True
   392    392               # recalculate addresses
   393         -            iAddr = self.root.size
   394         -            for oNode in self.sortedNodes:
          393  +            iAddr = self.oRoot.size
          394  +            for oNode in self.lSortedNodes:
   395    395                   oNode.addr = iAddr
   396    396                   iAddr += oNode.size
   397    397               # rewind and calculate dropdown from the end, several times
   398    398               for i in range(self.nNode-1, -1, -1):
   399         -                nSize = max(len(self.sortedNodes[i].arcs), 1) * nBytesNode
   400         -                for oNextNode in self.sortedNodes[i].arcs.values():
   401         -                    if 1 < (oNextNode.addr - self.sortedNodes[i].addr) < self.nMaxOffset:
          399  +                nSize = max(len(self.lSortedNodes[i].arcs), 1) * nBytesNode
          400  +                for oNextNode in self.lSortedNodes[i].arcs.values():
          401  +                    if 1 < (oNextNode.addr - self.lSortedNodes[i].addr) < self.nMaxOffset:
   402    402                           nSize -= nDiff
   403         -                if self.sortedNodes[i].size != nSize:
   404         -                    self.sortedNodes[i].size = nSize
          403  +                if self.lSortedNodes[i].size != nSize:
          404  +                    self.lSortedNodes[i].size = nSize
   405    405                       bEnd = False
   406    406   
   407    407       def _writeBinary (self, sPathFile, nMethod):
   408    408           """
   409    409           Format of the binary indexable dictionary:
   410    410           Each section is separated with 4 bytes of \0
   411    411           
................................................................................
   446    446                                                              self.nEntry, self.nNode, self.nArc, self.nAff, self.cStemming).encode("utf-8"))
   447    447               hDst.write(b"\0\0\0\0")
   448    448               # lArcVal
   449    449               hDst.write("\t".join(self.lArcVal).encode("utf-8"))
   450    450               hDst.write(b"\0\0\0\0")
   451    451               # DAWG: nodes / arcs
   452    452               if nMethod == 1:
   453         -                hDst.write(self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
   454         -                for oNode in self.minimizedNodes:
          453  +                hDst.write(self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
          454  +                for oNode in self.lMinimizedNodes:
   455    455                       hDst.write(oNode.convToBytes1(self.nBytesArc, self.nBytesNodeAddress))
   456    456               elif nMethod == 2:
   457         -                hDst.write(self.root.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
   458         -                for oNode in self.sortedNodes:
          457  +                hDst.write(self.oRoot.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
          458  +                for oNode in self.lSortedNodes:
   459    459                       hDst.write(oNode.convToBytes2(self.nBytesArc, self.nBytesNodeAddress))
   460    460               elif nMethod == 3:
   461         -                hDst.write(self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
   462         -                for oNode in self.sortedNodes:
          461  +                hDst.write(self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
          462  +                for oNode in self.lSortedNodes:
   463    463                       hDst.write(oNode.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset))
   464    464               hDst.close()
   465    465   
   466    466       def _writeNodes (self, sPathFile, nMethod):
   467    467           "for debugging only"
   468    468           print(" > Write nodes")
   469    469           with open(sPathFile+".nodes."+str(nMethod)+".txt", 'w', encoding='utf-8', newline="\n") as hDst:
   470    470               if nMethod == 1:
   471         -                hDst.write(self.root.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
   472         -                #hDst.write( ''.join( [ "%02X " %  z  for z in self.root.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
   473         -                for oNode in self.minimizedNodes:
          471  +                hDst.write(self.oRoot.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
          472  +                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes1(self.nBytesArc, self.nBytesNodeAddress) ] ).strip() )
          473  +                for oNode in self.lMinimizedNodes:
   474    474                       hDst.write(oNode.getTxtRepr1(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
   475    475               if nMethod == 2:
   476         -                hDst.write(self.root.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
   477         -                for oNode in self.sortedNodes:
          476  +                hDst.write(self.oRoot.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
          477  +                for oNode in self.lSortedNodes:
   478    478                       hDst.write(oNode.getTxtRepr2(self.nBytesArc, self.nBytesNodeAddress, self.lArcVal)+"\n")
   479    479               if nMethod == 3:
   480         -                hDst.write(self.root.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
   481         -                #hDst.write( ''.join( [ "%02X " %  z  for z in self.root.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
   482         -                for oNode in self.sortedNodes:
          480  +                hDst.write(self.oRoot.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
          481  +                #hDst.write( ''.join( [ "%02X " %  z  for z in self.oRoot.convToBytes3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset) ] ).strip() )
          482  +                for oNode in self.lSortedNodes:
   483    483                       hDst.write(oNode.getTxtRepr3(self.nBytesArc, self.nBytesNodeAddress, self.nBytesOffset, self.lArcVal)+"\n")
   484    484               hDst.close()
   485    485       
   486    486       def writeResults (self, sPathFile):
   487    487           bFileExits = os.path.isfile("_lexicons.res.txt")
   488    488           with open("_lexicons.res.txt", "a", encoding='utf-8', newline="\n") as hDst:
   489    489               sFormat1 = "{:<12} {:>12} {:>5} {:>8} {:>8} {:>6} {:>8} {:>9} {:>9} {:>15} {:>12} {:>12}\n"
................................................................................
   762    762       for cChar in sWord:
   763    763           if cPrevious not in _dCharOrder:
   764    764               _dCharOrder[cPrevious] = {}
   765    765           _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
   766    766           cPrevious = cChar
   767    767   
   768    768   
   769         -def getCharOrderForPreviousChar (cChar):
          769  +def getCharOrderAfterChar (cChar):
   770    770       return _dCharOrder.get(cChar, None)
   771    771   
   772    772   
   773    773   def displayCharOrder ():
   774    774       for key, value in _dCharOrder.items():
   775    775           print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))