Grammalecte  Check-in [b038740434]

Overview
Comment:[core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: b03874043469e152d3aefca8b7936b5aa50ddb6bf545d9c5bfd648547afaa47c
User & Date: olr on 2017-06-29 13:20:59
Other Links: manifest | tags
Context
2017-06-29
19:40
[core] code cleaning check-in: fe361b4e8d user: olr tags: core, trunk
13:20
[core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM) check-in: b038740434 user: olr tags: core, trunk
04:16
[core] ibdawg: add next node when drawing path of a word check-in: 94f5da9f4a user: olr tags: core, trunk
Changes

Modified gc_core/py/dawg.py from [ddd6fe1cc6] to [d8c1a249fb].

    12     12   import sys
    13     13   import os
    14     14   import collections
    15     15   
    16     16   from . import str_transform as st
    17     17   from .progressbar import ProgressBar
    18     18   
           19  +
    19     20   
    20     21   def readFile (spf):
    21     22       print(" < Read lexicon: " + spf)
    22     23       if os.path.isfile(spf):
    23     24           with open(spf, "r", encoding="utf-8") as hSrc:
    24     25               for sLine in hSrc:
    25     26                   sLine = sLine.strip()
................................................................................
   101    102           lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
   102    103           lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
   103    104           lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
   104    105           nErr = 0
   105    106           
   106    107           # read lexicon
   107    108           for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
          109  +            addWordToCharDict(sFlex)
   108    110               # chars
   109    111               for c in sFlex:
   110    112                   if c not in dChar:
   111    113                       dChar[c] = nChar
   112    114                       lChar.append(c)
   113    115                       nChar += 1
   114    116                   dCharOccur[c] = dCharOccur.get(c, 0) + 1
................................................................................
   177    179               oProgBar.increment(1)
   178    180           oProgBar.done()
   179    181           self.finish()
   180    182           self.countNodes()
   181    183           self.countArcs()
   182    184           self.sortNodes()
   183    185           self.sortNodeArcs(dValOccur)
          186  +        #self.sortNodeArcs2 (self.root, "")
   184    187           self.displayInfo()
   185    188   
   186    189       # BUILD DAWG
   187    190       def insert (self, word):
   188    191           if word < self.previousWord:
   189    192               sys.exit("# Error: Words must be inserted in alphabetical order.")
   190    193           
................................................................................
   243    246       
   244    247       def sortNodeArcs (self, dValOccur):
   245    248           print(" > Sort node arcs")
   246    249           self.root.sortArcs(dValOccur)
   247    250           for oNode in self.minimizedNodes:
   248    251               oNode.sortArcs(dValOccur)
   249    252       
          253  +    def sortNodeArcs2 (self, oNode, cPrevious=""):
          254  +        # recursive function
          255  +        dCharOccur = getCharOrderForPreviousChar(cPrevious)
          256  +        if dCharOccur:
          257  +            oNode.sortArcs2(dCharOccur, self.lArcVal)
          258  +        for nArcVal, oNextNode in oNode.arcs.items():
          259  +            self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])
          260  +
   250    261       def sortNodes (self):
   251    262           print(" > Sort nodes")
   252    263           for oNode in self.root.arcs.values():
   253    264               self._parseNodes(oNode)
   254    265       
   255    266       def _parseNodes (self, oNode):
   256    267           # Warning: recursive method
................................................................................
   525    536   
   526    537       def __eq__ (self, other):
   527    538           # Used as a key in a python dictionary.
   528    539           # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
   529    540           return self.__str__() == other.__str__()
   530    541   
   531    542       def sortArcs (self, dValOccur):
   532         -        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur[t[0]], reverse=True))
          543  +        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))
          544  +
          545  +    def sortArcs2 (self, dValOccur, lArcVal):
          546  +        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))
   533    547   
   534    548       # VERSION 1 =====================================================================================================
   535    549       def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
   536    550           """
   537    551           Node scheme:
   538    552           - Arc length is defined by nBytesArc
   539    553           - Address length is defined by nBytesNodeAddress
................................................................................
   728    742                   val = val | nFinalArcMask
   729    743               if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
   730    744                   val = val | nNextNodeMask
   731    745                   s += "  {:<20}  {:0>16}  i{:_>10}   +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
   732    746               else:
   733    747                   s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
   734    748           return s
          749  +
          750  +
          751  +
          752  +# Another attempt to sort node arcs
          753  +
          754  +_dCharOrder = {
          755  +    # key: previous char, value: dictionary of chars {c: nValue}
          756  +    "": {}
          757  +}
          758  +
          759  +
          760  +def addWordToCharDict (sWord):
          761  +    cPrevious = ""
          762  +    for cChar in sWord:
          763  +        if cPrevious not in _dCharOrder:
          764  +            _dCharOrder[cPrevious] = {}
          765  +        _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
          766  +        cPrevious = cChar
          767  +
          768  +
          769  +def getCharOrderForPreviousChar (cChar):
          770  +    return _dCharOrder.get(cChar, None)
          771  +
          772  +
          773  +def displayCharOrder ():
          774  +    for key, value in _dCharOrder.items():
          775  +        print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))