Grammalecte  Check-in [b038740434]

Overview
Comment:[core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: b03874043469e152d3aefca8b7936b5aa50ddb6bf545d9c5bfd648547afaa47c
User & Date: olr on 2017-06-29 13:20:59
Other Links: manifest | tags
Context
2017-06-29
19:40
[core] code cleaning check-in: fe361b4e8d user: olr tags: core, trunk
13:20
[core] dawg: attempt to speed up the dictionary lookup by reordering arcs (pointless ATM) check-in: b038740434 user: olr tags: core, trunk
04:16
[core] ibdawg: add next node when drawing path of a word check-in: 94f5da9f4a user: olr tags: core, trunk
Changes

Modified gc_core/py/dawg.py from [ddd6fe1cc6] to [d8c1a249fb].

12
13
14
15
16
17
18

19
20
21
22
23
24
25
...
101
102
103
104
105
106
107

108
109
110
111
112
113
114
...
177
178
179
180
181
182
183

184
185
186
187
188
189
190
...
243
244
245
246
247
248
249








250
251
252
253
254
255
256
...
525
526
527
528
529
530
531
532



533
534
535
536
537
538
539
...
728
729
730
731
732
733
734



























import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):
    print(" < Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
................................................................................
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        for sFlex, sStem, sTag in getElemsFromFile(spfSrc):

            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1
                dCharOccur[c] = dCharOccur.get(c, 0) + 1
................................................................................
            oProgBar.increment(1)
        oProgBar.done()
        self.finish()
        self.countNodes()
        self.countArcs()
        self.sortNodes()
        self.sortNodeArcs(dValOccur)

        self.displayInfo()

    # BUILD DAWG
    def insert (self, word):
        if word < self.previousWord:
            sys.exit("# Error: Words must be inserted in alphabetical order.")
        
................................................................................
    
    def sortNodeArcs (self, dValOccur):
        print(" > Sort node arcs")
        self.root.sortArcs(dValOccur)
        for oNode in self.minimizedNodes:
            oNode.sortArcs(dValOccur)
    








    def sortNodes (self):
        print(" > Sort nodes")
        for oNode in self.root.arcs.values():
            self._parseNodes(oNode)
    
    def _parseNodes (self, oNode):
        # Warning: recursive method
................................................................................

    def __eq__ (self, other):
        # Used as a key in a python dictionary.
        # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
        return self.__str__() == other.__str__()

    def sortArcs (self, dValOccur):
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur[t[0]], reverse=True))




    # VERSION 1 =====================================================================================================
    def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
        """
        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
................................................................................
                val = val | nFinalArcMask
            if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
                val = val | nNextNodeMask
                s += "  {:<20}  {:0>16}  i{:_>10}   +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
            else:
                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s


































>







 







>







 







>







 







>
>
>
>
>
>
>
>







 







|
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
...
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
...
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
...
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
...
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):
    print(" < Read lexicon: " + spf)
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
................................................................................
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
            addWordToCharDict(sFlex)
            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1
                dCharOccur[c] = dCharOccur.get(c, 0) + 1
................................................................................
            oProgBar.increment(1)
        oProgBar.done()
        self.finish()
        self.countNodes()
        self.countArcs()
        self.sortNodes()
        self.sortNodeArcs(dValOccur)
        #self.sortNodeArcs2 (self.root, "")
        self.displayInfo()

    # BUILD DAWG
    def insert (self, word):
        if word < self.previousWord:
            sys.exit("# Error: Words must be inserted in alphabetical order.")
        
................................................................................
    
    def sortNodeArcs (self, dValOccur):
        print(" > Sort node arcs")
        self.root.sortArcs(dValOccur)
        for oNode in self.minimizedNodes:
            oNode.sortArcs(dValOccur)
    
    def sortNodeArcs2 (self, oNode, cPrevious=""):
        # recursive function
        dCharOccur = getCharOrderForPreviousChar(cPrevious)
        if dCharOccur:
            oNode.sortArcs2(dCharOccur, self.lArcVal)
        for nArcVal, oNextNode in oNode.arcs.items():
            self.sortNodeArcs2(oNextNode, self.lArcVal[nArcVal])

    def sortNodes (self):
        print(" > Sort nodes")
        for oNode in self.root.arcs.values():
            self._parseNodes(oNode)
    
    def _parseNodes (self, oNode):
        # Warning: recursive method
................................................................................

    def __eq__ (self, other):
        # Used as a key in a python dictionary.
        # Nodes are equivalent if they have identical arcs, and each identical arc leads to identical states.
        return self.__str__() == other.__str__()

    def sortArcs (self, dValOccur):
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(t[0], 0), reverse=True))

    def sortArcs2 (self, dValOccur, lArcVal):
        self.arcs = collections.OrderedDict(sorted(self.arcs.items(), key=lambda t: dValOccur.get(lArcVal[t[0]], 0), reverse=True))

    # VERSION 1 =====================================================================================================
    def convToBytes1 (self, nBytesArc, nBytesNodeAddress):
        """
        Node scheme:
        - Arc length is defined by nBytesArc
        - Address length is defined by nBytesNodeAddress
................................................................................
                val = val | nFinalArcMask
            if 1 < (self.arcs[arc].addr - self.addr) < nMaxOffset and self.i != 0:
                val = val | nNextNodeMask
                s += "  {:<20}  {:0>16}  i{:_>10}   +{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr - self.addr)
            else:
                s += "  {:<20}  {:0>16}  i{:_>10}   #{:_>10}\n".format(lVal[arc], bin(val)[2:], self.arcs[arc].i, self.arcs[arc].addr)
        return s



# Another attempt to sort node arcs

_dCharOrder = {
    # key: previous char, value: dictionary of chars {c: nValue}
    "": {}
}


def addWordToCharDict (sWord):
    cPrevious = ""
    for cChar in sWord:
        if cPrevious not in _dCharOrder:
            _dCharOrder[cPrevious] = {}
        _dCharOrder[cPrevious][cChar] = _dCharOrder[cPrevious].get(cChar, 0) + 1
        cPrevious = cChar


def getCharOrderForPreviousChar (cChar):
    return _dCharOrder.get(cChar, None)


def displayCharOrder ():
    for key, value in _dCharOrder.items():
        print("[" + key + "]: ", ", ".join([ c+":"+str(n)  for c, n  in  sorted(value.items(), key=lambda t: t[1], reverse=True) ]))