Grammalecte  Check-in [c65e578338]

Overview
Comment:[graphspell][py] dawg: ability to build lexicon directly from a list of tuples
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | multid
Files: files | file ages | folders
SHA3-256: c65e57833800132efb1e40769607685a14c8faba956fd46d25c8e902ead14408
User & Date: olr on 2018-02-27 18:07:44
Original Comment: [graphspell][py] ability to build lexicon directly from a list of tuples
Other Links: branch diff | manifest | tags
Context
2018-02-27
20:50
[graphspell][py] dawg: API modifications + add function to get dictionary as JSON check-in: 8a0391b163 user: olr tags: graphspell, multid
18:07
[graphspell][py] dawg: ability to build lexicon directly from a list of tuples check-in: c65e578338 user: olr tags: graphspell, multid
17:23
[lo] update: lexicon editor check-in: 1d5fe44fe8 user: olr tags: lo, multid
Changes

Modified graphspell/dawg.py from [96443fe4a2] to [059d031769].

    23     23   def readFile (spf):
    24     24       print(" < Read lexicon: " + spf)
    25     25       if os.path.isfile(spf):
    26     26           with open(spf, "r", encoding="utf-8") as hSrc:
    27     27               for sLine in hSrc:
    28     28                   sLine = sLine.strip()
    29     29                   if sLine and not sLine.startswith("#"):
    30         -                    yield sLine
           30  +                    yield sLine.split("\t")
    31     31       else:
    32     32           raise OSError("# Error. File not found or not loadable: " + spf)
    33     33   
    34     34   
    35     35   
    36     36   class DAWG:
    37     37       """DIRECT ACYCLIC WORD GRAPH"""
    38     38       # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    39     39       # We store suffix/affix codes and tags within the graph after the “real” word.
    40     40       # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    41     41       # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    42     42       # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
    43     43   
    44         -    def __init__ (self, spfSrc, cStemming, sLangCode, sLangName="", sDicName=""):
           44  +    def __init__ (self, src, cStemming, sLangCode, sLangName="", sDicName=""):
    45     45           print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
    46     46           cStemming = cStemming.upper()
    47     47           if cStemming == "A":
    48     48               funcStemmingGen = st.defineAffixCode
    49     49           elif cStemming == "S":
    50     50               funcStemmingGen = st.defineSuffixCode
    51     51           elif cStemming == "N":
................................................................................
    54     54               raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))
    55     55   
    56     56           lEntry = []
    57     57           lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
    58     58           lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
    59     59           lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
    60     60           nErr = 0
    61         -        
           61  +
    62     62           # read lexicon
    63         -        for sLine in readFile(spfSrc):
    64         -            sFlex, sStem, sTag = sLine.split("\t")
           63  +        if type(src) is str:
           64  +            iterable = readFile(src)
           65  +        else:
           66  +            iterable = src
           67  +        for sFlex, sStem, sTag in iterable:
    65     68               addWordToCharDict(sFlex)
    66     69               # chars
    67     70               for c in sFlex:
    68     71                   if c not in dChar:
    69     72                       dChar[c] = nChar
    70     73                       lChar.append(c)
    71     74                       nChar += 1
................................................................................
    93     96           lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in lEntry ]
    94     97           lEntry = None
    95     98           
    96     99           # Dictionary of arc values occurrency, to sort arcs of each node
    97    100           dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
    98    101                           + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \
    99    102                           + [ (dTag[tag]+nChar+nAff, dTagOccur[tag]) for tag in dTag ] )
   100         -        #with open(spfSrc[:-8]+".valuesfreq.txt", 'w', encoding='utf-8') as hFreqDst:  # DEBUG
   101         -        #    for iKey, nOcc in sorted(dValOccur.items(), key=lambda t: t[1], reverse=True):
   102         -        #        hFreqDst.write("{}: {}\n".format(lVal[iKey], nOcc))
   103         -        #    hFreqDst.close()
   104    103           
   105         -        self.sFileName = spfSrc
          104  +        self.sFileName = src  if type(src) is str  else "[None]"
   106    105           self.sLangCode = sLangCode
   107    106           self.sLangName = sLangName
   108    107           self.sDicName = sDicName
   109    108           self.nEntry = len(lWord)
   110    109           self.aPreviousEntry = []
   111    110           DawgNode.resetNextId()
   112    111           self.oRoot = DawgNode()