Grammalecte  Check-in [e5f3698eb4]

Overview
Comment:[core] dawg: compressed lexicon
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | build | new_feature
Files: files | file ages | folders
SHA3-256: e5f3698eb421d38fb5422cc0f98de270c6476f13ce949d9b0768009bbc530fe2
User & Date: olr on 2017-06-23 17:11:10
Other Links: manifest | tags
Context
2017-06-23
17:25
[core] str_transform: change functions names check-in: 766f20e23c user: olr tags: core, trunk
17:11
[core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: build, new_feature, trunk
14:43
[core] dawg: accept personal lexicon check-in: 3916c538b5 user: olr tags: core, new_feature, trunk
Changes

Modified gc_core/py/dawg.py from [a30caaeab0] to [7e6ed7295c].

    14     14   import collections
    15     15   
    16     16   from . import str_transform as st
    17     17   from .progressbar import ProgressBar
    18     18   
    19     19   
    20     20   def readFile (spf):
           21  +    print("Read lexicon: " + spf)
    21     22       if os.path.isfile(spf):
    22     23           with open(spf, "r", encoding="utf-8") as hSrc:
    23     24               for sLine in hSrc:
    24     25                   sLine = sLine.strip()
    25     26                   if sLine and not sLine.startswith("#"):
    26     27                       yield sLine
    27     28       else:
    28     29           raise OSError("# Error. File not found or not loadable: " + spf)
    29     30   
    30     31   
    31         -def getElemsFromFile (spf, bCompressedDic=False):
           32  +def getElemsFromFile (spf):
           33  +    "returns tuple of (flexion, stem, tags) from lexicon file"
    32     34       nErr = 0
    33         -    if not bCompressedDic:
           35  +    if not spf.endswith(".clex"):
    34     36           for sLine in readFile(spf):
    35     37               try:
    36     38                   sFlex, sStem, sTag = sLine.split("\t")
    37     39                   yield (sFlex, sStem, sTag)
    38     40               except:
    39     41                   nErr += 1
    40     42       else:
    41         -        sTag = ":_" # neutral tag
           43  +        sTag = "_" # neutral tag
           44  +        sTag2 = ""
    42     45           for sLine in readFile(spf):
    43     46               if sLine.startswith("[") and sLine.endswith("]"):
    44         -                sTag = sLine[1:-1]
    45         -                continue
           47  +                # tag line
           48  +                if "-->" in sLine:
           49  +                    try:
           50  +                        sTag, sSfxCode, sTag2 = sLine[1:-1].split(" --> ")
           51  +                    except:
           52  +                        nErr += 1
           53  +                        continue
           54  +                    sTag = sTag.strip()
           55  +                    sSfxCode = sSfxCode.strip()
           56  +                    sTag2 = sTag2.strip()
           57  +                else:
           58  +                    sTag = sLine[1:-1]
           59  +                    sTag2 = ""
    46     60               else:
           61  +                # entry line
    47     62                   if "\t" in sLine:
    48     63                       if sLine.count("\t") > 1:
    49     64                           nErr += 1
    50     65                           continue
    51     66                       sFlex, sStem = sLine.split("\t")
    52     67                   else:
    53     68                       sFlex = sStem = sLine
           69  +                #print(sFlex, sStem, sTag)
    54     70                   yield (sFlex, sStem, sTag)
           71  +                if sTag2:
           72  +                    sFlex2 = st.getStemFromSuffixCode(sFlex, sSfxCode)
           73  +                    #print(sFlex2, sStem, sTag2)
           74  +                    yield (sFlex2, sStem, sTag2)
    55     75       if nErr:
    56     76           print(" # Lines ignored: {:>10}".format(nErr))
    57     77   
    58     78   
    59     79   
    60     80   class DAWG:
    61     81       """DIRECT ACYCLIC WORD GRAPH"""
    62     82       # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    63     83       # We store suffix/affix codes and tags within the graph after the “real” word.
    64     84       # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    65     85       # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    66     86       # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.
    67     87   
    68         -    def __init__ (self, spfSrc, sLangName, cStemming, bCompressedDic=False):
           88  +    def __init__ (self, spfSrc, sLangName, cStemming):
    69     89           print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
    70     90           cStemming = cStemming.upper()
    71     91           if cStemming == "A":
    72     92               funcStemmingGen = st.defineAffixCode
    73     93           elif cStemming == "S":
    74     94               funcStemmingGen = st.defineSuffixCode
    75     95           elif cStemming == "N":
................................................................................
    80    100           lEntry = []
    81    101           lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
    82    102           lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
    83    103           lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
    84    104           nErr = 0
    85    105           
    86    106           # read lexicon
    87         -        for sFlex, sStem, sTag in getElemsFromFile(spfSrc, bCompressedDic):
          107  +        for sFlex, sStem, sTag in getElemsFromFile(spfSrc):
    88    108               # chars
    89    109               for c in sFlex:
    90    110                   if c not in dChar:
    91    111                       dChar[c] = nChar
    92    112                       lChar.append(c)
    93    113                       nChar += 1
    94    114                   dCharOccur[c] = dCharOccur.get(c, 0) + 1