Grammalecte  Check-in [3916c538b5]

Overview
Comment:[core] dawg: accept personal lexicon
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core | new_feature
Files: files | file ages | folders
SHA3-256: 3916c538b58d03f2b51613bc833cdae99a4e314baa760e9df06d5009a8468214
User & Date: olr on 2017-06-23 14:43:25
Other Links: manifest | tags
Context
2017-06-23
17:11
[core] dawg: compressed lexicon check-in: e5f3698eb4 user: olr tags: build, new_feature, trunk
14:43
[core] dawg: accept personal lexicon check-in: 3916c538b5 user: olr tags: core, new_feature, trunk
13:19
[build] lex_build.py: main() + options check-in: e091821b50 user: olr tags: build, trunk
Changes

Modified gc_core/py/dawg.py from [a9c487538f] to [a30caaeab0].

12
13
14
15
16
17
18








































19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
...
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar










































class DAWG:
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangName, cStemming):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
            funcStemmingGen = st.noStemming
        else:
            print("# Error code: {}".format(cStemming))
            exit()

        lEntry = []
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon
        with open(spfSrc, 'r', encoding='utf-8') as hSrc:
            print(" > Reading lexicon: " + spfSrc + " ...")
            for line in hSrc:
                line = line.strip()
                if not (line.startswith('#') or line == ''):
                    try:
                        flex, stem, tag = line.split("\t")
                    except:
                        nErr += 1
                        continue

                    # chars
                    for c in flex:
                        if c not in dChar:
                            dChar[c] = nChar
                            lChar.append(c)
                            nChar += 1
                        dCharOccur[c] = dCharOccur.get(c, 0) + 1
                    # affixes to find stem from flexion
                    aff = funcStemmingGen(flex, stem)
                    if aff not in dAff:
                        dAff[aff] = nAff
                        lAff.append(aff)
                        nAff += 1
                    dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
                    # tags
                    if tag not in dTag:
                        dTag[tag] = nTag
                        lTag.append(tag)
                        nTag += 1
                    dTagOccur[tag] = dTagOccur.get(tag, 0) + 1
                    lEntry.append((flex, dAff[aff], dTag[tag]))
            hSrc.close()
        if nErr:
            print(" # Lines ignored: {:>10}".format(nErr))
        if not(lEntry):
            print(" # Empty lexicon")
            exit()
        
        # Preparing DAWG
        print(" > Preparing list of words")
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in lEntry ]
        lEntry = None
        
................................................................................
        self.nArc = 0
        self.dChar = dChar
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming.upper()
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
        else:
            self.funcStemming = st.noStemming
        







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









|









|
<








<
<
<
<
<
<
<
<
<
<
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
<
|
|
<







 







|







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

79
80
81
82
83
84
85
86










87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108



109
110

111
112
113
114
115
116
117
...
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import sys
import os
import collections

from . import str_transform as st
from .progressbar import ProgressBar


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                sLine = sLine.strip()
                if sLine and not sLine.startswith("#"):
                    yield sLine
    else:
        raise OSError("# Error. File not found or not loadable: " + spf)


def getElemsFromFile (spf, bCompressedDic=False):
    nErr = 0
    if not bCompressedDic:
        for sLine in readFile(spf):
            try:
                sFlex, sStem, sTag = sLine.split("\t")
                yield (sFlex, sStem, sTag)
            except:
                nErr += 1
    else:
        sTag = ":_" # neutral tag
        for sLine in readFile(spf):
            if sLine.startswith("[") and sLine.endswith("]"):
                sTag = sLine[1:-1]
                continue
            else:
                if "\t" in sLine:
                    if sLine.count("\t") > 1:
                        nErr += 1
                        continue
                    sFlex, sStem = sLine.split("\t")
                else:
                    sFlex = sStem = sLine
                yield (sFlex, sStem, sTag)
    if nErr:
        print(" # Lines ignored: {:>10}".format(nErr))



class DAWG:
    """DIRECT ACYCLIC WORD GRAPH"""
    # This code is inspired from Steve Hanov’s DAWG, 2011. (http://stevehanov.ca/blog/index.php?id=115)
    # We store suffix/affix codes and tags within the graph after the “real” word.
    # A word is a list of numbers [ c1, c2, c3 . . . cN, iAffix, iTags]
    # Each arc is an index in self.lArcVal, where are stored characters, suffix/affix codes for stemming and tags.
    # Important: As usual, the last node (after ‘iTags’) is tagged final, AND the node after ‘cN’ is ALSO tagged final.

    def __init__ (self, spfSrc, sLangName, cStemming, bCompressedDic=False):
        print("===== Direct Acyclic Word Graph - Minimal Acyclic Finite State Automaton =====")
        cStemming = cStemming.upper()
        if cStemming == "A":
            funcStemmingGen = st.defineAffixCode
        elif cStemming == "S":
            funcStemmingGen = st.defineSuffixCode
        elif cStemming == "N":
            funcStemmingGen = st.noStemming
        else:
            raise ValueError("# Error. Unknown stemming code: {}".format(cStemming))


        lEntry = []
        lChar = ['']; dChar = {}; nChar = 1; dCharOccur = {}
        lAff  = [];   dAff  = {}; nAff  = 0; dAffOccur = {}
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0
        
        # read lexicon










        for sFlex, sStem, sTag in getElemsFromFile(spfSrc, bCompressedDic):
            # chars
            for c in sFlex:
                if c not in dChar:
                    dChar[c] = nChar
                    lChar.append(c)
                    nChar += 1
                dCharOccur[c] = dCharOccur.get(c, 0) + 1
            # affixes to find stem from flexion
            aff = funcStemmingGen(sFlex, sStem)
            if aff not in dAff:
                dAff[aff] = nAff
                lAff.append(aff)
                nAff += 1
            dAffOccur[aff] = dCharOccur.get(aff, 0) + 1
            # tags
            if sTag not in dTag:
                dTag[sTag] = nTag
                lTag.append(sTag)
                nTag += 1
            dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
            lEntry.append((sFlex, dAff[aff], dTag[sTag]))



        if not lEntry:
            raise ValueError("# Error. Empty lexicon")

        
        # Preparing DAWG
        print(" > Preparing list of words")
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in lEntry ]
        lEntry = None
        
................................................................................
        self.nArc = 0
        self.dChar = dChar
        self.nChar = len(dChar)
        self.nAff = nAff
        self.lArcVal = lVal
        self.nArcVal = len(lVal)
        self.nTag = self.nArcVal - self.nChar - nAff
        self.cStemming = cStemming
        if cStemming == "A":
            self.funcStemming = st.getStemFromAffixCode
        elif cStemming == "S":    
            self.funcStemming = st.getStemFromSuffixCode
        else:
            self.funcStemming = st.noStemming