Grammalecte  Check-in [69affb5433]

Overview
Comment:[build][fr] include lemmas of words that are also verbal forms
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fr | build
Files: files | file ages | folders
SHA3-256: 69affb5433f7262264ef8fb1b67470638826884c1311ec51cf0ed03f3a579ceb
User & Date: olr on 2021-02-15 13:46:17
Other Links: manifest | tags
Context
2021-02-15
16:32
[core][fr] fix text formtatter check-in: f069a117e4 user: olr tags: core, fr, trunk
13:46
[build][fr] include lemmas of words that are also verbal forms check-in: 69affb5433 user: olr tags: build, fr, trunk
13:32
[fr] +1 test check-in: 160407dd67 user: olr tags: fr, trunk
Changes

Modified gc_lang/fr/build_data.py from [94a1ff7b31] to [5e658c4d18].

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
...
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

141

142
143
144
145
146
147
148
149
150
151
152
153
...
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
...
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387

    # read lexicon
    nStop = 0
    for n, sLine in enumerate(readFile(sp+"/data/dictConj.txt")):
        nTab = sLine.count("\t")
        if nTab == 1:
            # new entry
            sLemma, sVinfo = sLine.split("\t")
            dConj = {   ":P": { ":P": "" },
                        ":Q": { ":m:s": "", ":f:s": "", ":m:p": "", ":f:p": "" },
                        ":Ip": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" },
                        ":Iq": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
                        ":Is": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
                        ":If": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
                        ":K":  { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
................................................................................
                        ":E":  { ":2s": "", ":1p": "", ":2p": "" }
                    }
            if sVinfo not in lVinfo:
                dVinfo[sVinfo] = nVinfo
                lVinfo.append(sVinfo)
                nVinfo += 1
            # looking for names derivating from verb
            for sMorph in oDict.getMorph(sLemma):
                if ":N" in sMorph:
                    dVerbNames[sLemma] = { sLemma }
                    break
        elif nTab == 2:
            # flexion
            _, sTag, sFlex = sLine.split("\t")
            if sTag.count(" ") == 0:
                if sTag == "ppre":
                    dConj[":P"][":P"] = defineSuffixCode(sLemma, sFlex)
            else:
                try:
                    mode, g = sTag.split(maxsplit=1)
                    mode = dTrad[mode]
                    g = dTrad[g]
                    if dConj[mode][g] == "":
                        dConj[mode][g] = defineSuffixCode(sLemma, sFlex)
                    else:
                        # comment gérer les autres graphies ?
                        pass
                except:
                    echo(sLemma, " - ", sTag, " - non géré: ", mode, " / ", g)
            # looking for names derivating from verb
            for sMorph in oDict.getMorph(sFlex):
                if ":N" in sMorph:
                    if sLemma not in dVerbNames:
                        dVerbNames[sLemma] = { sFlex }
                    else:
                        dVerbNames[sLemma].add(sFlex)

                    break

        elif sLine == "$":
            # we store the dictionary of rules for this lemma
            if dConj[":Ip"][":1ś"] == "2è":
                dConj[":Ip"][":1ś"] = "2é"
            elif sLemma == "pouvoir":
                dConj[":Ip"][":1ś"] = "6uis"
            lConjTags = []
            for sTense in [":P", ":Q", ":Ip", ":Iq", ":Is", ":If", ":K", ":Sp", ":Sq", ":E"]:
                bFound = False
                for i, d in enumerate(dPatternList[sTense]):
                    if dConj[sTense] == d:
                        bFound = True
................................................................................
                    lConjTags.append(len(dPatternList[sTense]))
                    dPatternList[sTense].append(dConj[sTense])
            tConjTags = tuple(lConjTags)
            if tConjTags not in lTags:
                dTags[tConjTags] = nTags
                lTags.append(tConjTags)
                nTags += 1
            dVerb[sLemma] = (dVinfo[sVinfo], dTags[tConjTags])
        else:
            print("# Error - unknown line", n)

    for sLemma, aNames in dVerbNames.items():
        dVerbNames[sLemma] = tuple(aNames)  # convert set to tuple

    ## write file for Python
    sCode = "## generated data (do not edit)\n\n" + \
            "# Informations about verbs\n" + \
            "lVtyp = " + str(lVinfo) + "\n\n" + \
            "# indexes of tenses in _dPatternConj\n" + \
            "lTags = " + str(lTags) + "\n\n" + \
................................................................................
        ## write file for JavaScript
        sCode = "{\n" + \
                '    "dWord": ' + json.dumps(dWord, ensure_ascii=False) + ",\n" + \
                '    "lSet": ' + json.dumps(lSet, ensure_ascii=False) + ",\n" + \
                '    "dMorph": ' + json.dumps(dMorph, ensure_ascii=False) + "\n}"
        open(sp+"/modules-js/phonet_data.json", "w", encoding="utf-8", newline="\n").write(sCode)


def makeLocutions (sp, bJS=False):
    "compile list of locutions in JSON"
    print("> Locutions ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    dLocGraph = {}
    oTokenizer = tkz.Tokenizer("fr")
    for sLine in itertools.chain(readFile(sp+"/data/locutions_adverbiales.txt"), \
                                 readFile(sp+"/data/locutions_prépositives.txt"), \
                                 readFile(sp+"/data/locutions_conjonctives.txt"), \
                                 readFile(sp+"/data/locutions_pronominales.txt"), \
                                 readFile(sp+"/data/locutions_adjectivales.txt"), \
                                 readFile(sp+"/data/locutions_interjectives.txt"), \
                                 readFile(sp+"/data/locutions_nominales.txt"), \
                                 readFile(sp+"/data/locutions_verbales.txt")):
        dCur = dLocGraph
        sLoc, sTag = sLine.split("\t")
        for oToken in oTokenizer.genTokens(sLoc.strip()):
            sWord = oToken["sValue"]
            if sWord not in dCur:
                dCur[sWord] = {}
            dCur = dCur[sWord]
        dCur["_:_"] = sTag

    sCode = "# generated data (do not edit)\n\n" + \
            "dLocutions = " + str(dLocGraph) + "\n"
    open(sp+"/modules/locutions_data.py", "w", encoding="utf-8", newline="\n").write(sCode)
    if bJS:
        open(sp+"/modules-js/locutions_data.json", "w", encoding="utf-8", newline="\n").write(json.dumps(dLocGraph, ensure_ascii=False))


def before (spLaunch, dVars, bJS=False):
    print("========== Build Hunspell dictionaries ==========")
    makeDictionaries(spLaunch, dVars['oxt_version'])


def after (spLaunch, dVars, bJS=False):
    print("========== Build French data ==========")
    makeMfsp(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)
    #makeLocutions(spLaunch, bJS)







|







 







|

|






|






|




|



|
|
<
|
>
|
>




|







 







|



|
|







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<











<
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
...
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
...
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
...
340
341
342
343
344
345
346






























347
348
349
350
351
352
353
354
355
356
357


    # read lexicon
    nStop = 0
    for n, sLine in enumerate(readFile(sp+"/data/dictConj.txt")):
        nTab = sLine.count("\t")
        if nTab == 1:
            # new entry
            sInfi, sVinfo = sLine.split("\t")
            dConj = {   ":P": { ":P": "" },
                        ":Q": { ":m:s": "", ":f:s": "", ":m:p": "", ":f:p": "" },
                        ":Ip": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "", ":1ś": "" },
                        ":Iq": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
                        ":Is": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
                        ":If": { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
                        ":K":  { ":1s": "", ":2s": "", ":3s": "", ":1p": "", ":2p": "", ":3p": "" },
................................................................................
                        ":E":  { ":2s": "", ":1p": "", ":2p": "" }
                    }
            if sVinfo not in lVinfo:
                dVinfo[sVinfo] = nVinfo
                lVinfo.append(sVinfo)
                nVinfo += 1
            # looking for names derivating from verb
            for sMorph in oDict.getMorph(sInfi):
                if ":N" in sMorph:
                    dVerbNames[sInfi] = { sInfi }
                    break
        elif nTab == 2:
            # flexion
            _, sTag, sFlex = sLine.split("\t")
            if sTag.count(" ") == 0:
                if sTag == "ppre":
                    dConj[":P"][":P"] = defineSuffixCode(sInfi, sFlex)
            else:
                try:
                    mode, g = sTag.split(maxsplit=1)
                    mode = dTrad[mode]
                    g = dTrad[g]
                    if dConj[mode][g] == "":
                        dConj[mode][g] = defineSuffixCode(sInfi, sFlex)
                    else:
                        # comment gérer les autres graphies ?
                        pass
                except:
                    echo(sInfi, " - ", sTag, " - non géré: ", mode, " / ", g)
            # looking for names derivating from verb
            for sMorph in oDict.getMorph(sFlex):
                if ":N" in sMorph:
                    if sInfi not in dVerbNames:
                        dVerbNames[sInfi] = set()

                    dVerbNames[sInfi].add(sFlex)
                    sLemma = sMorph[1:sMorph.find("/")]
                    if sFlex != sLemma:
                        dVerbNames[sInfi].add(sLemma)
        elif sLine == "$":
            # we store the dictionary of rules for this lemma
            if dConj[":Ip"][":1ś"] == "2è":
                dConj[":Ip"][":1ś"] = "2é"
            elif sInfi == "pouvoir":
                dConj[":Ip"][":1ś"] = "6uis"
            lConjTags = []
            for sTense in [":P", ":Q", ":Ip", ":Iq", ":Is", ":If", ":K", ":Sp", ":Sq", ":E"]:
                bFound = False
                for i, d in enumerate(dPatternList[sTense]):
                    if dConj[sTense] == d:
                        bFound = True
................................................................................
                    lConjTags.append(len(dPatternList[sTense]))
                    dPatternList[sTense].append(dConj[sTense])
            tConjTags = tuple(lConjTags)
            if tConjTags not in lTags:
                dTags[tConjTags] = nTags
                lTags.append(tConjTags)
                nTags += 1
            dVerb[sInfi] = (dVinfo[sVinfo], dTags[tConjTags])
        else:
            print("# Error - unknown line", n)

    for sInfi, aNames in dVerbNames.items():
        dVerbNames[sInfi] = tuple(aNames)  # convert set to tuple

    ## write file for Python
    sCode = "## generated data (do not edit)\n\n" + \
            "# Informations about verbs\n" + \
            "lVtyp = " + str(lVinfo) + "\n\n" + \
            "# indexes of tenses in _dPatternConj\n" + \
            "lTags = " + str(lTags) + "\n\n" + \
................................................................................
        ## write file for JavaScript
        sCode = "{\n" + \
                '    "dWord": ' + json.dumps(dWord, ensure_ascii=False) + ",\n" + \
                '    "lSet": ' + json.dumps(lSet, ensure_ascii=False) + ",\n" + \
                '    "dMorph": ' + json.dumps(dMorph, ensure_ascii=False) + "\n}"
        open(sp+"/modules-js/phonet_data.json", "w", encoding="utf-8", newline="\n").write(sCode)
































def before (spLaunch, dVars, bJS=False):
    print("========== Build Hunspell dictionaries ==========")
    makeDictionaries(spLaunch, dVars['oxt_version'])


def after (spLaunch, dVars, bJS=False):
    print("========== Build French data ==========")
    makeMfsp(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)

Modified gc_lang/fr/modules/conj_data.py from [17f0d4c069] to [8bdfafb3cd].

cannot compute difference between binary files

Modified gc_lang/fr/modules/phonet_data.py from [7fb8239e70] to [16ca1dac13].

cannot compute difference between binary files

Modified gc_lang/fr/rules.grx from [b7b290a56f] to [2b9fcab6e4].

8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305


__conf_de_vconj__
    [de|d’]  @:[123][sp]¬:[GNA]
        <<- /conf/ not \2.istitle() and not \2.isupper() and not value(\2, "|jure|") and not tag(\2, "eg1mot")
        -2>> =suggSimil(\2, ":[NA]", True)+"|"+suggVerbInfi(\2)                                     && Incohérence avec « \1 » : “\2” est une forme verbale conjuguée.

TEST: il s’agit de {{mette}} en évidence.                                                           ->> mettre|mets|mise|mises|misses|missions
TEST: sa façon de {{nettoyez}} était inefficace.                                                    ->> nettoyer|nettoyant


    [de|d’]  [l’|leur]  @:[123][sp]¬:[GNAQ]
        <<- /conf/ not \3.istitle() and not \3.isupper() -3>> =suggSimil(\3, ":[NA].*:[si]", True)  && Incohérence avec « \1 \2 » : “\3” est une forme verbale conjuguée.

TEST: de l’{{arrivait}}                                                                             ->> arrivée|arrivant







|







8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305


__conf_de_vconj__
    [de|d’]  @:[123][sp]¬:[GNA]
        <<- /conf/ not \2.istitle() and not \2.isupper() and not value(\2, "|jure|") and not tag(\2, "eg1mot")
        -2>> =suggSimil(\2, ":[NA]", True)+"|"+suggVerbInfi(\2)                                     && Incohérence avec « \1 » : “\2” est une forme verbale conjuguée.

TEST: il s’agit de {{mette}} en évidence.                                                           ->> mettre|mets|mise|mises|miss|misses|mission|missions
TEST: sa façon de {{nettoyez}} était inefficace.                                                    ->> nettoyer|nettoyant


    [de|d’]  [l’|leur]  @:[123][sp]¬:[GNAQ]
        <<- /conf/ not \3.istitle() and not \3.isupper() -3>> =suggSimil(\3, ":[NA].*:[si]", True)  && Incohérence avec « \1 \2 » : “\3” est une forme verbale conjuguée.

TEST: de l’{{arrivait}}                                                                             ->> arrivée|arrivant