Grammalecte  Check-in [f82c3ce70e]

Overview
Comment:[build][fr] phonet simil: merge sets if words belongs to several sets
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fr | build
Files: files | file ages | folders
SHA3-256: f82c3ce70e793ad3e001a615db95560f4e4e1a9df5f4894ba9612cc2ba79c110
User & Date: olr on 2020-04-29 17:16:40
Other Links: manifest | tags
Context
2020-04-29
19:05
[build][fr] build_data.py: fix build for JS check-in: 6bc8dab4c2 user: olr tags: build, fr, trunk
17:16
[build][fr] phonet simil: merge sets if words belongs to several sets check-in: f82c3ce70e user: olr tags: build, fr, trunk
17:08
[fr] phonet_simil.txt update check-in: 582bf42669 user: olr tags: fr, trunk
Changes

Modified gc_lang/fr/build_data.py from [c910fde1c7] to [ce4f084f4e].

12
13
14
15
16
17
18


19
20
21
22
23
24
25
...
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304



305
306
307



308





309
310



311
312
313
314
315
316
317
318
319
320
321
322
323
324
...
363
364
365
366
367
368
369
370
371
372
import platform

import graphspell.ibdawg as ibdawg
from graphspell.echo import echo
from graphspell.str_transform import defineSuffixCode
import graphspell.tokenizer as tkz




oDict = None


class cd:
    """Context manager for changing the current working directory"""
    def __init__ (self, newPath):
................................................................................
        open(sp+"/modules-js/mfsp_data.json", "w", encoding="utf-8", newline="\n").write(sCode)


def makePhonetTable (sp, bJS=False):
    print("> Correspondances phonétiques ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")

    import gc_lang.fr.modules.conj as conj

    loadDictionary()

    # set of homophonic words
    lSet = []
    for sLine in readFile(sp+"/data/phonet_simil.txt"):
        lWord = sLine.split()
        aMore = set()
        for sWord in lWord:
            if sWord.endswith("er") and conj.isVerb(sWord):
                aMore = aMore.union(conj.getConjSimilInfiV1(sWord))
        lWord.extend(list(aMore))
        lSet.append(sorted(set(lWord)))

    # dictionary of words
    dWord = {}



    for i, aSet in enumerate(lSet):
        for sWord in aSet:
            if oDict.lookup(sWord):



                dWord[sWord] = i  # warning, what if word in several sets?





            else:
                echo("Mot inconnu : " + sWord)



    # dictionary of morphologies
    dMorph = {}
    for sWord in dWord:
        dMorph[sWord] = oDict.getMorph(sWord)

    # write file for Python
    sCode = "# generated data (do not edit)\n\n" + \
            "dWord = " + str(dWord) + "\n\n" + \
            "lSet = " + str(lSet) + "\n\n" + \
            "dMorph = " + str(dMorph) + "\n"
    open(sp+"/modules/phonet_data.py", "w", encoding="utf-8", newline="\n").write(sCode)

    if bJS:
        ## write file for JavaScript
................................................................................
    print("========== Build Hunspell dictionaries ==========")
    makeDictionaries(spLaunch, dVars['oxt_version'])


def after (spLaunch, dVars, bJS=False):
    print("========== Build French data ==========")
    makeMfsp(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)
    makeLocutions(spLaunch, bJS)







>
>







 







<
<






<


|
<
|



>
>
>



>
>
>
|
>
>
>
>
>

|
>
>
>






|







 







|
|

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
...
283
284
285
286
287
288
289


290
291
292
293
294
295

296
297
298

299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
...
375
376
377
378
379
380
381
382
383
384
import platform

import graphspell.ibdawg as ibdawg
from graphspell.echo import echo
from graphspell.str_transform import defineSuffixCode
import graphspell.tokenizer as tkz

import gc_lang.fr.modules.conj as conj


oDict = None


class cd:
    """Context manager for changing the current working directory"""
    def __init__ (self, newPath):
................................................................................
        open(sp+"/modules-js/mfsp_data.json", "w", encoding="utf-8", newline="\n").write(sCode)


def makePhonetTable (sp, bJS=False):
    print("> Correspondances phonétiques ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")



    loadDictionary()

    # set of homophonic words
    lSet = []
    for sLine in readFile(sp+"/data/phonet_simil.txt"):
        lWord = sLine.split()

        for sWord in lWord:
            if sWord.endswith("er") and conj.isVerb(sWord):
                lWord.extend(conj.getConjSimilInfiV1(sWord))

        lSet.append(set(lWord))

    # dictionary of words
    dWord = {}
    aMultiSetWord = set()
    lNewSet = []
    nAppend = 0
    for i, aSet in enumerate(lSet):
        for sWord in aSet:
            if oDict.lookup(sWord):
                if sWord not in dWord:
                    dWord[sWord] = i
                else:
                    # word in several set
                    aMultiSetWord.add(sWord)
                    iSet = dWord[sWord]
                    lNewSet.append(lSet[iSet].union(aSet))
                    dWord[sWord] = len(lSet) + nAppend
                    nAppend += 1
            else:
                echo(f"  Mot inconnu : <{sWord}>")
    lSet.extend(lNewSet)
    print("  Mots appartenant à plusieurs ensembles: ", ", ".join(aMultiSetWord))

    # dictionary of morphologies
    dMorph = {}
    for sWord in dWord:
        dMorph[sWord] = oDict.getMorph(sWord)

    # write file for Python
    sCode = "# generated data built in build_data.py (do not edit)\n\n" + \
            "dWord = " + str(dWord) + "\n\n" + \
            "lSet = " + str(lSet) + "\n\n" + \
            "dMorph = " + str(dMorph) + "\n"
    open(sp+"/modules/phonet_data.py", "w", encoding="utf-8", newline="\n").write(sCode)

    if bJS:
        ## write file for JavaScript
................................................................................
    print("========== Build Hunspell dictionaries ==========")
    makeDictionaries(spLaunch, dVars['oxt_version'])


def after (spLaunch, dVars, bJS=False):
    print("========== Build French data ==========")
    makeMfsp(spLaunch, bJS)
    makePhonetTable(spLaunch, bJS)
    makeConj(spLaunch, bJS)
    makeLocutions(spLaunch, bJS)

Modified gc_lang/fr/modules/conj_data.py from [24f905ee0b] to [348618f642].

cannot compute difference between binary files

Modified gc_lang/fr/modules/phonet_data.py from [fcf5178674] to [30d2b2eb01].

cannot compute difference between binary files