Grammalecte  Check-in [ceaa0c6eae]

Overview
Comment:[build][fr] build_data: add locutions_vrac.txt for unchecked locutions
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fr | build | Lexicographe
Files: files | file ages | folders
SHA3-256: ceaa0c6eaedd327ecd8daa62ac56c8e3b057427c3aaa88f606c9ca86780c9948
User & Date: olr on 2017-11-01 08:22:52
Other Links: branch diff | manifest | tags
Context
2017-11-01
09:20
[fr] data: locutions check-in: 144b46f14c user: olr tags: Lexicographe, fr
08:22
[build][fr] build_data: add locutions_vrac.txt for unchecked locutions check-in: ceaa0c6eae user: olr tags: Lexicographe, build, fr
08:16
[build][fr] build_data: readFile as a generator check-in: 1a082eec4e user: olr tags: Lexicographe, build, fr
Changes

Modified gc_lang/fr/build_data.py from [39bc8d4153] to [732462dec8].

3
4
5
6
7
8
9

10
11
12
13
14
15
16
...
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# FRENCH DATA BUILDER
#
# by Olivier R.
# License: MPL 2

import json
import os


import grammalecte.ibdawg as ibdawg
from grammalecte.echo import echo
from grammalecte.str_transform import defineSuffixCode
import grammalecte.fr.conj as conj
import grammalecte.tokenizer as tkz

................................................................................

def makeLocutions (sp, bJS=False):
    "compile list of locutions in JSON"
    print("> Locutions ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    dLocGraph = {}
    oTokenizer = tkz.Tokenizer("fr")
    for sLine in readFile(sp+"/data/locutions.txt"):
        dCur = dLocGraph
        sLoc, sTag = sLine.split("\t")
        for oToken in oTokenizer.genTokens(sLoc.strip()):
            sWord = oToken["sValue"]
            if sWord not in dCur:
                dCur[sWord] = {}
            dCur = dCur[sWord]







>







 







|







3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
...
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# FRENCH DATA BUILDER
#
# by Olivier R.
# License: MPL 2

import json
import os
import itertools

import grammalecte.ibdawg as ibdawg
from grammalecte.echo import echo
from grammalecte.str_transform import defineSuffixCode
import grammalecte.fr.conj as conj
import grammalecte.tokenizer as tkz

................................................................................

def makeLocutions (sp, bJS=False):
    "compile list of locutions in JSON"
    print("> Locutions ", end="")
    print("(Python et JavaScript)"  if bJS  else "(Python seulement)")
    dLocGraph = {}
    oTokenizer = tkz.Tokenizer("fr")
    for sLine in itertools.chain(readFile(sp+"/data/locutions.txt"), readFile(sp+"/data/locutions_vrac.txt")):
        dCur = dLocGraph
        sLoc, sTag = sLine.split("\t")
        for oToken in oTokenizer.genTokens(sLoc.strip()):
            sWord = oToken["sValue"]
            if sWord not in dCur:
                dCur[sWord] = {}
            dCur = dCur[sWord]

Added gc_lang/fr/data/locutions_vrac.txt version [a7ffc6f8bf].