Grammalecte  Check-in [ca4c833876]

Overview
Comment:[build][graphspell] multiple main dictionaries
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | build | graphspell
Files: files | file ages | folders
SHA3-256: ca4c8338762a1621b88820f9c82d363db866ebe7962185250d42213d8c541799
User & Date: olr on 2018-05-04 10:29:05
Other Links: manifest | tags
Context
2018-05-04
11:37
[lo] load selected dictionary from saved options check-in: 7c4223d402 user: olr tags: lo, trunk
10:29
[build][graphspell] multiple main dictionaries check-in: ca4c833876 user: olr tags: build, graphspell, trunk
08:16
[graphspell][py] dawg builder: filter entries with regex check-in: 96692bb883 user: olr tags: graphspell, trunk
Changes

Modified gc_lang/fr/config.ini from [c14dc709ed] to [dbf2bf89ee].

12
13
14
15
16
17
18
19
20



21
22
23
24
25
26
27
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png

# main dictionary
lexicon_src = lexicons/French.lex
dic_filename = fr
dic_name = French



# extended dictionary
lexicon_extended_src = lexicons/French.extended.lex
dic_extended_filename = fr.extended
dic_extended_name = Français - dictionnaire étendu
# community dictionary
lexicon_community_src = lexicons/French.community.lex
dic_community_filename = fr.community







|
|
>
>
>







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
link = http://grammalecte.net
description = Correcteur grammatical pour le français.
extras = README_fr.txt
logo = logo.png

# main dictionary
lexicon_src = lexicons/French.lex
dic_filenames = fr-allvars,fr-classic,fr-reform
dic_name = Français,Français (Classique/Moderne),Français (Réforme 1990)
dic_filter = ,[*CMPX]$,[*RPX]$
dic_default_filename_py = fr-allvars
dic_default_filename_js = fr-allvars
# extended dictionary
lexicon_extended_src = lexicons/French.extended.lex
dic_extended_filename = fr.extended
dic_extended_name = Français - dictionnaire étendu
# community dictionary
lexicon_community_src = lexicons/French.community.lex
dic_community_filename = fr.community

Modified gc_lang/fr/dictionnaire/genfrdic.py from [5f240a0703] to [59732a18e1].

59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }

dCLASSIQUEX = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE ÉTENDU”',
                'shortname': '“Classique étendu”',
                'asciiName': 'fr-classique-ext',
                'mozAsciiName': 'fr-FR-classic-ext',
                'subDicts': '*MCX',
                'mozId': 'fr-dicollecte-classique-ext',
                'description': "Dictionnaire français “Classique étendu”" }

dREFORME1990 = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “RÉFORME 1990”',
                 'shortname': '“Réforme 1990”',
                 'asciiName': 'fr-reforme1990',
                 'mozAsciiName': 'fr-FR-reform',
                 'subDicts': '*RX',
                 'mozId': 'fr-dicollecte-reforme1990',
                 'description': "Dictionnaire français “Réforme 1990”" }







<
<
<
<
<
<
<
<







59
60
61
62
63
64
65








66
67
68
69
70
71
72
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }









dREFORME1990 = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “RÉFORME 1990”',
                 'shortname': '“Réforme 1990”',
                 'asciiName': 'fr-reforme1990',
                 'mozAsciiName': 'fr-FR-reform',
                 'subDicts': '*RX',
                 'mozId': 'fr-dicollecte-reforme1990',
                 'description': "Dictionnaire français “Réforme 1990”" }

Modified gc_lang/fr/modules/tests.py from [43d45242b9] to [2e6f413e05].

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—")


class TestDictionary (unittest.TestCase):

    @classmethod
    def setUpClass (cls):
        cls.oDic = IBDAWG("${dic_filename}.bdic")

    def test_lookup (self):
        for sWord in ["branche", "Émilie"]:
            self.assertTrue(self.oDic.lookup(sWord), sWord)

    def test_lookup_failed (self):
        for sWord in ["Branche", "BRANCHE", "BranchE", "BRanche", "BRAnCHE", "émilie"]:







|







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    return s.replace("\u2019", "'").replace("\u2013", "–").replace("\u2014", "—")


class TestDictionary (unittest.TestCase):

    @classmethod
    def setUpClass (cls):
        cls.oDic = IBDAWG("${dic_main_filename_py}")

    def test_lookup (self):
        for sWord in ["branche", "Émilie"]:
            self.assertTrue(self.oDic.lookup(sWord), sWord)

    def test_lookup_failed (self):
        for sWord in ["Branche", "BRANCHE", "BranchE", "BRanche", "BRAnCHE", "émilie"]:

Modified gc_lang/fr/oxt/ContextMenu/ContextMenu.py from [512c45de75] to [03a78a32c7].

127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
            if not oSpellChecker:
                xCurCtx = uno.getComponentContext()
                oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx)
                if hasattr(oGC, "getSpellChecker"):
                    # https://bugs.documentfoundation.org/show_bug.cgi?id=97790
                    oSpellChecker = oGC.getSpellChecker()
                else:
                    oSpellChecker = SpellChecker("${lang}", "${dic_filename}.bdic")
            if not oLexicographe:
                oLexicographe = lxg.Lexicographe(oSpellChecker)
        except:
            traceback.print_exc()
        
    def execute (self, args):
        if not args:







|







127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
            if not oSpellChecker:
                xCurCtx = uno.getComponentContext()
                oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx)
                if hasattr(oGC, "getSpellChecker"):
                    # https://bugs.documentfoundation.org/show_bug.cgi?id=97790
                    oSpellChecker = oGC.getSpellChecker()
                else:
                    oSpellChecker = SpellChecker("${lang}", "fr-allvars.bdic")
            if not oLexicographe:
                oLexicographe = lxg.Lexicographe(oSpellChecker)
        except:
            traceback.print_exc()
        
    def execute (self, args):
        if not args:

Modified gc_lang/fr/oxt/Graphspell.py from [46a0993dea] to [0c5cbde982].

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
                sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic")
                if sPersonalDicJSON:
                    try:
                        personal_dic = json.loads(sPersonalDicJSON)
                    except:
                        print("Graphspell: wrong personal_dic")
                        traceback.print_exc()
            self.oGraphspell = SpellChecker("fr", "fr.bdic", "", "", personal_dic)
            self.loadHunspell()
            # print("Graphspell: init done")
        except:
            print("Graphspell: init failed")
            traceback.print_exc()
    
    def loadHunspell (self):







|







61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
                sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic")
                if sPersonalDicJSON:
                    try:
                        personal_dic = json.loads(sPersonalDicJSON)
                    except:
                        print("Graphspell: wrong personal_dic")
                        traceback.print_exc()
            self.oGraphspell = SpellChecker("fr", "fr-allvars.bdic", "", "", personal_dic)
            self.loadHunspell()
            # print("Graphspell: init done")
        except:
            print("Graphspell: init failed")
            traceback.print_exc()
    
    def loadHunspell (self):

Modified gc_lang/fr/rules.grx from [6e3d910360] to [a037a65419].

11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
    <<- morphex(\2, ":V", ":(?:G|2p|3p!|[ISK].*:2s)")
    -2>> =suggVerb(@, ":2s")                                 # Conjugaison erronée. Accord avec « \1 ». Le verbe devrait être à la 2ᵉ personne du singulier.

TEST: Tu ne {{ment}} jamais.
TEST: Tu {{a}} mal ?
TEST: Tu ne le lui {{prend}} pas.
TEST: Tu ne m’{{attendra}} pas.
TEST: toi qui n’y {{connait}} rien, ne nous ennuie pas avec tes théories.


## 3sg
__[i]/conj(conj_il)__
    (?<!t’)(il) +({w_1})  @@0,$
    <<- morphex(\2, ":V", ":(?:3s|P|G)") and not (morph(\2, ":[PQ]", False) and morph(word(-1), ":V0.*:3s", False, False))
    -2>> =suggVerb(@, ":3s")                                 # Conjugaison erronée. Accord avec « \1 ». Le verbe devrait être à la 3ᵉ personne du singulier.







|







11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
    <<- morphex(\2, ":V", ":(?:G|2p|3p!|[ISK].*:2s)")
    -2>> =suggVerb(@, ":2s")                                 # Conjugaison erronée. Accord avec « \1 ». Le verbe devrait être à la 2ᵉ personne du singulier.

TEST: Tu ne {{ment}} jamais.
TEST: Tu {{a}} mal ?
TEST: Tu ne le lui {{prend}} pas.
TEST: Tu ne m’{{attendra}} pas.
TEST: toi qui n’y {{connaît}} rien, ne nous ennuie pas avec tes théories.


## 3sg
__[i]/conj(conj_il)__
    (?<!t’)(il) +({w_1})  @@0,$
    <<- morphex(\2, ":V", ":(?:3s|P|G)") and not (morph(\2, ":[PQ]", False) and morph(word(-1), ":V0.*:3s", False, False))
    -2>> =suggVerb(@, ":3s")                                 # Conjugaison erronée. Accord avec « \1 ». Le verbe devrait être à la 3ᵉ personne du singulier.

Modified graphspell/dawg.py from [64364f5bf4] to [eb988983d4].

10
11
12
13
14
15
16


17
18
19
20
21
22
23
..
59
60
61
62
63
64
65

66
67
68
69
70
71
72
..
95
96
97
98
99
100
101

102
103
104
105
106
107
108


import sys
import os
import collections
import json
import time



from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):
................................................................................
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0

        try:
            zFilter = re.compile(sSelectFilterRegex)  if sSelectFilterRegex  else None
        except:
            print(" # Error. Wrong filter regex. Filter ignored.")

            zFilter = None

        # read lexicon
        if type(src) is str:
            iterable = readFile(src)
        else:
            iterable = src
................................................................................
                dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
                aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
        if not aEntry:
            raise ValueError("# Error. Empty lexicon")
        
        # Preparing DAWG
        print(" > Preparing list of words")

        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in aEntry ]
        aEntry = None
        
        # Dictionary of arc values occurrency, to sort arcs of each node
        dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \







>
>







 







>







 







>







10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
..
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
..
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112


import sys
import os
import collections
import json
import time
import re
import traceback

from . import str_transform as st
from .progressbar import ProgressBar



def readFile (spf):
................................................................................
        lTag  = [];   dTag  = {}; nTag  = 0; dTagOccur = {}
        nErr = 0

        try:
            zFilter = re.compile(sSelectFilterRegex)  if sSelectFilterRegex  else None
        except:
            print(" # Error. Wrong filter regex. Filter ignored.")
            traceback.print_exc()
            zFilter = None

        # read lexicon
        if type(src) is str:
            iterable = readFile(src)
        else:
            iterable = src
................................................................................
                dTagOccur[sTag] = dTagOccur.get(sTag, 0) + 1
                aEntry.add((sFlex, dAff[sAff], dTag[sTag]))
        if not aEntry:
            raise ValueError("# Error. Empty lexicon")
        
        # Preparing DAWG
        print(" > Preparing list of words")
        print(" Filter: " + (sSelectFilterRegex or "[None]"))
        lVal = lChar + lAff + lTag
        lWord = [ [dChar[c] for c in sFlex] + [iAff+nChar] + [iTag+nChar+nAff]  for sFlex, iAff, iTag in aEntry ]
        aEntry = None
        
        # Dictionary of arc values occurrency, to sort arcs of each node
        dValOccur = dict( [ (dChar[c], dCharOccur[c])  for c in dChar ] \
                        + [ (dAff[aff]+nChar, dAffOccur[aff]) for aff in dAff ] \

Modified lex_build.py from [2d1c4b9aa4] to [346704203c].

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName)
    dir_util.mkpath("graphspell/_dictionaries")
    oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))
    if bJSON:
        dir_util.mkpath("graphspell-js/_dictionaries")
        oDic = IBDAWG(sfDict + ".bdic")
        oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)







|

|







5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", sFilter="", cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sFilter)
    dir_util.mkpath("graphspell/_dictionaries")
    oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))
    if bJSON:
        dir_util.mkpath("graphspell-js/_dictionaries")
        oDic = IBDAWG(sfDict + ".bdic")
        oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)

Modified make.py from [a8ca755148] to [eb03bf4198].

312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
...
328
329
330
331
332
333
334

335
336


337
338
339
340
341
342
343




344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
    dVars["dic_main_filename_js"] = ""
    dVars["dic_extended_filename_py"] = ""
    dVars["dic_extended_filename_js"] = ""
    dVars["dic_community_filename_py"] = ""
    dVars["dic_community_filename_js"] = ""
    dVars["dic_personal_filename_py"] = ""
    dVars["dic_personal_filename_js"] = ""
    lDict = [ ("main", dVars['dic_filename']) ]
    if bExtendedDict:
        lDict.append(("extended", dVars['dic_extended_filename']))
    if bCommunityDict:
        lDict.append(("community", dVars['dic_community_filename']))
    if bPersonalDict:
        lDict.append(("personal", dVars['dic_personal_filename']))
    for sType, sFileName in lDict:
................................................................................
        spfJSDic = "graphspell-js/_dictionaries/" + sFileName + ".json"
        if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
            buildDictionary(dVars, sType, bJavaScript)
        print(spfPyDic)
        file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic'
        if bJavaScript:

            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'




def buildDictionary (dVars, sType, bJavaScript=False):
    if sType == "main":
        spfLexSrc = dVars['lexicon_src']
        sfDictDst = dVars['dic_filename']
        sDicName = dVars['dic_name']




    elif sType == "extended":
        spfLexSrc = dVars['lexicon_extended_src']
        sfDictDst = dVars['dic_extended_filename']
        sDicName = dVars['dic_extended_name']
    elif sType == "community":
        spfLexSrc = dVars['lexicon_community_src']
        sfDictDst = dVars['dic_community_filename']
        sDicName = dVars['dic_community_name']
    elif sType == "personal":
        spfLexSrc = dVars['lexicon_personal_src']
        sfDictDst = dVars['dic_personal_filename']
        sDicName = dVars['dic_personal_name']
    lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, dVars['stemming_method'], int(dVars['fsa_method']))



def main ():
    print("Python: " + sys.version)
    xParser = argparse.ArgumentParser()
    xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)")







|







 







>


>
>





|
|
>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|







312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
...
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
    dVars["dic_main_filename_js"] = ""
    dVars["dic_extended_filename_py"] = ""
    dVars["dic_extended_filename_js"] = ""
    dVars["dic_community_filename_py"] = ""
    dVars["dic_community_filename_js"] = ""
    dVars["dic_personal_filename_py"] = ""
    dVars["dic_personal_filename_js"] = ""
    lDict = [ ("main", s)  for s in dVars['dic_filenames'].split(",") ]
    if bExtendedDict:
        lDict.append(("extended", dVars['dic_extended_filename']))
    if bCommunityDict:
        lDict.append(("community", dVars['dic_community_filename']))
    if bPersonalDict:
        lDict.append(("personal", dVars['dic_personal_filename']))
    for sType, sFileName in lDict:
................................................................................
        spfJSDic = "graphspell-js/_dictionaries/" + sFileName + ".json"
        if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
            buildDictionary(dVars, sType, bJavaScript)
        print(spfPyDic)
        file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic'
        if bJavaScript:
            print(spfJSDic)
            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic"
    dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json"


def buildDictionary (dVars, sType, bJavaScript=False):
    if sType == "main":
        spfLexSrc = dVars['lexicon_src']
        l_sfDictDst = dVars['dic_filenames'].split(",")
        l_sDicName = dVars['dic_name'].split(",")
        l_sFilter = dVars['dic_filter'].split(",")
        for sfDictDst, sDicName, sFilter in zip(l_sfDictDst, l_sDicName, l_sFilter):
            lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, sFilter, dVars['stemming_method'], int(dVars['fsa_method']))
    else:
        if sType == "extended":
            spfLexSrc = dVars['lexicon_extended_src']
            sfDictDst = dVars['dic_extended_filename']
            sDicName = dVars['dic_extended_name']
        elif sType == "community":
            spfLexSrc = dVars['lexicon_community_src']
            sfDictDst = dVars['dic_community_filename']
            sDicName = dVars['dic_community_name']
        elif sType == "personal":
            spfLexSrc = dVars['lexicon_personal_src']
            sfDictDst = dVars['dic_personal_filename']
            sDicName = dVars['dic_personal_name']
        lex_build.build(spfLexSrc, dVars['lang'], dVars['lang_name'], sfDictDst, bJavaScript, sDicName, "", dVars['stemming_method'], int(dVars['fsa_method']))



def main ():
    print("Python: " + sys.version)
    xParser = argparse.ArgumentParser()
    xParser.add_argument("lang", type=str, nargs='+', help="lang project to generate (name of folder in /lang)")