Grammalecte  Check-in [05fb167483]

Overview
Comment:[build][graphspell][lo] dictionary: drop support for binary file -> use JSON
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | build | lo | graphspell | dict2
Files: files | file ages | folders
SHA3-256: 05fb167483021d19c671911fede3595b5b049d849f4082abcdbe0ad4776ad1e6
User & Date: olr on 2020-11-04 11:37:30
Other Links: branch diff | manifest | tags
Context
2020-11-04
12:02
[graphspell] ibdawg: code cleaning, remove old code, useless compression versions check-in: 86250e8e6c user: olr tags: dict2, graphspell
11:37
[build][graphspell][lo] dictionary: drop support for binary file -> use JSON check-in: 05fb167483 user: olr tags: build, dict2, graphspell, lo
2020-11-03
12:35
[fr] ajustements check-in: 3cffdae3b0 user: olr tags: fr, trunk
Changes

Modified gc_lang/fr/build_data.py from [6e865955c0] to [3d9c0f4ca9].

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
        raise OSError("# Error. File not found or not loadable: " + spf)


def loadDictionary ():
    global oDict
    if not oDict:
        try:
            oDict = ibdawg.IBDAWG("fr-allvars.bdic")
        except:
            traceback.print_exc()


def makeDictionaries (sp, sVersion):
    with cd(sp+"/dictionnaire"):
        if platform.system() == "Windows":







|







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
        raise OSError("# Error. File not found or not loadable: " + spf)


def loadDictionary ():
    global oDict
    if not oDict:
        try:
            oDict = ibdawg.IBDAWG("fr-allvars.json")
        except:
            traceback.print_exc()


def makeDictionaries (sp, sVersion):
    with cd(sp+"/dictionnaire"):
        if platform.system() == "Windows":

Modified gc_lang/fr/oxt/ContextMenu/ContextMenu.py from [c17c3b29b1] to [f3255b533a].

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
            if not oSpellChecker:
                xCurCtx = uno.getComponentContext()
                oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx)
                if hasattr(oGC, "getSpellChecker"):
                    # https://bugs.documentfoundation.org/show_bug.cgi?id=97790
                    oSpellChecker = oGC.getSpellChecker()
                else:
                    oSpellChecker = SpellChecker("${lang}", "fr-allvars.bdic")
        except:
            traceback.print_exc()

    def execute (self, args):
        if not args:
            return
        try:







|







131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
            if not oSpellChecker:
                xCurCtx = uno.getComponentContext()
                oGC = self.ctx.ServiceManager.createInstanceWithContext("org.openoffice.comp.pyuno.Lightproof.grammalecte", self.ctx)
                if hasattr(oGC, "getSpellChecker"):
                    # https://bugs.documentfoundation.org/show_bug.cgi?id=97790
                    oSpellChecker = oGC.getSpellChecker()
                else:
                    oSpellChecker = SpellChecker("${lang}", "fr-allvars.json")
        except:
            traceback.print_exc()

    def execute (self, args):
        if not args:
            return
        try:

Modified gc_lang/fr/oxt/DictOptions/LexiconEditor.py from [828f4f365e] to [5ef5214006].

404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
...
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473

    @_waitPointer
    def importDictionary (self):
        spfImported = ""
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILEOPEN_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json; *.bdic")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()
................................................................................
            self.xDateDic.Label = self.dUI.get("void", "#err")
        MessageBox(self.xDocument, self.dUI.get('save_message', "#err"), self.dUI.get('save_title', "#err"))

    def exportDictionary (self):
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILESAVE_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json; *.bdic")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()







|







 







|







404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
...
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473

    @_waitPointer
    def importDictionary (self):
        spfImported = ""
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILEOPEN_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()
................................................................................
            self.xDateDic.Label = self.dUI.get("void", "#err")
        MessageBox(self.xDocument, self.dUI.get('save_message', "#err"), self.dUI.get('save_title', "#err"))

    def exportDictionary (self):
        try:
            xFilePicker = self.xSvMgr.createInstanceWithContext('com.sun.star.ui.dialogs.FilePicker', self.ctx)  # other possibility: com.sun.star.ui.dialogs.SystemFilePicker
            xFilePicker.initialize([uno.getConstantByName("com.sun.star.ui.dialogs.TemplateDescription.FILESAVE_SIMPLE")]) # seems useless
            xFilePicker.appendFilter("Supported files", "*.json")
            xFilePicker.setDefaultName("fr.__personal__.json") # useless, doesn’t work
            xFilePicker.setDisplayDirectory("")
            xFilePicker.setMultiSelectionMode(False)
            nResult = xFilePicker.execute()
            if nResult == 1:
                # lFile = xFilePicker.getSelectedFiles()
                lFile = xFilePicker.getFiles()

Modified gc_lang/fr/oxt/DictOptions/SearchWords.py from [764a885065] to [2c4ada79ef].

182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
            elif xActionEvent.ActionCommand == "Close":
                self.xContainer.endExecute()
        except:
            traceback.print_exc()

    def initSpellChecker (self):
        if not self.oSpellChecker:
            self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.bdic", "", self.oPersonalDicJSON)

    @_waitPointer
    def searchSimilar (self):
        self.initSpellChecker()
        sWord = self.xWord.Text.strip()
        if sWord:
            xGridDataModel = self.xGridModel.GridDataModel







|







182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
            elif xActionEvent.ActionCommand == "Close":
                self.xContainer.endExecute()
        except:
            traceback.print_exc()

    def initSpellChecker (self):
        if not self.oSpellChecker:
            self.oSpellChecker = sc.SpellChecker("fr", "fr-allvars.json", "", self.oPersonalDicJSON)

    @_waitPointer
    def searchSimilar (self):
        self.initSpellChecker()
        sWord = self.xWord.Text.strip()
        if sWord:
            xGridDataModel = self.xGridModel.GridDataModel

Modified gc_lang/fr/oxt/Graphspell.py from [810dc52bd8] to [76770ac233].

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
                sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic")
                if sPersonalDicJSON:
                    try:
                        personal_dic = json.loads(sPersonalDicJSON)
                    except:
                        print("Graphspell: wrong personal_dic")
                        traceback.print_exc()
            self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".bdic", "", personal_dic)
            self.loadHunspell()
            # print("Graphspell: init done")
        except:
            print("Graphspell: init failed")
            traceback.print_exc()

    def loadHunspell (self):







|







65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
                sPersonalDicJSON = self.xOptionNode.getPropertyValue("personal_dic")
                if sPersonalDicJSON:
                    try:
                        personal_dic = json.loads(sPersonalDicJSON)
                    except:
                        print("Graphspell: wrong personal_dic")
                        traceback.print_exc()
            self.oGraphspell = SpellChecker("fr", "fr-"+sMainDicName+".json", "", personal_dic)
            self.loadHunspell()
            # print("Graphspell: init done")
        except:
            print("Graphspell: init failed")
            traceback.print_exc()

    def loadHunspell (self):

Modified gc_lang/fr/setup.py from [8a0db0631b] to [955f5741fe].

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    #     'test': ['coverage'],
    # },

    # If there are data files included in your packages that need to be
    # installed, specify them here.  If using Python 2.6 or less, then these
    # have to be included in MANIFEST.in as well.
    package_data={
        'grammalecte': ['graphspell/_dictionaries/*.bdic', '*.txt']
    },

    # Although 'package_data' is the preferred approach, in some case you may
    # need to place data files outside of your packages. See:
    # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
    # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
    # data_files=[('my_data', ['data/data_file'])],







|







89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    #     'test': ['coverage'],
    # },

    # If there are data files included in your packages that need to be
    # installed, specify them here.  If using Python 2.6 or less, then these
    # have to be included in MANIFEST.in as well.
    package_data={
        'grammalecte': ['graphspell/_dictionaries/*.json', '*.txt']
    },

    # Although 'package_data' is the preferred approach, in some case you may
    # need to place data files outside of your packages. See:
    # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
    # In this case, 'data_file' will be installed into '<sys.prefix>/my_data'
    # data_files=[('my_data', ['data/data_file'])],

Modified graphspell/dawg.py from [b60434a390] to [c083d6a347].

474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
            "l2grams": list(self.a2grams)
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod, bInJSModule=False, bBinaryDictAsHexString=True):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )







|







474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
            # Mozilla’s JS parser don’t like file bigger than 4 Mb!
            # So, if necessary, we use an hexadecimal string, that we will convert later in Firefox’s extension.
            # https://github.com/mozilla/addons-linter/issues/1361
            "sByDic": byDic.hex()  if bBinaryDictAsHexString  else [ e  for e in byDic ],
            "l2grams": list(self.a2grams)
        }

    def writeAsJSObject (self, spfDst, nCompressionMethod=1, bInJSModule=False, bBinaryDictAsHexString=True):
        "write a file (JSON or JS module) with all the necessary data"
        if not spfDst.endswith(".json"):
            spfDst += "."+str(nCompressionMethod)+".json"
        with open(spfDst, "w", encoding="utf-8", newline="\n") as hDst:
            if bInJSModule:
                hDst.write('// JavaScript\n// Generated data (do not edit)\n\n"use strict";\n\nconst dictionary = ')
            hDst.write( json.dumps(self.getBinaryAsJSON(nCompressionMethod, bBinaryDictAsHexString), ensure_ascii=False) )

Modified graphspell/spellchecker.py from [2bdbe76996] to [9b47d651ea].

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import traceback

from . import ibdawg
from . import tokenizer


dDefaultDictionaries = {
    "fr": "fr-allvars.bdic",
    "en": "en.bdic"
}


class SpellChecker ():
    "SpellChecker: wrapper for the IBDAWG class"

    def __init__ (self, sLangCode, sfMainDic="", sfCommunityDic="", sfPersonalDic=""):







|
|







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import traceback

from . import ibdawg
from . import tokenizer


dDefaultDictionaries = {
    "fr": "fr-allvars.json",
    "en": "en.json"
}


class SpellChecker ():
    "SpellChecker: wrapper for the IBDAWG class"

    def __init__ (self, sLangCode, sfMainDic="", sfCommunityDic="", sfPersonalDic=""):

Modified lex_build.py from [0d00b07703] to [1b2b5d0ea9].

7
8
9
10
11
12
13
14
15
16
17
18
19

20
21

22
23
24
25
26
27
28
29
30
import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangCode, sLangName, sfDict, bJSON=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sDescription, sFilter)
    dir_util.mkpath("graphspell/_dictionaries")
    oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))

    if bJSON:
        dir_util.mkpath("graphspell-js/_dictionaries")

        oDic = IBDAWG(sfDict + ".bdic")
        oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)


def main ():
    "parse args from CLI"
    xParser = argparse.ArgumentParser()
    xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon")
    xParser.add_argument("lang_code", type=str, help="language code")







|



|
|
>
|

>
|
|







7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import argparse
from distutils import dir_util

import graphspell.dawg as fsa
from graphspell.ibdawg import IBDAWG


def build (spfSrc, sLangCode, sLangName, sfDict, bJavaScript=False, sDicName="", sDescription="", sFilter="", cStemmingMethod="S", nCompressMethod=1):
    "transform a text lexicon as a binary indexable dictionary"
    oDAWG = fsa.DAWG(spfSrc, cStemmingMethod, sLangCode, sLangName, sDicName, sDescription, sFilter)
    dir_util.mkpath("graphspell/_dictionaries")
    #oDAWG.writeInfo("graphspell/_dictionaries/" + sfDict + ".info.txt")
    #oDAWG.writeBinary("graphspell/_dictionaries/" + sfDict + ".bdic", int(nCompressMethod))
    oDAWG.writeAsJSObject("graphspell/_dictionaries/" + sfDict + ".json")
    if bJavaScript:
        dir_util.mkpath("graphspell-js/_dictionaries")
        oDAWG.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json")
        #oDic = IBDAWG(sfDict + ".bdic")
        #oDic.writeAsJSObject("graphspell-js/_dictionaries/" + sfDict + ".json", bBinaryDictAsHexString=True)


def main ():
    "parse args from CLI"
    xParser = argparse.ArgumentParser()
    xParser.add_argument("src_lexicon", type=str, help="path and file name of the source lexicon")
    xParser.add_argument("lang_code", type=str, help="language code")

Modified make.py from [f59af684eb] to [a76be310e9].

313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
    dVars["dic_personal_filename_js"] = ""
    lDict = [ ("main", s)  for s in dVars['dic_filenames'].split(",") ]
    if bCommunityDict:
        lDict.append(("community", dVars['dic_community_filename']))
    if bPersonalDict:
        lDict.append(("personal", dVars['dic_personal_filename']))
    for sType, sFileName in lDict:
        spfPyDic = f"graphspell/_dictionaries/{sFileName}.bdic"
        spfJSDic = f"graphspell-js/_dictionaries/{sFileName}.json"
        if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
            buildDictionary(dVars, sType, bJavaScript)
        print("  +", spfPyDic)
        file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.bdic'
        if bJavaScript:
            print("  +", spfJSDic)
            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".bdic"
    dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json"


def buildDictionary (dVars, sType, bJavaScript=False):
    "build binary dictionary for Graphspell from lexicons"
    if sType == "main":
        spfLexSrc = dVars['lexicon_src']







|





|




|







313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
    dVars["dic_personal_filename_js"] = ""
    lDict = [ ("main", s)  for s in dVars['dic_filenames'].split(",") ]
    if bCommunityDict:
        lDict.append(("community", dVars['dic_community_filename']))
    if bPersonalDict:
        lDict.append(("personal", dVars['dic_personal_filename']))
    for sType, sFileName in lDict:
        spfPyDic = f"graphspell/_dictionaries/{sFileName}.json"
        spfJSDic = f"graphspell-js/_dictionaries/{sFileName}.json"
        if not os.path.isfile(spfPyDic) or (bJavaScript and not os.path.isfile(spfJSDic)):
            buildDictionary(dVars, sType, bJavaScript)
        print("  +", spfPyDic)
        file_util.copy_file(spfPyDic, "grammalecte/graphspell/_dictionaries")
        dVars['dic_'+sType+'_filename_py'] = sFileName + '.json'
        if bJavaScript:
            print("  +", spfJSDic)
            file_util.copy_file(spfJSDic, "grammalecte-js/graphspell/_dictionaries")
            dVars['dic_'+sType+'_filename_js'] = sFileName + '.json'
    dVars['dic_main_filename_py'] = dVars['dic_default_filename_py'] + ".json"
    dVars['dic_main_filename_js'] = dVars['dic_default_filename_js'] + ".json"


def buildDictionary (dVars, sType, bJavaScript=False):
    "build binary dictionary for Graphspell from lexicons"
    if sType == "main":
        spfLexSrc = dVars['lexicon_src']

Modified reader.py from [66f5eb17ae] to [e2706fc6a2].

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
..
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

import os
import sys
import re

import graphspell.ibdawg as ibdawg

oDict = ibdawg.IBDAWG("French.bdic")


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
................................................................................

def listUnknownWords (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spfSrc):
            sLine = sLine.strip()
            if sLine:
                for sWord in sLine.split():
                    if not oDict.isValid(sWord): 
                        hDst.write(sWord+"\n")

# --------------------------------------------------------------------------------------------------

def createLexStatFile (spf, dStat):
    dWord = {}
    for i, sLine in enumerate(readFile(spf)):







|







 







|







3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
..
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

import os
import sys
import re

import graphspell.ibdawg as ibdawg

oDict = ibdawg.IBDAWG("fr-allvars.json")


def readFile (spf):
    if os.path.isfile(spf):
        with open(spf, "r", encoding="utf-8") as hSrc:
            for sLine in hSrc:
                yield sLine
................................................................................

def listUnknownWords (spf):
    with open(spf+".res.txt", "w", encoding="utf-8") as hDst:
        for sLine in readFile(spfSrc):
            sLine = sLine.strip()
            if sLine:
                for sWord in sLine.split():
                    if not oDict.isValid(sWord):
                        hDst.write(sWord+"\n")

# --------------------------------------------------------------------------------------------------

def createLexStatFile (spf, dStat):
    dWord = {}
    for i, sLine in enumerate(readFile(spf)):