Grammalecte  Check-in [7616aa7ef9]

Overview
Comment:[graphspell] spellchecker: add parseParagraph()
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 7616aa7ef9d3cc203b118f77e0749b0be606a98df6bea82f7a2f7628405e43d4
User & Date: olr on 2018-02-20 08:40:03
Other Links: manifest | tags
Context
2018-02-20
12:06
[fr][tests] Update: Le Horla check-in: bf58e39b3f user: olr tags: fr, trunk
08:40
[graphspell] spellchecker: add parseParagraph() check-in: 7616aa7ef9 user: olr tags: graphspell, trunk
2018-02-19
18:08
[fr] new performance test (better when the processor isn’t converting a video!) check-in: b34690f0d8 user: olr tags: fr, trunk
Changes

Modified gc_lang/fr/webext/gce_worker.js from [c20f81d8f3] to [efd11a103b].

   200    200   }
   201    201   
   202    202   function parseAndSpellcheck (sText, sCountry, bDebug, bContext, dInfo={}) {
   203    203       let i = 0;
   204    204       sText = sText.replace(/­/g, "").normalize("NFC");
   205    205       for (let sParagraph of text.getParagraph(sText)) {
   206    206           let aGrammErr = gc_engine.parse(sParagraph, sCountry, bDebug, bContext);
   207         -        let aSpellErr = oTokenizer.getSpellingErrors(sParagraph, oSpellChecker);
          207  +        let aSpellErr = oSpellChecker.parseParagraph(sParagraph);
   208    208           postMessage(createResponse("parseAndSpellcheck", {sParagraph: sParagraph, iParaNum: i, aGrammErr: aGrammErr, aSpellErr: aSpellErr}, dInfo, false));
   209    209           i += 1;
   210    210       }
   211    211       postMessage(createResponse("parseAndSpellcheck", null, dInfo, true));
   212    212   }
   213    213   
   214    214   function parseAndSpellcheck1 (sParagraph, sCountry, bDebug, bContext, dInfo={}) {
   215    215       sParagraph = sParagraph.replace(/­/g, "").normalize("NFC");
   216    216       let aGrammErr = gc_engine.parse(sParagraph, sCountry, bDebug, bContext);
   217         -    let aSpellErr = oTokenizer.getSpellingErrors(sParagraph, oSpellChecker);
          217  +    let aSpellErr = oSpellChecker.parseParagraph(sParagraph);
   218    218       postMessage(createResponse("parseAndSpellcheck1", {sParagraph: sParagraph, aGrammErr: aGrammErr, aSpellErr: aSpellErr}, dInfo, true));
   219    219   }
   220    220   
   221    221   function getOptions (dInfo={}) {
   222    222       postMessage(createResponse("getOptions", gc_engine.getOptions(), dInfo, true));
   223    223   }
   224    224   

Modified graphspell-js/spellchecker.js from [e878cd2181] to [7b8a526c88].

     9      9   
    10     10   
    11     11   "use strict";
    12     12   
    13     13   
    14     14   if (typeof(require) !== 'undefined') {
    15     15       var ibdawg = require("resource://grammalecte/graphspell/ibdawg.js");
           16  +    var tokenizer = require("resource://grammalecte/graphspell/tokenizer.js");
    16     17   }
    17     18   
    18     19   
    19     20   ${map}
    20     21   
    21     22   
    22     23   const dDefaultDictionaries = new Map([
................................................................................
    32     33           this.sLangCode = sLangCode;
    33     34           if (!mainDic) {
    34     35               mainDic = dDefaultDictionaries.gl_get(sLangCode, "");
    35     36           }
    36     37           this.oMainDic = this._loadDictionary(mainDic, sPath, true);
    37     38           this.oExtendedDic = this._loadDictionary(extentedDic, sPath);
    38     39           this.oPersonalDic = this._loadDictionary(personalDic, sPath);
           40  +        this.oTokenizer = null;
    39     41       }
    40     42   
    41     43       _loadDictionary (dictionary, sPath, bNecessary=false) {
    42     44           // returns an IBDAWG object
    43     45           if (!dictionary) {
    44     46               return null;
    45     47           }
    46     48           try {
    47         -            if (typeof(require) !== 'undefined') {
           49  +            if (typeof(ibdawg) !== 'undefined') {
    48     50                   return new ibdawg.IBDAWG(dictionary);  // dictionary can be a filename or a JSON object
    49     51               } else {
    50     52                   return new IBDAWG(dictionary, sPath);  // dictionary can be a filename or a JSON object
    51     53               }
    52     54           }
    53     55           catch (e) {
    54     56               let sfDictionary = (typeof(dictionary) == "string") ? dictionary : dictionary.sLangName + "/" + dictionary.sFileName;
................................................................................
    56     58                   throw "Error: <" + sfDictionary + "> not loaded. " + e.message;
    57     59               }
    58     60               console.log("Error: <" + sfDictionary + "> not loaded.")
    59     61               console.log(e.message);
    60     62               return null;
    61     63           }
    62     64       }
           65  +
           66  +    loadTokenizer () {
           67  +        if (typeof(tokenizer) !== 'undefined') {
           68  +            this.oTokenizer = new tokenizer.Tokenizer(this.sLangCode);
           69  +        } else {
           70  +            this.oTokenizer = new Tokenizer(this.sLangCode);
           71  +        }
           72  +    }
    63     73   
    64     74       setMainDictionary (dictionary) {
    65     75           // returns true if the dictionary is loaded
    66     76           this.oMainDic = this._loadDictionary(dictionary);
    67     77           return Boolean(this.oMainDic);
    68     78       }
    69     79   
................................................................................
    74     84       }
    75     85   
    76     86       setPersonalDictionary (dictionary) {
    77     87           // returns true if the dictionary is loaded
    78     88           this.oPersonalDic = this._loadDictionary(dictionary);
    79     89           return Boolean(this.oPersonalDic);
    80     90       }
           91  +
           92  +    // parse text functions
           93  +
           94  +    parseParagraph (sText) {
           95  +        if (!this.oTokenizer) {
           96  +            this.loadTokenizer();
           97  +        }
           98  +        let aSpellErr = [];
           99  +        for (let oToken of this.oTokenizer.genTokens(sText)) {
          100  +            if (oToken.sType === 'WORD' && !this.isValidToken(oToken.sValue)) {
          101  +                aSpellErr.push(oToken);
          102  +            }
          103  +        }
          104  +        return aSpellErr;
          105  +    }
    81    106   
    82    107       // IBDAWG functions
    83    108   
    84    109       isValidToken (sToken) {
    85    110           // checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)
    86    111           if (this.oMainDic.isValidToken(sToken)) {
    87    112               return true;

Modified graphspell-js/tokenizer.js from [c3f0ee8c90] to [bdd895b918].

    83     83                       helpers.logerror(e);
    84     84                   }
    85     85               }
    86     86               i += nCut;
    87     87               sText = sText.slice(nCut);
    88     88           }
    89     89       }
    90         -
    91         -    getSpellingErrors (sText, oSpellChecker) {
    92         -        let aSpellErr = [];
    93         -        for (let oToken of this.genTokens(sText)) {
    94         -            if (oToken.sType === 'WORD' && !oSpellChecker.isValidToken(oToken.sValue)) {
    95         -                aSpellErr.push(oToken);
    96         -            }
    97         -        }
    98         -        return aSpellErr;
    99         -    }
   100     90   }
   101     91   
   102     92   
   103     93   if (typeof(exports) !== 'undefined') {
   104     94       exports.Tokenizer = Tokenizer;
   105     95   }

Modified graphspell/spellchecker.py from [638f8d8cdf] to [b9fb2c7b70].

     7      7   # - the extended dictionary, added by an organization
     8      8   # - the personal dictionary, created by the user for its own convenience
     9      9   
    10     10   
    11     11   import traceback
    12     12   
    13     13   from . import ibdawg
           14  +from . import tokenizer
    14     15   
    15     16   
    16     17   dDefaultDictionaries = {
    17     18       "fr": "fr.bdic",
    18     19       "en": "en.bdic"
    19     20   }
    20     21   
................................................................................
    25     26           "returns True if the main dictionary is loaded"
    26     27           self.sLangCode = sLangCode
    27     28           if not sfMainDic:
    28     29               sfMainDic = dDefaultDictionaries.get(sLangCode, "")
    29     30           self.oMainDic = self._loadDictionary(sfMainDic, True)
    30     31           self.oExtendedDic = self._loadDictionary(sfExtendedDic)
    31     32           self.oPersonalDic = self._loadDictionary(sfPersonalDic)
           33  +        self.oTokenizer = None
    32     34   
    33     35       def _loadDictionary (self, sfDictionary, bNecessary=False):
    34     36           "returns an IBDAWG object"
    35     37           if not sfDictionary:
    36     38               return None
    37     39           try:
    38     40               return ibdawg.IBDAWG(sfDictionary)
................................................................................
    39     41           except Exception as e:
    40     42               if bNecessary:
    41     43                   raise Exception(str(e), "Error: <" + sfDictionary + "> not loaded.")
    42     44               print("Error: <" + sfDictionary + "> not loaded.")
    43     45               traceback.print_exc()
    44     46               return None
    45     47   
           48  +    def loadTokenizer (self):
           49  +        self.oTokenizer = tokenizer.Tokenizer(self.sLangCode)
           50  +
    46     51       def setMainDictionary (self, sfDictionary):
    47     52           "returns True if the dictionary is loaded"
    48     53           self.oMainDic = self._loadDictionary(sfDictionary)
    49     54           return bool(self.oMainDic)
    50     55               
    51     56       def setExtendedDictionary (self, sfDictionary):
    52     57           "returns True if the dictionary is loaded"
................................................................................
    54     59           return bool(self.oExtendedDic)
    55     60   
    56     61       def setPersonalDictionary (self, sfDictionary):
    57     62           "returns True if the dictionary is loaded"
    58     63           self.oPersonalDic = self._loadDictionary(sfDictionary)
    59     64           return bool(self.oPersonalDic)
    60     65   
           66  +    # parse text functions
           67  +
           68  +    def parseParagraph (self, sText, bSpellSugg=False):
           69  +        if not self.oTokenizer:
           70  +            self.loadTokenizer()
           71  +        aSpellErrs = []
           72  +        for dToken in self.oTokenizer.genTokens(sText):
           73  +            if dToken['sType'] == "WORD" and not self.isValidToken(dToken['sValue']):
           74  +                if bSpellSugg:
           75  +                    dToken['aSuggestions'] = []
           76  +                    for lSugg in self.suggest(dToken['sValue']):
           77  +                        dToken['aSuggestions'].extend(lSugg)
           78  +                aSpellErrs.append(dToken)
           79  +        return aSpellErrs
    61     80   
    62     81       # IBDAWG functions
    63     82   
    64     83       def isValidToken (self, sToken):
    65     84           "checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)"
    66     85           if self.oMainDic.isValidToken(sToken):
    67     86               return True