Grammalecte  Check-in [ed53270796]

Overview
Comment:[core][cli][fx] grammar analysis update (keeping tokens from being deleted)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | cli | core | fx | salxg
Files: files | file ages | folders
SHA3-256: ed532707969ae823b44e3e90389efe77de32479c411fb32538b53be4da872f1c
User & Date: olr on 2020-08-26 12:21:03
Other Links: branch diff | manifest | tags
Context
2020-08-28
11:00
[graphspell] lexicographer: update tags check-in: df1367e6e2 user: olr tags: graphspell, salxg
2020-08-26
12:21
[core][cli][fx] grammar analysis update (keeping tokens from being deleted) check-in: ed53270796 user: olr tags: cli, core, fx, salxg
12:19
[graphspell][js] useless return check-in: be32ffb142 user: olr tags: graphspell, salxg
Changes

Modified gc_core/js/lang_core/gc_engine.js from [c1f3aa973c] to [bd80498a18].

218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
...
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990

991
992
993
994
995
996
997
998
999
....
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
                for (let dToken of this.lTokens) {
                    if (dToken["sType"] != "INFO") {
                        this.dTokenPos.set(dToken["nStart"], dToken);
                    }
                }
                if (bFullInfo) {
                    this.lTokens0 = Array.from(this.lTokens);
                    // the list of tokens is duplicated, to keep all tokens from being deleted when analysis
                }
                this.parseText(this.sSentence, this.sSentence0, false, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext);
                if (bFullInfo) {
                    for (let oToken of this.lTokens0) {
                        if (oToken["sType"] == "WORD") {
                            oToken["bValidToken"] = gc_engine.oSpellChecker.isValidToken(oToken["sValue"]);
                        }
................................................................................

    rewriteFromTags (bDebug=false) {
        // rewrite the sentence, modify tokens, purge the token list
        if (bDebug) {
            console.log("REWRITE");
        }
        let lNewToken = [];
        let lNewTokens0 = [];
        let nMergeUntil = 0;
        let oMergingToken = null;
        for (let [iToken, oToken] of this.lTokens.entries()) {
            let bKeepToken = true;
            let bKeepToken0 = true;
            if (oToken["sType"] != "INFO") {
                if (nMergeUntil && iToken <= nMergeUntil) {
                    oMergingToken["sValue"] += " ".repeat(oToken["nStart"] - oMergingToken["nEnd"]) + oToken["sValue"];
                    oMergingToken["nEnd"] = oToken["nEnd"];
                    if (bDebug) {
                        console.log("  MERGED TOKEN: " + oMergingToken["sValue"]);
                    }

                    bKeepToken = false;
                    bKeepToken0 = false;
                }
                if (oToken.hasOwnProperty("nMergeUntil")) {
                    if (iToken > nMergeUntil) { // this token is not already merged with a previous token
                        oMergingToken = oToken;
                    }
                    if (oToken["nMergeUntil"] > nMergeUntil) {
                        nMergeUntil = oToken["nMergeUntil"];
................................................................................
                    this.dTokenPos.delete(oToken["nStart"]);
                }
                catch (e) {
                    console.log(this.asString());
                    console.log(oToken);
                }
            }
            if (this.lTokens0 !== null && bKeepToken0) {
                lNewTokens0.push(oToken);
            }
        }
        if (bDebug) {
            console.log("  TEXT REWRITED: " + this.sSentence);
        }
        this.lTokens.length = 0;
        this.lTokens = lNewToken;
        if (this.lTokens0 !== null) {
            this.lTokens0.length = 0;
            this.lTokens0 = lNewTokens0;
        }
    }
};


if (typeof(exports) !== 'undefined') {
    exports.lang = gc_engine.lang;
    exports.locales = gc_engine.locales;







|







 







<




<







>

<







 







<
<
<






<
<
<
<







218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
...
971
972
973
974
975
976
977

978
979
980
981

982
983
984
985
986
987
988
989
990

991
992
993
994
995
996
997
....
1027
1028
1029
1030
1031
1032
1033



1034
1035
1036
1037
1038
1039




1040
1041
1042
1043
1044
1045
1046
                for (let dToken of this.lTokens) {
                    if (dToken["sType"] != "INFO") {
                        this.dTokenPos.set(dToken["nStart"], dToken);
                    }
                }
                if (bFullInfo) {
                    this.lTokens0 = Array.from(this.lTokens);
                    // the list of tokens is duplicated, to keep tokens from being deleted when analysis
                }
                this.parseText(this.sSentence, this.sSentence0, false, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext);
                if (bFullInfo) {
                    for (let oToken of this.lTokens0) {
                        if (oToken["sType"] == "WORD") {
                            oToken["bValidToken"] = gc_engine.oSpellChecker.isValidToken(oToken["sValue"]);
                        }
................................................................................

    rewriteFromTags (bDebug=false) {
        // rewrite the sentence, modify tokens, purge the token list
        if (bDebug) {
            console.log("REWRITE");
        }
        let lNewToken = [];

        let nMergeUntil = 0;
        let oMergingToken = null;
        for (let [iToken, oToken] of this.lTokens.entries()) {
            let bKeepToken = true;

            if (oToken["sType"] != "INFO") {
                if (nMergeUntil && iToken <= nMergeUntil) {
                    oMergingToken["sValue"] += " ".repeat(oToken["nStart"] - oMergingToken["nEnd"]) + oToken["sValue"];
                    oMergingToken["nEnd"] = oToken["nEnd"];
                    if (bDebug) {
                        console.log("  MERGED TOKEN: " + oMergingToken["sValue"]);
                    }
                    oToken["bMerged"] = true;
                    bKeepToken = false;

                }
                if (oToken.hasOwnProperty("nMergeUntil")) {
                    if (iToken > nMergeUntil) { // this token is not already merged with a previous token
                        oMergingToken = oToken;
                    }
                    if (oToken["nMergeUntil"] > nMergeUntil) {
                        nMergeUntil = oToken["nMergeUntil"];
................................................................................
                    this.dTokenPos.delete(oToken["nStart"]);
                }
                catch (e) {
                    console.log(this.asString());
                    console.log(oToken);
                }
            }



        }
        if (bDebug) {
            console.log("  TEXT REWRITED: " + this.sSentence);
        }
        this.lTokens.length = 0;
        this.lTokens = lNewToken;




    }
};


if (typeof(exports) !== 'undefined') {
    exports.lang = gc_engine.lang;
    exports.locales = gc_engine.locales;

Modified gc_core/py/lang_core/gc_engine.py from [95514cb21c] to [a0179d3d20].

280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
...
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852

853
854
855
856
857
858
859
860
861
...
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
                try:
                    self.sSentence = sText[iStart:iEnd]
                    self.sSentence0 = self.sText0[iStart:iEnd]
                    self.nOffsetWithinParagraph = iStart
                    self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True))
                    self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lTokens  if dToken["sType"] != "INFO" }
                    if bFullInfo:
                        self.lTokens0 = list(self.lTokens)  # the list of tokens is duplicated, to keep all tokens from being deleted when analysis
                    self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                    if bFullInfo:
                        for dToken in self.lTokens0:
                            if dToken["sType"] == "WORD":
                                dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"])
                            if "lMorph" not in dToken:
                                dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"])
................................................................................
            echo("REWRITE")
        lNewTokens = []
        lNewTokens0 = []
        nMergeUntil = 0
        dTokenMerger = {}
        for iToken, dToken in enumerate(self.lTokens):
            bKeepToken = True
            bKeepToken0 = True
            if dToken["sType"] != "INFO":
                if nMergeUntil and iToken <= nMergeUntil:
                    # token to merge
                    dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                    dTokenMerger["nEnd"] = dToken["nEnd"]
                    if bDebug:
                        echo("  MERGED TOKEN: " + dTokenMerger["sValue"])

                    bKeepToken = False
                    bKeepToken0 = False
                if "nMergeUntil" in dToken:
                    # first token to be merge with
                    if iToken > nMergeUntil: # this token is not to be merged with a previous token
                        dTokenMerger = dToken
                    if dToken["nMergeUntil"] > nMergeUntil:
                        nMergeUntil = dToken["nMergeUntil"]
                    del dToken["nMergeUntil"]
................................................................................
                    del dToken["sNewValue"]
            else:
                try:
                    del self.dTokenPos[dToken["nStart"]]
                except KeyError:
                    echo(self)
                    echo(dToken)
            if self.lTokens0 is not None and bKeepToken0:
                lNewTokens0.append(dToken)
        if bDebug:
            echo("  TEXT REWRITED: " + self.sSentence)
        self.lTokens.clear()
        self.lTokens = lNewTokens
        if self.lTokens0 is not None:
            self.lTokens0.clear()
            self.lTokens0 = lNewTokens0







|







 







<







>

<







 







<
<




<
<
<
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
...
838
839
840
841
842
843
844

845
846
847
848
849
850
851
852
853

854
855
856
857
858
859
860
...
879
880
881
882
883
884
885


886
887
888
889



                try:
                    self.sSentence = sText[iStart:iEnd]
                    self.sSentence0 = self.sText0[iStart:iEnd]
                    self.nOffsetWithinParagraph = iStart
                    self.lTokens = list(_oTokenizer.genTokens(self.sSentence, True))
                    self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lTokens  if dToken["sType"] != "INFO" }
                    if bFullInfo:
                        self.lTokens0 = list(self.lTokens)  # the list of tokens is duplicated, to keep tokens from being deleted when analysis
                    self.parseText(self.sSentence, self.sSentence0, False, iStart, sCountry, dOpt, bShowRuleId, bDebug, bContext)
                    if bFullInfo:
                        for dToken in self.lTokens0:
                            if dToken["sType"] == "WORD":
                                dToken["bValidToken"] = _oSpellChecker.isValidToken(dToken["sValue"])
                            if "lMorph" not in dToken:
                                dToken["lMorph"] = _oSpellChecker.getMorph(dToken["sValue"])
................................................................................
            echo("REWRITE")
        lNewTokens = []
        lNewTokens0 = []
        nMergeUntil = 0
        dTokenMerger = {}
        for iToken, dToken in enumerate(self.lTokens):
            bKeepToken = True

            if dToken["sType"] != "INFO":
                if nMergeUntil and iToken <= nMergeUntil:
                    # token to merge
                    dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                    dTokenMerger["nEnd"] = dToken["nEnd"]
                    if bDebug:
                        echo("  MERGED TOKEN: " + dTokenMerger["sValue"])
                    dToken["bMerged"] = True
                    bKeepToken = False

                if "nMergeUntil" in dToken:
                    # first token to be merge with
                    if iToken > nMergeUntil: # this token is not to be merged with a previous token
                        dTokenMerger = dToken
                    if dToken["nMergeUntil"] > nMergeUntil:
                        nMergeUntil = dToken["nMergeUntil"]
                    del dToken["nMergeUntil"]
................................................................................
                    del dToken["sNewValue"]
            else:
                try:
                    del self.dTokenPos[dToken["nStart"]]
                except KeyError:
                    echo(self)
                    echo(dToken)


        if bDebug:
            echo("  TEXT REWRITED: " + self.sSentence)
        self.lTokens.clear()
        self.lTokens = lNewTokens



Modified gc_lang/fr/webext/content_scripts/panel_gc.js from [c04a1ec88f] to [b1e5b16e34].

578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
                this.nLxgCount += 1;
                if (oSentence.sSentence.trim() !== "") {
                    let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"});
                    xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount}));
                    xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence}));
                    let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"});
                    for (let oToken of oSentence.lTokens) {
                        if (oToken["sType"] != "INFO") {
                            xTokenList.appendChild(this._createTokenBlock2(oToken));
                        }
                    }
                    xSentenceBlock.appendChild(xTokenList);
                    this.xLxgResultZone.appendChild(xSentenceBlock);
                }
            }







|







578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
                this.nLxgCount += 1;
                if (oSentence.sSentence.trim() !== "") {
                    let xSentenceBlock = oGrammalecte.createNode("div", {className: "grammalecte_lxg_paragraph_sentence_block"});
                    xSentenceBlock.appendChild(oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_num", textContent: this.nLxgCount}));
                    xSentenceBlock.appendChild(oGrammalecte.createNode("p", {className: "grammalecte_lxg_paragraph_sentence", textContent: oSentence.sSentence}));
                    let xTokenList = oGrammalecte.createNode("div", {className: "grammalecte_lxg_list_of_tokens"});
                    for (let oToken of oSentence.lTokens) {
                        if (oToken["sType"] != "INFO" && !oToken.hasOwnProperty("bMerged")) {
                            xTokenList.appendChild(this._createTokenBlock2(oToken));
                        }
                    }
                    xSentenceBlock.appendChild(xTokenList);
                    this.xLxgResultZone.appendChild(xSentenceBlock);
                }
            }

Modified grammalecte-cli.py from [906ad28942] to [65f33ab4b5].

342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
                    if xArgs.textformatter:
                        sParagraph = oTextFormatter.formatText(sParagraph)
                    lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True)
                    #echo(txt.getReadableErrors(lParagraphErrors, xArgs.width))
                    for dSentence in lSentences:
                        echo("{nStart}:{nEnd}  <{sSentence}>".format(**dSentence))
                        for dToken in dSentence["lTokens"]:
                            if dToken["sType"] == "INFO":
                                continue
                            echo("  {0[nStart]:>3}:{0[nEnd]:<3} {1} {0[sType]:<14} {2} {0[sValue]:<16} {3}".format(dToken, \
                                                                                                        "×" if dToken.get("bToRemove", False) else " ",
                                                                                                        "!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ",
                                                                                                        " ".join(dToken.get("aTags", "")) ) )
                            if "lMorph" in dToken:
                                for sMorph, sLabel in zip(dToken["lMorph"], dToken["aLabels"]):







|







342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
                    if xArgs.textformatter:
                        sParagraph = oTextFormatter.formatText(sParagraph)
                    lParagraphErrors, lSentences = oGrammarChecker.gce.parse(sParagraph, bDebug=xArgs.debug, bFullInfo=True)
                    #echo(txt.getReadableErrors(lParagraphErrors, xArgs.width))
                    for dSentence in lSentences:
                        echo("{nStart}:{nEnd}  <{sSentence}>".format(**dSentence))
                        for dToken in dSentence["lTokens"]:
                            if dToken["sType"] == "INFO" or "bMerged" in dToken:
                                continue
                            echo("  {0[nStart]:>3}:{0[nEnd]:<3} {1} {0[sType]:<14} {2} {0[sValue]:<16} {3}".format(dToken, \
                                                                                                        "×" if dToken.get("bToRemove", False) else " ",
                                                                                                        "!" if dToken["sType"] == "WORD" and not dToken.get("bValidToken", False) else " ",
                                                                                                        " ".join(dToken.get("aTags", "")) ) )
                            if "lMorph" in dToken:
                                for sMorph, sLabel in zip(dToken["lMorph"], dToken["aLabels"]):