Grammalecte  Check-in [c6acf0a935]

Overview
Comment:[core] fix intertwined bugs concerning switches between regex rules and graph rules (the transition is a mess)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | rg
Files: files | file ages | folders
SHA3-256: c6acf0a935480c9849275a9867929c7adba7ce2fcdfa9a403398d6fb545b92f0
User & Date: olr on 2018-06-29 15:32:17
Other Links: branch diff | manifest | tags
Context
2018-06-29
17:14
[fr] conversion: regex rules -> graph rules check-in: 2c2013067a user: olr tags: fr, rg
15:32
[core] fix intertwined bugs concerning switches between regex rules and graph rules (the transition is a mess) check-in: c6acf0a935 user: olr tags: core, rg
15:24
[fr] fix few bugs check-in: ad0bbbe88a user: olr tags: fr, rg
Changes

Modified gc_core/py/lang_core/gc_engine.py from [16b07dde1f] to [09d4d7dd58].

156
157
158
159
160
161
162
163
164
165
166
167
168
169

170
171
172
173
174
175
176
177
178
179
180


181
182
183
184
185
186
187
...
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
...
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
...
590
591
592
593
594
595
596
597
598
599
600








601
602
603
604
605
606
607
...
911
912
913
914
915
916
917

918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
...
948
949
950
951
952
953
954

955




956
957
958
959
960
961
962
    bParagraphChange = False
    bSentenceChange = False
    dTokenPos = oSentence.dTokenPos if oSentence else {}
    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules
            if not bParagraph and bSentenceChange:
                oSentence.update(s)
                bSentenceChange = False
            for sGraphName, sLineId in lRuleGroup:
                if bDebug:
                    print("\n>>>> GRAPH:", sGraphName, sLineId)
                bParagraphChange, s = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                dErrs.update(oSentence.dError)

        elif not sOption or dOptions.get(sOption, False):
            # regex rules
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
                        bCondMemo = None
                        for sFuncCond, cActionType, sWhat, *eAct in lActions:
                            # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
                            try:
                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
                                if bCondMemo:


                                    if cActionType == "-":
                                        # grammar error
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority.get(nErrorStart, -1):
                                            dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
................................................................................
    return any(zPattern.search(s)  for s in lMorph)


def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
    "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
    if not tWord:
        return bNoWord

    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        return False
    if (tWord[1].startswith("noir")):
        print(tWord)
        print(dTokenPos)
        print(lMorph)
    # check negative condition
    zNegPattern = re.compile(sNegPattern)
    if any(zNegPattern.search(s)  for s in lMorph):
        return False
    # search sPattern
    zPattern = re.compile(sPattern)
    return any(zPattern.search(s)  for s in lMorph)
................................................................................
    "Text parser"

    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffsetWithinParagraph = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken }
        self.dTags = {}
        self.dError = {}

    def __str__ (self):
        s = "sentence: " + self.sSentence0 + "\n"
        s += "now:      " + self.sSentence  + "\n"
        for dToken in self.lToken:
................................................................................
            if "lMorph" in dToken:
                s += "\t" + str(dToken["lMorph"])
            s += "\n"
        for nPos, dToken in self.dTokenPos.items():
            s += f"{nPos}\t{dToken}\n"
        return s

    def update (self, sSentence):
        "update <sSentence> and retokenize"
        self.sSentence = sSentence
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))









    def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            if bDebug:
                print("  MATCH:", dToken["sValue"])
................................................................................
        if bDebug:
            print("REWRITE")
        lNewToken = []
        nMergeUntil = 0
        dTokenMerger = None
        for dToken in self.lToken:
            bKeepToken = True

            if "bImmune" in dToken:
                nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"]
                if nErrorStart in self.dError:
                    if bDebug:
                        print("immunity -> error removed:", self.dError[nErrorStart])
                    del self.dError[nErrorStart]
            if nMergeUntil and dToken["i"] <= nMergeUntil:
                dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                dTokenMerger["nEnd"] = dToken["nEnd"]
                if bDebug:
                    print("  MERGED TOKEN:", dTokenMerger["sValue"])
                bKeepToken = False
            if "nMergeUntil" in dToken:
                if dToken["i"] > nMergeUntil: # this token is not already merged with a previous token
                    dTokenMerger = dToken
                if dToken["nMergeUntil"] > nMergeUntil:
                    nMergeUntil = dToken["nMergeUntil"]
                del dToken["nMergeUntil"]
            elif "bToRemove" in dToken:
                if bDebug:
                    print("  REMOVED:", dToken["sValue"])
                self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:]
                bKeepToken = False
            #
            if bKeepToken:
                lNewToken.append(dToken)
                if "sNewValue" in dToken:
                    # rewrite token and sentence
                    if bDebug:
                        print(dToken["sValue"], "->", dToken["sNewValue"])
................................................................................
                    dToken["sRealValue"] = dToken["sValue"]
                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:]
                    del dToken["sNewValue"]
            else:

                del self.dTokenPos[dToken["nStart"]]




        if bDebug:
            print("  REWRITED:", self.sSentence)
        self.lToken.clear()
        self.lToken = lNewToken










|






>











>
>







 







<



<
<
<
<







 







|







 







|


|
>
>
>
>
>
>
>
>







 







>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







 







>
|
>
>
>
>







156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
...
410
411
412
413
414
415
416

417
418
419




420
421
422
423
424
425
426
...
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
...
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
...
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
...
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
    bParagraphChange = False
    bSentenceChange = False
    dTokenPos = oSentence.dTokenPos if oSentence else {}
    for sOption, lRuleGroup in _getRules(bParagraph):
        if sOption == "@@@@":
            # graph rules
            if not bParagraph and bSentenceChange:
                oSentence.update(s, bDebug)
                bSentenceChange = False
            for sGraphName, sLineId in lRuleGroup:
                if bDebug:
                    print("\n>>>> GRAPH:", sGraphName, sLineId)
                bParagraphChange, s = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
                dErrs.update(oSentence.dError)
                dTokenPos = oSentence.dTokenPos
        elif not sOption or dOptions.get(sOption, False):
            # regex rules
            for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
                if sRuleId not in _aIgnoredRules:
                    for m in zRegex.finditer(s):
                        bCondMemo = None
                        for sFuncCond, cActionType, sWhat, *eAct in lActions:
                            # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
                            try:
                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
                                if bCondMemo:
                                    if bDebug:
                                        print("RULE:", sLineId)
                                    if cActionType == "-":
                                        # grammar error
                                        nErrorStart = nOffset + m.start(eAct[0])
                                        if nErrorStart not in dErrs or nPriority > dPriority.get(nErrorStart, -1):
                                            dErrs[nErrorStart] = _createError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
                                            dPriority[nErrorStart] = nPriority
                                    elif cActionType == "~":
................................................................................
    return any(zPattern.search(s)  for s in lMorph)


def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
    "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
    if not tWord:
        return bNoWord

    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
    if not lMorph:
        return False




    # check negative condition
    zNegPattern = re.compile(sNegPattern)
    if any(zNegPattern.search(s)  for s in lMorph):
        return False
    # search sPattern
    zPattern = re.compile(sPattern)
    return any(zPattern.search(s)  for s in lMorph)
................................................................................
    "Text parser"

    def __init__ (self, sSentence, sSentence0, nOffset):
        self.sSentence = sSentence
        self.sSentence0 = sSentence0
        self.nOffsetWithinParagraph = nOffset
        self.lToken = list(_oTokenizer.genTokens(sSentence, True))
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        self.dTags = {}
        self.dError = {}

    def __str__ (self):
        s = "sentence: " + self.sSentence0 + "\n"
        s += "now:      " + self.sSentence  + "\n"
        for dToken in self.lToken:
................................................................................
            if "lMorph" in dToken:
                s += "\t" + str(dToken["lMorph"])
            s += "\n"
        for nPos, dToken in self.dTokenPos.items():
            s += f"{nPos}\t{dToken}\n"
        return s

    def update (self, sSentence, bDebug=False):
        "update <sSentence> and retokenize"
        self.sSentence = sSentence
        lNewToken = list(_oTokenizer.genTokens(sSentence, True))
        for dToken in lNewToken:
            if "lMorph" in self.dTokenPos.get(dToken["nStart"], {}):
                dToken["lMorph"] = self.dTokenPos[dToken["nStart"]]["lMorph"]
        self.lToken = lNewToken
        self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }
        if bDebug:
            print("UPDATE:")
            print(self)

    def _getNextMatchingNodes (self, dToken, dGraph, dNode, bDebug=False):
        "generator: return nodes where <dToken> “values” match <dNode> arcs"
        # token value
        if dToken["sValue"] in dNode:
            if bDebug:
                print("  MATCH:", dToken["sValue"])
................................................................................
        if bDebug:
            print("REWRITE")
        lNewToken = []
        nMergeUntil = 0
        dTokenMerger = None
        for dToken in self.lToken:
            bKeepToken = True
            if dToken["sType"] != "INFO":
                if "bImmune" in dToken:
                    nErrorStart = self.nOffsetWithinParagraph + dToken["nStart"]
                    if nErrorStart in self.dError:
                        if bDebug:
                            print("immunity -> error removed:", self.dError[nErrorStart])
                        del self.dError[nErrorStart]
                if nMergeUntil and dToken["i"] <= nMergeUntil:
                    dTokenMerger["sValue"] += " " * (dToken["nStart"] - dTokenMerger["nEnd"]) + dToken["sValue"]
                    dTokenMerger["nEnd"] = dToken["nEnd"]
                    if bDebug:
                        print("  MERGED TOKEN:", dTokenMerger["sValue"])
                    bKeepToken = False
                if "nMergeUntil" in dToken:
                    if dToken["i"] > nMergeUntil: # this token is not already merged with a previous token
                        dTokenMerger = dToken
                    if dToken["nMergeUntil"] > nMergeUntil:
                        nMergeUntil = dToken["nMergeUntil"]
                    del dToken["nMergeUntil"]
                elif "bToRemove" in dToken:
                    if bDebug:
                        print("  REMOVED:", dToken["sValue"])
                    self.sSentence = self.sSentence[:dToken["nStart"]] + " " * (dToken["nEnd"] - dToken["nStart"]) + self.sSentence[dToken["nEnd"]:]
                    bKeepToken = False
            #
            if bKeepToken:
                lNewToken.append(dToken)
                if "sNewValue" in dToken:
                    # rewrite token and sentence
                    if bDebug:
                        print(dToken["sValue"], "->", dToken["sNewValue"])
................................................................................
                    dToken["sRealValue"] = dToken["sValue"]
                    dToken["sValue"] = dToken["sNewValue"]
                    nDiffLen = len(dToken["sRealValue"]) - len(dToken["sNewValue"])
                    sNewRepl = (dToken["sNewValue"] + " " * nDiffLen)  if nDiffLen >= 0  else dToken["sNewValue"][:len(dToken["sRealValue"])]
                    self.sSentence = self.sSentence[:dToken["nStart"]] + sNewRepl + self.sSentence[dToken["nEnd"]:]
                    del dToken["sNewValue"]
            else:
                try:
                    del self.dTokenPos[dToken["nStart"]]
                except:
                    print(self)
                    print(dToken)
                    exit()
        if bDebug:
            print("  REWRITED:", self.sSentence)
        self.lToken.clear()
        self.lToken = lNewToken