Grammalecte  Check-in [1184e8ba6d]

Overview
Comment:[build][core] regex rules now use tokens for disambiguation
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | build | rg
Files: files | file ages | folders
SHA3-256: 1184e8ba6d44299b56d10d9e778ca03ef1e154e05c43363c23109a761f9c4d99
User & Date: olr on 2018-06-13 09:26:32
Original Comment: [build][core] merge regex rules now use tokens for disambiguation
Other Links: branch diff | manifest | tags
Context
2018-06-13
17:10
[fr] conversion regex rules -> graph rules check-in: edf852434a user: olr tags: fr, rg
09:26
[build][core] regex rules now use tokens for disambiguation check-in: 1184e8ba6d user: olr tags: build, core, rg
07:57
[core] gc engine: dictionary of tokens position for disambiguation check-in: ff58bafc4d user: olr tags: core, rg
Changes

Modified compile_rules.py from [856372255b] to [a5c1ea137d].

    26     26       s = re.sub(r"isRealStart *\(\)", 'before("^ *$")', s)
    27     27       s = re.sub(r"isStart0 *\(\)", 'before0("^ *$|, *$")', s)
    28     28       s = re.sub(r"isRealStart0 *\(\)", 'before0("^ *$")', s)
    29     29       s = re.sub(r"isEnd *\(\)", 'after("^ *$|^,")', s)
    30     30       s = re.sub(r"isRealEnd *\(\)", 'after("^ *$")', s)
    31     31       s = re.sub(r"isEnd0 *\(\)", 'after0("^ *$|^,")', s)
    32     32       s = re.sub(r"isRealEnd0 *\(\)", 'after0("^ *$")', s)
    33         -    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dDA, m.start(\\2), m.group(\\2)', s)
    34         -    s = re.sub(r"define[(][\\](\d+)", 'define(dDA, m.start(\\1)', s)
           33  +    s = re.sub(r"(select|exclude)[(][\\](\d+)", '\\1(dTokenPos, m.start(\\2), m.group(\\2)', s)
           34  +    s = re.sub(r"define[(][\\](\d+)", 'define(dTokenPos, m.start(\\1)', s)
    35     35       s = re.sub(r"(morph|morphex|displayInfo)[(][\\](\d+)", '\\1((m.start(\\2), m.group(\\2))', s)
    36         -    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dDA, ', s)
           36  +    s = re.sub(r"(morph|morphex|displayInfo)[(]", '\\1(dTokenPos, ', s)
    37     37       s = re.sub(r"(sugg\w+|switch\w+)\(@", '\\1(m.group(i[4])', s)
    38     38       s = re.sub(r"word\(\s*1\b", 'nextword1(s, m.end()', s)                                  # word(1)
    39     39       s = re.sub(r"word\(\s*-1\b", 'prevword1(s, m.start()', s)                               # word(-1)
    40     40       s = re.sub(r"word\(\s*(\d)", 'nextword(s, m.end(), \\1', s)                             # word(n)
    41     41       s = re.sub(r"word\(\s*-(\d)", 'prevword(s, m.start(), \\1', s)                          # word(-n)
    42     42       s = re.sub(r"before\(\s*", 'look(s[:m.start()], ', s)                                   # before(s)
    43     43       s = re.sub(r"after\(\s*", 'look(s[m.end():], ', s)                                      # after(s)
    44     44       s = re.sub(r"textarea\(\s*", 'look(s, ', s)                                             # textarea(s)
    45         -    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dDA, s[:m.start()], 0, ', s)                 # before_chk1(s)
    46         -    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dDA, s[m.end():], m.end(), ', s)              # after_chk1(s)
    47         -    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dDA, s, 0, ', s)                           # textarea_chk1(s)
           45  +    s = re.sub(r"before_chk1\(\s*", 'look_chk1(dTokenPos, s[:m.start()], 0, ', s)           # before_chk1(s)
           46  +    s = re.sub(r"after_chk1\(\s*", 'look_chk1(dTokenPos, s[m.end():], m.end(), ', s)        # after_chk1(s)
           47  +    s = re.sub(r"textarea_chk1\(\s*", 'look_chk1(dTokenPos, s, 0, ', s)                     # textarea_chk1(s)
    48     48       s = re.sub(r"/0", 'sx[m.start():m.end()]', s)                                           # /0
    49     49       s = re.sub(r"before0\(\s*", 'look(sx[:m.start()], ', s)                                 # before0(s)
    50     50       s = re.sub(r"after0\(\s*", 'look(sx[m.end():], ', s)                                    # after0(s)
    51     51       s = re.sub(r"textarea0\(\s*", 'look(sx, ', s)                                           # textarea0(s)
    52         -    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dDA, sx[:m.start()], 0, ', s)               # before0_chk1(s)
    53         -    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dDA, sx[m.end():], m.end(), ', s)            # after0_chk1(s)
    54         -    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dDA, sx, 0, ', s)                         # textarea0_chk1(s)
    55         -    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dDA, s[m.end():], m.end())', s)              # isEndOfNG(s)
    56         -    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dDA, s[m.end():], m.end())', s)        # isNextNotCOD(s)
    57         -    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dDA, s[m.end():], m.end())', s)            # isNextVerb(s)
           52  +    s = re.sub(r"before0_chk1\(\s*", 'look_chk1(dTokenPos, sx[:m.start()], 0, ', s)         # before0_chk1(s)
           53  +    s = re.sub(r"after0_chk1\(\s*", 'look_chk1(dTokenPos, sx[m.end():], m.end(), ', s)      # after0_chk1(s)
           54  +    s = re.sub(r"textarea0_chk1\(\s*", 'look_chk1(dTokenPos, sx, 0, ', s)                   # textarea0_chk1(s)
           55  +    s = re.sub(r"isEndOfNG\(\s*\)", 'isEndOfNG(dTokenPos, s[m.end():], m.end())', s)        # isEndOfNG(s)
           56  +    s = re.sub(r"isNextNotCOD\(\s*\)", 'isNextNotCOD(dTokenPos, s[m.end():], m.end())', s)  # isNextNotCOD(s)
           57  +    s = re.sub(r"isNextVerb\(\s*\)", 'isNextVerb(dTokenPos, s[m.end():], m.end())', s)      # isNextVerb(s)
    58     58       s = re.sub(r"\bspell *[(]", '_oSpellChecker.isValid(', s)
    59     59       s = re.sub(r"[\\](\d+)", 'm.group(\\1)', s)
    60     60       return s
    61     61   
    62     62   
    63     63   def uppercase (s, sLang):
    64     64       "(flag i is not enough): converts regex to uppercase regex: 'foo' becomes '[Ff][Oo][Oo]', but 'Bar' becomes 'B[Aa][Rr]'."
................................................................................
   547    547       # creating file with all functions callable by rules
   548    548       print("  creating callables...")
   549    549       sPyCallables = "# generated code, do not edit\n"
   550    550       sJSCallables = "// generated code, do not edit\nconst oEvalFunc = {\n"
   551    551       for sFuncName, sReturn in lFUNCTIONS:
   552    552           cType = sFuncName[0:1]
   553    553           if cType == "c": # condition
   554         -            sParams = "s, sx, m, dDA, sCountry, bCondMemo"
          554  +            sParams = "s, sx, m, dTokenPos, sCountry, bCondMemo"
   555    555           elif cType == "m": # message
   556    556               sParams = "s, m"
   557    557           elif cType == "s": # suggestion
   558    558               sParams = "s, m"
   559    559           elif cType == "p": # preprocessor
   560    560               sParams = "s, m"
   561    561           elif cType == "d": # disambiguator
   562         -            sParams = "s, m, dDA"
          562  +            sParams = "s, m, dTokenPos"
   563    563           else:
   564    564               print("# Unknown function type in [" + sFuncName + "]")
   565    565               continue
   566    566           sPyCallables += "def {} ({}):\n".format(sFuncName, sParams)
   567    567           sPyCallables += "    return " + sReturn + "\n"
   568    568           sJSCallables += "    {}: function ({})".format(sFuncName, sParams) + " {\n"
   569    569           sJSCallables += "        return " + jsconv.py2js(sReturn) + ";\n"

Modified gc_core/py/lang_core/gc_engine.py from [f48195d439] to [8eb4241441].

   104    104   #### Parsing
   105    105   
   106    106   def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
   107    107       "analyses the paragraph sText and returns list of errors"
   108    108       #sText = unicodedata.normalize("NFC", sText)
   109    109       aErrors = None
   110    110       sRealText = sText
   111         -    dDA = {}        # Disambiguisator. Key = position; value = list of morphologies
   112    111       dPriority = {}  # Key = position; value = priority
   113    112       dOpt = _dOptions  if not dOptions  else dOptions
   114    113       bShowRuleId = option('idrule')
   115    114   
   116    115       # parse paragraph
   117    116       try:
   118         -        sNew, aErrors = _proofread(None, sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
          117  +        sNew, aErrors = _proofread(None, sText, sRealText, 0, True, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
   119    118           if sNew:
   120    119               sText = sNew
   121    120       except:
   122    121           raise
   123    122   
   124    123       # cleanup
   125    124       if " " in sText:
................................................................................
   130    129           sText = sText.replace("'", "’")
   131    130       if "‑" in sText:
   132    131           sText = sText.replace("‑", "-") # nobreakdash
   133    132   
   134    133       # parse sentences
   135    134       for iStart, iEnd in _getSentenceBoundaries(sText):
   136    135           if 4 < (iEnd - iStart) < 2000:
   137         -            dDA.clear()
   138    136               try:
   139    137                   oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
   140         -                _, errs = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
          138  +                _, errs = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
   141    139                   aErrors.update(errs)
   142    140               except:
   143    141                   raise
   144    142       return aErrors.values() # this is a view (iterable)
   145    143   
   146    144   
   147    145   _zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)')
................................................................................
   151    149   def _getSentenceBoundaries (sText):
   152    150       iStart = _zBeginOfParagraph.match(sText).end()
   153    151       for m in _zEndOfSentence.finditer(sText):
   154    152           yield (iStart, m.end())
   155    153           iStart = m.end()
   156    154   
   157    155   
   158         -def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
          156  +def _proofread (oSentence, s, sx, nOffset, bParagraph, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
   159    157       dErrs = {}
   160    158       bParagraphChange = False
   161    159       bSentenceChange = False
          160  +    dTokenPos = oSentence.dTokenPos if oSentence else {}
   162    161       for sOption, lRuleGroup in _getRules(bParagraph):
   163    162           if sOption == "@@@@":
   164    163               # graph rules
   165    164               if not bParagraph and bSentenceChange:
   166    165                   oSentence.update(s)
   167    166                   bSentenceChange = False
   168    167               for sGraphName, sLineId in lRuleGroup:
................................................................................
   179    178               for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
   180    179                   if sRuleId not in _aIgnoredRules:
   181    180                       for m in zRegex.finditer(s):
   182    181                           bCondMemo = None
   183    182                           for sFuncCond, cActionType, sWhat, *eAct in lActions:
   184    183                               # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
   185    184                               try:
   186         -                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dDA, sCountry, bCondMemo)
          185  +                                bCondMemo = not sFuncCond or globals()[sFuncCond](s, sx, m, dTokenPos, sCountry, bCondMemo)
   187    186                                   if bCondMemo:
   188    187                                       if cActionType == "-":
   189    188                                           # grammar error
   190    189                                           nErrorStart = nOffset + m.start(eAct[0])
   191    190                                           if nErrorStart not in dErrs or nPriority > dPriority[nErrorStart]:
   192    191                                               dErrs[nErrorStart] = _createRegexError(s, sx, sWhat, nOffset, m, eAct[0], sLineId, sRuleId, bUppercase, eAct[1], eAct[2], bShowRuleId, sOption, bContext)
   193    192                                               dPriority[nErrorStart] = nPriority
................................................................................
   196    195                                           s = _rewrite(s, sWhat, eAct[0], m, bUppercase)
   197    196                                           bParagraphChange = True
   198    197                                           bSentenceChange = True
   199    198                                           if bDebug:
   200    199                                               echo("~ " + s + "  -- " + m.group(eAct[0]) + "  # " + sLineId)
   201    200                                       elif cActionType == "=":
   202    201                                           # disambiguation
   203         -                                        globals()[sWhat](s, m, dDA)
   204         -                                        if bDebug:
   205         -                                            echo("= " + m.group(0) + "  # " + sLineId + "\nDA: " + str(dDA))
          202  +                                        if not bParagraph:
          203  +                                            globals()[sWhat](s, m, dTokenPos)
          204  +                                            if bDebug:
          205  +                                                echo("= " + m.group(0) + "  # " + sLineId)
   206    206                                       elif cActionType == ">":
   207    207                                           # we do nothing, this test is just a condition to apply all following actions
   208    208                                           pass
   209    209                                       else:
   210    210                                           echo("# error: unknown action at " + sLineId)
   211    211                                   elif cActionType == ">":
   212    212                                       break
................................................................................
   395    395   #### common functions
   396    396   
   397    397   def option (sOpt):
   398    398       "return True if option sOpt is active"
   399    399       return _dOptions.get(sOpt, False)
   400    400   
   401    401   
   402         -def displayInfo (dDA, tWord):
          402  +def displayInfo (dTokenPos, tWord):
   403    403       "for debugging: retrieve info of word"
   404    404       if not tWord:
   405    405           echo("> nothing to find")
   406    406           return True
   407    407       lMorph = _oSpellChecker.getMorph(tWord[1])
   408    408       if not lMorph:
   409    409           echo("> not in dictionary")
   410    410           return True
   411         -    if tWord[0] in dDA:
   412         -        echo("DA: " + str(dDA[tWord[0]]))
          411  +    if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]:
          412  +        echo("DA: " + str(dTokenPos[tWord[0]]["lMorph"]))
   413    413       echo("FSA: " + str(lMorph))
   414    414       return True
   415    415   
   416    416   
   417         -def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False):
          417  +def morph (dTokenPos, tWord, sPattern, bStrict=True, bNoWord=False):
   418    418       "analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)"
   419    419       if not tWord:
   420    420           return bNoWord
   421         -    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _oSpellChecker.getMorph(tWord[1])
          421  +    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
   422    422       if not lMorph:
   423    423           return False
   424    424       p = re.compile(sPattern)
   425    425       if bStrict:
   426    426           return all(p.search(s)  for s in lMorph)
   427    427       return any(p.search(s)  for s in lMorph)
   428    428   
   429    429   
   430         -def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False):
          430  +def morphex (dTokenPos, tWord, sPattern, sNegPattern, bNoWord=False):
   431    431       "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
   432    432       if not tWord:
   433    433           return bNoWord
   434         -    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _oSpellChecker.getMorph(tWord[1])
          434  +    lMorph = dTokenPos[tWord[0]]["lMorph"]  if tWord[0] in dTokenPos and "lMorph" in dTokenPos[tWord[0]]  else _oSpellChecker.getMorph(tWord[1])
   435    435       if not lMorph:
   436    436           return False
   437    437       # check negative condition
   438    438       np = re.compile(sNegPattern)
   439    439       if any(np.search(s)  for s in lMorph):
   440    440           return False
   441    441       # search sPattern
................................................................................
   513    513       if sNegPattern and re.search(sNegPattern, s):
   514    514           return False
   515    515       if re.search(sPattern, s):
   516    516           return True
   517    517       return False
   518    518   
   519    519   
   520         -def look_chk1 (dDA, s, nOffset, sPattern, sPatternGroup1, sNegPatternGroup1=None):
          520  +def look_chk1 (dTokenPos, s, nOffset, sPattern, sPatternGroup1, sNegPatternGroup1=None):
   521    521       "returns True if s has pattern sPattern and m.group(1) has pattern sPatternGroup1"
   522    522       m = re.search(sPattern, s)
   523    523       if not m:
   524    524           return False
   525    525       try:
   526    526           sWord = m.group(1)
   527    527           nPos = m.start(1) + nOffset
   528    528       except:
   529    529           return False
   530    530       if sNegPatternGroup1:
   531         -        return morphex(dDA, (nPos, sWord), sPatternGroup1, sNegPatternGroup1)
   532         -    return morph(dDA, (nPos, sWord), sPatternGroup1, False)
          531  +        return morphex(dTokenPos, (nPos, sWord), sPatternGroup1, sNegPatternGroup1)
          532  +    return morph(dTokenPos, (nPos, sWord), sPatternGroup1, False)
   533    533   
   534    534   
   535    535   #### Disambiguator
   536    536   
   537         -def select (dDA, nPos, sWord, sPattern, lDefault=None):
          537  +def select (dTokenPos, nPos, sWord, sPattern, lDefault=None):
   538    538       if not sWord:
   539    539           return True
   540         -    if nPos in dDA:
          540  +    if nPos not in dTokenPos:
          541  +        print("Error. There should be a token at this position: ", nPos)
   541    542           return True
   542    543       lMorph = _oSpellChecker.getMorph(sWord)
   543    544       if not lMorph or len(lMorph) == 1:
   544    545           return True
   545    546       lSelect = [ sMorph  for sMorph in lMorph  if re.search(sPattern, sMorph) ]
   546    547       if lSelect:
   547    548           if len(lSelect) != len(lMorph):
   548         -            dDA[nPos] = lSelect
          549  +            dTokenPos[nPos]["lMorph"] = lSelect
   549    550       elif lDefault:
   550         -        dDA[nPos] = lDefault
          551  +        dTokenPos[nPos]["lMorph"] = lDefault
   551    552       return True
   552    553   
   553    554   
   554         -def exclude (dDA, nPos, sWord, sPattern, lDefault=None):
          555  +def exclude (dTokenPos, nPos, sWord, sPattern, lDefault=None):
   555    556       if not sWord:
   556    557           return True
   557         -    if nPos in dDA:
          558  +    if nPos not in dTokenPos:
          559  +        print("Error. There should be a token at this position: ", nPos)
   558    560           return True
   559    561       lMorph = _oSpellChecker.getMorph(sWord)
   560    562       if not lMorph or len(lMorph) == 1:
   561    563           return True
   562    564       lSelect = [ sMorph  for sMorph in lMorph  if not re.search(sPattern, sMorph) ]
   563    565       if lSelect:
   564    566           if len(lSelect) != len(lMorph):
   565         -            dDA[nPos] = lSelect
          567  +            dTokenPos[nPos]["lMorph"] = lSelect
   566    568       elif lDefault:
   567         -        dDA[nPos] = lDefault
          569  +        dTokenPos[nPos]["lMorph"] = lDefault
   568    570       return True
   569    571   
   570    572   
   571         -def define (dDA, nPos, lMorph):
   572         -    dDA[nPos] = lMorph
          573  +def define (dTokenPos, nPos, lMorph):
          574  +    if nPos not in dTokenPos:
          575  +        print("Error. There should be a token at this position: ", nPos)
          576  +        return True
          577  +    dTokenPos[nPos]["lMorph"] = lMorph
   573    578       return True
   574    579   
   575    580   
   576    581   #### GRAMMAR CHECKER PLUGINS
   577    582   
   578    583   ${plugins}
   579    584   

Modified gc_lang/fr/rules.grx from [8fec34a927] to [413f581a35].

   388    388   # URL
   389    389   __<i>(p_URL)__
   390    390       https?://[\w./?&!%=+*"'@$#-]+ <<- ~>> *
   391    391   __<i](p_URL2)__
   392    392       ((?:{w_1}[.])*)({w_2})([.](?:com|net|org|info|fr|ca|be|ch|i[ot]|co[.]uk|tk|es|jp|zh|ru|us|nl|xyz)) @@0,**,$
   393    393       <<- ~1>> *
   394    394       <<- ~2>> =\2.capitalize()
   395         -    <<- =>> define(\2, [":MP:e:i"])
   396    395       <<- ~3>> *
   397    396   
   398    397   # Numéro de chapitre
   399    398   __<i>(p_chapitre)__
   400    399       ^\d+[.][\d.-]* <<- ~>> *
   401    400   
   402    401   # Numéro suivi de plusieurs espaces, considéré comme une numérotation de chapitre
................................................................................
 12396  12395   TEST: dès qu’il le {{voie}}
 12397  12396   TEST: donnant à entendre qu’il avait l’intention de violer Laura dès qu’il en aurait l’occasion
 12398  12397   
 12399  12398   
 12400  12399   # verbe que + subjonctif
 12401  12400   __vmode_qqch_que_subjonctif1__
 12402  12401       [>afin|>avant|>pour|>quoi|>permettre|>falloir|>vouloir|>ordonner|>exiger|>désirer|>préférer|>suffire]  [que|qu’|qu]  @:(?:Os|M)  @:I¬:[GYS]
 12403         -    >douter                                                                                                [que|qu’|qu]  @:(?:Os|M)  @:I¬:(?:[GYSK]|If)
 12404         -        <<- -4>> =suggVerbMode(\4, ":S", \3)                                                         # Après « \1 que », ce verbe devrait être au subjonctif.
        12402  +        <<- /vmode/ -4>> =suggVerbMode(\4, ":S", \3)                                                # Après « \1 que », ce verbe devrait être au subjonctif.
        12403  +
        12404  +    >douter  [que|qu’|qu]  @:(?:Os|M)  @:I¬:(?:[GYSK]|If)
        12405  +        <<- /vmode/ morph(\1, ":V", ":N") -4>> =suggVerbMode(\4, ":S", \3)                          # Après « \1 que », ce verbe devrait être au subjonctif.
 12405  12406   
 12406  12407   TEST: Il suffit qu’il {{court}} plus
 12407  12408   TEST: Je veux qu’il {{finit}} son repas.
 12408  12409   TEST: quoi qu’il en {{conclut}}
 12409  12410   TEST: Je ne veux pas que tu {{es}} des ennuis
 12410  12411   TEST: Avant que tu {{pars}}, je voudrais qu’on discute.
 12411  12412   TEST: Nul doute qu’elle nourrira à brève échéance la haine de demain à notre égard.