Grammalecte  Check-in [eeef098bd9]

Overview
Comment:[build][core] graph calls within regex rules
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | build | rg
Files: files | file ages | folders
SHA3-256: eeef098bd9f125077bc432ade4b12eed176ae7dc4d6e139b3987bbfc14bb16b7
User & Date: olr on 2018-06-11 13:21:42
Other Links: branch diff | manifest | tags
Context
2018-06-11
18:20
[misc] SublimeText syntax rules update check-in: cac524f6db user: olr tags: misc, rg
13:21
[build][core] graph calls within regex rules check-in: eeef098bd9 user: olr tags: build, core, rg
09:26
[build][core] named graphs check-in: 7e92a17d42 user: olr tags: build, core, rg
Changes

Modified compile_rules.py from [394c512707] to [8c11a24e22].

   108    108   
   109    109   
   110    110   def createRule (s, nIdLine, sLang, bParagraph, dOptPriority):
   111    111       "returns rule as list [option name, regex, bCaseInsensitive, identifier, list of actions]"
   112    112       global dJSREGEXES
   113    113       global nRULEWITHOUTNAME
   114    114   
   115         -    #### OPTIONS
   116    115       sLineId = str(nIdLine) + ("p" if bParagraph else "s")
   117    116       sRuleId = sLineId
          117  +
          118  +    #### GRAPH CALL
          119  +    if s.startswith("@@@@"):
          120  +        if bParagraph:
          121  +            print("Error. Graph call can’t be made only after the first pass (sentence by sentence)")
          122  +            exit()
          123  +        return ["@@@@", s[4:], sLineId]
          124  +
          125  +    #### OPTIONS
   118    126       sOption = False         # False or [a-z0-9]+ name
   119    127       nPriority = 4           # Default is 4, value must be between 0 and 9
   120    128       tGroups = None          # code for groups positioning (only useful for JavaScript)
   121    129       cCaseMode = 'i'         # i: case insensitive,  s: case sensitive,  u: uppercasing allowed
   122    130       cWordLimitLeft = '['    # [: word limit, <: no specific limit
   123    131       cWordLimitRight = ']'   # ]: word limit, >: no specific limit
   124    132       m = re.match("^__(?P<borders_and_case>[[<]\\w[]>])(?P<option>/[a-zA-Z0-9]+|)(?P<ruleid>\\(\\w+\\)|)(?P<priority>![0-9]|)__ *", s)
................................................................................
   341    349           print("# Unknown action at line " + sIdAction)
   342    350           return None
   343    351   
   344    352   
   345    353   def _calcRulesStats (lRules):
   346    354       d = {'=':0, '~': 0, '-': 0, '>': 0}
   347    355       for aRule in lRules:
   348         -        for aAction in aRule[6]:
   349         -            d[aAction[1]] = d[aAction[1]] + 1
          356  +        if aRule[0] != "@@@@":
          357  +            for aAction in aRule[6]:
          358  +                d[aAction[1]] = d[aAction[1]] + 1
   350    359       return (d, len(lRules))
   351    360   
   352    361   
   353    362   def displayStats (lParagraphRules, lSentenceRules):
   354    363       print("  {:>18} {:>18} {:>18} {:>18}".format("DISAMBIGUATOR", "TEXT PROCESSOR", "GRAMMAR CHECKING", "REGEX"))
   355    364       d, nRule = _calcRulesStats(lParagraphRules)
   356    365       print("§ {:>10} actions {:>10} actions {:>10} actions  in {:>8} rules".format(d['='], d['~'], d['-'], nRule))
................................................................................
   434    443       # removing comments, zeroing empty lines, creating definitions, storing tests, merging rule lines
   435    444       print("  parsing rules...")
   436    445       global dDEF
   437    446       lLine = []
   438    447       lRuleLine = []
   439    448       lTest = []
   440    449       lOpt = []
   441         -    zBookmark = re.compile("^!!+")
   442         -    zGraphLink = re.compile(r"^@@@@GRAPHLINK>(\w+)@@@@")
   443    450   
   444    451       for i, sLine in enumerate(lRules, 1):
   445    452           if sLine.startswith('#END'):
          453  +            # arbitrary end
   446    454               printBookmark(0, "BREAK BY #END", i)
   447    455               break
   448    456           elif sLine.startswith("#"):
          457  +            # comment
   449    458               pass
   450    459           elif sLine.startswith("@@@@"):
   451         -            m = re.match(r"^@@@@GRAPHLINK>(\w+)@@@@", sLine.strip())
          460  +            # rules graph call
          461  +            m = re.match(r"@@@@GRAPH: *(\w+)@@@@", sLine.strip())
   452    462               if m:
   453    463                   #lRuleLine.append(["@GRAPHLINK", m.group(1)])
   454         -                printBookmark(1, "@GRAPHLINK: " + m.group(1), i)
          464  +                printBookmark(1, "@GRAPH: " + m.group(1), i)
          465  +                lRuleLine.append([i, "@@@@"+m.group(1)])
   455    466           elif sLine.startswith("DEF:"):
          467  +            # definition
   456    468               m = re.match("DEF: +([a-zA-Z_][a-zA-Z_0-9]*) +(.+)$", sLine.strip())
   457    469               if m:
   458    470                   dDEF["{"+m.group(1)+"}"] = m.group(2)
   459    471               else:
   460    472                   print("Error in definition: ", end="")
   461    473                   print(sLine.strip())
   462    474           elif sLine.startswith("TEST:"):
          475  +            # test
   463    476               lTest.append("r{:<7}".format(i) + "  " + sLine[5:].strip())
   464    477           elif sLine.startswith("TODO:"):
          478  +            # todo
   465    479               pass
   466    480           elif sLine.startswith(("OPTGROUP/", "OPTSOFTWARE:", "OPT/", "OPTLANG/", "OPTDEFAULTUILANG:", "OPTLABEL/", "OPTPRIORITY/")):
          481  +            # options
   467    482               lOpt.append(sLine)
   468    483           elif re.match("[  \t]*$", sLine):
          484  +            # empty line
   469    485               pass
   470    486           elif sLine.startswith("!!"):
   471         -            m = zBookmark.search(sLine)
          487  +            # bookmark
          488  +            m = re.match("!!+", sLine)
   472    489               nExMk = len(m.group(0))
   473    490               if sLine[nExMk:].strip():
   474    491                   printBookmark(nExMk-2, sLine[nExMk:].strip(), i)
   475    492           elif sLine.startswith(("    ", "\t")):
   476         -            lRuleLine[len(lRuleLine)-1][1] += " " + sLine.strip()
          493  +            # rule (continuation)
          494  +            lRuleLine[-1][1] += " " + sLine.strip()
   477    495           else:
          496  +            # new rule
   478    497               lRuleLine.append([i, sLine.strip()])
   479    498   
   480    499       # generating options files
   481    500       print("  parsing options...")
   482    501       try:
   483    502           dOptions, dOptPriority = prepareOptions(lOpt)
   484    503       except:

Modified compile_rules_js_convert.py from [5ad87f3f46] to [f2cc9f3e39].

   115    115       if not lNegLookBeforeRegex:
   116    116           lNegLookBeforeRegex = None
   117    117       return (sRegex, lNegLookBeforeRegex)
   118    118   
   119    119   
   120    120   def pyRuleToJS (lRule, dJSREGEXES, sWORDLIMITLEFT):
   121    121       lRuleJS = copy.deepcopy(lRule)
          122  +    # graph rules
          123  +    if lRuleJS[0] == "@@@@":
          124  +        return lRuleJS
   122    125       del lRule[-1] # tGroups positioning codes are useless for Python
   123    126       # error messages
   124    127       for aAction in lRuleJS[6]:
   125    128           if aAction[1] == "-":
   126    129               aAction[2] = aAction[2].replace(" ", " ") # nbsp --> nnbsp
   127    130               aAction[4] = aAction[4].replace("« ", "« ").replace(" »", " »").replace(" :", " :").replace(" :", " :")
   128    131       # js regexes
................................................................................
   130    133       lRuleJS.append(lNegLookBehindRegex)
   131    134       return lRuleJS
   132    135   
   133    136   
   134    137   def writeRulesToJSArray (lRules):
   135    138       sArray = "[\n"
   136    139       for sOption, aRuleGroup in lRules:
   137         -        sArray += '  ["' + sOption + '", [\n'  if sOption  else  "  [false, [\n"
   138         -        for sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, aGroups, aNegLookBehindRegex in aRuleGroup:
   139         -            sArray += '    [' + sRegex + ", "
   140         -            sArray += "true, " if bCaseInsensitive  else "false, "
   141         -            sArray += '"' + sLineId + '", '
   142         -            sArray += '"' + sRuleId + '", '
   143         -            sArray += str(nPriority) + ", "
   144         -            sArray += json.dumps(lActions, ensure_ascii=False) + ", "
   145         -            sArray += json.dumps(aGroups, ensure_ascii=False) + ", "
   146         -            sArray += json.dumps(aNegLookBehindRegex, ensure_ascii=False) + "],\n"
   147         -        sArray += "  ]],\n"
          140  +        if sOption != "@@@@":
          141  +            sArray += '  ["' + sOption + '", [\n'  if sOption  else  "  [false, [\n"
          142  +            for sRegex, bCaseInsensitive, sLineId, sRuleId, nPriority, lActions, aGroups, aNegLookBehindRegex in aRuleGroup:
          143  +                sArray += '    [' + sRegex + ", "
          144  +                sArray += "true, " if bCaseInsensitive  else "false, "
          145  +                sArray += '"' + sLineId + '", '
          146  +                sArray += '"' + sRuleId + '", '
          147  +                sArray += str(nPriority) + ", "
          148  +                sArray += json.dumps(lActions, ensure_ascii=False) + ", "
          149  +                sArray += json.dumps(aGroups, ensure_ascii=False) + ", "
          150  +                sArray += json.dumps(aNegLookBehindRegex, ensure_ascii=False) + "],\n"
          151  +            sArray += "  ]],\n"
          152  +        else:
          153  +            sArray += '  ["' + sOption + '", [\n'
          154  +            for sGraphName, sLineId in aRuleGroup:
          155  +                sArray += '    ["' + sGraphName + '", "' + sLineId + '"],\n"'
          156  +            sArray += "  ]],\n"
   148    157       sArray += "]"
   149    158       return sArray
   150    159   
   151    160   
   152    161   def groupsPositioningCodeToList (sGroupsPositioningCode):
   153    162       if not sGroupsPositioningCode:
   154    163           return None
   155    164       return [ int(sCode)  if sCode.isdigit() or (sCode[0:1] == "-" and sCode[1:].isdigit())  else sCode \
   156    165                for sCode in sGroupsPositioningCode.split(",") ]

Modified gc_core/py/lang_core/gc_engine.py from [12b89317cd] to [0da090b219].

    87     87   
    88     88   
    89     89   def _loadRules ():
    90     90       from . import gc_rules
    91     91       global _rules
    92     92       _rules = gc_rules
    93     93       # compile rules regex
    94         -    for lRuleGroup in chain(_rules.lParagraphRules, _rules.lSentenceRules):
    95         -        for rule in lRuleGroup[1]:
    96         -            try:
    97         -                rule[0] = re.compile(rule[0])
    98         -            except:
    99         -                echo("Bad regular expression in # " + str(rule[2]))
   100         -                rule[0] = "(?i)<Grammalecte>"
           94  +    for sOption, lRuleGroup in chain(_rules.lParagraphRules, _rules.lSentenceRules):
           95  +        if sOption != "@@@@":
           96  +            for aRule in lRuleGroup:
           97  +                try:
           98  +                    aRule[0] = re.compile(aRule[0])
           99  +                except:
          100  +                    echo("Bad regular expression in # " + str(aRule[2]))
          101  +                    aRule[0] = "(?i)<Grammalecte>"
   101    102   
   102    103   
   103    104   #### Parsing
   104    105   
   105    106   def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
   106    107       "analyses the paragraph sText and returns list of errors"
   107    108       #sText = unicodedata.normalize("NFC", sText)
................................................................................
   110    111       dDA = {}        # Disambiguisator. Key = position; value = list of morphologies
   111    112       dPriority = {}  # Key = position; value = priority
   112    113       dOpt = _dOptions  if not dOptions  else dOptions
   113    114       bShowRuleId = option('idrule')
   114    115   
   115    116       # parse paragraph
   116    117       try:
   117         -        sNew, aErrors = _proofread(sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
          118  +        sNew, aErrors = _proofread(None, sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
   118    119           if sNew:
   119    120               sText = sNew
   120    121       except:
   121    122           raise
   122    123   
   123    124       # cleanup
   124    125       if " " in sText:
................................................................................
   131    132           sText = sText.replace("‑", "-") # nobreakdash
   132    133   
   133    134       # parse sentences
   134    135       for iStart, iEnd in _getSentenceBoundaries(sText):
   135    136           if 4 < (iEnd - iStart) < 2000:
   136    137               dDA.clear()
   137    138               try:
   138         -                # regex parser
   139         -                _, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
   140         -                aErrors.update(errs)
   141         -                # token parser
   142    139                   oSentence = TokenSentence(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart)
   143         -                bChange, errs = oSentence.parse(dAllGraph["test_graph"], dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
          140  +                _, errs = _proofread(oSentence, sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bShowRuleId, bDebug, bContext)
   144    141                   aErrors.update(errs)
   145         -                if bChange:
   146         -                    oSentence.rewrite()
   147         -                    if bDebug:
   148         -                        print("~", oSentence.sSentence)
   149    142               except:
   150    143                   raise
   151    144       return aErrors.values() # this is a view (iterable)
   152    145   
   153    146   
   154    147   _zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)')
   155    148   _zBeginOfParagraph = re.compile(r"^\W*")
................................................................................
   158    151   def _getSentenceBoundaries (sText):
   159    152       iStart = _zBeginOfParagraph.match(sText).end()
   160    153       for m in _zEndOfSentence.finditer(sText):
   161    154           yield (iStart, m.end())
   162    155           iStart = m.end()
   163    156   
   164    157   
   165         -def _proofread (s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
          158  +def _proofread (oSentence, s, sx, nOffset, bParagraph, dDA, dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext):
   166    159       dErrs = {}
   167    160       bChange = False
   168    161       for sOption, lRuleGroup in _getRules(bParagraph):
   169         -        if not sOption or dOptions.get(sOption, False):
          162  +        if sOption == "@@@@":
          163  +            # graph rules
          164  +            for sGraphName, sLineId in lRuleGroup:
          165  +                if bDebug:
          166  +                    print(sGraphName, sLineId)
          167  +                bChange, errs = oSentence.parse(dAllGraph[sGraphName], dPriority, sCountry, dOptions, bShowRuleId, bDebug, bContext)
          168  +                dErrs.update(errs)
          169  +                if bChange:
          170  +                    oSentence.rewrite()
          171  +                    if bDebug:
          172  +                        print("~", oSentence.sSentence)
          173  +        elif not sOption or dOptions.get(sOption, False):
          174  +            # regex rules
   170    175               for zRegex, bUppercase, sLineId, sRuleId, nPriority, lActions in lRuleGroup:
   171    176                   if sRuleId not in _aIgnoredRules:
   172    177                       for m in zRegex.finditer(s):
   173    178                           bCondMemo = None
   174    179                           for sFuncCond, cActionType, sWhat, *eAct in lActions:
   175    180                               # action in lActions: [ condition, action type, replacement/suggestion/action[, iGroup[, message, URL]] ]
   176    181                               try:
................................................................................
   323    328       if sFilter:
   324    329           try:
   325    330               zFilter = re.compile(sFilter)
   326    331           except:
   327    332               echo("# Error. List rules: wrong regex.")
   328    333               sFilter = None
   329    334       for sOption, lRuleGroup in chain(_getRules(True), _getRules(False)):
   330         -        for _, _, sLineId, sRuleId, _, _ in lRuleGroup:
   331         -            if not sFilter or zFilter.search(sRuleId):
   332         -                yield (sOption, sLineId, sRuleId)
          335  +        if sOption != "@@@@":
          336  +            for _, _, sLineId, sRuleId, _, _ in lRuleGroup:
          337  +                if not sFilter or zFilter.search(sRuleId):
          338  +                    yield (sOption, sLineId, sRuleId)
   333    339   
   334    340   
   335    341   def displayRules (sFilter=None):
   336    342       echo("List of rules. Filter: << " + str(sFilter) + " >>")
   337    343       for sOption, sLineId, sRuleId in listRules(sFilter):
   338    344           echo("{:<10} {:<10} {}".format(sOption, sLineId, sRuleId))
   339    345   

Modified gc_lang/fr/rules.grx from [bf0a5c739e] to [5101ac8a06].

 12441  12441       -2>> =suggVerbMode(@, ":I", \1)
 12442  12442       # Après « quand » ou « lorsque », le verbe ne s’emploie pas au subjonctif mais à l’indicatif.
 12443  12443   
 12444  12444   TEST: quand elle {{rencontrât}} son créateur
 12445  12445   TEST: lorsqu’il y {{eût}} du grabuge, nous montâmes tous sur le pont.
 12446  12446   
 12447  12447   
        12448  +
        12449  +@@@@GRAPH: test_graph@@@@
        12450  +
        12451  +
 12448  12452   
 12449  12453   !!
 12450  12454   !!
 12451  12455   !!
 12452  12456   !!
 12453  12457   !!
 12454  12458   !!
................................................................................
 16512  16516   TEST: Éliante, cousine de Célimène,
 16513  16517   TEST: Arsinoé, amie de Célimène,
 16514  16518   TEST: Acaste,
 16515  16519   TEST: Clitandre, marquis
 16516  16520   TEST: Basque, valet de Célimène,
 16517  16521   TEST: Un garde de la maréchaussée de France,
 16518  16522   TEST: Dubois, valet d’Alceste.
 16519         -
 16520  16523   TEST: La scène se passe à Paris, dans la maison de Célimène.
 16521         -
 16522  16524   TEST: ACTE I
 16523  16525   TEST: SCÈNE PREMIÈRE. Philinte, Alceste.
 16524  16526   TEST: PHILINTE. Qu’est-ce donc ? Qu’avez-vous ?
 16525  16527   TEST: ALCESTE, assis. Laissez-moi, je vous prie.
 16526  16528   TEST: PHILINTE. Mais encor, dites-moi, quelle bizarrerie…
 16527  16529   TEST: ALCESTE. Laissez-moi là, vous dis-je, et courez vous cacher.
 16528  16530   TEST: PHILINTE. Mais on entend les gens au moins sans se fâcher.