Grammalecte  Check-in [bc740f8402]

Overview
Comment:[core][py] gc: use spellchecker storage
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core | rg
Files: files | file ages | folders
SHA3-256: bc740f84029a81939f36fc28be648beeb5457d909b8832660780dcdd1aef238f
User & Date: olr on 2018-05-23 08:47:46
Other Links: branch diff | manifest | tags
Context
2018-05-23
10:29
[graphspell][js] data memorization check-in: e7244953ec user: olr tags: graphspell, rg
08:47
[core][py] gc: use spellchecker storage check-in: bc740f8402 user: olr tags: core, rg
08:46
[core][py] gc: use spellchecker storage check-in: 445405d362 user: olr tags: core, rg
Changes

Modified gc_core/py/lang_core/gc_engine.py from [72ecd7c680] to [29b43c054f].

     8      8   #import unicodedata
     9      9   from itertools import chain
    10     10   
    11     11   from ..graphspell.spellchecker import SpellChecker
    12     12   from ..graphspell.echo import echo
    13     13   from . import gc_options
    14     14   
           15  +from ..graphspell.tokenizer import Tokenizer
           16  +from .gc_rules_graph import dGraph
           17  +
    15     18   
    16     19   __all__ = [ "lang", "locales", "pkg", "name", "version", "author", \
    17     20               "load", "parse", "getSpellChecker", \
    18     21               "setOption", "setOptions", "getOptions", "getDefaultOptions", "getOptionsLabels", "resetOptions", "displayOptions", \
    19     22               "ignoreRule", "resetIgnoreRules", "reactivateRule", "listRules", "displayRules" ]
    20     23   
    21     24   __version__ = "${version}"
................................................................................
    31     34   _rules = None                               # module gc_rules
    32     35   
    33     36   # data
    34     37   _sAppContext = ""                           # what software is running
    35     38   _dOptions = None
    36     39   _aIgnoredRules = set()
    37     40   _oSpellChecker = None
    38         -_dAnalyses = {}                             # cache for data from dictionary
    39         -
           41  +_oTokenizer = None
    40     42   
    41     43   
    42     44   #### Parsing
    43     45   
    44     46   def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    45     47       "analyses the paragraph sText and returns list of errors"
    46     48       #sText = unicodedata.normalize("NFC", sText)
    47     49       aErrors = None
    48         -    sAlt = sText
           50  +    sRealText = sText
    49     51       dDA = {}        # Disambiguisator. Key = position; value = list of morphologies
    50     52       dPriority = {}  # Key = position; value = priority
    51     53       dOpt = _dOptions  if not dOptions  else dOptions
    52     54   
    53     55       # parse paragraph
    54     56       try:
    55         -        sNew, aErrors = _proofread(sText, sAlt, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
           57  +        sNew, aErrors = _proofread(sText, sRealText, 0, True, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
    56     58           if sNew:
    57     59               sText = sNew
    58     60       except:
    59     61           raise
    60     62   
    61     63       # cleanup
    62     64       if " " in sText:
................................................................................
    69     71           sText = sText.replace("‑", "-") # nobreakdash
    70     72   
    71     73       # parse sentences
    72     74       for iStart, iEnd in _getSentenceBoundaries(sText):
    73     75           if 4 < (iEnd - iStart) < 2000:
    74     76               dDA.clear()
    75     77               try:
    76         -                _, errs = _proofread(sText[iStart:iEnd], sAlt[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
           78  +                # regex parser
           79  +                _, errs = _proofread(sText[iStart:iEnd], sRealText[iStart:iEnd], iStart, False, dDA, dPriority, sCountry, dOpt, bDebug, bContext)
    77     80                   aErrors.update(errs)
    78     81               except:
    79     82                   raise
    80     83       return aErrors.values() # this is a view (iterable)
    81     84   
    82     85   
    83     86   def _getSentenceBoundaries (sText):
................................................................................
   287    290       _createError = _createDictError
   288    291   
   289    292   
   290    293   def load (sContext="Python"):
   291    294       global _oSpellChecker
   292    295       global _sAppContext
   293    296       global _dOptions
          297  +    global _oTokenizer
   294    298       try:
   295    299           _oSpellChecker = SpellChecker("${lang}", "${dic_main_filename_py}", "${dic_extended_filename_py}", "${dic_community_filename_py}", "${dic_personal_filename_py}")
   296    300           _sAppContext = sContext
   297    301           _dOptions = dict(gc_options.getOptions(sContext))   # duplication necessary, to be able to reset to default
          302  +        _oTokenizer = _oSpellChecker.getTokenizer()
          303  +        _oSpellChecker.activateStorage()
   298    304       except:
   299    305           traceback.print_exc()
   300    306   
   301    307   
   302    308   def setOption (sOpt, bVal):
   303    309       if sOpt in _dOptions:
   304    310           _dOptions[sOpt] = bVal
................................................................................
   367    373       return os.path.join(os.path.dirname(sys.modules[__name__].__file__), __name__ + ".py")
   368    374   
   369    375   
   370    376   
   371    377   #### common functions
   372    378   
   373    379   # common regexes
   374         -_zEndOfSentence = re.compile('([.?!:;…][ .?!… »”")]*|.$)')
   375         -_zBeginOfParagraph = re.compile("^\W*")
   376         -_zEndOfParagraph = re.compile("\W*$")
   377         -_zNextWord = re.compile(" +(\w[\w-]*)")
   378         -_zPrevWord = re.compile("(\w[\w-]*) +$")
          380  +_zEndOfSentence = re.compile(r'([.?!:;…][ .?!… »”")]*|.$)')
          381  +_zBeginOfParagraph = re.compile(r"^\W*")
          382  +_zEndOfParagraph = re.compile(r"\W*$")
          383  +_zNextWord = re.compile(r" +(\w[\w-]*)")
          384  +_zPrevWord = re.compile(r"(\w[\w-]*) +$")
   379    385   
   380    386   
   381    387   def option (sOpt):
   382    388       "return True if option sOpt is active"
   383    389       return _dOptions.get(sOpt, False)
   384    390   
   385    391   
   386    392   def displayInfo (dDA, tWord):
   387    393       "for debugging: retrieve info of word"
   388    394       if not tWord:
   389    395           echo("> nothing to find")
   390    396           return True
   391         -    if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
   392         -        echo("> not in FSA")
          397  +    lMorph = _oSpellChecker.getMorph(tWord[1])
          398  +    if not lMorph:
          399  +        echo("> not in dictionary")
   393    400           return True
   394    401       if tWord[0] in dDA:
   395    402           echo("DA: " + str(dDA[tWord[0]]))
   396         -    echo("FSA: " + str(_dAnalyses[tWord[1]]))
          403  +    echo("FSA: " + str(lMorph))
   397    404       return True
   398    405   
   399         -
   400         -def _storeMorphFromFSA (sWord):
   401         -    "retrieves morphologies list from _oSpellChecker -> _dAnalyses"
   402         -    global _dAnalyses
   403         -    _dAnalyses[sWord] = _oSpellChecker.getMorph(sWord)
   404         -    return True  if _dAnalyses[sWord]  else False
   405         -
   406    406   
   407    407   def morph (dDA, tWord, sPattern, bStrict=True, bNoWord=False):
   408    408       "analyse a tuple (position, word), return True if sPattern in morphologies (disambiguation on)"
   409    409       if not tWord:
   410    410           return bNoWord
   411         -    if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
   412         -        return False
   413         -    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _dAnalyses[tWord[1]]
          411  +    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _oSpellChecker.getMorph(tWord[1])
   414    412       if not lMorph:
   415    413           return False
   416    414       p = re.compile(sPattern)
   417    415       if bStrict:
   418    416           return all(p.search(s)  for s in lMorph)
   419    417       return any(p.search(s)  for s in lMorph)
   420    418   
   421    419   
   422    420   def morphex (dDA, tWord, sPattern, sNegPattern, bNoWord=False):
   423    421       "analyse a tuple (position, word), returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation on)"
   424    422       if not tWord:
   425    423           return bNoWord
   426         -    if tWord[1] not in _dAnalyses and not _storeMorphFromFSA(tWord[1]):
          424  +    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _oSpellChecker.getMorph(tWord[1])
          425  +    if not lMorph:
   427    426           return False
   428         -    lMorph = dDA[tWord[0]]  if tWord[0] in dDA  else _dAnalyses[tWord[1]]
   429    427       # check negative condition
   430    428       np = re.compile(sNegPattern)
   431    429       if any(np.search(s)  for s in lMorph):
   432    430           return False
   433    431       # search sPattern
   434    432       p = re.compile(sPattern)
   435    433       return any(p.search(s)  for s in lMorph)
   436    434   
   437    435   
   438    436   def analyse (sWord, sPattern, bStrict=True):
   439    437       "analyse a word, return True if sPattern in morphologies (disambiguation off)"
   440         -    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
   441         -        return False
   442         -    if not _dAnalyses[sWord]:
          438  +    lMorph = _oSpellChecker.getMorph(sWord)
          439  +    if not lMorph:
   443    440           return False
   444    441       p = re.compile(sPattern)
   445    442       if bStrict:
   446         -        return all(p.search(s)  for s in _dAnalyses[sWord])
   447         -    return any(p.search(s)  for s in _dAnalyses[sWord])
          443  +        return all(p.search(s)  for s in lMorph)
          444  +    return any(p.search(s)  for s in lMorph)
   448    445   
   449    446   
   450    447   def analysex (sWord, sPattern, sNegPattern):
   451    448       "analyse a word, returns True if not sNegPattern in word morphologies and sPattern in word morphologies (disambiguation off)"
   452         -    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
          449  +    lMorph = _oSpellChecker.getMorph(sWord)
          450  +    if not lMorph:
   453    451           return False
   454    452       # check negative condition
   455    453       np = re.compile(sNegPattern)
   456         -    if any(np.search(s)  for s in _dAnalyses[sWord]):
          454  +    if any(np.search(s)  for s in lMorph):
   457    455           return False
   458    456       # search sPattern
   459    457       p = re.compile(sPattern)
   460         -    return any(p.search(s)  for s in _dAnalyses[sWord])
          458  +    return any(p.search(s)  for s in lMorph)
   461    459   
   462    460   
   463         -def stem (sWord):
   464         -    "returns a list of sWord's stems"
   465         -    if not sWord:
   466         -        return []
   467         -    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
   468         -        return []
   469         -    return [ s[1:s.find(" ")]  for s in _dAnalyses[sWord] ]
   470         -
   471    461   
   472    462   ## functions to get text outside pattern scope
   473    463   
   474    464   # warning: check compile_rules.py to understand how it works
   475    465   
   476    466   def nextword (s, iStart, n):
   477    467       "get the nth word of the input string or empty string"
................................................................................
   532    522   #### Disambiguator
   533    523   
   534    524   def select (dDA, nPos, sWord, sPattern, lDefault=None):
   535    525       if not sWord:
   536    526           return True
   537    527       if nPos in dDA:
   538    528           return True
   539         -    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
   540         -        return True
   541         -    if len(_dAnalyses[sWord]) == 1:
          529  +    lMorph = _oSpellChecker.getMorph(sWord)
          530  +    if not lMorph or len(lMorph) == 1:
   542    531           return True
   543         -    lSelect = [ sMorph  for sMorph in _dAnalyses[sWord]  if re.search(sPattern, sMorph) ]
          532  +    lSelect = [ sMorph  for sMorph in lMorph  if re.search(sPattern, sMorph) ]
   544    533       if lSelect:
   545         -        if len(lSelect) != len(_dAnalyses[sWord]):
          534  +        if len(lSelect) != len(lMorph):
   546    535               dDA[nPos] = lSelect
   547         -            #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
   548    536       elif lDefault:
   549    537           dDA[nPos] = lDefault
   550         -        #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
   551    538       return True
   552    539   
   553    540   
   554    541   def exclude (dDA, nPos, sWord, sPattern, lDefault=None):
   555    542       if not sWord:
   556    543           return True
   557    544       if nPos in dDA:
   558    545           return True
   559         -    if sWord not in _dAnalyses and not _storeMorphFromFSA(sWord):
   560         -        return True
   561         -    if len(_dAnalyses[sWord]) == 1:
          546  +    lMorph = _oSpellChecker.getMorph(sWord)
          547  +    if not lMorph or len(lMorph) == 1:
   562    548           return True
   563         -    lSelect = [ sMorph  for sMorph in _dAnalyses[sWord]  if not re.search(sPattern, sMorph) ]
          549  +    lSelect = [ sMorph  for sMorph in lMorph  if not re.search(sPattern, sMorph) ]
   564    550       if lSelect:
   565         -        if len(lSelect) != len(_dAnalyses[sWord]):
          551  +        if len(lSelect) != len(lMorph):
   566    552               dDA[nPos] = lSelect
   567         -            #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
   568    553       elif lDefault:
   569    554           dDA[nPos] = lDefault
   570         -        #echo("= "+sWord+" "+str(dDA.get(nPos, "null")))
   571    555       return True
   572    556   
   573    557   
   574    558   def define (dDA, nPos, lMorph):
   575    559       dDA[nPos] = lMorph
   576         -    #echo("= "+str(nPos)+" "+str(dDA[nPos]))
   577    560       return True
   578    561   
   579    562   
   580    563   #### GRAMMAR CHECKER PLUGINS
   581    564   
   582    565   ${plugins}
   583    566   
          567  +
          568  +#### CALLABLES (generated code)
   584    569   
   585    570   ${callables}