Grammalecte  Check-in [d205a5a601]

Overview
Comment:[graphspell][py] new functions: getLemma() and countWordsOccurrences()
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: d205a5a60172d3c9150901a064507045d27f698085cffce78089999ef11119f8
User & Date: olr on 2018-02-21 19:13:21
Original Comment: [graphspell] new functions: getLemma() and countWordsOccurrences()
Other Links: manifest | tags
Context
2018-02-21
19:14
[graphspell][py] defaut module import check-in: 31837970bd user: olr tags: graphspell, trunk
19:13
[graphspell][py] new functions: getLemma() and countWordsOccurrences() check-in: d205a5a601 user: olr tags: graphspell, trunk
11:53
[build] new command for future graph rules check-in: c4eb507f6d user: olr tags: build, trunk
Changes

Modified graphspell/spellchecker.py from [b9fb2c7b70] to [dbd02131cc].

    73     73               if dToken['sType'] == "WORD" and not self.isValidToken(dToken['sValue']):
    74     74                   if bSpellSugg:
    75     75                       dToken['aSuggestions'] = []
    76     76                       for lSugg in self.suggest(dToken['sValue']):
    77     77                           dToken['aSuggestions'].extend(lSugg)
    78     78                   aSpellErrs.append(dToken)
    79     79           return aSpellErrs
           80  +
           81  +    def countWordsOccurrences (self, sText, bByLemma=False, bOnlyUnknownWords=False, dWord={}):
           82  +        if not self.oTokenizer:
           83  +            self.loadTokenizer()
           84  +        for dToken in self.oTokenizer.genTokens(sText):
           85  +            if dToken['sType'] == "WORD":
           86  +                if bOnlyUnknownWords:
           87  +                    if not self.isValidToken(dToken['sValue']):
           88  +                        dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1
           89  +                else:
           90  +                    if not bByLemma:
           91  +                        dWord[dToken['sValue']] = dWord.get(dToken['sValue'], 0) + 1
           92  +                    else:
           93  +                        for sLemma in self.getLemma(dToken['sValue']):
           94  +                            dWord[sLemma] = dWord.get(sLemma, 0) + 1
           95  +        return dWord
    80     96   
    81     97       # IBDAWG functions
    82     98   
    83     99       def isValidToken (self, sToken):
    84    100           "checks if sToken is valid (if there is hyphens in sToken, sToken is split, each part is checked)"
    85    101           if self.oMainDic.isValidToken(sToken):
    86    102               return True
................................................................................
   115    131           lResult = self.oMainDic.getMorph(sWord)
   116    132           if self.oExtendedDic:
   117    133               lResult.extend(self.oExtendedDic.getMorph(sWord))
   118    134           if self.oPersonalDic:
   119    135               lResult.extend(self.oPersonalDic.getMorph(sWord))
   120    136           return lResult
   121    137   
          138  +    def getLemma (self, sWord):
          139  +        return set([ s[1:s.find(" ")]  for s in self.getMorph(sWord) ])
          140  +
   122    141       def suggest (self, sWord, nSuggLimit=10):
   123    142           "generator: returns 1, 2 or 3 lists of suggestions"
   124    143           yield self.oMainDic.suggest(sWord, nSuggLimit)
   125    144           if self.oExtendedDic:
   126    145               yield self.oExtendedDic.suggest(sWord, nSuggLimit)
   127    146           if self.oPersonalDic:
   128    147               yield self.oPersonalDic.suggest(sWord, nSuggLimit)