Grammalecte  Check-in [be6d99bbdc]

Overview
Comment:[graphspell] tokenizer: add token index and avoid punctuations aggregation
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | graphspell | rg
Files: files | file ages | folders
SHA3-256: be6d99bbdc1bc63d0e212605b25292b4532774ed19abce6bab69e063e1af1e56
User & Date: olr on 2018-05-18 13:11:15
Other Links: branch diff | manifest | tags
Context
2018-05-19
14:06
[build][core] merge actions in key <rules> + code clarification check-in: a59fbc32a0 user: olr tags: build, core, rg
2018-05-18
13:11
[graphspell] tokenizer: add token index and avoid punctuations aggregation check-in: be6d99bbdc user: olr tags: graphspell, rg
2018-05-17
09:09
[build][core] use 1 instead of empty string for specific tags check-in: 1895dda13e user: olr tags: build, core, rg
Changes

Modified graphspell-js/tokenizer.js from [bdd895b918] to [9bd60cca8a].

    14     14   const aTkzPatterns = {
    15     15       // All regexps must start with ^.
    16     16       "default":
    17     17           [
    18     18               [/^[   \t]+/, 'SPACE'],
    19     19               [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
    20     20               [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
    21         -            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
           21  +            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'],
    22     22               [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'],
    23     23               [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
    24     24               [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
    25     25               [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
    26     26               [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
    27     27               [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
    28     28               [/^\d\d?h\d\d\b/, 'HOUR'],
................................................................................
    30     30               [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
    31     31           ],
    32     32       "fr":
    33     33           [
    34     34               [/^[   \t]+/, 'SPACE'],
    35     35               [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
    36     36               [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
    37         -            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
           37  +            [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]/, 'SEPARATOR'],
    38     38               [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'ACRONYM'],
    39     39               [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
    40     40               [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
    41     41               [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
    42     42               [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
    43     43               [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
    44     44               [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
................................................................................
    58     58               this.sLang = "default";
    59     59           }
    60     60           this.aRules = aTkzPatterns[this.sLang];
    61     61       }
    62     62   
    63     63       * genTokens (sText) {
    64     64           let m;
    65         -        let i = 0;
           65  +        let iNext = 0;
    66     66           while (sText) {
    67         -            let nCut = 1;
           67  +            let iCut = 1;
           68  +            let iToken = 0;
    68     69               for (let [zRegex, sType] of this.aRules) {
    69     70                   try {
    70     71                       if ((m = zRegex.exec(sText)) !== null) {
    71         -                        if (sType == 'SEPARATOR') {
    72         -                            for (let c of m[0]) {
    73         -                                yield { "sType": sType, "sValue": c, "nStart": i, "nEnd": i + m[0].length }
    74         -                            }
    75         -                        } else {
    76         -                            yield { "sType": sType, "sValue": m[0], "nStart": i, "nEnd": i + m[0].length }
    77         -                        }
    78         -                        nCut = m[0].length;
           72  +                        iToken += 1;
           73  +                        yield { "i": iToken, "sType": sType, "sValue": m[0], "nStart": iNext, "nEnd": iNext + m[0].length }
           74  +                        iCut = m[0].length;
    79     75                           break;
    80     76                       }
    81     77                   }
    82     78                   catch (e) {
    83     79                       helpers.logerror(e);
    84     80                   }
    85     81               }
    86         -            i += nCut;
    87         -            sText = sText.slice(nCut);
           82  +            iNext += iCut;
           83  +            sText = sText.slice(iCut);
    88     84           }
    89     85       }
    90     86   }
    91     87   
    92     88   
    93     89   if (typeof(exports) !== 'undefined') {
    94     90       exports.Tokenizer = Tokenizer;
    95     91   }

Modified graphspell/tokenizer.py from [17f452887e] to [b3cbfe75ea].

     3      3   import re
     4      4   
     5      5   _PATTERNS = {
     6      6       "default":
     7      7           (
     8      8               r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
     9      9               r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
    10         -            r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
           10  +            r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}/·–—])',
    11     11               r'(?P<ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
    12     12               r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
    13     13               r'(?P<HASHTAG>[#@][\w-]+)',
    14     14               r'(?P<HTML><\w+.*?>|</\w+ *>)',
    15     15               r'(?P<PSEUDOHTML>\[/?\w+\])',
    16     16               r'(?P<HOUR>\d\d?h\d\d\b)',
    17     17               r'(?P<NUM>-?\d+(?:[.,]\d+))',
    18     18               r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
    19     19           ),
    20     20       "fr":
    21     21           (
    22     22               r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
    23     23               r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
    24         -            r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
           24  +            r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}/·–—])',
    25     25               r'(?P<ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
    26     26               r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
    27     27               r'(?P<HASHTAG>[#@][\w-]+)',
    28     28               r'(?P<HTML><\w+.*?>|</\w+ *>)',
    29     29               r'(?P<PSEUDOHTML>\[/?\w+\])',
    30     30               r"(?P<ELPFX>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
    31     31               r'(?P<ORDINAL>\d+(?:er|nd|e|de|ième|ème|eme)\b)',
................................................................................
    41     41       def __init__ (self, sLang):
    42     42           self.sLang = sLang
    43     43           if sLang not in _PATTERNS:
    44     44               self.sLang = "default"
    45     45           self.zToken = re.compile( "(?i)" + '|'.join(sRegex for sRegex in _PATTERNS[sLang]) )
    46     46   
    47     47       def genTokens (self, sText):
    48         -        for m in self.zToken.finditer(sText):
    49         -            yield { "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }
           48  +        for i, m in enumerate(self.zToken.finditer(sText), 1):
           49  +            yield { "i": i, "sType": m.lastgroup, "sValue": m.group(), "nStart": m.start(), "nEnd": m.end() }