Grammalecte  Check-in [843c0244bc]

Overview
Comment:[core] tokenizer: better regex for URLs and folders
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: 843c0244bc490adb380b21d092fbe14195d154aa6581839d66cebe9b33e24e79
User & Date: olr on 2017-10-26 05:49:28
Other Links: manifest | tags
Context
2017-10-26
07:52
[fx] CSS: min-height for tooltip suggestions (prevent flat line if there is only spaces) check-in: b6c94cd7cb user: olr tags: fx, trunk
05:49
[core] tokenizer: better regex for URLs and folders check-in: 843c0244bc user: olr tags: core, trunk
2017-10-25
18:34
[core][bug] fix tokenizer for URL check-in: ee7d44a3ee user: olr tags: core, trunk
Changes

Modified gc_core/js/tokenizer.js from [de468b4358] to [9bb6ea03fb].

    12     12   
    13     13   
    14     14   const aTkzPatterns = {
    15     15       // All regexps must start with ^.
    16     16       "default":
    17     17           [
    18     18               [/^[   \t]+/, 'SPACE'],
    19         -            [/^\/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
    20         -            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
           19  +            [/^\/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDER'],
           20  +            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDER'],
    21     21               [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
    22         -            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
           22  +            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
    23     23               [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
    24     24               [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
    25     25               [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
    26     26               [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
    27     27               [/^\d\d?h\d\d\b/, 'HOUR'],
    28     28               [/^-?\d+(?:[.,]\d+|)/, 'NUM'],
    29     29               [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+)*/, 'WORD']
    30     30           ],
    31     31       "fr":
    32     32           [
    33     33               [/^[   \t]+/, 'SPACE'],
    34         -            [/^\/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
    35         -            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)*/, 'FOLDER'],
           34  +            [/^\/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDER'],
           35  +            [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDER'],
    36     36               [/^[,.;:!?…«»“”‘’"(){}\[\]/·–—]+/, 'SEPARATOR'],
    37         -            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_]+[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
           37  +            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
    38     38               [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
    39     39               [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
    40     40               [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
    41     41               [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
    42     42               [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'ELPFX'],
    43     43               [/^\d\d?[hm]\d\d\b/, 'HOUR'],
    44     44               [/^\d+(?:er|nd|e|de|ième|ème|eme)s?\b/, 'ORDINAL'],

Modified gc_core/py/tokenizer.py from [829b056f2c] to [5a9c0c9105].

     1      1   # Very simple tokenizer
     2      2   
     3      3   import re
     4      4   
     5      5   _PATTERNS = {
     6      6       "default":
     7      7           (
     8         -            r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
     9         -            r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
            8  +            r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
            9  +            r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
    10     10               r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
    11         -            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
           11  +            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
    12     12               r'(?P<HASHTAG>[#@][\w-]+)',
    13     13               r'(?P<HTML><\w+.*?>|</\w+ *>)',
    14     14               r'(?P<PSEUDOHTML>\[/?\w+\])',
    15     15               r'(?P<HOUR>\d\d?h\d\d\b)',
    16     16               r'(?P<NUM>-?\d+(?:[.,]\d+))',
    17     17               r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
    18     18           ),
    19     19       "fr":
    20     20           (
    21         -            r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()]+)*)',
    22         -            r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()]+)*)',
           21  +            r'(?P<FOLDER1>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
           22  +            r'(?P<FOLDER2>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
    23     23               r'(?P<PUNC>[.,?!:;…«»“”"()/·]+)',
    24         -            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
           24  +            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
    25     25               r'(?P<HASHTAG>[#@][\w-]+)',
    26     26               r'(?P<HTML><\w+.*?>|</\w+ *>)',
    27     27               r'(?P<PSEUDOHTML>\[/?\w+\])',
    28     28               r"(?P<ELPFX>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
    29     29               r'(?P<ORDINAL>\d+(?:er|nd|e|de|ième|ème|eme)\b)',
    30     30               r'(?P<HOUR>\d\d?h\d\d\b)',
    31     31               r'(?P<NUM>-?\d+(?:[.,]\d+|))',