Grammalecte  Check-in [365d3554c7]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[graphspell][fr] tokenisation: +signes €$# (faux positif)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fr | graphspell
Files: files | file ages | folders
SHA3-256:365d3554c78bff6e9e27c69cff0ff29013eddd69347218d6644331f5bc107aa4
User & Date: olr 2019-02-22 11:53:36
Context
2019-02-22
17:10
[fr] faux positif: confusion peut/peu check-in: d6657f0c8b user: olr tags: fr, trunk
11:53
[graphspell][fr] tokenisation: +signes €$# (faux positif) check-in: 365d3554c7 user: olr tags: fr, graphspell, trunk
11:27
[fr] merge last commit (cherrypick) check-in: 90dc1dc3c1 user: olr tags: fr, trunk
Changes

Changes to gc_lang/fr/rules.grx.

 12547  12547           -2>> =suggPlur(\2)                                                                  # Accord de nombre erroné : « \2 » devrait être au pluriel.
 12548  12548   
 12549  12549   TEST: 00 heure, 01 heure
 12550  12550   TEST: il a adopté 1 {{chiens}}.
 12551  12551   TEST: 22 {{heure}}
 12552  12552   TEST: 3 {{heure}}
 12553  12553   TEST: les élèves sont inquiets après une année 2018 compliquée et riche en réformes.
 12554         -TEST: ils gagnent 3000 € maximum.
        12554  +TEST: ils gagneront 300 € maximum.
 12555  12555   
 12556  12556   
 12557  12557   ##  trouver ça/ceci/cela + adj
 12558  12558   __gn_trouver_ça_adj__
 12559  12559       >trouver  [ça|cela|ceci]  @:A.*:(?:f|m:p)¬:(?:G|3[sp]|M)
 12560  12560           <<- /gn/ -3>> =suggMasSing(\3)                                              # Trouver \2 + [adjectif] : l’adjectif s’accorde avec “\2” (au masculin singulier).
 12561  12561   

Changes to graphspell-js/tokenizer.js.

    20     20               [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
    21     21               [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
    22     22               [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
    23     23               [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
    24     24               [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
    25     25               [/^\d\d?h\d\d\b/, 'HOUR'],
    26     26               [/^\d+(?:[.,]\d+|)/, 'NUM'],
    27         -            [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
           27  +            [/^[%‰€$+=*/<>⩾⩽#-]/, 'SIGN'],
    28     28               [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+)*/, 'WORD']
    29     29           ],
    30     30       "fr":
    31     31           [
    32     32               [/^[   \t]+/, 'SPACE'],
    33     33               [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
    34     34               [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
................................................................................
    39     39               [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
    40     40               [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
    41     41               [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
    42     42               [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'WORD_ELIDED'],
    43     43               [/^\d\d?[hm]\d\d\b/, 'HOUR'],
    44     44               [/^\d+(?:ers?\b|nds?\b|es?\b|des?\b|ièmes?\b|èmes?\b|emes?\b|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)/, 'WORD_ORDINAL'],
    45     45               [/^\d+(?:[.,]\d+|)/, 'NUM'],
    46         -            [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
           46  +            [/^[%‰€$+=*/<>⩾⩽#-]/, 'SIGN'],
    47     47               [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+)*/, 'WORD']
    48     48           ]
    49     49   };
    50     50   
    51     51   
    52     52   class Tokenizer {
    53     53   

Changes to graphspell/tokenizer.py.

    14     14               r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
    15     15               r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
    16     16               r'(?P<HASHTAG>[#@][\w-]+)',
    17     17               r'(?P<HTML><\w+.*?>|</\w+ *>)',
    18     18               r'(?P<PSEUDOHTML>\[/?\w+\])',
    19     19               r'(?P<HOUR>\d\d?h\d\d\b)',
    20     20               r'(?P<NUM>\d+(?:[.,]\d+))',
    21         -            r'(?P<SIGN>[%‰+=*/<>⩾⩽-])',
           21  +            r'(?P<SIGN>[%‰€$+=*/<>⩾⩽#-])',
    22     22               r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
    23     23           ),
    24     24       "fr":
    25     25           (
    26     26               r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
    27     27               r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
    28     28               r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—])',
................................................................................
    31     31               r'(?P<HASHTAG>[#@][\w-]+)',
    32     32               r'(?P<HTML><\w+.*?>|</\w+ *>)',
    33     33               r'(?P<PSEUDOHTML>\[/?\w+\])',
    34     34               r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
    35     35               r'(?P<WORD_ORDINAL>\d+(?:ers?|nds?|es?|des?|ièmes?|èmes?|emes?|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)\b)',
    36     36               r'(?P<HOUR>\d\d?h\d\d\b)',
    37     37               r'(?P<NUM>\d+(?:[.,]\d+|))',
    38         -            r'(?P<SIGN>[%‰+=*/<>⩾⩽-])',
           38  +            r'(?P<SIGN>[%‰€$+=*/<>⩾⩽#-])',
    39     39               r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
    40     40           )
    41     41   }
    42     42   
    43     43   
    44     44   class Tokenizer:
    45     45       "Tokenizer: transforms a text in a list of tokens"