Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | [graphspell][fr] tokenisation: +signes €$# (faux positif) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | fr | graphspell |
Files: | files | file ages | folders |
SHA3-256: | 365d3554c78bff6e9e27c69cff0ff290 |
User & Date: | olr 2019-02-22 11:53:36 |
Context
2019-02-22
| ||
17:10 | [fr] faux positif: confusion peut/peu check-in: d6657f0c8b user: olr tags: fr, trunk | |
11:53 | [graphspell][fr] tokenisation: +signes €$# (faux positif) check-in: 365d3554c7 user: olr tags: fr, graphspell, trunk | |
11:27 | [fr] merge last commit (cherrypick) check-in: 90dc1dc3c1 user: olr tags: fr, trunk | |
Changes
Changes to gc_lang/fr/rules.grx.
12547 12547 -2>> =suggPlur(\2) # Accord de nombre erroné : « \2 » devrait être au pluriel. 12548 12548 12549 12549 TEST: 00 heure, 01 heure 12550 12550 TEST: il a adopté 1 {{chiens}}. 12551 12551 TEST: 22 {{heure}} 12552 12552 TEST: 3 {{heure}} 12553 12553 TEST: les élèves sont inquiets après une année 2018 compliquée et riche en réformes. 12554 -TEST: ils gagnent 3000 € maximum. 12554 +TEST: ils gagneront 300 € maximum. 12555 12555 12556 12556 12557 12557 ## trouver ça/ceci/cela + adj 12558 12558 __gn_trouver_ça_adj__ 12559 12559 >trouver [ça|cela|ceci] @:A.*:(?:f|m:p)¬:(?:G|3[sp]|M) 12560 12560 <<- /gn/ -3>> =suggMasSing(\3) # Trouver \2 + [adjectif] : l’adjectif s’accorde avec “\2” (au masculin singulier). 12561 12561
Changes to graphspell-js/tokenizer.js.
20 20 [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], 21 21 [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], 22 22 [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], 23 23 [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], 24 24 [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], 25 25 [/^\d\d?h\d\d\b/, 'HOUR'], 26 26 [/^\d+(?:[.,]\d+|)/, 'NUM'], 27 - [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'], 27 + [/^[%‰€$+=*/<>⩾⩽#-]/, 'SIGN'], 28 28 [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+)*/, 'WORD'] 29 29 ], 30 30 "fr": 31 31 [ 32 32 [/^[ \t]+/, 'SPACE'], 33 33 [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], 34 34 [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], ................................................................................ 39 39 [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], 40 40 [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], 41 41 [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], 42 42 [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'WORD_ELIDED'], 43 43 [/^\d\d?[hm]\d\d\b/, 'HOUR'], 44 44 [/^\d+(?:ers?\b|nds?\b|es?\b|des?\b|ièmes?\b|èmes?\b|emes?\b|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)/, 'WORD_ORDINAL'], 45 45 [/^\d+(?:[.,]\d+|)/, 'NUM'], 46 - [/^[%‰+=*/<>⩾⩽-]/, 'SIGN'], 46 + [/^[%‰€$+=*/<>⩾⩽#-]/, 'SIGN'], 47 47 [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+)*/, 'WORD'] 48 48 ] 49 49 }; 50 50 51 51 52 52 class Tokenizer { 53 53
Changes to graphspell/tokenizer.py.
14 14 r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)', 15 15 r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', 16 16 r'(?P<HASHTAG>[#@][\w-]+)', 17 17 r'(?P<HTML><\w+.*?>|</\w+ *>)', 18 18 r'(?P<PSEUDOHTML>\[/?\w+\])', 19 19 r'(?P<HOUR>\d\d?h\d\d\b)', 20 20 r'(?P<NUM>\d+(?:[.,]\d+))', 21 - r'(?P<SIGN>[%‰+=*/<>⩾⩽-])', 21 + r'(?P<SIGN>[%‰€$+=*/<>⩾⩽#-])', 22 22 r"(?P<WORD>\w+(?:[’'`-]\w+)*)" 23 23 ), 24 24 "fr": 25 25 ( 26 26 r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', 27 27 r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', 28 28 r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—])', ................................................................................ 31 31 r'(?P<HASHTAG>[#@][\w-]+)', 32 32 r'(?P<HTML><\w+.*?>|</\w+ *>)', 33 33 r'(?P<PSEUDOHTML>\[/?\w+\])', 34 34 r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", 35 35 r'(?P<WORD_ORDINAL>\d+(?:ers?|nds?|es?|des?|ièmes?|èmes?|emes?|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)\b)', 36 36 r'(?P<HOUR>\d\d?h\d\d\b)', 37 37 r'(?P<NUM>\d+(?:[.,]\d+|))', 38 - r'(?P<SIGN>[%‰+=*/<>⩾⩽-])', 38 + r'(?P<SIGN>[%‰€$+=*/<>⩾⩽#-])', 39 39 r"(?P<WORD>\w+(?:[’'`-]\w+)*)" 40 40 ) 41 41 } 42 42 43 43 44 44 class Tokenizer: 45 45 "Tokenizer: transforms a text in a list of tokens"