Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | [graphspell][fr] tokenisation: +signes €$# (faux positif) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | fr | graphspell |
Files: | files | file ages | folders |
SHA3-256: | 365d3554c78bff6e9e27c69cff0ff290 |
User & Date: | olr 2019-02-22 11:53:36 |
Context
2019-02-22
| ||
17:10 | [fr] faux positif: confusion peut/peu check-in: d6657f0c8b user: olr tags: fr, trunk | |
11:53 | [graphspell][fr] tokenisation: +signes €$# (faux positif) check-in: 365d3554c7 user: olr tags: fr, graphspell, trunk | |
11:27 | [fr] merge last commit (cherrypick) check-in: 90dc1dc3c1 user: olr tags: fr, trunk | |
Changes
Changes to gc_lang/fr/rules.grx.
12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 |
-2>> =suggPlur(\2) # Accord de nombre erroné : « \2 » devrait être au pluriel.
TEST: 00 heure, 01 heure
TEST: il a adopté 1 {{chiens}}.
TEST: 22 {{heure}}
TEST: 3 {{heure}}
TEST: les élèves sont inquiets après une année 2018 compliquée et riche en réformes.
TEST: ils gagnent 3000 € maximum.
## trouver ça/ceci/cela + adj
__gn_trouver_ça_adj__
>trouver [ça|cela|ceci] @:A.*:(?:f|m:p)¬:(?:G|3[sp]|M)
<<- /gn/ -3>> =suggMasSing(\3) # Trouver \2 + [adjectif] : l’adjectif s’accorde avec “\2” (au masculin singulier).
|
| |
12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 |
-2>> =suggPlur(\2) # Accord de nombre erroné : « \2 » devrait être au pluriel.
TEST: 00 heure, 01 heure
TEST: il a adopté 1 {{chiens}}.
TEST: 22 {{heure}}
TEST: 3 {{heure}}
TEST: les élèves sont inquiets après une année 2018 compliquée et riche en réformes.
TEST: ils gagneront 300 € maximum.
## trouver ça/ceci/cela + adj
__gn_trouver_ça_adj__
>trouver [ça|cela|ceci] @:A.*:(?:f|m:p)¬:(?:G|3[sp]|M)
<<- /gn/ -3>> =suggMasSing(\3) # Trouver \2 + [adjectif] : l’adjectif s’accorde avec “\2” (au masculin singulier).
|
Changes to graphspell-js/tokenizer.js.
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
..
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
[/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'],
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^\d\d?h\d\d\b/, 'HOUR'],
[/^\d+(?:[.,]\d+|)/, 'NUM'],
[/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+)*/, 'WORD']
],
"fr":
[
[/^[ \t]+/, 'SPACE'],
[/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'],
[/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'],
................................................................................
[/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'],
[/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'],
[/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
[/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'WORD_ELIDED'],
[/^\d\d?[hm]\d\d\b/, 'HOUR'],
[/^\d+(?:ers?\b|nds?\b|es?\b|des?\b|ièmes?\b|èmes?\b|emes?\b|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)/, 'WORD_ORDINAL'],
[/^\d+(?:[.,]\d+|)/, 'NUM'],
[/^[%‰+=*/<>⩾⩽-]/, 'SIGN'],
[/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+)*/, 'WORD']
]
};
class Tokenizer {
|
|
|
|
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
..
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
[/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.\/?&!%=+*"'@$#-]+/, 'LINK'], [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_-]+/, 'TAG'], [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], [/^\d\d?h\d\d\b/, 'HOUR'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[%‰€$+=*/<>⩾⩽#-]/, 'SIGN'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿ_]+)*/, 'WORD'] ], "fr": [ [/^[ \t]+/, 'SPACE'], [/^\/(?:~|bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERUNIX'], [/^[a-zA-Z]:\\(?:Program Files(?: \(x86\)|)|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st.()]+)(?:\\[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st_.()-]+)*/, 'FOLDERWIN'], ................................................................................ [/^<[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+.*?>|<\/[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+ *>/, 'HTML'], [/^\[\/?[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-st]+\]/, 'PSEUDOHTML'], [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'], [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`]/i, 'WORD_ELIDED'], [/^\d\d?[hm]\d\d\b/, 'HOUR'], [/^\d+(?:ers?\b|nds?\b|es?\b|des?\b|ièmes?\b|èmes?\b|emes?\b|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)/, 'WORD_ORDINAL'], [/^\d+(?:[.,]\d+|)/, 'NUM'], [/^[%‰€$+=*/<>⩾⩽#-]/, 'SIGN'], [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯfi-stᴀ-ᶿᵉʳˢⁿᵈ_]+)*/, 'WORD'] ] }; class Tokenizer { |
Changes to graphspell/tokenizer.py.
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
..
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r'(?P<HOUR>\d\d?h\d\d\b)',
r'(?P<NUM>\d+(?:[.,]\d+))',
r'(?P<SIGN>[%‰+=*/<>⩾⩽-])',
r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
),
"fr":
(
r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)',
r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—])',
................................................................................
r'(?P<HASHTAG>[#@][\w-]+)',
r'(?P<HTML><\w+.*?>|</\w+ *>)',
r'(?P<PSEUDOHTML>\[/?\w+\])',
r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])",
r'(?P<WORD_ORDINAL>\d+(?:ers?|nds?|es?|des?|ièmes?|èmes?|emes?|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)\b)',
r'(?P<HOUR>\d\d?h\d\d\b)',
r'(?P<NUM>\d+(?:[.,]\d+|))',
r'(?P<SIGN>[%‰+=*/<>⩾⩽-])',
r"(?P<WORD>\w+(?:[’'`-]\w+)*)"
)
}
class Tokenizer:
"Tokenizer: transforms a text in a list of tokens"
|
|
|
|
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
..
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)', r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)', r'(?P<HASHTAG>[#@][\w-]+)', r'(?P<HTML><\w+.*?>|</\w+ *>)', r'(?P<PSEUDOHTML>\[/?\w+\])', r'(?P<HOUR>\d\d?h\d\d\b)', r'(?P<NUM>\d+(?:[.,]\d+))', r'(?P<SIGN>[%‰€$+=*/<>⩾⩽#-])', r"(?P<WORD>\w+(?:[’'`-]\w+)*)" ), "fr": ( r'(?P<FOLDERUNIX>/(?:bin|boot|dev|etc|home|lib|mnt|opt|root|sbin|tmp|usr|var|Bureau|Documents|Images|Musique|Public|Téléchargements|Vidéos)(?:/[\w.()-]+)*)', r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)', r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—])', ................................................................................ r'(?P<HASHTAG>[#@][\w-]+)', r'(?P<HTML><\w+.*?>|</\w+ *>)', r'(?P<PSEUDOHTML>\[/?\w+\])', r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu)['’`])", r'(?P<WORD_ORDINAL>\d+(?:ers?|nds?|es?|des?|ièmes?|èmes?|emes?|ᵉʳˢ?|ⁿᵈˢ?|ᵉˢ?|ᵈᵉˢ?)\b)', r'(?P<HOUR>\d\d?h\d\d\b)', r'(?P<NUM>\d+(?:[.,]\d+|))', r'(?P<SIGN>[%‰€$+=*/<>⩾⩽#-])', r"(?P<WORD>\w+(?:[’'`-]\w+)*)" ) } class Tokenizer: "Tokenizer: transforms a text in a list of tokens" |