Grammalecte  Check-in [b68161b398]

Overview
Comment:[graphspell] tokenizer and suggestion engine: other apostrophes
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: b68161b39849bb8fe242ed6d3c766a3845c248f7b76f4cc227e4b9b7421d4f2a
User & Date: olr on 2020-05-07 10:35:13
Other Links: manifest | tags
Context
2020-05-07
11:33
[fr] ajustements, +nr: iel·s +état +ppas check-in: f117d9d93a user: olr tags: fr, trunk
10:35
[graphspell] tokenizer and suggestion engine: other apostrophes check-in: b68161b398 user: olr tags: graphspell, trunk
10:34
[fr] ajustements: apostrophes check-in: c379fb8706 user: olr tags: fr, trunk
Changes

Modified graphspell-js/char_player.js from [76dbb40bcc] to [3caadd8250].

105
106
107
108
109
110
111













112
113
114
115
116
117
118
    aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"),
    aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"),  // letters that may be used twice successively


    // Similar chars

    d1to1: new Map([













        ["1", "1₁liîLIÎ"],
        ["2", "2₂zZ"],
        ["3", "3₃eéèêEÉÈÊ"],
        ["4", "4₄aàâAÀÂ"],
        ["5", "5₅sgSG"],
        ["6", "6₆bdgBDG"],
        ["7", "7₇ltLT"],







>
>
>
>
>
>
>
>
>
>
>
>
>







105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
    aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"),
    aDouble: new Set("bcdfjklmnprstzBCDFJKLMNPRSTZ"),  // letters that may be used twice successively


    // Similar chars

    d1to1: new Map([
        ["'", "'’"],    // U+0027: apostrophe droite
        ["’", "’"],     // U+2019: apostrophe typographique  (sera utilisée par défaut)
        ["ʼ", "ʼ’"],    // U+02BC: Lettre modificative apostrophe
        ["‘", "‘’"],    // U+2018: guillemet-apostrophe culbuté
        ["‛", "‛’"],    // U+201B: guillemet-virgule supérieur culbuté
        ["´", "´’"],    // U+00B4: accent aigu
        ["`", "`’"],    // U+0060: accent grave
        ["′", "′’"],    // U+2032: prime
        ["‵", "‵’"],    // U+2035: prime réfléchi
        ["՚", "՚’"],    // U+055A: apostrophe arménienne
        ["ꞌ", "ꞌ’"],    // U+A78C: latin minuscule saltillo
        ["Ꞌ", "Ꞌ’"],    // U+A78B: latin majuscule saltillo

        ["1", "1₁liîLIÎ"],
        ["2", "2₂zZ"],
        ["3", "3₃eéèêEÉÈÊ"],
        ["4", "4₄aàâAÀÂ"],
        ["5", "5₅sgSG"],
        ["6", "6₆bdgBDG"],
        ["7", "7₇ltLT"],

Modified graphspell-js/tokenizer.js from [efabea9cdf] to [0e7b889227].

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
            [/^[,.;:!?…«»“”‘’"(){}\[\]·–—¿¡]/, 'PUNC'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'WORD_ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+/, 'TAG'],
            [/^<[a-zA-Z]+.*?>|^<\/[a-zA-Z]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ]/i, 'WORD_ELIDED'],
            [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
            [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD']
        ]
};







|







35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
            [/^[,.;:!?…«»“”‘’"(){}\[\]·–—¿¡]/, 'PUNC'],
            [/^[A-Z][.][A-Z][.](?:[A-Z][.])*/, 'WORD_ACRONYM'],
            [/^(?:https?:\/\/|www[.]|[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+[@.][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]{2,}[@.])[a-zA-Z0-9][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_.\/?&!%=+*"'@$#-]+/, 'LINK'],
            [/^[#@][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-st_-]+/, 'TAG'],
            [/^<[a-zA-Z]+.*?>|^<\/[a-zA-Z]+ *>/, 'HTML'],
            [/^\[\/?[a-zA-Z]+\]/, 'PSEUDOHTML'],
            [/^&\w+;(?:\w+;|)/, 'HTMLENTITY'],
            [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ]/i, 'WORD_ELIDED'],
            [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
            [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORD_ORDINAL'],
            [/^\d+(?:[.,]\d+|)/, 'NUM'],
            [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
            [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ_]+)*/, 'WORD']
        ]
};

Modified graphspell/char_player.py from [fff304a6c5] to [fa338bf2f3].

91
92
93
94
95
96
97













98
99
100
101
102
103
104
aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ")  # letters that may be used twice successively


# Similar chars

d1to1 = {













    "1": "1₁liîLIÎ",
    "2": "2₂zZ",
    "3": "3₃eéèêEÉÈÊ",
    "4": "4₄aàâAÀÂ",
    "5": "5₅sgSG",
    "6": "6₆bdgBDG",
    "7": "7₇ltLT",







>
>
>
>
>
>
>
>
>
>
>
>
>







91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ")
aDouble = set("bcdfjklmnprstzBCDFJKLMNPRSTZ")  # letters that may be used twice successively


# Similar chars

d1to1 = {
    "'": "'’",  # U+0027: apostrophe droite
    "’": "’",   # U+2019: apostrophe typographique  (sera utilisée par défaut)
    "ʼ": "ʼ’",  # U+02BC: Lettre modificative apostrophe
    "‘": "‘’",  # U+2018: guillemet-apostrophe culbuté
    "‛": "‛’",  # U+201B: guillemet-virgule supérieur culbuté
    "´": "´’",  # U+00B4: accent aigu
    "`": "`’",  # U+0060: accent grave
    "′": "′’",  # U+2032: prime
    "‵": "‵’",  # U+2035: prime réfléchi
    "՚": "՚’",  # U+055A: apostrophe arménienne
    "ꞌ": "ꞌ’",  # U+A78C: latin minuscule saltillo
    "Ꞌ": "Ꞌ’",  # U+A78B: latin majuscule saltillo

    "1": "1₁liîLIÎ",
    "2": "2₂zZ",
    "3": "3₃eéèêEÉÈÊ",
    "4": "4₄aàâAÀÂ",
    "5": "5₅sgSG",
    "6": "6₆bdgBDG",
    "7": "7₇ltLT",

Modified graphspell/tokenizer.py from [81da836011] to [b7228e1a86].

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
            r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
            r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—¿¡])',
            r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
            r'(?P<HASHTAG>[#@][\w-]+)',
            r'(?P<HTML><\w+.*?>|</\w+ *>)',
            r'(?P<PSEUDOHTML>\[/?\w+\])',
            r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’´‘′`ʼ])",
            r'(?P<WORD_ORDINAL>\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)',
            r'(?P<HOUR>\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)',
            r'(?P<NUM>\d+(?:[.,]\d+|))',
            r'(?P<SIGN>[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])',
            r"(?P<WORD>[\w\u0300-\u036f]+(?:[’'`-][\w\u0300-\u036f]+)*)"
        )
}







|







29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
            r'(?P<FOLDERWIN>[a-zA-Z]:\\(?:Program Files(?: [(]x86[)]|)|[\w.()]+)(?:\\[\w.()-]+)*)',
            r'(?P<PUNC>[][,.;:!?…«»“”‘’"(){}·–—¿¡])',
            r'(?P<WORD_ACRONYM>[A-Z][.][A-Z][.](?:[A-Z][.])*)',
            r'(?P<LINK>(?:https?://|www[.]|\w+[@.]\w\w+[@.])\w[\w./?&!%=+*"\'@$#-]+)',
            r'(?P<HASHTAG>[#@][\w-]+)',
            r'(?P<HTML><\w+.*?>|</\w+ *>)',
            r'(?P<PSEUDOHTML>\[/?\w+\])',
            r"(?P<WORD_ELIDED>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ])",
            r'(?P<WORD_ORDINAL>\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)',
            r'(?P<HOUR>\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)',
            r'(?P<NUM>\d+(?:[.,]\d+|))',
            r'(?P<SIGN>[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])',
            r"(?P<WORD>[\w\u0300-\u036f]+(?:[’'`-][\w\u0300-\u036f]+)*)"
        )
}