Grammalecte  Check-in [b3448ac17f]

Overview
Comment:[graphspell][fx] update tokenizer and lexicographer: add symbols and emojis
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fx | graphspell
Files: files | file ages | folders
SHA3-256: b3448ac17f5057b4c4bfdbd977ef76ad4bb84295fa88d01ca87620931a64fb13
User & Date: olr on 2020-11-30 15:15:42
Other Links: manifest | tags
Context
2020-12-02
07:56
[misc] sublime text syntax update check-in: 02626b166a user: olr tags: misc, trunk
2020-11-30
15:15
[graphspell][fx] update tokenizer and lexicographer: add symbols and emojis check-in: b3448ac17f user: olr tags: fx, graphspell, trunk
11:12
[fr] ajustements check-in: ad1f902c97 user: olr tags: fr, trunk
Changes

Modified gc_lang/fr/webext/content_scripts/panel_lxg.css from [c6d8057ee1] to [ef239ab156].

   146    146   div.grammalecte_lxg_token_COMPLEX {
   147    147       background-color: hsla(60, 50%, 50%, 1);
   148    148   }
   149    149   div.grammalecte_lxg_token_PUNC {
   150    150       background-color: hsla(210, 50%, 50%, 1);
   151    151   }
   152    152   div.grammalecte_lxg_token_SIGN {
          153  +    background-color: hsla(210, 50%, 50%, 1);
          154  +}
          155  +div.grammalecte_lxg_token_SYMBOL,
          156  +div.grammalecte_lxg_token_EMOJI {
   153    157       background-color: hsla(300, 50%, 50%, 1);
   154    158   }
   155    159   div.grammalecte_lxg_token_LINK {
   156    160       background-color: hsla(270, 50%, 50%, 1);
   157    161   }
   158    162   div.grammalecte_lxg_token_HTML,
   159    163   div.grammalecte_lxg_token_PSEUDO_HTML {
   160    164       background-color: hsla(60, 50%, 50%, 1);
   161    165   }

Modified graphspell-js/lexgraph_fr.js from [95daec96bc] to [7a8c8a0a55].

   447    447           let m = null;
   448    448           try {
   449    449               switch (oToken.sType) {
   450    450                   case 'PUNC':
   451    451                   case 'SIGN':
   452    452                       oToken["aLabels"] = [this.dValues.gl_get(oToken["sValue"], "signe de ponctuation divers")];
   453    453                       break;
          454  +                case 'SYMB':
          455  +                    oToken["aLabels"] = ["symbole"];
          456  +                    break;
          457  +                case 'EMOJI':
          458  +                    oToken["aLabels"] = ["émoji"];
          459  +                    break;
   454    460                   case 'NUM':
   455    461                       oToken["aLabels"] = ["nombre"];
   456    462                       break;
   457    463                   case 'LINK':
   458    464                       oToken["aLabels"] = ["hyperlien"];
   459    465                       break;
   460    466                   case 'TAG':

Modified graphspell-js/tokenizer.js from [9c02b80583] to [8e6d24c94a].

    44     44               [/^(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ]/i, 'WORDELD'],
    45     45               [/^\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b/, 'HOUR'],
    46     46               [/^\d+(?:ers?\b|res?\b|è[rm]es?\b|i[èe][mr]es?\b|de?s?\b|nde?s?\b|ès?\b|es?\b|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)/, 'WORDORD'],
    47     47               [/^\d+(?:[.,]\d+|)/, 'NUM'],
    48     48               [/^[&%‰€$+±=*/<>⩾⩽#|×¥£§¢¬÷@-]/, 'SIGN'],
    49     49               [/^[a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+(?:[’'`-][a-zA-Zà-öÀ-Ö0-9ø-ÿØ-ßĀ-ʯff-stᴀ-ᶿ\u0300-\u036fᵉʳˢⁿᵈ]+)*/, 'WORD'],
    50     50               [/^_+/, 'UNDERSCORE'],
    51         -            [/^\S/, 'OTHER'],
           51  +            [/^[\u2600-\u26ff\u2700-\u27bf\u1f650-\u1f67f\u1f700-\u1f77f\u1f780-\u1f7ff\u1f800-\u1f8ff]/, 'SYMBOL'],
           52  +            [/^[\u1f300-\u1f5ff\u1f600-\u1f64f\u1f680-\u1f6ff\u1f900-\u1f9ff]+/u, "EMOJI"],
           53  +            [/^\S/u, 'OTHER'],
    52     54           ]
    53     55   };
    54     56   
    55     57   
    56     58   class Tokenizer {
    57     59   
    58     60       constructor (sLang) {

Modified graphspell/lexgraph_fr.py from [3b5f42b556] to [8b4d1fbe84].

   439    439   _zImperatifVerb = re.compile("([\\w]+)(-(?:l(?:es?|a)-(?:moi|toi|lui|[nv]ous|leur)|y|en|[mts]['’ʼ‘‛´`′‵՚ꞌꞋ](?:y|en)|les?|la|[mt]oi|leur|lui))$")
   440    440   
   441    441   def setLabelsOnToken (dToken):
   442    442       # Token: .sType, .sValue, .nStart, .nEnd, .lMorph
   443    443       try:
   444    444           if dToken["sType"] == "PUNC" or dToken["sType"] == "SIGN":
   445    445               dToken["aLabels"] = [_dValues.get(dToken["sValue"], "signe de ponctuation divers")]
          446  +        elif dToken["sType"] == 'SYMBOL':
          447  +            dToken["aLabels"] = ["symbole"]
          448  +        elif dToken["sType"] == 'EMOJI':
          449  +            dToken["aLabels"] = ["émoji"]
   446    450           elif dToken["sType"] == 'NUM':
   447    451               dToken["aLabels"] = ["nombre"]
   448    452           elif dToken["sType"] == 'LINK':
   449    453               dToken["aLabels"] = ["hyperlien"]
   450    454           elif dToken["sType"] == 'TAG':
   451    455               dToken["aLabels"] = ["étiquette (hashtag)"]
   452    456           elif dToken["sType"] == 'HTML':

Modified graphspell/tokenizer.py from [84d5574a19] to [1e4c6dee79].

    38     38               r"(?P<WORDELD>(?:l|d|n|m|t|s|j|c|ç|lorsqu|puisqu|jusqu|quoiqu|qu|presqu|quelqu)['’ʼ‘‛´`′‵՚ꞌꞋ])",
    39     39               r'(?P<WORDORD>\d+(?:ers?|res?|è[rm]es?|i[èe][mr]es?|de?s?|nde?s?|ès?|es?|ᵉʳˢ?|ʳᵉˢ?|ᵈᵉ?ˢ?|ⁿᵈᵉ?ˢ?|ᵉˢ?)\b)',
    40     40               r'(?P<HOUR>\d\d?[h:]\d\d(?:[m:]\d\ds?|)\b)',
    41     41               r'(?P<NUM>\d+(?:[.,]\d+|))',
    42     42               r'(?P<SIGN>[&%‰€$+±=*/<>⩾⩽#|×¥£¢§¬÷@-])',
    43     43               r"(?P<WORD>(?:(?!_)[\w\u0300-\u036f])+(?:[’'`-](?:(?!_)[\w\u0300-\u036f])+)*)",        # with combining diacritics
    44     44               r"(?P<UNDERSCORE>_+)",
           45  +            r"(?P<SYMBOL>[\u2600-\u26ff\u2700-\u27bf\U0001f650-\U0001f67f\U0001f700-\U0001f77f\U0001f780-\U0001f7ff\U0001f800-\U0001f8ff])",
           46  +            r"(?P<EMOJI>[\U0001f300-\U0001f5ff\U0001f600-\U0001f64f\U0001f680-\U0001f6ff\U0001f900-\U0001f9ff]+)",
    45     47               r"(?P<OTHER>\S)"
    46     48           )
    47     49   }
    48     50   
    49     51   
    50     52   class Tokenizer:
    51     53       "Tokenizer: transforms a text in a list of tokens"