Grammalecte  Check-in [cee9fdd1aa]

Overview
Comment:[core] ibdawg: suggestion mechanism update
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256: cee9fdd1aae4f0a139ab15185b4bc438031c0d9f8c90d32e25f16369b0efc22f
User & Date: olr on 2017-06-25 23:41:22
Other Links: manifest | tags
Context
2017-06-26
06:50
[core] ibdawg: suggestion mechanism update + keyboard chars proximity check-in: 80ebc25208 user: olr tags: core, trunk
2017-06-25
23:41
[core] ibdawg: suggestion mechanism update check-in: cee9fdd1aa user: olr tags: core, trunk
17:52
[fr][bug] codes de positionnement JS manquants check-in: 4985bf4659 user: olr tags: fr, trunk
Changes

Modified cli.py from [cfdf5fb796] to [8ec24a7015].

   203    203                       if sWord:
   204    204                           echo("* " + sWord)
   205    205                           for sMorph in oDict.getMorph(sWord):
   206    206                               echo("  {:<32} {}".format(sMorph, oLexGraphe.formatTags(sMorph)))
   207    207               elif sText.startswith("!"):
   208    208                   for sWord in sText[1:].strip().split():
   209    209                       if sWord:
   210         -                        echo("* suggestions for: " + sWord)
   211         -                        for sSugg in oDict.suggest(sWord):
   212         -                            echo("  > " + sSugg)
          210  +                        echo(" | ".join(oDict.suggest(sWord)))
   213    211               elif sText.startswith("/+ "):
   214    212                   gce.setOptions({ opt:True  for opt in sText[3:].strip().split()  if opt in gce.getOptions() })
   215    213                   echo("done")
   216    214               elif sText.startswith("/- "):
   217    215                   gce.setOptions({ opt:False  for opt in sText[3:].strip().split()  if opt in gce.getOptions() })
   218    216                   echo("done")
   219    217               elif sText.startswith("/-- "):

Modified gc_core/py/char_player.py from [8b0dcb8c7c] to [faa9abdccc].

     1      1   # list of similar chars
     2      2   # useful for suggestion mechanism
     3      3   
     4         -dSimilarChar = {
     5         -	"1": "l",
     6         -
     7         -    "a": "aàâáä",
     8         -    "à": "aàâáä",
     9         -    "â": "aàâáä",
    10         -    "á": "aàâáä",
    11         -    "ä": "aàâáä",
    12         -
    13         -    "c": "cçsśŝ",
    14         -    "ç": "cçsśŝ",
    15         -
    16         -    "e": "eéêèë",
    17         -    "é": "eéêèë",
    18         -    "ê": "eéêèë",
    19         -    "è": "eéêèë",
    20         -    "ë": "eéêèë",
    21         -
    22         -    "g": "j",
    23         -    "j": "g",
    24         -
    25         -    "i": "iîïyíìÿ",
    26         -    "î": "iîïyíìÿ",
    27         -    "ï": "iîïyíìÿ",
    28         -    "í": "iîïyíìÿ",
    29         -    "ì": "iîïyíìÿ",
    30         -
    31         -    "n": "nñńǹ",
    32         -
    33         -    "o": "oôóòö",
    34         -    "ô": "oôóòö",
    35         -    "ó": "oôóòö",
    36         -    "ò": "oôóòö",
    37         -    "ö": "oôóòö",
            4  +
            5  +# Method: Remove Useless Chars
            6  +
            7  +_dUselessChar = {
            8  +    'a': '',  'e': '',  'i': '',  'o': '',  'u': '',  'y': '',
            9  +    'à': '',  'é': '',  'î': '',  'ô': '',  'û': '',  'ÿ': '',
           10  +    'â': '',  'è': '',  'ï': '',  'ö': '',  'ù': '',  'ŷ': '',
           11  +    'ä': '',  'ê': '',  'í': '',  'ó': '',  'ü': '',  'ý': '',
           12  +    'á': '',  'ë': '',  'ì': '',  'ò': '',  'ú': '',  'ỳ': '',
           13  +    'ā': '',  'ē': '',  'ī': '',  'ō': '',  'ū': '',  'ȳ': '',
           14  +    'h': '',  'œ': '',  'æ': ''
           15  + }
           16  +
           17  +_CHARMAP = str.maketrans(_dUselessChar)
           18  +
           19  +aUselessChar = frozenset(_dUselessChar.keys())
           20  +
           21  +def clearWord (sWord):
           22  +    "remove vovels and h"
           23  +    return sWord.translate(_CHARMAP)
           24  +
           25  +
           26  +# Similar chars
           27  +
           28  +d1to1 = {
           29  +    "1": "li",
           30  +    "2": "e",
           31  +    "3": "e",
           32  +    "4": "aà",
           33  +    "5": "ge",
           34  +    "6": "bd",
           35  +    "7": "lt",
           36  +    "8": "b",
           37  +    "9": "gbd",
           38  +
           39  +    "a": "aàâáäæ",
           40  +    "à": "aàâáäæ",
           41  +    "â": "aàâáäæ",
           42  +    "á": "aàâáäæ",
           43  +    "ä": "aàâáäæ",
           44  +
           45  +    "æ": "éa",
           46  +
           47  +    "c": "cçskqśŝ",
           48  +    "ç": "cçskqśŝ",
           49  +
           50  +    "e": "eéèêëœ",
           51  +    "é": "eéèêëœ",
           52  +    "ê": "eéèêëœ",
           53  +    "è": "eéèêëœ",
           54  +    "ë": "eéèêëœ",
           55  +
           56  +    "f": "fv",
           57  +
           58  +    "g": "gjq",
           59  +    
           60  +    "i": "iîïylíìÿ",
           61  +    "î": "iîïylíìÿ",
           62  +    "ï": "iîïylíìÿ",
           63  +    "í": "iîïylíìÿ",
           64  +    "ì": "iîïylíìÿ",
           65  +
           66  +    "j": "jg",
           67  +
           68  +    "k": "kcq",
           69  +
           70  +    "l": "li",
           71  +
           72  +    "n": "nñr",
           73  +
           74  +    "o": "oôóòöœ",
           75  +    "ô": "oôóòöœ",
           76  +    "ó": "oôóòöœ",
           77  +    "ò": "oôóòöœ",
           78  +    "ö": "oôóòöœ",
           79  +
           80  +    "œ": "œoôeéèêë",
           81  +
           82  +    "p": "pb",
           83  +
           84  +    "q": "ckg",
           85  +
           86  +    "r": "rn",
    38     87   
    39     88       "s": "sśŝcç",
    40     89       "ś": "sśŝcç",
    41     90       "ŝ": "sśŝcç",
    42     91   
    43     92       "u": "uûùüú",
    44     93       "û": "uûùüú",
    45     94       "ù": "uûùüú",
    46     95       "ü": "uûùüú",
    47     96       "ú": "uûùüú",
           97  +
           98  +    "v": "vwf",
           99  +
          100  +    "w": "wv",
          101  +
          102  +    "x": "xck",
    48    103   
    49    104       "y": "yÿiîŷýỳ",
    50    105       "ÿ": "yÿiîŷýỳ",
    51    106       "ŷ": "yÿiîŷýỳ",
    52    107       "ý": "yÿiîŷýỳ",
    53    108       "ỳ": "yÿiîŷýỳ",
          109  +
          110  +    "z": "zs",
          111  +}
          112  +
          113  +d1toX = {
          114  +    "æ": ("ae",),
          115  +    "b": ("bb",),
          116  +    "c": ("cc", "ss", "qu", "ch"),
          117  +    "ç": ("ss", "cc", "qh", "ch"),
          118  +    "d": ("dd",),
          119  +    "f": ("ff", "ph"),
          120  +    "g": ("gu", "ge", "gg", "gh"),
          121  +    "i": ("ii",),
          122  +    "j": ("jj", "dj"),
          123  +    "k": ("qu", "ck", "ch", "cu", "kk", "kh"),
          124  +    "l": ("ll",),
          125  +    "m": ("mm", "mn"),
          126  +    "n": ("nn", "nm", "mn"),
          127  +    "o": ("au", "eau", "aut"),
          128  +    "œ": ("oe", "eu"),
          129  +    "p": ("pp", "ph"),
          130  +    "q": ("qu", "ch", "cq", "ck", "kk"),
          131  +    "r": ("rr",),
          132  +    "s": ("ss", "sh"),
          133  +    "t": ("tt", "th"),
          134  +    "x": ("cc", "ct", "xx"),
          135  +    "z": ("ss", "zh")
          136  +}
          137  +
          138  +d2toX = {
          139  +    "an": ("en",),
          140  +    "en": ("an",),
          141  +    "ai": ("ei", "é", "è", "ê", "ë"),
          142  +    "ei": ("ai", "ait", "ais", "é", "è", "ê", "ë"),
          143  +    "ch": ("sh", "c", "ss"),
          144  +    "ct": ("x", "cc"),
          145  +    "oa": ("oi",),
          146  +    "oi": ("oa", "oie"),
          147  +    "qu": ("q", "cq", "ck", "c", "k"),
          148  +}
          149  +
          150  +
          151  +# End of word
          152  +
          153  +dFinal1 = {
          154  +    "a": ("as", "at", "ant"),
          155  +
          156  +    "e": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait", "ent"),
          157  +    "é": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
          158  +    "è": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
          159  +    "ê": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
          160  +    "ë": ("et", "er", "ets", "ée", "ez", "ai", "ais", "ait"),
          161  +
          162  +    "i": ("is", "it", "ie", "in"),
          163  +
          164  +    "n": ("nt", "nd", "ns"),
          165  +
          166  +    "o": ("aut", "ot", "os"),
          167  +    "ô": ("aut", "ot", "os"),
          168  +    "ö": ("aut", "ot", "os"),
          169  +
          170  +    "u": ("ut", "us"),
          171  +}
          172  +
          173  +dFinal2 = {
          174  +    "an": ("ant", "ent"),
          175  +    "en": ("ent", "ant"),
          176  +    "ei": ("ait", "ais"),
          177  +    "on": ("ons", "ont"),
          178  +    "oi": ("ois", "oit", "oix"),
          179  +}
          180  +
          181  +
          182  +# Préfixes
          183  +
          184  +aPfx = ("anti", "contre", "mé", "im", "in", "ir", "par", "pré", "re", "ré", "sans", "sous", "sur")
          185  +
          186  +
          187  +# Keyboards
          188  +
          189  +dBépo = {
          190  +    # on présume que le bépoète est moins susceptible de faire des erreurs de frappe que l’azertyste.
          191  +    # ligne 2
          192  +    "b": "éa",
          193  +    "é": "bpu",
          194  +    "p": "éoi",
          195  +    "o": "pèe",
          196  +    "è": "o",
          197  +    "v": "dt",
          198  +    "d": "vls",
          199  +    "l": "djr",
          200  +    "j": "lzn",
          201  +    "z": "jmw",
          202  +    # ligne 3
          203  +    "a": "ubà",
          204  +    "u": "aiéy",
          205  +    "i": "uepx",
          206  +    "e": "io",
          207  +    "c": "t",
          208  +    "t": "csvq",
          209  +    "s": "trdg",
          210  +    "r": "snlh",
          211  +    "n": "rmjf",
          212  +    "m": "nzç",
          213  +    # ligne 4
          214  +    "à": "yêa",
          215  +    "y": "àxu",
          216  +    "x": "ywi",
          217  +    "w": "z",
          218  +    "k": "c",
          219  +    "q": "gt",
          220  +    "g": "qhs",
          221  +    "h": "gfr",
          222  +    "f": "hçn",
          223  +    "ç": "fm",
          224  +}
          225  +
          226  +dAzerty = {
          227  +    # ligne 1
          228  +    "é": "az",
          229  +    "è": "yu",
          230  +    "ç": "àio",
          231  +    "à": "op",
          232  +    # ligne 2
          233  +    "a": "zéqs",
          234  +    "z": "aesqd",
          235  +    "e": "zrdsf",
          236  +    "r": "etfdg",
          237  +    "t": "rygfh",
          238  +    "y": "tuhgj",
          239  +    "u": "yijhk",
          240  +    "i": "uokjl",
          241  +    "o": "iplkm",
          242  +    "p": "oml",
          243  +    # ligne 3
          244  +    "q": "sawz",
          245  +    "s": "qdzwxe",
          246  +    "d": "sfexcr",
          247  +    "f": "dgrcvt",
          248  +    "g": "fhtvby",
          249  +    "h": "gjybnu",
          250  +    "j": "hkuni",
          251  +    "k": "jlio",
          252  +    "l": "kmop",
          253  +    "m": "lùp",
          254  +    "ù": "m",
          255  +    # ligne 4
          256  +    "w": "xqs",
          257  +    "x": "wcsd",
          258  +    "c": "xvdf",
          259  +    "v": "cbfg",
          260  +    "b": "vngh",
          261  +    "n": "bhj",
    54    262   }

Modified gc_core/py/ibdawg.py from [077d799ad3] to [18fa7e7c19].

     1      1   #!python3
     2      2   # -*- coding: UTF-8 -*-
     3      3   
     4      4   import os
     5      5   import traceback
     6      6   import pkgutil
            7  +from itertools import chain
     7      8   
     8      9   from . import str_transform as st
     9     10   from . import char_player as cp
    10     11   from .echo import echo
    11     12   
           13  +
           14  +def show (nDeep, sText):
           15  +    print(nDeep * "  " + sText)
           16  +
    12     17   
    13     18   class IBDAWG:
    14     19       """INDEXABLE BINARY DIRECT ACYCLIC WORD GRAPH"""
    15     20   
    16     21       def __init__ (self, sDicName):
    17     22           self.by = pkgutil.get_data(__package__, "_dictionaries/" + sDicName)
    18     23           if not self.by:
................................................................................
    47     52           if self.cStemming == "S":
    48     53               self.funcStemming = st.changeWordWithSuffixCode
    49     54           elif self.cStemming == "A":
    50     55               self.funcStemming = st.changeWordWithAffixCode
    51     56           else:
    52     57               self.funcStemming = st.noStemming
    53     58           self.nTag = self.nArcVal - self.nChar - self.nAff
           59  +        # <dChar> to get the value of an arc, <dVal> to get the char of an arc with its value
    54     60           self.dChar = {}
    55     61           for i in range(1, self.nChar):
    56     62               self.dChar[self.lArcVal[i]] = i
           63  +        self.dVal = { v: k  for k, v in self.dChar.items() }
    57     64               
    58     65           self._arcMask = (2 ** ((self.nBytesArc * 8) - 3)) - 1
    59     66           self._finalNodeMask = 1 << ((self.nBytesArc * 8) - 1)
    60     67           self._lastArcMask = 1 << ((self.nBytesArc * 8) - 2)
    61     68           self._addrBitMask = 1 << ((self.nBytesArc * 8) - 3)  # version 2
    62     69   
    63     70           self.nBytesOffset = 1 # version 3
    64     71   
    65     72           # Configuring DAWG functions according to nVersion
    66     73           if self.nVersion == 1:
    67     74               self.morph = self._morph1
    68     75               self.stem = self._stem1
    69     76               self._lookupArcNode = self._lookupArcNode1
           77  +            self._getArcs = self._getArcs1
    70     78               self._writeNodes = self._writeNodes1
    71     79           elif self.nVersion == 2:
    72     80               self.morph = self._morph2
    73     81               self.stem = self._stem2
    74     82               self._lookupArcNode = self._lookupArcNode2
           83  +            self._getArcs = self._getArcs2
    75     84               self._writeNodes = self._writeNodes2
    76     85           elif self.nVersion == 3:
    77     86               self.morph = self._morph3
    78     87               self.stem = self._stem3
           88  +            self._getArcs = self._getArcs3
    79     89               self._lookupArcNode = self._lookupArcNode3
    80     90               self._writeNodes = self._writeNodes3
    81     91           else:
    82     92               raise ValueError("  # Error: unknown code: {}".format(self.nVersion))
    83     93   
    84     94           self.bOptNumSigle = False
    85     95           self.bOptNumAtLast = False
................................................................................
   161    171               if c not in self.dChar:
   162    172                   return False
   163    173               iAddr = self._lookupArcNode(self.dChar[c], iAddr)
   164    174               if iAddr == None:
   165    175                   return False
   166    176           return int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask
   167    177   
   168         -    def suggest (self, sWord, iAddr=0, sNewWord=""):
   169         -        "not finished"
          178  +    def suggest (self, sWord):
          179  +        "returns a set of similar words"
          180  +        # first, we check for similar words
          181  +        return set(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
          182  +        lSugg = self._suggest(sWord)
          183  +        if not lSugg:
          184  +            lSugg.extend(self._suggest(sWord[1:]))
          185  +            lSugg.extend(self._suggest(sWord[:-1]))
          186  +            lSugg.extend(self._suggest(sWord[1:-1]))
          187  +            if not lSugg:
          188  +                lSugg.extend(self._suggestWithCrushedUselessChars(cp.clearWord(sWord)))
          189  +        return set(lSugg)
          190  +
          191  +    def _suggest (self, sWord, cPrevious='', nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
   170    192           # RECURSIVE FUNCTION
   171    193           if not sWord:
   172    194               if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          195  +                show(nDeep, "!!! " + sNewWord + " !!!")
   173    196                   return [sNewWord]
   174    197               return []
          198  +        #show(nDeep, "<" + sWord + ">  ===>  " + sNewWord)
   175    199           lSugg = []
   176         -        for cChar, jAddr in self._getSimilarArcs(sWord[0:1], iAddr):
   177         -            lSugg.extend(self.suggest(sWord[1:], jAddr, sNewWord+cChar))
          200  +        cCurrent = sWord[0:1]
          201  +        for cChar, jAddr in self._getSimilarArcs(cCurrent, iAddr):
          202  +            #show(nDeep, cChar)
          203  +            lSugg.extend(self._suggest(sWord[1:], cCurrent, nDeep+1, jAddr, sNewWord+cChar))
          204  +        if not bAvoidLoop: # avoid infinite loop
          205  +            #show(nDeep, ":no loop:")
          206  +            if cPrevious == cCurrent:
          207  +                # same char, we remove 1 char without adding 1 to <sNewWord>
          208  +                lSugg.extend(self._suggest(sWord[1:], cCurrent, nDeep+1, iAddr, sNewWord))
          209  +            for sRepl in cp.d1toX.get(cCurrent, ()):
          210  +                #show(nDeep, sRepl)
          211  +                lSugg.extend(self._suggest(sRepl + sWord[1:], cCurrent, nDeep+1, iAddr, sNewWord, True))
          212  +            if len(sWord) == 1:
          213  +                #show(nDeep, ":end of word:")
          214  +                # end of word
          215  +                for sRepl in cp.dFinal1.get(sWord, ()):
          216  +                    #show(nDeep, sRepl)
          217  +                    lSugg.extend(self._suggest(sRepl, cCurrent, nDeep+1, iAddr, sNewWord, True))
   178    218           return lSugg
   179    219   
   180    220       def _getSimilarArcs (self, cChar, iAddr):
   181    221           "generator: yield similar char of <cChar> and address of the following node"
   182         -        for c in cp.dSimilarChar.get(cChar, [cChar]):
          222  +        for c in cp.d1to1.get(cChar, [cChar]):
          223  +            if c in self.dChar:
          224  +                jAddr = self._lookupArcNode(self.dChar[c], iAddr)
          225  +                if jAddr:
          226  +                    yield (c, jAddr)
          227  +
          228  +    def _suggestWithCrushedUselessChars (self, sWord, cPrevious='', nDeep=0, iAddr=0, sNewWord="", bAvoidLoop=False):
          229  +        if not sWord:
          230  +            if int.from_bytes(self.byDic[iAddr:iAddr+self.nBytesArc], byteorder='big') & self._finalNodeMask:
          231  +                show(nDeep, "!!! " + sNewWord + " !!!")
          232  +                return [sNewWord]
          233  +            return []
          234  +        lSugg = []
          235  +        cCurrent = sWord[0:1]
          236  +        for cChar, jAddr in self._getSimilarArcsAndCrushedChars(cCurrent, iAddr):
          237  +            show(nDeep, cChar)
          238  +            lSugg.extend(self._suggestWithCrushedUselessChars(sWord[1:], cCurrent, nDeep+1, jAddr, sNewWord+cChar))
          239  +        return lSugg
          240  +
          241  +    def _getSimilarArcsAndCrushedChars (self, cChar, iAddr):
          242  +        "generator: yield similar char of <cChar> and address of the following node"
          243  +        for nVal, jAddr in self._getArcs(iAddr):
          244  +            if self.dVal.get(nVal, "") in cp.aUselessChar:
          245  +                yield (self.dVal[nVal], jAddr)
          246  +        for c in cp.d1to1.get(cChar, [cChar]):
   183    247               if c in self.dChar:
   184    248                   jAddr = self._lookupArcNode(self.dChar[c], iAddr)
   185    249                   if jAddr:
   186    250                       yield (c, jAddr)
   187    251   
   188    252       def getMorph (self, sWord):
   189    253           "retrieves morphologies list, different casing allowed"
................................................................................
   263    327                   return int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big')
   264    328               else:
   265    329                   # value not found
   266    330                   if (nRawArc & self._lastArcMask):
   267    331                       return None
   268    332                   iAddr = iEndArcAddr+self.nBytesNodeAddress
   269    333   
          334  +    def _getArcs1 (self, iAddr):
          335  +        "generator: return all arcs at <iAddr> as tuples of (nVal, iAddr)"
          336  +        while True:
          337  +            iEndArcAddr = iAddr+self.nBytesArc
          338  +            nRawArc = int.from_bytes(self.byDic[iAddr:iEndArcAddr], byteorder='big')
          339  +            yield (nRawArc & self._arcMask, int.from_bytes(self.byDic[iEndArcAddr:iEndArcAddr+self.nBytesNodeAddress], byteorder='big'))
          340  +            if (nRawArc & self._lastArcMask):
          341  +                break
          342  +            iAddr = iEndArcAddr+self.nBytesNodeAddress
          343  +
   270    344       def _writeNodes1 (self, spfDest):
   271    345           "for debugging only"
   272    346           print(" > Write binary nodes")
   273    347           with codecs.open(spfDest, 'w', 'utf-8', newline="\n") as hDst:
   274    348               iAddr = 0
   275    349               hDst.write("i{:_>10} -- #{:_>10}\n".format("0", iAddr))
   276    350               while iAddr < len(self.byDic):