Overview
Comment: | [core] merge spellsugg: much faster suggestion engine |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk | core |
Files: | files | file ages | folders |
SHA3-256: |
e6e44e506c927b243de9359a882cf37b |
User & Date: | olr on 2017-11-10 16:52:28 |
Original Comment: | [core] marge spellsugg: much faster suggestion engine |
Other Links: | manifest | tags |
Context
2017-11-10
| ||
20:45 | [fr] autres tests pour les guillemets + commentaires check-in: e3c4cc6975 user: olr tags: fr, trunk | |
16:52 | [core] merge spellsugg: much faster suggestion engine check-in: e6e44e506c user: olr tags: core, trunk | |
16:49 | [fr] remove remaining crap check-in: 6ee59d86c1 user: olr tags: fr, trunk | |
2017-11-09
| ||
11:56 | [core] ibdawg: suggestion mechanism > reduce 1toX replacements overload (much, much faster) check-in: 767e396f2d user: olr tags: core, spellsugg | |
Changes
Modified gc_core/js/char_player.js from [09a4bffb3f] to [0547b59e35].
22 23 24 25 26 27 28 29 30 31 32 33 34 35 ... 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 ... 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
let sRes = ""; for (let c of sWord) { sRes += this._dTransChars.gl_get(c, c); } return sRes.replace("eau", "o").replace("au", "o"); }, // Similar chars d1to1: new Map([ ["1", "liîLIÎ"], ["2", "zZ"], ["3", "eéèêEÉÈÊ"], ................................................................................ d1toX: new Map([ ["æ", ["ae",]], ["Æ", ["AE",]], ["b", ["bb",]], ["B", ["BB",]], ["c", ["cc", "ss", "qu", "ch"]], ["C", ["CC", "SS", "QU", "CH"]], ["ç", ["ss", "cc", "qh", "ch"]], ["Ç", ["SS", "CC", "QH", "CH"]], ["d", ["dd",]], ["D", ["DD",]], ["f", ["ff", "ph"]], ["F", ["FF", "PH"]], ["g", ["gu", "ge", "gg", "gh"]], ["G", ["GU", "GE", "GG", "GH"]], ["i", ["ii",]], ["I", ["II",]], ["j", ["jj", "dj"]], ["J", ["JJ", "DJ"]], ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], ["l", ["ll",]], ["L", ["LL",]], ["m", ["mm", "mn"]], ................................................................................ ["t", ["tt", "th"]], ["T", ["TT", "TH"]], ["x", ["cc", "ct", "xx"]], ["X", ["CC", "CT", "XX"]], ["z", ["ss", "zh"]], ["Z", ["SS", "ZH"]], ]), d2toX: new Map([ ["an", ["en",]], ["AN", ["EN",]], ["au", ["eau", "o", "ô"]], ["AU", ["EAU", "O", "Ô"]], ["en", ["an",]], |
> > > > < < > > < < > > > > > > > |
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 ... 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 ... 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
let sRes = ""; for (let c of sWord) { sRes += this._dTransChars.gl_get(c, c); } return sRes.replace("eau", "o").replace("au", "o"); }, aVowel: new Set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ"), aConsonant: new Set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ"), aDouble: new Set("bcçdfjklmnprstzBCÇDFJKLMNPRSTZ"), // letters that may be used twice successively // Similar chars d1to1: new Map([ ["1", "liîLIÎ"], ["2", "zZ"], ["3", "eéèêEÉÈÊ"], ................................................................................ d1toX: new Map([ ["æ", ["ae",]], ["Æ", ["AE",]], ["b", ["bb",]], ["B", ["BB",]], ["c", ["cc", "ss", "qu", "ch"]], ["C", ["CC", "SS", "QU", "CH"]], ["d", ["dd",]], ["D", ["DD",]], ["é", ["ai", "ei"]], ["É", ["AI", "EI"]], ["f", ["ff", "ph"]], ["F", ["FF", "PH"]], ["g", ["gu", "ge", "gg", "gh"]], ["G", ["GU", "GE", "GG", "GH"]], ["j", ["jj", "dj"]], ["J", ["JJ", "DJ"]], ["k", ["qu", "ck", "ch", "cu", "kk", "kh"]], ["K", ["QU", "CK", "CH", "CU", "KK", "KH"]], ["l", ["ll",]], ["L", ["LL",]], ["m", ["mm", "mn"]], ................................................................................ ["t", ["tt", "th"]], ["T", ["TT", "TH"]], ["x", ["cc", "ct", "xx"]], ["X", ["CC", "CT", "XX"]], ["z", ["ss", "zh"]], ["Z", ["SS", "ZH"]], ]), get1toXReplacement: function (cPrev, cCur, cNext) { if (this.aConsonant.has(cCur) && (this.aConsonant.has(cPrev) || this.aConsonant.has(cNext))) { return []; } return this.d1toX.gl_get(cCur, []); }, d2toX: new Map([ ["an", ["en",]], ["AN", ["EN",]], ["au", ["eau", "o", "ô"]], ["AU", ["EAU", "O", "Ô"]], ["en", ["an",]], |
Modified gc_core/js/ibdawg.js from [ca747a7a44] to [952ba094d6].
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
}
// delete char
if (nMaxDel > 0) {
this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
}
}
// Phonetic replacements
for (let sRepl of char_player.d1toX.gl_get(cCurrent, [])) {
this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
}
for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) {
this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
}
// Hard replacements
if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) {
|
| |
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 |
}
// delete char
if (nMaxDel > 0) {
this._suggest(oSuggResult, sRemain.slice(1), nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
}
}
// Phonetic replacements
for (let sRepl of char_player.get1toXReplacement(sNewWord.slice(-1), cCurrent, sRemain.slice(1,2))) {
this._suggest(oSuggResult, sRepl + sRemain.slice(1), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
}
for (let sRepl of char_player.d2toX.gl_get(sRemain.slice(0, 2), [])) {
this._suggest(oSuggResult, sRepl + sRemain.slice(2), nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, true);
}
// Hard replacements
if (nDeep > 3 && nMaxHardRepl && sRemain.length >= 2) {
|
Modified gc_core/py/char_player.py from [aea8dd1016] to [b0152aab01].
14 15 16 17 18 19 20 21 22 23 24 25 26 27 ... 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 ... 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
'œ': 'oe', 'æ': 'ae', }) def cleanWord (sWord): "word simplication before calculating distance between words" return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o") # Similar chars d1to1 = { "1": "liîLIÎ", "2": "zZ", "3": "eéèêEÉÈÊ", ................................................................................ d1toX = { "æ": ("ae",), "Æ": ("AE",), "b": ("bb",), "B": ("BB",), "c": ("cc", "ss", "qu", "ch"), "C": ("CC", "SS", "QU", "CH"), "ç": ("ss", "cc", "qh", "ch"), "Ç": ("SS", "CC", "QH", "CH"), "d": ("dd",), "D": ("DD",), "f": ("ff", "ph"), "F": ("FF", "PH"), "g": ("gu", "ge", "gg", "gh"), "G": ("GU", "GE", "GG", "GH"), "i": ("ii",), "I": ("II",), "j": ("jj", "dj"), "J": ("JJ", "DJ"), "k": ("qu", "ck", "ch", "cu", "kk", "kh"), "K": ("QU", "CK", "CH", "CU", "KK", "KH"), "l": ("ll",), "L": ("LL",), "m": ("mm", "mn"), ................................................................................ "t": ("tt", "th"), "T": ("TT", "TH"), "x": ("cc", "ct", "xx"), "X": ("CC", "CT", "XX"), "z": ("ss", "zh"), "Z": ("SS", "ZH"), } d2toX = { "an": ("en",), "AN": ("EN",), "au": ("eau", "o", "ô"), "AU": ("EAU", "O", "Ô"), "en": ("an",), |
> > > > > < < > > < < > > > > > > > |
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 ... 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 ... 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
'œ': 'oe', 'æ': 'ae', }) def cleanWord (sWord): "word simplication before calculating distance between words" return sWord.lower().translate(_xTransChars).replace("eau", "o").replace("au", "o") aVowel = set("aáàâäāeéèêëēiíìîïīoóòôöōuúùûüūyýỳŷÿȳœæAÁÀÂÄĀEÉÈÊËĒIÍÌÎÏĪOÓÒÔÖŌUÚÙÛÜŪYÝỲŶŸȲŒÆ") aConsonant = set("bcçdfghjklmnñpqrstvwxzBCÇDFGHJKLMNÑPQRSTVWXZ") aDouble = set("bcçdfjklmnprstzBCÇDFJKLMNPRSTZ") # letters that may be used twice successively # Similar chars d1to1 = { "1": "liîLIÎ", "2": "zZ", "3": "eéèêEÉÈÊ", ................................................................................ d1toX = { "æ": ("ae",), "Æ": ("AE",), "b": ("bb",), "B": ("BB",), "c": ("cc", "ss", "qu", "ch"), "C": ("CC", "SS", "QU", "CH"), "d": ("dd",), "D": ("DD",), "é": ("ai", "ei"), "É": ("AI", "EI"), "f": ("ff", "ph"), "F": ("FF", "PH"), "g": ("gu", "ge", "gg", "gh"), "G": ("GU", "GE", "GG", "GH"), "j": ("jj", "dj"), "J": ("JJ", "DJ"), "k": ("qu", "ck", "ch", "cu", "kk", "kh"), "K": ("QU", "CK", "CH", "CU", "KK", "KH"), "l": ("ll",), "L": ("LL",), "m": ("mm", "mn"), ................................................................................ "t": ("tt", "th"), "T": ("TT", "TH"), "x": ("cc", "ct", "xx"), "X": ("CC", "CT", "XX"), "z": ("ss", "zh"), "Z": ("SS", "ZH"), } def get1toXReplacement (cPrev, cCur, cNext): if cCur in aConsonant and (cPrev in aConsonant or cNext in aConsonant): return () return d1toX.get(cCur, ()) d2toX = { "an": ("en",), "AN": ("EN",), "au": ("eau", "o", "ô"), "AU": ("EAU", "O", "Ô"), "en": ("an",), |
Modified gc_core/py/ibdawg.py from [dc355a8e9b] to [846d5dd677].
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
# switching chars if nMaxSwitch: self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "><",True) # delete char if nMaxDel: self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "-"+cCurrent, True) # Phonetic replacements for sRepl in cp.d1toX.get(cCurrent, ()): self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+">"+sRepl, True) for sRepl in cp.d2toX.get(sRemain[0:2], ()): self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain[0:2]+">"+sRepl, True) # Hard replacements if nDeep > 3 and nMaxHardRepl: for cChar, kAddr in self._getCharArcs(iAddr): if cChar not in cp.d1to1.get(cCurrent, ""): |
| |
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
# switching chars if nMaxSwitch: self._suggest(oSuggResult, sRemain[1:2]+sRemain[0:1]+sRemain[2:], nMaxSwitch-1, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "><",True) # delete char if nMaxDel: self._suggest(oSuggResult, sRemain[1:], nMaxSwitch, nMaxDel-1, nMaxHardRepl, nDeep+1, iAddr, sNewWord, "-"+cCurrent, True) # Phonetic replacements for sRepl in cp.get1toXReplacement(sNewWord[-1:], cCurrent, sRemain[1:2]): self._suggest(oSuggResult, sRepl + sRemain[1:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, cCurrent+">"+sRepl, True) for sRepl in cp.d2toX.get(sRemain[0:2], ()): self._suggest(oSuggResult, sRepl + sRemain[2:], nMaxSwitch, nMaxDel, nMaxHardRepl, nDeep+1, iAddr, sNewWord, sRemain[0:2]+">"+sRepl, True) # Hard replacements if nDeep > 3 and nMaxHardRepl: for cChar, kAddr in self._getCharArcs(iAddr): if cChar not in cp.d1to1.get(cCurrent, ""): |