Grammalecte  Check-in [90478790e5]

Overview
Comment:[graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | graphspell
Files: files | file ages | folders
SHA3-256: 90478790e5f24f1080fba675fedafa45f731c5ec9b4ba52ce24981e5a268b0e8
User & Date: olr on 2020-05-03 09:12:35
Other Links: manifest | tags
Context
2020-05-03
09:21
[graphspell] remove useless code check-in: 2e960183fa user: olr tags: graphspell, trunk
09:12
[graphspell] experiment: DamerauLevenstein distance modified by function calculating distance between chars check-in: 90478790e5 user: olr tags: graphspell, trunk
06:32
[fr] ajustements check-in: 7108ce2dc1 user: olr tags: fr, trunk
Changes

Modified graphspell-js/char_player.js from [60a9fdaff6] to [a12e814d2d].

16
17
18
19
20
21
22
23








































24
25
26
27
28
29
30
31
32
33
34
35
36
    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },









































    _xTransCharsForSimplification: new Map([
        ['à', 'a'],  ['é', 'é'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'i'],  ['y', 'i'],
        ['â', 'a'],  ['è', 'é'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'i'],
        ['ä', 'a'],  ['ê', 'é'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'i'],
        ['á', 'a'],  ['ë', 'é'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'i'],
        ['ā', 'a'],  ['ē', 'é'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'i'],
        ['ç', 'c'],  ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
        ['œ', 'oe'], ['æ', 'ae'],
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st'],
        ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"],
        ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"]
    ]),









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
|
|
|
|







16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
    spellingNormalization: function (sWord) {
        let sNewWord = "";
        for (let c of sWord) {
            sNewWord += this._xTransCharsForSpelling.gl_get(c, c);
        }
        return sNewWord.normalize("NFC");
    },

    oDistanceBetweenChars: {
        "a": {},
        "e": {"é": 0.5},
        "é": {"e": 0.5},
        "i": {"y": 0.2},
        "o": {},
        "u": {},
        "y": {"i": 0.3},
        "b": {"d": 0.8, "h": 0.9},
        "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
        "d": {"b": 0.8},
        "f": {"v": 0.8},
        "g": {"j": 0.5},
        "h": {"b": 0.9},
        "j": {"g": 0.5, "i": 0.9},
        "k": {"c": 0.5, "q": 0.1, "x": 0.5},
        "l": {"i": 0.9},
        "m": {"n": 0.8},
        "n": {"m": 0.8, "r": 0.9},
        "p": {"q": 0.9},
        "q": {"c": 0.5, "k": 0.1, "p": 0.9},
        "r": {"n": 0.9, "j": 0.9},
        "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
        "t": {"d": 0.9},
        "v": {"f": 0.8, "w": 0.1},
        "w": {"v": 0.1},
        "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
        "z": {"s": 0.5}
    },

    distanceBetweenChars: function (c1, c2) {
        if (c1 == c2) {
            return 0;
        }
        if (this.oDistanceBetweenChars.hasOwnProperty(c1) && this.oDistanceBetweenChars[c1].hasOwnProperty(c2)) {
            return this.oDistanceBetweenChars[c1][c2];
        }
        return 1;
    },

    _xTransCharsForSimplification: new Map([
        ['à', 'a'],  ['é', 'é'],  ['î', 'i'],  ['ô', 'o'],  ['û', 'u'],  ['ÿ', 'y'],
        ['â', 'a'],  ['è', 'é'],  ['ï', 'i'],  ['ö', 'o'],  ['ù', 'u'],  ['ŷ', 'y'],
        ['ä', 'a'],  ['ê', 'é'],  ['í', 'i'],  ['ó', 'o'],  ['ü', 'u'],  ['ý', 'y'],
        ['á', 'a'],  ['ë', 'é'],  ['ì', 'i'],  ['ò', 'o'],  ['ú', 'u'],  ['ỳ', 'y'],
        ['ā', 'a'],  ['ē', 'é'],  ['ī', 'i'],  ['ō', 'o'],  ['ū', 'u'],  ['ȳ', 'y'],
        ['ç', 'c'],  ['ñ', 'n'],  ['k', 'q'],  ['w', 'v'],
        ['œ', 'oe'], ['æ', 'ae'],
        ['ſ', 's'],  ['ffi', 'ffi'],  ['ffl', 'ffl'],  ['ff', 'ff'],  ['ſt', 'ft'],  ['fi', 'fi'],  ['fl', 'fl'],  ['st', 'st'],
        ["⁰", "0"], ["¹", "1"], ["²", "2"], ["³", "3"], ["⁴", "4"], ["⁵", "5"], ["⁶", "6"], ["⁷", "7"], ["⁸", "8"], ["⁹", "9"],
        ["₀", "0"], ["₁", "1"], ["₂", "2"], ["₃", "3"], ["₄", "4"], ["₅", "5"], ["₆", "6"], ["₇", "7"], ["₈", "8"], ["₉", "9"]
    ]),

Modified graphspell-js/str_transform.js from [d8c3ab3e0d] to [b9cc7d5a41].

2
3
4
5
6
7
8






9
10
11
12
13
14
15
..
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";








// Note: 48 is the ASCII code for "0"

var str_transform = {

    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
................................................................................
                    table[i+1][j+1] = 0;
                }
            }
        }
        return longestCommonSubstring;
    },

    distanceDamerauLevenshtein2: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i] = new Array(nLen2 + 1);
            }
            for (let i = 0;  i <= nLen1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    let nCost = (s1[i] === s2[j]) ? 0 : 1;

                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            return matrix[nLen1][nLen2];
        }
        catch (e) {
            console.error(e);
        }
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let INF = nLen1 + nLen2;
            let matrix = [];







>
>
>
>
>
>







 







|






|
|

|


|




|
>










|






|







2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
..
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

/* jshint esversion:6, -W097 */
/* jslint esversion:6 */
/* global exports, console */

"use strict";

if (typeof(process) !== 'undefined') {
    var char_player = require("./char_player.js");
} else if (typeof(require) !== 'undefined') {
    var char_player = require("resource://grammalecte/graphspell/char_player.js");
}


// Note: 48 is the ASCII code for "0"

var str_transform = {

    getNgrams: function (sWord, n=2) {
        let lNgrams = [];
................................................................................
                    table[i+1][j+1] = 0;
                }
            }
        }
        return longestCommonSubstring;
    },

    distanceDamerauLevenshtein: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let matrix = [];
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i] = new Array(nLen2 + 2);
            }
            for (let i = 0;  i <= nLen1+1;  i++) {
                matrix[i][0] = i;
            }
            for (let j = 0;  j <= nLen2+1;  j++) {
                matrix[0][j] = j;
            }
            for (let i = 1;  i <= nLen1;  i++) {
                for (let j = 1;  j <= nLen2;  j++) {
                    //let nCost = (s1[i-1] === s2[j-1]) ? 0 : 1;
                    let nCost = char_player.distanceBetweenChars(s1[i-1], s2[j-1]);
                    matrix[i][j] = Math.min(
                        matrix[i-1][j] + 1,         // Deletion
                        matrix[i][j-1] + 1,         // Insertion
                        matrix[i-1][j-1] + nCost    // Substitution
                    );
                    if (i > 1 && j > 1 && s1[i] == s2[j-1] && s1[i-1] == s2[j]) {
                        matrix[i][j] = Math.min(matrix[i][j], matrix[i-2][j-2] + nCost);  // Transposition
                    }
                }
            }
            return Math.floor(matrix[nLen1][nLen2]);
        }
        catch (e) {
            console.error(e);
        }
    },

    distanceDamerauLevenshtein2: function (s1, s2) {
        // distance of Damerau-Levenshtein between <s1> and <s2>
        // https://fr.wikipedia.org/wiki/Distance_de_Damerau-Levenshtein
        try {
            let nLen1 = s1.length;
            let nLen2 = s2.length;
            let INF = nLen1 + nLen2;
            let matrix = [];

Modified graphspell/char_player.py from [d15991830e] to [72875ebb27].

11
12
13
14
15
16
17
18







































19
20
21
22
23
24
25
26
27
28
29
30
31
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))









































_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'é',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'i',  "y": "i",
    'â': 'a',  'è': 'é',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'i',
    'ä': 'a',  'ê': 'é',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'i',
    'á': 'a',  'ë': 'é',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'i',
    'ā': 'a',  'ē': 'é',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'i',
    'ç': 'c',  'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st',
    "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
    "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9"
})









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
|
|
|
|







11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st'
})

def spellingNormalization (sWord):
    "nomalization NFC and removing ligatures"
    return unicodedata.normalize("NFC", sWord.translate(_xTransCharsForSpelling))


dDistanceBetweenChars = {
    "a": {},
    "e": {"é": 0.5},
    "é": {"e": 0.5},
    "i": {"y": 0.2},
    "o": {},
    "u": {},
    "y": {"i": 0.3},
    "b": {"d": 0.8, "h": 0.9},
    "c": {"ç": 0.1, "k": 0.5, "q": 0.5, "s": 0.5, "x": 0.5, "z": 0.8},
    "d": {"b": 0.8},
    "f": {"v": 0.8},
    "g": {"j": 0.5},
    "h": {"b": 0.9},
    "j": {"g": 0.5, "i": 0.9},
    "k": {"c": 0.5, "q": 0.1, "x": 0.5},
    "l": {"i": 0.9},
    "m": {"n": 0.8},
    "n": {"m": 0.8, "r": 0.9},
    "p": {"q": 0.9},
    "q": {"c": 0.5, "k": 0.1, "p": 0.9},
    "r": {"n": 0.9, "j": 0.9},
    "s": {"c": 0.5, "ç": 0.1, "x": 0.5, "z": 0.5},
    "t": {"d": 0.9},
    "v": {"f": 0.8, "w": 0.1},
    "w": {"v": 0.1},
    "x": {"c": 0.5, "k": 0.5, "q": 0.5, "s": 0.5},
    "z": {"s": 0.5}
}


def distanceBetweenChars (c1, c2):
    if c1 == c2:
        return 0
    if c1 not in dDistanceBetweenChars:
        return 1
    return dDistanceBetweenChars[c1].get(c2, 1)


_xTransCharsForSimplification = str.maketrans({
    'à': 'a',  'é': 'é',  'î': 'i',  'ô': 'o',  'û': 'u',  'ÿ': 'y',
    'â': 'a',  'è': 'é',  'ï': 'i',  'ö': 'o',  'ù': 'u',  'ŷ': 'y',
    'ä': 'a',  'ê': 'é',  'í': 'i',  'ó': 'o',  'ü': 'u',  'ý': 'y',
    'á': 'a',  'ë': 'é',  'ì': 'i',  'ò': 'o',  'ú': 'u',  'ỳ': 'y',
    'ā': 'a',  'ē': 'é',  'ī': 'i',  'ō': 'o',  'ū': 'u',  'ȳ': 'y',
    'ç': 'c',  'ñ': 'n',  'k': 'q',  'w': 'v',
    'œ': 'oe',  'æ': 'ae',
    'ſ': 's',  'ffi': 'ffi',  'ffl': 'ffl',  'ff': 'ff',  'ſt': 'ft',  'fi': 'fi',  'fl': 'fl',  'st': 'st',
    "⁰": "0", "¹": "1", "²": "2", "³": "3", "⁴": "4", "⁵": "5", "⁶": "6", "⁷": "7", "⁸": "8", "⁹": "9",
    "₀": "0", "₁": "1", "₂": "2", "₃": "3", "₄": "4", "₅": "5", "₆": "6", "₇": "7", "₈": "8", "₉": "9"
})

Modified graphspell/str_transform.py from [7dcad03ac9] to [452d0bdcef].

1
2
3
4
5



6
7
8
9
10
11
12
..
40
41
42
43
44
45
46
47

48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""




#### Ngrams

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]

................................................................................
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            nCost = 0  if s1[i] == s2[j]  else 1

            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return d[nLen1-1, nLen2-1]


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)





>
>
>







 







|
>







|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
..
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
Operations on strings:
- calculate distance between two strings
- transform strings with transformation codes
"""

from .char_player import distanceBetweenChars


#### Ngrams

def getNgrams (sWord, n=2):
    "return a list of Ngrams strings"
    return [ sWord[i:i+n]  for i in range(len(sWord)-n+1) ]

................................................................................
    nLen2 = len(s2)
    for i in range(-1, nLen1+1):
        d[i, -1] = i + 1
    for j in range(-1, nLen2+1):
        d[-1, j] = j + 1
    for i in range(nLen1):
        for j in range(nLen2):
            #nCost = 0  if s1[i] == s2[j]  else 1
            nCost = distanceBetweenChars(s1[i], s2[j])
            d[i, j] = min(
                d[i-1, j]   + 1,        # Deletion
                d[i,   j-1] + 1,        # Insertion
                d[i-1, j-1] + nCost,    # Substitution
            )
            if i and j and s1[i] == s2[j-1] and s1[i-1] == s2[j]:
                d[i, j] = min(d[i, j], d[i-2, j-2] + nCost)     # Transposition
    return int(d[nLen1-1, nLen2-1])


def distanceSift4 (s1, s2, nMaxOffset=5):
    "implementation of general Sift4."
    # https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
    if not s1:
        return len(s2)