Grammalecte  Check-in [bf0a1bdd5d]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[core] move getSentenceBoundaries from gc_engine to text module
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256:bf0a1bdd5dcdb4b2f65922bfa97012a4eb4f1ab5df68bf2a797ee178156b1742
User & Date: olr 2019-05-24 07:42:20
Context
2019-05-24
08:50
[core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: core, trunk
07:42
[core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: core, trunk
07:03
[fr] faux positifs et ajustements check-in: 2f921877e7 user: olr tags: fr, trunk
Changes

Changes to gc_core/js/lang_core/gc_engine.js.

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
...
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
    },

    //// Parsing

    parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
        let oText = new TextParser(sText);
        return oText.parse(sCountry, bDebug, dOptions, bContext);
    },

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),
    _zEndOfParagraph: new RegExp ("[-  .,;?!…–—]*$", "ig"),

    getSentenceBoundaries: function* (sText) {
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
    }
};


class TextParser {

    constructor (sText) {
................................................................................
            this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash
        }
        if (this.sText.includes("@@")) {
            this.sText = this.sText.replace(/@@+/g, "");
        }

        // parse sentence
        for (let [iStart, iEnd] of gc_engine.getSentenceBoundaries(this.sText)) {
            try {
                this.sSentence = this.sText.slice(iStart, iEnd);
                this.sSentence0 = this.sText0.slice(iStart, iEnd);
                this.nOffsetWithinParagraph = iStart;
                this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true));
                this.dTokenPos.clear();
                for (let dToken of this.lToken) {







<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







|







159
160
161
162
163
164
165














166
167
168
169
170
171
172
...
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
    },

    //// Parsing

    parse: function (sText, sCountry="${country_default}", bDebug=false, dOptions=null, bContext=false) {
        let oText = new TextParser(sText);
        return oText.parse(sCountry, bDebug, dOptions, bContext);














    }
};


class TextParser {

    constructor (sText) {
................................................................................
            this.sText = this.sText.replace(/‑/g, "-"); // nobreakdash
        }
        if (this.sText.includes("@@")) {
            this.sText = this.sText.replace(/@@+/g, "");
        }

        // parse sentence
        for (let [iStart, iEnd] of text.getSentenceBoundaries(this.sText)) {
            try {
                this.sSentence = this.sText.slice(iStart, iEnd);
                this.sSentence0 = this.sText0.slice(iStart, iEnd);
                this.nOffsetWithinParagraph = iStart;
                this.lToken = Array.from(_oTokenizer.genTokens(this.sSentence, true));
                this.dTokenPos.clear();
                for (let dToken of this.lToken) {

Changes to gc_core/js/text.js.

4
5
6
7
8
9
10















11
12
13
14
15
16
17
/* jslint esversion:6 */
/* global require, exports, console */

"use strict";


var text = {















    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");
        while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {
            yield sText.slice(iStart, iEnd);







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* jslint esversion:6 */
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
    },

    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");
        while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {
            yield sText.slice(iStart, iEnd);

Changes to gc_core/py/lang_core/gc_engine.py.

6
7
8
9
10
11
12



13
14
15
16
17
18
19
...
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
...
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import re
import traceback
#import unicodedata
from itertools import chain

from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo



from . import gc_options

try:
    # LibreOffice / OpenOffice
    from com.sun.star.linguistic2 import SingleProofreadingError
    from com.sun.star.text.TextMarkupType import PROOFREADING
    from com.sun.star.beans import PropertyValue
................................................................................
def resetOptions ():
    "set options to default values"
    global _dOptions
    _dOptions = getDefaultOptions()


#### Parsing

_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")
_zEndOfParagraph = re.compile(r"\W*$")

def _getSentenceBoundaries (sText):
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    "init point to analyze a text"
    oText = TextParser(sText)
    return oText.parse(sCountry, bDebug, dOptions, bContext)


................................................................................
            sText = sText.replace("'", "’")
        if "‑" in sText:
            sText = sText.replace("‑", "-") # nobreakdash
        if "@@" in sText:
            sText = re.sub("@@+", "", sText)

        # parse sentences
        for iStart, iEnd in _getSentenceBoundaries(sText):
            if 4 < (iEnd - iStart) < 2000:
                try:
                    self.sSentence = sText[iStart:iEnd]
                    self.sSentence0 = self.sText0[iStart:iEnd]
                    self.nOffsetWithinParagraph = iStart
                    self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
                    self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }







>
>
>







 







<
<
<
<
<
<
<
<
<
<
<







 







|







6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
...
189
190
191
192
193
194
195











196
197
198
199
200
201
202
...
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import re
import traceback
#import unicodedata
from itertools import chain

from ..graphspell.spellchecker import SpellChecker
from ..graphspell.echo import echo

from .. import text

from . import gc_options

try:
    # LibreOffice / OpenOffice
    from com.sun.star.linguistic2 import SingleProofreadingError
    from com.sun.star.text.TextMarkupType import PROOFREADING
    from com.sun.star.beans import PropertyValue
................................................................................
def resetOptions ():
    "set options to default values"
    global _dOptions
    _dOptions = getDefaultOptions()


#### Parsing












def parse (sText, sCountry="${country_default}", bDebug=False, dOptions=None, bContext=False):
    "init point to analyze a text"
    oText = TextParser(sText)
    return oText.parse(sCountry, bDebug, dOptions, bContext)


................................................................................
            sText = sText.replace("'", "’")
        if "‑" in sText:
            sText = sText.replace("‑", "-") # nobreakdash
        if "@@" in sText:
            sText = re.sub("@@+", "", sText)

        # parse sentences
        for iStart, iEnd in text.getSentenceBoundaries(sText):
            if 4 < (iEnd - iStart) < 2000:
                try:
                    self.sSentence = sText[iStart:iEnd]
                    self.sSentence0 = self.sText0[iStart:iEnd]
                    self.nOffsetWithinParagraph = iStart
                    self.lToken = list(_oTokenizer.genTokens(self.sSentence, True))
                    self.dTokenPos = { dToken["nStart"]: dToken  for dToken in self.lToken  if dToken["sType"] != "INFO" }

Changes to gc_core/py/text.py.

1
2
3
4
5
6

7
8
9











10
11
12
13
14
15
16
#!python3

"""
Text tools
"""


import textwrap
from itertools import chain













def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)
    while iEnd != -1:






>



>
>
>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!python3

"""
Text tools
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()


def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)
    while iEnd != -1: