Grammalecte  Check-in [2777d8cef6]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[core] new regex for sentence splitting, generator of sentences
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256:2777d8cef6c55a83e05311347076867a67f3cf007a7f3e7298d41a770d8510f7
User & Date: olr 2019-05-24 12:21:35
Context
2019-05-24
14:12
[core] sentence splitting: code clarification check-in: b52cb827b1 user: olr tags: core, trunk
12:21
[core] new regex for sentence splitting, generator of sentences check-in: 2777d8cef6 user: olr tags: core, trunk
12:20
[fr] commentaire check-in: 8fd1fbf7f3 user: olr tags: fr, trunk
Changes

Changes to gc_core/js/text.js.

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23







24
25
26
27
28
29
30
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('[.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])', "g"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let iStart = 0;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
        yield [iStart, sText.length];
    },








    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");
        while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {







|











>
>
>
>
>
>
>







5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('[.?!:;…]+[   ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])', "g"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let iStart = 0;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
        yield [iStart, sText.length];
    },

    getSentence: function* (sText) {
        // generator: returns sentences found in <sText>
        for (let [iStart, iEnd] of this.getSentenceBoundaries(sText)) {
            yield sText.slice(iStart, iEnd);
        }
    },

    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");
        while ((iEnd = sText.indexOf(sSepParagraph, iStart)) !== -1) {

Changes to gc_core/py/text.py.

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21






22
23
24
25
26
27
28
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'[.?!:;…]\W+(?=[A-ZÉÈÎÔ])')

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = 0
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()
    yield (iStart, len(sText))








def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)
    while iEnd != -1:







|









>
>
>
>
>
>







5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'[.?!:;…]+[   ]+[»”’]?(?=[«"“‘]?[A-ZÉÈÎÔ–—])')

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = 0
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()
    yield (iStart, len(sText))


def getSentence (sText):
    "generator: returns sentences found in <sText>"
    for iStart, iEnd in getSentenceBoundaries(sText):
        yield sText[iStart:iEnd]


def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)
    while iEnd != -1: