Grammalecte  Check-in [311ccab788]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[core] sentence splitting: code clarification
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256:311ccab78875d456cb1047be38072b6556d2c927d802c96c3896d65615cad872
User & Date: olr 2019-05-24 08:50:20
Context
2019-05-24
11:21
[fr] conversion: regex rules -> graph rules check-in: a5b3aff838 user: olr tags: fr, trunk
08:50
[core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: core, trunk
07:42
[core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: core, trunk
Changes

Changes to gc_core/js/text.js.

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

24
25
26
27
28
29
30
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),

    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>
        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
        let iStart = this._zBeginOfParagraph.lastIndex;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }

    },

    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");







|
<



<
|





>







5
6
7
8
9
10
11
12

13
14
15

16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* global require, exports, console */

"use strict";


var text = {

    _zEndOfSentence: new RegExp ('[.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])', "g"),


    getSentenceBoundaries: function* (sText) {
        // generator: returns start and end of sentences found in <sText>

        let iStart = 0;
        let m;
        while ((m = this._zEndOfSentence.exec(sText)) !== null) {
            yield [iStart, this._zEndOfSentence.lastIndex];
            iStart = this._zEndOfSentence.lastIndex;
        }
        yield [iStart, sText.length];
    },

    getParagraph: function* (sText, sSepParagraph = "\n") {
        // generator: returns paragraphs of text
        let iStart = 0;
        let iEnd = 0;
        sText = sText.replace("\r\n", "\n").replace("\r", "\n");

Changes to gc_core/py/text.py.

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

21
22
23
24
25
26
27
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
_zBeginOfParagraph = re.compile(r"^\W*")

def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = _zBeginOfParagraph.match(sText).end()
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()



def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)







|
<



|



>







5
6
7
8
9
10
11
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
"""

import re
import textwrap
from itertools import chain


_zEndOfSentence = re.compile(r'[.?!:;…]\W+(?=[A-ZÉÈÎÔ])')


def getSentenceBoundaries (sText):
    "generator: returns start and end of sentences found in <sText>"
    iStart = 0
    for m in _zEndOfSentence.finditer(sText):
        yield (iStart, m.end())
        iStart = m.end()
    yield (iStart, len(sText))


def getParagraph (sText):
    "generator: returns paragraphs of text"
    iStart = 0
    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    iEnd = sText.find("\n", iStart)

Changes to gc_lang/fr/rules.grx.

1590
1591
1592
1593
1594
1595
1596



1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608

!!!
!!!
!!! Processeur: épuration des signes inutiles et quelques simplifications                         !!
!!!
!!!




# fin de phrase
__<s>(p_fin_de_phrase)__        [.?!:;…][ .?!… »”")]*$  <<- ~>> *

# Guillemets et exposants
__<s>(p_guillemets_exposants)__ [«»“”"„`¹²³⁴⁵⁶⁷⁸⁹⁰]+ <<- ~>> *

# Chapitres et références
__<s>(p_chapitre_référence)__   [\[({][\dIVXLCDM]+, \d+[\])}]   <js>[\[\(\{][\dIVXLCDM]+, \d+[\]\)\}]</js>   <<- ~>> *

# le, la ou les chose(s)
__[i>(p_le_ou_les)__            l[ea] ou les {w_2}([(]s[)]) @@$ <<- ~1>> s
__[i](p_le_ou_la)__             l(e ou la|a ou le) {w_2} @@1 <<- ~1>> ’







>
>
>

|


|







1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611

!!!
!!!
!!! Processeur: épuration des signes inutiles et quelques simplifications                         !!
!!!
!!!

# début de phrase
__<s>(p_début_de_phrase)__      ^[ .?!:;–—•·… »«‘’“”\"'¿¡-]+  <<- ~>> *

# fin de phrase
__<s>(p_fin_de_phrase)__        [ .?!:;–—•·… »«‘’“”\"'¿¡-]+$  <<- ~>> *

# Guillemets et exposants
__<s>(p_guillemets_exposants)__ [«»“”\"„`¹²³⁴⁵⁶⁷⁸⁹⁰]+ <<- ~>> *

# Chapitres et références
__<s>(p_chapitre_référence)__   [\[({][\dIVXLCDM]+, \d+[\])}]   <js>[\[\(\{][\dIVXLCDM]+, \d+[\]\)\}]</js>   <<- ~>> *

# le, la ou les chose(s)
__[i>(p_le_ou_les)__            l[ea] ou les {w_2}([(]s[)]) @@$ <<- ~1>> s
__[i](p_le_ou_la)__             l(e ou la|a ou le) {w_2} @@1 <<- ~1>> ’