Grammalecte  Check-in [311ccab788]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[core] sentence splitting: code clarification
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | core
Files: files | file ages | folders
SHA3-256:311ccab78875d456cb1047be38072b6556d2c927d802c96c3896d65615cad872
User & Date: olr 2019-05-24 08:50:20
Context
2019-05-24
11:21
[fr] conversion: regex rules -> graph rules check-in: a5b3aff838 user: olr tags: fr, trunk
08:50
[core] sentence splitting: code clarification check-in: 311ccab788 user: olr tags: core, trunk
07:42
[core] move getSentenceBoundaries from gc_engine to text module check-in: bf0a1bdd5d user: olr tags: core, trunk
Changes

Changes to gc_core/js/text.js.

     5      5   /* global require, exports, console */
     6      6   
     7      7   "use strict";
     8      8   
     9      9   
    10     10   var text = {
    11     11   
    12         -    _zEndOfSentence: new RegExp ('([.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])|.$)', "g"),
    13         -    _zBeginOfParagraph: new RegExp ("^[-  –—.,;?!…]*", "ig"),
           12  +    _zEndOfSentence: new RegExp ('[.?!:;…][   .?!…»«“”"‘’)–—]+(?=[A-ZÉÈÎÔ])', "g"),
    14     13   
    15     14       getSentenceBoundaries: function* (sText) {
    16     15           // generator: returns start and end of sentences found in <sText>
    17         -        let mBeginOfSentence = this._zBeginOfParagraph.exec(sText);
    18         -        let iStart = this._zBeginOfParagraph.lastIndex;
           16  +        let iStart = 0;
    19     17           let m;
    20     18           while ((m = this._zEndOfSentence.exec(sText)) !== null) {
    21     19               yield [iStart, this._zEndOfSentence.lastIndex];
    22     20               iStart = this._zEndOfSentence.lastIndex;
    23     21           }
           22  +        yield [iStart, sText.length];
    24     23       },
    25     24   
    26     25       getParagraph: function* (sText, sSepParagraph = "\n") {
    27     26           // generator: returns paragraphs of text
    28     27           let iStart = 0;
    29     28           let iEnd = 0;
    30     29           sText = sText.replace("\r\n", "\n").replace("\r", "\n");

Changes to gc_core/py/text.py.

     5      5   """
     6      6   
     7      7   import re
     8      8   import textwrap
     9      9   from itertools import chain
    10     10   
    11     11   
    12         -_zEndOfSentence = re.compile(r'([.?!:;…]\W+(?=[A-ZÉÈÎÔ])|.$)')
    13         -_zBeginOfParagraph = re.compile(r"^\W*")
           12  +_zEndOfSentence = re.compile(r'[.?!:;…]\W+(?=[A-ZÉÈÎÔ])')
    14     13   
    15     14   def getSentenceBoundaries (sText):
    16     15       "generator: returns start and end of sentences found in <sText>"
    17         -    iStart = _zBeginOfParagraph.match(sText).end()
           16  +    iStart = 0
    18     17       for m in _zEndOfSentence.finditer(sText):
    19     18           yield (iStart, m.end())
    20     19           iStart = m.end()
           20  +    yield (iStart, len(sText))
    21     21   
    22     22   
    23     23   def getParagraph (sText):
    24     24       "generator: returns paragraphs of text"
    25     25       iStart = 0
    26     26       sText = sText.replace("\r\n", "\n").replace("\r", "\n")
    27     27       iEnd = sText.find("\n", iStart)

Changes to gc_lang/fr/rules.grx.

  1590   1590   
  1591   1591   !!!
  1592   1592   !!!
  1593   1593   !!! Processeur: épuration des signes inutiles et quelques simplifications                         !!
  1594   1594   !!!
  1595   1595   !!!
  1596   1596   
         1597  +# début de phrase
         1598  +__<s>(p_début_de_phrase)__      ^[ .?!:;–—•·… »«‘’“”\"'¿¡-]+  <<- ~>> *
         1599  +
  1597   1600   # fin de phrase
  1598         -__<s>(p_fin_de_phrase)__        [.?!:;…][ .?!… »”")]*$  <<- ~>> *
         1601  +__<s>(p_fin_de_phrase)__        [ .?!:;–—•·… »«‘’“”\"'¿¡-]+$  <<- ~>> *
  1599   1602   
  1600   1603   # Guillemets et exposants
  1601         -__<s>(p_guillemets_exposants)__ [«»“”"„`¹²³⁴⁵⁶⁷⁸⁹⁰]+ <<- ~>> *
         1604  +__<s>(p_guillemets_exposants)__ [«»“”\"„`¹²³⁴⁵⁶⁷⁸⁹⁰]+ <<- ~>> *
  1602   1605   
  1603   1606   # Chapitres et références
  1604   1607   __<s>(p_chapitre_référence)__   [\[({][\dIVXLCDM]+, \d+[\])}]   <js>[\[\(\{][\dIVXLCDM]+, \d+[\]\)\}]</js>   <<- ~>> *
  1605   1608   
  1606   1609   # le, la ou les chose(s)
  1607   1610   __[i>(p_le_ou_les)__            l[ea] ou les {w_2}([(]s[)]) @@$ <<- ~1>> s
  1608   1611   __[i](p_le_ou_la)__             l(e ou la|a ou le) {w_2} @@1 <<- ~1>> ’