Grammalecte  Check-in [44cbaf2b3e]

Overview
Comment:[fr] gendicfr.py: remove deprecated code
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | fr
Files: files | file ages | folders
SHA3-256: 44cbaf2b3e9a179bdd0044956a7fd9914e230073e65d4463a303934274173363
User & Date: olr on 2020-05-31 07:56:38
Other Links: manifest | tags
Context
2020-05-31
07:57
[fr] gendicfr.py: rename packages check-in: 22198f1f33 user: olr tags: fr, trunk
07:56
[fr] gendicfr.py: remove deprecated code check-in: 44cbaf2b3e user: olr tags: fr, trunk
2020-05-30
18:14
[fr] màj dictionnaire check-in: 5a86f4c00d user: olr tags: fr, trunk
Changes

Modified gc_lang/fr/dictionnaire/genfrdic.py from [29dc50eb92] to [186d4c2e6e].

41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
...
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
....
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
....
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549

# Les dictionnaires
dSUBDIC = { '*': 'Commun',
            'R': 'Réforme1990',
            'M': 'Moderne',
            'C': 'Classique',
            'A': 'Annexe',
            'P': 'Multimots',
            'X': 'Contributeurs' }

dMODERNE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “MODERNE”',
             'shortname': '“Moderne”',
             'asciiName': 'fr-moderne',
             'mozAsciiName': 'fr-FR-modern',
             'subDicts': '*MX',
             'mozId': 'fr-dicollecte-moderne',
             'description': "Dictionnaire français “Moderne”" }

dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }
................................................................................
        with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oEntry in self.lEntry:
                if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
                    hDst.write(oEntry.getDeclination())
        if spDestGL:
            echo("   Fichier de déclinaison copié dans Grammalecte...")
            file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)

    def generateSpellVariants (self, nReq, spBuild):
        if nReq < 1: nReq = 1
        if nReq > 2: nReq = 2
        echo(" * Lexique >> variantes par suppression... n = " + str(nReq))
        with open(spBuild+'/dictSpellVariants-'+str(nReq)+'.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oFlex in frozenset(self.lFlexions):
                hDst.write(oFlex.sFlexion+"\t_\t_\n")
                if len(oFlex.sFlexion) <= 2:
                    n = 0
                elif len(oFlex.sFlexion) <= 5:
                    n = 1
                else:
                    n = nReq
                #lTup = self._generatePhonetVariants(oFlex.sFlexion)
                lTup = self._generateDeleteVariants(oFlex.sFlexion, oFlex.sFlexion, n)
                for t in lTup:
                    sTag = t[1]  if "\t" in t[1]  else t[1]+"\t_"
                    hDst.write(t[0]+"\t"+sTag+"\n")

    _lTupPhonet = [ ("ph", "f"), ("qu", "k"), ("ss", "c"), ("ss", "ç"), ("ct", "x"),
        ("oe", "œ"), ("ae", "æ"), ("ei", "é"), ("ai", "é"), ("au", "o"), ("eau", "o"),
    ]

    def _generatePhonetVariants (self, s):
        l = []
        for torep, rep in self._lTupPhonet():
            for m in torep.finditer(s):
                l.append( (s[:m.start(0)] + rep + s[m.end(0):], str(m.start(0))+":"+str(m.start(0)+len(rep))+">"+torep) )
        return l

    def _generateDeleteVariants (self, sWord0, sWordCur, n):
        "renvoie une liste de tuples : (forme dégradée de sWord, code de genèse de sWord)"
        # caution: recursive function
        if n == 0:
            return []
        lTup = []
        for i in range(len(sWordCur)):
            sNew = sWordCur[0:i]+sWordCur[i+1:]
            lTup.append( ( sNew, self._generateAddCode(sWord0, sNew) ) )
            lTup += self._generateDeleteVariants(sWord0, sNew, n-1)
        return lTup

    def _generateAddCode (self, sWord, sCrippled):
        "returns addCode to generate sWord from sCrippled"
        sAdd = ""
        for i in range(len(sWord)):
            if sWord[i] != sCrippled[i:i+1]:
                sCrippled = sCrippled[:i] + sWord[i] + sCrippled[i:]
                if sAdd:
                    sAdd += "\t"
                sAdd += str(i)+"+"+sWord[i]
        return sAdd  if sAdd  else "0"



class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
................................................................................

def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
    xParser.add_argument("-m", "--mode", help="0: no tags,  1: Hunspell tags (default),  2: All tags", type=int, choices=[0, 1, 2], default=1)
    xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
    xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")
    xParser.add_argument("-sv", "--spellvariants", help="generate spell variants", action="store_true")
    xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
    xArgs = xParser.parse_args()

    if xArgs.simplify:
        xArgs.mode = 0
        xArgs.uncompress = True

................................................................................

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
        oFrenchDict.generateSpellVariants(1, spBuild)

    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')







<


<
<
<
<
<
<
<
<







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







<







 







<
<







41
42
43
44
45
46
47

48
49








50
51
52
53
54
55
56
...
634
635
636
637
638
639
640






















































641
642
643
644
645
646
647
....
1437
1438
1439
1440
1441
1442
1443

1444
1445
1446
1447
1448
1449
1450
....
1470
1471
1472
1473
1474
1475
1476


1477
1478
1479
1480
1481
1482
1483

# Les dictionnaires
dSUBDIC = { '*': 'Commun',
            'R': 'Réforme1990',
            'M': 'Moderne',
            'C': 'Classique',
            'A': 'Annexe',

            'X': 'Contributeurs' }









dCLASSIQUE = { 'name': 'DICTIONNAIRE ORTHOGRAPHIQUE FRANÇAIS “CLASSIQUE”',
               'shortname': '“Classique”',
               'asciiName': 'fr-classique',
               'mozAsciiName': 'fr-FR-classic',
               'subDicts': '*MCX',
               'mozId': 'fr-dicollecte-classique',
               'description': "Dictionnaire français “Classique”" }
................................................................................
        with open(spBuild+'/dictDecl.txt', 'w', encoding='utf-8', newline="\n") as hDst:
            for oEntry in self.lEntry:
                if re.match("[SXFWIA]", oEntry.flags) and (oEntry.po.startswith("nom") or oEntry.po.startswith("adj")):
                    hDst.write(oEntry.getDeclination())
        if spDestGL:
            echo("   Fichier de déclinaison copié dans Grammalecte...")
            file_util.copy_file(spBuild+'/dictDecl.txt', spDestGL)
























































class Entree:
    def __init__ (self, sLine):
        self.lemma = ''
        self.flags = ''
        # champs morphologiques Hunspell
................................................................................

def main ():
    xParser = argparse.ArgumentParser()
    xParser.add_argument("-v", "--verdic", help="set dictionary version, i.e. 5.4", type=str, default="X.Y.z")
    xParser.add_argument("-m", "--mode", help="0: no tags,  1: Hunspell tags (default),  2: All tags", type=int, choices=[0, 1, 2], default=1)
    xParser.add_argument("-u", "--uncompress", help="do not use Hunspell compression", action="store_true")
    xParser.add_argument("-s", "--simplify", help="no virtual lemmas", action="store_true")

    xParser.add_argument("-gl", "--grammalecte", help="copy generated files to Grammalecte folders", action="store_true")
    xArgs = xParser.parse_args()

    if xArgs.simplify:
        xArgs.mode = 0
        xArgs.uncompress = True

................................................................................

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)



    ### Statistiques
    spfStats = spBuild+'/'+STATS_NAME+xArgs.verdic+'.txt'
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')