Grammalecte  Check-in [656e8f3fad]

Overview
Comment:[fr] gendicfr: apostrophe typographique remplacée par apostrophe droit pour les dictionnaires Hunspell
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fr | rg
Files: files | file ages | folders
SHA3-256: 656e8f3fade04639ca7b3d4568ca2c653e49ebfdaae40b7135d8d6ef811ce973
User & Date: olr on 2018-08-26 09:17:00
Other Links: branch diff | manifest | tags
Context
2018-08-26
09:37
[fr] gendicfr: entrées avec espace non incluses dans les dictionnaires Hunspell check-in: 85dde78b00 user: olr tags: fr, rg
09:17
[fr] gendicfr: apostrophe typographique remplacée par apostrophe droit pour les dictionnaires Hunspell check-in: 656e8f3fad user: olr tags: fr, rg
08:57
[fr] conversion: clarification et corrections de bugs check-in: a12aee51d6 user: olr tags: fr, rg
Changes

Modified gc_lang/fr/dictionnaire/genfrdic.py from [21ee33ebdc] to [1e7a4b144b].

21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
...
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
...
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
...
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
...
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
...
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
...
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
...
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
...
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
...
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
...
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
...
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
...
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
...
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
...
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
...
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
....
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
....
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
....
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
....
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
....
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
....
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
....
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
....
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
....
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
....
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
....
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563

import metagraphe
import metaphone2


# Dictionnaire des caractères pour le tri naturel.
# Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents.
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*             
CHARMAP = str.maketrans({ 'à': 'a',  'À': 'A',  'â': 'a',  'Â': 'A',  'ä': 'a',  'Ä': 'A',  'å': 'a',  'Å': 'A',  'ā': 'a',  'Ā': 'A',
                          'ç': 'c',  'Ç': 'C',
                          'é': 'e',  'É': 'E',  'è': 'e',  'È': 'E',  'ê': 'e',  'Ê': 'E',  'ë': 'e',  'Ë': 'E',  'ē': 'e',  'Ē': 'E',
                          'î': 'i',  'Î': 'I',  'ï': 'i',  'Ï': 'I',  'ī': 'i',  'Ī': 'I',
                          'ñ': 'n',
                          'ô': 'o',  'Ô': 'O',  'ö': 'o',  'Ö': 'O',  'ō': 'o',  'Ō': 'O',
                          'ù': 'u',  'Ù': 'U',  'û': 'u',  'Û': 'U',  'ü': 'u',  'Ü': 'U',  'ū': 'u',  'Ū': 'U',
................................................................................
        # Affixes
        self.sSettings = '' # enregistre tout avant la ligne # END
        self.dFlags = collections.OrderedDict()
        self.bShortenTags = False
        self.dAM = collections.OrderedDict() # étiquettes morphologiques
        self.dAF = collections.OrderedDict() # étiquettes drapeaux
        # Flexions
        self.lFlexions = []           # liste des flexions avec lemme, morphologie et occurrences 
        self.lStatsLex = []
        self.nTotOccurRecognizedWords = 0
        self.aFlexions = None
    
    def readDictionary (self, spf):
        "Lecture du dictionnaire"
        echo('Dictionnaire << [ {} ]'.format(spf), end=' ')
        for sLine in readfile(spf):
            sLine = sLine.strip()
            if not sLine.isdigit() and not sLine.startswith("#"):
                self.lEntry.append(Entree(sLine))
................................................................................
                dAF[oEntry.flags] = dAF.get(oEntry.flags, 0) + 1
            sMorph = oEntry.getMorph(nMode).strip()
            if sMorph:
                dAM[sMorph] = dAM.get(sMorph, 0) + 1

        lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True)
        lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True)
        
        with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst:
            hDst.write("\n\nDrapeaux :\n")
            for nAF, elem in enumerate(lAF, 1):
                self.dAF[elem[0]] = str(nAF)
                hDst.write("  > {0[1]:>8} : {0[0]}\n".format(elem))
            hDst.write("\n\nMorphologies :\n")
            for nAM, elem in enumerate(lAM, 1):
................................................................................
            if oEntry.di in dTplVars['subDicts']:
                nEntry += 1
        with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(str(nEntry)+"\n")
            for oEntry in self.lEntry:
                if oEntry.di in dTplVars['subDicts']:
                    hDst.write(oEntry.getEntryLine(self, nMode, bSimplified))
    
    def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
        "Écrire le fichier des affixes (.aff)"
        echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
        info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
               "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
               "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
               "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
               "# par Olivier R. -- licence MPL 2.0\n" + \
               "# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \
               "# Pour améliorer le dictionnaire, allez sur http://www.dicollecte.org/\n\n"
               
        with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(info)
            hDst.write(self.sSettings + "\n")
            if self.bShortenTags:
                hDst.write("AM {}\n".format(len(self.dAM)))
                for item in self.dAM.items():
                    hDst.write("AM {}\n".format(item[0]))
................................................................................

    def sortEntriesNatural (self):
        echo(' * Dictionnaire - Tri naturel des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat)

    def sortEntriesNumerical (self):
        echo(' * Dictionnaire - Tri numérique des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)        

    def sortLexiconByFlexion (self):
        echo(' * Dictionnaire - tri du lexique (par flexion)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion)

    def sortLexiconByFreq (self):
        echo(' * Dictionnaire - tri du lexique (par fréquence)...')
................................................................................
                d[oFlex.sFlexion] = [oFlex.oEntry]
        for oFlex in self.lFlexions:
            oFlex.lMulti = list(d[oFlex.sFlexion])
            oFlex.nMulti = len(oFlex.lMulti)
        for oFlex in self.lFlexions:
            oFlex.lMulti.remove(oFlex.oEntry)
            oFlex.nMulti -= 1
        
    def setTagsFrom (self, other):
        echo(' * Dictionnaire - copie des tags...')
        for i in range(self.nEntry):
            for oEntry in other.lEntry:
                if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags:
                    self.lEntry[i].setTagsFrom(oEntry)

................................................................................
            hDst.write(oStatsLex.getInfo())
            for oFlex in self.lFlexions:
                oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion))
            self.nTotOccurRecognizedWords = 0
            for oFlex in self.lFlexions:
                oFlex.calcOccur()
                self.nTotOccurRecognizedWords += oFlex.nOccur
            
            # Report des occurrences
            echo("   report des occurrences des formes fléchies multiples...")
            hDst.write("Report des occurrences des formes fléchies multiples :\n")
            hDst.write("  Légende :\n")
            hDst.write("    >>   le nombre d’occurrences de la flexion est ramené à la moyenne.\n")
            hDst.write("    +>   le nombre d’occurrences de la flexion est augmenté avec le surplus d’occurrences des flexions ramenées à la moyenne.\n")
            hDst.write("    %>   le nombre d’occurrences de la flexion est pondéré avec le poids de la moyenne de l’entrée.\n\n")

            for oEntry in self.lEntry:
                oEntry.calcOccurFromFlexions()
                oEntry.calcAverageKnownOccurrence()
                oEntry.solveOccurMultipleFlexions(hDst, oStatsLex)
                oEntry.calcOccurFromFlexions()
            
            # Fréquences
            echo("   calcul des fréquences et indices de fréquence...")
            for oFlex in self.lFlexions:
                oFlex.calcFreq(self.nTotOccurRecognizedWords)
            for oEntry in self.lEntry:
                oEntry.calcFreq(self.nTotOccurRecognizedWords)
            
            # Entrées, statistiques
            echo("   statistiques...")
            hDst.write("\n\nNatures grammaticales :\n")
            d = {}
            for oEntry in self.lEntry:
                po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po)
                d[po] = d.get(po, 0) + 1
            for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
                hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e))
            
            hDst.write("\n\nVentilation des entrées par indice de fréquence :\n")
            d1 = {}
            d2 = {}
            for oEntry in self.lEntry:
                d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1
                d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq
            for k in sorted(d1.keys()):
                hDst.write(" * {} : {} entrées ({:.2f} %)  → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k]))
                    
            hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n")
            d = {}
            for oEntry in self.lEntry:
                d[oEntry.di] = d.get(oEntry.di, 0) + 1
            for sKey, nVal in d.items():
                hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry))
            
            # Occurrences des lettres
            echo("   occurrences des lettres...")
            d = {}
            for oFlex in self.lFlexions:
                for c in oFlex.sFlexion:
                    d[c] = d.get(c, 0) + oFlex.nOccur
            nTot = 0
................................................................................
            hDst.write("\n\nNombre de formes fléchies : {}\n".format(len(self.lFlexions)))
            hDst.write("\n\nNombre de graphies : {}\n".format(len(self.aFlexions)))

    def calcMetagraphe (self):
        echo(" * Lexique - Metagraphe")
        for oFlex in self.lFlexions:
            oFlex.calcMetagraphe()
    
    def calcMetaphone2 (self):
        echo(" * Lexique - Metaphone 2")
        for oFlex in self.lFlexions:
            oFlex.calcMetaphone2()
    
    def createNgrams (self, spDest, n):
        echo(" * Lexique - Ngrams " + str(n))
        if n < 2:
            echo("erreur: n = " + str(n))
            return
        dOccur = {} # ngram:n
        dRefW = {} # ngram:set(idx)
................................................................................
        file_util.copy_file('_templates/ooo/french_flag.png', spExt)
        file_util.copy_file('_templates/ooo/french_flag_16.bmp', spExt+'/ui')
        copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars)
        copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars)
        #file_util.copy_file('_templates/ooo/dictionaries.xcu.tpl.xml', spExt)
        copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars)
        for dVars in lDictVars:
            dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion 
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic')
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff')
        copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars)
        # thesaurus
        file_util.copy_file('thesaurus/thes_fr.dat', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/thes_fr.idx', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/README_thes_fr.txt', spExt+'/dictionaries')
................................................................................
        file_util.copy_file('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
        # zip
        createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
        # copy to Grammalecte Project
        if spDestGL:
            echo("   extension copiée dans Grammalecte...")
            dir_util.copy_tree(spExt+'/dictionaries', spDestGL)
    
    def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # Mozilla extension 1
        echo(" * Dictionnaire >> extension pour Mozilla")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_MOZ + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/dictionaries')
................................................................................
        createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
        # Grammalecte
        if spDestGL:
            echo(" * Dictionnaire >> copie des dicos dans Grammalecte")
            for dVars in lDictVars:
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')
    
    def createFileIfqForDB (self, spBuild):
        echo(" * Dictionnaire >> indices de fréquence pour la DB...")
        with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
             open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
            for oEntry in self.lEntry:
                if oEntry.fq != oEntry.oldFq:
                    hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
                    hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))
        
    def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
        sLexName = LEX_PREFIX + version
        spLex = spBuild + '/' + sLexName
        dir_util.mkpath(spLex)
        # write Dicollecte lexicon
        self.sortLexiconByFreq()
        self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
................................................................................
        self.nFlexions = 0
        self.lFlexions = []
        self.sRadical = ''
        self.nOccur = 0
        self.nAKO = -1   # Average known occurrences
        self.fFreq = 0
        self.oldFq = ''
        
        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
................................................................................
                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)
                
    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
................................................................................

    def keyTriNat (self):
        return (self.lemma.translate(CHARMAP), self.flags, self.po)

    def keyTriNum (self):
        return (self.lemma, self.flags, self.po)

    def getEntryLine (self, oDict, nMode, bSimplified=False):    
        sLine = self.lemma
        if self.flags:
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
        if bSimplified:
            return sLine.replace("()", "") + "\n"
        if nMode > 0:
            sMorph = self.getMorph(nMode)
................................................................................
                    #echo(sFlex + " " + sMorph + ", ")
                    pass
        # Drapeaux dont le lemme féminin doit être remplacé par le masculin dans la gestion des formes fléchies
        if self.flags.startswith(("F.", "F*", "W.", "W*")):
            # recherche de la forme masculine
            for t in lTuples:
                sMorph = self.clean(t[1])
                if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'): 
                    self.sRadical = t[0]
        else:
            self.sRadical = self.lemma
        # Tag duplicates
        d = {}
        for oFlex in self.lFlexions:
            d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1
................................................................................
                                        lFlexions.append( (oRule.add+flex[0], flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            else:
                                flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di)
                                if oFlag.bMix:
                                    lFlexPrefix.append(flexion)
                                    for flex in lFlexSuffix: 
                                        lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            if oRule.flags != '' and oRule.flags != '**':
                                lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
                else:
                    # cas des suffixes
................................................................................
    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur

    def calcAverageKnownOccurrence (self):
        # nous calculons la moyenne des occurrences des formes fléchies
        # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0) 
        nOccur = 0
        nFlex = 0
        for oFlex in self.lFlexions:
            if oFlex.nMulti == 0:
                nOccur += oFlex.nOccur
                nFlex += 1
        # moyenne des formes fléchies sans équivalent ou -1
        self.nAKO = math.ceil(nOccur / nFlex)  if nFlex > 0  else -1
    
    def solveOccurMultipleFlexions (self, hDst, oStatsLex):
        sBlank = "           "
        if self.nAKO >= 0:
            for oFlex in self.lFlexions:
                if oFlex.nMulti > 0 and not oFlex.bBlocked:
                    # on trie les entrées avec AKO et sans AKO
                    lEntWithAKO = []
                    lEntNoAKO = []
                    for oEntry in oFlex.lMulti:
                        if oEntry.nAKO >= 0:
                            lEntWithAKO.append(oEntry)
                        else:
                            lEntNoAKO.append(oEntry)
                    
                    if lEntNoAKO:
                        # on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO
                        nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup
                        for oEntry in lEntWithAKO:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nDiff += oFlexM.nOccur - oEntry.nAKO
................................................................................
                                        oFlexM.setOccurAndBlock(nNewOccur)
                    else:
                        # Toutes les entrées sont avec AKO : on pondère
                        nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
                        nTotAKO = self.nAKO
                        for oEnt in oFlex.lMulti:
                            nTotAKO += oEnt.nAKO
                        
                        hDst.write(" = {0.sFlexion}\n".format(oFlex))
                        hDst.write("       moyennes connues\n")
                        for oFlexD in self.lFlexions:
                            if oFlex.sFlexion == oFlexD.sFlexion:
                                nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup)  if nTotAKO  else 0
                                hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
                                oFlexD.setOccurAndBlock(nNewOccur)
                        for oEntry in oFlex.lMulti:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup)  if nTotAKO  else 0
                                    hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
                                    oFlexM.setOccurAndBlock(nNewOccur)
        
    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.oldFq = self.fq
        self.fq = getIfq(self.fFreq)



................................................................................
        self.nDup    = 0    # duplicates in the same entry
        self.nMulti  = 0    # duplicates with other entries
        self.lMulti  = []   # list of similar flexions
        self.fFreq   = 0
        self.cFq     = ''
        self.metagfx = ''   # métagraphe
        self.metaph2 = ''   # métaphone 2
    
    def setOccur (self, n):
        self.nOccur = n

    def setOccurAndBlock (self, n):
        self.nOccur = n
        self.bBlocked = True

    def calcOccur (self):
        self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)
    
    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.cFq = getIfq(self.fFreq)
    
    def calcMetagraphe (self):
        t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph)
        self.metagfx = t[0]  if not t[1]  else t[0]+"/"+t[1]

    def calcMetaphone2 (self):
        t = metaphone2.dm(self.sFlexion)
        self.metaph2 = t[0]  if not t[1]  else t[0]+"/"+t[1]
................................................................................
        return (self.sFlexion.translate(CHARMAP), self.sMorph)

    def keyFreq (self):
        return (100-self.fFreq, self.oEntry.sRadical, self.sFlexion)

    def keyOcc (self):
        return (self.nOccur, self.oEntry.sRadical, self.sFlexion)
        
    def keyIdx (self):
        return self.oEntry.iD

    def keyFlexion (self):
        return self.sFlexion


................................................................................
    def __init__ (self, sFlagType, sFlagName, sMix):
        self.sFlagName = sFlagName
        self.bSfx = True  if sFlagType == 'SFX'  else False
        self.bMix = True  if sMix == 'Y'  else False
        self.lRules = []
        self.nRules = 0
        self.nOccur = 0
        
    def addAffixRule (self, line):
        "ajoute une règle au drapeau"
        oRule = AffixRule(line)
        self.lRules.append(oRule)
        self.nRules += 1

    def getFlag (self, subDicts, oDict, nMode, bSimplified):
................................................................................
        # champs de Dicollecte
        self.lx = ''
        self.di = '*'
        # erreurs
        self.err = ''
        # autres champs
        self.nOccur = 0
        
        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
................................................................................
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))
    
    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)

    def getRuleLine (self, oDict, nMode, bSimplified=False):
        sLine = 'SFX'  if self.bSfx  else 'PFX'
        sLine += ' ' + self.sFlagName + ' ' + self.cut + ' '
................................................................................
                sLine = sLine.replace("()", "")
        sLine += ' ' + self.cond
        if not bSimplified and nMode > 0:
            sMorph = self.getMorph(nMode)
            if sMorph:
                sLine += sMorph  if not oDict.bShortenTags or bSimplified  else ' ' + oDict.dAM[sMorph.strip()]
        return sLine + "\n"
    
    def getMorph (self, nMode):
        # morphology for Hunspell
        txt = ''
        if self.po: txt += fieldToHunspell('po', self.po)
        if self.iz: txt += fieldToHunspell('is', self.iz)
        if self.ds: txt += fieldToHunspell('ds', self.ds)
        if self.ts: txt += fieldToHunspell('ts', self.ts)
................................................................................


class StatsLex:
    def __init__ (self, oDict):
        echo("Lexique statistique")
        self.dFlexions = { oFlex.sFlexion: []  for oFlex in oDict.lFlexions }
        self.lLex = []
        
    def addLexFromFile (self, sPathFile, cLexID, sLexName):
        if not os.path.isfile(sPathFile):
            echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile))
            return None
        if len(cLexID) != 1:
            echo(' * Lexique statistique - fichier {} - identifiant incorrect, 1 caractère requis'.format(sPathFile))
            return None
................................................................................
        xArgs.uncompress = True

    echo("Python: " + sys.version)
    echo("Version: " + xArgs.verdic)
    echo("Simplify: " + str(xArgs.simplify))
    echo("Mode: " + str(xArgs.mode))
    echo("Compression: " + str(not(xArgs.uncompress)))
    
    ### création du répertoire
    spBuild = BUILD_PATH + '/' + xArgs.verdic
    dir_util.mkpath(spBuild)
    
    ### Lecture des fichiers et création du dictionnaire
    oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
    for sFile in ['orthographe/FRANCAIS.dic']:
        oFrenchDict.readDictionary(sFile)
    oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')
    
    ### Contrôle
    oFrenchDict.sortEntriesNatural()
    oFrenchDict.checkEntries()
    
    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
................................................................................
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
    oStatsLex.write(spBuild+'/test_lex.txt')
    oFrenchDict.calculateStats(oStatsLex, spfStats)
    
    ### écriture des paquets
    echo("Création des paquets...")

    spLexiconDestGL = "../../../lexicons"  if xArgs.grammalecte  else ""
    spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries"  if xArgs.grammalecte  else ""
    spMozillaExtDestGL = "../xpi/data/dictionaries"  if xArgs.grammalecte  else ""
    spDataDestGL = "../data"  if xArgs.grammalecte  else ""







|







 







|



|







 







|







 







|










|







 







|







 







|







 







|













|






|









|








|






|







 







|




|







 







|







 







|







 







|








|







 







|







 







|







 







|
|







 







|







 







|







 







|








|













|







 







|













|







 







|









|



|







 







|







 







|







 







|







 







|







 







|







 







|







 







|



|





|



|







 







|







21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
...
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
...
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
...
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
...
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
...
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
...
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
...
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
...
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
...
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
...
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
...
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
...
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
...
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
...
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
...
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
....
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
....
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
....
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
....
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
....
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
....
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
....
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
....
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
....
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
....
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
....
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563

import metagraphe
import metaphone2


# Dictionnaire des caractères pour le tri naturel.
# Ordre souhaitable, mais pose problème pour la recherche, car engendre des égalités de lemmes différents.
# Il faut donc travailler sur un dictionnaire trié *numériquement* et le sauvegarder selon le tri *naturel*
CHARMAP = str.maketrans({ 'à': 'a',  'À': 'A',  'â': 'a',  'Â': 'A',  'ä': 'a',  'Ä': 'A',  'å': 'a',  'Å': 'A',  'ā': 'a',  'Ā': 'A',
                          'ç': 'c',  'Ç': 'C',
                          'é': 'e',  'É': 'E',  'è': 'e',  'È': 'E',  'ê': 'e',  'Ê': 'E',  'ë': 'e',  'Ë': 'E',  'ē': 'e',  'Ē': 'E',
                          'î': 'i',  'Î': 'I',  'ï': 'i',  'Ï': 'I',  'ī': 'i',  'Ī': 'I',
                          'ñ': 'n',
                          'ô': 'o',  'Ô': 'O',  'ö': 'o',  'Ö': 'O',  'ō': 'o',  'Ō': 'O',
                          'ù': 'u',  'Ù': 'U',  'û': 'u',  'Û': 'U',  'ü': 'u',  'Ü': 'U',  'ū': 'u',  'Ū': 'U',
................................................................................
        # Affixes
        self.sSettings = '' # enregistre tout avant la ligne # END
        self.dFlags = collections.OrderedDict()
        self.bShortenTags = False
        self.dAM = collections.OrderedDict() # étiquettes morphologiques
        self.dAF = collections.OrderedDict() # étiquettes drapeaux
        # Flexions
        self.lFlexions = []           # liste des flexions avec lemme, morphologie et occurrences
        self.lStatsLex = []
        self.nTotOccurRecognizedWords = 0
        self.aFlexions = None

    def readDictionary (self, spf):
        "Lecture du dictionnaire"
        echo('Dictionnaire << [ {} ]'.format(spf), end=' ')
        for sLine in readfile(spf):
            sLine = sLine.strip()
            if not sLine.isdigit() and not sLine.startswith("#"):
                self.lEntry.append(Entree(sLine))
................................................................................
                dAF[oEntry.flags] = dAF.get(oEntry.flags, 0) + 1
            sMorph = oEntry.getMorph(nMode).strip()
            if sMorph:
                dAM[sMorph] = dAM.get(sMorph, 0) + 1

        lAF = sorted(dAF.items(), key = lambda x: (x[1], x[0]), reverse=True)
        lAM = sorted(dAM.items(), key = lambda x: (x[1], x[0]), reverse=True)

        with open(spDst, 'a', encoding='utf-8', newline="\n") as hDst:
            hDst.write("\n\nDrapeaux :\n")
            for nAF, elem in enumerate(lAF, 1):
                self.dAF[elem[0]] = str(nAF)
                hDst.write("  > {0[1]:>8} : {0[0]}\n".format(elem))
            hDst.write("\n\nMorphologies :\n")
            for nAM, elem in enumerate(lAM, 1):
................................................................................
            if oEntry.di in dTplVars['subDicts']:
                nEntry += 1
        with open(spDst+'/'+dTplVars['asciiName']+'.dic', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(str(nEntry)+"\n")
            for oEntry in self.lEntry:
                if oEntry.di in dTplVars['subDicts']:
                    hDst.write(oEntry.getEntryLine(self, nMode, bSimplified))

    def writeAffixes (self, spDst, dTplVars, nMode, bSimplified):
        "Écrire le fichier des affixes (.aff)"
        echo(' * Dictionnaire >> [ {}.aff ]'.format(dTplVars['asciiName']))
        info = "# This Source Code Form is subject to the terms of the Mozilla Public\n" + \
               "# License, v. 2.0. If a copy of the MPL was not distributed with this\n" + \
               "# file, You can obtain one at http://mozilla.org/MPL/2.0/.\n\n" + \
               "# AFFIXES DU {} v{}\n".format(dTplVars['name'], self.sVersion) + \
               "# par Olivier R. -- licence MPL 2.0\n" + \
               "# Généré le " + time.strftime("%d-%m-%Y à %H:%M") + "\n" \
               "# Pour améliorer le dictionnaire, allez sur http://www.dicollecte.org/\n\n"

        with open(spDst+'/'+dTplVars['asciiName']+'.aff', 'w', encoding='utf-8', newline="\n") as hDst:
            hDst.write(info)
            hDst.write(self.sSettings + "\n")
            if self.bShortenTags:
                hDst.write("AM {}\n".format(len(self.dAM)))
                for item in self.dAM.items():
                    hDst.write("AM {}\n".format(item[0]))
................................................................................

    def sortEntriesNatural (self):
        echo(' * Dictionnaire - Tri naturel des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNat)

    def sortEntriesNumerical (self):
        echo(' * Dictionnaire - Tri numérique des entrées...')
        self.lEntry = sorted(self.lEntry, key=Entree.keyTriNum)

    def sortLexiconByFlexion (self):
        echo(' * Dictionnaire - tri du lexique (par flexion)...')
        self.lFlexions = sorted(self.lFlexions, key=Flexion.keyFlexion)

    def sortLexiconByFreq (self):
        echo(' * Dictionnaire - tri du lexique (par fréquence)...')
................................................................................
                d[oFlex.sFlexion] = [oFlex.oEntry]
        for oFlex in self.lFlexions:
            oFlex.lMulti = list(d[oFlex.sFlexion])
            oFlex.nMulti = len(oFlex.lMulti)
        for oFlex in self.lFlexions:
            oFlex.lMulti.remove(oFlex.oEntry)
            oFlex.nMulti -= 1

    def setTagsFrom (self, other):
        echo(' * Dictionnaire - copie des tags...')
        for i in range(self.nEntry):
            for oEntry in other.lEntry:
                if self.lEntry[i].lemma == oEntry.lemma and self.lEntry[i].flags == oEntry.flags:
                    self.lEntry[i].setTagsFrom(oEntry)

................................................................................
            hDst.write(oStatsLex.getInfo())
            for oFlex in self.lFlexions:
                oFlex.setOccur(oStatsLex.getFlexionOccur(oFlex.sFlexion))
            self.nTotOccurRecognizedWords = 0
            for oFlex in self.lFlexions:
                oFlex.calcOccur()
                self.nTotOccurRecognizedWords += oFlex.nOccur

            # Report des occurrences
            echo("   report des occurrences des formes fléchies multiples...")
            hDst.write("Report des occurrences des formes fléchies multiples :\n")
            hDst.write("  Légende :\n")
            hDst.write("    >>   le nombre d’occurrences de la flexion est ramené à la moyenne.\n")
            hDst.write("    +>   le nombre d’occurrences de la flexion est augmenté avec le surplus d’occurrences des flexions ramenées à la moyenne.\n")
            hDst.write("    %>   le nombre d’occurrences de la flexion est pondéré avec le poids de la moyenne de l’entrée.\n\n")

            for oEntry in self.lEntry:
                oEntry.calcOccurFromFlexions()
                oEntry.calcAverageKnownOccurrence()
                oEntry.solveOccurMultipleFlexions(hDst, oStatsLex)
                oEntry.calcOccurFromFlexions()

            # Fréquences
            echo("   calcul des fréquences et indices de fréquence...")
            for oFlex in self.lFlexions:
                oFlex.calcFreq(self.nTotOccurRecognizedWords)
            for oEntry in self.lEntry:
                oEntry.calcFreq(self.nTotOccurRecognizedWords)

            # Entrées, statistiques
            echo("   statistiques...")
            hDst.write("\n\nNatures grammaticales :\n")
            d = {}
            for oEntry in self.lEntry:
                po = re.sub("(?<=v[0-3])[itnpqrmaezx_]+", "", oEntry.po)
                d[po] = d.get(po, 0) + 1
            for e in sorted(d.items(), key = lambda x: (x[1], x[0]), reverse=True):
                hDst.write(" * {0[1]:<15} : {0[0]}\n".format(e))

            hDst.write("\n\nVentilation des entrées par indice de fréquence :\n")
            d1 = {}
            d2 = {}
            for oEntry in self.lEntry:
                d1[oEntry.fq] = d1.get(oEntry.fq, 0) + 1
                d2[oEntry.fq] = d2.get(oEntry.fq, 0) + oEntry.fFreq
            for k in sorted(d1.keys()):
                hDst.write(" * {} : {} entrées ({:.2f} %)  → {:.9f} %\n".format(k, d1[k], (d1[k]*100)/self.nEntry, d2[k]))

            hDst.write("\n\nRépartition des entrées par sous-dictionnaire :\n")
            d = {}
            for oEntry in self.lEntry:
                d[oEntry.di] = d.get(oEntry.di, 0) + 1
            for sKey, nVal in d.items():
                hDst.write(" * {0:<15} : {1} entrées ({2:.2f} %)\n".format(dSUBDIC[sKey], nVal, (nVal*100)/self.nEntry))

            # Occurrences des lettres
            echo("   occurrences des lettres...")
            d = {}
            for oFlex in self.lFlexions:
                for c in oFlex.sFlexion:
                    d[c] = d.get(c, 0) + oFlex.nOccur
            nTot = 0
................................................................................
            hDst.write("\n\nNombre de formes fléchies : {}\n".format(len(self.lFlexions)))
            hDst.write("\n\nNombre de graphies : {}\n".format(len(self.aFlexions)))

    def calcMetagraphe (self):
        echo(" * Lexique - Metagraphe")
        for oFlex in self.lFlexions:
            oFlex.calcMetagraphe()

    def calcMetaphone2 (self):
        echo(" * Lexique - Metaphone 2")
        for oFlex in self.lFlexions:
            oFlex.calcMetaphone2()

    def createNgrams (self, spDest, n):
        echo(" * Lexique - Ngrams " + str(n))
        if n < 2:
            echo("erreur: n = " + str(n))
            return
        dOccur = {} # ngram:n
        dRefW = {} # ngram:set(idx)
................................................................................
        file_util.copy_file('_templates/ooo/french_flag.png', spExt)
        file_util.copy_file('_templates/ooo/french_flag_16.bmp', spExt+'/ui')
        copyTemplate('_templates/ooo', spExt, 'description.xml', dTplVars)
        copyTemplate('_templates/ooo', spExt, 'dictionaries.xcu', dTplVars)
        #file_util.copy_file('_templates/ooo/dictionaries.xcu.tpl.xml', spExt)
        copyTemplate('_templates/ooo', spExt, 'package-description.txt', dTplVars)
        for dVars in lDictVars:
            dicPath = spBuild + '/' + PREFIX_DICT_PATH + self.sVersion
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.dic', spExt+'/dictionaries/'+dVars['asciiName']+'.dic')
            file_util.copy_file(dicPath+'/'+dVars['asciiName']+'.aff', spExt+'/dictionaries/'+dVars['asciiName']+'.aff')
        copyTemplate('orthographe', spExt+'/dictionaries', 'README_dict_fr.txt', dTplVars)
        # thesaurus
        file_util.copy_file('thesaurus/thes_fr.dat', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/thes_fr.idx', spExt+'/dictionaries')
        file_util.copy_file('thesaurus/README_thes_fr.txt', spExt+'/dictionaries')
................................................................................
        file_util.copy_file('césures/README_hyph_fr-2.9.txt', spExt+'/dictionaries')
        # zip
        createZipFiles(spExt, spBuild, sExtensionName + '.oxt')
        # copy to Grammalecte Project
        if spDestGL:
            echo("   extension copiée dans Grammalecte...")
            dir_util.copy_tree(spExt+'/dictionaries', spDestGL)

    def createMozillaExtensions (self, spBuild, dTplVars, lDictVars, spDestGL=""):
        # Mozilla extension 1
        echo(" * Dictionnaire >> extension pour Mozilla")
        dTplVars['version'] = self.sVersion
        sExtensionName = EXT_PREFIX_MOZ + self.sVersion
        spExt = spBuild + '/' + sExtensionName
        dir_util.mkpath(spExt+'/dictionaries')
................................................................................
        createZipFiles(spExt, spBuild, sExtensionName + '.xpi')
        # Grammalecte
        if spDestGL:
            echo(" * Dictionnaire >> copie des dicos dans Grammalecte")
            for dVars in lDictVars:
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.dic', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.dic')
                file_util.copy_file(spDict+'/'+dVars['asciiName']+'.aff', spDestGL+'/'+dVars['mozAsciiName']+"/"+dVars['mozAsciiName']+'.aff')

    def createFileIfqForDB (self, spBuild):
        echo(" * Dictionnaire >> indices de fréquence pour la DB...")
        with open(spBuild+'/dictIdxIfq-'+self.sVersion+'.diff.txt', 'w', encoding='utf-8', newline="\n") as hDiff, \
             open(spBuild+'/dictIdxIfq-'+self.sVersion+'.notes.txt', 'w', encoding='utf-8', newline="\n") as hNotes:
            for oEntry in self.lEntry:
                if oEntry.fq != oEntry.oldFq:
                    hDiff.write("{0.iD}\t{0.fq}\n".format(oEntry))
                    hNotes.write("{0.lemma}/{0.flags}\t{0.oldFq} > {0.fq}\n".format(oEntry))

    def createLexiconPackages (self, spBuild, version, oStatsLex, spDestGL=""):
        sLexName = LEX_PREFIX + version
        spLex = spBuild + '/' + sLexName
        dir_util.mkpath(spLex)
        # write Dicollecte lexicon
        self.sortLexiconByFreq()
        self.writeLexicon(spLex + '/' + sLexName + '.txt', version, oStatsLex)
................................................................................
        self.nFlexions = 0
        self.lFlexions = []
        self.sRadical = ''
        self.nOccur = 0
        self.nAKO = -1   # Average known occurrences
        self.fFreq = 0
        self.oldFq = ''

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
................................................................................
                else:
                    echo('  ## Champ inconnu: {}  dans  {}/{}'.format(fields[0], self.lemma, self.flags))
            else:
                self.err = self.err + elems[i]
        if self.err:
            echo("\n## Erreur dans le dictionnaire : {}".format(self.err))
            echo("   dans : " + self.lemma)

    def __str__ (self):
        return "{0.lemma}/{0.flags} {1}".format(self, self.getMorph(2))

    def check (self):
        sErr = ''
        if self.lemma == '':
            sErr += 'lemme vide'
................................................................................

    def keyTriNat (self):
        return (self.lemma.translate(CHARMAP), self.flags, self.po)

    def keyTriNum (self):
        return (self.lemma, self.flags, self.po)

    def getEntryLine (self, oDict, nMode, bSimplified=False):
        sLine = self.lemma.replace("’", "'")
        if self.flags:
            sLine += '/'
            sLine += self.flags  if not oDict.bShortenTags or bSimplified  else oDict.dAF[self.flags]
        if bSimplified:
            return sLine.replace("()", "") + "\n"
        if nMode > 0:
            sMorph = self.getMorph(nMode)
................................................................................
                    #echo(sFlex + " " + sMorph + ", ")
                    pass
        # Drapeaux dont le lemme féminin doit être remplacé par le masculin dans la gestion des formes fléchies
        if self.flags.startswith(("F.", "F*", "W.", "W*")):
            # recherche de la forme masculine
            for t in lTuples:
                sMorph = self.clean(t[1])
                if sMorph.endswith('mas') or sMorph.endswith('mas sg') or sMorph.endswith('mas inv'):
                    self.sRadical = t[0]
        else:
            self.sRadical = self.lemma
        # Tag duplicates
        d = {}
        for oFlex in self.lFlexions:
            d[oFlex.sFlexion] = d.get(oFlex.sFlexion, 0) + 1
................................................................................
                                        lFlexions.append( (oRule.add+flex[0], flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            else:
                                flexion = (self.lemma.replace(oRule.cut, oRule.add, 1), ruleMorph+morph, oRule.di)
                                if oFlag.bMix:
                                    lFlexPrefix.append(flexion)
                                    for flex in lFlexSuffix:
                                        lFlexions.append( (flex[0].replace(oRule.cut, oRule.add, 1), flex[1]+ruleMorph) )
                                else:
                                    lFlexions.append(flexion)
                            if oRule.flags != '' and oRule.flags != '**':
                                lFlexions.extend(Entree(flexion[0]+'/'+oRule.flags)._flechir(dFlags, flexion[1], iPR+1))
                else:
                    # cas des suffixes
................................................................................
    def calcOccurFromFlexions (self):
        self.nOccur = 0
        for o in self.lFlexions:
            self.nOccur += o.nOccur

    def calcAverageKnownOccurrence (self):
        # nous calculons la moyenne des occurrences des formes fléchies
        # qui n’ont pas d’équivalent dans les autres entrées (nMulti = 0)
        nOccur = 0
        nFlex = 0
        for oFlex in self.lFlexions:
            if oFlex.nMulti == 0:
                nOccur += oFlex.nOccur
                nFlex += 1
        # moyenne des formes fléchies sans équivalent ou -1
        self.nAKO = math.ceil(nOccur / nFlex)  if nFlex > 0  else -1

    def solveOccurMultipleFlexions (self, hDst, oStatsLex):
        sBlank = "           "
        if self.nAKO >= 0:
            for oFlex in self.lFlexions:
                if oFlex.nMulti > 0 and not oFlex.bBlocked:
                    # on trie les entrées avec AKO et sans AKO
                    lEntWithAKO = []
                    lEntNoAKO = []
                    for oEntry in oFlex.lMulti:
                        if oEntry.nAKO >= 0:
                            lEntWithAKO.append(oEntry)
                        else:
                            lEntNoAKO.append(oEntry)

                    if lEntNoAKO:
                        # on calcule la différence totale occasionnée par du passage des flexions appartenant à des entrées avec AKO au niveau AKO
                        nDiff = (oFlex.nOccur - self.nAKO) * oFlex.nDup
                        for oEntry in lEntWithAKO:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nDiff += oFlexM.nOccur - oEntry.nAKO
................................................................................
                                        oFlexM.setOccurAndBlock(nNewOccur)
                    else:
                        # Toutes les entrées sont avec AKO : on pondère
                        nFlexOccur = oStatsLex.getFlexionOccur(oFlex.sFlexion)
                        nTotAKO = self.nAKO
                        for oEnt in oFlex.lMulti:
                            nTotAKO += oEnt.nAKO

                        hDst.write(" = {0.sFlexion}\n".format(oFlex))
                        hDst.write("       moyennes connues\n")
                        for oFlexD in self.lFlexions:
                            if oFlex.sFlexion == oFlexD.sFlexion:
                                nNewOccur = math.ceil((nFlexOccur * (self.nAKO / nTotAKO)) / oFlexD.nDup)  if nTotAKO  else 0
                                hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexD, nNewOccur, self.getShortDescr()))
                                oFlexD.setOccurAndBlock(nNewOccur)
                        for oEntry in oFlex.lMulti:
                            for oFlexM in oEntry.lFlexions:
                                if oFlex.sFlexion == oFlexM.sFlexion:
                                    nNewOccur = math.ceil((nFlexOccur * (oEntry.nAKO / nTotAKO)) / oFlexM.nDup)  if nTotAKO  else 0
                                    hDst.write(sBlank + "{2:<30} {0.sMorph:<30}  {0.nOccur:>10}  %> {1:>10}\n".format(oFlexM, nNewOccur, oEntry.getShortDescr()))
                                    oFlexM.setOccurAndBlock(nNewOccur)

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.oldFq = self.fq
        self.fq = getIfq(self.fFreq)



................................................................................
        self.nDup    = 0    # duplicates in the same entry
        self.nMulti  = 0    # duplicates with other entries
        self.lMulti  = []   # list of similar flexions
        self.fFreq   = 0
        self.cFq     = ''
        self.metagfx = ''   # métagraphe
        self.metaph2 = ''   # métaphone 2

    def setOccur (self, n):
        self.nOccur = n

    def setOccurAndBlock (self, n):
        self.nOccur = n
        self.bBlocked = True

    def calcOccur (self):
        self.nOccur = math.ceil((self.nOccur / (self.nMulti+1)) / self.nDup)

    def calcFreq (self, nTot):
        self.fFreq = (self.nOccur * 100) / nTot
        self.cFq = getIfq(self.fFreq)

    def calcMetagraphe (self):
        t = metagraphe.getMetagraphe(self.sFlexion, self.sMorph)
        self.metagfx = t[0]  if not t[1]  else t[0]+"/"+t[1]

    def calcMetaphone2 (self):
        t = metaphone2.dm(self.sFlexion)
        self.metaph2 = t[0]  if not t[1]  else t[0]+"/"+t[1]
................................................................................
        return (self.sFlexion.translate(CHARMAP), self.sMorph)

    def keyFreq (self):
        return (100-self.fFreq, self.oEntry.sRadical, self.sFlexion)

    def keyOcc (self):
        return (self.nOccur, self.oEntry.sRadical, self.sFlexion)

    def keyIdx (self):
        return self.oEntry.iD

    def keyFlexion (self):
        return self.sFlexion


................................................................................
    def __init__ (self, sFlagType, sFlagName, sMix):
        self.sFlagName = sFlagName
        self.bSfx = True  if sFlagType == 'SFX'  else False
        self.bMix = True  if sMix == 'Y'  else False
        self.lRules = []
        self.nRules = 0
        self.nOccur = 0

    def addAffixRule (self, line):
        "ajoute une règle au drapeau"
        oRule = AffixRule(line)
        self.lRules.append(oRule)
        self.nRules += 1

    def getFlag (self, subDicts, oDict, nMode, bSimplified):
................................................................................
        # champs de Dicollecte
        self.lx = ''
        self.di = '*'
        # erreurs
        self.err = ''
        # autres champs
        self.nOccur = 0

        sLine = sLine.rstrip(" \n")
        # commentaire
        if '#' in sLine:
            sLine, comment = sLine.split('#', 1)
            self.comment = comment.strip()
        # éléments de la ligne
        elems = sLine.split()
................................................................................
                    self.lx = fields[1]  if self.lx == ''  else self.lx + ' ' + fields[1]
                elif fields[0] == 'di':
                    self.di = fields[1]
                else:
                    echo('Champ inconnu: {}  dans  {}'.format(fields[0], self.sFlagName))
            else:
                echo("  # Erreur affixe : {}".format(line))

    def isReplicationRule (self):
        "is this rule used for replication of a virtual lemma"
        return self.flags == "" and ((self.cut == "0" and self.add == "") or self.cut == self.add)

    def getRuleLine (self, oDict, nMode, bSimplified=False):
        sLine = 'SFX'  if self.bSfx  else 'PFX'
        sLine += ' ' + self.sFlagName + ' ' + self.cut + ' '
................................................................................
                sLine = sLine.replace("()", "")
        sLine += ' ' + self.cond
        if not bSimplified and nMode > 0:
            sMorph = self.getMorph(nMode)
            if sMorph:
                sLine += sMorph  if not oDict.bShortenTags or bSimplified  else ' ' + oDict.dAM[sMorph.strip()]
        return sLine + "\n"

    def getMorph (self, nMode):
        # morphology for Hunspell
        txt = ''
        if self.po: txt += fieldToHunspell('po', self.po)
        if self.iz: txt += fieldToHunspell('is', self.iz)
        if self.ds: txt += fieldToHunspell('ds', self.ds)
        if self.ts: txt += fieldToHunspell('ts', self.ts)
................................................................................


class StatsLex:
    def __init__ (self, oDict):
        echo("Lexique statistique")
        self.dFlexions = { oFlex.sFlexion: []  for oFlex in oDict.lFlexions }
        self.lLex = []

    def addLexFromFile (self, sPathFile, cLexID, sLexName):
        if not os.path.isfile(sPathFile):
            echo(' * Lexique statistique - fichier {} introuvable'.format(sPathFile))
            return None
        if len(cLexID) != 1:
            echo(' * Lexique statistique - fichier {} - identifiant incorrect, 1 caractère requis'.format(sPathFile))
            return None
................................................................................
        xArgs.uncompress = True

    echo("Python: " + sys.version)
    echo("Version: " + xArgs.verdic)
    echo("Simplify: " + str(xArgs.simplify))
    echo("Mode: " + str(xArgs.mode))
    echo("Compression: " + str(not(xArgs.uncompress)))

    ### création du répertoire
    spBuild = BUILD_PATH + '/' + xArgs.verdic
    dir_util.mkpath(spBuild)

    ### Lecture des fichiers et création du dictionnaire
    oFrenchDict = Dictionnaire(xArgs.verdic, "French dictionary")
    for sFile in ['orthographe/FRANCAIS.dic']:
        oFrenchDict.readDictionary(sFile)
    oFrenchDict.readAffixes('orthographe/FRANCAIS_5.aff')

    ### Contrôle
    oFrenchDict.sortEntriesNatural()
    oFrenchDict.checkEntries()

    ### Lexique
    oFrenchDict.generateFlexions()
    oFrenchDict.calcMetagraphe()
    oFrenchDict.calcMetaphone2()

    #oFrenchDict.createNgrams(spBuild, 3)
    if xArgs.spellvariants:
................................................................................
    oStatsLex = StatsLex(oFrenchDict)
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_google_ngram_1.txt', 'G', 'Google 1-grams')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwiki.txt', 'W', 'Wikipédia')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_frwikisource.txt', 'S', 'Wikisource')
    oStatsLex.addLexFromFile('lexique/corpus_data/stats_litterature.txt', 'L', 'Littérature')
    oStatsLex.write(spBuild+'/test_lex.txt')
    oFrenchDict.calculateStats(oStatsLex, spfStats)

    ### écriture des paquets
    echo("Création des paquets...")

    spLexiconDestGL = "../../../lexicons"  if xArgs.grammalecte  else ""
    spLibreOfficeExtDestGL = "../oxt/Dictionnaires/dictionaries"  if xArgs.grammalecte  else ""
    spMozillaExtDestGL = "../xpi/data/dictionaries"  if xArgs.grammalecte  else ""
    spDataDestGL = "../data"  if xArgs.grammalecte  else ""