Grammalecte  Hex Artifact Content

Artifact 0c60e1da751743178baf4ff4a082d20b624b67ec34c3698efa82119291348cf4:


0000: 23 21 70 79 74 68 6f 6e 33 0a 23 20 4a 75 73 74  #!python3.# Just
0010: 20 61 20 66 69 6c 65 20 66 6f 72 20 6f 6e 65 2d   a file for one-
0020: 73 68 6f 74 20 73 63 72 69 70 74 73 0a 0a 69 6d  shot scripts..im
0030: 70 6f 72 74 20 6f 73 0a 69 6d 70 6f 72 74 20 73  port os.import s
0040: 79 73 0a 69 6d 70 6f 72 74 20 72 65 0a 0a 69 6d  ys.import re..im
0050: 70 6f 72 74 20 67 72 61 6d 6d 61 6c 65 63 74 65  port grammalecte
0060: 2e 69 62 64 61 77 67 20 61 73 20 69 62 64 61 77  .ibdawg as ibdaw
0070: 67 0a 0a 6f 44 69 63 74 20 3d 20 69 62 64 61 77  g..oDict = ibdaw
0080: 67 2e 49 42 44 41 57 47 28 22 46 72 65 6e 63 68  g.IBDAWG("French
0090: 2e 62 64 69 63 22 29 0a 0a 0a 64 65 66 20 72 65  .bdic")...def re
00a0: 61 64 46 69 6c 65 20 28 73 70 66 29 3a 0a 20 20  adFile (spf):.  
00b0: 20 20 69 66 20 6f 73 2e 70 61 74 68 2e 69 73 66    if os.path.isf
00c0: 69 6c 65 28 73 70 66 29 3a 0a 20 20 20 20 20 20  ile(spf):.      
00d0: 20 20 77 69 74 68 20 6f 70 65 6e 28 73 70 66 2c    with open(spf,
00e0: 20 22 72 22 2c 20 65 6e 63 6f 64 69 6e 67 3d 22   "r", encoding="
00f0: 75 74 66 2d 38 22 29 20 61 73 20 68 53 72 63 3a  utf-8") as hSrc:
0100: 0a 20 20 20 20 20 20 20 20 20 20 20 20 66 6f 72  .            for
0110: 20 73 4c 69 6e 65 20 69 6e 20 68 53 72 63 3a 0a   sLine in hSrc:.
0120: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
0130: 79 69 65 6c 64 20 73 4c 69 6e 65 0a 20 20 20 20  yield sLine.    
0140: 65 6c 73 65 3a 0a 20 20 20 20 20 20 20 20 70 72  else:.        pr
0150: 69 6e 74 28 22 23 20 45 72 72 6f 72 3a 20 66 69  int("# Error: fi
0160: 6c 65 20 6e 6f 74 20 66 6f 75 6e 64 2e 22 29 0a  le not found.").
0170: 0a 23 20 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  .# -------------
0180: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0190: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01a0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01b0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01c0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01d0: 2d 2d 2d 2d 2d 0a 0a 64 65 66 20 6c 69 73 74 55  -----..def listU
01e0: 6e 6b 6e 6f 77 6e 57 6f 72 64 73 20 28 73 70 66  nknownWords (spf
01f0: 29 3a 0a 20 20 20 20 77 69 74 68 20 6f 70 65 6e  ):.    with open
0200: 28 73 70 66 2b 22 2e 72 65 73 2e 74 78 74 22 2c  (spf+".res.txt",
0210: 20 22 77 22 2c 20 65 6e 63 6f 64 69 6e 67 3d 22   "w", encoding="
0220: 75 74 66 2d 38 22 29 20 61 73 20 68 44 73 74 3a  utf-8") as hDst:
0230: 0a 20 20 20 20 20 20 20 20 66 6f 72 20 73 4c 69  .        for sLi
0240: 6e 65 20 69 6e 20 72 65 61 64 46 69 6c 65 28 73  ne in readFile(s
0250: 70 66 53 72 63 29 3a 0a 20 20 20 20 20 20 20 20  pfSrc):.        
0260: 20 20 20 20 73 4c 69 6e 65 20 3d 20 73 4c 69 6e      sLine = sLin
0270: 65 2e 73 74 72 69 70 28 29 0a 20 20 20 20 20 20  e.strip().      
0280: 20 20 20 20 20 20 69 66 20 73 4c 69 6e 65 3a 0a        if sLine:.
0290: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
02a0: 66 6f 72 20 73 57 6f 72 64 20 69 6e 20 73 4c 69  for sWord in sLi
02b0: 6e 65 2e 73 70 6c 69 74 28 29 3a 0a 20 20 20 20  ne.split():.    
02c0: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
02d0: 69 66 20 6e 6f 74 20 6f 44 69 63 74 2e 69 73 56  if not oDict.isV
02e0: 61 6c 69 64 28 73 57 6f 72 64 29 3a 20 0a 20 20  alid(sWord): .  
02f0: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
0300: 20 20 20 20 20 20 68 44 73 74 2e 77 72 69 74 65        hDst.write
0310: 28 73 57 6f 72 64 2b 22 5c 6e 22 29 0a 0a 23 20  (sWord+"\n")..# 
0320: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0330: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0340: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0350: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0360: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0370: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0380: 2d 2d 0a 0a 64 65 66 20 63 72 65 61 74 65 4c 65  --..def createLe
0390: 78 53 74 61 74 46 69 6c 65 20 28 73 70 66 2c 20  xStatFile (spf, 
03a0: 64 53 74 61 74 29 3a 0a 20 20 20 20 64 57 6f 72  dStat):.    dWor
03b0: 64 20 3d 20 7b 7d 0a 20 20 20 20 66 6f 72 20 69  d = {}.    for i
03c0: 2c 20 73 4c 69 6e 65 20 69 6e 20 65 6e 75 6d 65  , sLine in enume
03d0: 72 61 74 65 28 72 65 61 64 46 69 6c 65 28 73 70  rate(readFile(sp
03e0: 66 29 29 3a 0a 20 20 20 20 20 20 20 20 69 66 20  f)):.        if 
03f0: 6e 6f 74 20 73 4c 69 6e 65 2e 73 74 61 72 74 73  not sLine.starts
0400: 77 69 74 68 28 22 23 22 29 3a 0a 20 20 20 20 20  with("#"):.     
0410: 20 20 20 20 20 20 20 73 57 6f 72 64 20 3d 20 73         sWord = s
0420: 4c 69 6e 65 2e 73 74 72 69 70 28 29 0a 20 20 20  Line.strip().   
0430: 20 20 20 20 20 20 20 20 20 69 66 20 73 57 6f 72           if sWor
0440: 64 20 6e 6f 74 20 69 6e 20 64 57 6f 72 64 3a 0a  d not in dWord:.
0450: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
0460: 64 57 6f 72 64 5b 73 57 6f 72 64 5d 20 3d 20 64  dWord[sWord] = d
0470: 53 74 61 74 2e 67 65 74 28 73 57 6f 72 64 2c 20  Stat.get(sWord, 
0480: 30 29 0a 20 20 20 20 20 20 20 20 70 72 69 6e 74  0).        print
0490: 28 69 2c 20 65 6e 64 3d 22 5c 72 22 29 0a 0a 20  (i, end="\r").. 
04a0: 20 20 20 77 69 74 68 20 6f 70 65 6e 28 73 70 66     with open(spf
04b0: 2b 22 2e 72 65 73 2e 74 78 74 22 2c 20 22 77 22  +".res.txt", "w"
04c0: 2c 20 65 6e 63 6f 64 69 6e 67 3d 22 75 74 66 2d  , encoding="utf-
04d0: 38 22 29 20 61 73 20 68 44 73 74 3a 0a 20 20 20  8") as hDst:.   
04e0: 20 20 20 20 20 66 6f 72 20 73 57 6f 72 64 2c 20       for sWord, 
04f0: 6e 56 61 6c 20 69 6e 20 73 6f 72 74 65 64 28 64  nVal in sorted(d
0500: 57 6f 72 64 2e 69 74 65 6d 73 28 29 2c 20 6b 65  Word.items(), ke
0510: 79 3d 6c 61 6d 62 64 61 20 78 3a 20 28 78 5b 31  y=lambda x: (x[1
0520: 5d 2c 20 78 5b 30 5d 29 2c 20 72 65 76 65 72 73  ], x[0]), revers
0530: 65 3d 54 72 75 65 29 3a 0a 20 20 20 20 20 20 20  e=True):.       
0540: 20 20 20 20 20 69 66 20 6e 6f 74 20 6f 44 69 63       if not oDic
0550: 74 2e 69 73 56 61 6c 69 64 28 73 57 6f 72 64 29  t.isValid(sWord)
0560: 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20 20 20  :.              
0570: 20 20 68 44 73 74 2e 77 72 69 74 65 28 73 57 6f    hDst.write(sWo
0580: 72 64 20 2b 20 22 20 22 20 2b 20 73 74 72 28 6e  rd + " " + str(n
0590: 56 61 6c 29 20 2b 20 22 5c 6e 22 29 0a 0a 0a 64  Val) + "\n")...d
05a0: 65 66 20 72 65 61 64 53 74 61 74 46 69 6c 65 20  ef readStatFile 
05b0: 28 73 70 66 2c 20 64 53 74 61 74 29 3a 0a 20 20  (spf, dStat):.  
05c0: 20 20 70 72 69 6e 74 28 22 72 65 61 64 20 73 74    print("read st
05d0: 61 74 73 3a 20 22 20 2b 20 73 70 66 29 0a 20 20  ats: " + spf).  
05e0: 20 20 66 6f 72 20 73 4c 69 6e 65 20 69 6e 20 72    for sLine in r
05f0: 65 61 64 46 69 6c 65 28 73 70 66 29 3a 0a 20 20  eadFile(spf):.  
0600: 20 20 20 20 20 20 69 66 20 6e 6f 74 20 73 4c 69        if not sLi
0610: 6e 65 2e 73 74 61 72 74 73 77 69 74 68 28 22 23  ne.startswith("#
0620: 22 29 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20  "):.            
0630: 73 57 6f 72 64 2c 20 73 43 6f 75 6e 74 20 3d 20  sWord, sCount = 
0640: 73 4c 69 6e 65 2e 73 70 6c 69 74 28 29 0a 20 20  sLine.split().  
0650: 20 20 20 20 20 20 20 20 20 20 64 53 74 61 74 5b            dStat[
0660: 73 57 6f 72 64 5d 20 3d 20 64 53 74 61 74 2e 67  sWord] = dStat.g
0670: 65 74 28 73 57 6f 72 64 2c 20 30 29 20 2b 20 69  et(sWord, 0) + i
0680: 6e 74 28 73 43 6f 75 6e 74 29 0a 20 20 20 20 72  nt(sCount).    r
0690: 65 74 75 72 6e 20 64 53 74 61 74 0a 0a 0a 64 65  eturn dStat...de
06a0: 66 20 72 65 61 64 53 74 61 74 46 69 6c 65 73 41  f readStatFilesA
06b0: 6e 64 43 72 65 61 74 65 4c 65 78 69 63 6f 6e 20  ndCreateLexicon 
06c0: 28 29 3a 0a 20 20 20 20 64 53 74 61 74 20 3d 20  ():.    dStat = 
06d0: 7b 7d 0a 20 20 20 20 72 65 61 64 53 74 61 74 46  {}.    readStatF
06e0: 69 6c 65 28 22 73 74 61 74 73 31 2e 74 78 74 22  ile("stats1.txt"
06f0: 2c 20 64 53 74 61 74 29 0a 20 20 20 20 72 65 61  , dStat).    rea
0700: 64 53 74 61 74 46 69 6c 65 28 22 73 74 61 74 73  dStatFile("stats
0710: 32 2e 74 78 74 22 2c 20 64 53 74 61 74 29 0a 20  2.txt", dStat). 
0720: 20 20 20 72 65 61 64 53 74 61 74 46 69 6c 65 28     readStatFile(
0730: 22 73 74 61 74 73 33 2e 74 78 74 22 2c 20 64 53  "stats3.txt", dS
0740: 74 61 74 29 0a 20 20 20 20 63 72 65 61 74 65 4c  tat).    createL
0750: 65 78 53 74 61 74 46 69 6c 65 28 22 70 72 6f 70  exStatFile("prop
0760: 6f 73 69 74 69 6f 6e 73 2e 74 78 74 22 2c 20 64  ositions.txt", d
0770: 53 74 61 74 29 0a 0a 23 20 2d 2d 2d 2d 2d 2d 2d  Stat)..# -------
0780: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0790: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07a0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07b0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07c0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07d0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 0a 0a 64 65 66  -----------..def
07e0: 20 69 73 4d 6f 72 65 54 68 61 6e 4f 6e 65 53 65   isMoreThanOneSe
07f0: 74 49 6e 4c 69 73 74 20 28 6c 53 65 74 29 3a 0a  tInList (lSet):.
0800: 20 20 20 20 61 46 69 72 73 74 20 3d 20 6c 53 65      aFirst = lSe
0810: 74 2e 70 6f 70 28 30 29 0a 20 20 20 20 66 6f 72  t.pop(0).    for
0820: 20 61 53 65 74 20 69 6e 20 6c 53 65 74 3a 0a 20   aSet in lSet:. 
0830: 20 20 20 20 20 20 20 69 66 20 61 53 65 74 20 21         if aSet !
0840: 3d 20 61 46 69 72 73 74 3a 0a 20 20 20 20 20 20  = aFirst:.      
0850: 20 20 20 20 20 20 72 65 74 75 72 6e 20 54 72 75        return Tru
0860: 65 0a 20 20 20 20 72 65 74 75 72 6e 20 46 61 6c  e.    return Fal
0870: 73 65 0a 0a 64 65 66 20 66 69 6c 74 65 72 4c 69  se..def filterLi
0880: 6e 65 73 57 69 74 68 57 6f 72 64 73 57 69 74 68  nesWithWordsWith
0890: 44 69 66 66 65 72 65 6e 74 53 74 65 6d 73 20 28  DifferentStems (
08a0: 73 70 66 29 3a 0a 20 20 20 20 77 69 74 68 20 6f  spf):.    with o
08b0: 70 65 6e 28 73 70 66 2b 22 2e 72 65 73 2e 74 78  pen(spf+".res.tx
08c0: 74 22 2c 20 22 77 22 2c 20 65 6e 63 6f 64 69 6e  t", "w", encodin
08d0: 67 3d 22 75 74 66 2d 38 22 29 20 61 73 20 68 44  g="utf-8") as hD
08e0: 73 74 3a 0a 20 20 20 20 20 20 20 20 66 6f 72 20  st:.        for 
08f0: 73 4c 69 6e 65 20 69 6e 20 72 65 61 64 46 69 6c  sLine in readFil
0900: 65 28 73 70 66 29 3a 0a 20 20 20 20 20 20 20 20  e(spf):.        
0910: 20 20 20 20 6c 53 74 65 6d 53 65 74 20 3d 20 5b      lStemSet = [
0920: 20 73 65 74 28 6f 44 69 63 74 2e 73 74 65 6d 28   set(oDict.stem(
0930: 73 57 6f 72 64 29 29 20 20 66 6f 72 20 73 57 6f  sWord))  for sWo
0940: 72 64 20 69 6e 20 73 4c 69 6e 65 2e 73 74 72 69  rd in sLine.stri
0950: 70 28 29 2e 73 70 6c 69 74 28 29 5d 0a 20 20 20  p().split()].   
0960: 20 20 20 20 20 20 20 20 20 69 66 20 69 73 4d 6f           if isMo
0970: 72 65 54 68 61 6e 4f 6e 65 53 65 74 49 6e 4c 69  reThanOneSetInLi
0980: 73 74 28 6c 53 74 65 6d 53 65 74 29 3a 0a 20 20  st(lStemSet):.  
0990: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 68 44                hD
09a0: 73 74 2e 77 72 69 74 65 28 73 4c 69 6e 65 29 0a  st.write(sLine).
09b0: 0a 64 65 66 20 66 69 6c 74 65 72 48 6f 6d 6f 70  .def filterHomop
09c0: 68 6f 6e 69 63 57 6f 72 64 73 20 28 29 3a 0a 20  honicWords ():. 
09d0: 20 20 20 66 69 6c 74 65 72 4c 69 6e 65 73 57 69     filterLinesWi
09e0: 74 68 57 6f 72 64 73 57 69 74 68 44 69 66 66 65  thWordsWithDiffe
09f0: 72 65 6e 74 53 74 65 6d 73 28 22 68 6f 6d 6f 70  rentStems("homop
0a00: 68 6f 6e 65 73 2e 74 78 74 22 29 0a 0a 23 20 2d  hones.txt")..# -
0a10: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a20: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a30: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a40: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a50: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a60: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a70: 2d 0a 0a 69 66 20 5f 5f 6e 61 6d 65 5f 5f 20 3d  -..if __name__ =
0a80: 3d 20 27 5f 5f 6d 61 69 6e 5f 5f 27 20 3a 0a 20  = '__main__' :. 
0a90: 20 20 20 66 69 6c 74 65 72 48 6f 6d 6f 70 68 6f     filterHomopho
0aa0: 6e 69 63 57 6f 72 64 73 28 29                    nicWords()