Grammalecte  Hex Artifact Content

Artifact 66f5eb17ae6acef600fe00f3e36bd52afd722d2e9dffdd0f0b75d1a9b3af862c:


0000: 23 21 70 79 74 68 6f 6e 33 0a 23 20 4a 75 73 74  #!python3.# Just
0010: 20 61 20 66 69 6c 65 20 66 6f 72 20 6f 6e 65 2d   a file for one-
0020: 73 68 6f 74 20 73 63 72 69 70 74 73 0a 0a 69 6d  shot scripts..im
0030: 70 6f 72 74 20 6f 73 0a 69 6d 70 6f 72 74 20 73  port os.import s
0040: 79 73 0a 69 6d 70 6f 72 74 20 72 65 0a 0a 69 6d  ys.import re..im
0050: 70 6f 72 74 20 67 72 61 70 68 73 70 65 6c 6c 2e  port graphspell.
0060: 69 62 64 61 77 67 20 61 73 20 69 62 64 61 77 67  ibdawg as ibdawg
0070: 0a 0a 6f 44 69 63 74 20 3d 20 69 62 64 61 77 67  ..oDict = ibdawg
0080: 2e 49 42 44 41 57 47 28 22 46 72 65 6e 63 68 2e  .IBDAWG("French.
0090: 62 64 69 63 22 29 0a 0a 0a 64 65 66 20 72 65 61  bdic")...def rea
00a0: 64 46 69 6c 65 20 28 73 70 66 29 3a 0a 20 20 20  dFile (spf):.   
00b0: 20 69 66 20 6f 73 2e 70 61 74 68 2e 69 73 66 69   if os.path.isfi
00c0: 6c 65 28 73 70 66 29 3a 0a 20 20 20 20 20 20 20  le(spf):.       
00d0: 20 77 69 74 68 20 6f 70 65 6e 28 73 70 66 2c 20   with open(spf, 
00e0: 22 72 22 2c 20 65 6e 63 6f 64 69 6e 67 3d 22 75  "r", encoding="u
00f0: 74 66 2d 38 22 29 20 61 73 20 68 53 72 63 3a 0a  tf-8") as hSrc:.
0100: 20 20 20 20 20 20 20 20 20 20 20 20 66 6f 72 20              for 
0110: 73 4c 69 6e 65 20 69 6e 20 68 53 72 63 3a 0a 20  sLine in hSrc:. 
0120: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 79                 y
0130: 69 65 6c 64 20 73 4c 69 6e 65 0a 20 20 20 20 65  ield sLine.    e
0140: 6c 73 65 3a 0a 20 20 20 20 20 20 20 20 70 72 69  lse:.        pri
0150: 6e 74 28 22 23 20 45 72 72 6f 72 3a 20 66 69 6c  nt("# Error: fil
0160: 65 20 6e 6f 74 20 66 6f 75 6e 64 2e 22 29 0a 0a  e not found.")..
0170: 23 20 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  # --------------
0180: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0190: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01a0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01b0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01c0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01d0: 2d 2d 2d 2d 0a 0a 64 65 66 20 6c 69 73 74 55 6e  ----..def listUn
01e0: 6b 6e 6f 77 6e 57 6f 72 64 73 20 28 73 70 66 29  knownWords (spf)
01f0: 3a 0a 20 20 20 20 77 69 74 68 20 6f 70 65 6e 28  :.    with open(
0200: 73 70 66 2b 22 2e 72 65 73 2e 74 78 74 22 2c 20  spf+".res.txt", 
0210: 22 77 22 2c 20 65 6e 63 6f 64 69 6e 67 3d 22 75  "w", encoding="u
0220: 74 66 2d 38 22 29 20 61 73 20 68 44 73 74 3a 0a  tf-8") as hDst:.
0230: 20 20 20 20 20 20 20 20 66 6f 72 20 73 4c 69 6e          for sLin
0240: 65 20 69 6e 20 72 65 61 64 46 69 6c 65 28 73 70  e in readFile(sp
0250: 66 53 72 63 29 3a 0a 20 20 20 20 20 20 20 20 20  fSrc):.         
0260: 20 20 20 73 4c 69 6e 65 20 3d 20 73 4c 69 6e 65     sLine = sLine
0270: 2e 73 74 72 69 70 28 29 0a 20 20 20 20 20 20 20  .strip().       
0280: 20 20 20 20 20 69 66 20 73 4c 69 6e 65 3a 0a 20       if sLine:. 
0290: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 66                 f
02a0: 6f 72 20 73 57 6f 72 64 20 69 6e 20 73 4c 69 6e  or sWord in sLin
02b0: 65 2e 73 70 6c 69 74 28 29 3a 0a 20 20 20 20 20  e.split():.     
02c0: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 69                 i
02d0: 66 20 6e 6f 74 20 6f 44 69 63 74 2e 69 73 56 61  f not oDict.isVa
02e0: 6c 69 64 28 73 57 6f 72 64 29 3a 20 0a 20 20 20  lid(sWord): .   
02f0: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
0300: 20 20 20 20 20 68 44 73 74 2e 77 72 69 74 65 28       hDst.write(
0310: 73 57 6f 72 64 2b 22 5c 6e 22 29 0a 0a 23 20 2d  sWord+"\n")..# -
0320: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0330: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0340: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0350: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0360: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0370: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0380: 2d 0a 0a 64 65 66 20 63 72 65 61 74 65 4c 65 78  -..def createLex
0390: 53 74 61 74 46 69 6c 65 20 28 73 70 66 2c 20 64  StatFile (spf, d
03a0: 53 74 61 74 29 3a 0a 20 20 20 20 64 57 6f 72 64  Stat):.    dWord
03b0: 20 3d 20 7b 7d 0a 20 20 20 20 66 6f 72 20 69 2c   = {}.    for i,
03c0: 20 73 4c 69 6e 65 20 69 6e 20 65 6e 75 6d 65 72   sLine in enumer
03d0: 61 74 65 28 72 65 61 64 46 69 6c 65 28 73 70 66  ate(readFile(spf
03e0: 29 29 3a 0a 20 20 20 20 20 20 20 20 69 66 20 6e  )):.        if n
03f0: 6f 74 20 73 4c 69 6e 65 2e 73 74 61 72 74 73 77  ot sLine.startsw
0400: 69 74 68 28 22 23 22 29 3a 0a 20 20 20 20 20 20  ith("#"):.      
0410: 20 20 20 20 20 20 73 57 6f 72 64 20 3d 20 73 4c        sWord = sL
0420: 69 6e 65 2e 73 74 72 69 70 28 29 0a 20 20 20 20  ine.strip().    
0430: 20 20 20 20 20 20 20 20 69 66 20 73 57 6f 72 64          if sWord
0440: 20 6e 6f 74 20 69 6e 20 64 57 6f 72 64 3a 0a 20   not in dWord:. 
0450: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 64                 d
0460: 57 6f 72 64 5b 73 57 6f 72 64 5d 20 3d 20 64 53  Word[sWord] = dS
0470: 74 61 74 2e 67 65 74 28 73 57 6f 72 64 2c 20 30  tat.get(sWord, 0
0480: 29 0a 20 20 20 20 20 20 20 20 70 72 69 6e 74 28  ).        print(
0490: 69 2c 20 65 6e 64 3d 22 5c 72 22 29 0a 0a 20 20  i, end="\r")..  
04a0: 20 20 77 69 74 68 20 6f 70 65 6e 28 73 70 66 2b    with open(spf+
04b0: 22 2e 72 65 73 2e 74 78 74 22 2c 20 22 77 22 2c  ".res.txt", "w",
04c0: 20 65 6e 63 6f 64 69 6e 67 3d 22 75 74 66 2d 38   encoding="utf-8
04d0: 22 29 20 61 73 20 68 44 73 74 3a 0a 20 20 20 20  ") as hDst:.    
04e0: 20 20 20 20 66 6f 72 20 73 57 6f 72 64 2c 20 6e      for sWord, n
04f0: 56 61 6c 20 69 6e 20 73 6f 72 74 65 64 28 64 57  Val in sorted(dW
0500: 6f 72 64 2e 69 74 65 6d 73 28 29 2c 20 6b 65 79  ord.items(), key
0510: 3d 6c 61 6d 62 64 61 20 78 3a 20 28 78 5b 31 5d  =lambda x: (x[1]
0520: 2c 20 78 5b 30 5d 29 2c 20 72 65 76 65 72 73 65  , x[0]), reverse
0530: 3d 54 72 75 65 29 3a 0a 20 20 20 20 20 20 20 20  =True):.        
0540: 20 20 20 20 69 66 20 6e 6f 74 20 6f 44 69 63 74      if not oDict
0550: 2e 69 73 56 61 6c 69 64 28 73 57 6f 72 64 29 3a  .isValid(sWord):
0560: 0a 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20  .               
0570: 20 68 44 73 74 2e 77 72 69 74 65 28 73 57 6f 72   hDst.write(sWor
0580: 64 20 2b 20 22 20 22 20 2b 20 73 74 72 28 6e 56  d + " " + str(nV
0590: 61 6c 29 20 2b 20 22 5c 6e 22 29 0a 0a 0a 64 65  al) + "\n")...de
05a0: 66 20 72 65 61 64 53 74 61 74 46 69 6c 65 20 28  f readStatFile (
05b0: 73 70 66 2c 20 64 53 74 61 74 29 3a 0a 20 20 20  spf, dStat):.   
05c0: 20 70 72 69 6e 74 28 22 72 65 61 64 20 73 74 61   print("read sta
05d0: 74 73 3a 20 22 20 2b 20 73 70 66 29 0a 20 20 20  ts: " + spf).   
05e0: 20 66 6f 72 20 73 4c 69 6e 65 20 69 6e 20 72 65   for sLine in re
05f0: 61 64 46 69 6c 65 28 73 70 66 29 3a 0a 20 20 20  adFile(spf):.   
0600: 20 20 20 20 20 69 66 20 6e 6f 74 20 73 4c 69 6e       if not sLin
0610: 65 2e 73 74 61 72 74 73 77 69 74 68 28 22 23 22  e.startswith("#"
0620: 29 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20 73  ):.            s
0630: 57 6f 72 64 2c 20 73 43 6f 75 6e 74 20 3d 20 73  Word, sCount = s
0640: 4c 69 6e 65 2e 73 70 6c 69 74 28 29 0a 20 20 20  Line.split().   
0650: 20 20 20 20 20 20 20 20 20 64 53 74 61 74 5b 73           dStat[s
0660: 57 6f 72 64 5d 20 3d 20 64 53 74 61 74 2e 67 65  Word] = dStat.ge
0670: 74 28 73 57 6f 72 64 2c 20 30 29 20 2b 20 69 6e  t(sWord, 0) + in
0680: 74 28 73 43 6f 75 6e 74 29 0a 20 20 20 20 72 65  t(sCount).    re
0690: 74 75 72 6e 20 64 53 74 61 74 0a 0a 0a 64 65 66  turn dStat...def
06a0: 20 72 65 61 64 53 74 61 74 46 69 6c 65 73 41 6e   readStatFilesAn
06b0: 64 43 72 65 61 74 65 4c 65 78 69 63 6f 6e 20 28  dCreateLexicon (
06c0: 29 3a 0a 20 20 20 20 64 53 74 61 74 20 3d 20 7b  ):.    dStat = {
06d0: 7d 0a 20 20 20 20 72 65 61 64 53 74 61 74 46 69  }.    readStatFi
06e0: 6c 65 28 22 73 74 61 74 73 31 2e 74 78 74 22 2c  le("stats1.txt",
06f0: 20 64 53 74 61 74 29 0a 20 20 20 20 72 65 61 64   dStat).    read
0700: 53 74 61 74 46 69 6c 65 28 22 73 74 61 74 73 32  StatFile("stats2
0710: 2e 74 78 74 22 2c 20 64 53 74 61 74 29 0a 20 20  .txt", dStat).  
0720: 20 20 72 65 61 64 53 74 61 74 46 69 6c 65 28 22    readStatFile("
0730: 73 74 61 74 73 33 2e 74 78 74 22 2c 20 64 53 74  stats3.txt", dSt
0740: 61 74 29 0a 20 20 20 20 63 72 65 61 74 65 4c 65  at).    createLe
0750: 78 53 74 61 74 46 69 6c 65 28 22 70 72 6f 70 6f  xStatFile("propo
0760: 73 69 74 69 6f 6e 73 2e 74 78 74 22 2c 20 64 53  sitions.txt", dS
0770: 74 61 74 29 0a 0a 23 20 2d 2d 2d 2d 2d 2d 2d 2d  tat)..# --------
0780: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0790: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07a0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07b0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07c0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07d0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 0a 0a 64 65 66 20  ----------..def 
07e0: 69 73 4d 6f 72 65 54 68 61 6e 4f 6e 65 53 65 74  isMoreThanOneSet
07f0: 49 6e 4c 69 73 74 20 28 6c 53 65 74 29 3a 0a 20  InList (lSet):. 
0800: 20 20 20 61 46 69 72 73 74 20 3d 20 6c 53 65 74     aFirst = lSet
0810: 2e 70 6f 70 28 30 29 0a 20 20 20 20 66 6f 72 20  .pop(0).    for 
0820: 61 53 65 74 20 69 6e 20 6c 53 65 74 3a 0a 20 20  aSet in lSet:.  
0830: 20 20 20 20 20 20 69 66 20 61 53 65 74 20 21 3d        if aSet !=
0840: 20 61 46 69 72 73 74 3a 0a 20 20 20 20 20 20 20   aFirst:.       
0850: 20 20 20 20 20 72 65 74 75 72 6e 20 54 72 75 65       return True
0860: 0a 20 20 20 20 72 65 74 75 72 6e 20 46 61 6c 73  .    return Fals
0870: 65 0a 0a 64 65 66 20 66 69 6c 74 65 72 4c 69 6e  e..def filterLin
0880: 65 73 57 69 74 68 57 6f 72 64 73 57 69 74 68 44  esWithWordsWithD
0890: 69 66 66 65 72 65 6e 74 53 74 65 6d 73 20 28 73  ifferentStems (s
08a0: 70 66 29 3a 0a 20 20 20 20 77 69 74 68 20 6f 70  pf):.    with op
08b0: 65 6e 28 73 70 66 2b 22 2e 72 65 73 2e 74 78 74  en(spf+".res.txt
08c0: 22 2c 20 22 77 22 2c 20 65 6e 63 6f 64 69 6e 67  ", "w", encoding
08d0: 3d 22 75 74 66 2d 38 22 29 20 61 73 20 68 44 73  ="utf-8") as hDs
08e0: 74 3a 0a 20 20 20 20 20 20 20 20 66 6f 72 20 73  t:.        for s
08f0: 4c 69 6e 65 20 69 6e 20 72 65 61 64 46 69 6c 65  Line in readFile
0900: 28 73 70 66 29 3a 0a 20 20 20 20 20 20 20 20 20  (spf):.         
0910: 20 20 20 6c 53 74 65 6d 53 65 74 20 3d 20 5b 20     lStemSet = [ 
0920: 73 65 74 28 6f 44 69 63 74 2e 73 74 65 6d 28 73  set(oDict.stem(s
0930: 57 6f 72 64 29 29 20 20 66 6f 72 20 73 57 6f 72  Word))  for sWor
0940: 64 20 69 6e 20 73 4c 69 6e 65 2e 73 74 72 69 70  d in sLine.strip
0950: 28 29 2e 73 70 6c 69 74 28 29 5d 0a 20 20 20 20  ().split()].    
0960: 20 20 20 20 20 20 20 20 69 66 20 69 73 4d 6f 72          if isMor
0970: 65 54 68 61 6e 4f 6e 65 53 65 74 49 6e 4c 69 73  eThanOneSetInLis
0980: 74 28 6c 53 74 65 6d 53 65 74 29 3a 0a 20 20 20  t(lStemSet):.   
0990: 20 20 20 20 20 20 20 20 20 20 20 20 20 68 44 73               hDs
09a0: 74 2e 77 72 69 74 65 28 73 4c 69 6e 65 29 0a 0a  t.write(sLine)..
09b0: 64 65 66 20 66 69 6c 74 65 72 48 6f 6d 6f 70 68  def filterHomoph
09c0: 6f 6e 69 63 57 6f 72 64 73 20 28 29 3a 0a 20 20  onicWords ():.  
09d0: 20 20 66 69 6c 74 65 72 4c 69 6e 65 73 57 69 74    filterLinesWit
09e0: 68 57 6f 72 64 73 57 69 74 68 44 69 66 66 65 72  hWordsWithDiffer
09f0: 65 6e 74 53 74 65 6d 73 28 22 68 6f 6d 6f 70 68  entStems("homoph
0a00: 6f 6e 65 73 2e 74 78 74 22 29 0a 0a 23 20 2d 2d  ones.txt")..# --
0a10: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a20: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a30: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a40: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a50: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a60: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a70: 0a 0a 69 66 20 5f 5f 6e 61 6d 65 5f 5f 20 3d 3d  ..if __name__ ==
0a80: 20 27 5f 5f 6d 61 69 6e 5f 5f 27 20 3a 0a 20 20   '__main__' :.  
0a90: 20 20 66 69 6c 74 65 72 48 6f 6d 6f 70 68 6f 6e    filterHomophon
0aa0: 69 63 57 6f 72 64 73 28 29                       icWords()