Grammalecte  Hex Artifact Content

Artifact e2706fc6a240415ca7dabeab83c93a06d13a4cf3174662bc0cd04da31d98125f:


0000: 23 21 70 79 74 68 6f 6e 33 0a 23 20 4a 75 73 74  #!python3.# Just
0010: 20 61 20 66 69 6c 65 20 66 6f 72 20 6f 6e 65 2d   a file for one-
0020: 73 68 6f 74 20 73 63 72 69 70 74 73 0a 0a 69 6d  shot scripts..im
0030: 70 6f 72 74 20 6f 73 0a 69 6d 70 6f 72 74 20 73  port os.import s
0040: 79 73 0a 69 6d 70 6f 72 74 20 72 65 0a 0a 69 6d  ys.import re..im
0050: 70 6f 72 74 20 67 72 61 70 68 73 70 65 6c 6c 2e  port graphspell.
0060: 69 62 64 61 77 67 20 61 73 20 69 62 64 61 77 67  ibdawg as ibdawg
0070: 0a 0a 6f 44 69 63 74 20 3d 20 69 62 64 61 77 67  ..oDict = ibdawg
0080: 2e 49 42 44 41 57 47 28 22 66 72 2d 61 6c 6c 76  .IBDAWG("fr-allv
0090: 61 72 73 2e 6a 73 6f 6e 22 29 0a 0a 0a 64 65 66  ars.json")...def
00a0: 20 72 65 61 64 46 69 6c 65 20 28 73 70 66 29 3a   readFile (spf):
00b0: 0a 20 20 20 20 69 66 20 6f 73 2e 70 61 74 68 2e  .    if os.path.
00c0: 69 73 66 69 6c 65 28 73 70 66 29 3a 0a 20 20 20  isfile(spf):.   
00d0: 20 20 20 20 20 77 69 74 68 20 6f 70 65 6e 28 73       with open(s
00e0: 70 66 2c 20 22 72 22 2c 20 65 6e 63 6f 64 69 6e  pf, "r", encodin
00f0: 67 3d 22 75 74 66 2d 38 22 29 20 61 73 20 68 53  g="utf-8") as hS
0100: 72 63 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20  rc:.            
0110: 66 6f 72 20 73 4c 69 6e 65 20 69 6e 20 68 53 72  for sLine in hSr
0120: 63 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20 20  c:.             
0130: 20 20 20 79 69 65 6c 64 20 73 4c 69 6e 65 0a 20     yield sLine. 
0140: 20 20 20 65 6c 73 65 3a 0a 20 20 20 20 20 20 20     else:.       
0150: 20 70 72 69 6e 74 28 22 23 20 45 72 72 6f 72 3a   print("# Error:
0160: 20 66 69 6c 65 20 6e 6f 74 20 66 6f 75 6e 64 2e   file not found.
0170: 22 29 0a 0a 23 20 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ")..# ----------
0180: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0190: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01a0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01b0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01c0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
01d0: 2d 2d 2d 2d 2d 2d 2d 2d 0a 0a 64 65 66 20 6c 69  --------..def li
01e0: 73 74 55 6e 6b 6e 6f 77 6e 57 6f 72 64 73 20 28  stUnknownWords (
01f0: 73 70 66 29 3a 0a 20 20 20 20 77 69 74 68 20 6f  spf):.    with o
0200: 70 65 6e 28 73 70 66 2b 22 2e 72 65 73 2e 74 78  pen(spf+".res.tx
0210: 74 22 2c 20 22 77 22 2c 20 65 6e 63 6f 64 69 6e  t", "w", encodin
0220: 67 3d 22 75 74 66 2d 38 22 29 20 61 73 20 68 44  g="utf-8") as hD
0230: 73 74 3a 0a 20 20 20 20 20 20 20 20 66 6f 72 20  st:.        for 
0240: 73 4c 69 6e 65 20 69 6e 20 72 65 61 64 46 69 6c  sLine in readFil
0250: 65 28 73 70 66 53 72 63 29 3a 0a 20 20 20 20 20  e(spfSrc):.     
0260: 20 20 20 20 20 20 20 73 4c 69 6e 65 20 3d 20 73         sLine = s
0270: 4c 69 6e 65 2e 73 74 72 69 70 28 29 0a 20 20 20  Line.strip().   
0280: 20 20 20 20 20 20 20 20 20 69 66 20 73 4c 69 6e           if sLin
0290: 65 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20 20  e:.             
02a0: 20 20 20 66 6f 72 20 73 57 6f 72 64 20 69 6e 20     for sWord in 
02b0: 73 4c 69 6e 65 2e 73 70 6c 69 74 28 29 3a 0a 20  sLine.split():. 
02c0: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
02d0: 20 20 20 69 66 20 6e 6f 74 20 6f 44 69 63 74 2e     if not oDict.
02e0: 69 73 56 61 6c 69 64 28 73 57 6f 72 64 29 3a 0a  isValid(sWord):.
02f0: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
0300: 20 20 20 20 20 20 20 20 68 44 73 74 2e 77 72 69          hDst.wri
0310: 74 65 28 73 57 6f 72 64 2b 22 5c 6e 22 29 0a 0a  te(sWord+"\n")..
0320: 23 20 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  # --------------
0330: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0340: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0350: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0360: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0370: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0380: 2d 2d 2d 2d 0a 0a 64 65 66 20 63 72 65 61 74 65  ----..def create
0390: 4c 65 78 53 74 61 74 46 69 6c 65 20 28 73 70 66  LexStatFile (spf
03a0: 2c 20 64 53 74 61 74 29 3a 0a 20 20 20 20 64 57  , dStat):.    dW
03b0: 6f 72 64 20 3d 20 7b 7d 0a 20 20 20 20 66 6f 72  ord = {}.    for
03c0: 20 69 2c 20 73 4c 69 6e 65 20 69 6e 20 65 6e 75   i, sLine in enu
03d0: 6d 65 72 61 74 65 28 72 65 61 64 46 69 6c 65 28  merate(readFile(
03e0: 73 70 66 29 29 3a 0a 20 20 20 20 20 20 20 20 69  spf)):.        i
03f0: 66 20 6e 6f 74 20 73 4c 69 6e 65 2e 73 74 61 72  f not sLine.star
0400: 74 73 77 69 74 68 28 22 23 22 29 3a 0a 20 20 20  tswith("#"):.   
0410: 20 20 20 20 20 20 20 20 20 73 57 6f 72 64 20 3d           sWord =
0420: 20 73 4c 69 6e 65 2e 73 74 72 69 70 28 29 0a 20   sLine.strip(). 
0430: 20 20 20 20 20 20 20 20 20 20 20 69 66 20 73 57             if sW
0440: 6f 72 64 20 6e 6f 74 20 69 6e 20 64 57 6f 72 64  ord not in dWord
0450: 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20 20 20  :.              
0460: 20 20 64 57 6f 72 64 5b 73 57 6f 72 64 5d 20 3d    dWord[sWord] =
0470: 20 64 53 74 61 74 2e 67 65 74 28 73 57 6f 72 64   dStat.get(sWord
0480: 2c 20 30 29 0a 20 20 20 20 20 20 20 20 70 72 69  , 0).        pri
0490: 6e 74 28 69 2c 20 65 6e 64 3d 22 5c 72 22 29 0a  nt(i, end="\r").
04a0: 0a 20 20 20 20 77 69 74 68 20 6f 70 65 6e 28 73  .    with open(s
04b0: 70 66 2b 22 2e 72 65 73 2e 74 78 74 22 2c 20 22  pf+".res.txt", "
04c0: 77 22 2c 20 65 6e 63 6f 64 69 6e 67 3d 22 75 74  w", encoding="ut
04d0: 66 2d 38 22 29 20 61 73 20 68 44 73 74 3a 0a 20  f-8") as hDst:. 
04e0: 20 20 20 20 20 20 20 66 6f 72 20 73 57 6f 72 64         for sWord
04f0: 2c 20 6e 56 61 6c 20 69 6e 20 73 6f 72 74 65 64  , nVal in sorted
0500: 28 64 57 6f 72 64 2e 69 74 65 6d 73 28 29 2c 20  (dWord.items(), 
0510: 6b 65 79 3d 6c 61 6d 62 64 61 20 78 3a 20 28 78  key=lambda x: (x
0520: 5b 31 5d 2c 20 78 5b 30 5d 29 2c 20 72 65 76 65  [1], x[0]), reve
0530: 72 73 65 3d 54 72 75 65 29 3a 0a 20 20 20 20 20  rse=True):.     
0540: 20 20 20 20 20 20 20 69 66 20 6e 6f 74 20 6f 44         if not oD
0550: 69 63 74 2e 69 73 56 61 6c 69 64 28 73 57 6f 72  ict.isValid(sWor
0560: 64 29 3a 0a 20 20 20 20 20 20 20 20 20 20 20 20  d):.            
0570: 20 20 20 20 68 44 73 74 2e 77 72 69 74 65 28 73      hDst.write(s
0580: 57 6f 72 64 20 2b 20 22 20 22 20 2b 20 73 74 72  Word + " " + str
0590: 28 6e 56 61 6c 29 20 2b 20 22 5c 6e 22 29 0a 0a  (nVal) + "\n")..
05a0: 0a 64 65 66 20 72 65 61 64 53 74 61 74 46 69 6c  .def readStatFil
05b0: 65 20 28 73 70 66 2c 20 64 53 74 61 74 29 3a 0a  e (spf, dStat):.
05c0: 20 20 20 20 70 72 69 6e 74 28 22 72 65 61 64 20      print("read 
05d0: 73 74 61 74 73 3a 20 22 20 2b 20 73 70 66 29 0a  stats: " + spf).
05e0: 20 20 20 20 66 6f 72 20 73 4c 69 6e 65 20 69 6e      for sLine in
05f0: 20 72 65 61 64 46 69 6c 65 28 73 70 66 29 3a 0a   readFile(spf):.
0600: 20 20 20 20 20 20 20 20 69 66 20 6e 6f 74 20 73          if not s
0610: 4c 69 6e 65 2e 73 74 61 72 74 73 77 69 74 68 28  Line.startswith(
0620: 22 23 22 29 3a 0a 20 20 20 20 20 20 20 20 20 20  "#"):.          
0630: 20 20 73 57 6f 72 64 2c 20 73 43 6f 75 6e 74 20    sWord, sCount 
0640: 3d 20 73 4c 69 6e 65 2e 73 70 6c 69 74 28 29 0a  = sLine.split().
0650: 20 20 20 20 20 20 20 20 20 20 20 20 64 53 74 61              dSta
0660: 74 5b 73 57 6f 72 64 5d 20 3d 20 64 53 74 61 74  t[sWord] = dStat
0670: 2e 67 65 74 28 73 57 6f 72 64 2c 20 30 29 20 2b  .get(sWord, 0) +
0680: 20 69 6e 74 28 73 43 6f 75 6e 74 29 0a 20 20 20   int(sCount).   
0690: 20 72 65 74 75 72 6e 20 64 53 74 61 74 0a 0a 0a   return dStat...
06a0: 64 65 66 20 72 65 61 64 53 74 61 74 46 69 6c 65  def readStatFile
06b0: 73 41 6e 64 43 72 65 61 74 65 4c 65 78 69 63 6f  sAndCreateLexico
06c0: 6e 20 28 29 3a 0a 20 20 20 20 64 53 74 61 74 20  n ():.    dStat 
06d0: 3d 20 7b 7d 0a 20 20 20 20 72 65 61 64 53 74 61  = {}.    readSta
06e0: 74 46 69 6c 65 28 22 73 74 61 74 73 31 2e 74 78  tFile("stats1.tx
06f0: 74 22 2c 20 64 53 74 61 74 29 0a 20 20 20 20 72  t", dStat).    r
0700: 65 61 64 53 74 61 74 46 69 6c 65 28 22 73 74 61  eadStatFile("sta
0710: 74 73 32 2e 74 78 74 22 2c 20 64 53 74 61 74 29  ts2.txt", dStat)
0720: 0a 20 20 20 20 72 65 61 64 53 74 61 74 46 69 6c  .    readStatFil
0730: 65 28 22 73 74 61 74 73 33 2e 74 78 74 22 2c 20  e("stats3.txt", 
0740: 64 53 74 61 74 29 0a 20 20 20 20 63 72 65 61 74  dStat).    creat
0750: 65 4c 65 78 53 74 61 74 46 69 6c 65 28 22 70 72  eLexStatFile("pr
0760: 6f 70 6f 73 69 74 69 6f 6e 73 2e 74 78 74 22 2c  opositions.txt",
0770: 20 64 53 74 61 74 29 0a 0a 23 20 2d 2d 2d 2d 2d   dStat)..# -----
0780: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0790: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07a0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07b0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07c0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
07d0: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 0a 0a 64  -------------..d
07e0: 65 66 20 69 73 4d 6f 72 65 54 68 61 6e 4f 6e 65  ef isMoreThanOne
07f0: 53 65 74 49 6e 4c 69 73 74 20 28 6c 53 65 74 29  SetInList (lSet)
0800: 3a 0a 20 20 20 20 61 46 69 72 73 74 20 3d 20 6c  :.    aFirst = l
0810: 53 65 74 2e 70 6f 70 28 30 29 0a 20 20 20 20 66  Set.pop(0).    f
0820: 6f 72 20 61 53 65 74 20 69 6e 20 6c 53 65 74 3a  or aSet in lSet:
0830: 0a 20 20 20 20 20 20 20 20 69 66 20 61 53 65 74  .        if aSet
0840: 20 21 3d 20 61 46 69 72 73 74 3a 0a 20 20 20 20   != aFirst:.    
0850: 20 20 20 20 20 20 20 20 72 65 74 75 72 6e 20 54          return T
0860: 72 75 65 0a 20 20 20 20 72 65 74 75 72 6e 20 46  rue.    return F
0870: 61 6c 73 65 0a 0a 64 65 66 20 66 69 6c 74 65 72  alse..def filter
0880: 4c 69 6e 65 73 57 69 74 68 57 6f 72 64 73 57 69  LinesWithWordsWi
0890: 74 68 44 69 66 66 65 72 65 6e 74 53 74 65 6d 73  thDifferentStems
08a0: 20 28 73 70 66 29 3a 0a 20 20 20 20 77 69 74 68   (spf):.    with
08b0: 20 6f 70 65 6e 28 73 70 66 2b 22 2e 72 65 73 2e   open(spf+".res.
08c0: 74 78 74 22 2c 20 22 77 22 2c 20 65 6e 63 6f 64  txt", "w", encod
08d0: 69 6e 67 3d 22 75 74 66 2d 38 22 29 20 61 73 20  ing="utf-8") as 
08e0: 68 44 73 74 3a 0a 20 20 20 20 20 20 20 20 66 6f  hDst:.        fo
08f0: 72 20 73 4c 69 6e 65 20 69 6e 20 72 65 61 64 46  r sLine in readF
0900: 69 6c 65 28 73 70 66 29 3a 0a 20 20 20 20 20 20  ile(spf):.      
0910: 20 20 20 20 20 20 6c 53 74 65 6d 53 65 74 20 3d        lStemSet =
0920: 20 5b 20 73 65 74 28 6f 44 69 63 74 2e 73 74 65   [ set(oDict.ste
0930: 6d 28 73 57 6f 72 64 29 29 20 20 66 6f 72 20 73  m(sWord))  for s
0940: 57 6f 72 64 20 69 6e 20 73 4c 69 6e 65 2e 73 74  Word in sLine.st
0950: 72 69 70 28 29 2e 73 70 6c 69 74 28 29 5d 0a 20  rip().split()]. 
0960: 20 20 20 20 20 20 20 20 20 20 20 69 66 20 69 73             if is
0970: 4d 6f 72 65 54 68 61 6e 4f 6e 65 53 65 74 49 6e  MoreThanOneSetIn
0980: 4c 69 73 74 28 6c 53 74 65 6d 53 65 74 29 3a 0a  List(lStemSet):.
0990: 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20                  
09a0: 68 44 73 74 2e 77 72 69 74 65 28 73 4c 69 6e 65  hDst.write(sLine
09b0: 29 0a 0a 64 65 66 20 66 69 6c 74 65 72 48 6f 6d  )..def filterHom
09c0: 6f 70 68 6f 6e 69 63 57 6f 72 64 73 20 28 29 3a  ophonicWords ():
09d0: 0a 20 20 20 20 66 69 6c 74 65 72 4c 69 6e 65 73  .    filterLines
09e0: 57 69 74 68 57 6f 72 64 73 57 69 74 68 44 69 66  WithWordsWithDif
09f0: 66 65 72 65 6e 74 53 74 65 6d 73 28 22 68 6f 6d  ferentStems("hom
0a00: 6f 70 68 6f 6e 65 73 2e 74 78 74 22 29 0a 0a 23  ophones.txt")..#
0a10: 20 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d   ---------------
0a20: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a30: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a40: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a50: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a60: 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d 2d  ----------------
0a70: 2d 2d 2d 0a 0a 69 66 20 5f 5f 6e 61 6d 65 5f 5f  ---..if __name__
0a80: 20 3d 3d 20 27 5f 5f 6d 61 69 6e 5f 5f 27 20 3a   == '__main__' :
0a90: 0a 20 20 20 20 66 69 6c 74 65 72 48 6f 6d 6f 70  .    filterHomop
0aa0: 68 6f 6e 69 63 57 6f 72 64 73 28 29              honicWords()