homophones

find homophones in French
git clone https://a3nm.net/git/homophones/
Log | Files | Refs

homophones.py (1210B)


      1 #!/usr/bin/python3
      2 
      3 from collections import defaultdict
      4 import sys
      5 
      6 phons = defaultdict(lambda: [])
      7 
      8 blacklist = set()
      9 
     10 defs = defaultdict(lambda: "")
     11 
     12 f = open("tlf_not_exist", 'r')
     13 for l in f.readlines():
     14     blacklist.add(l.lower().strip())
     15 f.close()
     16 
     17 f = open("tlf_defs", 'r')
     18 for l in f.readlines():
     19     h, word = l.lower().strip().split("\t")
     20     defs[word] = h
     21 
     22 for l in sys.stdin.readlines():
     23     ortho, phon, lemme, cgam, genre, nombre, freq = l.strip().split("\t")
     24     ortho = ortho.lower()
     25     lemme = lemme.lower()
     26     if ortho in blacklist:
     27         continue
     28     #if ortho != lemme:
     29         #continue
     30     phons[(phon, cgam)].append((ortho, defs[ortho], cgam, genre, nombre, freq, lemme))
     31 
     32 total_words = 0
     33 total_groups = 0
     34 
     35 for k in sorted(phons, key=(lambda x: min(y[5] for y in phons[x]))):
     36     v = phons[k]
     37     #if len(set([x[6] for x in v])) < 2:
     38     if len(set([x[1] for x in v])) < 2:
     39         continue
     40     if len(set([x[6] for x in v])) < 2:
     41         continue
     42     print(" + ".join(("%s (%s %s %s)" % (w[0], w[2], w[3], w[4]) for w in v)))
     43     #for w in v:
     44         #print(w[0])
     45     total_words += len(v)
     46     total_groups += 1
     47 
     48 print("Total: %d words in %d groups" % (total_words, total_groups))