todo - homophones - find homophones in French

commit 03e442aae8bead9ed958bf83dc9d5be2ce18f308
parent df5a7a4c409f407c64c9aeecd06f0e055989804b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Mon,  7 May 2018 00:40:11 +0200

todo

Diffstat:
TODO  | 18 ++++++++++++++++++
homophones.py  | 12 +++++++-----

2 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/TODO b/TODO
@@ -0,0 +1,18 @@
+- seulement les formes non fléchies
+  (eg, pas de "appert" vs "appaire", "paonne" vs "panne")
+=> en fait : il faudrait plutôt rejeter les paires où tous les membres ont la
+même forme de base
+
+- seulement au sein d'une même catégorie gram
+  (eg, pas de "althée" vs "haleter")
+
+
+related work:
+
+https://books.google.fr/books?id=-kdCohtEDrEC&printsec=frontcover&hl=fr&source=gbs_ge_summary_r&cad=0#v=onepage&q&f=false
+
+https://www.editions-larousse.fr/livre/dictionnaire-des-homonymes-9782035901101
+
+https://www.persee.fr/doc/psy_0003-5033_1999_num_99_4_28503
+
+
diff --git a/homophones.py b/homophones.py
@@ -20,19 +20,21 @@ for l in f.readlines():
     defs[word] = h
 
 for l in sys.stdin.readlines():
-    ortho, phon, lemme, cgam, genre, nombre, freq = l.lower().strip().split("\t")
+    ortho, phon, lemme, cgam, genre, nombre, freq = l.strip().split("\t")
+    ortho = ortho.lower()
+    lemme = lemme.lower()
     if ortho in blacklist:
         continue
-    if ortho != lemme:
-        continue
-    phons[(phon, cgam)].append((ortho, defs[ortho], cgam, genre, nombre, freq))
+    #if ortho != lemme:
+        #continue
+    phons[(phon, cgam)].append((ortho, defs[ortho], cgam, genre, nombre, freq, lemme))
 
 total_words = 0
 total_groups = 0
 
 for k in sorted(phons, key=(lambda x: max(y[5] for y in phons[x]))):
     v = phons[k]
-    if len(set([x[1] for x in v])) < 2:
+    if len(set([x[6] for x in v])) < 2:
         continue
     print(" + ".join(("%s (%s %s %s)" % (w[0], w[2], w[3], w[4]) for w in v)))
     #for w in v:

	homophones find homophones in French
	git clone https://a3nm.net/git/homophones/
	Log \| Files \| Refs