homophones

find homophones in French
git clone https://a3nm.net/git/homophones/
Log | Files | Refs

commit df5a7a4c409f407c64c9aeecd06f0e055989804b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun,  6 May 2018 22:54:19 +0200

start importing files

Diffstat:
fix_elision_dicollecte_1.sh | 3+++
homophones.py | 43+++++++++++++++++++++++++++++++++++++++++++
homophones_lexique.sh | 1+
3 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/fix_elision_dicollecte_1.sh b/fix_elision_dicollecte_1.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sed 's/^\([^ ]* [^ ]*[bcćçdfghjklmnpqrsSśštvwxz]\)e\([bcćçdfghjklmnpqrsSśštvwxz].*\)/\1\2/g' diff --git a/homophones.py b/homophones.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +from collections import defaultdict +import sys + +phons = defaultdict(lambda: []) + +blacklist = set() + +defs = defaultdict(lambda: "") + +f = open("tlf_not_exist", 'r') +for l in f.readlines(): + blacklist.add(l.lower().strip()) +f.close() + +f = open("tlf_defs", 'r') +for l in f.readlines(): + h, word = l.lower().strip().split("\t") + defs[word] = h + +for l in sys.stdin.readlines(): + ortho, phon, lemme, cgam, genre, nombre, freq = l.lower().strip().split("\t") + if ortho in blacklist: + continue + if ortho != lemme: + continue + phons[(phon, cgam)].append((ortho, defs[ortho], cgam, genre, nombre, freq)) + +total_words = 0 +total_groups = 0 + +for k in sorted(phons, key=(lambda x: max(y[5] for y in phons[x]))): + v = phons[k] + if len(set([x[1] for x in v])) < 2: + continue + print(" + ".join(("%s (%s %s %s)" % (w[0], w[2], w[3], w[4]) for w in v))) + #for w in v: + #print(w[0]) + total_words += len(v) + total_groups += 1 + +print("Total: %d words in %d groups" % (total_words, total_groups)) diff --git a/homophones_lexique.sh b/homophones_lexique.sh @@ -0,0 +1 @@ + cut -f 1-4,7 lexique | tr ' ' '_' | awk '$1 == $3' | sort | uniq | awk 'cnt[$2 $4]++{if (cnt[$2 $4]==2) print prev[$2 $4]; print} {prev[$2 $4]=$0}' | sort -k2,2 | cut -f1 | sort | uniq | tr '_' ' ' | wc -l > ~/git/homophones_lexique