commit df5a7a4c409f407c64c9aeecd06f0e055989804b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sun, 6 May 2018 22:54:19 +0200
start importing files
Diffstat:
3 files changed, 47 insertions(+), 0 deletions(-)
diff --git a/fix_elision_dicollecte_1.sh b/fix_elision_dicollecte_1.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+sed 's/^\([^ ]* [^ ]*[bcćçdfghjklmnpqrsSśštvwxz]\)e\([bcćçdfghjklmnpqrsSśštvwxz].*\)/\1\2/g'
diff --git a/homophones.py b/homophones.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python3
+
+from collections import defaultdict
+import sys
+
+phons = defaultdict(lambda: [])
+
+blacklist = set()
+
+defs = defaultdict(lambda: "")
+
+f = open("tlf_not_exist", 'r')
+for l in f.readlines():
+ blacklist.add(l.lower().strip())
+f.close()
+
+f = open("tlf_defs", 'r')
+for l in f.readlines():
+ h, word = l.lower().strip().split("\t")
+ defs[word] = h
+
+for l in sys.stdin.readlines():
+ ortho, phon, lemme, cgam, genre, nombre, freq = l.lower().strip().split("\t")
+ if ortho in blacklist:
+ continue
+ if ortho != lemme:
+ continue
+ phons[(phon, cgam)].append((ortho, defs[ortho], cgam, genre, nombre, freq))
+
+total_words = 0
+total_groups = 0
+
+for k in sorted(phons, key=(lambda x: max(y[5] for y in phons[x]))):
+ v = phons[k]
+ if len(set([x[1] for x in v])) < 2:
+ continue
+ print(" + ".join(("%s (%s %s %s)" % (w[0], w[2], w[3], w[4]) for w in v)))
+ #for w in v:
+ #print(w[0])
+ total_words += len(v)
+ total_groups += 1
+
+print("Total: %d words in %d groups" % (total_words, total_groups))
diff --git a/homophones_lexique.sh b/homophones_lexique.sh
@@ -0,0 +1 @@
+ cut -f 1-4,7 lexique | tr ' ' '_' | awk '$1 == $3' | sort | uniq | awk 'cnt[$2 $4]++{if (cnt[$2 $4]==2) print prev[$2 $4]; print} {prev[$2 $4]=$0}' | sort -k2,2 | cut -f1 | sort | uniq | tr '_' ' ' | wc -l > ~/git/homophones_lexique