commit 8ccfb8c5d5843b953c72dfd88add070509dece0c
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Thu, 5 Nov 2015 01:01:07 +0100
import ubac, start 2grams
Diffstat:
2grams.py | | | 47 | +++++++++++++++++++++++++++++++++++++++++++++++ |
2grams.sh | | | 6 | ++++++ |
ubac.py | | | 44 | ++++++++++++++++++++++++++++++++++++++++++++ |
3 files changed, 97 insertions(+), 0 deletions(-)
diff --git a/2grams.py b/2grams.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+
+import sys
+from collections import defaultdict
+import unicodedata
+
+mlen = 5
+mgram = 10
+forbid = ". '-"
+vowels = "aeiouy"
+
+grams = defaultdict(lambda : defaultdict(list))
+
+for l in sys.stdin.readlines():
+ w = l.strip()
+ has_vowel = False
+ for c in w:
+ for c2 in unicodedata.normalize('NFD', c)[:1]:
+ if c2 in vowels:
+ has_vowel = True
+ break
+ if has_vowel:
+ break
+ if not has_vowel:
+ continue
+ w2 = "^" + w + "$"
+ for i in range(len(w)-1):
+ gram = w[i:i+2]
+ ok = True
+ for c in forbid:
+ if c in gram:
+ ok = False
+ break
+ if not ok:
+ continue
+ ctxgram = w2[i:i+4]
+ d = grams[gram]
+ d[ctxgram].append(w)
+
+tum = sorted(grams.items(), key=(lambda x : len(x[1].keys())))
+for (digram, v) in tum:
+ if len(v.keys()) <= mgram:
+ for k in v.keys():
+ print ("%s => %s (%s)" % (digram, k, (', '.join(sorted(v[k])[:mlen])) +
+ (", ..." if len(v[k]) >= mlen else "")))
+ print("")
+
diff --git a/2grams.sh b/2grams.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# run on lexique
+
+cut -f1 "$1" | sed 1d | grep -v " " | sort | uniq | ./digraphs.py
+
diff --git a/ubac.py b/ubac.py
@@ -0,0 +1,44 @@
+#! /usr/bin/python3 -tt
+# by Virgile Andreani and Antoine Amarilli
+
+import sys
+
+if len(sys.argv) not in [2, 3]:
+ print("usage: ./ubac.py n [beginend] < lexique")
+ print(" to look for n-grams (with or without begin-end markers)")
+ print(" where lexique is a utf8 word list")
+
+n = int(sys.argv[1])
+words = []
+cngrams = {}
+
+def ngrams(word):
+ return zip(*(word[i:] for i in range(n)))
+
+def score(word):
+ score = sum(cngrams[ngram] for ngram in ngrams(word))
+ return float(score) / float(len(list(ngrams(word))))
+
+beginend = len(sys.argv) == 3
+
+for word in sys.stdin.readlines():
+ if beginend:
+ words.append("^" + word.strip() + "$")
+ else:
+ words.append(word.strip())
+ for ngram in ngrams(words[-1]):
+ if ngram in cngrams.keys():
+ cngrams[ngram] += 1
+ else:
+ cngrams[ngram] = 1
+
+scores = [(word, score(word)) for word in words
+ if len(word) >= n + (1 if beginend else 0)]
+scores.sort(key=lambda t: t[1])
+
+for (w, s) in scores:
+ if beginend:
+ print("%8.2f\t%s" % (s, w[1:-1]))
+ else:
+ print("%8.2f\t%s" % (s, w))
+