import ubac, start 2grams - ubac - search for weird ngrams

commit 8ccfb8c5d5843b953c72dfd88add070509dece0c
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Thu,  5 Nov 2015 01:01:07 +0100

import ubac, start 2grams

Diffstat:
2grams.py  | 47 +++++++++++++++++++++++++++++++++++++++++++++++
2grams.sh  | 6 ++++++
ubac.py  | 44 ++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 97 insertions(+), 0 deletions(-)
diff --git a/2grams.py b/2grams.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+
+import sys
+from collections import defaultdict
+import unicodedata
+
+mlen = 5
+mgram = 10
+forbid = ". '-"
+vowels = "aeiouy"
+
+grams = defaultdict(lambda : defaultdict(list))
+
+for l in sys.stdin.readlines():
+    w = l.strip()
+    has_vowel = False
+    for c in w:
+        for c2 in unicodedata.normalize('NFD', c)[:1]:
+            if c2 in vowels:
+                has_vowel = True
+                break
+        if has_vowel:
+            break
+    if not has_vowel:
+        continue
+    w2 = "^" + w + "$"
+    for i in range(len(w)-1):
+        gram = w[i:i+2]
+        ok = True
+        for c in forbid:
+            if c in gram:
+                ok = False
+                break
+        if not ok:
+            continue
+        ctxgram = w2[i:i+4]
+        d = grams[gram]
+        d[ctxgram].append(w)
+
+tum = sorted(grams.items(), key=(lambda x : len(x[1].keys())))
+for (digram, v) in tum:
+    if len(v.keys()) <= mgram:
+        for k in v.keys():
+            print ("%s => %s (%s)" % (digram, k, (', '.join(sorted(v[k])[:mlen])) +
+            (", ..." if len(v[k]) >= mlen else "")))
+        print("")
+
diff --git a/2grams.sh b/2grams.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# run on lexique
+
+cut -f1 "$1" | sed 1d | grep -v " " | sort | uniq | ./digraphs.py
+
diff --git a/ubac.py b/ubac.py
@@ -0,0 +1,44 @@
+#! /usr/bin/python3 -tt
+# by Virgile Andreani and Antoine Amarilli
+
+import sys
+
+if len(sys.argv) not in [2, 3]:
+    print("usage: ./ubac.py n [beginend] < lexique")
+    print("    to look for n-grams (with or without begin-end markers)")
+    print("    where lexique is a utf8 word list")
+
+n = int(sys.argv[1])
+words = []
+cngrams = {}
+    
+def ngrams(word):
+    return zip(*(word[i:] for i in range(n)))
+
+def score(word):
+    score = sum(cngrams[ngram] for ngram in ngrams(word))
+    return float(score) / float(len(list(ngrams(word))))
+
+beginend = len(sys.argv) == 3
+
+for word in sys.stdin.readlines():
+    if beginend:
+        words.append("^" + word.strip() + "$")
+    else:
+        words.append(word.strip())
+    for ngram in ngrams(words[-1]):
+        if ngram in cngrams.keys():
+            cngrams[ngram] += 1
+        else:
+            cngrams[ngram] = 1
+
+scores = [(word, score(word)) for word in words
+            if len(word) >= n + (1 if beginend else 0)]
+scores.sort(key=lambda t: t[1])
+
+for (w, s) in scores:
+    if beginend:
+        print("%8.2f\t%s" % (s, w[1:-1]))
+    else:
+        print("%8.2f\t%s" % (s, w))
+

	ubac search for weird ngrams
	git clone https://a3nm.net/git/ubac/
	Log \| Files \| Refs

2grams.py	\|	47	+++++++++++++++++++++++++++++++++++++++++++++++
2grams.sh	\|	6	++++++
ubac.py	\|	44	++++++++++++++++++++++++++++++++++++++++++++