ubac

search for weird ngrams
git clone https://a3nm.net/git/ubac/
Log | Files | Refs

commit 8ccfb8c5d5843b953c72dfd88add070509dece0c
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Thu,  5 Nov 2015 01:01:07 +0100

import ubac, start 2grams

Diffstat:
2grams.py | 47+++++++++++++++++++++++++++++++++++++++++++++++
2grams.sh | 6++++++
ubac.py | 44++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 97 insertions(+), 0 deletions(-)

diff --git a/2grams.py b/2grams.py @@ -0,0 +1,47 @@ +#!/usr/bin/python3 + +import sys +from collections import defaultdict +import unicodedata + +mlen = 5 +mgram = 10 +forbid = ". '-" +vowels = "aeiouy" + +grams = defaultdict(lambda : defaultdict(list)) + +for l in sys.stdin.readlines(): + w = l.strip() + has_vowel = False + for c in w: + for c2 in unicodedata.normalize('NFD', c)[:1]: + if c2 in vowels: + has_vowel = True + break + if has_vowel: + break + if not has_vowel: + continue + w2 = "^" + w + "$" + for i in range(len(w)-1): + gram = w[i:i+2] + ok = True + for c in forbid: + if c in gram: + ok = False + break + if not ok: + continue + ctxgram = w2[i:i+4] + d = grams[gram] + d[ctxgram].append(w) + +tum = sorted(grams.items(), key=(lambda x : len(x[1].keys()))) +for (digram, v) in tum: + if len(v.keys()) <= mgram: + for k in v.keys(): + print ("%s => %s (%s)" % (digram, k, (', '.join(sorted(v[k])[:mlen])) + + (", ..." if len(v[k]) >= mlen else ""))) + print("") + diff --git a/2grams.sh b/2grams.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# run on lexique + +cut -f1 "$1" | sed 1d | grep -v " " | sort | uniq | ./digraphs.py + diff --git a/ubac.py b/ubac.py @@ -0,0 +1,44 @@ +#! /usr/bin/python3 -tt +# by Virgile Andreani and Antoine Amarilli + +import sys + +if len(sys.argv) not in [2, 3]: + print("usage: ./ubac.py n [beginend] < lexique") + print(" to look for n-grams (with or without begin-end markers)") + print(" where lexique is a utf8 word list") + +n = int(sys.argv[1]) +words = [] +cngrams = {} + +def ngrams(word): + return zip(*(word[i:] for i in range(n))) + +def score(word): + score = sum(cngrams[ngram] for ngram in ngrams(word)) + return float(score) / float(len(list(ngrams(word)))) + +beginend = len(sys.argv) == 3 + +for word in sys.stdin.readlines(): + if beginend: + words.append("^" + word.strip() + "$") + else: + words.append(word.strip()) + for ngram in ngrams(words[-1]): + if ngram in cngrams.keys(): + cngrams[ngram] += 1 + else: + cngrams[ngram] = 1 + +scores = [(word, score(word)) for word in words + if len(word) >= n + (1 if beginend else 0)] +scores.sort(key=lambda t: t[1]) + +for (w, s) in scores: + if beginend: + print("%8.2f\t%s" % (s, w[1:-1])) + else: + print("%8.2f\t%s" % (s, w)) +