ubac

search for weird ngrams
git clone https://a3nm.net/git/ubac/
Log | Files | Refs

2grams.py (1115B)


      1 #!/usr/bin/python3
      2 
      3 import sys
      4 from collections import defaultdict
      5 import unicodedata
      6 
      7 mlen = 5
      8 mgram = 10
      9 forbid = ". '-"
     10 vowels = "aeiouy"
     11 
     12 grams = defaultdict(lambda : defaultdict(list))
     13 
     14 for l in sys.stdin.readlines():
     15     w = l.strip()
     16     has_vowel = False
     17     for c in w:
     18         for c2 in unicodedata.normalize('NFD', c)[:1]:
     19             if c2 in vowels:
     20                 has_vowel = True
     21                 break
     22         if has_vowel:
     23             break
     24     if not has_vowel:
     25         continue
     26     w2 = "^" + w + "$"
     27     for i in range(len(w)-1):
     28         gram = w[i:i+2]
     29         ok = True
     30         for c in forbid:
     31             if c in gram:
     32                 ok = False
     33                 break
     34         if not ok:
     35             continue
     36         ctxgram = w2[i:i+4]
     37         d = grams[gram]
     38         d[ctxgram].append(w)
     39 
     40 tum = sorted(grams.items(), key=(lambda x : len(x[1].keys())))
     41 for (digram, v) in tum:
     42     if len(v.keys()) <= mgram:
     43         for k in v.keys():
     44             print ("%s => %s (%s)" % (digram, k, (', '.join(sorted(v[k])[:mlen])) +
     45             (", ..." if len(v[k]) >= mlen else "")))
     46         print("")
     47