2grams.py (1115B)
1 #!/usr/bin/python3 2 3 import sys 4 from collections import defaultdict 5 import unicodedata 6 7 mlen = 5 8 mgram = 10 9 forbid = ". '-" 10 vowels = "aeiouy" 11 12 grams = defaultdict(lambda : defaultdict(list)) 13 14 for l in sys.stdin.readlines(): 15 w = l.strip() 16 has_vowel = False 17 for c in w: 18 for c2 in unicodedata.normalize('NFD', c)[:1]: 19 if c2 in vowels: 20 has_vowel = True 21 break 22 if has_vowel: 23 break 24 if not has_vowel: 25 continue 26 w2 = "^" + w + "$" 27 for i in range(len(w)-1): 28 gram = w[i:i+2] 29 ok = True 30 for c in forbid: 31 if c in gram: 32 ok = False 33 break 34 if not ok: 35 continue 36 ctxgram = w2[i:i+4] 37 d = grams[gram] 38 d[ctxgram].append(w) 39 40 tum = sorted(grams.items(), key=(lambda x : len(x[1].keys()))) 41 for (digram, v) in tum: 42 if len(v.keys()) <= mgram: 43 for k in v.keys(): 44 print ("%s => %s (%s)" % (digram, k, (', '.join(sorted(v[k])[:mlen])) + 45 (", ..." if len(v[k]) >= mlen else ""))) 46 print("") 47