ubac.py (1144B)
1 #! /usr/bin/python3 -tt 2 # by Virgile Andreani and Antoine Amarilli 3 4 import sys 5 6 if len(sys.argv) not in [2, 3]: 7 print("usage: ./ubac.py n [beginend] < lexique") 8 print(" to look for n-grams (with or without begin-end markers)") 9 print(" where lexique is a utf8 word list") 10 sys.exit(1) 11 12 n = int(sys.argv[1]) 13 words = [] 14 cngrams = {} 15 16 def ngrams(word): 17 return zip(*(word[i:] for i in range(n))) 18 19 def score(word): 20 score = sum(cngrams[ngram] for ngram in ngrams(word)) 21 return float(score) / float(len(list(ngrams(word)))) 22 23 beginend = len(sys.argv) == 3 24 25 for word in sys.stdin.readlines(): 26 if beginend: 27 words.append("^" + word.strip() + "$") 28 else: 29 words.append(word.strip()) 30 for ngram in ngrams(words[-1]): 31 if ngram in cngrams.keys(): 32 cngrams[ngram] += 1 33 else: 34 cngrams[ngram] = 1 35 36 scores = [(word, score(word)) for word in words 37 if len(word) >= n + (1 if beginend else 0)] 38 scores.sort(key=lambda t: t[1]) 39 40 for (w, s) in scores: 41 if beginend: 42 print("%8.2f\t%s" % (s, w[1:-1])) 43 else: 44 print("%8.2f\t%s" % (s, w)) 45