ubac

search for weird ngrams
git clone https://a3nm.net/git/ubac/
Log | Files | Refs

ubac.py (1144B)


      1 #! /usr/bin/python3 -tt
      2 # by Virgile Andreani and Antoine Amarilli
      3 
      4 import sys
      5 
      6 if len(sys.argv) not in [2, 3]:
      7     print("usage: ./ubac.py n [beginend] < lexique")
      8     print("    to look for n-grams (with or without begin-end markers)")
      9     print("    where lexique is a utf8 word list")
     10     sys.exit(1)
     11 
     12 n = int(sys.argv[1])
     13 words = []
     14 cngrams = {}
     15     
     16 def ngrams(word):
     17     return zip(*(word[i:] for i in range(n)))
     18 
     19 def score(word):
     20     score = sum(cngrams[ngram] for ngram in ngrams(word))
     21     return float(score) / float(len(list(ngrams(word))))
     22 
     23 beginend = len(sys.argv) == 3
     24 
     25 for word in sys.stdin.readlines():
     26     if beginend:
     27         words.append("^" + word.strip() + "$")
     28     else:
     29         words.append(word.strip())
     30     for ngram in ngrams(words[-1]):
     31         if ngram in cngrams.keys():
     32             cngrams[ngram] += 1
     33         else:
     34             cngrams[ngram] = 1
     35 
     36 scores = [(word, score(word)) for word in words
     37             if len(word) >= n + (1 if beginend else 0)]
     38 scores.sort(key=lambda t: t[1])
     39 
     40 for (w, s) in scores:
     41     if beginend:
     42         print("%8.2f\t%s" % (s, w[1:-1]))
     43     else:
     44         print("%8.2f\t%s" % (s, w))
     45