version undocumented ouliplint code - plint - French poetry validator (local mirror of https://gitlab.com/a3nm/plint)

commit fc0c11112689035a966779354650e14db1d72ba3
parent 40215120b44d17498b6f575debe6c78d01160f88
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed, 14 Aug 2019 23:53:32 +0200

version undocumented ouliplint code

Diffstat:
ouliplint/nplus7.py  | 327 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ouliplint/posplay.py  | 38 ++++++++++++++++++++++++++++++++++++++

2 files changed, 365 insertions(+), 0 deletions(-)
diff --git a/ouliplint/nplus7.py b/ouliplint/nplus7.py
@@ -0,0 +1,327 @@
+#!/usr/bin/python3 -uO
+
+"""Undocumented hack to play oulipo's dictionary game with plint"""
+
+import copy
+import localization
+from template import Template
+from rhyme import Rhyme
+import re
+import sys
+import time
+from common import normalize, apostrophes, strip_accents_one, vowels, consonants
+from verse import elision, remove_trivial
+from pos import postag
+sys.path.insert(0, "../drime")
+from query import query
+
+ORTHO = 0
+CGRAM = 3
+GENRE = 4
+NOMBRE = 5
+FREQ = 7
+
+localization.init_locale()
+
+cats = ['ADV', 'NOM', 'ADJ']
+posses = ['A', 'N', 'ADV']
+corr = {
+    'ADV': 'ADV',
+    'NOM': 'N',
+    'ADJ': 'A'}
+varcats = ['NOM', 'ADJ']
+genres = ['m', 'f']
+nombres = ['s', 'p']
+# TODO options to favor frequent, nonfrequent words, words from a certain theme
+# TODO verbs
+fthresh = 1
+exclude = ['travers', 'loin', 'ainsi', 'assez', 'guère', 'pas', 'partout', 'ni',
+    'ne', 'là-bas', 'tant', 'est-ce', 'beau', 'fois', 'milieu', 'présent',
+    'peu', 'peur', 'très', 'enfin', 'tous', 'tout', 'toute', 'toutes', 'bien',
+    'peine', 'autre', 'million', 'millier', 'plus', 'seul', 'puis', 'côté',
+    'encore', 'encor', 'plus', 'point', 'quelque']
+mdur = 5
+
+f = open(sys.argv[1], 'r')
+
+offset = int(sys.argv[3])
+
+words = {}
+mwords = []
+idx = {}
+
+def adj(x, y):
+  if x == '':
+    return y
+  return [x]
+
+def cutword(word):
+  x = re.sub("[" + apostrophes + "]", "'", word)
+  if "'" in x:
+    s = x.split("'")
+    before, main, after = cutword(s[-1])
+    return "'".join(s[:-1]) + "'" + before, main, after
+  before = ""
+  main = ""
+  after = ""
+  started = False
+  finished = False
+  for c in x:
+    if not strip_accents_one(c)[0].lower() in vowels + consonants + ('-' if
+        started else ''):
+      if started:
+        finished = True
+        after = after + c
+        continue
+      before = before + c
+      continue
+    if not finished:
+      started = True
+      main = main + c
+  return before, main, after
+
+def sure(poss):
+  for (cat, x, y) in poss:
+    if cat not in cats:
+      return False
+  return True
+
+def possible(poss, tag):
+  for (cat, x, y) in poss:
+    if cat in cats:
+      if tag in posses:
+        return True
+  return False
+
+def ok_extends(w, w2, tag):
+  try:
+    p = mwords[idx[w]][1]
+  except KeyError:
+    p = [('NOM', 'm', 's'), ('NOM', 'f', 's')]
+  w2 = w2.lower()
+  if w2 not in idx.keys():
+    return False
+  p2 = mwords[idx[w2]][1]
+  for (cat, a, b) in p:
+    # and corr[cat] == tag 
+    if cat in cats and (cat, a, b) not in p2:
+      return False
+  if w2 != w and set(elision(w)) <= set(elision(w2)):
+    return True
+  return False
+
+def valid_word(w, tag):
+  global words, lists, idx
+  if w not in idx.keys():
+    return False
+  p = mwords[idx[w]][1]
+  if not sure(p) and not possible(p, tag):
+    return False
+  return True
+
+def change(w, tag):
+  #print(w, sure(p), tag, possible(p, tag))
+  try:
+    i = idx[w]
+  except KeyError:
+    i = len([w2 for w2 in idx.keys() if w2 < w])
+  for (w2, rare, p2) in mwords[i:] + mwords[:i]:
+    if ok_extends(w, w2, tag):
+      yield w2
+  yield w
+  # p = idx[cat][genre][nombre][w]
+  # n = len(lists[cat][genre][nombre])
+  # return lists[cat][genre][nombre][(p+offset) % n]
+
+  # if w not in words.keys():
+  #   return w
+  # if len(words[w]) > 1:
+  #   return w
+  # entry = words[w][0]
+  # if entry[CGRAM] not in cats:
+  #   return w
+  # cat = entry[CGRAM]
+  # genre = entry[GENRE]
+  # nombre = entry[NOMBRE]
+  # if cat in varcats and (genre not in genres or nombre not in nombres):
+  #   return w
+  # #print(cat, genre, nombre, w)
+  # p = idx[cat][genre][nombre][w]
+  # n = len(lists[cat][genre][nombre])
+  # return lists[cat][genre][nombre][(p+offset) % n]
+
+first = True
+while True:
+  l = f.readline()
+  if not l:
+    break
+  # split header line
+  if first:
+    first = False
+    continue
+  s = l.split('\t')
+  if s[ORTHO] not in words.keys():
+    words[s[ORTHO]] = []
+  words[s[ORTHO]].append(s)
+
+f.close()
+f = open(sys.argv[2], 'r')
+x = f.read()
+template = Template(x)
+template.options['phon_supposed_ok'] = False
+f.close()
+template.reject_errors = True
+
+lwords = sorted(list(words.keys()))
+
+for w in lwords:
+  if w in exclude:
+    continue
+  poss = set()
+  oposs = set()
+  ok = True
+  for entry in words[w]:
+    for cat in entry[CGRAM].split(','):
+      #if cat not in cats:
+        #ok = False
+        #break
+      for genre in adj(entry[GENRE], genres):
+        for nombre in adj(entry[NOMBRE], nombres):
+            poss.add((cat, genre, nombre))
+            if float(entry[FREQ]) >= fthresh and cat in cats:
+              oposs.add((cat, genre, nombre))
+  if ok and len(poss) >= 1:
+    idx[w] = len(mwords)
+    mwords.append((w, poss, oposs))
+
+
+# for cat in cats:
+#   if cat not in lists.keys():
+#     lists[cat] = {}
+#     idx[cat] = {}
+#   for genre in (genres if cat in varcats else ['']):
+#     if genre not in lists[cat].keys():
+#       lists[cat][genre] = {}
+#       idx[cat][genre] = {}
+#     for nombre in (nombres if cat in varcats else ['']):
+#       if nombre not in lists[cat][genre].keys():
+#         lists[cat][genre][nombre] = []
+#         idx[cat][genre][nombre] = {}
+#       for w in lwords:
+#         if len(words[w]) == 1 and ',' not in words[w][0][CGRAM]:
+#           entry = words[w][0]
+#           if (entry[CGRAM] == cat and entry[GENRE] == genre and entry[NOMBRE] ==
+#               nombre):
+#             if float(entry[FREQ]) > fthresh:
+#               idx[cat][genre][nombre][w] = len(lists[cat][genre][nombre])
+#               lists[cat][genre][nombre].append(w)
+
+whitespace_regexp = re.compile("(\s*)")
+
+while True:
+  l = sys.stdin.readline()
+  if not l:
+    break
+  l = l.strip()
+  if len(l) == 0:
+    print(l)
+    continue
+  s = re.split(whitespace_regexp, l)
+  try:
+    loffset = int(s[-1])
+    s = s[:-1]
+  except ValueError:
+    loffset = offset
+  #print("before init:", template.position)
+  errors = template.check(' '.join(s))
+  template.back()
+  #print("after init:", template.position)
+  if errors:
+    print ("PROBLEM with ORIGINAL")
+    print (errors.report())
+    continue
+  lw = s[-1]
+  s = remove_trivial(s, (lambda w: re.match("^\s*$", w) or
+          len(normalize(w, rm_all=True)) == 0))
+  r = []
+  #print ("INIT rhyme: ", l)
+  constraint = template.template[template.position % len(template.template)].constraint
+  rhyme = Rhyme(lw, constraint, template.mergers, template.options)
+  scut = [cutword(wfull) for wfull in s]
+  #print(scut)
+  tags = postag(scut)
+  #print(tags)
+  #print(scut)
+  first = True
+  for i, (before, ow, after) in reversed(list(enumerate(scut))):
+    #print ("<%s|%s|%s>" % (before, w, after))
+    w = ow.lower()
+    started = time.time()
+    ok = False
+    tried = 0
+    acceptable = 0
+    if valid_word(w, tags[i]) or (ow[0] == ow[0].upper() and i > 0):
+      if first and len(normalize(w)) > 0:
+        first = False
+        was_first = True
+        rr, c, sur = query(w)
+        try:
+          lrhymes = sorted([x['word'] for x in rr['result']] + [w])
+          it = lrhymes
+          wpos = it.index(w)
+          it = it[wpos+1:]
+        except KeyError:
+          it = change(w, tags[1])
+      else:
+        it = change(w, tags[i])
+      for w2 in it:
+        if not (ok_extends(w, w2, tags[i])):
+          continue
+        if time.time() - started > mdur:
+          break #timeout
+        if w2.lower() == w.lower():
+          break
+        tried += 1
+        #print (w2, "try:" + ' '.join(r + [w2] + s[i+1:]))
+        line = ' '.join(s[:i] + [before + w2 + after] + list(reversed(r)))
+        #print ("CONSIDER: " + line)
+        if was_first:
+          was_first = False
+          nrhyme = copy.deepcopy(rhyme)
+          #print(lw, rhyme.phon, rhyme.eye)
+          nrhyme.feed(w2, constraint)
+          #print(normalize(line), nrhyme.phon, nrhyme.eye)
+          if not nrhyme.satisfied():
+            #print(nrhyme.phon, nrhyme.eye)
+            #print ("... NO RHYME")
+            continue
+        #print ("TRY: " + line)
+        #print("before inter:", template.position)
+        #print ("check...")
+        errors = template.check(line, quiet=True)
+        #print ("...done")
+        template.back()
+        #print("after inter:", template.position)
+        if not errors:
+          acceptable += 1
+          if acceptable == loffset:
+            r.append(w2)
+            ok = True
+            break
+        else:
+          pass
+          #print (errors.report())
+    if not ok:
+      r.append(w)
+    if len(w) > 0 and ow[0] == ow[0].upper():
+      r[-1] = r[-1][0].upper() + r[-1][1:]
+    r[-1] = before + r[-1] + after
+  final = ''.join(reversed(r))
+  #print("before final:", template.position)
+  errors = template.check(final)
+  #print("after final:", template.position)
+  if errors:
+    print ("PROBLEM")
+    print (errors.report())
+    break
+  print (final)
diff --git a/ouliplint/posplay.py b/ouliplint/posplay.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+# -*- coding: utf-8
+
+import os
+from common import normalize
+from nltk.tag.stanford import POSTagger
+from pprint import pprint
+
+def postag(l):
+  l2 = []
+  idxes = []
+  for (b, w, a) in l:
+    for i, x in enumerate([b, w, a]):
+      if (i == 1):
+       idxes.append(len(l2))
+      if (len(x.strip()) > 0) or i == 1:
+        l2.append(x)
+  tags = st.tag(l2)
+  l3 = []
+  for idx in idxes:
+    l3.append(tags[idx][1])
+  #pprint(l)
+  #pprint(tags)
+  #pprint(l3)
+  return l3
+
+os.environ['JAVAHOME'] = '/usr/bin'
+# depends on http://nltk.org/nltk3-alpha/ and stanfond pos tagger
+# st = POSTagger('stanford-postagger-full-2013-11-12/models/english-bidirectional-distsim.tagger', 'stanford-postagger-full-2013-11-12/stanford-postagger.jar')
+st = POSTagger('stanford-postagger-full-2013-11-12/models/french.tagger',
+'stanford-postagger-full-2013-11-12/stanford-postagger.jar', encoding='utf-8')
+x = "Rome à qui vient ton bras d' immoler mon amant".split()
+print( st.tag(x))
+#x = "L' autre mime en riant l' infirme qui volait".split()
+#print( st.tag(x))
+#x = "Quelle est la vitesse aérienne d' une hirondelle à vide ?".split()
+#x = "La souffleuse, , , l'hindoue, elle a lentement péché, l' autre l autre l'autre la belle lésine,".split()
+#print( st.tag('What is the airspeed of an unladen swallow ?'.split()))

	plint French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
	git clone https://a3nm.net/git/plint/
	Log \| Files \| Refs \| README

ouliplint/nplus7.py	\|	327	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ouliplint/posplay.py	\|	38	++++++++++++++++++++++++++++++++++++++