plint

French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

commit fc0c11112689035a966779354650e14db1d72ba3
parent 40215120b44d17498b6f575debe6c78d01160f88
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed, 14 Aug 2019 23:53:32 +0200

version undocumented ouliplint code

Diffstat:
ouliplint/nplus7.py | 327+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ouliplint/posplay.py | 38++++++++++++++++++++++++++++++++++++++
2 files changed, 365 insertions(+), 0 deletions(-)

diff --git a/ouliplint/nplus7.py b/ouliplint/nplus7.py @@ -0,0 +1,327 @@ +#!/usr/bin/python3 -uO + +"""Undocumented hack to play oulipo's dictionary game with plint""" + +import copy +import localization +from template import Template +from rhyme import Rhyme +import re +import sys +import time +from common import normalize, apostrophes, strip_accents_one, vowels, consonants +from verse import elision, remove_trivial +from pos import postag +sys.path.insert(0, "../drime") +from query import query + +ORTHO = 0 +CGRAM = 3 +GENRE = 4 +NOMBRE = 5 +FREQ = 7 + +localization.init_locale() + +cats = ['ADV', 'NOM', 'ADJ'] +posses = ['A', 'N', 'ADV'] +corr = { + 'ADV': 'ADV', + 'NOM': 'N', + 'ADJ': 'A'} +varcats = ['NOM', 'ADJ'] +genres = ['m', 'f'] +nombres = ['s', 'p'] +# TODO options to favor frequent, nonfrequent words, words from a certain theme +# TODO verbs +fthresh = 1 +exclude = ['travers', 'loin', 'ainsi', 'assez', 'guère', 'pas', 'partout', 'ni', + 'ne', 'là-bas', 'tant', 'est-ce', 'beau', 'fois', 'milieu', 'présent', + 'peu', 'peur', 'très', 'enfin', 'tous', 'tout', 'toute', 'toutes', 'bien', + 'peine', 'autre', 'million', 'millier', 'plus', 'seul', 'puis', 'côté', + 'encore', 'encor', 'plus', 'point', 'quelque'] +mdur = 5 + +f = open(sys.argv[1], 'r') + +offset = int(sys.argv[3]) + +words = {} +mwords = [] +idx = {} + +def adj(x, y): + if x == '': + return y + return [x] + +def cutword(word): + x = re.sub("[" + apostrophes + "]", "'", word) + if "'" in x: + s = x.split("'") + before, main, after = cutword(s[-1]) + return "'".join(s[:-1]) + "'" + before, main, after + before = "" + main = "" + after = "" + started = False + finished = False + for c in x: + if not strip_accents_one(c)[0].lower() in vowels + consonants + ('-' if + started else ''): + if started: + finished = True + after = after + c + continue + before = before + c + continue + if not finished: + started = True + main = main + c + return before, main, after + +def sure(poss): + for (cat, x, y) in poss: + if cat not in cats: + return False + return True + +def possible(poss, tag): + for (cat, x, y) in poss: + if cat in cats: + if tag in posses: + return True + return False + +def ok_extends(w, w2, tag): + try: + p = mwords[idx[w]][1] + except KeyError: + p = [('NOM', 'm', 's'), ('NOM', 'f', 's')] + w2 = w2.lower() + if w2 not in idx.keys(): + return False + p2 = mwords[idx[w2]][1] + for (cat, a, b) in p: + # and corr[cat] == tag + if cat in cats and (cat, a, b) not in p2: + return False + if w2 != w and set(elision(w)) <= set(elision(w2)): + return True + return False + +def valid_word(w, tag): + global words, lists, idx + if w not in idx.keys(): + return False + p = mwords[idx[w]][1] + if not sure(p) and not possible(p, tag): + return False + return True + +def change(w, tag): + #print(w, sure(p), tag, possible(p, tag)) + try: + i = idx[w] + except KeyError: + i = len([w2 for w2 in idx.keys() if w2 < w]) + for (w2, rare, p2) in mwords[i:] + mwords[:i]: + if ok_extends(w, w2, tag): + yield w2 + yield w + # p = idx[cat][genre][nombre][w] + # n = len(lists[cat][genre][nombre]) + # return lists[cat][genre][nombre][(p+offset) % n] + + # if w not in words.keys(): + # return w + # if len(words[w]) > 1: + # return w + # entry = words[w][0] + # if entry[CGRAM] not in cats: + # return w + # cat = entry[CGRAM] + # genre = entry[GENRE] + # nombre = entry[NOMBRE] + # if cat in varcats and (genre not in genres or nombre not in nombres): + # return w + # #print(cat, genre, nombre, w) + # p = idx[cat][genre][nombre][w] + # n = len(lists[cat][genre][nombre]) + # return lists[cat][genre][nombre][(p+offset) % n] + +first = True +while True: + l = f.readline() + if not l: + break + # split header line + if first: + first = False + continue + s = l.split('\t') + if s[ORTHO] not in words.keys(): + words[s[ORTHO]] = [] + words[s[ORTHO]].append(s) + +f.close() +f = open(sys.argv[2], 'r') +x = f.read() +template = Template(x) +template.options['phon_supposed_ok'] = False +f.close() +template.reject_errors = True + +lwords = sorted(list(words.keys())) + +for w in lwords: + if w in exclude: + continue + poss = set() + oposs = set() + ok = True + for entry in words[w]: + for cat in entry[CGRAM].split(','): + #if cat not in cats: + #ok = False + #break + for genre in adj(entry[GENRE], genres): + for nombre in adj(entry[NOMBRE], nombres): + poss.add((cat, genre, nombre)) + if float(entry[FREQ]) >= fthresh and cat in cats: + oposs.add((cat, genre, nombre)) + if ok and len(poss) >= 1: + idx[w] = len(mwords) + mwords.append((w, poss, oposs)) + + +# for cat in cats: +# if cat not in lists.keys(): +# lists[cat] = {} +# idx[cat] = {} +# for genre in (genres if cat in varcats else ['']): +# if genre not in lists[cat].keys(): +# lists[cat][genre] = {} +# idx[cat][genre] = {} +# for nombre in (nombres if cat in varcats else ['']): +# if nombre not in lists[cat][genre].keys(): +# lists[cat][genre][nombre] = [] +# idx[cat][genre][nombre] = {} +# for w in lwords: +# if len(words[w]) == 1 and ',' not in words[w][0][CGRAM]: +# entry = words[w][0] +# if (entry[CGRAM] == cat and entry[GENRE] == genre and entry[NOMBRE] == +# nombre): +# if float(entry[FREQ]) > fthresh: +# idx[cat][genre][nombre][w] = len(lists[cat][genre][nombre]) +# lists[cat][genre][nombre].append(w) + +whitespace_regexp = re.compile("(\s*)") + +while True: + l = sys.stdin.readline() + if not l: + break + l = l.strip() + if len(l) == 0: + print(l) + continue + s = re.split(whitespace_regexp, l) + try: + loffset = int(s[-1]) + s = s[:-1] + except ValueError: + loffset = offset + #print("before init:", template.position) + errors = template.check(' '.join(s)) + template.back() + #print("after init:", template.position) + if errors: + print ("PROBLEM with ORIGINAL") + print (errors.report()) + continue + lw = s[-1] + s = remove_trivial(s, (lambda w: re.match("^\s*$", w) or + len(normalize(w, rm_all=True)) == 0)) + r = [] + #print ("INIT rhyme: ", l) + constraint = template.template[template.position % len(template.template)].constraint + rhyme = Rhyme(lw, constraint, template.mergers, template.options) + scut = [cutword(wfull) for wfull in s] + #print(scut) + tags = postag(scut) + #print(tags) + #print(scut) + first = True + for i, (before, ow, after) in reversed(list(enumerate(scut))): + #print ("<%s|%s|%s>" % (before, w, after)) + w = ow.lower() + started = time.time() + ok = False + tried = 0 + acceptable = 0 + if valid_word(w, tags[i]) or (ow[0] == ow[0].upper() and i > 0): + if first and len(normalize(w)) > 0: + first = False + was_first = True + rr, c, sur = query(w) + try: + lrhymes = sorted([x['word'] for x in rr['result']] + [w]) + it = lrhymes + wpos = it.index(w) + it = it[wpos+1:] + except KeyError: + it = change(w, tags[1]) + else: + it = change(w, tags[i]) + for w2 in it: + if not (ok_extends(w, w2, tags[i])): + continue + if time.time() - started > mdur: + break #timeout + if w2.lower() == w.lower(): + break + tried += 1 + #print (w2, "try:" + ' '.join(r + [w2] + s[i+1:])) + line = ' '.join(s[:i] + [before + w2 + after] + list(reversed(r))) + #print ("CONSIDER: " + line) + if was_first: + was_first = False + nrhyme = copy.deepcopy(rhyme) + #print(lw, rhyme.phon, rhyme.eye) + nrhyme.feed(w2, constraint) + #print(normalize(line), nrhyme.phon, nrhyme.eye) + if not nrhyme.satisfied(): + #print(nrhyme.phon, nrhyme.eye) + #print ("... NO RHYME") + continue + #print ("TRY: " + line) + #print("before inter:", template.position) + #print ("check...") + errors = template.check(line, quiet=True) + #print ("...done") + template.back() + #print("after inter:", template.position) + if not errors: + acceptable += 1 + if acceptable == loffset: + r.append(w2) + ok = True + break + else: + pass + #print (errors.report()) + if not ok: + r.append(w) + if len(w) > 0 and ow[0] == ow[0].upper(): + r[-1] = r[-1][0].upper() + r[-1][1:] + r[-1] = before + r[-1] + after + final = ''.join(reversed(r)) + #print("before final:", template.position) + errors = template.check(final) + #print("after final:", template.position) + if errors: + print ("PROBLEM") + print (errors.report()) + break + print (final) diff --git a/ouliplint/posplay.py b/ouliplint/posplay.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 + +import os +from common import normalize +from nltk.tag.stanford import POSTagger +from pprint import pprint + +def postag(l): + l2 = [] + idxes = [] + for (b, w, a) in l: + for i, x in enumerate([b, w, a]): + if (i == 1): + idxes.append(len(l2)) + if (len(x.strip()) > 0) or i == 1: + l2.append(x) + tags = st.tag(l2) + l3 = [] + for idx in idxes: + l3.append(tags[idx][1]) + #pprint(l) + #pprint(tags) + #pprint(l3) + return l3 + +os.environ['JAVAHOME'] = '/usr/bin' +# depends on http://nltk.org/nltk3-alpha/ and stanfond pos tagger +# st = POSTagger('stanford-postagger-full-2013-11-12/models/english-bidirectional-distsim.tagger', 'stanford-postagger-full-2013-11-12/stanford-postagger.jar') +st = POSTagger('stanford-postagger-full-2013-11-12/models/french.tagger', +'stanford-postagger-full-2013-11-12/stanford-postagger.jar', encoding='utf-8') +x = "Rome à qui vient ton bras d' immoler mon amant".split() +print( st.tag(x)) +#x = "L' autre mime en riant l' infirme qui volait".split() +#print( st.tag(x)) +#x = "Quelle est la vitesse aérienne d' une hirondelle à vide ?".split() +#x = "La souffleuse, , , l'hindoue, elle a lentement péché, l' autre l autre l'autre la belle lésine,".split() +#print( st.tag('What is the airspeed of an unladen swallow ?'.split()))