commit 441ba2b353f6fa8bccfa389a9563ae1d989d1788
parent 1306aaa38e4d7f517e57057e509545385b707f30
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 17 Aug 2019 18:52:32 +0200
migrate ouliplint to plint_extra
Diffstat:
4 files changed, 2 insertions(+), 368 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,6 @@ haspirater
haspirater/*
irc/*
occurrences
-old/*
*.pyc
contexts/
contexts_old/
diff --git a/TODO b/TODO
@@ -1,7 +1,6 @@
== Ongoing ==
-- update to Lexique version 3.83
-- check again xmllitre
+- ship the occurrence files directly
- turn should_be_accepted into a test
- expand the corpus of classical poetry: more Racine, more other authors
(Boileau, Corneille, Prudhomme, etc.)
@@ -33,6 +32,7 @@ corneille_comedie_des_tuileries https://fr.wikisource.org/wiki/La_Com%C3%A9die_d
- Train diaresesis.json on new works
- check that diaeresis:permissive is indeed more permissive
- check for duplicates in additions.txt
+- check again xmllitre
== Ideas ==
diff --git a/ouliplint/nplus7.py b/ouliplint/nplus7.py
@@ -1,327 +0,0 @@
-#!/usr/bin/python3 -uO
-
-"""Undocumented hack to play oulipo's dictionary game with plint"""
-
-import copy
-import localization
-from template import Template
-from rhyme import Rhyme
-import re
-import sys
-import time
-from common import normalize, apostrophes, strip_accents_one, vowels, consonants
-from verse import elision, remove_trivial
-from pos import postag
-sys.path.insert(0, "../drime")
-from query import query
-
-ORTHO = 0
-CGRAM = 3
-GENRE = 4
-NOMBRE = 5
-FREQ = 7
-
-localization.init_locale()
-
-cats = ['ADV', 'NOM', 'ADJ']
-posses = ['A', 'N', 'ADV']
-corr = {
- 'ADV': 'ADV',
- 'NOM': 'N',
- 'ADJ': 'A'}
-varcats = ['NOM', 'ADJ']
-genres = ['m', 'f']
-nombres = ['s', 'p']
-# TODO options to favor frequent, nonfrequent words, words from a certain theme
-# TODO verbs
-fthresh = 1
-exclude = ['travers', 'loin', 'ainsi', 'assez', 'guère', 'pas', 'partout', 'ni',
- 'ne', 'là-bas', 'tant', 'est-ce', 'beau', 'fois', 'milieu', 'présent',
- 'peu', 'peur', 'très', 'enfin', 'tous', 'tout', 'toute', 'toutes', 'bien',
- 'peine', 'autre', 'million', 'millier', 'plus', 'seul', 'puis', 'côté',
- 'encore', 'encor', 'plus', 'point', 'quelque']
-mdur = 5
-
-f = open(sys.argv[1], 'r')
-
-offset = int(sys.argv[3])
-
-words = {}
-mwords = []
-idx = {}
-
-def adj(x, y):
- if x == '':
- return y
- return [x]
-
-def cutword(word):
- x = re.sub("[" + apostrophes + "]", "'", word)
- if "'" in x:
- s = x.split("'")
- before, main, after = cutword(s[-1])
- return "'".join(s[:-1]) + "'" + before, main, after
- before = ""
- main = ""
- after = ""
- started = False
- finished = False
- for c in x:
- if not strip_accents_one(c)[0].lower() in vowels + consonants + ('-' if
- started else ''):
- if started:
- finished = True
- after = after + c
- continue
- before = before + c
- continue
- if not finished:
- started = True
- main = main + c
- return before, main, after
-
-def sure(poss):
- for (cat, x, y) in poss:
- if cat not in cats:
- return False
- return True
-
-def possible(poss, tag):
- for (cat, x, y) in poss:
- if cat in cats:
- if tag in posses:
- return True
- return False
-
-def ok_extends(w, w2, tag):
- try:
- p = mwords[idx[w]][1]
- except KeyError:
- p = [('NOM', 'm', 's'), ('NOM', 'f', 's')]
- w2 = w2.lower()
- if w2 not in idx.keys():
- return False
- p2 = mwords[idx[w2]][1]
- for (cat, a, b) in p:
- # and corr[cat] == tag
- if cat in cats and (cat, a, b) not in p2:
- return False
- if w2 != w and set(elision(w)) <= set(elision(w2)):
- return True
- return False
-
-def valid_word(w, tag):
- global words, lists, idx
- if w not in idx.keys():
- return False
- p = mwords[idx[w]][1]
- if not sure(p) and not possible(p, tag):
- return False
- return True
-
-def change(w, tag):
- #print(w, sure(p), tag, possible(p, tag))
- try:
- i = idx[w]
- except KeyError:
- i = len([w2 for w2 in idx.keys() if w2 < w])
- for (w2, rare, p2) in mwords[i:] + mwords[:i]:
- if ok_extends(w, w2, tag):
- yield w2
- yield w
- # p = idx[cat][genre][nombre][w]
- # n = len(lists[cat][genre][nombre])
- # return lists[cat][genre][nombre][(p+offset) % n]
-
- # if w not in words.keys():
- # return w
- # if len(words[w]) > 1:
- # return w
- # entry = words[w][0]
- # if entry[CGRAM] not in cats:
- # return w
- # cat = entry[CGRAM]
- # genre = entry[GENRE]
- # nombre = entry[NOMBRE]
- # if cat in varcats and (genre not in genres or nombre not in nombres):
- # return w
- # #print(cat, genre, nombre, w)
- # p = idx[cat][genre][nombre][w]
- # n = len(lists[cat][genre][nombre])
- # return lists[cat][genre][nombre][(p+offset) % n]
-
-first = True
-while True:
- l = f.readline()
- if not l:
- break
- # split header line
- if first:
- first = False
- continue
- s = l.split('\t')
- if s[ORTHO] not in words.keys():
- words[s[ORTHO]] = []
- words[s[ORTHO]].append(s)
-
-f.close()
-f = open(sys.argv[2], 'r')
-x = f.read()
-template = Template(x)
-template.options['phon_supposed_ok'] = False
-f.close()
-template.reject_errors = True
-
-lwords = sorted(list(words.keys()))
-
-for w in lwords:
- if w in exclude:
- continue
- poss = set()
- oposs = set()
- ok = True
- for entry in words[w]:
- for cat in entry[CGRAM].split(','):
- #if cat not in cats:
- #ok = False
- #break
- for genre in adj(entry[GENRE], genres):
- for nombre in adj(entry[NOMBRE], nombres):
- poss.add((cat, genre, nombre))
- if float(entry[FREQ]) >= fthresh and cat in cats:
- oposs.add((cat, genre, nombre))
- if ok and len(poss) >= 1:
- idx[w] = len(mwords)
- mwords.append((w, poss, oposs))
-
-
-# for cat in cats:
-# if cat not in lists.keys():
-# lists[cat] = {}
-# idx[cat] = {}
-# for genre in (genres if cat in varcats else ['']):
-# if genre not in lists[cat].keys():
-# lists[cat][genre] = {}
-# idx[cat][genre] = {}
-# for nombre in (nombres if cat in varcats else ['']):
-# if nombre not in lists[cat][genre].keys():
-# lists[cat][genre][nombre] = []
-# idx[cat][genre][nombre] = {}
-# for w in lwords:
-# if len(words[w]) == 1 and ',' not in words[w][0][CGRAM]:
-# entry = words[w][0]
-# if (entry[CGRAM] == cat and entry[GENRE] == genre and entry[NOMBRE] ==
-# nombre):
-# if float(entry[FREQ]) > fthresh:
-# idx[cat][genre][nombre][w] = len(lists[cat][genre][nombre])
-# lists[cat][genre][nombre].append(w)
-
-whitespace_regexp = re.compile("(\s*)")
-
-while True:
- l = sys.stdin.readline()
- if not l:
- break
- l = l.strip()
- if len(l) == 0:
- print(l)
- continue
- s = re.split(whitespace_regexp, l)
- try:
- loffset = int(s[-1])
- s = s[:-1]
- except ValueError:
- loffset = offset
- #print("before init:", template.position)
- errors = template.check(' '.join(s))
- template.back()
- #print("after init:", template.position)
- if errors:
- print ("PROBLEM with ORIGINAL")
- print (errors.report())
- continue
- lw = s[-1]
- s = remove_trivial(s, (lambda w: re.match("^\s*$", w) or
- len(normalize(w, rm_all=True)) == 0))
- r = []
- #print ("INIT rhyme: ", l)
- constraint = template.template[template.position % len(template.template)].constraint
- rhyme = Rhyme(lw, constraint, template.mergers, template.options)
- scut = [cutword(wfull) for wfull in s]
- #print(scut)
- tags = postag(scut)
- #print(tags)
- #print(scut)
- first = True
- for i, (before, ow, after) in reversed(list(enumerate(scut))):
- #print ("<%s|%s|%s>" % (before, w, after))
- w = ow.lower()
- started = time.time()
- ok = False
- tried = 0
- acceptable = 0
- if valid_word(w, tags[i]) or (ow[0] == ow[0].upper() and i > 0):
- if first and len(normalize(w)) > 0:
- first = False
- was_first = True
- rr, c, sur = query(w)
- try:
- lrhymes = sorted([x['word'] for x in rr['result']] + [w])
- it = lrhymes
- wpos = it.index(w)
- it = it[wpos+1:]
- except KeyError:
- it = change(w, tags[1])
- else:
- it = change(w, tags[i])
- for w2 in it:
- if not (ok_extends(w, w2, tags[i])):
- continue
- if time.time() - started > mdur:
- break #timeout
- if w2.lower() == w.lower():
- break
- tried += 1
- #print (w2, "try:" + ' '.join(r + [w2] + s[i+1:]))
- line = ' '.join(s[:i] + [before + w2 + after] + list(reversed(r)))
- #print ("CONSIDER: " + line)
- if was_first:
- was_first = False
- nrhyme = copy.deepcopy(rhyme)
- #print(lw, rhyme.phon, rhyme.eye)
- nrhyme.feed(w2, constraint)
- #print(normalize(line), nrhyme.phon, nrhyme.eye)
- if not nrhyme.satisfied():
- #print(nrhyme.phon, nrhyme.eye)
- #print ("... NO RHYME")
- continue
- #print ("TRY: " + line)
- #print("before inter:", template.position)
- #print ("check...")
- errors = template.check(line, quiet=True)
- #print ("...done")
- template.back()
- #print("after inter:", template.position)
- if not errors:
- acceptable += 1
- if acceptable == loffset:
- r.append(w2)
- ok = True
- break
- else:
- pass
- #print (errors.report())
- if not ok:
- r.append(w)
- if len(w) > 0 and ow[0] == ow[0].upper():
- r[-1] = r[-1][0].upper() + r[-1][1:]
- r[-1] = before + r[-1] + after
- final = ''.join(reversed(r))
- #print("before final:", template.position)
- errors = template.check(final)
- #print("after final:", template.position)
- if errors:
- print ("PROBLEM")
- print (errors.report())
- break
- print (final)
diff --git a/ouliplint/posplay.py b/ouliplint/posplay.py
@@ -1,38 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8
-
-import os
-from common import normalize
-from nltk.tag.stanford import POSTagger
-from pprint import pprint
-
-def postag(l):
- l2 = []
- idxes = []
- for (b, w, a) in l:
- for i, x in enumerate([b, w, a]):
- if (i == 1):
- idxes.append(len(l2))
- if (len(x.strip()) > 0) or i == 1:
- l2.append(x)
- tags = st.tag(l2)
- l3 = []
- for idx in idxes:
- l3.append(tags[idx][1])
- #pprint(l)
- #pprint(tags)
- #pprint(l3)
- return l3
-
-os.environ['JAVAHOME'] = '/usr/bin'
-# depends on http://nltk.org/nltk3-alpha/ and stanfond pos tagger
-# st = POSTagger('stanford-postagger-full-2013-11-12/models/english-bidirectional-distsim.tagger', 'stanford-postagger-full-2013-11-12/stanford-postagger.jar')
-st = POSTagger('stanford-postagger-full-2013-11-12/models/french.tagger',
-'stanford-postagger-full-2013-11-12/stanford-postagger.jar', encoding='utf-8')
-x = "Rome à qui vient ton bras d' immoler mon amant".split()
-print( st.tag(x))
-#x = "L' autre mime en riant l' infirme qui volait".split()
-#print( st.tag(x))
-#x = "Quelle est la vitesse aérienne d' une hirondelle à vide ?".split()
-#x = "La souffleuse, , , l'hindoue, elle a lentement péché, l' autre l autre l'autre la belle lésine,".split()
-#print( st.tag('What is the airspeed of an unladen swallow ?'.split()))