commit 8d59d3e20eeee614e80fec970c3500efc05d0f82
parent 38720c219b80d088bf0696bab2d0322de670284a
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 25 Jun 2011 03:18:43 -0400
all boileau without error
Diffstat:
poetlint.py | | | 559 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- |
1 file changed, 427 insertions(+), 132 deletions(-)
diff --git a/poetlint.py b/poetlint.py
@@ -1,48 +1,96 @@
-#!/usr/bin/python3 -u
+#!/usr/bin/python3 -uO
import re
import sys
import unicodedata
-import aspire
+import haspirater
+import rhyme
+#import cProfile
from pprint import pprint
+#TODO no clear femid env for implicit repeat
+#TODO femid pattern groups (not all the same)
+
+
consonants = "[bcçdfghjklmnpqrstvwxz*-]"
vowels = 'aeiouyϾ'
# TODO -ment at hemistiche
-# TODO diaresis
-# TODO rhymes
-# TODO vers en -es sont masc, pas fém
sure_end_fem = ['es', 'e']
end_fem = sure_end_fem + ['ent']
-count_two = ['aë', 'aï', 'ao', 'éa', 'éi', 'éo', 'éu', 'êa', 'êi',
-'êo', 'êu', 'èa', 'èi', 'èo', 'èu', 'oa', 'oya' , 'ueu', 'euâ', 'éâ',
-'oï', 'aïeu', 'oüoi', 'ouï', 'aïe', 'oè', 'oüé', 'ii', 'uau', 'oé',
-'uï', 'uïe']
-# TODO 'ée' ? ('déesse')
-can_count_two = ['ia', 'ée', 'ieue', 'ieu', 'ua', 'ié', 'iée', 'io', 'iu',
-'iue', 'ue', 'ui', 'ie', 'oue', 'oua', 'oueu', 'ouaie', 'ouai', 'oui', 'iè',
-'oué', 'ué', 'uée', 'uia', 'iai', 'yau', 'uo', 'yo']
+hemistiche_pos = 6
+num_verse = 12
+
+def contains_trema(chunk):
+ for x in ['ä', 'ï', 'ö', 'ü', 'ÿ']:
+ if x in chunk:
+ return True
+ return False
+
+def possible_weights(chunk):
+ if len(chunk) == 1:
+ return [1]
+ # old spelling and weird exceptions
+ if chunk in ['ouï']:
+ return [2]
+ if chunk in ['eüi', 'aoû']:
+ return [1]
+ if contains_trema(chunk):
+ return [2]
+ chunk = strip_accents(chunk, True)
+ # TODO 'ée' ? ('déesse')
+ if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
+ 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
+ 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
+ 'yeu', 'ye']:
+ return [1]
+ for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']:
+ if x in chunk:
+ return [2]
+ if chunk == 'ée':
+ return [1, 2]
+ if chunk[0] == 'i':
+ return [1, 2]
+ if chunk[0] == 'u' and (strip_accents(chunk[1]) in ['i', 'e']):
+ return [1, 2]
+ if chunk[0] == 'o' and chunk[1] == 'u' and len(chunk) >= 3 and strip_accents(chunk[2]) in ['i', 'e']:
+ return [1, 2]
+ if 'é' in chunk or 'è' in chunk:
+ return [2]
+ # only non-accented left
+
+ # TODO hmm
+ return [99]
# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
-def strip_accents(s):
+def strip_accents_one(s, with_except):
+ r = []
+ for x in s:
+ if with_except and x in ['è', 'é']:
+ r.append(x)
+ else:
+ r += unicodedata.normalize('NFD', x)
+ return r
+
+def strip_accents(s, with_except=False):
return ''.join(
- (c for c in unicodedata.normalize('NFD', s)
+ (c for c in strip_accents_one(s, with_except)
if unicodedata.category(c) != 'Mn'))
def norm_spaces(text):
- return re.sub("\s+", ' ', text)
+ return re.sub("\s+-*\s*", ' ', text)
def rm_punct(text):
text = re.sub("'", '', text)
+ #TODO rather: keep only good chars
pattern = re.compile('[^\w -]', re.UNICODE)
return pattern.sub(' ', text)
def annotate_aspirated(word):
if word[0] != 'h':
return word
- if aspire.lookup(word):
+ if haspirater.lookup(word):
return '*'+word
else:
return word
@@ -59,68 +107,76 @@ def is_vowels(chunk, with_h = False, with_y = True):
def count_vowel_chunks(word):
return sum([1 for chunk in word if is_vowels(chunk)])
-def possible_weights(chunk):
- if chunk in count_two:
- return [2]
- if chunk in can_count_two:
- return [1,2]
- return [1]
-
-def fit(chunks, left, past):
- if left == 7 and (len(chunks) < 2 or chunks[0] + chunks[1] in
- sure_end_fem):
- # no feminine at hemistiche
- # maybe it's a lone word?
- ok = False
- for i in range(2):
- for j in ' -':
- if j in past[-i]:
- ok = True
- if not ok:
- print ("refuse hemistiche", file=sys.stderr)
- return None
- weights = possible_weights(chunks[0])
- for weight in weights:
- nleft = left - weight
- print("Take %s with weight %d, left %d" % (chunks[0], weight,
- nleft), file=sys.stderr)
- result = maybe_sum([(chunks[0], weight)], skip(chunks[1:], nleft,
- past+[chunks[0]], nleft == 6))
- if result != None:
- return result
- print("FAIL!", file=sys.stderr)
- return None
-
-def maybe_sum(a, b):
- if b == None or a == None:
- return None
+def check_spaces(align, pos):
+ if pos >= len(align):
+ return "bad"
+ if align[pos] == ' ':
+ return "ok"
+ if not isinstance(align[pos], tuple):
+ return check_spaces(align, pos + 1)
+ return "cut"
+
+def check_hemistiche(align, pos, hem):
+ if pos >= len(align):
+ return ("bad", pos)
+ if hem == 0:
+ return (check_spaces(align, pos), pos)
+ if hem < 0:
+ return ("cut", pos)
+ if not isinstance(align[pos], tuple):
+ return check_hemistiche(align, pos +1, hem)
+ if hem == 1:
+ if pos + 1 >= len(align):
+ # this is weird
+ return ("bad", pos)
+ if ((align[pos][0] + align[pos+1]).rstrip() in sure_end_fem):
+ # no feminine at hemistiche
+ # maybe it's a lone word?
+ ok = False
+ for i in range(2):
+ for j in ' -':
+ if j in align[pos-i-1]:
+ ok = True
+ if not ok:
+ #print ("refuse hemistiche", file=sys.stderr)
+ return ("fem", pos)
+ return check_hemistiche(align, pos+1, hem - align[pos][1])
+
+def fit(chunks, pos, left):
+ if pos >= len(chunks):
+ return [[]]
+ if left < 0:
+ return []
+ if (not is_vowels(chunks[pos])):
+ return prepend([chunks[pos]], fit(chunks, pos+1, left))
else:
- return a + b
-
-def skip(chunks, left, past, expect_space=False):
- result = []
- chunks = list(chunks)
- if len(chunks) > 0 and not is_vowels(chunks[0]):
- return maybe_sum([chunks[0]], skip(chunks[1:], left, past +
- [chunks[0]], expect_space and not chunks[0] == ' '))
- if len(chunks) == 0:
- if left == 0:
- print("OK", file=sys.stderr)
- return []
+ if (pos >= len(chunks) - 2 and chunks[pos] == 'e'):
+ # special case for endings
+ if pos == len(chunks) - 1:
+ weights = [0]
+ elif chunks[pos+1] == 's':
+ weights = [0]
+ elif chunks[pos+1] == 'nt':
+ weights = [0, 1]
+ else:
+ weights = possible_weights(chunks[pos])
else:
- print("out of chunks", file=sys.stderr)
- return None
- if expect_space:
- # we wanted a space and haven't got it, fail
- print("wanted space", file=sys.stderr)
- return None
- return fit(chunks, left, past)
-
-def get_feminine(text):
- for end in end_fem:
- if text.endswith(end):
- return end
- return ''
+ weights = possible_weights(chunks[pos])
+ result = []
+ for weight in weights:
+ #print("Take %s with weight %d" % (chunks[pos], weight), file=sys.stderr)
+ result += prepend([(chunks[pos], weight)], fit(chunks, pos+1,
+ left - weight))
+ return result
+
+def feminine(align, verse):
+ for a in sure_end_fem:
+ if verse.endswith(a):
+ return True
+ #pprint(align)
+ if verse.endswith('ent') and align[-2][1] != 1:
+ return True
+ return False
def nullify(chunk):
if is_vowels(chunk):
@@ -128,31 +184,58 @@ def nullify(chunk):
else:
return chunk
-def align(result):
- align, feminine = result
- if align == None:
- return "Non."
- l1 = ['F '] if feminine else ["M "]
- l2 = ['12 ']
+def align2(result):
+ align, feminine, c, hemi = result
+ l2 = [('{:^2}').format(str(c))]
+ l2 += ['f'] if feminine else ["m"]
+ l2 += '-H'
+ l2 += [('{:^3}').format(hemi)]
+ l2 += ' '
+ count = 0
for x in align:
if isinstance(x, tuple):
- l1 += x[0]
l2 += ('{:^'+str(len(x[0]))+'}').format(str(x[1]))
+ count += x[1]
+ else:
+ if x == ' ' and count == hemistiche_pos:
+ l2 += '/'
+ else:
+ l2 += ' ' * len(x)
+ return ''.join(l2)
+
+def align1(result, success):
+ l1 = '-------- ' if success else '!!!ERROR '
+ for x in result[0]:
+ if isinstance(x, tuple):
+ l1 += x[0]
else:
l1 += x
- l2 += ' ' * len(x)
- return ''.join(l1) + '\n' + ''.join(l2)
-
-def parse(text):
- text = norm_spaces(rm_punct(text.lower())).rstrip().lstrip()
- oend = get_feminine(text)
- feminine = oend != ''
- end = oend
- text = re.sub("qu", 'q', text)
+ return ''.join(l1)
+
+def append(ls, l):
+ r = []
+ for x in ls:
+ r.append(x + l)
+ return r
+def prepend(l, ls):
+ r = []
+ for x in ls:
+ r.append(l + x)
+ return r
+
+def normalize(text):
+ return norm_spaces(rm_punct(text.lower())).rstrip().lstrip()
+
+def parse(text, bound):
+ original_text = normalize(text)
+ text = re.sub("qu", 'q', original_text)
text = re.sub("gue", 'ge', text)
- print(text, file=sys.stderr)
+ text = re.sub("gué", 'gé', text)
+ text = re.sub("guè", 'gè', text)
+ text = re.sub("gua", 'ga', text)
+ #print(text, file=sys.stderr)
words = text.split(' ')
- words = [annotate_aspirated(word) for word in words]
+ words = [annotate_aspirated(word) for word in words if word != '']
pattern = re.compile('('+consonants+'*)', re.UNICODE)
for i in range(len(words)):
words[i] = re.split(pattern, words[i])
@@ -168,7 +251,7 @@ def parse(text):
if a[1] != '':
nwords.append(a[1])
else:
- # TODO ouais c'est foutu là...
+ # TODO very special case :-/
if words[i] == ['p', 'ay', 's']:
nwords.append('y')
words[i] = nwords
@@ -181,43 +264,255 @@ def parse(text):
word.append(' ')
chunks = sum(words, [])[:-1]
- ochunks = list(chunks)
- end = [chunk for chunk in re.split(pattern, end)
- if chunk != '']
- if len(chunks) >= 2 and chunks[-(len(end)+1)] != ' ' and chunks[-(len(end)+2)] != ' ' :
- if end != []:
- # drop end
- end.reverse()
- nend = []
- for x in end:
- if chunks[-1] == x:
- chunks.pop()
- nend.append(nullify(x))
- nend.reverse()
- end = nend
- else:
- try:
- if end[-1] == chunks[-1] and chunks[-1] == 'nt':
- feminine = False # OK this looks like fem but isnt (" cent$")
- except IndexError:
- pass
- end = []
-
- print('/'.join(chunks), file=sys.stderr)
- result = (maybe_sum(skip(chunks, 12, []), end), feminine)
- if result[0] == None and oend == 'ent':
- #super-ugly hack because ending 'ent' sometimes isn't dropped
- return (maybe_sum(skip(ochunks, 12, []), end), False)
- else:
+ return list(map((lambda x : (x, feminine(x, original_text))),
+ fit(chunks, 0, bound)))
+
+class Error:
+ def __init__(self):
+ self.line = None
+ self.line_no = None
+ self.pattern = None
+ self.prefix = None
+
+ def pos(self, line, line_no, pattern):
+ self.line = line
+ self.line_no = line_no
+ self.pattern = pattern
+ self.prefix = "stdin:%d: " % self.line_no
+
+ def say(self, l):
+ print(self.prefix + l)
+
+ def report(self, s, t = []):
+ self.say("error: %s" % (s))
+ #TODO optional
+ self.say("Line is: %s" % (self.line))
+ for l in t:
+ self.say(" " + l)
+
+class ErrorBadRhyme(Error):
+ def __init__(self, expected, inferred):
+ Error.__init__(self)
+ self.expected = expected
+ self.inferred = inferred
+
+ def report(self):
+ Error.report(self, "Bad rhyme %s for type %s (expected %s, inferred %s)"
+ % (self.kind, self.pattern.myid, self.fmt(self.expected),
+ self.fmt(self.inferred)))
+
+class ErrorBadRhymeGenre(ErrorBadRhyme):
+ def fmt(self, l):
+ return ' or '.join(list(l))
+
+ @property
+ def kind(self):
+ return "genre"
+
+class ErrorBadRhymeSound(ErrorBadRhyme):
+ def fmt(self, l):
+ #TODO
+ return 'TODO'
+
+ @property
+ def kind(self):
+ return "value"
+
+class ErrorBadMetric(Error):
+ def __init__(self, possible):
+ Error.__init__(self)
+ self.possible = possible
+
+ def align(self, align):
+ #TODO include a summary
+ #TODO match to real line
+ score, align = align
+ align, feminine = align
+ l2 = []
+ count = 0
+ for x in align:
+ if isinstance(x, tuple):
+ l2 += ('{:^'+str(len(x[0]))+'}').format(str(x[1]))
+ count += x[1]
+ else:
+ if x == ' ' and count in self.pattern.hemistiches:
+ l2 += '/'
+ else:
+ l2 += ' ' * len(x)
+ l2 += ' (%d)' % score
+ return ''.join(l2)
+
+ def report(self):
+ num = min(len(self.possible), 4)
+ Error.report(
+ self,
+ ("Bad metric (expected %s, inferred the %d following)" %
+ (self.pattern.metric, num)),
+ list(map(self.align, self.possible[:num])))
+
+class Pattern:
+ def __init__(self, metric, myid, femid, rhyme):
+ self.metric = metric
+ self.parse_metric()
+ self.myid = myid
+ self.femid = femid
+ self.rhyme = rhyme
+
+ def parse_metric(self):
+ verse = [int(x) for x in self.metric.split('/')]
+ self.hemistiches = []
+ self.length = 0
+ for v in verse:
+ self.length += v
+ self.hemistiches.append(self.length)
+ self.length = self.hemistiches.pop()
+
+class Template:
+ def __init__(self, stream):
+ self.template = []
+ for line in f.readlines():
+ line = line.strip()
+ if line != '' and line[0] != '#':
+ self.template.append(self.parse_template(line.lstrip().rstrip()))
+ self.reset_state()
+ self.line_no = 0
+
+ def count(self, align):
+ return sum([x[1] for x in align if isinstance(x, tuple)])
+
+ def rate(self, pattern, align):
+ align, fem = align
+ c = self.count(align)
+ #print("%d is len" % c)
+ #TODO one pass would be enough
+ hemis = []
+ ok = True
+ #print ("HEMIS")
+ pos = 0
+ h2 = 0
+ for h in pattern.hemistiches:
+ r, pos = check_hemistiche(align, pos, h-h2)
+ h2 = h
+ hemis.append(r)
+ #print (hemis[-1])
+ if hemis[-1] != "ok":
+ ok = False
+ if ok and c == pattern.length:
+ return 0
+ return (len(hemis)*abs(pattern.length - c)
+ + sum([1 for x in hemis if x == "ok"]))
+
+ def match(self, line):
+ pattern = self.get()
+ possible = parse(line, pattern.length + 2)
+ #pprint("POSSIBLE")
+ #pprint(possible)
+ errors = []
+
+ possible = map((lambda x : (self.rate(pattern, x), x)), possible)
+ possible = sorted(possible, key=(lambda x : x[0]))
+ if len(possible) == 0 or possible[0][0] != 0:
+ errors.append(ErrorBadMetric(possible))
+ if len(possible) == 0:
+ return errors
+ possible2 = []
+ for (score, x) in possible:
+ possible2.append((score, x))
+ if score != possible[0][0]:
+ break
+ possible = possible2
+
+ if pattern.myid not in self.env.keys():
+ #print(normalize(line))
+ self.env[pattern.myid] = rhyme.init_rhyme(normalize(line),
+ pattern.rhyme)
+ #print("nVALUE")
+ #pprint(self.env[pattern.myid])
+ else:
+ self.env[pattern.myid] = rhyme.check_rhyme(self.env[pattern.myid],
+ (normalize(line), pattern.rhyme))
+ #print("nVALUE")
+ #pprint(self.env[pattern.myid])
+ if (self.env[pattern.myid][1] == None and
+ len(self.env[pattern.myid][0]) == 0):
+ errors.append(ErrorBadRhymeSound(None, None))
+ if pattern.femid not in self.femenv.keys():
+ if pattern.femid == 'M':
+ x = set(['M'])
+ elif pattern.femid == 'F':
+ x = set(['F'])
+ else:
+ x = set(['M', 'F'])
+ self.femenv[pattern.femid] = x
+ old = list(self.femenv[pattern.femid])
+ #pprint(possible)
+ new = list(set(['F' if x[1] else 'M' for (score, x) in possible]))
+ self.femenv[pattern.femid] &= set(new)
+ #print(old)
+ #print(new)
+ if len(self.femenv[pattern.femid]) == 0:
+ errors.append(ErrorBadRhymeGenre(old, new))
+ #TODO debug
+ errors.append(ErrorBadMetric(possible))
+
+ return errors, pattern
+
+ def parse_template(self, l):
+ split = l.split(' ')
+ metric = split[0]
+ myid = split[1]
+ femid = split[2]
+ if len(split) >= 4:
+ rhyme = [int(x) for x in split[3].split('|')]
+ else:
+ rhyme = []
+ if len(rhyme) == 0:
+ rhyme.append(1)
+ while len(rhyme) < 3:
+ rhyme.append(-1)
+ return Pattern(metric, myid, femid, rhyme)
+
+ def reset_state(self):
+ self.position = 0
+ self.env = {}
+ self.femenv = {}
+
+ def get(self):
+ if self.position >= len(self.template):
+ self.reset_state()
+ result = self.template[self.position]
+ self.position += 1
return result
-while True:
- line = sys.stdin.readline()
- if not line:
- break
- if line.rstrip() != '':
+ def check(self, line):
+ self.line_no += 1
line = line.rstrip()
- print(align(parse(line)))
- else:
- print()
+ if line == '':
+ return []
+ #possible = [compute(p) for p in possible]
+ #possible = sorted(possible, key=rate)
+ errors, pattern = self.match(line)
+ for error in errors:
+ error.pos(line, self.line_no, pattern)
+ return errors
+
+
+if len(sys.argv) != 2:
+ print("Usage: %s TEMPLATE" % sys.argv[0], file=sys.stderr)
+ sys.exit(1)
+
+f = open(sys.argv[1])
+template = Template(f)
+f.close()
+
+def run():
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ for error in template.check(line):
+ error.report()
+
+#cProfile.run('run()', 'poetlint.prof')
+run()