plint

French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

commit cb33de69f07e5a8af717220e72c75b3719e27bdf
parent 0168f74585cbf074d9337696241f92b126016306
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun, 10 Jul 2011 22:05:58 -0400

refactoring

Diffstat:
common.py | 2+-
metric.py | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
poetlint.py | 265+------------------------------------------------------------------------------
template.py | 172+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 274 insertions(+), 263 deletions(-)

diff --git a/common.py b/common.py @@ -2,6 +2,7 @@ import unicodedata import re +import haspirater vowels = 'aeiouyœæ' consonants = "[bcçdfghjklmnpqrstvwxz*-]" @@ -10,7 +11,6 @@ consonants = "[bcçdfghjklmnpqrstvwxz*-]" # in some cases but not others... sure_end_fem = ['es', 'e'] - # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string def strip_accents_one(s, with_except): r = [] diff --git a/metric.py b/metric.py @@ -0,0 +1,98 @@ +import re +from common import strip_accents, normalize, is_vowels, consonants, \ + sure_end_fem +from vowels import possible_weights +import haspirater + +def annotate_aspirated(word): + """Annotate aspirated 'h'""" + if word[0] != 'h': + return word + if haspirater.lookup(word): + return '*'+word + else: + return word + +def fit(chunks, pos, left): + if pos >= len(chunks): + return [[]] + if left < 0: + return [] + if (not is_vowels(chunks[pos])): + return [[chunks[pos]] + x for x in fit(chunks, pos+1, left)] + else: + if (pos >= len(chunks) - 2 and chunks[pos] == 'e'): + # special case for endings + if pos == len(chunks) - 1: + weights = [0] + elif chunks[pos+1] == 's': + weights = [0] + elif chunks[pos+1] == 'nt': + weights = [0, 1] + else: + weights = possible_weights(chunks[pos]) + else: + weights = possible_weights(chunks[pos]) + result = [] + for weight in weights: + #print("Take %s with weight %d" % (chunks[pos], weight), file=sys.stderr) + result += [[(chunks[pos], weight)] + x for x in fit(chunks, pos+1, + left - weight)] + return result + +def feminine(align, verse): + for a in sure_end_fem: + if verse.endswith(a): + return True + #pprint(align) + if verse.endswith('ent') and align[-2][1] != 1: + return True + return False + +def parse(text, bound): + """Return possible aligns for text, bound is an upper bound on the + align length to limit cost""" + + original_text = normalize(text) + + # avoid some vowel problems + text = re.sub("qu", 'q', original_text) + text = re.sub("gue", 'ge', text) + text = re.sub("gué", 'gé', text) + text = re.sub("guè", 'gè', text) + text = re.sub("gua", 'ga', text) + + words = text.split(' ') + words = [annotate_aspirated(word) for word in words if word != ''] + + pattern = re.compile('('+consonants+'*)', re.UNICODE) + for i in range(len(words)): + words[i] = re.split(pattern, words[i]) + words[i] = [chunk for chunk in words[i] if chunk != ''] + nwords = [] + for chunk in words[i]: + if 'y' not in chunk or len(chunk) == 1 or chunk[0] == 'y': + nwords.append(chunk) + else: + a = chunk.split('y') + nwords.append(a[0]) + nwords.append('Y') + if a[1] != '': + nwords.append(a[1]) + else: + # very special case :-/ + if words[i] == ['p', 'ay', 's']: + nwords.append('y') + words[i] = nwords + if i > 0: + if sum([1 for chunk in words[i-1] if is_vowels(chunk)]) > 1: + if words[i-1][-1] == 'e' and is_vowels(words[i][0], True): + words[i-1].pop(-1) + words[i-1][-1] = words[i-1][-1]+"'" + for word in words: + word.append(' ') + chunks = sum(words, [])[:-1] + + return list(map((lambda x : (x, feminine(x, original_text))), + fit(chunks, 0, bound))) + diff --git a/poetlint.py b/poetlint.py @@ -2,269 +2,10 @@ import re import sys -import unicodedata -import haspirater import rhyme -import error +import metric +import template from pprint import pprint -from vowels import possible_weights -from common import strip_accents, normalize, is_vowels, consonants, \ - sure_end_fem -from hemistiches import check_hemistiches - -def annotate_aspirated(word): - """Annotate aspirated 'h'""" - if word[0] != 'h': - return word - if haspirater.lookup(word): - return '*'+word - else: - return word - -def fit(chunks, pos, left): - if pos >= len(chunks): - return [[]] - if left < 0: - return [] - if (not is_vowels(chunks[pos])): - return [[chunks[pos]] + x for x in fit(chunks, pos+1, left)] - else: - if (pos >= len(chunks) - 2 and chunks[pos] == 'e'): - # special case for endings - if pos == len(chunks) - 1: - weights = [0] - elif chunks[pos+1] == 's': - weights = [0] - elif chunks[pos+1] == 'nt': - weights = [0, 1] - else: - weights = possible_weights(chunks[pos]) - else: - weights = possible_weights(chunks[pos]) - result = [] - for weight in weights: - #print("Take %s with weight %d" % (chunks[pos], weight), file=sys.stderr) - result += [[(chunks[pos], weight)] + x for x in fit(chunks, pos+1, - left - weight)] - return result - -def feminine(align, verse): - for a in sure_end_fem: - if verse.endswith(a): - return True - #pprint(align) - if verse.endswith('ent') and align[-2][1] != 1: - return True - return False - -def parse(text, bound): - original_text = normalize(text) - text = re.sub("qu", 'q', original_text) - text = re.sub("gue", 'ge', text) - text = re.sub("gué", 'gé', text) - text = re.sub("guè", 'gè', text) - text = re.sub("gua", 'ga', text) - #print(text, file=sys.stderr) - words = text.split(' ') - words = [annotate_aspirated(word) for word in words if word != ''] - pattern = re.compile('('+consonants+'*)', re.UNICODE) - for i in range(len(words)): - words[i] = re.split(pattern, words[i]) - words[i] = [chunk for chunk in words[i] if chunk != ''] - nwords = [] - for chunk in words[i]: - if 'y' not in chunk or len(chunk) == 1 or chunk[0] == 'y': - nwords.append(chunk) - else: - a = chunk.split('y') - nwords.append(a[0]) - nwords.append('Y') - if a[1] != '': - nwords.append(a[1]) - else: - # very special case :-/ - if words[i] == ['p', 'ay', 's']: - nwords.append('y') - words[i] = nwords - if i > 0: - if sum([1 for chunk in words[i-1] if is_vowels(chunk)]) > 1: - if words[i-1][-1] == 'e' and is_vowels(words[i][0], True): - words[i-1].pop(-1) - words[i-1][-1] = words[i-1][-1]+"'" - for word in words: - word.append(' ') - chunks = sum(words, [])[:-1] - - return list(map((lambda x : (x, feminine(x, original_text))), - fit(chunks, 0, bound))) - -class Pattern: - def __init__(self, metric, myid, femid, rhyme): - self.metric = metric - self.parse_metric() - self.myid = myid - self.femid = femid - self.rhyme = rhyme - - def parse_metric(self): - """Parse from a metric description""" - verse = [int(x) for x in self.metric.split('/')] - self.hemistiches = [] - self.length = 0 - for v in verse: - self.length += v - self.hemistiches.append(self.length) - self.length = self.hemistiches.pop() - -class Template: - def __init__(self, stream): - self.template = [] - self.pattern_line_no = 0 - self.load(stream) - self.line_no = 0 - self.position = 0 - self.env = {} - self.femenv = {} - - def load(self, stream): - """Load from a stream""" - for line in f.readlines(): - line = line.strip() - self.pattern_line_no += 1 - if line != '' and line[0] != '#': - self.template.append(self.parse_template(line.lstrip().rstrip())) - - def count(self, align): - #TODO cleanup - return sum([x[1] for x in align if isinstance(x, tuple)]) - - def rate(self, pattern, align): - """Rate align according to pattern""" - align, fem, hemis = align - c = self.count(align) - ok = True - for h in hemis.values(): - if h != "ok": - ok = False - if ok and c == pattern.length: - return 0 - return (len(hemis.keys())*abs(pattern.length - c) - + sum([1 for x in hemis.values() if x != "ok"])) - - def match(self, line): - """Check a line""" - pattern = self.get() - # compute alignments, check hemistiches, sort by score - possible = parse(line, pattern.length + 2) - possible = list(map((lambda p : (p[0], p[1], - check_hemistiches(p[0], pattern.hemistiches))), possible)) - possible = map((lambda x : (self.rate(pattern, x), x)), possible) - possible = sorted(possible, key=(lambda x : x[0])) - - errors = [] - if len(possible) == 0 or possible[0][0] != 0: - errors.append(error.ErrorBadMetric(possible)) - if len(possible) == 0: - return errors, pattern - possible2 = [] - for (score, x) in possible: - possible2.append((score, x)) - if score != possible[0][0]: - break - possible = possible2 - - if pattern.myid not in self.env.keys(): - #print(normalize(line)) - self.env[pattern.myid] = rhyme.init_rhyme(normalize(line), - pattern.rhyme) - #print("nVALUE") - #pprint(self.env[pattern.myid]) - #pprint(self.env[pattern.myid]) - else: - old = list(self.env[pattern.myid]) - self.env[pattern.myid] = rhyme.check_rhyme(self.env[pattern.myid], - (normalize(line), pattern.rhyme)) - #print("nVALUE") - #pprint(self.env[pattern.myid]) - if (self.env[pattern.myid][1] == None and - len(self.env[pattern.myid][0]) == 0): - errors.append(error.ErrorBadRhymeSound(old, None)) - if pattern.femid not in self.femenv.keys(): - if pattern.femid == 'M': - x = set(['M']) - elif pattern.femid == 'F': - x = set(['F']) - else: - x = set(['M', 'F']) - self.femenv[pattern.femid] = x - # TODO this is simplistic and order-dependent - if pattern.femid.swapcase() in self.femenv.keys(): - new = set(['M', 'F']) - self.femenv[pattern.femid.swapcase()] - if len(new) > 0: - self.femenv[pattern.femid] = new - old = list(self.femenv[pattern.femid]) - #pprint(possible) - new = list(set(['F' if x[1] else 'M' for (score, x) in possible])) - self.femenv[pattern.femid] &= set(new) - #print(old) - #print(new) - if len(self.femenv[pattern.femid]) == 0: - errors.append(error.ErrorBadRhymeGenre(old, new)) - - return errors, pattern - - def parse_template(self, l): - """Parse template from a line""" - split = l.split(' ') - metric = split[0] - if len(split) >= 2: - myid = split[1] - else: - myid = str(self.pattern_line_no) - if len(split) >= 3: - femid = split[2] - else: - femid = str(self.pattern_line_no) - if len(split) >= 4: - rhyme = [int(x) for x in split[3].split('|')] - else: - rhyme = [] - if len(rhyme) == 0: - rhyme.append(1) - while len(rhyme) < 3: - rhyme.append(-1) - return Pattern(metric, myid, femid, rhyme) - - def reset_conditional(self, d): - return dict((k, v) for x, v in d.items() if x[-1] == '!') - - def reset_state(self, with_femenv=False): - """Reset our state""" - self.position = 0 - self.env = self.reset_conditional(self.env) - self.femenv = self.reset_conditional(self.femenv) - - def get(self): - """Get next state, resetting if needed""" - if self.position >= len(self.template): - self.reset_state() - result = self.template[self.position] - self.position += 1 - return result - - def check(self, line): - """Check line (wrapper)""" - self.line_no += 1 - line = line.rstrip() - if line == '': - return [] - #possible = [compute(p) for p in possible] - #possible = sorted(possible, key=rate) - errors, pattern = self.match(line) - for error in errors: - error.pos(line, self.line_no, pattern) - return errors - if len(sys.argv) != 2: print("Usage: %s TEMPLATE" % sys.argv[0], file=sys.stderr) @@ -273,7 +14,7 @@ if len(sys.argv) != 2: sys.exit(1) f = open(sys.argv[1]) -template = Template(f) +template = template.Template(f) f.close() def run(): diff --git a/template.py b/template.py @@ -0,0 +1,172 @@ +import error +from metric import parse +from hemistiches import check_hemistiches +import rhyme +from common import normalize + +class Pattern: + def __init__(self, metric, myid, femid, rhyme): + self.metric = metric + self.parse_metric() + self.myid = myid + self.femid = femid + self.rhyme = rhyme + + def parse_metric(self): + """Parse from a metric description""" + verse = [int(x) for x in self.metric.split('/')] + self.hemistiches = [] + self.length = 0 + for v in verse: + self.length += v + self.hemistiches.append(self.length) + self.length = self.hemistiches.pop() + +class Template: + def __init__(self, stream): + self.template = [] + self.pattern_line_no = 0 + self.load(stream) + self.line_no = 0 + self.position = 0 + self.env = {} + self.femenv = {} + + def load(self, stream): + """Load from a stream""" + for line in stream.readlines(): + line = line.strip() + self.pattern_line_no += 1 + if line != '' and line[0] != '#': + self.template.append(self.parse_template(line.lstrip().rstrip())) + + def count(self, align): + """total weight of an align""" + return sum([x[1] for x in align if isinstance(x, tuple)]) + + def rate(self, pattern, align): + """Rate align according to pattern""" + align, fem, hemis = align + c = self.count(align) + ok = True + for h in hemis.values(): + if h != "ok": + ok = False + if ok and c == pattern.length: + return 0 + return (len(hemis.keys())*abs(pattern.length - c) + + sum([1 for x in hemis.values() if x != "ok"])) + + def match(self, line): + """Check a line against current pattern, return errors""" + pattern = self.get() + # compute alignments, check hemistiches, sort by score + possible = parse(line, pattern.length + 2) + possible = list(map((lambda p : (p[0], p[1], + check_hemistiches(p[0], pattern.hemistiches))), possible)) + possible = map((lambda x : (self.rate(pattern, x), x)), possible) + possible = sorted(possible, key=(lambda x : x[0])) + + errors = [] + + # check metric + if len(possible) == 0 or possible[0][0] != 0: + errors.append(error.ErrorBadMetric(possible)) + if len(possible) == 0: + return errors, pattern + # keep the best alignment as hypotheses + possible = [(score, align) for (score, align) in possible + if score == possible[0][0]] + + # rhymes + if pattern.myid not in self.env.keys(): + # initialize the rhyme + self.env[pattern.myid] = rhyme.init_rhyme(normalize(line), + pattern.rhyme) + else: + # update the rhyme + old = list(self.env[pattern.myid]) + self.env[pattern.myid] = rhyme.check_rhyme(self.env[pattern.myid], + (normalize(line), pattern.rhyme)) + # no more possible rhymes, something went wrong + if (self.env[pattern.myid][1] == None and + len(self.env[pattern.myid][0]) == 0): + errors.append(error.ErrorBadRhymeSound(old, None)) + + # rhyme genres + # TODO refactor this + if pattern.femid not in self.femenv.keys(): + if pattern.femid == 'M': + x = set(['M']) + elif pattern.femid == 'F': + x = set(['F']) + else: + x = set(['M', 'F']) + self.femenv[pattern.femid] = x + else: + # TODO this is simplistic and order-dependent + if pattern.femid.swapcase() in self.femenv.keys(): + new = set(['M', 'F']) - self.femenv[pattern.femid.swapcase()] + if len(new) > 0: + self.femenv[pattern.femid] = new + + old = list(self.femenv[pattern.femid]) + new = list(set(['F' if x[1] else 'M' for (score, x) in possible])) + self.femenv[pattern.femid] &= set(new) + if len(self.femenv[pattern.femid]) == 0: + errors.append(error.ErrorBadRhymeGenre(old, new)) + + return errors, pattern + + def parse_template(self, l): + """Parse template from a line""" + split = l.split(' ') + metric = split[0] + if len(split) >= 2: + myid = split[1] + else: + myid = str(self.pattern_line_no) + if len(split) >= 3: + femid = split[2] + else: + femid = str(self.pattern_line_no) + if len(split) >= 4: + rhyme = [int(x) for x in split[3].split('|')] + else: + rhyme = [] + if len(rhyme) == 0: + rhyme.append(1) + while len(rhyme) < 3: + rhyme.append(-1) + return Pattern(metric, myid, femid, rhyme) + + def reset_conditional(self, d): + return dict((k, v) for x, v in d.items() if x[-1] == '!') + + def reset_state(self, with_femenv=False): + """Reset our state, except ids ending with '!'""" + self.position = 0 + self.env = self.reset_conditional(self.env) + self.femenv = self.reset_conditional(self.femenv) + + def get(self): + """Get next state, resetting if needed""" + if self.position >= len(self.template): + self.reset_state() + result = self.template[self.position] + self.position += 1 + return result + + def check(self, line): + """Check line (wrapper)""" + self.line_no += 1 + line = line.rstrip() + if line == '': + return [] + #possible = [compute(p) for p in possible] + #possible = sorted(possible, key=rate) + errors, pattern = self.match(line) + for error in errors: + error.pos(line, self.line_no, pattern) + return errors +