plint

French poetry validator
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

commit 42bde95087c10d81e146f4a560727f94be824407
parent 8d59d3e20eeee614e80fec970c3500efc05d0f82
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sat, 25 Jun 2011 12:12:31 -0400

refactoring

Diffstat:
common.py | 43+++++++++++++++++++++++++++++++++++++++++++
poetlint.py | 98+++++++------------------------------------------------------------------------
vowels.py | 44++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 95 insertions(+), 90 deletions(-)

diff --git a/common.py b/common.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 + +import unicodedata +import re + +vowels = 'aeiouyœæ' + +# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string +def strip_accents_one(s, with_except): + r = [] + for x in s: + if with_except and x in ['è', 'é']: + r.append(x) + else: + r += unicodedata.normalize('NFD', x) + return r + +def strip_accents(s, with_except=False): + return ''.join( + (c for c in strip_accents_one(s, with_except) + if unicodedata.category(c) != 'Mn')) + +def norm_spaces(text): + return re.sub("\s+-*\s*", ' ', text) + +def rm_punct(text): + text = re.sub("'", '', text) + #TODO rather: keep only good chars + pattern = re.compile('[^\w -]', re.UNICODE) + return pattern.sub(' ', text) + +def is_vowels(chunk, with_h = False, with_y = True): + if not with_y and chunk == 'y': + return False + for char in strip_accents(chunk): + if char not in vowels: + if char != 'h' or not with_h: + return False + return True + +def normalize(text): + return norm_spaces(rm_punct(text.lower())).rstrip().lstrip() + diff --git a/poetlint.py b/poetlint.py @@ -5,87 +5,18 @@ import sys import unicodedata import haspirater import rhyme -#import cProfile from pprint import pprint +from vowels import possible_weights +from common import strip_accents, normalize, is_vowels #TODO no clear femid env for implicit repeat #TODO femid pattern groups (not all the same) - consonants = "[bcçdfghjklmnpqrstvwxz*-]" -vowels = 'aeiouyœæ' -# TODO -ment at hemistiche +# Forbidden at the end of a hemistiche. "-ent" would also be forbidden +# in some cases but not others... sure_end_fem = ['es', 'e'] -end_fem = sure_end_fem + ['ent'] - -hemistiche_pos = 6 -num_verse = 12 - -def contains_trema(chunk): - for x in ['ä', 'ï', 'ö', 'ü', 'ÿ']: - if x in chunk: - return True - return False - -def possible_weights(chunk): - if len(chunk) == 1: - return [1] - # old spelling and weird exceptions - if chunk in ['ouï']: - return [2] - if chunk in ['eüi', 'aoû']: - return [1] - if contains_trema(chunk): - return [2] - chunk = strip_accents(chunk, True) - # TODO 'ée' ? ('déesse') - if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', - 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', - 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', - 'yeu', 'ye']: - return [1] - for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']: - if x in chunk: - return [2] - if chunk == 'ée': - return [1, 2] - if chunk[0] == 'i': - return [1, 2] - if chunk[0] == 'u' and (strip_accents(chunk[1]) in ['i', 'e']): - return [1, 2] - if chunk[0] == 'o' and chunk[1] == 'u' and len(chunk) >= 3 and strip_accents(chunk[2]) in ['i', 'e']: - return [1, 2] - if 'é' in chunk or 'è' in chunk: - return [2] - # only non-accented left - - # TODO hmm - return [99] - -# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string -def strip_accents_one(s, with_except): - r = [] - for x in s: - if with_except and x in ['è', 'é']: - r.append(x) - else: - r += unicodedata.normalize('NFD', x) - return r - -def strip_accents(s, with_except=False): - return ''.join( - (c for c in strip_accents_one(s, with_except) - if unicodedata.category(c) != 'Mn')) - -def norm_spaces(text): - return re.sub("\s+-*\s*", ' ', text) - -def rm_punct(text): - text = re.sub("'", '', text) - #TODO rather: keep only good chars - pattern = re.compile('[^\w -]', re.UNICODE) - return pattern.sub(' ', text) def annotate_aspirated(word): if word[0] != 'h': @@ -95,18 +26,6 @@ def annotate_aspirated(word): else: return word -def is_vowels(chunk, with_h = False, with_y = True): - if not with_y and chunk == 'y': - return False - for char in strip_accents(chunk): - if char not in vowels: - if char != 'h' or not with_h: - return False - return True - -def count_vowel_chunks(word): - return sum([1 for chunk in word if is_vowels(chunk)]) - def check_spaces(align, pos): if pos >= len(align): return "bad" @@ -223,9 +142,6 @@ def prepend(l, ls): r.append(l + x) return r -def normalize(text): - return norm_spaces(rm_punct(text.lower())).rstrip().lstrip() - def parse(text, bound): original_text = normalize(text) text = re.sub("qu", 'q', original_text) @@ -256,7 +172,7 @@ def parse(text, bound): nwords.append('y') words[i] = nwords if i > 0: - if count_vowel_chunks(words[i-1]) > 1: + if sum([1 for chunk in words[i-1] if is_vowels(chunk)]) > 1: if words[i-1][-1] == 'e' and is_vowels(words[i][0], True): words[i-1].pop(-1) words[i-1][-1] = words[i-1][-1]+"'" @@ -428,6 +344,7 @@ class Template: pattern.rhyme) #print("nVALUE") #pprint(self.env[pattern.myid]) + #pprint(self.env[pattern.myid]) else: self.env[pattern.myid] = rhyme.check_rhyme(self.env[pattern.myid], (normalize(line), pattern.rhyme)) @@ -453,13 +370,14 @@ class Template: if len(self.femenv[pattern.femid]) == 0: errors.append(ErrorBadRhymeGenre(old, new)) #TODO debug - errors.append(ErrorBadMetric(possible)) + #errors.append(ErrorBadMetric(possible)) return errors, pattern def parse_template(self, l): split = l.split(' ') metric = split[0] + #TODO generate unique ids if need be myid = split[1] femid = split[2] if len(split) >= 4: diff --git a/vowels.py b/vowels.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 + +from common import strip_accents + +def contains_trema(chunk): + for x in ['ä', 'ï', 'ö', 'ü', 'ÿ']: + if x in chunk: + return True + return False + +def possible_weights(chunk): + if len(chunk) == 1: + return [1] + # old spelling and weird exceptions + if chunk in ['ouï']: + return [2] + if chunk in ['eüi', 'aoû']: + return [1] + if contains_trema(chunk): + return [2] + chunk = strip_accents(chunk, True) + # TODO 'ée' ? ('déesse') + if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', + 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', + 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', + 'yeu', 'ye']: + return [1] + for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']: + if x in chunk: + return [2] + if chunk == 'ée': + return [1, 2] + if chunk[0] == 'i': + return [1, 2] + if chunk[0] == 'u' and (strip_accents(chunk[1]) in ['i', 'e']): + return [1, 2] + if chunk[0] == 'o' and chunk[1] == 'u' and len(chunk) >= 3 and strip_accents(chunk[2]) in ['i', 'e']: + return [1, 2] + if 'é' in chunk or 'è' in chunk: + return [2] + # only non-accented left + + # TODO hmm + return [99]