plint

French poetry validator
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

commit bed9e45249998d768ebd771a456f8caf7caae334
parent 756cabf98118fc022983d07a828594cf2a76b52d
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 13 Mar 2012 14:54:57 +0100

cleanup and corrections

Diffstat:
common.py | 58+++++++++++++++++++++++++++++++++++++++++++++++++---------
metric.py | 10++++++----
vowels.py | 9+++++++--
3 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/common.py b/common.py @@ -1,18 +1,29 @@ #!/usr/bin/python3 +#coding: utf-8 import unicodedata import re -import haspirater vowels = 'aeiouyœæ' -consonants = "[bcçdfghjklmnpqrstvwxz*-]" +consonants = "bcçdfghjklmnpqrstvwxz" + +# a variant of x-sampa such that all french phonemes are one-character +SUBSTS = [ + ('#', 'A~'), + ('$', 'O~'), + (')', 'E~'), + ('(', '9~'), + ] # Forbidden at the end of a hemistiche. "-ent" would also be forbidden # in some cases but not others... sure_end_fem = ['es', 'e', 'ë'] # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string -def strip_accents_one(s, with_except): +def strip_accents_one(s, with_except=False): + """Strip accent from a string + + with_except keeps specifically 'é' and 'è'""" r = [] for x in s: if with_except and x in ['è', 'é']: @@ -27,15 +38,22 @@ def strip_accents(s, with_except=False): if unicodedata.category(c) != 'Mn')) def norm_spaces(text): + """Remove multiple consecutive whitespace""" return re.sub("\s+-*\s*", ' ', text) -def rm_punct(text): - text = re.sub("'", '', text) +def rm_punct(text, with_apostrophe = False): + """Remove punctuation from text""" + if not with_apostrophe: + text = re.sub("'", '', text) #TODO rather: keep only good chars - pattern = re.compile('[^\w -]', re.UNICODE) + pattern = re.compile("[^'\w -]", re.UNICODE) return pattern.sub(' ', text) -def is_vowels(chunk, with_h = False, with_y = True): +def is_vowels(chunk, with_h=False, with_y=True): + """Test if a chunk is vowels + + with_h counts 'h' as vowel, with_y allows 'y'""" + if not with_y and chunk == 'y': return False for char in strip_accents(chunk): @@ -44,6 +62,28 @@ def is_vowels(chunk, with_h = False, with_y = True): return False return True -def normalize(text): - return norm_spaces(rm_punct(text.lower())).rstrip().lstrip() +def is_consonants(chunk): + """Test if a chunk is consonants""" + + for char in strip_accents(chunk): + if char not in consonants: + return False + return True + +def normalize(text, with_apostrophe=False): + """Normalize text, ie. lowercase, no useless punctuation or whitespace""" + return norm_spaces(rm_punct(text.lower(), with_apostrophe)).rstrip().lstrip() + +def subst(string, subs): + if len(subs) == 0: + return string + return subst(string.replace(subs[0][0], subs[0][1]), subs[1:]) + +def to_xsampa(s): + """convert our modified format to x-sampa""" + return subst(s, SUBSTS) + +def from_xsampa(s): + """convert x-sampa to our modified format""" + return subst(s, [(x[1], x[0]) for x in SUBSTS]) diff --git a/metric.py b/metric.py @@ -1,6 +1,8 @@ +#!/usr/bin/python +#coding: utf-8 + import re -from common import strip_accents, normalize, is_vowels, consonants, \ - sure_end_fem +from common import normalize, is_vowels, consonants, sure_end_fem from vowels import possible_weights import haspirater @@ -62,7 +64,7 @@ def feminine(align, verse): def parse(text, bound): """Return possible aligns for text, bound is an upper bound on the - align length to limit cost""" + align length to limit running time""" original_text = normalize(text) @@ -77,7 +79,7 @@ def parse(text, bound): words = text.split(' ') words = [annotate_aspirated(word) for word in words if word != ''] - pattern = re.compile('('+consonants+'*)', re.UNICODE) + pattern = re.compile('(['+consonants+'*-]*)', re.UNICODE) # cut each word in chunks of vowels and consonants, with some specific # kludges diff --git a/vowels.py b/vowels.py @@ -1,14 +1,19 @@ #!/usr/bin/python3 +#coding: utf-8 + +"""Compute the number of syllabes taken by a vowel chunk""" from common import strip_accents def contains_trema(chunk): + """Test if a string contains a word with a trema""" for x in ['ä', 'ï', 'ö', 'ü', 'ÿ']: if x in chunk: return True return False def possible_weights(chunk): + """Return the possible number of syllabes taken by a vowel chunk""" if len(chunk) == 1: return [1] # old spelling and weird exceptions @@ -19,7 +24,6 @@ def possible_weights(chunk): if contains_trema(chunk): return [2] chunk = strip_accents(chunk, True) - # TODO 'ée' ? ('déesse') if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', @@ -28,6 +32,7 @@ def possible_weights(chunk): for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']: if x in chunk: return [2] + # beware of "déesse" if chunk == 'ée': return [1, 2] if chunk[0] == 'i': @@ -38,7 +43,7 @@ def possible_weights(chunk): return [1, 2] if 'é' in chunk or 'è' in chunk: return [2] - # only non-accented left # TODO hmm return [1, 2] +