plint

French poetry validator
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

commit 3e193b33dd7dee79d9db366c317c0c2014a4da6c
parent 0951d1c5ebcafccc63d656615369e0da0c39bcff
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sat,  8 Dec 2012 22:34:31 +0100

fixes for weird cases

Diffstat:
common.py | 3++-
vowels.py | 10++++++++--
2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/common.py b/common.py @@ -5,7 +5,7 @@ import unicodedata import re vowels = 'aeiouyœæ' -consonants = "bcçdfghjklmnpqrstvwxz'" +consonants = "bcçdfghjklmnpqrstvwxzñ'" legal = vowels + consonants + ' -' # a variant of x-sampa such that all french phonemes are one-character @@ -46,6 +46,7 @@ def rm_punct(text): """Remove punctuation from text""" text = re.sub("’", "'", text) # no weird apostrophes text = re.sub("' ", "'", text) # space after apostrophes + text = re.sub("'*$", "", text) # apostrophes at end of line #TODO rather: keep only good chars pattern = re.compile("[^'\w -]", re.UNICODE) diff --git a/vowels.py b/vowels.py @@ -55,15 +55,21 @@ def possible_weights_approx(chunk): return [1, 2] # TODO unsure about that if chunk in ['eüi', 'aoû', 'uë']: return [1] + if chunk in ['aïe', 'oë', 'ouü']: + return [1, 2] if contains_trema(chunk): return [2] chunk = strip_accents(chunk, True) if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', - 'yeu', 'ye']: + 'yeu', 'ye', 'you']: return [1] - for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']: + if chunk == "oua": + return [1, 2] # "pouah" + if chunk == "ao": + return [1, 2] # "paon" + for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yo', 'yau']: if x in chunk: return [2] # beware of "déesse"