commit 3e193b33dd7dee79d9db366c317c0c2014a4da6c
parent 0951d1c5ebcafccc63d656615369e0da0c39bcff
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 8 Dec 2012 22:34:31 +0100
fixes for weird cases
Diffstat:
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/common.py b/common.py
@@ -5,7 +5,7 @@ import unicodedata
import re
vowels = 'aeiouyϾ'
-consonants = "bcçdfghjklmnpqrstvwxz'"
+consonants = "bcçdfghjklmnpqrstvwxzñ'"
legal = vowels + consonants + ' -'
# a variant of x-sampa such that all french phonemes are one-character
@@ -46,6 +46,7 @@ def rm_punct(text):
"""Remove punctuation from text"""
text = re.sub("’", "'", text) # no weird apostrophes
text = re.sub("' ", "'", text) # space after apostrophes
+ text = re.sub("'*$", "", text) # apostrophes at end of line
#TODO rather: keep only good chars
pattern = re.compile("[^'\w -]", re.UNICODE)
diff --git a/vowels.py b/vowels.py
@@ -55,15 +55,21 @@ def possible_weights_approx(chunk):
return [1, 2] # TODO unsure about that
if chunk in ['eüi', 'aoû', 'uë']:
return [1]
+ if chunk in ['aïe', 'oë', 'ouü']:
+ return [1, 2]
if contains_trema(chunk):
return [2]
chunk = strip_accents(chunk, True)
if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
- 'yeu', 'ye']:
+ 'yeu', 'ye', 'you']:
return [1]
- for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']:
+ if chunk == "oua":
+ return [1, 2] # "pouah"
+ if chunk == "ao":
+ return [1, 2] # "paon"
+ for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yo', 'yau']:
if x in chunk:
return [2]
# beware of "déesse"