commit bed9e45249998d768ebd771a456f8caf7caae334
parent 756cabf98118fc022983d07a828594cf2a76b52d
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Tue, 13 Mar 2012 14:54:57 +0100
cleanup and corrections
Diffstat:
3 files changed, 62 insertions(+), 15 deletions(-)
diff --git a/common.py b/common.py
@@ -1,18 +1,29 @@
#!/usr/bin/python3
+#coding: utf-8
import unicodedata
import re
-import haspirater
vowels = 'aeiouyϾ'
-consonants = "[bcçdfghjklmnpqrstvwxz*-]"
+consonants = "bcçdfghjklmnpqrstvwxz"
+
+# a variant of x-sampa such that all french phonemes are one-character
+SUBSTS = [
+ ('#', 'A~'),
+ ('$', 'O~'),
+ (')', 'E~'),
+ ('(', '9~'),
+ ]
# Forbidden at the end of a hemistiche. "-ent" would also be forbidden
# in some cases but not others...
sure_end_fem = ['es', 'e', 'ë']
# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
-def strip_accents_one(s, with_except):
+def strip_accents_one(s, with_except=False):
+ """Strip accent from a string
+
+ with_except keeps specifically 'é' and 'è'"""
r = []
for x in s:
if with_except and x in ['è', 'é']:
@@ -27,15 +38,22 @@ def strip_accents(s, with_except=False):
if unicodedata.category(c) != 'Mn'))
def norm_spaces(text):
+ """Remove multiple consecutive whitespace"""
return re.sub("\s+-*\s*", ' ', text)
-def rm_punct(text):
- text = re.sub("'", '', text)
+def rm_punct(text, with_apostrophe = False):
+ """Remove punctuation from text"""
+ if not with_apostrophe:
+ text = re.sub("'", '', text)
#TODO rather: keep only good chars
- pattern = re.compile('[^\w -]', re.UNICODE)
+ pattern = re.compile("[^'\w -]", re.UNICODE)
return pattern.sub(' ', text)
-def is_vowels(chunk, with_h = False, with_y = True):
+def is_vowels(chunk, with_h=False, with_y=True):
+ """Test if a chunk is vowels
+
+ with_h counts 'h' as vowel, with_y allows 'y'"""
+
if not with_y and chunk == 'y':
return False
for char in strip_accents(chunk):
@@ -44,6 +62,28 @@ def is_vowels(chunk, with_h = False, with_y = True):
return False
return True
-def normalize(text):
- return norm_spaces(rm_punct(text.lower())).rstrip().lstrip()
+def is_consonants(chunk):
+ """Test if a chunk is consonants"""
+
+ for char in strip_accents(chunk):
+ if char not in consonants:
+ return False
+ return True
+
+def normalize(text, with_apostrophe=False):
+ """Normalize text, ie. lowercase, no useless punctuation or whitespace"""
+ return norm_spaces(rm_punct(text.lower(), with_apostrophe)).rstrip().lstrip()
+
+def subst(string, subs):
+ if len(subs) == 0:
+ return string
+ return subst(string.replace(subs[0][0], subs[0][1]), subs[1:])
+
+def to_xsampa(s):
+ """convert our modified format to x-sampa"""
+ return subst(s, SUBSTS)
+
+def from_xsampa(s):
+ """convert x-sampa to our modified format"""
+ return subst(s, [(x[1], x[0]) for x in SUBSTS])
diff --git a/metric.py b/metric.py
@@ -1,6 +1,8 @@
+#!/usr/bin/python
+#coding: utf-8
+
import re
-from common import strip_accents, normalize, is_vowels, consonants, \
- sure_end_fem
+from common import normalize, is_vowels, consonants, sure_end_fem
from vowels import possible_weights
import haspirater
@@ -62,7 +64,7 @@ def feminine(align, verse):
def parse(text, bound):
"""Return possible aligns for text, bound is an upper bound on the
- align length to limit cost"""
+ align length to limit running time"""
original_text = normalize(text)
@@ -77,7 +79,7 @@ def parse(text, bound):
words = text.split(' ')
words = [annotate_aspirated(word) for word in words if word != '']
- pattern = re.compile('('+consonants+'*)', re.UNICODE)
+ pattern = re.compile('(['+consonants+'*-]*)', re.UNICODE)
# cut each word in chunks of vowels and consonants, with some specific
# kludges
diff --git a/vowels.py b/vowels.py
@@ -1,14 +1,19 @@
#!/usr/bin/python3
+#coding: utf-8
+
+"""Compute the number of syllabes taken by a vowel chunk"""
from common import strip_accents
def contains_trema(chunk):
+ """Test if a string contains a word with a trema"""
for x in ['ä', 'ï', 'ö', 'ü', 'ÿ']:
if x in chunk:
return True
return False
def possible_weights(chunk):
+ """Return the possible number of syllabes taken by a vowel chunk"""
if len(chunk) == 1:
return [1]
# old spelling and weird exceptions
@@ -19,7 +24,6 @@ def possible_weights(chunk):
if contains_trema(chunk):
return [2]
chunk = strip_accents(chunk, True)
- # TODO 'ée' ? ('déesse')
if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
@@ -28,6 +32,7 @@ def possible_weights(chunk):
for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']:
if x in chunk:
return [2]
+ # beware of "déesse"
if chunk == 'ée':
return [1, 2]
if chunk[0] == 'i':
@@ -38,7 +43,7 @@ def possible_weights(chunk):
return [1, 2]
if 'é' in chunk or 'è' in chunk:
return [2]
- # only non-accented left
# TODO hmm
return [1, 2]
+