cleanup and corrections - plint - French poetry validator (local mirror of https://gitlab.com/a3nm/plint)

commit bed9e45249998d768ebd771a456f8caf7caae334
parent 756cabf98118fc022983d07a828594cf2a76b52d
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 13 Mar 2012 14:54:57 +0100

cleanup and corrections

Diffstat:
common.py  | 58 +++++++++++++++++++++++++++++++++++++++++++++++++---------
metric.py  | 10 ++++++----
vowels.py  | 9 +++++++--

3 files changed, 62 insertions(+), 15 deletions(-)
diff --git a/common.py b/common.py
@@ -1,18 +1,29 @@
 #!/usr/bin/python3
+#coding: utf-8
 
 import unicodedata
 import re
-import haspirater
 
 vowels = 'aeiouyœæ'
-consonants = "[bcçdfghjklmnpqrstvwxz*-]"
+consonants = "bcçdfghjklmnpqrstvwxz"
+
+# a variant of x-sampa such that all french phonemes are one-character
+SUBSTS = [
+  ('#', 'A~'),
+  ('$', 'O~'),
+  (')', 'E~'),
+  ('(', '9~'),
+    ]
 
 # Forbidden at the end of a hemistiche. "-ent" would also be forbidden
 # in some cases but not others...
 sure_end_fem = ['es', 'e', 'ë']
 
 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
-def strip_accents_one(s, with_except):
+def strip_accents_one(s, with_except=False):
+  """Strip accent from a string
+  
+  with_except keeps specifically 'é' and 'è'"""
   r = []
   for x in s:
     if with_except and x in ['è', 'é']:
@@ -27,15 +38,22 @@ def strip_accents(s, with_except=False):
       if unicodedata.category(c) != 'Mn'))
 
 def norm_spaces(text):
+  """Remove multiple consecutive whitespace"""
   return re.sub("\s+-*\s*", ' ', text)
 
-def rm_punct(text):
-  text = re.sub("'", '', text)
+def rm_punct(text, with_apostrophe = False):
+  """Remove punctuation from text"""
+  if not with_apostrophe:
+    text = re.sub("'", '', text)
   #TODO rather: keep only good chars
-  pattern = re.compile('[^\w -]', re.UNICODE)
+  pattern = re.compile("[^'\w -]", re.UNICODE)
   return pattern.sub(' ', text)
 
-def is_vowels(chunk, with_h = False, with_y = True):
+def is_vowels(chunk, with_h=False, with_y=True):
+  """Test if a chunk is vowels
+
+  with_h counts 'h' as vowel, with_y allows 'y'"""
+
   if not with_y and chunk == 'y':
     return False
   for char in strip_accents(chunk):
@@ -44,6 +62,28 @@ def is_vowels(chunk, with_h = False, with_y = True):
         return False
   return True
 
-def normalize(text):
-  return norm_spaces(rm_punct(text.lower())).rstrip().lstrip()
+def is_consonants(chunk):
+  """Test if a chunk is consonants"""
+
+  for char in strip_accents(chunk):
+    if char not in consonants:
+      return False
+  return True
+
+def normalize(text, with_apostrophe=False):
+  """Normalize text, ie. lowercase, no useless punctuation or whitespace"""
+  return norm_spaces(rm_punct(text.lower(), with_apostrophe)).rstrip().lstrip()
+
+def subst(string, subs):
+  if len(subs) == 0:
+    return string
+  return subst(string.replace(subs[0][0], subs[0][1]), subs[1:])
+
+def to_xsampa(s):
+  """convert our modified format to x-sampa"""
+  return subst(s, SUBSTS)
+
+def from_xsampa(s):
+  """convert x-sampa to our modified format"""
+  return subst(s, [(x[1], x[0]) for x in SUBSTS])
 
diff --git a/metric.py b/metric.py
@@ -1,6 +1,8 @@
+#!/usr/bin/python
+#coding: utf-8
+
 import re
-from common import strip_accents, normalize, is_vowels, consonants, \
-  sure_end_fem
+from common import normalize, is_vowels, consonants, sure_end_fem
 from vowels import possible_weights
 import haspirater
 
@@ -62,7 +64,7 @@ def feminine(align, verse):
 
 def parse(text, bound):
   """Return possible aligns for text, bound is an upper bound on the
-  align length to limit cost"""
+  align length to limit running time"""
 
   original_text = normalize(text)
 
@@ -77,7 +79,7 @@ def parse(text, bound):
   words = text.split(' ')
   words = [annotate_aspirated(word) for word in words if word != '']
 
-  pattern = re.compile('('+consonants+'*)', re.UNICODE)
+  pattern = re.compile('(['+consonants+'*-]*)', re.UNICODE)
 
   # cut each word in chunks of vowels and consonants, with some specific
   # kludges
diff --git a/vowels.py b/vowels.py
@@ -1,14 +1,19 @@
 #!/usr/bin/python3
+#coding: utf-8
+
+"""Compute the number of syllabes taken by a vowel chunk"""
 
 from common import strip_accents
 
 def contains_trema(chunk):
+  """Test if a string contains a word with a trema"""
   for x in ['ä', 'ï', 'ö', 'ü', 'ÿ']:
     if x in chunk:
       return True
   return False
 
 def possible_weights(chunk):
+  """Return the possible number of syllabes taken by a vowel chunk"""
   if len(chunk) == 1:
     return [1]
   # old spelling and weird exceptions
@@ -19,7 +24,6 @@ def possible_weights(chunk):
   if contains_trema(chunk):
     return [2]
   chunk = strip_accents(chunk, True)
-  # TODO 'ée' ? ('déesse')
   if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
       'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
       'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
@@ -28,6 +32,7 @@ def possible_weights(chunk):
   for x in ['oa', 'ea', 'eua', 'ao', 'euo', 'ua', 'uo', 'yo', 'yau']:
     if x in chunk:
       return [2]
+  # beware of "déesse"
   if chunk == 'ée':
     return [1, 2]
   if chunk[0] == 'i':
@@ -38,7 +43,7 @@ def possible_weights(chunk):
     return [1, 2]
   if 'é' in chunk or 'è' in chunk:
     return [2]
-  # only non-accented left
   
   # TODO hmm
   return [1, 2]
+

	plint French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
	git clone https://a3nm.net/git/plint/
	Log \| Files \| Refs \| README

common.py	\|	58	+++++++++++++++++++++++++++++++++++++++++++++++++---------
metric.py	\|	10	++++++----
vowels.py	\|	9	+++++++--