commit da866fcbd6d798f5ba5f73bfb224b8f95dde1857
parent 3e6eb41e7c0b2a85b5fd4230cc3801e5d92278b9
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 21 Sep 2013 11:08:16 +0200
fix problem with apostrophes followed by spaces
Diffstat:
2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/common.py b/common.py
@@ -6,6 +6,7 @@ import re
vowels = 'aeiouyϾ'
consonants = "bcçdfghjklmnpqrstvwxzñ'"
+apostrophes = "'’"
legal = vowels + consonants + ' -'
# a variant of x-sampa such that all french phonemes are one-character
@@ -44,8 +45,8 @@ def norm_spaces(text):
def rm_punct(text, rm_all=False, rm_apostrophe=False):
"""Remove punctuation from text"""
- text = re.sub("’", "'", text) # no weird apostrophes
- text = re.sub("' ", "'", text) # space after apostrophes
+ text = re.sub("[" + apostrophes + "]", "'", text) # no weird apostrophes
+ text = re.sub("' *", "'", text) # space after apostrophes
if rm_apostrophe:
text = re.sub("'", "", text)
text = re.sub("'*$", "", text) # apostrophes at end of line
diff --git a/verse.py b/verse.py
@@ -1,7 +1,7 @@
#!/usr/bin/python3
import common
-from common import consonants, normalize, is_consonants, is_vowels, sure_end_fem, strip_accents_one
+from common import apostrophes, consonants, normalize, is_consonants, is_vowels, sure_end_fem, strip_accents_one
import re
import vowels
import haspirater
@@ -66,6 +66,19 @@ class Verse:
self.chunks = [[{'original': y, 'text': normalize(y, rm_apostrophe=True)}
for y in x] for x in pre_chunks]
+ # collapse apostrophes
+ self.chunks2 = []
+ acc = []
+ for w in self.chunks:
+ if re.search("[" + apostrophes + "]\s*$", w[-1]['original']):
+ acc += w
+ else:
+ self.chunks2.append(acc + w)
+ acc = []
+ if len(acc) > 0:
+ self.chunks2.append(acc)
+ self.chunks = self.chunks2
+
# check forbidden characters
for w in self.chunks:
for y in w: