commit d3ee60085c5eb8647c2e6c23f1014acc973a0cdf
parent a2dc431e2d39f50220fc9d6687c6a231ebacd0bd
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sun, 5 Jan 2014 14:42:31 +0100
fix problem with apostrophes and rhyme
Diffstat:
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/common.py b/common.py
@@ -43,13 +43,14 @@ def norm_spaces(text):
"""Remove multiple consecutive whitespace"""
return re.sub("\s+-*\s*", ' ', text)
-def rm_punct(text, rm_all=False, rm_apostrophe=False):
+def rm_punct(text, rm_all=False, rm_apostrophe=False, rm_apostrophe_end=True):
"""Remove punctuation from text"""
text = re.sub("[" + apostrophes + "]", "'", text) # no weird apostrophes
text = re.sub("' *", "'", text) # space after apostrophes
if rm_apostrophe:
text = re.sub("'", "", text)
- text = re.sub("'*$", "", text) # apostrophes at end of line
+ if rm_apostrophe_end:
+ text = re.sub("'*$", "", text) # apostrophes at end of line
text = re.sub("[‒–—―⁓⸺⸻]", " ", text) # no weird dashes
#TODO rather: keep only good chars
@@ -83,10 +84,12 @@ def is_consonants(chunk):
return False
return True
-def normalize(text, downcase=True, rm_all=False, rm_apostrophe=False, strip=True):
+def normalize(text, downcase=True, rm_all=False, rm_apostrophe=False,
+ rm_apostrophe_end=True, strip=True):
"""Normalize text, ie. lowercase, no useless punctuation or whitespace"""
res = norm_spaces(rm_punct(text.lower() if downcase else text,
- rm_all=rm_all, rm_apostrophe=rm_apostrophe))
+ rm_all=rm_all, rm_apostrophe=rm_apostrophe,
+ rm_apostrophe_end=rm_apostrophe_end))
if strip:
return res.rstrip().lstrip()
else:
diff --git a/verse.py b/verse.py
@@ -71,7 +71,7 @@ class Verse:
@property
def normalized(self):
- return ''.join(normalize(x['original'], strip=False)
+ return ''.join(normalize(x['original'], strip=False, rm_apostrophe_end=False)
if 'text_pron' not in x.keys() else x['text']
for x in self.chunks).lstrip().rstrip()