commit d89bfe7d0c5afcae1fc2e1f69d6f8b632793bffd
parent d55e8594bf5a8c30611cd7435200755c46cab728
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Fri, 27 Apr 2012 18:06:48 +0200
handle weird apostrophes and space after apostrophes
Diffstat:
1 file changed, 3 insertions(+), 0 deletions(-)
diff --git a/common.py b/common.py
@@ -43,8 +43,11 @@ def norm_spaces(text):
def rm_punct(text, with_apostrophe = False):
"""Remove punctuation from text"""
+ text = re.sub("’", "'", text) # no weird apostrophes
+ text = re.sub("' ", "'", text) # space after apostrophes
if not with_apostrophe:
text = re.sub("'", '', text)
+
#TODO rather: keep only good chars
pattern = re.compile("[^'\w -]", re.UNICODE)
text2 = pattern.sub(' ', text)