commit eac921c1c85cc61a8af18348990075021bb29895
parent 36ce1261259dea64a6d8285b4dc94ea116e7c3ee
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Tue, 1 Dec 2020 11:17:23 +0100
fix crash with bad leading characters with haspirater
Diffstat:
3 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/plint/chunk.py b/plint/chunk.py
@@ -615,7 +615,8 @@ class Chunk:
self.causes_hiatus = True
elif word[0] == 'h':
result = list(map((lambda s: not s),
- haspirater.lookup(normalize(original_word))))
+ haspirater.lookup(normalize(original_word,
+ rm_all_begin=True))))
if len(result) == 1 and True in result:
self.causes_hiatus = True
@@ -679,7 +680,8 @@ def elision(word, original_word, was_cap):
return [True, False]
# look up in haspirater using the original (but normalized) word
return list(map((lambda s: not s),
- haspirater.lookup(normalize(original_word))))
+ haspirater.lookup(normalize(original_word,
+ rm_all_begin=True))))
if is_vowels(word[0]):
return [True]
return [False]
diff --git a/plint/common.py b/plint/common.py
@@ -45,7 +45,8 @@ def normalize_spaces(text):
return re.sub(r"\s+-*\s*", ' ', text)
-def remove_punctuation(text, rm_all=False, rm_apostrophe=False, rm_apostrophe_end=True):
+def remove_punctuation(text, rm_all=False, rm_apostrophe=False,
+ rm_apostrophe_end=True, rm_all_begin=False):
"""Remove punctuation from text"""
text = re.sub("[" + APOSTROPHES + "]", "'", text) # no weird apostrophes
if rm_apostrophe:
@@ -60,8 +61,13 @@ def remove_punctuation(text, rm_all=False, rm_apostrophe=False, rm_apostrophe_en
# TODO rather: keep only good chars
if not rm_all:
+ if rm_all_begin:
+ pattern = re.compile(r"^[^\w]*", re.UNICODE)
+ text2b = pattern.sub(' ', text)
+ else:
+ text2b = text
pattern = re.compile(r"[^'\w -]", re.UNICODE)
- text2 = pattern.sub(' ', text)
+ text2 = pattern.sub(' ', text2b)
else:
pattern = re.compile(r"[^\w]", re.UNICODE)
text2 = pattern.sub('', text)
@@ -89,11 +95,12 @@ def is_consonants(chunk_text):
def normalize(text, downcase=True, rm_all=False, rm_apostrophe=False,
- rm_apostrophe_end=True, strip=True):
+ rm_apostrophe_end=True, rm_all_begin=False, strip=True):
"""Normalize text, ie. lowercase, no useless punctuation or whitespace"""
res = normalize_spaces(remove_punctuation(text.lower() if downcase else text,
rm_all=rm_all, rm_apostrophe=rm_apostrophe,
- rm_apostrophe_end=rm_apostrophe_end))
+ rm_apostrophe_end=rm_apostrophe_end,
+ rm_all_begin=rm_all_begin))
if strip:
return res.rstrip().lstrip()
else:
diff --git a/plint/tests/test_sanity_check.py b/plint/tests/test_sanity_check.py
@@ -44,6 +44,10 @@ class SanityCheck(unittest.TestCase):
text = "-----"
self.assertEqual(common.normalize(text), "")
+ def testAspiratedJunk(self):
+ text = "---''humain"
+ self.assertEqual(common.normalize(text, rm_all_begin=True), "humain")
+
if __name__ == "__main__":
unittest.main()