fix crash with bad leading characters with haspirater - plint - French poetry validator (local mirror of https://gitlab.com/a3nm/plint)

commit eac921c1c85cc61a8af18348990075021bb29895
parent 36ce1261259dea64a6d8285b4dc94ea116e7c3ee
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue,  1 Dec 2020 11:17:23 +0100

fix crash with bad leading characters with haspirater

Diffstat:
plint/chunk.py  | 6 ++++--
plint/common.py  | 15 +++++++++++----
plint/tests/test_sanity_check.py  | 4 ++++

3 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/plint/chunk.py b/plint/chunk.py
@@ -615,7 +615,8 @@ class Chunk:
                 self.causes_hiatus = True
         elif word[0] == 'h':
             result = list(map((lambda s: not s),
-                            haspirater.lookup(normalize(original_word))))
+                            haspirater.lookup(normalize(original_word,
+                                rm_all_begin=True))))
             if len(result) == 1 and True in result:
                 self.causes_hiatus = True
 
@@ -679,7 +680,8 @@ def elision(word, original_word, was_cap):
             return [True, False]
         # look up in haspirater using the original (but normalized) word
         return list(map((lambda s: not s),
-                        haspirater.lookup(normalize(original_word))))
+                        haspirater.lookup(normalize(original_word,
+                            rm_all_begin=True))))
     if is_vowels(word[0]):
         return [True]
     return [False]
diff --git a/plint/common.py b/plint/common.py
@@ -45,7 +45,8 @@ def normalize_spaces(text):
     return re.sub(r"\s+-*\s*", ' ', text)
 
 
-def remove_punctuation(text, rm_all=False, rm_apostrophe=False, rm_apostrophe_end=True):
+def remove_punctuation(text, rm_all=False, rm_apostrophe=False,
+        rm_apostrophe_end=True, rm_all_begin=False):
     """Remove punctuation from text"""
     text = re.sub("[" + APOSTROPHES + "]", "'", text)  # no weird apostrophes
     if rm_apostrophe:
@@ -60,8 +61,13 @@ def remove_punctuation(text, rm_all=False, rm_apostrophe=False, rm_apostrophe_en
 
     # TODO rather: keep only good chars
     if not rm_all:
+        if rm_all_begin:
+            pattern = re.compile(r"^[^\w]*", re.UNICODE)
+            text2b = pattern.sub(' ', text)
+        else:
+            text2b = text
         pattern = re.compile(r"[^'\w -]", re.UNICODE)
-        text2 = pattern.sub(' ', text)
+        text2 = pattern.sub(' ', text2b)
     else:
         pattern = re.compile(r"[^\w]", re.UNICODE)
         text2 = pattern.sub('', text)
@@ -89,11 +95,12 @@ def is_consonants(chunk_text):
 
 
 def normalize(text, downcase=True, rm_all=False, rm_apostrophe=False,
-              rm_apostrophe_end=True, strip=True):
+              rm_apostrophe_end=True, rm_all_begin=False, strip=True):
     """Normalize text, ie. lowercase, no useless punctuation or whitespace"""
     res = normalize_spaces(remove_punctuation(text.lower() if downcase else text,
                                               rm_all=rm_all, rm_apostrophe=rm_apostrophe,
-                                              rm_apostrophe_end=rm_apostrophe_end))
+                                              rm_apostrophe_end=rm_apostrophe_end,
+                                              rm_all_begin=rm_all_begin))
     if strip:
         return res.rstrip().lstrip()
     else:
diff --git a/plint/tests/test_sanity_check.py b/plint/tests/test_sanity_check.py
@@ -44,6 +44,10 @@ class SanityCheck(unittest.TestCase):
         text = "-----"
         self.assertEqual(common.normalize(text), "")
 
+    def testAspiratedJunk(self):
+        text = "---''humain"
+        self.assertEqual(common.normalize(text, rm_all_begin=True), "humain")
+
 
 if __name__ == "__main__":
     unittest.main()

	plint French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
	git clone https://a3nm.net/git/plint/
	Log \| Files \| Refs \| README

plint/chunk.py	\|	6	++++--
plint/common.py	\|	15	+++++++++++----
plint/tests/test_sanity_check.py	\|	4	++++