plint

French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

commit eac921c1c85cc61a8af18348990075021bb29895
parent 36ce1261259dea64a6d8285b4dc94ea116e7c3ee
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue,  1 Dec 2020 11:17:23 +0100

fix crash with bad leading characters with haspirater

Diffstat:
plint/chunk.py | 6++++--
plint/common.py | 15+++++++++++----
plint/tests/test_sanity_check.py | 4++++
3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/plint/chunk.py b/plint/chunk.py @@ -615,7 +615,8 @@ class Chunk: self.causes_hiatus = True elif word[0] == 'h': result = list(map((lambda s: not s), - haspirater.lookup(normalize(original_word)))) + haspirater.lookup(normalize(original_word, + rm_all_begin=True)))) if len(result) == 1 and True in result: self.causes_hiatus = True @@ -679,7 +680,8 @@ def elision(word, original_word, was_cap): return [True, False] # look up in haspirater using the original (but normalized) word return list(map((lambda s: not s), - haspirater.lookup(normalize(original_word)))) + haspirater.lookup(normalize(original_word, + rm_all_begin=True)))) if is_vowels(word[0]): return [True] return [False] diff --git a/plint/common.py b/plint/common.py @@ -45,7 +45,8 @@ def normalize_spaces(text): return re.sub(r"\s+-*\s*", ' ', text) -def remove_punctuation(text, rm_all=False, rm_apostrophe=False, rm_apostrophe_end=True): +def remove_punctuation(text, rm_all=False, rm_apostrophe=False, + rm_apostrophe_end=True, rm_all_begin=False): """Remove punctuation from text""" text = re.sub("[" + APOSTROPHES + "]", "'", text) # no weird apostrophes if rm_apostrophe: @@ -60,8 +61,13 @@ def remove_punctuation(text, rm_all=False, rm_apostrophe=False, rm_apostrophe_en # TODO rather: keep only good chars if not rm_all: + if rm_all_begin: + pattern = re.compile(r"^[^\w]*", re.UNICODE) + text2b = pattern.sub(' ', text) + else: + text2b = text pattern = re.compile(r"[^'\w -]", re.UNICODE) - text2 = pattern.sub(' ', text) + text2 = pattern.sub(' ', text2b) else: pattern = re.compile(r"[^\w]", re.UNICODE) text2 = pattern.sub('', text) @@ -89,11 +95,12 @@ def is_consonants(chunk_text): def normalize(text, downcase=True, rm_all=False, rm_apostrophe=False, - rm_apostrophe_end=True, strip=True): + rm_apostrophe_end=True, rm_all_begin=False, strip=True): """Normalize text, ie. lowercase, no useless punctuation or whitespace""" res = normalize_spaces(remove_punctuation(text.lower() if downcase else text, rm_all=rm_all, rm_apostrophe=rm_apostrophe, - rm_apostrophe_end=rm_apostrophe_end)) + rm_apostrophe_end=rm_apostrophe_end, + rm_all_begin=rm_all_begin)) if strip: return res.rstrip().lstrip() else: diff --git a/plint/tests/test_sanity_check.py b/plint/tests/test_sanity_check.py @@ -44,6 +44,10 @@ class SanityCheck(unittest.TestCase): text = "-----" self.assertEqual(common.normalize(text), "") + def testAspiratedJunk(self): + text = "---''humain" + self.assertEqual(common.normalize(text, rm_all_begin=True), "humain") + if __name__ == "__main__": unittest.main()