plint

French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

commit bdf7000c91d00282d7bc5595459458abbbbbe4df
parent b308017c164fe7bd94a6be8a2c3993a35db1cee0
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Thu, 15 Aug 2019 15:26:47 +0200

Merge gitlab.com:a3nm/plint

Merge my own commits with Julien's

Diffstat:
.gitignore | 3+++
compare_test_output.py | 12++++++++++++
lexique_comparison/count_syllables_plint.py | 4+++-
plint.py | 110++++++++++++++++++++++++++++++++++++++++----------------------------------------
plint/chunk.py | 595+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
plint/chunks.py | 554+++++++++++--------------------------------------------------------------------
plint/common.py | 7+++++++
plint/error.py | 6+++---
plint/pattern.py | 32++++++++++++++++++++++++++++++++
plint/plint_irc.py | 2+-
plint/plint_web.py | 329++++++++++++++++++++++++++++++++++++++++++-------------------------------------
plint/template.py | 547+++++++++++++++++++++++++++++++++++++------------------------------------------
plint/tests/test_bad_chars.py | 5+++--
plint/tests/test_counts.py | 3++-
plint/tests/test_eliminate.py | 5+++--
plint/tests/test_gender.py | 13+++++++------
plint/tests/test_hiatus.py | 13+++++++------
plint/tests/test_sanity_check.py | 11++++++-----
plint/tests/test_sanity_check2.py | 3++-
plint/verse.py | 32++++++++++++++------------------
plint/vowels.py | 128-------------------------------------------------------------------------------
test.sh | 24++++++++++++++++++++++--
22 files changed, 1280 insertions(+), 1158 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,4 +1,6 @@ __pycache__/* +.idea +Lexique382.tsv frhyme frhyme/* haspirater @@ -35,3 +37,4 @@ final_syneresis2.ctx coverage .coverage ouliplint/stanford-postagger-full-2013-11-12/ +test_temp.txt diff --git a/compare_test_output.py b/compare_test_output.py @@ -0,0 +1,12 @@ +import sys + +file0 = sys.argv[1] +file1 = sys.argv[2] + +with open(file0) as f: + content0 = f.read() + +with open(file1) as f: + content1 = f.read() + +print(int(sorted(content0) == sorted(content1))) diff --git a/lexique_comparison/count_syllables_plint.py b/lexique_comparison/count_syllables_plint.py @@ -4,12 +4,14 @@ import os import sys # modules are in the parent folder +import plint.pattern + sys.path.insert(1, os.path.join(sys.path[0], '..')) from plint import template, verse, rhyme templateobj = template.Template() -patternobj = template.Pattern("12") +patternobj = plint.pattern.Pattern("12") for l in sys.stdin.readlines(): w = (l.strip().split("\t"))[0] diff --git a/plint.py b/plint.py @@ -5,64 +5,64 @@ import sys def run(): - ok = True - f2 = None - nsyl = None - offset = 0 - if len(sys.argv) >= 4: - f2 = open(sys.argv[3], 'w') - if len(sys.argv) >= 5: - nsyl = int(sys.argv[4]) - if len(sys.argv) == 6: - offset = int(sys.argv[5]) - should_end = False - while True: - line = sys.stdin.readline() - if not line: - should_end = True - line = "" - errors = template.check(line, f2, last=should_end, nsyl=nsyl, offset=offset) - if errors: - print(errors.report(), file=sys.stderr) - ok = False - if should_end: - break - return ok + ok = True + f2 = None + nsyl = None + offset = 0 + if len(sys.argv) >= 4: + f2 = open(sys.argv[3], 'w') + if len(sys.argv) >= 5: + nsyl = int(sys.argv[4]) + if len(sys.argv) == 6: + offset = int(sys.argv[5]) + should_end = False + while True: + line = sys.stdin.readline() + if not line: + should_end = True + line = "" + errors = template.check(line, f2, last=should_end, n_syllables=nsyl, offset=offset) + if errors: + print(errors.report(), file=sys.stderr) + ok = False + if should_end: + break + return ok -if __name__ == '__main__': - localization.init_locale() - if len(sys.argv) < 2 or len(sys.argv) > 6: - print(_("Usage: %s TEMPLATE [DFILE [OCONTEXT [NSYL [OFFSET]]]]") % sys.argv[0], - file=sys.stderr) - print(_("Check stdin according to TEMPLATE, report errors on stdout"), - file=sys.stderr) - print(_("For internal use:"), - file=sys.stderr) - print(_("DFILE is the diaeresis file, OCONTEXT is the context output file"), - file=sys.stderr) - print(_("NSYL is the assigned weight to the last chunk (diaeresis training)"), - file=sys.stderr) - print(_("OFFSET is to add after the last chunk (diaeresis training)"), - file=sys.stderr) - sys.exit(2) - template_name = sys.argv[1] - if len(sys.argv) > 2: - diaeresis_name = sys.argv[2] - else: - diaeresis_name = "../data/diaeresis.json" - diaeresis.set_diaeresis(diaeresis_name) +if __name__ == '__main__': + localization.init_locale() + if len(sys.argv) < 2 or len(sys.argv) > 6: + print(_("Usage: %s TEMPLATE [DFILE [OCONTEXT [NSYL [OFFSET]]]]") % sys.argv[0], + file=sys.stderr) + print(_("Check stdin according to TEMPLATE, report errors on stdout"), + file=sys.stderr) + print(_("For internal use:"), + file=sys.stderr) + print(_("DFILE is the diaeresis file, OCONTEXT is the context output file"), + file=sys.stderr) + print(_("NSYL is the assigned weight to the last chunk (diaeresis training)"), + file=sys.stderr) + print(_("OFFSET is to add after the last chunk (diaeresis training)"), + file=sys.stderr) + sys.exit(2) - f = open(template_name) - x = f.read() - f.close() + template_name = sys.argv[1] + if len(sys.argv) > 2: + diaeresis_name = sys.argv[2] + else: + diaeresis_name = "../data/diaeresis.json" + diaeresis.set_diaeresis(diaeresis_name) - try: - template = template.Template(x) - except error.TemplateLoadError as e: - print("Could not load template %s: %s" % (template_name, e.msg), file=sys.stderr) - sys.exit(2) + f = open(template_name) + x = f.read() + f.close() - ok = run() - sys.exit(0 if ok else 1) + try: + template = template.Template(x) + except error.TemplateLoadError as e: + print("Could not load template %s: %s" % (template_name, e.msg), file=sys.stderr) + sys.exit(2) + ok = run() + sys.exit(0 if ok else 1) diff --git a/plint/chunk.py b/plint/chunk.py @@ -0,0 +1,595 @@ +import re + +from haspirater import haspirater +from plint import common, diaeresis, error +from plint.common import normalize, strip_accents_one, is_consonants, APOSTROPHES, is_vowels, get_consonants_regex, \ + strip_accents, SURE_END_FEM +from plint.vowels import contains_trema, intersperse + + +DEFAULT_THRESHOLD = 3 + + +class Chunk: + + def __init__(self, word, verse): + self.original = word + self.text = normalize(word, rm_apostrophe=True) + self.hemistiche = None + self.error = None + self.illegal_str = None + self.weights = None + self.had_hyphen = None + self.text_pron = None + self.elision = None + self.no_hiatus = None + self.elidable = None + self.word_end = False + # TODO What is a weight without s? + self.weight = None + self.verse = verse + + def __repr__(self): + return "Chunk(" \ + + "original:" + self.original \ + + ", text:" + self.text \ + + ", weights:" + str(self.weights or []) \ + + ", weight:" + str(self.weight or "") \ + + ", elidable:" + str(self.elidable or False) \ + + ", elision:" + str(self.elision or False) \ + + ", hemistiche:" + str(self.hemistiche) \ + + ", error:" + str(self.error) \ + + ", illegal_str:" + str(self.illegal_str) \ + + ", had_hypher:" + str(self.had_hyphen) \ + + ", text_pron:" + str(self.text_pron) \ + + ", no_hiatus:" + str(self.no_hiatus) \ + + ", word_end:" + str(self.word_end) \ + + ")" + "\n" + + def copy(self): + new_chunk = Chunk(self.original, self.verse) + new_chunk.original = self.original + new_chunk.text = self.text + new_chunk.hemistiche = self.hemistiche + new_chunk.error = self.error + new_chunk.illegal_str = self.illegal_str + new_chunk.weights = self.weights + new_chunk.had_hyphen = self.had_hyphen + new_chunk.text_pron = self.text_pron + new_chunk.elision = self.elision + new_chunk.no_hiatus = self.no_hiatus + new_chunk.elidable = self.elidable + new_chunk.word_end = self.word_end + new_chunk.weight = self.weight + return new_chunk + + def set_hemistiche(self, hemis): + self.hemistiche = hemis + + def check_forbidden_characters(self): + es = "" + for x in self.text: + if not common.remove_punctuation(strip_accents_one(x)[0].lower()) in common.LEGAL: + es += 'I' + self.error = "illegal" + else: + es += ' ' + if self.error is not None and self.error == "illegal": + self.illegal_str = es + + def simplify_gu_qu(self, next_chunk): + if next_chunk.text.startswith('u'): + if self.text.endswith('q'): + next_chunk.text = next_chunk.text[1:] + if next_chunk.text == '': + self.original += next_chunk.original + next_chunk.original = '' + if self.text.endswith('g') and len(next_chunk.text) >= 2: + if next_chunk.text[1] in "eéèa": + next_chunk.text = next_chunk.text[1:] + + def elide_inside_words(self, all_next_chunks): + if self.text == "e-": + self.weights = [0] # force elision + next_chunk = all_next_chunks[0] + if self.text == "e" and next_chunk.text.startswith("-h"): + # collect what follows until the next hyphen or end + flw = next_chunk.original.split('-')[1] + for future_chunk in all_next_chunks[1:]: + flw += future_chunk.original.split('-')[0] + if '-' in future_chunk.original: + break + # TODO: not sure if this reconstruction of the original word is bulletproof... + if haspirater.lookup(normalize(flw)): + self.weights = [0] + else: + self.weights = [1] + + def remove_leading_and_trailing_crap(self): + seen_space = False + seen_hyphen = False + while len(self.text) > 0 and self.text[0] in ' -': + if self.text[0] == ' ': + seen_space = True + else: + seen_hyphen = True + self.text = self.text[1:] + while len(self.text) > 0 and self.text[-1] in ' -': + if self.text[-1] == ' ': + seen_space = True + else: + seen_hyphen = True + self.text = self.text[:-1] + if seen_hyphen and not seen_space: + self.had_hyphen = True + + def is_empty(self): + return len(self.text) == 0 + + def add_original(self, other_chunk): + self.original += other_chunk.original + + def create_acronym(self): + new_chunks = [] + for j, character in enumerate(self.text): + try: + new_chunk_content = LETTERS[character] + # hack: the final 'e's in letters are just to help pronunciation + # inference and are only needed at end of word, otherwise they will + # mess syllable count up + if j < len(self.text) - 1 and new_chunk_content[-1] == 'e': + new_chunk_content = new_chunk_content[:-1] + except KeyError: + new_chunk_content = character + 'é' + new_chunks += [(j, x) for x in re.split(get_consonants_regex(), new_chunk_content)] + new_chunks = [x for x in new_chunks if len(x[1]) > 0] + new_word = [] + last_opos = -1 + for j, (original_position, character) in enumerate(new_chunks): + part = "" + if j == len(new_chunks) - 1: + # don't miss final spaces + part = self.original[last_opos + 1:] + elif last_opos < original_position: + part = self.original[last_opos + 1:original_position + 1] + last_opos = original_position + # allow or forbid elision because of possible ending '-e' before + # forbid hiatus both for this and for preceding + # instruct that we must use text for the pronunciation + new_chunk = Chunk(part, self.verse) + new_chunk.original = part + new_chunk.text = character + new_chunk.text_pron = True + new_chunk.elision = [False, True] + new_chunk.no_hiatus = True + new_word.append(new_chunk) + # propagate information from splithyph + new_word[-1].hemistiche = self.hemistiche + return new_word + + def check_elidable(self): + if self.text == 'e': + self.elidable = [True] + + def is_consonants(self): + return is_consonants(self.text) + + def ends_with_apostrophe(self): + return re.search("[" + APOSTROPHES + "]$", self.original) is not None + + def elide_vowel_problems(self, chunk_group): + if self.elision is None: + self.elision = elision_wrap(chunk_group) + + def process_y_cases(self, previous_chunk, next_chunk): + new_word_from_chunk = [] + if 'y' not in self.text or len(self.text) == 1 or self.text.startswith("y"): + new_word_from_chunk.append(self) + else: + if previous_chunk is not None and next_chunk is not None: + # special cases of "pays", "alcoyle", "abbayes" + c_text = self.text + p_text = previous_chunk.text + n_text = next_chunk.text + # TODO Should you force if this condition does not apply? + if ((c_text == "ay" and p_text.endswith("p") and n_text.startswith("s")) + or + (c_text == "oy" and p_text.endswith("lc") + and n_text.startswith("l")) + or + (c_text == "aye" and p_text.endswith("bb") + and n_text.startswith("s"))): + # force weight + self.weights = [2] + new_word_from_chunk.append(self) + return new_word_from_chunk + must_force = next_chunk is None and previous_chunk is not None and \ + (self.text == "aye" and previous_chunk.text.endswith("bb")) + if must_force: + # force weight + self.weights = [2] + new_word_from_chunk.append(self) + else: + sub_chunks = re.split(re.compile("(y+)"), self.text) + sub_chunks = [x for x in sub_chunks if len(x) > 0] + for j, sub_chunk in enumerate(sub_chunks): + lindex = int(j * len(self.original) / len(sub_chunks)) + rindex = int((j + 1) * len(self.original) / len(sub_chunks)) + part = self.original[lindex:rindex] + new_subchunk_text = 'Y' if 'y' in sub_chunk else sub_chunk + new_subchunk = self.copy() + new_subchunk.original = part + new_subchunk.text = new_subchunk_text + new_word_from_chunk.append(new_subchunk) + return new_word_from_chunk + + def is_vowels(self): + return is_vowels(self.text) + + def is_dash_elidable(self): + # "fais-le" not elidable, but "suis-je" and "est-ce" is + return not ('-' in self.text and not self.text.endswith('-j') and not self.text.endswith('-c')) + + def check_elidable_with_next(self, next_chunk): + if self.elidable is None: + self.elidable = next_chunk.elision + + def is_potentially_ambiguous_hiatus(self): + return self.text in ["ie", "ée", "ue"] + + def ends_with_potentially_ambiguous_hiatus(self): + return len(self.text) >= 2 and self.text[-2:] in ["ie", "ée", "ue"] + + def check_potentially_ambiguous_plural(self, previous_chunk): + if self.text == "s": + if previous_chunk.is_potentially_ambiguous_hiatus(): + previous_chunk.error = "ambiguous" + self.error = "ambiguous" + + def check_potentially_ambiguous_with_elision(self, next_chunk): + if self.ends_with_potentially_ambiguous_hiatus(): + if next_chunk.elision is not None or True not in next_chunk.elision: + self.error = "ambiguous" + next_chunk.error = "ambiguous" + + def check_hiatus(self, previous_chunk, next_chunk, only_two_parts): + if previous_chunk is not None: + self.check_potentially_ambiguous_plural(previous_chunk) + if self.ends_with_potentially_ambiguous_hiatus(): + if not any(next_chunk.elision or [False]): + self.error = "ambiguous" + next_chunk.error = "ambiguous" + + # elision concerns words ending with a vowel without a mute 'e' + # that have not been marked "no_hiatus" + # it also concerns specifically "et" + elif (not self.text.endswith('e') and self.no_hiatus is None + and (self.is_vowels() or self.text == 'Y') + or (only_two_parts and previous_chunk.text == 'e' and self.text == 't')): + # it happens if the next word is not marked no_hiatus + # and starts with something that causes elision + if all(next_chunk.elision) and next_chunk.no_hiatus is None: + self.error = "hiatus" + next_chunk.error = "hiatus" + + def make_word_end(self): + self.word_end = True + + def contains_break(self): + return '-' in self.text \ + or self.word_end or False \ + or self.had_hyphen or False + + def is_e(self): + return self.text == "e" + + def possible_weights_approx(self): + """Return the possible number of syllabes taken by a vowel chunk (permissive approximation)""" + chunk_text = self.text + if len(chunk_text) == 1: + return [1] + # old spelling and weird exceptions + if chunk_text in ['ouï']: + return [1, 2] # TODO unsure about that + if chunk_text in ['eüi', 'aoû', 'uë']: + return [1] + if chunk_text in ['aïe', 'oë', 'ouü']: + return [1, 2] + if contains_trema(chunk_text): + return [2] + chunk_text = strip_accents(chunk_text, True) + if chunk_text in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', + 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', + 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', + 'yeu', 'ye', 'you']: + return [1] + if chunk_text == "oua": + return [1, 2] # "pouah" + if chunk_text == "ao": + return [1, 2] # "paon" + for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']: + if x in chunk_text: + return [2] + # beware of "déesse" + if chunk_text == 'ée': + return [1, 2] + if chunk_text[0] == 'i': + return [1, 2] + if chunk_text[0] == 'u' and (strip_accents(chunk_text[1]) in ['i', 'e']): + return [1, 2] + if chunk_text[0] == 'o' and chunk_text[1] == 'u' and len(chunk_text) >= 3 and\ + strip_accents(chunk_text[2]) in ['i', 'e']: + return [1, 2] + if 'é' in chunk_text or 'è' in chunk_text: + return [2] + # we can't tell + return [1, 2] + + def clear(self): + if self.word_end is None or not self.word_end: + return self.text + return self.text + ' ' + + def set_possible_weights_from_context(self, chunks_before, chunks_after, template, threshold): + if self.weights is not None: + return + if len(chunks_after) > 0: + next_chunk = chunks_after[0] + else: + next_chunk = None + + if len(chunks_before) > 0: + previous_chunk = chunks_before[-1] + else: + previous_chunk = None + + if len(chunks_before) > 1: + previous_previous_chunk = chunks_before[-2] + else: + previous_previous_chunk = None + + if ((len(chunks_after) <= 1 and self.is_e()) + and not (next_chunk is not None and next_chunk.is_vowels()) + and not (previous_chunk is None or previous_chunk.contains_break()) + and not (previous_previous_chunk is None or previous_previous_chunk.contains_break())): + # special case for verse endings, which can get elided (or not) + # but we don't elide lone syllables ("prends-le", etc.) + + if next_chunk is None: + self.weights = [0] # ending 'e' is elided + elif next_chunk.text == 's': + self.weights = [0] # ending 'es' is elided + elif next_chunk.text == 'nt': + # ending 'ent' is sometimes elided, try to use pronunciation + # actually, this will have an influence on the rhyme's gender + # see feminine + possible = [] + if not self.verse.phon or len(self.verse.phon) == 0: + self.weights = [0, 1] # do something reasonable without pron + else: + for possible_phon in self.verse.phon: + if possible_phon.endswith(')') or possible_phon.endswith('#'): + possible.append(1) + else: + possible.append(0) + self.weights = possible + else: + self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold) + elif (next_chunk is None and self.text == 'e' and + previous_chunk is not None and (previous_chunk.text.endswith('-c') + or previous_chunk.text.endswith('-j') + or (previous_chunk.text == 'c' + and previous_chunk.had_hyphen is not None) + or (previous_chunk.text == 'j' + and previous_chunk.had_hyphen is not None))): + self.weights = [0] # -ce and -je are elided + elif next_chunk is None and self.text in ['ie', 'ée']: + self.weights = [1] + # elide "-ée" and "-ées", but be specific (beware of e.g. "réel") + elif (len(chunks_after) <= 1 + and self.text == 'ée' + and (next_chunk is None or chunks_after[-1].text == 's')): + self.weights = [1] + elif self.elidable is not None: + self.weights = [int(not x) for x in self.elidable] + else: + self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold) + + def possible_weights(self, chunks_before, chunks_after, template, threshold): + if template.options['diaeresis'] == "classical": + return self.possible_weights_ctx(chunks_before, chunks_after, threshold=threshold) + elif template.options['diaeresis'] == "permissive": + return self.possible_weights_approx() + + def possible_weights_ctx(self, chunks_before, chunks_after, threshold=None): + if not threshold: + threshold = DEFAULT_THRESHOLD + q = self.make_query(chunks_before, chunks_after) + v = diaeresis.diaeresis_finder.lookup(q) + if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold: + return [int(list(v.keys())[0])] + else: + return self.possible_weights_seed() + + def make_query(self, chunks_before, chunks_after): + cleaned_before = [chunk.clear() for chunk in chunks_before] + cleaned_after = [chunk.clear() for chunk in chunks_after] + current_clear = self.clear() + if current_clear.endswith(' '): + current_clear = current_clear.rstrip() + if len(cleaned_after) > 0: + cleaned_after[0] = " " + cleaned_after[0] + else: + cleaned_after.append(' ') + ret2 = intersperse( + ''.join(cleaned_after), + ''.join([x[::-1] for x in cleaned_before[::-1]])) + ret = [current_clear] + ret2 + return ret + + def possible_weights_seed(self): + """Return the possible number of syllabes taken by a vowel chunk""" + if len(self.text) == 1: + return [1] + # dioïde, maoïste, taoïste + if (self.text[-1] == 'ï' and len(self.text) >= 3 and not + self.text[-3:-1] == 'ou'): + return [3] + # ostéoarthrite + if "éoa" in self.text: + return [3] + # antiaérien; but let's play it safe + if "iaé" in self.text: + return [2, 3] + # giaour, miaou, niaouli + if "iaou" in self.text: + return [2, 3] + # bioélectrique + if "ioé" in self.text: + return [2, 3] + # méiose, nucléion, etc. + if "éio" in self.text: + return [2, 3] + # radioactif, radioamateur, etc. + if "ioa" in self.text: + return [2, 3] + # pléiade + if "éio" in self.text: + return [2, 3] + # pompéien, tarpéien... + # in theory the "-ie" should give a diaeresis, so 3 syllabes + # let's keep the benefit of the doubt... + # => this also gives 3 as a possibility for "obéie"... + if "éie" in self.text: + return [2, 3] + # tolstoïen + # same remark + if "oïe" in self.text: + return [2, 3] + # shanghaïen (diaeresis?), but also "aië" + if "aïe" in self.text: + return [1, 2, 3] + if self.text in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']: + return [1] + # we can't tell + return [1, 2] + + def set_hemistiche_from_context(self, previous_previous_chunk, previous_chunk, next_chunk): + if self.hemistiche is not None: + return + ending = self.text + if not (self.word_end or False) and next_chunk is not None: + if not (next_chunk.word_end or False): + self.hemistiche = "cut" + return + ending += next_chunk.text + if ending in SURE_END_FEM and previous_previous_chunk is not None and previous_chunk is not None: + # check that this isn't a one-syllabe wourd (which is allowed) + ok = False + try: + if '-' in previous_chunk.original or (previous_chunk.word_end or False): + ok = True + if '-' in previous_previous_chunk.original or (previous_previous_chunk.word_end or False): + ok = True + except IndexError: + pass + if not ok: + # hemistiche ends in feminine + if any(self.elidable or [False]): + self.hemistiche = "elid" # elidable final -e, but only OK if actually elided + return + else: + self.hemistiche = "fem" + return + self.hemistiche = "ok" + + def normalize(self): + if self.text_pron is None: + return normalize(self.original, strip=False, rm_apostrophe_end=False) + else: + return self.text + + def get_original_text(self): + return self.original + + def get_errors_set(self, forbidden_ok, hiatus_ok): + errors_chunk = set() + if self.error is not None: + if self.error == "ambiguous" and not forbidden_ok: + errors_chunk.add(error.ErrorForbiddenPattern) + if self.error == "hiatus" and not hiatus_ok: + errors_chunk.add(error.ErrorHiatus) + if self.error == "illegal": + errors_chunk.add(error.ErrorBadCharacters) + return errors_chunk + + +LETTERS = { + 'f': 'effe', + 'h': 'ache', + 'j': 'gi', + 'k': 'ka', + 'l': 'elle', + 'm': 'aime', + 'n': 'aine', + 'q': 'cu', + 'r': 'ère', + 's': 'esse', + 'w': 'doublevé', + 'x': 'ixe', + 'z': 'zaide' +} + + +def elision_wrap(chunk_group): + first_letter = common.remove_punctuation(chunk_group[0].original.strip()) + temp = elision(''.join(chunk.text for chunk in chunk_group), + ''.join(chunk.original for chunk in chunk_group), + first_letter == first_letter.upper()) + return temp + + +def elision(word, original_word, was_cap): + if word.startswith('y'): + if word == 'y': + return [True] + if was_cap: + if word == 'york': + return [False] + # Grevisse, Le Bon usage, 14th ed., paragraphs 49-50 + # depends on whether it's French or foreign... + return [True, False] + else: + exc = ["york", "yeux", "yeuse", "ypérite"] + for w in exc: + if word.startswith(w): + return [True] + # otherwise, no elision + return [False] + if word in ["oui", "ouis"]: + # elision for those words, but beware, no elision for "ouighour" + # boileau : "Ont l'esprit mieux tourné que n'a l'homme ? Oui sans doute." + # so elision sometimes + return [True, False] + if word.startswith("ouistiti") or word.startswith("ouagadougou"): + return [False] + # "un", "une" are non-elided as nouns ("cette une") + if word in ["un", "une"]: + return [True, False] + # "onze" is not elided + if word == "onze": + return [False] + if word.startswith('ulul'): + return [False] # ululement, ululer, etc. + if word.startswith('uhlan'): + return [False] # uhlan + if word[0] == 'h': + if word == "huis": + # special case, "huis" is elided but "huis clos" isn't + return [True, False] + # look up in haspirater using the original (but normalized) word + return list(map((lambda s: not s), + haspirater.lookup(normalize(original_word)))) + if is_vowels(word[0]): + return [True] + return [False] diff --git a/plint/chunks.py b/plint/chunks.py @@ -2,298 +2,18 @@ import re import sys from pprint import pprint -from haspirater import haspirater -from plint import common, vowels -from plint.common import is_vowels, APOSTROPHES, is_consonants, normalize, strip_accents_one, CONSONANTS, SURE_END_FEM +from plint.chunk import Chunk +from plint.common import normalize, get_consonants_regex from plint.hyphen_splitter import HyphenSplitter -class Chunk: - - def __init__(self, word): - self.original = word - self.text = normalize(word, rm_apostrophe=True) - self.hemistiche = None - self.error = None - self.illegal_str = None - self.weights = None - self.had_hyphen = None - self.text_pron = None - self.elision = None - self.no_hiatus = None - self.elidable = None - self.word_end = False - # TODO What is a weight without s? - self.weight = None - - def __repr__(self): - return "Chunk("\ - + "original:" + self.original\ - + ", text:" + self.text\ - + ", weights:" + str(self.weights or [])\ - + ", weight:" + str(self.weight or "")\ - + ", elidable:" + str(self.elidable or False)\ - + ", elision:" + str(self.elision or False)\ - + ", hemistiche:" + str(self.hemistiche)\ - + ", error:" + str(self.error)\ - + ", illegal_str:" + str(self.illegal_str)\ - + ", had_hypher:" + str(self.had_hyphen)\ - + ", text_pron:" + str(self.text_pron)\ - + ", no_hiatus:" + str(self.no_hiatus)\ - + ", word_end:" + str(self.word_end)\ - + ")" + "\n" - - def copy(self): - new_chunk = Chunk(self.original) - new_chunk.original = self.original - new_chunk.text = self.text - new_chunk.hemistiche = self.hemistiche - new_chunk.error = self.error - new_chunk.illegal_str = self.illegal_str - new_chunk.weights = self.weights - new_chunk.had_hyphen = self.had_hyphen - new_chunk.text_pron = self.text_pron - new_chunk.elision = self.elision - new_chunk.no_hiatus = self.no_hiatus - new_chunk.elidable = self.elidable - new_chunk.word_end = self.word_end - new_chunk.weight = self.weight - return new_chunk - - def set_hemistiche(self, hemis): - self.hemistiche = hemis - - def check_forbidden_characters(self): - es = "" - for x in self.text: - if not common.remove_punctuation(strip_accents_one(x)[0].lower()) in common.LEGAL: - es += 'I' - self.error = "illegal" - else: - es += ' ' - if self.error is not None and self.error == "illegal": - self.illegal_str = es - - def simplify_gu_qu(self, next_chunk): - if next_chunk.text.startswith('u'): - if self.text.endswith('q'): - next_chunk.text = next_chunk.text[1:] - if next_chunk.text == '': - self.original += next_chunk.original - next_chunk.original = '' - if self.text.endswith('g') and len(next_chunk.text) >= 2: - if next_chunk.text[1] in "eéèa": - next_chunk.text = next_chunk.text[1:] - - def elide_inside_words(self, all_next_chunks): - if self.text == "e-": - self.weights = [0] # force elision - next_chunk = all_next_chunks[0] - if self.text == "e" and next_chunk.text.startswith("-h"): - # collect what follows until the next hyphen or end - flw = next_chunk.original.split('-')[1] - for future_chunk in all_next_chunks[1:]: - flw += future_chunk.original.split('-')[0] - if '-' in future_chunk.original: - break - # TODO: not sure if this reconstruction of the original word is bulletproof... - if haspirater.lookup(normalize(flw)): - self.weights = [0] - else: - self.weights = [1] - - def remove_leading_and_trailing_crap(self): - seen_space = False - seen_hyphen = False - while len(self.text) > 0 and self.text[0] in ' -': - if self.text[0] == ' ': - seen_space = True - else: - seen_hyphen = True - self.text = self.text[1:] - while len(self.text) > 0 and self.text[-1] in ' -': - if self.text[-1] == ' ': - seen_space = True - else: - seen_hyphen = True - self.text = self.text[:-1] - if seen_hyphen and not seen_space: - self.had_hyphen = True - - def is_empty(self): - return len(self.text) == 0 - - def add_original(self, other_chunk): - self.original += other_chunk.original - - def create_sigles(self): - new_chunks = [] - for j, character in enumerate(self.text): - try: - new_chunk_content = LETTERS[character] - # hack: the final 'e's in letters are just to help pronunciation - # inference and are only needed at end of word, otherwise they will - # mess syllable count up - if j < len(self.text) - 1 and new_chunk_content[-1] == 'e': - new_chunk_content = new_chunk_content[:-1] - except KeyError: - new_chunk_content = character + 'é' - new_chunks += [(j, x) for x in re.split(get_consonants_regex(), new_chunk_content)] - new_chunks = [x for x in new_chunks if len(x[1]) > 0] - new_word = [] - last_opos = -1 - for j, (original_position, character) in enumerate(new_chunks): - part = "" - if j == len(new_chunks) - 1: - # don't miss final spaces - part = self.original[last_opos + 1:] - elif last_opos < original_position: - part = self.original[last_opos + 1:original_position + 1] - last_opos = original_position - # allow or forbid elision because of possible ending '-e' before - # forbid hiatus both for this and for preceding - # instruct that we must use text for the pronunciation - new_chunk = Chunk(part) - new_chunk.original = part - new_chunk.text = character - new_chunk.text_pron = True - new_chunk.elision = [False, True] - new_chunk.no_hiatus = True - new_word.append(new_chunk) - # propagate information from splithyph - new_word[-1].hemistiche = self.hemistiche - return new_word - - def check_elidable(self): - if self.text == 'e': - self.elidable = [True] - - def is_consonants(self): - return is_consonants(self.text) - - def ends_with_apostrophe(self): - return re.search("[" + APOSTROPHES + "]$", self.original) is not None - - def elide_vowel_problems(self, chunk_group): - if self.elision is None: - self.elision = elision_wrap(chunk_group) - - def process_y_cases(self, previous_chunk, next_chunk): - new_word_from_chunk = [] - if 'y' not in self.text or len(self.text) == 1 or self.text.startswith("y"): - new_word_from_chunk.append(self) - else: - if previous_chunk is not None and next_chunk is not None: - # special cases of "pays", "alcoyle", "abbayes" - c_text = self.text - p_text = previous_chunk.text - n_text = next_chunk.text - # TODO Should you force if this condition does not apply? - if ((c_text == "ay" and p_text.endswith("p") and n_text.startswith("s")) - or - (c_text == "oy" and p_text.endswith("lc") - and n_text.startswith("l")) - or - (c_text == "aye" and p_text.endswith("bb") - and n_text.startswith("s"))): - # force weight - self.weights = [2] - new_word_from_chunk.append(self) - return new_word_from_chunk - must_force = next_chunk is None and previous_chunk is not None and \ - (self.text == "aye" and previous_chunk.text.endswith("bb")) - if must_force: - # force weight - self.weights = [2] - new_word_from_chunk.append(self) - else: - sub_chunks = re.split(re.compile("(y+)"), self.text) - sub_chunks = [x for x in sub_chunks if len(x) > 0] - for j, sub_chunk in enumerate(sub_chunks): - lindex = int(j * len(self.original) / len(sub_chunks)) - rindex = int((j + 1) * len(self.original) / len(sub_chunks)) - part = self.original[lindex:rindex] - new_subchunk_text = 'Y' if 'y' in sub_chunk else sub_chunk - new_subchunk = self.copy() - new_subchunk.original = part - new_subchunk.text = new_subchunk_text - new_word_from_chunk.append(new_subchunk) - return new_word_from_chunk - - def is_vowels(self): - return is_vowels(self.text) - - def is_dash_elidable(self): - # "fais-le" not elidable, but "suis-je" and "est-ce" is - return not ('-' in self.text and not self.text.endswith('-j') and not self.text.endswith('-c')) - - def check_elidable_with_next(self, next_chunk): - if self.elidable is None: - self.elidable = next_chunk.elision - - def is_potentially_ambiguous_hiatus(self): - return self.text in ["ie", "ée", "ue"] - - def ends_with_potentially_ambiguous_hiatus(self): - return len(self.text) >= 2 and self.text[-2:] in ["ie", "ée", "ue"] - - def check_potentially_ambiguous_plural(self, previous_chunk): - if self.text == "s": - if previous_chunk.is_potentially_ambiguous_hiatus(): - previous_chunk.error = "ambiguous" - self.error = "ambiguous" - - def check_potentially_ambiguous_with_elision(self, next_chunk): - if self.ends_with_potentially_ambiguous_hiatus(): - if next_chunk.elision is not None or True not in next_chunk.elision: - self.error = "ambiguous" - next_chunk.error = "ambiguous" - - def check_hiatus(self, previous_chunk, next_chunk, only_two_parts): - if previous_chunk is not None: - self.check_potentially_ambiguous_plural(previous_chunk) - if self.ends_with_potentially_ambiguous_hiatus(): - if not any(next_chunk.elision or [False]): - self.error = "ambiguous" - next_chunk.error = "ambiguous" - - # elision concerns words ending with a vowel without a mute 'e' - # that have not been marked "no_hiatus" - # it also concerns specifically "et" - elif (not self.text.endswith('e') and self.no_hiatus is None - and (self.is_vowels() or self.text == 'Y') - or (only_two_parts and previous_chunk.text == 'e' and self.text == 't')): - # it happens if the next word is not marked no_hiatus - # and starts with something that causes elision - if all(next_chunk.elision) and next_chunk.no_hiatus is None: - self.error = "hiatus" - next_chunk.error = "hiatus" - - def make_word_end(self): - self.word_end = True - - def contains_break(self): - return '-' in self.text \ - or self.word_end or False \ - or self.had_hyphen or False - - def is_e(self): - return self.text == "e" - - -def get_consonants_regex(): - all_consonants = CONSONANTS + CONSONANTS.upper() - consonants_regexp = re.compile('([^' + all_consonants + '*-]+)', re.UNICODE) - return consonants_regexp - - class Chunks: - def __init__(self, line): - self._line = line + def __init__(self, verse): + # TODO Find a way to remove this dependency + self.verse = verse self.chunks = [] self.create_chunks() - self.phon = None self.separated_chunks = [] def create_chunks(self): @@ -304,7 +24,7 @@ class Chunks: self.elide_inside_words() self.remove_leading_and_trailing_crap() self.collapse_empty_chunks_from_simplifications() - self.create_sigles() + self.create_acronym() self.elide_vowel_problems() self.process_y_cases() self.annotate_final_mute_e() @@ -315,8 +35,8 @@ class Chunks: def print_new_line_if_changed(self): now_line = ''.join(chunk.original for chunk in self.chunks) - if now_line != self._line: - print("%s became %s" % (self._line, now_line), file=sys.stderr) + if now_line != self.verse.input_line: + print("%s became %s" % (self.verse.input_line, now_line), file=sys.stderr) pprint(self.chunks, stream=sys.stderr) def merge_chunks_words(self): @@ -384,12 +104,12 @@ class Chunks: future_chunks.append(acc) self.separated_chunks = future_chunks - def create_sigles(self): + def create_acronym(self): for i, chunk_group in enumerate(self.separated_chunks): if len(chunk_group) == 1: first_chunk = chunk_group[0] if first_chunk.is_consonants(): - new_word = first_chunk.create_sigles() + new_word = first_chunk.create_acronym() self.separated_chunks[i] = new_word self.separated_chunks[i][-1].check_elidable() @@ -430,37 +150,25 @@ class Chunks: def initialize_chunks(self): word_bi_tokens = self.get_word_tokens() - pre_chunks = self.preprocess_bi_tokens(word_bi_tokens) + pre_chunks = pre_process_bi_tokens(word_bi_tokens) self.separated_chunks = [] for (is_end_word, pre_chunk) in pre_chunks: if len(pre_chunk) != 0: - self.separated_chunks.append([Chunk(word) for word in pre_chunk]) + self.separated_chunks.append([Chunk(word, self.verse) for word in pre_chunk]) if not is_end_word: # word end is a fake word end for chunk in self.separated_chunks[-1]: chunk.set_hemistiche('cut') - def preprocess_bi_tokens(self, word_bi_tokens): - consonants_regexp = get_consonants_regex() - pre_chunks = [(b, re.split(consonants_regexp, word)) for (b, word) in word_bi_tokens] - pre_chunks = [(b, remove_trivial(x, self.is_empty_word)) for (b, x) in pre_chunks] - return pre_chunks - def get_word_tokens(self): words = self.split_input_line_by_whitespace() - words = remove_trivial(words, self.is_empty_word) - word_tokens = self.split_all_hyph(words) + words = remove_trivial(words, is_empty_word) + word_tokens = split_all_hyphen(words) return word_tokens - def split_all_hyph(self, words): - return sum([HyphenSplitter().split(w) for w in words], []) - - def is_empty_word(self, word): - return re.match(r"^\s*$", word) or len(normalize(word, rm_all=True)) == 0 - def split_input_line_by_whitespace(self): whitespace_regexp = re.compile(r"(\s+)") - words = re.split(whitespace_regexp, self._line) + words = re.split(whitespace_regexp, self.verse.input_line) return words def annotate(self, template, threshold): @@ -468,187 +176,75 @@ class Chunks: for i, chunk in enumerate(self.chunks): if not chunk.is_vowels(): continue + + chunks_before = self.chunks[:i] + chunks_after = self.chunks[i + 1:] # for the case of "pays" and related words - if chunk.weights is None: - chunk.weights = self.possible_weights_context(i, template, threshold) - if chunk.hemistiche is None: - chunk.hemistiche = self.hemistiche(i) - return self.align2str() + chunk.set_possible_weights_from_context(chunks_before, chunks_after, template, threshold) - def possible_weights_context(self, pos, template, threshold): - chunk = self.chunks[pos] - if pos != len(self.chunks) - 1: - next_chunk = self.chunks[pos + 1] - else: - next_chunk = None - if pos > 0: - previous_chunk = self.chunks[pos - 1] - else: - previous_chunk = None - if pos > 1: - previous_previous_chunk = self.chunks[pos - 2] - else: - previous_previous_chunk = None - - if ((pos >= len(self.chunks) - 2 and chunk.is_e()) - and not (next_chunk is not None and next_chunk.is_vowels()) - and not (previous_chunk is None or previous_chunk.contains_break()) - and not (previous_previous_chunk is None or previous_previous_chunk.contains_break())): - # special case for verse endings, which can get elided (or not) - # but we don't elide lone syllables ("prends-le", etc.) - - if next_chunk is None: - return [0] # ending 'e' is elided - if next_chunk.text == 's': - return [0] # ending 'es' is elided - if next_chunk.text == 'nt': - # ending 'ent' is sometimes elided, try to use pronunciation - # actually, this will have an influence on the rhyme's gender - # see feminine - possible = [] - if not self.phon or len(self.phon) == 0: - return [0, 1] # do something reasonable without pron - for possible_phon in self.phon: - if possible_phon.endswith(')') or possible_phon.endswith('#'): - possible.append(1) - else: - possible.append(0) - return possible - return self.possible_weights(pos, template, threshold) - if (next_chunk is None and chunk.text == 'e' and - previous_chunk is not None and (previous_chunk.text.endswith('-c') - or previous_chunk.text.endswith('-j') - or (previous_chunk.text == 'c' - and previous_chunk.had_hyphen is not None) - or (previous_chunk.text == 'j' - and previous_chunk.had_hyphen is not None))): - return [0] # -ce and -je are elided - if next_chunk is None and chunk.text in ['ie', 'ée']: - return [1] - # elide "-ée" and "-ées", but be specific (beware of e.g. "réel") - if (pos >= len(self.chunks) - 2 - and chunk.text == 'ée' - and (next_chunk is None or self.chunks[-1].text == 's')): - return [1] - if chunk.elidable is not None: - return [int(not x) for x in chunk.elidable] - return self.possible_weights(pos, template, threshold) - - def possible_weights(self, pos, template, threshold): - if template.options['diaeresis'] == "classical": - return vowels.possible_weights_ctx(self.chunks, pos, threshold=threshold) - elif template.options['diaeresis'] == "permissive": - return vowels.possible_weights_approx(self.chunks[pos].text) - - def hemistiche(self, pos): - current_chunk = self.chunks[pos] - ending = current_chunk.text - if not (current_chunk.word_end or False) and pos < len(self.chunks) - 1: - if not (self.chunks[pos + 1].word_end or False): - return "cut" - ending += self.chunks[pos + 1].text - if ending in SURE_END_FEM: - # check that this isn't a one-syllabe wourd (which is allowed) - ok = False - try: - for i in range(2): - if '-' in self.chunks[pos - i - 1].original or (self.chunks[pos - i - 1].word_end or False) : - ok = True - except IndexError: - pass - if not ok: - # hemistiche ends in feminine - if any(current_chunk.elidable or [False]): - return "elid" # elidable final -e, but only OK if actually elided - else: - return "fem" - return "ok" + next_chunk = self.chunks[i + 1] if i < len(self.chunks) - 1 else None + previous_chunk = self.chunks[i - 1] if i > 0 else None + previous_previous_chunk = self.chunks[i - 2] if i > 1 else None + chunk.set_hemistiche_from_context(previous_previous_chunk, previous_chunk, next_chunk) + return self.align2str() def align2str(self): return ''.join([x.text for x in self.chunks]) + def print_n_syllables(self, n_syllables, offset, output_file): + count = 0 + for i, chunk in enumerate(self.chunks[::-1]): + if chunk.weights is not None: + if count < offset: + count += 1 + continue + pos = len(self.chunks) - i - 1 + considered_chunk = self.chunks[pos] + chunks_before = self.chunks[:pos] + chunks_after = self.chunks[pos + 1:] + print(str(n_syllables) + ' ' + ' '.join(considered_chunk.make_query(chunks_before, chunks_after)), + file=output_file) + break -LETTERS = { - 'f': 'effe', - 'h': 'ache', - 'j': 'gi', - 'k': 'ka', - 'l': 'elle', - 'm': 'aime', - 'n': 'aine', - 'q': 'cu', - 'r': 'ère', - 's': 'esse', - 'w': 'doublevé', - 'x': 'ixe', - 'z': 'zaide' -} - - -def elision_wrap(chunk_group): - first_letter = common.remove_punctuation(chunk_group[0].original.strip()) - temp = elision(''.join(chunk.text for chunk in chunk_group), - ''.join(chunk.original for chunk in chunk_group), - first_letter == first_letter.upper()) - return temp - - -def elision(word, original_word, was_cap): - if word.startswith('y'): - if word == 'y': - return [True] - if was_cap: - if word == 'york': - return [False] - # Grevisse, Le Bon usage, 14th ed., paragraphs 49-50 - # depends on whether it's French or foreign... - return [True, False] - else: - exc = ["york", "yeux", "yeuse", "ypérite"] - for w in exc: - if word.startswith(w): - return [True] - # otherwise, no elision - return [False] - if word in ["oui", "ouis"]: - # elision for those words, but beware, no elision for "ouighour" - # boileau : "Ont l'esprit mieux tourné que n'a l'homme ? Oui sans doute." - # so elision sometimes - return [True, False] - if word.startswith("ouistiti") or word.startswith("ouagadougou"): - return [False] - # "un", "une" are non-elided as nouns ("cette une") - if word in ["un", "une"]: - return [True, False] - # "onze" is not elided - if word == "onze": - return [False] - if word.startswith('ulul'): - return [False] # ululement, ululer, etc. - if word.startswith('uhlan'): - return [False] # uhlan - if word[0] == 'h': - if word == "huis": - # special case, "huis" is elided but "huis clos" isn't - return [True, False] - # look up in haspirater using the original (but normalized) word - return list(map((lambda s: not s), - haspirater.lookup(normalize(original_word)))) - if is_vowels(word[0]): - return [True] - return [False] - - -def remove_trivial(chunks, predicate): + def normalized(self): + return ''.join(chunk.normalize() for chunk in self.chunks).lstrip().rstrip() + + def get_line(self): + return ''.join(chunk.get_original_text() for chunk in self.chunks) + + def get_errors_set(self, forbidden_ok, hiatus_ok): + errors = set() + for chunk in self.chunks: + errors_chunk = chunk.get_errors_set(forbidden_ok, hiatus_ok) + errors = errors.union(errors_chunk) + return errors + + +def remove_trivial(words, predicate): new_chunks = [] - accu = "" - for i, w in enumerate(chunks): - if predicate(w): + words_accumulation = "" + for i, chunk in enumerate(words): + if predicate(chunk): if len(new_chunks) == 0: - accu = accu + w + words_accumulation = words_accumulation + chunk else: - new_chunks[-1] = new_chunks[-1] + w + new_chunks[-1] = new_chunks[-1] + chunk else: - new_chunks.append(accu + w) - accu = "" + new_chunks.append(words_accumulation + chunk) + words_accumulation = "" return new_chunks + + +def split_all_hyphen(words): + return sum([HyphenSplitter().split(w) for w in words], []) + + +def is_empty_word(word): + return re.match(r"^\s*$", word) or len(normalize(word, rm_all=True)) == 0 + + +def pre_process_bi_tokens(word_bi_tokens): + consonants_regexp = get_consonants_regex() + pre_chunks = [(b, re.split(consonants_regexp, word)) for (b, word) in word_bi_tokens] + pre_chunks = [(b, remove_trivial(x, is_empty_word)) for (b, x) in pre_chunks] + return pre_chunks diff --git a/plint/common.py b/plint/common.py @@ -122,3 +122,9 @@ def to_xsampa(s): def from_xsampa(s): """convert x-sampa to our modified format""" return subst(s, [(x[1], x[0]) for x in SUBSTS]) + + +def get_consonants_regex(): + all_consonants = CONSONANTS + CONSONANTS.upper() + consonants_regexp = re.compile('([^' + all_consonants + '*-]+)', re.UNICODE) + return consonants_regexp+ \ No newline at end of file diff --git a/plint/error.py b/plint/error.py @@ -118,11 +118,11 @@ class ErrorBadRhymeGenre(ErrorBadRhyme): return "\"" + result + "\"" def get_id(self, pattern): - return pattern.femid + return pattern.feminine_id class ErrorBadRhymeObject(ErrorBadRhyme): def get_id(self, pattern): - return pattern.myid + return pattern.my_id class ErrorBadRhymeSound(ErrorBadRhymeObject): @property @@ -157,7 +157,7 @@ class ErrorMultipleWordOccurrence: def report(self, pattern): return (_("Too many occurrences of word \"%s\" for rhyme %s") - % (self.word, pattern.myid)) + % (self.word, pattern.my_id)) class ErrorIncompleteTemplate: def report(self, pattern): diff --git a/plint/pattern.py b/plint/pattern.py @@ -0,0 +1,31 @@ +from plint import error + + +class Pattern: + def __init__(self, metric, my_id="", feminine_id="", constraint=None, hemistiches=None): + self.metric = metric + self.length = None + self.parse_metric() + self.my_id = my_id + self.feminine_id = feminine_id + self.constraint = constraint + if hemistiches: + self.hemistiches = hemistiches + + def parse_metric(self): + """Parse from a metric description""" + try: + verse = [int(x) for x in self.metric.split('/')] + for i in verse: + if i < 1: + raise ValueError + except ValueError: + raise error.TemplateLoadError("Metric description should only contain positive integers") + if sum(verse) > 16: + raise error.TemplateLoadError("Metric length limit exceeded") + self.hemistiches = [] + self.length = 0 + for v in verse: + self.length += v + self.hemistiches.append(self.length) + self.length = self.hemistiches.pop()+ \ No newline at end of file diff --git a/plint/plint_irc.py b/plint/plint_irc.py @@ -75,7 +75,7 @@ def manage(line, descriptor=sys.stdout): else: lbuf = [l] return True - errors = template.check(text, quiet=False) + errors = template.check(text) quiet = False if errors: print(errors.report()) diff --git a/plint/plint_web.py b/plint/plint_web.py @@ -1,5 +1,5 @@ #!/usr/bin/python3 -Ou -#encoding: utf8 +# encoding: utf8 from plint import localization, error, template, diaeresis import re @@ -10,6 +10,7 @@ import time env = Environment(loader=PackageLoader('plint_web', 'views')) + # force HTTPS usage # http://bottlepy.org/docs/dev/faq.html#problems-with-reverse-proxies # because bottle makes absolute redirects @@ -17,215 +18,235 @@ env = Environment(loader=PackageLoader('plint_web', 'views')) # even though relative Location: is now allowed # http://stackoverflow.com/a/25643550 def fix_https(app): - def fixed_app(environ, start_response): - environ['wsgi.url_scheme'] = 'https' - return app(environ, start_response) - return fixed_app + def fixed_app(environ, start_response): + environ['wsgi.url_scheme'] = 'https' + return app(environ, start_response) + + return fixed_app + + app = Bottle() app.wsgi = fix_https(app.wsgi) THROTTLE_DELAY = 2 throttle = set() + def best_match(matches, header): - # inspired by http://www.xml.com/pub/a/2005/06/08/restful.html + # inspired by http://www.xml.com/pub/a/2005/06/08/restful.html + + def parse_one(t): + parts = t.split(";") + d = {} + for param in parts[1:]: + spl = param.split("=") + if (len(spl) != 2): + # this should be formatted as key=value + # so ignore it + continue + k, v = spl + d[k.strip().lower()] = v.strip() + if 'q' not in d.keys(): + d['q'] = "1" + return (parts[0], d) + + parts = [] + for p in header.split(","): + parsed = parse_one(p) + try: + value = float(parsed[1]['q']) + except ValueError: + # q value should be a float; set it to 0 + value = 0 + parts.append((value, parsed[0].split("-"))) + for lang in [x[1] for x in sorted(parts, reverse=True)]: + for match in matches: + if match in lang: + return match + return matches[0] - def parse_one(t): - parts = t.split(";") - d = {} - for param in parts[1:]: - spl = param.split("=") - if (len(spl) != 2): - # this should be formatted as key=value - # so ignore it - continue - k, v = spl - d[k.strip().lower()] = v.strip() - if 'q' not in d.keys(): - d['q'] = "1" - return (parts[0], d) - - parts = [] - for p in header.split(","): - parsed = parse_one(p) - try: - value = float(parsed[1]['q']) - except ValueError: - # q value should be a float; set it to 0 - value = 0 - parts.append((value, parsed[0].split("-"))) - for lang in [x[1] for x in sorted(parts, reverse=True)]: - for match in matches: - if match in lang: - return match - return matches[0] def get_locale(): - header = request.headers.get('Accept-Language') - print(header) - try: - return best_match(['fr', 'en'], header) - except AttributeError: - return 'en' + header = request.headers.get('Accept-Language') + print(header) + try: + return best_match(['fr', 'en'], header) + except AttributeError: + return 'en' + def get_title(lang): - if lang == 'fr': - return "plint -- vérification formelle de poèmes" - else: - return "plint -- French poetry checker" + if lang == 'fr': + return "plint -- vérification formelle de poèmes" + else: + return "plint -- French poetry checker" + @app.route('/static/tpl/<filename>') def server_static(filename): - return static_file(filename, root="./static/tpl", mimetype="text/plain") + return static_file(filename, root="./static/tpl", mimetype="text/plain") + @app.route('/<lang>/static/img/<filename>') def server_static(filename, lang=None): - return static_file(filename, root="./static/img") + return static_file(filename, root="./static/img") + @app.route('/<lang>/static/tpl/<filename>') def server_static(filename, lang=None): - return static_file(filename, root="./static/tpl", mimetype="text/plain") + return static_file(filename, root="./static/tpl", mimetype="text/plain") + @app.route('/static/<filename>') def server_static(filename): - return static_file(filename, root="./static") + return static_file(filename, root="./static") + @app.route('/<lang>/static/<filename>') def server_static(filename, lang=None): - return static_file(filename, root="./static") + return static_file(filename, root="./static") + @app.route('/') def root(): - redirect('/' + get_locale() + '/') + redirect('/' + get_locale() + '/') + @app.route('/<page>') def paged(page): - redirect('/' + get_locale() + '/' + page) + redirect('/' + get_locale() + '/' + page) + @app.route('/<lang>/') def root(lang): - if lang not in ['fr', 'en']: - return paged(lang) - return env.get_template('index.html').render(title=get_title(lang), - lang=lang, path="") + if lang not in ['fr', 'en']: + return paged(lang) + return env.get_template('index.html').render(title=get_title(lang), + lang=lang, path="") + @app.route('/<lang>/about') def about(lang): - return env.get_template('about.html').render(title=get_title(lang), - lang=lang, path="about") + return env.get_template('about.html').render(title=get_title(lang), + lang=lang, path="about") + MAX_POEM_LEN = 8192 MAX_LINE_LEN = 512 + class TooBigException(Exception): pass + class TooLongLinesException(Exception): pass + def check(poem): - if len(poem) > MAX_POEM_LEN: - raise TooBigException - s = poem.split("\n") - for x in range(len(s)): - if len(s[x]) > MAX_LINE_LEN: - raise TooLongLinesException - s[x] = s[x].strip() - return s + if len(poem) > MAX_POEM_LEN: + raise TooBigException + s = poem.split("\n") + for x in range(len(s)): + if len(s[x]) > MAX_LINE_LEN: + raise TooLongLinesException + s[x] = s[x].strip() + return s + @app.route('/<lang>/checkjs', method='POST') def q(lang): - global throttle - # necessary when serving with lighttpd proxy-core - ip = request.environ.get('HTTP_X_FORWARDED_FOR') - if not ip: - # fallback; this is 127.0.0.1 with proxy-core - ip = request.environ.get('REMOTE_ADDR') - t = time.time() - print("== %s %s ==" % (ip, t)) - response.content_type = 'application/json' - localization.init_locale(lang) - throttle = set(x for x in throttle if t - x[1] < THROTTLE_DELAY) - if ip in (x[0] for x in throttle): + global throttle + # necessary when serving with lighttpd proxy-core + ip = request.environ.get('HTTP_X_FORWARDED_FOR') + if not ip: + # fallback; this is 127.0.0.1 with proxy-core + ip = request.environ.get('REMOTE_ADDR') + t = time.time() + print("== %s %s ==" % (ip, t)) + response.content_type = 'application/json' + localization.init_locale(lang) + throttle = set(x for x in throttle if t - x[1] < THROTTLE_DELAY) + if ip in (x[0] for x in throttle): + if lang == 'fr': + msg = (("Trop de requêtes pour vérifier le poème," + + " veuillez réessayer dans %d secondes") % + THROTTLE_DELAY) + else: + msg = (("Too many requests to check poem," + + " please try again in %d seconds") % + THROTTLE_DELAY) + return dumps({'error': msg}) + throttle.add((ip, t)) + poem = re.sub(r'<>&', '', request.forms.get('poem')) + print(poem) + + # default message if lang == 'fr': - msg = (("Trop de requêtes pour vérifier le poème," - + " veuillez réessayer dans %d secondes") % - THROTTLE_DELAY) + msg = "Le poème est vide" else: - msg = (("Too many requests to check poem," - + " please try again in %d seconds") % - THROTTLE_DELAY) - return dumps({'error': msg}) - throttle.add((ip, t)) - poem = re.sub(r'<>&', '', request.forms.get('poem')) - print(poem) - - # default message - if lang == 'fr': - msg = "Le poème est vide" - else: - msg = "Poem is empty" - - try: - poem = check(poem) - except TooBigException: - poem = None - if lang == 'fr': - msg = "Le poème est trop long (maximum %d caractères)" % MAX_POEM_LEN - else: - msg = "Poem is too long (maximum %d characters)" % MAX_POEM_LEN - except TooLongLinesException: - poem = None - if lang == 'fr': - msg = "Certaines lignes du poème sont trop longues (maximum %d caractères)" % MAX_LINE_LEN - else: - msg = "Some lines of the poem are too long (maximum %d characters)" % MAX_LINE_LEN - if not poem or len(poem) == 0 or (len(poem) == 1 and len(poem[0]) == 0): - return dumps({'error': msg}) - templateName = re.sub(r'[^a-z_]', '', request.forms.get('template')) - print(templateName) - if templateName == 'custom': - x = request.forms.get('custom_template') - else: + msg = "Poem is empty" + try: - f = open("static/tpl/" + templateName + ".tpl") - x = f.read() - f.close() - except IOError: - if lang == 'fr': - msg = "Modèle inexistant" - else: - msg = "No such template" - return dumps({'error': msg}) - print(x) - try: - templ = template.Template(x) - except error.TemplateLoadError as e: - if lang == 'fr': - msg = "Erreur à la lecture du modèle : " + e.msg + poem = check(poem) + except TooBigException: + poem = None + if lang == 'fr': + msg = "Le poème est trop long (maximum %d caractères)" % MAX_POEM_LEN + else: + msg = "Poem is too long (maximum %d characters)" % MAX_POEM_LEN + except TooLongLinesException: + poem = None + if lang == 'fr': + msg = "Certaines lignes du poème sont trop longues (maximum %d caractères)" % MAX_LINE_LEN + else: + msg = "Some lines of the poem are too long (maximum %d characters)" % MAX_LINE_LEN + if not poem or len(poem) == 0 or (len(poem) == 1 and len(poem[0]) == 0): + return dumps({'error': msg}) + templateName = re.sub(r'[^a-z_]', '', request.forms.get('template')) + print(templateName) + if templateName == 'custom': + x = request.forms.get('custom_template') else: - msg = "Error when reading template: " + e.msg - return dumps({'error': msg}) - poem.append(None) - r = [] - i = 0 - d = {} - for line in poem: - i += 1 - last = False - if line == None: - line = "" - last = True - errors = templ.check(line, last=last) - if errors: - r.append({ - 'line': line, - 'num': i, - 'errors': sum(errors.lines(short=True), []) - }) - d['result'] = r - return dumps(d) + try: + f = open("static/tpl/" + templateName + ".tpl") + x = f.read() + f.close() + except IOError: + if lang == 'fr': + msg = "Modèle inexistant" + else: + msg = "No such template" + return dumps({'error': msg}) + print(x) + try: + templ = template.Template(x) + except error.TemplateLoadError as e: + if lang == 'fr': + msg = "Erreur à la lecture du modèle : " + e.msg + else: + msg = "Error when reading template: " + e.msg + return dumps({'error': msg}) + poem.append(None) + r = [] + i = 0 + d = {} + for line in poem: + i += 1 + last = False + if line == None: + line = "" + last = True + errors = templ.check(line, last=last) + if errors: + r.append({ + 'line': line, + 'num': i, + 'errors': sum(errors.lines(short=True), []) + }) + d['result'] = r + return dumps(d) -if __name__ == '__main__': - run(app, port='5000', server="cherrypy", host="::") +if __name__ == '__main__': + run(app, port='5000', server="cherrypy", host="::") diff --git a/plint/template.py b/plint/template.py @@ -5,305 +5,264 @@ from plint import error, rhyme from plint.common import normalize from plint.nature import nature_count from plint.options import default_options +from plint.pattern import Pattern from plint.verse import Verse -from plint.vowels import make_query - - -class Pattern: - def __init__(self, metric, myid="", femid="", constraint=None, hemistiches=None): - self.metric = metric - self.parse_metric() - self.myid = myid - self.femid = femid - self.constraint = constraint - if hemistiches: - self.hemistiches = hemistiches - - def parse_metric(self): - """Parse from a metric description""" - try: - verse = [int(x) for x in self.metric.split('/')] - for i in verse: - if i < 1: - raise ValueError - except ValueError: - raise error.TemplateLoadError(("Metric description should only contain positive integers")) - if sum(verse) > 16: - raise error.TemplateLoadError(("Metric length limit exceeded")) - self.hemistiches = [] - self.length = 0 - for v in verse: - self.length += v - self.hemistiches.append(self.length) - self.length = self.hemistiches.pop() -class Template: - option_aliases = { - 'fusionner': 'merge', - 'ambiguous_ok': 'forbidden_ok', - 'ambigu_ok': 'forbidden_ok', - 'dierese': 'diaeresis', - 'verifie_occurrences': 'check_occurrences', - 'repetition_ok': 'repeat_ok', - 'incomplet_ok': 'incomplete_ok', - 'phon_supposee_ok': 'phon_supposed_ok', - 'oeil_supposee_ok': 'eye_supposed_ok', - 'oeil_tolerance_ok': 'eye_tolerance_ok', - 'pauvre_oeil_requise': 'poor_eye_required', - 'pauvre_oeil_supposee_ok': 'poor_eye_supposed_ok', - 'pauvre_oeil_vocalique_ok': 'poor_eye_vocalic_ok', + +OPTION_ALIASES = { + 'fusionner': 'merge', + 'ambiguous_ok': 'forbidden_ok', + 'ambigu_ok': 'forbidden_ok', + 'dierese': 'diaeresis', + 'verifie_occurrences': 'check_occurrences', + 'repetition_ok': 'repeat_ok', + 'incomplet_ok': 'incomplete_ok', + 'phon_supposee_ok': 'phon_supposed_ok', + 'oeil_supposee_ok': 'eye_supposed_ok', + 'oeil_tolerance_ok': 'eye_tolerance_ok', + 'pauvre_oeil_requise': 'poor_eye_required', + 'pauvre_oeil_supposee_ok': 'poor_eye_supposed_ok', + 'pauvre_oeil_vocalique_ok': 'poor_eye_vocalic_ok', } - def __init__(self, string=None): - self.template = [] - self.pattern_line_no = 0 - self.options = dict(default_options) - self.mergers = [] - self.overflowed = False - if string != None: - self.load(string) - self.line_no = 0 - self.position = 0 - self.prev = None - self.env = {} - self.femenv = {} - self.occenv = {} - self.reject_errors = False - - def read_option(self, x): - try: - key, value = x.split(':') - except ValueError: - raise error.TemplateLoadError(("Global options must be provided as key-value pairs")) - if key in self.option_aliases.keys(): - key = self.option_aliases[key] - if key == 'merge': - self.mergers.append(value) - elif key == 'diaeresis': - if value == "classique": - value = "classical" - if value not in ["permissive", "classical"]: - raise error.TemplateLoadError(("Bad value for global option %s") % key) - self.options['diaeresis'] = value - elif key in self.options.keys(): - self.options[key] = str2bool(value) - else: - raise error.TemplateLoadError(("Unknown global option")) - - def load(self, s): - """Load from a string""" - for line in s.split('\n'): - line = line.strip() - self.pattern_line_no += 1 - if line != '' and line[0] != '#': - if line[0] == '!': - # don't count the '!' in the options, that's why we use [1:] - for option in line.split()[1:]: - self.read_option(option) +class Template: + + def __init__(self, template_string=None): + self.template = [] + self.pattern_line_no = 0 + self.options = dict(default_options) + self.mergers = [] + self.overflowed = False + if template_string is not None: + self.load(template_string) + self.line_no = 0 + self.position = 0 + self.prev = None + self.env = {} + self.feminine_environment = {} + self.occurrence_environment = {} + self.reject_errors = False + + def load(self, template_string): + """Load from a string""" + for line in template_string.split('\n'): + line = line.strip() + self.pattern_line_no += 1 + if len(line) != 0 and line[0] != '#': + if line[0] == '!': + # don't count the '!' in the options, that's why we use [1:] + for option_string in line.split()[1:]: + self.read_option(option_string) + else: + self.template.append(self.parse_line(line.strip())) + if len(self.template) == 0: + raise error.TemplateLoadError("Template is empty") + + def read_option(self, option_string): + try: + key, value = option_string.split(':') + except ValueError: + raise error.TemplateLoadError("Global options must be provided as key-value pairs") + if key in OPTION_ALIASES: + key = OPTION_ALIASES[key] + if key == 'merge': + self.mergers.append(value) + elif key == 'diaeresis': + if value == "classique": + value = "classical" + if value not in ["permissive", "classical"]: + raise error.TemplateLoadError("Bad value for global option %s" % key) + self.options['diaeresis'] = value + elif key in self.options: + self.options[key] = str2bool(value) + else: + raise error.TemplateLoadError("Unknown global option") + + def parse_line(self, line): + """Parse template line from a line""" + split = line.split(' ') + metric = split[0] + if len(split) >= 2: + my_id = split[1] else: - self.template.append(self.parse_line(line.strip())) - if len(self.template) == 0: - raise error.TemplateLoadError(("Template is empty")) - - def match(self, line, ofile=None, quiet=False, last=False, nsyl=None, - offset=0): - """Check a line against current pattern, return errors""" - - was_incomplete = last and not self.beyond - - errors = [] - pattern = self.get() - - line_with_case = normalize(line, downcase=False) - - v = Verse(line, self, pattern) - - if nsyl: - v.annotate() - count = 0 - # only generate a context with the prescribed final weight - # where "final" is the offset-th chunk with a weight from the end - for i, p in enumerate(v.chunks.chunks[::-1]): - if (p.weights is not None): - if count < offset: - count += 1 - continue - print(str(nsyl) + ' ' - + ' '.join(make_query(v.chunks.chunks, len(v.chunks.chunks)-i-1)), file=ofile) - break - return errors, pattern, v - - if last: - if was_incomplete and not self.options['incomplete_ok'] and not self.overflowed: - return [error.ErrorIncompleteTemplate()], pattern, v - return [], pattern, v - - if self.overflowed: - return [error.ErrorOverflowedTemplate()], pattern, v - - rhyme_failed = False - # rhymes - if pattern.myid not in self.env.keys(): - # initialize the rhyme - # last_count is passed later - self.env[pattern.myid] = rhyme.Rhyme(v.normalized, - pattern.constraint, self.mergers, self.options) - else: - # update the rhyme - self.env[pattern.myid].feed(v.normalized, pattern.constraint) - if not self.env[pattern.myid].satisfied_phon(): - # no more possible rhymes, something went wrong, check phon - self.env[pattern.myid].rollback() - rhyme_failed = True - errors.append(error.ErrorBadRhymeSound(self.env[pattern.myid], - self.env[pattern.myid].new_rhyme)) - - # occurrences - if self.options['check_occurrences']: - if pattern.myid not in self.occenv.keys(): - self.occenv[pattern.myid] = {} - last_word = re.split(r'[- ]', line_with_case)[-1] - if last_word not in self.occenv[pattern.myid].keys(): - self.occenv[pattern.myid][last_word] = 0 - self.occenv[pattern.myid][last_word] += 1 - if self.occenv[pattern.myid][last_word] > nature_count(last_word): - errors.insert(0, error.ErrorMultipleWordOccurrence(last_word, - self.occenv[pattern.myid][last_word])) - - v.phon = self.env[pattern.myid].phon - v.parse() - - # now that we have parsed, adjust rhyme to reflect last word length - # and check eye - if not rhyme_failed: - self.env[pattern.myid].adjustLastCount(v.last_count()) - if not self.env[pattern.myid].satisfied_eye(): - old_phon = len(self.env[pattern.myid].phon) - self.env[pattern.myid].rollback() - errors.append(error.ErrorBadRhymeEye(self.env[pattern.myid], - self.env[pattern.myid].new_rhyme, old_phon)) - - rhyme_failed = False - - errors = v.problems() + errors - - if ofile: - possible = v.possible - if len(possible) == 1: - for i, p in enumerate(possible[0]): - if (p.weights is not None and len(p.weights) > 1 - and p.weight is not None and p.weight > 0): - print(str(p.weight) + ' ' - + ' '.join(make_query(possible[0], i)), file=ofile) - - # rhyme genres - # inequality constraint - # TODO this is simplistic and order-dependent - if pattern.femid.swapcase() in self.femenv.keys(): - new = set(['M', 'F']) - self.femenv[pattern.femid.swapcase()] - if len(new) > 0: - self.femenv[pattern.femid] = new - if pattern.femid not in self.femenv.keys(): - if pattern.femid == 'M': - x = set(['M']) - elif pattern.femid == 'F': - x = set(['F']) - else: - x = set(['M', 'F']) - self.femenv[pattern.femid] = x - old = list(self.femenv[pattern.femid]) - new = v.genders() - self.femenv[pattern.femid] &= set(new) - if len(self.femenv[pattern.femid]) == 0: - errors.append(error.ErrorBadRhymeGenre(old, new)) - - return errors, pattern, v - - def parse_line(self, line): - """Parse template line from a line""" - split = line.split(' ') - metric = split[0] - if len(split) >= 2: - myid = split[1] - else: - myid = str(self.pattern_line_no) # unique - if len(split) >= 3: - femid = split[2] - else: - femid = str(self.pattern_line_no) # unique - idsplit = myid.split(':') - if len(idsplit) >= 2: - constraint = idsplit[-1].split('|') - if len(constraint) > 0: - constraint[0] = False if constraint[0] in ["no", "non"] else constraint[0] - if len(constraint) > 1: - constraint[1] = int(constraint[1]) - else: - constraint = [] - if len(constraint) == 0: - constraint.append(1) - if len(constraint) < 2: - constraint.append(True) - return Pattern(metric, myid, femid, rhyme.Constraint(*constraint)) - - def reset_conditional(self, d): - return dict((k, v) for k, v in d.items() if len(k) > 0 and k[0] == '!') - - def reset_state(self, with_femenv=False): - """Reset our state, except ids starting with '!'""" - self.position = 0 - self.env = self.reset_conditional(self.env) - self.femenv = self.reset_conditional(self.femenv) - self.occenv = {} # always reset - - @property - def beyond(self): - return self.position >= len(self.template) - - def get(self): - """Get next state, resetting if needed""" - self.old_position = self.position - self.old_env = copy.deepcopy(self.env) - self.old_femenv = copy.deepcopy(self.femenv) - self.old_occenv = copy.deepcopy(self.occenv) - if self.beyond: - if not self.options['repeat_ok']: - self.overflowed = True - self.reset_state() - result = self.template[self.position] - self.position += 1 - return result - - def back(self): - """Revert to previous state""" - self.position = self.old_position - self.env = copy.deepcopy(self.old_env) - self.femenv = copy.deepcopy(self.old_femenv) - self.occenv = copy.deepcopy(self.old_occenv) - - def check(self, line, ofile=None, quiet=False, last=False, nsyl=None, - offset=0): - """Check line (wrapper)""" - self.line_no += 1 - line = line.rstrip() - if normalize(line) == '' and not last: - return None - #possible = [compute(p) for p in possible] - #possible = sorted(possible, key=rate) - errors, pattern, verse = self.match(line, ofile, quiet=quiet, last=last, - nsyl=nsyl, offset=offset) - if len(errors) > 0: - if self.reject_errors: - self.back() - self.line_no -= 1 - return error.ErrorCollection(self.line_no, line, pattern, verse, errors) - return None + my_id = str(self.pattern_line_no) # unique + if len(split) >= 3: + feminine_id = split[2] + else: + feminine_id = str(self.pattern_line_no) # unique + id_split = my_id.split(':') + classical = True + n_common_suffix_phones = 1 + if len(id_split) >= 2: + constraint = id_split[-1].split('|') + if len(constraint) > 0: + classical = False if constraint[0] in ["no", "non"] else constraint[0] + if len(constraint) > 1: + n_common_suffix_phones = int(constraint[1]) + else: + constraint = [] + if len(constraint) == 0: + n_common_suffix_phones = 1 + if len(constraint) < 2: + classical = True + return Pattern(metric, my_id, feminine_id, rhyme.Constraint(classical, n_common_suffix_phones)) -def str2bool(x): - if x.lower() in ["yes", "oui", "y", "o", "true", "t", "vrai", "v"]: - return True - if x.lower() in ["no", "non", "n", "false", "faux", "f"]: - return False - raise error.TemplateLoadError(("Bad value in global option")) + def match(self, line, output_file=None, last=False, n_syllables=None, offset=0): + """Check a line against current pattern, return errors""" + + was_incomplete = last and not self.beyond + + errors = [] + pattern = self.get() + line_with_case = normalize(line, downcase=False) + + verse = Verse(line, self, pattern) + + if n_syllables: + verse.print_n_syllables(n_syllables, offset, output_file) + return errors, pattern, verse + + if last: + if was_incomplete and not self.options['incomplete_ok'] and not self.overflowed: + return [error.ErrorIncompleteTemplate()], pattern, verse + return [], pattern, verse + + if self.overflowed: + return [error.ErrorOverflowedTemplate()], pattern, verse + + rhyme_failed = False + # rhymes + if pattern.my_id not in self.env: + # initialize the rhyme + # last_count is passed later + self.env[pattern.my_id] = rhyme.Rhyme(verse.normalized, pattern.constraint, self.mergers, self.options) + else: + # update the rhyme + self.env[pattern.my_id].feed(verse.normalized, pattern.constraint) + if not self.env[pattern.my_id].satisfied_phon(): + # no more possible rhymes, something went wrong, check phon + self.env[pattern.my_id].rollback() + rhyme_failed = True + errors.append(error.ErrorBadRhymeSound(self.env[pattern.my_id], + self.env[pattern.my_id].new_rhyme)) + + # occurrences + if self.options['check_occurrences']: + if pattern.my_id not in self.occurrence_environment.keys(): + self.occurrence_environment[pattern.my_id] = {} + last_word = re.split(r'[- ]', line_with_case)[-1] + if last_word not in self.occurrence_environment[pattern.my_id].keys(): + self.occurrence_environment[pattern.my_id][last_word] = 0 + self.occurrence_environment[pattern.my_id][last_word] += 1 + if self.occurrence_environment[pattern.my_id][last_word] > nature_count(last_word): + errors.insert(0, error.ErrorMultipleWordOccurrence(last_word, + self.occurrence_environment[pattern.my_id][last_word])) + + verse.phon = self.env[pattern.my_id].phon + verse.parse() + + # now that we have parsed, adjust rhyme to reflect last word length + # and check eye + if not rhyme_failed: + self.env[pattern.my_id].adjustLastCount(verse.last_count()) + if not self.env[pattern.my_id].satisfied_eye(): + old_phon = len(self.env[pattern.my_id].phon) + self.env[pattern.my_id].rollback() + errors.append(error.ErrorBadRhymeEye(self.env[pattern.my_id], + self.env[pattern.my_id].new_rhyme, old_phon)) + + errors = verse.problems() + errors + + if output_file: + possible = verse.possible + if len(possible) == 1: + for i, chunk in enumerate(possible[0]): + if (chunk.weights is not None and len(chunk.weights) > 1 + and chunk.weight is not None and chunk.weight > 0): + chunks_before = possible[0][:i] + chunks_after = possible[0][i + 1:] + print(str(chunk.weight) + ' ' + + ' '.join(chunk.make_query(chunks_before, chunks_after)), file=output_file) + + # rhyme genres + # inequality constraint + # TODO this is simplistic and order-dependent + if pattern.feminine_id.swapcase() in self.feminine_environment.keys(): + new = {'M', 'F'} - self.feminine_environment[pattern.feminine_id.swapcase()] + if len(new) > 0: + self.feminine_environment[pattern.feminine_id] = new + if pattern.feminine_id not in self.feminine_environment.keys(): + if pattern.feminine_id == 'M': + x = {'M'} + elif pattern.feminine_id == 'F': + x = {'F'} + else: + x = {'M', 'F'} + self.feminine_environment[pattern.feminine_id] = x + old = list(self.feminine_environment[pattern.feminine_id]) + new = verse.genders() + self.feminine_environment[pattern.feminine_id] &= set(new) + if len(self.feminine_environment[pattern.feminine_id]) == 0: + errors.append(error.ErrorBadRhymeGenre(old, new)) + + return errors, pattern, verse + + def reset_conditional(self, d): + return dict((k, v) for k, v in d.items() if len(k) > 0 and k[0] == '!') + + def reset_state(self, with_femenv=False): + """Reset our state, except ids starting with '!'""" + self.position = 0 + self.env = self.reset_conditional(self.env) + self.feminine_environment = self.reset_conditional(self.feminine_environment) + self.occurrence_environment = {} # always reset + + @property + def beyond(self): + return self.position >= len(self.template) + + def get(self): + """Get next state, resetting if needed""" + self.old_position = self.position + self.old_env = copy.deepcopy(self.env) + self.old_femenv = copy.deepcopy(self.feminine_environment) + self.old_occenv = copy.deepcopy(self.occurrence_environment) + if self.beyond: + if not self.options['repeat_ok']: + self.overflowed = True + self.reset_state() + result = self.template[self.position] + self.position += 1 + return result + + def back(self): + """Revert to previous state""" + self.position = self.old_position + self.env = copy.deepcopy(self.old_env) + self.feminine_environment = copy.deepcopy(self.old_femenv) + self.occurrence_environment = copy.deepcopy(self.old_occenv) + + def check(self, line, output_file=None, last=False, n_syllables=None, offset=0): + """Check line (wrapper)""" + self.line_no += 1 + line = line.rstrip() + if normalize(line) == '' and not last: + return None + + errors, pattern, verse = self.match(line, output_file, last=last, n_syllables=n_syllables, offset=offset) + if len(errors) > 0: + if self.reject_errors: + self.back() + self.line_no -= 1 + return error.ErrorCollection(self.line_no, line, pattern, verse, errors) + return None + + +def str2bool(x): + if x.lower() in ["yes", "oui", "y", "o", "true", "t", "vrai", "v"]: + return True + if x.lower() in ["no", "non", "n", "false", "faux", "f"]: + return False + raise error.TemplateLoadError(("Bad value in global option")) diff --git a/plint/tests/test_bad_chars.py b/plint/tests/test_bad_chars.py @@ -1,16 +1,17 @@ import unittest +import plint.pattern from plint import verse, template class BadChars(unittest.TestCase): def testBadAlone(self): - v = verse.Verse("42", template.Template(), template.Pattern("12")) + v = verse.Verse("42", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertFalse(v.valid()) def testBadAndGood(self): - v = verse.Verse("bla h42 blah ", template.Template(), template.Pattern("12")) + v = verse.Verse("bla h42 blah ", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertFalse(v.valid()) diff --git a/plint/tests/test_counts.py b/plint/tests/test_counts.py @@ -1,12 +1,13 @@ import unittest +import plint.pattern from plint import verse, template class Counts(unittest.TestCase): def runCount(self, text, limit=12, hemistiches=None): - v = verse.Verse(text, template.Template(), template.Pattern(str(limit), hemistiches=hemistiches)) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern(str(limit), hemistiches=hemistiches)) v.parse() return v.possible diff --git a/plint/tests/test_eliminate.py b/plint/tests/test_eliminate.py @@ -1,19 +1,20 @@ import unittest +import plint.pattern from plint import verse, template class Eliminate(unittest.TestCase): def testEliminateOneGue(self): text = "gue" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() c = ''.join([x.text for x in v.chunks.chunks]) self.assertFalse("gue" in c) def testEliminateGue(self): text = "gue gue GUE ogues longuement la guerre" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() c = ''.join([x.text for x in v.chunks.chunks]) self.assertFalse("gue" in c) diff --git a/plint/tests/test_gender.py b/plint/tests/test_gender.py @@ -1,12 +1,13 @@ import unittest +import plint.pattern from plint import verse, template class Genders(unittest.TestCase): def testSingleSyllJe(self): text = "Patati patata patatatah où suis-je" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() gend = v.genders() self.assertTrue(v.valid()) @@ -15,7 +16,7 @@ class Genders(unittest.TestCase): def testSingleSyllJeBis(self): text = "Patati patata patatah la verrai-je" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() gend = v.genders() self.assertTrue(v.valid()) @@ -24,7 +25,7 @@ class Genders(unittest.TestCase): def testSingleSyllLe(self): text = "Patati patata patatata prends-le" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() gend = v.genders() self.assertTrue(v.valid()) @@ -33,7 +34,7 @@ class Genders(unittest.TestCase): def testSingleSyllCe(self): text = "Patati patata patatata mais qu'est-ce" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() gend = v.genders() self.assertTrue(v.valid()) @@ -42,7 +43,7 @@ class Genders(unittest.TestCase): def testSingleSyllHyphen(self): text = "Patati patata patata mange-les" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() gend = v.genders() self.assertTrue(v.valid()) @@ -51,7 +52,7 @@ class Genders(unittest.TestCase): def testSingleSyllNoHyphen(self): text = "Patati patata patata mange les" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() gend = v.genders() self.assertTrue(v.valid()) diff --git a/plint/tests/test_hiatus.py b/plint/tests/test_hiatus.py @@ -1,36 +1,37 @@ import unittest +import plint.pattern from plint import verse, template class Hiatus(unittest.TestCase): def testBadVowel(self): - v = verse.Verse("patati patata patata arbrisseau", template.Template(), template.Pattern("12")) + v = verse.Verse("patati patata patata arbrisseau", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertFalse(v.valid()) def testBadUnaspirated(self): - v = verse.Verse("patati patata patata hirondelle", template.Template(), template.Pattern("12")) + v = verse.Verse("patati patata patata hirondelle", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertFalse(v.valid()) def testGoodAspirated(self): - v = verse.Verse("patati patata patata tata hache", template.Template(), template.Pattern("12")) + v = verse.Verse("patati patata patata tata hache", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertTrue(v.valid()) def testGoodConsonant(self): - v = verse.Verse("patati patata patatah arbrisseau", template.Template(), template.Pattern("12")) + v = verse.Verse("patati patata patatah arbrisseau", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertTrue(v.valid()) def testGoodMuteE(self): - v = verse.Verse("patati patata patatue arbrisseau", template.Template(), template.Pattern("12")) + v = verse.Verse("patati patata patatue arbrisseau", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertTrue(v.valid()) def testBadEt(self): - v = verse.Verse("patati patata patata et avant", template.Template(), template.Pattern("12")) + v = verse.Verse("patati patata patata et avant", template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertFalse(v.valid()) diff --git a/plint/tests/test_sanity_check.py b/plint/tests/test_sanity_check.py @@ -1,5 +1,6 @@ import unittest +import plint.pattern from plint import diaeresis, verse, template, common @@ -7,31 +8,31 @@ class SanityCheck(unittest.TestCase): def testSimple(self): text = "Hello World!! This is a test_data" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertEqual(text, v.line) def testComplex(self): text = "Aye AYAYE aye gue que geque AYAYAY a prt sncf bbbéé" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertEqual(text, v.line) def testLeadingSpace(self): text = " a" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertEqual(text, v.line) def testLeadingSpaceHyphenVowel(self): text = " -a" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertEqual(text, v.line) def testLeadingSpaceHyphenConsonant(self): text = " -c" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() self.assertEqual(text, v.line) diff --git a/plint/tests/test_sanity_check2.py b/plint/tests/test_sanity_check2.py @@ -1,12 +1,13 @@ import unittest +import plint.pattern from plint import verse, template class SanityCheck2(unittest.TestCase): def testSimple(self): text = "Patati patata patata tata vies" - v = verse.Verse(text, template.Template(), template.Pattern("12")) + v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12")) v.parse() gend = v.genders() self.assertEqual(1, len(gend)) diff --git a/plint/verse.py b/plint/verse.py @@ -1,7 +1,8 @@ #!/usr/bin/python3 -from plint.chunks import Chunks -from plint.common import normalize, is_vowels, SURE_END_FEM, strip_accents from plint import error, common +from plint.chunks import Chunks +from plint.common import SURE_END_FEM, strip_accents + # the writing is designed to make frhyme succeed # end vowels will be elided @@ -11,13 +12,11 @@ class Verse: @property def line(self): - return ''.join(x.original for x in self.chunks.chunks) + return self.chunks.get_line() @property def normalized(self): - return ''.join(normalize(x.original, strip=False, rm_apostrophe_end=False) - if x.text_pron is None else x.text - for x in self.chunks.chunks).lstrip().rstrip() + return self.chunks.normalized() def __init__(self, input_line, template, pattern, threshold=None): self.template = template @@ -25,8 +24,8 @@ class Verse: self.threshold = threshold self.phon = None self.possible = None - self._line = input_line - self.chunks = Chunks(input_line) + self.input_line = input_line + self.chunks = Chunks(self) self.text = None def annotate(self): @@ -119,7 +118,6 @@ class Verse: def last_count(self): """return min number of syllables for last word""" - tot = 0 for chunk in self.chunks.chunks[::-1]: if chunk.original.endswith(' ') or chunk.original.endswith('-'): @@ -133,18 +131,10 @@ class Verse: return tot def problems(self): + errors = self.chunks.get_errors_set(self.template.options['forbidden_ok'], self.template.options['hiatus_ok']) result = [] - errors = set() if len(self.possible) == 0: result.append(error.ErrorBadMetric()) - for chunk in self.chunks.chunks: - if chunk.error is not None: - if chunk.error == "ambiguous" and not self.template.options['forbidden_ok']: - errors.add(error.ErrorForbiddenPattern) - if chunk.error == "hiatus" and not self.template.options['hiatus_ok']: - errors.add(error.ErrorHiatus) - if chunk.error == "illegal": - errors.add(error.ErrorBadCharacters) for k in errors: result.append(k()) return result @@ -160,3 +150,9 @@ class Verse: # try to infer gender even when metric is wrong result.update(set(self.feminine(None))) return result + + def print_n_syllables(self, n_syllables, offset, output_file): + self.annotate() + # only generate a context with the prescribed final weight + # where "final" is the offset-th chunk with a weight from the end + self.chunks.print_n_syllables(n_syllables, offset, output_file) diff --git a/plint/vowels.py b/plint/vowels.py @@ -3,45 +3,6 @@ """Compute the number of syllabes taken by a vowel chunk""" -from plint.common import strip_accents -from plint import diaeresis - -DEFAULT_THRESHOLD = 3 - - -def possible_weights_ctx(chunks, pos, threshold=None): - global DEFAULT_THRESHOLD - if not threshold: - threshold = DEFAULT_THRESHOLD - chunk = chunks[pos] - q = make_query(chunks, pos) - v = diaeresis.diaeresis_finder.lookup(q) - if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold: - return [int(list(v.keys())[0])] - else: - return possible_weights_seed(chunk) - - -def make_query(chunks, pos): - cleared = [clear(chunk) for chunk in chunks] - if cleared[pos].endswith(' '): - cleared[pos] = cleared[pos].rstrip() - if pos + 1 < len(cleared): - cleared[pos + 1] = " " + cleared[pos + 1] - else: - cleared.append(' ') - ret2 = intersperse( - ''.join(cleared[pos + 1:]), - ''.join([x[::-1] for x in cleared[:pos][::-1]])) - ret = [cleared[pos]] + ret2 - return ret - - -def clear(chunk): - if chunk.word_end == True: - return (chunk.text + ' ') - return chunk.text - def intersperse(left, right): if (len(left) == 0 or left[0] == ' ') and (len(right) == 0 or right[0] == ' '): @@ -53,98 +14,9 @@ def intersperse(left, right): return [left[0], right[0]] + intersperse(left[1:], right[1:]) -def possible_weights_approx(chunk): - """Return the possible number of syllabes taken by a vowel chunk (permissive approximation)""" - if len(chunk) == 1: - return [1] - # old spelling and weird exceptions - if chunk in ['ouï']: - return [1, 2] # TODO unsure about that - if chunk in ['eüi', 'aoû', 'uë']: - return [1] - if chunk in ['aïe', 'oë', 'ouü']: - return [1, 2] - if contains_trema(chunk): - return [2] - chunk = strip_accents(chunk, True) - if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', - 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', - 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', - 'yeu', 'ye', 'you']: - return [1] - if chunk == "oua": - return [1, 2] # "pouah" - if chunk == "ao": - return [1, 2] # "paon" - for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']: - if x in chunk: - return [2] - # beware of "déesse" - if chunk == 'ée': - return [1, 2] - if chunk[0] == 'i': - return [1, 2] - if chunk[0] == 'u' and (strip_accents(chunk[1]) in ['i', 'e']): - return [1, 2] - if chunk[0] == 'o' and chunk[1] == 'u' and len(chunk) >= 3 and strip_accents(chunk[2]) in ['i', 'e']: - return [1, 2] - if 'é' in chunk or 'è' in chunk: - return [2] - # we can't tell - return [1, 2] - - def contains_trema(chunk): """Test if a string contains a word with a trema""" for x in ['ä', 'ë', 'ï', 'ö', 'ü', 'ÿ']: if x in chunk: return True return False - - -def possible_weights_seed(chunk): - """Return the possible number of syllabes taken by a vowel chunk""" - if len(chunk.text) == 1: - return [1] - # dioïde, maoïste, taoïste - if (chunk.text[-1] == 'ï' and len(chunk.text) >= 3 and not - chunk.text[-3:-1] == 'ou'): - return [3] - # ostéoarthrite - if "éoa" in chunk.text: - return [3] - # antiaérien; but let's play it safe - if "iaé" in chunk.text: - return [2, 3] - # giaour, miaou, niaouli - if "iaou" in chunk.text: - return [2, 3] - # bioélectrique - if "ioé" in chunk.text: - return [2, 3] - # méiose, nucléion, etc. - if "éio" in chunk.text: - return [2, 3] - # radioactif, radioamateur, etc. - if "ioa" in chunk.text: - return [2, 3] - # pléiade - if "éio" in chunk.text: - return [2, 3] - # pompéien, tarpéien... - # in theory the "-ie" should give a diaeresis, so 3 syllabes - # let's keep the benefit of the doubt... - # => this also gives 3 as a possibility for "obéie"... - if "éie" in chunk.text: - return [2, 3] - # tolstoïen - # same remark - if "oïe" in chunk.text: - return [2, 3] - # shanghaïen (diaeresis?), but also "aië" - if "aïe" in chunk.text: - return [1, 2, 3] - if chunk.text in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']: - return [1] - # we can't tell - return [1, 2] diff --git a/test.sh b/test.sh @@ -5,12 +5,32 @@ echo "It is normal that some errors occur when running this script" >/dev/stderr echo "See test_expected_output.out for the usual errors that are output" >/dev/stderr + +rm -f test_temp.txt; +rm -f test_temp_sorted.txt; +rm -f test_expected_sorted.txt; + for a in plint/test_data/*.tpl; do echo "$a" + echo "$a" >> test_temp.txt if [[ $a == *cyrano_full* ]] then - ./plint.py $(pwd)/$a ../data/diaeresis_cyrano.json < $(pwd)/${a%.tpl} + ./plint.py $(pwd)/$a ../data/diaeresis_cyrano.json < $(pwd)/${a%.tpl} &>> test_temp.txt else - ./test_one.sh $(basename "${a%.tpl}") + ./test_one.sh $(basename "${a%.tpl}") &>> test_temp.txt fi done + +sort test_temp.txt > test_temp_sorted.txt; +sort test_expected_output.out > test_expected_sorted.txt; + +if [ $(python3 compare_test_output.py test_temp_sorted.txt test_expected_sorted.txt | wc -l) -eq 1 ]; then + echo "TEST SUCCEED"; +else + echo "TEST FAILED"; + diff test_temp_sorted.txt test_expected_sorted.txt +fi + +rm -f test_temp.txt; +rm -f test_temp_sorted.txt; +rm -f test_expected_sorted.txt