commit bdf7000c91d00282d7bc5595459458abbbbbe4df
parent b308017c164fe7bd94a6be8a2c3993a35db1cee0
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Thu, 15 Aug 2019 15:26:47 +0200
Merge gitlab.com:a3nm/plint
Merge my own commits with Julien's
Diffstat:
22 files changed, 1280 insertions(+), 1158 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
__pycache__/*
+.idea
+Lexique382.tsv
frhyme
frhyme/*
haspirater
@@ -35,3 +37,4 @@ final_syneresis2.ctx
coverage
.coverage
ouliplint/stanford-postagger-full-2013-11-12/
+test_temp.txt
diff --git a/compare_test_output.py b/compare_test_output.py
@@ -0,0 +1,12 @@
+import sys
+
+file0 = sys.argv[1]
+file1 = sys.argv[2]
+
+with open(file0) as f:
+ content0 = f.read()
+
+with open(file1) as f:
+ content1 = f.read()
+
+print(int(sorted(content0) == sorted(content1)))
diff --git a/lexique_comparison/count_syllables_plint.py b/lexique_comparison/count_syllables_plint.py
@@ -4,12 +4,14 @@ import os
import sys
# modules are in the parent folder
+import plint.pattern
+
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from plint import template, verse, rhyme
templateobj = template.Template()
-patternobj = template.Pattern("12")
+patternobj = plint.pattern.Pattern("12")
for l in sys.stdin.readlines():
w = (l.strip().split("\t"))[0]
diff --git a/plint.py b/plint.py
@@ -5,64 +5,64 @@ import sys
def run():
- ok = True
- f2 = None
- nsyl = None
- offset = 0
- if len(sys.argv) >= 4:
- f2 = open(sys.argv[3], 'w')
- if len(sys.argv) >= 5:
- nsyl = int(sys.argv[4])
- if len(sys.argv) == 6:
- offset = int(sys.argv[5])
- should_end = False
- while True:
- line = sys.stdin.readline()
- if not line:
- should_end = True
- line = ""
- errors = template.check(line, f2, last=should_end, nsyl=nsyl, offset=offset)
- if errors:
- print(errors.report(), file=sys.stderr)
- ok = False
- if should_end:
- break
- return ok
+ ok = True
+ f2 = None
+ nsyl = None
+ offset = 0
+ if len(sys.argv) >= 4:
+ f2 = open(sys.argv[3], 'w')
+ if len(sys.argv) >= 5:
+ nsyl = int(sys.argv[4])
+ if len(sys.argv) == 6:
+ offset = int(sys.argv[5])
+ should_end = False
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ should_end = True
+ line = ""
+ errors = template.check(line, f2, last=should_end, n_syllables=nsyl, offset=offset)
+ if errors:
+ print(errors.report(), file=sys.stderr)
+ ok = False
+ if should_end:
+ break
+ return ok
-if __name__ == '__main__':
- localization.init_locale()
- if len(sys.argv) < 2 or len(sys.argv) > 6:
- print(_("Usage: %s TEMPLATE [DFILE [OCONTEXT [NSYL [OFFSET]]]]") % sys.argv[0],
- file=sys.stderr)
- print(_("Check stdin according to TEMPLATE, report errors on stdout"),
- file=sys.stderr)
- print(_("For internal use:"),
- file=sys.stderr)
- print(_("DFILE is the diaeresis file, OCONTEXT is the context output file"),
- file=sys.stderr)
- print(_("NSYL is the assigned weight to the last chunk (diaeresis training)"),
- file=sys.stderr)
- print(_("OFFSET is to add after the last chunk (diaeresis training)"),
- file=sys.stderr)
- sys.exit(2)
- template_name = sys.argv[1]
- if len(sys.argv) > 2:
- diaeresis_name = sys.argv[2]
- else:
- diaeresis_name = "../data/diaeresis.json"
- diaeresis.set_diaeresis(diaeresis_name)
+if __name__ == '__main__':
+ localization.init_locale()
+ if len(sys.argv) < 2 or len(sys.argv) > 6:
+ print(_("Usage: %s TEMPLATE [DFILE [OCONTEXT [NSYL [OFFSET]]]]") % sys.argv[0],
+ file=sys.stderr)
+ print(_("Check stdin according to TEMPLATE, report errors on stdout"),
+ file=sys.stderr)
+ print(_("For internal use:"),
+ file=sys.stderr)
+ print(_("DFILE is the diaeresis file, OCONTEXT is the context output file"),
+ file=sys.stderr)
+ print(_("NSYL is the assigned weight to the last chunk (diaeresis training)"),
+ file=sys.stderr)
+ print(_("OFFSET is to add after the last chunk (diaeresis training)"),
+ file=sys.stderr)
+ sys.exit(2)
- f = open(template_name)
- x = f.read()
- f.close()
+ template_name = sys.argv[1]
+ if len(sys.argv) > 2:
+ diaeresis_name = sys.argv[2]
+ else:
+ diaeresis_name = "../data/diaeresis.json"
+ diaeresis.set_diaeresis(diaeresis_name)
- try:
- template = template.Template(x)
- except error.TemplateLoadError as e:
- print("Could not load template %s: %s" % (template_name, e.msg), file=sys.stderr)
- sys.exit(2)
+ f = open(template_name)
+ x = f.read()
+ f.close()
- ok = run()
- sys.exit(0 if ok else 1)
+ try:
+ template = template.Template(x)
+ except error.TemplateLoadError as e:
+ print("Could not load template %s: %s" % (template_name, e.msg), file=sys.stderr)
+ sys.exit(2)
+ ok = run()
+ sys.exit(0 if ok else 1)
diff --git a/plint/chunk.py b/plint/chunk.py
@@ -0,0 +1,595 @@
+import re
+
+from haspirater import haspirater
+from plint import common, diaeresis, error
+from plint.common import normalize, strip_accents_one, is_consonants, APOSTROPHES, is_vowels, get_consonants_regex, \
+ strip_accents, SURE_END_FEM
+from plint.vowels import contains_trema, intersperse
+
+
+DEFAULT_THRESHOLD = 3
+
+
+class Chunk:
+
+ def __init__(self, word, verse):
+ self.original = word
+ self.text = normalize(word, rm_apostrophe=True)
+ self.hemistiche = None
+ self.error = None
+ self.illegal_str = None
+ self.weights = None
+ self.had_hyphen = None
+ self.text_pron = None
+ self.elision = None
+ self.no_hiatus = None
+ self.elidable = None
+ self.word_end = False
+ # TODO What is a weight without s?
+ self.weight = None
+ self.verse = verse
+
+ def __repr__(self):
+ return "Chunk(" \
+ + "original:" + self.original \
+ + ", text:" + self.text \
+ + ", weights:" + str(self.weights or []) \
+ + ", weight:" + str(self.weight or "") \
+ + ", elidable:" + str(self.elidable or False) \
+ + ", elision:" + str(self.elision or False) \
+ + ", hemistiche:" + str(self.hemistiche) \
+ + ", error:" + str(self.error) \
+ + ", illegal_str:" + str(self.illegal_str) \
+ + ", had_hypher:" + str(self.had_hyphen) \
+ + ", text_pron:" + str(self.text_pron) \
+ + ", no_hiatus:" + str(self.no_hiatus) \
+ + ", word_end:" + str(self.word_end) \
+ + ")" + "\n"
+
+ def copy(self):
+ new_chunk = Chunk(self.original, self.verse)
+ new_chunk.original = self.original
+ new_chunk.text = self.text
+ new_chunk.hemistiche = self.hemistiche
+ new_chunk.error = self.error
+ new_chunk.illegal_str = self.illegal_str
+ new_chunk.weights = self.weights
+ new_chunk.had_hyphen = self.had_hyphen
+ new_chunk.text_pron = self.text_pron
+ new_chunk.elision = self.elision
+ new_chunk.no_hiatus = self.no_hiatus
+ new_chunk.elidable = self.elidable
+ new_chunk.word_end = self.word_end
+ new_chunk.weight = self.weight
+ return new_chunk
+
+ def set_hemistiche(self, hemis):
+ self.hemistiche = hemis
+
+ def check_forbidden_characters(self):
+ es = ""
+ for x in self.text:
+ if not common.remove_punctuation(strip_accents_one(x)[0].lower()) in common.LEGAL:
+ es += 'I'
+ self.error = "illegal"
+ else:
+ es += ' '
+ if self.error is not None and self.error == "illegal":
+ self.illegal_str = es
+
+ def simplify_gu_qu(self, next_chunk):
+ if next_chunk.text.startswith('u'):
+ if self.text.endswith('q'):
+ next_chunk.text = next_chunk.text[1:]
+ if next_chunk.text == '':
+ self.original += next_chunk.original
+ next_chunk.original = ''
+ if self.text.endswith('g') and len(next_chunk.text) >= 2:
+ if next_chunk.text[1] in "eéèa":
+ next_chunk.text = next_chunk.text[1:]
+
+ def elide_inside_words(self, all_next_chunks):
+ if self.text == "e-":
+ self.weights = [0] # force elision
+ next_chunk = all_next_chunks[0]
+ if self.text == "e" and next_chunk.text.startswith("-h"):
+ # collect what follows until the next hyphen or end
+ flw = next_chunk.original.split('-')[1]
+ for future_chunk in all_next_chunks[1:]:
+ flw += future_chunk.original.split('-')[0]
+ if '-' in future_chunk.original:
+ break
+ # TODO: not sure if this reconstruction of the original word is bulletproof...
+ if haspirater.lookup(normalize(flw)):
+ self.weights = [0]
+ else:
+ self.weights = [1]
+
+ def remove_leading_and_trailing_crap(self):
+ seen_space = False
+ seen_hyphen = False
+ while len(self.text) > 0 and self.text[0] in ' -':
+ if self.text[0] == ' ':
+ seen_space = True
+ else:
+ seen_hyphen = True
+ self.text = self.text[1:]
+ while len(self.text) > 0 and self.text[-1] in ' -':
+ if self.text[-1] == ' ':
+ seen_space = True
+ else:
+ seen_hyphen = True
+ self.text = self.text[:-1]
+ if seen_hyphen and not seen_space:
+ self.had_hyphen = True
+
+ def is_empty(self):
+ return len(self.text) == 0
+
+ def add_original(self, other_chunk):
+ self.original += other_chunk.original
+
+ def create_acronym(self):
+ new_chunks = []
+ for j, character in enumerate(self.text):
+ try:
+ new_chunk_content = LETTERS[character]
+ # hack: the final 'e's in letters are just to help pronunciation
+ # inference and are only needed at end of word, otherwise they will
+ # mess syllable count up
+ if j < len(self.text) - 1 and new_chunk_content[-1] == 'e':
+ new_chunk_content = new_chunk_content[:-1]
+ except KeyError:
+ new_chunk_content = character + 'é'
+ new_chunks += [(j, x) for x in re.split(get_consonants_regex(), new_chunk_content)]
+ new_chunks = [x for x in new_chunks if len(x[1]) > 0]
+ new_word = []
+ last_opos = -1
+ for j, (original_position, character) in enumerate(new_chunks):
+ part = ""
+ if j == len(new_chunks) - 1:
+ # don't miss final spaces
+ part = self.original[last_opos + 1:]
+ elif last_opos < original_position:
+ part = self.original[last_opos + 1:original_position + 1]
+ last_opos = original_position
+ # allow or forbid elision because of possible ending '-e' before
+ # forbid hiatus both for this and for preceding
+ # instruct that we must use text for the pronunciation
+ new_chunk = Chunk(part, self.verse)
+ new_chunk.original = part
+ new_chunk.text = character
+ new_chunk.text_pron = True
+ new_chunk.elision = [False, True]
+ new_chunk.no_hiatus = True
+ new_word.append(new_chunk)
+ # propagate information from splithyph
+ new_word[-1].hemistiche = self.hemistiche
+ return new_word
+
+ def check_elidable(self):
+ if self.text == 'e':
+ self.elidable = [True]
+
+ def is_consonants(self):
+ return is_consonants(self.text)
+
+ def ends_with_apostrophe(self):
+ return re.search("[" + APOSTROPHES + "]$", self.original) is not None
+
+ def elide_vowel_problems(self, chunk_group):
+ if self.elision is None:
+ self.elision = elision_wrap(chunk_group)
+
+ def process_y_cases(self, previous_chunk, next_chunk):
+ new_word_from_chunk = []
+ if 'y' not in self.text or len(self.text) == 1 or self.text.startswith("y"):
+ new_word_from_chunk.append(self)
+ else:
+ if previous_chunk is not None and next_chunk is not None:
+ # special cases of "pays", "alcoyle", "abbayes"
+ c_text = self.text
+ p_text = previous_chunk.text
+ n_text = next_chunk.text
+ # TODO Should you force if this condition does not apply?
+ if ((c_text == "ay" and p_text.endswith("p") and n_text.startswith("s"))
+ or
+ (c_text == "oy" and p_text.endswith("lc")
+ and n_text.startswith("l"))
+ or
+ (c_text == "aye" and p_text.endswith("bb")
+ and n_text.startswith("s"))):
+ # force weight
+ self.weights = [2]
+ new_word_from_chunk.append(self)
+ return new_word_from_chunk
+ must_force = next_chunk is None and previous_chunk is not None and \
+ (self.text == "aye" and previous_chunk.text.endswith("bb"))
+ if must_force:
+ # force weight
+ self.weights = [2]
+ new_word_from_chunk.append(self)
+ else:
+ sub_chunks = re.split(re.compile("(y+)"), self.text)
+ sub_chunks = [x for x in sub_chunks if len(x) > 0]
+ for j, sub_chunk in enumerate(sub_chunks):
+ lindex = int(j * len(self.original) / len(sub_chunks))
+ rindex = int((j + 1) * len(self.original) / len(sub_chunks))
+ part = self.original[lindex:rindex]
+ new_subchunk_text = 'Y' if 'y' in sub_chunk else sub_chunk
+ new_subchunk = self.copy()
+ new_subchunk.original = part
+ new_subchunk.text = new_subchunk_text
+ new_word_from_chunk.append(new_subchunk)
+ return new_word_from_chunk
+
+ def is_vowels(self):
+ return is_vowels(self.text)
+
+ def is_dash_elidable(self):
+ # "fais-le" not elidable, but "suis-je" and "est-ce" is
+ return not ('-' in self.text and not self.text.endswith('-j') and not self.text.endswith('-c'))
+
+ def check_elidable_with_next(self, next_chunk):
+ if self.elidable is None:
+ self.elidable = next_chunk.elision
+
+ def is_potentially_ambiguous_hiatus(self):
+ return self.text in ["ie", "ée", "ue"]
+
+ def ends_with_potentially_ambiguous_hiatus(self):
+ return len(self.text) >= 2 and self.text[-2:] in ["ie", "ée", "ue"]
+
+ def check_potentially_ambiguous_plural(self, previous_chunk):
+ if self.text == "s":
+ if previous_chunk.is_potentially_ambiguous_hiatus():
+ previous_chunk.error = "ambiguous"
+ self.error = "ambiguous"
+
+ def check_potentially_ambiguous_with_elision(self, next_chunk):
+ if self.ends_with_potentially_ambiguous_hiatus():
+ if next_chunk.elision is not None or True not in next_chunk.elision:
+ self.error = "ambiguous"
+ next_chunk.error = "ambiguous"
+
+ def check_hiatus(self, previous_chunk, next_chunk, only_two_parts):
+ if previous_chunk is not None:
+ self.check_potentially_ambiguous_plural(previous_chunk)
+ if self.ends_with_potentially_ambiguous_hiatus():
+ if not any(next_chunk.elision or [False]):
+ self.error = "ambiguous"
+ next_chunk.error = "ambiguous"
+
+ # elision concerns words ending with a vowel without a mute 'e'
+ # that have not been marked "no_hiatus"
+ # it also concerns specifically "et"
+ elif (not self.text.endswith('e') and self.no_hiatus is None
+ and (self.is_vowels() or self.text == 'Y')
+ or (only_two_parts and previous_chunk.text == 'e' and self.text == 't')):
+ # it happens if the next word is not marked no_hiatus
+ # and starts with something that causes elision
+ if all(next_chunk.elision) and next_chunk.no_hiatus is None:
+ self.error = "hiatus"
+ next_chunk.error = "hiatus"
+
+ def make_word_end(self):
+ self.word_end = True
+
+ def contains_break(self):
+ return '-' in self.text \
+ or self.word_end or False \
+ or self.had_hyphen or False
+
+ def is_e(self):
+ return self.text == "e"
+
+ def possible_weights_approx(self):
+ """Return the possible number of syllabes taken by a vowel chunk (permissive approximation)"""
+ chunk_text = self.text
+ if len(chunk_text) == 1:
+ return [1]
+ # old spelling and weird exceptions
+ if chunk_text in ['ouï']:
+ return [1, 2] # TODO unsure about that
+ if chunk_text in ['eüi', 'aoû', 'uë']:
+ return [1]
+ if chunk_text in ['aïe', 'oë', 'ouü']:
+ return [1, 2]
+ if contains_trema(chunk_text):
+ return [2]
+ chunk_text = strip_accents(chunk_text, True)
+ if chunk_text in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
+ 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
+ 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
+ 'yeu', 'ye', 'you']:
+ return [1]
+ if chunk_text == "oua":
+ return [1, 2] # "pouah"
+ if chunk_text == "ao":
+ return [1, 2] # "paon"
+ for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']:
+ if x in chunk_text:
+ return [2]
+ # beware of "déesse"
+ if chunk_text == 'ée':
+ return [1, 2]
+ if chunk_text[0] == 'i':
+ return [1, 2]
+ if chunk_text[0] == 'u' and (strip_accents(chunk_text[1]) in ['i', 'e']):
+ return [1, 2]
+ if chunk_text[0] == 'o' and chunk_text[1] == 'u' and len(chunk_text) >= 3 and\
+ strip_accents(chunk_text[2]) in ['i', 'e']:
+ return [1, 2]
+ if 'é' in chunk_text or 'è' in chunk_text:
+ return [2]
+ # we can't tell
+ return [1, 2]
+
+ def clear(self):
+ if self.word_end is None or not self.word_end:
+ return self.text
+ return self.text + ' '
+
+ def set_possible_weights_from_context(self, chunks_before, chunks_after, template, threshold):
+ if self.weights is not None:
+ return
+ if len(chunks_after) > 0:
+ next_chunk = chunks_after[0]
+ else:
+ next_chunk = None
+
+ if len(chunks_before) > 0:
+ previous_chunk = chunks_before[-1]
+ else:
+ previous_chunk = None
+
+ if len(chunks_before) > 1:
+ previous_previous_chunk = chunks_before[-2]
+ else:
+ previous_previous_chunk = None
+
+ if ((len(chunks_after) <= 1 and self.is_e())
+ and not (next_chunk is not None and next_chunk.is_vowels())
+ and not (previous_chunk is None or previous_chunk.contains_break())
+ and not (previous_previous_chunk is None or previous_previous_chunk.contains_break())):
+ # special case for verse endings, which can get elided (or not)
+ # but we don't elide lone syllables ("prends-le", etc.)
+
+ if next_chunk is None:
+ self.weights = [0] # ending 'e' is elided
+ elif next_chunk.text == 's':
+ self.weights = [0] # ending 'es' is elided
+ elif next_chunk.text == 'nt':
+ # ending 'ent' is sometimes elided, try to use pronunciation
+ # actually, this will have an influence on the rhyme's gender
+ # see feminine
+ possible = []
+ if not self.verse.phon or len(self.verse.phon) == 0:
+ self.weights = [0, 1] # do something reasonable without pron
+ else:
+ for possible_phon in self.verse.phon:
+ if possible_phon.endswith(')') or possible_phon.endswith('#'):
+ possible.append(1)
+ else:
+ possible.append(0)
+ self.weights = possible
+ else:
+ self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold)
+ elif (next_chunk is None and self.text == 'e' and
+ previous_chunk is not None and (previous_chunk.text.endswith('-c')
+ or previous_chunk.text.endswith('-j')
+ or (previous_chunk.text == 'c'
+ and previous_chunk.had_hyphen is not None)
+ or (previous_chunk.text == 'j'
+ and previous_chunk.had_hyphen is not None))):
+ self.weights = [0] # -ce and -je are elided
+ elif next_chunk is None and self.text in ['ie', 'ée']:
+ self.weights = [1]
+ # elide "-ée" and "-ées", but be specific (beware of e.g. "réel")
+ elif (len(chunks_after) <= 1
+ and self.text == 'ée'
+ and (next_chunk is None or chunks_after[-1].text == 's')):
+ self.weights = [1]
+ elif self.elidable is not None:
+ self.weights = [int(not x) for x in self.elidable]
+ else:
+ self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold)
+
+ def possible_weights(self, chunks_before, chunks_after, template, threshold):
+ if template.options['diaeresis'] == "classical":
+ return self.possible_weights_ctx(chunks_before, chunks_after, threshold=threshold)
+ elif template.options['diaeresis'] == "permissive":
+ return self.possible_weights_approx()
+
+ def possible_weights_ctx(self, chunks_before, chunks_after, threshold=None):
+ if not threshold:
+ threshold = DEFAULT_THRESHOLD
+ q = self.make_query(chunks_before, chunks_after)
+ v = diaeresis.diaeresis_finder.lookup(q)
+ if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold:
+ return [int(list(v.keys())[0])]
+ else:
+ return self.possible_weights_seed()
+
+ def make_query(self, chunks_before, chunks_after):
+ cleaned_before = [chunk.clear() for chunk in chunks_before]
+ cleaned_after = [chunk.clear() for chunk in chunks_after]
+ current_clear = self.clear()
+ if current_clear.endswith(' '):
+ current_clear = current_clear.rstrip()
+ if len(cleaned_after) > 0:
+ cleaned_after[0] = " " + cleaned_after[0]
+ else:
+ cleaned_after.append(' ')
+ ret2 = intersperse(
+ ''.join(cleaned_after),
+ ''.join([x[::-1] for x in cleaned_before[::-1]]))
+ ret = [current_clear] + ret2
+ return ret
+
+ def possible_weights_seed(self):
+ """Return the possible number of syllabes taken by a vowel chunk"""
+ if len(self.text) == 1:
+ return [1]
+ # dioïde, maoïste, taoïste
+ if (self.text[-1] == 'ï' and len(self.text) >= 3 and not
+ self.text[-3:-1] == 'ou'):
+ return [3]
+ # ostéoarthrite
+ if "éoa" in self.text:
+ return [3]
+ # antiaérien; but let's play it safe
+ if "iaé" in self.text:
+ return [2, 3]
+ # giaour, miaou, niaouli
+ if "iaou" in self.text:
+ return [2, 3]
+ # bioélectrique
+ if "ioé" in self.text:
+ return [2, 3]
+ # méiose, nucléion, etc.
+ if "éio" in self.text:
+ return [2, 3]
+ # radioactif, radioamateur, etc.
+ if "ioa" in self.text:
+ return [2, 3]
+ # pléiade
+ if "éio" in self.text:
+ return [2, 3]
+ # pompéien, tarpéien...
+ # in theory the "-ie" should give a diaeresis, so 3 syllabes
+ # let's keep the benefit of the doubt...
+ # => this also gives 3 as a possibility for "obéie"...
+ if "éie" in self.text:
+ return [2, 3]
+ # tolstoïen
+ # same remark
+ if "oïe" in self.text:
+ return [2, 3]
+ # shanghaïen (diaeresis?), but also "aië"
+ if "aïe" in self.text:
+ return [1, 2, 3]
+ if self.text in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']:
+ return [1]
+ # we can't tell
+ return [1, 2]
+
+ def set_hemistiche_from_context(self, previous_previous_chunk, previous_chunk, next_chunk):
+ if self.hemistiche is not None:
+ return
+ ending = self.text
+ if not (self.word_end or False) and next_chunk is not None:
+ if not (next_chunk.word_end or False):
+ self.hemistiche = "cut"
+ return
+ ending += next_chunk.text
+ if ending in SURE_END_FEM and previous_previous_chunk is not None and previous_chunk is not None:
+ # check that this isn't a one-syllabe wourd (which is allowed)
+ ok = False
+ try:
+ if '-' in previous_chunk.original or (previous_chunk.word_end or False):
+ ok = True
+ if '-' in previous_previous_chunk.original or (previous_previous_chunk.word_end or False):
+ ok = True
+ except IndexError:
+ pass
+ if not ok:
+ # hemistiche ends in feminine
+ if any(self.elidable or [False]):
+ self.hemistiche = "elid" # elidable final -e, but only OK if actually elided
+ return
+ else:
+ self.hemistiche = "fem"
+ return
+ self.hemistiche = "ok"
+
+ def normalize(self):
+ if self.text_pron is None:
+ return normalize(self.original, strip=False, rm_apostrophe_end=False)
+ else:
+ return self.text
+
+ def get_original_text(self):
+ return self.original
+
+ def get_errors_set(self, forbidden_ok, hiatus_ok):
+ errors_chunk = set()
+ if self.error is not None:
+ if self.error == "ambiguous" and not forbidden_ok:
+ errors_chunk.add(error.ErrorForbiddenPattern)
+ if self.error == "hiatus" and not hiatus_ok:
+ errors_chunk.add(error.ErrorHiatus)
+ if self.error == "illegal":
+ errors_chunk.add(error.ErrorBadCharacters)
+ return errors_chunk
+
+
+LETTERS = {
+ 'f': 'effe',
+ 'h': 'ache',
+ 'j': 'gi',
+ 'k': 'ka',
+ 'l': 'elle',
+ 'm': 'aime',
+ 'n': 'aine',
+ 'q': 'cu',
+ 'r': 'ère',
+ 's': 'esse',
+ 'w': 'doublevé',
+ 'x': 'ixe',
+ 'z': 'zaide'
+}
+
+
+def elision_wrap(chunk_group):
+ first_letter = common.remove_punctuation(chunk_group[0].original.strip())
+ temp = elision(''.join(chunk.text for chunk in chunk_group),
+ ''.join(chunk.original for chunk in chunk_group),
+ first_letter == first_letter.upper())
+ return temp
+
+
+def elision(word, original_word, was_cap):
+ if word.startswith('y'):
+ if word == 'y':
+ return [True]
+ if was_cap:
+ if word == 'york':
+ return [False]
+ # Grevisse, Le Bon usage, 14th ed., paragraphs 49-50
+ # depends on whether it's French or foreign...
+ return [True, False]
+ else:
+ exc = ["york", "yeux", "yeuse", "ypérite"]
+ for w in exc:
+ if word.startswith(w):
+ return [True]
+ # otherwise, no elision
+ return [False]
+ if word in ["oui", "ouis"]:
+ # elision for those words, but beware, no elision for "ouighour"
+ # boileau : "Ont l'esprit mieux tourné que n'a l'homme ? Oui sans doute."
+ # so elision sometimes
+ return [True, False]
+ if word.startswith("ouistiti") or word.startswith("ouagadougou"):
+ return [False]
+ # "un", "une" are non-elided as nouns ("cette une")
+ if word in ["un", "une"]:
+ return [True, False]
+ # "onze" is not elided
+ if word == "onze":
+ return [False]
+ if word.startswith('ulul'):
+ return [False] # ululement, ululer, etc.
+ if word.startswith('uhlan'):
+ return [False] # uhlan
+ if word[0] == 'h':
+ if word == "huis":
+ # special case, "huis" is elided but "huis clos" isn't
+ return [True, False]
+ # look up in haspirater using the original (but normalized) word
+ return list(map((lambda s: not s),
+ haspirater.lookup(normalize(original_word))))
+ if is_vowels(word[0]):
+ return [True]
+ return [False]
diff --git a/plint/chunks.py b/plint/chunks.py
@@ -2,298 +2,18 @@ import re
import sys
from pprint import pprint
-from haspirater import haspirater
-from plint import common, vowels
-from plint.common import is_vowels, APOSTROPHES, is_consonants, normalize, strip_accents_one, CONSONANTS, SURE_END_FEM
+from plint.chunk import Chunk
+from plint.common import normalize, get_consonants_regex
from plint.hyphen_splitter import HyphenSplitter
-class Chunk:
-
- def __init__(self, word):
- self.original = word
- self.text = normalize(word, rm_apostrophe=True)
- self.hemistiche = None
- self.error = None
- self.illegal_str = None
- self.weights = None
- self.had_hyphen = None
- self.text_pron = None
- self.elision = None
- self.no_hiatus = None
- self.elidable = None
- self.word_end = False
- # TODO What is a weight without s?
- self.weight = None
-
- def __repr__(self):
- return "Chunk("\
- + "original:" + self.original\
- + ", text:" + self.text\
- + ", weights:" + str(self.weights or [])\
- + ", weight:" + str(self.weight or "")\
- + ", elidable:" + str(self.elidable or False)\
- + ", elision:" + str(self.elision or False)\
- + ", hemistiche:" + str(self.hemistiche)\
- + ", error:" + str(self.error)\
- + ", illegal_str:" + str(self.illegal_str)\
- + ", had_hypher:" + str(self.had_hyphen)\
- + ", text_pron:" + str(self.text_pron)\
- + ", no_hiatus:" + str(self.no_hiatus)\
- + ", word_end:" + str(self.word_end)\
- + ")" + "\n"
-
- def copy(self):
- new_chunk = Chunk(self.original)
- new_chunk.original = self.original
- new_chunk.text = self.text
- new_chunk.hemistiche = self.hemistiche
- new_chunk.error = self.error
- new_chunk.illegal_str = self.illegal_str
- new_chunk.weights = self.weights
- new_chunk.had_hyphen = self.had_hyphen
- new_chunk.text_pron = self.text_pron
- new_chunk.elision = self.elision
- new_chunk.no_hiatus = self.no_hiatus
- new_chunk.elidable = self.elidable
- new_chunk.word_end = self.word_end
- new_chunk.weight = self.weight
- return new_chunk
-
- def set_hemistiche(self, hemis):
- self.hemistiche = hemis
-
- def check_forbidden_characters(self):
- es = ""
- for x in self.text:
- if not common.remove_punctuation(strip_accents_one(x)[0].lower()) in common.LEGAL:
- es += 'I'
- self.error = "illegal"
- else:
- es += ' '
- if self.error is not None and self.error == "illegal":
- self.illegal_str = es
-
- def simplify_gu_qu(self, next_chunk):
- if next_chunk.text.startswith('u'):
- if self.text.endswith('q'):
- next_chunk.text = next_chunk.text[1:]
- if next_chunk.text == '':
- self.original += next_chunk.original
- next_chunk.original = ''
- if self.text.endswith('g') and len(next_chunk.text) >= 2:
- if next_chunk.text[1] in "eéèa":
- next_chunk.text = next_chunk.text[1:]
-
- def elide_inside_words(self, all_next_chunks):
- if self.text == "e-":
- self.weights = [0] # force elision
- next_chunk = all_next_chunks[0]
- if self.text == "e" and next_chunk.text.startswith("-h"):
- # collect what follows until the next hyphen or end
- flw = next_chunk.original.split('-')[1]
- for future_chunk in all_next_chunks[1:]:
- flw += future_chunk.original.split('-')[0]
- if '-' in future_chunk.original:
- break
- # TODO: not sure if this reconstruction of the original word is bulletproof...
- if haspirater.lookup(normalize(flw)):
- self.weights = [0]
- else:
- self.weights = [1]
-
- def remove_leading_and_trailing_crap(self):
- seen_space = False
- seen_hyphen = False
- while len(self.text) > 0 and self.text[0] in ' -':
- if self.text[0] == ' ':
- seen_space = True
- else:
- seen_hyphen = True
- self.text = self.text[1:]
- while len(self.text) > 0 and self.text[-1] in ' -':
- if self.text[-1] == ' ':
- seen_space = True
- else:
- seen_hyphen = True
- self.text = self.text[:-1]
- if seen_hyphen and not seen_space:
- self.had_hyphen = True
-
- def is_empty(self):
- return len(self.text) == 0
-
- def add_original(self, other_chunk):
- self.original += other_chunk.original
-
- def create_sigles(self):
- new_chunks = []
- for j, character in enumerate(self.text):
- try:
- new_chunk_content = LETTERS[character]
- # hack: the final 'e's in letters are just to help pronunciation
- # inference and are only needed at end of word, otherwise they will
- # mess syllable count up
- if j < len(self.text) - 1 and new_chunk_content[-1] == 'e':
- new_chunk_content = new_chunk_content[:-1]
- except KeyError:
- new_chunk_content = character + 'é'
- new_chunks += [(j, x) for x in re.split(get_consonants_regex(), new_chunk_content)]
- new_chunks = [x for x in new_chunks if len(x[1]) > 0]
- new_word = []
- last_opos = -1
- for j, (original_position, character) in enumerate(new_chunks):
- part = ""
- if j == len(new_chunks) - 1:
- # don't miss final spaces
- part = self.original[last_opos + 1:]
- elif last_opos < original_position:
- part = self.original[last_opos + 1:original_position + 1]
- last_opos = original_position
- # allow or forbid elision because of possible ending '-e' before
- # forbid hiatus both for this and for preceding
- # instruct that we must use text for the pronunciation
- new_chunk = Chunk(part)
- new_chunk.original = part
- new_chunk.text = character
- new_chunk.text_pron = True
- new_chunk.elision = [False, True]
- new_chunk.no_hiatus = True
- new_word.append(new_chunk)
- # propagate information from splithyph
- new_word[-1].hemistiche = self.hemistiche
- return new_word
-
- def check_elidable(self):
- if self.text == 'e':
- self.elidable = [True]
-
- def is_consonants(self):
- return is_consonants(self.text)
-
- def ends_with_apostrophe(self):
- return re.search("[" + APOSTROPHES + "]$", self.original) is not None
-
- def elide_vowel_problems(self, chunk_group):
- if self.elision is None:
- self.elision = elision_wrap(chunk_group)
-
- def process_y_cases(self, previous_chunk, next_chunk):
- new_word_from_chunk = []
- if 'y' not in self.text or len(self.text) == 1 or self.text.startswith("y"):
- new_word_from_chunk.append(self)
- else:
- if previous_chunk is not None and next_chunk is not None:
- # special cases of "pays", "alcoyle", "abbayes"
- c_text = self.text
- p_text = previous_chunk.text
- n_text = next_chunk.text
- # TODO Should you force if this condition does not apply?
- if ((c_text == "ay" and p_text.endswith("p") and n_text.startswith("s"))
- or
- (c_text == "oy" and p_text.endswith("lc")
- and n_text.startswith("l"))
- or
- (c_text == "aye" and p_text.endswith("bb")
- and n_text.startswith("s"))):
- # force weight
- self.weights = [2]
- new_word_from_chunk.append(self)
- return new_word_from_chunk
- must_force = next_chunk is None and previous_chunk is not None and \
- (self.text == "aye" and previous_chunk.text.endswith("bb"))
- if must_force:
- # force weight
- self.weights = [2]
- new_word_from_chunk.append(self)
- else:
- sub_chunks = re.split(re.compile("(y+)"), self.text)
- sub_chunks = [x for x in sub_chunks if len(x) > 0]
- for j, sub_chunk in enumerate(sub_chunks):
- lindex = int(j * len(self.original) / len(sub_chunks))
- rindex = int((j + 1) * len(self.original) / len(sub_chunks))
- part = self.original[lindex:rindex]
- new_subchunk_text = 'Y' if 'y' in sub_chunk else sub_chunk
- new_subchunk = self.copy()
- new_subchunk.original = part
- new_subchunk.text = new_subchunk_text
- new_word_from_chunk.append(new_subchunk)
- return new_word_from_chunk
-
- def is_vowels(self):
- return is_vowels(self.text)
-
- def is_dash_elidable(self):
- # "fais-le" not elidable, but "suis-je" and "est-ce" is
- return not ('-' in self.text and not self.text.endswith('-j') and not self.text.endswith('-c'))
-
- def check_elidable_with_next(self, next_chunk):
- if self.elidable is None:
- self.elidable = next_chunk.elision
-
- def is_potentially_ambiguous_hiatus(self):
- return self.text in ["ie", "ée", "ue"]
-
- def ends_with_potentially_ambiguous_hiatus(self):
- return len(self.text) >= 2 and self.text[-2:] in ["ie", "ée", "ue"]
-
- def check_potentially_ambiguous_plural(self, previous_chunk):
- if self.text == "s":
- if previous_chunk.is_potentially_ambiguous_hiatus():
- previous_chunk.error = "ambiguous"
- self.error = "ambiguous"
-
- def check_potentially_ambiguous_with_elision(self, next_chunk):
- if self.ends_with_potentially_ambiguous_hiatus():
- if next_chunk.elision is not None or True not in next_chunk.elision:
- self.error = "ambiguous"
- next_chunk.error = "ambiguous"
-
- def check_hiatus(self, previous_chunk, next_chunk, only_two_parts):
- if previous_chunk is not None:
- self.check_potentially_ambiguous_plural(previous_chunk)
- if self.ends_with_potentially_ambiguous_hiatus():
- if not any(next_chunk.elision or [False]):
- self.error = "ambiguous"
- next_chunk.error = "ambiguous"
-
- # elision concerns words ending with a vowel without a mute 'e'
- # that have not been marked "no_hiatus"
- # it also concerns specifically "et"
- elif (not self.text.endswith('e') and self.no_hiatus is None
- and (self.is_vowels() or self.text == 'Y')
- or (only_two_parts and previous_chunk.text == 'e' and self.text == 't')):
- # it happens if the next word is not marked no_hiatus
- # and starts with something that causes elision
- if all(next_chunk.elision) and next_chunk.no_hiatus is None:
- self.error = "hiatus"
- next_chunk.error = "hiatus"
-
- def make_word_end(self):
- self.word_end = True
-
- def contains_break(self):
- return '-' in self.text \
- or self.word_end or False \
- or self.had_hyphen or False
-
- def is_e(self):
- return self.text == "e"
-
-
-def get_consonants_regex():
- all_consonants = CONSONANTS + CONSONANTS.upper()
- consonants_regexp = re.compile('([^' + all_consonants + '*-]+)', re.UNICODE)
- return consonants_regexp
-
-
class Chunks:
- def __init__(self, line):
- self._line = line
+ def __init__(self, verse):
+ # TODO Find a way to remove this dependency
+ self.verse = verse
self.chunks = []
self.create_chunks()
- self.phon = None
self.separated_chunks = []
def create_chunks(self):
@@ -304,7 +24,7 @@ class Chunks:
self.elide_inside_words()
self.remove_leading_and_trailing_crap()
self.collapse_empty_chunks_from_simplifications()
- self.create_sigles()
+ self.create_acronym()
self.elide_vowel_problems()
self.process_y_cases()
self.annotate_final_mute_e()
@@ -315,8 +35,8 @@ class Chunks:
def print_new_line_if_changed(self):
now_line = ''.join(chunk.original for chunk in self.chunks)
- if now_line != self._line:
- print("%s became %s" % (self._line, now_line), file=sys.stderr)
+ if now_line != self.verse.input_line:
+ print("%s became %s" % (self.verse.input_line, now_line), file=sys.stderr)
pprint(self.chunks, stream=sys.stderr)
def merge_chunks_words(self):
@@ -384,12 +104,12 @@ class Chunks:
future_chunks.append(acc)
self.separated_chunks = future_chunks
- def create_sigles(self):
+ def create_acronym(self):
for i, chunk_group in enumerate(self.separated_chunks):
if len(chunk_group) == 1:
first_chunk = chunk_group[0]
if first_chunk.is_consonants():
- new_word = first_chunk.create_sigles()
+ new_word = first_chunk.create_acronym()
self.separated_chunks[i] = new_word
self.separated_chunks[i][-1].check_elidable()
@@ -430,37 +150,25 @@ class Chunks:
def initialize_chunks(self):
word_bi_tokens = self.get_word_tokens()
- pre_chunks = self.preprocess_bi_tokens(word_bi_tokens)
+ pre_chunks = pre_process_bi_tokens(word_bi_tokens)
self.separated_chunks = []
for (is_end_word, pre_chunk) in pre_chunks:
if len(pre_chunk) != 0:
- self.separated_chunks.append([Chunk(word) for word in pre_chunk])
+ self.separated_chunks.append([Chunk(word, self.verse) for word in pre_chunk])
if not is_end_word:
# word end is a fake word end
for chunk in self.separated_chunks[-1]:
chunk.set_hemistiche('cut')
- def preprocess_bi_tokens(self, word_bi_tokens):
- consonants_regexp = get_consonants_regex()
- pre_chunks = [(b, re.split(consonants_regexp, word)) for (b, word) in word_bi_tokens]
- pre_chunks = [(b, remove_trivial(x, self.is_empty_word)) for (b, x) in pre_chunks]
- return pre_chunks
-
def get_word_tokens(self):
words = self.split_input_line_by_whitespace()
- words = remove_trivial(words, self.is_empty_word)
- word_tokens = self.split_all_hyph(words)
+ words = remove_trivial(words, is_empty_word)
+ word_tokens = split_all_hyphen(words)
return word_tokens
- def split_all_hyph(self, words):
- return sum([HyphenSplitter().split(w) for w in words], [])
-
- def is_empty_word(self, word):
- return re.match(r"^\s*$", word) or len(normalize(word, rm_all=True)) == 0
-
def split_input_line_by_whitespace(self):
whitespace_regexp = re.compile(r"(\s+)")
- words = re.split(whitespace_regexp, self._line)
+ words = re.split(whitespace_regexp, self.verse.input_line)
return words
def annotate(self, template, threshold):
@@ -468,187 +176,75 @@ class Chunks:
for i, chunk in enumerate(self.chunks):
if not chunk.is_vowels():
continue
+
+ chunks_before = self.chunks[:i]
+ chunks_after = self.chunks[i + 1:]
# for the case of "pays" and related words
- if chunk.weights is None:
- chunk.weights = self.possible_weights_context(i, template, threshold)
- if chunk.hemistiche is None:
- chunk.hemistiche = self.hemistiche(i)
- return self.align2str()
+ chunk.set_possible_weights_from_context(chunks_before, chunks_after, template, threshold)
- def possible_weights_context(self, pos, template, threshold):
- chunk = self.chunks[pos]
- if pos != len(self.chunks) - 1:
- next_chunk = self.chunks[pos + 1]
- else:
- next_chunk = None
- if pos > 0:
- previous_chunk = self.chunks[pos - 1]
- else:
- previous_chunk = None
- if pos > 1:
- previous_previous_chunk = self.chunks[pos - 2]
- else:
- previous_previous_chunk = None
-
- if ((pos >= len(self.chunks) - 2 and chunk.is_e())
- and not (next_chunk is not None and next_chunk.is_vowels())
- and not (previous_chunk is None or previous_chunk.contains_break())
- and not (previous_previous_chunk is None or previous_previous_chunk.contains_break())):
- # special case for verse endings, which can get elided (or not)
- # but we don't elide lone syllables ("prends-le", etc.)
-
- if next_chunk is None:
- return [0] # ending 'e' is elided
- if next_chunk.text == 's':
- return [0] # ending 'es' is elided
- if next_chunk.text == 'nt':
- # ending 'ent' is sometimes elided, try to use pronunciation
- # actually, this will have an influence on the rhyme's gender
- # see feminine
- possible = []
- if not self.phon or len(self.phon) == 0:
- return [0, 1] # do something reasonable without pron
- for possible_phon in self.phon:
- if possible_phon.endswith(')') or possible_phon.endswith('#'):
- possible.append(1)
- else:
- possible.append(0)
- return possible
- return self.possible_weights(pos, template, threshold)
- if (next_chunk is None and chunk.text == 'e' and
- previous_chunk is not None and (previous_chunk.text.endswith('-c')
- or previous_chunk.text.endswith('-j')
- or (previous_chunk.text == 'c'
- and previous_chunk.had_hyphen is not None)
- or (previous_chunk.text == 'j'
- and previous_chunk.had_hyphen is not None))):
- return [0] # -ce and -je are elided
- if next_chunk is None and chunk.text in ['ie', 'ée']:
- return [1]
- # elide "-ée" and "-ées", but be specific (beware of e.g. "réel")
- if (pos >= len(self.chunks) - 2
- and chunk.text == 'ée'
- and (next_chunk is None or self.chunks[-1].text == 's')):
- return [1]
- if chunk.elidable is not None:
- return [int(not x) for x in chunk.elidable]
- return self.possible_weights(pos, template, threshold)
-
- def possible_weights(self, pos, template, threshold):
- if template.options['diaeresis'] == "classical":
- return vowels.possible_weights_ctx(self.chunks, pos, threshold=threshold)
- elif template.options['diaeresis'] == "permissive":
- return vowels.possible_weights_approx(self.chunks[pos].text)
-
- def hemistiche(self, pos):
- current_chunk = self.chunks[pos]
- ending = current_chunk.text
- if not (current_chunk.word_end or False) and pos < len(self.chunks) - 1:
- if not (self.chunks[pos + 1].word_end or False):
- return "cut"
- ending += self.chunks[pos + 1].text
- if ending in SURE_END_FEM:
- # check that this isn't a one-syllabe wourd (which is allowed)
- ok = False
- try:
- for i in range(2):
- if '-' in self.chunks[pos - i - 1].original or (self.chunks[pos - i - 1].word_end or False) :
- ok = True
- except IndexError:
- pass
- if not ok:
- # hemistiche ends in feminine
- if any(current_chunk.elidable or [False]):
- return "elid" # elidable final -e, but only OK if actually elided
- else:
- return "fem"
- return "ok"
+ next_chunk = self.chunks[i + 1] if i < len(self.chunks) - 1 else None
+ previous_chunk = self.chunks[i - 1] if i > 0 else None
+ previous_previous_chunk = self.chunks[i - 2] if i > 1 else None
+ chunk.set_hemistiche_from_context(previous_previous_chunk, previous_chunk, next_chunk)
+ return self.align2str()
def align2str(self):
return ''.join([x.text for x in self.chunks])
+ def print_n_syllables(self, n_syllables, offset, output_file):
+ count = 0
+ for i, chunk in enumerate(self.chunks[::-1]):
+ if chunk.weights is not None:
+ if count < offset:
+ count += 1
+ continue
+ pos = len(self.chunks) - i - 1
+ considered_chunk = self.chunks[pos]
+ chunks_before = self.chunks[:pos]
+ chunks_after = self.chunks[pos + 1:]
+ print(str(n_syllables) + ' ' + ' '.join(considered_chunk.make_query(chunks_before, chunks_after)),
+ file=output_file)
+ break
-LETTERS = {
- 'f': 'effe',
- 'h': 'ache',
- 'j': 'gi',
- 'k': 'ka',
- 'l': 'elle',
- 'm': 'aime',
- 'n': 'aine',
- 'q': 'cu',
- 'r': 'ère',
- 's': 'esse',
- 'w': 'doublevé',
- 'x': 'ixe',
- 'z': 'zaide'
-}
-
-
-def elision_wrap(chunk_group):
- first_letter = common.remove_punctuation(chunk_group[0].original.strip())
- temp = elision(''.join(chunk.text for chunk in chunk_group),
- ''.join(chunk.original for chunk in chunk_group),
- first_letter == first_letter.upper())
- return temp
-
-
-def elision(word, original_word, was_cap):
- if word.startswith('y'):
- if word == 'y':
- return [True]
- if was_cap:
- if word == 'york':
- return [False]
- # Grevisse, Le Bon usage, 14th ed., paragraphs 49-50
- # depends on whether it's French or foreign...
- return [True, False]
- else:
- exc = ["york", "yeux", "yeuse", "ypérite"]
- for w in exc:
- if word.startswith(w):
- return [True]
- # otherwise, no elision
- return [False]
- if word in ["oui", "ouis"]:
- # elision for those words, but beware, no elision for "ouighour"
- # boileau : "Ont l'esprit mieux tourné que n'a l'homme ? Oui sans doute."
- # so elision sometimes
- return [True, False]
- if word.startswith("ouistiti") or word.startswith("ouagadougou"):
- return [False]
- # "un", "une" are non-elided as nouns ("cette une")
- if word in ["un", "une"]:
- return [True, False]
- # "onze" is not elided
- if word == "onze":
- return [False]
- if word.startswith('ulul'):
- return [False] # ululement, ululer, etc.
- if word.startswith('uhlan'):
- return [False] # uhlan
- if word[0] == 'h':
- if word == "huis":
- # special case, "huis" is elided but "huis clos" isn't
- return [True, False]
- # look up in haspirater using the original (but normalized) word
- return list(map((lambda s: not s),
- haspirater.lookup(normalize(original_word))))
- if is_vowels(word[0]):
- return [True]
- return [False]
-
-
-def remove_trivial(chunks, predicate):
+ def normalized(self):
+ return ''.join(chunk.normalize() for chunk in self.chunks).lstrip().rstrip()
+
+ def get_line(self):
+ return ''.join(chunk.get_original_text() for chunk in self.chunks)
+
+ def get_errors_set(self, forbidden_ok, hiatus_ok):
+ errors = set()
+ for chunk in self.chunks:
+ errors_chunk = chunk.get_errors_set(forbidden_ok, hiatus_ok)
+ errors = errors.union(errors_chunk)
+ return errors
+
+
+def remove_trivial(words, predicate):
new_chunks = []
- accu = ""
- for i, w in enumerate(chunks):
- if predicate(w):
+ words_accumulation = ""
+ for i, chunk in enumerate(words):
+ if predicate(chunk):
if len(new_chunks) == 0:
- accu = accu + w
+ words_accumulation = words_accumulation + chunk
else:
- new_chunks[-1] = new_chunks[-1] + w
+ new_chunks[-1] = new_chunks[-1] + chunk
else:
- new_chunks.append(accu + w)
- accu = ""
+ new_chunks.append(words_accumulation + chunk)
+ words_accumulation = ""
return new_chunks
+
+
+def split_all_hyphen(words):
+ return sum([HyphenSplitter().split(w) for w in words], [])
+
+
+def is_empty_word(word):
+ return re.match(r"^\s*$", word) or len(normalize(word, rm_all=True)) == 0
+
+
+def pre_process_bi_tokens(word_bi_tokens):
+ consonants_regexp = get_consonants_regex()
+ pre_chunks = [(b, re.split(consonants_regexp, word)) for (b, word) in word_bi_tokens]
+ pre_chunks = [(b, remove_trivial(x, is_empty_word)) for (b, x) in pre_chunks]
+ return pre_chunks
diff --git a/plint/common.py b/plint/common.py
@@ -122,3 +122,9 @@ def to_xsampa(s):
def from_xsampa(s):
"""convert x-sampa to our modified format"""
return subst(s, [(x[1], x[0]) for x in SUBSTS])
+
+
+def get_consonants_regex():
+ all_consonants = CONSONANTS + CONSONANTS.upper()
+ consonants_regexp = re.compile('([^' + all_consonants + '*-]+)', re.UNICODE)
+ return consonants_regexp+
\ No newline at end of file
diff --git a/plint/error.py b/plint/error.py
@@ -118,11 +118,11 @@ class ErrorBadRhymeGenre(ErrorBadRhyme):
return "\"" + result + "\""
def get_id(self, pattern):
- return pattern.femid
+ return pattern.feminine_id
class ErrorBadRhymeObject(ErrorBadRhyme):
def get_id(self, pattern):
- return pattern.myid
+ return pattern.my_id
class ErrorBadRhymeSound(ErrorBadRhymeObject):
@property
@@ -157,7 +157,7 @@ class ErrorMultipleWordOccurrence:
def report(self, pattern):
return (_("Too many occurrences of word \"%s\" for rhyme %s")
- % (self.word, pattern.myid))
+ % (self.word, pattern.my_id))
class ErrorIncompleteTemplate:
def report(self, pattern):
diff --git a/plint/pattern.py b/plint/pattern.py
@@ -0,0 +1,31 @@
+from plint import error
+
+
+class Pattern:
+ def __init__(self, metric, my_id="", feminine_id="", constraint=None, hemistiches=None):
+ self.metric = metric
+ self.length = None
+ self.parse_metric()
+ self.my_id = my_id
+ self.feminine_id = feminine_id
+ self.constraint = constraint
+ if hemistiches:
+ self.hemistiches = hemistiches
+
+ def parse_metric(self):
+ """Parse from a metric description"""
+ try:
+ verse = [int(x) for x in self.metric.split('/')]
+ for i in verse:
+ if i < 1:
+ raise ValueError
+ except ValueError:
+ raise error.TemplateLoadError("Metric description should only contain positive integers")
+ if sum(verse) > 16:
+ raise error.TemplateLoadError("Metric length limit exceeded")
+ self.hemistiches = []
+ self.length = 0
+ for v in verse:
+ self.length += v
+ self.hemistiches.append(self.length)
+ self.length = self.hemistiches.pop()+
\ No newline at end of file
diff --git a/plint/plint_irc.py b/plint/plint_irc.py
@@ -75,7 +75,7 @@ def manage(line, descriptor=sys.stdout):
else:
lbuf = [l]
return True
- errors = template.check(text, quiet=False)
+ errors = template.check(text)
quiet = False
if errors:
print(errors.report())
diff --git a/plint/plint_web.py b/plint/plint_web.py
@@ -1,5 +1,5 @@
#!/usr/bin/python3 -Ou
-#encoding: utf8
+# encoding: utf8
from plint import localization, error, template, diaeresis
import re
@@ -10,6 +10,7 @@ import time
env = Environment(loader=PackageLoader('plint_web', 'views'))
+
# force HTTPS usage
# http://bottlepy.org/docs/dev/faq.html#problems-with-reverse-proxies
# because bottle makes absolute redirects
@@ -17,215 +18,235 @@ env = Environment(loader=PackageLoader('plint_web', 'views'))
# even though relative Location: is now allowed
# http://stackoverflow.com/a/25643550
def fix_https(app):
- def fixed_app(environ, start_response):
- environ['wsgi.url_scheme'] = 'https'
- return app(environ, start_response)
- return fixed_app
+ def fixed_app(environ, start_response):
+ environ['wsgi.url_scheme'] = 'https'
+ return app(environ, start_response)
+
+ return fixed_app
+
+
app = Bottle()
app.wsgi = fix_https(app.wsgi)
THROTTLE_DELAY = 2
throttle = set()
+
def best_match(matches, header):
- # inspired by http://www.xml.com/pub/a/2005/06/08/restful.html
+ # inspired by http://www.xml.com/pub/a/2005/06/08/restful.html
+
+ def parse_one(t):
+ parts = t.split(";")
+ d = {}
+ for param in parts[1:]:
+ spl = param.split("=")
+ if (len(spl) != 2):
+ # this should be formatted as key=value
+ # so ignore it
+ continue
+ k, v = spl
+ d[k.strip().lower()] = v.strip()
+ if 'q' not in d.keys():
+ d['q'] = "1"
+ return (parts[0], d)
+
+ parts = []
+ for p in header.split(","):
+ parsed = parse_one(p)
+ try:
+ value = float(parsed[1]['q'])
+ except ValueError:
+ # q value should be a float; set it to 0
+ value = 0
+ parts.append((value, parsed[0].split("-")))
+ for lang in [x[1] for x in sorted(parts, reverse=True)]:
+ for match in matches:
+ if match in lang:
+ return match
+ return matches[0]
- def parse_one(t):
- parts = t.split(";")
- d = {}
- for param in parts[1:]:
- spl = param.split("=")
- if (len(spl) != 2):
- # this should be formatted as key=value
- # so ignore it
- continue
- k, v = spl
- d[k.strip().lower()] = v.strip()
- if 'q' not in d.keys():
- d['q'] = "1"
- return (parts[0], d)
-
- parts = []
- for p in header.split(","):
- parsed = parse_one(p)
- try:
- value = float(parsed[1]['q'])
- except ValueError:
- # q value should be a float; set it to 0
- value = 0
- parts.append((value, parsed[0].split("-")))
- for lang in [x[1] for x in sorted(parts, reverse=True)]:
- for match in matches:
- if match in lang:
- return match
- return matches[0]
def get_locale():
- header = request.headers.get('Accept-Language')
- print(header)
- try:
- return best_match(['fr', 'en'], header)
- except AttributeError:
- return 'en'
+ header = request.headers.get('Accept-Language')
+ print(header)
+ try:
+ return best_match(['fr', 'en'], header)
+ except AttributeError:
+ return 'en'
+
def get_title(lang):
- if lang == 'fr':
- return "plint -- vérification formelle de poèmes"
- else:
- return "plint -- French poetry checker"
+ if lang == 'fr':
+ return "plint -- vérification formelle de poèmes"
+ else:
+ return "plint -- French poetry checker"
+
@app.route('/static/tpl/<filename>')
def server_static(filename):
- return static_file(filename, root="./static/tpl", mimetype="text/plain")
+ return static_file(filename, root="./static/tpl", mimetype="text/plain")
+
@app.route('/<lang>/static/img/<filename>')
def server_static(filename, lang=None):
- return static_file(filename, root="./static/img")
+ return static_file(filename, root="./static/img")
+
@app.route('/<lang>/static/tpl/<filename>')
def server_static(filename, lang=None):
- return static_file(filename, root="./static/tpl", mimetype="text/plain")
+ return static_file(filename, root="./static/tpl", mimetype="text/plain")
+
@app.route('/static/<filename>')
def server_static(filename):
- return static_file(filename, root="./static")
+ return static_file(filename, root="./static")
+
@app.route('/<lang>/static/<filename>')
def server_static(filename, lang=None):
- return static_file(filename, root="./static")
+ return static_file(filename, root="./static")
+
@app.route('/')
def root():
- redirect('/' + get_locale() + '/')
+ redirect('/' + get_locale() + '/')
+
@app.route('/<page>')
def paged(page):
- redirect('/' + get_locale() + '/' + page)
+ redirect('/' + get_locale() + '/' + page)
+
@app.route('/<lang>/')
def root(lang):
- if lang not in ['fr', 'en']:
- return paged(lang)
- return env.get_template('index.html').render(title=get_title(lang),
- lang=lang, path="")
+ if lang not in ['fr', 'en']:
+ return paged(lang)
+ return env.get_template('index.html').render(title=get_title(lang),
+ lang=lang, path="")
+
@app.route('/<lang>/about')
def about(lang):
- return env.get_template('about.html').render(title=get_title(lang),
- lang=lang, path="about")
+ return env.get_template('about.html').render(title=get_title(lang),
+ lang=lang, path="about")
+
MAX_POEM_LEN = 8192
MAX_LINE_LEN = 512
+
class TooBigException(Exception):
pass
+
class TooLongLinesException(Exception):
pass
+
def check(poem):
- if len(poem) > MAX_POEM_LEN:
- raise TooBigException
- s = poem.split("\n")
- for x in range(len(s)):
- if len(s[x]) > MAX_LINE_LEN:
- raise TooLongLinesException
- s[x] = s[x].strip()
- return s
+ if len(poem) > MAX_POEM_LEN:
+ raise TooBigException
+ s = poem.split("\n")
+ for x in range(len(s)):
+ if len(s[x]) > MAX_LINE_LEN:
+ raise TooLongLinesException
+ s[x] = s[x].strip()
+ return s
+
@app.route('/<lang>/checkjs', method='POST')
def q(lang):
- global throttle
- # necessary when serving with lighttpd proxy-core
- ip = request.environ.get('HTTP_X_FORWARDED_FOR')
- if not ip:
- # fallback; this is 127.0.0.1 with proxy-core
- ip = request.environ.get('REMOTE_ADDR')
- t = time.time()
- print("== %s %s ==" % (ip, t))
- response.content_type = 'application/json'
- localization.init_locale(lang)
- throttle = set(x for x in throttle if t - x[1] < THROTTLE_DELAY)
- if ip in (x[0] for x in throttle):
+ global throttle
+ # necessary when serving with lighttpd proxy-core
+ ip = request.environ.get('HTTP_X_FORWARDED_FOR')
+ if not ip:
+ # fallback; this is 127.0.0.1 with proxy-core
+ ip = request.environ.get('REMOTE_ADDR')
+ t = time.time()
+ print("== %s %s ==" % (ip, t))
+ response.content_type = 'application/json'
+ localization.init_locale(lang)
+ throttle = set(x for x in throttle if t - x[1] < THROTTLE_DELAY)
+ if ip in (x[0] for x in throttle):
+ if lang == 'fr':
+ msg = (("Trop de requêtes pour vérifier le poème,"
+ + " veuillez réessayer dans %d secondes") %
+ THROTTLE_DELAY)
+ else:
+ msg = (("Too many requests to check poem,"
+ + " please try again in %d seconds") %
+ THROTTLE_DELAY)
+ return dumps({'error': msg})
+ throttle.add((ip, t))
+ poem = re.sub(r'<>&', '', request.forms.get('poem'))
+ print(poem)
+
+ # default message
if lang == 'fr':
- msg = (("Trop de requêtes pour vérifier le poème,"
- + " veuillez réessayer dans %d secondes") %
- THROTTLE_DELAY)
+ msg = "Le poème est vide"
else:
- msg = (("Too many requests to check poem,"
- + " please try again in %d seconds") %
- THROTTLE_DELAY)
- return dumps({'error': msg})
- throttle.add((ip, t))
- poem = re.sub(r'<>&', '', request.forms.get('poem'))
- print(poem)
-
- # default message
- if lang == 'fr':
- msg = "Le poème est vide"
- else:
- msg = "Poem is empty"
-
- try:
- poem = check(poem)
- except TooBigException:
- poem = None
- if lang == 'fr':
- msg = "Le poème est trop long (maximum %d caractères)" % MAX_POEM_LEN
- else:
- msg = "Poem is too long (maximum %d characters)" % MAX_POEM_LEN
- except TooLongLinesException:
- poem = None
- if lang == 'fr':
- msg = "Certaines lignes du poème sont trop longues (maximum %d caractères)" % MAX_LINE_LEN
- else:
- msg = "Some lines of the poem are too long (maximum %d characters)" % MAX_LINE_LEN
- if not poem or len(poem) == 0 or (len(poem) == 1 and len(poem[0]) == 0):
- return dumps({'error': msg})
- templateName = re.sub(r'[^a-z_]', '', request.forms.get('template'))
- print(templateName)
- if templateName == 'custom':
- x = request.forms.get('custom_template')
- else:
+ msg = "Poem is empty"
+
try:
- f = open("static/tpl/" + templateName + ".tpl")
- x = f.read()
- f.close()
- except IOError:
- if lang == 'fr':
- msg = "Modèle inexistant"
- else:
- msg = "No such template"
- return dumps({'error': msg})
- print(x)
- try:
- templ = template.Template(x)
- except error.TemplateLoadError as e:
- if lang == 'fr':
- msg = "Erreur à la lecture du modèle : " + e.msg
+ poem = check(poem)
+ except TooBigException:
+ poem = None
+ if lang == 'fr':
+ msg = "Le poème est trop long (maximum %d caractères)" % MAX_POEM_LEN
+ else:
+ msg = "Poem is too long (maximum %d characters)" % MAX_POEM_LEN
+ except TooLongLinesException:
+ poem = None
+ if lang == 'fr':
+ msg = "Certaines lignes du poème sont trop longues (maximum %d caractères)" % MAX_LINE_LEN
+ else:
+ msg = "Some lines of the poem are too long (maximum %d characters)" % MAX_LINE_LEN
+ if not poem or len(poem) == 0 or (len(poem) == 1 and len(poem[0]) == 0):
+ return dumps({'error': msg})
+ templateName = re.sub(r'[^a-z_]', '', request.forms.get('template'))
+ print(templateName)
+ if templateName == 'custom':
+ x = request.forms.get('custom_template')
else:
- msg = "Error when reading template: " + e.msg
- return dumps({'error': msg})
- poem.append(None)
- r = []
- i = 0
- d = {}
- for line in poem:
- i += 1
- last = False
- if line == None:
- line = ""
- last = True
- errors = templ.check(line, last=last)
- if errors:
- r.append({
- 'line': line,
- 'num': i,
- 'errors': sum(errors.lines(short=True), [])
- })
- d['result'] = r
- return dumps(d)
+ try:
+ f = open("static/tpl/" + templateName + ".tpl")
+ x = f.read()
+ f.close()
+ except IOError:
+ if lang == 'fr':
+ msg = "Modèle inexistant"
+ else:
+ msg = "No such template"
+ return dumps({'error': msg})
+ print(x)
+ try:
+ templ = template.Template(x)
+ except error.TemplateLoadError as e:
+ if lang == 'fr':
+ msg = "Erreur à la lecture du modèle : " + e.msg
+ else:
+ msg = "Error when reading template: " + e.msg
+ return dumps({'error': msg})
+ poem.append(None)
+ r = []
+ i = 0
+ d = {}
+ for line in poem:
+ i += 1
+ last = False
+ if line == None:
+ line = ""
+ last = True
+ errors = templ.check(line, last=last)
+ if errors:
+ r.append({
+ 'line': line,
+ 'num': i,
+ 'errors': sum(errors.lines(short=True), [])
+ })
+ d['result'] = r
+ return dumps(d)
-if __name__ == '__main__':
- run(app, port='5000', server="cherrypy", host="::")
+if __name__ == '__main__':
+ run(app, port='5000', server="cherrypy", host="::")
diff --git a/plint/template.py b/plint/template.py
@@ -5,305 +5,264 @@ from plint import error, rhyme
from plint.common import normalize
from plint.nature import nature_count
from plint.options import default_options
+from plint.pattern import Pattern
from plint.verse import Verse
-from plint.vowels import make_query
-
-
-class Pattern:
- def __init__(self, metric, myid="", femid="", constraint=None, hemistiches=None):
- self.metric = metric
- self.parse_metric()
- self.myid = myid
- self.femid = femid
- self.constraint = constraint
- if hemistiches:
- self.hemistiches = hemistiches
-
- def parse_metric(self):
- """Parse from a metric description"""
- try:
- verse = [int(x) for x in self.metric.split('/')]
- for i in verse:
- if i < 1:
- raise ValueError
- except ValueError:
- raise error.TemplateLoadError(("Metric description should only contain positive integers"))
- if sum(verse) > 16:
- raise error.TemplateLoadError(("Metric length limit exceeded"))
- self.hemistiches = []
- self.length = 0
- for v in verse:
- self.length += v
- self.hemistiches.append(self.length)
- self.length = self.hemistiches.pop()
-class Template:
- option_aliases = {
- 'fusionner': 'merge',
- 'ambiguous_ok': 'forbidden_ok',
- 'ambigu_ok': 'forbidden_ok',
- 'dierese': 'diaeresis',
- 'verifie_occurrences': 'check_occurrences',
- 'repetition_ok': 'repeat_ok',
- 'incomplet_ok': 'incomplete_ok',
- 'phon_supposee_ok': 'phon_supposed_ok',
- 'oeil_supposee_ok': 'eye_supposed_ok',
- 'oeil_tolerance_ok': 'eye_tolerance_ok',
- 'pauvre_oeil_requise': 'poor_eye_required',
- 'pauvre_oeil_supposee_ok': 'poor_eye_supposed_ok',
- 'pauvre_oeil_vocalique_ok': 'poor_eye_vocalic_ok',
+
+OPTION_ALIASES = {
+ 'fusionner': 'merge',
+ 'ambiguous_ok': 'forbidden_ok',
+ 'ambigu_ok': 'forbidden_ok',
+ 'dierese': 'diaeresis',
+ 'verifie_occurrences': 'check_occurrences',
+ 'repetition_ok': 'repeat_ok',
+ 'incomplet_ok': 'incomplete_ok',
+ 'phon_supposee_ok': 'phon_supposed_ok',
+ 'oeil_supposee_ok': 'eye_supposed_ok',
+ 'oeil_tolerance_ok': 'eye_tolerance_ok',
+ 'pauvre_oeil_requise': 'poor_eye_required',
+ 'pauvre_oeil_supposee_ok': 'poor_eye_supposed_ok',
+ 'pauvre_oeil_vocalique_ok': 'poor_eye_vocalic_ok',
}
- def __init__(self, string=None):
- self.template = []
- self.pattern_line_no = 0
- self.options = dict(default_options)
- self.mergers = []
- self.overflowed = False
- if string != None:
- self.load(string)
- self.line_no = 0
- self.position = 0
- self.prev = None
- self.env = {}
- self.femenv = {}
- self.occenv = {}
- self.reject_errors = False
-
- def read_option(self, x):
- try:
- key, value = x.split(':')
- except ValueError:
- raise error.TemplateLoadError(("Global options must be provided as key-value pairs"))
- if key in self.option_aliases.keys():
- key = self.option_aliases[key]
- if key == 'merge':
- self.mergers.append(value)
- elif key == 'diaeresis':
- if value == "classique":
- value = "classical"
- if value not in ["permissive", "classical"]:
- raise error.TemplateLoadError(("Bad value for global option %s") % key)
- self.options['diaeresis'] = value
- elif key in self.options.keys():
- self.options[key] = str2bool(value)
- else:
- raise error.TemplateLoadError(("Unknown global option"))
-
- def load(self, s):
- """Load from a string"""
- for line in s.split('\n'):
- line = line.strip()
- self.pattern_line_no += 1
- if line != '' and line[0] != '#':
- if line[0] == '!':
- # don't count the '!' in the options, that's why we use [1:]
- for option in line.split()[1:]:
- self.read_option(option)
+class Template:
+
+ def __init__(self, template_string=None):
+ self.template = []
+ self.pattern_line_no = 0
+ self.options = dict(default_options)
+ self.mergers = []
+ self.overflowed = False
+ if template_string is not None:
+ self.load(template_string)
+ self.line_no = 0
+ self.position = 0
+ self.prev = None
+ self.env = {}
+ self.feminine_environment = {}
+ self.occurrence_environment = {}
+ self.reject_errors = False
+
+ def load(self, template_string):
+ """Load from a string"""
+ for line in template_string.split('\n'):
+ line = line.strip()
+ self.pattern_line_no += 1
+ if len(line) != 0 and line[0] != '#':
+ if line[0] == '!':
+ # don't count the '!' in the options, that's why we use [1:]
+ for option_string in line.split()[1:]:
+ self.read_option(option_string)
+ else:
+ self.template.append(self.parse_line(line.strip()))
+ if len(self.template) == 0:
+ raise error.TemplateLoadError("Template is empty")
+
+ def read_option(self, option_string):
+ try:
+ key, value = option_string.split(':')
+ except ValueError:
+ raise error.TemplateLoadError("Global options must be provided as key-value pairs")
+ if key in OPTION_ALIASES:
+ key = OPTION_ALIASES[key]
+ if key == 'merge':
+ self.mergers.append(value)
+ elif key == 'diaeresis':
+ if value == "classique":
+ value = "classical"
+ if value not in ["permissive", "classical"]:
+ raise error.TemplateLoadError("Bad value for global option %s" % key)
+ self.options['diaeresis'] = value
+ elif key in self.options:
+ self.options[key] = str2bool(value)
+ else:
+ raise error.TemplateLoadError("Unknown global option")
+
+ def parse_line(self, line):
+ """Parse template line from a line"""
+ split = line.split(' ')
+ metric = split[0]
+ if len(split) >= 2:
+ my_id = split[1]
else:
- self.template.append(self.parse_line(line.strip()))
- if len(self.template) == 0:
- raise error.TemplateLoadError(("Template is empty"))
-
- def match(self, line, ofile=None, quiet=False, last=False, nsyl=None,
- offset=0):
- """Check a line against current pattern, return errors"""
-
- was_incomplete = last and not self.beyond
-
- errors = []
- pattern = self.get()
-
- line_with_case = normalize(line, downcase=False)
-
- v = Verse(line, self, pattern)
-
- if nsyl:
- v.annotate()
- count = 0
- # only generate a context with the prescribed final weight
- # where "final" is the offset-th chunk with a weight from the end
- for i, p in enumerate(v.chunks.chunks[::-1]):
- if (p.weights is not None):
- if count < offset:
- count += 1
- continue
- print(str(nsyl) + ' '
- + ' '.join(make_query(v.chunks.chunks, len(v.chunks.chunks)-i-1)), file=ofile)
- break
- return errors, pattern, v
-
- if last:
- if was_incomplete and not self.options['incomplete_ok'] and not self.overflowed:
- return [error.ErrorIncompleteTemplate()], pattern, v
- return [], pattern, v
-
- if self.overflowed:
- return [error.ErrorOverflowedTemplate()], pattern, v
-
- rhyme_failed = False
- # rhymes
- if pattern.myid not in self.env.keys():
- # initialize the rhyme
- # last_count is passed later
- self.env[pattern.myid] = rhyme.Rhyme(v.normalized,
- pattern.constraint, self.mergers, self.options)
- else:
- # update the rhyme
- self.env[pattern.myid].feed(v.normalized, pattern.constraint)
- if not self.env[pattern.myid].satisfied_phon():
- # no more possible rhymes, something went wrong, check phon
- self.env[pattern.myid].rollback()
- rhyme_failed = True
- errors.append(error.ErrorBadRhymeSound(self.env[pattern.myid],
- self.env[pattern.myid].new_rhyme))
-
- # occurrences
- if self.options['check_occurrences']:
- if pattern.myid not in self.occenv.keys():
- self.occenv[pattern.myid] = {}
- last_word = re.split(r'[- ]', line_with_case)[-1]
- if last_word not in self.occenv[pattern.myid].keys():
- self.occenv[pattern.myid][last_word] = 0
- self.occenv[pattern.myid][last_word] += 1
- if self.occenv[pattern.myid][last_word] > nature_count(last_word):
- errors.insert(0, error.ErrorMultipleWordOccurrence(last_word,
- self.occenv[pattern.myid][last_word]))
-
- v.phon = self.env[pattern.myid].phon
- v.parse()
-
- # now that we have parsed, adjust rhyme to reflect last word length
- # and check eye
- if not rhyme_failed:
- self.env[pattern.myid].adjustLastCount(v.last_count())
- if not self.env[pattern.myid].satisfied_eye():
- old_phon = len(self.env[pattern.myid].phon)
- self.env[pattern.myid].rollback()
- errors.append(error.ErrorBadRhymeEye(self.env[pattern.myid],
- self.env[pattern.myid].new_rhyme, old_phon))
-
- rhyme_failed = False
-
- errors = v.problems() + errors
-
- if ofile:
- possible = v.possible
- if len(possible) == 1:
- for i, p in enumerate(possible[0]):
- if (p.weights is not None and len(p.weights) > 1
- and p.weight is not None and p.weight > 0):
- print(str(p.weight) + ' '
- + ' '.join(make_query(possible[0], i)), file=ofile)
-
- # rhyme genres
- # inequality constraint
- # TODO this is simplistic and order-dependent
- if pattern.femid.swapcase() in self.femenv.keys():
- new = set(['M', 'F']) - self.femenv[pattern.femid.swapcase()]
- if len(new) > 0:
- self.femenv[pattern.femid] = new
- if pattern.femid not in self.femenv.keys():
- if pattern.femid == 'M':
- x = set(['M'])
- elif pattern.femid == 'F':
- x = set(['F'])
- else:
- x = set(['M', 'F'])
- self.femenv[pattern.femid] = x
- old = list(self.femenv[pattern.femid])
- new = v.genders()
- self.femenv[pattern.femid] &= set(new)
- if len(self.femenv[pattern.femid]) == 0:
- errors.append(error.ErrorBadRhymeGenre(old, new))
-
- return errors, pattern, v
-
- def parse_line(self, line):
- """Parse template line from a line"""
- split = line.split(' ')
- metric = split[0]
- if len(split) >= 2:
- myid = split[1]
- else:
- myid = str(self.pattern_line_no) # unique
- if len(split) >= 3:
- femid = split[2]
- else:
- femid = str(self.pattern_line_no) # unique
- idsplit = myid.split(':')
- if len(idsplit) >= 2:
- constraint = idsplit[-1].split('|')
- if len(constraint) > 0:
- constraint[0] = False if constraint[0] in ["no", "non"] else constraint[0]
- if len(constraint) > 1:
- constraint[1] = int(constraint[1])
- else:
- constraint = []
- if len(constraint) == 0:
- constraint.append(1)
- if len(constraint) < 2:
- constraint.append(True)
- return Pattern(metric, myid, femid, rhyme.Constraint(*constraint))
-
- def reset_conditional(self, d):
- return dict((k, v) for k, v in d.items() if len(k) > 0 and k[0] == '!')
-
- def reset_state(self, with_femenv=False):
- """Reset our state, except ids starting with '!'"""
- self.position = 0
- self.env = self.reset_conditional(self.env)
- self.femenv = self.reset_conditional(self.femenv)
- self.occenv = {} # always reset
-
- @property
- def beyond(self):
- return self.position >= len(self.template)
-
- def get(self):
- """Get next state, resetting if needed"""
- self.old_position = self.position
- self.old_env = copy.deepcopy(self.env)
- self.old_femenv = copy.deepcopy(self.femenv)
- self.old_occenv = copy.deepcopy(self.occenv)
- if self.beyond:
- if not self.options['repeat_ok']:
- self.overflowed = True
- self.reset_state()
- result = self.template[self.position]
- self.position += 1
- return result
-
- def back(self):
- """Revert to previous state"""
- self.position = self.old_position
- self.env = copy.deepcopy(self.old_env)
- self.femenv = copy.deepcopy(self.old_femenv)
- self.occenv = copy.deepcopy(self.old_occenv)
-
- def check(self, line, ofile=None, quiet=False, last=False, nsyl=None,
- offset=0):
- """Check line (wrapper)"""
- self.line_no += 1
- line = line.rstrip()
- if normalize(line) == '' and not last:
- return None
- #possible = [compute(p) for p in possible]
- #possible = sorted(possible, key=rate)
- errors, pattern, verse = self.match(line, ofile, quiet=quiet, last=last,
- nsyl=nsyl, offset=offset)
- if len(errors) > 0:
- if self.reject_errors:
- self.back()
- self.line_no -= 1
- return error.ErrorCollection(self.line_no, line, pattern, verse, errors)
- return None
+ my_id = str(self.pattern_line_no) # unique
+ if len(split) >= 3:
+ feminine_id = split[2]
+ else:
+ feminine_id = str(self.pattern_line_no) # unique
+ id_split = my_id.split(':')
+ classical = True
+ n_common_suffix_phones = 1
+ if len(id_split) >= 2:
+ constraint = id_split[-1].split('|')
+ if len(constraint) > 0:
+ classical = False if constraint[0] in ["no", "non"] else constraint[0]
+ if len(constraint) > 1:
+ n_common_suffix_phones = int(constraint[1])
+ else:
+ constraint = []
+ if len(constraint) == 0:
+ n_common_suffix_phones = 1
+ if len(constraint) < 2:
+ classical = True
+ return Pattern(metric, my_id, feminine_id, rhyme.Constraint(classical, n_common_suffix_phones))
-def str2bool(x):
- if x.lower() in ["yes", "oui", "y", "o", "true", "t", "vrai", "v"]:
- return True
- if x.lower() in ["no", "non", "n", "false", "faux", "f"]:
- return False
- raise error.TemplateLoadError(("Bad value in global option"))
+ def match(self, line, output_file=None, last=False, n_syllables=None, offset=0):
+ """Check a line against current pattern, return errors"""
+
+ was_incomplete = last and not self.beyond
+
+ errors = []
+ pattern = self.get()
+ line_with_case = normalize(line, downcase=False)
+
+ verse = Verse(line, self, pattern)
+
+ if n_syllables:
+ verse.print_n_syllables(n_syllables, offset, output_file)
+ return errors, pattern, verse
+
+ if last:
+ if was_incomplete and not self.options['incomplete_ok'] and not self.overflowed:
+ return [error.ErrorIncompleteTemplate()], pattern, verse
+ return [], pattern, verse
+
+ if self.overflowed:
+ return [error.ErrorOverflowedTemplate()], pattern, verse
+
+ rhyme_failed = False
+ # rhymes
+ if pattern.my_id not in self.env:
+ # initialize the rhyme
+ # last_count is passed later
+ self.env[pattern.my_id] = rhyme.Rhyme(verse.normalized, pattern.constraint, self.mergers, self.options)
+ else:
+ # update the rhyme
+ self.env[pattern.my_id].feed(verse.normalized, pattern.constraint)
+ if not self.env[pattern.my_id].satisfied_phon():
+ # no more possible rhymes, something went wrong, check phon
+ self.env[pattern.my_id].rollback()
+ rhyme_failed = True
+ errors.append(error.ErrorBadRhymeSound(self.env[pattern.my_id],
+ self.env[pattern.my_id].new_rhyme))
+
+ # occurrences
+ if self.options['check_occurrences']:
+ if pattern.my_id not in self.occurrence_environment.keys():
+ self.occurrence_environment[pattern.my_id] = {}
+ last_word = re.split(r'[- ]', line_with_case)[-1]
+ if last_word not in self.occurrence_environment[pattern.my_id].keys():
+ self.occurrence_environment[pattern.my_id][last_word] = 0
+ self.occurrence_environment[pattern.my_id][last_word] += 1
+ if self.occurrence_environment[pattern.my_id][last_word] > nature_count(last_word):
+ errors.insert(0, error.ErrorMultipleWordOccurrence(last_word,
+ self.occurrence_environment[pattern.my_id][last_word]))
+
+ verse.phon = self.env[pattern.my_id].phon
+ verse.parse()
+
+ # now that we have parsed, adjust rhyme to reflect last word length
+ # and check eye
+ if not rhyme_failed:
+ self.env[pattern.my_id].adjustLastCount(verse.last_count())
+ if not self.env[pattern.my_id].satisfied_eye():
+ old_phon = len(self.env[pattern.my_id].phon)
+ self.env[pattern.my_id].rollback()
+ errors.append(error.ErrorBadRhymeEye(self.env[pattern.my_id],
+ self.env[pattern.my_id].new_rhyme, old_phon))
+
+ errors = verse.problems() + errors
+
+ if output_file:
+ possible = verse.possible
+ if len(possible) == 1:
+ for i, chunk in enumerate(possible[0]):
+ if (chunk.weights is not None and len(chunk.weights) > 1
+ and chunk.weight is not None and chunk.weight > 0):
+ chunks_before = possible[0][:i]
+ chunks_after = possible[0][i + 1:]
+ print(str(chunk.weight) + ' '
+ + ' '.join(chunk.make_query(chunks_before, chunks_after)), file=output_file)
+
+ # rhyme genres
+ # inequality constraint
+ # TODO this is simplistic and order-dependent
+ if pattern.feminine_id.swapcase() in self.feminine_environment.keys():
+ new = {'M', 'F'} - self.feminine_environment[pattern.feminine_id.swapcase()]
+ if len(new) > 0:
+ self.feminine_environment[pattern.feminine_id] = new
+ if pattern.feminine_id not in self.feminine_environment.keys():
+ if pattern.feminine_id == 'M':
+ x = {'M'}
+ elif pattern.feminine_id == 'F':
+ x = {'F'}
+ else:
+ x = {'M', 'F'}
+ self.feminine_environment[pattern.feminine_id] = x
+ old = list(self.feminine_environment[pattern.feminine_id])
+ new = verse.genders()
+ self.feminine_environment[pattern.feminine_id] &= set(new)
+ if len(self.feminine_environment[pattern.feminine_id]) == 0:
+ errors.append(error.ErrorBadRhymeGenre(old, new))
+
+ return errors, pattern, verse
+
+ def reset_conditional(self, d):
+ return dict((k, v) for k, v in d.items() if len(k) > 0 and k[0] == '!')
+
+ def reset_state(self, with_femenv=False):
+ """Reset our state, except ids starting with '!'"""
+ self.position = 0
+ self.env = self.reset_conditional(self.env)
+ self.feminine_environment = self.reset_conditional(self.feminine_environment)
+ self.occurrence_environment = {} # always reset
+
+ @property
+ def beyond(self):
+ return self.position >= len(self.template)
+
+ def get(self):
+ """Get next state, resetting if needed"""
+ self.old_position = self.position
+ self.old_env = copy.deepcopy(self.env)
+ self.old_femenv = copy.deepcopy(self.feminine_environment)
+ self.old_occenv = copy.deepcopy(self.occurrence_environment)
+ if self.beyond:
+ if not self.options['repeat_ok']:
+ self.overflowed = True
+ self.reset_state()
+ result = self.template[self.position]
+ self.position += 1
+ return result
+
+ def back(self):
+ """Revert to previous state"""
+ self.position = self.old_position
+ self.env = copy.deepcopy(self.old_env)
+ self.feminine_environment = copy.deepcopy(self.old_femenv)
+ self.occurrence_environment = copy.deepcopy(self.old_occenv)
+
+ def check(self, line, output_file=None, last=False, n_syllables=None, offset=0):
+ """Check line (wrapper)"""
+ self.line_no += 1
+ line = line.rstrip()
+ if normalize(line) == '' and not last:
+ return None
+
+ errors, pattern, verse = self.match(line, output_file, last=last, n_syllables=n_syllables, offset=offset)
+ if len(errors) > 0:
+ if self.reject_errors:
+ self.back()
+ self.line_no -= 1
+ return error.ErrorCollection(self.line_no, line, pattern, verse, errors)
+ return None
+
+
+def str2bool(x):
+ if x.lower() in ["yes", "oui", "y", "o", "true", "t", "vrai", "v"]:
+ return True
+ if x.lower() in ["no", "non", "n", "false", "faux", "f"]:
+ return False
+ raise error.TemplateLoadError(("Bad value in global option"))
diff --git a/plint/tests/test_bad_chars.py b/plint/tests/test_bad_chars.py
@@ -1,16 +1,17 @@
import unittest
+import plint.pattern
from plint import verse, template
class BadChars(unittest.TestCase):
def testBadAlone(self):
- v = verse.Verse("42", template.Template(), template.Pattern("12"))
+ v = verse.Verse("42", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertFalse(v.valid())
def testBadAndGood(self):
- v = verse.Verse("bla h42 blah ", template.Template(), template.Pattern("12"))
+ v = verse.Verse("bla h42 blah ", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertFalse(v.valid())
diff --git a/plint/tests/test_counts.py b/plint/tests/test_counts.py
@@ -1,12 +1,13 @@
import unittest
+import plint.pattern
from plint import verse, template
class Counts(unittest.TestCase):
def runCount(self, text, limit=12, hemistiches=None):
- v = verse.Verse(text, template.Template(), template.Pattern(str(limit), hemistiches=hemistiches))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern(str(limit), hemistiches=hemistiches))
v.parse()
return v.possible
diff --git a/plint/tests/test_eliminate.py b/plint/tests/test_eliminate.py
@@ -1,19 +1,20 @@
import unittest
+import plint.pattern
from plint import verse, template
class Eliminate(unittest.TestCase):
def testEliminateOneGue(self):
text = "gue"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
c = ''.join([x.text for x in v.chunks.chunks])
self.assertFalse("gue" in c)
def testEliminateGue(self):
text = "gue gue GUE ogues longuement la guerre"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
c = ''.join([x.text for x in v.chunks.chunks])
self.assertFalse("gue" in c)
diff --git a/plint/tests/test_gender.py b/plint/tests/test_gender.py
@@ -1,12 +1,13 @@
import unittest
+import plint.pattern
from plint import verse, template
class Genders(unittest.TestCase):
def testSingleSyllJe(self):
text = "Patati patata patatatah où suis-je"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
gend = v.genders()
self.assertTrue(v.valid())
@@ -15,7 +16,7 @@ class Genders(unittest.TestCase):
def testSingleSyllJeBis(self):
text = "Patati patata patatah la verrai-je"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
gend = v.genders()
self.assertTrue(v.valid())
@@ -24,7 +25,7 @@ class Genders(unittest.TestCase):
def testSingleSyllLe(self):
text = "Patati patata patatata prends-le"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
gend = v.genders()
self.assertTrue(v.valid())
@@ -33,7 +34,7 @@ class Genders(unittest.TestCase):
def testSingleSyllCe(self):
text = "Patati patata patatata mais qu'est-ce"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
gend = v.genders()
self.assertTrue(v.valid())
@@ -42,7 +43,7 @@ class Genders(unittest.TestCase):
def testSingleSyllHyphen(self):
text = "Patati patata patata mange-les"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
gend = v.genders()
self.assertTrue(v.valid())
@@ -51,7 +52,7 @@ class Genders(unittest.TestCase):
def testSingleSyllNoHyphen(self):
text = "Patati patata patata mange les"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
gend = v.genders()
self.assertTrue(v.valid())
diff --git a/plint/tests/test_hiatus.py b/plint/tests/test_hiatus.py
@@ -1,36 +1,37 @@
import unittest
+import plint.pattern
from plint import verse, template
class Hiatus(unittest.TestCase):
def testBadVowel(self):
- v = verse.Verse("patati patata patata arbrisseau", template.Template(), template.Pattern("12"))
+ v = verse.Verse("patati patata patata arbrisseau", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertFalse(v.valid())
def testBadUnaspirated(self):
- v = verse.Verse("patati patata patata hirondelle", template.Template(), template.Pattern("12"))
+ v = verse.Verse("patati patata patata hirondelle", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertFalse(v.valid())
def testGoodAspirated(self):
- v = verse.Verse("patati patata patata tata hache", template.Template(), template.Pattern("12"))
+ v = verse.Verse("patati patata patata tata hache", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertTrue(v.valid())
def testGoodConsonant(self):
- v = verse.Verse("patati patata patatah arbrisseau", template.Template(), template.Pattern("12"))
+ v = verse.Verse("patati patata patatah arbrisseau", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertTrue(v.valid())
def testGoodMuteE(self):
- v = verse.Verse("patati patata patatue arbrisseau", template.Template(), template.Pattern("12"))
+ v = verse.Verse("patati patata patatue arbrisseau", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertTrue(v.valid())
def testBadEt(self):
- v = verse.Verse("patati patata patata et avant", template.Template(), template.Pattern("12"))
+ v = verse.Verse("patati patata patata et avant", template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertFalse(v.valid())
diff --git a/plint/tests/test_sanity_check.py b/plint/tests/test_sanity_check.py
@@ -1,5 +1,6 @@
import unittest
+import plint.pattern
from plint import diaeresis, verse, template, common
@@ -7,31 +8,31 @@ class SanityCheck(unittest.TestCase):
def testSimple(self):
text = "Hello World!! This is a test_data"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertEqual(text, v.line)
def testComplex(self):
text = "Aye AYAYE aye gue que geque AYAYAY a prt sncf bbbéé"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertEqual(text, v.line)
def testLeadingSpace(self):
text = " a"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertEqual(text, v.line)
def testLeadingSpaceHyphenVowel(self):
text = " -a"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertEqual(text, v.line)
def testLeadingSpaceHyphenConsonant(self):
text = " -c"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
self.assertEqual(text, v.line)
diff --git a/plint/tests/test_sanity_check2.py b/plint/tests/test_sanity_check2.py
@@ -1,12 +1,13 @@
import unittest
+import plint.pattern
from plint import verse, template
class SanityCheck2(unittest.TestCase):
def testSimple(self):
text = "Patati patata patata tata vies"
- v = verse.Verse(text, template.Template(), template.Pattern("12"))
+ v = verse.Verse(text, template.Template(), plint.pattern.Pattern("12"))
v.parse()
gend = v.genders()
self.assertEqual(1, len(gend))
diff --git a/plint/verse.py b/plint/verse.py
@@ -1,7 +1,8 @@
#!/usr/bin/python3
-from plint.chunks import Chunks
-from plint.common import normalize, is_vowels, SURE_END_FEM, strip_accents
from plint import error, common
+from plint.chunks import Chunks
+from plint.common import SURE_END_FEM, strip_accents
+
# the writing is designed to make frhyme succeed
# end vowels will be elided
@@ -11,13 +12,11 @@ class Verse:
@property
def line(self):
- return ''.join(x.original for x in self.chunks.chunks)
+ return self.chunks.get_line()
@property
def normalized(self):
- return ''.join(normalize(x.original, strip=False, rm_apostrophe_end=False)
- if x.text_pron is None else x.text
- for x in self.chunks.chunks).lstrip().rstrip()
+ return self.chunks.normalized()
def __init__(self, input_line, template, pattern, threshold=None):
self.template = template
@@ -25,8 +24,8 @@ class Verse:
self.threshold = threshold
self.phon = None
self.possible = None
- self._line = input_line
- self.chunks = Chunks(input_line)
+ self.input_line = input_line
+ self.chunks = Chunks(self)
self.text = None
def annotate(self):
@@ -119,7 +118,6 @@ class Verse:
def last_count(self):
"""return min number of syllables for last word"""
-
tot = 0
for chunk in self.chunks.chunks[::-1]:
if chunk.original.endswith(' ') or chunk.original.endswith('-'):
@@ -133,18 +131,10 @@ class Verse:
return tot
def problems(self):
+ errors = self.chunks.get_errors_set(self.template.options['forbidden_ok'], self.template.options['hiatus_ok'])
result = []
- errors = set()
if len(self.possible) == 0:
result.append(error.ErrorBadMetric())
- for chunk in self.chunks.chunks:
- if chunk.error is not None:
- if chunk.error == "ambiguous" and not self.template.options['forbidden_ok']:
- errors.add(error.ErrorForbiddenPattern)
- if chunk.error == "hiatus" and not self.template.options['hiatus_ok']:
- errors.add(error.ErrorHiatus)
- if chunk.error == "illegal":
- errors.add(error.ErrorBadCharacters)
for k in errors:
result.append(k())
return result
@@ -160,3 +150,9 @@ class Verse:
# try to infer gender even when metric is wrong
result.update(set(self.feminine(None)))
return result
+
+ def print_n_syllables(self, n_syllables, offset, output_file):
+ self.annotate()
+ # only generate a context with the prescribed final weight
+ # where "final" is the offset-th chunk with a weight from the end
+ self.chunks.print_n_syllables(n_syllables, offset, output_file)
diff --git a/plint/vowels.py b/plint/vowels.py
@@ -3,45 +3,6 @@
"""Compute the number of syllabes taken by a vowel chunk"""
-from plint.common import strip_accents
-from plint import diaeresis
-
-DEFAULT_THRESHOLD = 3
-
-
-def possible_weights_ctx(chunks, pos, threshold=None):
- global DEFAULT_THRESHOLD
- if not threshold:
- threshold = DEFAULT_THRESHOLD
- chunk = chunks[pos]
- q = make_query(chunks, pos)
- v = diaeresis.diaeresis_finder.lookup(q)
- if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold:
- return [int(list(v.keys())[0])]
- else:
- return possible_weights_seed(chunk)
-
-
-def make_query(chunks, pos):
- cleared = [clear(chunk) for chunk in chunks]
- if cleared[pos].endswith(' '):
- cleared[pos] = cleared[pos].rstrip()
- if pos + 1 < len(cleared):
- cleared[pos + 1] = " " + cleared[pos + 1]
- else:
- cleared.append(' ')
- ret2 = intersperse(
- ''.join(cleared[pos + 1:]),
- ''.join([x[::-1] for x in cleared[:pos][::-1]]))
- ret = [cleared[pos]] + ret2
- return ret
-
-
-def clear(chunk):
- if chunk.word_end == True:
- return (chunk.text + ' ')
- return chunk.text
-
def intersperse(left, right):
if (len(left) == 0 or left[0] == ' ') and (len(right) == 0 or right[0] == ' '):
@@ -53,98 +14,9 @@ def intersperse(left, right):
return [left[0], right[0]] + intersperse(left[1:], right[1:])
-def possible_weights_approx(chunk):
- """Return the possible number of syllabes taken by a vowel chunk (permissive approximation)"""
- if len(chunk) == 1:
- return [1]
- # old spelling and weird exceptions
- if chunk in ['ouï']:
- return [1, 2] # TODO unsure about that
- if chunk in ['eüi', 'aoû', 'uë']:
- return [1]
- if chunk in ['aïe', 'oë', 'ouü']:
- return [1, 2]
- if contains_trema(chunk):
- return [2]
- chunk = strip_accents(chunk, True)
- if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
- 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
- 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
- 'yeu', 'ye', 'you']:
- return [1]
- if chunk == "oua":
- return [1, 2] # "pouah"
- if chunk == "ao":
- return [1, 2] # "paon"
- for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']:
- if x in chunk:
- return [2]
- # beware of "déesse"
- if chunk == 'ée':
- return [1, 2]
- if chunk[0] == 'i':
- return [1, 2]
- if chunk[0] == 'u' and (strip_accents(chunk[1]) in ['i', 'e']):
- return [1, 2]
- if chunk[0] == 'o' and chunk[1] == 'u' and len(chunk) >= 3 and strip_accents(chunk[2]) in ['i', 'e']:
- return [1, 2]
- if 'é' in chunk or 'è' in chunk:
- return [2]
- # we can't tell
- return [1, 2]
-
-
def contains_trema(chunk):
"""Test if a string contains a word with a trema"""
for x in ['ä', 'ë', 'ï', 'ö', 'ü', 'ÿ']:
if x in chunk:
return True
return False
-
-
-def possible_weights_seed(chunk):
- """Return the possible number of syllabes taken by a vowel chunk"""
- if len(chunk.text) == 1:
- return [1]
- # dioïde, maoïste, taoïste
- if (chunk.text[-1] == 'ï' and len(chunk.text) >= 3 and not
- chunk.text[-3:-1] == 'ou'):
- return [3]
- # ostéoarthrite
- if "éoa" in chunk.text:
- return [3]
- # antiaérien; but let's play it safe
- if "iaé" in chunk.text:
- return [2, 3]
- # giaour, miaou, niaouli
- if "iaou" in chunk.text:
- return [2, 3]
- # bioélectrique
- if "ioé" in chunk.text:
- return [2, 3]
- # méiose, nucléion, etc.
- if "éio" in chunk.text:
- return [2, 3]
- # radioactif, radioamateur, etc.
- if "ioa" in chunk.text:
- return [2, 3]
- # pléiade
- if "éio" in chunk.text:
- return [2, 3]
- # pompéien, tarpéien...
- # in theory the "-ie" should give a diaeresis, so 3 syllabes
- # let's keep the benefit of the doubt...
- # => this also gives 3 as a possibility for "obéie"...
- if "éie" in chunk.text:
- return [2, 3]
- # tolstoïen
- # same remark
- if "oïe" in chunk.text:
- return [2, 3]
- # shanghaïen (diaeresis?), but also "aië"
- if "aïe" in chunk.text:
- return [1, 2, 3]
- if chunk.text in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']:
- return [1]
- # we can't tell
- return [1, 2]
diff --git a/test.sh b/test.sh
@@ -5,12 +5,32 @@
echo "It is normal that some errors occur when running this script" >/dev/stderr
echo "See test_expected_output.out for the usual errors that are output" >/dev/stderr
+
+rm -f test_temp.txt;
+rm -f test_temp_sorted.txt;
+rm -f test_expected_sorted.txt;
+
for a in plint/test_data/*.tpl; do
echo "$a"
+ echo "$a" >> test_temp.txt
if [[ $a == *cyrano_full* ]]
then
- ./plint.py $(pwd)/$a ../data/diaeresis_cyrano.json < $(pwd)/${a%.tpl}
+ ./plint.py $(pwd)/$a ../data/diaeresis_cyrano.json < $(pwd)/${a%.tpl} &>> test_temp.txt
else
- ./test_one.sh $(basename "${a%.tpl}")
+ ./test_one.sh $(basename "${a%.tpl}") &>> test_temp.txt
fi
done
+
+sort test_temp.txt > test_temp_sorted.txt;
+sort test_expected_output.out > test_expected_sorted.txt;
+
+if [ $(python3 compare_test_output.py test_temp_sorted.txt test_expected_sorted.txt | wc -l) -eq 1 ]; then
+ echo "TEST SUCCEED";
+else
+ echo "TEST FAILED";
+ diff test_temp_sorted.txt test_expected_sorted.txt
+fi
+
+rm -f test_temp.txt;
+rm -f test_temp_sorted.txt;
+rm -f test_expected_sorted.txt