chunk.py - plint - French poetry validator (local mirror of https://gitlab.com/a3nm/plint)

chunk.py (27660B)
      1 import re
      2 import sys
      3 
      4 from haspirater import haspirater
      5 from plint import common, diaeresis, error
      6 from plint.common import normalize, strip_accents_one, is_consonants, APOSTROPHES, is_vowels, get_consonants_regex, \
      7     strip_accents, SURE_END_FEM
      8 from plint.error import ErrorCollection
      9 from plint.vowels import contains_trema, intersperse
     10 
     11 
     12 DEFAULT_THRESHOLD = 3
     13 
     14 
     15 class Chunk:
     16 
     17     def __init__(self, word, verse):
     18         self.original = word
     19         self.text = normalize(word, rm_apostrophe=True)
     20         self.hemistiche = None
     21         self.error = None
     22         self.illegal_str = None
     23         self.weights = None
     24         self.had_hyphen = None
     25         self.text_pron = None
     26         self.elision = None
     27         self.no_hiatus = None
     28         self.causes_hiatus = None
     29         self.elidable = None
     30         self.word_end = False
     31 
     32         # self.weight contains the weight attributed to the chunk when fitting
     33         # all chunks of the verse (function fit in chunks.py) to respect the
     34         # metric
     35         self.weight = None
     36 
     37         self.verse = verse
     38 
     39     def __repr__(self):
     40         return "Chunk(" \
     41                + "original:" + self.original \
     42                + ", text:" + self.text \
     43                + ", weights:" + str(self.weights or []) \
     44                + ", weight:" + str(self.weight or "") \
     45                + ", elidable:" + str(self.elidable or False) \
     46                + ", elision:" + str(self.elision or False) \
     47                + ", hemistiche:" + str(self.hemistiche) \
     48                + ", error:" + str(self.error) \
     49                + ", illegal_str:" + str(self.illegal_str) \
     50                + ", had_hypher:" + str(self.had_hyphen) \
     51                + ", text_pron:" + str(self.text_pron) \
     52                + ", no_hiatus:" + str(self.no_hiatus) \
     53                + ", word_end:" + str(self.word_end) \
     54                + ")" + "\n"
     55 
     56     def copy(self):
     57         new_chunk = Chunk(self.original, self.verse)
     58         new_chunk.original = self.original
     59         new_chunk.text = self.text
     60         new_chunk.hemistiche = self.hemistiche
     61         new_chunk.error = self.error
     62         new_chunk.illegal_str = self.illegal_str
     63         new_chunk.weights = self.weights
     64         new_chunk.had_hyphen = self.had_hyphen
     65         new_chunk.text_pron = self.text_pron
     66         new_chunk.elision = self.elision
     67         new_chunk.no_hiatus = self.no_hiatus
     68         new_chunk.elidable = self.elidable
     69         new_chunk.word_end = self.word_end
     70         new_chunk.weight = self.weight
     71         return new_chunk
     72 
     73     def set_hemistiche(self, hemistiche):
     74         # The hemistiche can take the following values
     75         #    ok: correct
     76         #    cut: falls at the middle of a word
     77         #    fem: preceding word ends by a mute e
     78         self.hemistiche = hemistiche
     79 
     80     def check_forbidden_characters(self):
     81         es = ""
     82         for x in self.text:
     83             if not common.remove_punctuation(strip_accents_one(x)[0].lower()) in common.LEGAL:
     84                 es += 'I'
     85                 self.error = "illegal"
     86             else:
     87                 es += ' '
     88         if self.error is not None and self.error == "illegal":
     89             self.illegal_str = es
     90 
     91     def simplify_gu_qu(self, next_chunk):
     92         if next_chunk.text.startswith('u'):
     93             if self.text.endswith('q'):
     94                 next_chunk.text = next_chunk.text[1:]
     95                 if next_chunk.text == '':
     96                     self.original += next_chunk.original
     97                     next_chunk.original = ''
     98             if self.text.endswith('g') and len(next_chunk.text) >= 2:
     99                 if next_chunk.text[1] in "eéèa":
    100                     next_chunk.text = next_chunk.text[1:]
    101 
    102     def elide_inside_words(self, all_next_chunks):
    103         if self.text == "e-":
    104             self.weights = [0]  # force elision
    105         next_chunk = all_next_chunks[0]
    106         if self.text == "e" and next_chunk.text.startswith("-h"):
    107             # collect what follows until the next hyphen or end
    108             flw = next_chunk.original.split('-')[1]
    109             for future_chunk in all_next_chunks[1:]:
    110                 flw += future_chunk.original.split('-')[0]
    111                 if '-' in future_chunk.original:
    112                     break
    113             # TODO: not sure if this reconstruction of the original word is bulletproof...
    114             if haspirater.lookup(normalize(flw)):
    115                 self.weights = [0]
    116             else:
    117                 self.weights = [1]
    118 
    119     def remove_leading_and_trailing_crap(self):
    120         seen_space = False
    121         seen_hyphen = False
    122         while len(self.text) > 0 and self.text[0] in ' -':
    123             if self.text[0] == ' ':
    124                 seen_space = True
    125             else:
    126                 seen_hyphen = True
    127             self.text = self.text[1:]
    128         while len(self.text) > 0 and self.text[-1] in ' -':
    129             if self.text[-1] == ' ':
    130                 seen_space = True
    131             else:
    132                 seen_hyphen = True
    133             self.text = self.text[:-1]
    134         if seen_hyphen and not seen_space:
    135             self.had_hyphen = True
    136 
    137     def is_empty(self):
    138         return len(self.text) == 0
    139 
    140     def add_original(self, other_chunk):
    141         self.original += other_chunk.original
    142 
    143     def create_acronym(self):
    144         new_chunks = []
    145         for j, character in enumerate(self.text):
    146             try:
    147                 new_chunk_content = LETTERS[character]
    148                 # hack: the final 'e's in letters are just to help pronunciation
    149                 # inference and are only needed at end of word, otherwise they will
    150                 # mess syllable count up
    151                 if j < len(self.text) - 1 and new_chunk_content[-1] == 'e':
    152                     new_chunk_content = new_chunk_content[:-1]
    153             except KeyError:
    154                 new_chunk_content = character + 'é'
    155             new_chunks += [(j, x) for x in re.split(get_consonants_regex(), new_chunk_content)]
    156         new_chunks = [x for x in new_chunks if len(x[1]) > 0]
    157         new_word = []
    158         last_opos = -1
    159         for j, (original_position, character) in enumerate(new_chunks):
    160             part = ""
    161             if j == len(new_chunks) - 1:
    162                 # don't miss final spaces
    163                 part = self.original[last_opos + 1:]
    164             elif last_opos < original_position:
    165                 part = self.original[last_opos + 1:original_position + 1]
    166                 last_opos = original_position
    167             # allow or forbid elision because of possible ending '-e' before
    168             # forbid hiatus both for this and for preceding
    169             # instruct that we must use text for the pronunciation
    170             new_chunk = Chunk(part, self.verse)
    171             new_chunk.original = part
    172             new_chunk.text = character
    173             new_chunk.text_pron = True
    174             new_chunk.elision = [False, True]
    175             new_chunk.no_hiatus = True
    176             new_word.append(new_chunk)
    177             # propagate information from splithyph
    178             new_word[-1].hemistiche = self.hemistiche
    179         return new_word
    180 
    181     def check_elidable(self):
    182         if self.text == 'e':
    183             self.elidable = [True]
    184 
    185     def is_consonants(self):
    186         return is_consonants(self.text)
    187 
    188     def ends_with_apostrophe(self):
    189         return re.search("[" + APOSTROPHES + "]$", self.original) is not None
    190 
    191     def elide_vowel_problems(self, chunk_group):
    192         if self.elision is None:
    193             self.elision_wrap(chunk_group)
    194 
    195     def process_y_cases(self, previous_chunk, next_chunk):
    196         new_word_from_chunk = []
    197         if 'y' not in self.text or len(self.text) == 1 or self.text.startswith("y"):
    198             new_word_from_chunk.append(self)
    199         else:
    200             if previous_chunk is not None and next_chunk is not None:
    201                 # special cases of "pays", "alcoyle", "abbayes"
    202                 c_text = self.text
    203                 p_text = previous_chunk.text
    204                 n_text = next_chunk.text
    205                 # TODO Should you force if this condition does not apply?
    206                 if ((c_text == "ay" and p_text.endswith("p") and n_text.startswith("s"))
    207                         or
    208                         (c_text == "oy" and p_text.endswith("lc")
    209                          and n_text.startswith("l"))
    210                         or
    211                         (c_text == "aye" and p_text.endswith("bb")
    212                          and n_text.startswith("s"))):
    213                     # force weight
    214                     self.weights = [2]
    215                     new_word_from_chunk.append(self)
    216                     return new_word_from_chunk
    217             must_force = next_chunk is None and previous_chunk is not None and \
    218                 (self.text == "aye" and previous_chunk.text.endswith("bb"))
    219             if must_force:
    220                 # force weight
    221                 self.weights = [2]
    222                 new_word_from_chunk.append(self)
    223             else:
    224                 sub_chunks = re.split(re.compile("(y+)"), self.text)
    225                 sub_chunks = [x for x in sub_chunks if len(x) > 0]
    226                 for j, sub_chunk in enumerate(sub_chunks):
    227                     lindex = int(j * len(self.original) / len(sub_chunks))
    228                     rindex = int((j + 1) * len(self.original) / len(sub_chunks))
    229                     part = self.original[lindex:rindex]
    230                     new_subchunk_text = 'Y' if 'y' in sub_chunk else sub_chunk
    231                     new_subchunk = self.copy()
    232                     new_subchunk.original = part
    233                     new_subchunk.text = new_subchunk_text
    234                     new_word_from_chunk.append(new_subchunk)
    235         return new_word_from_chunk
    236 
    237     def is_vowels(self):
    238         return is_vowels(self.text)
    239 
    240     def is_dash_elidable(self):
    241         # "fais-le" not elidable, but "suis-je" and "est-ce" is
    242         return not ('-' in self.text and not self.text.endswith('-j') and not self.text.endswith('-c'))
    243 
    244     def check_elidable_with_next(self, next_chunk):
    245         if self.elidable is None:
    246             self.elidable = next_chunk.elision
    247 
    248     def is_potentially_ambiguous_hiatus(self):
    249         return self.text in ["ie", "ée", "ue"]
    250 
    251     def ends_with_potentially_ambiguous_hiatus(self):
    252         return len(self.text) >= 2 and self.text[-2:] in ["ie", "ée", "ue"]
    253 
    254     def check_potentially_ambiguous_plural(self, previous_chunk):
    255         if self.text == "s":
    256             if previous_chunk.is_potentially_ambiguous_hiatus():
    257                 previous_chunk.error = "ambiguous"
    258                 self.error = "ambiguous"
    259 
    260     def check_potentially_ambiguous_with_elision(self, next_chunk):
    261         if self.ends_with_potentially_ambiguous_hiatus():
    262             if next_chunk.elision is not None or True not in next_chunk.elision:
    263                 self.error = "ambiguous"
    264                 next_chunk.error = "ambiguous"
    265 
    266     def check_hiatus(self, previous_chunk, next_chunk, only_two_parts):
    267         if previous_chunk is not None:
    268             self.check_potentially_ambiguous_plural(previous_chunk)
    269         if self.ends_with_potentially_ambiguous_hiatus():
    270             if not any(next_chunk.elision or [False]):
    271                 self.error = "ambiguous"
    272                 next_chunk.error = "ambiguous"
    273 
    274         # elision concerns words ending with a vowel without a mute 'e'
    275         # that have not been marked "no_hiatus"
    276         # it also concerns specifically "et"
    277         elif (not self.text.endswith('e') and self.no_hiatus is None
    278               and (self.is_vowels() or self.text == 'Y')
    279               or (only_two_parts and previous_chunk.text == 'e' and self.text == 't')):
    280             # it happens if the next word is not marked no_hiatus
    281             # and starts with something that causes elision
    282             if next_chunk.causes_hiatus and next_chunk.no_hiatus is None:
    283                 self.error = "hiatus"
    284                 next_chunk.error = "hiatus"
    285 
    286     def make_word_end(self):
    287         self.word_end = True
    288 
    289     def contains_break(self):
    290         return '-' in self.text \
    291                or self.word_end or False \
    292                or self.had_hyphen or False
    293 
    294     def is_e(self):
    295         return self.text == "e"
    296 
    297     def possible_weights_approx(self):
    298         """Return the possible number of syllabes taken by a vowel chunk (permissive approximation)"""
    299         chunk_text = self.text
    300         if len(chunk_text) == 1:
    301             return [1]
    302         # old spelling and weird exceptions
    303         if chunk_text in ['ouï']:
    304             return [1, 2]  # TODO unsure about that
    305         if chunk_text in ['eüi', 'aoû', 'uë']:
    306             return [1]
    307         if chunk_text in ['aïe', 'oë', 'ouü']:
    308             return [1, 2]
    309         if contains_trema(chunk_text):
    310             return [2]
    311         chunk_text = strip_accents(chunk_text, True)
    312         if chunk_text in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
    313                           'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
    314                           'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
    315                           'yeu', 'ye', 'you']:
    316             return [1]
    317         if chunk_text == "oua":
    318             return [1, 2]  # "pouah"
    319         if chunk_text == "ao":
    320             return [1, 2]  # "paon"
    321         for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']:
    322             if x in chunk_text:
    323                 return [2]
    324         # beware of "déesse"
    325         if chunk_text == 'ée':
    326             return [1, 2]
    327         if chunk_text[0] == 'i':
    328             return [1, 2]
    329         if chunk_text[0] == 'u' and (strip_accents(chunk_text[1]) in ['i', 'e']):
    330             return [1, 2]
    331         if chunk_text[0] == 'o' and chunk_text[1] == 'u' and len(chunk_text) >= 3 and\
    332                 strip_accents(chunk_text[2]) in ['i', 'e']:
    333             return [1, 2]
    334         if 'é' in chunk_text or 'è' in chunk_text:
    335             return [2]
    336         # we can't tell
    337         return [1, 2]
    338 
    339     def clear(self):
    340         if self.word_end is None or not self.word_end:
    341             return self.text
    342         return self.text + ' '
    343 
    344     def set_possible_weights_from_context(self, chunks_before, chunks_after, template, threshold):
    345         if self.weights is not None:
    346             return
    347         if len(chunks_after) > 0:
    348             next_chunk = chunks_after[0]
    349         else:
    350             next_chunk = None
    351 
    352         if len(chunks_before) > 0:
    353             previous_chunk = chunks_before[-1]
    354         else:
    355             previous_chunk = None
    356 
    357         if len(chunks_before) > 1:
    358             previous_previous_chunk = chunks_before[-2]
    359         else:
    360             previous_previous_chunk = None
    361 
    362         if ((len(chunks_after) <= 1 and self.is_e())
    363                 and not (next_chunk is not None and next_chunk.is_vowels())
    364                 and not (previous_chunk is None or previous_chunk.contains_break())
    365                 and not (previous_previous_chunk is None or previous_previous_chunk.contains_break())):
    366             # special case for verse endings, which can get elided (or not)
    367             # but we don't elide lone syllables ("prends-le", etc.)
    368 
    369             if next_chunk is None:
    370                 self.weights = [0]  # ending 'e' is elided
    371             elif next_chunk.text == 's':
    372                 self.weights = [0]  # ending 'es' is elided
    373             elif next_chunk.text == 'nt':
    374                 # ending 'ent' is sometimes elided, try to use pronunciation
    375                 # actually, this will have an influence on the rhyme's gender
    376                 # see feminine
    377                 possible = []
    378                 if not self.verse.phon or len(self.verse.phon) == 0:
    379                     self.weights = [0, 1]  # do something reasonable without pron
    380                 else:
    381                     for possible_phon in self.verse.phon:
    382                         if possible_phon.endswith(')') or possible_phon.endswith('#'):
    383                             possible.append(1)
    384                         else:
    385                             possible.append(0)
    386                     self.weights = possible
    387             else:
    388                 self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold)
    389         elif (next_chunk is None and self.text == 'e' and
    390                 previous_chunk is not None and (previous_chunk.text.endswith('-c')
    391                                                 or previous_chunk.text.endswith('-j')
    392                                                 or (previous_chunk.text == 'c'
    393                                                     and previous_chunk.had_hyphen is not None)
    394                                                 or (previous_chunk.text == 'j'
    395                                                     and previous_chunk.had_hyphen is not None))):
    396             self.weights = [0]  # -ce and -je are elided
    397         elif next_chunk is None and self.text in ['ie', 'ée']:
    398             self.weights = [1]
    399         # elide "-ée" and "-ées", but be specific (beware of e.g. "réel")
    400         elif (len(chunks_after) <= 1
    401                 and self.text == 'ée'
    402                 and (next_chunk is None or chunks_after[-1].text == 's')):
    403             self.weights = [1]
    404         elif self.elidable is not None:
    405             self.weights = [int(not x) for x in self.elidable]
    406         else:
    407             self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold)
    408 
    409     def possible_weights(self, chunks_before, chunks_after, template, threshold):
    410         if template.options['diaeresis'] == "classical":
    411             return self.possible_weights_ctx(chunks_before, chunks_after, threshold=threshold)
    412         elif template.options['diaeresis'] == "permissive":
    413             return self.possible_weights_approx()
    414 
    415     def possible_weights_ctx(self, chunks_before, chunks_after, threshold=None):
    416         if not threshold:
    417             threshold = DEFAULT_THRESHOLD
    418         q = self.make_query(chunks_before, chunks_after)
    419         v = diaeresis.diaeresis_finder.lookup(q)
    420         if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold:
    421             return [int(list(v.keys())[0])]
    422         else:
    423             return self.possible_weights_seed()
    424 
    425     def make_query(self, chunks_before, chunks_after):
    426         cleaned_before = [chunk.clear() for chunk in chunks_before]
    427         cleaned_after = [chunk.clear() for chunk in chunks_after]
    428         current_clear = self.clear()
    429         if current_clear.endswith(' '):
    430             current_clear = current_clear.rstrip()
    431             if len(cleaned_after) > 0:
    432                 cleaned_after[0] = " " + cleaned_after[0]
    433             else:
    434                 cleaned_after.append(' ')
    435         ret2 = intersperse(
    436             ''.join(cleaned_after),
    437             ''.join([x[::-1] for x in cleaned_before[::-1]]))
    438         ret = [current_clear] + ret2
    439         return ret
    440 
    441     def possible_weights_seed(self):
    442         """Return the possible number of syllabes taken by a vowel chunk"""
    443         if len(self.text) == 1:
    444             return [1]
    445         # dioïde, maoïste, taoïste
    446         if (self.text[-1] == 'ï' and len(self.text) >= 3 and not
    447                 self.text[-3:-1] == 'ou'):
    448             return [3]
    449         # ostéoarthrite
    450         if "éoa" in self.text:
    451             return [3]
    452         # antiaérien; but let's play it safe
    453         if "iaé" in self.text:
    454             return [2, 3]
    455         # giaour, miaou, niaouli
    456         if "iaou" in self.text:
    457             return [2, 3]
    458         # bioélectrique
    459         if "ioé" in self.text:
    460             return [2, 3]
    461         # méiose, nucléion, etc.
    462         if "éio" in self.text:
    463             return [2, 3]
    464         # radioactif, radioamateur, etc.
    465         if "ioa" in self.text:
    466             return [2, 3]
    467         # pléiade
    468         if "éio" in self.text:
    469             return [2, 3]
    470         # pompéien, tarpéien...
    471         # in theory the "-ie" should give a diaeresis, so 3 syllabes
    472         # let's keep the benefit of the doubt...
    473         # => this also gives 3 as a possibility for "obéie"...
    474         if "éie" in self.text:
    475             return [2, 3]
    476         # tolstoïen
    477         # same remark
    478         if "oïe" in self.text:
    479             return [2, 3]
    480         # shanghaïen (diaeresis?), but also "aië"
    481         if "aïe" in self.text:
    482             return [1, 2, 3]
    483         if self.text in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']:
    484             return [1]
    485         # we can't tell
    486         return [1, 2]
    487 
    488     def set_hemistiche_from_context(self, previous_previous_chunk, previous_chunk, next_chunk):
    489         if self.hemistiche is not None:
    490             return
    491         ending = self.text
    492         if not (self.word_end or False) and next_chunk is not None:
    493             if not (next_chunk.word_end or False):
    494                 self.hemistiche = "cut"
    495                 return
    496             ending += next_chunk.text
    497         if ending in SURE_END_FEM and previous_previous_chunk is not None and previous_chunk is not None:
    498             # check that this isn't a one-syllabe wourd (which is allowed)
    499             ok = False
    500             try:
    501                 if '-' in previous_chunk.original or (previous_chunk.word_end or False):
    502                     ok = True
    503                 if '-' in previous_previous_chunk.original or (previous_previous_chunk.word_end or False):
    504                     ok = True
    505             except IndexError:
    506                 pass
    507             if not ok:
    508                 # hemistiche ends in feminine
    509                 if any(self.elidable or [False]):
    510                     self.hemistiche = "elid"  # elidable final -e, but only OK if actually elided
    511                     return
    512                 else:
    513                     self.hemistiche = "fem"
    514                     return
    515         self.hemistiche = "ok"
    516 
    517     def normalize(self):
    518         if self.text_pron is None:
    519             return normalize(self.original, strip=False, rm_apostrophe_end=False)
    520         else:
    521             return self.text
    522 
    523     def get_original_text(self):
    524         return self.original
    525 
    526     def get_errors_set(self, forbidden_ok, hiatus_ok):
    527         errors_chunk = set()
    528         if self.error is not None:
    529             if self.error == "ambiguous" and not forbidden_ok:
    530                 errors_chunk.add(error.ErrorForbiddenPattern)
    531             if self.error == "hiatus" and not hiatus_ok:
    532                 errors_chunk.add(error.ErrorHiatus)
    533             if self.error == "illegal":
    534                 errors_chunk.add(error.ErrorBadCharacters)
    535         return errors_chunk
    536 
    537     def is_masculine(self):
    538         return (self.had_hyphen or False) or (self.word_end or False)
    539 
    540     def render(self, key, fmt="text"):
    541         if key == 'error' and self.error == 'illegal':
    542             if fmt == "text":
    543                 return self.illegal_str
    544             elif fmt == "json":
    545                 # don't know how to represent the specific characters
    546                 # cleanly in JSON
    547                 return "illegal_characters"
    548             else:
    549                 raise ValueError("bad format")
    550         if key == 'original':
    551             return str(self.original)
    552         elif key == 'weights':
    553             if fmt == "text":
    554                 return '-'.join([str(a) for a in self.weights or []])
    555             elif fmt == "json":
    556                 if self.weights is None:
    557                     return None
    558                 return [a for a in self.weights or []]
    559             else:
    560                 raise ValueError("bad format")
    561         elif key == 'error':
    562             if fmt == "text":
    563                 return ErrorCollection.keys.get(self.error, '') * len(self.original)
    564             elif fmt == "json":
    565                 return self.error or None
    566             else:
    567                 raise ValueError("bad format")
    568         elif key == 'hemis':
    569             if fmt == "text":
    570                 return str(self.hemistiche or "")
    571             elif fmt == "json":
    572                 return self.hemistiche or None
    573             else:
    574                 raise ValueError("bad format")
    575         else:
    576             print(key, file=sys.stderr)
    577             assert False
    578 
    579     def get_normalized_rendering(self, key, keys, fmt="text"):
    580         if fmt == "text":
    581             return ('{:^' + str(self.get_max_render_size(keys)) + '}').format(self.render(key))
    582         elif fmt == "json":
    583             return self.render(key, fmt=fmt)
    584         else:
    585             raise ValueError("bad format")
    586 
    587     def get_min_weight(self):
    588         return min(self.weights or [0])
    589 
    590     def get_max_weight(self):
    591         return max(self.weights or [0])
    592 
    593     def get_max_render_size(self, keys):
    594         return max(len(self.render(key)) for key in keys)
    595 
    596     def print_query(self, chunks_after, chunks_before, output_file):
    597         if (self.weights is not None and len(self.weights) > 1
    598                 and self.weight is not None and self.weight > 0):
    599             print(str(self.weight) + ' ' +
    600                   ' '.join(self.make_query(chunks_before, chunks_after)), file=output_file)
    601 
    602     # set self.elision and self.causes_hiatus
    603     def elision_wrap(self, chunk_group):
    604         first_letter = common.remove_punctuation(chunk_group[0].original.strip())
    605         word = ''.join(chunk.text for chunk in chunk_group)
    606         original_word = ''.join(chunk.original for chunk in chunk_group)
    607         self.elision = elision(word,
    608                        original_word,
    609                        first_letter == first_letter.upper())
    610     
    611         self.causes_hiatus = False
    612         if is_vowels(word[0]):
    613             # "oui, oui" often occurs
    614             if word not in ["oui", "ouis"]:
    615                 self.causes_hiatus = True
    616         elif word[0] == 'h':
    617             result = list(map((lambda s: not s),
    618                             haspirater.lookup(normalize(original_word,
    619                                 rm_all_begin=True))))
    620             if len(result) == 1 and True in result:
    621                 self.causes_hiatus = True
    622 
    623 
    624 
    625 LETTERS = {
    626     'f': 'effe',
    627     'h': 'ache',
    628     'j': 'gi',
    629     'k': 'ka',
    630     'l': 'elle',
    631     'm': 'aime',
    632     'n': 'aine',
    633     'q': 'cu',
    634     'r': 'ère',
    635     's': 'esse',
    636     'w': 'doublevé',
    637     'x': 'ixe',
    638     'z': 'zaide'
    639 }
    640 
    641 
    642 
    643 def elision(word, original_word, was_cap):
    644     if word.startswith('y'):
    645         if word == 'y':
    646             return [True]
    647         if was_cap:
    648             if word == 'york':
    649                 return [False]
    650             # Grevisse, Le Bon usage, 14th ed., paragraphs 49-50
    651             # depends on whether it's French or foreign...
    652             return [True, False]
    653         else:
    654             exc = ["york", "yeux", "yeuse", "ypérite"]
    655             for w in exc:
    656                 if word.startswith(w):
    657                     return [True]
    658             # otherwise, no elision
    659             return [False]
    660     if word in ["oui", "ouis"]:
    661         # elision for those words, but beware, no elision for "ouighour"
    662         # boileau : "Ont l'esprit mieux tourné que n'a l'homme ? Oui sans doute."
    663         # so elision sometimes
    664         return [True, False]
    665     if word.startswith("ouistiti") or word.startswith("ouagadougou"):
    666         return [False]
    667     # "un", "une" are non-elided as nouns ("cette une")
    668     if word in ["un", "une"]:
    669         return [True, False]
    670     # "onze" is not elided
    671     if word == "onze":
    672         return [False]
    673     if word.startswith('ulul'):
    674         return [False]  # ululement, ululer, etc.
    675     if word.startswith('uhlan'):
    676         return [False]  # uhlan
    677     if word[0] == 'h':
    678         if word == "huis":
    679             # special case, "huis" is elided but "huis clos" isn't
    680             return [True, False]
    681         # look up in haspirater using the original (but normalized) word
    682         return list(map((lambda s: not s),
    683                         haspirater.lookup(normalize(original_word,
    684                             rm_all_begin=True))))
    685     if is_vowels(word[0]):
    686         return [True]
    687     return [False]
	plint French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
	git clone https://a3nm.net/git/plint/
	Log \| Files \| Refs \| README