chunk.py (27660B)
1 import re 2 import sys 3 4 from haspirater import haspirater 5 from plint import common, diaeresis, error 6 from plint.common import normalize, strip_accents_one, is_consonants, APOSTROPHES, is_vowels, get_consonants_regex, \ 7 strip_accents, SURE_END_FEM 8 from plint.error import ErrorCollection 9 from plint.vowels import contains_trema, intersperse 10 11 12 DEFAULT_THRESHOLD = 3 13 14 15 class Chunk: 16 17 def __init__(self, word, verse): 18 self.original = word 19 self.text = normalize(word, rm_apostrophe=True) 20 self.hemistiche = None 21 self.error = None 22 self.illegal_str = None 23 self.weights = None 24 self.had_hyphen = None 25 self.text_pron = None 26 self.elision = None 27 self.no_hiatus = None 28 self.causes_hiatus = None 29 self.elidable = None 30 self.word_end = False 31 32 # self.weight contains the weight attributed to the chunk when fitting 33 # all chunks of the verse (function fit in chunks.py) to respect the 34 # metric 35 self.weight = None 36 37 self.verse = verse 38 39 def __repr__(self): 40 return "Chunk(" \ 41 + "original:" + self.original \ 42 + ", text:" + self.text \ 43 + ", weights:" + str(self.weights or []) \ 44 + ", weight:" + str(self.weight or "") \ 45 + ", elidable:" + str(self.elidable or False) \ 46 + ", elision:" + str(self.elision or False) \ 47 + ", hemistiche:" + str(self.hemistiche) \ 48 + ", error:" + str(self.error) \ 49 + ", illegal_str:" + str(self.illegal_str) \ 50 + ", had_hypher:" + str(self.had_hyphen) \ 51 + ", text_pron:" + str(self.text_pron) \ 52 + ", no_hiatus:" + str(self.no_hiatus) \ 53 + ", word_end:" + str(self.word_end) \ 54 + ")" + "\n" 55 56 def copy(self): 57 new_chunk = Chunk(self.original, self.verse) 58 new_chunk.original = self.original 59 new_chunk.text = self.text 60 new_chunk.hemistiche = self.hemistiche 61 new_chunk.error = self.error 62 new_chunk.illegal_str = self.illegal_str 63 new_chunk.weights = self.weights 64 new_chunk.had_hyphen = self.had_hyphen 65 new_chunk.text_pron = self.text_pron 66 new_chunk.elision = self.elision 67 new_chunk.no_hiatus = self.no_hiatus 68 new_chunk.elidable = self.elidable 69 new_chunk.word_end = self.word_end 70 new_chunk.weight = self.weight 71 return new_chunk 72 73 def set_hemistiche(self, hemistiche): 74 # The hemistiche can take the following values 75 # ok: correct 76 # cut: falls at the middle of a word 77 # fem: preceding word ends by a mute e 78 self.hemistiche = hemistiche 79 80 def check_forbidden_characters(self): 81 es = "" 82 for x in self.text: 83 if not common.remove_punctuation(strip_accents_one(x)[0].lower()) in common.LEGAL: 84 es += 'I' 85 self.error = "illegal" 86 else: 87 es += ' ' 88 if self.error is not None and self.error == "illegal": 89 self.illegal_str = es 90 91 def simplify_gu_qu(self, next_chunk): 92 if next_chunk.text.startswith('u'): 93 if self.text.endswith('q'): 94 next_chunk.text = next_chunk.text[1:] 95 if next_chunk.text == '': 96 self.original += next_chunk.original 97 next_chunk.original = '' 98 if self.text.endswith('g') and len(next_chunk.text) >= 2: 99 if next_chunk.text[1] in "eéèa": 100 next_chunk.text = next_chunk.text[1:] 101 102 def elide_inside_words(self, all_next_chunks): 103 if self.text == "e-": 104 self.weights = [0] # force elision 105 next_chunk = all_next_chunks[0] 106 if self.text == "e" and next_chunk.text.startswith("-h"): 107 # collect what follows until the next hyphen or end 108 flw = next_chunk.original.split('-')[1] 109 for future_chunk in all_next_chunks[1:]: 110 flw += future_chunk.original.split('-')[0] 111 if '-' in future_chunk.original: 112 break 113 # TODO: not sure if this reconstruction of the original word is bulletproof... 114 if haspirater.lookup(normalize(flw)): 115 self.weights = [0] 116 else: 117 self.weights = [1] 118 119 def remove_leading_and_trailing_crap(self): 120 seen_space = False 121 seen_hyphen = False 122 while len(self.text) > 0 and self.text[0] in ' -': 123 if self.text[0] == ' ': 124 seen_space = True 125 else: 126 seen_hyphen = True 127 self.text = self.text[1:] 128 while len(self.text) > 0 and self.text[-1] in ' -': 129 if self.text[-1] == ' ': 130 seen_space = True 131 else: 132 seen_hyphen = True 133 self.text = self.text[:-1] 134 if seen_hyphen and not seen_space: 135 self.had_hyphen = True 136 137 def is_empty(self): 138 return len(self.text) == 0 139 140 def add_original(self, other_chunk): 141 self.original += other_chunk.original 142 143 def create_acronym(self): 144 new_chunks = [] 145 for j, character in enumerate(self.text): 146 try: 147 new_chunk_content = LETTERS[character] 148 # hack: the final 'e's in letters are just to help pronunciation 149 # inference and are only needed at end of word, otherwise they will 150 # mess syllable count up 151 if j < len(self.text) - 1 and new_chunk_content[-1] == 'e': 152 new_chunk_content = new_chunk_content[:-1] 153 except KeyError: 154 new_chunk_content = character + 'é' 155 new_chunks += [(j, x) for x in re.split(get_consonants_regex(), new_chunk_content)] 156 new_chunks = [x for x in new_chunks if len(x[1]) > 0] 157 new_word = [] 158 last_opos = -1 159 for j, (original_position, character) in enumerate(new_chunks): 160 part = "" 161 if j == len(new_chunks) - 1: 162 # don't miss final spaces 163 part = self.original[last_opos + 1:] 164 elif last_opos < original_position: 165 part = self.original[last_opos + 1:original_position + 1] 166 last_opos = original_position 167 # allow or forbid elision because of possible ending '-e' before 168 # forbid hiatus both for this and for preceding 169 # instruct that we must use text for the pronunciation 170 new_chunk = Chunk(part, self.verse) 171 new_chunk.original = part 172 new_chunk.text = character 173 new_chunk.text_pron = True 174 new_chunk.elision = [False, True] 175 new_chunk.no_hiatus = True 176 new_word.append(new_chunk) 177 # propagate information from splithyph 178 new_word[-1].hemistiche = self.hemistiche 179 return new_word 180 181 def check_elidable(self): 182 if self.text == 'e': 183 self.elidable = [True] 184 185 def is_consonants(self): 186 return is_consonants(self.text) 187 188 def ends_with_apostrophe(self): 189 return re.search("[" + APOSTROPHES + "]$", self.original) is not None 190 191 def elide_vowel_problems(self, chunk_group): 192 if self.elision is None: 193 self.elision_wrap(chunk_group) 194 195 def process_y_cases(self, previous_chunk, next_chunk): 196 new_word_from_chunk = [] 197 if 'y' not in self.text or len(self.text) == 1 or self.text.startswith("y"): 198 new_word_from_chunk.append(self) 199 else: 200 if previous_chunk is not None and next_chunk is not None: 201 # special cases of "pays", "alcoyle", "abbayes" 202 c_text = self.text 203 p_text = previous_chunk.text 204 n_text = next_chunk.text 205 # TODO Should you force if this condition does not apply? 206 if ((c_text == "ay" and p_text.endswith("p") and n_text.startswith("s")) 207 or 208 (c_text == "oy" and p_text.endswith("lc") 209 and n_text.startswith("l")) 210 or 211 (c_text == "aye" and p_text.endswith("bb") 212 and n_text.startswith("s"))): 213 # force weight 214 self.weights = [2] 215 new_word_from_chunk.append(self) 216 return new_word_from_chunk 217 must_force = next_chunk is None and previous_chunk is not None and \ 218 (self.text == "aye" and previous_chunk.text.endswith("bb")) 219 if must_force: 220 # force weight 221 self.weights = [2] 222 new_word_from_chunk.append(self) 223 else: 224 sub_chunks = re.split(re.compile("(y+)"), self.text) 225 sub_chunks = [x for x in sub_chunks if len(x) > 0] 226 for j, sub_chunk in enumerate(sub_chunks): 227 lindex = int(j * len(self.original) / len(sub_chunks)) 228 rindex = int((j + 1) * len(self.original) / len(sub_chunks)) 229 part = self.original[lindex:rindex] 230 new_subchunk_text = 'Y' if 'y' in sub_chunk else sub_chunk 231 new_subchunk = self.copy() 232 new_subchunk.original = part 233 new_subchunk.text = new_subchunk_text 234 new_word_from_chunk.append(new_subchunk) 235 return new_word_from_chunk 236 237 def is_vowels(self): 238 return is_vowels(self.text) 239 240 def is_dash_elidable(self): 241 # "fais-le" not elidable, but "suis-je" and "est-ce" is 242 return not ('-' in self.text and not self.text.endswith('-j') and not self.text.endswith('-c')) 243 244 def check_elidable_with_next(self, next_chunk): 245 if self.elidable is None: 246 self.elidable = next_chunk.elision 247 248 def is_potentially_ambiguous_hiatus(self): 249 return self.text in ["ie", "ée", "ue"] 250 251 def ends_with_potentially_ambiguous_hiatus(self): 252 return len(self.text) >= 2 and self.text[-2:] in ["ie", "ée", "ue"] 253 254 def check_potentially_ambiguous_plural(self, previous_chunk): 255 if self.text == "s": 256 if previous_chunk.is_potentially_ambiguous_hiatus(): 257 previous_chunk.error = "ambiguous" 258 self.error = "ambiguous" 259 260 def check_potentially_ambiguous_with_elision(self, next_chunk): 261 if self.ends_with_potentially_ambiguous_hiatus(): 262 if next_chunk.elision is not None or True not in next_chunk.elision: 263 self.error = "ambiguous" 264 next_chunk.error = "ambiguous" 265 266 def check_hiatus(self, previous_chunk, next_chunk, only_two_parts): 267 if previous_chunk is not None: 268 self.check_potentially_ambiguous_plural(previous_chunk) 269 if self.ends_with_potentially_ambiguous_hiatus(): 270 if not any(next_chunk.elision or [False]): 271 self.error = "ambiguous" 272 next_chunk.error = "ambiguous" 273 274 # elision concerns words ending with a vowel without a mute 'e' 275 # that have not been marked "no_hiatus" 276 # it also concerns specifically "et" 277 elif (not self.text.endswith('e') and self.no_hiatus is None 278 and (self.is_vowels() or self.text == 'Y') 279 or (only_two_parts and previous_chunk.text == 'e' and self.text == 't')): 280 # it happens if the next word is not marked no_hiatus 281 # and starts with something that causes elision 282 if next_chunk.causes_hiatus and next_chunk.no_hiatus is None: 283 self.error = "hiatus" 284 next_chunk.error = "hiatus" 285 286 def make_word_end(self): 287 self.word_end = True 288 289 def contains_break(self): 290 return '-' in self.text \ 291 or self.word_end or False \ 292 or self.had_hyphen or False 293 294 def is_e(self): 295 return self.text == "e" 296 297 def possible_weights_approx(self): 298 """Return the possible number of syllabes taken by a vowel chunk (permissive approximation)""" 299 chunk_text = self.text 300 if len(chunk_text) == 1: 301 return [1] 302 # old spelling and weird exceptions 303 if chunk_text in ['ouï']: 304 return [1, 2] # TODO unsure about that 305 if chunk_text in ['eüi', 'aoû', 'uë']: 306 return [1] 307 if chunk_text in ['aïe', 'oë', 'ouü']: 308 return [1, 2] 309 if contains_trema(chunk_text): 310 return [2] 311 chunk_text = strip_accents(chunk_text, True) 312 if chunk_text in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', 313 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', 314 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', 315 'yeu', 'ye', 'you']: 316 return [1] 317 if chunk_text == "oua": 318 return [1, 2] # "pouah" 319 if chunk_text == "ao": 320 return [1, 2] # "paon" 321 for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']: 322 if x in chunk_text: 323 return [2] 324 # beware of "déesse" 325 if chunk_text == 'ée': 326 return [1, 2] 327 if chunk_text[0] == 'i': 328 return [1, 2] 329 if chunk_text[0] == 'u' and (strip_accents(chunk_text[1]) in ['i', 'e']): 330 return [1, 2] 331 if chunk_text[0] == 'o' and chunk_text[1] == 'u' and len(chunk_text) >= 3 and\ 332 strip_accents(chunk_text[2]) in ['i', 'e']: 333 return [1, 2] 334 if 'é' in chunk_text or 'è' in chunk_text: 335 return [2] 336 # we can't tell 337 return [1, 2] 338 339 def clear(self): 340 if self.word_end is None or not self.word_end: 341 return self.text 342 return self.text + ' ' 343 344 def set_possible_weights_from_context(self, chunks_before, chunks_after, template, threshold): 345 if self.weights is not None: 346 return 347 if len(chunks_after) > 0: 348 next_chunk = chunks_after[0] 349 else: 350 next_chunk = None 351 352 if len(chunks_before) > 0: 353 previous_chunk = chunks_before[-1] 354 else: 355 previous_chunk = None 356 357 if len(chunks_before) > 1: 358 previous_previous_chunk = chunks_before[-2] 359 else: 360 previous_previous_chunk = None 361 362 if ((len(chunks_after) <= 1 and self.is_e()) 363 and not (next_chunk is not None and next_chunk.is_vowels()) 364 and not (previous_chunk is None or previous_chunk.contains_break()) 365 and not (previous_previous_chunk is None or previous_previous_chunk.contains_break())): 366 # special case for verse endings, which can get elided (or not) 367 # but we don't elide lone syllables ("prends-le", etc.) 368 369 if next_chunk is None: 370 self.weights = [0] # ending 'e' is elided 371 elif next_chunk.text == 's': 372 self.weights = [0] # ending 'es' is elided 373 elif next_chunk.text == 'nt': 374 # ending 'ent' is sometimes elided, try to use pronunciation 375 # actually, this will have an influence on the rhyme's gender 376 # see feminine 377 possible = [] 378 if not self.verse.phon or len(self.verse.phon) == 0: 379 self.weights = [0, 1] # do something reasonable without pron 380 else: 381 for possible_phon in self.verse.phon: 382 if possible_phon.endswith(')') or possible_phon.endswith('#'): 383 possible.append(1) 384 else: 385 possible.append(0) 386 self.weights = possible 387 else: 388 self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold) 389 elif (next_chunk is None and self.text == 'e' and 390 previous_chunk is not None and (previous_chunk.text.endswith('-c') 391 or previous_chunk.text.endswith('-j') 392 or (previous_chunk.text == 'c' 393 and previous_chunk.had_hyphen is not None) 394 or (previous_chunk.text == 'j' 395 and previous_chunk.had_hyphen is not None))): 396 self.weights = [0] # -ce and -je are elided 397 elif next_chunk is None and self.text in ['ie', 'ée']: 398 self.weights = [1] 399 # elide "-ée" and "-ées", but be specific (beware of e.g. "réel") 400 elif (len(chunks_after) <= 1 401 and self.text == 'ée' 402 and (next_chunk is None or chunks_after[-1].text == 's')): 403 self.weights = [1] 404 elif self.elidable is not None: 405 self.weights = [int(not x) for x in self.elidable] 406 else: 407 self.weights = self.possible_weights(chunks_before, chunks_after, template, threshold) 408 409 def possible_weights(self, chunks_before, chunks_after, template, threshold): 410 if template.options['diaeresis'] == "classical": 411 return self.possible_weights_ctx(chunks_before, chunks_after, threshold=threshold) 412 elif template.options['diaeresis'] == "permissive": 413 return self.possible_weights_approx() 414 415 def possible_weights_ctx(self, chunks_before, chunks_after, threshold=None): 416 if not threshold: 417 threshold = DEFAULT_THRESHOLD 418 q = self.make_query(chunks_before, chunks_after) 419 v = diaeresis.diaeresis_finder.lookup(q) 420 if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold: 421 return [int(list(v.keys())[0])] 422 else: 423 return self.possible_weights_seed() 424 425 def make_query(self, chunks_before, chunks_after): 426 cleaned_before = [chunk.clear() for chunk in chunks_before] 427 cleaned_after = [chunk.clear() for chunk in chunks_after] 428 current_clear = self.clear() 429 if current_clear.endswith(' '): 430 current_clear = current_clear.rstrip() 431 if len(cleaned_after) > 0: 432 cleaned_after[0] = " " + cleaned_after[0] 433 else: 434 cleaned_after.append(' ') 435 ret2 = intersperse( 436 ''.join(cleaned_after), 437 ''.join([x[::-1] for x in cleaned_before[::-1]])) 438 ret = [current_clear] + ret2 439 return ret 440 441 def possible_weights_seed(self): 442 """Return the possible number of syllabes taken by a vowel chunk""" 443 if len(self.text) == 1: 444 return [1] 445 # dioïde, maoïste, taoïste 446 if (self.text[-1] == 'ï' and len(self.text) >= 3 and not 447 self.text[-3:-1] == 'ou'): 448 return [3] 449 # ostéoarthrite 450 if "éoa" in self.text: 451 return [3] 452 # antiaérien; but let's play it safe 453 if "iaé" in self.text: 454 return [2, 3] 455 # giaour, miaou, niaouli 456 if "iaou" in self.text: 457 return [2, 3] 458 # bioélectrique 459 if "ioé" in self.text: 460 return [2, 3] 461 # méiose, nucléion, etc. 462 if "éio" in self.text: 463 return [2, 3] 464 # radioactif, radioamateur, etc. 465 if "ioa" in self.text: 466 return [2, 3] 467 # pléiade 468 if "éio" in self.text: 469 return [2, 3] 470 # pompéien, tarpéien... 471 # in theory the "-ie" should give a diaeresis, so 3 syllabes 472 # let's keep the benefit of the doubt... 473 # => this also gives 3 as a possibility for "obéie"... 474 if "éie" in self.text: 475 return [2, 3] 476 # tolstoïen 477 # same remark 478 if "oïe" in self.text: 479 return [2, 3] 480 # shanghaïen (diaeresis?), but also "aië" 481 if "aïe" in self.text: 482 return [1, 2, 3] 483 if self.text in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']: 484 return [1] 485 # we can't tell 486 return [1, 2] 487 488 def set_hemistiche_from_context(self, previous_previous_chunk, previous_chunk, next_chunk): 489 if self.hemistiche is not None: 490 return 491 ending = self.text 492 if not (self.word_end or False) and next_chunk is not None: 493 if not (next_chunk.word_end or False): 494 self.hemistiche = "cut" 495 return 496 ending += next_chunk.text 497 if ending in SURE_END_FEM and previous_previous_chunk is not None and previous_chunk is not None: 498 # check that this isn't a one-syllabe wourd (which is allowed) 499 ok = False 500 try: 501 if '-' in previous_chunk.original or (previous_chunk.word_end or False): 502 ok = True 503 if '-' in previous_previous_chunk.original or (previous_previous_chunk.word_end or False): 504 ok = True 505 except IndexError: 506 pass 507 if not ok: 508 # hemistiche ends in feminine 509 if any(self.elidable or [False]): 510 self.hemistiche = "elid" # elidable final -e, but only OK if actually elided 511 return 512 else: 513 self.hemistiche = "fem" 514 return 515 self.hemistiche = "ok" 516 517 def normalize(self): 518 if self.text_pron is None: 519 return normalize(self.original, strip=False, rm_apostrophe_end=False) 520 else: 521 return self.text 522 523 def get_original_text(self): 524 return self.original 525 526 def get_errors_set(self, forbidden_ok, hiatus_ok): 527 errors_chunk = set() 528 if self.error is not None: 529 if self.error == "ambiguous" and not forbidden_ok: 530 errors_chunk.add(error.ErrorForbiddenPattern) 531 if self.error == "hiatus" and not hiatus_ok: 532 errors_chunk.add(error.ErrorHiatus) 533 if self.error == "illegal": 534 errors_chunk.add(error.ErrorBadCharacters) 535 return errors_chunk 536 537 def is_masculine(self): 538 return (self.had_hyphen or False) or (self.word_end or False) 539 540 def render(self, key, fmt="text"): 541 if key == 'error' and self.error == 'illegal': 542 if fmt == "text": 543 return self.illegal_str 544 elif fmt == "json": 545 # don't know how to represent the specific characters 546 # cleanly in JSON 547 return "illegal_characters" 548 else: 549 raise ValueError("bad format") 550 if key == 'original': 551 return str(self.original) 552 elif key == 'weights': 553 if fmt == "text": 554 return '-'.join([str(a) for a in self.weights or []]) 555 elif fmt == "json": 556 if self.weights is None: 557 return None 558 return [a for a in self.weights or []] 559 else: 560 raise ValueError("bad format") 561 elif key == 'error': 562 if fmt == "text": 563 return ErrorCollection.keys.get(self.error, '') * len(self.original) 564 elif fmt == "json": 565 return self.error or None 566 else: 567 raise ValueError("bad format") 568 elif key == 'hemis': 569 if fmt == "text": 570 return str(self.hemistiche or "") 571 elif fmt == "json": 572 return self.hemistiche or None 573 else: 574 raise ValueError("bad format") 575 else: 576 print(key, file=sys.stderr) 577 assert False 578 579 def get_normalized_rendering(self, key, keys, fmt="text"): 580 if fmt == "text": 581 return ('{:^' + str(self.get_max_render_size(keys)) + '}').format(self.render(key)) 582 elif fmt == "json": 583 return self.render(key, fmt=fmt) 584 else: 585 raise ValueError("bad format") 586 587 def get_min_weight(self): 588 return min(self.weights or [0]) 589 590 def get_max_weight(self): 591 return max(self.weights or [0]) 592 593 def get_max_render_size(self, keys): 594 return max(len(self.render(key)) for key in keys) 595 596 def print_query(self, chunks_after, chunks_before, output_file): 597 if (self.weights is not None and len(self.weights) > 1 598 and self.weight is not None and self.weight > 0): 599 print(str(self.weight) + ' ' + 600 ' '.join(self.make_query(chunks_before, chunks_after)), file=output_file) 601 602 # set self.elision and self.causes_hiatus 603 def elision_wrap(self, chunk_group): 604 first_letter = common.remove_punctuation(chunk_group[0].original.strip()) 605 word = ''.join(chunk.text for chunk in chunk_group) 606 original_word = ''.join(chunk.original for chunk in chunk_group) 607 self.elision = elision(word, 608 original_word, 609 first_letter == first_letter.upper()) 610 611 self.causes_hiatus = False 612 if is_vowels(word[0]): 613 # "oui, oui" often occurs 614 if word not in ["oui", "ouis"]: 615 self.causes_hiatus = True 616 elif word[0] == 'h': 617 result = list(map((lambda s: not s), 618 haspirater.lookup(normalize(original_word, 619 rm_all_begin=True)))) 620 if len(result) == 1 and True in result: 621 self.causes_hiatus = True 622 623 624 625 LETTERS = { 626 'f': 'effe', 627 'h': 'ache', 628 'j': 'gi', 629 'k': 'ka', 630 'l': 'elle', 631 'm': 'aime', 632 'n': 'aine', 633 'q': 'cu', 634 'r': 'ère', 635 's': 'esse', 636 'w': 'doublevé', 637 'x': 'ixe', 638 'z': 'zaide' 639 } 640 641 642 643 def elision(word, original_word, was_cap): 644 if word.startswith('y'): 645 if word == 'y': 646 return [True] 647 if was_cap: 648 if word == 'york': 649 return [False] 650 # Grevisse, Le Bon usage, 14th ed., paragraphs 49-50 651 # depends on whether it's French or foreign... 652 return [True, False] 653 else: 654 exc = ["york", "yeux", "yeuse", "ypérite"] 655 for w in exc: 656 if word.startswith(w): 657 return [True] 658 # otherwise, no elision 659 return [False] 660 if word in ["oui", "ouis"]: 661 # elision for those words, but beware, no elision for "ouighour" 662 # boileau : "Ont l'esprit mieux tourné que n'a l'homme ? Oui sans doute." 663 # so elision sometimes 664 return [True, False] 665 if word.startswith("ouistiti") or word.startswith("ouagadougou"): 666 return [False] 667 # "un", "une" are non-elided as nouns ("cette une") 668 if word in ["un", "une"]: 669 return [True, False] 670 # "onze" is not elided 671 if word == "onze": 672 return [False] 673 if word.startswith('ulul'): 674 return [False] # ululement, ululer, etc. 675 if word.startswith('uhlan'): 676 return [False] # uhlan 677 if word[0] == 'h': 678 if word == "huis": 679 # special case, "huis" is elided but "huis clos" isn't 680 return [True, False] 681 # look up in haspirater using the original (but normalized) word 682 return list(map((lambda s: not s), 683 haspirater.lookup(normalize(original_word, 684 rm_all_begin=True)))) 685 if is_vowels(word[0]): 686 return [True] 687 return [False]