chunks.py (14886B)
1 import re 2 import sys 3 from pprint import pprint 4 5 from plint import common 6 from plint.chunk import Chunk 7 from plint.common import normalize, get_consonants_regex, SURE_END_FEM, strip_accents 8 from plint.hyphen_splitter import HyphenSplitter 9 10 11 class Chunks: 12 13 def __init__(self, verse): 14 # TODO Find a way to remove this dependency 15 self.verse = verse 16 self.chunks = [] 17 self.create_chunks() 18 self.separated_chunks = [] 19 20 def create_chunks(self): 21 self.initialize_chunks() 22 self.collapse_apostrophes() 23 self.check_forbidden_characters() 24 self.simplify_gu_qu() 25 self.elide_inside_words() 26 self.remove_leading_and_trailing_crap() 27 self.collapse_empty_chunks_from_simplifications() 28 self.create_acronym() 29 self.elide_vowel_problems() 30 self.process_y_cases() 31 self.annotate_final_mute_e() 32 self.annotate_hiatus() 33 self.annotate_word_ends() 34 self.merge_chunks_words() 35 self.print_new_line_if_changed() 36 37 def print_new_line_if_changed(self): 38 now_line = ''.join(chunk.original for chunk in self.chunks) 39 if now_line != self.verse.input_line: 40 print("%s became %s" % (self.verse.input_line, now_line), file=sys.stderr) 41 pprint(self.chunks, stream=sys.stderr) 42 43 def merge_chunks_words(self): 44 self.chunks = sum(self.separated_chunks, []) 45 46 def annotate_word_ends(self): 47 for chunk_group in self.separated_chunks[:-1]: 48 chunk_group[-1].make_word_end() 49 50 def annotate_hiatus(self): 51 for i, chunk_group in enumerate(self.separated_chunks[:-1]): 52 last_chunk = chunk_group[-1] 53 next_chunk = self.separated_chunks[i + 1][0] 54 if len(chunk_group) >= 2: 55 previous_last_chunk = chunk_group[-2] 56 else: 57 previous_last_chunk = None 58 only_two_parts = len(chunk_group) == 2 59 last_chunk.check_hiatus(previous_last_chunk, next_chunk, only_two_parts) 60 61 def annotate_final_mute_e(self): 62 for i, chunk_group in enumerate(self.separated_chunks[:-1]): 63 if chunk_group[-1].is_e(): 64 n_weight = 0 65 for chunk in chunk_group[::-1]: 66 if chunk.is_vowels(): 67 n_weight += 1 68 if not chunk.is_dash_elidable(): 69 break 70 if n_weight == 1: 71 continue 72 next_group_first_chunk = self.separated_chunks[i + 1][0] 73 chunk_group[-1].check_elidable_with_next(next_group_first_chunk) 74 75 def process_y_cases(self): 76 for i, chunk_group in enumerate(self.separated_chunks): 77 new_word = [] 78 for j, chunk in enumerate(chunk_group): 79 if j != 0: 80 previous_chunk = chunk_group[j - 1] 81 else: 82 previous_chunk = None 83 if j != len(chunk_group) - 1: 84 next_chunk = chunk_group[j + 1] 85 else: 86 next_chunk = None 87 new_word_from_chunk = chunk.process_y_cases(previous_chunk, next_chunk) 88 new_word += new_word_from_chunk 89 self.separated_chunks[i] = new_word 90 91 def elide_vowel_problems(self): 92 for chunk_group in self.separated_chunks: 93 chunk_group[0].elide_vowel_problems(chunk_group) 94 95 def collapse_apostrophes(self): 96 future_chunks = [] 97 acc = [] 98 for chunk_group in self.separated_chunks: 99 if chunk_group[-1].ends_with_apostrophe(): 100 acc += chunk_group 101 else: 102 future_chunks.append(acc + chunk_group) 103 acc = [] 104 if acc: 105 future_chunks.append(acc) 106 self.separated_chunks = future_chunks 107 108 def create_acronym(self): 109 for i, chunk_group in enumerate(self.separated_chunks): 110 if len(chunk_group) == 1: 111 first_chunk = chunk_group[0] 112 if first_chunk.is_consonants(): 113 new_word = first_chunk.create_acronym() 114 self.separated_chunks[i] = new_word 115 self.separated_chunks[i][-1].check_elidable() 116 117 def collapse_empty_chunks_from_simplifications(self): 118 for i, chunk_group in enumerate(self.separated_chunks): 119 new_chunks = [] 120 for chunk in chunk_group: 121 if not chunk.is_empty(): 122 new_chunks.append(chunk) 123 else: 124 # propagate the original text 125 # newly empty chunks cannot be the first ones 126 new_chunks[-1].add_original(chunk) 127 self.separated_chunks[i] = new_chunks 128 129 def remove_leading_and_trailing_crap(self): 130 for chunk_group in self.separated_chunks: 131 for chunk in chunk_group: 132 chunk.remove_leading_and_trailing_crap() 133 134 def elide_inside_words(self): 135 for chunk_group in self.separated_chunks: 136 for i, chunk in enumerate(chunk_group[:-1]): 137 all_next_chunks = chunk_group[i + 1:] 138 chunk.elide_inside_words(all_next_chunks) 139 140 def simplify_gu_qu(self): 141 for chunk_group in self.separated_chunks: 142 if len(chunk_group) >= 2: 143 for i, chunk in enumerate(chunk_group[:-1]): 144 next_chunk = chunk_group[i + 1] 145 chunk.simplify_gu_qu(next_chunk) 146 147 def check_forbidden_characters(self): 148 for chunk_group in self.separated_chunks: 149 for chunk in chunk_group: 150 chunk.check_forbidden_characters() 151 152 def initialize_chunks(self): 153 word_bi_tokens = self.get_word_tokens() 154 pre_chunks = pre_process_bi_tokens(word_bi_tokens) 155 self.separated_chunks = [] 156 for (is_end_word, pre_chunk) in pre_chunks: 157 if len(pre_chunk) != 0: 158 self.separated_chunks.append([Chunk(word, self.verse) for word in pre_chunk]) 159 if not is_end_word: 160 # word end is a fake word end 161 for chunk in self.separated_chunks[-1]: 162 chunk.set_hemistiche('cut') 163 164 def get_word_tokens(self): 165 words = self.split_input_line_by_whitespace() 166 words = remove_trivial(words, is_empty_word) 167 word_tokens = split_all_hyphen(words) 168 return word_tokens 169 170 def split_input_line_by_whitespace(self): 171 whitespace_regexp = re.compile(r"(\s+)") 172 words = re.split(whitespace_regexp, self.verse.input_line) 173 return words 174 175 def annotate(self, template, threshold): 176 # annotate weights 177 for i, chunk in enumerate(self.chunks): 178 if not chunk.is_vowels(): 179 continue 180 181 chunks_before = self.chunks[:i] 182 chunks_after = self.chunks[i + 1:] 183 # for the case of "pays" and related words 184 chunk.set_possible_weights_from_context(chunks_before, chunks_after, template, threshold) 185 186 next_chunk = self.chunks[i + 1] if i < len(self.chunks) - 1 else None 187 previous_chunk = self.chunks[i - 1] if i > 0 else None 188 previous_previous_chunk = self.chunks[i - 2] if i > 1 else None 189 chunk.set_hemistiche_from_context(previous_previous_chunk, previous_chunk, next_chunk) 190 return self.align2str() 191 192 def align2str(self): 193 return ''.join([x.text for x in self.chunks]) 194 195 def print_n_syllables(self, n_syllables, offset, output_file): 196 count = 0 197 for i, chunk in enumerate(self.chunks[::-1]): 198 if chunk.weights is not None: 199 if count < offset: 200 count += 1 201 continue 202 pos = len(self.chunks) - i - 1 203 considered_chunk = self.chunks[pos] 204 chunks_before = self.chunks[:pos] 205 chunks_after = self.chunks[pos + 1:] 206 print(str(n_syllables) + ' ' + ' '.join(considered_chunk.make_query(chunks_before, chunks_after)), 207 file=output_file) 208 break 209 210 def normalized(self): 211 return ''.join(chunk.normalize() for chunk in self.chunks).lstrip().rstrip() 212 213 def get_line(self): 214 return ''.join(chunk.get_original_text() for chunk in self.chunks) 215 216 def get_errors_set(self, forbidden_ok, hiatus_ok): 217 errors = set() 218 for chunk in self.chunks: 219 errors_chunk = chunk.get_errors_set(forbidden_ok, hiatus_ok) 220 errors = errors.union(errors_chunk) 221 return errors 222 223 def get_feminine(self, template, threshold, align=None): 224 text = self.annotate(template, threshold) 225 for a in SURE_END_FEM: 226 if text.endswith(a): 227 # if vowel before, it must be fem 228 try: 229 if strip_accents(text[-len(a) - 1]) in common.VOWELS: 230 return ['F'] 231 except IndexError: 232 # too short 233 if text == "es": 234 return ['M'] 235 else: 236 return ['F'] 237 # check that this isn't a one-syllabe word that ends with "es" 238 # => must be masculine as '-es' cannot be mute then 239 # => except if there is another vowel before ("fées") 240 if text.endswith("es") and (len(text) == 2 or strip_accents(text[-3]) not in common.VOWELS): 241 for i in range(4): 242 try: 243 if self.chunks[-i - 1].is_masculine(): 244 return ['M'] 245 except IndexError: 246 return ['M'] 247 return ['F'] 248 if not text.endswith('ent'): 249 return ['M'] 250 # verse ends with 'ent' 251 if align: 252 if align and align[-2].weight == 0: 253 return ['F'] # mute -ent 254 if align and align[-2].weight > 0 and align[-2].text == 'e': 255 return ['M'] # non-mute "-ent" by the choice of metric 256 possible = [] 257 # now, we must check pronunciation? 258 # "tient" vs. "lient" for instance, "excellent"... 259 for possible_phon in self.verse.phon: 260 if possible_phon.endswith(')') or possible_phon.endswith('#'): 261 possible.append('M') 262 else: 263 possible.append('F') 264 if possible_phon.endswith('E') and text.endswith('aient'): 265 # imparfait and conditionnel are masculine... 266 possible.append('M') 267 return possible 268 269 def fit(self, hemistiches, pos=0, count=0): 270 if count > self.verse.pattern.length: 271 return [] # no possibilites 272 if len(hemistiches) > 0 and hemistiches[0] < count: 273 return [] # missed a hemistiche 274 if pos == len(self.chunks): 275 if count == self.verse.pattern.length: 276 return [[]] # empty list is the only possibility 277 else: 278 return [] 279 chunk = self.chunks[pos] 280 result = [] 281 for weight in (chunk.weights or [0]): 282 next_hemistiches = hemistiches 283 if (len(hemistiches) > 0 and count + weight == hemistiches[0] and 284 chunk.is_vowels()): 285 # need to try to hemistiche 286 if chunk.hemistiche == "ok" or (chunk.hemistiche == "elid" and weight == 0): 287 # we hemistiche here 288 next_hemistiches = next_hemistiches[1:] 289 current = chunk.copy() 290 if current.weights is not None: 291 current.weight = weight 292 for x in self.fit(next_hemistiches, pos + 1, count + weight): 293 result.append([current] + x) 294 return result 295 296 def get_last_count(self): 297 tot = 0 298 for chunk in self.chunks[::-1]: 299 if chunk.original.endswith(' ') or chunk.original.endswith('-'): 300 if tot > 0: 301 break 302 if chunk.weights is not None: 303 tot += min(chunk.weights) 304 if ' ' in chunk.original.rstrip() or '-' in chunk.original.rstrip(): 305 if tot > 0: 306 break 307 return tot 308 309 def align_from_keys(self, keys, fmt="text"): 310 if fmt == "text": 311 lines = {} 312 for key in keys: 313 lines[key] = "" 314 for chunk in self.chunks: 315 for key in keys: 316 lines[key] += chunk.get_normalized_rendering( 317 key, keys, fmt=fmt) 318 if 'weights' in keys: 319 bounds = self.get_weights_bounds() 320 bounds = [str(x) for x in bounds] 321 lines['weights'] += " (total: " + ('-'.join(bounds) 322 if bounds[1] != bounds[0] else bounds[0]) + ")" 323 return ["> " + lines[key] for key in keys if len(lines[key].strip()) > 0] 324 elif fmt == "json": 325 ret = {'chunks': []} 326 for chunk in self.chunks: 327 d = {} 328 for key in keys: 329 v = chunk.get_normalized_rendering( 330 key, keys, fmt=fmt) 331 if v is not None: 332 d[key] = v 333 ret['chunks'].append(d) 334 bounds = self.get_weights_bounds() 335 ret['total_weight'] = { 336 'min': bounds[0], 337 'max': bounds[1]} 338 return ret 339 else: 340 raise ValueError("bad format") 341 342 343 def get_weights_bounds(self): 344 bounds = [0, 0] 345 for chunk in self.chunks: 346 bounds[0] += chunk.get_min_weight() 347 bounds[1] += chunk.get_max_weight() 348 return bounds 349 350 351 def remove_trivial(words, predicate): 352 new_chunks = [] 353 words_accumulation = "" 354 for i, chunk in enumerate(words): 355 if predicate(chunk): 356 if len(new_chunks) == 0: 357 words_accumulation = words_accumulation + chunk 358 else: 359 new_chunks[-1] = new_chunks[-1] + chunk 360 else: 361 new_chunks.append(words_accumulation + chunk) 362 words_accumulation = "" 363 return new_chunks 364 365 366 def split_all_hyphen(words): 367 return sum([HyphenSplitter().split(w) for w in words], []) 368 369 370 def is_empty_word(word): 371 return re.match(r"^\s*$", word) or len(normalize(word, rm_all=True)) == 0 372 373 374 def pre_process_bi_tokens(word_bi_tokens): 375 consonants_regexp = get_consonants_regex() 376 pre_chunks = [(b, re.split(consonants_regexp, word)) for (b, word) in word_bi_tokens] 377 pre_chunks = [(b, remove_trivial(x, is_empty_word)) for (b, x) in pre_chunks] 378 return pre_chunks