plint

French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

chunks.py (14886B)


      1 import re
      2 import sys
      3 from pprint import pprint
      4 
      5 from plint import common
      6 from plint.chunk import Chunk
      7 from plint.common import normalize, get_consonants_regex, SURE_END_FEM, strip_accents
      8 from plint.hyphen_splitter import HyphenSplitter
      9 
     10 
     11 class Chunks:
     12 
     13     def __init__(self, verse):
     14         # TODO Find a way to remove this dependency
     15         self.verse = verse
     16         self.chunks = []
     17         self.create_chunks()
     18         self.separated_chunks = []
     19 
     20     def create_chunks(self):
     21         self.initialize_chunks()
     22         self.collapse_apostrophes()
     23         self.check_forbidden_characters()
     24         self.simplify_gu_qu()
     25         self.elide_inside_words()
     26         self.remove_leading_and_trailing_crap()
     27         self.collapse_empty_chunks_from_simplifications()
     28         self.create_acronym()
     29         self.elide_vowel_problems()
     30         self.process_y_cases()
     31         self.annotate_final_mute_e()
     32         self.annotate_hiatus()
     33         self.annotate_word_ends()
     34         self.merge_chunks_words()
     35         self.print_new_line_if_changed()
     36 
     37     def print_new_line_if_changed(self):
     38         now_line = ''.join(chunk.original for chunk in self.chunks)
     39         if now_line != self.verse.input_line:
     40             print("%s became %s" % (self.verse.input_line, now_line), file=sys.stderr)
     41             pprint(self.chunks, stream=sys.stderr)
     42 
     43     def merge_chunks_words(self):
     44         self.chunks = sum(self.separated_chunks, [])
     45 
     46     def annotate_word_ends(self):
     47         for chunk_group in self.separated_chunks[:-1]:
     48             chunk_group[-1].make_word_end()
     49 
     50     def annotate_hiatus(self):
     51         for i, chunk_group in enumerate(self.separated_chunks[:-1]):
     52             last_chunk = chunk_group[-1]
     53             next_chunk = self.separated_chunks[i + 1][0]
     54             if len(chunk_group) >= 2:
     55                 previous_last_chunk = chunk_group[-2]
     56             else:
     57                 previous_last_chunk = None
     58             only_two_parts = len(chunk_group) == 2
     59             last_chunk.check_hiatus(previous_last_chunk, next_chunk, only_two_parts)
     60 
     61     def annotate_final_mute_e(self):
     62         for i, chunk_group in enumerate(self.separated_chunks[:-1]):
     63             if chunk_group[-1].is_e():
     64                 n_weight = 0
     65                 for chunk in chunk_group[::-1]:
     66                     if chunk.is_vowels():
     67                         n_weight += 1
     68                     if not chunk.is_dash_elidable():
     69                         break
     70                 if n_weight == 1:
     71                     continue
     72                 next_group_first_chunk = self.separated_chunks[i + 1][0]
     73                 chunk_group[-1].check_elidable_with_next(next_group_first_chunk)
     74 
     75     def process_y_cases(self):
     76         for i, chunk_group in enumerate(self.separated_chunks):
     77             new_word = []
     78             for j, chunk in enumerate(chunk_group):
     79                 if j != 0:
     80                     previous_chunk = chunk_group[j - 1]
     81                 else:
     82                     previous_chunk = None
     83                 if j != len(chunk_group) - 1:
     84                     next_chunk = chunk_group[j + 1]
     85                 else:
     86                     next_chunk = None
     87                 new_word_from_chunk = chunk.process_y_cases(previous_chunk, next_chunk)
     88                 new_word += new_word_from_chunk
     89             self.separated_chunks[i] = new_word
     90 
     91     def elide_vowel_problems(self):
     92         for chunk_group in self.separated_chunks:
     93             chunk_group[0].elide_vowel_problems(chunk_group)
     94 
     95     def collapse_apostrophes(self):
     96         future_chunks = []
     97         acc = []
     98         for chunk_group in self.separated_chunks:
     99             if chunk_group[-1].ends_with_apostrophe():
    100                 acc += chunk_group
    101             else:
    102                 future_chunks.append(acc + chunk_group)
    103                 acc = []
    104         if acc:
    105             future_chunks.append(acc)
    106         self.separated_chunks = future_chunks
    107 
    108     def create_acronym(self):
    109         for i, chunk_group in enumerate(self.separated_chunks):
    110             if len(chunk_group) == 1:
    111                 first_chunk = chunk_group[0]
    112                 if first_chunk.is_consonants():
    113                     new_word = first_chunk.create_acronym()
    114                     self.separated_chunks[i] = new_word
    115                     self.separated_chunks[i][-1].check_elidable()
    116 
    117     def collapse_empty_chunks_from_simplifications(self):
    118         for i, chunk_group in enumerate(self.separated_chunks):
    119             new_chunks = []
    120             for chunk in chunk_group:
    121                 if not chunk.is_empty():
    122                     new_chunks.append(chunk)
    123                 else:
    124                     # propagate the original text
    125                     # newly empty chunks cannot be the first ones
    126                     new_chunks[-1].add_original(chunk)
    127             self.separated_chunks[i] = new_chunks
    128 
    129     def remove_leading_and_trailing_crap(self):
    130         for chunk_group in self.separated_chunks:
    131             for chunk in chunk_group:
    132                 chunk.remove_leading_and_trailing_crap()
    133 
    134     def elide_inside_words(self):
    135         for chunk_group in self.separated_chunks:
    136             for i, chunk in enumerate(chunk_group[:-1]):
    137                 all_next_chunks = chunk_group[i + 1:]
    138                 chunk.elide_inside_words(all_next_chunks)
    139 
    140     def simplify_gu_qu(self):
    141         for chunk_group in self.separated_chunks:
    142             if len(chunk_group) >= 2:
    143                 for i, chunk in enumerate(chunk_group[:-1]):
    144                     next_chunk = chunk_group[i + 1]
    145                     chunk.simplify_gu_qu(next_chunk)
    146 
    147     def check_forbidden_characters(self):
    148         for chunk_group in self.separated_chunks:
    149             for chunk in chunk_group:
    150                 chunk.check_forbidden_characters()
    151 
    152     def initialize_chunks(self):
    153         word_bi_tokens = self.get_word_tokens()
    154         pre_chunks = pre_process_bi_tokens(word_bi_tokens)
    155         self.separated_chunks = []
    156         for (is_end_word, pre_chunk) in pre_chunks:
    157             if len(pre_chunk) != 0:
    158                 self.separated_chunks.append([Chunk(word, self.verse) for word in pre_chunk])
    159                 if not is_end_word:
    160                     # word end is a fake word end
    161                     for chunk in self.separated_chunks[-1]:
    162                         chunk.set_hemistiche('cut')
    163 
    164     def get_word_tokens(self):
    165         words = self.split_input_line_by_whitespace()
    166         words = remove_trivial(words, is_empty_word)
    167         word_tokens = split_all_hyphen(words)
    168         return word_tokens
    169 
    170     def split_input_line_by_whitespace(self):
    171         whitespace_regexp = re.compile(r"(\s+)")
    172         words = re.split(whitespace_regexp, self.verse.input_line)
    173         return words
    174 
    175     def annotate(self, template, threshold):
    176         # annotate weights
    177         for i, chunk in enumerate(self.chunks):
    178             if not chunk.is_vowels():
    179                 continue
    180 
    181             chunks_before = self.chunks[:i]
    182             chunks_after = self.chunks[i + 1:]
    183             # for the case of "pays" and related words
    184             chunk.set_possible_weights_from_context(chunks_before, chunks_after, template, threshold)
    185 
    186             next_chunk = self.chunks[i + 1] if i < len(self.chunks) - 1 else None
    187             previous_chunk = self.chunks[i - 1] if i > 0 else None
    188             previous_previous_chunk = self.chunks[i - 2] if i > 1 else None
    189             chunk.set_hemistiche_from_context(previous_previous_chunk, previous_chunk, next_chunk)
    190         return self.align2str()
    191 
    192     def align2str(self):
    193         return ''.join([x.text for x in self.chunks])
    194 
    195     def print_n_syllables(self, n_syllables, offset, output_file):
    196         count = 0
    197         for i, chunk in enumerate(self.chunks[::-1]):
    198             if chunk.weights is not None:
    199                 if count < offset:
    200                     count += 1
    201                     continue
    202                 pos = len(self.chunks) - i - 1
    203                 considered_chunk = self.chunks[pos]
    204                 chunks_before = self.chunks[:pos]
    205                 chunks_after = self.chunks[pos + 1:]
    206                 print(str(n_syllables) + ' ' + ' '.join(considered_chunk.make_query(chunks_before, chunks_after)),
    207                       file=output_file)
    208                 break
    209 
    210     def normalized(self):
    211         return ''.join(chunk.normalize() for chunk in self.chunks).lstrip().rstrip()
    212 
    213     def get_line(self):
    214         return ''.join(chunk.get_original_text() for chunk in self.chunks)
    215 
    216     def get_errors_set(self, forbidden_ok, hiatus_ok):
    217         errors = set()
    218         for chunk in self.chunks:
    219             errors_chunk = chunk.get_errors_set(forbidden_ok, hiatus_ok)
    220             errors = errors.union(errors_chunk)
    221         return errors
    222 
    223     def get_feminine(self, template, threshold, align=None):
    224         text = self.annotate(template, threshold)
    225         for a in SURE_END_FEM:
    226             if text.endswith(a):
    227                 # if vowel before, it must be fem
    228                 try:
    229                     if strip_accents(text[-len(a) - 1]) in common.VOWELS:
    230                         return ['F']
    231                 except IndexError:
    232                     # too short
    233                     if text == "es":
    234                         return ['M']
    235                     else:
    236                         return ['F']
    237                 # check that this isn't a one-syllabe word that ends with "es"
    238                 # => must be masculine as '-es' cannot be mute then
    239                 # => except if there is another vowel before ("fées")
    240                 if text.endswith("es") and (len(text) == 2 or strip_accents(text[-3]) not in common.VOWELS):
    241                     for i in range(4):
    242                         try:
    243                             if self.chunks[-i - 1].is_masculine():
    244                                 return ['M']
    245                         except IndexError:
    246                             return ['M']
    247                 return ['F']
    248         if not text.endswith('ent'):
    249             return ['M']
    250         # verse ends with 'ent'
    251         if align:
    252             if align and align[-2].weight == 0:
    253                 return ['F']  # mute -ent
    254             if align and align[-2].weight > 0 and align[-2].text == 'e':
    255                 return ['M']  # non-mute "-ent" by the choice of metric
    256         possible = []
    257         # now, we must check pronunciation?
    258         # "tient" vs. "lient" for instance, "excellent"...
    259         for possible_phon in self.verse.phon:
    260             if possible_phon.endswith(')') or possible_phon.endswith('#'):
    261                 possible.append('M')
    262             else:
    263                 possible.append('F')
    264                 if possible_phon.endswith('E') and text.endswith('aient'):
    265                     # imparfait and conditionnel are masculine...
    266                     possible.append('M')
    267         return possible
    268 
    269     def fit(self, hemistiches, pos=0, count=0):
    270         if count > self.verse.pattern.length:
    271             return []  # no possibilites
    272         if len(hemistiches) > 0 and hemistiches[0] < count:
    273             return []  # missed a hemistiche
    274         if pos == len(self.chunks):
    275             if count == self.verse.pattern.length:
    276                 return [[]]  # empty list is the only possibility
    277             else:
    278                 return []
    279         chunk = self.chunks[pos]
    280         result = []
    281         for weight in (chunk.weights or [0]):
    282             next_hemistiches = hemistiches
    283             if (len(hemistiches) > 0 and count + weight == hemistiches[0] and
    284                     chunk.is_vowels()):
    285                 # need to try to hemistiche
    286                 if chunk.hemistiche == "ok" or (chunk.hemistiche == "elid" and weight == 0):
    287                     # we hemistiche here
    288                     next_hemistiches = next_hemistiches[1:]
    289             current = chunk.copy()
    290             if current.weights is not None:
    291                 current.weight = weight
    292             for x in self.fit(next_hemistiches, pos + 1, count + weight):
    293                 result.append([current] + x)
    294         return result
    295 
    296     def get_last_count(self):
    297         tot = 0
    298         for chunk in self.chunks[::-1]:
    299             if chunk.original.endswith(' ') or chunk.original.endswith('-'):
    300                 if tot > 0:
    301                     break
    302             if chunk.weights is not None:
    303                 tot += min(chunk.weights)
    304             if ' ' in chunk.original.rstrip() or '-' in chunk.original.rstrip():
    305                 if tot > 0:
    306                     break
    307         return tot
    308 
    309     def align_from_keys(self, keys, fmt="text"):
    310         if fmt == "text":
    311             lines = {}
    312             for key in keys:
    313                 lines[key] = ""
    314             for chunk in self.chunks:
    315                 for key in keys:
    316                     lines[key] += chunk.get_normalized_rendering(
    317                             key, keys, fmt=fmt)
    318             if 'weights' in keys:
    319                 bounds = self.get_weights_bounds()
    320                 bounds = [str(x) for x in bounds]
    321                 lines['weights'] += " (total: " + ('-'.join(bounds)
    322                                                    if bounds[1] != bounds[0] else bounds[0]) + ")"
    323             return ["> " + lines[key] for key in keys if len(lines[key].strip()) > 0]
    324         elif fmt == "json":
    325             ret = {'chunks': []}
    326             for chunk in self.chunks:
    327                 d = {}
    328                 for key in keys:
    329                     v = chunk.get_normalized_rendering(
    330                         key, keys, fmt=fmt)
    331                     if v is not None:
    332                         d[key] = v
    333                 ret['chunks'].append(d)
    334             bounds = self.get_weights_bounds()
    335             ret['total_weight'] = {
    336                     'min': bounds[0],
    337                     'max': bounds[1]}
    338             return ret
    339         else:
    340             raise ValueError("bad format")
    341 
    342 
    343     def get_weights_bounds(self):
    344         bounds = [0, 0]
    345         for chunk in self.chunks:
    346             bounds[0] += chunk.get_min_weight()
    347             bounds[1] += chunk.get_max_weight()
    348         return bounds
    349 
    350 
    351 def remove_trivial(words, predicate):
    352     new_chunks = []
    353     words_accumulation = ""
    354     for i, chunk in enumerate(words):
    355         if predicate(chunk):
    356             if len(new_chunks) == 0:
    357                 words_accumulation = words_accumulation + chunk
    358             else:
    359                 new_chunks[-1] = new_chunks[-1] + chunk
    360         else:
    361             new_chunks.append(words_accumulation + chunk)
    362             words_accumulation = ""
    363     return new_chunks
    364 
    365 
    366 def split_all_hyphen(words):
    367     return sum([HyphenSplitter().split(w) for w in words], [])
    368 
    369 
    370 def is_empty_word(word):
    371     return re.match(r"^\s*$", word) or len(normalize(word, rm_all=True)) == 0
    372 
    373 
    374 def pre_process_bi_tokens(word_bi_tokens):
    375     consonants_regexp = get_consonants_regex()
    376     pre_chunks = [(b, re.split(consonants_regexp, word)) for (b, word) in word_bi_tokens]
    377     pre_chunks = [(b, remove_trivial(x, is_empty_word)) for (b, x) in pre_chunks]
    378     return pre_chunks