plint

French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

hyphen_splitter.py (2367B)


      1 import re
      2 
      3 from plint.common import is_consonants, normalize
      4 
      5 HYPHEN_REGEX = re.compile("(-+)")
      6 
      7 
      8 class HyphenSplitter:
      9 
     10     def __init__(self):
     11         self._missed = ""
     12         self.tokens = []
     13         self.just_append = False
     14         self.initialize()
     15 
     16     def initialize(self):
     17         self._missed = ""
     18         self.tokens = []
     19         self.just_append = False
     20 
     21     def split(self, word):
     22         """split hyphen-delimited word parts into separate words if they are only
     23               consonants, so that the sigle code later can deal with them (e.g. "k-way")
     24               annotates parts with boolean indicating if there is a word end afterward"""
     25         self.initialize()
     26         self.complete_tokens(word)
     27         self.process_remaining_missed()
     28         return self.get_tokens_with_last_word_indication()
     29 
     30     def complete_tokens(self, word):
     31         word_split_by_hyphen = re.split(HYPHEN_REGEX, word)
     32         for i, sub_word in enumerate(word_split_by_hyphen):
     33             self.add_subword_to_tokens(sub_word)
     34 
     35     def add_subword_to_tokens(self, sub_word):
     36         if self.just_append:
     37             self.append_to_last_token(sub_word)
     38         elif self.is_separator(sub_word):
     39             self.process_separator(sub_word)
     40         elif is_consonants(normalize(sub_word)):
     41             self.append_with_miss(sub_word)
     42         else:
     43             self.append_with_miss(sub_word)
     44             self.just_append = True
     45 
     46     def get_tokens_with_last_word_indication(self):
     47         return list(zip([False] * (len(self.tokens) - 1) + [True], self.tokens))
     48 
     49     def append_with_miss(self, sub_word):
     50         self.tokens.append(self._missed + sub_word)
     51         self._missed = ""
     52 
     53     def process_remaining_missed(self):
     54         if self._missed:
     55             if self.tokens:
     56                 self.append_to_last_token(self._missed)
     57             else:
     58                 self.tokens = [self._missed]
     59 
     60     @staticmethod
     61     def get_token(before_word_end, word):
     62         return before_word_end, word
     63 
     64     def process_separator(self, sub_word):
     65         if self.tokens:
     66             self.append_to_last_token(sub_word)
     67         else:
     68             self._missed += sub_word
     69 
     70     def append_to_last_token(self, sub_word):
     71         self.tokens[-1] = self.tokens[-1] + sub_word
     72 
     73     @staticmethod
     74     def is_separator(word):
     75         return re.match(r"^-*$", word) or re.match(r"^ *$", word)