hyphen_splitter.py (2367B)
1 import re 2 3 from plint.common import is_consonants, normalize 4 5 HYPHEN_REGEX = re.compile("(-+)") 6 7 8 class HyphenSplitter: 9 10 def __init__(self): 11 self._missed = "" 12 self.tokens = [] 13 self.just_append = False 14 self.initialize() 15 16 def initialize(self): 17 self._missed = "" 18 self.tokens = [] 19 self.just_append = False 20 21 def split(self, word): 22 """split hyphen-delimited word parts into separate words if they are only 23 consonants, so that the sigle code later can deal with them (e.g. "k-way") 24 annotates parts with boolean indicating if there is a word end afterward""" 25 self.initialize() 26 self.complete_tokens(word) 27 self.process_remaining_missed() 28 return self.get_tokens_with_last_word_indication() 29 30 def complete_tokens(self, word): 31 word_split_by_hyphen = re.split(HYPHEN_REGEX, word) 32 for i, sub_word in enumerate(word_split_by_hyphen): 33 self.add_subword_to_tokens(sub_word) 34 35 def add_subword_to_tokens(self, sub_word): 36 if self.just_append: 37 self.append_to_last_token(sub_word) 38 elif self.is_separator(sub_word): 39 self.process_separator(sub_word) 40 elif is_consonants(normalize(sub_word)): 41 self.append_with_miss(sub_word) 42 else: 43 self.append_with_miss(sub_word) 44 self.just_append = True 45 46 def get_tokens_with_last_word_indication(self): 47 return list(zip([False] * (len(self.tokens) - 1) + [True], self.tokens)) 48 49 def append_with_miss(self, sub_word): 50 self.tokens.append(self._missed + sub_word) 51 self._missed = "" 52 53 def process_remaining_missed(self): 54 if self._missed: 55 if self.tokens: 56 self.append_to_last_token(self._missed) 57 else: 58 self.tokens = [self._missed] 59 60 @staticmethod 61 def get_token(before_word_end, word): 62 return before_word_end, word 63 64 def process_separator(self, sub_word): 65 if self.tokens: 66 self.append_to_last_token(sub_word) 67 else: 68 self._missed += sub_word 69 70 def append_to_last_token(self, sub_word): 71 self.tokens[-1] = self.tokens[-1] + sub_word 72 73 @staticmethod 74 def is_separator(word): 75 return re.match(r"^-*$", word) or re.match(r"^ *$", word)