common.py (4254B)
1 #!/usr/bin/python3 2 # coding: utf-8 3 4 import unicodedata 5 import re 6 7 VOWELS = 'aeiouyœæ' 8 CONSONANTS = "bcçdfghjklmnpqrstvwxzñĝ'" 9 APOSTROPHES = "'’`" 10 LEGAL = VOWELS + CONSONANTS + ' -' 11 12 # a variant of x-sampa such that all french phonemes are one-character 13 SUBSTITUTIONS = [ 14 ('#', 'A~'), 15 ('$', 'O~'), 16 (')', 'E~'), 17 ('(', '9~'), 18 ] 19 20 # Forbidden at the end of a hemistiche. "-ent" would also be forbidden 21 # in some cases but not others... 22 SURE_END_FEM = ['es', 'e', 'ë'] 23 24 25 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string 26 def strip_accents_one(s, with_except=False): 27 """Strip accent from a string with_except keeps specifically 'é' and 'è'""" 28 r = [] 29 for x in s: 30 if with_except and x in ['è', 'é']: 31 r.append(x) 32 else: 33 r += unicodedata.normalize('NFD', x) 34 return r 35 36 37 def strip_accents(s, with_except=False): 38 return ''.join( 39 (c for c in strip_accents_one(s, with_except) 40 if unicodedata.category(c) != 'Mn')) 41 42 43 def normalize_spaces(text): 44 """Remove multiple consecutive whitespace""" 45 return re.sub(r"\s+-*\s*", ' ', text) 46 47 48 def remove_punctuation(text, rm_all=False, rm_apostrophe=False, 49 rm_apostrophe_end=True, rm_all_begin=False): 50 """Remove punctuation from text""" 51 text = re.sub("[" + APOSTROPHES + "]", "'", text) # no weird apostrophes 52 if rm_apostrophe: 53 text = re.sub(r"'", "", text) 54 if rm_apostrophe_end: 55 text = re.sub(r"'*$", "", text) # apostrophes at end of line 56 text = re.sub(r"[‒–—―⁓⸺⸻]", " ", text) # no weird dashes 57 text = re.sub(r"^--*\s", " ", text) # no isolated dashes 58 text = re.sub(r"--*\s", " ", text) # no trailing dashes 59 text = re.sub(r"^\s*-\s*$", " ", text) # no lone dash 60 text = re.sub(r"^--*$", "", text) # no only dashes 61 62 # TODO rather: keep only good chars 63 if not rm_all: 64 if rm_all_begin: 65 pattern = re.compile(r"^[^\w]*", re.UNICODE) 66 text2b = pattern.sub(' ', text) 67 else: 68 text2b = text 69 pattern = re.compile(r"[^'\w -]", re.UNICODE) 70 text2 = pattern.sub(' ', text2b) 71 else: 72 pattern = re.compile(r"[^\w]", re.UNICODE) 73 text2 = pattern.sub('', text) 74 text2 = re.sub(r"\s'*$", " ", text2) # no lonely apostrophes 75 text2 = re.sub(r"^'*$", "", text2) # not only apostrophes 76 return text2 77 78 79 def is_vowels(chunk_text, with_h=False, with_y=True, with_crap=False): 80 """Test if a chunk is vowels with_h counts 'h' as vowel, with_y allows 'y'""" 81 if not with_y and chunk_text == 'y': 82 return False 83 for char in strip_accents(chunk_text): 84 if char not in VOWELS and (char != 'h' or not with_h) and (char not in ['*', '?'] or not with_crap): 85 return False 86 return True 87 88 89 def is_consonants(chunk_text): 90 """Test if a chunk is consonants""" 91 for char in strip_accents(chunk_text): 92 if char not in CONSONANTS: 93 return False 94 return True 95 96 97 def normalize(text, downcase=True, rm_all=False, rm_apostrophe=False, 98 rm_apostrophe_end=True, rm_all_begin=False, strip=True): 99 """Normalize text, ie. lowercase, no useless punctuation or whitespace""" 100 res = normalize_spaces(remove_punctuation(text.lower() if downcase else text, 101 rm_all=rm_all, rm_apostrophe=rm_apostrophe, 102 rm_apostrophe_end=rm_apostrophe_end, 103 rm_all_begin=rm_all_begin)) 104 if strip: 105 return res.rstrip().lstrip() 106 else: 107 return res 108 109 110 def subst(string, subs): 111 if len(subs) == 0: 112 return string 113 return subst(string.replace(subs[0][0], subs[0][1]), subs[1:]) 114 115 116 def to_xsampa(s): 117 """convert our modified format to x-sampa""" 118 return subst(s, SUBSTITUTIONS) 119 120 121 def from_xsampa(s): 122 """convert x-sampa to our modified format""" 123 return subst(s, [(x[1], x[0]) for x in SUBSTITUTIONS]) 124 125 126 def get_consonants_regex(): 127 all_consonants = CONSONANTS + CONSONANTS.upper() 128 consonants_regexp = re.compile(r'([^' + all_consonants + '*-]+)', re.UNICODE) 129 return consonants_regexp