common.py (2355B)
1 #!/usr/bin/env python3 2 #coding: utf-8 3 4 import unicodedata 5 import re 6 7 vowels = 'aeiouyœæ' 8 consonants = "bcçdfghjklmnpqrstvwxz" 9 10 # a variant of x-sampa such that all french phonemes are one-character 11 SUBSTS = [ 12 ('#', 'A~'), 13 ('$', 'O~'), 14 (')', 'E~'), 15 ('(', '9~'), 16 ] 17 18 # Forbidden at the end of a hemistiche. "-ent" would also be forbidden 19 # in some cases but not others... 20 sure_end_fem = ['es', 'e', 'ë'] 21 22 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string 23 def strip_accents_one(s, with_except=False): 24 """Strip accent from a string 25 26 with_except keeps specifically 'é' and 'è'""" 27 r = [] 28 for x in s: 29 if with_except and x in ['è', 'é']: 30 r.append(x) 31 else: 32 r += unicodedata.normalize('NFD', x) 33 return r 34 35 def strip_accents(s, with_except=False): 36 return ''.join( 37 (c for c in strip_accents_one(s, with_except) 38 if unicodedata.category(c) != 'Mn')) 39 40 def norm_spaces(text): 41 """Remove multiple consecutive whitespace""" 42 return re.sub("\s+-*\s*", ' ', text) 43 44 def rm_punct(text, with_apostrophe = False): 45 """Remove punctuation from text""" 46 if not with_apostrophe: 47 text = re.sub("'", '', text) 48 #TODO rather: keep only good chars 49 pattern = re.compile("[^'\w -]", re.UNICODE) 50 return pattern.sub(' ', text) 51 52 def is_vowels(chunk, with_h=False, with_y=True): 53 """Test if a chunk is vowels 54 55 with_h counts 'h' as vowel, with_y allows 'y'""" 56 57 if not with_y and chunk == 'y': 58 return False 59 for char in strip_accents(chunk): 60 if char not in vowels: 61 if char != 'h' or not with_h: 62 return False 63 return True 64 65 def is_consonants(chunk): 66 """Test if a chunk is consonants""" 67 68 for char in strip_accents(chunk): 69 if char not in consonants: 70 return False 71 return True 72 73 def normalize(text, with_apostrophe=False): 74 """Normalize text, ie. lowercase, no useless punctuation or whitespace""" 75 return norm_spaces(rm_punct(text.lower(), with_apostrophe)).rstrip().lstrip() 76 77 def subst(string, subs): 78 if len(subs) == 0: 79 return string 80 return subst(string.replace(subs[0][0], subs[0][1]), subs[1:]) 81 82 def to_xsampa(s): 83 """convert our modified format to x-sampa""" 84 return subst(s, SUBSTS) 85 86 def from_xsampa(s): 87 """convert x-sampa to our modified format""" 88 return subst(s, [(x[1], x[0]) for x in SUBSTS]) 89