vowels.py (3242B)
1 #!/usr/bin/env python3 2 #coding: utf-8 3 4 """Compute the number of syllabes taken by a vowel chunk""" 5 6 from common import strip_accents 7 8 def clear(x): 9 return (x['text'] + ' ') if 'wordend' in x else x['text'] 10 11 def intersperse(a, b): 12 if (len(a) == 0 or a[0] == ' ') and (len(b) == 0 or b[0] == ' '): 13 return [] 14 if len(a) == 0 or a[0] == ' ': 15 return ["/", b[0]] + intersperse(a, b[1:]) 16 if len(b) == 0 or b[0] == ' ': 17 return [a[0], "/"] + intersperse(a[1:], b) 18 return [a[0], b[0]] + intersperse(a[1:], b[1:]) 19 20 def contains_trema(chunk): 21 """Test if a string contains a word with a trema""" 22 for x in ['ä', 'ë', 'ï', 'ö', 'ü', 'ÿ']: 23 if x in chunk: 24 return True 25 return False 26 27 default_threshold = 12 28 29 def make_query(chunks, pos): 30 cleared = [clear(x) for x in chunks] 31 if cleared[pos].endswith(' '): 32 cleared[pos] = cleared[pos].rstrip() 33 if pos + 1 <= len(cleared): 34 cleared[pos+1] = " " + cleared[pos+1] 35 else: 36 cleared.append(' ') 37 38 return [cleared[pos]] + intersperse( 39 ''.join(cleared[pos+1:]), 40 ''.join([x[::-1] for x in cleared[:pos][::-1]])) 41 42 def possible_weights_ctx(chunks, pos, threshold=None): 43 global default_threshold 44 if not threshold: 45 threshold = default_threshold 46 from diaeresis import lookup 47 chunk = chunks[pos] 48 q = make_query(chunks, pos) 49 #print (q) 50 v = lookup(q) 51 #print (v) 52 if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold: 53 return [int(list(v.keys())[0])] 54 else: 55 return possible_weights_seed(chunk) 56 57 def possible_weights_approx(chunk): 58 """Return the possible number of syllabes taken by a vowel chunk (permissive 59 approximation)""" 60 if len(chunk) == 1: 61 return [1] 62 # old spelling and weird exceptions 63 if chunk in ['ouï']: 64 return [1, 2] # TODO unsure about that 65 if chunk in ['eüi', 'aoû', 'uë']: 66 return [1] 67 if chunk in ['aïe', 'oë', 'ouü']: 68 return [1, 2] 69 if contains_trema(chunk): 70 return [2] 71 chunk = strip_accents(chunk, True) 72 if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi', 73 'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo', 74 'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii', 75 'yeu', 'ye', 'you']: 76 return [1] 77 if chunk == "oua": 78 return [1, 2] # "pouah" 79 if chunk == "ao": 80 return [1, 2] # "paon" 81 for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']: 82 if x in chunk: 83 return [2] 84 # beware of "déesse" 85 if chunk == 'ée': 86 return [1, 2] 87 if chunk[0] == 'i': 88 return [1, 2] 89 if chunk[0] == 'u' and (strip_accents(chunk[1]) in ['i', 'e']): 90 return [1, 2] 91 if chunk[0] == 'o' and chunk[1] == 'u' and len(chunk) >= 3 and strip_accents(chunk[2]) in ['i', 'e']: 92 return [1, 2] 93 if 'é' in chunk or 'è' in chunk: 94 return [2] 95 96 # we can't tell 97 return [1, 2] 98 99 def possible_weights_seed(chunk): 100 """Return the possible number of syllabes taken by a vowel chunk""" 101 if len(chunk['text']) == 1: 102 return [1] 103 # dioïde, maoïste, taoïste 104 if (chunk['text'][-1] == 'ï' and len(chunk['text']) >= 3 and not 105 chunk['text'][-3:-1] == 'ou'): 106 return [3] 107 #if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']: 108 # return [1] 109 # we can't tell 110 return [1, 2] 111