drime

French rhyme dictionary with web and CLI interface
git clone https://a3nm.net/git/drime/
Log | Files | Refs | README

vowels.py (3242B)


      1 #!/usr/bin/env python3
      2 #coding: utf-8
      3 
      4 """Compute the number of syllabes taken by a vowel chunk"""
      5 
      6 from common import strip_accents
      7 
      8 def clear(x):
      9   return (x['text'] + ' ') if 'wordend' in x else x['text']
     10 
     11 def intersperse(a, b):
     12   if (len(a) == 0 or a[0] == ' ') and (len(b) == 0 or b[0] == ' '):
     13     return []
     14   if len(a) == 0 or a[0] == ' ':
     15     return ["/", b[0]] + intersperse(a, b[1:])
     16   if len(b) == 0 or b[0] == ' ':
     17     return [a[0], "/"] + intersperse(a[1:], b)
     18   return [a[0], b[0]] + intersperse(a[1:], b[1:])
     19 
     20 def contains_trema(chunk):
     21   """Test if a string contains a word with a trema"""
     22   for x in ['ä', 'ë', 'ï', 'ö', 'ü', 'ÿ']:
     23     if x in chunk:
     24       return True
     25   return False
     26 
     27 default_threshold = 12
     28 
     29 def make_query(chunks, pos):
     30   cleared = [clear(x) for x in chunks]
     31   if cleared[pos].endswith(' '):
     32     cleared[pos] = cleared[pos].rstrip()
     33     if pos + 1 <= len(cleared):
     34       cleared[pos+1] = " " + cleared[pos+1]
     35     else:
     36       cleared.append(' ')
     37 
     38   return [cleared[pos]] + intersperse(
     39       ''.join(cleared[pos+1:]),
     40       ''.join([x[::-1] for x in cleared[:pos][::-1]]))
     41 
     42 def possible_weights_ctx(chunks, pos, threshold=None):
     43   global default_threshold
     44   if not threshold:
     45     threshold = default_threshold
     46   from diaeresis import lookup
     47   chunk = chunks[pos]
     48   q = make_query(chunks, pos)
     49   #print (q)
     50   v = lookup(q)
     51   #print (v)
     52   if len(v.keys()) == 1 and v[list(v.keys())[0]] > threshold:
     53     return [int(list(v.keys())[0])]
     54   else:
     55     return possible_weights_seed(chunk)
     56 
     57 def possible_weights_approx(chunk):
     58   """Return the possible number of syllabes taken by a vowel chunk (permissive
     59   approximation)"""
     60   if len(chunk) == 1:
     61     return [1]
     62   # old spelling and weird exceptions
     63   if chunk in ['ouï']:
     64     return [1, 2] # TODO unsure about that
     65   if chunk in ['eüi', 'aoû', 'uë']:
     66     return [1]
     67   if chunk in ['aïe', 'oë', 'ouü']:
     68     return [1, 2]
     69   if contains_trema(chunk):
     70     return [2]
     71   chunk = strip_accents(chunk, True)
     72   if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'eoi', 'eui', 'au', 'oi',
     73       'oie', 'œi', 'œu', 'eaie', 'aie', 'oei', 'oeu', 'ea', 'ae', 'eo',
     74       'eoie', 'oe', 'eai', 'eue', 'aa', 'oo', 'ee', 'ii', 'aii',
     75       'yeu', 'ye', 'you']:
     76     return [1]
     77   if chunk == "oua":
     78     return [1, 2] # "pouah"
     79   if chunk == "ao":
     80     return [1, 2] # "paon"
     81   for x in ['oa', 'ea', 'eua', 'euo', 'ua', 'uo', 'yau']:
     82     if x in chunk:
     83       return [2]
     84   # beware of "déesse"
     85   if chunk == 'ée':
     86     return [1, 2]
     87   if chunk[0] == 'i':
     88     return [1, 2]
     89   if chunk[0] == 'u' and (strip_accents(chunk[1]) in ['i', 'e']):
     90     return [1, 2]
     91   if chunk[0] == 'o' and chunk[1] == 'u' and len(chunk) >= 3 and strip_accents(chunk[2]) in ['i', 'e']:
     92     return [1, 2]
     93   if 'é' in chunk or 'è' in chunk:
     94     return [2]
     95 
     96   # we can't tell
     97   return [1, 2]
     98 
     99 def possible_weights_seed(chunk):
    100   """Return the possible number of syllabes taken by a vowel chunk"""
    101   if len(chunk['text']) == 1:
    102     return [1]
    103   # dioïde, maoïste, taoïste
    104   if (chunk['text'][-1] == 'ï' and len(chunk['text']) >= 3 and not
    105       chunk['text'][-3:-1] == 'ou'):
    106     return [3]
    107   #if chunk in ['ai', 'ou', 'eu', 'ei', 'eau', 'au', 'oi']:
    108   #  return [1]
    109   # we can't tell
    110   return [1, 2]
    111