drime

French rhyme dictionary with web and CLI interface
git clone https://a3nm.net/git/drime/
Log | Files | Refs | README

common.py (2350B)


      1 #!/usr/bin/python
      2 #coding: utf-8
      3 
      4 import unicodedata
      5 import re
      6 
      7 vowels = 'aeiouyϾ'
      8 consonants = "bcçdfghjklmnpqrstvwxz"
      9 
     10 # a variant of x-sampa such that all french phonemes are one-character
     11 SUBSTS = [
     12   ('#', 'A~'),
     13   ('$', 'O~'),
     14   (')', 'E~'),
     15   ('(', '9~'),
     16     ]
     17 
     18 # Forbidden at the end of a hemistiche. "-ent" would also be forbidden
     19 # in some cases but not others...
     20 sure_end_fem = ['es', 'e', 'ë']
     21 
     22 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
     23 def strip_accents_one(s, with_except=False):
     24   """Strip accent from a string
     25   
     26   with_except keeps specifically 'é' and 'è'"""
     27   r = []
     28   for x in s:
     29     if with_except and x in ['è', 'é']:
     30       r.append(x)
     31     else:
     32       r += unicodedata.normalize('NFD', x)
     33   return r
     34 
     35 def strip_accents(s, with_except=False):
     36   return ''.join(
     37       (c for c in strip_accents_one(s, with_except)
     38       if unicodedata.category(c) != 'Mn'))
     39 
     40 def norm_spaces(text):
     41   """Remove multiple consecutive whitespace"""
     42   return re.sub("\s+-*\s*", ' ', text)
     43 
     44 def rm_punct(text, with_apostrophe = False):
     45   """Remove punctuation from text"""
     46   if not with_apostrophe:
     47     text = re.sub("'", '', text)
     48   #TODO rather: keep only good chars
     49   pattern = re.compile("[^'\w -]", re.UNICODE)
     50   return pattern.sub(' ', text)
     51 
     52 def is_vowels(chunk, with_h=False, with_y=True):
     53   """Test if a chunk is vowels
     54 
     55   with_h counts 'h' as vowel, with_y allows 'y'"""
     56 
     57   if not with_y and chunk == 'y':
     58     return False
     59   for char in strip_accents(chunk):
     60     if char not in vowels:
     61       if char != 'h' or not with_h:
     62         return False
     63   return True
     64 
     65 def is_consonants(chunk):
     66   """Test if a chunk is consonants"""
     67 
     68   for char in strip_accents(chunk):
     69     if char not in consonants:
     70       return False
     71   return True
     72 
     73 def normalize(text, with_apostrophe=False):
     74   """Normalize text, ie. lowercase, no useless punctuation or whitespace"""
     75   return norm_spaces(rm_punct(text.lower(), with_apostrophe)).rstrip().lstrip()
     76 
     77 def subst(string, subs):
     78   if len(subs) == 0:
     79     return string
     80   return subst(string.replace(subs[0][0], subs[0][1]), subs[1:])
     81 
     82 def to_xsampa(s):
     83   """convert our modified format to x-sampa"""
     84   return subst(s, SUBSTS)
     85 
     86 def from_xsampa(s):
     87   """convert x-sampa to our modified format"""
     88   return subst(s, [(x[1], x[0]) for x in SUBSTS])
     89