drime

French rhyme dictionary with web and CLI interface
git clone https://a3nm.net/git/drime/
Log | Files | Refs | README

lexique2sql.py (4746B)


      1 #!/usr/bin/env python3
      2 
      3 """Prepare the rhyme database
      4 
      5 Input should have tab-separated fields: word, pronunciation, base word,
      6 grammatical category, frequency. Output is a sequence of SQL statements"""
      7 
      8 # TODO frequencies are off
      9 # TODO "bibliographe" number of syllables?!
     10 
     11 import haspirater
     12 import metric
     13 from common import is_vowels, is_consonants, sure_end_fem
     14 import sys
     15 
     16 def string_type(max_len):
     17   return ('string', {'max_len':max_len})
     18 sql_fields = [
     19   ('word', string_type(100)),
     20   ('phon', string_type(100)),
     21   ('orig', ('string2', {'max_len':100})),
     22   ('freq', ('float', {})),
     23   ('min_nsyl', ('int', {})),
     24   ('max_nsyl', ('int', {})),
     25   ('word_end', string_type(10)),
     26   ('phon_end', string_type(10)),
     27   ('elidable', ('bool', {})),
     28   ('feminine', ('bool', {})),
     29 ]
     30 
     31 seen = {}
     32 
     33 # phonetic vowel sounds
     34 phon_vowels = "()$#289aeEioOuy@"
     35 # not a feminine ending, independently of spelling
     36 phon_non_end_fem = ['#', ')']
     37 
     38 def escape(x):
     39   """Escape for SQL"""
     40   s = []
     41   for a in x:
     42     if a in ["\\", "'"]:
     43       s.append('\\')
     44     s.append(a)
     45   return ''.join(s)
     46 
     47 class Word:
     48   @property
     49   def min_nsyl(self):
     50     return self.nsyl[0]
     51 
     52   @property
     53   def max_nsyl(self):
     54     return self.nsyl[1]
     55 
     56   @property
     57   def word_end(self):
     58     """Compute minimal visual rhyme"""
     59     l = []
     60     w = list(self.word)
     61     count = 0
     62     w.reverse()
     63     for x in w:
     64       if is_vowels(x) or is_consonants(x):
     65         l.append(x)
     66       if is_vowels(x) and count >= 1:
     67         break
     68       count += 1
     69     l.reverse()
     70     return ''.join(l)
     71 
     72   @property
     73   def phon_end(self):
     74     """Compute minimal phonetic rhyme"""
     75     l = []
     76     w = list(self.phon)
     77     w.reverse()
     78     for x in w:
     79       l.append(x)
     80       if x in phon_vowels:
     81         break
     82     l.reverse()
     83     return ''.join(l)
     84 
     85   @property
     86   def elidable(self):
     87     """Can this word cause elision in the previous word?"""
     88     return is_vowels(self.word[0]) or (self.word[0] == 'h' and
     89         not haspirater.lookup(self.word))
     90 
     91   @property
     92   def feminine(self):
     93     """Would this word be a feminine rhyme?"""
     94     # when word ends in -ent, it's hard to tell from writing, so look at phon
     95     # example: "tient" vs. "lient"
     96     def endswith_any(x, ends): return any([x.endswith(end) for end in ends])
     97     return (endswith_any(self.word, sure_end_fem)
     98             or (self.word.endswith('ent')
     99                 and not endswith_any(self.phon, phon_non_end_fem)))
    100 
    101   @property
    102   def ok(self):
    103     # Remove words with no vowels
    104     for x in phon_vowels:
    105       if x in self.phon_end:
    106         return True
    107     return False
    108 
    109   @property
    110   def sql(self):
    111     render = {
    112       'string': lambda s: '"'+escape(s)+'"',
    113       'string2': lambda s:
    114             '"'+escape(','.join([x[0] + '|' + x[1] for x in s]))+'"',
    115       'float': str,
    116       'int': lambda s: str(int(s)),
    117       'bool': lambda s: str(int(s)),
    118     }
    119     def sql_field(field):
    120       (name, (ty, _)) = field
    121       return render[ty](getattr(self, name))
    122     return ('INSERT INTO words VALUES('
    123             + ', '.join([sql_field(f) for f in sql_fields])
    124             + ');')
    125 
    126   def __init__(self, word, phon, base, kind, freq, do_extends=True):
    127     self.word = word
    128     self.phon = phon
    129     # workarounds for lexique
    130     base = base.split(',')[0]
    131     if kind == '':
    132       kind = "MISC"
    133     self.orig = [(kind, base)]
    134     self.freq = float(freq)
    135     self.nsyl = None
    136     if do_extends:
    137       # don't do this in the case where we are calling from lookup
    138       # it's not needed and may cause encoding problems
    139       self.do_extends()
    140 
    141   def align_sum(self, align):
    142     s = 0
    143     for a in align:
    144       #print(a)
    145       if isinstance(a, tuple):
    146         s += a[1]
    147     #print ("DBG for %s: %d" % (self.word, s))
    148     return s
    149 
    150   def do_extends(self):
    151     for align in metric.parse(self.word, 999):
    152       self.extend(self.align_sum(align[0]))
    153 
    154   def extend(self, item):
    155     try:
    156       self.nsyl = [min(self.nsyl[0], item), max(self.nsyl[1], item)]
    157     except TypeError: # first execution
    158       self.nsyl = [item, item]
    159 
    160 def create_table():
    161   def decl(field):
    162     (name, (ty, data)) = field
    163     if ty.startswith('string'):
    164       ty = 'varchar(' + str(data['max_len']) + ')'
    165     return name + ' ' + ty
    166   return ('CREATE TABLE words('
    167           + ', '.join([decl(field) for field in sql_fields])
    168           + ');')
    169 
    170 if __name__ == '__main__':
    171   cache = None
    172 
    173   print (create_table())
    174   while True:
    175     line = sys.stdin.readline()
    176     if not line:
    177       break
    178     l = line.rstrip().split("\t")
    179     w = Word(*l)
    180     k = (w.word, w.phon)
    181     if not w.ok:
    182       continue
    183     if cache and cache.word == w.word and cache.phon == w.phon:
    184       cache.freq += w.freq
    185       cache.orig += [w.orig[0]]
    186     else:
    187       if cache:
    188         print(cache.sql)
    189       cache = w
    190   print(cache.sql)
    191