lexique2sql.py (4746B)
1 #!/usr/bin/env python3 2 3 """Prepare the rhyme database 4 5 Input should have tab-separated fields: word, pronunciation, base word, 6 grammatical category, frequency. Output is a sequence of SQL statements""" 7 8 # TODO frequencies are off 9 # TODO "bibliographe" number of syllables?! 10 11 import haspirater 12 import metric 13 from common import is_vowels, is_consonants, sure_end_fem 14 import sys 15 16 def string_type(max_len): 17 return ('string', {'max_len':max_len}) 18 sql_fields = [ 19 ('word', string_type(100)), 20 ('phon', string_type(100)), 21 ('orig', ('string2', {'max_len':100})), 22 ('freq', ('float', {})), 23 ('min_nsyl', ('int', {})), 24 ('max_nsyl', ('int', {})), 25 ('word_end', string_type(10)), 26 ('phon_end', string_type(10)), 27 ('elidable', ('bool', {})), 28 ('feminine', ('bool', {})), 29 ] 30 31 seen = {} 32 33 # phonetic vowel sounds 34 phon_vowels = "()$#289aeEioOuy@" 35 # not a feminine ending, independently of spelling 36 phon_non_end_fem = ['#', ')'] 37 38 def escape(x): 39 """Escape for SQL""" 40 s = [] 41 for a in x: 42 if a in ["\\", "'"]: 43 s.append('\\') 44 s.append(a) 45 return ''.join(s) 46 47 class Word: 48 @property 49 def min_nsyl(self): 50 return self.nsyl[0] 51 52 @property 53 def max_nsyl(self): 54 return self.nsyl[1] 55 56 @property 57 def word_end(self): 58 """Compute minimal visual rhyme""" 59 l = [] 60 w = list(self.word) 61 count = 0 62 w.reverse() 63 for x in w: 64 if is_vowels(x) or is_consonants(x): 65 l.append(x) 66 if is_vowels(x) and count >= 1: 67 break 68 count += 1 69 l.reverse() 70 return ''.join(l) 71 72 @property 73 def phon_end(self): 74 """Compute minimal phonetic rhyme""" 75 l = [] 76 w = list(self.phon) 77 w.reverse() 78 for x in w: 79 l.append(x) 80 if x in phon_vowels: 81 break 82 l.reverse() 83 return ''.join(l) 84 85 @property 86 def elidable(self): 87 """Can this word cause elision in the previous word?""" 88 return is_vowels(self.word[0]) or (self.word[0] == 'h' and 89 not haspirater.lookup(self.word)) 90 91 @property 92 def feminine(self): 93 """Would this word be a feminine rhyme?""" 94 # when word ends in -ent, it's hard to tell from writing, so look at phon 95 # example: "tient" vs. "lient" 96 def endswith_any(x, ends): return any([x.endswith(end) for end in ends]) 97 return (endswith_any(self.word, sure_end_fem) 98 or (self.word.endswith('ent') 99 and not endswith_any(self.phon, phon_non_end_fem))) 100 101 @property 102 def ok(self): 103 # Remove words with no vowels 104 for x in phon_vowels: 105 if x in self.phon_end: 106 return True 107 return False 108 109 @property 110 def sql(self): 111 render = { 112 'string': lambda s: '"'+escape(s)+'"', 113 'string2': lambda s: 114 '"'+escape(','.join([x[0] + '|' + x[1] for x in s]))+'"', 115 'float': str, 116 'int': lambda s: str(int(s)), 117 'bool': lambda s: str(int(s)), 118 } 119 def sql_field(field): 120 (name, (ty, _)) = field 121 return render[ty](getattr(self, name)) 122 return ('INSERT INTO words VALUES(' 123 + ', '.join([sql_field(f) for f in sql_fields]) 124 + ');') 125 126 def __init__(self, word, phon, base, kind, freq, do_extends=True): 127 self.word = word 128 self.phon = phon 129 # workarounds for lexique 130 base = base.split(',')[0] 131 if kind == '': 132 kind = "MISC" 133 self.orig = [(kind, base)] 134 self.freq = float(freq) 135 self.nsyl = None 136 if do_extends: 137 # don't do this in the case where we are calling from lookup 138 # it's not needed and may cause encoding problems 139 self.do_extends() 140 141 def align_sum(self, align): 142 s = 0 143 for a in align: 144 #print(a) 145 if isinstance(a, tuple): 146 s += a[1] 147 #print ("DBG for %s: %d" % (self.word, s)) 148 return s 149 150 def do_extends(self): 151 for align in metric.parse(self.word, 999): 152 self.extend(self.align_sum(align[0])) 153 154 def extend(self, item): 155 try: 156 self.nsyl = [min(self.nsyl[0], item), max(self.nsyl[1], item)] 157 except TypeError: # first execution 158 self.nsyl = [item, item] 159 160 def create_table(): 161 def decl(field): 162 (name, (ty, data)) = field 163 if ty.startswith('string'): 164 ty = 'varchar(' + str(data['max_len']) + ')' 165 return name + ' ' + ty 166 return ('CREATE TABLE words(' 167 + ', '.join([decl(field) for field in sql_fields]) 168 + ');') 169 170 if __name__ == '__main__': 171 cache = None 172 173 print (create_table()) 174 while True: 175 line = sys.stdin.readline() 176 if not line: 177 break 178 l = line.rstrip().split("\t") 179 w = Word(*l) 180 k = (w.word, w.phon) 181 if not w.ok: 182 continue 183 if cache and cache.word == w.word and cache.phon == w.phon: 184 cache.freq += w.freq 185 cache.orig += [w.orig[0]] 186 else: 187 if cache: 188 print(cache.sql) 189 cache = w 190 print(cache.sql) 191