query.py (7159B)
1 #!/usr/bin/env python3 2 # -*- coding: utf-8 -*- 3 4 import sys 5 import codecs 6 import operator 7 from db_mysql import run_query 8 from common import from_xsampa, to_xsampa 9 from rhyme import Rhyme, Constraint, escape 10 from options import default_options 11 import frhyme 12 from lexique2sql import Word 13 14 PAGESIZE = 500 15 # max number of phonemes to show when inferring 16 LIMIT = 5 17 18 class BadValues(Exception): 19 pass 20 21 def lcs(x, y): 22 """Longest common suffix""" 23 i = 1 24 while x[-i] == y[-i]: 25 i += 1 26 if i > len(x) or i > len(y): 27 break 28 return i - 1 29 30 def convert(x): 31 d = {'true': True, 'false': False} 32 try: 33 return d[x.lower()] 34 except KeyError: 35 raise BadValues 36 37 38 def query(q, nsyl='', gender=True, classical=True, page=0): 39 if not page: 40 page = 0 41 else: 42 try: 43 page = int(page) 44 except ValueError: 45 raise BadValues 46 if not nsyl: 47 nsyl = '' 48 if not q: 49 raise BadValues 50 word = q.strip().split(' ') 51 nsyl = nsyl.strip() 52 if word[-1].startswith('[') and word[-1].endswith(']'): 53 phon = from_xsampa(word[-1][1:-1]) 54 word = word[:-1] 55 else: 56 phon = None 57 word = ' '.join(word) 58 word = word.replace("œ", "oe").replace("æ", "ae") 59 if word == '': 60 word = None 61 elide = False 62 if len(nsyl) == 0: 63 minsyll = None 64 maxsyll = None 65 else: 66 syll = nsyl.split('-') 67 if len(syll[-1]) > 0 and syll[-1][-1] == '+': 68 syll[-1] = syll[-1][:-1] 69 elide = True 70 else: 71 elide = False 72 if len(syll) > 2: 73 raise BadValues 74 try: 75 minsyll = int(syll[0]) 76 if len(syll) == 1: 77 maxsyll = int(syll[0]) 78 else: 79 maxsyll = int(syll[1]) 80 except ValueError: 81 raise BadValues 82 83 return do_query(word, phon, minsyll, maxsyll, elide, gender, classical, 84 page*PAGESIZE, PAGESIZE) 85 86 def get_key(x): 87 x['orig'] = decode_orig(x['orig']) 88 return (x['word'], to_xsampa(x['phon']), render_orig(x), 89 x['word'] + ' [' + to_xsampa(x['phon']) + ']') 90 91 def decode_orig(x): 92 bases = x.split(',') 93 for i in range(len(bases)): 94 bases[i] = bases[i].split('|') 95 return bases 96 97 def render_orig(row): 98 return ', '.join( 99 [a[0] + (' ('+a[1]+')' if row['word'] != a[1] else '') 100 for a in row['orig']]) 101 102 def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, size): 103 cursor = run_query(''' 104 SELECT word, phon, word_end, phon_end, feminine, orig 105 FROM words 106 WHERE (word = ? OR ?) AND (phon = ? OR ?) 107 ORDER BY freq DESC 108 ''', (word, word == None, phon, phon == None,)) 109 result = {} 110 keys = [] 111 constraint = Constraint(1, True) 112 for x in cursor: 113 key = get_key(x) 114 keys.append(key) 115 sure = True 116 if len(keys) > 1: 117 return {'keys': keys}, 0, True # require disambiguation or is empty 118 if len(keys) == 0: 119 if not word: 120 return {'keys': keys}, 0, True # we need a word to infer anything 121 if not phon: 122 # infer from what was given 123 sure = False 124 s = word.split(' ')[-3:] 125 prons = [frhyme.lookup(escape(w))[0][1] for w in s] 126 phon = ''.join(prons)[-LIMIT:] 127 # now, create a dummy entry for what was provided 128 w = Word(word, phon, '', '', '1', 129 do_extends=False) 130 x = {'word': w.word, 'phon': w.phon, 'word_end': w.word_end, 'phon_end': 131 w.phon_end, 'feminine': w.feminine, 'orig': '|'} 132 key = get_key(x) 133 word = x['word'] 134 phon = x['phon'] 135 word_end = x['word_end'] 136 phon_end = x['phon_end'] 137 feminine = x['feminine'] 138 139 rest = ''' FROM words 140 WHERE (phon_end = ? OR word_end = ?) 141 AND ((? OR max_nsyl >= ?) 142 AND (? OR min_nsyl <= ? 143 OR (elidable AND min_nsyl - 1 <= ? AND ?))) 144 ''' 145 #limit = '''LIMIT ? OFFSET ?''' 146 147 args = (phon_end, word_end, 148 minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,) 149 query = ''' 150 SELECT word, phon, freq, min_nsyl, max_nsyl, elidable, orig, feminine 151 ''' + rest #+ limit 152 cursor = run_query(query, args) #+ (size, offset,)) 153 154 result = [] 155 bword = word.split(' ')[-1] 156 for row in cursor: 157 if feminine != row['feminine'] and gender: 158 continue 159 if (row['word'].endswith('-'+bword)): 160 continue 161 if (row['word'] == bword and row['word'] == bword 162 and ',' not in row['orig']): 163 continue # don't display the word if it has only one possible origin 164 if classical: 165 rhyme = Rhyme(word, constraint, 166 phon=[phon], mergers=[], options=default_options) 167 rhyme.restrict(Rhyme(row['word'], constraint, 168 phon=[row['phon']], mergers=[], options=default_options)) 169 if not rhyme.satisfied(): 170 continue # discard words following classical rules 171 row['freq'] = float(row['freq']) 172 row['phon_rhyme'] = lcs(phon, row['phon']) 173 row['word_rhyme'] = lcs(word, row['word']) 174 row['key'] = ( 175 -row['phon_rhyme'], # phon_rhyme desc 176 -row['word_rhyme'], # eye_rhyme desc 177 #-len([x for x in row['orig'] if x[1] == row['word']]), # same as base 178 -row['freq'], # frequency desc 179 row['word'] # alphabetical order 180 ) 181 182 result.append(row) 183 result.sort(key=operator.itemgetter('key')) 184 result2 = [] 185 seen = set() 186 c = 0 187 for row in result: 188 row['orig'] = decode_orig(row['orig']) 189 ok = False 190 for i in range(len(row['orig'])): 191 if row['orig'][i][1] not in seen: 192 ok = True 193 seen.add(row['orig'][i][1]) 194 if ok: 195 row['orig'] = render_orig(row) 196 row['phon'] = to_xsampa(row['phon']) 197 result2.append(row) 198 c += 1 199 if c > offset + size: 200 break 201 seen.add(row['word']) 202 count = len(result) 203 result2 = result2[offset:offset+size] 204 205 #cursor = run_query(''' 206 #SELECT count(t2.word) 207 #''' + rest, args) 208 #for x in cursor: 209 #count = x[x.keys()[0]] 210 return {'keys': [key], 'result': result2}, count, sure 211 212 if __name__ == '__main__': 213 def usage(): 214 print ("Usage: %s QUERY [NSYL [GENDER [CLASSICAL [PAGE]]]]" % sys.argv[0]) 215 try: 216 for p in [3, 4]: 217 if p < len(sys.argv): 218 sys.argv[p] = convert(sys.argv[p]) 219 try: 220 # adjust page to number starting at 0 221 sys.argv[5] = max(int(sys.argv[5]) - 1, 0) 222 except IndexError: 223 pass # page not specified 224 r, c, sure = query(*sys.argv[1:]) 225 except (BadValues, ValueError, TypeError): 226 print ("Bad values passed as arguments.") 227 usage() 228 sys.exit(4) 229 #except TypeError: 230 #usage() 231 #sys.exit(4) 232 if len(r['keys']) == 0: 233 print ("No interpretation found for %s" % sys.argv[1]) 234 sys.exit(1) 235 if len(r['keys']) > 1: 236 print ("Multiple interpretations found for %s:" % sys.argv[1]) 237 for k in r['keys']: 238 print (" - %s" % k[-1] + ' -- ' + k[2]) 239 print ("Please rerun with a more specific query") 240 sys.exit(2) 241 if not sure: 242 print ("Warning: word is unknown, pronunciation is inferred, please check") 243 result = [["word", "phon", "pr", "wr", "freq", "orig"]] + [ 244 (x['word'], 245 x['phon'], 246 str(x['phon_rhyme']), 247 str(x['word_rhyme']), 248 str(x['freq']), 249 x['orig']) for x in r['result']] 250 #print ("Displaying result for %s" % r['keys'][0][2]) 251 for x in result: 252 print ('|'.join(x)) 253 sys.exit(0) 254