drime

French rhyme dictionary with web and CLI interface
git clone https://a3nm.net/git/drime/
Log | Files | Refs | README

query.py (7158B)


      1 #!/usr/bin/python3 -O
      2 # -*- coding: utf-8 -*-
      3 
      4 import sys
      5 import codecs
      6 import operator
      7 from db_mysql import run_query
      8 from common import from_xsampa, to_xsampa
      9 from rhyme import Rhyme, Constraint, escape
     10 from options import default_options
     11 import frhyme
     12 from lexique2sql import Word
     13 
     14 PAGESIZE = 500
     15 # max number of phonemes to show when inferring
     16 LIMIT = 5
     17 
     18 class BadValues(Exception):
     19   pass
     20 
     21 def lcs(x, y):
     22   """Longest common suffix"""
     23   i = 1
     24   while x[-i] == y[-i]:
     25     i += 1
     26     if i > len(x) or i > len(y):
     27       break
     28   return i - 1
     29 
     30 def convert(x):
     31   d = {'true': True, 'false': False}
     32   try:
     33     return d[x.lower()]
     34   except KeyError:
     35     raise BadValues
     36 
     37 
     38 def query(q, nsyl='', gender=True, classical=True, page=0):
     39   if not page:
     40     page = 0
     41   else:
     42     try:
     43       page = int(page)
     44     except ValueError:
     45       raise BadValues
     46   if not nsyl:
     47     nsyl = ''
     48   if not q:
     49     raise BadValues
     50   word = q.strip().split(' ')
     51   nsyl = nsyl.strip()
     52   if word[-1].startswith('[') and word[-1].endswith(']'):
     53       phon = from_xsampa(word[-1][1:-1])
     54       word = word[:-1]
     55   else:
     56       phon = None
     57   word = ' '.join(word)
     58   word = word.replace("œ", "oe").replace("æ", "ae")
     59   if word == '':
     60     word = None
     61   elide = False
     62   if len(nsyl) == 0:
     63     minsyll = None
     64     maxsyll = None
     65   else:
     66     syll = nsyl.split('-')
     67     if len(syll[-1]) > 0 and syll[-1][-1] == '+':
     68       syll[-1] = syll[-1][:-1]
     69       elide = True
     70     else:
     71       elide = False
     72     if len(syll) > 2:
     73       raise BadValues
     74     try:
     75       minsyll = int(syll[0])
     76       if len(syll) == 1:
     77         maxsyll = int(syll[0])
     78       else:
     79         maxsyll = int(syll[1])
     80     except ValueError:
     81       raise BadValues
     82 
     83   return do_query(word, phon, minsyll, maxsyll, elide, gender, classical,
     84       page*PAGESIZE, PAGESIZE)
     85 
     86 def get_key(x):
     87   x['orig'] = decode_orig(x['orig'])
     88   return (x['word'], to_xsampa(x['phon']), render_orig(x),
     89         x['word'] + ' [' + to_xsampa(x['phon']) + ']')
     90 
     91 def decode_orig(x):
     92   bases = x.split(',')
     93   for i in range(len(bases)):
     94     bases[i] = bases[i].split('|')
     95   return bases
     96 
     97 def render_orig(row):
     98   return ', '.join(
     99         [a[0] + (' ('+a[1]+')' if row['word'] != a[1] else '')
    100           for a in row['orig']])
    101 
    102 def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, size):
    103   cursor = run_query('''
    104     SELECT word, phon, word_end, phon_end, feminine, orig
    105     FROM words
    106     WHERE (word = ? OR ?) AND (phon = ? OR ?)
    107     ORDER BY freq DESC
    108     ''', (word, word == None, phon, phon == None,))
    109   result = {}
    110   keys = []
    111   constraint = Constraint(1, True)
    112   for x in cursor:
    113     key = get_key(x)
    114     keys.append(key)
    115   sure = True
    116   if len(keys) > 1:
    117     return {'keys': keys}, 0, True # require disambiguation or is empty
    118   if len(keys) == 0:
    119     if not word:
    120       return {'keys': keys}, 0, True # we need a word to infer anything
    121     if not phon:
    122       # infer from what was given
    123       sure = False
    124       s = word.split(' ')[-3:]
    125       prons = [frhyme.lookup(escape(w))[0][1] for w in s]
    126       phon = ''.join(prons)[-LIMIT:]
    127     # now, create a dummy entry for what was provided
    128     w = Word(word, phon, '', '', '1',
    129             do_extends=False)
    130     x = {'word': w.word, 'phon': w.phon, 'word_end': w.word_end, 'phon_end':
    131         w.phon_end, 'feminine': w.feminine, 'orig': '|'}
    132     key = get_key(x)
    133   word = x['word']
    134   phon = x['phon']
    135   word_end = x['word_end']
    136   phon_end = x['phon_end']
    137   feminine = x['feminine']
    138 
    139   rest = ''' FROM words
    140     WHERE (phon_end = ? OR word_end = ?)
    141         AND ((? OR max_nsyl >= ?)
    142         AND (? OR min_nsyl <= ?
    143             OR (elidable AND min_nsyl - 1 <= ? AND ?)))
    144     '''
    145   #limit = '''LIMIT ? OFFSET ?'''
    146 
    147   args = (phon_end, word_end,
    148         minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,)
    149   query = '''
    150     SELECT word, phon, freq, min_nsyl, max_nsyl, elidable, orig, feminine
    151       ''' + rest #+ limit
    152   cursor = run_query(query, args) #+ (size, offset,))
    153 
    154   result = []
    155   bword = word.split(' ')[-1]
    156   for row in cursor:
    157     if feminine != row['feminine'] and gender:
    158       continue
    159     if (row['word'].endswith('-'+bword)):
    160       continue
    161     if (row['word'] == bword and row['word'] == bword
    162         and ',' not in row['orig']):
    163       continue # don't display the word if it has only one possible origin
    164     if classical:
    165       rhyme = Rhyme(word, constraint,
    166               phon=[phon], mergers=[], options=default_options)
    167       rhyme.restrict(Rhyme(row['word'], constraint,
    168           phon=[row['phon']], mergers=[], options=default_options))
    169       if not rhyme.satisfied():
    170         continue # discard words following classical rules
    171     row['freq'] = float(row['freq'])
    172     row['phon_rhyme'] = lcs(phon, row['phon'])
    173     row['word_rhyme'] = lcs(word, row['word'])
    174     row['key'] = (
    175         -row['phon_rhyme'], # phon_rhyme desc
    176         -row['word_rhyme'], # eye_rhyme desc
    177         #-len([x for x in row['orig'] if x[1] == row['word']]), # same as base
    178         -row['freq'], # frequency desc
    179         row['word'] # alphabetical order
    180         )
    181 
    182     result.append(row)
    183   result.sort(key=operator.itemgetter('key'))
    184   result2 = []
    185   seen = set()
    186   c = 0
    187   for row in result:
    188     row['orig'] = decode_orig(row['orig'])
    189     ok = False
    190     for i in range(len(row['orig'])):
    191       if row['orig'][i][1] not in seen:
    192         ok = True
    193       seen.add(row['orig'][i][1])
    194     if ok:
    195       row['orig'] = render_orig(row)
    196       row['phon'] = to_xsampa(row['phon'])
    197       result2.append(row)
    198       c += 1
    199       if c > offset + size:
    200         break
    201       seen.add(row['word'])
    202   count = len(result)
    203   result2 = result2[offset:offset+size]
    204 
    205   #cursor = run_query('''
    206     #SELECT count(t2.word)
    207     #''' + rest, args)
    208   #for x in cursor:
    209     #count = x[x.keys()[0]]
    210   return {'keys': [key], 'result': result2}, count, sure
    211 
    212 if __name__ == '__main__':
    213   def usage():
    214     print ("Usage: %s QUERY [NSYL [GENDER [CLASSICAL [PAGE]]]]" % sys.argv[0])
    215   try:
    216     for p in [3, 4]:
    217       if p < len(sys.argv):
    218         sys.argv[p] = convert(sys.argv[p])
    219     try:
    220       # adjust page to number starting at 0
    221       sys.argv[5] = max(int(sys.argv[5]) - 1, 0)
    222     except IndexError:
    223       pass # page not specified
    224     r, c, sure = query(*sys.argv[1:])
    225   except (BadValues, ValueError, TypeError):
    226     print ("Bad values passed as arguments.")
    227     usage()
    228     sys.exit(4)
    229   #except TypeError:
    230     #usage()
    231     #sys.exit(4)
    232   if len(r['keys']) == 0:
    233     print ("No interpretation found for %s" % sys.argv[1])
    234     sys.exit(1)
    235   if len(r['keys']) > 1:
    236     print ("Multiple interpretations found for %s:" % sys.argv[1])
    237     for k in r['keys']:
    238       print ("  - %s" % k[-1] + ' -- ' + k[2])
    239     print ("Please rerun with a more specific query")
    240     sys.exit(2)
    241   if not sure:
    242     print ("Warning: word is unknown, pronunciation is inferred, please check")
    243   result = [["word", "phon", "pr", "wr", "freq", "orig"]] + [
    244       (x['word'],
    245        x['phon'],
    246        str(x['phon_rhyme']),
    247        str(x['word_rhyme']),
    248        str(x['freq']),
    249        x['orig']) for x in r['result']]
    250   #print ("Displaying result for %s" % r['keys'][0][2])
    251   for x in result:
    252     print ('|'.join(x))
    253   sys.exit(0)
    254