drime

French rhyme dictionary with web and CLI interface
git clone https://a3nm.net/git/drime/
Log | Files | Refs | README

commit eb9c522929e8eb7784052a2dc4122009c0aa2bac
parent 77221a978a5e49fe5a16d6606559bc91576423e7
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 27 Dec 2011 00:45:56 +0100

sortof works now

Diffstat:
README | 6+++---
db_mysql.py | 3+--
lexique2sql.py | 17++++++++---------
lexique2sql.sh | 4+++-
query.py | 142+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
static/main.css | 45++++++++++++++++++++++++++++++++++++++++++---
templates/about.html | 17++++++++++-------
templates/disambig.html | 4++--
templates/error.html | 2+-
templates/notfound.html | 4++--
templates/page.html | 5++---
templates/results.html | 38+++++++++++++++++++++++++++++---------
12 files changed, 205 insertions(+), 82 deletions(-)

diff --git a/README b/README @@ -1,6 +1,4 @@ WARNING -- this code does *not* work yet! -TODO: dos2unix -TODO: placement in rhymes for enlacement? drime - by Antoine Amarilli A French rhyme dictionary @@ -39,7 +37,9 @@ This can take some time: you can monitor progress using the pv utility: To import the output of lexique2sql.sh in a MySQL database (on localhost, database 'drime', as user 'drime', interactive password authentication), run: - cat output.sql | mysql --default-character-set=utf8 -D drime -u drime -p + cat output.sql | + sed 's/varchar([0-9]*)/& collate utf8_bin/g' | + mysql --default-character-set=utf8 -D drime -u drime -p == 4. Using the DB == diff --git a/db_mysql.py b/db_mysql.py @@ -10,8 +10,7 @@ def run_query(r, v): user=config['user'], passwd=config['passwd'], db=config['db'], - cursorclass=MySQLdb.cursors.DictCursor, - use_unicode=True) + cursorclass=MySQLdb.cursors.DictCursor) cursor = db.cursor() cursor.execute(r.replace('?', '%s'), v) return cursor diff --git a/lexique2sql.py b/lexique2sql.py @@ -109,18 +109,16 @@ class Word: @property def sql(self): render = { - 'string': lambda s, w: '"'+escape(s)+'"', - 'string2': lambda s, w: - '"'+escape( - ', '.join([x[0]+(' ('+x[1]+')' if w.word != x[1] else '') - for x in s]))+'"', - 'float': lambda s, w: str(s), - 'int': lambda s, w: str(int(s)), - 'bool': lambda s, w: str(int(s)), + 'string': lambda s: '"'+escape(s)+'"', + 'string2': lambda s: + '"'+escape(','.join([x[0] + '|' + x[1] for x in s]))+'"', + 'float': str, + 'int': lambda s: str(int(s)), + 'bool': lambda s: str(int(s)), } def sql_field(field): (name, (ty, _)) = field - return render[ty](getattr(self, name), self) + return render[ty](getattr(self, name)) return ('INSERT INTO words VALUES(' + ', '.join([sql_field(f) for f in sql_fields]) + ');') @@ -128,6 +126,7 @@ class Word: def __init__(self, word, phon, base, kind, freq): self.word = word self.phon = phon + base = base.split(',')[0] # workaround for lexique self.orig = [(kind, base)] self.freq = float(freq) self.nsyl = None diff --git a/lexique2sql.sh b/lexique2sql.sh @@ -3,6 +3,8 @@ cd "$( dirname "$0" )" cat - additions | # add custom exceptions + sort -k1,2 | # sort to aggregate duplicates TODO break ties by frequency cut -f 1-8 | # select relevant fields - awk '{FS=" "; OFS=" "; print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies + awk '{FS=" "; OFS=" "; + print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies ./lexique2sql.py diff --git a/query.py b/query.py @@ -5,7 +5,10 @@ import operator from db_mysql import run_query from common import from_xsampa, to_xsampa -PAGESIZE=50 +PAGESIZE = 500 + +class BadValues(Exception): + pass def lcs(x, y): """Longest common suffix""" @@ -18,10 +21,14 @@ def lcs(x, y): def query(q, nsyl='', gender=True, page=0): + if not page: + page = 0 + else: + page = int(page) if not nsyl: nsyl = '' if not q: - raise ValueError + raise BadValues word = q.strip().split(' ') nsyl = nsyl.strip() if word[-1].startswith('[') and word[-1].endswith(']'): @@ -44,7 +51,7 @@ def query(q, nsyl='', gender=True, page=0): else: elide = False if len(syll) > 2: - raise ValueError + raise BadValues minsyll = int(syll[0]) if len(syll) == 1: maxsyll = int(syll[0]) @@ -55,13 +62,50 @@ def query(q, nsyl='', gender=True, page=0): page*PAGESIZE, PAGESIZE) pass +def decode_all(x): + for k in x.keys(): + if isinstance(x[k], str): + try: + x[k] = x[k].decode('utf8') + except UnicodeDecodeError: + x[k] = x[k].decode('latin1') + +def get_key(x): + return (x['t1_word'], x['t1_phon'], + x['t1_word'] + ' [' + to_xsampa(x['t1_phon']) + ']') + def do_query(word, phon, minsyll, maxsyll, elide, gender, offset, size): print ((word, phon, minsyll, maxsyll, elide, gender,)) + print ((offset, size,)) cursor = run_query(''' - SELECT t1.freq AS t1_freq, - t1.word AS t1_word, + SELECT t1.word AS t1_word, + t1.phon AS t1_phon + FROM words AS t1 + WHERE (t1.word = ? OR ?) AND (t1.phon = ? OR ?) + ORDER BY t1.freq DESC + ''', (word, word == None, phon, phon == None,)) + result = {} + for x in cursor: + decode_all(x) + result[get_key(x)] = x + if len(result.keys()) > 1 or result == {}: + return result, 0 # require disambiguation or is empty + + rest = ''' FROM words AS t1, words AS t2 + WHERE (t1.phon_end = t2.phon_end OR t1.word_end = t2.word_end) + AND (t1.word = ? OR ?) AND (t1.phon = ? OR ?) + AND ((? OR t2.max_nsyl >= ?) + AND (? OR t2.min_nsyl <= ? + OR (t2.elidable AND t2.min_nsyl - 1 <= ? AND ?))) + ORDER BY t2.freq, t1.phon, t1.word + ''' + limit = '''LIMIT ? OFFSET ?''' + + args = (word, word == None, phon, phon == None, + minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,) + query = ''' + SELECT t1.word AS t1_word, t1.phon AS t1_phon, - t1.base AS t1_base, t1.feminine AS t1_feminine, t2.word AS t2_word, t2.phon AS t2_phon, @@ -69,63 +113,81 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, offset, size): t2.min_nsyl AS t2_min_nsyl, t2.max_nsyl AS t2_max_nsyl, t2.elidable AS t2_elidable, - t2.base AS t2_base, - t2.kind AS t2_kind, + t2.orig AS t2_orig, t2.feminine AS t2_feminine - FROM words AS t1 INNER JOIN words AS t2 ON - (t1.phon_end = t2.phon_end OR t1.word_end = t2.word_end) - WHERE (t1.word = ? OR ?) AND (t1.phon = ? OR ?) - AND ((? OR t2.max_nsyl >= ?) - AND (? OR t2.min_nsyl <= ? - OR (t2.elidable AND t2.min_nsyl - 1 <= ? AND ?))) - ORDER BY t1.freq, t1.phon, t1.word - ''', (word, word == None, phon, phon == None, - minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,)) + ''' + rest #+ limit + print (query) + cursor = run_query(query, args) #+ (size, offset,)) result = {} - cache = {} - seen = {} for x in cursor: + for k in x.keys(): + if isinstance(x[k], str): + try: + x[k] = x[k].decode('utf8') + except UnicodeDecodeError: + x[k] = x[k].decode('latin1') if x['t1_feminine'] != x['t2_feminine'] and gender: continue - key = (x['t1_word'], x['t1_phon'], - x['t1_word'] + ' [' + to_xsampa(x['t1_phon']) + ']') + key = get_key(x) if key not in result.keys(): result[key] = [] - cache[key] = [] - seen[key] = set() row = dict([ (k[3:], x[k]) for k in x.keys() if k.startswith('t2_')]) - if row['base'] in seen[key]: + for k in row.keys(): + if isinstance(row[k], str): + try: + row[k] = row[k].decode('utf8') + except UnicodeDecodeError: + row[k] = row[k].decode('latin1') + if (row['word'].endswith('-'+x['t1_word'])): continue - seen[key].add(row['base']) + if (row['word'] == x['t1_word'] and row['word'] == x['t1_word'] + and ',' not in row['orig']): + continue # don't display the word if it has only one possible origin row['freq'] = float(row['freq']) row['phon_rhyme'] = lcs(x['t1_phon'], row['phon']) row['word_rhyme'] = lcs(x['t1_word'], row['word']) row['key'] = ( -row['phon_rhyme'], # phon_rhyme desc -row['word_rhyme'], # eye_rhyme desc - row['base'] != row['word'], # same as base + #TODO row['base'] != row['word'], # same as base -row['freq'], # frequency desc row['word'] # alphabetical order ) - row['derivation'] = row['kind'] + ( - ' (' + row['base'] + ')' - if row['base'] != row['word'] - else '') row['phon'] = to_xsampa(row['phon']) - if (row['word'] in [x['t1_word'], x['t1_base']] - and row['phon'] == to_xsampa(x['t1_phon'])): - cache[key].append(row) - else: - result[key].append(row) + result[key].append(row) + + result2 = {} + seen = {} for k in result.keys(): - # only display the word itself if multiple derivations are possible - if len(cache[key]) > 1: - result[k] += cache[key] + # TODO only display the word itself if multiple derivations are possible result[k] = sorted(result[k], key=operator.itemgetter('key')) - print(result) - return result + result2[k] = [] + seen[key] = set() + for row in result[k]: + bases = row['orig'].split(',') + ok = False + for i in range(len(bases)): + bases[i] = bases[i].split('|') + if bases[i][1] not in seen[key]: + ok = True + seen[key].add(bases[i][1]) + if ok: + row['orig'] = ', '.join( + [a[0] + (' ('+a[1]+')' if row['word'] != a[1] else '') + for a in bases]) + result2[k].append(row) + seen[key].add(row['word']) + count = len(result2[k]) + result2[k] = result2[k][:PAGESIZE] + + #cursor = run_query(''' + #SELECT count(t2.word) + #''' + rest, args) + #for x in cursor: + #count = x[x.keys()[0]] + return result2, count if __name__ == '__main__': print(query(*sys.argv[1:])) diff --git a/static/main.css b/static/main.css @@ -3,7 +3,8 @@ h1 { float: left; margin: 0; margin-right: 0.3em; - font-size: 140%; + font-size: 100%; + padding: 0.2em; } #body { @@ -44,10 +45,12 @@ label { table, .faketable { width: 100%; + margin: 0; + padding: 0; } .odd { - background: #efe; + background: #dfd; } .help { @@ -59,7 +62,7 @@ table, .faketable { } #query { - width: 20em; + width: 12em; } #gender_label { @@ -87,3 +90,39 @@ table, .faketable { padding: 0.3em; } +#result_header { + background: #dfd; + margin: 0; + padding: 0.3em; +} + +header form { + float: right; +} + +header form input { + height: 100%; +} + +.page { + text-align: center; +} + +/*#col_word { + width: 10%; +} +#col_pron { + width: 10%; +} +#col_derivations { + width: 30%; +} +#col_freq { + width: 15%; +} +#col_phon { + width: 7%; +} +#col_eye { + width: 7%; +}*/ diff --git a/templates/about.html b/templates/about.html @@ -1,21 +1,19 @@ {% extends "page.html" %} {% block body %} -TODO move this to / <p>Welcome to <strong>drime</strong>!</p> <h2 id="info">Wait, what is this?</h2> -<p>This is drime, <a href="http://a3nm.net">a3nm</a>'s attempt to build a better -French rhyme dictionary. It uses the <a - href="http://www.lexique.org/">Lexique</a> database with some customisations, -and is powered by Python, Sqlite and Flask. You can get the code. TODO -links.</p> +<p>This is <a href="http://a3nm.net">a3nm</a>'s attempt to build a better French +rhyme dictionary. It uses the <a href="http://www.lexique.org/">Lexique</a> +database with some customisations, and is powered by Python, Sqlite and Flask. +You can get the code. TODO links.</p> <h2 id="help">How do I use it?</h2> <p>In the <strong>word</strong> field, enter the word you want to get rhymes for. Inferring pronunciation of unknown words isn't supported (yet), so don't use proper nouns or rare words. You can also provide a pronunciation written between square brackets using <a href="#pron">the right convention</a> to disambiguate if multiple pronunciations are possible. Example: <a - href="/?query=fils%20[fis]">fils [fis]</a>.</p> + href="/query?query=fils%20[fis]">fils [fis]</a>.</p> <p>In the <strong>n_syllables</strong> field, you can specify a number of syllables to limit on. You can either specify an exact number or a range (eg. "1-3"). You can suffix a "+" to indicate that you can accept one syllable more if @@ -44,5 +42,10 @@ pronunciation indicated is a colloquial one and not the one that you would use t versify (for instance, "placement" is reported as "plasmA~" but would be read as "plas@mA~"). Heuristics are used to work around this when filtering on the number of syllables.</p> + +<h2>What about the name?</h2> + +<p>It's pronounced [dRim], as a pun with "dream" (the dream rhyme dictionary) +but also because it is short for "dictionnaire de rimes" in French.</p> {% endblock %} diff --git a/templates/disambig.html b/templates/disambig.html @@ -6,11 +6,11 @@ {% for k in keys %} <li class="{{ loop.cycle('odd', 'even') }}"> <a href="?query={{ k[-1] | escape }}&nsyl={{ nsyl }}&gender={{ gender }}"> - {{ k[0] }} [{{ k[1] }}], rhyming with + {{ k[0] }} [{{ k[1] }}]{#, rhyming with {% for v in example[k][:5] %} {{ v.word }} {% endfor %} - ... + ... #} </a> </li> {% endfor %} diff --git a/templates/error.html b/templates/error.html @@ -1,6 +1,6 @@ {% extends "page.html" %} {% block body %} -<p>Invalid values supplied. Need some <a href="about#help">help</a>?</p> +<p>Invalid values supplied. Need some <a href="/#help">help</a>?</p> {% endblock %} diff --git a/templates/notfound.html b/templates/notfound.html @@ -2,7 +2,7 @@ {% block body %} <p>No known word matches this query. Make sure you're using a word from the -dictionary. Remove syllabe count restrictions, if any. Need some <a - href="about#help">help</a>?</p> +dictionary. Need some <a + href="/#help">help</a>?</p> {% endblock %} diff --git a/templates/page.html b/templates/page.html @@ -9,7 +9,6 @@ <body> <header> <h1><a href="/">drime</a></h1> - <div id="about"><a href="about">about</a></div> <form method="GET" action="query"> <label class="redundant" for="query">Word</label> <input id="query" name="query" @@ -19,9 +18,9 @@ <input id="nsyl" name="nsyl" placeholder="n_syllables" value="{{ nsyl }}"/> - <label id="gender_label"> + <label id="gender_label"> <input type="checkbox" id="gender" name="gender" - {% if gender %} + {% if (mode != 'query') or gender == 'on' %} checked="{{ gender }}" {% endif %} /> diff --git a/templates/results.html b/templates/results.html @@ -1,15 +1,24 @@ {% extends "page.html" %} {% block body %} -<p>Displaying results for: <strong>{{ keys[0][-1] }}</strong></p> +<div> +<p id="result_header">Displaying +{{ displayed }} result{% if displayed != 1 %}s{% endif %} +{# results +{{ page * pagesize + 1 }} to {{ (page+1) * pagesize }} #} +{% if displayed < count %} +of {{ count }} +total result{% if count != 1 %}s{% endif %} +{% endif %} +for: <strong>{{ keys[0][-1] }}</strong></p> <table> <tr> - <th>word</th> - <th>pron<a href="about#pron" class="help">?</a></th> - <th>phon</th> - <th>eye</th> - <th>freq</th> - <th>derivation</th> + <th id="col_word">word</th> + <th id="col_pron">pron<a href="/#pron" class="help">?</a></th> + <th id="col_phon">phon</th> + <th id="col_eye">eye</th> + <th id="col_freq">freq</th> + <th id="col_deriations">derivations</th> {% for r in result %} <tr class="{{ loop.cycle('odd', 'even') }}"> <td>{{ r.word }}</td> @@ -17,10 +26,21 @@ <td class="num">{{ r.phon_rhyme }}</td> <td class="num">{{ r.word_rhyme }}</td> <td class="num">{{ r.freq }}</td> - <td>{{ r.derivation }}</td> + <td>{{ r.orig }}</td> </tr> {% endfor %} </table> - +</div> +{#<p class="page"> +{% if page > 0 %} +<a href="?query={{ q }}&nsyl={{ nsyl }}&gender={{ gender }}&page={{ + page - 1 }}">{{ page }}</a> &mdash; +{% endif %} +{{ page + 1 }} +{% if (page+1) * pagesize < count %} +&mdash; <a href="?query={{ q }}&nsyl={{ nsyl }}&gender={{ gender }}&page={{ + page + 1 }}">{{ page + 2 }}</a> +{% endif %} +</p>#} {% endblock %}