commit eb9c522929e8eb7784052a2dc4122009c0aa2bac
parent 77221a978a5e49fe5a16d6606559bc91576423e7
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Tue, 27 Dec 2011 00:45:56 +0100
sortof works now
Diffstat:
12 files changed, 205 insertions(+), 82 deletions(-)
diff --git a/README b/README
@@ -1,6 +1,4 @@
WARNING -- this code does *not* work yet!
-TODO: dos2unix
-TODO: placement in rhymes for enlacement?
drime - by Antoine Amarilli
A French rhyme dictionary
@@ -39,7 +37,9 @@ This can take some time: you can monitor progress using the pv utility:
To import the output of lexique2sql.sh in a MySQL database (on localhost,
database 'drime', as user 'drime', interactive password authentication), run:
- cat output.sql | mysql --default-character-set=utf8 -D drime -u drime -p
+ cat output.sql |
+ sed 's/varchar([0-9]*)/& collate utf8_bin/g' |
+ mysql --default-character-set=utf8 -D drime -u drime -p
== 4. Using the DB ==
diff --git a/db_mysql.py b/db_mysql.py
@@ -10,8 +10,7 @@ def run_query(r, v):
user=config['user'],
passwd=config['passwd'],
db=config['db'],
- cursorclass=MySQLdb.cursors.DictCursor,
- use_unicode=True)
+ cursorclass=MySQLdb.cursors.DictCursor)
cursor = db.cursor()
cursor.execute(r.replace('?', '%s'), v)
return cursor
diff --git a/lexique2sql.py b/lexique2sql.py
@@ -109,18 +109,16 @@ class Word:
@property
def sql(self):
render = {
- 'string': lambda s, w: '"'+escape(s)+'"',
- 'string2': lambda s, w:
- '"'+escape(
- ', '.join([x[0]+(' ('+x[1]+')' if w.word != x[1] else '')
- for x in s]))+'"',
- 'float': lambda s, w: str(s),
- 'int': lambda s, w: str(int(s)),
- 'bool': lambda s, w: str(int(s)),
+ 'string': lambda s: '"'+escape(s)+'"',
+ 'string2': lambda s:
+ '"'+escape(','.join([x[0] + '|' + x[1] for x in s]))+'"',
+ 'float': str,
+ 'int': lambda s: str(int(s)),
+ 'bool': lambda s: str(int(s)),
}
def sql_field(field):
(name, (ty, _)) = field
- return render[ty](getattr(self, name), self)
+ return render[ty](getattr(self, name))
return ('INSERT INTO words VALUES('
+ ', '.join([sql_field(f) for f in sql_fields])
+ ');')
@@ -128,6 +126,7 @@ class Word:
def __init__(self, word, phon, base, kind, freq):
self.word = word
self.phon = phon
+ base = base.split(',')[0] # workaround for lexique
self.orig = [(kind, base)]
self.freq = float(freq)
self.nsyl = None
diff --git a/lexique2sql.sh b/lexique2sql.sh
@@ -3,6 +3,8 @@
cd "$( dirname "$0" )"
cat - additions | # add custom exceptions
+ sort -k1,2 | # sort to aggregate duplicates TODO break ties by frequency
cut -f 1-8 | # select relevant fields
- awk '{FS=" "; OFS=" "; print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies
+ awk '{FS=" "; OFS=" ";
+ print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies
./lexique2sql.py
diff --git a/query.py b/query.py
@@ -5,7 +5,10 @@ import operator
from db_mysql import run_query
from common import from_xsampa, to_xsampa
-PAGESIZE=50
+PAGESIZE = 500
+
+class BadValues(Exception):
+ pass
def lcs(x, y):
"""Longest common suffix"""
@@ -18,10 +21,14 @@ def lcs(x, y):
def query(q, nsyl='', gender=True, page=0):
+ if not page:
+ page = 0
+ else:
+ page = int(page)
if not nsyl:
nsyl = ''
if not q:
- raise ValueError
+ raise BadValues
word = q.strip().split(' ')
nsyl = nsyl.strip()
if word[-1].startswith('[') and word[-1].endswith(']'):
@@ -44,7 +51,7 @@ def query(q, nsyl='', gender=True, page=0):
else:
elide = False
if len(syll) > 2:
- raise ValueError
+ raise BadValues
minsyll = int(syll[0])
if len(syll) == 1:
maxsyll = int(syll[0])
@@ -55,13 +62,50 @@ def query(q, nsyl='', gender=True, page=0):
page*PAGESIZE, PAGESIZE)
pass
+def decode_all(x):
+ for k in x.keys():
+ if isinstance(x[k], str):
+ try:
+ x[k] = x[k].decode('utf8')
+ except UnicodeDecodeError:
+ x[k] = x[k].decode('latin1')
+
+def get_key(x):
+ return (x['t1_word'], x['t1_phon'],
+ x['t1_word'] + ' [' + to_xsampa(x['t1_phon']) + ']')
+
def do_query(word, phon, minsyll, maxsyll, elide, gender, offset, size):
print ((word, phon, minsyll, maxsyll, elide, gender,))
+ print ((offset, size,))
cursor = run_query('''
- SELECT t1.freq AS t1_freq,
- t1.word AS t1_word,
+ SELECT t1.word AS t1_word,
+ t1.phon AS t1_phon
+ FROM words AS t1
+ WHERE (t1.word = ? OR ?) AND (t1.phon = ? OR ?)
+ ORDER BY t1.freq DESC
+ ''', (word, word == None, phon, phon == None,))
+ result = {}
+ for x in cursor:
+ decode_all(x)
+ result[get_key(x)] = x
+ if len(result.keys()) > 1 or result == {}:
+ return result, 0 # require disambiguation or is empty
+
+ rest = ''' FROM words AS t1, words AS t2
+ WHERE (t1.phon_end = t2.phon_end OR t1.word_end = t2.word_end)
+ AND (t1.word = ? OR ?) AND (t1.phon = ? OR ?)
+ AND ((? OR t2.max_nsyl >= ?)
+ AND (? OR t2.min_nsyl <= ?
+ OR (t2.elidable AND t2.min_nsyl - 1 <= ? AND ?)))
+ ORDER BY t2.freq, t1.phon, t1.word
+ '''
+ limit = '''LIMIT ? OFFSET ?'''
+
+ args = (word, word == None, phon, phon == None,
+ minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,)
+ query = '''
+ SELECT t1.word AS t1_word,
t1.phon AS t1_phon,
- t1.base AS t1_base,
t1.feminine AS t1_feminine,
t2.word AS t2_word,
t2.phon AS t2_phon,
@@ -69,63 +113,81 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, offset, size):
t2.min_nsyl AS t2_min_nsyl,
t2.max_nsyl AS t2_max_nsyl,
t2.elidable AS t2_elidable,
- t2.base AS t2_base,
- t2.kind AS t2_kind,
+ t2.orig AS t2_orig,
t2.feminine AS t2_feminine
- FROM words AS t1 INNER JOIN words AS t2 ON
- (t1.phon_end = t2.phon_end OR t1.word_end = t2.word_end)
- WHERE (t1.word = ? OR ?) AND (t1.phon = ? OR ?)
- AND ((? OR t2.max_nsyl >= ?)
- AND (? OR t2.min_nsyl <= ?
- OR (t2.elidable AND t2.min_nsyl - 1 <= ? AND ?)))
- ORDER BY t1.freq, t1.phon, t1.word
- ''', (word, word == None, phon, phon == None,
- minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,))
+ ''' + rest #+ limit
+ print (query)
+ cursor = run_query(query, args) #+ (size, offset,))
result = {}
- cache = {}
- seen = {}
for x in cursor:
+ for k in x.keys():
+ if isinstance(x[k], str):
+ try:
+ x[k] = x[k].decode('utf8')
+ except UnicodeDecodeError:
+ x[k] = x[k].decode('latin1')
if x['t1_feminine'] != x['t2_feminine'] and gender:
continue
- key = (x['t1_word'], x['t1_phon'],
- x['t1_word'] + ' [' + to_xsampa(x['t1_phon']) + ']')
+ key = get_key(x)
if key not in result.keys():
result[key] = []
- cache[key] = []
- seen[key] = set()
row = dict([
(k[3:], x[k]) for k in x.keys()
if k.startswith('t2_')])
- if row['base'] in seen[key]:
+ for k in row.keys():
+ if isinstance(row[k], str):
+ try:
+ row[k] = row[k].decode('utf8')
+ except UnicodeDecodeError:
+ row[k] = row[k].decode('latin1')
+ if (row['word'].endswith('-'+x['t1_word'])):
continue
- seen[key].add(row['base'])
+ if (row['word'] == x['t1_word'] and row['word'] == x['t1_word']
+ and ',' not in row['orig']):
+ continue # don't display the word if it has only one possible origin
row['freq'] = float(row['freq'])
row['phon_rhyme'] = lcs(x['t1_phon'], row['phon'])
row['word_rhyme'] = lcs(x['t1_word'], row['word'])
row['key'] = (
-row['phon_rhyme'], # phon_rhyme desc
-row['word_rhyme'], # eye_rhyme desc
- row['base'] != row['word'], # same as base
+ #TODO row['base'] != row['word'], # same as base
-row['freq'], # frequency desc
row['word'] # alphabetical order
)
- row['derivation'] = row['kind'] + (
- ' (' + row['base'] + ')'
- if row['base'] != row['word']
- else '')
row['phon'] = to_xsampa(row['phon'])
- if (row['word'] in [x['t1_word'], x['t1_base']]
- and row['phon'] == to_xsampa(x['t1_phon'])):
- cache[key].append(row)
- else:
- result[key].append(row)
+ result[key].append(row)
+
+ result2 = {}
+ seen = {}
for k in result.keys():
- # only display the word itself if multiple derivations are possible
- if len(cache[key]) > 1:
- result[k] += cache[key]
+ # TODO only display the word itself if multiple derivations are possible
result[k] = sorted(result[k], key=operator.itemgetter('key'))
- print(result)
- return result
+ result2[k] = []
+ seen[key] = set()
+ for row in result[k]:
+ bases = row['orig'].split(',')
+ ok = False
+ for i in range(len(bases)):
+ bases[i] = bases[i].split('|')
+ if bases[i][1] not in seen[key]:
+ ok = True
+ seen[key].add(bases[i][1])
+ if ok:
+ row['orig'] = ', '.join(
+ [a[0] + (' ('+a[1]+')' if row['word'] != a[1] else '')
+ for a in bases])
+ result2[k].append(row)
+ seen[key].add(row['word'])
+ count = len(result2[k])
+ result2[k] = result2[k][:PAGESIZE]
+
+ #cursor = run_query('''
+ #SELECT count(t2.word)
+ #''' + rest, args)
+ #for x in cursor:
+ #count = x[x.keys()[0]]
+ return result2, count
if __name__ == '__main__':
print(query(*sys.argv[1:]))
diff --git a/static/main.css b/static/main.css
@@ -3,7 +3,8 @@ h1 {
float: left;
margin: 0;
margin-right: 0.3em;
- font-size: 140%;
+ font-size: 100%;
+ padding: 0.2em;
}
#body {
@@ -44,10 +45,12 @@ label {
table, .faketable {
width: 100%;
+ margin: 0;
+ padding: 0;
}
.odd {
- background: #efe;
+ background: #dfd;
}
.help {
@@ -59,7 +62,7 @@ table, .faketable {
}
#query {
- width: 20em;
+ width: 12em;
}
#gender_label {
@@ -87,3 +90,39 @@ table, .faketable {
padding: 0.3em;
}
+#result_header {
+ background: #dfd;
+ margin: 0;
+ padding: 0.3em;
+}
+
+header form {
+ float: right;
+}
+
+header form input {
+ height: 100%;
+}
+
+.page {
+ text-align: center;
+}
+
+/*#col_word {
+ width: 10%;
+}
+#col_pron {
+ width: 10%;
+}
+#col_derivations {
+ width: 30%;
+}
+#col_freq {
+ width: 15%;
+}
+#col_phon {
+ width: 7%;
+}
+#col_eye {
+ width: 7%;
+}*/
diff --git a/templates/about.html b/templates/about.html
@@ -1,21 +1,19 @@
{% extends "page.html" %}
{% block body %}
-TODO move this to /
<p>Welcome to <strong>drime</strong>!</p>
<h2 id="info">Wait, what is this?</h2>
-<p>This is drime, <a href="http://a3nm.net">a3nm</a>'s attempt to build a better
-French rhyme dictionary. It uses the <a
- href="http://www.lexique.org/">Lexique</a> database with some customisations,
-and is powered by Python, Sqlite and Flask. You can get the code. TODO
-links.</p>
+<p>This is <a href="http://a3nm.net">a3nm</a>'s attempt to build a better French
+rhyme dictionary. It uses the <a href="http://www.lexique.org/">Lexique</a>
+database with some customisations, and is powered by Python, Sqlite and Flask.
+You can get the code. TODO links.</p>
<h2 id="help">How do I use it?</h2>
<p>In the <strong>word</strong> field, enter the word you want to get rhymes
for. Inferring pronunciation of unknown words isn't supported (yet), so don't
use proper nouns or rare words. You can also provide a pronunciation written
between square brackets using <a href="#pron">the right convention</a> to
disambiguate if multiple pronunciations are possible. Example: <a
- href="/?query=fils%20[fis]">fils [fis]</a>.</p>
+ href="/query?query=fils%20[fis]">fils [fis]</a>.</p>
<p>In the <strong>n_syllables</strong> field, you can specify a number of
syllables to limit on. You can either specify an exact number or a range (eg.
"1-3"). You can suffix a "+" to indicate that you can accept one syllable more if
@@ -44,5 +42,10 @@ pronunciation indicated is a colloquial one and not the one that you would use t
versify (for instance, "placement" is reported as "plasmA~" but would be read as
"plas@mA~"). Heuristics are used to work around this when filtering on the
number of syllables.</p>
+
+<h2>What about the name?</h2>
+
+<p>It's pronounced [dRim], as a pun with "dream" (the dream rhyme dictionary)
+but also because it is short for "dictionnaire de rimes" in French.</p>
{% endblock %}
diff --git a/templates/disambig.html b/templates/disambig.html
@@ -6,11 +6,11 @@
{% for k in keys %}
<li class="{{ loop.cycle('odd', 'even') }}">
<a href="?query={{ k[-1] | escape }}&nsyl={{ nsyl }}&gender={{ gender }}">
- {{ k[0] }} [{{ k[1] }}], rhyming with
+ {{ k[0] }} [{{ k[1] }}]{#, rhyming with
{% for v in example[k][:5] %}
{{ v.word }}
{% endfor %}
- ...
+ ... #}
</a>
</li>
{% endfor %}
diff --git a/templates/error.html b/templates/error.html
@@ -1,6 +1,6 @@
{% extends "page.html" %}
{% block body %}
-<p>Invalid values supplied. Need some <a href="about#help">help</a>?</p>
+<p>Invalid values supplied. Need some <a href="/#help">help</a>?</p>
{% endblock %}
diff --git a/templates/notfound.html b/templates/notfound.html
@@ -2,7 +2,7 @@
{% block body %}
<p>No known word matches this query. Make sure you're using a word from the
-dictionary. Remove syllabe count restrictions, if any. Need some <a
- href="about#help">help</a>?</p>
+dictionary. Need some <a
+ href="/#help">help</a>?</p>
{% endblock %}
diff --git a/templates/page.html b/templates/page.html
@@ -9,7 +9,6 @@
<body>
<header>
<h1><a href="/">drime</a></h1>
- <div id="about"><a href="about">about</a></div>
<form method="GET" action="query">
<label class="redundant" for="query">Word</label>
<input id="query" name="query"
@@ -19,9 +18,9 @@
<input id="nsyl" name="nsyl"
placeholder="n_syllables"
value="{{ nsyl }}"/>
- <label id="gender_label">
+ <label id="gender_label">
<input type="checkbox" id="gender" name="gender"
- {% if gender %}
+ {% if (mode != 'query') or gender == 'on' %}
checked="{{ gender }}"
{% endif %}
/>
diff --git a/templates/results.html b/templates/results.html
@@ -1,15 +1,24 @@
{% extends "page.html" %}
{% block body %}
-<p>Displaying results for: <strong>{{ keys[0][-1] }}</strong></p>
+<div>
+<p id="result_header">Displaying
+{{ displayed }} result{% if displayed != 1 %}s{% endif %}
+{# results
+{{ page * pagesize + 1 }} to {{ (page+1) * pagesize }} #}
+{% if displayed < count %}
+of {{ count }}
+total result{% if count != 1 %}s{% endif %}
+{% endif %}
+for: <strong>{{ keys[0][-1] }}</strong></p>
<table>
<tr>
- <th>word</th>
- <th>pron<a href="about#pron" class="help">?</a></th>
- <th>phon</th>
- <th>eye</th>
- <th>freq</th>
- <th>derivation</th>
+ <th id="col_word">word</th>
+ <th id="col_pron">pron<a href="/#pron" class="help">?</a></th>
+ <th id="col_phon">phon</th>
+ <th id="col_eye">eye</th>
+ <th id="col_freq">freq</th>
+ <th id="col_deriations">derivations</th>
{% for r in result %}
<tr class="{{ loop.cycle('odd', 'even') }}">
<td>{{ r.word }}</td>
@@ -17,10 +26,21 @@
<td class="num">{{ r.phon_rhyme }}</td>
<td class="num">{{ r.word_rhyme }}</td>
<td class="num">{{ r.freq }}</td>
- <td>{{ r.derivation }}</td>
+ <td>{{ r.orig }}</td>
</tr>
{% endfor %}
</table>
-
+</div>
+{#<p class="page">
+{% if page > 0 %}
+<a href="?query={{ q }}&nsyl={{ nsyl }}&gender={{ gender }}&page={{
+ page - 1 }}">{{ page }}</a> —
+{% endif %}
+{{ page + 1 }}
+{% if (page+1) * pagesize < count %}
+— <a href="?query={{ q }}&nsyl={{ nsyl }}&gender={{ gender }}&page={{
+ page + 1 }}">{{ page + 2 }}</a>
+{% endif %}
+</p>#}
{% endblock %}