sortof works now - drime - French rhyme dictionary with web and CLI interface

commit eb9c522929e8eb7784052a2dc4122009c0aa2bac
parent 77221a978a5e49fe5a16d6606559bc91576423e7
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 27 Dec 2011 00:45:56 +0100

sortof works now

Diffstat:
README  | 6 +++---
db_mysql.py  | 3 +--
lexique2sql.py  | 17 ++++++++---------
lexique2sql.sh  | 4 +++-
query.py  | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
static/main.css  | 45 ++++++++++++++++++++++++++++++++++++++++++---
templates/about.html  | 17 ++++++++++-------
templates/disambig.html  | 4 ++--
templates/error.html  | 2 +-
templates/notfound.html  | 4 ++--
templates/page.html  | 5 ++---
templates/results.html  | 38 +++++++++++++++++++++++++++++---------

12 files changed, 205 insertions(+), 82 deletions(-)
diff --git a/README b/README
@@ -1,6 +1,4 @@
 WARNING -- this code does *not* work yet!
-TODO: dos2unix
-TODO: placement in rhymes for enlacement?
 
 drime - by Antoine Amarilli
 A French rhyme dictionary
@@ -39,7 +37,9 @@ This can take some time: you can monitor progress using the pv utility:
 To import the output of lexique2sql.sh in a MySQL database (on localhost,
 database 'drime', as user 'drime', interactive password authentication), run:
 
-  cat output.sql | mysql --default-character-set=utf8 -D drime -u drime -p
+  cat output.sql |
+    sed 's/varchar([0-9]*)/& collate utf8_bin/g' |
+    mysql --default-character-set=utf8 -D drime -u drime -p
 
 == 4. Using the DB ==
 
diff --git a/db_mysql.py b/db_mysql.py
@@ -10,8 +10,7 @@ def run_query(r, v):
       user=config['user'],
       passwd=config['passwd'],
       db=config['db'],
-      cursorclass=MySQLdb.cursors.DictCursor,
-      use_unicode=True)
+      cursorclass=MySQLdb.cursors.DictCursor)
   cursor = db.cursor()
   cursor.execute(r.replace('?', '%s'), v)
   return cursor
diff --git a/lexique2sql.py b/lexique2sql.py
@@ -109,18 +109,16 @@ class Word:
   @property
   def sql(self):
     render = {
-      'string': lambda s, w: '"'+escape(s)+'"',
-      'string2': lambda s, w:
-            '"'+escape(
-              ', '.join([x[0]+(' ('+x[1]+')' if w.word != x[1] else '')
-                for x in s]))+'"',
-      'float': lambda s, w: str(s),
-      'int': lambda s, w: str(int(s)),
-      'bool': lambda s, w: str(int(s)),
+      'string': lambda s: '"'+escape(s)+'"',
+      'string2': lambda s:
+            '"'+escape(','.join([x[0] + '|' + x[1] for x in s]))+'"',
+      'float': str,
+      'int': lambda s: str(int(s)),
+      'bool': lambda s: str(int(s)),
     }
     def sql_field(field):
       (name, (ty, _)) = field
-      return render[ty](getattr(self, name), self)
+      return render[ty](getattr(self, name))
     return ('INSERT INTO words VALUES('
             + ', '.join([sql_field(f) for f in sql_fields])
             + ');')
@@ -128,6 +126,7 @@ class Word:
   def __init__(self, word, phon, base, kind, freq):
     self.word = word
     self.phon = phon
+    base = base.split(',')[0] # workaround for lexique
     self.orig = [(kind, base)]
     self.freq = float(freq)
     self.nsyl = None
diff --git a/lexique2sql.sh b/lexique2sql.sh
@@ -3,6 +3,8 @@
 cd "$( dirname "$0" )"
 
 cat - additions | # add custom exceptions
+  sort -k1,2 | # sort to aggregate duplicates TODO break ties by frequency
   cut -f 1-8 | # select relevant fields
-  awk '{FS="	"; OFS="	"; print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies
+  awk '{FS="	"; OFS="	";
+    print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies
   ./lexique2sql.py
diff --git a/query.py b/query.py
@@ -5,7 +5,10 @@ import operator
 from db_mysql import run_query
 from common import from_xsampa, to_xsampa
 
-PAGESIZE=50
+PAGESIZE = 500
+
+class BadValues(Exception):
+  pass
 
 def lcs(x, y):
   """Longest common suffix"""
@@ -18,10 +21,14 @@ def lcs(x, y):
 
 
 def query(q, nsyl='', gender=True, page=0):
+  if not page:
+    page = 0
+  else:
+    page = int(page)
   if not nsyl:
     nsyl = ''
   if not q:
-    raise ValueError
+    raise BadValues
   word = q.strip().split(' ')
   nsyl = nsyl.strip()
   if word[-1].startswith('[') and word[-1].endswith(']'):
@@ -44,7 +51,7 @@ def query(q, nsyl='', gender=True, page=0):
     else:
       elide = False
     if len(syll) > 2:
-      raise ValueError
+      raise BadValues
     minsyll = int(syll[0])
     if len(syll) == 1:
       maxsyll = int(syll[0])
@@ -55,13 +62,50 @@ def query(q, nsyl='', gender=True, page=0):
       page*PAGESIZE, PAGESIZE)
 pass
 
+def decode_all(x):
+  for k in x.keys():
+    if isinstance(x[k], str):
+      try:
+        x[k] = x[k].decode('utf8')
+      except UnicodeDecodeError:
+        x[k] = x[k].decode('latin1')
+
+def get_key(x):
+  return (x['t1_word'], x['t1_phon'],
+        x['t1_word'] + ' [' + to_xsampa(x['t1_phon']) + ']')
+
 def do_query(word, phon, minsyll, maxsyll, elide, gender, offset, size):
   print ((word, phon, minsyll, maxsyll, elide, gender,))
+  print ((offset, size,))
   cursor = run_query('''
-    SELECT t1.freq AS t1_freq,
-        t1.word AS t1_word,
+    SELECT t1.word AS t1_word,
+        t1.phon AS t1_phon
+    FROM words AS t1
+    WHERE (t1.word = ? OR ?) AND (t1.phon = ? OR ?)
+    ORDER BY t1.freq DESC
+    ''', (word, word == None, phon, phon == None,))
+  result = {}
+  for x in cursor:
+    decode_all(x)
+    result[get_key(x)] = x
+  if len(result.keys()) > 1 or result == {}:
+    return result, 0 # require disambiguation or is empty
+
+  rest = ''' FROM words AS t1, words AS t2
+    WHERE (t1.phon_end = t2.phon_end OR t1.word_end = t2.word_end)
+        AND (t1.word = ? OR ?) AND (t1.phon = ? OR ?)
+        AND ((? OR t2.max_nsyl >= ?)
+        AND (? OR t2.min_nsyl <= ?
+            OR (t2.elidable AND t2.min_nsyl - 1 <= ? AND ?)))
+    ORDER BY t2.freq, t1.phon, t1.word
+    '''
+  limit = '''LIMIT ? OFFSET ?'''
+
+  args = (word, word == None, phon, phon == None,
+        minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,)
+  query = '''
+    SELECT t1.word AS t1_word,
         t1.phon AS t1_phon,
-        t1.base AS t1_base,
         t1.feminine AS t1_feminine,
         t2.word AS t2_word,
         t2.phon AS t2_phon,
@@ -69,63 +113,81 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, offset, size):
         t2.min_nsyl AS t2_min_nsyl,
         t2.max_nsyl AS t2_max_nsyl,
         t2.elidable AS t2_elidable,
-        t2.base AS t2_base,
-        t2.kind AS t2_kind,
+        t2.orig AS t2_orig,
         t2.feminine AS t2_feminine
-    FROM words AS t1 INNER JOIN words AS t2 ON
-        (t1.phon_end = t2.phon_end OR t1.word_end = t2.word_end)
-    WHERE (t1.word = ? OR ?) AND (t1.phon = ? OR ?)
-        AND ((? OR t2.max_nsyl >= ?)
-        AND (? OR t2.min_nsyl <= ?
-            OR (t2.elidable AND t2.min_nsyl - 1 <= ? AND ?)))
-    ORDER BY t1.freq, t1.phon, t1.word
-    ''', (word, word == None, phon, phon == None,
-        minsyll == None, minsyll, maxsyll == None, maxsyll, maxsyll, elide,))
+      ''' + rest #+ limit
+  print (query)
+  cursor = run_query(query, args) #+ (size, offset,))
   result = {}
-  cache = {}
-  seen = {}
   for x in cursor:
+    for k in x.keys():
+      if isinstance(x[k], str):
+        try:
+          x[k] = x[k].decode('utf8')
+        except UnicodeDecodeError:
+          x[k] = x[k].decode('latin1')
     if x['t1_feminine'] != x['t2_feminine'] and gender:
       continue
-    key = (x['t1_word'], x['t1_phon'],
-        x['t1_word'] + ' [' + to_xsampa(x['t1_phon']) + ']')
+    key = get_key(x)
     if key not in result.keys():
       result[key] = []
-      cache[key] = []
-      seen[key] = set()
     row = dict([
         (k[3:], x[k]) for k in x.keys()
             if k.startswith('t2_')])
-    if row['base'] in seen[key]:
+    for k in row.keys():
+      if isinstance(row[k], str):
+        try:
+          row[k] = row[k].decode('utf8')
+        except UnicodeDecodeError:
+          row[k] = row[k].decode('latin1')
+    if (row['word'].endswith('-'+x['t1_word'])):
       continue
-    seen[key].add(row['base'])
+    if (row['word'] == x['t1_word'] and row['word'] == x['t1_word']
+        and ',' not in row['orig']):
+      continue # don't display the word if it has only one possible origin
     row['freq'] = float(row['freq'])
     row['phon_rhyme'] = lcs(x['t1_phon'], row['phon'])
     row['word_rhyme'] = lcs(x['t1_word'], row['word'])
     row['key'] = (
         -row['phon_rhyme'], # phon_rhyme desc
         -row['word_rhyme'], # eye_rhyme desc
-        row['base'] != row['word'], # same as base
+        #TODO row['base'] != row['word'], # same as base
         -row['freq'], # frequency desc
         row['word'] # alphabetical order
         )
-    row['derivation'] = row['kind'] + (
-          ' (' + row['base'] + ')'
-          if row['base'] != row['word']
-          else '')
     row['phon'] = to_xsampa(row['phon'])
-    if (row['word'] in [x['t1_word'], x['t1_base']]
-        and row['phon'] == to_xsampa(x['t1_phon'])):
-      cache[key].append(row)
-    else:
-      result[key].append(row)
+    result[key].append(row)
+
+  result2 = {}
+  seen = {}
   for k in result.keys():
-    # only display the word itself if multiple derivations are possible
-    if len(cache[key]) > 1:
-      result[k] += cache[key]
+    # TODO only display the word itself if multiple derivations are possible
     result[k] = sorted(result[k], key=operator.itemgetter('key'))
-  print(result)
-  return result
+    result2[k] = []
+    seen[key] = set()
+    for row in result[k]:
+      bases = row['orig'].split(',')
+      ok = False
+      for i in range(len(bases)):
+        bases[i] = bases[i].split('|')
+        if bases[i][1] not in seen[key]:
+          ok = True
+        seen[key].add(bases[i][1])
+      if ok:
+        row['orig'] = ', '.join(
+              [a[0] + (' ('+a[1]+')' if row['word'] != a[1] else '')
+                for a in bases])
+        result2[k].append(row)
+        seen[key].add(row['word'])
+    count = len(result2[k])
+    result2[k] = result2[k][:PAGESIZE]
+
+  #cursor = run_query('''
+    #SELECT count(t2.word)
+    #''' + rest, args)
+  #for x in cursor:
+    #count = x[x.keys()[0]]
+  return result2, count
 
 if __name__ == '__main__':
     print(query(*sys.argv[1:]))
diff --git a/static/main.css b/static/main.css
@@ -3,7 +3,8 @@ h1 {
   float: left;
   margin: 0;
   margin-right: 0.3em;
-  font-size: 140%;
+  font-size: 100%;
+  padding: 0.2em;
 }
 
 #body {
@@ -44,10 +45,12 @@ label {
 
 table, .faketable {
   width: 100%;
+  margin: 0;
+  padding: 0;
 }
 
 .odd {
-  background: #efe;
+  background: #dfd;
 }
 
 .help {
@@ -59,7 +62,7 @@ table, .faketable {
 }
 
 #query {
-  width: 20em;
+  width: 12em;
 }
 
 #gender_label {
@@ -87,3 +90,39 @@ table, .faketable {
   padding: 0.3em;
 }
 
+#result_header {
+  background: #dfd;
+  margin: 0;
+  padding: 0.3em;
+}
+
+header form {
+  float: right;
+}
+
+header form input {
+  height: 100%;
+}
+
+.page {
+  text-align: center;
+}
+
+/*#col_word {
+  width: 10%;
+}
+#col_pron {
+  width: 10%;
+}
+#col_derivations {
+  width: 30%;
+}
+#col_freq {
+  width: 15%;
+}
+#col_phon {
+  width: 7%;
+}
+#col_eye {
+  width: 7%;
+}*/
diff --git a/templates/about.html b/templates/about.html
@@ -1,21 +1,19 @@
 {% extends "page.html" %}
 
 {% block body %}
-TODO move this to /
 <p>Welcome to <strong>drime</strong>!</p>
 <h2 id="info">Wait, what is this?</h2>
-<p>This is drime, <a href="http://a3nm.net">a3nm</a>'s attempt to build a better
-French rhyme dictionary. It uses the <a
-  href="http://www.lexique.org/">Lexique</a> database with some customisations,
-and is powered by Python, Sqlite and Flask. You can get the code. TODO
-links.</p>
+<p>This is <a href="http://a3nm.net">a3nm</a>'s attempt to build a better French
+rhyme dictionary. It uses the <a href="http://www.lexique.org/">Lexique</a>
+database with some customisations, and is powered by Python, Sqlite and Flask.
+You can get the code. TODO links.</p>
 <h2 id="help">How do I use it?</h2>
 <p>In the <strong>word</strong> field, enter the word you want to get rhymes
 for. Inferring pronunciation of unknown words isn't supported (yet), so don't
 use proper nouns or rare words. You can also provide a pronunciation written
 between square brackets using <a href="#pron">the right convention</a> to
 disambiguate if multiple pronunciations are possible. Example: <a
-  href="/?query=fils%20[fis]">fils [fis]</a>.</p>
+  href="/query?query=fils%20[fis]">fils [fis]</a>.</p>
 <p>In the <strong>n_syllables</strong> field, you can specify a number of
 syllables to limit on. You can either specify an exact number or a range (eg.
 "1-3"). You can suffix a "+" to indicate that you can accept one syllable more if
@@ -44,5 +42,10 @@ pronunciation indicated is a colloquial one and not the one that you would use t
 versify (for instance, "placement" is reported as "plasmA~" but would be read as
 "plas@mA~"). Heuristics are used to work around this when filtering on the
 number of syllables.</p>
+
+<h2>What about the name?</h2>
+
+<p>It's pronounced [dRim], as a pun with "dream" (the dream rhyme dictionary)
+but also because it is short for "dictionnaire de rimes" in French.</p>
 {% endblock %}
 
diff --git a/templates/disambig.html b/templates/disambig.html
@@ -6,11 +6,11 @@
 {% for k in keys %}
 <li class="{{ loop.cycle('odd', 'even') }}">
 <a href="?query={{ k[-1] | escape }}&nsyl={{ nsyl }}&gender={{ gender }}">
-    {{ k[0] }} [{{ k[1] }}], rhyming with  
+  {{ k[0] }} [{{ k[1] }}]{#, rhyming with  
       {% for v in example[k][:5] %}
         {{ v.word }}
         {% endfor %}
-        ...
+        ... #}
   </a>
     </li>
 {% endfor %}
diff --git a/templates/error.html b/templates/error.html
@@ -1,6 +1,6 @@
 {% extends "page.html" %}
 
 {% block body %}
-<p>Invalid values supplied. Need some <a href="about#help">help</a>?</p>
+<p>Invalid values supplied. Need some <a href="/#help">help</a>?</p>
 {% endblock %}
 
diff --git a/templates/notfound.html b/templates/notfound.html
@@ -2,7 +2,7 @@
 
 {% block body %}
 <p>No known word matches this query. Make sure you're using a word from the
-dictionary. Remove syllabe count restrictions, if any. Need some <a
-  href="about#help">help</a>?</p>
+dictionary. Need some <a
+  href="/#help">help</a>?</p>
 {% endblock %}
 
diff --git a/templates/page.html b/templates/page.html
@@ -9,7 +9,6 @@
   <body>
     <header>
     <h1><a href="/">drime</a></h1>
-    <div id="about"><a href="about">about</a></div>
     <form method="GET" action="query">
       <label class="redundant" for="query">Word</label>
       <input id="query" name="query"
@@ -19,9 +18,9 @@
       <input id="nsyl" name="nsyl"
         placeholder="n_syllables"
         value="{{ nsyl }}"/>
-      <label id="gender_label">
+        <label id="gender_label">
       <input type="checkbox" id="gender" name="gender"
-        {% if gender %}
+        {% if (mode != 'query') or gender == 'on' %}
         checked="{{ gender }}"
         {% endif %}
         />
diff --git a/templates/results.html b/templates/results.html
@@ -1,15 +1,24 @@
 {% extends "page.html" %}
 
 {% block body %}
-<p>Displaying results for: <strong>{{ keys[0][-1] }}</strong></p>
+<div>
+<p id="result_header">Displaying
+{{ displayed }} result{% if displayed != 1 %}s{% endif %}
+{# results
+{{ page * pagesize + 1 }} to {{ (page+1) * pagesize }} #}
+{% if displayed < count %}
+of {{ count }}
+total result{% if count != 1 %}s{% endif %}
+{% endif %}
+for: <strong>{{ keys[0][-1] }}</strong></p>
 <table>
   <tr>
-    <th>word</th>
-    <th>pron<a href="about#pron" class="help">?</a></th>
-    <th>phon</th>
-    <th>eye</th>
-    <th>freq</th>
-    <th>derivation</th>
+    <th id="col_word">word</th>
+    <th id="col_pron">pron<a href="/#pron" class="help">?</a></th>
+    <th id="col_phon">phon</th>
+    <th id="col_eye">eye</th>
+    <th id="col_freq">freq</th>
+    <th id="col_deriations">derivations</th>
 {% for r in result %}
 <tr class="{{ loop.cycle('odd', 'even') }}">
   <td>{{ r.word }}</td>
@@ -17,10 +26,21 @@
   <td class="num">{{ r.phon_rhyme }}</td>
   <td class="num">{{ r.word_rhyme }}</td>
   <td class="num">{{ r.freq }}</td>
-  <td>{{ r.derivation }}</td>
+  <td>{{ r.orig }}</td>
 </tr>
 {% endfor %}
 </table>
-
+</div>
+{#<p class="page">
+{% if page > 0 %}
+<a href="?query={{ q }}&nsyl={{ nsyl }}&gender={{ gender }}&page={{
+  page - 1 }}">{{ page }}</a> &mdash;
+{% endif %}
+{{ page + 1 }}
+{% if (page+1) * pagesize < count %}
+&mdash; <a href="?query={{ q }}&nsyl={{ nsyl }}&gender={{ gender }}&page={{
+  page + 1 }}">{{ page + 2 }}</a>
+{% endif %}
+</p>#}
 {% endblock %}

	drime French rhyme dictionary with web and CLI interface
	git clone https://a3nm.net/git/drime/
	Log \| Files \| Refs \| README

README	\|	6	+++---
db_mysql.py	\|	3	+--
lexique2sql.py	\|	17	++++++++---------
lexique2sql.sh	\|	4	+++-
query.py	\|	142	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
static/main.css	\|	45	++++++++++++++++++++++++++++++++++++++++++---
templates/about.html	\|	17	++++++++++-------
templates/disambig.html	\|	4	++--
templates/error.html	\|	2	+-
templates/notfound.html	\|	4	++--
templates/page.html	\|	5	++---
templates/results.html	\|	38	+++++++++++++++++++++++++++++---------