infer pronuncation of unknown words - drime - French rhyme dictionary with web and CLI interface

commit e6b424a2503b9d1e8ecbaa3bd0548ce462877208
parent 6b3e556ced2fe137360ad87870a8a0419329f20b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sat, 26 Oct 2013 12:57:18 +0200

infer pronuncation of unknown words

Diffstat:
drime.py  | 3 ++-
query.py  | 35 ++++++++++++++++++++++++++++-------

2 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/drime.py b/drime.py
@@ -45,12 +45,13 @@ def q():
   except ValueError:
     d['page'] = 0
   try:
-    r, count = query.query(**d)
+    r, count, sure = query.query(**d)
     d['lang'] = get_locale()
     d['pagesize'] = query.PAGESIZE
     d['mode'] = 'query'
     d['title'] = get_title()
     d['count'] = count
+    d['sure'] = sure
     d['displayed'] = min(d['pagesize'], count)
     d['keys'] = r['keys']
     if len(r['keys']) == 0:
diff --git a/query.py b/query.py
@@ -6,7 +6,9 @@ import codecs
 import operator
 from db_mysql import run_query
 from common import from_xsampa, to_xsampa
-from rhyme import Rhyme, Constraint
+from rhyme import Rhyme, Constraint, escape
+import frhyme
+from lexique2sql import Word
 
 PAGESIZE = 500
 
@@ -113,8 +115,24 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, siz
     decode_all(x)
     key = get_key(x)
     keys.append(key)
-  if len(keys) > 1 or keys == []:
-    return {'keys': keys}, 0 # require disambiguation or is empty
+  sure = True
+  if len(keys) > 1:
+    return {'keys': keys}, 0, True # require disambiguation or is empty
+  if len(keys) == 0:
+    if not word:
+      return {'keys': keys}, 0, True # we need a word to infer anything
+    if not phon:
+      # infer from what was given
+      sure = False
+      s = word.split(' ')[-3:]
+      prons = [frhyme.lookup(escape(w))[0][1] for w in s]
+      phon = ''.join(prons)
+    # now, create a dummy entry for what was provided
+    w = Word(word.encode('utf-8'), phon.encode('utf-8'), '', '', '1',
+            do_extends=False)
+    x = {'word': w.word, 'phon': w.phon, 'word_end': w.word_end, 'phon_end':
+        w.phon_end, 'feminine': w.feminine, 'orig': '|'}
+    key = get_key(x)
   word = x['word']
   phon = x['phon']
   word_end = x['word_end']
@@ -137,13 +155,14 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, siz
   cursor = run_query(query, args) #+ (size, offset,))
 
   result = []
+  bword = word.split(' ')[-1]
   for row in cursor:
     decode_all(row)
     if feminine != row['feminine'] and gender:
       continue
-    if (row['word'].endswith('-'+word)):
+    if (row['word'].endswith('-'+bword)):
       continue
-    if (row['word'] == word and row['word'] == word
+    if (row['word'] == bword and row['word'] == bword
         and ',' not in row['orig']):
       continue # don't display the word if it has only one possible origin
     if classical:
@@ -192,7 +211,7 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, siz
     #''' + rest, args)
   #for x in cursor:
     #count = x[x.keys()[0]]
-  return {'keys': [key], 'result': result2}, count
+  return {'keys': [key], 'result': result2}, count, sure
 
 if __name__ == '__main__':
   # work around encoding issues
@@ -205,7 +224,7 @@ if __name__ == '__main__':
     for p in [3, 4]:
       if p < len(sys.argv):
         sys.argv[p] = convert(sys.argv[p])
-    r, c = query(*sys.argv[1:])
+    r, c, sure = query(*sys.argv[1:])
   except BadValues:
     print ("Bad values passed as arguments.")
     usage()
@@ -222,6 +241,8 @@ if __name__ == '__main__':
       print ("  - %s" % k[-1] + ' -- ' + k[2])
     print ("Please rerun with a more specific query")
     sys.exit(2)
+  if not sure:
+    print ("Warning: word is unknown, pronunciation is inferred, please check")
   result = [["word", "phon", "pr", "wr", "freq", "orig"]] + [
       (x['word'],
        x['phon'],

	drime French rhyme dictionary with web and CLI interface
	git clone https://a3nm.net/git/drime/
	Log \| Files \| Refs \| README