commit e6b424a2503b9d1e8ecbaa3bd0548ce462877208
parent 6b3e556ced2fe137360ad87870a8a0419329f20b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 26 Oct 2013 12:57:18 +0200
infer pronuncation of unknown words
Diffstat:
2 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/drime.py b/drime.py
@@ -45,12 +45,13 @@ def q():
except ValueError:
d['page'] = 0
try:
- r, count = query.query(**d)
+ r, count, sure = query.query(**d)
d['lang'] = get_locale()
d['pagesize'] = query.PAGESIZE
d['mode'] = 'query'
d['title'] = get_title()
d['count'] = count
+ d['sure'] = sure
d['displayed'] = min(d['pagesize'], count)
d['keys'] = r['keys']
if len(r['keys']) == 0:
diff --git a/query.py b/query.py
@@ -6,7 +6,9 @@ import codecs
import operator
from db_mysql import run_query
from common import from_xsampa, to_xsampa
-from rhyme import Rhyme, Constraint
+from rhyme import Rhyme, Constraint, escape
+import frhyme
+from lexique2sql import Word
PAGESIZE = 500
@@ -113,8 +115,24 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, siz
decode_all(x)
key = get_key(x)
keys.append(key)
- if len(keys) > 1 or keys == []:
- return {'keys': keys}, 0 # require disambiguation or is empty
+ sure = True
+ if len(keys) > 1:
+ return {'keys': keys}, 0, True # require disambiguation or is empty
+ if len(keys) == 0:
+ if not word:
+ return {'keys': keys}, 0, True # we need a word to infer anything
+ if not phon:
+ # infer from what was given
+ sure = False
+ s = word.split(' ')[-3:]
+ prons = [frhyme.lookup(escape(w))[0][1] for w in s]
+ phon = ''.join(prons)
+ # now, create a dummy entry for what was provided
+ w = Word(word.encode('utf-8'), phon.encode('utf-8'), '', '', '1',
+ do_extends=False)
+ x = {'word': w.word, 'phon': w.phon, 'word_end': w.word_end, 'phon_end':
+ w.phon_end, 'feminine': w.feminine, 'orig': '|'}
+ key = get_key(x)
word = x['word']
phon = x['phon']
word_end = x['word_end']
@@ -137,13 +155,14 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, siz
cursor = run_query(query, args) #+ (size, offset,))
result = []
+ bword = word.split(' ')[-1]
for row in cursor:
decode_all(row)
if feminine != row['feminine'] and gender:
continue
- if (row['word'].endswith('-'+word)):
+ if (row['word'].endswith('-'+bword)):
continue
- if (row['word'] == word and row['word'] == word
+ if (row['word'] == bword and row['word'] == bword
and ',' not in row['orig']):
continue # don't display the word if it has only one possible origin
if classical:
@@ -192,7 +211,7 @@ def do_query(word, phon, minsyll, maxsyll, elide, gender, classical, offset, siz
#''' + rest, args)
#for x in cursor:
#count = x[x.keys()[0]]
- return {'keys': [key], 'result': result2}, count
+ return {'keys': [key], 'result': result2}, count, sure
if __name__ == '__main__':
# work around encoding issues
@@ -205,7 +224,7 @@ if __name__ == '__main__':
for p in [3, 4]:
if p < len(sys.argv):
sys.argv[p] = convert(sys.argv[p])
- r, c = query(*sys.argv[1:])
+ r, c, sure = query(*sys.argv[1:])
except BadValues:
print ("Bad values passed as arguments.")
usage()
@@ -222,6 +241,8 @@ if __name__ == '__main__':
print (" - %s" % k[-1] + ' -- ' + k[2])
print ("Please rerun with a more specific query")
sys.exit(2)
+ if not sure:
+ print ("Warning: word is unknown, pronunciation is inferred, please check")
result = [["word", "phon", "pr", "wr", "freq", "orig"]] + [
(x['word'],
x['phon'],