drime

French rhyme dictionary with web and CLI interface
git clone https://a3nm.net/git/drime/
Log | Files | Refs | README

commit 3937aa9cb6899ea053a09a3c2c111319cc37a93e
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed, 17 Aug 2011 19:16:05 -0400

initial commit

Diffstat:
manage.py | 190+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
prepare.sh | 6++++++
query.sh | 10++++++++++
reorder.py | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 270 insertions(+), 0 deletions(-)

diff --git a/manage.py b/manage.py @@ -0,0 +1,190 @@ +#!/usr/bin/python3 -O + +# TODO frequencies are off + +import haspirater +import metric +import sys + +seen = {} + +vowels = "aàâãeéèêëiîïoôöuùûüy" +consonants = "bcçdfghjklmnpqrstvwxz" + +phon_vowels = "()$#289aeEioOuy@" + +sure_end_fem = ['e', 'es', 'ent'] +phon_non_end_fem = ['#', ')'] + +class Word: + @property + def elidable(self): + return self.word[0] in vowels or (self.word[0] == 'h' and + not haspirater.lookup(self.word)) + + @property + def phon_ending(self): + l = [] + w = list(self.phon) + w.reverse() + for x in w: + l.append(x) + if x in phon_vowels: + break + l.reverse() + return ''.join(l) + + @property + def ending(self): + l = [] + w = list(self.word) + count = 0 + w.reverse() + for x in w: + if x in vowels or x in consonants: + l.append(x) + if x in vowels and count >= 1: + break + count += 1 + l.reverse() + return ''.join(l) + + @property + def feminine(self): + for end in sure_end_fem: + if self.word.endswith(end): + return True + if not self.word.endswith('ent'): + return False + for end in phon_non_end_fem: + if self.phon.endswith(end): + return False + return True + + @property + def render(self): + fields = [self.word, self.phon, self.base, self.freq, self.nsyl[0], + self.nsyl[1], self.ending, self.phon_ending, self.mult, + self.elidable, self.feminine, self.redundant] + return "\t".join([str(x) for x in fields]) + + @property + def render_sql(self): + fields = [self.word, self.phon, self.base, self.freq, self.nsyl[0], + self.nsyl[1], self.mult, self.elidable, self.feminine, + self.redundant, self.ending, self.phon_ending, self.redundant] + return ('INSERT INTO words VALUES("' + self.word + '", "' + + self.phon + '", "' + + self.base + '", ' + + str(self.freq) + ', ' + + str(self.nsyl[0]) + ', ' + + str(self.nsyl[1]) + ', "' + + self.ending + '", "' + + self.phon_ending + '", ' + + str(int(self.mult)) + ', ' + + str(int(self.elidable)) + ', ' + + str(int(self.feminine)) + ', ' + + str(int(self.redundant)) + ');') + + @property + def ok(self): + for x in phon_vowels: + if x in self.phon_ending: + return True + return False + + def __init__(self, word, phon, base, freq, nsyl, mult): + self.word = word + self.phon = phon + self.base = base + self.freq = freq + self.nsyl = [nsyl, nsyl] + self.mult = mult + self.redundant = False + self.do_extends() + + def align_sum(self, align): + s = 0 + for a in align: + #print(a) + if isinstance(a, tuple): + s += a[1] + #print ("DBG for %s: %d" % (self.word, s)) + return s + + def do_extends(self): + for align in metric.parse(self.word, 999): + self.extend(self.align_sum(align[0])) + + def extend(self, item): + self.nsyl = [min(self.nsyl[0], item), + max(self.nsyl[1], item)] + +seen = set() +bases = {} +phon_seen = {} + +def derives(a, b): + #print ("SKIP derives %s %s" % (a, b)) + if a == b: + return True + if a not in bases.keys(): + return False + for x in bases[a]: + #print ("SKIP base is %s" % x) + if x != a: + if derives(x, b): + return True + return False + +print ("""CREATE TABLE words(word varchar(100), phon varchar(100), base +varchar(100), freq float, min_nsyl int, max_nsyl int, word_end +varchar(10), phon_end varchar(10), multiple bool, elidable bool, +feminine bool, redundant bool);""") + +while True: + line = sys.stdin.readline() + if not line: + break + l = line.rstrip().split("\t") + word = l.pop(0) + phon = l.pop(0) + base = l.pop(0) + freq = ((float(l[0]) + float(l[1]))/2 + + 100*(float(l[2]) + float(l[3]))/2) + l.pop(0) + l.pop(0) + l.pop(0) + l.pop(0) + #print ("DBG for %s: %d and %d" % (word, int(l[0]), 1+len([x for x in l[1] + #if x == ' ' or x == '-']))) + nsyl = max(int(l[0]), 1+len([x for x in l[1] if x == ' ' or x == '-'])) + l.pop(0) + l.pop(0) + mult = ',' in l[0] + l.pop(0) + assert(len(l) == 0) + w = Word(word, phon, base, freq, nsyl, mult) + key = (word, phon) + if key in seen: + continue + else: + seen.add(key) + phon_key = (phon, w.feminine) + if phon_key not in phon_seen.keys(): + phon_seen[phon_key] = [] + if word not in bases.keys(): + bases[word] = [] + bases[word].append(base) + for candidate in phon_seen[phon_key]: + #print("SKIP candidate for %s %s is %s" % (phon_key[0], phon_key[1], + # candidate)) + # TODO replace by common ancestor, and perform tsort + if derives(word, candidate): + # word derives from a word with the same pronunciation, skip it + #print("SKIP ", word) + w.redundant = True + phon_seen[phon_key].append(word) + if w.ok: + print(w.render_sql) + diff --git a/prepare.sh b/prepare.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +cat Lexique3.txt additions | cut -f 1,2,3,7,8,9,10,24,28,29 | + ~/DOCUMENTS/poetlint/rhyme/lexique/lexique_fix.sh | + sort -k1,1 | + ./manage.py diff --git a/query.sh b/query.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +cd "$( dirname "$0" )" + +sqlite dico.sqlite 'select t1.freq, t1.word, t1.phon, t2.word, t2.phon, +t2.freq, t2.min_nsyl, t2.max_nsyl, t2.elidable, t2.redundant from words +as t1 inner join words as t2 on (t1.phon_end = t2.phon_end or +t1.word_end = t2.word_end) and t1.feminine = t2.feminine where t1.word = +"'$1'" and (t2.word != t1.word or t2.multiple);' | ./reorder.py + diff --git a/reorder.py b/reorder.py @@ -0,0 +1,64 @@ +#!/usr/bin/python3 -O + +import sys + +def lcs(x, y): + i = 1 + while x[-i] == y[-i]: + i += 1 + if i > len(x) or i > len(y): + break + return i - 1 + +def pad(x, n): + return x + ' ' * max(0, n - len(x)) + +def mp(items, lens, field): + return pad(str(items[field]), lens[field]) + +by_pron = {} +keys = [] +lines = [] +names = {0: "pour l'œil", 1: "pauvre", 2: "suffisante", 3: "riche"} + +def key(l): + # frequency of interpretation desc, phonemes desc, eye desc, frequency + # desc, alpha + #print(l) + return (-float(l[0]), -l[10], -l[11], -float(l[5]), l[3]) + +mx = [0] * 12 +while True: + line = sys.stdin.readline() + if not line: + break + l = line.rstrip().split('|') + l.append(lcs(l[2], l[4])) + l.append(lcs(l[1], l[3])) + l[4] = '[' + l[4] + ']' + for i in range(len(l)): + mx[i] = max(mx[i], len(str(l[i]))) + lines.append(l) + +seen = set() + +last2 = None +last10 = None +for l in sorted(lines, key=key): + if l[2] != last2: + last2 = l[2] + print ("## For %s [%s], freq %s" % (l[1], l[2], l[0][0:9])) + if l[10] != last10: + last10 = l[10] + # TODO check if vowel is in there + print (" -- %d phonemes (%s)" % (l[10], names[min(3, l[10])] if min(3, + l[10]) in names.keys() else '')) + if l[9] == '1' and l[4] in seen: + # skip redundant where the same pronunciation was seen + # keep for eye rhyme + continue + seen.add(l[4]) + print(mp(l, mx, 3) + ' w' + mp(l, mx, 11) + ' ' + + mp(l, mx, 6) + '-' + mp(l, mx, 7) + + ('+' if l[5] == '1' else ' ') + ' ' + + mp(l, mx, 4) + ' ' + mp(l, mx, 5)[0:9])