initial commit - drime - French rhyme dictionary with web and CLI interface

commit 3937aa9cb6899ea053a09a3c2c111319cc37a93e
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed, 17 Aug 2011 19:16:05 -0400

initial commit

Diffstat:
manage.py  | 190 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
prepare.sh  | 6 ++++++
query.sh  | 10 ++++++++++
reorder.py  | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

4 files changed, 270 insertions(+), 0 deletions(-)
diff --git a/manage.py b/manage.py
@@ -0,0 +1,190 @@
+#!/usr/bin/python3 -O
+
+# TODO frequencies are off
+
+import haspirater
+import metric
+import sys
+
+seen = {}
+
+vowels = "aàâãeéèêëiîïoôöuùûüy"
+consonants = "bcçdfghjklmnpqrstvwxz"
+
+phon_vowels = "()$#289aeEioOuy@"
+
+sure_end_fem = ['e', 'es', 'ent']
+phon_non_end_fem = ['#', ')']
+
+class Word:
+  @property
+  def elidable(self):
+    return self.word[0] in vowels or (self.word[0] == 'h' and
+        not haspirater.lookup(self.word))
+
+  @property
+  def phon_ending(self):
+    l = []
+    w = list(self.phon)
+    w.reverse()
+    for x in w:
+      l.append(x)
+      if x in phon_vowels:
+        break
+    l.reverse()
+    return ''.join(l)
+      
+  @property
+  def ending(self):
+    l = []
+    w = list(self.word)
+    count = 0
+    w.reverse()
+    for x in w:
+      if x in vowels or x in consonants:
+        l.append(x)
+      if x in vowels and count >= 1:
+        break
+      count += 1
+    l.reverse()
+    return ''.join(l)
+
+  @property
+  def feminine(self):
+    for end in sure_end_fem:
+      if self.word.endswith(end):
+        return True
+    if not self.word.endswith('ent'):
+      return False
+    for end in phon_non_end_fem:
+      if self.phon.endswith(end):
+        return False
+    return True
+
+  @property
+  def render(self):
+    fields = [self.word, self.phon, self.base, self.freq, self.nsyl[0],
+        self.nsyl[1], self.ending, self.phon_ending, self.mult,
+        self.elidable, self.feminine, self.redundant]
+    return "\t".join([str(x) for x in fields])
+  
+  @property
+  def render_sql(self):
+    fields = [self.word, self.phon, self.base, self.freq, self.nsyl[0],
+        self.nsyl[1], self.mult, self.elidable, self.feminine,
+        self.redundant, self.ending, self.phon_ending, self.redundant]
+    return ('INSERT INTO words VALUES("' + self.word + '", "'
+        + self.phon + '", "'
+        + self.base + '", '
+        + str(self.freq) + ', '
+        + str(self.nsyl[0]) + ', '
+        + str(self.nsyl[1]) + ', "'
+        + self.ending + '", "'
+        + self.phon_ending + '", '
+        + str(int(self.mult)) + ', '
+        + str(int(self.elidable)) + ', '
+        + str(int(self.feminine)) + ', '
+        + str(int(self.redundant)) + ');')
+
+  @property
+  def ok(self):
+    for x in phon_vowels:
+      if x in self.phon_ending:
+        return True
+    return False
+
+  def __init__(self, word, phon, base, freq, nsyl, mult):
+    self.word = word
+    self.phon = phon
+    self.base = base
+    self.freq = freq
+    self.nsyl = [nsyl, nsyl]
+    self.mult = mult
+    self.redundant = False
+    self.do_extends()
+
+  def align_sum(self, align):
+    s = 0
+    for a in align:
+      #print(a)
+      if isinstance(a, tuple):
+        s += a[1]
+    #print ("DBG for %s: %d" % (self.word, s))
+    return s
+
+  def do_extends(self):
+    for align in metric.parse(self.word, 999):
+      self.extend(self.align_sum(align[0]))
+
+  def extend(self, item):
+    self.nsyl = [min(self.nsyl[0], item),
+        max(self.nsyl[1], item)]
+
+seen = set()
+bases = {}
+phon_seen = {}
+
+def derives(a, b):
+  #print ("SKIP derives %s %s" % (a, b))
+  if a == b:
+    return True
+  if a not in bases.keys():
+    return False
+  for x in bases[a]:
+    #print ("SKIP base is %s" % x)
+    if x != a:
+      if derives(x, b):
+        return True
+  return False
+
+print ("""CREATE TABLE words(word varchar(100), phon varchar(100), base
+varchar(100), freq float, min_nsyl int, max_nsyl int, word_end
+varchar(10), phon_end varchar(10), multiple bool, elidable bool,
+feminine bool, redundant bool);""")
+
+while True:
+  line = sys.stdin.readline()
+  if not line:
+    break
+  l = line.rstrip().split("\t")
+  word = l.pop(0)
+  phon = l.pop(0)
+  base = l.pop(0)
+  freq = ((float(l[0]) + float(l[1]))/2 +
+        100*(float(l[2]) + float(l[3]))/2)
+  l.pop(0)
+  l.pop(0)
+  l.pop(0)
+  l.pop(0)
+  #print ("DBG for %s: %d and %d" % (word, int(l[0]), 1+len([x for x in l[1]
+    #if x == ' ' or x == '-'])))
+  nsyl = max(int(l[0]), 1+len([x for x in l[1] if x == ' ' or x == '-']))
+  l.pop(0)
+  l.pop(0)
+  mult = ',' in l[0]
+  l.pop(0)
+  assert(len(l) == 0)
+  w = Word(word, phon, base, freq, nsyl, mult)
+  key = (word, phon)
+  if key in seen:
+    continue
+  else:
+    seen.add(key)
+  phon_key = (phon, w.feminine)
+  if phon_key not in phon_seen.keys():
+    phon_seen[phon_key] = []
+  if word not in bases.keys():
+    bases[word] = []
+  bases[word].append(base)
+  for candidate in phon_seen[phon_key]:
+    #print("SKIP candidate for %s %s is %s" % (phon_key[0], phon_key[1],
+      # candidate))
+    # TODO replace by common ancestor, and perform tsort
+    if derives(word, candidate):
+      # word derives from a word with the same pronunciation, skip it
+      #print("SKIP ", word)
+      w.redundant = True
+  phon_seen[phon_key].append(word)
+  if w.ok:
+    print(w.render_sql)
+
diff --git a/prepare.sh b/prepare.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+cat Lexique3.txt additions | cut -f 1,2,3,7,8,9,10,24,28,29 |
+  ~/DOCUMENTS/poetlint/rhyme/lexique/lexique_fix.sh |
+  sort -k1,1 |
+  ./manage.py
diff --git a/query.sh b/query.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+cd "$( dirname "$0" )"
+
+sqlite dico.sqlite 'select t1.freq, t1.word, t1.phon, t2.word, t2.phon,
+t2.freq, t2.min_nsyl, t2.max_nsyl, t2.elidable, t2.redundant from words
+as t1 inner join words as t2 on (t1.phon_end = t2.phon_end or
+t1.word_end = t2.word_end) and t1.feminine = t2.feminine where t1.word =
+"'$1'" and (t2.word != t1.word or t2.multiple);' | ./reorder.py
+
diff --git a/reorder.py b/reorder.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python3 -O
+
+import sys
+
+def lcs(x, y):
+  i = 1
+  while x[-i] == y[-i]:
+    i += 1
+    if i > len(x) or i > len(y):
+      break
+  return i - 1
+
+def pad(x, n):
+  return x + ' ' * max(0, n - len(x))
+
+def mp(items, lens, field):
+  return pad(str(items[field]), lens[field])
+
+by_pron = {}
+keys = []
+lines = []
+names = {0: "pour l'œil", 1: "pauvre", 2: "suffisante", 3: "riche"}
+
+def key(l):
+  # frequency of interpretation desc, phonemes desc, eye desc, frequency
+  # desc, alpha
+  #print(l)
+  return (-float(l[0]), -l[10], -l[11], -float(l[5]), l[3])
+
+mx = [0] * 12
+while True:
+  line = sys.stdin.readline()
+  if not line:
+    break
+  l = line.rstrip().split('|')
+  l.append(lcs(l[2], l[4]))
+  l.append(lcs(l[1], l[3]))
+  l[4] = '[' + l[4] + ']'
+  for i in range(len(l)):
+    mx[i] = max(mx[i], len(str(l[i])))
+  lines.append(l)
+
+seen = set()
+
+last2 = None
+last10 = None
+for l in sorted(lines, key=key):
+  if l[2] != last2:
+    last2 = l[2]
+    print ("## For %s [%s], freq %s" % (l[1], l[2], l[0][0:9]))
+  if l[10] != last10:
+    last10 = l[10]
+    # TODO check if vowel is in there
+    print ("  -- %d phonemes (%s)" % (l[10], names[min(3, l[10])] if min(3,
+      l[10]) in names.keys() else ''))
+  if l[9] == '1' and l[4] in seen:
+    # skip redundant where the same pronunciation was seen
+    # keep for eye rhyme
+    continue
+  seen.add(l[4])
+  print(mp(l, mx, 3) + ' w' + mp(l, mx, 11) + '  '
+      + mp(l, mx, 6) + '-' + mp(l, mx, 7)
+      + ('+' if l[5] == '1' else ' ') + ' '
+      + mp(l, mx, 4) + ' ' + mp(l, mx, 5)[0:9])

	drime French rhyme dictionary with web and CLI interface
	git clone https://a3nm.net/git/drime/
	Log \| Files \| Refs \| README

manage.py	\|	190	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
prepare.sh	\|	6	++++++
query.sh	\|	10	++++++++++
reorder.py	\|	64	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++