commit f9d1dff14c55394a180f99e0b179b1a240c86f27
parent bd59f05eddd501b9d6f58ac145f412c693fa7d9e
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 9 Nov 2011 12:04:42 +0100
change shebang, renamings
Diffstat:
5 files changed, 171 insertions(+), 165 deletions(-)
diff --git a/lexique2sql.py b/lexique2sql.py
@@ -0,0 +1,157 @@
+#!/usr/bin/python3 -O
+
+"""Prepare the rhyme database
+
+Input should have tab-separated fields: word, pronunciation, base word,
+grammatical category, frequency. Output is a """
+
+# TODO frequencies are off
+# TODO "bibliographe" number of syllables?!
+
+import haspirater
+import metric
+from common import is_vowels, is_consonants, sure_end_fem
+import sys
+import _mysql
+
+seen = {}
+
+# phonetic vowel sounds
+phon_vowels = "()$#289aeEioOuy@"
+# not a feminine ending, independently of spelling
+phon_non_end_fem = ['#', ')']
+
+#
+print ("""CREATE TABLE words(
+ word varchar(100), -- word
+ phon varchar(100), -- pronunciation
+ base varchar(100), -- base word
+ kind varchar(10), -- grammatical category
+ freq float, -- frequency
+ min_nsyl int, -- lower bound on the number of syllabes
+ max_nsyl int, -- upper bound on the number of syllabes
+ word_end varchar(10), -- minimal word-level rhyme
+ phon_end varchar(10), -- minimal phon-level rhyme
+ elidable bool, -- can cause elision
+ feminine bool -- genre of the rhyme
+);""")
+
+class Word:
+ @property
+ def elidable(self):
+ """Can this word cause elision in the previous word?"""
+ return is_vowels(self.word[0]) or (self.word[0] == 'h' and
+ not haspirater.lookup(self.word))
+
+ @property
+ def phon_ending(self):
+ """Compute minimal phonetic rhyme"""
+ l = []
+ w = list(self.phon)
+ w.reverse()
+ for x in w:
+ l.append(x)
+ if x in phon_vowels:
+ break
+ l.reverse()
+ return ''.join(l)
+
+ @property
+ def ending(self):
+ """Compute minimal visual rhyme"""
+ l = []
+ w = list(self.word)
+ count = 0
+ w.reverse()
+ for x in w:
+ if is_vowels(x) or is_consonants(x):
+ l.append(x)
+ if is_vowels(x) and count >= 1:
+ break
+ count += 1
+ l.reverse()
+ return ''.join(l)
+
+ @property
+ def feminine(self):
+ """Would this word be a feminine rhyme?"""
+ for end in sure_end_fem:
+ if self.word.endswith(end):
+ return True
+ if not self.word.endswith('ent'):
+ return False
+ # word ends in -ent, it's hard to tell from writing, so look at phon
+ # example: "tient" vs. "lient"
+ for end in phon_non_end_fem:
+ if self.phon.endswith(end):
+ return False
+ return True
+
+ @property
+ def render_sql(self):
+ return ('INSERT INTO words VALUES("'
+ + _mysql.escape_string(self.word) + '", "'
+ + _mysql.escape_string(self.phon) + '", "'
+ + _mysql.escape_string(self.base) + '", "'
+ + _mysql.escape_string(self.kind) + '", '
+ + _mysql.escape_string(str(self.freq)) + ', '
+ + _mysql.escape_string(str(self.nsyl[0])) + ', '
+ + _mysql.escape_string(str(self.nsyl[1])) + ', "'
+ + _mysql.escape_string(self.ending) + '", "'
+ + _mysql.escape_string(self.phon_ending) + '", '
+ + _mysql.escape_string(str(int(self.elidable))) + ', '
+ + _mysql.escape_string(str(int(self.feminine))) + ');'
+ )
+
+ @property
+ def ok(self):
+ # Remove words with no vowels
+ for x in phon_vowels:
+ if x in self.phon_ending:
+ return True
+ return False
+
+ def __init__(self, word, phon, base, kind, freq):
+ self.word = word
+ self.phon = phon
+ self.base = base
+ self.kind = kind
+ self.freq = freq
+ self.nsyl = None
+ self.redundant = False
+ self.do_extends()
+
+ def align_sum(self, align):
+ s = 0
+ for a in align:
+ #print(a)
+ if isinstance(a, tuple):
+ s += a[1]
+ #print ("DBG for %s: %d" % (self.word, s))
+ return s
+
+ def do_extends(self):
+ for align in metric.parse(self.word, 999):
+ self.extend(self.align_sum(align[0]))
+
+ def extend(self, item):
+ if self.nsyl == None:
+ self.nsyl = [item, item]
+ else:
+ self.nsyl = [min(self.nsyl[0], item), max(self.nsyl[1], item)]
+
+while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ l = line.rstrip().split("\t")
+ word = l.pop(0)
+ phon = l.pop(0)
+ base = l.pop(0)
+ kind = l.pop(0)
+ freq = float(l.pop(0))
+ assert(len(l) == 0)
+ w = Word(word, phon, base, kind, freq)
+ if w.ok:
+ print(w.render_sql)
+
diff --git a/lexique2sql.sh b/lexique2sql.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cd "$( dirname "$0" )"
+
+cat - additions | # add custom exceptions
+ cut -f 1,2,3,4,7,8,9,10,24,28 | # select relevant fields
+ awk '{FS=" "; OFS=" "; print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies
+ ./make_db.py
diff --git a/make_db.py b/make_db.py
@@ -1,157 +0,0 @@
-#!/usr/bin/python
-
-"""Prepare the rhyme database
-
-Input should have tab-separated fields: word, pronunciation, base word,
-grammatical category, frequency. Output is a """
-
-# TODO frequencies are off
-# TODO "bibliographe" number of syllables?!
-
-import haspirater
-import metric
-from common import is_vowels, is_consonants, sure_end_fem
-import sys
-import _mysql
-
-seen = {}
-
-# phonetic vowel sounds
-phon_vowels = "()$#289aeEioOuy@"
-# not a feminine ending, independently of spelling
-phon_non_end_fem = ['#', ')']
-
-#
-print ("""CREATE TABLE words(
- word varchar(100), -- word
- phon varchar(100), -- pronunciation
- base varchar(100), -- base word
- kind varchar(10), -- grammatical category
- freq float, -- frequency
- min_nsyl int, -- lower bound on the number of syllabes
- max_nsyl int, -- upper bound on the number of syllabes
- word_end varchar(10), -- minimal word-level rhyme
- phon_end varchar(10), -- minimal phon-level rhyme
- elidable bool, -- can cause elision
- feminine bool -- genre of the rhyme
-);""")
-
-class Word:
- @property
- def elidable(self):
- """Can this word cause elision in the previous word?"""
- return is_vowels(self.word[0]) or (self.word[0] == 'h' and
- not haspirater.lookup(self.word))
-
- @property
- def phon_ending(self):
- """Compute minimal phonetic rhyme"""
- l = []
- w = list(self.phon)
- w.reverse()
- for x in w:
- l.append(x)
- if x in phon_vowels:
- break
- l.reverse()
- return ''.join(l)
-
- @property
- def ending(self):
- """Compute minimal visual rhyme"""
- l = []
- w = list(self.word)
- count = 0
- w.reverse()
- for x in w:
- if is_vowels(x) or is_consonants(x):
- l.append(x)
- if is_vowels(x) and count >= 1:
- break
- count += 1
- l.reverse()
- return ''.join(l)
-
- @property
- def feminine(self):
- """Would this word be a feminine rhyme?"""
- for end in sure_end_fem:
- if self.word.endswith(end):
- return True
- if not self.word.endswith('ent'):
- return False
- # word ends in -ent, it's hard to tell from writing, so look at phon
- # example: "tient" vs. "lient"
- for end in phon_non_end_fem:
- if self.phon.endswith(end):
- return False
- return True
-
- @property
- def render_sql(self):
- return ('INSERT INTO words VALUES("'
- + _mysql.escape_string(self.word) + '", "'
- + _mysql.escape_string(self.phon) + '", "'
- + _mysql.escape_string(self.base) + '", "'
- + _mysql.escape_string(self.kind) + '", '
- + _mysql.escape_string(str(self.freq)) + ', '
- + _mysql.escape_string(str(self.nsyl[0])) + ', '
- + _mysql.escape_string(str(self.nsyl[1])) + ', "'
- + _mysql.escape_string(self.ending) + '", "'
- + _mysql.escape_string(self.phon_ending) + '", '
- + _mysql.escape_string(str(int(self.elidable))) + ', '
- + _mysql.escape_string(str(int(self.feminine))) + ');'
- )
-
- @property
- def ok(self):
- # Remove words with no vowels
- for x in phon_vowels:
- if x in self.phon_ending:
- return True
- return False
-
- def __init__(self, word, phon, base, kind, freq):
- self.word = word
- self.phon = phon
- self.base = base
- self.kind = kind
- self.freq = freq
- self.nsyl = None
- self.redundant = False
- self.do_extends()
-
- def align_sum(self, align):
- s = 0
- for a in align:
- #print(a)
- if isinstance(a, tuple):
- s += a[1]
- #print ("DBG for %s: %d" % (self.word, s))
- return s
-
- def do_extends(self):
- for align in metric.parse(self.word, 999):
- self.extend(self.align_sum(align[0]))
-
- def extend(self, item):
- if self.nsyl == None:
- self.nsyl = [item, item]
- else:
- self.nsyl = [min(self.nsyl[0], item), max(self.nsyl[1], item)]
-
-while True:
- line = sys.stdin.readline()
- if not line:
- break
- l = line.rstrip().split("\t")
- word = l.pop(0)
- phon = l.pop(0)
- base = l.pop(0)
- kind = l.pop(0)
- freq = float(l.pop(0))
- assert(len(l) == 0)
- w = Word(word, phon, base, kind, freq)
- if w.ok:
- print(w.render_sql)
-
diff --git a/make_db.sh b/make_db.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-cd "$( dirname "$0" )"
-
-cat - additions | # add custom exceptions
- cut -f 1,2,3,4,7,8,9,10,24,28 | # select relevant fields
- awk '{FS=" "; OFS=" "; print $1, $2, $3, $4, ($5+$6)/2 + 100*($7+$8)/2}' | # aggregate frequencies
- ./make_db.py
diff --git a/prepare.sh b/prepare.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+cd "$( dirname "$0" )"
+
+./lexique2sql | sqlite db.sqlite
+