commit 77221a978a5e49fe5a16d6606559bc91576423e7
parent c1acbbd1530fd004a8141deea2705d0c0136a653
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Mon, 26 Dec 2011 23:13:54 +0100
lexique2sql.py now aggregates all possible derivations
Diffstat:
1 file changed, 25 insertions(+), 13 deletions(-)
diff --git a/lexique2sql.py b/lexique2sql.py
@@ -18,8 +18,7 @@ def string_type(max_len):
sql_fields = [
('word', string_type(100)),
('phon', string_type(100)),
- ('base', string_type(100)),
- ('kind', string_type(10)),
+ ('orig', ('string2', {'max_len':100})),
('freq', ('float', {})),
('min_nsyl', ('int', {})),
('max_nsyl', ('int', {})),
@@ -110,14 +109,18 @@ class Word:
@property
def sql(self):
render = {
- 'string': lambda s: '"'+escape(s)+'"', # no escaping: use parametrized queries!
- 'float': str,
- 'int': lambda s: str(int(s)),
- 'bool': lambda s: str(int(s)),
+ 'string': lambda s, w: '"'+escape(s)+'"',
+ 'string2': lambda s, w:
+ '"'+escape(
+ ', '.join([x[0]+(' ('+x[1]+')' if w.word != x[1] else '')
+ for x in s]))+'"',
+ 'float': lambda s, w: str(s),
+ 'int': lambda s, w: str(int(s)),
+ 'bool': lambda s, w: str(int(s)),
}
def sql_field(field):
(name, (ty, _)) = field
- return render[ty](getattr(self, name))
+ return render[ty](getattr(self, name), self)
return ('INSERT INTO words VALUES('
+ ', '.join([sql_field(f) for f in sql_fields])
+ ');')
@@ -125,8 +128,7 @@ class Word:
def __init__(self, word, phon, base, kind, freq):
self.word = word
self.phon = phon
- self.base = base
- self.kind = kind
+ self.orig = [(kind, base)]
self.freq = float(freq)
self.nsyl = None
self.do_extends()
@@ -153,7 +155,7 @@ class Word:
def create_table():
def decl(field):
(name, (ty, data)) = field
- if ty == 'string':
+ if ty.startswith('string'):
ty = 'varchar(' + str(data['max_len']) + ')'
return name + ' ' + ty
return ('CREATE TABLE words('
@@ -161,14 +163,24 @@ def create_table():
+ ');')
if __name__ == '__main__':
- print (create_table())
+ cache = None
+ print (create_table())
while True:
line = sys.stdin.readline()
if not line:
break
l = line.rstrip().split("\t")
w = Word(*l)
- if w.ok:
- print(w.sql)
+ k = (w.word, w.phon)
+ if not w.ok:
+ continue
+ if cache and cache.word == w.word and cache.phon == w.phon:
+ cache.freq += w.freq
+ cache.orig += [w.orig[0]]
+ else:
+ if cache:
+ print(cache.sql)
+ cache = w
+ print(cache.sql)