commit c8eef9d4def727451f40f4f4e6c67e3b094fb97a
parent bc2a67b0a5c3be54d027370fcccbb4583bc109d7
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sun, 12 Jun 2011 19:28:53 -0400
it works
Diffstat:
additions | | | 50 | ++++++++++++++++++++------------------------------ |
buildtrie.py | | | 48 | +++++++++++++----------------------------------- |
compresstrie.py | | | 40 | +++++++++++++++++++++++++--------------- |
pron.py | | | 35 | +++++++++++++++++++++++++++++++---- |
4 files changed, 89 insertions(+), 84 deletions(-)
diff --git a/additions b/additions
@@ -1,30 +1,20 @@
-1 heaume
-1 hèlement
-1 hertz
-1 héraut
-1 hit-parade
-1 high-five
-1 hlm
-1 hobby
-1 hongrois
-1 homard
-1 hoquet
-1 hors-piste
-1 hors-bord
-1 huée
-1 hildegarde
-1 hiroshima
-1 heimatlos
-0 haÿ-les-roses
-0 heur
-0 heure
-0 h
-1 haveler
-0 hallucination
-0 hallucine
-0 halène
-0 halèner
-0 hadopisme
-1 hadopi
-0 hellénisme
-0 hiatus
+almanach almana
+dompte d#t
+domptent d#t
+dompterai d#tRE
+dompterait d#tRE
+dompter d#te
+dompteur d#t9R
+dompteurs d#t9R
+dompteuse d#t2z
+dompteuses d#t2z
+domptez d#te
+tabis tabi
+libye libi
+est E
+bœuf b9f
+bœufs b2f
+dis-je iZ
+employ @plwa
+amusemens amyzm@
+parens paR@
diff --git a/buildtrie.py b/buildtrie.py
@@ -6,52 +6,29 @@ representing this mapping"""
import json
import sys
-NUM=4
-
# first item is a dictionnary from values to an int indicating the
# number of occurrences with this prefix having this value
# second item is a dictionnary from letters to descendent nodes
def empty_node():
- return [0, {}]
-def empty_node2():
- return [empty_node(), {}]
-
-trie = empty_node2()
+ return [{}, {}]
-def too_ambiguous(trie):
- values, children = trie
- if values < 2:
- return False
- r = float(sum(sorted([x[0] for x in children.values()])[-NUM:])) / (values - 1)
- return r < .99
+trie = empty_node()
-def insert(trie, key):
+def insert(trie, key, val):
"""Insert val for key in trie"""
values, children = trie
- # increment count for val
- trie[0] += 1
- if trie[1] == None:
- return
- if too_ambiguous(trie):
- trie[1] = None #too ambiguous
- return
+ # create a new value, if needed
+ if len(key) == 0:
+ if val not in values.keys():
+ values[val] = 0
+ # increment count for val
+ values[val] += 1
if len(key) > 0:
# create a new node if needed
if key[0] not in children.keys():
children[key[0]] = empty_node()
# recurse
- return insert(children[key[0]], key[1:])
-
-def insert2(trie, key, val):
- """Insert val for key in trie"""
- values, children = trie
- insert(values, val)
- if len(key) > 0:
- # create a new node if needed
- if key[0] not in children.keys():
- children[key[0]] = empty_node2()
- # recurse
- return insert2(children[key[0]], key[1:], val)
+ return insert(children[key[0]], key[1:], val)
while True:
line = sys.stdin.readline()
@@ -59,13 +36,14 @@ while True:
break
line = line.split('\t')
value = line[1].rstrip()[-4:][::-1]
- word = line[0].lower()[-8:][::-1] if len(line) == 2 else ''
+ word = line[0].lower()[::-1] if len(line) == 2 else ''
#print(word)
#print(value)
# a trailing space is used to mark termination of the word
# this is useful in cases where a prefix of a word is a complete,
# different word with a different value
- insert2(trie, word+' ', value)
+ # two spaces because some data words have multiple spaces
+ insert(trie, word+' ', value)
print(json.dumps(trie))
diff --git a/compresstrie.py b/compresstrie.py
@@ -8,24 +8,34 @@ import sys
trie = json.load(sys.stdin)
-def linear(trie):
- v, c = trie
- if c == None:
- return False
- if len(c.keys()) == 0:
- return True
- elif len(c.keys()) > 1:
- return False
- else:
- return linear(list(c.values())[0])
-
def compress(trie):
"""Compress the trie"""
- if linear(trie[0]):
- # no need for children, there is no more doubt
- trie[1] = {}
+ ref = None
+ num = 0
+ ok = True
+ if trie[0] != {}:
+ if len(trie[0].keys()) > 1:
+ return None
+ ref = list(trie[0].keys())[0]
+ num = trie[0][ref]
for child in trie[1].values():
- compress(child)
+ x = compress(child)
+ if not ok or x == None:
+ ok = False
+ continue
+ r, n = x
+ if ref == None:
+ ref = r
+ if ref != r:
+ ok = False
+ num += n
+ if not ok:
+ return None
+ trie[0] = {}
+ trie[0][ref] = num
+ trie[1] = {}
+ #print(ref, file=sys.stderr)
+ return ref, num
compress(trie)
diff --git a/pron.py b/pron.py
@@ -13,15 +13,42 @@ f = open(os.path.join(os.path.dirname(
trie = json.load(f)
f.close()
+def to_list(d, rev=True):
+ l = []
+ for a in d.keys():
+ if rev:
+ l.append((d[a], a[::-1]))
+ else:
+ l.append((d[a], a))
+ return l
+
+def trie2list(trie):
+ v, c = trie
+ if c == {}:
+ return to_list(v)
+ else:
+ d = {}
+ for child in c.keys():
+ l = trie2list(c[child])
+ for x in l:
+ if x[1] not in d.keys():
+ d[x[1]] = 0
+ d[x[1]] += x[0]
+ return to_list(d, False)
+
def do_lookup(trie, key):
- if len(key) == 0 or (key[0] not in trie[1].keys()):
- print(key)
- return trie[0]
+ if len(key) == 0 or key[0] not in trie[1].keys():
+ return trie2list(trie)
return do_lookup(trie[1][key[0]], key[1:])
+
+def nbest(l, t):
+ l = sorted(l)[-t:]
+ l.reverse()
+ return l
def lookup(key):
"""Return pronunciations for key"""
- return do_lookup(trie, key[::-1] + ' ')
+ return nbest(do_lookup(trie, key[::-1] + ' '), 5)
if __name__ == '__main__':
while True: