commit 24a9dd9c81fd6b437d31fb375bc9266b128bb1e5
parent 43c95dd256fd0cee2a67a0e3e77e29b04e6ba734
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 2 Jul 2014 19:47:43 +0200
Merge branch 'master' of gitorious.org:haspirater/haspirater
Diffstat:
2 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/leavestrie.py b/leavestrie.py
@@ -8,14 +8,23 @@ import sys
trie = json.load(sys.stdin)
-def leaves(trie, prefix=""):
+def leaves(trie, prefix="", provisional=None):
"""Keep only the most probable values at each node"""
if len(trie[1].keys()) == 0:
assert(len(trie[0].keys()) == 1)
k, v = trie[0].popitem()
- print("%s\t%s" % (k, prefix[::int(sys.argv[1])]))
+ if (k != provisional):
+ # does not agree with provisional decision so far
+ print("%s\t%s" % (k, prefix[::int(sys.argv[1])]))
+ # decided nodes
+ if len(trie) == 3 and trie[2]:
+ if (trie[2] != provisional):
+ # does not agree with provisional decision so far
+ print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])]))
+ if len(trie) == 3:
+ provisional = trie[2]
for child in trie[1].keys():
- leaves(trie[1][child], prefix + child)
+ leaves(trie[1][child], prefix + child, provisional)
leaves(trie)
diff --git a/uptrie.py b/uptrie.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, make internal node decisions and output json dump to
+stdout"""
+
+import itertools
+import operator
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def uptrie(trie):
+ """Make internal node decisions if possible"""
+ for child in trie[1].values():
+ uptrie(child)
+ decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if
+ len(t[0].keys()) == 1]
+ dchild_g = {}
+ for (x, y) in decided_children:
+ if x not in dchild_g.keys():
+ dchild_g[x] = []
+ dchild_g[x].append(y)
+ sums = [(x, len(y)) for (x, y) in dchild_g.items()]
+ if len(sums) == 0:
+ return
+ best = max(sums, key=operator.itemgetter(1))
+ if best[1] >= 2:
+ # compress here
+ trie.append(best[0])
+ nchildren = {}
+ for key, child in trie[1].items():
+ if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]:
+ nchildren[key] = child
+ trie[1] = nchildren
+
+uptrie(trie)
+
+print(json.dumps(trie))
+