haspirater

detect aspirated 'h' in French words
git clone https://a3nm.net/git/haspirater/
Log | Files | Refs | README

commit 24a9dd9c81fd6b437d31fb375bc9266b128bb1e5
parent 43c95dd256fd0cee2a67a0e3e77e29b04e6ba734
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  2 Jul 2014 19:47:43 +0200

Merge branch 'master' of gitorious.org:haspirater/haspirater

Diffstat:
leavestrie.py | 15++++++++++++---
uptrie.py | 40++++++++++++++++++++++++++++++++++++++++
2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/leavestrie.py b/leavestrie.py @@ -8,14 +8,23 @@ import sys trie = json.load(sys.stdin) -def leaves(trie, prefix=""): +def leaves(trie, prefix="", provisional=None): """Keep only the most probable values at each node""" if len(trie[1].keys()) == 0: assert(len(trie[0].keys()) == 1) k, v = trie[0].popitem() - print("%s\t%s" % (k, prefix[::int(sys.argv[1])])) + if (k != provisional): + # does not agree with provisional decision so far + print("%s\t%s" % (k, prefix[::int(sys.argv[1])])) + # decided nodes + if len(trie) == 3 and trie[2]: + if (trie[2] != provisional): + # does not agree with provisional decision so far + print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])])) + if len(trie) == 3: + provisional = trie[2] for child in trie[1].keys(): - leaves(trie[1][child], prefix + child) + leaves(trie[1][child], prefix + child, provisional) leaves(trie) diff --git a/uptrie.py b/uptrie.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +"""Read json trie in stdin, make internal node decisions and output json dump to +stdout""" + +import itertools +import operator +import json +import sys + +trie = json.load(sys.stdin) + +def uptrie(trie): + """Make internal node decisions if possible""" + for child in trie[1].values(): + uptrie(child) + decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if + len(t[0].keys()) == 1] + dchild_g = {} + for (x, y) in decided_children: + if x not in dchild_g.keys(): + dchild_g[x] = [] + dchild_g[x].append(y) + sums = [(x, len(y)) for (x, y) in dchild_g.items()] + if len(sums) == 0: + return + best = max(sums, key=operator.itemgetter(1)) + if best[1] >= 2: + # compress here + trie.append(best[0]) + nchildren = {} + for key, child in trie[1].items(): + if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]: + nchildren[key] = child + trie[1] = nchildren + +uptrie(trie) + +print(json.dumps(trie)) +