haspirater

detect aspirated 'h' in French words (local mirror of https://gitlab.com/a3nm/haspirater)
git clone https://a3nm.net/git/haspirater/
Log | Files | Refs | README | LICENSE

commit ae59e299f0d0b9ce4255d6b07dc718b32edd980d
parent b79b3eed3cf78fd00cda450eb38b60413ff67a9f
Author: a3nm <a3nm@a3nm.net>
Date:   Thu, 15 Aug 2019 21:47:53 +0000

Merge branch 'master' into 'master'

Setup Package

See merge request a3nm/haspirater!1
Diffstat:
.gitignore | 4++++
LICENSE | 18++++++++++++++++++
README | 2+-
buildtrie.py | 43-------------------------------------------
buildtrie_list.py | 22----------------------
compresstrie.py | 22----------------------
haspirater.json | 1-
haspirater.py | 49-------------------------------------------------
haspirater/__init__.py | 1+
haspirater/buildtrie.py | 43+++++++++++++++++++++++++++++++++++++++++++
haspirater/buildtrie_list.py | 22++++++++++++++++++++++
haspirater/compresstrie.py | 22++++++++++++++++++++++
haspirater/haspirater.json | 1+
haspirater/haspirater.py | 49+++++++++++++++++++++++++++++++++++++++++++++++++
haspirater/leavestrie.py | 30++++++++++++++++++++++++++++++
haspirater/majoritytrie.py | 35+++++++++++++++++++++++++++++++++++
haspirater/trie2dot.py | 71+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
haspirater/uptrie.py | 40++++++++++++++++++++++++++++++++++++++++
leavestrie.py | 30------------------------------
majoritytrie.py | 35-----------------------------------
make.sh | 6+++---
setup.py | 20++++++++++++++++++++
trie2dot.py | 71-----------------------------------------------------------------------
uptrie.py | 40----------------------------------------
24 files changed, 360 insertions(+), 317 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1 +1,5 @@ __pycache__/ +dist/ +haspirater.egg-info/ +build/ + diff --git a/LICENSE b/LICENSE @@ -0,0 +1,18 @@ +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README b/README @@ -63,7 +63,7 @@ and are not stored as-is; they are just piped later on in the training phase. make.sh produces on stdout the json trie. Thus, you would run something like: - $ cat corpus | ./make.sh exceptions > haspirater.json + $ cat corpus | ./make.sh exceptions > haspirater/haspirater.json == 4. Training details == diff --git a/buildtrie.py b/buildtrie.py @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -"""From a list of values (arbitrary) and keys (words), create a trie -representing this mapping""" - -import json -import sys - -# first item is a dictionnary from values to an int indicating the -# number of occurrences with this prefix having this value -# second item is a dictionnary from letters to descendent nodes -def empty_node(): - return [{}, {}] - -def insert(trie, key, val): - """Insert val for key in trie""" - values, children = trie - # create a new value, if needed - if val not in values.keys(): - values[val] = 0 - # increment count for val - values[val] += 1 - if len(key) > 0: - # create a new node if needed - if key[0] not in children.keys(): - children[key[0]] = empty_node() - # recurse - return insert(children[key[0]], key[1:], val) - -if __name__ == '__main__': - trie = empty_node() - - for line in sys.stdin.readlines(): - line = line.split() - value = line[0] - word = line[1].lower() if len(line) == 2 else '' - # a trailing space is used to mark termination of the word - # this is useful in cases where a prefix of a word is a complete, - # different word with a different value - insert(trie, word+' ', value) - - print(json.dumps(trie)) - diff --git a/buildtrie_list.py b/buildtrie_list.py @@ -1,22 +0,0 @@ -#!/usr/bin/env python3 - -"""From a list of values (arbitrary) and keys (words), create a trie -representing this mapping""" - -# this modified version is used by plint -# see https://a3nm.net/git/plint - -import buildtrie -import json -import sys - -trie = buildtrie.empty_node() - -for line in sys.stdin.readlines(): - line = line.split() - value = line[0] - word = line[1:] - buildtrie.insert(trie, word+['-', '-'], value) - -print(json.dumps(trie)) - diff --git a/compresstrie.py b/compresstrie.py @@ -1,22 +0,0 @@ -#!/usr/bin/env python3 - -"""Read json trie in stdin, trim unneeded branches and output json dump -to stdout""" - -import json -import sys - -trie = json.load(sys.stdin) - -def compress(trie): - """Compress the trie""" - if len(trie[0].keys()) <= 1: - # no need for children, there is no more doubt - trie[1] = {} - for child in trie[1].values(): - compress(child) - -compress(trie) - -print(json.dumps(trie)) - diff --git a/haspirater.json b/haspirater.json @@ -1 +0,0 @@ -[["0"], {"\u00e9": [["0"], {"b": [["0"], {"\u00e9": [["0"], {"c": [["1"], {}]}]}], "l": [["0"], {"\u00e9": [["1"], {}], "a": [["1"], {"s": [["0"], {" ": [["0", "1"], {}]}]}], "i": [["0"], {"p": [["1"], {}]}], "e": [["1"], {}], "\u00e8": [["0", "1"], {"n": [["0"], {}], "r": [["1"], {}]}]}], "r": [["1"], {"\u00e9": [["0"], {}], "a": [["1"], {"c": [["0"], {}]}], "o": [["1"], {"\u00ef": [["0"], {}]}], "i": [["0"], {"s": [["1"], {}]}]}], "q": [["1"], {}]}], "\u00f4": [["0"], {"l": [["1"], {}]}], "o": [["0"], {"t": [["1"], {}], "m": [["0"], {"e": [["1"], {}], "a": [["1"], {}], " ": [["1"], {}]}], "o": [["1"], {}], "h": [["1"], {}], " ": [["1"], {}], "y": [["1"], {}], "l": [["1"], {"o": [["0"], {}]}], "u": [["1"], {}], "b": [["1"], {}], "r": [["0"], {"m": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}], "s": [["1"], {}], "n": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "d": [["1"], {}]}], "c": [["1"], {}], "s": [["0"], {"a": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}], "s": [["1"], {}], "n": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "g": [["1"], {}]}], "p": [["1"], {}], "w": [["1"], {}], "g": [["1"], {}], "d": [["1"], {}], "q": [["1"], {}]}], "e": [["0"], {"t": [["1"], {}], "m": [["1"], {}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}], "a": [["1"], {"u": [["1"], {"t": [["0"], {}]}]}], "i": [["1"], {}], "l": [["0"], {"l": [["0"], {"o": [["1"], {}], "e": [["0"], {"b": [["1"], {}]}]}]}], "r": [["0"], {"c": [["1"], {"u": [["0"], {}]}], "t": [["1"], {}], "m": [["0"], {"i": [["0"], {"t": [["0"], {"i": [["1"], {}]}]}]}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}]}], "u": [["0"], {"/": [["1"], {}], "s": [["1"], {}], "l": [["1"], {}], "r": [["0"], {"t": [["1"], {}]}]}]}], "y": [["0"], {"a": [["0", "1"], {"c": [["0"], {}], "l": [["1"], {}]}]}], "l": [["1"], {}], "u": [["1"], {"m": [["0"], {"\u00e9": [["1"], {}], "b": [["0"], {"o": [["1"], {}], "l": [["0"], {"e": [["0"], {" ": [["0", "1"], {}], "s": [["0", "1"], {}]}]}]}], "o": [["1"], {"u": [["0"], {}]}], "e": [["0"], {"z": [["1"], {}], "m": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "u": [["0"], {"x": [["1"], {}]}], "r": [["1"], {}]}], "a": [["0"], {" ": [["1"], {}], "i": [["0"], {"t": [["1"], {}], "e": [["1"], {}], " ": [["1"], {}], "s": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}]}], "g": [["1"], {}]}], "p": [["1"], {}], " ": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "\u00e8": [["1"], {}]}], "\u00ee": [["0"], {}], "l": [["1"], {"u": [["1"], {"b": [["0"], {}]}]}], "i": [["1"], {"s": [["0"], {}], "l": [["0"], {}]}], "d": [["0"], {}]}], "\u00e2": [["1"], {}], "\u00ea": [["1"], {}], "a": [["1"], {"l": [["1"], {"o": [["1"], {"g": [["0"], {}]}], "e": [["1"], {"i": [["0"], {}]}], "l": [["1"], {"a": [["1"], {"l": [["0"], {}]}], "u": [["0"], {}]}], "i": [["0", "1"], {"b": [["1"], {}], "o": [["0"], {}]}], "\u00e8": [["1"], {"n": [["0"], {}]}]}], "b": [["0"], {"a": [["1"], {}], "o": [["1"], {}]}], "\u00ff": [["0"], {}], "r": [["1"], {"m": [["0"], {}]}], "d": [["1"], {"o": [["0", "1"], {"p": [["0", "1"], {"i": [["0", "1"], {" ": [["1"], {}], "s": [["0"], {}]}]}]}], "r": [["0"], {}]}], "v": [["0"], {"e": [["0"], {"s": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "l": [["0"], {"\u00e9": [["0"], {"e": [["0"], {" ": [["0", "1"], {}]}]}], "e": [["0"], {"r": [["0", "1"], {}]}]}], "r": [["1"], {}], "u": [["1"], {}]}], "a": [["1"], {}], "r": [["1"], {}], "i": [["1"], {}], "u": [["1"], {}]}]}], " ": [["0", "1"], {}], "i": [["0"], {"\u00e9": [["1"], {}], "t": [["1"], {}], "m": [["1"], {}], "e": [["1"], {"r": [["0"], {" ": [["0", "1"], {}]}]}], "h": [["1"], {}], "b": [["0"], {"o": [["1"], {}]}], "l": [["0", "1"], {"a": [["0"], {"i": [["1"], {}]}], "d": [["0", "1"], {"e": [["0", "1"], {"s": [["0"], {}], "g": [["1"], {}]}]}], "b": [["1"], {}], "o": [["1"], {}], "e": [["1"], {}]}], "g": [["1"], {}], "c": [["1"], {}], "s": [["0"], {"s": [["1"], {}]}], "n": [["1"], {"d": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}]}], "a": [["0", "1"], {"t": [["0", "1"], {"a": [["1"], {}], "u": [["0"], {}]}]}], "p": [["0"], {"h": [["1"], {}], "p": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "r": [["1"], {"o": [["1"], {"n": [["0"], {}]}]}], "f": [["1"], {}]}], "\u00e1": [["1"], {}], "\u00e8": [["1"], {"b": [["0"], {}]}]}] diff --git a/haspirater.py b/haspirater.py @@ -1,49 +0,0 @@ -#!/usr/bin/python3 - -"""Determine if a French word starts by an aspirated 'h' or not, by a -lookup in a precompiled trie""" - -import os -import json -import sys - -f = open(os.path.join(os.path.dirname( - os.path.realpath(__file__)), 'haspirater.json')) -trie = json.load(f) -f.close() - -def do_lookup(trie, key): - if len(key) == 0 or (key[0] not in trie[1].keys()): - return trie[0] - return do_lookup(trie[1][key[0]], key[1:]) - -def lookup(key): - """Return True iff key starts with an aspirated 'h'""" - if key == '' or key[0] != 'h': - raise ValueError - return list(map((lambda x: x == "1"), do_lookup(trie, key[1:] + ' '))) - -def wrap_lookup(line): - line = line.lower().lstrip().rstrip() - try: - result = lookup(line) - if True in result and not False in result: - print("%s: aspirated" % line) - elif False in result and not True in result: - print("%s: not aspirated" % line) - else: - print("%s: ambiguous" % line) - except ValueError: - print("%s: no leading 'h'" % line) - -if __name__ == '__main__': - if len(sys.argv) > 1: - for arg in sys.argv[1:]: - wrap_lookup(arg) - else: - while True: - line = sys.stdin.readline() - if not line: - break - wrap_lookup(line) - diff --git a/haspirater/__init__.py b/haspirater/__init__.py @@ -0,0 +1 @@ +from .haspirater import * diff --git a/haspirater/buildtrie.py b/haspirater/buildtrie.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +"""From a list of values (arbitrary) and keys (words), create a trie +representing this mapping""" + +import json +import sys + +# first item is a dictionnary from values to an int indicating the +# number of occurrences with this prefix having this value +# second item is a dictionnary from letters to descendent nodes +def empty_node(): + return [{}, {}] + +def insert(trie, key, val): + """Insert val for key in trie""" + values, children = trie + # create a new value, if needed + if val not in values.keys(): + values[val] = 0 + # increment count for val + values[val] += 1 + if len(key) > 0: + # create a new node if needed + if key[0] not in children.keys(): + children[key[0]] = empty_node() + # recurse + return insert(children[key[0]], key[1:], val) + +if __name__ == '__main__': + trie = empty_node() + + for line in sys.stdin.readlines(): + line = line.split() + value = line[0] + word = line[1].lower() if len(line) == 2 else '' + # a trailing space is used to mark termination of the word + # this is useful in cases where a prefix of a word is a complete, + # different word with a different value + insert(trie, word+' ', value) + + print(json.dumps(trie)) + diff --git a/haspirater/buildtrie_list.py b/haspirater/buildtrie_list.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +"""From a list of values (arbitrary) and keys (words), create a trie +representing this mapping""" + +# this modified version is used by plint +# see https://a3nm.net/git/plint + +import haspirater.buildtrie +import json +import sys + +trie = buildtrie.empty_node() + +for line in sys.stdin.readlines(): + line = line.split() + value = line[0] + word = line[1:] + buildtrie.insert(trie, word+['-', '-'], value) + +print(json.dumps(trie)) + diff --git a/haspirater/compresstrie.py b/haspirater/compresstrie.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +"""Read json trie in stdin, trim unneeded branches and output json dump +to stdout""" + +import json +import sys + +trie = json.load(sys.stdin) + +def compress(trie): + """Compress the trie""" + if len(trie[0].keys()) <= 1: + # no need for children, there is no more doubt + trie[1] = {} + for child in trie[1].values(): + compress(child) + +compress(trie) + +print(json.dumps(trie)) + diff --git a/haspirater/haspirater.json b/haspirater/haspirater.json @@ -0,0 +1 @@ +[["0"], {"\u00e9": [["0"], {"b": [["0"], {"\u00e9": [["0"], {"c": [["1"], {}]}]}], "l": [["0"], {"\u00e9": [["1"], {}], "a": [["1"], {"s": [["0"], {" ": [["0", "1"], {}]}]}], "i": [["0"], {"p": [["1"], {}]}], "e": [["1"], {}], "\u00e8": [["0", "1"], {"n": [["0"], {}], "r": [["1"], {}]}]}], "r": [["1"], {"\u00e9": [["0"], {}], "a": [["1"], {"c": [["0"], {}]}], "o": [["1"], {"\u00ef": [["0"], {}]}], "i": [["0"], {"s": [["1"], {}]}]}], "q": [["1"], {}]}], "\u00f4": [["0"], {"l": [["1"], {}]}], "o": [["0"], {"t": [["1"], {}], "m": [["0"], {"e": [["1"], {}], "a": [["1"], {}], " ": [["1"], {}]}], "o": [["1"], {}], "h": [["1"], {}], " ": [["1"], {}], "y": [["1"], {}], "l": [["1"], {"o": [["0"], {}]}], "u": [["1"], {}], "b": [["1"], {}], "r": [["0"], {"m": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}], "s": [["1"], {}], "n": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "d": [["1"], {}]}], "c": [["1"], {}], "s": [["0"], {"a": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}], "s": [["1"], {}], "n": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "g": [["1"], {}]}], "p": [["1"], {}], "w": [["1"], {}], "g": [["1"], {}], "d": [["1"], {}], "q": [["1"], {}]}], "e": [["0"], {"t": [["1"], {}], "m": [["1"], {}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}], "a": [["1"], {"u": [["1"], {"t": [["0"], {}]}]}], "i": [["1"], {}], "l": [["0"], {"l": [["0"], {"o": [["1"], {}], "e": [["0"], {"b": [["1"], {}]}]}]}], "r": [["0"], {"c": [["1"], {"u": [["0"], {}]}], "t": [["1"], {}], "m": [["0"], {"i": [["0"], {"t": [["0"], {"i": [["1"], {}]}]}]}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}]}], "u": [["0"], {"/": [["1"], {}], "s": [["1"], {}], "l": [["1"], {}], "r": [["0"], {"t": [["1"], {}]}]}]}], "y": [["0"], {"a": [["0", "1"], {"c": [["0"], {}], "l": [["1"], {}]}]}], "l": [["1"], {}], "u": [["1"], {"m": [["0"], {"\u00e9": [["1"], {}], "b": [["0"], {"o": [["1"], {}], "l": [["0"], {"e": [["0"], {" ": [["0", "1"], {}], "s": [["0", "1"], {}]}]}]}], "o": [["1"], {"u": [["0"], {}]}], "e": [["0"], {"z": [["1"], {}], "m": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "u": [["0"], {"x": [["1"], {}]}], "r": [["1"], {}]}], "a": [["0"], {" ": [["1"], {}], "i": [["0"], {"t": [["1"], {}], "e": [["1"], {}], " ": [["1"], {}], "s": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}]}], "g": [["1"], {}]}], "p": [["1"], {}], " ": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "\u00e8": [["1"], {}]}], "\u00ee": [["0"], {}], "l": [["1"], {"u": [["1"], {"b": [["0"], {}]}]}], "i": [["1"], {"s": [["0"], {}], "l": [["0"], {}]}], "d": [["0"], {}]}], "\u00e2": [["1"], {}], "\u00ea": [["1"], {}], "a": [["1"], {"l": [["1"], {"o": [["1"], {"g": [["0"], {}]}], "e": [["1"], {"i": [["0"], {}]}], "l": [["1"], {"a": [["1"], {"l": [["0"], {}]}], "u": [["0"], {}]}], "i": [["0", "1"], {"b": [["1"], {}], "o": [["0"], {}]}], "\u00e8": [["1"], {"n": [["0"], {}]}]}], "b": [["0"], {"a": [["1"], {}], "o": [["1"], {}]}], "\u00ff": [["0"], {}], "r": [["1"], {"m": [["0"], {}]}], "d": [["1"], {"o": [["0", "1"], {"p": [["0", "1"], {"i": [["0", "1"], {" ": [["1"], {}], "s": [["0"], {}]}]}]}], "r": [["0"], {}]}], "v": [["0"], {"e": [["0"], {"s": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "l": [["0"], {"\u00e9": [["0"], {"e": [["0"], {" ": [["0", "1"], {}]}]}], "e": [["0"], {"r": [["0", "1"], {}]}]}], "r": [["1"], {}], "u": [["1"], {}]}], "a": [["1"], {}], "r": [["1"], {}], "i": [["1"], {}], "u": [["1"], {}]}]}], " ": [["0", "1"], {}], "i": [["0"], {"\u00e9": [["1"], {}], "t": [["1"], {}], "m": [["1"], {}], "e": [["1"], {"r": [["0"], {" ": [["0", "1"], {}]}]}], "h": [["1"], {}], "b": [["0"], {"o": [["1"], {}]}], "l": [["0", "1"], {"a": [["0"], {"i": [["1"], {}]}], "d": [["0", "1"], {"e": [["0", "1"], {"s": [["0"], {}], "g": [["1"], {}]}]}], "b": [["1"], {}], "o": [["1"], {}], "e": [["1"], {}]}], "g": [["1"], {}], "c": [["1"], {}], "s": [["0"], {"s": [["1"], {}]}], "n": [["1"], {"d": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}]}], "a": [["0", "1"], {"t": [["0", "1"], {"a": [["1"], {}], "u": [["0"], {}]}]}], "p": [["0"], {"h": [["1"], {}], "p": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "r": [["1"], {"o": [["1"], {"n": [["0"], {}]}]}], "f": [["1"], {}]}], "\u00e1": [["1"], {}], "\u00e8": [["1"], {"b": [["0"], {}]}]}] diff --git a/haspirater/haspirater.py b/haspirater/haspirater.py @@ -0,0 +1,49 @@ +#!/usr/bin/python3 + +"""Determine if a French word starts by an aspirated 'h' or not, by a +lookup in a precompiled trie""" + +import os +import json +import sys + +f = open(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'haspirater.json')) +trie = json.load(f) +f.close() + +def do_lookup(trie, key): + if len(key) == 0 or (key[0] not in trie[1].keys()): + return trie[0] + return do_lookup(trie[1][key[0]], key[1:]) + +def lookup(key): + """Return True iff key starts with an aspirated 'h'""" + if key == '' or key[0] != 'h': + raise ValueError + return list(map((lambda x: x == "1"), do_lookup(trie, key[1:] + ' '))) + +def wrap_lookup(line): + line = line.lower().lstrip().rstrip() + try: + result = lookup(line) + if True in result and not False in result: + print("%s: aspirated" % line) + elif False in result and not True in result: + print("%s: not aspirated" % line) + else: + print("%s: ambiguous" % line) + except ValueError: + print("%s: no leading 'h'" % line) + +if __name__ == '__main__': + if len(sys.argv) > 1: + for arg in sys.argv[1:]: + wrap_lookup(arg) + else: + while True: + line = sys.stdin.readline() + if not line: + break + wrap_lookup(line) + diff --git a/haspirater/leavestrie.py b/haspirater/leavestrie.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +"""Read json trie in stdin, produce leaves and values +argv[1] is 1 or -1 to reverse the label sequence or not""" + +import json +import sys + +trie = json.load(sys.stdin) + +def leaves(trie, prefix="", provisional=None): + """Keep only the most probable values at each node""" + if len(trie[1].keys()) == 0: + assert(len(trie[0].keys()) == 1) + k, v = trie[0].popitem() + if (k != provisional): + # does not agree with provisional decision so far + print("%s\t%s" % (k, prefix[::int(sys.argv[1])])) + # decided nodes + if len(trie) == 3 and trie[2]: + if (trie[2] != provisional): + # does not agree with provisional decision so far + print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])])) + if len(trie) == 3: + provisional = trie[2] + for child in trie[1].keys(): + leaves(trie[1][child], prefix + child, provisional) + +leaves(trie) + diff --git a/haspirater/majoritytrie.py b/haspirater/majoritytrie.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +"""Read json trie in stdin, keep majority value at each node, remove +useless leaf nodes and output trie to stdout""" + +import json +import sys + +trie = json.load(sys.stdin) + +def get_majority(d): + """What are the most probable values?""" + mx = max(d.values()) + return [k for k in d.keys() if d[k] == mx] + +def majority(trie): + """Keep only the most probable values at each node""" + if len(trie[1].keys()) == 0: + # keep all options at leaf nodes + trie[0] = list(trie[0].keys()) + else: + trie[0] = get_majority(trie[0]) + useless = [] + for child in trie[1].keys(): + majority(trie[1][child]) + # if it is relabeled to our majority value and is a leaf, drop it + if trie[1][child][0] == trie[0] and trie[1][child][1] == {}: + useless.append(child) + for child in useless: + del(trie[1][child]) + +majority(trie) + +print(json.dumps(trie)) + diff --git a/haspirater/trie2dot.py b/haspirater/trie2dot.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +"""Takes json as input with labels [value1, value2] and produces dot, +usage: trie2dot.py prefix value1 value2""" + +import json +import sys +from math import log + +trie = json.load(sys.stdin) + +free_id = 0 + +def cget(d, k): + try: + if k in d.keys(): + return d[k] + else: + return 0 + except AttributeError: + # we have a list, not a dictionary + # this happens after majoritytrie.py + if k in d: + return 1 + else: + return 0 + +def int2strbyte(i): + s = hex(i).split('x')[1] + if len(s) == 1: + return '0' + s + else: + return s + +def fraction2rgb(fraction): + n = int(255*fraction) + return int2strbyte(n)+'00'+int2strbyte(255 - n) + +def total(x): + key, node = x + try: + return sum(node[0].values()) + except AttributeError: + # we have only one value, not a dictionary + return 1 + +def to_dot(trie, prefix=''): + global free_id + + values, children = trie + my_id = free_id + free_id += 1 + count = cget(values, v1) + cget(values, v2) + fraction = cget(values, v2) / count + + print("%d [label=\"%s\",color=\"#%s\",penwidth=%d]" % (my_id, prefix, + fraction2rgb(fraction), 1+int(log(count)))) + + for (key, child) in sorted(children.items(), key=total, reverse=True): + i = to_dot(child, prefix+key) + print("%d -> %d [penwidth=%d]" % (my_id, i, + 1+int(log(total((None, child)))))) + + return my_id + +print("digraph G {\naspect=\"1\"\n") +prefix = sys.argv[1] +v1 = sys.argv[2] +v2 = sys.argv[3] +to_dot(trie, prefix) +print("}") diff --git a/haspirater/uptrie.py b/haspirater/uptrie.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +"""Read json trie in stdin, make internal node decisions and output json dump to +stdout""" + +import itertools +import operator +import json +import sys + +trie = json.load(sys.stdin) + +def uptrie(trie): + """Make internal node decisions if possible""" + for child in trie[1].values(): + uptrie(child) + decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if + len(t[0].keys()) == 1] + dchild_g = {} + for (x, y) in decided_children: + if x not in dchild_g.keys(): + dchild_g[x] = [] + dchild_g[x].append(y) + sums = [(x, len(y)) for (x, y) in dchild_g.items()] + if len(sums) == 0: + return + best = max(sums, key=operator.itemgetter(1)) + if best[1] >= 2: + # compress here + trie.append(best[0]) + nchildren = {} + for key, child in trie[1].items(): + if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]: + nchildren[key] = child + trie[1] = nchildren + +uptrie(trie) + +print(json.dumps(trie)) + diff --git a/leavestrie.py b/leavestrie.py @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 - -"""Read json trie in stdin, produce leaves and values -argv[1] is 1 or -1 to reverse the label sequence or not""" - -import json -import sys - -trie = json.load(sys.stdin) - -def leaves(trie, prefix="", provisional=None): - """Keep only the most probable values at each node""" - if len(trie[1].keys()) == 0: - assert(len(trie[0].keys()) == 1) - k, v = trie[0].popitem() - if (k != provisional): - # does not agree with provisional decision so far - print("%s\t%s" % (k, prefix[::int(sys.argv[1])])) - # decided nodes - if len(trie) == 3 and trie[2]: - if (trie[2] != provisional): - # does not agree with provisional decision so far - print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])])) - if len(trie) == 3: - provisional = trie[2] - for child in trie[1].keys(): - leaves(trie[1][child], prefix + child, provisional) - -leaves(trie) - diff --git a/majoritytrie.py b/majoritytrie.py @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 - -"""Read json trie in stdin, keep majority value at each node, remove -useless leaf nodes and output trie to stdout""" - -import json -import sys - -trie = json.load(sys.stdin) - -def get_majority(d): - """What are the most probable values?""" - mx = max(d.values()) - return [k for k in d.keys() if d[k] == mx] - -def majority(trie): - """Keep only the most probable values at each node""" - if len(trie[1].keys()) == 0: - # keep all options at leaf nodes - trie[0] = list(trie[0].keys()) - else: - trie[0] = get_majority(trie[0]) - useless = [] - for child in trie[1].keys(): - majority(trie[1][child]) - # if it is relabeled to our majority value and is a leaf, drop it - if trie[1][child][0] == trie[0] and trie[1][child][1] == {}: - useless.append(child) - for child in useless: - del(trie[1][child]) - -majority(trie) - -print(json.dumps(trie)) - diff --git a/make.sh b/make.sh @@ -7,7 +7,7 @@ ./detect.pl | # identify and label occurrences cat - $* | # add in exceptions sed 's/ h/ /' | # we don't keep the useless leading 'h' in the trie - ./buildtrie.py | # prepare the trie - ./compresstrie.py | # compress the trie - ./majoritytrie.py # keep only the most frequent information + ./haspirater/buildtrie.py | # prepare the trie + ./haspirater/compresstrie.py | # compress the trie + ./haspirater/majoritytrie.py # keep only the most frequent information diff --git a/setup.py b/setup.py @@ -0,0 +1,20 @@ +import setuptools + +with open("README", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name='haspirater', + version='0.2', + author="Antoine Amarilli", + author_email="a3nm@a3nm.net", + package_data={'haspirater' :['*json']}, + description="detect aspirated 'h' in French words", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://gitlab.com/a3nm/haspirater", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + ], +) diff --git a/trie2dot.py b/trie2dot.py @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 - -"""Takes json as input with labels [value1, value2] and produces dot, -usage: trie2dot.py prefix value1 value2""" - -import json -import sys -from math import log - -trie = json.load(sys.stdin) - -free_id = 0 - -def cget(d, k): - try: - if k in d.keys(): - return d[k] - else: - return 0 - except AttributeError: - # we have a list, not a dictionary - # this happens after majoritytrie.py - if k in d: - return 1 - else: - return 0 - -def int2strbyte(i): - s = hex(i).split('x')[1] - if len(s) == 1: - return '0' + s - else: - return s - -def fraction2rgb(fraction): - n = int(255*fraction) - return int2strbyte(n)+'00'+int2strbyte(255 - n) - -def total(x): - key, node = x - try: - return sum(node[0].values()) - except AttributeError: - # we have only one value, not a dictionary - return 1 - -def to_dot(trie, prefix=''): - global free_id - - values, children = trie - my_id = free_id - free_id += 1 - count = cget(values, v1) + cget(values, v2) - fraction = cget(values, v2) / count - - print("%d [label=\"%s\",color=\"#%s\",penwidth=%d]" % (my_id, prefix, - fraction2rgb(fraction), 1+int(log(count)))) - - for (key, child) in sorted(children.items(), key=total, reverse=True): - i = to_dot(child, prefix+key) - print("%d -> %d [penwidth=%d]" % (my_id, i, - 1+int(log(total((None, child)))))) - - return my_id - -print("digraph G {\naspect=\"1\"\n") -prefix = sys.argv[1] -v1 = sys.argv[2] -v2 = sys.argv[3] -to_dot(trie, prefix) -print("}") diff --git a/uptrie.py b/uptrie.py @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 - -"""Read json trie in stdin, make internal node decisions and output json dump to -stdout""" - -import itertools -import operator -import json -import sys - -trie = json.load(sys.stdin) - -def uptrie(trie): - """Make internal node decisions if possible""" - for child in trie[1].values(): - uptrie(child) - decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if - len(t[0].keys()) == 1] - dchild_g = {} - for (x, y) in decided_children: - if x not in dchild_g.keys(): - dchild_g[x] = [] - dchild_g[x].append(y) - sums = [(x, len(y)) for (x, y) in dchild_g.items()] - if len(sums) == 0: - return - best = max(sums, key=operator.itemgetter(1)) - if best[1] >= 2: - # compress here - trie.append(best[0]) - nchildren = {} - for key, child in trie[1].items(): - if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]: - nchildren[key] = child - trie[1] = nchildren - -uptrie(trie) - -print(json.dumps(trie)) -