commit ae59e299f0d0b9ce4255d6b07dc718b32edd980d
parent b79b3eed3cf78fd00cda450eb38b60413ff67a9f
Author: a3nm <a3nm@a3nm.net>
Date: Thu, 15 Aug 2019 21:47:53 +0000
Merge branch 'master' into 'master'
Setup Package
See merge request a3nm/haspirater!1
Diffstat:
24 files changed, 360 insertions(+), 317 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1 +1,5 @@
__pycache__/
+dist/
+haspirater.egg-info/
+build/
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,18 @@
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README b/README
@@ -63,7 +63,7 @@ and are not stored as-is; they are just piped later on in the training
phase. make.sh produces on stdout the json trie. Thus, you would run
something like:
- $ cat corpus | ./make.sh exceptions > haspirater.json
+ $ cat corpus | ./make.sh exceptions > haspirater/haspirater.json
== 4. Training details ==
diff --git a/buildtrie.py b/buildtrie.py
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-
-"""From a list of values (arbitrary) and keys (words), create a trie
-representing this mapping"""
-
-import json
-import sys
-
-# first item is a dictionnary from values to an int indicating the
-# number of occurrences with this prefix having this value
-# second item is a dictionnary from letters to descendent nodes
-def empty_node():
- return [{}, {}]
-
-def insert(trie, key, val):
- """Insert val for key in trie"""
- values, children = trie
- # create a new value, if needed
- if val not in values.keys():
- values[val] = 0
- # increment count for val
- values[val] += 1
- if len(key) > 0:
- # create a new node if needed
- if key[0] not in children.keys():
- children[key[0]] = empty_node()
- # recurse
- return insert(children[key[0]], key[1:], val)
-
-if __name__ == '__main__':
- trie = empty_node()
-
- for line in sys.stdin.readlines():
- line = line.split()
- value = line[0]
- word = line[1].lower() if len(line) == 2 else ''
- # a trailing space is used to mark termination of the word
- # this is useful in cases where a prefix of a word is a complete,
- # different word with a different value
- insert(trie, word+' ', value)
-
- print(json.dumps(trie))
-
diff --git a/buildtrie_list.py b/buildtrie_list.py
@@ -1,22 +0,0 @@
-#!/usr/bin/env python3
-
-"""From a list of values (arbitrary) and keys (words), create a trie
-representing this mapping"""
-
-# this modified version is used by plint
-# see https://a3nm.net/git/plint
-
-import buildtrie
-import json
-import sys
-
-trie = buildtrie.empty_node()
-
-for line in sys.stdin.readlines():
- line = line.split()
- value = line[0]
- word = line[1:]
- buildtrie.insert(trie, word+['-', '-'], value)
-
-print(json.dumps(trie))
-
diff --git a/compresstrie.py b/compresstrie.py
@@ -1,22 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, trim unneeded branches and output json dump
-to stdout"""
-
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def compress(trie):
- """Compress the trie"""
- if len(trie[0].keys()) <= 1:
- # no need for children, there is no more doubt
- trie[1] = {}
- for child in trie[1].values():
- compress(child)
-
-compress(trie)
-
-print(json.dumps(trie))
-
diff --git a/haspirater.json b/haspirater.json
@@ -1 +0,0 @@
-[["0"], {"\u00e9": [["0"], {"b": [["0"], {"\u00e9": [["0"], {"c": [["1"], {}]}]}], "l": [["0"], {"\u00e9": [["1"], {}], "a": [["1"], {"s": [["0"], {" ": [["0", "1"], {}]}]}], "i": [["0"], {"p": [["1"], {}]}], "e": [["1"], {}], "\u00e8": [["0", "1"], {"n": [["0"], {}], "r": [["1"], {}]}]}], "r": [["1"], {"\u00e9": [["0"], {}], "a": [["1"], {"c": [["0"], {}]}], "o": [["1"], {"\u00ef": [["0"], {}]}], "i": [["0"], {"s": [["1"], {}]}]}], "q": [["1"], {}]}], "\u00f4": [["0"], {"l": [["1"], {}]}], "o": [["0"], {"t": [["1"], {}], "m": [["0"], {"e": [["1"], {}], "a": [["1"], {}], " ": [["1"], {}]}], "o": [["1"], {}], "h": [["1"], {}], " ": [["1"], {}], "y": [["1"], {}], "l": [["1"], {"o": [["0"], {}]}], "u": [["1"], {}], "b": [["1"], {}], "r": [["0"], {"m": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}], "s": [["1"], {}], "n": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "d": [["1"], {}]}], "c": [["1"], {}], "s": [["0"], {"a": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}], "s": [["1"], {}], "n": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "g": [["1"], {}]}], "p": [["1"], {}], "w": [["1"], {}], "g": [["1"], {}], "d": [["1"], {}], "q": [["1"], {}]}], "e": [["0"], {"t": [["1"], {}], "m": [["1"], {}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}], "a": [["1"], {"u": [["1"], {"t": [["0"], {}]}]}], "i": [["1"], {}], "l": [["0"], {"l": [["0"], {"o": [["1"], {}], "e": [["0"], {"b": [["1"], {}]}]}]}], "r": [["0"], {"c": [["1"], {"u": [["0"], {}]}], "t": [["1"], {}], "m": [["0"], {"i": [["0"], {"t": [["0"], {"i": [["1"], {}]}]}]}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}]}], "u": [["0"], {"/": [["1"], {}], "s": [["1"], {}], "l": [["1"], {}], "r": [["0"], {"t": [["1"], {}]}]}]}], "y": [["0"], {"a": [["0", "1"], {"c": [["0"], {}], "l": [["1"], {}]}]}], "l": [["1"], {}], "u": [["1"], {"m": [["0"], {"\u00e9": [["1"], {}], "b": [["0"], {"o": [["1"], {}], "l": [["0"], {"e": [["0"], {" ": [["0", "1"], {}], "s": [["0", "1"], {}]}]}]}], "o": [["1"], {"u": [["0"], {}]}], "e": [["0"], {"z": [["1"], {}], "m": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "u": [["0"], {"x": [["1"], {}]}], "r": [["1"], {}]}], "a": [["0"], {" ": [["1"], {}], "i": [["0"], {"t": [["1"], {}], "e": [["1"], {}], " ": [["1"], {}], "s": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}]}], "g": [["1"], {}]}], "p": [["1"], {}], " ": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "\u00e8": [["1"], {}]}], "\u00ee": [["0"], {}], "l": [["1"], {"u": [["1"], {"b": [["0"], {}]}]}], "i": [["1"], {"s": [["0"], {}], "l": [["0"], {}]}], "d": [["0"], {}]}], "\u00e2": [["1"], {}], "\u00ea": [["1"], {}], "a": [["1"], {"l": [["1"], {"o": [["1"], {"g": [["0"], {}]}], "e": [["1"], {"i": [["0"], {}]}], "l": [["1"], {"a": [["1"], {"l": [["0"], {}]}], "u": [["0"], {}]}], "i": [["0", "1"], {"b": [["1"], {}], "o": [["0"], {}]}], "\u00e8": [["1"], {"n": [["0"], {}]}]}], "b": [["0"], {"a": [["1"], {}], "o": [["1"], {}]}], "\u00ff": [["0"], {}], "r": [["1"], {"m": [["0"], {}]}], "d": [["1"], {"o": [["0", "1"], {"p": [["0", "1"], {"i": [["0", "1"], {" ": [["1"], {}], "s": [["0"], {}]}]}]}], "r": [["0"], {}]}], "v": [["0"], {"e": [["0"], {"s": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "l": [["0"], {"\u00e9": [["0"], {"e": [["0"], {" ": [["0", "1"], {}]}]}], "e": [["0"], {"r": [["0", "1"], {}]}]}], "r": [["1"], {}], "u": [["1"], {}]}], "a": [["1"], {}], "r": [["1"], {}], "i": [["1"], {}], "u": [["1"], {}]}]}], " ": [["0", "1"], {}], "i": [["0"], {"\u00e9": [["1"], {}], "t": [["1"], {}], "m": [["1"], {}], "e": [["1"], {"r": [["0"], {" ": [["0", "1"], {}]}]}], "h": [["1"], {}], "b": [["0"], {"o": [["1"], {}]}], "l": [["0", "1"], {"a": [["0"], {"i": [["1"], {}]}], "d": [["0", "1"], {"e": [["0", "1"], {"s": [["0"], {}], "g": [["1"], {}]}]}], "b": [["1"], {}], "o": [["1"], {}], "e": [["1"], {}]}], "g": [["1"], {}], "c": [["1"], {}], "s": [["0"], {"s": [["1"], {}]}], "n": [["1"], {"d": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}]}], "a": [["0", "1"], {"t": [["0", "1"], {"a": [["1"], {}], "u": [["0"], {}]}]}], "p": [["0"], {"h": [["1"], {}], "p": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "r": [["1"], {"o": [["1"], {"n": [["0"], {}]}]}], "f": [["1"], {}]}], "\u00e1": [["1"], {}], "\u00e8": [["1"], {"b": [["0"], {}]}]}]
diff --git a/haspirater.py b/haspirater.py
@@ -1,49 +0,0 @@
-#!/usr/bin/python3
-
-"""Determine if a French word starts by an aspirated 'h' or not, by a
-lookup in a precompiled trie"""
-
-import os
-import json
-import sys
-
-f = open(os.path.join(os.path.dirname(
- os.path.realpath(__file__)), 'haspirater.json'))
-trie = json.load(f)
-f.close()
-
-def do_lookup(trie, key):
- if len(key) == 0 or (key[0] not in trie[1].keys()):
- return trie[0]
- return do_lookup(trie[1][key[0]], key[1:])
-
-def lookup(key):
- """Return True iff key starts with an aspirated 'h'"""
- if key == '' or key[0] != 'h':
- raise ValueError
- return list(map((lambda x: x == "1"), do_lookup(trie, key[1:] + ' ')))
-
-def wrap_lookup(line):
- line = line.lower().lstrip().rstrip()
- try:
- result = lookup(line)
- if True in result and not False in result:
- print("%s: aspirated" % line)
- elif False in result and not True in result:
- print("%s: not aspirated" % line)
- else:
- print("%s: ambiguous" % line)
- except ValueError:
- print("%s: no leading 'h'" % line)
-
-if __name__ == '__main__':
- if len(sys.argv) > 1:
- for arg in sys.argv[1:]:
- wrap_lookup(arg)
- else:
- while True:
- line = sys.stdin.readline()
- if not line:
- break
- wrap_lookup(line)
-
diff --git a/haspirater/__init__.py b/haspirater/__init__.py
@@ -0,0 +1 @@
+from .haspirater import *
diff --git a/haspirater/buildtrie.py b/haspirater/buildtrie.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+"""From a list of values (arbitrary) and keys (words), create a trie
+representing this mapping"""
+
+import json
+import sys
+
+# first item is a dictionnary from values to an int indicating the
+# number of occurrences with this prefix having this value
+# second item is a dictionnary from letters to descendent nodes
+def empty_node():
+ return [{}, {}]
+
+def insert(trie, key, val):
+ """Insert val for key in trie"""
+ values, children = trie
+ # create a new value, if needed
+ if val not in values.keys():
+ values[val] = 0
+ # increment count for val
+ values[val] += 1
+ if len(key) > 0:
+ # create a new node if needed
+ if key[0] not in children.keys():
+ children[key[0]] = empty_node()
+ # recurse
+ return insert(children[key[0]], key[1:], val)
+
+if __name__ == '__main__':
+ trie = empty_node()
+
+ for line in sys.stdin.readlines():
+ line = line.split()
+ value = line[0]
+ word = line[1].lower() if len(line) == 2 else ''
+ # a trailing space is used to mark termination of the word
+ # this is useful in cases where a prefix of a word is a complete,
+ # different word with a different value
+ insert(trie, word+' ', value)
+
+ print(json.dumps(trie))
+
diff --git a/haspirater/buildtrie_list.py b/haspirater/buildtrie_list.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+"""From a list of values (arbitrary) and keys (words), create a trie
+representing this mapping"""
+
+# this modified version is used by plint
+# see https://a3nm.net/git/plint
+
+import haspirater.buildtrie
+import json
+import sys
+
+trie = buildtrie.empty_node()
+
+for line in sys.stdin.readlines():
+ line = line.split()
+ value = line[0]
+ word = line[1:]
+ buildtrie.insert(trie, word+['-', '-'], value)
+
+print(json.dumps(trie))
+
diff --git a/haspirater/compresstrie.py b/haspirater/compresstrie.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, trim unneeded branches and output json dump
+to stdout"""
+
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def compress(trie):
+ """Compress the trie"""
+ if len(trie[0].keys()) <= 1:
+ # no need for children, there is no more doubt
+ trie[1] = {}
+ for child in trie[1].values():
+ compress(child)
+
+compress(trie)
+
+print(json.dumps(trie))
+
diff --git a/haspirater/haspirater.json b/haspirater/haspirater.json
@@ -0,0 +1 @@
+[["0"], {"\u00e9": [["0"], {"b": [["0"], {"\u00e9": [["0"], {"c": [["1"], {}]}]}], "l": [["0"], {"\u00e9": [["1"], {}], "a": [["1"], {"s": [["0"], {" ": [["0", "1"], {}]}]}], "i": [["0"], {"p": [["1"], {}]}], "e": [["1"], {}], "\u00e8": [["0", "1"], {"n": [["0"], {}], "r": [["1"], {}]}]}], "r": [["1"], {"\u00e9": [["0"], {}], "a": [["1"], {"c": [["0"], {}]}], "o": [["1"], {"\u00ef": [["0"], {}]}], "i": [["0"], {"s": [["1"], {}]}]}], "q": [["1"], {}]}], "\u00f4": [["0"], {"l": [["1"], {}]}], "o": [["0"], {"t": [["1"], {}], "m": [["0"], {"e": [["1"], {}], "a": [["1"], {}], " ": [["1"], {}]}], "o": [["1"], {}], "h": [["1"], {}], " ": [["1"], {}], "y": [["1"], {}], "l": [["1"], {"o": [["0"], {}]}], "u": [["1"], {}], "b": [["1"], {}], "r": [["0"], {"m": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}], "s": [["1"], {}], "n": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "d": [["1"], {}]}], "c": [["1"], {}], "s": [["0"], {"a": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}], "s": [["1"], {}], "n": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "g": [["1"], {}]}], "p": [["1"], {}], "w": [["1"], {}], "g": [["1"], {}], "d": [["1"], {}], "q": [["1"], {}]}], "e": [["0"], {"t": [["1"], {}], "m": [["1"], {}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}], "a": [["1"], {"u": [["1"], {"t": [["0"], {}]}]}], "i": [["1"], {}], "l": [["0"], {"l": [["0"], {"o": [["1"], {}], "e": [["0"], {"b": [["1"], {}]}]}]}], "r": [["0"], {"c": [["1"], {"u": [["0"], {}]}], "t": [["1"], {}], "m": [["0"], {"i": [["0"], {"t": [["0"], {"i": [["1"], {}]}]}]}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}]}], "u": [["0"], {"/": [["1"], {}], "s": [["1"], {}], "l": [["1"], {}], "r": [["0"], {"t": [["1"], {}]}]}]}], "y": [["0"], {"a": [["0", "1"], {"c": [["0"], {}], "l": [["1"], {}]}]}], "l": [["1"], {}], "u": [["1"], {"m": [["0"], {"\u00e9": [["1"], {}], "b": [["0"], {"o": [["1"], {}], "l": [["0"], {"e": [["0"], {" ": [["0", "1"], {}], "s": [["0", "1"], {}]}]}]}], "o": [["1"], {"u": [["0"], {}]}], "e": [["0"], {"z": [["1"], {}], "m": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "u": [["0"], {"x": [["1"], {}]}], "r": [["1"], {}]}], "a": [["0"], {" ": [["1"], {}], "i": [["0"], {"t": [["1"], {}], "e": [["1"], {}], " ": [["1"], {}], "s": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}]}], "g": [["1"], {}]}], "p": [["1"], {}], " ": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "\u00e8": [["1"], {}]}], "\u00ee": [["0"], {}], "l": [["1"], {"u": [["1"], {"b": [["0"], {}]}]}], "i": [["1"], {"s": [["0"], {}], "l": [["0"], {}]}], "d": [["0"], {}]}], "\u00e2": [["1"], {}], "\u00ea": [["1"], {}], "a": [["1"], {"l": [["1"], {"o": [["1"], {"g": [["0"], {}]}], "e": [["1"], {"i": [["0"], {}]}], "l": [["1"], {"a": [["1"], {"l": [["0"], {}]}], "u": [["0"], {}]}], "i": [["0", "1"], {"b": [["1"], {}], "o": [["0"], {}]}], "\u00e8": [["1"], {"n": [["0"], {}]}]}], "b": [["0"], {"a": [["1"], {}], "o": [["1"], {}]}], "\u00ff": [["0"], {}], "r": [["1"], {"m": [["0"], {}]}], "d": [["1"], {"o": [["0", "1"], {"p": [["0", "1"], {"i": [["0", "1"], {" ": [["1"], {}], "s": [["0"], {}]}]}]}], "r": [["0"], {}]}], "v": [["0"], {"e": [["0"], {"s": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "l": [["0"], {"\u00e9": [["0"], {"e": [["0"], {" ": [["0", "1"], {}]}]}], "e": [["0"], {"r": [["0", "1"], {}]}]}], "r": [["1"], {}], "u": [["1"], {}]}], "a": [["1"], {}], "r": [["1"], {}], "i": [["1"], {}], "u": [["1"], {}]}]}], " ": [["0", "1"], {}], "i": [["0"], {"\u00e9": [["1"], {}], "t": [["1"], {}], "m": [["1"], {}], "e": [["1"], {"r": [["0"], {" ": [["0", "1"], {}]}]}], "h": [["1"], {}], "b": [["0"], {"o": [["1"], {}]}], "l": [["0", "1"], {"a": [["0"], {"i": [["1"], {}]}], "d": [["0", "1"], {"e": [["0", "1"], {"s": [["0"], {}], "g": [["1"], {}]}]}], "b": [["1"], {}], "o": [["1"], {}], "e": [["1"], {}]}], "g": [["1"], {}], "c": [["1"], {}], "s": [["0"], {"s": [["1"], {}]}], "n": [["1"], {"d": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}]}], "a": [["0", "1"], {"t": [["0", "1"], {"a": [["1"], {}], "u": [["0"], {}]}]}], "p": [["0"], {"h": [["1"], {}], "p": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "r": [["1"], {"o": [["1"], {"n": [["0"], {}]}]}], "f": [["1"], {}]}], "\u00e1": [["1"], {}], "\u00e8": [["1"], {"b": [["0"], {}]}]}]
diff --git a/haspirater/haspirater.py b/haspirater/haspirater.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+
+"""Determine if a French word starts by an aspirated 'h' or not, by a
+lookup in a precompiled trie"""
+
+import os
+import json
+import sys
+
+f = open(os.path.join(os.path.dirname(
+ os.path.realpath(__file__)), 'haspirater.json'))
+trie = json.load(f)
+f.close()
+
+def do_lookup(trie, key):
+ if len(key) == 0 or (key[0] not in trie[1].keys()):
+ return trie[0]
+ return do_lookup(trie[1][key[0]], key[1:])
+
+def lookup(key):
+ """Return True iff key starts with an aspirated 'h'"""
+ if key == '' or key[0] != 'h':
+ raise ValueError
+ return list(map((lambda x: x == "1"), do_lookup(trie, key[1:] + ' ')))
+
+def wrap_lookup(line):
+ line = line.lower().lstrip().rstrip()
+ try:
+ result = lookup(line)
+ if True in result and not False in result:
+ print("%s: aspirated" % line)
+ elif False in result and not True in result:
+ print("%s: not aspirated" % line)
+ else:
+ print("%s: ambiguous" % line)
+ except ValueError:
+ print("%s: no leading 'h'" % line)
+
+if __name__ == '__main__':
+ if len(sys.argv) > 1:
+ for arg in sys.argv[1:]:
+ wrap_lookup(arg)
+ else:
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ wrap_lookup(line)
+
diff --git a/haspirater/leavestrie.py b/haspirater/leavestrie.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, produce leaves and values
+argv[1] is 1 or -1 to reverse the label sequence or not"""
+
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def leaves(trie, prefix="", provisional=None):
+ """Keep only the most probable values at each node"""
+ if len(trie[1].keys()) == 0:
+ assert(len(trie[0].keys()) == 1)
+ k, v = trie[0].popitem()
+ if (k != provisional):
+ # does not agree with provisional decision so far
+ print("%s\t%s" % (k, prefix[::int(sys.argv[1])]))
+ # decided nodes
+ if len(trie) == 3 and trie[2]:
+ if (trie[2] != provisional):
+ # does not agree with provisional decision so far
+ print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])]))
+ if len(trie) == 3:
+ provisional = trie[2]
+ for child in trie[1].keys():
+ leaves(trie[1][child], prefix + child, provisional)
+
+leaves(trie)
+
diff --git a/haspirater/majoritytrie.py b/haspirater/majoritytrie.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, keep majority value at each node, remove
+useless leaf nodes and output trie to stdout"""
+
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def get_majority(d):
+ """What are the most probable values?"""
+ mx = max(d.values())
+ return [k for k in d.keys() if d[k] == mx]
+
+def majority(trie):
+ """Keep only the most probable values at each node"""
+ if len(trie[1].keys()) == 0:
+ # keep all options at leaf nodes
+ trie[0] = list(trie[0].keys())
+ else:
+ trie[0] = get_majority(trie[0])
+ useless = []
+ for child in trie[1].keys():
+ majority(trie[1][child])
+ # if it is relabeled to our majority value and is a leaf, drop it
+ if trie[1][child][0] == trie[0] and trie[1][child][1] == {}:
+ useless.append(child)
+ for child in useless:
+ del(trie[1][child])
+
+majority(trie)
+
+print(json.dumps(trie))
+
diff --git a/haspirater/trie2dot.py b/haspirater/trie2dot.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+"""Takes json as input with labels [value1, value2] and produces dot,
+usage: trie2dot.py prefix value1 value2"""
+
+import json
+import sys
+from math import log
+
+trie = json.load(sys.stdin)
+
+free_id = 0
+
+def cget(d, k):
+ try:
+ if k in d.keys():
+ return d[k]
+ else:
+ return 0
+ except AttributeError:
+ # we have a list, not a dictionary
+ # this happens after majoritytrie.py
+ if k in d:
+ return 1
+ else:
+ return 0
+
+def int2strbyte(i):
+ s = hex(i).split('x')[1]
+ if len(s) == 1:
+ return '0' + s
+ else:
+ return s
+
+def fraction2rgb(fraction):
+ n = int(255*fraction)
+ return int2strbyte(n)+'00'+int2strbyte(255 - n)
+
+def total(x):
+ key, node = x
+ try:
+ return sum(node[0].values())
+ except AttributeError:
+ # we have only one value, not a dictionary
+ return 1
+
+def to_dot(trie, prefix=''):
+ global free_id
+
+ values, children = trie
+ my_id = free_id
+ free_id += 1
+ count = cget(values, v1) + cget(values, v2)
+ fraction = cget(values, v2) / count
+
+ print("%d [label=\"%s\",color=\"#%s\",penwidth=%d]" % (my_id, prefix,
+ fraction2rgb(fraction), 1+int(log(count))))
+
+ for (key, child) in sorted(children.items(), key=total, reverse=True):
+ i = to_dot(child, prefix+key)
+ print("%d -> %d [penwidth=%d]" % (my_id, i,
+ 1+int(log(total((None, child))))))
+
+ return my_id
+
+print("digraph G {\naspect=\"1\"\n")
+prefix = sys.argv[1]
+v1 = sys.argv[2]
+v2 = sys.argv[3]
+to_dot(trie, prefix)
+print("}")
diff --git a/haspirater/uptrie.py b/haspirater/uptrie.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, make internal node decisions and output json dump to
+stdout"""
+
+import itertools
+import operator
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def uptrie(trie):
+ """Make internal node decisions if possible"""
+ for child in trie[1].values():
+ uptrie(child)
+ decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if
+ len(t[0].keys()) == 1]
+ dchild_g = {}
+ for (x, y) in decided_children:
+ if x not in dchild_g.keys():
+ dchild_g[x] = []
+ dchild_g[x].append(y)
+ sums = [(x, len(y)) for (x, y) in dchild_g.items()]
+ if len(sums) == 0:
+ return
+ best = max(sums, key=operator.itemgetter(1))
+ if best[1] >= 2:
+ # compress here
+ trie.append(best[0])
+ nchildren = {}
+ for key, child in trie[1].items():
+ if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]:
+ nchildren[key] = child
+ trie[1] = nchildren
+
+uptrie(trie)
+
+print(json.dumps(trie))
+
diff --git a/leavestrie.py b/leavestrie.py
@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, produce leaves and values
-argv[1] is 1 or -1 to reverse the label sequence or not"""
-
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def leaves(trie, prefix="", provisional=None):
- """Keep only the most probable values at each node"""
- if len(trie[1].keys()) == 0:
- assert(len(trie[0].keys()) == 1)
- k, v = trie[0].popitem()
- if (k != provisional):
- # does not agree with provisional decision so far
- print("%s\t%s" % (k, prefix[::int(sys.argv[1])]))
- # decided nodes
- if len(trie) == 3 and trie[2]:
- if (trie[2] != provisional):
- # does not agree with provisional decision so far
- print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])]))
- if len(trie) == 3:
- provisional = trie[2]
- for child in trie[1].keys():
- leaves(trie[1][child], prefix + child, provisional)
-
-leaves(trie)
-
diff --git a/majoritytrie.py b/majoritytrie.py
@@ -1,35 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, keep majority value at each node, remove
-useless leaf nodes and output trie to stdout"""
-
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def get_majority(d):
- """What are the most probable values?"""
- mx = max(d.values())
- return [k for k in d.keys() if d[k] == mx]
-
-def majority(trie):
- """Keep only the most probable values at each node"""
- if len(trie[1].keys()) == 0:
- # keep all options at leaf nodes
- trie[0] = list(trie[0].keys())
- else:
- trie[0] = get_majority(trie[0])
- useless = []
- for child in trie[1].keys():
- majority(trie[1][child])
- # if it is relabeled to our majority value and is a leaf, drop it
- if trie[1][child][0] == trie[0] and trie[1][child][1] == {}:
- useless.append(child)
- for child in useless:
- del(trie[1][child])
-
-majority(trie)
-
-print(json.dumps(trie))
-
diff --git a/make.sh b/make.sh
@@ -7,7 +7,7 @@
./detect.pl | # identify and label occurrences
cat - $* | # add in exceptions
sed 's/ h/ /' | # we don't keep the useless leading 'h' in the trie
- ./buildtrie.py | # prepare the trie
- ./compresstrie.py | # compress the trie
- ./majoritytrie.py # keep only the most frequent information
+ ./haspirater/buildtrie.py | # prepare the trie
+ ./haspirater/compresstrie.py | # compress the trie
+ ./haspirater/majoritytrie.py # keep only the most frequent information
diff --git a/setup.py b/setup.py
@@ -0,0 +1,20 @@
+import setuptools
+
+with open("README", "r") as fh:
+ long_description = fh.read()
+
+setuptools.setup(
+ name='haspirater',
+ version='0.2',
+ author="Antoine Amarilli",
+ author_email="a3nm@a3nm.net",
+ package_data={'haspirater' :['*json']},
+ description="detect aspirated 'h' in French words",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ url="https://gitlab.com/a3nm/haspirater",
+ packages=setuptools.find_packages(),
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ ],
+)
diff --git a/trie2dot.py b/trie2dot.py
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-
-"""Takes json as input with labels [value1, value2] and produces dot,
-usage: trie2dot.py prefix value1 value2"""
-
-import json
-import sys
-from math import log
-
-trie = json.load(sys.stdin)
-
-free_id = 0
-
-def cget(d, k):
- try:
- if k in d.keys():
- return d[k]
- else:
- return 0
- except AttributeError:
- # we have a list, not a dictionary
- # this happens after majoritytrie.py
- if k in d:
- return 1
- else:
- return 0
-
-def int2strbyte(i):
- s = hex(i).split('x')[1]
- if len(s) == 1:
- return '0' + s
- else:
- return s
-
-def fraction2rgb(fraction):
- n = int(255*fraction)
- return int2strbyte(n)+'00'+int2strbyte(255 - n)
-
-def total(x):
- key, node = x
- try:
- return sum(node[0].values())
- except AttributeError:
- # we have only one value, not a dictionary
- return 1
-
-def to_dot(trie, prefix=''):
- global free_id
-
- values, children = trie
- my_id = free_id
- free_id += 1
- count = cget(values, v1) + cget(values, v2)
- fraction = cget(values, v2) / count
-
- print("%d [label=\"%s\",color=\"#%s\",penwidth=%d]" % (my_id, prefix,
- fraction2rgb(fraction), 1+int(log(count))))
-
- for (key, child) in sorted(children.items(), key=total, reverse=True):
- i = to_dot(child, prefix+key)
- print("%d -> %d [penwidth=%d]" % (my_id, i,
- 1+int(log(total((None, child))))))
-
- return my_id
-
-print("digraph G {\naspect=\"1\"\n")
-prefix = sys.argv[1]
-v1 = sys.argv[2]
-v2 = sys.argv[3]
-to_dot(trie, prefix)
-print("}")
diff --git a/uptrie.py b/uptrie.py
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, make internal node decisions and output json dump to
-stdout"""
-
-import itertools
-import operator
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def uptrie(trie):
- """Make internal node decisions if possible"""
- for child in trie[1].values():
- uptrie(child)
- decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if
- len(t[0].keys()) == 1]
- dchild_g = {}
- for (x, y) in decided_children:
- if x not in dchild_g.keys():
- dchild_g[x] = []
- dchild_g[x].append(y)
- sums = [(x, len(y)) for (x, y) in dchild_g.items()]
- if len(sums) == 0:
- return
- best = max(sums, key=operator.itemgetter(1))
- if best[1] >= 2:
- # compress here
- trie.append(best[0])
- nchildren = {}
- for key, child in trie[1].items():
- if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]:
- nchildren[key] = child
- trie[1] = nchildren
-
-uptrie(trie)
-
-print(json.dumps(trie))
-