Merge branch 'master' into 'master' - haspirater - detect aspirated 'h' in French words (local mirror of https://gitlab.com/a3nm/haspirater)

commit ae59e299f0d0b9ce4255d6b07dc718b32edd980d
parent b79b3eed3cf78fd00cda450eb38b60413ff67a9f
Author: a3nm <a3nm@a3nm.net>
Date:   Thu, 15 Aug 2019 21:47:53 +0000

Merge branch 'master' into 'master'

Setup Package

See merge request a3nm/haspirater!1
Diffstat:
.gitignore  | 4 ++++
LICENSE  | 18 ++++++++++++++++++
README  | 2 +-
buildtrie.py  | 43 -------------------------------------------
buildtrie_list.py  | 22 ----------------------
compresstrie.py  | 22 ----------------------
haspirater.json  | 1 -
haspirater.py  | 49 -------------------------------------------------
haspirater/__init__.py  | 1 +
haspirater/buildtrie.py  | 43 +++++++++++++++++++++++++++++++++++++++++++
haspirater/buildtrie_list.py  | 22 ++++++++++++++++++++++
haspirater/compresstrie.py  | 22 ++++++++++++++++++++++
haspirater/haspirater.json  | 1 +
haspirater/haspirater.py  | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
haspirater/leavestrie.py  | 30 ++++++++++++++++++++++++++++++
haspirater/majoritytrie.py  | 35 +++++++++++++++++++++++++++++++++++
haspirater/trie2dot.py  | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
haspirater/uptrie.py  | 40 ++++++++++++++++++++++++++++++++++++++++
leavestrie.py  | 30 ------------------------------
majoritytrie.py  | 35 -----------------------------------
make.sh  | 6 +++---
setup.py  | 20 ++++++++++++++++++++
trie2dot.py  | 71 -----------------------------------------------------------------------
uptrie.py  | 40 ----------------------------------------

24 files changed, 360 insertions(+), 317 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1 +1,5 @@
 __pycache__/
+dist/
+haspirater.egg-info/
+build/
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,18 @@
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README b/README
@@ -63,7 +63,7 @@ and are not stored as-is; they are just piped later on in the training
 phase. make.sh produces on stdout the json trie. Thus, you would run
 something like:
 
-  $ cat corpus | ./make.sh exceptions > haspirater.json
+  $ cat corpus | ./make.sh exceptions > haspirater/haspirater.json
 
 == 4. Training details ==
 
diff --git a/buildtrie.py b/buildtrie.py
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-
-"""From a list of values (arbitrary) and keys (words), create a trie
-representing this mapping"""
-
-import json
-import sys
-
-# first item is a dictionnary from values to an int indicating the
-# number of occurrences with this prefix having this value
-# second item is a dictionnary from letters to descendent nodes
-def empty_node():
-  return [{}, {}]
-
-def insert(trie, key, val):
-  """Insert val for key in trie"""
-  values, children = trie
-  # create a new value, if needed
-  if val not in values.keys():
-    values[val] = 0
-  # increment count for val
-  values[val] += 1
-  if len(key) > 0:
-    # create a new node if needed
-    if key[0] not in children.keys():
-      children[key[0]] = empty_node()
-    # recurse
-    return insert(children[key[0]], key[1:], val)
-
-if __name__ == '__main__':
-  trie = empty_node()
-
-  for line in sys.stdin.readlines():
-    line = line.split()
-    value = line[0]
-    word = line[1].lower() if len(line) == 2 else ''
-    # a trailing space is used to mark termination of the word
-    # this is useful in cases where a prefix of a word is a complete,
-    # different word with a different value
-    insert(trie, word+' ', value)
-
-  print(json.dumps(trie))
-
diff --git a/buildtrie_list.py b/buildtrie_list.py
@@ -1,22 +0,0 @@
-#!/usr/bin/env python3
-
-"""From a list of values (arbitrary) and keys (words), create a trie
-representing this mapping"""
-
-# this modified version is used by plint
-# see https://a3nm.net/git/plint
-
-import buildtrie
-import json
-import sys
-
-trie = buildtrie.empty_node()
-
-for line in sys.stdin.readlines():
-  line = line.split()
-  value = line[0]
-  word = line[1:]
-  buildtrie.insert(trie, word+['-', '-'], value)
-
-print(json.dumps(trie))
-
diff --git a/compresstrie.py b/compresstrie.py
@@ -1,22 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, trim unneeded branches and output json dump
-to stdout"""
-
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def compress(trie):
-  """Compress the trie"""
-  if len(trie[0].keys()) <= 1:
-    # no need for children, there is no more doubt
-    trie[1] = {}
-  for child in trie[1].values():
-    compress(child)
-
-compress(trie)
-
-print(json.dumps(trie))
-
diff --git a/haspirater.json b/haspirater.json
@@ -1 +0,0 @@
-[["0"], {"\u00e9": [["0"], {"b": [["0"], {"\u00e9": [["0"], {"c": [["1"], {}]}]}], "l": [["0"], {"\u00e9": [["1"], {}], "a": [["1"], {"s": [["0"], {" ": [["0", "1"], {}]}]}], "i": [["0"], {"p": [["1"], {}]}], "e": [["1"], {}], "\u00e8": [["0", "1"], {"n": [["0"], {}], "r": [["1"], {}]}]}], "r": [["1"], {"\u00e9": [["0"], {}], "a": [["1"], {"c": [["0"], {}]}], "o": [["1"], {"\u00ef": [["0"], {}]}], "i": [["0"], {"s": [["1"], {}]}]}], "q": [["1"], {}]}], "\u00f4": [["0"], {"l": [["1"], {}]}], "o": [["0"], {"t": [["1"], {}], "m": [["0"], {"e": [["1"], {}], "a": [["1"], {}], " ": [["1"], {}]}], "o": [["1"], {}], "h": [["1"], {}], " ": [["1"], {}], "y": [["1"], {}], "l": [["1"], {"o": [["0"], {}]}], "u": [["1"], {}], "b": [["1"], {}], "r": [["0"], {"m": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}], "s": [["1"], {}], "n": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "d": [["1"], {}]}], "c": [["1"], {}], "s": [["0"], {"a": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}], "s": [["1"], {}], "n": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "g": [["1"], {}]}], "p": [["1"], {}], "w": [["1"], {}], "g": [["1"], {}], "d": [["1"], {}], "q": [["1"], {}]}], "e": [["0"], {"t": [["1"], {}], "m": [["1"], {}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}], "a": [["1"], {"u": [["1"], {"t": [["0"], {}]}]}], "i": [["1"], {}], "l": [["0"], {"l": [["0"], {"o": [["1"], {}], "e": [["0"], {"b": [["1"], {}]}]}]}], "r": [["0"], {"c": [["1"], {"u": [["0"], {}]}], "t": [["1"], {}], "m": [["0"], {"i": [["0"], {"t": [["0"], {"i": [["1"], {}]}]}]}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}]}], "u": [["0"], {"/": [["1"], {}], "s": [["1"], {}], "l": [["1"], {}], "r": [["0"], {"t": [["1"], {}]}]}]}], "y": [["0"], {"a": [["0", "1"], {"c": [["0"], {}], "l": [["1"], {}]}]}], "l": [["1"], {}], "u": [["1"], {"m": [["0"], {"\u00e9": [["1"], {}], "b": [["0"], {"o": [["1"], {}], "l": [["0"], {"e": [["0"], {" ": [["0", "1"], {}], "s": [["0", "1"], {}]}]}]}], "o": [["1"], {"u": [["0"], {}]}], "e": [["0"], {"z": [["1"], {}], "m": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "u": [["0"], {"x": [["1"], {}]}], "r": [["1"], {}]}], "a": [["0"], {" ": [["1"], {}], "i": [["0"], {"t": [["1"], {}], "e": [["1"], {}], " ": [["1"], {}], "s": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}]}], "g": [["1"], {}]}], "p": [["1"], {}], " ": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "\u00e8": [["1"], {}]}], "\u00ee": [["0"], {}], "l": [["1"], {"u": [["1"], {"b": [["0"], {}]}]}], "i": [["1"], {"s": [["0"], {}], "l": [["0"], {}]}], "d": [["0"], {}]}], "\u00e2": [["1"], {}], "\u00ea": [["1"], {}], "a": [["1"], {"l": [["1"], {"o": [["1"], {"g": [["0"], {}]}], "e": [["1"], {"i": [["0"], {}]}], "l": [["1"], {"a": [["1"], {"l": [["0"], {}]}], "u": [["0"], {}]}], "i": [["0", "1"], {"b": [["1"], {}], "o": [["0"], {}]}], "\u00e8": [["1"], {"n": [["0"], {}]}]}], "b": [["0"], {"a": [["1"], {}], "o": [["1"], {}]}], "\u00ff": [["0"], {}], "r": [["1"], {"m": [["0"], {}]}], "d": [["1"], {"o": [["0", "1"], {"p": [["0", "1"], {"i": [["0", "1"], {" ": [["1"], {}], "s": [["0"], {}]}]}]}], "r": [["0"], {}]}], "v": [["0"], {"e": [["0"], {"s": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "l": [["0"], {"\u00e9": [["0"], {"e": [["0"], {" ": [["0", "1"], {}]}]}], "e": [["0"], {"r": [["0", "1"], {}]}]}], "r": [["1"], {}], "u": [["1"], {}]}], "a": [["1"], {}], "r": [["1"], {}], "i": [["1"], {}], "u": [["1"], {}]}]}], " ": [["0", "1"], {}], "i": [["0"], {"\u00e9": [["1"], {}], "t": [["1"], {}], "m": [["1"], {}], "e": [["1"], {"r": [["0"], {" ": [["0", "1"], {}]}]}], "h": [["1"], {}], "b": [["0"], {"o": [["1"], {}]}], "l": [["0", "1"], {"a": [["0"], {"i": [["1"], {}]}], "d": [["0", "1"], {"e": [["0", "1"], {"s": [["0"], {}], "g": [["1"], {}]}]}], "b": [["1"], {}], "o": [["1"], {}], "e": [["1"], {}]}], "g": [["1"], {}], "c": [["1"], {}], "s": [["0"], {"s": [["1"], {}]}], "n": [["1"], {"d": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}]}], "a": [["0", "1"], {"t": [["0", "1"], {"a": [["1"], {}], "u": [["0"], {}]}]}], "p": [["0"], {"h": [["1"], {}], "p": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "r": [["1"], {"o": [["1"], {"n": [["0"], {}]}]}], "f": [["1"], {}]}], "\u00e1": [["1"], {}], "\u00e8": [["1"], {"b": [["0"], {}]}]}]
diff --git a/haspirater.py b/haspirater.py
@@ -1,49 +0,0 @@
-#!/usr/bin/python3
-
-"""Determine if a French word starts by an aspirated 'h' or not, by a
-lookup in a precompiled trie"""
-
-import os
-import json
-import sys
-
-f = open(os.path.join(os.path.dirname(
-  os.path.realpath(__file__)), 'haspirater.json'))
-trie = json.load(f)
-f.close()
-
-def do_lookup(trie, key):
-  if len(key) == 0 or (key[0] not in trie[1].keys()):
-    return trie[0]
-  return do_lookup(trie[1][key[0]], key[1:])
-
-def lookup(key):
-  """Return True iff key starts with an aspirated 'h'"""
-  if key == '' or key[0] != 'h':
-    raise ValueError
-  return list(map((lambda x: x == "1"), do_lookup(trie, key[1:] + ' ')))
-
-def wrap_lookup(line):
-  line = line.lower().lstrip().rstrip()
-  try:
-    result = lookup(line)
-    if True in result and not False in result:
-      print("%s: aspirated" % line)
-    elif False in result and not True in result:
-      print("%s: not aspirated" % line)
-    else:
-      print("%s: ambiguous" % line)
-  except ValueError:
-    print("%s: no leading 'h'" % line)
-
-if __name__ == '__main__':
-  if len(sys.argv) > 1:
-    for arg in sys.argv[1:]:
-      wrap_lookup(arg)
-  else:
-    while True:
-      line = sys.stdin.readline()
-      if not line:
-        break
-      wrap_lookup(line)
-
diff --git a/haspirater/__init__.py b/haspirater/__init__.py
@@ -0,0 +1 @@
+from .haspirater import *
diff --git a/haspirater/buildtrie.py b/haspirater/buildtrie.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+"""From a list of values (arbitrary) and keys (words), create a trie
+representing this mapping"""
+
+import json
+import sys
+
+# first item is a dictionnary from values to an int indicating the
+# number of occurrences with this prefix having this value
+# second item is a dictionnary from letters to descendent nodes
+def empty_node():
+  return [{}, {}]
+
+def insert(trie, key, val):
+  """Insert val for key in trie"""
+  values, children = trie
+  # create a new value, if needed
+  if val not in values.keys():
+    values[val] = 0
+  # increment count for val
+  values[val] += 1
+  if len(key) > 0:
+    # create a new node if needed
+    if key[0] not in children.keys():
+      children[key[0]] = empty_node()
+    # recurse
+    return insert(children[key[0]], key[1:], val)
+
+if __name__ == '__main__':
+  trie = empty_node()
+
+  for line in sys.stdin.readlines():
+    line = line.split()
+    value = line[0]
+    word = line[1].lower() if len(line) == 2 else ''
+    # a trailing space is used to mark termination of the word
+    # this is useful in cases where a prefix of a word is a complete,
+    # different word with a different value
+    insert(trie, word+' ', value)
+
+  print(json.dumps(trie))
+
diff --git a/haspirater/buildtrie_list.py b/haspirater/buildtrie_list.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+"""From a list of values (arbitrary) and keys (words), create a trie
+representing this mapping"""
+
+# this modified version is used by plint
+# see https://a3nm.net/git/plint
+
+import haspirater.buildtrie
+import json
+import sys
+
+trie = buildtrie.empty_node()
+
+for line in sys.stdin.readlines():
+  line = line.split()
+  value = line[0]
+  word = line[1:]
+  buildtrie.insert(trie, word+['-', '-'], value)
+
+print(json.dumps(trie))
+
diff --git a/haspirater/compresstrie.py b/haspirater/compresstrie.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, trim unneeded branches and output json dump
+to stdout"""
+
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def compress(trie):
+  """Compress the trie"""
+  if len(trie[0].keys()) <= 1:
+    # no need for children, there is no more doubt
+    trie[1] = {}
+  for child in trie[1].values():
+    compress(child)
+
+compress(trie)
+
+print(json.dumps(trie))
+
diff --git a/haspirater/haspirater.json b/haspirater/haspirater.json
@@ -0,0 +1 @@
+[["0"], {"\u00e9": [["0"], {"b": [["0"], {"\u00e9": [["0"], {"c": [["1"], {}]}]}], "l": [["0"], {"\u00e9": [["1"], {}], "a": [["1"], {"s": [["0"], {" ": [["0", "1"], {}]}]}], "i": [["0"], {"p": [["1"], {}]}], "e": [["1"], {}], "\u00e8": [["0", "1"], {"n": [["0"], {}], "r": [["1"], {}]}]}], "r": [["1"], {"\u00e9": [["0"], {}], "a": [["1"], {"c": [["0"], {}]}], "o": [["1"], {"\u00ef": [["0"], {}]}], "i": [["0"], {"s": [["1"], {}]}]}], "q": [["1"], {}]}], "\u00f4": [["0"], {"l": [["1"], {}]}], "o": [["0"], {"t": [["1"], {}], "m": [["0"], {"e": [["1"], {}], "a": [["1"], {}], " ": [["1"], {}]}], "o": [["1"], {}], "h": [["1"], {}], " ": [["1"], {}], "y": [["1"], {}], "l": [["1"], {"o": [["0"], {}]}], "u": [["1"], {}], "b": [["1"], {}], "r": [["0"], {"m": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}], "s": [["1"], {}], "n": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "d": [["1"], {}]}], "c": [["1"], {}], "s": [["0"], {"a": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}], "s": [["1"], {}], "n": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "g": [["1"], {}]}], "p": [["1"], {}], "w": [["1"], {}], "g": [["1"], {}], "d": [["1"], {}], "q": [["1"], {}]}], "e": [["0"], {"t": [["1"], {}], "m": [["1"], {}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}], "a": [["1"], {"u": [["1"], {"t": [["0"], {}]}]}], "i": [["1"], {}], "l": [["0"], {"l": [["0"], {"o": [["1"], {}], "e": [["0"], {"b": [["1"], {}]}]}]}], "r": [["0"], {"c": [["1"], {"u": [["0"], {}]}], "t": [["1"], {}], "m": [["0"], {"i": [["0"], {"t": [["0"], {"i": [["1"], {}]}]}]}], "s": [["1"], {}], "n": [["1"], {}], "p": [["1"], {}]}], "u": [["0"], {"/": [["1"], {}], "s": [["1"], {}], "l": [["1"], {}], "r": [["0"], {"t": [["1"], {}]}]}]}], "y": [["0"], {"a": [["0", "1"], {"c": [["0"], {}], "l": [["1"], {}]}]}], "l": [["1"], {}], "u": [["1"], {"m": [["0"], {"\u00e9": [["1"], {}], "b": [["0"], {"o": [["1"], {}], "l": [["0"], {"e": [["0"], {" ": [["0", "1"], {}], "s": [["0", "1"], {}]}]}]}], "o": [["1"], {"u": [["0"], {}]}], "e": [["0"], {"z": [["1"], {}], "m": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "u": [["0"], {"x": [["1"], {}]}], "r": [["1"], {}]}], "a": [["0"], {" ": [["1"], {}], "i": [["0"], {"t": [["1"], {}], "e": [["1"], {}], " ": [["1"], {}], "s": [["1"], {}]}], "n": [["0"], {"t": [["1"], {}]}], "g": [["1"], {}]}], "p": [["1"], {}], " ": [["1"], {}], "i": [["0"], {"o": [["1"], {}]}], "\u00e8": [["1"], {}]}], "\u00ee": [["0"], {}], "l": [["1"], {"u": [["1"], {"b": [["0"], {}]}]}], "i": [["1"], {"s": [["0"], {}], "l": [["0"], {}]}], "d": [["0"], {}]}], "\u00e2": [["1"], {}], "\u00ea": [["1"], {}], "a": [["1"], {"l": [["1"], {"o": [["1"], {"g": [["0"], {}]}], "e": [["1"], {"i": [["0"], {}]}], "l": [["1"], {"a": [["1"], {"l": [["0"], {}]}], "u": [["0"], {}]}], "i": [["0", "1"], {"b": [["1"], {}], "o": [["0"], {}]}], "\u00e8": [["1"], {"n": [["0"], {}]}]}], "b": [["0"], {"a": [["1"], {}], "o": [["1"], {}]}], "\u00ff": [["0"], {}], "r": [["1"], {"m": [["0"], {}]}], "d": [["1"], {"o": [["0", "1"], {"p": [["0", "1"], {"i": [["0", "1"], {" ": [["1"], {}], "s": [["0"], {}]}]}]}], "r": [["0"], {}]}], "v": [["0"], {"e": [["0"], {"s": [["1"], {}], "n": [["1"], {}], " ": [["1"], {}], "l": [["0"], {"\u00e9": [["0"], {"e": [["0"], {" ": [["0", "1"], {}]}]}], "e": [["0"], {"r": [["0", "1"], {}]}]}], "r": [["1"], {}], "u": [["1"], {}]}], "a": [["1"], {}], "r": [["1"], {}], "i": [["1"], {}], "u": [["1"], {}]}]}], " ": [["0", "1"], {}], "i": [["0"], {"\u00e9": [["1"], {}], "t": [["1"], {}], "m": [["1"], {}], "e": [["1"], {"r": [["0"], {" ": [["0", "1"], {}]}]}], "h": [["1"], {}], "b": [["0"], {"o": [["1"], {}]}], "l": [["0", "1"], {"a": [["0"], {"i": [["1"], {}]}], "d": [["0", "1"], {"e": [["0", "1"], {"s": [["0"], {}], "g": [["1"], {}]}]}], "b": [["1"], {}], "o": [["1"], {}], "e": [["1"], {}]}], "g": [["1"], {}], "c": [["1"], {}], "s": [["0"], {"s": [["1"], {}]}], "n": [["1"], {"d": [["0", "1"], {"o": [["0"], {}], "i": [["1"], {}]}]}], "a": [["0", "1"], {"t": [["0", "1"], {"a": [["1"], {}], "u": [["0"], {}]}]}], "p": [["0"], {"h": [["1"], {}], "p": [["0"], {"i": [["1"], {}]}], " ": [["1"], {}]}], " ": [["1"], {}], "d": [["1"], {}], "r": [["1"], {"o": [["1"], {"n": [["0"], {}]}]}], "f": [["1"], {}]}], "\u00e1": [["1"], {}], "\u00e8": [["1"], {"b": [["0"], {}]}]}]
diff --git a/haspirater/haspirater.py b/haspirater/haspirater.py
@@ -0,0 +1,49 @@
+#!/usr/bin/python3
+
+"""Determine if a French word starts by an aspirated 'h' or not, by a
+lookup in a precompiled trie"""
+
+import os
+import json
+import sys
+
+f = open(os.path.join(os.path.dirname(
+  os.path.realpath(__file__)), 'haspirater.json'))
+trie = json.load(f)
+f.close()
+
+def do_lookup(trie, key):
+  if len(key) == 0 or (key[0] not in trie[1].keys()):
+    return trie[0]
+  return do_lookup(trie[1][key[0]], key[1:])
+
+def lookup(key):
+  """Return True iff key starts with an aspirated 'h'"""
+  if key == '' or key[0] != 'h':
+    raise ValueError
+  return list(map((lambda x: x == "1"), do_lookup(trie, key[1:] + ' ')))
+
+def wrap_lookup(line):
+  line = line.lower().lstrip().rstrip()
+  try:
+    result = lookup(line)
+    if True in result and not False in result:
+      print("%s: aspirated" % line)
+    elif False in result and not True in result:
+      print("%s: not aspirated" % line)
+    else:
+      print("%s: ambiguous" % line)
+  except ValueError:
+    print("%s: no leading 'h'" % line)
+
+if __name__ == '__main__':
+  if len(sys.argv) > 1:
+    for arg in sys.argv[1:]:
+      wrap_lookup(arg)
+  else:
+    while True:
+      line = sys.stdin.readline()
+      if not line:
+        break
+      wrap_lookup(line)
+
diff --git a/haspirater/leavestrie.py b/haspirater/leavestrie.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, produce leaves and values
+argv[1] is 1 or -1 to reverse the label sequence or not"""
+
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def leaves(trie, prefix="", provisional=None):
+  """Keep only the most probable values at each node"""
+  if len(trie[1].keys()) == 0:
+    assert(len(trie[0].keys()) == 1)
+    k, v = trie[0].popitem()
+    if (k != provisional):
+      # does not agree with provisional decision so far
+      print("%s\t%s" % (k, prefix[::int(sys.argv[1])]))
+  # decided nodes
+  if len(trie) == 3 and trie[2]:
+    if (trie[2] != provisional):
+      # does not agree with provisional decision so far
+      print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])]))
+  if len(trie) == 3:
+    provisional = trie[2]
+  for child in trie[1].keys():
+    leaves(trie[1][child], prefix + child, provisional)
+
+leaves(trie)
+
diff --git a/haspirater/majoritytrie.py b/haspirater/majoritytrie.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, keep majority value at each node, remove
+useless leaf nodes and output trie to stdout"""
+
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def get_majority(d):
+  """What are the most probable values?"""
+  mx = max(d.values())
+  return [k for k in d.keys() if d[k] == mx]
+
+def majority(trie):
+  """Keep only the most probable values at each node"""
+  if len(trie[1].keys()) == 0:
+    # keep all options at leaf nodes
+    trie[0] = list(trie[0].keys())
+  else:
+    trie[0] = get_majority(trie[0])
+  useless = []
+  for child in trie[1].keys():
+    majority(trie[1][child])
+    # if it is relabeled to our majority value and is a leaf, drop it
+    if trie[1][child][0] == trie[0] and trie[1][child][1] == {}:
+      useless.append(child)
+  for child in useless:
+    del(trie[1][child])
+
+majority(trie)
+
+print(json.dumps(trie))
+
diff --git a/haspirater/trie2dot.py b/haspirater/trie2dot.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+"""Takes json as input with labels [value1, value2] and produces dot,
+usage: trie2dot.py prefix value1 value2"""
+
+import json
+import sys
+from math import log
+
+trie = json.load(sys.stdin)
+
+free_id = 0
+
+def cget(d, k):
+  try:
+    if k in d.keys():
+      return d[k]
+    else:
+      return 0
+  except AttributeError:
+    # we have a list, not a dictionary
+    # this happens after majoritytrie.py
+    if k in d:
+      return 1
+    else:
+      return 0
+
+def int2strbyte(i):
+  s = hex(i).split('x')[1]
+  if len(s) == 1:
+    return '0' + s
+  else:
+    return s
+
+def fraction2rgb(fraction):
+  n = int(255*fraction)
+  return int2strbyte(n)+'00'+int2strbyte(255 - n)
+
+def total(x):
+  key, node = x
+  try:
+    return sum(node[0].values())
+  except AttributeError:
+    # we have only one value, not a dictionary
+    return 1
+
+def to_dot(trie, prefix=''):
+  global free_id
+
+  values, children = trie
+  my_id = free_id
+  free_id += 1
+  count = cget(values, v1) + cget(values, v2)
+  fraction = cget(values, v2) / count
+
+  print("%d [label=\"%s\",color=\"#%s\",penwidth=%d]" % (my_id, prefix,
+    fraction2rgb(fraction), 1+int(log(count))))
+
+  for (key, child) in sorted(children.items(), key=total, reverse=True):
+    i = to_dot(child, prefix+key)
+    print("%d -> %d [penwidth=%d]" % (my_id, i,
+      1+int(log(total((None, child))))))
+
+  return my_id
+
+print("digraph G {\naspect=\"1\"\n")
+prefix = sys.argv[1]
+v1 = sys.argv[2]
+v2 = sys.argv[3]
+to_dot(trie, prefix)
+print("}")
diff --git a/haspirater/uptrie.py b/haspirater/uptrie.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, make internal node decisions and output json dump to
+stdout"""
+
+import itertools
+import operator
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def uptrie(trie):
+  """Make internal node decisions if possible"""
+  for child in trie[1].values():
+    uptrie(child)
+  decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if
+          len(t[0].keys()) == 1]
+  dchild_g = {}
+  for (x, y) in decided_children:
+      if x not in dchild_g.keys():
+          dchild_g[x] = []
+      dchild_g[x].append(y)
+  sums = [(x, len(y)) for (x, y) in dchild_g.items()]
+  if len(sums) == 0:
+    return
+  best = max(sums, key=operator.itemgetter(1))
+  if best[1] >= 2:
+    # compress here
+    trie.append(best[0])
+    nchildren = {}
+    for key, child in trie[1].items():
+      if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]:
+        nchildren[key] = child
+    trie[1] = nchildren
+
+uptrie(trie)
+
+print(json.dumps(trie))
+
diff --git a/leavestrie.py b/leavestrie.py
@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, produce leaves and values
-argv[1] is 1 or -1 to reverse the label sequence or not"""
-
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def leaves(trie, prefix="", provisional=None):
-  """Keep only the most probable values at each node"""
-  if len(trie[1].keys()) == 0:
-    assert(len(trie[0].keys()) == 1)
-    k, v = trie[0].popitem()
-    if (k != provisional):
-      # does not agree with provisional decision so far
-      print("%s\t%s" % (k, prefix[::int(sys.argv[1])]))
-  # decided nodes
-  if len(trie) == 3 and trie[2]:
-    if (trie[2] != provisional):
-      # does not agree with provisional decision so far
-      print("%s\t%s" % (trie[2], prefix[::int(sys.argv[1])]))
-  if len(trie) == 3:
-    provisional = trie[2]
-  for child in trie[1].keys():
-    leaves(trie[1][child], prefix + child, provisional)
-
-leaves(trie)
-
diff --git a/majoritytrie.py b/majoritytrie.py
@@ -1,35 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, keep majority value at each node, remove
-useless leaf nodes and output trie to stdout"""
-
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def get_majority(d):
-  """What are the most probable values?"""
-  mx = max(d.values())
-  return [k for k in d.keys() if d[k] == mx]
-
-def majority(trie):
-  """Keep only the most probable values at each node"""
-  if len(trie[1].keys()) == 0:
-    # keep all options at leaf nodes
-    trie[0] = list(trie[0].keys())
-  else:
-    trie[0] = get_majority(trie[0])
-  useless = []
-  for child in trie[1].keys():
-    majority(trie[1][child])
-    # if it is relabeled to our majority value and is a leaf, drop it
-    if trie[1][child][0] == trie[0] and trie[1][child][1] == {}:
-      useless.append(child)
-  for child in useless:
-    del(trie[1][child])
-
-majority(trie)
-
-print(json.dumps(trie))
-
diff --git a/make.sh b/make.sh
@@ -7,7 +7,7 @@
   ./detect.pl | # identify and label occurrences
   cat - $* | # add in exceptions
   sed 's/ h/ /' | # we don't keep the useless leading 'h' in the trie
-  ./buildtrie.py  | # prepare the trie
-  ./compresstrie.py | # compress the trie
-  ./majoritytrie.py # keep only the most frequent information
+  ./haspirater/buildtrie.py  | # prepare the trie
+  ./haspirater/compresstrie.py | # compress the trie
+  ./haspirater/majoritytrie.py # keep only the most frequent information
 
diff --git a/setup.py b/setup.py
@@ -0,0 +1,20 @@
+import setuptools
+
+with open("README", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name='haspirater',
+    version='0.2',
+    author="Antoine Amarilli",
+    author_email="a3nm@a3nm.net",
+    package_data={'haspirater' :['*json']},
+    description="detect aspirated 'h' in French words",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://gitlab.com/a3nm/haspirater",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+    ],
+)
diff --git a/trie2dot.py b/trie2dot.py
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-
-"""Takes json as input with labels [value1, value2] and produces dot,
-usage: trie2dot.py prefix value1 value2"""
-
-import json
-import sys
-from math import log
-
-trie = json.load(sys.stdin)
-
-free_id = 0
-
-def cget(d, k):
-  try:
-    if k in d.keys():
-      return d[k]
-    else:
-      return 0
-  except AttributeError:
-    # we have a list, not a dictionary
-    # this happens after majoritytrie.py
-    if k in d:
-      return 1
-    else:
-      return 0
-
-def int2strbyte(i):
-  s = hex(i).split('x')[1]
-  if len(s) == 1:
-    return '0' + s
-  else:
-    return s
-
-def fraction2rgb(fraction):
-  n = int(255*fraction)
-  return int2strbyte(n)+'00'+int2strbyte(255 - n)
-
-def total(x):
-  key, node = x
-  try:
-    return sum(node[0].values())
-  except AttributeError:
-    # we have only one value, not a dictionary
-    return 1
-
-def to_dot(trie, prefix=''):
-  global free_id
-
-  values, children = trie
-  my_id = free_id
-  free_id += 1
-  count = cget(values, v1) + cget(values, v2)
-  fraction = cget(values, v2) / count
-
-  print("%d [label=\"%s\",color=\"#%s\",penwidth=%d]" % (my_id, prefix,
-    fraction2rgb(fraction), 1+int(log(count))))
-
-  for (key, child) in sorted(children.items(), key=total, reverse=True):
-    i = to_dot(child, prefix+key)
-    print("%d -> %d [penwidth=%d]" % (my_id, i,
-      1+int(log(total((None, child))))))
-
-  return my_id
-
-print("digraph G {\naspect=\"1\"\n")
-prefix = sys.argv[1]
-v1 = sys.argv[2]
-v2 = sys.argv[3]
-to_dot(trie, prefix)
-print("}")
diff --git a/uptrie.py b/uptrie.py
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, make internal node decisions and output json dump to
-stdout"""
-
-import itertools
-import operator
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def uptrie(trie):
-  """Make internal node decisions if possible"""
-  for child in trie[1].values():
-    uptrie(child)
-  decided_children = [(list(t[0].items())[0][0], t) for t in trie[1].values() if
-          len(t[0].keys()) == 1]
-  dchild_g = {}
-  for (x, y) in decided_children:
-      if x not in dchild_g.keys():
-          dchild_g[x] = []
-      dchild_g[x].append(y)
-  sums = [(x, len(y)) for (x, y) in dchild_g.items()]
-  if len(sums) == 0:
-    return
-  best = max(sums, key=operator.itemgetter(1))
-  if best[1] >= 2:
-    # compress here
-    trie.append(best[0])
-    nchildren = {}
-    for key, child in trie[1].items():
-      if len(child[0].keys()) != 1 or list(child[0].items())[0][0] != best[0]:
-        nchildren[key] = child
-    trie[1] = nchildren
-
-uptrie(trie)
-
-print(json.dumps(trie))
-

	haspirater detect aspirated 'h' in French words (local mirror of https://gitlab.com/a3nm/haspirater)
	git clone https://a3nm.net/git/haspirater/
	Log \| Files \| Refs \| README \| LICENSE

.gitignore	\|	4	++++
LICENSE	\|	18	++++++++++++++++++
README	\|	2	+-
buildtrie.py	\|	43	-------------------------------------------
buildtrie_list.py	\|	22	----------------------
compresstrie.py	\|	22	----------------------
haspirater.json	\|	1	-
haspirater.py	\|	49	-------------------------------------------------
haspirater/__init__.py	\|	1	+
haspirater/buildtrie.py	\|	43	+++++++++++++++++++++++++++++++++++++++++++
haspirater/buildtrie_list.py	\|	22	++++++++++++++++++++++
haspirater/compresstrie.py	\|	22	++++++++++++++++++++++
haspirater/haspirater.json	\|	1	+
haspirater/haspirater.py	\|	49	+++++++++++++++++++++++++++++++++++++++++++++++++
haspirater/leavestrie.py	\|	30	++++++++++++++++++++++++++++++
haspirater/majoritytrie.py	\|	35	+++++++++++++++++++++++++++++++++++
haspirater/trie2dot.py	\|	71	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
haspirater/uptrie.py	\|	40	++++++++++++++++++++++++++++++++++++++++
leavestrie.py	\|	30	------------------------------
majoritytrie.py	\|	35	-----------------------------------
make.sh	\|	6	+++---
setup.py	\|	20	++++++++++++++++++++
trie2dot.py	\|	71	-----------------------------------------------------------------------
uptrie.py	\|	40	----------------------------------------