commit 6e9af935a279923df039026980c92b979e26947a
parent 5f623dafafdd06a94b2165224cd5a0eba4cc1451
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Fri, 16 Aug 2019 00:03:44 +0200
Merge gitlab.com:a3nm/frhyme
Diffstat:
26 files changed, 393 insertions(+), 344 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
frhyme.json
lexique.txt
lexique/Lexique*
+build/
+dist/
+frhyme.egg-info/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,18 @@
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README b/README
@@ -55,8 +55,10 @@ should be trained from a pronunciation database. The recommended way to do so is
to use a tweaked Lexique <http://lexique.org> along with a provided bugfix file,
as follows:
+ cd scripts
lexique/lexique_retrieve.sh > lexique.txt
- ./make.sh NPHON lexique.txt additions > frhyme.json
+ ./make.sh NPHON lexique.txt additions > ../frhyme/frhyme.json
+ cd ..
where NPHON is the number of trailing phonemes to keep (suggested value: 4).
Beware, this may take up several hundred megabytes of RAM. The resulting file
diff --git a/additions b/additions
@@ -1,109 +0,0 @@
-almanach almana
-dompte d$t
-domptent d$t
-dompterai d$tRE
-dompterait d$tRE
-dompter d$te
-dompteur d$t9R
-dompteurs d$t9R
-dompteuse d$t2z
-dompteuses d$t2z
-domptez d$te
-tabis tabi
-libye libi
-est E
-bœuf b9f
-bœufs b2
-dis-je diZ
-employ #plwa
-amusemens amyzm#
-parens paR#
-peur p9R
-vapeur vap9R
-moeurs m9R
-mœurs m9R
-tous tu
-Achille aSil
-Achilles aSil
-ignora iJORa
-ignorai iJORE
-ignoraient iJORE
-ignorais iJORE
-ignorait iJORE
-ignorance iJOR#s
-ignorances iJOR#s
-ignorant iJOR#
-ignorante iJOR#t
-ignorantes iJOR#t
-ignorantins iJOR#t)
-ignorants iJOR#
-ignorassent iJORas
-ignore iJOR
-ignorent iJOR
-ignorer iJORe
-donc d$
-pattern patERn
-est-ce Es
-Rouen Rw#
-c'est sE
-l'est lE
--il il
-die di
-'en #
-étais-je etEZ
-lords lOR
-post-scriptum pOstskRipt9m
-Arras aRas
-arras aRas
-laissez-les lEselE
-ruz Ry
-c'est sE
-l'est lE
-m'en m#
-Soize swaz
-Cianán kajnan
-inuit inwit
-inuits inwit
-mindel m)dEl
-mindels m)dEl
-citroën sitROEn
-Citroën sitROEn
-inlay inlE
-inlays inlE
-ber bER
-bers bER
-ehud eud
-Ehud eud
-rubén Ruben
-Rubén Ruben
-Jefferson ZEfERsOn
-ruolz RwOls
-ruolz RyOls
-maremme maREm
-maremmes maREm
-jackpot dZakpOt
-jackpots dZakpOt
-poële pwal
-poëles pwal
-poëlon pwal$
-poëlées pwale
-Terese teReze
-pôvre povR
-pôvres povR
-Jocelyn Zos2l)
-saburre sabyR
-Sylla sila
-m'sieur msj2
-corner kORnER
-bostryche bOstRiS
-bostryches bOstRiS
-abrivent abRiv#
-abrivents abRiv#
-apocyn apOs)
-apocyns apOs)
-Rostand ROst#
-Zürich zyRik
-Dresde dREzd
-zooment zum
-n'es nE
-Créuse kReyz
diff --git a/buildtrie.py b/buildtrie.py
@@ -1,45 +0,0 @@
-#!/usr/bin/python3 -O
-
-"""From a list of values (arbitrary) and keys (words), create a trie
-representing this mapping"""
-
-import json
-import sys
-
-# first item is a dictionnary from values to an int indicating the
-# number of occurrences with this prefix having this value
-# second item is a dictionnary from letters to descendent nodes
-def empty_node():
- return [{}, {}]
-
-trie = empty_node()
-
-def insert(trie, key, val):
- """Insert val for key in trie"""
- values, children = trie
- # create a new value, if needed
- if len(key) == 0:
- if val not in values.keys():
- values[val] = 0
- # increment count for val
- values[val] += 1
- if len(key) > 0:
- # create a new node if needed
- if key[0] not in children.keys():
- children[key[0]] = empty_node()
- # recurse
- return insert(children[key[0]], key[1:], val)
-
-while True:
- line = sys.stdin.readline()
- if not line:
- break
- line = line.strip().split('\t')
- # a trailing space is used to mark termination of the word
- # this is useful in cases where a prefix of a word is a complete,
- # different word with a different value
- # two spaces because some data words have multiple spaces
- insert(trie, line[0]+' ', line[1])
-
-print(json.dumps(trie))
-
diff --git a/compresstrie.py b/compresstrie.py
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-
-"""Read json trie in stdin, trim unneeded branches and output json dump
-to stdout"""
-
-import json
-import sys
-
-trie = json.load(sys.stdin)
-
-def compress(trie):
- """Compress the trie"""
- ref = None
- num = 0
- ok = True
- if trie[0] != {}:
- if len(trie[0].keys()) > 1:
- return None
- ref = list(trie[0].keys())[0]
- num = trie[0][ref]
- for child in trie[1].values():
- x = compress(child)
- if not ok or x == None:
- ok = False
- continue
- r, n = x
- if ref == None:
- ref = r
- if ref != r:
- ok = False
- num += n
- if not ok:
- return None
- trie[0] = {}
- trie[0][ref] = num
- trie[1] = {}
- #print(ref, file=sys.stderr)
- return ref, num
-
-compress(trie)
-
-print(json.dumps(trie))
-
diff --git a/frhyme.py b/frhyme.py
@@ -1,68 +0,0 @@
-#!/usr/bin/python3 -O
-
-"""Try to guess the last few phonemes of a French word, by a lookup in a
-precompiled trie"""
-
-import os
-import json
-import sys
-from pprint import pprint
-
-DEFAULT_NBEST=5
-
-f = open(os.path.join(os.path.dirname(
- os.path.realpath(__file__)), 'frhyme.json'))
-trie = json.load(f)
-f.close()
-
-def to_list(d, rev=True):
- return [(d[a], a[::-1] if rev else a) for a in d.keys()]
-
-def trie2list(trie):
- v, c = trie
- if c == {}:
- return to_list(v)
- else:
- d = {}
- for child in c.keys():
- l = trie2list(c[child])
- for x in l:
- if x[1] not in d.keys():
- d[x[1]] = 0
- d[x[1]] += x[0]
- return to_list(d, False)
-
-def add_dict(a, b):
- return dict( [ (n, a.get(n, 0)+b.get(n, 0)) for n in set(a)|set(b) ] )
-
-def do_lookup(trie, key):
- if len(key) == 0 or key[0] not in trie[1].keys():
- return trie2list(trie)
- return do_lookup(trie[1][key[0]], key[1:])
-
-def nbest(l, t):
- l = sorted(l)[-t:]
- l.reverse()
- return l
-
-def lookup(key, n=DEFAULT_NBEST):
- """Return n top pronunciations for key"""
- return nbest(do_lookup(trie, key[::-1] + ' '), n)
-
-def wrap_lookup(line, n):
- pprint(lookup(line.lower().strip(), n))
-
-if __name__ == '__main__':
- n = DEFAULT_NBEST
- if len(sys.argv) >= 2:
- n = int(sys.argv[1])
- if len(sys.argv) > 2:
- for arg in sys.argv[2:]:
- wrap_lookup(arg, n)
- else:
- while True:
- line = sys.stdin.readline()
- if not line:
- break
- wrap_lookup(line, n)
-
diff --git a/frhyme/__init__.py b/frhyme/__init__.py
@@ -0,0 +1 @@
+from .frhyme import *
diff --git a/frhyme/buildtrie.py b/frhyme/buildtrie.py
@@ -0,0 +1,45 @@
+#!/usr/bin/python3 -O
+
+"""From a list of values (arbitrary) and keys (words), create a trie
+representing this mapping"""
+
+import json
+import sys
+
+# first item is a dictionnary from values to an int indicating the
+# number of occurrences with this prefix having this value
+# second item is a dictionnary from letters to descendent nodes
+def empty_node():
+ return [{}, {}]
+
+trie = empty_node()
+
+def insert(trie, key, val):
+ """Insert val for key in trie"""
+ values, children = trie
+ # create a new value, if needed
+ if len(key) == 0:
+ if val not in values.keys():
+ values[val] = 0
+ # increment count for val
+ values[val] += 1
+ if len(key) > 0:
+ # create a new node if needed
+ if key[0] not in children.keys():
+ children[key[0]] = empty_node()
+ # recurse
+ return insert(children[key[0]], key[1:], val)
+
+while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ line = line.strip().split('\t')
+ # a trailing space is used to mark termination of the word
+ # this is useful in cases where a prefix of a word is a complete,
+ # different word with a different value
+ # two spaces because some data words have multiple spaces
+ insert(trie, line[0]+' ', line[1])
+
+print(json.dumps(trie))
+
diff --git a/frhyme/compresstrie.py b/frhyme/compresstrie.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+"""Read json trie in stdin, trim unneeded branches and output json dump
+to stdout"""
+
+import json
+import sys
+
+trie = json.load(sys.stdin)
+
+def compress(trie):
+ """Compress the trie"""
+ ref = None
+ num = 0
+ ok = True
+ if trie[0] != {}:
+ if len(trie[0].keys()) > 1:
+ return None
+ ref = list(trie[0].keys())[0]
+ num = trie[0][ref]
+ for child in trie[1].values():
+ x = compress(child)
+ if not ok or x == None:
+ ok = False
+ continue
+ r, n = x
+ if ref == None:
+ ref = r
+ if ref != r:
+ ok = False
+ num += n
+ if not ok:
+ return None
+ trie[0] = {}
+ trie[0][ref] = num
+ trie[1] = {}
+ #print(ref, file=sys.stderr)
+ return ref, num
+
+compress(trie)
+
+print(json.dumps(trie))
+
diff --git a/frhyme/frhyme.py b/frhyme/frhyme.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3 -O
+
+"""Try to guess the last few phonemes of a French word, by a lookup in a
+precompiled trie"""
+
+import os
+import json
+import sys
+from pprint import pprint
+
+DEFAULT_NBEST=5
+
+f = open(os.path.join(os.path.dirname(
+ os.path.realpath(__file__)), 'frhyme.json'))
+trie = json.load(f)
+f.close()
+
+def to_list(d, rev=True):
+ return [(d[a], a[::-1] if rev else a) for a in d.keys()]
+
+def trie2list(trie):
+ v, c = trie
+ if c == {}:
+ return to_list(v)
+ else:
+ d = {}
+ for child in c.keys():
+ l = trie2list(c[child])
+ for x in l:
+ if x[1] not in d.keys():
+ d[x[1]] = 0
+ d[x[1]] += x[0]
+ return to_list(d, False)
+
+def add_dict(a, b):
+ return dict( [ (n, a.get(n, 0)+b.get(n, 0)) for n in set(a)|set(b) ] )
+
+def do_lookup(trie, key):
+ if len(key) == 0 or key[0] not in trie[1].keys():
+ return trie2list(trie)
+ return do_lookup(trie[1][key[0]], key[1:])
+
+def nbest(l, t):
+ l = sorted(l)[-t:]
+ l.reverse()
+ return l
+
+def lookup(key, n=DEFAULT_NBEST):
+ """Return n top pronunciations for key"""
+ return nbest(do_lookup(trie, key[::-1] + ' '), n)
+
+def wrap_lookup(line, n):
+ pprint(lookup(line.lower().strip(), n))
+
+if __name__ == '__main__':
+ n = DEFAULT_NBEST
+ if len(sys.argv) >= 2:
+ n = int(sys.argv[1])
+ if len(sys.argv) > 2:
+ for arg in sys.argv[2:]:
+ wrap_lookup(arg, n)
+ else:
+ while True:
+ line = sys.stdin.readline()
+ if not line:
+ break
+ wrap_lookup(line, n)
+
diff --git a/lexique/lexique_fix.sh b/lexique/lexique_fix.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-# General fixes for lexique
-
-cd "$( dirname "$0" )"
-
-sed 1d | ./subst.pl
-
diff --git a/lexique/lexique_prepare.sh b/lexique/lexique_prepare.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-# Prepare the Lexique file for use with frhyme
-
-cut -f 1,2 | uniq
-
diff --git a/lexique/lexique_retrieve.sh b/lexique/lexique_retrieve.sh
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-ZIP="Lexique382.zip"
-URL="http://www.lexique.org/databases/Lexique382/$ZIP"
-FILE="Lexique382.tsv"
-
-cd "$( dirname "$0" )"
-
-wget $URL
-unzip -qq $ZIP $FILE
-cat $FILE | ./lexique_fix.sh | ./lexique_prepare.sh
-
diff --git a/lexique/subst.pl b/lexique/subst.pl
@@ -1,38 +0,0 @@
-#!/usr/bin/perl
-
-# This file fixes Lexique's pronunciation info from the home-grown
-# format described in
-# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a
-# variation of the X-SAMPA standard
-
-
-sub subst {
- my $a = shift;
- # substitutions to apply
- my @s = (
- ['§', '$'],
- ['@', '#'],
- ['1', '('],
- ['5', ')'],
- ['°', '@'],
- ['3', '@'],
- ['H', '8'],
- ['N', 'J'],
- ['G', 'N'],
- );
- foreach my $t (@s) {
- $a =~ s/${$t}[0]/${$t}[1]/g
- }
- return $a;
-}
-
-while (<>) {
- chop;
- if (/^([^\t]*)\t([^\t]*)(.*)$/) {
- my $repl = subst $2;
- print "$1\t$repl$3\n";
- } else {
- die "Cannot process line: $_\n";
- }
-}
-
diff --git a/make.sh b/make.sh
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-NUM=$1
-shift
-
-cat $* | ./truncate.sh $NUM |
- rev | awk --field-separator="\t" '{printf "%s\t%s\n", $2, $1}' |
- ./buildtrie.py | ./compresstrie.py
-
diff --git a/scripts/additions b/scripts/additions
@@ -0,0 +1,109 @@
+almanach almana
+dompte d$t
+domptent d$t
+dompterai d$tRE
+dompterait d$tRE
+dompter d$te
+dompteur d$t9R
+dompteurs d$t9R
+dompteuse d$t2z
+dompteuses d$t2z
+domptez d$te
+tabis tabi
+libye libi
+est E
+bœuf b9f
+bœufs b2
+dis-je diZ
+employ #plwa
+amusemens amyzm#
+parens paR#
+peur p9R
+vapeur vap9R
+moeurs m9R
+mœurs m9R
+tous tu
+Achille aSil
+Achilles aSil
+ignora iJORa
+ignorai iJORE
+ignoraient iJORE
+ignorais iJORE
+ignorait iJORE
+ignorance iJOR#s
+ignorances iJOR#s
+ignorant iJOR#
+ignorante iJOR#t
+ignorantes iJOR#t
+ignorantins iJOR#t)
+ignorants iJOR#
+ignorassent iJORas
+ignore iJOR
+ignorent iJOR
+ignorer iJORe
+donc d$
+pattern patERn
+est-ce Es
+Rouen Rw#
+c'est sE
+l'est lE
+-il il
+die di
+'en #
+étais-je etEZ
+lords lOR
+post-scriptum pOstskRipt9m
+Arras aRas
+arras aRas
+laissez-les lEselE
+ruz Ry
+c'est sE
+l'est lE
+m'en m#
+Soize swaz
+Cianán kajnan
+inuit inwit
+inuits inwit
+mindel m)dEl
+mindels m)dEl
+citroën sitROEn
+Citroën sitROEn
+inlay inlE
+inlays inlE
+ber bER
+bers bER
+ehud eud
+Ehud eud
+rubén Ruben
+Rubén Ruben
+Jefferson ZEfERsOn
+ruolz RwOls
+ruolz RyOls
+maremme maREm
+maremmes maREm
+jackpot dZakpOt
+jackpots dZakpOt
+poële pwal
+poëles pwal
+poëlon pwal$
+poëlées pwale
+Terese teReze
+pôvre povR
+pôvres povR
+Jocelyn Zos2l)
+saburre sabyR
+Sylla sila
+m'sieur msj2
+corner kORnER
+bostryche bOstRiS
+bostryches bOstRiS
+abrivent abRiv#
+abrivents abRiv#
+apocyn apOs)
+apocyns apOs)
+Rostand ROst#
+Zürich zyRik
+Dresde dREzd
+zooment zum
+n'es nE
+Créuse kReyz
diff --git a/scripts/install.sh b/scripts/install.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+echo "Downloading Lexique"
+lexique/lexique_retrieve.sh > lexique.txt
+./make.sh 4 lexique.txt additions > ../frhyme/frhyme.json
diff --git a/scripts/lexique/lexique_fix.sh b/scripts/lexique/lexique_fix.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# General fixes for lexique
+
+cd "$( dirname "$0" )"
+
+sed 1d | ./subst.pl
+
diff --git a/scripts/lexique/lexique_prepare.sh b/scripts/lexique/lexique_prepare.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# Prepare the Lexique file for use with frhyme
+
+cut -f 1,2 | uniq
+
diff --git a/scripts/lexique/lexique_retrieve.sh b/scripts/lexique/lexique_retrieve.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+ZIP="Lexique382.zip"
+URL="http://www.lexique.org/databases/Lexique382/$ZIP"
+FILE="Lexique382.tsv"
+
+cd "$( dirname "$0" )"
+
+wget $URL
+unzip -qq $ZIP $FILE
+cat $FILE | ./lexique_fix.sh | ./lexique_prepare.sh
+
diff --git a/scripts/lexique/subst.pl b/scripts/lexique/subst.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl
+
+# This file fixes Lexique's pronunciation info from the home-grown
+# format described in
+# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a
+# variation of the X-SAMPA standard
+
+
+sub subst {
+ my $a = shift;
+ # substitutions to apply
+ my @s = (
+ ['§', '$'],
+ ['@', '#'],
+ ['1', '('],
+ ['5', ')'],
+ ['°', '@'],
+ ['3', '@'],
+ ['H', '8'],
+ ['N', 'J'],
+ ['G', 'N'],
+ );
+ foreach my $t (@s) {
+ $a =~ s/${$t}[0]/${$t}[1]/g
+ }
+ return $a;
+}
+
+while (<>) {
+ chop;
+ if (/^([^\t]*)\t([^\t]*)(.*)$/) {
+ my $repl = subst $2;
+ print "$1\t$repl$3\n";
+ } else {
+ die "Cannot process line: $_\n";
+ }
+}
+
diff --git a/scripts/make.sh b/scripts/make.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+NUM=$1
+shift
+
+cat $* | ./truncate.sh $NUM |
+ rev | awk --field-separator="\t" '{printf "%s\t%s\n", $2, $1}' |
+ ../frhyme/buildtrie.py | ../frhyme/compresstrie.py
+
diff --git a/scripts/truncate.sh b/scripts/truncate.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+NUM=${1:?Usage: $0 NUM_PHONEMES}
+awk --field-separator="\t" '{ printf "%s\t%s\n", $1, substr( $2, length($2) - '$NUM' + 1) }'
+
diff --git a/setup.py b/setup.py
@@ -0,0 +1,20 @@
+import setuptools
+
+with open("README", "r") as fh:
+ long_description = fh.read()
+
+setuptools.setup(
+ name='frhyme',
+ version='0.2',
+ author="Antoine Amarilli",
+ author_email="a3nm@a3nm.net",
+ package_data={'frhyme' :['*json']},
+ description="Guess the last phonemes of a French word",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ url="https://gitlab.com/a3nm/frhyme",
+ packages=setuptools.find_packages(),
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ ],
+)
diff --git a/truncate.sh b/truncate.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-NUM=${1:?Usage: $0 NUM_PHONEMES}
-awk --field-separator="\t" '{ printf "%s\t%s\n", $1, substr( $2, length($2) - '$NUM' + 1) }'
-