frhyme

guess the last phonemes of a French word (local mirror of https://gitlab.com/a3nm/frhyme)
git clone https://a3nm.net/git/frhyme/
Log | Files | Refs | README | LICENSE

commit 6e9af935a279923df039026980c92b979e26947a
parent 5f623dafafdd06a94b2165224cd5a0eba4cc1451
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Fri, 16 Aug 2019 00:03:44 +0200

Merge gitlab.com:a3nm/frhyme

Diffstat:
.gitignore | 3+++
LICENSE | 18++++++++++++++++++
README | 4+++-
additions | 109-------------------------------------------------------------------------------
buildtrie.py | 45---------------------------------------------
compresstrie.py | 43-------------------------------------------
frhyme.py | 68--------------------------------------------------------------------
frhyme/__init__.py | 1+
frhyme/buildtrie.py | 45+++++++++++++++++++++++++++++++++++++++++++++
frhyme/compresstrie.py | 43+++++++++++++++++++++++++++++++++++++++++++
frhyme/frhyme.py | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
lexique/lexique_fix.sh | 8--------
lexique/lexique_prepare.sh | 6------
lexique/lexique_retrieve.sh | 12------------
lexique/subst.pl | 38--------------------------------------
make.sh | 9---------
scripts/additions | 109+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
scripts/install.sh | 5+++++
scripts/lexique/lexique_fix.sh | 8++++++++
scripts/lexique/lexique_prepare.sh | 6++++++
scripts/lexique/lexique_retrieve.sh | 12++++++++++++
scripts/lexique/subst.pl | 38++++++++++++++++++++++++++++++++++++++
scripts/make.sh | 9+++++++++
scripts/truncate.sh | 5+++++
setup.py | 20++++++++++++++++++++
truncate.sh | 5-----
26 files changed, 393 insertions(+), 344 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -1,3 +1,6 @@ frhyme.json lexique.txt lexique/Lexique* +build/ +dist/ +frhyme.egg-info/ diff --git a/LICENSE b/LICENSE @@ -0,0 +1,18 @@ +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README b/README @@ -55,8 +55,10 @@ should be trained from a pronunciation database. The recommended way to do so is to use a tweaked Lexique <http://lexique.org> along with a provided bugfix file, as follows: + cd scripts lexique/lexique_retrieve.sh > lexique.txt - ./make.sh NPHON lexique.txt additions > frhyme.json + ./make.sh NPHON lexique.txt additions > ../frhyme/frhyme.json + cd .. where NPHON is the number of trailing phonemes to keep (suggested value: 4). Beware, this may take up several hundred megabytes of RAM. The resulting file diff --git a/additions b/additions @@ -1,109 +0,0 @@ -almanach almana -dompte d$t -domptent d$t -dompterai d$tRE -dompterait d$tRE -dompter d$te -dompteur d$t9R -dompteurs d$t9R -dompteuse d$t2z -dompteuses d$t2z -domptez d$te -tabis tabi -libye libi -est E -bœuf b9f -bœufs b2 -dis-je diZ -employ #plwa -amusemens amyzm# -parens paR# -peur p9R -vapeur vap9R -moeurs m9R -mœurs m9R -tous tu -Achille aSil -Achilles aSil -ignora iJORa -ignorai iJORE -ignoraient iJORE -ignorais iJORE -ignorait iJORE -ignorance iJOR#s -ignorances iJOR#s -ignorant iJOR# -ignorante iJOR#t -ignorantes iJOR#t -ignorantins iJOR#t) -ignorants iJOR# -ignorassent iJORas -ignore iJOR -ignorent iJOR -ignorer iJORe -donc d$ -pattern patERn -est-ce Es -Rouen Rw# -c'est sE -l'est lE --il il -die di -'en # -étais-je etEZ -lords lOR -post-scriptum pOstskRipt9m -Arras aRas -arras aRas -laissez-les lEselE -ruz Ry -c'est sE -l'est lE -m'en m# -Soize swaz -Cianán kajnan -inuit inwit -inuits inwit -mindel m)dEl -mindels m)dEl -citroën sitROEn -Citroën sitROEn -inlay inlE -inlays inlE -ber bER -bers bER -ehud eud -Ehud eud -rubén Ruben -Rubén Ruben -Jefferson ZEfERsOn -ruolz RwOls -ruolz RyOls -maremme maREm -maremmes maREm -jackpot dZakpOt -jackpots dZakpOt -poële pwal -poëles pwal -poëlon pwal$ -poëlées pwale -Terese teReze -pôvre povR -pôvres povR -Jocelyn Zos2l) -saburre sabyR -Sylla sila -m'sieur msj2 -corner kORnER -bostryche bOstRiS -bostryches bOstRiS -abrivent abRiv# -abrivents abRiv# -apocyn apOs) -apocyns apOs) -Rostand ROst# -Zürich zyRik -Dresde dREzd -zooment zum -n'es nE -Créuse kReyz diff --git a/buildtrie.py b/buildtrie.py @@ -1,45 +0,0 @@ -#!/usr/bin/python3 -O - -"""From a list of values (arbitrary) and keys (words), create a trie -representing this mapping""" - -import json -import sys - -# first item is a dictionnary from values to an int indicating the -# number of occurrences with this prefix having this value -# second item is a dictionnary from letters to descendent nodes -def empty_node(): - return [{}, {}] - -trie = empty_node() - -def insert(trie, key, val): - """Insert val for key in trie""" - values, children = trie - # create a new value, if needed - if len(key) == 0: - if val not in values.keys(): - values[val] = 0 - # increment count for val - values[val] += 1 - if len(key) > 0: - # create a new node if needed - if key[0] not in children.keys(): - children[key[0]] = empty_node() - # recurse - return insert(children[key[0]], key[1:], val) - -while True: - line = sys.stdin.readline() - if not line: - break - line = line.strip().split('\t') - # a trailing space is used to mark termination of the word - # this is useful in cases where a prefix of a word is a complete, - # different word with a different value - # two spaces because some data words have multiple spaces - insert(trie, line[0]+' ', line[1]) - -print(json.dumps(trie)) - diff --git a/compresstrie.py b/compresstrie.py @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -"""Read json trie in stdin, trim unneeded branches and output json dump -to stdout""" - -import json -import sys - -trie = json.load(sys.stdin) - -def compress(trie): - """Compress the trie""" - ref = None - num = 0 - ok = True - if trie[0] != {}: - if len(trie[0].keys()) > 1: - return None - ref = list(trie[0].keys())[0] - num = trie[0][ref] - for child in trie[1].values(): - x = compress(child) - if not ok or x == None: - ok = False - continue - r, n = x - if ref == None: - ref = r - if ref != r: - ok = False - num += n - if not ok: - return None - trie[0] = {} - trie[0][ref] = num - trie[1] = {} - #print(ref, file=sys.stderr) - return ref, num - -compress(trie) - -print(json.dumps(trie)) - diff --git a/frhyme.py b/frhyme.py @@ -1,68 +0,0 @@ -#!/usr/bin/python3 -O - -"""Try to guess the last few phonemes of a French word, by a lookup in a -precompiled trie""" - -import os -import json -import sys -from pprint import pprint - -DEFAULT_NBEST=5 - -f = open(os.path.join(os.path.dirname( - os.path.realpath(__file__)), 'frhyme.json')) -trie = json.load(f) -f.close() - -def to_list(d, rev=True): - return [(d[a], a[::-1] if rev else a) for a in d.keys()] - -def trie2list(trie): - v, c = trie - if c == {}: - return to_list(v) - else: - d = {} - for child in c.keys(): - l = trie2list(c[child]) - for x in l: - if x[1] not in d.keys(): - d[x[1]] = 0 - d[x[1]] += x[0] - return to_list(d, False) - -def add_dict(a, b): - return dict( [ (n, a.get(n, 0)+b.get(n, 0)) for n in set(a)|set(b) ] ) - -def do_lookup(trie, key): - if len(key) == 0 or key[0] not in trie[1].keys(): - return trie2list(trie) - return do_lookup(trie[1][key[0]], key[1:]) - -def nbest(l, t): - l = sorted(l)[-t:] - l.reverse() - return l - -def lookup(key, n=DEFAULT_NBEST): - """Return n top pronunciations for key""" - return nbest(do_lookup(trie, key[::-1] + ' '), n) - -def wrap_lookup(line, n): - pprint(lookup(line.lower().strip(), n)) - -if __name__ == '__main__': - n = DEFAULT_NBEST - if len(sys.argv) >= 2: - n = int(sys.argv[1]) - if len(sys.argv) > 2: - for arg in sys.argv[2:]: - wrap_lookup(arg, n) - else: - while True: - line = sys.stdin.readline() - if not line: - break - wrap_lookup(line, n) - diff --git a/frhyme/__init__.py b/frhyme/__init__.py @@ -0,0 +1 @@ +from .frhyme import * diff --git a/frhyme/buildtrie.py b/frhyme/buildtrie.py @@ -0,0 +1,45 @@ +#!/usr/bin/python3 -O + +"""From a list of values (arbitrary) and keys (words), create a trie +representing this mapping""" + +import json +import sys + +# first item is a dictionnary from values to an int indicating the +# number of occurrences with this prefix having this value +# second item is a dictionnary from letters to descendent nodes +def empty_node(): + return [{}, {}] + +trie = empty_node() + +def insert(trie, key, val): + """Insert val for key in trie""" + values, children = trie + # create a new value, if needed + if len(key) == 0: + if val not in values.keys(): + values[val] = 0 + # increment count for val + values[val] += 1 + if len(key) > 0: + # create a new node if needed + if key[0] not in children.keys(): + children[key[0]] = empty_node() + # recurse + return insert(children[key[0]], key[1:], val) + +while True: + line = sys.stdin.readline() + if not line: + break + line = line.strip().split('\t') + # a trailing space is used to mark termination of the word + # this is useful in cases where a prefix of a word is a complete, + # different word with a different value + # two spaces because some data words have multiple spaces + insert(trie, line[0]+' ', line[1]) + +print(json.dumps(trie)) + diff --git a/frhyme/compresstrie.py b/frhyme/compresstrie.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +"""Read json trie in stdin, trim unneeded branches and output json dump +to stdout""" + +import json +import sys + +trie = json.load(sys.stdin) + +def compress(trie): + """Compress the trie""" + ref = None + num = 0 + ok = True + if trie[0] != {}: + if len(trie[0].keys()) > 1: + return None + ref = list(trie[0].keys())[0] + num = trie[0][ref] + for child in trie[1].values(): + x = compress(child) + if not ok or x == None: + ok = False + continue + r, n = x + if ref == None: + ref = r + if ref != r: + ok = False + num += n + if not ok: + return None + trie[0] = {} + trie[0][ref] = num + trie[1] = {} + #print(ref, file=sys.stderr) + return ref, num + +compress(trie) + +print(json.dumps(trie)) + diff --git a/frhyme/frhyme.py b/frhyme/frhyme.py @@ -0,0 +1,68 @@ +#!/usr/bin/python3 -O + +"""Try to guess the last few phonemes of a French word, by a lookup in a +precompiled trie""" + +import os +import json +import sys +from pprint import pprint + +DEFAULT_NBEST=5 + +f = open(os.path.join(os.path.dirname( + os.path.realpath(__file__)), 'frhyme.json')) +trie = json.load(f) +f.close() + +def to_list(d, rev=True): + return [(d[a], a[::-1] if rev else a) for a in d.keys()] + +def trie2list(trie): + v, c = trie + if c == {}: + return to_list(v) + else: + d = {} + for child in c.keys(): + l = trie2list(c[child]) + for x in l: + if x[1] not in d.keys(): + d[x[1]] = 0 + d[x[1]] += x[0] + return to_list(d, False) + +def add_dict(a, b): + return dict( [ (n, a.get(n, 0)+b.get(n, 0)) for n in set(a)|set(b) ] ) + +def do_lookup(trie, key): + if len(key) == 0 or key[0] not in trie[1].keys(): + return trie2list(trie) + return do_lookup(trie[1][key[0]], key[1:]) + +def nbest(l, t): + l = sorted(l)[-t:] + l.reverse() + return l + +def lookup(key, n=DEFAULT_NBEST): + """Return n top pronunciations for key""" + return nbest(do_lookup(trie, key[::-1] + ' '), n) + +def wrap_lookup(line, n): + pprint(lookup(line.lower().strip(), n)) + +if __name__ == '__main__': + n = DEFAULT_NBEST + if len(sys.argv) >= 2: + n = int(sys.argv[1]) + if len(sys.argv) > 2: + for arg in sys.argv[2:]: + wrap_lookup(arg, n) + else: + while True: + line = sys.stdin.readline() + if not line: + break + wrap_lookup(line, n) + diff --git a/lexique/lexique_fix.sh b/lexique/lexique_fix.sh @@ -1,8 +0,0 @@ -#!/bin/bash - -# General fixes for lexique - -cd "$( dirname "$0" )" - -sed 1d | ./subst.pl - diff --git a/lexique/lexique_prepare.sh b/lexique/lexique_prepare.sh @@ -1,6 +0,0 @@ -#!/bin/bash - -# Prepare the Lexique file for use with frhyme - -cut -f 1,2 | uniq - diff --git a/lexique/lexique_retrieve.sh b/lexique/lexique_retrieve.sh @@ -1,12 +0,0 @@ -#!/bin/bash - -ZIP="Lexique382.zip" -URL="http://www.lexique.org/databases/Lexique382/$ZIP" -FILE="Lexique382.tsv" - -cd "$( dirname "$0" )" - -wget $URL -unzip -qq $ZIP $FILE -cat $FILE | ./lexique_fix.sh | ./lexique_prepare.sh - diff --git a/lexique/subst.pl b/lexique/subst.pl @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -# This file fixes Lexique's pronunciation info from the home-grown -# format described in -# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a -# variation of the X-SAMPA standard - - -sub subst { - my $a = shift; - # substitutions to apply - my @s = ( - ['§', '$'], - ['@', '#'], - ['1', '('], - ['5', ')'], - ['°', '@'], - ['3', '@'], - ['H', '8'], - ['N', 'J'], - ['G', 'N'], - ); - foreach my $t (@s) { - $a =~ s/${$t}[0]/${$t}[1]/g - } - return $a; -} - -while (<>) { - chop; - if (/^([^\t]*)\t([^\t]*)(.*)$/) { - my $repl = subst $2; - print "$1\t$repl$3\n"; - } else { - die "Cannot process line: $_\n"; - } -} - diff --git a/make.sh b/make.sh @@ -1,9 +0,0 @@ -#!/bin/bash - -NUM=$1 -shift - -cat $* | ./truncate.sh $NUM | - rev | awk --field-separator="\t" '{printf "%s\t%s\n", $2, $1}' | - ./buildtrie.py | ./compresstrie.py - diff --git a/scripts/additions b/scripts/additions @@ -0,0 +1,109 @@ +almanach almana +dompte d$t +domptent d$t +dompterai d$tRE +dompterait d$tRE +dompter d$te +dompteur d$t9R +dompteurs d$t9R +dompteuse d$t2z +dompteuses d$t2z +domptez d$te +tabis tabi +libye libi +est E +bœuf b9f +bœufs b2 +dis-je diZ +employ #plwa +amusemens amyzm# +parens paR# +peur p9R +vapeur vap9R +moeurs m9R +mœurs m9R +tous tu +Achille aSil +Achilles aSil +ignora iJORa +ignorai iJORE +ignoraient iJORE +ignorais iJORE +ignorait iJORE +ignorance iJOR#s +ignorances iJOR#s +ignorant iJOR# +ignorante iJOR#t +ignorantes iJOR#t +ignorantins iJOR#t) +ignorants iJOR# +ignorassent iJORas +ignore iJOR +ignorent iJOR +ignorer iJORe +donc d$ +pattern patERn +est-ce Es +Rouen Rw# +c'est sE +l'est lE +-il il +die di +'en # +étais-je etEZ +lords lOR +post-scriptum pOstskRipt9m +Arras aRas +arras aRas +laissez-les lEselE +ruz Ry +c'est sE +l'est lE +m'en m# +Soize swaz +Cianán kajnan +inuit inwit +inuits inwit +mindel m)dEl +mindels m)dEl +citroën sitROEn +Citroën sitROEn +inlay inlE +inlays inlE +ber bER +bers bER +ehud eud +Ehud eud +rubén Ruben +Rubén Ruben +Jefferson ZEfERsOn +ruolz RwOls +ruolz RyOls +maremme maREm +maremmes maREm +jackpot dZakpOt +jackpots dZakpOt +poële pwal +poëles pwal +poëlon pwal$ +poëlées pwale +Terese teReze +pôvre povR +pôvres povR +Jocelyn Zos2l) +saburre sabyR +Sylla sila +m'sieur msj2 +corner kORnER +bostryche bOstRiS +bostryches bOstRiS +abrivent abRiv# +abrivents abRiv# +apocyn apOs) +apocyns apOs) +Rostand ROst# +Zürich zyRik +Dresde dREzd +zooment zum +n'es nE +Créuse kReyz diff --git a/scripts/install.sh b/scripts/install.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +echo "Downloading Lexique" +lexique/lexique_retrieve.sh > lexique.txt +./make.sh 4 lexique.txt additions > ../frhyme/frhyme.json diff --git a/scripts/lexique/lexique_fix.sh b/scripts/lexique/lexique_fix.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# General fixes for lexique + +cd "$( dirname "$0" )" + +sed 1d | ./subst.pl + diff --git a/scripts/lexique/lexique_prepare.sh b/scripts/lexique/lexique_prepare.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# Prepare the Lexique file for use with frhyme + +cut -f 1,2 | uniq + diff --git a/scripts/lexique/lexique_retrieve.sh b/scripts/lexique/lexique_retrieve.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +ZIP="Lexique382.zip" +URL="http://www.lexique.org/databases/Lexique382/$ZIP" +FILE="Lexique382.tsv" + +cd "$( dirname "$0" )" + +wget $URL +unzip -qq $ZIP $FILE +cat $FILE | ./lexique_fix.sh | ./lexique_prepare.sh + diff --git a/scripts/lexique/subst.pl b/scripts/lexique/subst.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl + +# This file fixes Lexique's pronunciation info from the home-grown +# format described in +# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a +# variation of the X-SAMPA standard + + +sub subst { + my $a = shift; + # substitutions to apply + my @s = ( + ['§', '$'], + ['@', '#'], + ['1', '('], + ['5', ')'], + ['°', '@'], + ['3', '@'], + ['H', '8'], + ['N', 'J'], + ['G', 'N'], + ); + foreach my $t (@s) { + $a =~ s/${$t}[0]/${$t}[1]/g + } + return $a; +} + +while (<>) { + chop; + if (/^([^\t]*)\t([^\t]*)(.*)$/) { + my $repl = subst $2; + print "$1\t$repl$3\n"; + } else { + die "Cannot process line: $_\n"; + } +} + diff --git a/scripts/make.sh b/scripts/make.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +NUM=$1 +shift + +cat $* | ./truncate.sh $NUM | + rev | awk --field-separator="\t" '{printf "%s\t%s\n", $2, $1}' | + ../frhyme/buildtrie.py | ../frhyme/compresstrie.py + diff --git a/scripts/truncate.sh b/scripts/truncate.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +NUM=${1:?Usage: $0 NUM_PHONEMES} +awk --field-separator="\t" '{ printf "%s\t%s\n", $1, substr( $2, length($2) - '$NUM' + 1) }' + diff --git a/setup.py b/setup.py @@ -0,0 +1,20 @@ +import setuptools + +with open("README", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name='frhyme', + version='0.2', + author="Antoine Amarilli", + author_email="a3nm@a3nm.net", + package_data={'frhyme' :['*json']}, + description="Guess the last phonemes of a French word", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://gitlab.com/a3nm/frhyme", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + ], +) diff --git a/truncate.sh b/truncate.sh @@ -1,5 +0,0 @@ -#!/bin/bash - -NUM=${1:?Usage: $0 NUM_PHONEMES} -awk --field-separator="\t" '{ printf "%s\t%s\n", $1, substr( $2, length($2) - '$NUM' + 1) }' -