frhyme

guess the last phonemes of a French word
git clone https://a3nm.net/git/frhyme/
Log | Files | Refs | README

commit f0f352ea98d49111239f3daf143cbf7025a18a40
parent 2d217822c497c984747bf44760318ae27ce0880b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 13 Mar 2012 11:57:47 +0100

improve lexique tools

Diffstat:
lexique/lexique_prepare.sh | 4+---
lexique/lexique_retrieve.sh | 4++--
lexique/subst.pl | 28+++++++++++++++-------------
3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/lexique/lexique_prepare.sh b/lexique/lexique_prepare.sh @@ -2,7 +2,5 @@ # Prepare the Lexique file for use with frhyme -cd "$( dirname "$0" )" - -awk '{print $1, $2}' | iconv -f latin1 -t utf8 | ./subst.pl +cut -f 1,2 | uniq diff --git a/lexique/lexique_retrieve.sh b/lexique/lexique_retrieve.sh @@ -7,6 +7,6 @@ FILE="Lexique371/Bases+Scripts/Lexique3.txt" cd "$( dirname "$0" )" wget $URL -unzip $ZIP $FILE -cat $FILE | ./prepare_lexique.sh +unzip -qq $ZIP $FILE +cat $FILE | ./lexique_fix.sh | ./lexique_prepare.sh diff --git a/lexique/subst.pl b/lexique/subst.pl @@ -2,23 +2,23 @@ # This file fixes Lexique's pronunciation info from the home-grown # format described in -# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to the -# X-SAMPA standard +# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a +# variation of the X-SAMPA standard sub subst { my $a = shift; # substitutions to apply my @s = ( - ["§", "O~"], - ["@", "A~"], - ["1", "E~"], - ["5", "9~"], - ["°", "@"], - ["3", "@"], - ["H", "8"], - ["N", "J"], - ["G", "N"], + ['§', '$'], + ['@', '#'], + ['1', '('], + ['5', ')'], + ['°', '@'], + ['3', '@'], + ['H', '8'], + ['N', 'J'], + ['G', 'N'], ); foreach my $t (@s) { $a =~ s/${$t}[0]/${$t}[1]/g @@ -28,9 +28,11 @@ sub subst { while (<>) { chop; - if (/^(.*) ([^ ]*)$/) { + if (/^([^\t]*)\t([^\t]*)(.*)$/) { my $repl = subst $2; - print "$1 $repl\n"; + print "$1\t$repl$3\n"; + } else { + die "Cannot process line: $_\n"; } }