commit f0f352ea98d49111239f3daf143cbf7025a18a40
parent 2d217822c497c984747bf44760318ae27ce0880b
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Tue, 13 Mar 2012 11:57:47 +0100
improve lexique tools
Diffstat:
3 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/lexique/lexique_prepare.sh b/lexique/lexique_prepare.sh
@@ -2,7 +2,5 @@
# Prepare the Lexique file for use with frhyme
-cd "$( dirname "$0" )"
-
-awk '{print $1, $2}' | iconv -f latin1 -t utf8 | ./subst.pl
+cut -f 1,2 | uniq
diff --git a/lexique/lexique_retrieve.sh b/lexique/lexique_retrieve.sh
@@ -7,6 +7,6 @@ FILE="Lexique371/Bases+Scripts/Lexique3.txt"
cd "$( dirname "$0" )"
wget $URL
-unzip $ZIP $FILE
-cat $FILE | ./prepare_lexique.sh
+unzip -qq $ZIP $FILE
+cat $FILE | ./lexique_fix.sh | ./lexique_prepare.sh
diff --git a/lexique/subst.pl b/lexique/subst.pl
@@ -2,23 +2,23 @@
# This file fixes Lexique's pronunciation info from the home-grown
# format described in
-# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to the
-# X-SAMPA standard
+# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a
+# variation of the X-SAMPA standard
sub subst {
my $a = shift;
# substitutions to apply
my @s = (
- ["§", "O~"],
- ["@", "A~"],
- ["1", "E~"],
- ["5", "9~"],
- ["°", "@"],
- ["3", "@"],
- ["H", "8"],
- ["N", "J"],
- ["G", "N"],
+ ['§', '$'],
+ ['@', '#'],
+ ['1', '('],
+ ['5', ')'],
+ ['°', '@'],
+ ['3', '@'],
+ ['H', '8'],
+ ['N', 'J'],
+ ['G', 'N'],
);
foreach my $t (@s) {
$a =~ s/${$t}[0]/${$t}[1]/g
@@ -28,9 +28,11 @@ sub subst {
while (<>) {
chop;
- if (/^(.*) ([^ ]*)$/) {
+ if (/^([^\t]*)\t([^\t]*)(.*)$/) {
my $repl = subst $2;
- print "$1 $repl\n";
+ print "$1\t$repl$3\n";
+ } else {
+ die "Cannot process line: $_\n";
}
}