commit bd5d041c02f10ff5dad7ae374c8de0c492aa13e9
parent 7b0d4c908e7f20bc44c07a58fc41eb6dc88d66b5
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sat, 17 Aug 2019 18:25:14 +0200
add the lexique tweaking scripts to this repo
Diffstat:
3 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/README b/README
@@ -9,9 +9,8 @@ file fixes a few errors that I have found in Lexique (and reported to the
authors).
Last, it contains a modification of the file "lexique" to only use ASCII
-characters for field 2 in "lexique_my_format" (other fields are untouched: the
-precise script used is
-<https://gitlab.com/a3nm/frhyme/blob/master/scripts/lexique/lexique_fix.sh>).
+characters for field 2 in "lexique_my_format" (other fields are untouched): the
+precise script is in scripts/lexique_fix.sh.
The license of all these files is the same as that of Lexique, namely, the
license CC BY SA 4.0 (according to the file README-Lexique.txt in the
diff --git a/scripts/lexique_fix.sh b/scripts/lexique_fix.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# General fixes for lexique
+# Author: Antoine Amarilli
+# Public domain
+
+cd "$( dirname "$0" )"
+
+sed 1d | ./subst.pl
+
diff --git a/scripts/subst.pl b/scripts/subst.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+
+# This file fixes Lexique's pronunciation info from the home-grown
+# format described in
+# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a
+# variation of the X-SAMPA standard
+# Author: Antoine Amarilli
+# Public domain
+
+
+sub subst {
+ my $a = shift;
+ # substitutions to apply
+ my @s = (
+ ['§', '$'],
+ ['@', '#'],
+ ['1', '('],
+ ['5', ')'],
+ ['°', '@'],
+ ['3', '@'],
+ ['H', '8'],
+ ['N', 'J'],
+ ['G', 'N'],
+ );
+ foreach my $t (@s) {
+ $a =~ s/${$t}[0]/${$t}[1]/g
+ }
+ return $a;
+}
+
+while (<>) {
+ chop;
+ if (/^([^\t]*)\t([^\t]*)(.*)$/) {
+ my $repl = subst $2;
+ print "$1\t$repl$3\n";
+ } else {
+ die "Cannot process line: $_\n";
+ }
+}
+