commit bd5d041c02f10ff5dad7ae374c8de0c492aa13e9
parent 7b0d4c908e7f20bc44c07a58fc41eb6dc88d66b5
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sat, 17 Aug 2019 18:25:14 +0200
add the lexique tweaking scripts to this repo
Diffstat:
3 files changed, 52 insertions(+), 3 deletions(-)
diff --git a/README b/README
@@ -9,9 +9,8 @@ file fixes a few errors that I have found in Lexique (and reported to the
 authors).
 
 Last, it contains a modification of the file "lexique" to only use ASCII
-characters for field 2 in "lexique_my_format" (other fields are untouched: the
-precise script used is
-<https://gitlab.com/a3nm/frhyme/blob/master/scripts/lexique/lexique_fix.sh>).
+characters for field 2 in "lexique_my_format" (other fields are untouched): the
+precise script is in scripts/lexique_fix.sh.
 
 The license of all these files is the same as that of Lexique, namely, the
 license CC BY SA 4.0 (according to the file README-Lexique.txt in the
diff --git a/scripts/lexique_fix.sh b/scripts/lexique_fix.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# General fixes for lexique
+# Author: Antoine Amarilli
+# Public domain
+
+cd "$( dirname "$0" )"
+
+sed 1d | ./subst.pl
+
diff --git a/scripts/subst.pl b/scripts/subst.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+
+# This file fixes Lexique's pronunciation info from the home-grown
+# format described in
+# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a
+# variation of the X-SAMPA standard
+# Author: Antoine Amarilli
+# Public domain
+
+
+sub subst {
+  my $a = shift;
+  # substitutions to apply
+  my @s = (
+    ['§', '$'],
+    ['@', '#'],
+    ['1', '('],
+    ['5', ')'],
+    ['°', '@'],
+    ['3', '@'],
+    ['H', '8'],
+    ['N', 'J'],
+    ['G', 'N'],
+  );
+  foreach my $t (@s) {
+    $a =~ s/${$t}[0]/${$t}[1]/g
+  }
+  return $a;
+}
+
+while (<>) {
+  chop;
+  if (/^([^\t]*)\t([^\t]*)(.*)$/) {
+    my $repl = subst $2;
+    print "$1\t$repl$3\n";
+  } else {
+    die "Cannot process line: $_\n";
+  }
+}
+