commit c869dc9f1cde9aaa8663ecb75a5ba50160bbe7c5
parent 53fa7d503793a57c6fbf44ce9febeacc0f85e3f5
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sun, 18 Aug 2019 10:34:04 +0200
mv occurrence preparation script to own subfolder
Diffstat:
10 files changed, 67 insertions(+), 65 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
__pycache__/*
.idea
-Lexique382.tsv
+Lexique*.tsv
frhyme
frhyme/*
haspirater
@@ -15,6 +15,7 @@ messages.pot
plint/res/*.mo
diaeresis?.json
Lexique*.zip
+Lexique*.zip.*
haspirater
*.pyo
poem
diff --git a/README b/README
@@ -131,7 +131,7 @@ This is how the file data/occurrences is generated
Run:
- ./lexique_occurrences_retrieve.sh > data/occurrences
+ ./prepare_occurrences/lexique_occurrences_retrieve.sh > data/occurrences
== 6. Updating the localization ==
diff --git a/additions_occurrences b/additions_occurrences
@@ -1 +0,0 @@
-chose
diff --git a/lexique_fix.sh b/lexique_fix.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-# General fixes for lexique
-
-cd "$( dirname "$0" )"
-
-sed 1d | ./subst.pl
-
diff --git a/lexique_occurrences_retrieve.sh b/lexique_occurrences_retrieve.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-ZIP="Lexique383.zip"
-URL="http://www.lexique.org/databases/Lexique383/$ZIP"
-FILE="Lexique383.tsv"
-
-cd "$( dirname "$0" )"
-
-wget $URL
-unzip -qq $ZIP $FILE
-cat $FILE | ./lexique_fix.sh | cut -f1 |
- rev | cut -d' ' -f1 | rev |
- cat - additions_occurrences |
- sort | uniq -c |
- awk '{print $2, $1}'
-
diff --git a/prepare_occurrences/additions_occurrences b/prepare_occurrences/additions_occurrences
@@ -0,0 +1 @@
+chose
diff --git a/prepare_occurrences/lexique_fix.sh b/prepare_occurrences/lexique_fix.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+# General fixes for lexique
+
+cd "$( dirname "$0" )"
+
+sed 1d | ./subst.pl
+
diff --git a/prepare_occurrences/lexique_occurrences_retrieve.sh b/prepare_occurrences/lexique_occurrences_retrieve.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+ZIP="Lexique383.zip"
+URL="http://www.lexique.org/databases/Lexique383/$ZIP"
+FILE="Lexique383.tsv"
+
+DIR="$( cd "$( dirname "$0" )" && pwd )"
+cd "$DIR"
+
+wget $URL
+unzip -qq $ZIP $FILE
+cat $FILE | ./lexique_fix.sh | cut -f1 |
+ rev | cut -d' ' -f1 | rev |
+ cat - additions_occurrences |
+ sort | uniq -c |
+ awk '{print $2, $1}'
+
diff --git a/prepare_occurrences/subst.pl b/prepare_occurrences/subst.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl
+
+# This file fixes Lexique's pronunciation info from the home-grown
+# format described in
+# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a
+# variation of the X-SAMPA standard
+
+
+sub subst {
+ my $a = shift;
+ # substitutions to apply
+ my @s = (
+ ['§', '$'],
+ ['@', '#'],
+ ['1', '('],
+ ['5', ')'],
+ ['°', '@'],
+ ['3', '@'],
+ ['H', '8'],
+ ['N', 'J'],
+ ['G', 'N'],
+ );
+ foreach my $t (@s) {
+ $a =~ s/${$t}[0]/${$t}[1]/g
+ }
+ return $a;
+}
+
+while (<>) {
+ chop;
+ if (/^([^\t]*)\t([^\t]*)(.*)$/) {
+ my $repl = subst $2;
+ print "$1\t$repl$3\n";
+ } else {
+ die "Cannot process line: $_\n";
+ }
+}
+
diff --git a/subst.pl b/subst.pl
@@ -1,38 +0,0 @@
-#!/usr/bin/perl
-
-# This file fixes Lexique's pronunciation info from the home-grown
-# format described in
-# http://www.lexique.org/outils/Manuel_Lexique.htm#_Toc108519023 to a
-# variation of the X-SAMPA standard
-
-
-sub subst {
- my $a = shift;
- # substitutions to apply
- my @s = (
- ['§', '$'],
- ['@', '#'],
- ['1', '('],
- ['5', ')'],
- ['°', '@'],
- ['3', '@'],
- ['H', '8'],
- ['N', 'J'],
- ['G', 'N'],
- );
- foreach my $t (@s) {
- $a =~ s/${$t}[0]/${$t}[1]/g
- }
- return $a;
-}
-
-while (<>) {
- chop;
- if (/^([^\t]*)\t([^\t]*)(.*)$/) {
- my $repl = subst $2;
- print "$1\t$repl$3\n";
- } else {
- die "Cannot process line: $_\n";
- }
-}
-