commit 02763d08d110e47567aa3ad5dc84ed4b5cb2fd24
parent 055c35f57ee9789a9e717c29e9286450a0496908
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sun, 18 Aug 2019 14:41:34 +0200
import experimental things from gitlab.com/a3nm/plint
Diffstat:
7 files changed, 155 insertions(+), 0 deletions(-)
diff --git a/README b/README
@@ -20,3 +20,8 @@ something along the lines of:
poem2html/make_poem.sh poem_file > www/poem.html
done
+- littre: comparison of the number of syllables counted by plint to what is
+ indicated in littre
+
+- lexique_comparison: comparison of the number of the number of counted
+ syllables to lexique
diff --git a/lexique_comparison/count_syllables_lexique.py b/lexique_comparison/count_syllables_lexique.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+
+# count the number of syllables of words according to lexique
+
+import sys
+
+vowels = "ae$E2@#u)9ioO(y"
+consonants = "dpgmtRwszlbkZjknfvSNJx8"
+
+for l in sys.stdin.readlines():
+ f = l.strip().split("\t")
+ nsyl = 0
+ for a in f[1]:
+ if a in vowels:
+ nsyl += 1
+ elif a in consonants:
+ pass
+ else:
+ print("unknown phoneme %s" % a, file=sys.stderr)
+ sys.exit(1)
+ # workaround bug in lexique
+ if f[1].endswith("@") and f[0] != "afin de":
+ nsyl -= 1
+ print("%s\t%d" % (f[0], nsyl))
+
diff --git a/lexique_comparison/count_syllables_plint.py b/lexique_comparison/count_syllables_plint.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+
+import os
+import sys
+
+# modules are in the parent folder
+import plint.pattern
+from plint.rhyme import Rhyme
+from plint.template import Template
+from plint.verse import Verse
+
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+
+template = Template()
+pattern = plint.pattern.Pattern("12")
+
+for l in sys.stdin.readlines():
+ w = (l.strip().split("\t"))[0]
+ verse = Verse(w, template, pattern)
+ rhyme = Rhyme(verse.normalized,
+ pattern.constraint, template.mergers, template.options)
+ verse.phon = rhyme.phon
+ verse.annotate()
+ mx = 0
+ mn = 0
+ for c in verse.chunks:
+ if 'weights' in c.keys():
+ mn += min(c['weights'])
+ mx += max(c['weights'])
+ print("%s\t%d\t%d" % (w, mn, mx))
+
diff --git a/littre/.gitignore b/littre/.gitignore
@@ -0,0 +1,9 @@
+conflicts
+plint_num
+plint_raw_nums
+prons
+prons_normal
+prons_num
+prons_poesie
+prons_special
+prons_vers
diff --git a/littre/compare_plint.py b/littre/compare_plint.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python3
+
+"""compare file from littre and file from plint for disagreements"""
+
+import sys
+
+plint = open(sys.argv[1])
+littre = open(sys.argv[2])
+
+while True:
+ l_plint = plint.readline()
+ if not l_plint:
+ break
+ l_littre = littre.readline()
+ w_plint, p_plint = l_plint.split('%')
+ w_littre, p_littre = l_littre.split('%')
+ p_littre = int(p_littre)
+ assert(w_plint == w_littre)
+ w = w_plint
+ if '-' in p_plint:
+ lo, hi = p_plint.split('-')
+ lo = int(lo)
+ hi = int(hi)
+ else:
+ lo = int(p_plint)
+ hi = lo
+ if not (lo <= p_littre <= hi):
+ print ("%s : %d vs %d-%d" % (w, p_littre, lo, hi))
diff --git a/littre/littre_syll.sh b/littre/littre_syll.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# extract prononciation from xmllittre
+# https://bitbucket.org/Mytskine/xmlittre-data.git
+
+xmlstarlet sel -t -m "//entree" -v "@terme" -v "\"%\"" \
+ -v "entete/prononciation" -n "$1"/*.xml > prons
+cat prons | grep -E "(syllabes en poésie|en poésie,? de)" > prons_poesie
+cat prons | grep -E "(en vers,? de|syllabes en vers)" > prons_vers
+cat additions_poesie additions_vers prons_poesie prons_vers |
+ awk 'BEGIN {FS = "%";} !a[$1]++;' |
+ while read l; do
+ echo "$l" | cut -d '%' -f 1 | cut -d ' ' -f 1 | tr -d '\n'
+ echo -n '%'
+ echo "$l" | cut -d '%' -f 2- | tr ' ' '\n' |
+ sed '
+ s/^une$/1/;
+ s/^deux$/2/;
+ s/^trois$/3/;
+ s/^quatre$/4/;
+ s/^cinq$/5/;
+ s/^cinç$/5/;
+ s/^six$/6/;
+ s/^sept$/7/;
+ s/^disylla.*$/2/;
+ s/^trisylla.*$/3/;
+ ' | grep '[0-9]' | head -1
+ done > prons_special
+
+pv prons |
+ grep -v '%$' |
+ grep -v ' .*%' |
+ awk 'BEGIN {FS = "%";} !a[$1]++;' |
+ while read l; do
+ echo "$l" | cut -d '%' -f 1 | cut -d ' ' -f 1 | tr -d '\n'
+ echo -n '%'
+ echo "$l" | cut -d '%' -f 2- | sed 's/ *- */-/g' | cut -d ' ' -f 1 | tr -d ',' |
+ sed "s/-[^aâàeéêèiîoôuùûäëïöü-]*'//" | tr '-' '\n' | wc -l
+ done > prons_normal
+
+pv prons_special prons_normal |
+ awk 'BEGIN {FS = "%";} !a[$1]++;' |
+ tr -d ',' | sort | grep -v '^%' | sed 's/.*/\L&/' > prons_num
+
+pv prons_num | cut -d '%' -f1 |
+ ../plint.py raw.tpl 2>&1 |
+ grep 'total:' | cut -d ':' -f4 |
+ cut -d ')' -f1 > plint_raw_nums
+
+paste <(cat prons_num| cut -d'%' -f1) plint_raw_nums |
+ tr '\t' '%' | sed 's/ *% */%/' \
+ > plint_num
+
+./compare_plint.py plint_num prons_num > conflicts
+
diff --git a/littre/raw.tpl b/littre/raw.tpl
@@ -0,0 +1,2 @@
+!
+12 A X