commit 73f0f51928dfbccd6857ca0fa2d82c05decd6264
parent 63838ada2f96be79473763307350400fe455ab4c
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 26 Aug 2015 23:00:10 +0200
lexique actually provides all that we need
Diffstat:
3 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/common.py b/common.py
@@ -2,7 +2,7 @@
import unicodedata
-vowels = 'io9@2EeaOy#$u()'
+vowels = 'io9@2EeaOy#$u()§°51'
vowels_script = "aeiouy"
semivowels = 'j8w'
diff --git a/only3.py b/only3.py
@@ -7,22 +7,20 @@ import sys
for l in sys.stdin.readlines():
f = l.split('\t')
vowels_regexp = re.compile('([' + vowels + '])')
- f[1] = f[1].strip()
- parse = re.split(vowels_regexp, f[1])
- parse = [x for x in parse if len(x) > 0]
- # count number of vowel sounds
- s = sum([1 for x in parse if x[0] in vowels])
- if s != 3:
- continue
+ f[-1] = f[-1].strip()
+ if len(f) <= 3:
+ f.append("")
+ chunks = f[1].split('-')
# words cannot start with a vowel
# as last consonant before last vowel will be kept
- if parse[0][0] in vowels:
+ if chunks[0][0] in vowels:
continue
# sound preceding the last vowel sound must be a consonant
- if parse[-1][0] in vowels and parse[-2][0] in vowels:
+ if chunks[-1][-1] in vowels and (len(chunks[-1]) == 1 or chunks[-1][-2] in
+ vowels):
continue
- if (parse[-1][0] not in vowels and parse[-2][0] in vowels
- and parse[-3][0] in vowels):
+ if (chunks[-1][-1] not in vowels and chunks[-1][-2] in vowels
+ and (len(chunks[-1]) == 2 or chunks[-1][-3] in vowels)):
continue
- print("%s\t%s" % (f[0], ' '.join(parse)))
+ print('\t'.join(f))
diff --git a/script.sh b/script.sh
@@ -5,7 +5,8 @@
#cat Lexique371/Bases+Scripts/Lexique3.txt |
# ./lexique_fix.sh| cut -f1,2,4 | grep NOM | uniq > lexique.txt
-pv lexique.txt| ./only3.py > lexique3
+cat ~/documents/lexique/lexique | cut -f1,2,4,23,24,28 | grep NOM | grep '\s3\s' | cut -f1,4,6 | rev | uniq -f 2 | rev > lexique_full
+pv lexique_full | ./only3.py > lexique3
cat lexique3 | ./graph.py > graph
cat graph | ./cycle.py