commit 32529efe519512a2872a244208ba1b6c0b68a48e
parent 89b86df701323f63136bb6a035f424856bee393a
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Tue, 4 Aug 2015 18:21:04 +0200
continue
Diffstat:
common.py | | | 24 | +++++++++++++++++++++++- |
cycle.py | | | 93 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
graph.py | | | 111 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------ |
only3.py | | | 13 | +++++++++++-- |
script.sh | | | 5 | ++++- |
5 files changed, 225 insertions(+), 21 deletions(-)
diff --git a/common.py b/common.py
@@ -1,4 +1,26 @@
#!/usr/bin/python3
-vowels = 'io92EeaOy#$u()'
+import unicodedata
+vowels = 'io9@2EeaOy#$u()'
+vowels_script = "aeiouy"
+semivowels = 'j8w'
+
+def is_pref(u, v):
+ if len(v) < len(u):
+ return False
+ for i in range(len(u)):
+ if v[i] != u[i]:
+ return False
+ return True
+
+def fem(w):
+ return w.endswith('e') or w.endswith('es')
+
+
+# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
+def rmacc(s):
+ """Strip accent from a string
+
+ with_except keeps specifically 'é' and 'è'"""
+ return ''.join([unicodedata.normalize('NFD', x) for x in s])
diff --git a/cycle.py b/cycle.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python3
+
+import sys
+from common import fem, vowels_script, rmacc, semivowels
+
+g = {}
+START = "sEl"
+bestlen = 0
+visited = set()
+fcache = {}
+
+def print_word(w, prn):
+ s = ""
+ vowels = False
+ for i in range(len(w)):
+ p = -(i+1)
+ s += w[p]
+ x = rmacc(w[p])[0]
+ if not vowels:
+ if x in vowels_script and (i > 0 or w[-1] != 'e') and (i > 1 or w[-1] !=
+ 's' or w[-2] != 'e') and (i > 2 or (not w.endswith('gue'))
+ and
+ not (w.endswith('que'))) and (i > 3 or (not
+ w.endswith('gues') and (not w.endswith('ques')))):
+ vowels = True
+ else:
+ if x not in vowels_script and x != '-':
+ if prn[0] in semivowels:
+ s = s[:-1]
+ break
+ # exception
+ if w[p] == 'h':
+ s += w[-(i+2)]
+ break
+ s = s[::-1]
+ print ("%s, %s, %s, %s, %s" % (w, w, w, s, s))
+
+def print_list(l):
+ l = l + [l[1]]
+ print("------------------")
+ last = ""
+ for i in range(len(l)-1):
+ ok = False
+ for w in g[l[i]][l[i+1]]:
+ #print(w, fem(w), last)
+ if (len(last) == 0 and fem(w) == False) or (len(last) > 0 and fem(last) != fem(w)):
+ last = w
+ print_word(last, l[i+1])
+ #print(fem(last))
+ ok = True
+ break
+ if not ok:
+ print("ERRROR %s %s" % (l[i], l[i+1]))
+ return
+ print("------------------")
+ print("")
+
+def dfs(l):
+ global g
+ global bestlen
+ global visited
+ #print(l)
+ if (l[-1] == START and len(l) > 1 and len(l) % 2 == 1):
+ if len(l) > bestlen:
+ print_list(l)
+ bestlen = len(l)
+ return
+ if l[-1] in visited:
+ return
+ v = l[-1]
+ visited.add(v)
+ for t in sorted(list(g[v].keys())):
+ #if (len(l) % 2 != 0) in [fem(w) for w in g[v][t]]:
+ if fcache[v][t][len(l) % 2 == 0]:
+ dfs(l+[t])
+ visited.remove(v)
+
+
+for l in sys.stdin.readlines():
+ l = l.strip().split(' ')
+ if l[0] not in g.keys():
+ g[l[0]] = {}
+ fcache[l[0]] = {}
+ if l[1] not in g[l[0]].keys():
+ g[l[0]][l[1]] = set()
+ fcache[l[0]][l[1]] = {True: False, False: False}
+ w = ''.join(l[2:])
+ g[l[0]][l[1]].add(w)
+ fcache[l[0]][l[1]][fem(w)] = True
+
+f = START
+
+dfs([f])
diff --git a/graph.py b/graph.py
@@ -1,33 +1,110 @@
#!/usr/bin/python3
import sys
-from common import vowels
+from common import vowels, is_pref, semivowels, fem
g = {}
+def nedd(myg):
+ n = 0
+ for f in myg.keys():
+ for t in myg[f].keys():
+ n += len(myg[f][t])
+ return n
+
+def tred(myg):
+ g2 = {}
+ tos_m = set()
+ tos_f = set()
+ for f in myg.keys():
+ for t in myg[f].keys():
+ for w in myg[f][t]:
+ if fem(w):
+ tos_f.add(t)
+ else:
+ tos_m.add(t)
+ for f in myg.keys():
+ if f not in tos_f and f not in tos_m:
+ continue
+ ok_f = False
+ ok_m = False
+ for t in myg[f].keys():
+ for w in myg[f][t]:
+ if fem(w):
+ ok_f = True
+ else:
+ ok_m = True
+ if (not ok_f) and f not in tos_f:
+ continue
+ if (not ok_m) and f not in tos_m:
+ continue
+ for t in myg[f].keys():
+ if t not in myg.keys():
+ continue
+ for w in myg[f][t]:
+ if f not in g2.keys():
+ g2[f] = {}
+ if t not in g2[f].keys():
+ g2[f][t] = set()
+ g2[f][t].add(w)
+ return g2
+
for l in sys.stdin.readlines():
l = l.strip()
w, p = l.split('\t')
p = p.split(' ')
- if p[0][0] in vowels:
- continue
- #f = p[0] + ('' if p[0][0] in vowels else p[1])
for i in range(len(p[2])+1):
f = p[0] + p[1] + p[2][:i]
- t = (p[-2][-1] if p[-1][0] in vowels else (p[-3][-1] + p[-2])) + p[-1]
+ t = ((p[-2][-2:] if p[-2][-1] in semivowels else p[-2][-1]) if p[-1][0] in vowels else ((p[-3][-2:] if p[-3][-1] in
+ semivowels else p[-3][-1]) + p[-2])) + p[-1]
# print ("%s : %s -> %s" % (w, f, t))
if f not in g.keys():
- g[f] = set()
- g[f].add((t, w))
-
-for j in range(100):
- print("-----------------")
- f = list(g.keys())[j]
- print(f)
- for i in range(100):
- t = list(g[f])[0]
- if t[0] not in g.keys():
+ g[f] = {}
+ if t not in g[f].keys():
+ g[f][t] = set()
+ addit = True
+ for ws in g[f][t]:
+ # if one is prefix of us, give up
+ v1 = fem(ws)
+ v2 = fem(w)
+ if is_pref(ws, w) and v1 == v2:
+ addit = False
+ break
+ # if we are pref of it, remove it
+ if is_pref(w, ws) and v1 == v2:
+ g[f][t].remove(ws)
+ if not addit:
continue
- print("%s -[%s]-> %s" % (f, t[1], t[0]))
- f = t[0]
+ g[f][t].add(w)
+
+ned = nedd(g)
+print(ned, file=sys.stderr)
+
+while True:
+ g2 = tred(g)
+ ned2 = nedd(g2)
+ if ned2 == ned:
+ break
+ g = g2
+ ned = ned2
+ print(ned, file=sys.stderr)
+
+
+
+for f in g.keys():
+ for t in g[f].keys():
+ for w in g[f][t]:
+ print ("%s %s %s" % (f, t, w))
+
+#
+#for j in range(100):
+# print("-----------------")
+# f = list(g.keys())[j]
+# print(f)
+# for i in range(100):
+# t = list(g[f])[0]
+# if t[0] not in g.keys():
+# continue
+# print("%s -[%s]-> %s" % (f, t[1], t[0]))
+# f = t[0]
diff --git a/only3.py b/only3.py
@@ -11,5 +11,14 @@ for l in sys.stdin.readlines():
parse = re.split(vowels_regexp, f[1])
parse = [x for x in parse if len(x) > 0]
s = sum([1 for x in parse if x[0] in vowels])
- if s == 3:
- print("%s\t%s" % (f[0], ' '.join(parse)))
+ if s != 3:
+ continue
+ if parse[0][0] in vowels:
+ continue
+ if parse[-1][0] in vowels and parse[-2][0] in vowels:
+ continue
+ if (parse[-1][0] not in vowels and parse[-2][0] in vowels
+ and parse[-3][0] in vowels):
+ continue
+ print("%s\t%s" % (f[0], ' '.join(parse)))
+
diff --git a/script.sh b/script.sh
@@ -1,5 +1,8 @@
#!/bin/bash
+#cat Lexique371/Bases+Scripts/Lexique3.txt |
+# ./lexique_fix.sh| cut -f1,2,4 | uniq > lexique.txt
pv lexique.txt| ./only3.py > lexique3
-cat lexique3 | ./graph.py
+cat lexique3 | ./graph.py > graph
+cat graph | ./cycle.py