commit 320f950c9cc6d5fea94b1538bbf0d10b51ff6a79
parent 73f0f51928dfbccd6857ca0fa2d82c05decd6264
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Thu, 27 Aug 2015 00:41:23 +0200
better
Diffstat:
common.py | | | 11 | ++--------- |
cycle.py | | | 61 | ++++++++++++++++++++++++++----------------------------------- |
graph.py | | | 97 | ------------------------------------------------------------------------------- |
script.sh | | | 2 | +- |
4 files changed, 29 insertions(+), 142 deletions(-)
diff --git a/common.py b/common.py
@@ -2,19 +2,12 @@
import unicodedata
+# starting symbol, must be part of cycle
+start = (False, 'm@')
vowels = 'io9@2EeaOy#$u()§°51'
vowels_script = "aeiouy"
semivowels = 'j8w'
-# TODO replace by startswith
-def is_pref(u, v):
- if len(v) < len(u):
- return False
- for i in range(len(u)):
- if v[i] != u[i]:
- return False
- return True
-
def fem(w):
return ((w.endswith('e') and not w[-2] in "ui")
or (w.endswith('es') and not w[-3] in "ui"))
diff --git a/cycle.py b/cycle.py
@@ -1,12 +1,11 @@
-m!/usr/bin/python3
+#!/usr/bin/python3
import sys
-from common import fem, vowels_script, strip_accents, semivowels
+from common import fem, vowels_script, strip_accents, semivowels, start
+from collections import defaultdict
-# TODO use defaultdict
+g = defaultdict(lambda: defaultdict(lambda: set()))
-g = {}
-START = "sEl"
bestlen = 0
visited = set()
fcache = {}
@@ -35,25 +34,21 @@ def print_word(w, prn):
s += w[-(i+2)]
break
s = s[::-1]
- print ("%s, %s, %s, %s, %s" % (w, w, w, s, s))
+ print ("## %s, %s, %s, %s, %s" % (w, w, w, s, s))
def print_list(l):
l = l + [l[1]]
print("------------------")
- last = ""
for i in range(len(l)-1):
- ok = False
- for w in g[l[i]][l[i+1]]:
- #print(w, fem(w), last)
- if (len(last) == 0 and fem(w) == False) or (len(last) > 0 and fem(last) != fem(w)):
- last = w
- print_word(last, l[i+1])
- #print(fem(last))
- ok = True
- break
- if not ok:
- print("ERRROR %s %s" % (l[i], l[i+1]))
- return
+ w = list(g[l[i]][l[i+1]])[0]
+ #print(w[0])
+ fs = w[1].split('-')
+ if len(w[1]) == 0 or len(fs) < 3:
+ print_word(w[0], l[i+1][1])
+ else:
+ end = ''.join(fs[2:])
+ print (" %s, %s, %s, %s, %s"
+ % (w[0], w[0], w[0], end, end))
print("------------------")
print("")
@@ -61,10 +56,11 @@ def print_list(l):
def dfs(l):
global g
+ global start
global bestlen
global visited
#print(l)
- if (l[-1] == START and len(l) > 1 and len(l) % 2 == 1):
+ if l[-1] == start and len(l) > 1:
if len(l) > bestlen:
print_list(l)
bestlen = len(l)
@@ -73,25 +69,20 @@ def dfs(l):
return
v = l[-1]
visited.add(v)
- for t in sorted(list(g[v].keys())):
+ for t in g[v].keys():
#if (len(l) % 2 != 0) in [fem(w) for w in g[v][t]]:
- if fcache[v][t][len(l) % 2 == 0]:
- dfs(l+[t])
+ #if fcache[v][t][len(l) % 2 == 0]:
+ dfs(l+[t])
visited.remove(v)
for l in sys.stdin.readlines():
- l = l.strip().split(' ')
- if l[0] not in g.keys():
- g[l[0]] = {}
- fcache[l[0]] = {}
- if l[1] not in g[l[0]].keys():
- g[l[0]][l[1]] = set()
- fcache[l[0]][l[1]] = {True: False, False: False}
- w = ''.join(l[2:])
- g[l[0]][l[1]].add(w)
- fcache[l[0]][l[1]][fem(w)] = True
+ l = l.strip().split('\t')
+ if len(l) < 6:
+ l.append('')
+ fr = (l[0] == 'f', l[1])
+ to = (l[2] == 'f', l[3])
+ g[fr][to].add((l[4], l[5]))
-f = START
+dfs([start])
-dfs([f])
diff --git a/graph.py b/graph.py
@@ -1,97 +0,0 @@
-#!/usr/bin/python3
-
-import sys
-from common import vowels, is_pref, semivowels, fem
-from collections import defaultdict
-
-g = defaultdict(lambda: defaultdict(lambda: set()))
-
-def nedd(myg):
- n = 0
- for f in myg.keys():
- for t in myg[f].keys():
- n += len(myg[f][t])
- return n
-
-def tred(myg):
- g2 = defaultdict(lambda: defaultdict(lambda: set()))
- tos_m = set()
- tos_f = set()
- for f in myg.keys():
- for t in myg[f].keys():
- for w in myg[f][t]:
- if fem(w):
- tos_f.add(t)
- else:
- tos_m.add(t)
- for f in myg.keys():
- if f not in tos_f and f not in tos_m:
- continue
- ok_f = False
- ok_m = False
- for t in myg[f].keys():
- for w in myg[f][t]:
- if fem(w):
- ok_f = True
- else:
- ok_m = True
- if (not ok_f) and f not in tos_f:
- continue
- if (not ok_m) and f not in tos_m:
- continue
- for t in myg[f].keys():
- if t not in myg.keys():
- continue
- for w in myg[f][t]:
- g2[f][t].add(w)
- return g2
-
-for l in sys.stdin.readlines():
- l = l.strip()
- w, p = l.split('\t')
- p = p.split(' ')
- for i in range(len(p[2])+1):
- f = p[0] + p[1] + p[2][:i]
- # TODO: this cut is non-optimal: should be "radio/gramme", "géo/graphe",
- # but how to tell?
- t = ((p[-2][-2:] if p[-2][-1] in semivowels else p[-2][-1]) if p[-1][0] in vowels else ((p[-3][-2:] if p[-3][-1] in
- semivowels else p[-3][-1]) + p[-2])) + p[-1]
- # print ("%s : %s -> %s" % (w, f, t))
- addit = True
- for ws in g[f][t]:
- # if one is prefix of us, give up
- v1 = fem(ws)
- v2 = fem(w)
- if is_pref(ws, w) and v1 == v2:
- addit = False
- break
- # if we are pref of it, remove it
- if is_pref(w, ws) and v1 == v2:
- g[f][t].remove(ws)
- if not addit:
- continue
- g[f][t].add(w)
-
-
-# TODO: replace this by a true SCC
-
-ned = nedd(g)
-print(ned, file=sys.stderr)
-
-while True:
- g2 = tred(g)
- ned2 = nedd(g2)
- if ned2 == ned:
- break
- g = g2
- ned = ned2
- print(ned, file=sys.stderr)
-
-
-
-for f in g.keys():
- for t in g[f].keys():
- for w in g[f][t]:
- print ("%s %s %s" % (f, t, w))
-
-
diff --git a/script.sh b/script.sh
@@ -7,7 +7,7 @@
# ./lexique_fix.sh| cut -f1,2,4 | grep NOM | uniq > lexique.txt
cat ~/documents/lexique/lexique | cut -f1,2,4,23,24,28 | grep NOM | grep '\s3\s' | cut -f1,4,6 | rev | uniq -f 2 | rev > lexique_full
pv lexique_full | ./only3.py > lexique3
-cat lexique3 | ./graph.py > graph
+cat lexique3 | ./graph2.py > graph
cat graph | ./cycle.py
# cat <(echo "digraph G {") <(sed 's/\([^ ]*\) \([^ ]*\) \(.*\)/\1 -> \2