continue - songleash - generate chansons en laisse

commit 32529efe519512a2872a244208ba1b6c0b68a48e
parent 89b86df701323f63136bb6a035f424856bee393a
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue,  4 Aug 2015 18:21:04 +0200

continue

Diffstat:
common.py  | 24 +++++++++++++++++++++++-
cycle.py  | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
graph.py  | 111 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
only3.py  | 13 +++++++++++--
script.sh  | 5 ++++-

5 files changed, 225 insertions(+), 21 deletions(-)
diff --git a/common.py b/common.py
@@ -1,4 +1,26 @@
 #!/usr/bin/python3
 
-vowels = 'io92EeaOy#$u()'
+import unicodedata
 
+vowels = 'io9@2EeaOy#$u()'
+vowels_script = "aeiouy"
+semivowels = 'j8w'
+
+def is_pref(u, v):
+    if len(v) < len(u):
+        return False
+    for i in range(len(u)):
+        if v[i] != u[i]:
+            return False
+    return True
+
+def fem(w):
+    return w.endswith('e') or w.endswith('es')
+
+
+# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
+def rmacc(s):
+  """Strip accent from a string
+  
+  with_except keeps specifically 'é' and 'è'"""
+  return ''.join([unicodedata.normalize('NFD', x) for x in s])
diff --git a/cycle.py b/cycle.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python3
+
+import sys
+from common import fem, vowels_script, rmacc, semivowels
+
+g = {}
+START = "sEl"
+bestlen = 0
+visited = set()
+fcache = {}
+
+def print_word(w, prn):
+    s = ""
+    vowels = False
+    for i in range(len(w)):
+        p = -(i+1)
+        s += w[p]
+        x = rmacc(w[p])[0]
+        if not vowels:
+            if x in vowels_script and (i > 0 or w[-1] != 'e') and (i > 1 or w[-1] !=
+                    's' or w[-2] != 'e') and (i > 2 or (not w.endswith('gue'))
+                            and
+                        not (w.endswith('que'))) and (i > 3 or (not
+                            w.endswith('gues') and (not w.endswith('ques')))):
+                vowels = True
+        else:
+            if x not in vowels_script and x != '-':
+                if prn[0] in semivowels:
+                    s = s[:-1]
+                    break
+                # exception
+                if w[p] == 'h':
+                    s += w[-(i+2)]
+                break
+    s =  s[::-1]
+    print ("%s, %s, %s, %s, %s" % (w, w, w, s, s))
+
+def print_list(l):
+    l = l + [l[1]]
+    print("------------------")
+    last = ""
+    for i in range(len(l)-1):
+        ok = False
+        for w in g[l[i]][l[i+1]]:
+            #print(w, fem(w), last)
+            if (len(last) == 0 and fem(w) == False) or (len(last) > 0 and fem(last) != fem(w)):
+                last = w
+                print_word(last, l[i+1])
+                #print(fem(last))
+                ok = True
+                break
+        if not ok:
+            print("ERRROR %s %s" % (l[i], l[i+1]))
+            return
+    print("------------------")
+    print("")
+
+def dfs(l):
+    global g
+    global bestlen
+    global visited
+    #print(l)
+    if (l[-1] == START and len(l) > 1 and len(l) % 2 == 1):
+        if len(l) > bestlen:
+            print_list(l)
+            bestlen = len(l)
+            return
+    if l[-1] in visited:
+        return
+    v = l[-1]
+    visited.add(v)
+    for t in sorted(list(g[v].keys())):
+        #if (len(l) % 2 != 0) in [fem(w) for w in g[v][t]]:
+        if fcache[v][t][len(l) % 2 == 0]:
+            dfs(l+[t])
+    visited.remove(v)
+
+
+for l in sys.stdin.readlines():
+    l = l.strip().split(' ')
+    if l[0] not in g.keys():
+        g[l[0]] = {}
+        fcache[l[0]] = {}
+    if l[1] not in g[l[0]].keys():
+        g[l[0]][l[1]] = set()
+        fcache[l[0]][l[1]] = {True: False, False: False}
+    w = ''.join(l[2:])
+    g[l[0]][l[1]].add(w)
+    fcache[l[0]][l[1]][fem(w)] = True
+
+f = START
+
+dfs([f])
diff --git a/graph.py b/graph.py
@@ -1,33 +1,110 @@
 #!/usr/bin/python3
 
 import sys
-from common import vowels
+from common import vowels, is_pref, semivowels, fem
 
 g = {}
 
+def nedd(myg):
+    n = 0
+    for f in myg.keys():
+        for t in myg[f].keys():
+            n += len(myg[f][t])
+    return n
+
+def tred(myg):
+    g2 = {}
+    tos_m = set()
+    tos_f = set()
+    for f in myg.keys():
+        for t in myg[f].keys():
+            for w in myg[f][t]:
+                if fem(w):
+                    tos_f.add(t)
+                else:
+                    tos_m.add(t)
+    for f in myg.keys():
+        if f not in tos_f and f not in tos_m:
+            continue
+        ok_f = False
+        ok_m = False
+        for t in myg[f].keys():
+            for w in myg[f][t]:
+                if fem(w):
+                    ok_f = True
+                else:
+                    ok_m = True
+        if (not ok_f) and f not in tos_f:
+            continue
+        if (not ok_m) and f not in tos_m:
+            continue
+        for t in myg[f].keys():
+            if t not in myg.keys():
+                continue
+            for w in myg[f][t]:
+                if f not in g2.keys():
+                    g2[f] = {}
+                if t not in g2[f].keys():
+                    g2[f][t] = set()
+                g2[f][t].add(w)
+    return g2
+
 for l in sys.stdin.readlines():
     l = l.strip()
     w, p = l.split('\t')
     p = p.split(' ')
-    if p[0][0] in vowels:
-        continue
-    #f = p[0] + ('' if p[0][0] in vowels else p[1])
     for i in range(len(p[2])+1):
         f = p[0] + p[1] + p[2][:i]
-        t = (p[-2][-1] if p[-1][0] in vowels else (p[-3][-1] + p[-2])) + p[-1]
+        t = ((p[-2][-2:] if p[-2][-1] in semivowels else p[-2][-1]) if p[-1][0] in vowels else ((p[-3][-2:] if p[-3][-1] in
+            semivowels else p[-3][-1]) + p[-2])) + p[-1]
         # print ("%s : %s -> %s" % (w, f, t))
         if f not in g.keys():
-            g[f] = set()
-        g[f].add((t, w))
-
-for j in range(100):
-    print("-----------------")
-    f = list(g.keys())[j]
-    print(f)
-    for i in range(100):
-        t = list(g[f])[0]
-        if t[0] not in g.keys():
+            g[f] = {}
+        if t not in g[f].keys():
+            g[f][t] = set()
+        addit = True
+        for ws in g[f][t]:
+            # if one is prefix of us, give up
+            v1 = fem(ws)
+            v2 = fem(w)
+            if is_pref(ws, w) and v1 == v2:
+                addit = False
+                break
+            # if we are pref of it, remove it
+            if is_pref(w, ws) and v1 == v2:
+                g[f][t].remove(ws)
+        if not addit:
             continue
-        print("%s -[%s]-> %s" % (f, t[1], t[0]))
-        f = t[0]
+        g[f][t].add(w)
+
+ned = nedd(g)
+print(ned, file=sys.stderr)
+
+while True:
+    g2 = tred(g)
+    ned2 = nedd(g2)
+    if ned2 == ned:
+        break
+    g = g2
+    ned = ned2
+    print(ned, file=sys.stderr)
+
+
+
+for f in g.keys():
+    for t in g[f].keys():
+        for w in g[f][t]:
+            print ("%s %s %s" % (f, t, w))
+
+#
+#for j in range(100):
+#    print("-----------------")
+#    f = list(g.keys())[j]
+#    print(f)
+#    for i in range(100):
+#        t = list(g[f])[0]
+#        if t[0] not in g.keys():
+#            continue
+#        print("%s -[%s]-> %s" % (f, t[1], t[0]))
+#        f = t[0]
 
diff --git a/only3.py b/only3.py
@@ -11,5 +11,14 @@ for l in sys.stdin.readlines():
     parse = re.split(vowels_regexp, f[1])
     parse = [x for x in parse if len(x) > 0]
     s = sum([1 for x in parse if x[0] in vowels])
-    if s == 3:
-        print("%s\t%s" % (f[0], ' '.join(parse)))
+    if s != 3:
+        continue
+    if parse[0][0] in vowels:
+        continue
+    if parse[-1][0] in vowels and parse[-2][0] in vowels:
+        continue
+    if (parse[-1][0] not in vowels and parse[-2][0] in vowels
+            and parse[-3][0] in vowels):
+        continue
+    print("%s\t%s" % (f[0], ' '.join(parse)))
+
diff --git a/script.sh b/script.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+#cat Lexique371/Bases+Scripts/Lexique3.txt |
+#  ./lexique_fix.sh| cut -f1,2,4 | uniq > lexique.txt
 pv lexique.txt| ./only3.py > lexique3
-cat lexique3 | ./graph.py
+cat lexique3 | ./graph.py > graph
+cat graph | ./cycle.py

	songleash generate chansons en laisse
	git clone https://a3nm.net/git/songleash/
	Log \| Files \| Refs

common.py	\|	24	+++++++++++++++++++++++-
cycle.py	\|	93	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
graph.py	\|	111	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
only3.py	\|	13	+++++++++++--
script.sh	\|	5	++++-