songleash

generate chansons en laisse
git clone https://a3nm.net/git/songleash/
Log | Files | Refs

commit 320f950c9cc6d5fea94b1538bbf0d10b51ff6a79
parent 73f0f51928dfbccd6857ca0fa2d82c05decd6264
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Thu, 27 Aug 2015 00:41:23 +0200

better

Diffstat:
common.py | 11++---------
cycle.py | 61++++++++++++++++++++++++++-----------------------------------
graph.py | 97-------------------------------------------------------------------------------
script.sh | 2+-
4 files changed, 29 insertions(+), 142 deletions(-)

diff --git a/common.py b/common.py @@ -2,19 +2,12 @@ import unicodedata +# starting symbol, must be part of cycle +start = (False, 'm@') vowels = 'io9@2EeaOy#$u()§°51' vowels_script = "aeiouy" semivowels = 'j8w' -# TODO replace by startswith -def is_pref(u, v): - if len(v) < len(u): - return False - for i in range(len(u)): - if v[i] != u[i]: - return False - return True - def fem(w): return ((w.endswith('e') and not w[-2] in "ui") or (w.endswith('es') and not w[-3] in "ui")) diff --git a/cycle.py b/cycle.py @@ -1,12 +1,11 @@ -m!/usr/bin/python3 +#!/usr/bin/python3 import sys -from common import fem, vowels_script, strip_accents, semivowels +from common import fem, vowels_script, strip_accents, semivowels, start +from collections import defaultdict -# TODO use defaultdict +g = defaultdict(lambda: defaultdict(lambda: set())) -g = {} -START = "sEl" bestlen = 0 visited = set() fcache = {} @@ -35,25 +34,21 @@ def print_word(w, prn): s += w[-(i+2)] break s = s[::-1] - print ("%s, %s, %s, %s, %s" % (w, w, w, s, s)) + print ("## %s, %s, %s, %s, %s" % (w, w, w, s, s)) def print_list(l): l = l + [l[1]] print("------------------") - last = "" for i in range(len(l)-1): - ok = False - for w in g[l[i]][l[i+1]]: - #print(w, fem(w), last) - if (len(last) == 0 and fem(w) == False) or (len(last) > 0 and fem(last) != fem(w)): - last = w - print_word(last, l[i+1]) - #print(fem(last)) - ok = True - break - if not ok: - print("ERRROR %s %s" % (l[i], l[i+1])) - return + w = list(g[l[i]][l[i+1]])[0] + #print(w[0]) + fs = w[1].split('-') + if len(w[1]) == 0 or len(fs) < 3: + print_word(w[0], l[i+1][1]) + else: + end = ''.join(fs[2:]) + print (" %s, %s, %s, %s, %s" + % (w[0], w[0], w[0], end, end)) print("------------------") print("") @@ -61,10 +56,11 @@ def print_list(l): def dfs(l): global g + global start global bestlen global visited #print(l) - if (l[-1] == START and len(l) > 1 and len(l) % 2 == 1): + if l[-1] == start and len(l) > 1: if len(l) > bestlen: print_list(l) bestlen = len(l) @@ -73,25 +69,20 @@ def dfs(l): return v = l[-1] visited.add(v) - for t in sorted(list(g[v].keys())): + for t in g[v].keys(): #if (len(l) % 2 != 0) in [fem(w) for w in g[v][t]]: - if fcache[v][t][len(l) % 2 == 0]: - dfs(l+[t]) + #if fcache[v][t][len(l) % 2 == 0]: + dfs(l+[t]) visited.remove(v) for l in sys.stdin.readlines(): - l = l.strip().split(' ') - if l[0] not in g.keys(): - g[l[0]] = {} - fcache[l[0]] = {} - if l[1] not in g[l[0]].keys(): - g[l[0]][l[1]] = set() - fcache[l[0]][l[1]] = {True: False, False: False} - w = ''.join(l[2:]) - g[l[0]][l[1]].add(w) - fcache[l[0]][l[1]][fem(w)] = True + l = l.strip().split('\t') + if len(l) < 6: + l.append('') + fr = (l[0] == 'f', l[1]) + to = (l[2] == 'f', l[3]) + g[fr][to].add((l[4], l[5])) -f = START +dfs([start]) -dfs([f]) diff --git a/graph.py b/graph.py @@ -1,97 +0,0 @@ -#!/usr/bin/python3 - -import sys -from common import vowels, is_pref, semivowels, fem -from collections import defaultdict - -g = defaultdict(lambda: defaultdict(lambda: set())) - -def nedd(myg): - n = 0 - for f in myg.keys(): - for t in myg[f].keys(): - n += len(myg[f][t]) - return n - -def tred(myg): - g2 = defaultdict(lambda: defaultdict(lambda: set())) - tos_m = set() - tos_f = set() - for f in myg.keys(): - for t in myg[f].keys(): - for w in myg[f][t]: - if fem(w): - tos_f.add(t) - else: - tos_m.add(t) - for f in myg.keys(): - if f not in tos_f and f not in tos_m: - continue - ok_f = False - ok_m = False - for t in myg[f].keys(): - for w in myg[f][t]: - if fem(w): - ok_f = True - else: - ok_m = True - if (not ok_f) and f not in tos_f: - continue - if (not ok_m) and f not in tos_m: - continue - for t in myg[f].keys(): - if t not in myg.keys(): - continue - for w in myg[f][t]: - g2[f][t].add(w) - return g2 - -for l in sys.stdin.readlines(): - l = l.strip() - w, p = l.split('\t') - p = p.split(' ') - for i in range(len(p[2])+1): - f = p[0] + p[1] + p[2][:i] - # TODO: this cut is non-optimal: should be "radio/gramme", "géo/graphe", - # but how to tell? - t = ((p[-2][-2:] if p[-2][-1] in semivowels else p[-2][-1]) if p[-1][0] in vowels else ((p[-3][-2:] if p[-3][-1] in - semivowels else p[-3][-1]) + p[-2])) + p[-1] - # print ("%s : %s -> %s" % (w, f, t)) - addit = True - for ws in g[f][t]: - # if one is prefix of us, give up - v1 = fem(ws) - v2 = fem(w) - if is_pref(ws, w) and v1 == v2: - addit = False - break - # if we are pref of it, remove it - if is_pref(w, ws) and v1 == v2: - g[f][t].remove(ws) - if not addit: - continue - g[f][t].add(w) - - -# TODO: replace this by a true SCC - -ned = nedd(g) -print(ned, file=sys.stderr) - -while True: - g2 = tred(g) - ned2 = nedd(g2) - if ned2 == ned: - break - g = g2 - ned = ned2 - print(ned, file=sys.stderr) - - - -for f in g.keys(): - for t in g[f].keys(): - for w in g[f][t]: - print ("%s %s %s" % (f, t, w)) - - diff --git a/script.sh b/script.sh @@ -7,7 +7,7 @@ # ./lexique_fix.sh| cut -f1,2,4 | grep NOM | uniq > lexique.txt cat ~/documents/lexique/lexique | cut -f1,2,4,23,24,28 | grep NOM | grep '\s3\s' | cut -f1,4,6 | rev | uniq -f 2 | rev > lexique_full pv lexique_full | ./only3.py > lexique3 -cat lexique3 | ./graph.py > graph +cat lexique3 | ./graph2.py > graph cat graph | ./cycle.py # cat <(echo "digraph G {") <(sed 's/\([^ ]*\) \([^ ]*\) \(.*\)/\1 -> \2