songleash

generate chansons en laisse
git clone https://a3nm.net/git/songleash/
Log | Files | Refs

commit 32529efe519512a2872a244208ba1b6c0b68a48e
parent 89b86df701323f63136bb6a035f424856bee393a
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue,  4 Aug 2015 18:21:04 +0200

continue

Diffstat:
common.py | 24+++++++++++++++++++++++-
cycle.py | 93+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
graph.py | 111+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
only3.py | 13+++++++++++--
script.sh | 5++++-
5 files changed, 225 insertions(+), 21 deletions(-)

diff --git a/common.py b/common.py @@ -1,4 +1,26 @@ #!/usr/bin/python3 -vowels = 'io92EeaOy#$u()' +import unicodedata +vowels = 'io9@2EeaOy#$u()' +vowels_script = "aeiouy" +semivowels = 'j8w' + +def is_pref(u, v): + if len(v) < len(u): + return False + for i in range(len(u)): + if v[i] != u[i]: + return False + return True + +def fem(w): + return w.endswith('e') or w.endswith('es') + + +# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string +def rmacc(s): + """Strip accent from a string + + with_except keeps specifically 'é' and 'è'""" + return ''.join([unicodedata.normalize('NFD', x) for x in s]) diff --git a/cycle.py b/cycle.py @@ -0,0 +1,93 @@ +#!/usr/bin/python3 + +import sys +from common import fem, vowels_script, rmacc, semivowels + +g = {} +START = "sEl" +bestlen = 0 +visited = set() +fcache = {} + +def print_word(w, prn): + s = "" + vowels = False + for i in range(len(w)): + p = -(i+1) + s += w[p] + x = rmacc(w[p])[0] + if not vowels: + if x in vowels_script and (i > 0 or w[-1] != 'e') and (i > 1 or w[-1] != + 's' or w[-2] != 'e') and (i > 2 or (not w.endswith('gue')) + and + not (w.endswith('que'))) and (i > 3 or (not + w.endswith('gues') and (not w.endswith('ques')))): + vowels = True + else: + if x not in vowels_script and x != '-': + if prn[0] in semivowels: + s = s[:-1] + break + # exception + if w[p] == 'h': + s += w[-(i+2)] + break + s = s[::-1] + print ("%s, %s, %s, %s, %s" % (w, w, w, s, s)) + +def print_list(l): + l = l + [l[1]] + print("------------------") + last = "" + for i in range(len(l)-1): + ok = False + for w in g[l[i]][l[i+1]]: + #print(w, fem(w), last) + if (len(last) == 0 and fem(w) == False) or (len(last) > 0 and fem(last) != fem(w)): + last = w + print_word(last, l[i+1]) + #print(fem(last)) + ok = True + break + if not ok: + print("ERRROR %s %s" % (l[i], l[i+1])) + return + print("------------------") + print("") + +def dfs(l): + global g + global bestlen + global visited + #print(l) + if (l[-1] == START and len(l) > 1 and len(l) % 2 == 1): + if len(l) > bestlen: + print_list(l) + bestlen = len(l) + return + if l[-1] in visited: + return + v = l[-1] + visited.add(v) + for t in sorted(list(g[v].keys())): + #if (len(l) % 2 != 0) in [fem(w) for w in g[v][t]]: + if fcache[v][t][len(l) % 2 == 0]: + dfs(l+[t]) + visited.remove(v) + + +for l in sys.stdin.readlines(): + l = l.strip().split(' ') + if l[0] not in g.keys(): + g[l[0]] = {} + fcache[l[0]] = {} + if l[1] not in g[l[0]].keys(): + g[l[0]][l[1]] = set() + fcache[l[0]][l[1]] = {True: False, False: False} + w = ''.join(l[2:]) + g[l[0]][l[1]].add(w) + fcache[l[0]][l[1]][fem(w)] = True + +f = START + +dfs([f]) diff --git a/graph.py b/graph.py @@ -1,33 +1,110 @@ #!/usr/bin/python3 import sys -from common import vowels +from common import vowels, is_pref, semivowels, fem g = {} +def nedd(myg): + n = 0 + for f in myg.keys(): + for t in myg[f].keys(): + n += len(myg[f][t]) + return n + +def tred(myg): + g2 = {} + tos_m = set() + tos_f = set() + for f in myg.keys(): + for t in myg[f].keys(): + for w in myg[f][t]: + if fem(w): + tos_f.add(t) + else: + tos_m.add(t) + for f in myg.keys(): + if f not in tos_f and f not in tos_m: + continue + ok_f = False + ok_m = False + for t in myg[f].keys(): + for w in myg[f][t]: + if fem(w): + ok_f = True + else: + ok_m = True + if (not ok_f) and f not in tos_f: + continue + if (not ok_m) and f not in tos_m: + continue + for t in myg[f].keys(): + if t not in myg.keys(): + continue + for w in myg[f][t]: + if f not in g2.keys(): + g2[f] = {} + if t not in g2[f].keys(): + g2[f][t] = set() + g2[f][t].add(w) + return g2 + for l in sys.stdin.readlines(): l = l.strip() w, p = l.split('\t') p = p.split(' ') - if p[0][0] in vowels: - continue - #f = p[0] + ('' if p[0][0] in vowels else p[1]) for i in range(len(p[2])+1): f = p[0] + p[1] + p[2][:i] - t = (p[-2][-1] if p[-1][0] in vowels else (p[-3][-1] + p[-2])) + p[-1] + t = ((p[-2][-2:] if p[-2][-1] in semivowels else p[-2][-1]) if p[-1][0] in vowels else ((p[-3][-2:] if p[-3][-1] in + semivowels else p[-3][-1]) + p[-2])) + p[-1] # print ("%s : %s -> %s" % (w, f, t)) if f not in g.keys(): - g[f] = set() - g[f].add((t, w)) - -for j in range(100): - print("-----------------") - f = list(g.keys())[j] - print(f) - for i in range(100): - t = list(g[f])[0] - if t[0] not in g.keys(): + g[f] = {} + if t not in g[f].keys(): + g[f][t] = set() + addit = True + for ws in g[f][t]: + # if one is prefix of us, give up + v1 = fem(ws) + v2 = fem(w) + if is_pref(ws, w) and v1 == v2: + addit = False + break + # if we are pref of it, remove it + if is_pref(w, ws) and v1 == v2: + g[f][t].remove(ws) + if not addit: continue - print("%s -[%s]-> %s" % (f, t[1], t[0])) - f = t[0] + g[f][t].add(w) + +ned = nedd(g) +print(ned, file=sys.stderr) + +while True: + g2 = tred(g) + ned2 = nedd(g2) + if ned2 == ned: + break + g = g2 + ned = ned2 + print(ned, file=sys.stderr) + + + +for f in g.keys(): + for t in g[f].keys(): + for w in g[f][t]: + print ("%s %s %s" % (f, t, w)) + +# +#for j in range(100): +# print("-----------------") +# f = list(g.keys())[j] +# print(f) +# for i in range(100): +# t = list(g[f])[0] +# if t[0] not in g.keys(): +# continue +# print("%s -[%s]-> %s" % (f, t[1], t[0])) +# f = t[0] diff --git a/only3.py b/only3.py @@ -11,5 +11,14 @@ for l in sys.stdin.readlines(): parse = re.split(vowels_regexp, f[1]) parse = [x for x in parse if len(x) > 0] s = sum([1 for x in parse if x[0] in vowels]) - if s == 3: - print("%s\t%s" % (f[0], ' '.join(parse))) + if s != 3: + continue + if parse[0][0] in vowels: + continue + if parse[-1][0] in vowels and parse[-2][0] in vowels: + continue + if (parse[-1][0] not in vowels and parse[-2][0] in vowels + and parse[-3][0] in vowels): + continue + print("%s\t%s" % (f[0], ' '.join(parse))) + diff --git a/script.sh b/script.sh @@ -1,5 +1,8 @@ #!/bin/bash +#cat Lexique371/Bases+Scripts/Lexique3.txt | +# ./lexique_fix.sh| cut -f1,2,4 | uniq > lexique.txt pv lexique.txt| ./only3.py > lexique3 -cat lexique3 | ./graph.py +cat lexique3 | ./graph.py > graph +cat graph | ./cycle.py