graph2.py (2467B)
1 #!/usr/bin/python3 2 3 import sys 4 from common import vowels, semivowels, fem, start 5 from collections import defaultdict 6 7 g = defaultdict(lambda: defaultdict(lambda: [])) 8 9 def explore(graph, fr): 10 seen = set() 11 queue = [fr] 12 while len(queue) > 0: 13 pos = queue.pop(0) 14 if pos in seen: 15 continue 16 seen.add(pos) 17 for t in graph[pos].keys(): 18 queue.append(t) 19 return seen 20 21 22 for l in sys.stdin.readlines(): 23 f = l.strip().split('\t') 24 if len(f) < 3: 25 f.append('') 26 w, p, syl = f 27 syl.strip() 28 p = p.split('-') 29 allp = ''.join(p) 30 fems = fem(w) 31 # w allows us to go from its first syllabe to its last, toggling fem status 32 # we can go to something that requires less consonants 33 seenvow = False 34 for i in range(len(allp)): 35 if allp[i] in vowels: 36 if seenvow: 37 break 38 seenvow = True 39 if not seenvow: 40 continue 41 # we are between first vowel included and second vowel excluded 42 fr = (not fems, allp[:i+1]) 43 to = (fems, p[-1]) 44 if allp.startswith(to[1]): 45 # loop (e.g., trinitrine), not very pretty 46 continue 47 #if (w.startswith('mandib')): 48 #print(w, fr, to) 49 g[fr][to].append((w, syl)) 50 # todel = set() 51 # toadd = True 52 # for (ww, syl) in g[fr][to]: 53 # if ww.startswith(w) and w != ww: 54 # # we will kill something longer 55 # todel.add(ww) 56 # elif w.startswith(ww): 57 # # we already have something shorter 58 # toadd = False 59 # break 60 # g[fr][to] -= todel 61 # if toadd: 62 # g[fr][to].add((w, syl)) 63 64 # compute reverse graph 65 rg = defaultdict(lambda: defaultdict(lambda: set())) 66 for f in g.keys(): 67 for t in g[f].keys(): 68 rg[t][f] = g[f][t] 69 70 # only keep what's accessible from start 71 access = explore(g, start) 72 coaccess = explore(rg, start) 73 useful = access & coaccess 74 useful.add(start) 75 76 for f in g.keys(): 77 for t in g[f].keys(): 78 #if (f[1] == 'm@' and t[1] == 'by'): 79 #print ("HOHOHO", t, start, f in useful, t in useful) 80 if f not in useful or t not in useful: 81 continue 82 # only print the most frequent word, for now 83 for w in g[f][t][:1]: 84 print ("%s\t%s\t%s\t%s\t%s\t%s" % ('f' if f[0] else 'm', f[1], 'f' if t[0] else 'm', t[1], w[0], w[1])) 85 86