duire

french missing verbs with prefix forms
git clone https://a3nm.net/git/duire/
Log | Files | Refs

commit 25a879c0f2170ac80c0da862a0a5146b4ed1efa4
parent d698e2137fa0ba51ffd8b54dac0252110794b20c
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Mon, 23 Mar 2015 01:41:21 +0100

continue

Diffstat:
calcpref.py | 33+++++++++++++++++++++++++++++++++
prefix.py | 49++++++++++++++++++++++---------------------------
2 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/calcpref.py b/calcpref.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 -O +# -*- encoding: utf-8 + +import operator +import sys + +words = set() + +for l in sys.stdin.readlines(): + words.add(l.rstrip()) + +myprefs = {} +done = 0 +every = 200 + +for w in words: + if done % every == 0: + print("done %d" % done, file=sys.stderr) + done += 1 + for i in range(len(words)): + w2 = w[i:] + if i == 0 or i == len(words): + continue + if w2 in words: + p = w[:i] + if p not in myprefs.keys(): + myprefs[p] = 0 + myprefs[p] += 1 + +sorted_x = sorted(myprefs.items(), key=operator.itemgetter(1)) +for (t, v) in sorted_x: + print("%s %s" % (t, v)) + diff --git a/prefix.py b/prefix.py @@ -1,45 +1,40 @@ -#!/usr/bin/python3 +#!/usr/bin/python3 -O # -*- encoding: utf-8 +import operator import sys -prefixes = ["dé", "re", "par", "ex", "sous", "sur", "in", "as", "bi", "em", -"ac", "rec", "di", "su", "en"] +prefixes = {} words = set() interesting = {} threshold = 2 +maxlen = 2 +keep = 30 +exp = 0.2 -for l in sys.stdin.readlines(): +fwords = open(sys.argv[1]) +for l in fwords.readlines(): words.add(l.rstrip()) +fwords.close() +fpref = open(sys.argv[2]) +for l in fpref.readlines(): + t = l.rstrip().split(' ') + prefixes[t[0]] = int(t[1]) +fpref.close() for w in words: - for p in prefixes: + for p, v in prefixes.items(): if w.startswith(p): w2 = w[len(p):] + if w2 in words: + continue if w2 not in interesting.keys(): interesting[w2] = set() - interesting[w2].add(w) + interesting[w2].add((w, v)) -for wi, ws in interesting.items(): - if wi in words: - continue - if len(ws) >= threshold: - print(wi, ws) +sortint = sorted(interesting.items(), key=lambda x: -sum((t[1]**exp for t in x[1]), 0)) -#myprefs = {} -#pthresh = 4 -#cons = "bcçdfghjklmnpqrstvwxz" -# for wi, ws in interesting.items(): -# if wi in words: -# continue -# for w2 in words: -# if w2.endswith(wi): -# mypref = w2[:len(wi)] -# if mypref not in myprefs.keys(): -# myprefs[mypref] = 0 -# myprefs[mypref] += 1 -# -# for mypref, v in myprefs.items(): -# if v >= pthresh: -# print (mypref, v) +for (wi, ws) in sortint: + ws = sorted(ws, key=operator.itemgetter(1), reverse=True) + print(wi+":", ' '.join(t[0] for t in ws))