frhyme

guess the last phonemes of a French word
git clone https://a3nm.net/git/frhyme/
Log | Files | Refs | README

commit c8eef9d4def727451f40f4f4e6c67e3b094fb97a
parent bc2a67b0a5c3be54d027370fcccbb4583bc109d7
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun, 12 Jun 2011 19:28:53 -0400

it works

Diffstat:
additions | 50++++++++++++++++++++------------------------------
buildtrie.py | 48+++++++++++++-----------------------------------
compresstrie.py | 40+++++++++++++++++++++++++---------------
pron.py | 35+++++++++++++++++++++++++++++++----
4 files changed, 89 insertions(+), 84 deletions(-)

diff --git a/additions b/additions @@ -1,30 +1,20 @@ -1 heaume -1 hèlement -1 hertz -1 héraut -1 hit-parade -1 high-five -1 hlm -1 hobby -1 hongrois -1 homard -1 hoquet -1 hors-piste -1 hors-bord -1 huée -1 hildegarde -1 hiroshima -1 heimatlos -0 haÿ-les-roses -0 heur -0 heure -0 h -1 haveler -0 hallucination -0 hallucine -0 halène -0 halèner -0 hadopisme -1 hadopi -0 hellénisme -0 hiatus +almanach almana +dompte d#t +domptent d#t +dompterai d#tRE +dompterait d#tRE +dompter d#te +dompteur d#t9R +dompteurs d#t9R +dompteuse d#t2z +dompteuses d#t2z +domptez d#te +tabis tabi +libye libi +est E +bœuf b9f +bœufs b2f +dis-je iZ +employ @plwa +amusemens amyzm@ +parens paR@ diff --git a/buildtrie.py b/buildtrie.py @@ -6,52 +6,29 @@ representing this mapping""" import json import sys -NUM=4 - # first item is a dictionnary from values to an int indicating the # number of occurrences with this prefix having this value # second item is a dictionnary from letters to descendent nodes def empty_node(): - return [0, {}] -def empty_node2(): - return [empty_node(), {}] - -trie = empty_node2() + return [{}, {}] -def too_ambiguous(trie): - values, children = trie - if values < 2: - return False - r = float(sum(sorted([x[0] for x in children.values()])[-NUM:])) / (values - 1) - return r < .99 +trie = empty_node() -def insert(trie, key): +def insert(trie, key, val): """Insert val for key in trie""" values, children = trie - # increment count for val - trie[0] += 1 - if trie[1] == None: - return - if too_ambiguous(trie): - trie[1] = None #too ambiguous - return + # create a new value, if needed + if len(key) == 0: + if val not in values.keys(): + values[val] = 0 + # increment count for val + values[val] += 1 if len(key) > 0: # create a new node if needed if key[0] not in children.keys(): children[key[0]] = empty_node() # recurse - return insert(children[key[0]], key[1:]) - -def insert2(trie, key, val): - """Insert val for key in trie""" - values, children = trie - insert(values, val) - if len(key) > 0: - # create a new node if needed - if key[0] not in children.keys(): - children[key[0]] = empty_node2() - # recurse - return insert2(children[key[0]], key[1:], val) + return insert(children[key[0]], key[1:], val) while True: line = sys.stdin.readline() @@ -59,13 +36,14 @@ while True: break line = line.split('\t') value = line[1].rstrip()[-4:][::-1] - word = line[0].lower()[-8:][::-1] if len(line) == 2 else '' + word = line[0].lower()[::-1] if len(line) == 2 else '' #print(word) #print(value) # a trailing space is used to mark termination of the word # this is useful in cases where a prefix of a word is a complete, # different word with a different value - insert2(trie, word+' ', value) + # two spaces because some data words have multiple spaces + insert(trie, word+' ', value) print(json.dumps(trie)) diff --git a/compresstrie.py b/compresstrie.py @@ -8,24 +8,34 @@ import sys trie = json.load(sys.stdin) -def linear(trie): - v, c = trie - if c == None: - return False - if len(c.keys()) == 0: - return True - elif len(c.keys()) > 1: - return False - else: - return linear(list(c.values())[0]) - def compress(trie): """Compress the trie""" - if linear(trie[0]): - # no need for children, there is no more doubt - trie[1] = {} + ref = None + num = 0 + ok = True + if trie[0] != {}: + if len(trie[0].keys()) > 1: + return None + ref = list(trie[0].keys())[0] + num = trie[0][ref] for child in trie[1].values(): - compress(child) + x = compress(child) + if not ok or x == None: + ok = False + continue + r, n = x + if ref == None: + ref = r + if ref != r: + ok = False + num += n + if not ok: + return None + trie[0] = {} + trie[0][ref] = num + trie[1] = {} + #print(ref, file=sys.stderr) + return ref, num compress(trie) diff --git a/pron.py b/pron.py @@ -13,15 +13,42 @@ f = open(os.path.join(os.path.dirname( trie = json.load(f) f.close() +def to_list(d, rev=True): + l = [] + for a in d.keys(): + if rev: + l.append((d[a], a[::-1])) + else: + l.append((d[a], a)) + return l + +def trie2list(trie): + v, c = trie + if c == {}: + return to_list(v) + else: + d = {} + for child in c.keys(): + l = trie2list(c[child]) + for x in l: + if x[1] not in d.keys(): + d[x[1]] = 0 + d[x[1]] += x[0] + return to_list(d, False) + def do_lookup(trie, key): - if len(key) == 0 or (key[0] not in trie[1].keys()): - print(key) - return trie[0] + if len(key) == 0 or key[0] not in trie[1].keys(): + return trie2list(trie) return do_lookup(trie[1][key[0]], key[1:]) + +def nbest(l, t): + l = sorted(l)[-t:] + l.reverse() + return l def lookup(key): """Return pronunciations for key""" - return do_lookup(trie, key[::-1] + ' ') + return nbest(do_lookup(trie, key[::-1] + ' '), 5) if __name__ == '__main__': while True: