commit cb33de69f07e5a8af717220e72c75b3719e27bdf
parent 0168f74585cbf074d9337696241f92b126016306
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sun, 10 Jul 2011 22:05:58 -0400
refactoring
Diffstat:
common.py | | | 2 | +- |
metric.py | | | 98 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
poetlint.py | | | 265 | +------------------------------------------------------------------------------ |
template.py | | | 172 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
4 files changed, 274 insertions(+), 263 deletions(-)
diff --git a/common.py b/common.py
@@ -2,6 +2,7 @@
import unicodedata
import re
+import haspirater
vowels = 'aeiouyϾ'
consonants = "[bcçdfghjklmnpqrstvwxz*-]"
@@ -10,7 +11,6 @@ consonants = "[bcçdfghjklmnpqrstvwxz*-]"
# in some cases but not others...
sure_end_fem = ['es', 'e']
-
# http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
def strip_accents_one(s, with_except):
r = []
diff --git a/metric.py b/metric.py
@@ -0,0 +1,98 @@
+import re
+from common import strip_accents, normalize, is_vowels, consonants, \
+ sure_end_fem
+from vowels import possible_weights
+import haspirater
+
+def annotate_aspirated(word):
+ """Annotate aspirated 'h'"""
+ if word[0] != 'h':
+ return word
+ if haspirater.lookup(word):
+ return '*'+word
+ else:
+ return word
+
+def fit(chunks, pos, left):
+ if pos >= len(chunks):
+ return [[]]
+ if left < 0:
+ return []
+ if (not is_vowels(chunks[pos])):
+ return [[chunks[pos]] + x for x in fit(chunks, pos+1, left)]
+ else:
+ if (pos >= len(chunks) - 2 and chunks[pos] == 'e'):
+ # special case for endings
+ if pos == len(chunks) - 1:
+ weights = [0]
+ elif chunks[pos+1] == 's':
+ weights = [0]
+ elif chunks[pos+1] == 'nt':
+ weights = [0, 1]
+ else:
+ weights = possible_weights(chunks[pos])
+ else:
+ weights = possible_weights(chunks[pos])
+ result = []
+ for weight in weights:
+ #print("Take %s with weight %d" % (chunks[pos], weight), file=sys.stderr)
+ result += [[(chunks[pos], weight)] + x for x in fit(chunks, pos+1,
+ left - weight)]
+ return result
+
+def feminine(align, verse):
+ for a in sure_end_fem:
+ if verse.endswith(a):
+ return True
+ #pprint(align)
+ if verse.endswith('ent') and align[-2][1] != 1:
+ return True
+ return False
+
+def parse(text, bound):
+ """Return possible aligns for text, bound is an upper bound on the
+ align length to limit cost"""
+
+ original_text = normalize(text)
+
+ # avoid some vowel problems
+ text = re.sub("qu", 'q', original_text)
+ text = re.sub("gue", 'ge', text)
+ text = re.sub("gué", 'gé', text)
+ text = re.sub("guè", 'gè', text)
+ text = re.sub("gua", 'ga', text)
+
+ words = text.split(' ')
+ words = [annotate_aspirated(word) for word in words if word != '']
+
+ pattern = re.compile('('+consonants+'*)', re.UNICODE)
+ for i in range(len(words)):
+ words[i] = re.split(pattern, words[i])
+ words[i] = [chunk for chunk in words[i] if chunk != '']
+ nwords = []
+ for chunk in words[i]:
+ if 'y' not in chunk or len(chunk) == 1 or chunk[0] == 'y':
+ nwords.append(chunk)
+ else:
+ a = chunk.split('y')
+ nwords.append(a[0])
+ nwords.append('Y')
+ if a[1] != '':
+ nwords.append(a[1])
+ else:
+ # very special case :-/
+ if words[i] == ['p', 'ay', 's']:
+ nwords.append('y')
+ words[i] = nwords
+ if i > 0:
+ if sum([1 for chunk in words[i-1] if is_vowels(chunk)]) > 1:
+ if words[i-1][-1] == 'e' and is_vowels(words[i][0], True):
+ words[i-1].pop(-1)
+ words[i-1][-1] = words[i-1][-1]+"'"
+ for word in words:
+ word.append(' ')
+ chunks = sum(words, [])[:-1]
+
+ return list(map((lambda x : (x, feminine(x, original_text))),
+ fit(chunks, 0, bound)))
+
diff --git a/poetlint.py b/poetlint.py
@@ -2,269 +2,10 @@
import re
import sys
-import unicodedata
-import haspirater
import rhyme
-import error
+import metric
+import template
from pprint import pprint
-from vowels import possible_weights
-from common import strip_accents, normalize, is_vowels, consonants, \
- sure_end_fem
-from hemistiches import check_hemistiches
-
-def annotate_aspirated(word):
- """Annotate aspirated 'h'"""
- if word[0] != 'h':
- return word
- if haspirater.lookup(word):
- return '*'+word
- else:
- return word
-
-def fit(chunks, pos, left):
- if pos >= len(chunks):
- return [[]]
- if left < 0:
- return []
- if (not is_vowels(chunks[pos])):
- return [[chunks[pos]] + x for x in fit(chunks, pos+1, left)]
- else:
- if (pos >= len(chunks) - 2 and chunks[pos] == 'e'):
- # special case for endings
- if pos == len(chunks) - 1:
- weights = [0]
- elif chunks[pos+1] == 's':
- weights = [0]
- elif chunks[pos+1] == 'nt':
- weights = [0, 1]
- else:
- weights = possible_weights(chunks[pos])
- else:
- weights = possible_weights(chunks[pos])
- result = []
- for weight in weights:
- #print("Take %s with weight %d" % (chunks[pos], weight), file=sys.stderr)
- result += [[(chunks[pos], weight)] + x for x in fit(chunks, pos+1,
- left - weight)]
- return result
-
-def feminine(align, verse):
- for a in sure_end_fem:
- if verse.endswith(a):
- return True
- #pprint(align)
- if verse.endswith('ent') and align[-2][1] != 1:
- return True
- return False
-
-def parse(text, bound):
- original_text = normalize(text)
- text = re.sub("qu", 'q', original_text)
- text = re.sub("gue", 'ge', text)
- text = re.sub("gué", 'gé', text)
- text = re.sub("guè", 'gè', text)
- text = re.sub("gua", 'ga', text)
- #print(text, file=sys.stderr)
- words = text.split(' ')
- words = [annotate_aspirated(word) for word in words if word != '']
- pattern = re.compile('('+consonants+'*)', re.UNICODE)
- for i in range(len(words)):
- words[i] = re.split(pattern, words[i])
- words[i] = [chunk for chunk in words[i] if chunk != '']
- nwords = []
- for chunk in words[i]:
- if 'y' not in chunk or len(chunk) == 1 or chunk[0] == 'y':
- nwords.append(chunk)
- else:
- a = chunk.split('y')
- nwords.append(a[0])
- nwords.append('Y')
- if a[1] != '':
- nwords.append(a[1])
- else:
- # very special case :-/
- if words[i] == ['p', 'ay', 's']:
- nwords.append('y')
- words[i] = nwords
- if i > 0:
- if sum([1 for chunk in words[i-1] if is_vowels(chunk)]) > 1:
- if words[i-1][-1] == 'e' and is_vowels(words[i][0], True):
- words[i-1].pop(-1)
- words[i-1][-1] = words[i-1][-1]+"'"
- for word in words:
- word.append(' ')
- chunks = sum(words, [])[:-1]
-
- return list(map((lambda x : (x, feminine(x, original_text))),
- fit(chunks, 0, bound)))
-
-class Pattern:
- def __init__(self, metric, myid, femid, rhyme):
- self.metric = metric
- self.parse_metric()
- self.myid = myid
- self.femid = femid
- self.rhyme = rhyme
-
- def parse_metric(self):
- """Parse from a metric description"""
- verse = [int(x) for x in self.metric.split('/')]
- self.hemistiches = []
- self.length = 0
- for v in verse:
- self.length += v
- self.hemistiches.append(self.length)
- self.length = self.hemistiches.pop()
-
-class Template:
- def __init__(self, stream):
- self.template = []
- self.pattern_line_no = 0
- self.load(stream)
- self.line_no = 0
- self.position = 0
- self.env = {}
- self.femenv = {}
-
- def load(self, stream):
- """Load from a stream"""
- for line in f.readlines():
- line = line.strip()
- self.pattern_line_no += 1
- if line != '' and line[0] != '#':
- self.template.append(self.parse_template(line.lstrip().rstrip()))
-
- def count(self, align):
- #TODO cleanup
- return sum([x[1] for x in align if isinstance(x, tuple)])
-
- def rate(self, pattern, align):
- """Rate align according to pattern"""
- align, fem, hemis = align
- c = self.count(align)
- ok = True
- for h in hemis.values():
- if h != "ok":
- ok = False
- if ok and c == pattern.length:
- return 0
- return (len(hemis.keys())*abs(pattern.length - c)
- + sum([1 for x in hemis.values() if x != "ok"]))
-
- def match(self, line):
- """Check a line"""
- pattern = self.get()
- # compute alignments, check hemistiches, sort by score
- possible = parse(line, pattern.length + 2)
- possible = list(map((lambda p : (p[0], p[1],
- check_hemistiches(p[0], pattern.hemistiches))), possible))
- possible = map((lambda x : (self.rate(pattern, x), x)), possible)
- possible = sorted(possible, key=(lambda x : x[0]))
-
- errors = []
- if len(possible) == 0 or possible[0][0] != 0:
- errors.append(error.ErrorBadMetric(possible))
- if len(possible) == 0:
- return errors, pattern
- possible2 = []
- for (score, x) in possible:
- possible2.append((score, x))
- if score != possible[0][0]:
- break
- possible = possible2
-
- if pattern.myid not in self.env.keys():
- #print(normalize(line))
- self.env[pattern.myid] = rhyme.init_rhyme(normalize(line),
- pattern.rhyme)
- #print("nVALUE")
- #pprint(self.env[pattern.myid])
- #pprint(self.env[pattern.myid])
- else:
- old = list(self.env[pattern.myid])
- self.env[pattern.myid] = rhyme.check_rhyme(self.env[pattern.myid],
- (normalize(line), pattern.rhyme))
- #print("nVALUE")
- #pprint(self.env[pattern.myid])
- if (self.env[pattern.myid][1] == None and
- len(self.env[pattern.myid][0]) == 0):
- errors.append(error.ErrorBadRhymeSound(old, None))
- if pattern.femid not in self.femenv.keys():
- if pattern.femid == 'M':
- x = set(['M'])
- elif pattern.femid == 'F':
- x = set(['F'])
- else:
- x = set(['M', 'F'])
- self.femenv[pattern.femid] = x
- # TODO this is simplistic and order-dependent
- if pattern.femid.swapcase() in self.femenv.keys():
- new = set(['M', 'F']) - self.femenv[pattern.femid.swapcase()]
- if len(new) > 0:
- self.femenv[pattern.femid] = new
- old = list(self.femenv[pattern.femid])
- #pprint(possible)
- new = list(set(['F' if x[1] else 'M' for (score, x) in possible]))
- self.femenv[pattern.femid] &= set(new)
- #print(old)
- #print(new)
- if len(self.femenv[pattern.femid]) == 0:
- errors.append(error.ErrorBadRhymeGenre(old, new))
-
- return errors, pattern
-
- def parse_template(self, l):
- """Parse template from a line"""
- split = l.split(' ')
- metric = split[0]
- if len(split) >= 2:
- myid = split[1]
- else:
- myid = str(self.pattern_line_no)
- if len(split) >= 3:
- femid = split[2]
- else:
- femid = str(self.pattern_line_no)
- if len(split) >= 4:
- rhyme = [int(x) for x in split[3].split('|')]
- else:
- rhyme = []
- if len(rhyme) == 0:
- rhyme.append(1)
- while len(rhyme) < 3:
- rhyme.append(-1)
- return Pattern(metric, myid, femid, rhyme)
-
- def reset_conditional(self, d):
- return dict((k, v) for x, v in d.items() if x[-1] == '!')
-
- def reset_state(self, with_femenv=False):
- """Reset our state"""
- self.position = 0
- self.env = self.reset_conditional(self.env)
- self.femenv = self.reset_conditional(self.femenv)
-
- def get(self):
- """Get next state, resetting if needed"""
- if self.position >= len(self.template):
- self.reset_state()
- result = self.template[self.position]
- self.position += 1
- return result
-
- def check(self, line):
- """Check line (wrapper)"""
- self.line_no += 1
- line = line.rstrip()
- if line == '':
- return []
- #possible = [compute(p) for p in possible]
- #possible = sorted(possible, key=rate)
- errors, pattern = self.match(line)
- for error in errors:
- error.pos(line, self.line_no, pattern)
- return errors
-
if len(sys.argv) != 2:
print("Usage: %s TEMPLATE" % sys.argv[0], file=sys.stderr)
@@ -273,7 +14,7 @@ if len(sys.argv) != 2:
sys.exit(1)
f = open(sys.argv[1])
-template = Template(f)
+template = template.Template(f)
f.close()
def run():
diff --git a/template.py b/template.py
@@ -0,0 +1,172 @@
+import error
+from metric import parse
+from hemistiches import check_hemistiches
+import rhyme
+from common import normalize
+
+class Pattern:
+ def __init__(self, metric, myid, femid, rhyme):
+ self.metric = metric
+ self.parse_metric()
+ self.myid = myid
+ self.femid = femid
+ self.rhyme = rhyme
+
+ def parse_metric(self):
+ """Parse from a metric description"""
+ verse = [int(x) for x in self.metric.split('/')]
+ self.hemistiches = []
+ self.length = 0
+ for v in verse:
+ self.length += v
+ self.hemistiches.append(self.length)
+ self.length = self.hemistiches.pop()
+
+class Template:
+ def __init__(self, stream):
+ self.template = []
+ self.pattern_line_no = 0
+ self.load(stream)
+ self.line_no = 0
+ self.position = 0
+ self.env = {}
+ self.femenv = {}
+
+ def load(self, stream):
+ """Load from a stream"""
+ for line in stream.readlines():
+ line = line.strip()
+ self.pattern_line_no += 1
+ if line != '' and line[0] != '#':
+ self.template.append(self.parse_template(line.lstrip().rstrip()))
+
+ def count(self, align):
+ """total weight of an align"""
+ return sum([x[1] for x in align if isinstance(x, tuple)])
+
+ def rate(self, pattern, align):
+ """Rate align according to pattern"""
+ align, fem, hemis = align
+ c = self.count(align)
+ ok = True
+ for h in hemis.values():
+ if h != "ok":
+ ok = False
+ if ok and c == pattern.length:
+ return 0
+ return (len(hemis.keys())*abs(pattern.length - c)
+ + sum([1 for x in hemis.values() if x != "ok"]))
+
+ def match(self, line):
+ """Check a line against current pattern, return errors"""
+ pattern = self.get()
+ # compute alignments, check hemistiches, sort by score
+ possible = parse(line, pattern.length + 2)
+ possible = list(map((lambda p : (p[0], p[1],
+ check_hemistiches(p[0], pattern.hemistiches))), possible))
+ possible = map((lambda x : (self.rate(pattern, x), x)), possible)
+ possible = sorted(possible, key=(lambda x : x[0]))
+
+ errors = []
+
+ # check metric
+ if len(possible) == 0 or possible[0][0] != 0:
+ errors.append(error.ErrorBadMetric(possible))
+ if len(possible) == 0:
+ return errors, pattern
+ # keep the best alignment as hypotheses
+ possible = [(score, align) for (score, align) in possible
+ if score == possible[0][0]]
+
+ # rhymes
+ if pattern.myid not in self.env.keys():
+ # initialize the rhyme
+ self.env[pattern.myid] = rhyme.init_rhyme(normalize(line),
+ pattern.rhyme)
+ else:
+ # update the rhyme
+ old = list(self.env[pattern.myid])
+ self.env[pattern.myid] = rhyme.check_rhyme(self.env[pattern.myid],
+ (normalize(line), pattern.rhyme))
+ # no more possible rhymes, something went wrong
+ if (self.env[pattern.myid][1] == None and
+ len(self.env[pattern.myid][0]) == 0):
+ errors.append(error.ErrorBadRhymeSound(old, None))
+
+ # rhyme genres
+ # TODO refactor this
+ if pattern.femid not in self.femenv.keys():
+ if pattern.femid == 'M':
+ x = set(['M'])
+ elif pattern.femid == 'F':
+ x = set(['F'])
+ else:
+ x = set(['M', 'F'])
+ self.femenv[pattern.femid] = x
+ else:
+ # TODO this is simplistic and order-dependent
+ if pattern.femid.swapcase() in self.femenv.keys():
+ new = set(['M', 'F']) - self.femenv[pattern.femid.swapcase()]
+ if len(new) > 0:
+ self.femenv[pattern.femid] = new
+
+ old = list(self.femenv[pattern.femid])
+ new = list(set(['F' if x[1] else 'M' for (score, x) in possible]))
+ self.femenv[pattern.femid] &= set(new)
+ if len(self.femenv[pattern.femid]) == 0:
+ errors.append(error.ErrorBadRhymeGenre(old, new))
+
+ return errors, pattern
+
+ def parse_template(self, l):
+ """Parse template from a line"""
+ split = l.split(' ')
+ metric = split[0]
+ if len(split) >= 2:
+ myid = split[1]
+ else:
+ myid = str(self.pattern_line_no)
+ if len(split) >= 3:
+ femid = split[2]
+ else:
+ femid = str(self.pattern_line_no)
+ if len(split) >= 4:
+ rhyme = [int(x) for x in split[3].split('|')]
+ else:
+ rhyme = []
+ if len(rhyme) == 0:
+ rhyme.append(1)
+ while len(rhyme) < 3:
+ rhyme.append(-1)
+ return Pattern(metric, myid, femid, rhyme)
+
+ def reset_conditional(self, d):
+ return dict((k, v) for x, v in d.items() if x[-1] == '!')
+
+ def reset_state(self, with_femenv=False):
+ """Reset our state, except ids ending with '!'"""
+ self.position = 0
+ self.env = self.reset_conditional(self.env)
+ self.femenv = self.reset_conditional(self.femenv)
+
+ def get(self):
+ """Get next state, resetting if needed"""
+ if self.position >= len(self.template):
+ self.reset_state()
+ result = self.template[self.position]
+ self.position += 1
+ return result
+
+ def check(self, line):
+ """Check line (wrapper)"""
+ self.line_no += 1
+ line = line.rstrip()
+ if line == '':
+ return []
+ #possible = [compute(p) for p in possible]
+ #possible = sorted(possible, key=rate)
+ errors, pattern = self.match(line)
+ for error in errors:
+ error.pos(line, self.line_no, pattern)
+ return errors
+