works well for boileau - plint - French poetry validator (local mirror of https://gitlab.com/a3nm/plint)

commit 38720c219b80d088bf0696bab2d0322de670284a
parent c9c0595a265efe9dd1f6291cf739e35a145918ef
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun, 29 May 2011 16:48:22 -0400

works well for boileau

Diffstat:
poetlint.py  | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------

1 file changed, 62 insertions(+), 29 deletions(-)
diff --git a/poetlint.py b/poetlint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/python3 -u
 
 import re
 import sys
@@ -6,18 +6,24 @@ import unicodedata
 import aspire
 from pprint import pprint
 
-consonants = "[bcçdfghjklmnpqrstvwxz*]"
+consonants = "[bcçdfghjklmnpqrstvwxz*-]"
 vowels = 'aeiouyœæ'
 
 # TODO -ment at hemistiche
 # TODO diaresis
 # TODO rhymes
+# TODO vers en -es sont masc, pas fém
 sure_end_fem = ['es', 'e']
 end_fem = sure_end_fem + ['ent']
 
-count_two = ['aë', 'aï', 'ao', 'ea', 'éa', 'éi', 'éo', 'éu', 'êa', 'êi',
-'êo', 'êu', 'èa', 'èi', 'èo', 'èu', 'oa', 'ua', 'oya']
-can_count_two = ['ia', 'ieue', 'ié', 'iées', 'io', 'iu', 'iue', 'ue']
+count_two = ['aë', 'aï', 'ao', 'éa', 'éi', 'éo', 'éu', 'êa', 'êi',
+'êo', 'êu', 'èa', 'èi', 'èo', 'èu', 'oa', 'oya' , 'ueu', 'euâ', 'éâ',
+'oï', 'aïeu', 'oüoi', 'ouï', 'aïe', 'oè', 'oüé', 'ii', 'uau', 'oé',
+'uï', 'uïe']
+# TODO 'ée' ? ('déesse')
+can_count_two = ['ia', 'ée', 'ieue', 'ieu', 'ua', 'ié', 'iée', 'io', 'iu',
+'iue', 'ue', 'ui', 'ie', 'oue', 'oua', 'oueu', 'ouaie', 'ouai', 'oui', 'iè',
+'oué', 'ué', 'uée', 'uia', 'iai', 'yau', 'uo', 'yo']
 
 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
 def strip_accents(s):
@@ -30,7 +36,7 @@ def norm_spaces(text):
 
 def rm_punct(text):
   text = re.sub("'", '', text)
-  pattern = re.compile('[^\w ]', re.UNICODE)
+  pattern = re.compile('[^\w -]', re.UNICODE)
   return pattern.sub(' ', text)
 
 def annotate_aspirated(word):
@@ -60,22 +66,29 @@ def possible_weights(chunk):
     return [1,2]
   return [1]
 
-def fit(chunks, left):
+def fit(chunks, left, past):
   if left == 7 and (len(chunks) < 2 or chunks[0] + chunks[1] in
       sure_end_fem):
     # no feminine at hemistiche
-    print ("refuse hemistiche")
-    return None
+    # maybe it's a lone word?
+    ok = False
+    for i in range(2):
+      for j in ' -':
+        if j in past[-i]:
+          ok = True
+    if not ok:
+      print ("refuse hemistiche", file=sys.stderr)
+      return None
   weights = possible_weights(chunks[0])
   for weight in weights:
     nleft = left - weight
-    #print("Take %s with weight %d, left %d" % (chunks[0], weight,
-      #nleft))
+    print("Take %s with weight %d, left %d" % (chunks[0], weight,
+      nleft), file=sys.stderr)
     result = maybe_sum([(chunks[0], weight)], skip(chunks[1:], nleft,
-      nleft == 6))
+      past+[chunks[0]], nleft == 6))
     if result != None:
       return result
-    #print ("FAIL!")
+    print("FAIL!", file=sys.stderr)
   return None
 
 def maybe_sum(a, b):
@@ -84,24 +97,24 @@ def maybe_sum(a, b):
   else:
     return a + b
   
-def skip(chunks, left, expect_space=False):
+def skip(chunks, left, past, expect_space=False):
   result = []
   chunks = list(chunks)
   if len(chunks) > 0 and not is_vowels(chunks[0]):
-    return maybe_sum([chunks[0]], skip(chunks[1:], left, expect_space
-      and not chunks[0] == ' '))
+    return maybe_sum([chunks[0]], skip(chunks[1:], left, past +
+      [chunks[0]], expect_space and not chunks[0] == ' '))
   if len(chunks) == 0:
     if left == 0:
-      #print("OK")
+      print("OK", file=sys.stderr)
       return []
     else:
-      #print("out of chunks")
+      print("out of chunks", file=sys.stderr)
       return None
   if expect_space:
     # we wanted a space and haven't got it, fail
-    #print("wanted space")
+    print("wanted space", file=sys.stderr)
     return None
-  return fit(chunks, left)
+  return fit(chunks, left, past)
 
 def get_feminine(text):
   for end in end_fem:
@@ -117,6 +130,8 @@ def nullify(chunk):
 
 def align(result):
   align, feminine = result
+  if align == None:
+    return "Non."
   l1 = ['F  '] if feminine else ["M  "]
   l2 = ['12 ']
   for x in align:
@@ -130,9 +145,12 @@ def align(result):
 
 def parse(text):
   text = norm_spaces(rm_punct(text.lower())).rstrip().lstrip()
-  end = get_feminine(text)
-  feminine = end != ''
+  oend = get_feminine(text)
+  feminine = oend != ''
+  end = oend
   text = re.sub("qu", 'q', text)
+  text = re.sub("gue", 'ge', text)
+  print(text, file=sys.stderr)
   words = text.split(' ')
   words = [annotate_aspirated(word) for word in words]
   pattern = re.compile('('+consonants+'*)', re.UNICODE)
@@ -141,13 +159,18 @@ def parse(text):
     words[i] = [chunk for chunk in words[i] if chunk != '']
     nwords = []
     for chunk in words[i]:
-      if 'y' not in chunk or len(chunk) == 1:
+      if 'y' not in chunk or len(chunk) == 1 or chunk[0] == 'y':
         nwords.append(chunk)
       else:
         a = chunk.split('y')
         nwords.append(a[0])
         nwords.append('Y')
-        nwords.append(a[1])
+        if a[1] != '':
+          nwords.append(a[1])
+        else:
+          # TODO ouais c'est foutu là...
+          if words[i] == ['p', 'ay', 's']:
+            nwords.append('y')
     words[i] = nwords
     if i > 0:
       if count_vowel_chunks(words[i-1]) > 1:
@@ -157,26 +180,36 @@ def parse(text):
   for word in words:
     word.append(' ')
   chunks = sum(words, [])[:-1]
-  
+ 
+  ochunks = list(chunks)
   end = [chunk for chunk in re.split(pattern, end)
           if chunk != '']
-  if chunks[-(len(end)+1)] != ' ' and chunks[-(len(end)+2)] != ' ' :
+  if len(chunks) >= 2 and chunks[-(len(end)+1)] != ' ' and chunks[-(len(end)+2)] != ' ' :
     if end != []:
       # drop end
       end.reverse()
       nend = []
       for x in end:
-        #print (chunks[-1])
         if chunks[-1] == x:
           chunks.pop()
           nend.append(nullify(x))
       nend.reverse()
       end = nend
   else:
+    try:
+      if end[-1] == chunks[-1] and chunks[-1] == 'nt':
+        feminine = False # OK this looks like fem but isnt (" cent$")
+    except IndexError:
+      pass
     end = []
 
-  #pprint(chunks)
-  return (maybe_sum(skip(chunks, 12), end), feminine)
+  print('/'.join(chunks), file=sys.stderr)
+  result = (maybe_sum(skip(chunks, 12, []), end), feminine)
+  if result[0] == None and oend == 'ent':
+    #super-ugly hack because ending 'ent' sometimes isn't dropped
+    return (maybe_sum(skip(ochunks, 12, []), end), False)
+  else:
+    return result
 
 while True:
   line = sys.stdin.readline()

	plint French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
	git clone https://a3nm.net/git/plint/
	Log \| Files \| Refs \| README