plint

French poetry validator (local mirror of https://gitlab.com/a3nm/plint)
git clone https://a3nm.net/git/plint/
Log | Files | Refs | README

common.py (4254B)


      1 #!/usr/bin/python3
      2 # coding: utf-8
      3 
      4 import unicodedata
      5 import re
      6 
      7 VOWELS = 'aeiouyϾ'
      8 CONSONANTS = "bcçdfghjklmnpqrstvwxzñĝ'"
      9 APOSTROPHES = "'’`"
     10 LEGAL = VOWELS + CONSONANTS + ' -'
     11 
     12 # a variant of x-sampa such that all french phonemes are one-character
     13 SUBSTITUTIONS = [
     14     ('#', 'A~'),
     15     ('$', 'O~'),
     16     (')', 'E~'),
     17     ('(', '9~'),
     18 ]
     19 
     20 # Forbidden at the end of a hemistiche. "-ent" would also be forbidden
     21 # in some cases but not others...
     22 SURE_END_FEM = ['es', 'e', 'ë']
     23 
     24 
     25 # http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
     26 def strip_accents_one(s, with_except=False):
     27     """Strip accent from a string with_except keeps specifically 'é' and 'è'"""
     28     r = []
     29     for x in s:
     30         if with_except and x in ['è', 'é']:
     31             r.append(x)
     32         else:
     33             r += unicodedata.normalize('NFD', x)
     34     return r
     35 
     36 
     37 def strip_accents(s, with_except=False):
     38     return ''.join(
     39         (c for c in strip_accents_one(s, with_except)
     40          if unicodedata.category(c) != 'Mn'))
     41 
     42 
     43 def normalize_spaces(text):
     44     """Remove multiple consecutive whitespace"""
     45     return re.sub(r"\s+-*\s*", ' ', text)
     46 
     47 
     48 def remove_punctuation(text, rm_all=False, rm_apostrophe=False,
     49         rm_apostrophe_end=True, rm_all_begin=False):
     50     """Remove punctuation from text"""
     51     text = re.sub("[" + APOSTROPHES + "]", "'", text)  # no weird apostrophes
     52     if rm_apostrophe:
     53         text = re.sub(r"'", "", text)
     54     if rm_apostrophe_end:
     55         text = re.sub(r"'*$", "", text)  # apostrophes at end of line
     56     text = re.sub(r"[‒–—―⁓⸺⸻]", " ", text)  # no weird dashes
     57     text = re.sub(r"^--*\s", " ", text)  # no isolated dashes
     58     text = re.sub(r"--*\s", " ", text)  # no trailing dashes
     59     text = re.sub(r"^\s*-\s*$", " ", text)  # no lone dash
     60     text = re.sub(r"^--*$", "", text)  # no only dashes
     61 
     62     # TODO rather: keep only good chars
     63     if not rm_all:
     64         if rm_all_begin:
     65             pattern = re.compile(r"^[^\w]*", re.UNICODE)
     66             text2b = pattern.sub(' ', text)
     67         else:
     68             text2b = text
     69         pattern = re.compile(r"[^'\w -]", re.UNICODE)
     70         text2 = pattern.sub(' ', text2b)
     71     else:
     72         pattern = re.compile(r"[^\w]", re.UNICODE)
     73         text2 = pattern.sub('', text)
     74     text2 = re.sub(r"\s'*$", " ", text2)  # no lonely apostrophes
     75     text2 = re.sub(r"^'*$", "", text2)  # not only apostrophes
     76     return text2
     77 
     78 
     79 def is_vowels(chunk_text, with_h=False, with_y=True, with_crap=False):
     80     """Test if a chunk is vowels with_h counts 'h' as vowel, with_y allows 'y'"""
     81     if not with_y and chunk_text == 'y':
     82         return False
     83     for char in strip_accents(chunk_text):
     84         if char not in VOWELS and (char != 'h' or not with_h) and (char not in ['*', '?'] or not with_crap):
     85             return False
     86     return True
     87 
     88 
     89 def is_consonants(chunk_text):
     90     """Test if a chunk is consonants"""
     91     for char in strip_accents(chunk_text):
     92         if char not in CONSONANTS:
     93             return False
     94     return True
     95 
     96 
     97 def normalize(text, downcase=True, rm_all=False, rm_apostrophe=False,
     98               rm_apostrophe_end=True, rm_all_begin=False, strip=True):
     99     """Normalize text, ie. lowercase, no useless punctuation or whitespace"""
    100     res = normalize_spaces(remove_punctuation(text.lower() if downcase else text,
    101                                               rm_all=rm_all, rm_apostrophe=rm_apostrophe,
    102                                               rm_apostrophe_end=rm_apostrophe_end,
    103                                               rm_all_begin=rm_all_begin))
    104     if strip:
    105         return res.rstrip().lstrip()
    106     else:
    107         return res
    108 
    109 
    110 def subst(string, subs):
    111     if len(subs) == 0:
    112         return string
    113     return subst(string.replace(subs[0][0], subs[0][1]), subs[1:])
    114 
    115 
    116 def to_xsampa(s):
    117     """convert our modified format to x-sampa"""
    118     return subst(s, SUBSTITUTIONS)
    119 
    120 
    121 def from_xsampa(s):
    122     """convert x-sampa to our modified format"""
    123     return subst(s, [(x[1], x[0]) for x in SUBSTITUTIONS])
    124 
    125 
    126 def get_consonants_regex():
    127     all_consonants = CONSONANTS + CONSONANTS.upper()
    128     consonants_regexp = re.compile(r'([^' + all_consonants + '*-]+)', re.UNICODE)
    129     return consonants_regexp