republique

helper scripts for www.republique-numerique.fr
git clone https://a3nm.net/git/republique/
Log | Files | Refs | README

common.py (1701B)


      1 #!/bin/python3
      2 
      3 from bs4 import BeautifulSoup
      4 import json
      5 import requests
      6 import time
      7 
      8 HEADERS = { 'User-Agent': 'Mozilla' }
      9 HEADERS_JSON = {
     10         'Accept': "application/json"
     11         }
     12 HEADERS_JSON.update(HEADERS)
     13 
     14 URL = 'https://www.republique-numerique.fr%s'
     15 API_URL = 'https://www.republique-numerique.fr/api/%s'
     16 
     17 def url2res(relurl, res_title=None):
     18     """get identifier of URL"""
     19     # this sucks but I don't know how else to do it
     20     url = URL % relurl
     21     data = requests.get(url, headers=HEADERS)
     22     time.sleep(1)
     23     tree = BeautifulSoup(data.text, 'html.parser')
     24     divs = (tree.find_all('div', id='render-opinion')
     25             + tree.find_all('div', id='render-opinion-version'))
     26     div = divs[0]
     27     opinion = div.get('data-opinion')
     28     version = None
     29     try:
     30         version = div.get('data-version')
     31     except KeyError:
     32         pass
     33     if version:
     34         candidate = 'opinions/%s/versions/%s' % (opinion, version)
     35     else:
     36         candidate = 'opinions/%s' % opinion
     37     if not res_title:
     38         return candidate
     39     # links to arguments have the same href as the ones to the opinions and
     40     # versions themselves, so we need to make sure
     41     check = requests.get(API_URL % candidate, headers=HEADERS_JSON)
     42     time.sleep(1)
     43     check_v = json.loads(check.text)
     44     try:
     45         real_title = check_v['opinion']['title']
     46     except KeyError:
     47         real_title = check_v['version']['title']
     48     # be caution because of broken unicode truncation
     49     res_title = res_title[:-2]
     50     if real_title.startswith(res_title):
     51         return candidate
     52     else:
     53         # the link was probably to an argument and not to the opinion or version
     54         # itself
     55         return None
     56