common.py (1701B)
1 #!/bin/python3 2 3 from bs4 import BeautifulSoup 4 import json 5 import requests 6 import time 7 8 HEADERS = { 'User-Agent': 'Mozilla' } 9 HEADERS_JSON = { 10 'Accept': "application/json" 11 } 12 HEADERS_JSON.update(HEADERS) 13 14 URL = 'https://www.republique-numerique.fr%s' 15 API_URL = 'https://www.republique-numerique.fr/api/%s' 16 17 def url2res(relurl, res_title=None): 18 """get identifier of URL""" 19 # this sucks but I don't know how else to do it 20 url = URL % relurl 21 data = requests.get(url, headers=HEADERS) 22 time.sleep(1) 23 tree = BeautifulSoup(data.text, 'html.parser') 24 divs = (tree.find_all('div', id='render-opinion') 25 + tree.find_all('div', id='render-opinion-version')) 26 div = divs[0] 27 opinion = div.get('data-opinion') 28 version = None 29 try: 30 version = div.get('data-version') 31 except KeyError: 32 pass 33 if version: 34 candidate = 'opinions/%s/versions/%s' % (opinion, version) 35 else: 36 candidate = 'opinions/%s' % opinion 37 if not res_title: 38 return candidate 39 # links to arguments have the same href as the ones to the opinions and 40 # versions themselves, so we need to make sure 41 check = requests.get(API_URL % candidate, headers=HEADERS_JSON) 42 time.sleep(1) 43 check_v = json.loads(check.text) 44 try: 45 real_title = check_v['opinion']['title'] 46 except KeyError: 47 real_title = check_v['version']['title'] 48 # be caution because of broken unicode truncation 49 res_title = res_title[:-2] 50 if real_title.startswith(res_title): 51 return candidate 52 else: 53 # the link was probably to an argument and not to the opinion or version 54 # itself 55 return None 56