republique

helper scripts for www.republique-numerique.fr
git clone https://a3nm.net/git/republique/
Log | Files | Refs | README

get_propositions.py (1969B)


      1 #!/usr/bin/python3 -u
      2 # Get opinions and modifications of a user
      3 
      4 from common import HEADERS, url2res
      5 from bs4 import BeautifulSoup
      6 import requests
      7 import sys
      8 import time
      9 
     10 PROPOSITIONS_URL = 'https://www.republique-numerique.fr/profile/%s/opinions'
     11 VERSIONS_URL = 'https://www.republique-numerique.fr/profile/%s/versions'
     12 
     13 if __name__ == '__main__':
     14     try:
     15         users = sys.argv[1:]
     16     except IndexError:
     17         print("Usage: %s USER...\n"
     18                 "Returns all opinions and modifications of each USER" %
     19                 sys.argv[0], file=sys.stderr)
     20         sys.exit(1)
     21     
     22     for user in users:
     23         for url in [PROPOSITIONS_URL % user, VERSIONS_URL % user]:
     24             data = requests.get(url, headers=HEADERS)
     25             time.sleep(1)
     26             tree = BeautifulSoup(data.text, 'html.parser')
     27 
     28             seen = set()
     29 
     30             # the following does not work on older bs4 versions
     31             #for div in tree.find_all('div', class_='opinion__data'):
     32             for div in tree.find_all('div'):
     33                 try:
     34                     c = div.get("class")
     35                 except KeyError:
     36                     continue
     37                 if isinstance(c, list):
     38                     c = c[0]
     39                 if c != 'opinion__data':
     40                     continue
     41                 res_url = None
     42                 for a in div.find_all('a'):
     43                     v = a.get('href')
     44                     if not v:
     45                         continue
     46                     if v.startswith('/consultations'):
     47                         res_url = a.get('href')
     48                         break
     49 
     50                 res = url2res(res_url)
     51                 if res in seen:
     52                     print("warning: duplicate entry for %s" % res,
     53                             file=sys.stderr)
     54                     print("this may indicate a problem with the scraping",
     55                             file=sys.stderr)
     56                 seen.add(res)
     57                 print ("%s 1" % res)
     58