stdump.py (2227B)
1 #!/usr/bin/python3 2 3 """dump all opinions, versions, and arguments to TSV""" 4 5 import html 6 from itertools import chain 7 from json import load, dumps 8 from os import listdir 9 from os.path import isfile, join 10 import sys 11 12 def getAuthor(j): 13 pref = "https://www.republique-numerique.fr/profile/user/" 14 pref2 = "https://www.republique-numerique.fr/profile/" 15 s = j['_links']['profile'] 16 if not (s.startswith(pref)): 17 assert (s.startswith(pref2)) 18 return s[len(pref2):] 19 return s[len(pref):] 20 21 def opinionId(j, i=None): 22 return 'opinion/%d' % j['id'] 23 24 def argumentId(j, i): 25 return (i + '/argument/%d') % j['id'] 26 27 def versionId(j, i=None): 28 return 'opinion/%d/version/%d' % (j['parent']['id'], j['id']) 29 30 def parse(j, i, key, idF): 31 try: 32 j = j[key] 33 except KeyError: 34 pass 35 nid = idF(j, i) 36 author = getAuthor(j['author']) 37 try: 38 votes_ok = j['votes_ok'] 39 votes_mitige = j['votes_mitige'] 40 votes_nok = j['votes_nok'] 41 votes_total = j['votes_total'] 42 except KeyError: 43 votes_ok = j['votes_count'] 44 votes_mitige = 0 45 votes_nok = 0 46 votes_total = j['votes_count'] 47 url = j['_links']['show'] 48 try: 49 body = html.unescape(j['title']).split('\n')[0] 50 except KeyError: 51 body = html.unescape(j['body']).split('\n')[0] 52 yield (nid, author, votes_ok, votes_mitige, votes_nok, votes_total, 53 url, body) 54 if 'arguments' in j.keys(): 55 for ja in j['arguments']: 56 for r in parse(ja, nid, 'argument', argumentId): 57 yield r 58 59 def iterfiles(fdir, key, idF): 60 for f in listdir(fdir): 61 if not isfile(join(fdir, f)): 62 continue 63 with open(join(fdir, f)) as fh: 64 j = load(fh) 65 try: 66 if j['code'] == 404: 67 continue 68 except KeyError: 69 pass 70 for res in parse(j, None, key, idF): 71 yield res 72 73 if __name__ == '__main__': 74 dump = sys.argv[1] 75 for t in chain(iterfiles(join(dump, 'opinions'), 'opinion', opinionId), 76 iterfiles(join(dump, 'versions'), 'version', versionId)): 77 print ("\t".join(str(x) for x in t)) 78