republique

helper scripts for www.republique-numerique.fr
git clone https://a3nm.net/git/republique/
Log | Files | Refs | README

commit 3920f022ff8117cfb893a7a676c99d3f093a86fc
parent da6b3b320e64b16294994a78dd86d79201fccc7c
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed, 14 Oct 2015 21:49:44 +0100

stdump.py

Diffstat:
README | 1+
stdump.py | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 73 insertions(+), 0 deletions(-)

diff --git a/README b/README @@ -25,6 +25,7 @@ information (in French). - dump_all.sh DEST dump raw JSON of all opinions and versions to DEST/opinions and DEST/versions (also dumps a few ones that don't really exist and must be filtered) + You can then pretty-print to TSV with stdump.py DEST Examples: diff --git a/stdump.py b/stdump.py @@ -0,0 +1,72 @@ +#!/usr/bin/python3 + +"""dump all opinions, versions, and arguments to TSV""" + +import html +from itertools import chain +from json import load, dumps +from os import listdir +from os.path import isfile, join +import sys + +def getAuthor(j): + pref = "https://www.republique-numerique.fr/profile/user/" + s = j['_links']['profile'] + assert(s.startswith(pref)) + return s[len(pref):] + +def opinionId(j, i=None): + return 'opinion/%d' % j['id'] + +def argumentId(j, i): + return (i + '/argument/%d') % j['id'] + +def versionId(j, i=None): + return 'opinion/%d/version/%d' % (j['parent']['id'], j['id']) + +def parse(j, i, key, idF): + try: + j = j[key] + except KeyError: + pass + nid = idF(j, i) + author = getAuthor(j['author']) + try: + votes_ok = j['votes_ok'] + votes_mitige = j['votes_mitige'] + votes_nok = j['votes_nok'] + votes_total = j['votes_total'] + except KeyError: + votes_ok = j['votes_count'] + votes_mitige = 0 + votes_nok = 0 + votes_total = 0 + url = j['_links']['show'] + body = html.unescape(j['body']).split('\n')[0] + yield (nid, author, votes_ok, votes_mitige, votes_nok, votes_total, + url, body) + if 'arguments' in j.keys(): + for ja in j['arguments']: + for r in parse(ja, nid, 'argument', argumentId): + yield r + +def iterfiles(fdir, key, idF): + for f in listdir(fdir): + if not isfile(join(fdir, f)): + continue + with open(join(fdir, f)) as fh: + j = load(fh) + try: + if j['code'] == 404: + continue + except KeyError: + pass + for res in parse(j, None, key, idF): + yield res + +if __name__ == '__main__': + dump = sys.argv[1] + for t in chain(iterfiles(join(dump, 'opinions'), 'opinion', opinionId), + iterfiles(join(dump, 'versions'), 'version', versionId)): + print ("\t".join(str(x) for x in t)) +