commit 3920f022ff8117cfb893a7a676c99d3f093a86fc
parent da6b3b320e64b16294994a78dd86d79201fccc7c
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 14 Oct 2015 21:49:44 +0100
stdump.py
Diffstat:
README | | | 1 | + |
stdump.py | | | 72 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 73 insertions(+), 0 deletions(-)
diff --git a/README b/README
@@ -25,6 +25,7 @@ information (in French).
- dump_all.sh DEST
dump raw JSON of all opinions and versions to DEST/opinions and DEST/versions
(also dumps a few ones that don't really exist and must be filtered)
+ You can then pretty-print to TSV with stdump.py DEST
Examples:
diff --git a/stdump.py b/stdump.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python3
+
+"""dump all opinions, versions, and arguments to TSV"""
+
+import html
+from itertools import chain
+from json import load, dumps
+from os import listdir
+from os.path import isfile, join
+import sys
+
+def getAuthor(j):
+ pref = "https://www.republique-numerique.fr/profile/user/"
+ s = j['_links']['profile']
+ assert(s.startswith(pref))
+ return s[len(pref):]
+
+def opinionId(j, i=None):
+ return 'opinion/%d' % j['id']
+
+def argumentId(j, i):
+ return (i + '/argument/%d') % j['id']
+
+def versionId(j, i=None):
+ return 'opinion/%d/version/%d' % (j['parent']['id'], j['id'])
+
+def parse(j, i, key, idF):
+ try:
+ j = j[key]
+ except KeyError:
+ pass
+ nid = idF(j, i)
+ author = getAuthor(j['author'])
+ try:
+ votes_ok = j['votes_ok']
+ votes_mitige = j['votes_mitige']
+ votes_nok = j['votes_nok']
+ votes_total = j['votes_total']
+ except KeyError:
+ votes_ok = j['votes_count']
+ votes_mitige = 0
+ votes_nok = 0
+ votes_total = 0
+ url = j['_links']['show']
+ body = html.unescape(j['body']).split('\n')[0]
+ yield (nid, author, votes_ok, votes_mitige, votes_nok, votes_total,
+ url, body)
+ if 'arguments' in j.keys():
+ for ja in j['arguments']:
+ for r in parse(ja, nid, 'argument', argumentId):
+ yield r
+
+def iterfiles(fdir, key, idF):
+ for f in listdir(fdir):
+ if not isfile(join(fdir, f)):
+ continue
+ with open(join(fdir, f)) as fh:
+ j = load(fh)
+ try:
+ if j['code'] == 404:
+ continue
+ except KeyError:
+ pass
+ for res in parse(j, None, key, idF):
+ yield res
+
+if __name__ == '__main__':
+ dump = sys.argv[1]
+ for t in chain(iterfiles(join(dump, 'opinions'), 'opinion', opinionId),
+ iterfiles(join(dump, 'versions'), 'version', versionId)):
+ print ("\t".join(str(x) for x in t))
+