commit 3f5df619548f4ed5e83ca97a7cdedb47f005aafb
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Sun, 1 Jun 2025 11:35:32 +0200
initial commit
Diffstat:
4 files changed, 183 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+conffiles
+authorsfiles
+authors
+venues.txt
+stats.txt
+notables.txt
+notables.md
+actives.md
+actives.txt
diff --git a/display.sh b/display.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+N=$(echo $1 | tr '_' ' ')
+N2=$(echo $1 | tr '_' ' ' | sed 's/ 0001//')
+echo -n "- $2 **$N2** "
+FILE="authors/$N.raw.xml"
+FILE2="authors/$N.page.xml"
+cd authors
+URL=$(../get_dblp_url "$N")
+cd ..
+echo -n "[[dblp]]($URL) "
+xmlstarlet sel -T -t -m '/dblpperson/person/url' -c '.' -n < "$FILE2" |
+ grep -vE '^https?://(dl.acm.org|id.loc.gov|isni.org|viaf.org|www.genealogy.ams.org|www.andrej.com|awards.acm.org|www.acm.org|scholar.google.com|zbmath.org|www.scopus.com|d-nb.info|openlibrary.org|ethw.org|id.worldcat.org|www.idref.fr|www.twitter.com|www.researcherid.com|twitter.com|[a-z][a-z].linkedin.|www.ams.org|www.genealogy.math)' |
+ sed 's_https://www.wikidata.org.*_[[wikidata]](&)_;s_https\?://en.wikipedia.org.*_[[wikipedia]](&)_;s_https://orcid.org.*_[[orcid]](&)_;s_^http.*_[[webpage]](&)_' |
+ sort | tr '\n' ' ' | cat - <(echo)
+
diff --git a/get_dblp_url b/get_dblp_url
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+URL=$(xmlstarlet sel -T -t -m "/result/hits/hit/info[author='$1']" -c url -n < "$1.raw.xml" | head -1)
+if [[ -z "$URL" ]]
+then
+ URL=$(xmlstarlet sel -T -t -m /result/hits/hit/info/url -c . -n < "$1.raw.xml" | head -1)
+fi
+echo "$URL"
diff --git a/retrieve.sh b/retrieve.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+# note: if you change the years, make sure to remove all downloaded files!
+
+FROM_YEAR=1900
+TO_YEAR=2018 # assumed to be >= 2010, and assumed to be <= 2023 (PODS changed to PACMMOD)
+MIN_ACTIVE=2015
+
+mkdir -p authorsfiles conffiles
+
+for d in `seq 1982 $TO_YEAR`; do
+ D2=$(echo "$d" | sed 's/^19//' | sed 's/2002/02/')
+ if [ ! -s conffiles/conf_PODS_$d.xml ]
+ then
+ curl "https://dblp.uni-trier.de/db/conf/pods/pods${D2}.xml" \
+ > conffiles/conf_PODS_$d.xml;
+ sleep 1
+ fi
+done
+
+for d in 1986 1988 1990 1992 `seq 1995 2 2009` `seq 2010 $TO_YEAR`; do
+ D2=$(echo "$d" | sed 's/^19//')
+ if [ ! -s conffiles/conf_ICDT_$d.xml ]
+ then
+ curl "https://dblp.uni-trier.de/db/conf/icdt/icdt${D2}.xml" \
+ > conffiles/conf_ICDT_$d.xml;
+ sleep 1
+ fi
+done
+
+rm -f authorsfiles/c_authors_consolidated.txt
+touch authorsfiles/c_authors_consolidated.txt
+
+cd conffiles
+
+for c in PODS ICDT; do
+for f in conf_${c}_????.xml; do
+ DEST1="${f#conf_}"
+ DEST2="${DEST1%.xml}"
+ Y="${DEST2: -4}"
+ if (( Y >= FROM_YEAR && Y <= TO_YEAR ))
+ then
+ DEST="authors_${DEST2}.txt"
+ echo "$f"
+ xmlstarlet sel -T -t -m "//inproceedings/author" \
+ -m . -c '.' -n <"$f" >"$DEST"
+ cat "$DEST" >> ../authorsfiles/c_authors_consolidated.txt
+ fi
+done
+done
+
+cd ..
+
+mkdir -p authors
+
+cat authorsfiles/c_authors_consolidated.txt | sort | uniq | while read l; do
+ ENAME=$(echo "$l" | sed 's/ /%20/g');
+ if [ ! -s "authors/$l.raw.xml" ]
+ then
+ echo "$l"
+ curl "https://dblp.org/search/author/api?h=1000&q=$ENAME" \
+ > "authors/$l.raw.xml"
+ sleep 1
+ fi
+done
+
+cd authors
+for a in *.raw.xml; do
+ aa=${a%.raw.xml}
+ b=$aa.page.xml
+ if [ ! -s "$b" ]
+ then
+ echo "$a"
+ echo "$aa"
+ # favor exact matches
+ URL=$(../get_dblp_url "$aa")
+ curl -L "${URL}.xml" > "$b"
+ sleep 1
+ fi
+done
+
+cd ..
+
+cd conffiles
+
+for a in authors_*; do
+ b="authorsn_${a#authors_}"
+ DEST1="${b#authors_}"
+ DEST2="${DEST1%.txt}"
+ Y="${DEST2: -4}"
+ if (( Y >= FROM_YEAR && Y <= TO_YEAR ))
+ then
+ cat $a | while read l; do
+ V=$(xmlstarlet sel -T -t -m '/dblpperson' -v @name -n < "../authors/$l.page.xml")
+ if [ ! -e "../authors/${V}.page.xml" ]
+ then
+ ln -s "${l}.page.xml" "../authors/${V}.page.xml"
+ fi
+ if [ ! -e "../authors/${V}.raw.xml" ]
+ then
+ ln -s "${l}.raw.xml" "../authors/${V}.raw.xml"
+ fi
+ echo "$V"
+ done > "../authorsfiles/$b"
+ fi
+done
+
+cd ..
+cd authorsfiles
+
+for a in authorsn_*; do
+ b="authorsn_${a#authors_}"
+ DEST1="${b#authors_}"
+ DEST2="${DEST1%.txt}"
+ Y="${DEST2: -4}"
+ if (( Y >= FROM_YEAR && Y <= TO_YEAR ))
+ then
+ b=$(echo $a | sed 's/\.txt$//;s/^authorsn_...._//');
+ cat "$a" | tr ' ' '_' | sed "s/\$/ $b/g" | sort | uniq;
+ fi
+ done |
+ sort | uniq |
+ awk '{
+ n[$1]++;
+ if (mn[$1] == 0 || mn[$1] > $2) { mn[$1] = $2; };
+ if (mx[$1] == 0 || mx[$1] < $2) { mx[$1] = $2; };
+ } END {
+ for (k in mn) { print (k, mn[k], mx[k], n[k]); }
+ } ' | sort | sort -k4,4n | awk '{print $1, $2, $3, $4, $3-$2}' \
+ > ../stats.txt
+cd ..
+cat stats.txt | awk '$4 >= 3' | awk '$3 >= '"$MIN_ACTIVE"' && $3 <= '"$TO_YEAR" > actives.txt
+cat stats.txt | awk '$4 >= 10' > notables.txt
+
+cut -d' ' -f1 actives.txt|
+ while read l; do
+ xmlstarlet sel -T -t -m '//inproceedings[year>='"$MIN_ACTIVE"' and year<='"$TO_YEAR"']/booktitle' -c '.' -n \
+ < "authors/$(echo $l | tr '_' ' ').page.xml" |
+ sed 's/ ([0-9]*)//g' | sort | uniq
+ done | sort | uniq -c | sort -nr > venues.txt
+
+cat actives.txt| cut -d' ' -f1 | while read l; do ./display.sh $l; done |
+ sort | sed 's/ $//;s/ / /g' > actives.md
+
+sort -k3,3nr notables.txt| cut -d' ' -f1,3 |
+ while read l; do
+ A=$(echo $l | cut -d ' ' -f1); B=$(echo $l | cut -d' ' -f2);
+ ./display.sh $A "$B:";
+ done | sed 's/ $//' > notables.md
+