dblp_db_community

Identify members of the database theory community using DBLP
git clone https://a3nm.net/git/dblp_db_community/
Log | Files | Refs

commit 3f5df619548f4ed5e83ca97a7cdedb47f005aafb
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun,  1 Jun 2025 11:35:32 +0200

initial commit

Diffstat:
.gitignore | 9+++++++++
display.sh | 16++++++++++++++++
get_dblp_url | 8++++++++
retrieve.sh | 150+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 183 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,9 @@ +conffiles +authorsfiles +authors +venues.txt +stats.txt +notables.txt +notables.md +actives.md +actives.txt diff --git a/display.sh b/display.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +N=$(echo $1 | tr '_' ' ') +N2=$(echo $1 | tr '_' ' ' | sed 's/ 0001//') +echo -n "- $2 **$N2** " +FILE="authors/$N.raw.xml" +FILE2="authors/$N.page.xml" +cd authors +URL=$(../get_dblp_url "$N") +cd .. +echo -n "[[dblp]]($URL) " +xmlstarlet sel -T -t -m '/dblpperson/person/url' -c '.' -n < "$FILE2" | + grep -vE '^https?://(dl.acm.org|id.loc.gov|isni.org|viaf.org|www.genealogy.ams.org|www.andrej.com|awards.acm.org|www.acm.org|scholar.google.com|zbmath.org|www.scopus.com|d-nb.info|openlibrary.org|ethw.org|id.worldcat.org|www.idref.fr|www.twitter.com|www.researcherid.com|twitter.com|[a-z][a-z].linkedin.|www.ams.org|www.genealogy.math)' | + sed 's_https://www.wikidata.org.*_[[wikidata]](&)_;s_https\?://en.wikipedia.org.*_[[wikipedia]](&)_;s_https://orcid.org.*_[[orcid]](&)_;s_^http.*_[[webpage]](&)_' | + sort | tr '\n' ' ' | cat - <(echo) + diff --git a/get_dblp_url b/get_dblp_url @@ -0,0 +1,8 @@ +#!/bin/bash + +URL=$(xmlstarlet sel -T -t -m "/result/hits/hit/info[author='$1']" -c url -n < "$1.raw.xml" | head -1) +if [[ -z "$URL" ]] +then + URL=$(xmlstarlet sel -T -t -m /result/hits/hit/info/url -c . -n < "$1.raw.xml" | head -1) +fi +echo "$URL" diff --git a/retrieve.sh b/retrieve.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +# note: if you change the years, make sure to remove all downloaded files! + +FROM_YEAR=1900 +TO_YEAR=2018 # assumed to be >= 2010, and assumed to be <= 2023 (PODS changed to PACMMOD) +MIN_ACTIVE=2015 + +mkdir -p authorsfiles conffiles + +for d in `seq 1982 $TO_YEAR`; do + D2=$(echo "$d" | sed 's/^19//' | sed 's/2002/02/') + if [ ! -s conffiles/conf_PODS_$d.xml ] + then + curl "https://dblp.uni-trier.de/db/conf/pods/pods${D2}.xml" \ + > conffiles/conf_PODS_$d.xml; + sleep 1 + fi +done + +for d in 1986 1988 1990 1992 `seq 1995 2 2009` `seq 2010 $TO_YEAR`; do + D2=$(echo "$d" | sed 's/^19//') + if [ ! -s conffiles/conf_ICDT_$d.xml ] + then + curl "https://dblp.uni-trier.de/db/conf/icdt/icdt${D2}.xml" \ + > conffiles/conf_ICDT_$d.xml; + sleep 1 + fi +done + +rm -f authorsfiles/c_authors_consolidated.txt +touch authorsfiles/c_authors_consolidated.txt + +cd conffiles + +for c in PODS ICDT; do +for f in conf_${c}_????.xml; do + DEST1="${f#conf_}" + DEST2="${DEST1%.xml}" + Y="${DEST2: -4}" + if (( Y >= FROM_YEAR && Y <= TO_YEAR )) + then + DEST="authors_${DEST2}.txt" + echo "$f" + xmlstarlet sel -T -t -m "//inproceedings/author" \ + -m . -c '.' -n <"$f" >"$DEST" + cat "$DEST" >> ../authorsfiles/c_authors_consolidated.txt + fi +done +done + +cd .. + +mkdir -p authors + +cat authorsfiles/c_authors_consolidated.txt | sort | uniq | while read l; do + ENAME=$(echo "$l" | sed 's/ /%20/g'); + if [ ! -s "authors/$l.raw.xml" ] + then + echo "$l" + curl "https://dblp.org/search/author/api?h=1000&q=$ENAME" \ + > "authors/$l.raw.xml" + sleep 1 + fi +done + +cd authors +for a in *.raw.xml; do + aa=${a%.raw.xml} + b=$aa.page.xml + if [ ! -s "$b" ] + then + echo "$a" + echo "$aa" + # favor exact matches + URL=$(../get_dblp_url "$aa") + curl -L "${URL}.xml" > "$b" + sleep 1 + fi +done + +cd .. + +cd conffiles + +for a in authors_*; do + b="authorsn_${a#authors_}" + DEST1="${b#authors_}" + DEST2="${DEST1%.txt}" + Y="${DEST2: -4}" + if (( Y >= FROM_YEAR && Y <= TO_YEAR )) + then + cat $a | while read l; do + V=$(xmlstarlet sel -T -t -m '/dblpperson' -v @name -n < "../authors/$l.page.xml") + if [ ! -e "../authors/${V}.page.xml" ] + then + ln -s "${l}.page.xml" "../authors/${V}.page.xml" + fi + if [ ! -e "../authors/${V}.raw.xml" ] + then + ln -s "${l}.raw.xml" "../authors/${V}.raw.xml" + fi + echo "$V" + done > "../authorsfiles/$b" + fi +done + +cd .. +cd authorsfiles + +for a in authorsn_*; do + b="authorsn_${a#authors_}" + DEST1="${b#authors_}" + DEST2="${DEST1%.txt}" + Y="${DEST2: -4}" + if (( Y >= FROM_YEAR && Y <= TO_YEAR )) + then + b=$(echo $a | sed 's/\.txt$//;s/^authorsn_...._//'); + cat "$a" | tr ' ' '_' | sed "s/\$/ $b/g" | sort | uniq; + fi + done | + sort | uniq | + awk '{ + n[$1]++; + if (mn[$1] == 0 || mn[$1] > $2) { mn[$1] = $2; }; + if (mx[$1] == 0 || mx[$1] < $2) { mx[$1] = $2; }; + } END { + for (k in mn) { print (k, mn[k], mx[k], n[k]); } + } ' | sort | sort -k4,4n | awk '{print $1, $2, $3, $4, $3-$2}' \ + > ../stats.txt +cd .. +cat stats.txt | awk '$4 >= 3' | awk '$3 >= '"$MIN_ACTIVE"' && $3 <= '"$TO_YEAR" > actives.txt +cat stats.txt | awk '$4 >= 10' > notables.txt + +cut -d' ' -f1 actives.txt| + while read l; do + xmlstarlet sel -T -t -m '//inproceedings[year>='"$MIN_ACTIVE"' and year<='"$TO_YEAR"']/booktitle' -c '.' -n \ + < "authors/$(echo $l | tr '_' ' ').page.xml" | + sed 's/ ([0-9]*)//g' | sort | uniq + done | sort | uniq -c | sort -nr > venues.txt + +cat actives.txt| cut -d' ' -f1 | while read l; do ./display.sh $l; done | + sort | sed 's/ $//;s/ / /g' > actives.md + +sort -k3,3nr notables.txt| cut -d' ' -f1,3 | + while read l; do + A=$(echo $l | cut -d ' ' -f1); B=$(echo $l | cut -d' ' -f2); + ./display.sh $A "$B:"; + done | sed 's/ $//' > notables.md +