dblp_db_community

Identify members of the database theory community using DBLP
git clone https://a3nm.net/git/dblp_db_community/
Log | Files | Refs

retrieve.sh (3849B)


      1 #!/bin/bash
      2 
      3 # note: if you change the years, make sure to remove all downloaded files!
      4 
      5 FROM_YEAR=1900
      6 TO_YEAR=2018 # assumed to be >= 2010, and assumed to be <= 2023 (PODS changed to PACMMOD)
      7 MIN_ACTIVE=2015
      8 
      9 mkdir -p authorsfiles conffiles
     10 
     11 for d in `seq 1982 $TO_YEAR`; do 
     12   D2=$(echo "$d" | sed 's/^19//' | sed 's/2002/02/')
     13   if [ ! -s conffiles/conf_PODS_$d.xml ]
     14   then
     15     curl "https://dblp.uni-trier.de/db/conf/pods/pods${D2}.xml" \
     16       > conffiles/conf_PODS_$d.xml;
     17     sleep 1
     18   fi
     19 done
     20 
     21 for d in 1986 1988 1990 1992 `seq 1995 2 2009` `seq 2010 $TO_YEAR`; do 
     22   D2=$(echo "$d" | sed 's/^19//')
     23   if [ ! -s conffiles/conf_ICDT_$d.xml ]
     24   then
     25     curl "https://dblp.uni-trier.de/db/conf/icdt/icdt${D2}.xml" \
     26       > conffiles/conf_ICDT_$d.xml;
     27     sleep 1
     28   fi
     29 done
     30 
     31 rm -f authorsfiles/c_authors_consolidated.txt
     32 touch authorsfiles/c_authors_consolidated.txt
     33 
     34 cd conffiles
     35 
     36 for c in PODS ICDT; do
     37 for f in conf_${c}_????.xml; do
     38   DEST1="${f#conf_}"
     39   DEST2="${DEST1%.xml}"
     40   Y="${DEST2: -4}"
     41   if (( Y >= FROM_YEAR && Y <= TO_YEAR ))
     42   then
     43     DEST="authors_${DEST2}.txt"
     44     echo "$f"
     45     xmlstarlet sel -T -t -m "//inproceedings/author" \
     46         -m . -c '.' -n <"$f" >"$DEST"
     47     cat "$DEST" >> ../authorsfiles/c_authors_consolidated.txt
     48   fi
     49 done
     50 done
     51 
     52 cd ..
     53 
     54 mkdir -p authors
     55 
     56 cat authorsfiles/c_authors_consolidated.txt | sort | uniq | while read l; do
     57   ENAME=$(echo "$l" | sed 's/ /%20/g');
     58   if [ ! -s "authors/$l.raw.xml" ]
     59   then
     60     echo "$l"
     61     curl "https://dblp.org/search/author/api?h=1000&q=$ENAME" \
     62       > "authors/$l.raw.xml"
     63     sleep 1
     64   fi
     65 done
     66 
     67 cd authors
     68 for a in *.raw.xml; do
     69   aa=${a%.raw.xml}
     70   b=$aa.page.xml
     71   if [ ! -s "$b" ]
     72   then
     73     echo "$a"
     74     echo "$aa"
     75     # favor exact matches
     76     URL=$(../get_dblp_url "$aa")
     77     curl -L "${URL}.xml" > "$b"
     78     sleep 1
     79   fi
     80 done
     81 
     82 cd ..
     83 
     84 cd conffiles
     85 
     86 for a in authors_*; do
     87   b="authorsn_${a#authors_}"
     88   DEST1="${b#authors_}"
     89   DEST2="${DEST1%.txt}"
     90   Y="${DEST2: -4}"
     91   if (( Y >= FROM_YEAR && Y <= TO_YEAR ))
     92   then
     93     cat $a | while read l; do
     94       V=$(xmlstarlet sel -T -t -m '/dblpperson' -v @name -n < "../authors/$l.page.xml")
     95       if [ ! -e "../authors/${V}.page.xml" ]
     96       then
     97         ln -s "${l}.page.xml" "../authors/${V}.page.xml"
     98       fi
     99       if [ ! -e "../authors/${V}.raw.xml" ]
    100       then
    101         ln -s "${l}.raw.xml" "../authors/${V}.raw.xml"
    102       fi
    103       echo "$V"
    104     done > "../authorsfiles/$b"
    105   fi
    106 done
    107 
    108 cd ..
    109 cd authorsfiles
    110 
    111 for a in authorsn_*; do
    112   b="authorsn_${a#authors_}"
    113   DEST1="${b#authors_}"
    114   DEST2="${DEST1%.txt}"
    115   Y="${DEST2: -4}"
    116   if (( Y >= FROM_YEAR && Y <= TO_YEAR ))
    117   then
    118     b=$(echo $a | sed 's/\.txt$//;s/^authorsn_...._//');
    119     cat "$a" | tr ' ' '_' | sed "s/\$/ $b/g" | sort | uniq;
    120   fi
    121   done |
    122     sort | uniq |
    123     awk '{
    124       n[$1]++;
    125       if (mn[$1] == 0 || mn[$1] > $2) { mn[$1] = $2; };
    126         if (mx[$1] == 0 || mx[$1] < $2) { mx[$1] = $2; };
    127       } END {
    128         for (k in mn) { print (k, mn[k], mx[k], n[k]); }
    129       } ' | sort | sort -k4,4n  | awk '{print $1, $2, $3, $4, $3-$2}' \
    130         > ../stats.txt
    131 cd ..
    132 cat stats.txt | awk '$4 >= 3' | awk '$3 >= '"$MIN_ACTIVE"' && $3 <= '"$TO_YEAR" > actives.txt
    133 cat stats.txt | awk '$4 >= 10' > notables.txt
    134 
    135 cut -d' ' -f1 actives.txt|
    136   while read l; do 
    137     xmlstarlet sel -T -t -m '//inproceedings[year>='"$MIN_ACTIVE"' and year<='"$TO_YEAR"']/booktitle' -c '.' -n \
    138       < "authors/$(echo $l | tr '_' ' ').page.xml" |
    139       sed 's/ ([0-9]*)//g' | sort | uniq
    140   done | sort | uniq -c  | sort -nr > venues.txt
    141 
    142 cat actives.txt| cut -d' ' -f1 | while read l; do ./display.sh $l; done |
    143   sort | sed 's/ $//;s/  / /g'  > actives.md
    144 
    145 sort -k3,3nr notables.txt| cut -d' ' -f1,3 |
    146   while read l; do
    147     A=$(echo $l | cut -d ' ' -f1); B=$(echo $l | cut -d' ' -f2); 
    148     ./display.sh $A "$B:"; 
    149   done | sed 's/ $//' > notables.md
    150