retrieve.sh (3849B)
1 #!/bin/bash 2 3 # note: if you change the years, make sure to remove all downloaded files! 4 5 FROM_YEAR=1900 6 TO_YEAR=2018 # assumed to be >= 2010, and assumed to be <= 2023 (PODS changed to PACMMOD) 7 MIN_ACTIVE=2015 8 9 mkdir -p authorsfiles conffiles 10 11 for d in `seq 1982 $TO_YEAR`; do 12 D2=$(echo "$d" | sed 's/^19//' | sed 's/2002/02/') 13 if [ ! -s conffiles/conf_PODS_$d.xml ] 14 then 15 curl "https://dblp.uni-trier.de/db/conf/pods/pods${D2}.xml" \ 16 > conffiles/conf_PODS_$d.xml; 17 sleep 1 18 fi 19 done 20 21 for d in 1986 1988 1990 1992 `seq 1995 2 2009` `seq 2010 $TO_YEAR`; do 22 D2=$(echo "$d" | sed 's/^19//') 23 if [ ! -s conffiles/conf_ICDT_$d.xml ] 24 then 25 curl "https://dblp.uni-trier.de/db/conf/icdt/icdt${D2}.xml" \ 26 > conffiles/conf_ICDT_$d.xml; 27 sleep 1 28 fi 29 done 30 31 rm -f authorsfiles/c_authors_consolidated.txt 32 touch authorsfiles/c_authors_consolidated.txt 33 34 cd conffiles 35 36 for c in PODS ICDT; do 37 for f in conf_${c}_????.xml; do 38 DEST1="${f#conf_}" 39 DEST2="${DEST1%.xml}" 40 Y="${DEST2: -4}" 41 if (( Y >= FROM_YEAR && Y <= TO_YEAR )) 42 then 43 DEST="authors_${DEST2}.txt" 44 echo "$f" 45 xmlstarlet sel -T -t -m "//inproceedings/author" \ 46 -m . -c '.' -n <"$f" >"$DEST" 47 cat "$DEST" >> ../authorsfiles/c_authors_consolidated.txt 48 fi 49 done 50 done 51 52 cd .. 53 54 mkdir -p authors 55 56 cat authorsfiles/c_authors_consolidated.txt | sort | uniq | while read l; do 57 ENAME=$(echo "$l" | sed 's/ /%20/g'); 58 if [ ! -s "authors/$l.raw.xml" ] 59 then 60 echo "$l" 61 curl "https://dblp.org/search/author/api?h=1000&q=$ENAME" \ 62 > "authors/$l.raw.xml" 63 sleep 1 64 fi 65 done 66 67 cd authors 68 for a in *.raw.xml; do 69 aa=${a%.raw.xml} 70 b=$aa.page.xml 71 if [ ! -s "$b" ] 72 then 73 echo "$a" 74 echo "$aa" 75 # favor exact matches 76 URL=$(../get_dblp_url "$aa") 77 curl -L "${URL}.xml" > "$b" 78 sleep 1 79 fi 80 done 81 82 cd .. 83 84 cd conffiles 85 86 for a in authors_*; do 87 b="authorsn_${a#authors_}" 88 DEST1="${b#authors_}" 89 DEST2="${DEST1%.txt}" 90 Y="${DEST2: -4}" 91 if (( Y >= FROM_YEAR && Y <= TO_YEAR )) 92 then 93 cat $a | while read l; do 94 V=$(xmlstarlet sel -T -t -m '/dblpperson' -v @name -n < "../authors/$l.page.xml") 95 if [ ! -e "../authors/${V}.page.xml" ] 96 then 97 ln -s "${l}.page.xml" "../authors/${V}.page.xml" 98 fi 99 if [ ! -e "../authors/${V}.raw.xml" ] 100 then 101 ln -s "${l}.raw.xml" "../authors/${V}.raw.xml" 102 fi 103 echo "$V" 104 done > "../authorsfiles/$b" 105 fi 106 done 107 108 cd .. 109 cd authorsfiles 110 111 for a in authorsn_*; do 112 b="authorsn_${a#authors_}" 113 DEST1="${b#authors_}" 114 DEST2="${DEST1%.txt}" 115 Y="${DEST2: -4}" 116 if (( Y >= FROM_YEAR && Y <= TO_YEAR )) 117 then 118 b=$(echo $a | sed 's/\.txt$//;s/^authorsn_...._//'); 119 cat "$a" | tr ' ' '_' | sed "s/\$/ $b/g" | sort | uniq; 120 fi 121 done | 122 sort | uniq | 123 awk '{ 124 n[$1]++; 125 if (mn[$1] == 0 || mn[$1] > $2) { mn[$1] = $2; }; 126 if (mx[$1] == 0 || mx[$1] < $2) { mx[$1] = $2; }; 127 } END { 128 for (k in mn) { print (k, mn[k], mx[k], n[k]); } 129 } ' | sort | sort -k4,4n | awk '{print $1, $2, $3, $4, $3-$2}' \ 130 > ../stats.txt 131 cd .. 132 cat stats.txt | awk '$4 >= 3' | awk '$3 >= '"$MIN_ACTIVE"' && $3 <= '"$TO_YEAR" > actives.txt 133 cat stats.txt | awk '$4 >= 10' > notables.txt 134 135 cut -d' ' -f1 actives.txt| 136 while read l; do 137 xmlstarlet sel -T -t -m '//inproceedings[year>='"$MIN_ACTIVE"' and year<='"$TO_YEAR"']/booktitle' -c '.' -n \ 138 < "authors/$(echo $l | tr '_' ' ').page.xml" | 139 sed 's/ ([0-9]*)//g' | sort | uniq 140 done | sort | uniq -c | sort -nr > venues.txt 141 142 cat actives.txt| cut -d' ' -f1 | while read l; do ./display.sh $l; done | 143 sort | sed 's/ $//;s/ / /g' > actives.md 144 145 sort -k3,3nr notables.txt| cut -d' ' -f1,3 | 146 while read l; do 147 A=$(echo $l | cut -d ' ' -f1); B=$(echo $l | cut -d' ' -f2); 148 ./display.sh $A "$B:"; 149 done | sed 's/ $//' > notables.md 150