adddoi (6027B)
1 #!/usr/bin/env python 2 # source : https://tex.stackexchange.com/a/300474 and https://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography 3 # users: XachaB, thando, JohnM 4 import sys, re 5 from unidecode import unidecode 6 import bibtexparser 7 from bibtexparser.bwriter import BibTexWriter 8 import requests 9 import urllib 10 from time import sleep 11 import os 12 import json 13 14 EMAIL = os.environ['EMAIL'] 15 16 # Search for open-access URL from DOI using unpaywall 17 def searchurl(doi): 18 doi_quote = urllib.parse.quote(doi.strip()) 19 url = "https://api.unpaywall.org/v2/" + doi_quote 20 params = urllib.parse.urlencode({'email': EMAIL}) 21 sleep(2) 22 r = requests.get(url + '?' + params) 23 data = json.loads(r.text) 24 if 'HTTP_status_code' in data.keys() and data['HTTP_status_code'] == 404: 25 # not found, let's check if this is a crossref DOI 26 url_crossref = "https://api.crossref.org/works/" + doi_quote + "/agency" 27 r2 = requests.get(url_crossref) 28 data_crossref = json.loads(r2.text) 29 assert(data_crossref['message']['DOI'].lower() == doi.lower()) 30 assert(data_crossref['message']['agency']['id'] == 'datacite') 31 return True, None # OK, it is a datacite DOI, so probably OA 32 assert(data['doi'].lower() == doi.lower()) 33 if not data['is_oa']: 34 return False, None # closed-access and no OA version available :( 35 else: 36 if data['best_oa_location']['host_type'] == 'publisher': 37 return True, None # publisher version is OA so DOI link suffices 38 else: 39 assert(data['best_oa_location']['host_type'] == 'repository') 40 return True, data['best_oa_location']['url_for_pdf'] 41 42 # TODO: this is just trusting the API, a better approach is in the S2ORC ACL 43 # paper https://aclanthology.org/2020.acl-main.447.pdf section 2.6 44 # Search for the DOI given a title; e.g. "computation in Noisy Radio Networks" 45 # Credit to user13348, slight modifications 46 # http://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography 47 def searchdoi(title, author): 48 params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"}) 49 headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"} 50 url = "https://www.crossref.org/guestquery/#bibsearch" 51 sleep(2) 52 r = requests.post(url, headers=headers, data=params) 53 data = r.text 54 55 return re.search(r'doi\.org/([^"^<^>]+)', str(data)) 56 57 def normalize(string): 58 """Normalize strings to ascii, without latex.""" 59 # get rid of hyperlinks 60 string = re.sub(r'\\href{[^}]*}\s*{\s*([^}]*)\s*}',r"\1", string) 61 assert(not 'href' in string) 62 string = re.sub(r'[{}\\\'"^]',"", string) 63 string = re.sub(r"\$.*?\$","",string) # better remove all math expressions 64 return unidecode(string) 65 66 def get_authors(entry): 67 """Get a list of authors' or editors' last names.""" 68 def get_last_name(authors): 69 for author in authors : 70 author = author.strip(" ") 71 if "," in author: 72 yield author.split(",")[0] 73 elif " " in author: 74 yield author.split(" ")[-1] 75 else: 76 yield author 77 78 try: 79 authors = entry["author"] 80 except KeyError: 81 authors = entry["editor"] 82 83 authors = normalize(authors).split("and") 84 return list(get_last_name(authors)) 85 86 if __name__ == '__main__': 87 88 print("Reading Bibliography...") 89 with open(sys.argv[1]) as bibtex_file: 90 bibliography = bibtexparser.load(bibtex_file) 91 92 93 print("Looking for Dois...") 94 before = 0 95 before_url = 0 96 new = 0 97 new_url = 0 98 useless_url = 0 99 total = len(bibliography.entries) 100 for i,entry in enumerate(bibliography.entries): 101 print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="") 102 if "doi" not in entry.keys() or entry["doi"].isspace(): 103 title = normalize(entry["title"]) 104 authors = get_authors(entry) 105 for author in authors: 106 doi_match = searchdoi(title,author) 107 if doi_match: 108 doi = doi_match.groups()[0] 109 entry["doi"] = doi 110 new += 1 111 break 112 if 'doi' not in entry.keys(): 113 print("(D) no DOI found for %s" % entry['ID']) 114 else: 115 before += 1 116 if "url" not in entry.keys() or entry["url"].isspace(): 117 if 'doi' in entry.keys(): 118 doi = entry["doi"] 119 if not doi.isspace(): 120 is_oa, url_match = searchurl(doi) 121 if is_oa: 122 if url_match: 123 entry["url"] = url_match 124 new_url += 1 125 else: 126 useless_url += 1 127 else: 128 print("(U) no URL found for %s" % entry['ID']) 129 else: 130 before_url += 1 131 print("") 132 133 template="We added {new} DOIs and {new_url} URLs !\n" 134 template+="Before: {before}/{total} entries had DOI " 135 template+="and {before_url}/{total} entries had URL\n" 136 template+="Now: {after}/{total} entries have DOI " 137 template+="and {after_url}/{total} entries have URL " 138 template+="plus {useless_url}/{total} entries that do not need URL" 139 140 print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total,useless_url=useless_url)) 141 outfile = sys.argv[1]+"_doi.bib" 142 print("Writing result to ",outfile) 143 writer = BibTexWriter() 144 writer.indent = ' ' # indent entries with 4 spaces instead of one 145 with open(outfile, 'w') as bibfile: 146 bibfile.write(writer.write(bibliography))