adddoi (6223B)
1 #!/usr/bin/env python 2 # source : https://tex.stackexchange.com/a/300474 and https://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography 3 # users: XachaB, thando, JohnM 4 import sys, re 5 from unidecode import unidecode 6 import bibtexparser 7 from bibtexparser.bwriter import BibTexWriter 8 import requests 9 import urllib 10 from time import sleep 11 import os 12 import json 13 14 EMAIL = os.environ['EMAIL'] 15 16 # Search for open-access URL from DOI using unpaywall 17 def searchurl(doi): 18 doi_quote = urllib.parse.quote(doi.strip()) 19 url = "https://api.unpaywall.org/v2/" + doi_quote 20 params = urllib.parse.urlencode({'email': EMAIL}) 21 sleep(2) 22 r = requests.get(url + '?' + params) 23 data = json.loads(r.text) 24 if 'HTTP_status_code' in data.keys() and data['HTTP_status_code'] == 404: 25 # not found, let's check if this is a crossref DOI 26 url_crossref = "https://api.crossref.org/works/" + doi_quote + "/agency" 27 r2 = requests.get(url_crossref) 28 data_crossref = json.loads(r2.text) 29 assert(data_crossref['message']['DOI'].lower() == doi.lower()) 30 dapublisher = data_crossref['message']['agency']['id'] 31 assert(dapublisher in ['datacite', 'crossref']) 32 if dapublisher == 'datacite': 33 return True, None # OK, it is a datacite DOI, so probably OA 34 else: 35 return False, None # this is a crossref DOI not listed in unpaywall, so fail :( 36 assert(data['doi'].lower() == doi.lower()) 37 if not data['is_oa']: 38 return False, None # closed-access and no OA version available :( 39 else: 40 if data['best_oa_location']['host_type'] == 'publisher': 41 return True, None # publisher version is OA so DOI link suffices 42 else: 43 assert(data['best_oa_location']['host_type'] == 'repository') 44 return True, data['best_oa_location']['url_for_pdf'] 45 46 # TODO: this is just trusting the API, a better approach is in the S2ORC ACL 47 # paper https://aclanthology.org/2020.acl-main.447.pdf section 2.6 48 # Search for the DOI given a title; e.g. "computation in Noisy Radio Networks" 49 # Credit to user13348, slight modifications 50 # http://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography 51 def searchdoi(title, author): 52 params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"}) 53 headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"} 54 url = "https://www.crossref.org/guestquery/#bibsearch" 55 sleep(2) 56 r = requests.post(url, headers=headers, data=params) 57 data = r.text 58 59 return re.search(r'doi\.org/([^"^<^>]+)', str(data)) 60 61 def normalize(string): 62 """Normalize strings to ascii, without latex.""" 63 # get rid of hyperlinks 64 string = re.sub(r'\\href{[^}]*}\s*{\s*([^}]*)\s*}',r"\1", string) 65 assert(not 'href' in string) 66 string = re.sub(r'[{}\\\'"^]',"", string) 67 string = re.sub(r"\$.*?\$","",string) # better remove all math expressions 68 return unidecode(string) 69 70 def get_authors(entry): 71 """Get a list of authors' or editors' last names.""" 72 def get_last_name(authors): 73 for author in authors : 74 author = author.strip(" ") 75 if "," in author: 76 yield author.split(",")[0] 77 elif " " in author: 78 yield author.split(" ")[-1] 79 else: 80 yield author 81 82 try: 83 authors = entry["author"] 84 except KeyError: 85 authors = entry["editor"] 86 87 authors = normalize(authors).split("and") 88 return list(get_last_name(authors)) 89 90 if __name__ == '__main__': 91 92 print("Reading Bibliography...") 93 with open(sys.argv[1]) as bibtex_file: 94 bibliography = bibtexparser.load(bibtex_file) 95 96 97 print("Looking for Dois...") 98 before = 0 99 before_url = 0 100 new = 0 101 new_url = 0 102 useless_url = 0 103 total = len(bibliography.entries) 104 for i,entry in enumerate(bibliography.entries): 105 print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="") 106 if "doi" not in entry.keys() or entry["doi"].isspace(): 107 title = normalize(entry["title"]) 108 authors = get_authors(entry) 109 for author in authors: 110 doi_match = searchdoi(title,author) 111 if doi_match: 112 doi = doi_match.groups()[0] 113 entry["doi"] = doi 114 new += 1 115 break 116 if 'doi' not in entry.keys(): 117 print("(D) no DOI found for %s" % entry['ID']) 118 else: 119 before += 1 120 if "url" not in entry.keys() or entry["url"].isspace(): 121 if 'doi' in entry.keys(): 122 doi = entry["doi"] 123 if not doi.isspace(): 124 is_oa, url_match = searchurl(doi) 125 if is_oa: 126 if url_match: 127 entry["url"] = url_match 128 new_url += 1 129 else: 130 useless_url += 1 131 else: 132 print("(U) no URL found for %s" % entry['ID']) 133 else: 134 before_url += 1 135 print("") 136 137 template="We added {new} DOIs and {new_url} URLs !\n" 138 template+="Before: {before}/{total} entries had DOI " 139 template+="and {before_url}/{total} entries had URL\n" 140 template+="Now: {after}/{total} entries have DOI " 141 template+="and {after_url}/{total} entries have URL " 142 template+="plus {useless_url}/{total} entries that do not need URL" 143 144 print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total,useless_url=useless_url)) 145 outfile = sys.argv[1]+"_doi.bib" 146 print("Writing result to ",outfile) 147 writer = BibTexWriter() 148 writer.indent = ' ' # indent entries with 4 spaces instead of one 149 with open(outfile, 'w') as bibfile: 150 bibfile.write(writer.write(bibliography))