mybin

my ~/bin
git clone https://a3nm.net/git/mybin/
Log | Files | Refs | README

commit 45835f826d85d453af9fa86b1db4f4b34078e63d
parent 52cf2db0d11a91a91a709b63fedef2f89a41af52
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 18 Jan 2022 20:09:38 +0100

add open-access urls

Diffstat:
adddoi | 129+++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
1 file changed, 86 insertions(+), 43 deletions(-)

diff --git a/adddoi b/adddoi @@ -5,9 +5,39 @@ import sys, re from unidecode import unidecode import bibtexparser from bibtexparser.bwriter import BibTexWriter -import http.client as httplib import requests import urllib +from time import sleep +import os +import json + +EMAIL = os.environ['EMAIL'] + +# Search for open-access URL from DOI using unpaywall +def searchurl(doi): + doi_quote = urllib.parse.quote(doi.strip()) + url = "https://api.unpaywall.org/v2/" + doi_quote + params = urllib.parse.urlencode({'email': EMAIL}) + sleep(2) + r = requests.get(url + '?' + params) + data = json.loads(r.text) + if 'HTTP_status_code' in data.keys() and data['HTTP_status_code'] == 404: + # not found, let's check if this is a crossref DOI + url_crossref = "https://api.crossref.org/works/" + doi_quote + "/agency" + r2 = requests.get(url_crossref) + data_crossref = json.loads(r2.text) + assert(data_crossref['message']['DOI'].lower() == doi.lower()) + assert(data_crossref['message']['agency']['id'] == 'datacite') + return None # OK, it is a datacite DOI, so probably OA + assert(data['doi'].lower() == doi.lower()) + if not data['is_oa']: + return None # closed-access and no OA version available :( + else: + if data['best_oa_location']['host_type'] == 'publisher': + return None # publisher version is OA so DOI link suffices + else: + assert(data['best_oa_location']['host_type'] == 'repository') + return data['best_oa_location']['url_for_pdf'] # Search for the DOI given a title; e.g. "computation in Noisy Radio Networks" # Credit to user13348, slight modifications @@ -15,23 +45,18 @@ import urllib def searchdoi(title, author): params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"}) headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"} - # conn = httplib.HTTPConnection("www.crossref.org:80") # Not working any more, HTTPS required - conn = httplib.HTTPSConnection("www.crossref.org") - conn.request("POST", "/guestquery/", params, headers) - response = conn.getresponse() - #print(response.status, response.reason) - data = response.read() - conn.close() url = "https://www.crossref.org/guestquery/#bibsearch" - + sleep(2) r = requests.post(url, headers=headers, data=params) - data = r.text return re.search(r'doi\.org/([^"^<^>]+)', str(data)) def normalize(string): """Normalize strings to ascii, without latex.""" + # get rid of hyperlinks + string = re.sub(r'\\href{[^}]*}\s*{\s*([^}]*)\s*}',r"\1", string) + assert(not 'href' in string) string = re.sub(r'[{}\\\'"^]',"", string) string = re.sub(r"\$.*?\$","",string) # better remove all math expressions return unidecode(string) @@ -56,41 +81,59 @@ def get_authors(entry): authors = normalize(authors).split("and") return list(get_last_name(authors)) +if __name__ == '__main__': -print("Reading Bibliography...") -with open(sys.argv[1]) as bibtex_file: - bibliography = bibtexparser.load(bibtex_file) + print("Reading Bibliography...") + with open(sys.argv[1]) as bibtex_file: + bibliography = bibtexparser.load(bibtex_file) -print("Looking for Dois...") -before = 0 -new = 0 -total = len(bibliography.entries) -for i,entry in enumerate(bibliography.entries): - print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="") - try: - if "doi" not in entry or entry["doi"].isspace(): - title = normalize(entry["title"]) - authors = get_authors(entry) - for author in authors: - doi_match = searchdoi(title,author) - if doi_match: - doi = doi_match.groups()[0] - entry["doi"] = doi - new += 1 - break - else: - before += 1 - except: - pass -print("") + print("Looking for Dois...") + before = 0 + before_url = 0 + new = 0 + new_url = 0 + total = len(bibliography.entries) + for i,entry in enumerate(bibliography.entries): + print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="") + try: + if "doi" not in entry or entry["doi"].isspace(): + title = normalize(entry["title"]) + authors = get_authors(entry) + for author in authors: + doi_match = searchdoi(title,author) + if doi_match: + doi = doi_match.groups()[0] + entry["doi"] = doi + new += 1 + break + else: + before += 1 + if "url" not in entry or entry["url"].isspace(): + if 'doi' in entry.keys(): + doi = entry["doi"] + if not doi.isspace(): + url_match = searchurl(doi) + if url_match: + entry["url"] = url + new_url += 1 + break + else: + before_url += 1 + except: + pass + print("") -template="We added {new} DOIs !\nBefore: {before}/{total} entries had DOI\nNow: {after}/{total} entries have DOI" + template="We added {new} DOIs and {new_url} URLs !\n" + template+="Before: {before}/{total} entries had DOI " + template+="and {before_url}/{total} entries had URL\n" + template+="Now: {after}/{total} entries have DOI " + template+="and {after_url}/{total} entries have URL\n" -print(template.format(new=new,before=before,after=before+new,total=total)) -outfile = sys.argv[1]+"_doi.bib" -print("Writing result to ",outfile) -writer = BibTexWriter() -writer.indent = ' ' # indent entries with 4 spaces instead of one -with open(outfile, 'w') as bibfile: - bibfile.write(writer.write(bibliography)) + print(template.format(new=new,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total)) + outfile = sys.argv[1]+"_doi.bib" + print("Writing result to ",outfile) + writer = BibTexWriter() + writer.indent = ' ' # indent entries with 4 spaces instead of one + with open(outfile, 'w') as bibfile: + bibfile.write(writer.write(bibliography))