commit 45835f826d85d453af9fa86b1db4f4b34078e63d
parent 52cf2db0d11a91a91a709b63fedef2f89a41af52
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Tue, 18 Jan 2022 20:09:38 +0100
add open-access urls
Diffstat:
adddoi | | | 129 | +++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------- |
1 file changed, 86 insertions(+), 43 deletions(-)
diff --git a/adddoi b/adddoi
@@ -5,9 +5,39 @@ import sys, re
from unidecode import unidecode
import bibtexparser
from bibtexparser.bwriter import BibTexWriter
-import http.client as httplib
import requests
import urllib
+from time import sleep
+import os
+import json
+
+EMAIL = os.environ['EMAIL']
+
+# Search for open-access URL from DOI using unpaywall
+def searchurl(doi):
+ doi_quote = urllib.parse.quote(doi.strip())
+ url = "https://api.unpaywall.org/v2/" + doi_quote
+ params = urllib.parse.urlencode({'email': EMAIL})
+ sleep(2)
+ r = requests.get(url + '?' + params)
+ data = json.loads(r.text)
+ if 'HTTP_status_code' in data.keys() and data['HTTP_status_code'] == 404:
+ # not found, let's check if this is a crossref DOI
+ url_crossref = "https://api.crossref.org/works/" + doi_quote + "/agency"
+ r2 = requests.get(url_crossref)
+ data_crossref = json.loads(r2.text)
+ assert(data_crossref['message']['DOI'].lower() == doi.lower())
+ assert(data_crossref['message']['agency']['id'] == 'datacite')
+ return None # OK, it is a datacite DOI, so probably OA
+ assert(data['doi'].lower() == doi.lower())
+ if not data['is_oa']:
+ return None # closed-access and no OA version available :(
+ else:
+ if data['best_oa_location']['host_type'] == 'publisher':
+ return None # publisher version is OA so DOI link suffices
+ else:
+ assert(data['best_oa_location']['host_type'] == 'repository')
+ return data['best_oa_location']['url_for_pdf']
# Search for the DOI given a title; e.g. "computation in Noisy Radio Networks"
# Credit to user13348, slight modifications
@@ -15,23 +45,18 @@ import urllib
def searchdoi(title, author):
params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"})
headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
- # conn = httplib.HTTPConnection("www.crossref.org:80") # Not working any more, HTTPS required
- conn = httplib.HTTPSConnection("www.crossref.org")
- conn.request("POST", "/guestquery/", params, headers)
- response = conn.getresponse()
- #print(response.status, response.reason)
- data = response.read()
- conn.close()
url = "https://www.crossref.org/guestquery/#bibsearch"
-
+ sleep(2)
r = requests.post(url, headers=headers, data=params)
-
data = r.text
return re.search(r'doi\.org/([^"^<^>]+)', str(data))
def normalize(string):
"""Normalize strings to ascii, without latex."""
+ # get rid of hyperlinks
+ string = re.sub(r'\\href{[^}]*}\s*{\s*([^}]*)\s*}',r"\1", string)
+ assert(not 'href' in string)
string = re.sub(r'[{}\\\'"^]',"", string)
string = re.sub(r"\$.*?\$","",string) # better remove all math expressions
return unidecode(string)
@@ -56,41 +81,59 @@ def get_authors(entry):
authors = normalize(authors).split("and")
return list(get_last_name(authors))
+if __name__ == '__main__':
-print("Reading Bibliography...")
-with open(sys.argv[1]) as bibtex_file:
- bibliography = bibtexparser.load(bibtex_file)
+ print("Reading Bibliography...")
+ with open(sys.argv[1]) as bibtex_file:
+ bibliography = bibtexparser.load(bibtex_file)
-print("Looking for Dois...")
-before = 0
-new = 0
-total = len(bibliography.entries)
-for i,entry in enumerate(bibliography.entries):
- print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
- try:
- if "doi" not in entry or entry["doi"].isspace():
- title = normalize(entry["title"])
- authors = get_authors(entry)
- for author in authors:
- doi_match = searchdoi(title,author)
- if doi_match:
- doi = doi_match.groups()[0]
- entry["doi"] = doi
- new += 1
- break
- else:
- before += 1
- except:
- pass
-print("")
+ print("Looking for Dois...")
+ before = 0
+ before_url = 0
+ new = 0
+ new_url = 0
+ total = len(bibliography.entries)
+ for i,entry in enumerate(bibliography.entries):
+ print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
+ try:
+ if "doi" not in entry or entry["doi"].isspace():
+ title = normalize(entry["title"])
+ authors = get_authors(entry)
+ for author in authors:
+ doi_match = searchdoi(title,author)
+ if doi_match:
+ doi = doi_match.groups()[0]
+ entry["doi"] = doi
+ new += 1
+ break
+ else:
+ before += 1
+ if "url" not in entry or entry["url"].isspace():
+ if 'doi' in entry.keys():
+ doi = entry["doi"]
+ if not doi.isspace():
+ url_match = searchurl(doi)
+ if url_match:
+ entry["url"] = url
+ new_url += 1
+ break
+ else:
+ before_url += 1
+ except:
+ pass
+ print("")
-template="We added {new} DOIs !\nBefore: {before}/{total} entries had DOI\nNow: {after}/{total} entries have DOI"
+ template="We added {new} DOIs and {new_url} URLs !\n"
+ template+="Before: {before}/{total} entries had DOI "
+ template+="and {before_url}/{total} entries had URL\n"
+ template+="Now: {after}/{total} entries have DOI "
+ template+="and {after_url}/{total} entries have URL\n"
-print(template.format(new=new,before=before,after=before+new,total=total))
-outfile = sys.argv[1]+"_doi.bib"
-print("Writing result to ",outfile)
-writer = BibTexWriter()
-writer.indent = ' ' # indent entries with 4 spaces instead of one
-with open(outfile, 'w') as bibfile:
- bibfile.write(writer.write(bibliography))
+ print(template.format(new=new,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total))
+ outfile = sys.argv[1]+"_doi.bib"
+ print("Writing result to ",outfile)
+ writer = BibTexWriter()
+ writer.indent = ' ' # indent entries with 4 spaces instead of one
+ with open(outfile, 'w') as bibfile:
+ bibfile.write(writer.write(bibliography))