add open-access urls - mybin

commit 45835f826d85d453af9fa86b1db4f4b34078e63d
parent 52cf2db0d11a91a91a709b63fedef2f89a41af52
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 18 Jan 2022 20:09:38 +0100

add open-access urls

Diffstat:
adddoi  | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------

1 file changed, 86 insertions(+), 43 deletions(-)
diff --git a/adddoi b/adddoi
@@ -5,9 +5,39 @@ import sys, re
 from unidecode import unidecode
 import bibtexparser
 from bibtexparser.bwriter import BibTexWriter
-import http.client as httplib
 import requests
 import urllib
+from time import sleep
+import os
+import json
+
+EMAIL = os.environ['EMAIL']
+
+# Search for open-access URL from DOI using unpaywall
+def searchurl(doi):
+    doi_quote = urllib.parse.quote(doi.strip())
+    url = "https://api.unpaywall.org/v2/" + doi_quote
+    params = urllib.parse.urlencode({'email': EMAIL})
+    sleep(2)
+    r = requests.get(url + '?' + params)
+    data = json.loads(r.text)
+    if 'HTTP_status_code' in data.keys() and data['HTTP_status_code'] == 404:
+        # not found, let's check if this is a crossref DOI
+        url_crossref = "https://api.crossref.org/works/" + doi_quote + "/agency"
+        r2 = requests.get(url_crossref)
+        data_crossref = json.loads(r2.text)
+        assert(data_crossref['message']['DOI'].lower() == doi.lower())
+        assert(data_crossref['message']['agency']['id'] == 'datacite')
+        return None # OK, it is a datacite DOI, so probably OA
+    assert(data['doi'].lower() == doi.lower())
+    if not data['is_oa']:
+        return None # closed-access and no OA version available :(
+    else:
+        if data['best_oa_location']['host_type'] == 'publisher':
+            return None # publisher version is OA so DOI link suffices
+        else:
+            assert(data['best_oa_location']['host_type'] == 'repository')
+            return data['best_oa_location']['url_for_pdf']
 
 # Search for the DOI given a title; e.g.  "computation in Noisy Radio Networks"
 # Credit to user13348, slight modifications
@@ -15,23 +45,18 @@ import urllib
 def searchdoi(title, author):
   params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"})
   headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
-  # conn = httplib.HTTPConnection("www.crossref.org:80") # Not working any more, HTTPS required
-  conn = httplib.HTTPSConnection("www.crossref.org")       
-  conn.request("POST", "/guestquery/", params, headers)
-  response = conn.getresponse()
-  #print(response.status, response.reason)
-  data = response.read()
-  conn.close()
   url = "https://www.crossref.org/guestquery/#bibsearch"
-
+  sleep(2)
   r = requests.post(url, headers=headers, data=params)
-
   data = r.text
 
   return re.search(r'doi\.org/([^"^<^>]+)', str(data))
 
 def normalize(string):
     """Normalize strings to ascii, without latex."""
+    # get rid of hyperlinks
+    string = re.sub(r'\\href{[^}]*}\s*{\s*([^}]*)\s*}',r"\1", string)
+    assert(not 'href' in string)
     string = re.sub(r'[{}\\\'"^]',"", string)
     string = re.sub(r"\$.*?\$","",string) # better remove all math expressions
     return unidecode(string)
@@ -56,41 +81,59 @@ def get_authors(entry):
     authors = normalize(authors).split("and")
     return list(get_last_name(authors))
 
+if __name__ == '__main__':
 
-print("Reading Bibliography...")
-with open(sys.argv[1]) as bibtex_file:
-    bibliography = bibtexparser.load(bibtex_file)
+    print("Reading Bibliography...")
+    with open(sys.argv[1]) as bibtex_file:
+        bibliography = bibtexparser.load(bibtex_file)
 
 
-print("Looking for Dois...")
-before = 0
-new = 0
-total = len(bibliography.entries)
-for i,entry in enumerate(bibliography.entries):
-    print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
-    try:
-        if "doi" not in entry or entry["doi"].isspace():
-            title = normalize(entry["title"])
-            authors = get_authors(entry)
-            for author in authors:
-                doi_match = searchdoi(title,author)
-                if doi_match:
-                    doi = doi_match.groups()[0]
-                    entry["doi"] = doi
-                    new += 1
-                    break
-        else:
-            before += 1
-    except:
-        pass
-print("")
+    print("Looking for Dois...")
+    before = 0
+    before_url = 0
+    new = 0
+    new_url = 0
+    total = len(bibliography.entries)
+    for i,entry in enumerate(bibliography.entries):
+        print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
+        try:
+            if "doi" not in entry or entry["doi"].isspace():
+                title = normalize(entry["title"])
+                authors = get_authors(entry)
+                for author in authors:
+                    doi_match = searchdoi(title,author)
+                    if doi_match:
+                        doi = doi_match.groups()[0]
+                        entry["doi"] = doi
+                        new += 1
+                        break
+            else:
+                before += 1
+            if "url" not in entry or entry["url"].isspace():
+                if 'doi' in entry.keys():
+                    doi = entry["doi"]
+                    if not doi.isspace():
+                        url_match = searchurl(doi)
+                        if url_match:
+                            entry["url"] = url
+                            new_url += 1
+                            break
+            else:
+                before_url += 1
+        except:
+            pass
+    print("")
 
-template="We added {new} DOIs !\nBefore: {before}/{total} entries had DOI\nNow: {after}/{total} entries have DOI"
+    template="We added {new} DOIs and {new_url} URLs !\n"
+    template+="Before: {before}/{total} entries had DOI "
+    template+="and {before_url}/{total} entries had URL\n"
+    template+="Now: {after}/{total} entries have DOI "
+    template+="and {after_url}/{total} entries have URL\n"
 
-print(template.format(new=new,before=before,after=before+new,total=total))
-outfile = sys.argv[1]+"_doi.bib"
-print("Writing result to ",outfile)
-writer = BibTexWriter()
-writer.indent = '    '     # indent entries with 4 spaces instead of one
-with open(outfile, 'w') as bibfile:
-    bibfile.write(writer.write(bibliography))
+    print(template.format(new=new,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total))
+    outfile = sys.argv[1]+"_doi.bib"
+    print("Writing result to ",outfile)
+    writer = BibTexWriter()
+    writer.indent = '    '     # indent entries with 4 spaces instead of one
+    with open(outfile, 'w') as bibfile:
+        bibfile.write(writer.write(bibliography))

	mybin my ~/bin
	git clone https://a3nm.net/git/mybin/
	Log \| Files \| Refs \| README