mybin

my ~/bin
git clone https://a3nm.net/git/mybin/
Log | Files | Refs | README

adddoi (6223B)


      1 #!/usr/bin/env python
      2 # source : https://tex.stackexchange.com/a/300474 and https://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography
      3 # users: XachaB, thando, JohnM
      4 import sys, re
      5 from unidecode import unidecode
      6 import bibtexparser
      7 from bibtexparser.bwriter import BibTexWriter
      8 import requests
      9 import urllib
     10 from time import sleep
     11 import os
     12 import json
     13 
     14 EMAIL = os.environ['EMAIL']
     15 
     16 # Search for open-access URL from DOI using unpaywall
     17 def searchurl(doi):
     18     doi_quote = urllib.parse.quote(doi.strip())
     19     url = "https://api.unpaywall.org/v2/" + doi_quote
     20     params = urllib.parse.urlencode({'email': EMAIL})
     21     sleep(2)
     22     r = requests.get(url + '?' + params)
     23     data = json.loads(r.text)
     24     if 'HTTP_status_code' in data.keys() and data['HTTP_status_code'] == 404:
     25         # not found, let's check if this is a crossref DOI
     26         url_crossref = "https://api.crossref.org/works/" + doi_quote + "/agency"
     27         r2 = requests.get(url_crossref)
     28         data_crossref = json.loads(r2.text)
     29         assert(data_crossref['message']['DOI'].lower() == doi.lower())
     30         dapublisher = data_crossref['message']['agency']['id']
     31         assert(dapublisher in ['datacite', 'crossref'])
     32         if dapublisher == 'datacite':
     33             return True, None # OK, it is a datacite DOI, so probably OA
     34         else:
     35             return False, None # this is a crossref DOI not listed in unpaywall, so fail :(
     36     assert(data['doi'].lower() == doi.lower())
     37     if not data['is_oa']:
     38         return False, None # closed-access and no OA version available :(
     39     else:
     40         if data['best_oa_location']['host_type'] == 'publisher':
     41             return True, None # publisher version is OA so DOI link suffices
     42         else:
     43             assert(data['best_oa_location']['host_type'] == 'repository')
     44             return True, data['best_oa_location']['url_for_pdf']
     45 
     46 # TODO: this is just trusting the API, a better approach is in the S2ORC ACL
     47 # paper https://aclanthology.org/2020.acl-main.447.pdf section 2.6
     48 # Search for the DOI given a title; e.g.  "computation in Noisy Radio Networks"
     49 # Credit to user13348, slight modifications
     50 # http://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography
     51 def searchdoi(title, author):
     52   params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"})
     53   headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
     54   url = "https://www.crossref.org/guestquery/#bibsearch"
     55   sleep(2)
     56   r = requests.post(url, headers=headers, data=params)
     57   data = r.text
     58 
     59   return re.search(r'doi\.org/([^"^<^>]+)', str(data))
     60 
     61 def normalize(string):
     62     """Normalize strings to ascii, without latex."""
     63     # get rid of hyperlinks
     64     string = re.sub(r'\\href{[^}]*}\s*{\s*([^}]*)\s*}',r"\1", string)
     65     assert(not 'href' in string)
     66     string = re.sub(r'[{}\\\'"^]',"", string)
     67     string = re.sub(r"\$.*?\$","",string) # better remove all math expressions
     68     return unidecode(string)
     69 
     70 def get_authors(entry):
     71     """Get a list of authors' or editors' last names."""
     72     def get_last_name(authors):
     73         for author in authors :
     74             author = author.strip(" ")
     75             if "," in author:
     76                 yield author.split(",")[0]
     77             elif " " in author:
     78                 yield author.split(" ")[-1]
     79             else:
     80                 yield author
     81 
     82     try:
     83         authors = entry["author"]
     84     except KeyError:
     85         authors = entry["editor"]
     86 
     87     authors = normalize(authors).split("and")
     88     return list(get_last_name(authors))
     89 
     90 if __name__ == '__main__':
     91 
     92     print("Reading Bibliography...")
     93     with open(sys.argv[1]) as bibtex_file:
     94         bibliography = bibtexparser.load(bibtex_file)
     95 
     96 
     97     print("Looking for Dois...")
     98     before = 0
     99     before_url = 0
    100     new = 0
    101     new_url = 0
    102     useless_url = 0
    103     total = len(bibliography.entries)
    104     for i,entry in enumerate(bibliography.entries):
    105         print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
    106         if "doi" not in entry.keys() or entry["doi"].isspace():
    107             title = normalize(entry["title"])
    108             authors = get_authors(entry)
    109             for author in authors:
    110                 doi_match = searchdoi(title,author)
    111                 if doi_match:
    112                     doi = doi_match.groups()[0]
    113                     entry["doi"] = doi
    114                     new += 1
    115                     break
    116             if 'doi' not in entry.keys():
    117                 print("(D) no DOI found for %s" % entry['ID'])
    118         else:
    119             before += 1
    120         if "url" not in entry.keys() or entry["url"].isspace():
    121             if 'doi' in entry.keys():
    122                 doi = entry["doi"]
    123                 if not doi.isspace():
    124                     is_oa, url_match = searchurl(doi)
    125                     if is_oa:
    126                         if url_match:
    127                             entry["url"] = url_match
    128                             new_url += 1
    129                         else:
    130                             useless_url += 1
    131                     else:
    132                         print("(U) no URL found for %s" % entry['ID'])
    133         else:
    134             before_url += 1
    135     print("")
    136 
    137     template="We added {new} DOIs and {new_url} URLs !\n"
    138     template+="Before: {before}/{total} entries had DOI "
    139     template+="and {before_url}/{total} entries had URL\n"
    140     template+="Now: {after}/{total} entries have DOI "
    141     template+="and {after_url}/{total} entries have URL "
    142     template+="plus {useless_url}/{total} entries that do not need URL"
    143 
    144     print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total,useless_url=useless_url))
    145     outfile = sys.argv[1]+"_doi.bib"
    146     print("Writing result to ",outfile)
    147     writer = BibTexWriter()
    148     writer.indent = '    '     # indent entries with 4 spaces instead of one
    149     with open(outfile, 'w') as bibfile:
    150         bibfile.write(writer.write(bibliography))