mybin

my ~/bin
git clone https://a3nm.net/git/mybin/
Log | Files | Refs | README

adddoi (6027B)


      1 #!/usr/bin/env python
      2 # source : https://tex.stackexchange.com/a/300474 and https://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography
      3 # users: XachaB, thando, JohnM
      4 import sys, re
      5 from unidecode import unidecode
      6 import bibtexparser
      7 from bibtexparser.bwriter import BibTexWriter
      8 import requests
      9 import urllib
     10 from time import sleep
     11 import os
     12 import json
     13 
     14 EMAIL = os.environ['EMAIL']
     15 
     16 # Search for open-access URL from DOI using unpaywall
     17 def searchurl(doi):
     18     doi_quote = urllib.parse.quote(doi.strip())
     19     url = "https://api.unpaywall.org/v2/" + doi_quote
     20     params = urllib.parse.urlencode({'email': EMAIL})
     21     sleep(2)
     22     r = requests.get(url + '?' + params)
     23     data = json.loads(r.text)
     24     if 'HTTP_status_code' in data.keys() and data['HTTP_status_code'] == 404:
     25         # not found, let's check if this is a crossref DOI
     26         url_crossref = "https://api.crossref.org/works/" + doi_quote + "/agency"
     27         r2 = requests.get(url_crossref)
     28         data_crossref = json.loads(r2.text)
     29         assert(data_crossref['message']['DOI'].lower() == doi.lower())
     30         assert(data_crossref['message']['agency']['id'] == 'datacite')
     31         return True, None # OK, it is a datacite DOI, so probably OA
     32     assert(data['doi'].lower() == doi.lower())
     33     if not data['is_oa']:
     34         return False, None # closed-access and no OA version available :(
     35     else:
     36         if data['best_oa_location']['host_type'] == 'publisher':
     37             return True, None # publisher version is OA so DOI link suffices
     38         else:
     39             assert(data['best_oa_location']['host_type'] == 'repository')
     40             return True, data['best_oa_location']['url_for_pdf']
     41 
     42 # TODO: this is just trusting the API, a better approach is in the S2ORC ACL
     43 # paper https://aclanthology.org/2020.acl-main.447.pdf section 2.6
     44 # Search for the DOI given a title; e.g.  "computation in Noisy Radio Networks"
     45 # Credit to user13348, slight modifications
     46 # http://tex.stackexchange.com/questions/6810/automatically-adding-doi-fields-to-a-hand-made-bibliography
     47 def searchdoi(title, author):
     48   params = urllib.parse.urlencode({"titlesearch":"titlesearch", "auth2" : author, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"})
     49   headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
     50   url = "https://www.crossref.org/guestquery/#bibsearch"
     51   sleep(2)
     52   r = requests.post(url, headers=headers, data=params)
     53   data = r.text
     54 
     55   return re.search(r'doi\.org/([^"^<^>]+)', str(data))
     56 
     57 def normalize(string):
     58     """Normalize strings to ascii, without latex."""
     59     # get rid of hyperlinks
     60     string = re.sub(r'\\href{[^}]*}\s*{\s*([^}]*)\s*}',r"\1", string)
     61     assert(not 'href' in string)
     62     string = re.sub(r'[{}\\\'"^]',"", string)
     63     string = re.sub(r"\$.*?\$","",string) # better remove all math expressions
     64     return unidecode(string)
     65 
     66 def get_authors(entry):
     67     """Get a list of authors' or editors' last names."""
     68     def get_last_name(authors):
     69         for author in authors :
     70             author = author.strip(" ")
     71             if "," in author:
     72                 yield author.split(",")[0]
     73             elif " " in author:
     74                 yield author.split(" ")[-1]
     75             else:
     76                 yield author
     77 
     78     try:
     79         authors = entry["author"]
     80     except KeyError:
     81         authors = entry["editor"]
     82 
     83     authors = normalize(authors).split("and")
     84     return list(get_last_name(authors))
     85 
     86 if __name__ == '__main__':
     87 
     88     print("Reading Bibliography...")
     89     with open(sys.argv[1]) as bibtex_file:
     90         bibliography = bibtexparser.load(bibtex_file)
     91 
     92 
     93     print("Looking for Dois...")
     94     before = 0
     95     before_url = 0
     96     new = 0
     97     new_url = 0
     98     useless_url = 0
     99     total = len(bibliography.entries)
    100     for i,entry in enumerate(bibliography.entries):
    101         print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
    102         if "doi" not in entry.keys() or entry["doi"].isspace():
    103             title = normalize(entry["title"])
    104             authors = get_authors(entry)
    105             for author in authors:
    106                 doi_match = searchdoi(title,author)
    107                 if doi_match:
    108                     doi = doi_match.groups()[0]
    109                     entry["doi"] = doi
    110                     new += 1
    111                     break
    112             if 'doi' not in entry.keys():
    113                 print("(D) no DOI found for %s" % entry['ID'])
    114         else:
    115             before += 1
    116         if "url" not in entry.keys() or entry["url"].isspace():
    117             if 'doi' in entry.keys():
    118                 doi = entry["doi"]
    119                 if not doi.isspace():
    120                     is_oa, url_match = searchurl(doi)
    121                     if is_oa:
    122                         if url_match:
    123                             entry["url"] = url_match
    124                             new_url += 1
    125                         else:
    126                             useless_url += 1
    127                     else:
    128                         print("(U) no URL found for %s" % entry['ID'])
    129         else:
    130             before_url += 1
    131     print("")
    132 
    133     template="We added {new} DOIs and {new_url} URLs !\n"
    134     template+="Before: {before}/{total} entries had DOI "
    135     template+="and {before_url}/{total} entries had URL\n"
    136     template+="Now: {after}/{total} entries have DOI "
    137     template+="and {after_url}/{total} entries have URL "
    138     template+="plus {useless_url}/{total} entries that do not need URL"
    139 
    140     print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total,useless_url=useless_url))
    141     outfile = sys.argv[1]+"_doi.bib"
    142     print("Writing result to ",outfile)
    143     writer = BibTexWriter()
    144     writer.indent = '    '     # indent entries with 4 spaces instead of one
    145     with open(outfile, 'w') as bibfile:
    146         bibfile.write(writer.write(bibliography))