mybin

my ~/bin
git clone https://a3nm.net/git/mybin/
Log | Files | Refs | README

commit a99d5d0f70911352fc63b0b7fb666c16d5b7146b
parent e3d7c625d32a498a45b5ca4e429065414e2f62e8
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Tue, 18 Jan 2022 21:25:37 +0100

indicate missing

Diffstat:
adddoi | 33+++++++++++++++++++++------------
1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/adddoi b/adddoi @@ -28,16 +28,16 @@ def searchurl(doi): data_crossref = json.loads(r2.text) assert(data_crossref['message']['DOI'].lower() == doi.lower()) assert(data_crossref['message']['agency']['id'] == 'datacite') - return None # OK, it is a datacite DOI, so probably OA + return True, None # OK, it is a datacite DOI, so probably OA assert(data['doi'].lower() == doi.lower()) if not data['is_oa']: - return None # closed-access and no OA version available :( + return False, None # closed-access and no OA version available :( else: if data['best_oa_location']['host_type'] == 'publisher': - return None # publisher version is OA so DOI link suffices + return True, None # publisher version is OA so DOI link suffices else: assert(data['best_oa_location']['host_type'] == 'repository') - return data['best_oa_location']['url_for_pdf'] + return True, data['best_oa_location']['url_for_pdf'] # TODO: this is just trusting the API, a better approach is in the S2ORC ACL # paper https://aclanthology.org/2020.acl-main.447.pdf section 2.6 @@ -95,10 +95,11 @@ if __name__ == '__main__': before_url = 0 new = 0 new_url = 0 + useless_url = 0 total = len(bibliography.entries) for i,entry in enumerate(bibliography.entries): print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="") - if "doi" not in entry or entry["doi"].isspace(): + if "doi" not in entry.keys() or entry["doi"].isspace(): title = normalize(entry["title"]) authors = get_authors(entry) for author in authors: @@ -108,16 +109,23 @@ if __name__ == '__main__': entry["doi"] = doi new += 1 break + if 'doi' not in entry.keys(): + print("(D) no DOI found for %s" % entry['ID']) else: before += 1 - if "url" not in entry or entry["url"].isspace(): + if "url" not in entry.keys() or entry["url"].isspace(): if 'doi' in entry.keys(): doi = entry["doi"] if not doi.isspace(): - url_match = searchurl(doi) - if url_match: - entry["url"] = url_match - new_url += 1 + is_oa, url_match = searchurl(doi) + if is_oa: + if url_match: + entry["url"] = url_match + new_url += 1 + else: + useless_url += 1 + else: + print("(U) no URL found for %s" % entry['ID']) else: before_url += 1 print("") @@ -126,9 +134,10 @@ if __name__ == '__main__': template+="Before: {before}/{total} entries had DOI " template+="and {before_url}/{total} entries had URL\n" template+="Now: {after}/{total} entries have DOI " - template+="and {after_url}/{total} entries have URL\n" + template+="and {after_url}/{total} entries have URL " + template+="plus {useless_url}/{total} entries that do not need URL" - print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total)) + print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total,useless_url=useless_url)) outfile = sys.argv[1]+"_doi.bib" print("Writing result to ",outfile) writer = BibTexWriter()