commit a99d5d0f70911352fc63b0b7fb666c16d5b7146b
parent e3d7c625d32a498a45b5ca4e429065414e2f62e8
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Tue, 18 Jan 2022 21:25:37 +0100
indicate missing
Diffstat:
adddoi | | | 33 | +++++++++++++++++++++------------ |
1 file changed, 21 insertions(+), 12 deletions(-)
diff --git a/adddoi b/adddoi
@@ -28,16 +28,16 @@ def searchurl(doi):
data_crossref = json.loads(r2.text)
assert(data_crossref['message']['DOI'].lower() == doi.lower())
assert(data_crossref['message']['agency']['id'] == 'datacite')
- return None # OK, it is a datacite DOI, so probably OA
+ return True, None # OK, it is a datacite DOI, so probably OA
assert(data['doi'].lower() == doi.lower())
if not data['is_oa']:
- return None # closed-access and no OA version available :(
+ return False, None # closed-access and no OA version available :(
else:
if data['best_oa_location']['host_type'] == 'publisher':
- return None # publisher version is OA so DOI link suffices
+ return True, None # publisher version is OA so DOI link suffices
else:
assert(data['best_oa_location']['host_type'] == 'repository')
- return data['best_oa_location']['url_for_pdf']
+ return True, data['best_oa_location']['url_for_pdf']
# TODO: this is just trusting the API, a better approach is in the S2ORC ACL
# paper https://aclanthology.org/2020.acl-main.447.pdf section 2.6
@@ -95,10 +95,11 @@ if __name__ == '__main__':
before_url = 0
new = 0
new_url = 0
+ useless_url = 0
total = len(bibliography.entries)
for i,entry in enumerate(bibliography.entries):
print("\r{i}/{total} entries processed, please wait...".format(i=i,total=total),flush=True,end="")
- if "doi" not in entry or entry["doi"].isspace():
+ if "doi" not in entry.keys() or entry["doi"].isspace():
title = normalize(entry["title"])
authors = get_authors(entry)
for author in authors:
@@ -108,16 +109,23 @@ if __name__ == '__main__':
entry["doi"] = doi
new += 1
break
+ if 'doi' not in entry.keys():
+ print("(D) no DOI found for %s" % entry['ID'])
else:
before += 1
- if "url" not in entry or entry["url"].isspace():
+ if "url" not in entry.keys() or entry["url"].isspace():
if 'doi' in entry.keys():
doi = entry["doi"]
if not doi.isspace():
- url_match = searchurl(doi)
- if url_match:
- entry["url"] = url_match
- new_url += 1
+ is_oa, url_match = searchurl(doi)
+ if is_oa:
+ if url_match:
+ entry["url"] = url_match
+ new_url += 1
+ else:
+ useless_url += 1
+ else:
+ print("(U) no URL found for %s" % entry['ID'])
else:
before_url += 1
print("")
@@ -126,9 +134,10 @@ if __name__ == '__main__':
template+="Before: {before}/{total} entries had DOI "
template+="and {before_url}/{total} entries had URL\n"
template+="Now: {after}/{total} entries have DOI "
- template+="and {after_url}/{total} entries have URL\n"
+ template+="and {after_url}/{total} entries have URL "
+ template+="plus {useless_url}/{total} entries that do not need URL"
- print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total))
+ print(template.format(new=new,new_url=new_url,before=before,after=before+new,before_url=before_url,after_url=before_url+new_url,total=total,useless_url=useless_url))
outfile = sys.argv[1]+"_doi.bib"
print("Writing result to ",outfile)
writer = BibTexWriter()