Skip to content

Commit

Permalink
RF get-bibliography and make consistent short records on lacking BibTeX
Browse files Browse the repository at this point in the history
  • Loading branch information
yarikoptic committed Nov 1, 2024
1 parent 27f9115 commit 43ecec1
Showing 1 changed file with 89 additions and 69 deletions.
158 changes: 89 additions & 69 deletions sandbox/get-bibliography
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,9 @@ logging.basicConfig(
lgr = logging.getLogger(__name__)


def fetch_dandisets(me, results=None, metadata=None, datacite=None):
if results is None:
results = {}
if datacite:
from dandischema.datacite import to_datacite, validate_datacite
from jsonschema import ValidationError as JSONValidationError
from pydantic import ValidationError
def fetch_dandisets(me, bibtex_results=None, get_metadata=False):
if bibtex_results is None:
bibtex_results = {}
# Construct the query URL
url = (
f"https://api.dandiarchive.org/api/dandisets/?draft=false&empty=false"
Expand Down Expand Up @@ -61,9 +57,9 @@ def fetch_dandisets(me, results=None, metadata=None, datacite=None):
version_id = version["version"]
if version_id == "draft":
continue
if identifier not in results:
results[identifier] = {}
if version_id not in results[identifier]:
if identifier not in bibtex_results:
bibtex_results[identifier] = {}
if version_id not in bibtex_results[identifier]:
doi_url = (
f"https://doi.org/10.48324/dandi.{identifier}/{version_id}"
)
Expand All @@ -76,11 +72,15 @@ def fetch_dandisets(me, results=None, metadata=None, datacite=None):
identifier,
version_id,
)
bibtex_results[identifier][version_id] = (
f"# No valid BibTeX for {identifier}/{version_id}. "
f"Starts with {bibtex.splitlines()[0][:20]}"
)
else:
results[identifier][version_id] = bibtex.replace(
bibtex_results[identifier][version_id] = bibtex.replace(
"@misc{https://doi.org/10.48324/", "@misc{"
)
if metadata or datacite:
if get_metadata:
# fetch metadata record
metadata_response = requests.get(
f"https://api.dandiarchive.org/api/dandisets/"
Expand All @@ -91,61 +91,21 @@ def fetch_dandisets(me, results=None, metadata=None, datacite=None):
# The default to be cited -- ATM we do not have DOI for it, use latest version
v = dandiset["most_recent_published_version"]["version"]
# TODO: might want to robustify using `re` etc.
if v not in results[identifier]:
if v not in bibtex_results[identifier]:
lgr.error(
"Got no record for the most recent version %s of %s", v, identifier
)
else:
results[identifier][None] = results[identifier][v].replace(
bibtex_results[identifier][None] = bibtex_results[identifier][
v
].replace(
f"@misc{{dandi.{identifier}/{v},", f"@misc{{dandi.{identifier},"
)
if not url:
lgr.info("No further URL, exiting")
break

if metadata_records:
lgr.info("Got %d metadata records", len(metadata_records))
if metadata:
with open(metadata, "w") as f:
json.dump(metadata_records, f, indent=True)
if datacite:
meta_errors = defaultdict(list)
datacite_errors = defaultdict(list)
datacite_records = []
for m in metadata_records:
try:
datacite_record = to_datacite(m)
try:
validate_datacite(datacite_record)
except JSONValidationError as exc:
error_rec = {
"identifier": m["identifier"],
"version": m["version"],
"message": exc.message,
"path": list(exc.path),
"schema_path": list(exc.schema_path),
}
datacite_errors[m["identifier"]].append(error_rec)
except ValidationError as exc:
errors_filtered = []
# filter out the "input" field from errors
for error in exc.errors():
error.pop("input")
error["identifier"] = m["identifier"]
error["version"] = m["version"]
errors_filtered.append(error)
meta_errors[m["identifier"]].extend(errors_filtered)

Path(datacite).with_suffix(".meta-errors.json").write_text(
json.dumps(meta_errors, indent=True)
)
Path(datacite).with_suffix(".datacite-errors.json").write_text(
json.dumps(datacite_errors, indent=True)
)

Path(datacite).write_text(json.dumps(datacite_records, indent=True))

return results
return bibtex_results, metadata_records


def main():
Expand All @@ -167,38 +127,98 @@ def main():
parser.add_argument(
"--results",
type=str,
help="Path to prior results JSON file, or where to save/cache results JSON "
help="Path to prior BibTeX results JSON file, or where to save/cache results JSON "
"(only for bibtex)",
default=None,
)
parser.add_argument(
"--bibtex",
type=str,
help="Path to the BibTeX file to save the results to. If not provided -- stdout",
default=None,
)
# TODO:
# - add options for requested format (currently bibtex but probably others supported)
# - add option to do it only for specific datasets. Orthogonal to --me

args = parser.parse_args()

results = None
if args.datacite:
from dandischema.datacite import to_datacite, validate_datacite
from jsonschema import ValidationError as JSONValidationError
from pydantic import ValidationError

bibtex_results = None
if args.results and os.path.lexists(args.results):
with open(args.results, "r") as f:
results = json.load(f)
bibtex_results = json.load(f)

results = fetch_dandisets(args.me, results, args.metadata, args.datacite)
bibtex_results, metadata_records = fetch_dandisets(
args.me, bibtex_results, args.metadata or args.datacite
)

if args.results:
if os.path.lexists(args.results):
os.unlink(args.results)
with open(args.results, "w") as f:
json.dump(results, f, indent=2)
json.dump(bibtex_results, f, indent=2)
lgr.info("Updated results have been saved to %s", args.results)

# OUTPUT BibTeX
for dataset, versions in results.items():
print(f"# DANDISET {dataset}")
for version, rec in versions.items():
if version is None:
print("# Take latest as the default")
print(rec)
print()
try:
out = open(args.bibtex, "w") if args.bibtex else sys.stdout
for dataset, versions in bibtex_results.items():
out.write(f"# DANDISET {dataset}\n")
for version, rec in versions.items():
if version is None:
out.write("# Take latest as the default\n")
out.write(f"{rec}\n\n")
finally:
if args.bibtex:
out.close()

# Metadata and datacite
if metadata_records is not None:
lgr.info("Got %d metadata records", len(metadata_records))
if args.metadata:
with open(args.metadata, "w") as f:
json.dump(metadata_records, f, indent=True)
if args.datacite:
meta_errors = defaultdict(list)
datacite_errors = defaultdict(list)
datacite_records = []
for m in metadata_records:
try:
datacite_record = to_datacite(m)
try:
validate_datacite(datacite_record)
except JSONValidationError as exc:
error_rec = {
"identifier": m["identifier"],
"version": m["version"],
"message": exc.message,
"path": list(exc.path),
"schema_path": list(exc.schema_path),
}
datacite_errors[m["identifier"]].append(error_rec)
except ValidationError as exc:
errors_filtered = []
# filter out the "input" field from errors
for error in exc.errors():
error.pop("input")
error["identifier"] = m["identifier"]
error["version"] = m["version"]
errors_filtered.append(error)
meta_errors[m["identifier"]].extend(errors_filtered)

datacite_path = Path(args.datacite)
datacite_path.with_suffix(".meta-errors.json").write_text(
json.dumps(meta_errors, indent=True)
)
datacite_path.with_suffix(".datacite-errors.json").write_text(
json.dumps(datacite_errors, indent=True)
)
datacite_path.write_text(json.dumps(datacite_records, indent=True))


if __name__ == "__main__":
Expand Down

0 comments on commit 43ecec1

Please sign in to comment.