Skip to content

Commit

Permalink
Merge pull request #25 from Sage-Bionetworks-Workflows/bgrande/ORCA-1…
Browse files Browse the repository at this point in the history
…67/lazy-staging

[ORCA-167] Stage files lazily
  • Loading branch information
Bruno Grande authored Apr 3, 2023
2 parents 70d919f + cceede9 commit 37c1d40
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 74 deletions.
4 changes: 3 additions & 1 deletion src/dcqc/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,9 @@ def stage(
if self._local_path is not None:
return self._local_path
else:
destination_str = mkdtemp()
# TODO: This prefix is used by nf-dcqc to easily find the staged file.
# It might be worth using a DCQCTMPDIR to avoid hard-coding this.
destination_str = mkdtemp(prefix="dcqc-staged-")
destination = Path(destination_str)

# By this point, destination is defined (not None)
Expand Down
38 changes: 8 additions & 30 deletions src/dcqc/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import sys
from csv import DictWriter
from pathlib import Path
from typing import List, Optional
from typing import List

from typer import Argument, Exit, Option, Typer

Expand All @@ -28,11 +28,8 @@

# Common options
overwrite_opt = Option(False, "--overwrite", "-f", help="Ignore existing files")
required_tests_opt = Option(None, "--required-tests", "-rt", help="Required tests")
skipped_tests_opt = Option(None, "--skipped-tests", "-st", help="Skipped tests")
stage_files_opt = Option(False, "--stage-files", "-sf", help="Stage remote files.")
prt_help = "Update paths to be relative to given directory upon serialization."
paths_relative_to_opt = Option(None, "--paths-relative-to", "-prt", help=prt_help)
required_tests_opt = Option(None, "--required-tests", "-r", help="Required tests")
skipped_tests_opt = Option(None, "--skipped-tests", "-s", help="Skipped tests")


@app.callback()
Expand All @@ -48,12 +45,11 @@ def create_targets(
input_csv: Path = input_path_arg,
output_dir: Path = output_dir_path_arg,
overwrite: bool = overwrite_opt,
stage_files: bool = stage_files_opt,
):
"""Create target JSON files from a targets CSV file"""
output_dir.mkdir(parents=True, exist_ok=True)

parser = CsvParser(input_csv, stage_files)
parser = CsvParser(input_csv)
targets = parser.create_targets()

# Naming the targets by index to ensure no clashes
Expand All @@ -63,24 +59,6 @@ def create_targets(
report.save_many(named_targets, output_dir.as_posix(), overwrite)


@app.command()
def stage_target(
input_json: Path = input_path_arg,
output_json: str = output_arg,
output_dir: Path = output_dir_path_arg,
overwrite: bool = overwrite_opt,
paths_relative_to: Optional[Path] = paths_relative_to_opt,
):
"""Create local file copies from a target JSON file"""
output_dir.mkdir(parents=True, exist_ok=True)

target = JsonParser.parse_object(input_json, Target)
target.stage(output_dir, overwrite)

report = JsonReport(paths_relative_to)
report.save(target, output_json, overwrite)


@app.command()
def create_tests(
input_json: Path = input_path_arg,
Expand All @@ -93,8 +71,8 @@ def create_tests(
output_dir.mkdir(parents=True, exist_ok=True)

# Interpret empty lists from CLI as None (to auto-generate values)
required_tests_maybe = required_tests if len(required_tests) > 0 else None
skipped_tests_maybe = skipped_tests if len(skipped_tests) > 0 else None
required_tests_maybe = required_tests if required_tests else None
skipped_tests_maybe = skipped_tests if skipped_tests else None

target = JsonParser.parse_object(input_json, Target)
suite = SuiteABC.from_target(target, required_tests_maybe, skipped_tests_maybe)
Expand Down Expand Up @@ -150,8 +128,8 @@ def create_suite(
):
"""Create a suite from a set of test JSON files sharing the same target"""
# Interpret empty lists from CLI as None (to auto-generate values)
required_tests_maybe = required_tests if len(required_tests) > 0 else None
skipped_tests_maybe = skipped_tests if len(skipped_tests) > 0 else None
required_tests_maybe = required_tests if required_tests else None
skipped_tests_maybe = skipped_tests if skipped_tests else None

tests = [JsonParser.parse_object(test_json, TestABC) for test_json in input_jsons]
suite = SuiteABC.from_tests(tests, required_tests_maybe, skipped_tests_maybe)
Expand Down
36 changes: 34 additions & 2 deletions src/dcqc/tests/test_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,40 @@ def get_status(self, compute_ok: bool = True) -> TestStatus:
self._status = self.compute_status()
return self._status

def _get_single_target_file(self) -> File:
files = self.target.files
def get_files(self, staged: bool = True) -> list[File]:
"""Get and stage files for target.
Args:
staged: Whether to make sure that the files are staged.
Defaults to True.
Returns:
Staged target files.
"""
files = []
for file in self.target.files:
if staged:
file.stage()
files.append(file)
return files

def get_file(self, staged: bool = True) -> File:
"""Get and stage file for single-file target.
Args:
staged: Whether to make sure that the files are staged.
Defaults to True.
Raises:
ValueError: If the target has multiple files.
Returns:
Staged target file.
"""
files = self.get_files(staged)
if len(files) != 1:
message = "This method only supports single-file targets."
raise ValueError(message)
return files[0]

@classmethod
Expand Down
46 changes: 20 additions & 26 deletions src/dcqc/tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import hashlib
import json
from pathlib import Path

from dcqc.file import File
from dcqc.tests.test_abc import ExternalTestMixin, Process, TestABC, TestStatus


Expand All @@ -11,7 +11,7 @@ class FileExtensionTest(TestABC):

def compute_status(self) -> TestStatus:
status = TestStatus.PASS
for file in self.target.files:
for file in self.get_files(staged=False):
file_type = file.get_file_type()
file_extensions = file_type.file_extensions
if not file.name.endswith(file_extensions):
Expand All @@ -26,18 +26,17 @@ class Md5ChecksumTest(TestABC):

def compute_status(self) -> TestStatus:
status = TestStatus.PASS
for file in self.target.files:
for file in self.get_files():
expected_md5 = file.get_metadata("md5_checksum")
actual_md5 = self._compute_md5_checksum(file)
actual_md5 = self._compute_md5_checksum(file.local_path)
if expected_md5 != actual_md5:
status = TestStatus.FAIL
break
return status

def _compute_md5_checksum(self, file: File) -> str:
local_path = file.local_path
def _compute_md5_checksum(self, path: Path) -> str:
hash_md5 = hashlib.md5()
with local_path.open("rb") as infile:
with path.open("rb") as infile:
for chunk in iter(lambda: infile.read(4096), b""):
hash_md5.update(chunk)
actual_md5 = hash_md5.hexdigest()
Expand All @@ -50,16 +49,15 @@ class JsonLoadTest(TestABC):

def compute_status(self) -> TestStatus:
status = TestStatus.PASS
for file in self.target.files:
if not self._can_be_loaded(file):
for file in self.get_files():
if not self._can_be_loaded(file.local_path):
status = TestStatus.FAIL
break
return status

def _can_be_loaded(self, file: File) -> bool:
def _can_be_loaded(self, path: Path) -> bool:
success = True
local_path = file.local_path
with local_path.open("r") as infile:
with path.open("r") as infile:
try:
json.load(infile)
except Exception:
Expand All @@ -73,19 +71,18 @@ class JsonLdLoadTest(TestABC):

def compute_status(self) -> TestStatus:
status = TestStatus.PASS
for file in self.target.files:
if not self._can_be_loaded(file):
for file in self.get_files():
if not self._can_be_loaded(file.local_path):
status = TestStatus.FAIL
break
return status

def _can_be_loaded(self, file: File) -> bool:
def _can_be_loaded(self, path: Path) -> bool:
rdflib = self.import_module("rdflib")
graph = rdflib.Graph()

success = True
local_path = file.local_path
with local_path.open("r") as infile:
with path.open("r") as infile:
try:
graph.parse(infile, format="json-ld")
except Exception:
Expand All @@ -97,9 +94,8 @@ class LibTiffInfoTest(ExternalTestMixin, TestABC):
tier = 2

def generate_process(self) -> Process:
file = self._get_single_target_file()
path = file.local_path.as_posix()
command_args = ["tiffinfo", path]
file = self.get_file()
command_args = ["tiffinfo", file.local_path.as_posix()]
process = Process(
container="quay.io/sagebionetworks/libtiff:2.0",
command_args=command_args,
Expand All @@ -111,14 +107,13 @@ class BioFormatsInfoTest(ExternalTestMixin, TestABC):
tier = 2

def generate_process(self) -> Process:
file = self._get_single_target_file()
path = file.local_path.as_posix()
file = self.get_file()
command_args = [
"/opt/bftools/showinf",
"-nopix",
"-novalid",
"-nocore",
path,
file.local_path.as_posix(),
]
process = Process(
container="quay.io/sagebionetworks/bftools:latest",
Expand All @@ -131,11 +126,10 @@ class OmeXmlSchemaTest(ExternalTestMixin, TestABC):
tier = 2

def generate_process(self) -> Process:
file = self._get_single_target_file()
path = file.local_path.as_posix()
file = self.get_file()
command_args = [
"/opt/bftools/xmlvalid",
path,
file.local_path.as_posix(),
]
process = Process(
container="quay.io/sagebionetworks/bftools:latest",
Expand Down
16 changes: 1 addition & 15 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,27 +55,13 @@ def test_create_targets(get_data, get_output):
assert len(list(output_dir.iterdir())) > 0


def test_stage_target(get_data, get_output):
input_json = get_data("target.json")
output_json = get_output("stage_target/target.staged.json")
output_dir = get_output("stage_target/targets")
output_json.unlink(missing_ok=True)
shutil.rmtree(output_dir, ignore_errors=True)

assert not output_dir.exists()
args = ["stage-target", "-prt", ".", input_json, output_json, output_dir]
result = run_command(args)
check_command_result(result)
assert len(list(output_dir.iterdir())) > 0


def test_create_tests(get_data, get_output):
input_json = get_data("target.json")
output_dir = get_output("create_tests")
shutil.rmtree(output_dir, ignore_errors=True)

assert not output_dir.exists()
args = ["create-tests", "-rt", "Md5ChecksumTest", input_json, output_dir]
args = ["create-tests", "-r", "Md5ChecksumTest", input_json, output_dir]
result = run_command(args)
check_command_result(result)
assert len(list(output_dir.iterdir())) > 0
Expand Down
8 changes: 8 additions & 0 deletions tests/test_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,11 @@ def test_that_a_process_can_be_serialized_and_deserialized():
process_dict = process.to_dict()
process_from_dict = Process.from_dict(process_dict)
assert process_dict == process_from_dict.to_dict()


def test_for_an_error_when_getting_one_file_from_multi_file_target(test_files):
file = test_files["good"]
target = Target(file, file)
test = tests.FileExtensionTest(target)
with pytest.raises(ValueError):
test.get_file()

0 comments on commit 37c1d40

Please sign in to comment.