From e5c7b3cffae3539144a7f92fa14e2f2a57d1b0e7 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Mon, 2 Dec 2024 13:15:08 -0800 Subject: [PATCH 1/5] Add cachedir argument and make sure directories exist --- src/toil/cwl/cwltoil.py | 4 ++++ src/toil/options/cwl.py | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 3064641519..22c12db560 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -4261,8 +4261,12 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int: runtime_context.move_outputs = "leave" runtime_context.rm_tmpdir = False runtime_context.streaming_allowed = not options.disable_streaming + if options.cachedir is not None: + runtime_context.cachedir = os.path.abspath(options.cachedir) if options.mpi_config_file is not None: runtime_context.mpi_config = MpiConfig.load(options.mpi_config_file) + if cwltool.main.check_working_directories(runtime_context) is not None: + return 1 setattr(runtime_context, "bypass_file_store", options.bypass_file_store) if options.bypass_file_store and options.destBucket: # We use the file store to write to buckets, so we can't do this (yet?) diff --git a/src/toil/options/cwl.py b/src/toil/options/cwl.py index 6304e75745..08d0327f5e 100644 --- a/src/toil/options/cwl.py +++ b/src/toil/options/cwl.py @@ -419,3 +419,12 @@ def add_cwl_options(parser: ArgumentParser, suppress: bool = True) -> None: type=str, help=suppress_help or "Specify a cloud bucket endpoint for output files.", ) + parser.add_argument( + "--cachedir", + type=str, + help=suppress_help + or "Directory to cache intermediate workflow outputs to avoid " + "recomputing steps. Can be very helpful in the development and " + "troubleshooting of CWL documents.", + dest="cachedir" + ) From 0cf19c5ffdfac0ec9389cecbda0a34e13932e38f Mon Sep 17 00:00:00 2001 From: stxue1 Date: Fri, 6 Dec 2024 10:59:40 -0800 Subject: [PATCH 2/5] Add a test --- src/toil/test/cwl/cwlTest.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/toil/test/cwl/cwlTest.py b/src/toil/test/cwl/cwlTest.py index 5b8a13733e..cc911f499e 100644 --- a/src/toil/test/cwl/cwlTest.py +++ b/src/toil/test/cwl/cwlTest.py @@ -698,6 +698,29 @@ def path_with_bogus_rev() -> str: except subprocess.CalledProcessError: pass + def test_caching(self) -> None: + log.info("Running CWL Test Cache.") + from toil.cwl import cwltoil + + outDir = self._createTempDir() + cwlDir = os.path.join(self._projectRootPath(), "src", "toil", "test", "cwl") + cmd = [ + "--outdir", + outDir, + "--jobStore", + os.path.join(outDir, "jobStore"), + "--no-container", + "--cachedir", + "cache", + os.path.join(cwlDir, "revsort.cwl"), + os.path.join(cwlDir, "revsort-job.json"), + ] + # Finish the job with a correct PATH + st = StringIO() + cwltoil.main(cmd, stdout=st) + # cwltool hashes certain steps into directories, ensure it exists + assert os.path.exists(os.path.join(cwlDir, "cache", "92f7d79ad270b174174343085c5456c6")) + @needs_aws_s3 def test_streamable(self, extra_args: Optional[list[str]] = None) -> None: """ From 4d33ae5c882c7f53754813656bae041fcb688307 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Wed, 18 Dec 2024 14:28:42 -0800 Subject: [PATCH 3/5] Do a copy when not bypassing filestore and when caching is enabled --- src/toil/cwl/cwltoil.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/toil/cwl/cwltoil.py b/src/toil/cwl/cwltoil.py index 22c12db560..481c78ebf0 100644 --- a/src/toil/cwl/cwltoil.py +++ b/src/toil/cwl/cwltoil.py @@ -4447,7 +4447,12 @@ def main(args: Optional[list[str]] = None, stdout: TextIO = sys.stdout) -> int: if not options.bypass_file_store: # If we're using the file store we need to start moving output # files now. - runtime_context.move_outputs = "move" + # But if caching is enabled we have to leave files in the cache directory, + # so do a copy if so. + if runtime_context.cachedir is not None: + runtime_context.move_outputs = "copy" + else: + runtime_context.move_outputs = "move" # We instantiate an early builder object here to populate indirect # secondaryFile references using cwltool's library because we need From 8284cd21dc81670752875fc8cf819023ce5c149a Mon Sep 17 00:00:00 2001 From: stxue1 Date: Thu, 19 Dec 2024 09:45:55 -0800 Subject: [PATCH 4/5] Fix test --- src/toil/test/cwl/cwlTest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/toil/test/cwl/cwlTest.py b/src/toil/test/cwl/cwlTest.py index cc911f499e..80a547524a 100644 --- a/src/toil/test/cwl/cwlTest.py +++ b/src/toil/test/cwl/cwlTest.py @@ -717,9 +717,10 @@ def test_caching(self) -> None: ] # Finish the job with a correct PATH st = StringIO() - cwltoil.main(cmd, stdout=st) + ret = cwltoil.main(cmd, stdout=st) + assert ret == 0 # cwltool hashes certain steps into directories, ensure it exists - assert os.path.exists(os.path.join(cwlDir, "cache", "92f7d79ad270b174174343085c5456c6")) + assert os.path.exists(os.path.join(cwlDir, "cache", "9da28e219a61b062824576503f88b863")) @needs_aws_s3 def test_streamable(self, extra_args: Optional[list[str]] = None) -> None: From f6a6e4656b2ea807ee89f60395bc94f2ee894336 Mon Sep 17 00:00:00 2001 From: stxue1 Date: Tue, 7 Jan 2025 16:41:40 -0800 Subject: [PATCH 5/5] Add cache test --- src/toil/test/cwl/cwlTest.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/toil/test/cwl/cwlTest.py b/src/toil/test/cwl/cwlTest.py index 80a547524a..77e1325429 100644 --- a/src/toil/test/cwl/cwlTest.py +++ b/src/toil/test/cwl/cwlTest.py @@ -1222,6 +1222,14 @@ def test_run_conformance_with_caching(self) -> None: junit_file=os.path.join(self.rootDir, "caching-conformance-1.2.junit.xml"), ) + @slow + @pytest.mark.timeout(CONFORMANCE_TEST_TIMEOUT) + def test_run_conformance_with_task_caching(self) -> None: + self.test_run_conformance( + junit_file=os.path.join(self.rootDir, "task-caching-conformance-1.2.junit.xml"), + extra_args=["--cachedir", self._createTempDir("task_cache")] + ) + @slow @pytest.mark.timeout(CONFORMANCE_TEST_TIMEOUT) def test_run_conformance_with_in_place_update(self) -> None: