Skip to content

Commit

Permalink
Add support for merging directories in hybrid packager
Browse files Browse the repository at this point in the history
Signed-off-by: smajumdar <[email protected]>
  • Loading branch information
titu1994 committed Jan 22, 2025
1 parent 11e0d2f commit 3c552aa
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 10 deletions.
40 changes: 30 additions & 10 deletions src/nemo_run/core/packaging/hybrid.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict
from typing import Dict, List, Union

from invoke.context import Context

Expand All @@ -16,7 +16,7 @@ class HybridPackager(Packager):
the top-level folder under which that packager’s content is placed.
"""

sub_packagers: Dict[str, Packager] = field(default_factory=dict)
sub_packagers: Dict[str, Union[Packager, List[Packager]]] = field(default_factory=dict)

def package(self, path: Path, job_dir: str, name: str) -> str:
final_tar_gz = os.path.join(job_dir, f"{name}.tar.gz")
Expand All @@ -28,18 +28,38 @@ def package(self, path: Path, job_dir: str, name: str) -> str:
ctx = Context()
ctx.run(f"tar -cf {tmp_tar} --files-from /dev/null")

# Defer deletion of temporary files until all subpackagers have been processed
subarchive_list = set([])
tmp_extract_dir_list = set([])

# For each subpackager, run its .package() method and extract to a subfolder
for folder_name, packager in self.sub_packagers.items():
subarchive_path = packager.package(path, job_dir, f"{name}_{folder_name}")
for folder_name, packagers in self.sub_packagers.items():
if not isinstance(packagers, list):
packagers = [packagers]

for packager in packagers:
subarchive_path = packager.package(path, job_dir, f"{name}_{folder_name}")
subarchive_list.add(subarchive_path)

# Create a temp folder, extract subarchive content into it,
# then add that folder to the final tar under the desired subpath
tmp_extract_dir = os.path.join(job_dir, f"__extract_{folder_name}")
tmp_extract_dir_list.add(tmp_extract_dir)
os.makedirs(tmp_extract_dir, exist_ok=True)

# Create a temp folder, extract subarchive content into it,
# then add that folder to the final tar under the desired subpath
tmp_extract_dir = os.path.join(job_dir, f"__extract_{folder_name}")
os.makedirs(tmp_extract_dir, exist_ok=True)
ctx.run(f"tar -xf {subarchive_path} -C {tmp_extract_dir}")

ctx.run(f"tar -xf {subarchive_path} -C {tmp_extract_dir}")
ctx.run(f"tar -rf {tmp_tar} -C {tmp_extract_dir} . --transform='s,^,{folder_name}/,'")
# If a folder name is provided, add the content under that folder
if folder_name != '':
ctx.run(f"tar -rf {tmp_tar} -C {tmp_extract_dir} . --transform='s,^,{folder_name}/,'")
else:
# Otherwise, add the content directly to the root of the tar
ctx.run(f"tar -rf {tmp_tar} -C {tmp_extract_dir} .")

for tmp_extract_dir in tmp_extract_dir_list:
ctx.run(f"rm -rf {tmp_extract_dir}")

for subarchive_path in subarchive_list:
ctx.run(f"rm {subarchive_path}")

# Finally, compress the combined tar
Expand Down
73 changes: 73 additions & 0 deletions test/core/packaging/test_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,76 @@ def test_hybrid_packager(mock_subpackager_one, mock_subpackager_two, tmp_path):
os.path.join(extract_dir, "2"),
)
assert not cmp.diff_files


@patch("nemo_run.core.packaging.hybrid.Context", MockContext)
def test_hybrid_packager_root_default(mock_subpackager_one, mock_subpackager_two, tmp_path):
hybrid = HybridPackager(
sub_packagers={
"": mock_subpackager_one,
"2": mock_subpackager_two,
}
)
with tempfile.TemporaryDirectory() as job_dir:
output_tar = hybrid.package(Path(tmp_path), job_dir, "hybrid_test")

assert os.path.exists(output_tar)

# Extract the resulting tar to verify contents
extract_dir = os.path.join(job_dir, "hybrid_extracted")
os.makedirs(extract_dir, exist_ok=True)
subprocess.run(["tar", "-xzf", output_tar, "-C", extract_dir], check=True)

# Compare subfolder "1" for file1.txt
cmp = filecmp.dircmp(
os.path.dirname(mock_subpackager_one.package.return_value),
os.path.join(extract_dir, ""),
)
assert not cmp.diff_files

# Compare subfolder "2" for file2.txt
cmp = filecmp.dircmp(
os.path.dirname(mock_subpackager_two.package.return_value),
os.path.join(extract_dir, "2"),
)
assert not cmp.diff_files


@patch("nemo_run.core.packaging.hybrid.Context", MockContext)
def test_hybrid_packager_root_combined(mock_subpackager_one, mock_subpackager_two, tmp_path):
hybrid = HybridPackager(
sub_packagers={
"": [mock_subpackager_one, mock_subpackager_two],
"1": mock_subpackager_one,
}
)
with tempfile.TemporaryDirectory() as job_dir:
output_tar = hybrid.package(Path(tmp_path), job_dir, "hybrid_test")

assert os.path.exists(output_tar)

# Extract the resulting tar to verify contents
extract_dir = os.path.join(job_dir, "hybrid_extracted")
os.makedirs(extract_dir, exist_ok=True)
subprocess.run(["tar", "-xzf", output_tar, "-C", extract_dir], check=True)

# Compare subfolder "1" for file1.txt
cmp = filecmp.dircmp(
os.path.dirname(mock_subpackager_one.package.return_value),
os.path.join(extract_dir, ""),
)
assert not cmp.diff_files

# Compare subfolder "2" for file2.txt
cmp = filecmp.dircmp(
os.path.dirname(mock_subpackager_two.package.return_value),
os.path.join(extract_dir,),
)
assert not cmp.diff_files

# Compare subfolder "1" for file1.txt
cmp = filecmp.dircmp(
os.path.dirname(mock_subpackager_one.package.return_value),
os.path.join(extract_dir, "1"),
)
assert not cmp.diff_files

0 comments on commit 3c552aa

Please sign in to comment.