From 16553681c9bded139df02d8deddfbb2b00d74bd6 Mon Sep 17 00:00:00 2001 From: Raghavendra M Dani Date: Fri, 25 Oct 2024 16:41:20 -0700 Subject: [PATCH] Fix divide by zero error when pyarrow table size comes out 0 (#368) * Add a case when in-memory-size is also 0 * Add UTs * bump version --- deltacat/__init__.py | 2 +- deltacat/compute/resource_estimation/delta.py | 2 +- .../compute/resource_estimation/test_delta.py | 37 +++++++++++++++++++ 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/deltacat/__init__.py b/deltacat/__init__.py index 1f19744e..be70e67d 100644 --- a/deltacat/__init__.py +++ b/deltacat/__init__.py @@ -44,7 +44,7 @@ deltacat.logs.configure_deltacat_logger(logging.getLogger(__name__)) -__version__ = "1.1.26" +__version__ = "1.1.27" __all__ = [ diff --git a/deltacat/compute/resource_estimation/delta.py b/deltacat/compute/resource_estimation/delta.py index 97f137a1..62500552 100644 --- a/deltacat/compute/resource_estimation/delta.py +++ b/deltacat/compute/resource_estimation/delta.py @@ -188,7 +188,7 @@ def _estimate_resources_required_to_process_delta_using_file_sampling( sampled_on_disk_size += delta.manifest.entries[entry_index].meta.content_length sampled_num_rows += len(tbl) - if not sampled_on_disk_size: + if not sampled_on_disk_size or not sampled_in_memory_size: return EstimatedResources.of( memory_bytes=0, statistics=Statistics.of( diff --git a/deltacat/tests/compute/resource_estimation/test_delta.py b/deltacat/tests/compute/resource_estimation/test_delta.py index 8c6acc58..aeab34c6 100644 --- a/deltacat/tests/compute/resource_estimation/test_delta.py +++ b/deltacat/tests/compute/resource_estimation/test_delta.py @@ -437,6 +437,43 @@ def test_delta_manifest_parquet_when_file_sampling( == parquet_delta_with_manifest.meta.content_length ) + def test_parquet_delta_when_file_sampling_and_arrow_size_zero( + self, + local_deltacat_storage_kwargs, + parquet_delta_with_manifest: Delta, + monkeypatch, + ): + params = EstimateResourcesParams.of( + resource_estimation_method=ResourceEstimationMethod.FILE_SAMPLING, + max_files_to_sample=2, + ) + + def mock_func(*args, **kwargs): + class MockedValue: + nbytes = 0 + + def __len__(self): + return 0 + + return MockedValue() + + monkeypatch.setattr(ds, "download_delta_manifest_entry", mock_func) + + result = estimate_resources_required_to_process_delta( + delta=parquet_delta_with_manifest, + operation_type=OperationType.PYARROW_DOWNLOAD, + deltacat_storage=ds, + deltacat_storage_kwargs=local_deltacat_storage_kwargs, + estimate_resources_params=params, + ) + + assert parquet_delta_with_manifest.manifest is not None + assert result.memory_bytes == 0 + assert ( + result.statistics.on_disk_size_bytes + == parquet_delta_with_manifest.meta.content_length + ) + def test_delta_manifest_utsv_when_file_sampling( self, local_deltacat_storage_kwargs, utsv_delta_with_manifest: Delta ):