diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 417a168a5e..7b8249170f 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -16,6 +16,7 @@ from kedro.framework.session import KedroSession from kedro.framework.startup import ProjectMetadata from kedro.io import AbstractDataset +from kedro.io.data_catalog import DataCatalog def _create_session(package_name: str, **kwargs: Any) -> KedroSession: @@ -231,8 +232,8 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: session = _create_session(metadata.package_name, env=env) context = session.load_context() - data_catalog = context.catalog catalog_config = context.config_loader["catalog"] + data_catalog = DataCatalog.from_config(catalog_config) explicit_datasets = { ds_name: ds_config @@ -264,7 +265,6 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: ds_config = data_catalog._resolve_config( ds_name, matched_pattern, ds_config_copy ) - explicit_datasets[ds_name] = ds_config secho(yaml.dump(explicit_datasets)) diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index 604c086e85..df74cfd015 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -33,9 +33,29 @@ def fake_catalog_config(): config = { "parquet_{factory_pattern}": { "type": "pandas.ParquetDataset", - "filepath": "test.pq", + "filepath": "data/01_raw/{factory_pattern}.pq", }, - "csv_{factory_pattern}": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, + "csv_{factory_pattern}": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/{factory_pattern}.csv", + }, + "csv_test": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, + } + return config + + +@pytest.fixture +def fake_catalog_config_resolved(): + config = { + "parquet_example": { + "type": "pandas.ParquetDataset", + "filepath": "data/01_raw/example.pq", + }, + "csv_example": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/example.csv", + }, + "csv_test": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, } return config @@ -68,17 +88,56 @@ def fake_catalog_with_overlapping_factories(): @pytest.fixture -def fake_catalog_config_with_resolvable_dataset(): +def fake_catalog_config_with_factories(fake_metadata): config = { "parquet_{factory_pattern}": { "type": "pandas.ParquetDataset", - "filepath": "test.pq", + "filepath": "data/01_raw/{factory_pattern}.pq", + }, + "csv_{factory_pattern}": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/{factory_pattern}.csv", }, - "csv_{factory_pattern}": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, "{factory_pattern}_ds": { "type": "pandas.ParquetDataset", - "filepath": "test.pq", + "filepath": "data/01_raw/{factory_pattern}_ds.pq", + }, + "partitioned_{factory_pattern}": { + "type": "partitions.PartitionedDataset", + "path": "data/01_raw", + "dataset": "pandas.CSVDataset", + "metadata": { + "my-plugin": { + "path": "data/01_raw", + } + }, + }, + } + return config + + +@pytest.fixture +def fake_catalog_config_with_factories_resolved(): + config = { + "parquet_example": { + "type": "pandas.ParquetDataset", + "filepath": "data/01_raw/example.pq", + }, + "csv_example": { + "type": "pandas.CSVDataset", + "filepath": "data/01_raw/example.csv", + }, + "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"}, + "partitioned_example": { + "type": "partitions.PartitionedDataset", + "path": "data/01_raw", + "dataset": "pandas.CSVDataset", + "metadata": { + "my-plugin": { + "path": "data/01_raw", + } + }, }, } return config @@ -231,14 +290,16 @@ def test_list_factory_generated_datasets( ["catalog", "list"], obj=fake_metadata, ) - assert not result.exit_code expected_dict = { f"Datasets in '{PIPELINE_NAME}' pipeline": { "Datasets generated from factories": { "pandas.CSVDataset": ["csv_example"], "pandas.ParquetDataset": ["parquet_example"], - } + }, + "Datasets mentioned in pipeline": { + "CSVDataset": ["csv_test"], + }, } } key = f"Datasets in '{PIPELINE_NAME}' pipeline" @@ -403,192 +464,173 @@ def test_bad_env(self, fake_project_cli, fake_metadata): assert "Unable to instantiate Kedro Catalog" in result.output -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_rank_catalog_factories( - fake_project_cli, - fake_metadata, - mocker, - fake_load_context, - fake_catalog_with_overlapping_factories, -): - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - mocked_context.catalog = DataCatalog.from_config( - fake_catalog_with_overlapping_factories - ) - - result = CliRunner().invoke( - fake_project_cli, ["catalog", "rank"], obj=fake_metadata - ) - assert not result.exit_code - - expected_patterns_sorted = [ - "an_example_{place}_{holder}", - "an_example_{placeholder}", - "an_{example_placeholder}", - "on_{example_placeholder}", - ] - - assert yaml_dump_mock.call_count == 1 - assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted +@pytest.mark.usefixtures("chdir_to_dummy_project", "fake_load_context") +class TestCatalogFactoryCommands: + @pytest.mark.usefixtures("mock_pipelines") + def test_rank_catalog_factories( + self, + fake_project_cli, + fake_metadata, + mocker, + fake_load_context, + fake_catalog_with_overlapping_factories, + ): + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value + mocked_context.catalog = DataCatalog.from_config( + fake_catalog_with_overlapping_factories + ) + print("!!!!", mocked_context.catalog._dataset_patterns) + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) + assert not result.exit_code + expected_patterns_sorted = [ + "an_example_{place}_{holder}", + "an_example_{placeholder}", + "an_{example_placeholder}", + "on_{example_placeholder}", + ] -@pytest.mark.usefixtures( - "chdir_to_dummy_project", - "fake_load_context", -) -def test_rank_catalog_factories_with_no_factories( - fake_project_cli, fake_metadata, fake_load_context -): - mocked_context = fake_load_context.return_value - - catalog_datasets = { - "iris_data": CSVDataset(filepath="test.csv"), - "intermediate": MemoryDataset(), - "not_used": CSVDataset(filepath="test2.csv"), - } - mocked_context.catalog = DataCatalog(datasets=catalog_datasets) + assert yaml_dump_mock.call_count == 1 + assert yaml_dump_mock.call_args[0][0] == expected_patterns_sorted - result = CliRunner().invoke( - fake_project_cli, ["catalog", "rank"], obj=fake_metadata - ) + def test_rank_catalog_factories_with_no_factories( + self, fake_project_cli, fake_metadata, fake_load_context + ): + mocked_context = fake_load_context.return_value - assert not result.exit_code - expected_output = "There are no dataset factories in the catalog." - assert expected_output in result.output + catalog_datasets = { + "iris_data": CSVDataset(filepath="test.csv"), + "intermediate": MemoryDataset(), + "not_used": CSVDataset(filepath="test2.csv"), + } + mocked_context.catalog = DataCatalog(datasets=catalog_datasets) + result = CliRunner().invoke( + fake_project_cli, ["catalog", "rank"], obj=fake_metadata + ) -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_catalog_resolve( - fake_project_cli, - fake_metadata, - fake_load_context, - mocker, - mock_pipelines, - fake_catalog_config, -): - """Test that datasets factories are correctly resolved to the explicit datasets in the pipeline.""" - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - mocked_context.catalog = DataCatalog.from_config(fake_catalog_config) - - placeholder_ds = mocked_context.catalog._datasets.keys() - explicit_ds = {"csv_example", "parquet_example"} - - mocker.patch.object( - mock_pipelines[PIPELINE_NAME], - "datasets", - return_value=explicit_ds, - ) + assert not result.exit_code + expected_output = "There are no dataset factories in the catalog." + assert expected_output in result.output - result = CliRunner().invoke( - fake_project_cli, ["catalog", "resolve"], obj=fake_metadata - ) + @pytest.mark.usefixtures("mock_pipelines") + def test_catalog_resolve( + self, + fake_project_cli, + fake_metadata, + fake_load_context, + mocker, + mock_pipelines, + fake_catalog_config, + fake_catalog_config_resolved, + ): + """Test that datasets factories are correctly resolved to the explicit datasets in the pipeline.""" + mocked_context = fake_load_context.return_value + mocked_context.config_loader = {"catalog": fake_catalog_config} + mocked_context.catalog = DataCatalog.from_config(fake_catalog_config) + placeholder_ds = mocked_context.catalog._dataset_patterns.keys() + pipeline_datasets = {"csv_example", "parquet_example", "explicit_dataset"} - assert not result.exit_code - assert yaml_dump_mock.call_count == 1 + mocker.patch.object( + mock_pipelines[PIPELINE_NAME], + "datasets", + return_value=pipeline_datasets, + ) - output = yaml_dump_mock.call_args[0][0] + result = CliRunner().invoke( + fake_project_cli, ["catalog", "resolve"], obj=fake_metadata + ) - for ds in placeholder_ds: - assert ds not in output + assert not result.exit_code + resolved_config = yaml.safe_load(result.output) + assert resolved_config == fake_catalog_config_resolved - for ds in explicit_ds: - assert ds in output + for ds in placeholder_ds: + assert ds not in result.output + @pytest.mark.usefixtures("mock_pipelines") + def test_catalog_resolve_nested_config( + self, + fake_project_cli, + fake_metadata, + fake_load_context, + mocker, + mock_pipelines, + fake_catalog_config_with_factories, + fake_catalog_config_with_factories_resolved, + ): + """Test that explicit catalog entries are not overwritten by factory config.""" + mocked_context = fake_load_context.return_value + mocked_context.project_path = fake_metadata.project_path -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_no_overwrite( - fake_project_cli, - fake_metadata, - fake_load_context, - mocker, - mock_pipelines, - fake_catalog_config_with_resolvable_dataset, -): - """Test that explicit catalog entries are not overwritten by factory config.""" - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - - mocked_context.config_loader = { - "catalog": fake_catalog_config_with_resolvable_dataset - } - mocked_context.catalog = DataCatalog.from_config( - fake_catalog_config_with_resolvable_dataset - ) + mocked_context.config_loader = {"catalog": fake_catalog_config_with_factories} + mocked_context.catalog = DataCatalog.from_config( + fake_catalog_config_with_factories + ) - mocker.patch.object( - mock_pipelines[PIPELINE_NAME], - "datasets", - return_value=mocked_context.catalog._datasets.keys() - | {"csv_example", "parquet_example"}, - ) + mocker.patch.object( + mock_pipelines[PIPELINE_NAME], + "datasets", + return_value=mocked_context.catalog._datasets.keys() + | {"csv_example", "parquet_example", "partitioned_example"}, + ) - result = CliRunner().invoke( - fake_project_cli, ["catalog", "resolve"], obj=fake_metadata - ) + result = CliRunner().invoke( + fake_project_cli, ["catalog", "resolve"], obj=fake_metadata + ) - assert not result.exit_code - assert yaml_dump_mock.call_count == 1 + assert not result.exit_code + resolved_config = yaml.safe_load(result.output) - assert ( - yaml_dump_mock.call_args[0][0]["explicit_ds"] - == fake_catalog_config_with_resolvable_dataset["explicit_ds"] - ) + assert resolved_config == fake_catalog_config_with_factories_resolved + @pytest.mark.usefixtures("mock_pipelines") + def test_no_param_datasets_in_resolve( + self, fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines + ): + yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") + mocked_context = fake_load_context.return_value -@pytest.mark.usefixtures( - "chdir_to_dummy_project", "fake_load_context", "mock_pipelines" -) -def test_no_param_datasets_in_resolve( - fake_project_cli, fake_metadata, fake_load_context, mocker, mock_pipelines -): - yaml_dump_mock = mocker.patch("yaml.dump", return_value="Result YAML") - mocked_context = fake_load_context.return_value - - catalog_config = { - "iris_data": { - "type": "pandas.CSVDataset", - "filepath": "test.csv", - }, - "intermediate": {"type": "MemoryDataset"}, - } + catalog_config = { + "iris_data": { + "type": "pandas.CSVDataset", + "filepath": "test.csv", + }, + "intermediate": {"type": "MemoryDataset"}, + } - catalog_datasets = { - "iris_data": CSVDataset(filepath="test.csv"), - "intermediate": MemoryDataset(), - "parameters": MemoryDataset(), - "params:data_ratio": MemoryDataset(), - } + catalog_datasets = { + "iris_data": CSVDataset(filepath="test.csv"), + "intermediate": MemoryDataset(), + "parameters": MemoryDataset(), + "params:data_ratio": MemoryDataset(), + } - mocked_context.config_loader = {"catalog": catalog_config} - mocked_context.catalog = DataCatalog(datasets=catalog_datasets) + mocked_context.config_loader = {"catalog": catalog_config} + mocked_context.catalog = DataCatalog(datasets=catalog_datasets) - mocker.patch.object( - mock_pipelines[PIPELINE_NAME], - "datasets", - return_value=catalog_datasets.keys(), - ) + mocker.patch.object( + mock_pipelines[PIPELINE_NAME], + "datasets", + return_value=catalog_datasets.keys(), + ) - result = CliRunner().invoke( - fake_project_cli, - ["catalog", "resolve"], - obj=fake_metadata, - ) + result = CliRunner().invoke( + fake_project_cli, + ["catalog", "resolve"], + obj=fake_metadata, + ) - assert not result.exit_code - assert yaml_dump_mock.call_count == 1 + assert not result.exit_code + assert yaml_dump_mock.call_count == 1 - # 'parameters' and 'params:data_ratio' should not appear in the output - output = yaml_dump_mock.call_args[0][0] + # 'parameters' and 'params:data_ratio' should not appear in the output + output = yaml_dump_mock.call_args[0][0] - assert "parameters" not in output.keys() - assert "params:data_ratio" not in output.keys() - assert "iris_data" in output.keys() - assert "intermediate" in output.keys() + assert "parameters" not in output.keys() + assert "params:data_ratio" not in output.keys() + assert "iris_data" in output.keys() + assert "intermediate" in output.keys()