From 1806821be48f615822ce0c52cde81577a36e4383 Mon Sep 17 00:00:00 2001 From: Guillaume Tauzin <4648633+gtauzin@users.noreply.github.com> Date: Sat, 8 Feb 2025 11:51:24 +0100 Subject: [PATCH 01/15] Enable datasets_from_catalog to return factory-based datasets Signed-off-by: Guillaume Tauzin <4648633+gtauzin@users.noreply.github.com> --- .../20250208_114146_4648633+gtauzin.md | 47 +++++++++++++++++++ vizro-core/docs/pages/explanation/authors.md | 2 +- .../pages/user-guides/kedro-data-catalog.md | 31 ++++++++++-- .../vizro/integrations/kedro/_data_manager.py | 44 +++++++++++++---- .../kedro/fixtures/test_catalog.yaml | 24 +++++++--- .../kedro/test_kedro_data_manager.py | 26 ++++++++-- 6 files changed, 151 insertions(+), 23 deletions(-) create mode 100644 vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md diff --git a/vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md b/vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md new file mode 100644 index 000000000..b1946eb20 --- /dev/null +++ b/vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md @@ -0,0 +1,47 @@ + + + + + + + + +### Fixed + +- Fix a bug where datasets generated by dataset factories would not be returned by `kedro_integration.datasets_from_catalog`. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) + + diff --git a/vizro-core/docs/pages/explanation/authors.md b/vizro-core/docs/pages/explanation/authors.md index 7632f5c10..2f9b4cd5c 100644 --- a/vizro-core/docs/pages/explanation/authors.md +++ b/vizro-core/docs/pages/explanation/authors.md @@ -10,7 +10,7 @@ -[Ann Marie Ward](https://github.com/AnnMarieW), [Anna Xiong](https://github.com/Anna-Xiong), [Annie Wachsmuth](https://github.com/anniecwa), [ataraexia](https://github.com/ataraexia), [axa99](https://github.com/axa99), [Bhavana Sundar](https://github.com/bhavanaeh), [Bo Xu](https://github.com/boxuboxu), [Chiara Pullem](https://github.com/chiara-sophie), [Denis Lebedev](https://github.com/DenisLebedevMcK), [Elena Fridman](https://github.com/EllenWie), [Ferida Mohammed](https://github.com/feridaaa), [Hamza Oza](https://github.com/hamzaoza), [Hansaem Park](https://github.com/sammitako), [Hilary Ivy](https://github.com/hxe00570), [Jasmine Wu](https://github.com/jazwu), [Jenelle Yonkman](https://github.com/yonkmanjl), [Jingjing Guo](https://github.com/jjguo-mck), [Juan Luis Cano Rodríguez](https://github.com/astrojuanlu), [Kee Wen Ng](https://github.com/KeeWenNgQB), [Leon Nallamuthu](https://github.com/leonnallamuthu), [Lydia Pitts](https://github.com/LydiaPitts), [Manuel Konrad](https://github.com/manuelkonrad), [Ned Letcher](https://github.com/ned2), [Nikolaos Tsaousis](https://github.com/tsanikgr), [njmcgrat](https://github.com/njmcgrat), [Oleksandr Serdiuk](https://github.com/oserdiuk-lohika), [Prateek Bajaj](https://github.com/prateekdev552), [Qiuyi Chen](https://github.com/Qiuyi-Chen), [Rashida Kanchwala](https://github.com/rashidakanchwala), [Riley Dou](https://github.com/rilieo), [Rosheen C.](https://github.com/rc678), [Sylvie Zhang](https://github.com/sylviezhang37), and [Upekesha Ngugi](https://github.com/upekesha). +[Ann Marie Ward](https://github.com/AnnMarieW), [Anna Xiong](https://github.com/Anna-Xiong), [Annie Wachsmuth](https://github.com/anniecwa), [ataraexia](https://github.com/ataraexia), [axa99](https://github.com/axa99), [Bhavana Sundar](https://github.com/bhavanaeh), [Bo Xu](https://github.com/boxuboxu), [Chiara Pullem](https://github.com/chiara-sophie), [Denis Lebedev](https://github.com/DenisLebedevMcK), [Elena Fridman](https://github.com/EllenWie), [Ferida Mohammed](https://github.com/feridaaa), [Guillaume Tauzin](https://github.com/gtauzin), [Hamza Oza](https://github.com/hamzaoza), [Hansaem Park](https://github.com/sammitako), [Hilary Ivy](https://github.com/hxe00570), [Jasmine Wu](https://github.com/jazwu), [Jenelle Yonkman](https://github.com/yonkmanjl), [Jingjing Guo](https://github.com/jjguo-mck), [Juan Luis Cano Rodríguez](https://github.com/astrojuanlu), [Kee Wen Ng](https://github.com/KeeWenNgQB), [Leon Nallamuthu](https://github.com/leonnallamuthu), [Lydia Pitts](https://github.com/LydiaPitts), [Manuel Konrad](https://github.com/manuelkonrad), [Ned Letcher](https://github.com/ned2), [Nikolaos Tsaousis](https://github.com/tsanikgr), [njmcgrat](https://github.com/njmcgrat), [Oleksandr Serdiuk](https://github.com/oserdiuk-lohika), [Prateek Bajaj](https://github.com/prateekdev552), [Qiuyi Chen](https://github.com/Qiuyi-Chen), [Rashida Kanchwala](https://github.com/rashidakanchwala), [Riley Dou](https://github.com/rilieo), [Rosheen C.](https://github.com/rc678), [Sylvie Zhang](https://github.com/sylviezhang37), and [Upekesha Ngugi](https://github.com/upekesha). with thanks to Sam Bourton and Kevin Staight for sponsorship, inspiration and guidance, diff --git a/vizro-core/docs/pages/user-guides/kedro-data-catalog.md b/vizro-core/docs/pages/user-guides/kedro-data-catalog.md index ce4b060fa..69279d47e 100644 --- a/vizro-core/docs/pages/user-guides/kedro-data-catalog.md +++ b/vizro-core/docs/pages/user-guides/kedro-data-catalog.md @@ -12,7 +12,7 @@ pip install vizro[kedro] ## Use datasets from the Kedro Data Catalog -`vizro.integrations.kedro` provides functions to help generate and process a [Kedro Data Catalog](https://docs.kedro.org/en/stable/data/index.html). Given a Kedro Data Catalog `catalog`, the general pattern to add datasets into the Vizro data manager is: +`vizro.integrations.kedro` provides functions to help generate and process a [Kedro Data Catalog](https://docs.kedro.org/en/stable/data/index.html). It supports both the original [DataCatalog](https://docs.kedro.org/en/stable/data/data_catalog.html) and the more recently introduced [KedroDataCatalog](https://docs.kedro.org/en/stable/data/index.html#kedrodatacatalog-experimental-feature). Given a Kedro Data Catalog `catalog`, the general pattern to add datasets into the Vizro data manager is: ```python from vizro.integrations import kedro as kedro_integration @@ -23,6 +23,19 @@ for dataset_name, dataset in kedro_integration.datasets_from_catalog(catalog).it data_manager[dataset_name] = dataset ``` +To add datasets that are defined using the [Kedro dataset factory](https://docs.kedro.org/en/stable/data/kedro_dataset_factories.html), `datasets_from_catalog` needs to access the pipelines that use them. + +```python +from vizro.integrations import kedro as kedro_integration +from vizro.managers import data_manager + + +pipeline = pipelines.get("my_pipeline_name") + +for dataset_name, dataset in kedro_integration.datasets_from_catalog(catalog, pipeline=pipeline).items(): + data_manager[dataset_name] = dataset +``` + This imports all datasets of type [`kedro_datasets.pandas`](https://docs.kedro.org/en/stable/kedro_datasets.html) from the Kedro `catalog` into the Vizro `data_manager`. The `catalog` variable may have been created in a number of different ways: @@ -31,6 +44,11 @@ The `catalog` variable may have been created in a number of different ways: 1. [Kedro Jupyter session](https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks.html). This automatically exposes `catalog`. 1. Data Catalog configuration file (`catalog.yaml`). This can create a `catalog` entirely independently of a Kedro project using [`kedro.io.DataCatalog.from_config`](https://docs.kedro.org/en/stable/kedro.io.DataCatalog.html#kedro.io.DataCatalog.from_config). +Conversely, the `pipelines` variable may have been created the following ways: + +1. Kedro project path. Vizro exposes a helper function `vizro.integrations.kedro.pipelines_from_project` to generate a `pipelines` given the path to a Kedro project. +1. [Kedro Jupyter session](https://docs.kedro.org/en/stable/notebooks_and_ipython/kedro_and_notebooks.html). This automatically exposes `pipelines`. + The full code for these different cases is given below. !!! example "Import a Kedro Data Catalog into the Vizro data manager" @@ -39,10 +57,13 @@ The full code for these different cases is given below. from vizro.integrations import kedro as kedro_integration from vizro.managers import data_manager + project_path = "/path/to/kedro/project" + catalog = kedro_integration.catalog_from_project(project_path) + pipelines = kedro_integration.catalog_from_project(project_path) - catalog = kedro_integration.catalog_from_project("/path/to/kedro/project") + pipeline = pipelines.get("my_pipeline") - for dataset_name, dataset in kedro_integration.datasets_from_catalog(catalog).items(): + for dataset_name, dataset in kedro_integration.datasets_from_catalog(catalog, pipeline=pipeline).items(): data_manager[dataset_name] = dataset ``` @@ -51,7 +72,9 @@ The full code for these different cases is given below. from vizro.managers import data_manager - for dataset_name, dataset in kedro_integration.datasets_from_catalog(catalog).items(): + pipeline = pipelines.get("my_pipeline") + + for dataset_name, dataset in kedro_integration.datasets_from_catalog(catalog, pipeline=pipeline).items(): data_manager[dataset_name] = dataset ``` diff --git a/vizro-core/src/vizro/integrations/kedro/_data_manager.py b/vizro-core/src/vizro/integrations/kedro/_data_manager.py index 28c565a6c..2397a2587 100644 --- a/vizro-core/src/vizro/integrations/kedro/_data_manager.py +++ b/vizro-core/src/vizro/integrations/kedro/_data_manager.py @@ -3,14 +3,15 @@ from kedro.framework.session import KedroSession from kedro.framework.startup import bootstrap_project -from kedro.io import DataCatalog +from kedro.io import CatalogProtocol, KedroDataCatalog +from kedro.pipeline import Pipeline from vizro.managers._data_manager import pd_DataFrameCallable def catalog_from_project( project_path: Union[str, Path], env: Optional[str] = None, extra_params: Optional[dict[str, Any]] = None -) -> DataCatalog: +) -> CatalogProtocol | KedroDataCatalog: bootstrap_project(project_path) with KedroSession.create( project_path=project_path, env=env, save_on_close=False, extra_params=extra_params @@ -18,10 +19,35 @@ def catalog_from_project( return session.load_context().catalog -def datasets_from_catalog(catalog: DataCatalog) -> dict[str, pd_DataFrameCallable]: - datasets = {} - for name in catalog.list(): - dataset = catalog._get_dataset(name, suggest=False) - if "pandas" in dataset.__module__: - datasets[name] = dataset.load - return datasets +def pipelines_from_project(project_path: Union[str, Path]) -> Pipeline: + bootstrap_project(project_path) + from kedro.framework.project import pipelines + + return pipelines + + +def datasets_from_catalog( + catalog: CatalogProtocol | KedroDataCatalog, *, pipeline: Pipeline = None +) -> dict[str, pd_DataFrameCallable]: + # This doesn't include things added to the catalog at run time but that is ok for our purposes. + config_resolver = catalog.config_resolver + kedro_datasets = config_resolver.config.copy() + + if pipeline is not None: + # Go through all dataset names that weren't in catalog and try to resolve them. Those that cannot be + # resolved give an empty dictionary and are ignored. + for dataset_name in set(pipeline.datasets()) - set(kedro_datasets): + if dataset_config := config_resolver.resolve_pattern(dataset_name): + kedro_datasets[dataset_name] = dataset_config + + vizro_data_sources = {} + + for dataset_name, dataset_config in kedro_datasets.items(): + # "type" key always exists because we filtered out patterns that resolve to empty dictionary above. + if "pandas" in dataset_config["type"]: + # TODO: in future update to use lambda: catalog.load(dataset_name) instead of _get_dataset + # but need to check if works with caching. + dataset = catalog._get_dataset(dataset_name, suggest=False) + vizro_data_sources[dataset_name] = dataset.load + + return vizro_data_sources diff --git a/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml b/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml index 8ae6ef952..184d8625e 100644 --- a/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml +++ b/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml @@ -1,7 +1,19 @@ -companies: - type: pandas.JSONDataset - filepath: companies.json +"{pandas_factory}1": + type: pandas.CSVDataset + filepath: ./{pandas_factory}.csv -reviews: - type: pickle.PickleDataset - filepath: reviews.pkl +pandas_excel: + type: pandas.ExcelDataset + filepath: pandas_excel.xlsx + +pandas_parquet: + type: pandas.ParquetDataset + filepath: pandas_parquet.parquet + +polars: + type: polars.CSVDataset + filepath: polars.csv + +not_dataframe: + type: picke.PickleDataset + filepath: pickle.pkl diff --git a/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py b/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py index cb6cb796d..fd1b63330 100644 --- a/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py +++ b/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py @@ -8,6 +8,7 @@ kedro = pytest.importorskip("kedro") +import kedro.pipeline as kp # noqa: E402 from kedro.io import DataCatalog # noqa: E402 from vizro.integrations.kedro import datasets_from_catalog # noqa: E402 @@ -20,6 +21,25 @@ def catalog_path(): def test_datasets_from_catalog(catalog_path): catalog = DataCatalog.from_config(yaml.safe_load(catalog_path.read_text(encoding="utf-8"))) - assert "companies" in datasets_from_catalog(catalog) - assert isinstance(datasets_from_catalog(catalog), dict) - assert isinstance(datasets_from_catalog(catalog)["companies"], types.MethodType) + + datasets = datasets_from_catalog(catalog) + assert isinstance(datasets, dict) + assert set(datasets) == {"pandas_excel", "pandas_parquet"} + for dataset in datasets.values(): + assert isinstance(dataset, types.MethodType) + + +def test_datasets_from_catalog_with_pipeline(catalog_path): + catalog = DataCatalog.from_config(yaml.safe_load(catalog_path.read_text(encoding="utf-8"))) + pipeline = kp.pipeline( + [ + kp.node( + func=lambda *args: None, + inputs=["pandas_excel", "C1", "polars", "Z", "parameters", "params:z"], + outputs=["pandas_parquet", "not_dataframe"], + ), + ] + ) + + datasets = datasets_from_catalog(catalog, pipeline=pipeline) + assert set(datasets) == {"pandas_excel", "pandas_parquet", "C1"} From 828735a82c4cb2bcc92a4ab82464a013b6f6431a Mon Sep 17 00:00:00 2001 From: Guillaume Tauzin <4648633+gtauzin@users.noreply.github.com> Date: Sat, 8 Feb 2025 12:07:53 +0100 Subject: [PATCH 02/15] Use Union instead of | Signed-off-by: Guillaume Tauzin <4648633+gtauzin@users.noreply.github.com> --- vizro-core/src/vizro/integrations/kedro/_data_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vizro-core/src/vizro/integrations/kedro/_data_manager.py b/vizro-core/src/vizro/integrations/kedro/_data_manager.py index 2397a2587..d03cf7c59 100644 --- a/vizro-core/src/vizro/integrations/kedro/_data_manager.py +++ b/vizro-core/src/vizro/integrations/kedro/_data_manager.py @@ -11,7 +11,7 @@ def catalog_from_project( project_path: Union[str, Path], env: Optional[str] = None, extra_params: Optional[dict[str, Any]] = None -) -> CatalogProtocol | KedroDataCatalog: +) -> Union[CatalogProtocol, KedroDataCatalog]: bootstrap_project(project_path) with KedroSession.create( project_path=project_path, env=env, save_on_close=False, extra_params=extra_params @@ -27,7 +27,7 @@ def pipelines_from_project(project_path: Union[str, Path]) -> Pipeline: def datasets_from_catalog( - catalog: CatalogProtocol | KedroDataCatalog, *, pipeline: Pipeline = None + catalog: Union[CatalogProtocol, KedroDataCatalog], *, pipeline: Pipeline = None ) -> dict[str, pd_DataFrameCallable]: # This doesn't include things added to the catalog at run time but that is ok for our purposes. config_resolver = catalog.config_resolver From 0d475600bbc093715e1cb41e50cdbe5597127431 Mon Sep 17 00:00:00 2001 From: Guillaume Tauzin <4648633+gtauzin@users.noreply.github.com> Date: Mon, 10 Feb 2025 18:06:36 +0100 Subject: [PATCH 03/15] Apply suggestions from code review --- .../20250208_114146_4648633+gtauzin.md | 9 +-- .../pages/user-guides/kedro-data-catalog.md | 70 ++++++++++++++----- .../src/vizro/integrations/kedro/__init__.py | 4 +- .../vizro/integrations/kedro/_data_manager.py | 6 +- .../kedro/fixtures/test_catalog.yaml | 8 +-- .../kedro/test_kedro_data_manager.py | 6 +- 6 files changed, 64 insertions(+), 39 deletions(-) diff --git a/vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md b/vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md index b1946eb20..c358660e7 100644 --- a/vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md +++ b/vizro-core/changelog.d/20250208_114146_4648633+gtauzin.md @@ -16,12 +16,9 @@ Uncomment the section that is right (remove the HTML comment wrapper). - A bullet item for the Removed category with a link to the relevant PR at the end of your entry, e.g. Enable feature XXX. ([#1](https://github.com/mckinsey/vizro/pull/1)) --> - +- Kedro integration function `datasets_from_catalog` can now handle dataset factories. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) -### Fixed - -- Fix a bug where datasets generated by dataset factories would not be returned by `kedro_integration.datasets_from_catalog`. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) - ### Added -- Kedro integration function `datasets_from_catalog` can now handle dataset factories. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) +- Kedro integration function `datasets_from_catalog` can now handle dataset factories for `kedro>=0.19.9`. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) ### Changed -- Bump optional dependency lower bound to `kedro>=0.19.9`. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) +- Bump optional dependency lower bound to `kedro>=0.19.0`. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) ### Added -- Kedro integration function `datasets_from_catalog` can now handle dataset factories for `kedro>=0.19.9`. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) +- Kedro integration function `datasets_from_catalog` can handle dataset factories for `kedro>=0.19.9`. ([#1001](https://github.com/mckinsey/vizro/pull/1001)) ### Changed diff --git a/vizro-core/src/vizro/__init__.py b/vizro-core/src/vizro/__init__.py index 598e75420..0cd833bb7 100644 --- a/vizro-core/src/vizro/__init__.py +++ b/vizro-core/src/vizro/__init__.py @@ -24,8 +24,6 @@ # This would only be the case where you need to test something with serve_locally=False and have changed # assets compared to main. In this case you need to push your assets changes to remote for the CDN to update, # and it might also be necessary to clear the CDN cache: https://www.jsdelivr.com/tools/purge. - - _git_branch = __version__ if not parse(__version__).is_devrelease else "main" BASE_EXTERNAL_URL = f"https://cdn.jsdelivr.net/gh/mckinsey/vizro@{_git_branch}/vizro-core/src/vizro/" # Enables the use of our own Bootstrap theme in a pure Dash app with `external_stylesheets=vizro.bootstrap`. diff --git a/vizro-core/src/vizro/_vizro.py b/vizro-core/src/vizro/_vizro.py index 5c0a7e40b..fc69ef479 100644 --- a/vizro-core/src/vizro/_vizro.py +++ b/vizro-core/src/vizro/_vizro.py @@ -210,7 +210,6 @@ def _make_resource_spec(path: Path) -> _ResourceSpec: # This would only be the case where you need to test something with serve_locally=False and have changed # assets compared to main. In this case you need to push your assets changes to remote for the CDN to update, # and it might also be necessary to clear the CDN cache: https://www.jsdelivr.com/tools/purge. - _git_branch = vizro.__version__ if not parse(vizro.__version__).is_devrelease else "main" BASE_EXTERNAL_URL = f"https://cdn.jsdelivr.net/gh/mckinsey/vizro@{_git_branch}/vizro-core/src/vizro/" diff --git a/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml b/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml index 708b115b3..2e0049ef2 100644 --- a/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml +++ b/vizro-core/tests/unit/vizro/integrations/kedro/fixtures/test_catalog.yaml @@ -1,6 +1,6 @@ "{pandas_factory}#csv": type: pandas.CSVDataset - filepath: ./{pandas_factory}.csv + filepath: "{pandas_factory}.csv" pandas_excel: type: pandas.ExcelDataset diff --git a/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py b/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py index 8600b46b6..791f6f22a 100644 --- a/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py +++ b/vizro-core/tests/unit/vizro/integrations/kedro/test_kedro_data_manager.py @@ -44,9 +44,9 @@ def test_datasets_from_catalog_with_pipeline(catalog): inputs=[ "pandas_excel", "something#csv", + "something_else#csv", "not_dataframe", "not_in_catalog", - "pandas_parquet", "parameters", "params:z", ], @@ -58,7 +58,7 @@ def test_datasets_from_catalog_with_pipeline(catalog): datasets = datasets_from_catalog(catalog, pipeline=pipeline) # Dataset factories only work for kedro>=0.19.9. expected_datasets = ( - {"pandas_excel", "pandas_parquet", "something#csv"} + {"pandas_excel", "pandas_parquet", "something#csv", "something_else#csv"} if parse(version("kedro")) >= parse("0.19.9") else {"pandas_excel", "pandas_parquet"} )