Skip to content

Commit

Permalink
Remove dataset factory discovery (#1688)
Browse files Browse the repository at this point in the history
Dataset Factory Pattern discovery was introduced to discover datasets (mostly Tracking datasets used in Experiment Tracking) before we populate kedro viz data repositories
Due to this discovery, datasets that users do not have access to, are either timed-out or raise exceptions. This causes Kedro Viz to timeout or fail.
This PR removes the dataset factory pattern discovery implementation as a temporary fix. This restricts users from using Dataset Factory Patterns for Experiment Tracking
  • Loading branch information
ravi-kumar-pilla authored Dec 19, 2023
1 parent a25cff0 commit 99b84e4
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 146 deletions.
33 changes: 5 additions & 28 deletions package/kedro_viz/data_access/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,40 +69,17 @@ def set_db_session(self, db_session_class: sessionmaker):
"""Set db session on repositories that need it."""
self.runs.set_db_session(db_session_class)

def resolve_dataset_factory_patterns(
self, catalog: DataCatalog, pipelines: Dict[str, KedroPipeline]
):
"""Resolve dataset factory patterns in data catalog by matching
them against the datasets in the pipelines.
"""
for pipeline in pipelines.values():
if hasattr(pipeline, "data_sets"):
# Support for Kedro 0.18.x
datasets = pipeline.data_sets()
else:
datasets = pipeline.datasets()

for dataset_name in datasets:
try:
catalog.exists(dataset_name)
# pylint: disable=broad-except
except Exception as exc: # pragma: no cover
logger.warning(
"'%s' does not exist. Full exception: %s: %s",
dataset_name,
type(exc).__name__,
exc,
)

def add_catalog(self, catalog: DataCatalog, pipelines: Dict[str, KedroPipeline]):
def add_catalog(self, catalog: DataCatalog):
"""Resolve dataset factory patterns, add the catalog to the CatalogRepository
and relevant tracking datasets to TrackingDatasetRepository.
Args:
catalog: The DataCatalog instance to add.
pipelines: A dictionary which holds project pipelines
"""
self.resolve_dataset_factory_patterns(catalog, pipelines)

# TODO: Implement dataset factory pattern discovery for
# experiment tracking datasets.

self.catalog.set_catalog(catalog)

for dataset_name, dataset in self.catalog.as_dict().items():
Expand Down
2 changes: 1 addition & 1 deletion package/kedro_viz/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def populate_data(
session_class = make_db_session_factory(session_store.location)
data_access_manager.set_db_session(session_class)

data_access_manager.add_catalog(catalog, pipelines)
data_access_manager.add_catalog(catalog)

# add dataset stats before adding pipelines as the data nodes
# need stats information and they are created during add_pipelines
Expand Down
27 changes: 6 additions & 21 deletions package/tests/test_api/test_graphql/test_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,8 @@ def test_run_tracking_data_query(
client,
example_tracking_catalog,
data_access_manager_with_runs,
example_pipelines,
):
data_access_manager_with_runs.add_catalog(
example_tracking_catalog, example_pipelines
)
data_access_manager_with_runs.add_catalog(example_tracking_catalog)
example_run_id = example_run_ids[0]

response = client.post(
Expand Down Expand Up @@ -173,15 +170,9 @@ def test_run_tracking_data_query(
assert response.json() == expected_response

def test_metrics_data(
self,
client,
example_tracking_catalog,
data_access_manager_with_runs,
example_pipelines,
self, client, example_tracking_catalog, data_access_manager_with_runs
):
data_access_manager_with_runs.add_catalog(
example_tracking_catalog, example_pipelines
)
data_access_manager_with_runs.add_catalog(example_tracking_catalog)

response = client.post(
"/graphql",
Expand Down Expand Up @@ -295,11 +286,8 @@ def test_graphql_run_tracking_data(
data_access_manager_with_runs,
show_diff,
expected_response,
example_pipelines,
):
data_access_manager_with_runs.add_catalog(
example_multiple_run_tracking_catalog, example_pipelines
)
data_access_manager_with_runs.add_catalog(example_multiple_run_tracking_catalog)

response = client.post(
"/graphql",
Expand Down Expand Up @@ -355,11 +343,9 @@ def test_graphql_run_tracking_data_at_least_one_empty_run(
data_access_manager_with_runs,
show_diff,
expected_response,
example_pipelines,
):
data_access_manager_with_runs.add_catalog(
example_multiple_run_tracking_catalog_at_least_one_empty_run,
example_pipelines,
example_multiple_run_tracking_catalog_at_least_one_empty_run
)

response = client.post(
Expand Down Expand Up @@ -393,10 +379,9 @@ def test_graphql_run_tracking_data_all_empty_runs(
data_access_manager_with_runs,
show_diff,
expected_response,
example_pipelines,
):
data_access_manager_with_runs.add_catalog(
example_multiple_run_tracking_catalog_all_empty_runs, example_pipelines
example_multiple_run_tracking_catalog_all_empty_runs
)

response = client.post(
Expand Down
Loading

0 comments on commit 99b84e4

Please sign in to comment.