From 11989df50cac6736b46a34ec55b8aafe2a5e3cc0 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Mon, 2 Dec 2024 21:28:12 +0000
Subject: [PATCH 01/16] remove session store
Signed-off-by: Huong Nguyen
---
demo-project/src/demo_project/settings.py | 19 --
.../integrations/kedro/sqlite_store.py | 201 ------------------
package/kedro_viz/server.py | 12 +-
3 files changed, 2 insertions(+), 230 deletions(-)
delete mode 100644 package/kedro_viz/integrations/kedro/sqlite_store.py
diff --git a/demo-project/src/demo_project/settings.py b/demo-project/src/demo_project/settings.py
index 304f7d8c1a..5023ebc1f4 100644
--- a/demo-project/src/demo_project/settings.py
+++ b/demo-project/src/demo_project/settings.py
@@ -2,25 +2,6 @@
# List the installed plugins for which to disable auto-registry
# DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",)
-from pathlib import Path
-
-# Define where to store data from a KedroSession. Defaults to BaseSessionStore.
-# from kedro.framework.session.store import ShelveStore
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
-
-SESSION_STORE_CLASS = SQLiteStore
-SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")}
-
-# Setup for collaborative experiment tracking.
-# SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data"),
-# "remote_path": "s3://{path-to-session_store}" }
-
-# Define custom context class. Defaults to `KedroContext`
-# CONTEXT_CLASS = KedroContext
-
-# Define the configuration folder. Defaults to `conf`
-# CONF_ROOT = "conf"
-
from kedro.config import OmegaConfigLoader # NOQA
CONFIG_LOADER_CLASS = OmegaConfigLoader
diff --git a/package/kedro_viz/integrations/kedro/sqlite_store.py b/package/kedro_viz/integrations/kedro/sqlite_store.py
deleted file mode 100644
index 8ba1a5ac9e..0000000000
--- a/package/kedro_viz/integrations/kedro/sqlite_store.py
+++ /dev/null
@@ -1,201 +0,0 @@
-"""kedro_viz.intergrations.kedro.sqlite_store is a child of BaseSessionStore
-which stores sessions data in the SQLite database"""
-
-import getpass
-import json
-import logging
-import os
-from pathlib import Path
-from typing import Any, Optional
-
-import fsspec
-from kedro.framework.project import settings
-from kedro.framework.session.store import BaseSessionStore
-from kedro.io.core import get_protocol_and_path
-from sqlalchemy import create_engine, select
-from sqlalchemy.orm import Session
-
-from kedro_viz.constants import VIZ_SESSION_STORE_ARGS
-from kedro_viz.database import make_db_session_factory
-from kedro_viz.launchers.utils import _find_kedro_project
-from kedro_viz.models.experiment_tracking import RunModel
-
-logger = logging.getLogger(__name__)
-
-
-def _get_dbname():
- return os.getenv("KEDRO_SQLITE_STORE_USERNAME", getpass.getuser()) + ".db"
-
-
-def _is_json_serializable(obj: Any):
- try:
- json.dumps(obj)
- return True
- except (TypeError, OverflowError):
- return False
-
-
-def _get_session_path(session_path: str) -> str:
- """Returns the session path by creating its parent directory
- if unavailable.
- """
- session_file_path = Path(session_path)
- session_file_path.parent.mkdir(parents=True, exist_ok=True)
- return str(session_file_path)
-
-
-class SQLiteStore(BaseSessionStore):
- """Stores the session data on the sqlite db."""
-
- def __init__(self, *args, remote_path: Optional[str] = None, **kwargs):
- """Initializes the SQLiteStore object."""
- super().__init__(*args, **kwargs)
- self._db_session_class = make_db_session_factory(self.location)
- self._remote_path = remote_path
-
- if self.remote_location:
- protocol, _ = get_protocol_and_path(self.remote_location)
- self._remote_fs = fsspec.filesystem(protocol)
-
- @property
- def location(self) -> str:
- """Returns location of the sqlite_store database"""
- if "path" not in settings.SESSION_STORE_ARGS:
- kedro_project_path = _find_kedro_project(Path.cwd()) or self._path
- return _get_session_path(
- f"{kedro_project_path}/{VIZ_SESSION_STORE_ARGS['path']}/session_store.db"
- )
-
- return _get_session_path(f"{self._path}/session_store.db")
-
- @property
- def remote_location(self) -> Optional[str]:
- """Returns the remote location of the sqlite_store database on the cloud"""
- return self._remote_path
-
- def _to_json(self) -> str:
- """Returns session_store information in json format after converting PosixPath to string"""
- session_dict = {}
- for key, value in self.data.items():
- if key == "git":
- try:
- import git
-
- branch = git.Repo(search_parent_directories=True).active_branch
- value["branch"] = branch.name
- except ImportError as exc: # pragma: no cover
- logger.warning("%s:%s", exc.__class__.__name__, exc.msg)
- except Exception as exc: # pragma: no cover
- logger.warning("Something went wrong when fetching git metadata.")
- logger.warning(exc)
-
- if _is_json_serializable(value):
- session_dict[key] = value
- else:
- session_dict[key] = str(value)
- return json.dumps(session_dict)
-
- def save(self):
- """Save the session store info on db and uploads it
- to the cloud if a remote cloud path is provided ."""
- with self._db_session_class.begin() as session:
- session.add(RunModel(id=self._session_id, blob=self._to_json()))
- if self.remote_location:
- self._upload()
-
- def _upload(self):
- """Uploads the session store database file to the specified
- remote path on the cloud storage."""
- db_name = _get_dbname()
- logger.debug(
- """Uploading local session store to %s with name
- %s...""",
- self.remote_location,
- db_name,
- )
- try:
- self._remote_fs.put(self.location, f"{self.remote_location}/{db_name}")
- except Exception as exc:
- logger.exception("Upload failed: %s ", exc)
-
- def _download(self):
- """Downloads all the session store database files
- from the specified remote path on the cloud storage
- to your local project.
- """
- try:
- # In theory we should be able to do this as a single operation:
- # self._remote_fs.get(f"{self.remote_location}/*.db", str(Path(self.location).parent))
- # but this does not seem to work correctly - maybe a bug in fsspec. So instead
- # we do it in two steps. Also need to add os.sep so it works with older s3fs version.
- # This is a known bug in s3fs - https://github.com/fsspec/s3fs/issues/717
- remote_dbs = self._remote_fs.glob(f"{self.remote_location}/*.db")
- logger.debug(
- "Downloading %s remote session stores to local...", len(remote_dbs)
- )
- for remote_db in remote_dbs:
- self._remote_fs.get(remote_db, str(Path(self.location).parent) + os.sep)
- except Exception as exc:
- logger.exception("Download failed: %s ", exc)
-
- def _merge(self):
- """Merges all the session store databases stored at the
- specified locations into the user's local session_store.db
-
- Notes:
- - This method uses multiple SQLAlchemy engines to connect to the
- user's session_store.db and to all the other downloaded dbs.
- - It is assumed that all the databases share the same schema.
- - In the Kedro-viz version 6.2.0 - we only merge the runs table which
- contains all the experiments.
- """
-
- all_new_runs = []
-
- with self._db_session_class() as session:
- existing_run_ids = session.execute(select(RunModel.id)).scalars().all()
-
- # Look at all databases in the local session store directory
- # that aren't the actual session_store.db itself.
- downloaded_db_locations = set(Path(self.location).parent.glob("*.db")) - {
- Path(self.location)
- }
-
- logger.debug(
- "Checking %s downloaded session stores for new runs...",
- len(downloaded_db_locations),
- )
- for downloaded_db_location in downloaded_db_locations:
- engine = create_engine(f"sqlite:///{downloaded_db_location}")
- with Session(engine) as session:
- query = select(RunModel).where(RunModel.id.not_in(existing_run_ids))
- new_runs = session.execute(query).scalars().all()
-
- existing_run_ids.extend([run.id for run in new_runs])
- all_new_runs.extend(new_runs)
- logger.debug(
- "Found %s new runs in downloaded session store %s",
- len(new_runs),
- downloaded_db_location.name,
- )
-
- if all_new_runs:
- logger.debug("Adding %s new runs to session store...", len(all_new_runs))
- with self._db_session_class.begin() as session:
- for run in all_new_runs:
- session.merge(run)
-
- def sync(self):
- """
- Synchronizes the user's local session_store.db with
- remote session_store.db stored on a cloud storage service.
- """
-
- if self.remote_location:
- self._download()
- # We don't want a failed merge to stop the whole kedro-viz process.
- try:
- self._merge()
- except Exception as exc:
- logger.exception("Merge failed on sync: %s", exc)
- self._upload()
diff --git a/package/kedro_viz/server.py b/package/kedro_viz/server.py
index 8643bec73f..747bbb16d9 100644
--- a/package/kedro_viz/server.py
+++ b/package/kedro_viz/server.py
@@ -4,7 +4,6 @@
from pathlib import Path
from typing import Any, Dict, Optional
-from kedro.framework.session.store import BaseSessionStore
from kedro.io import DataCatalog
from kedro.pipeline import Pipeline
@@ -13,7 +12,6 @@
from kedro_viz.data_access import DataAccessManager, data_access_manager
from kedro_viz.database import make_db_session_factory
from kedro_viz.integrations.kedro import data_loader as kedro_data_loader
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
from kedro_viz.launchers.utils import _check_viz_up, _wait_for, display_cli_message
DEV_PORT = 4142
@@ -23,18 +21,12 @@ def populate_data(
data_access_manager: DataAccessManager,
catalog: DataCatalog,
pipelines: Dict[str, Pipeline],
- session_store: BaseSessionStore,
stats_dict: Dict,
):
"""Populate data repositories. Should be called once on application start
if creating an api app from project.
"""
- if isinstance(session_store, SQLiteStore):
- session_store.sync()
- session_class = make_db_session_factory(session_store.location)
- data_access_manager.set_db_session(session_class)
-
data_access_manager.add_catalog(catalog, pipelines)
# add dataset stats before adding pipelines as the data nodes
@@ -56,7 +48,7 @@ def load_and_populate_data(
"""Loads underlying Kedro project data and populates Kedro Viz Repositories"""
# Loads data from underlying Kedro Project
- catalog, pipelines, session_store, stats_dict = kedro_data_loader.load_data(
+ catalog, pipelines, stats_dict, _ = kedro_data_loader.load_data(
path, env, include_hooks, package_name, extra_params, is_lite
)
@@ -67,7 +59,7 @@ def load_and_populate_data(
)
# Creates data repositories which are used by Kedro Viz Backend APIs
- populate_data(data_access_manager, catalog, pipelines, session_store, stats_dict)
+ populate_data(data_access_manager, catalog, pipelines, stats_dict)
def run_server(
From 943699d2c20904244c52798869b05f395a95791a Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Mon, 2 Dec 2024 21:33:29 +0000
Subject: [PATCH 02/16] remove relateds tests
Signed-off-by: Huong Nguyen
---
package/tests/conftest.py | 5 -
.../test_integrations/test_sqlite_store.py | 390 ------------------
package/tests/test_server.py | 24 --
3 files changed, 419 deletions(-)
delete mode 100644 package/tests/test_integrations/test_sqlite_store.py
diff --git a/package/tests/conftest.py b/package/tests/conftest.py
index ea25e94f7c..2ab1233a21 100644
--- a/package/tests/conftest.py
+++ b/package/tests/conftest.py
@@ -20,7 +20,6 @@
ModularPipelinesRepository,
)
from kedro_viz.integrations.kedro.hooks import DatasetStatsHook
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
from kedro_viz.models.flowchart.node_metadata import DataNodeMetadata
from kedro_viz.models.flowchart.nodes import GraphNode
from kedro_viz.server import populate_data
@@ -43,10 +42,6 @@ def session_store():
yield BaseSessionStore("dummy_path", "dummy_session_id")
-@pytest.fixture
-def sqlite_session_store(tmp_path):
- yield SQLiteStore(tmp_path, "dummy_session_id")
-
@pytest.fixture
def example_stats_dict():
diff --git a/package/tests/test_integrations/test_sqlite_store.py b/package/tests/test_integrations/test_sqlite_store.py
deleted file mode 100644
index 4f0cb6a00b..0000000000
--- a/package/tests/test_integrations/test_sqlite_store.py
+++ /dev/null
@@ -1,390 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-import boto3
-import pytest
-from moto import mock_aws
-from sqlalchemy import create_engine, func, select, text
-from sqlalchemy.orm import sessionmaker
-
-from kedro_viz.database import make_db_session_factory
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore, _get_dbname
-from kedro_viz.models.experiment_tracking import Base, RunModel
-
-BUCKET_NAME = "test-bucket"
-
-
-@pytest.fixture
-def parametrize_session_store_args(request):
- """Fixture to parameterize has_session_store_args."""
-
- # This fixture sets a class attribute has_session_store_args
- # based on the parameter passed
- request.cls.has_session_store_args = request.param
-
-
-@pytest.fixture
-def mock_session_store_args(request, mocker, setup_kedro_project):
- """Fixture to mock SESSION_STORE_ARGS and _find_kedro_project."""
-
- # This fixture uses the class attribute has_session_store_args
- # to apply the appropriate mocks.
- if request.cls.has_session_store_args:
- mocker.patch.dict(
- "kedro_viz.integrations.kedro.sqlite_store.settings.SESSION_STORE_ARGS",
- {"path": "some_path"},
- clear=True,
- )
- else:
- mocker.patch(
- "kedro_viz.integrations.kedro.sqlite_store._find_kedro_project",
- return_value=setup_kedro_project,
- )
-
-
-@pytest.fixture
-def store_path(request, tmp_path, setup_kedro_project):
- if request.cls.has_session_store_args:
- return Path(tmp_path)
- session_store_path = Path(tmp_path / setup_kedro_project / ".viz")
- session_store_path.mkdir(parents=True, exist_ok=True)
- return session_store_path
-
-
-@pytest.fixture
-def db_session_class(store_path):
- engine = create_engine(f"sqlite:///{store_path}/session_store.db")
- Base.metadata.create_all(engine)
- Session = sessionmaker(bind=engine)
- return Session
-
-
-@pytest.fixture(scope="class")
-def aws_credentials():
- """Mocked AWS credentials for moto"""
- os.environ["AWS_ACCESS_KEY_ID"] = "testing"
- os.environ["AWS_SECRET_ACCESS_KEY"] = "testing"
- os.environ["AWS_SESSION_TOKEN"] = "testing"
-
-
-@pytest.fixture(scope="class")
-def mocked_s3_bucket(aws_credentials):
- """S3 Mock Client"""
- with mock_aws():
- conn = boto3.client("s3", region_name="us-east-1")
- conn.create_bucket(Bucket=BUCKET_NAME)
- yield conn
-
-
-@pytest.fixture
-def remote_path():
- return f"s3://{BUCKET_NAME}"
-
-
-@pytest.fixture
-def mock_db1(store_path):
- database_loc = str(store_path / "db1.db")
- with make_db_session_factory(database_loc).begin() as session:
- session.add(RunModel(id="1", blob="blob1"))
- yield Path(database_loc)
-
-
-@pytest.fixture
-def mock_db2(store_path):
- database_loc = str(store_path / "db2.db")
- with make_db_session_factory(database_loc).begin() as session:
- session.add(RunModel(id="2", blob="blob2"))
- yield Path(database_loc)
-
-
-@pytest.fixture
-def mock_db3_with_db2_data(store_path):
- database_loc = str(store_path / "db3.db")
- with make_db_session_factory(database_loc).begin() as session:
- session.add(RunModel(id="2", blob="blob2"))
- yield Path(database_loc)
-
-
-def get_files_in_bucket(bucket_name):
- s3 = boto3.client("s3")
- response = s3.list_objects(Bucket=bucket_name)
- files = [obj["Key"] for obj in response.get("Contents", [])]
- return files
-
-
-@pytest.fixture
-def mocked_db_in_s3(mocked_s3_bucket, mock_db1, mock_db2):
- # define the name of the S3 bucket and the database file names
- db1_filename = "db1.db"
- db2_filename = "db2.db"
-
- # upload each mock database file to the mocked S3 bucket
- mocked_s3_bucket.put_object(
- Bucket=BUCKET_NAME, Key=db1_filename, Body=mock_db1.read_bytes()
- )
- mocked_s3_bucket.put_object(
- Bucket=BUCKET_NAME, Key=db2_filename, Body=mock_db2.read_bytes()
- )
-
- return get_files_in_bucket(BUCKET_NAME)
-
-
-@pytest.fixture
-def mocked_db_in_s3_repeated_runs(
- mocked_s3_bucket, mock_db1, mock_db2, mock_db3_with_db2_data
-):
- # define the name of the S3 bucket and the database file names
- db1_filename = "db1.db"
- db2_filename = "db2.db"
- db3_filename = "db3.db"
-
- # upload each mock database file to the mocked S3 bucket
- mocked_s3_bucket.put_object(
- Bucket=BUCKET_NAME, Key=db1_filename, Body=mock_db1.read_bytes()
- )
- mocked_s3_bucket.put_object(
- Bucket=BUCKET_NAME, Key=db2_filename, Body=mock_db2.read_bytes()
- )
- mocked_s3_bucket.put_object(
- Bucket=BUCKET_NAME, Key=db3_filename, Body=mock_db3_with_db2_data.read_bytes()
- )
-
- return get_files_in_bucket(BUCKET_NAME)
-
-
-def session_id():
- i = 0
- while True:
- yield f"session_{i}"
- i += 1
-
-
-def test_get_dbname_with_env_var(mocker):
- mocker.patch.dict(
- os.environ, {"KEDRO_SQLITE_STORE_USERNAME": "env_user_name"}, clear=True
- )
- mocker.patch("getpass.getuser", return_value="computer_user_name")
- dbname = _get_dbname()
- assert dbname == "env_user_name.db"
-
-
-def test_get_dbname_without_env_var(mocker):
- mocker.patch.dict("os.environ", clear=True)
- mocker.patch("getpass.getuser", return_value="computer_user_name")
- dbname = _get_dbname()
- assert dbname == "computer_user_name.db"
-
-
-@pytest.mark.usefixtures("parametrize_session_store_args", "mock_session_store_args")
-@pytest.mark.parametrize("parametrize_session_store_args", [True, False], indirect=True)
-class TestSQLiteStore:
- def test_empty(self, store_path):
- sqlite_store = SQLiteStore(store_path, next(session_id()))
- assert not sqlite_store
- assert sqlite_store.location == str(Path(store_path) / "session_store.db")
-
- def test_save_single_run(self, store_path):
- sqlite_store = SQLiteStore(store_path, next(session_id()))
- sqlite_store.data = {"project_path": store_path, "project_name": "test"}
- sqlite_store.save()
- with sqlite_store._db_session_class() as session:
- query = select(RunModel)
- loaded_runs = session.execute(query).scalars().all()
- assert len(loaded_runs) == 1
- assert json.loads(loaded_runs[0].blob) == {
- "project_path": str(store_path),
- "project_name": "test",
- }
-
- def test_save_multiple_runs(self, store_path):
- session = session_id()
- sqlite_store = SQLiteStore(store_path, next(session))
- sqlite_store.save()
- with sqlite_store._db_session_class() as db_session:
- query = select(func.count()).select_from(RunModel)
- assert db_session.execute(query).scalar() == 1
- # save another session
- sqlite_store2 = SQLiteStore(store_path, next(session))
- sqlite_store2.save()
- with sqlite_store2._db_session_class() as db_session:
- query = select(func.count()).select_from(RunModel)
- assert db_session.execute(query).scalar() == 2
-
- def test_save_run_with_remote_path(self, mocker, store_path, remote_path):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store.data = {"project_path": store_path, "project_name": "test"}
- mock_upload = mocker.patch.object(sqlite_store, "_upload")
- sqlite_store.save()
- mock_upload.assert_called_once()
-
- def test_save_run_without_remote_path(self, mocker, store_path):
- sqlite_store = SQLiteStore(store_path, next(session_id()))
- sqlite_store.data = {"project_path": store_path, "project_name": "test"}
- mock_upload = mocker.patch.object(sqlite_store, "_upload")
- sqlite_store.save()
- mock_upload.assert_not_called()
-
- def test_update_git_branch(self, store_path, mocker):
- sqlite_store = SQLiteStore(store_path, next(session_id()))
- sqlite_store.data = {
- "project_path": store_path,
- "git": {"commit_sha": "123456"},
- }
- mocker.patch("git.Repo.active_branch").name = "test_branch"
-
- assert sqlite_store._to_json() == json.dumps(
- {
- "project_path": str(store_path),
- "git": {"commit_sha": "123456", "branch": "test_branch"},
- }
- )
-
- def test_upload_to_s3_success(self, mocker, store_path, remote_path):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store._upload()
- sqlite_store._remote_fs.put.assert_called_once()
-
- def test_upload_to_s3_fail(self, mocker, store_path, remote_path, caplog):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store._remote_fs.put.side_effect = ConnectionError("Connection error")
- sqlite_store._upload()
- assert "Upload failed: Connection error" in caplog.text
-
- def test_download_from_s3_success(
- self,
- mocker,
- store_path,
- remote_path,
- mocked_db_in_s3,
- ):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store._remote_fs.glob.return_value = mocked_db_in_s3
- sqlite_store._download()
-
- assert set(file.name for file in Path(store_path).glob("*.db")) == {
- "db1.db",
- "db2.db",
- "session_store.db",
- }
-
- def test_download_from_s3_failure(self, mocker, store_path, remote_path, caplog):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store._remote_fs.glob.side_effect = ConnectionError("Connection error")
- sqlite_store._download()
- # assert that downloaded dbs are not downloaded
- assert set(file.name for file in Path(store_path).glob("*.db")) == {
- "session_store.db"
- }
- assert "Download failed: Connection error" in caplog.text
-
- def test_merge_databases(
- self,
- mocker,
- store_path,
- remote_path,
- mocked_db_in_s3,
- ):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store._remote_fs.glob.return_value = mocked_db_in_s3
- sqlite_store._download()
- sqlite_store._merge()
- db_session = sqlite_store._db_session_class
- with db_session() as session:
- assert session.execute(select(RunModel.id)).scalars().all() == ["1", "2"]
-
- def test_merge_databases_with_repeated_runs(
- self,
- mocker,
- store_path,
- remote_path,
- mocked_db_in_s3_repeated_runs,
- ):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store._remote_fs.glob.return_value = mocked_db_in_s3_repeated_runs
- sqlite_store._download()
- sqlite_store._merge()
- db_session = sqlite_store._db_session_class
- with db_session() as session:
- assert session.execute(select(RunModel.id)).scalars().all() == ["1", "2"]
-
- def test_sync(self, mocker, store_path, remote_path, mocked_db_in_s3):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- sqlite_store._remote_fs.glob.return_value = mocked_db_in_s3
- mock_download = mocker.patch.object(sqlite_store, "_download")
- mock_merge = mocker.patch.object(sqlite_store, "_merge")
- mock_upload = mocker.patch.object(sqlite_store, "_upload")
- sqlite_store.sync()
- mock_download.assert_called_once()
- mock_merge.assert_called_once()
- mock_upload.assert_called_once()
-
- def test_sync_without_remote_path(self, mocker, store_path):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(store_path, next(session_id()))
- mock_download = mocker.patch.object(sqlite_store, "_download")
- mock_merge = mocker.patch.object(sqlite_store, "_merge")
- mock_upload = mocker.patch.object(sqlite_store, "_upload")
- sqlite_store.sync()
- mock_download.assert_not_called()
- mock_merge.assert_not_called()
- mock_upload.assert_not_called()
-
- def test_sync_with_merge_error(self, mocker, store_path, remote_path, caplog):
- mocker.patch("fsspec.filesystem")
- sqlite_store = SQLiteStore(
- store_path, next(session_id()), remote_path=remote_path
- )
- mock_download = mocker.patch.object(sqlite_store, "_download")
- mock_merge = mocker.patch.object(
- sqlite_store, "_merge", side_effect=Exception("Merge failed")
- )
- mock_upload = mocker.patch.object(sqlite_store, "_upload")
- sqlite_store.sync()
- mock_download.assert_called_once()
- mock_merge.assert_called_once()
- mock_upload.assert_called_once()
- assert "Merge failed on sync: Merge failed" in caplog.text
-
- def test_make_db_session_factory_with_azure_env_var(self, mocker, tmp_path):
- """Test that WAL mode is enabled when running in an Azure environment."""
- mocker.patch.dict(
- os.environ,
- {
- "AZUREML_ARM_SUBSCRIPTION": "dummy_value",
- "AZUREML_ARM_RESOURCEGROUP": "dummy_value",
- },
- )
- db_location = str(tmp_path / "test_session_store.db")
- session_class = make_db_session_factory(db_location)
-
- # Ensure that the session can be created without issues.
- with session_class() as session:
- assert session is not None
- # Check if the database is using WAL mode by querying the PRAGMA
- result = session.execute(text("PRAGMA journal_mode;")).scalar()
- assert result == "wal"
diff --git a/package/tests/test_server.py b/package/tests/test_server.py
index ca8d19a2c2..1c4d24b03a 100644
--- a/package/tests/test_server.py
+++ b/package/tests/test_server.py
@@ -84,30 +84,6 @@ def test_run_server_from_project(
# an uvicorn server is launched
patched_uvicorn_run.assert_called_once()
- def test_run_server_from_project_with_sqlite_store(
- self,
- patched_create_api_app_from_project,
- patched_data_access_manager,
- patched_uvicorn_run,
- patched_load_data_with_sqlite_session_store,
- example_catalog,
- example_pipelines,
- ):
- run_server()
- # assert that when running server, data are added correctly to the data access manager
- patched_data_access_manager.add_catalog.assert_called_once_with(
- example_catalog, example_pipelines
- )
- patched_data_access_manager.add_pipelines.assert_called_once_with(
- example_pipelines
- )
- patched_data_access_manager.set_db_session.assert_called_once()
-
- # correct api app is created
- patched_create_api_app_from_project.assert_called_once()
-
- # an uvicorn server is launched
- patched_uvicorn_run.assert_called_once()
def test_specific_pipeline(
self,
From 7cd9f79f8aa0d9af74c44f9b39d8fd9cb80313b3 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Mon, 2 Dec 2024 21:33:47 +0000
Subject: [PATCH 03/16] remove doc about session store
Signed-off-by: Huong Nguyen
---
CONTRIBUTING.md | 8 -
docs/source/experiment_tracking.md | 360 -----------------------------
package/README.md | 27 ---
3 files changed, 395 deletions(-)
delete mode 100644 docs/source/experiment_tracking.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8b8fda0c15..4d5f1b2ea9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -210,14 +210,6 @@ make run PROJECT_PATH=/new-kedro-project
> **Note**: Once the backend development server is launched at port 4142, the local app will always pull data from that server. To prevent this, you can comment out the proxy setting in `package.json` and restart the dev server at port 4141.
-#### Launch the development server with the `SQLiteSessionStore`
-
-Kedro-Viz provides a `SQLiteSessionStore` that users can use in their project to enable experiment tracking functionality. If you want to use this session store with the development server, make sure you don't use a relative path when specifying the store's location in `settings.py`. For example, `demo-project` specifies the local `data` directory within a project as the session store's location as follows:
-
-```python
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
-SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")}
-```
Owing to this coupling between the project settings and Kedro-Viz, if you wish to execute any Kedro commands on `demo-project` (including `kedro run`), you will need to install the Kedro-Viz Python package. To install your local development version of the package, run:
diff --git a/docs/source/experiment_tracking.md b/docs/source/experiment_tracking.md
deleted file mode 100644
index c06ca5a208..0000000000
--- a/docs/source/experiment_tracking.md
+++ /dev/null
@@ -1,360 +0,0 @@
-# Experiment tracking in Kedro-Viz
-
-```{important}
-Starting from version 8.0.0 of Kedro-Viz, Experiment Tracking is exclusively supported for users with kedro-datasets version 2.1.0 or higher.
-```
-
-Experiment tracking is the process of saving all the metadata related to an experiment each time you run it. It enables you to compare different runs of a machine-learning model as part of the experimentation process.
-
-The metadata you store may include:
-
-* Scripts used for running the experiment
-* Environment configuration files
-* Versions of the data used for training and evaluation
-* Evaluation metrics
-* Model weights
-* Plots and other visualisations
-
-You can use Kedro-Viz experiment tracking to store and access results, and to share them with others for comparison. Storage can be local or remote, such as cloud storage on AWS S3.
-
-The experiment tracking demo enables you to explore the experiment tracking capabilities of Kedro-Viz.
-
-![](./images/experiment-tracking_demo.gif)
-
-## Kedro versions supporting experiment tracking
-Kedro has always supported parameter versioning (as part of your codebase with a version control system like `git`) and Kedro’s dataset versioning capabilities enabled you to [snapshot models, datasets and plots](https://docs.kedro.org/en/stable/data/data_catalog.html#dataset-versioning).
-
-Kedro-Viz version 4.1.1 introduced metadata capture, visualisation, discovery and comparison, enabling you to access, edit and [compare your experiments](#access-run-data-and-compare-runs) and additionally [track how your metrics change over time](#view-and-compare-metrics-data).
-
-Kedro-Viz version 5.0 also supports the [display and comparison of plots, such as Plotly and Matplotlib](./preview_plotly_datasets.md). Support for metric plots (timeseries and parallel coords) was added to Kedro-Viz version 5.2.1.
-
-Kedro-Viz version 6.2 includes support for collaborative experiment tracking using a cloud storage solution. This means that multiple users can store their experiment data in a centralized remote storage, such as AWS S3, and access it through Kedro-Viz.
-
-## When should I use experiment tracking in Kedro?
-
-The choice of experiment tracking tool depends on your use case and choice of complementary tools, such as MLflow and Neptune:
-
-- **Kedro** - If you need experiment tracking, are looking for improved metrics visualisation and want a lightweight tool to work alongside existing functionality in Kedro. Kedro does not support a model registry.
-- **MLflow** - You can combine MLflow with Kedro by using [`kedro-mlflow`](https://kedro-mlflow.readthedocs.io/en/stable/) if you require experiment tracking, model registry and/or model serving capabilities or have access to Managed MLflow within the Databricks ecosystem.
-- **Neptune** - If you require experiment tracking and model registry functionality, improved visualisation of metrics and support for collaborative data science, you may consider [`kedro-neptune`](https://docs.neptune.ai/integrations/kedro/) for your workflow.
-
-{doc}`We support a growing list of integrations`.
-
-## Set up a project
-
-This section describes the steps necessary to set up experiment tracking and access logged metrics, using the {doc}`spaceflights tutorial` with a version of Kedro equal to or higher than 0.18.4, and a version of Kedro-Viz equal to or higher than 5.2.
-
-There are three steps to enable experiment tracking features with Kedro-Viz. We illustrate how to:
-
-- [Set up a session store to capture experiment metadata](#set-up-the-session-store)
-- [Set up experiment tracking datasets to list the metrics to track](#set-up-experiment-tracking-datasets)
-- [Modify your nodes and pipelines to output those metrics](#modify-your-nodes-and-pipelines-to-log-metrics)
-
-### Install Kedro and Kedro-Viz
-To use this tutorial code, you must already have {doc}`installed Kedro` and [Kedro-Viz](./kedro-viz_visualisation.md). You can confirm the versions you have installed by running `kedro info`
-
-```{note}
-The example code uses a version of Kedro-Viz `>6.2.0`.
-```
-
-Create a new project using the spaceflights starter. From the terminal run:
-
-```bash
-kedro new --starter=spaceflights-pandas
-```
-
-Feel free to name your project as you like, but this guide assumes the project is named `Spaceflights`.
-
-### Install the dependencies for the project
-
-Once you have created the project, to run project-specific Kedro commands, you must navigate to the directory in which it has been created:
-
-```bash
-cd spaceflights
-```
-Install the project's dependencies:
-
-```bash
-pip install -r src/requirements.txt
-```
-
-## Set up the session store
-
-In the domain of experiment tracking, each pipeline run is considered a session. A session store records all related metadata for each pipeline run, from logged metrics to other run-related data such as timestamp, `git` username and branch. The session store is a [SQLite](https://www.sqlite.org/index.html) database that is generated during your first pipeline run after it has been set up in your project.
-
-### Local storage
-To set up the session store locally, go to the `src/spaceflights/settings.py` file and add the following:
-
-```python
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
-from pathlib import Path
-
-SESSION_STORE_CLASS = SQLiteStore
-SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")}
-```
-
-This specifies the creation of the `SQLiteStore` under the `data` subfolder, using the `SQLiteStore` setup from your installed Kedro-Viz plugin
-
-This step is crucial to enable experiment tracking features on Kedro-Viz, as it is the database used to serve all run data to the Kedro-Viz front-end. Once this step is complete, you can either proceed to [set up the tracking datasets](#set-up-experiment-tracking-datasets) or [set up your nodes and pipelines to log metrics](#modify-your-nodes-and-pipelines-to-log-metrics); these two activities are interchangeable, but both should be completed to get a working experiment tracking setup.
-
-```{note}
-Starting from Kedro-Viz 9.2.0, if the user does not provide `SESSION_STORE_ARGS` in the project settings, a default directory `.viz` will be created at the root of your Kedro project and used for `SQLiteStore`.
-```
-
-## Collaborative experiment tracking
-
-```{note}
-To use collaborative experiment tracking, ensure that your installed version of Kedro-Viz is `>=6.2.0`.
-```
-
-For collaborative experiment tracking, Kedro-Viz saves your experiments as SQLite database files on a central cloud storage. To ensure that all users have a unique filename, set up your `KEDRO_SQLITE_STORE_USERNAME` in the environment variables. By default, Kedro-Viz will take your computer user name if this is not specified.
-
-> Note: In Kedro-Viz version 6.2, the only way to set up credentials for accessing your cloud storage is through environment variables.
-
-```bash
-export KEDRO_SQLITE_STORE_USERNAME="your_unique__username"
-
-```
-
-Now specify a remote path in the `SESSION_STORE_ARGS` variable, which links to your cloud storage.
-
-
-```python
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
-from pathlib import Path
-
-SESSION_STORE_CLASS = SQLiteStore
-SESSION_STORE_ARGS = {
- "path": str(Path(__file__).parents[2] / "data"),
- "remote_path": "s3://my-bucket-name/path/to/experiments",
-}
-```
-
-Finally, ensure you have the necessary credentials set up as shown below:
-
-```bash
-export AWS_ACCESS_KEY_ID="your_access_key_id"
-export AWS_SECRET_ACCESS_KEY="your_secret_access_key"
-export AWS_REGION="your_aws_region"
-
-```
-
-## Set up experiment tracking datasets
-
-There are two types of tracking datasets: {py:class}`tracking.MetricsDataset ` and {py:class}`tracking.JSONDataset `. The `tracking.MetricsDataset` should be used for tracking numerical metrics, and the `tracking.JSONDataset` can be used for tracking any other JSON-compatible data like boolean or text-based data.
-
-Set up two datasets to log the columns used in the companies dataset (`companies_columns`) and experiment metrics for the data science pipeline (`metrics`) like the coefficient of determination (`r2 score`), max error (`me`) and mean absolute error (`mae`) by adding the following in the `conf/base/catalog.yml` file:
-
-```yaml
-metrics:
- type: tracking.MetricsDataset
- filepath: data/09_tracking/metrics.json
-
-companies_columns:
- type: tracking.JSONDataset
- filepath: data/09_tracking/companies_columns.json
-```
-
-## Modify your nodes and pipelines to log metrics
-
-Now that you have set up the tracking datasets to log experiment tracking data, next ensure that the data is returned from your nodes.
-
-Set up the data to be logged for the metrics dataset - under `nodes.py` of your `data_science` pipeline (`src/spaceflights/pipelines/data_science/nodes.py`), add three different metrics to your `evaluate_model` function to log `r2_score`, `mae` and `me` and return these 3 metrics as key-value pairs.
-
-The new `evaluate_model` function should look like this:
-
-```python
-from sklearn.metrics import mean_absolute_error, max_error
-
-
-def evaluate_model(
- regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series
-) -> Dict[str, float]:
- """Calculates and logs the coefficient of determination.
-
- Args:
- regressor: Trained model.
- X_test: Testing data of independent features.
- y_test: Testing data for price.
- """
- y_pred = regressor.predict(X_test)
- score = r2_score(y_test, y_pred)
- mae = mean_absolute_error(y_test, y_pred)
- me = max_error(y_test, y_pred)
- logger = logging.getLogger(__name__)
- logger.info("Model has a coefficient R^2 of %.3f on test data.", score)
- return {"r2_score": score, "mae": mae, "max_error": me}
-```
-
-Next, ensure that the dataset is also specified as an output of your `evaluate_model` node. In the `src/spaceflights/pipelines/data_science/pipeline.py` file, specify the `output` of your `evaluate_model` to be the `metrics` dataset. Note that the output dataset must exactly match the name of the tracking dataset specified in the catalog file.
-
-The node of the `evaluate_model` on the pipeline should look like this:
-
-```python
-node(
- func=evaluate_model,
- inputs=["regressor", "X_test", "y_test"],
- name="evaluate_model_node",
- outputs="metrics",
-)
-```
-
-Repeat the same steps to set up the `companies_column` dataset. For this dataset, log the column that contains the list of companies as outlined in the `companies.csv` file under the `data/01_raw` directory. Modify the `preprocess_companies` node under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/nodes.py`) to return the data under a key-value pair, as shown below:
-
-```python
-from typing import Tuple, Dict
-
-
-def preprocess_companies(companies: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
- """Preprocesses the data for companies.
-
- Args:
- companies: Raw data.
- Returns:
- Preprocessed data, with `company_rating` converted to a float and
- `iata_approved` converted to boolean.
- """
- companies["iata_approved"] = _is_true(companies["iata_approved"])
- companies["company_rating"] = _parse_percentage(companies["company_rating"])
- return companies, {"columns": companies.columns.tolist(), "data_type": "companies"}
-```
-
-Again, you must ensure that the dataset is also specified as an output on the `pipeline.py` file under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`), as follows:
-
-```python
-node(
- func=preprocess_companies,
- inputs="companies",
- outputs=["preprocessed_companies", "companies_columns"],
- name="preprocess_companies_node",
-)
-```
-
-Having set up both datasets, you can now generate your first set of experiment tracking data!
-
-## Generate the run data
-
-The beauty of native experiment tracking in Kedro is that all tracked data is generated and stored each time you do a Kedro run. Hence, to generate the data, you need only execute:
-
-```bash
-kedro run
-```
-
-After the run completes, under `data/09_tracking`, you can now see two folders, `companies_column.json` and `metrics.json`. On performing a pipeline run after setting up the tracking datasets, Kedro generates a folder with the dataset name for each tracked dataset. Each folder of the tracked dataset contains folders named by the timestamp of each pipeline run to store the saved metrics of the dataset, and each future pipeline run generates a new timestamp folder with the JSON file of the saved metrics under the folder of its subsequent tracked dataset.
-
-You can also see the `session_store.db` generated from your first pipeline run after enabling experiment tracking, which is used to store all the generated run metadata, alongside the tracking dataset, to be used for exposing experiment tracking to Kedro-Viz.
-
-![](./images/experiment-tracking-folder.png)
-
-Execute `kedro run` a few times in a row to generate a larger set of experiment data. You can also play around with setting up different tracking datasets, and check the logged data via the generated JSON data files.
-
-## Access run data and compare runs
-
-Here comes the fun part of accessing your run data on Kedro-Viz. Having generated some run data, execute the following command:
-
-```bash
-kedro viz run
-```
-
-When you open the Kedro-Viz web app, you see an experiment tracking icon on the left-hand side of the screen.
-
-![](./images/experiment-tracking-icon.png)
-
-Click the icon to go to the experiment tracking page (you can also access the page from your browser at `http://127.0.0.1:4141/experiment-tracking`), where you can see the sets of experiment data generated from all previous runs:
-
-![](./images/experiment-tracking-runs-list.png)
-
-You can now access, compare and pin your runs by toggling the `Compare runs` button:
-
-![](./images/experiment-tracking-compare-runs.png)
-
-## View and compare plots
-
-In this section, we illustrate how to compare Matplotlib plots across experimental runs (functionality available since Kedro-Viz version 5.0).
-
-### Update the dependencies
-
-Update the `src/requirements.txt` file in your Kedro project by adding the following dataset to enable Matplotlib for your project:
-
-```text
-kedro-datasets[matplotlib.MatplotlibWriter]~=1.1
-seaborn~=0.12.1
-```
-
-And install the requirements with:
-
-```bash
-pip install -r src/requirements.txt
-```
-
-### Add a plotting node
-
-Add a new node to the `data_processing` nodes (`src/spaceflights/pipelines/data_processing/nodes.py`):
-
-```python
-import matplotlib.pyplot as plt
-import seaborn as sn
-
-
-def create_confusion_matrix(companies: pd.DataFrame):
- actuals = [0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1]
- predicted = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1]
- data = {"y_Actual": actuals, "y_Predicted": predicted}
- df = pd.DataFrame(data, columns=["y_Actual", "y_Predicted"])
- confusion_matrix = pd.crosstab(
- df["y_Actual"], df["y_Predicted"], rownames=["Actual"], colnames=["Predicted"]
- )
- sn.heatmap(confusion_matrix, annot=True)
- return plt
-```
-
-And now add this node to the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`):
-
-```python
-from .nodes import create_confusion_matrix
-
-node(
- func=create_confusion_matrix,
- inputs="companies",
- outputs="confusion_matrix",
-),
-```
-
-In the catalog (`conf/base/catalog.yml`) add the `confusion_matrix` data definition, making sure to set the versioned flag to `true` within the project catalog to include the plot in experiment tracking:
-
-```yaml
-confusion_matrix:
- type: matplotlib.MatplotlibWriter
- filepath: data/09_tracking/confusion_matrix.png
- versioned: true
-```
-
-After running the pipeline with `kedro run`, the plot is saved and you can see it in the experiment tracking panel when you execute `kedro viz run`. Clicking on a plot expands it. When in comparison view, expanding a plot shows all the plots in that view for side-by-side comparison.
-
-![](./images/experiment-tracking-plots-comparison.png)
-
-![](./images/experiment-tracking-plots-comparison-expanded.png)
-
-## View and compare metrics data
-
-From Kedro-Viz `>=5.2.1` experiment tracking also supports the display and comparison of metrics data through two chart types: time series and parallel coordinates.
-
-Time series displays one metric per graph, showing how the metric value has changed over time.
-
-Parallel coordinates displays all metrics on a single graph, with each vertical line representing one metric with its own scale. The metric values are positioned along those vertical lines and connected across each axis.
-
-When in comparison view, comparing runs highlights your selections on the respective chart types, improving readability even in the event there is a multitude of data points.
-
-```{note}
-The following graphic is taken from the [Kedro-Viz experiment tracking demo](https://demo.kedro.org/) (it is not a visualisation from the example code you created above).
-```
-
-![](./images/experiment-tracking-metrics-comparison.gif)
-
-Additionally, you can monitor the changes to metrics over time from the pipeline visualisation tab which you can access by following the icon on the left-hand side of the screen.
-
-![](./images/pipeline_visualisation_icon.png)
-
-Clicking on any `MetricsDataset` node opens a side panel displaying how the metric value has changed over time:
-
-![](./images/pipeline_show_metrics.gif)
diff --git a/package/README.md b/package/README.md
index dc543fae18..f2d5ab880d 100644
--- a/package/README.md
+++ b/package/README.md
@@ -205,33 +205,6 @@ Options:
-h, --help Show this message and exit.
```
-### Experiment Tracking usage
-
-To enable [experiment tracking](https://docs.kedro.org/en/stable/experiment_tracking/index.html) in Kedro-Viz, you need to add the Kedro-Viz `SQLiteStore` to your Kedro project.
-
-This can be done by adding the below code to `settings.py` in the `src` folder of your Kedro project.
-
-```python
-from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
-from pathlib import Path
-SESSION_STORE_CLASS = SQLiteStore
-SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")}
-```
-
-Once the above set-up is complete, tracking datasets can be used to track relevant data for Kedro runs. More information on how to use tracking datasets can be found in the [experiment tracking documentation](https://docs.kedro.org/en/stable/experiment_tracking/index.html)
-
-**Notes:**
-
-- Experiment Tracking is only available for Kedro-Viz >= 4.0.2 and Kedro >= 0.17.5
-- Prior to Kedro 0.17.6, when using tracking datasets, you will have to explicitly mark the datasets as `versioned` for it to show up properly in Kedro-Viz experiment tracking tab. From Kedro >= 0.17.6, this is done automatically:
-
-```yaml
-train_evaluation.r2_score_linear_regression:
- type: tracking.MetricsDataset
- filepath: ${base_location}/09_tracking/linear_score.json
- versioned: true
-```
-
### Standalone React component usage
To use Kedro-Viz as a standalone React component, you can follow the example below. However, please note that Kedro-Viz does not support server-side rendering (SSR). If you're using Next.js or another SSR framework, you should be aware of this limitation.
From cc50ab6ab63662c1b055a2f6f601d0451cf9ddf3 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 10:01:46 +0000
Subject: [PATCH 04/16] lint formatting
Signed-off-by: Huong Nguyen
---
package/tests/conftest.py | 1 -
package/tests/test_server.py | 1 -
2 files changed, 2 deletions(-)
diff --git a/package/tests/conftest.py b/package/tests/conftest.py
index 2ab1233a21..6582ab985a 100644
--- a/package/tests/conftest.py
+++ b/package/tests/conftest.py
@@ -42,7 +42,6 @@ def session_store():
yield BaseSessionStore("dummy_path", "dummy_session_id")
-
@pytest.fixture
def example_stats_dict():
yield {
diff --git a/package/tests/test_server.py b/package/tests/test_server.py
index 1c4d24b03a..011b05defd 100644
--- a/package/tests/test_server.py
+++ b/package/tests/test_server.py
@@ -84,7 +84,6 @@ def test_run_server_from_project(
# an uvicorn server is launched
patched_uvicorn_run.assert_called_once()
-
def test_specific_pipeline(
self,
patched_data_access_manager,
From d283fca5115b1ef7e269ce350ca47e81fbee95b4 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 10:10:16 +0000
Subject: [PATCH 05/16] remove session store from tests
Signed-off-by: Huong Nguyen
---
package/tests/conftest.py | 19 +------------------
1 file changed, 1 insertion(+), 18 deletions(-)
diff --git a/package/tests/conftest.py b/package/tests/conftest.py
index 6582ab985a..a54000748e 100644
--- a/package/tests/conftest.py
+++ b/package/tests/conftest.py
@@ -6,7 +6,6 @@
import pandas as pd
import pytest
from fastapi.testclient import TestClient
-from kedro.framework.session.store import BaseSessionStore
from kedro.io import DataCatalog, MemoryDataset, Version
from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline
@@ -37,11 +36,6 @@ def data_access_manager():
yield DataAccessManager()
-@pytest.fixture
-def session_store():
- yield BaseSessionStore("dummy_path", "dummy_session_id")
-
-
@pytest.fixture
def example_stats_dict():
yield {
@@ -484,7 +478,6 @@ def example_api(
data_access_manager: DataAccessManager,
example_pipelines: Dict[str, Pipeline],
example_catalog: DataCatalog,
- session_store: BaseSessionStore,
example_stats_dict: Dict,
mocker,
):
@@ -493,7 +486,6 @@ def example_api(
data_access_manager,
example_catalog,
example_pipelines,
- session_store,
example_stats_dict,
)
mocker.patch(
@@ -512,14 +504,11 @@ def example_api_no_default_pipeline(
data_access_manager: DataAccessManager,
example_pipelines: Dict[str, Pipeline],
example_catalog: DataCatalog,
- session_store: BaseSessionStore,
mocker,
):
del example_pipelines["__default__"]
api = apps.create_api_app_from_project(mock.MagicMock())
- populate_data(
- data_access_manager, example_catalog, example_pipelines, session_store, {}
- )
+ populate_data(data_access_manager, example_catalog, example_pipelines, {})
mocker.patch(
"kedro_viz.api.rest.responses.pipelines.data_access_manager",
new=data_access_manager,
@@ -536,7 +525,6 @@ def example_api_for_edge_case_pipelines(
data_access_manager: DataAccessManager,
edge_case_example_pipelines: Dict[str, Pipeline],
example_catalog: DataCatalog,
- session_store: BaseSessionStore,
mocker,
):
api = apps.create_api_app_from_project(mock.MagicMock())
@@ -552,7 +540,6 @@ def example_api_for_edge_case_pipelines(
data_access_manager,
example_catalog,
edge_case_example_pipelines,
- session_store,
{},
)
mocker.patch(
@@ -571,7 +558,6 @@ def example_api_for_pipelines_with_additional_tags(
data_access_manager: DataAccessManager,
example_pipelines_with_additional_tags: Dict[str, Pipeline],
example_catalog: DataCatalog,
- session_store: BaseSessionStore,
mocker,
):
api = apps.create_api_app_from_project(mock.MagicMock())
@@ -587,7 +573,6 @@ def example_api_for_pipelines_with_additional_tags(
data_access_manager,
example_catalog,
example_pipelines_with_additional_tags,
- session_store,
{},
)
mocker.patch(
@@ -606,7 +591,6 @@ def example_transcoded_api(
data_access_manager: DataAccessManager,
example_transcoded_pipelines: Dict[str, Pipeline],
example_transcoded_catalog: DataCatalog,
- session_store: BaseSessionStore,
mocker,
):
api = apps.create_api_app_from_project(mock.MagicMock())
@@ -614,7 +598,6 @@ def example_transcoded_api(
data_access_manager,
example_transcoded_catalog,
example_transcoded_pipelines,
- session_store,
{},
)
mocker.patch(
From e083357a756cbcb0130ff56606189b4b256e9cca Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 10:19:16 +0000
Subject: [PATCH 06/16] update docs reference
Signed-off-by: Huong Nguyen
---
docs/source/index.md | 4 +---
docs/source/kedro-viz_visualisation.md | 1 -
2 files changed, 1 insertion(+), 4 deletions(-)
diff --git a/docs/source/index.md b/docs/source/index.md
index ea10570cfb..4db6c326b8 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -6,7 +6,7 @@
-Kedro-Viz is an interactive development tool for visualising data science pipelines built with [Kedro](https://github.com/kedro-org/kedro). Kedro-Viz also enables users to view and compare different experiment runs within their Kedro project.
+Kedro-Viz is an interactive development tool for visualising data science pipelines built with [Kedro](https://github.com/kedro-org/kedro).
Kedro-Viz features include:
@@ -18,7 +18,6 @@ Kedro-Viz features include:
🎨 Rich metadata side panel to display parameters, plots, etc.
📊 Support for all types of [Plotly charts](https://plotly.com/javascript/).
♻️ Autoreload on code change.
-🧪 Support for experiment tracking and comparing runs in a Kedro project.
Take a look at the live demo for a preview of Kedro-Viz.
@@ -30,7 +29,6 @@ kedro-viz_visualisation
share_kedro_viz
preview_datasets
slice_a_pipeline
-experiment_tracking
```
```{toctree}
diff --git a/docs/source/kedro-viz_visualisation.md b/docs/source/kedro-viz_visualisation.md
index 66c9cf651d..d4e30fb80f 100644
--- a/docs/source/kedro-viz_visualisation.md
+++ b/docs/source/kedro-viz_visualisation.md
@@ -89,7 +89,6 @@ Some of the known limitations while using `--lite` flag:
* If the datasets are not resolved, they will be defaulted to a custom dataset `UnavailableDataset`.
* The flowchart will not show the layers information for the datasets.
-* Experiment Tracking will not work if the pre-requisite of having kedro-datasets version 2.1.0 and above is not met.
## Automatic visualisation updates
From d8abbd30636c71331768c8f78a36240c1b39b5e5 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 10:28:55 +0000
Subject: [PATCH 07/16] update test_server
Signed-off-by: Huong Nguyen
---
package/tests/test_server.py | 20 +-------------------
1 file changed, 1 insertion(+), 19 deletions(-)
diff --git a/package/tests/test_server.py b/package/tests/test_server.py
index 011b05defd..1c0960c1cb 100644
--- a/package/tests/test_server.py
+++ b/package/tests/test_server.py
@@ -31,30 +31,12 @@ def patched_create_api_app_from_file(mocker):
@pytest.fixture(autouse=True)
-def patched_load_data(
- mocker, example_catalog, example_pipelines, session_store, example_stats_dict
-):
+def patched_load_data(mocker, example_catalog, example_pipelines, example_stats_dict):
yield mocker.patch(
"kedro_viz.server.kedro_data_loader.load_data",
return_value=(
example_catalog,
example_pipelines,
- session_store,
- example_stats_dict,
- ),
- )
-
-
-@pytest.fixture
-def patched_load_data_with_sqlite_session_store(
- mocker, example_catalog, example_pipelines, sqlite_session_store, example_stats_dict
-):
- yield mocker.patch(
- "kedro_viz.server.kedro_data_loader.load_data",
- return_value=(
- example_catalog,
- example_pipelines,
- sqlite_session_store,
example_stats_dict,
),
)
From 2fb35765239ee1ddf6e3924d8aad2346e6eff6b7 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 12:35:40 +0000
Subject: [PATCH 08/16] remove sessions store from load_data
Signed-off-by: Huong Nguyen
---
package/kedro_viz/integrations/kedro/data_loader.py | 3 +--
package/kedro_viz/server.py | 2 +-
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/package/kedro_viz/integrations/kedro/data_loader.py b/package/kedro_viz/integrations/kedro/data_loader.py
index 6232270368..47f3b98a05 100644
--- a/package/kedro_viz/integrations/kedro/data_loader.py
+++ b/package/kedro_viz/integrations/kedro/data_loader.py
@@ -13,7 +13,6 @@
from kedro import __version__
from kedro.framework.project import configure_project, pipelines
from kedro.framework.session import KedroSession
-from kedro.framework.session.store import BaseSessionStore
from kedro.framework.startup import bootstrap_project
from kedro.io import DataCatalog
from kedro.pipeline import Pipeline
@@ -120,7 +119,7 @@ def load_data(
package_name: Optional[str] = None,
extra_params: Optional[Dict[str, Any]] = None,
is_lite: bool = False,
-) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
+) -> Tuple[DataCatalog, Dict[str, Pipeline], Dict]:
"""Load data from a Kedro project.
Args:
project_path: the path where the Kedro project is located.
diff --git a/package/kedro_viz/server.py b/package/kedro_viz/server.py
index 747bbb16d9..d52378bbc7 100644
--- a/package/kedro_viz/server.py
+++ b/package/kedro_viz/server.py
@@ -48,7 +48,7 @@ def load_and_populate_data(
"""Loads underlying Kedro project data and populates Kedro Viz Repositories"""
# Loads data from underlying Kedro Project
- catalog, pipelines, stats_dict, _ = kedro_data_loader.load_data(
+ catalog, pipelines, stats_dict = kedro_data_loader.load_data(
path, env, include_hooks, package_name, extra_params, is_lite
)
From ab66d10a00ca90d4b2e2c85641cbbb75aa4932e9 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 13:09:08 +0000
Subject: [PATCH 09/16] remove make_db_session_factory
Signed-off-by: Huong Nguyen
---
package/kedro_viz/database.py | 18 -------
.../integrations/kedro/data_loader.py | 3 +-
package/kedro_viz/server.py | 1 -
.../tests/test_api/test_graphql/conftest.py | 50 -------------------
4 files changed, 1 insertion(+), 71 deletions(-)
diff --git a/package/kedro_viz/database.py b/package/kedro_viz/database.py
index 5a62e32bcb..b545ddc665 100644
--- a/package/kedro_viz/database.py
+++ b/package/kedro_viz/database.py
@@ -21,21 +21,3 @@ def configure_wal_for_azure(engine):
if is_azure_ml:
with engine.connect() as conn:
conn.execute(text("PRAGMA journal_mode=WAL;"))
-
-
-def make_db_session_factory(session_store_location: str) -> sessionmaker:
- """SQLAlchemy connection to a SQLite DB"""
- database_url = f"sqlite:///{session_store_location}"
- engine = create_engine(database_url, connect_args={"check_same_thread": False})
- # TODO: making db session factory shouldn't depend on models.
- # So want to move the table creation elsewhere ideally.
- # But this means returning engine as well as session class.
-
- # Check if we are running in an Azure ML environment if so enable WAL mode.
- configure_wal_for_azure(engine)
-
- # Create the database tables if they do not exist.
- Base.metadata.create_all(bind=engine)
-
- # Return a session factory bound to the engine.
- return sessionmaker(bind=engine)
diff --git a/package/kedro_viz/integrations/kedro/data_loader.py b/package/kedro_viz/integrations/kedro/data_loader.py
index 47f3b98a05..fc34abcb3a 100644
--- a/package/kedro_viz/integrations/kedro/data_loader.py
+++ b/package/kedro_viz/integrations/kedro/data_loader.py
@@ -133,8 +133,7 @@ def load_data(
configuration.
is_lite: A flag to run Kedro-Viz in lite mode.
Returns:
- A tuple containing the data catalog, pipeline dictionary, session store
- and dataset stats dictionary.
+ A tuple containing the data catalog, pipeline dictionary,and dataset stats dictionary.
"""
if package_name:
configure_project(package_name)
diff --git a/package/kedro_viz/server.py b/package/kedro_viz/server.py
index d52378bbc7..5c6c48b7ce 100644
--- a/package/kedro_viz/server.py
+++ b/package/kedro_viz/server.py
@@ -10,7 +10,6 @@
from kedro_viz.autoreload_file_filter import AutoreloadFileFilter
from kedro_viz.constants import DEFAULT_HOST, DEFAULT_PORT
from kedro_viz.data_access import DataAccessManager, data_access_manager
-from kedro_viz.database import make_db_session_factory
from kedro_viz.integrations.kedro import data_loader as kedro_data_loader
from kedro_viz.launchers.utils import _check_viz_up, _wait_for, display_cli_message
diff --git a/package/tests/test_api/test_graphql/conftest.py b/package/tests/test_api/test_graphql/conftest.py
index fb57f5aa58..78ee0156c0 100644
--- a/package/tests/test_api/test_graphql/conftest.py
+++ b/package/tests/test_api/test_graphql/conftest.py
@@ -7,59 +7,9 @@
from kedro_datasets import matplotlib, pandas, plotly, tracking
from kedro_viz.api.graphql.types import Run
-from kedro_viz.database import make_db_session_factory
from kedro_viz.models.experiment_tracking import RunModel, UserRunDetailsModel
-@pytest.fixture
-def example_run_ids():
- yield ["2021-11-03T18.24.24.379Z", "2021-11-02T18.24.24.379Z"]
-
-
-@pytest.fixture
-def example_db_session(tmp_path):
- session_store_location = Path(tmp_path / "session_store.db")
- session_class = make_db_session_factory(session_store_location)
- yield session_class
-
-
-@pytest.fixture
-def example_db_session_with_runs(example_db_session, example_run_ids):
- with example_db_session.begin() as session:
- for run_id in example_run_ids:
- session_data = {
- "package_name": "testsql",
- "project_path": "/Users/Projects/testsql",
- "session_id": run_id,
- "cli": {
- "args": [],
- "params": {
- "from_inputs": [],
- "to_outputs": [],
- "from_nodes": [],
- "to_nodes": [],
- "node_names": (),
- "runner": None,
- "parallel": False,
- "is_async": False,
- "env": None,
- "tag": (),
- "load_version": {},
- "pipeline": None,
- "config": None,
- "params": {},
- },
- "command_name": "run",
- "command_path": "kedro run",
- },
- }
- run = RunModel(id=run_id, blob=json.dumps(session_data))
- user_run_details = UserRunDetailsModel(run_id=run.id, bookmark=True)
- session.add(run)
- session.add(user_run_details)
- yield example_db_session
-
-
@pytest.fixture
def data_access_manager_with_no_run(data_access_manager, example_db_session, mocker):
data_access_manager.set_db_session(example_db_session)
From 0490962469383db1ff3896308ce17e2e79ddd6ca Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 15:11:33 +0000
Subject: [PATCH 10/16] remove session store from _load_data_helper
Signed-off-by: Huong Nguyen
---
package/kedro_viz/integrations/kedro/data_loader.py | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/package/kedro_viz/integrations/kedro/data_loader.py b/package/kedro_viz/integrations/kedro/data_loader.py
index fc34abcb3a..ec2d6671bd 100644
--- a/package/kedro_viz/integrations/kedro/data_loader.py
+++ b/package/kedro_viz/integrations/kedro/data_loader.py
@@ -72,8 +72,7 @@ def _load_data_helper(
configuration.
is_lite: A flag to run Kedro-Viz in lite mode.
Returns:
- A tuple containing the data catalog, pipeline dictionary, session store
- and dataset stats dictionary.
+ A tuple containing the data catalog, pipeline dictionary and dataset stats dictionary.
"""
with KedroSession.create(
@@ -87,7 +86,6 @@ def _load_data_helper(
session._hook_manager = _VizNullPluginManager() # type: ignore
context = session.load_context()
- session_store = session._store
# patch the AbstractDataset class for a custom
# implementation to handle kedro.io.core.DatasetError
@@ -109,7 +107,7 @@ def _load_data_helper(
# Useful for users who have `get_current_session` in their `register_pipelines()`.
pipelines_dict = dict(pipelines)
stats_dict = _get_dataset_stats(project_path)
- return catalog, pipelines_dict, session_store, stats_dict
+ return catalog, pipelines_dict, stats_dict
def load_data(
From 57cba0b126add5a6b8f0ad5a66331143ee782ae8 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Tue, 3 Dec 2024 16:36:02 +0000
Subject: [PATCH 11/16] remove database.py entirely
Signed-off-by: Huong Nguyen
---
demo-project/src/demo_project/settings.py | 6 ++++++
package/kedro_viz/database.py | 23 -----------------------
2 files changed, 6 insertions(+), 23 deletions(-)
delete mode 100644 package/kedro_viz/database.py
diff --git a/demo-project/src/demo_project/settings.py b/demo-project/src/demo_project/settings.py
index 5023ebc1f4..63533b8eb9 100644
--- a/demo-project/src/demo_project/settings.py
+++ b/demo-project/src/demo_project/settings.py
@@ -2,6 +2,12 @@
# List the installed plugins for which to disable auto-registry
# DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",)
+# Define custom context class. Defaults to `KedroContext`
+# CONTEXT_CLASS = KedroContext
+
+# Define the configuration folder. Defaults to `conf`
+# CONF_ROOT = "conf"
+
from kedro.config import OmegaConfigLoader # NOQA
CONFIG_LOADER_CLASS = OmegaConfigLoader
diff --git a/package/kedro_viz/database.py b/package/kedro_viz/database.py
deleted file mode 100644
index b545ddc665..0000000000
--- a/package/kedro_viz/database.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Database management layer based on SQLAlchemy"""
-
-import os
-
-from sqlalchemy import create_engine, text
-from sqlalchemy.orm import sessionmaker
-
-from kedro_viz.models.experiment_tracking import Base
-
-
-def configure_wal_for_azure(engine):
- """Applies WAL mode to SQLite if running in an Azure ML environment."""
- is_azure_ml = any(
- var in os.environ
- for var in [
- "AZUREML_ARM_SUBSCRIPTION",
- "AZUREML_ARM_RESOURCEGROUP",
- "AZUREML_RUN_ID",
- ]
- )
- if is_azure_ml:
- with engine.connect() as conn:
- conn.execute(text("PRAGMA journal_mode=WAL;"))
From 3ff2a075ba73e1fb61024e6c65d827cadc34546b Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Wed, 4 Dec 2024 10:10:06 +0000
Subject: [PATCH 12/16] remove test graphql folder
Signed-off-by: Huong Nguyen
---
.../tests/test_api/test_graphql/__init__.py | 0
.../tests/test_api/test_graphql/conftest.py | 196 --------
.../test_api/test_graphql/test_mutations.py | 232 ----------
.../test_api/test_graphql/test_queries.py | 429 ------------------
.../test_api/test_graphql/test_serializers.py | 0
5 files changed, 857 deletions(-)
delete mode 100644 package/tests/test_api/test_graphql/__init__.py
delete mode 100644 package/tests/test_api/test_graphql/conftest.py
delete mode 100644 package/tests/test_api/test_graphql/test_mutations.py
delete mode 100644 package/tests/test_api/test_graphql/test_queries.py
delete mode 100644 package/tests/test_api/test_graphql/test_serializers.py
diff --git a/package/tests/test_api/test_graphql/__init__.py b/package/tests/test_api/test_graphql/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/package/tests/test_api/test_graphql/conftest.py b/package/tests/test_api/test_graphql/conftest.py
deleted file mode 100644
index 78ee0156c0..0000000000
--- a/package/tests/test_api/test_graphql/conftest.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import base64
-import json
-from pathlib import Path
-
-import pytest
-from kedro.io import DataCatalog, Version
-from kedro_datasets import matplotlib, pandas, plotly, tracking
-
-from kedro_viz.api.graphql.types import Run
-from kedro_viz.models.experiment_tracking import RunModel, UserRunDetailsModel
-
-
-@pytest.fixture
-def data_access_manager_with_no_run(data_access_manager, example_db_session, mocker):
- data_access_manager.set_db_session(example_db_session)
- mocker.patch(
- "kedro_viz.api.graphql.schema.data_access_manager", data_access_manager
- )
- yield data_access_manager
-
-
-@pytest.fixture
-def data_access_manager_with_runs(
- data_access_manager, example_db_session_with_runs, mocker
-):
- data_access_manager.set_db_session(example_db_session_with_runs)
- mocker.patch(
- "kedro_viz.api.graphql.schema.data_access_manager", data_access_manager
- )
- yield data_access_manager
-
-
-@pytest.fixture
-def save_version(example_run_ids):
- yield example_run_ids[0]
-
-
-@pytest.fixture
-def example_tracking_catalog(example_run_ids, tmp_path):
- example_run_id = example_run_ids[0]
- metrics_dataset = tracking.MetricsDataset(
- filepath=Path(tmp_path / "test.json").as_posix(),
- version=Version(None, example_run_id),
- )
- metrics_dataset.save({"col1": 1, "col2": 2, "col3": 3})
-
- csv_dataset = pandas.CSVDataset(
- filepath=Path(tmp_path / "metrics.csv").as_posix(),
- version=Version(None, example_run_id),
- )
-
- more_metrics = tracking.MetricsDataset(
- filepath=Path(tmp_path / "metrics.json").as_posix(),
- version=Version(None, example_run_id),
- )
- more_metrics.save({"col4": 4, "col5": 5, "col6": 6})
-
- json_dataset = tracking.JSONDataset(
- filepath=Path(tmp_path / "tracking.json").as_posix(),
- version=Version(None, example_run_id),
- )
- json_dataset.save({"col7": "column_seven", "col2": True, "col3": 3})
-
- plotly_dataset = plotly.JSONDataset(
- filepath=Path(tmp_path / "plotly.json").as_posix(),
- version=Version(None, example_run_id),
- )
-
- class MockPlotlyData:
- data = {
- "data": [
- {
- "x": ["giraffes", "orangutans", "monkeys"],
- "y": [20, 14, 23],
- "type": "bar",
- }
- ]
- }
-
- @classmethod
- def write_json(cls, fs_file, **kwargs):
- json.dump(cls.data, fs_file, **kwargs)
-
- plotly_dataset.save(MockPlotlyData)
-
- matplotlib_dataset = matplotlib.MatplotlibWriter(
- filepath=Path(tmp_path / "matplotlib.png").as_posix(),
- version=Version(None, example_run_id),
- )
-
- class MockMatplotData:
- data = base64.b64decode(
- "iVBORw0KGgoAAAANSUhEUg"
- "AAAAEAAAABCAQAAAC1HAwCAA"
- "AAC0lEQVQYV2NgYAAAAAM"
- "AAWgmWQ0AAAAASUVORK5CYII="
- )
-
- @classmethod
- def savefig(cls, bytes_buffer, **kwargs):
- bytes_buffer.write(cls.data)
-
- matplotlib_dataset.save(MockMatplotData)
-
- catalog = DataCatalog(
- datasets={
- "metrics": metrics_dataset,
- "csv_dataset": csv_dataset,
- "more_metrics": more_metrics,
- "json_tracking": json_dataset,
- "plotly_dataset": plotly_dataset,
- "matplotlib_dataset": matplotlib_dataset,
- }
- )
-
- yield catalog
-
-
-@pytest.fixture
-def example_multiple_run_tracking_catalog(example_run_ids, tmp_path):
- new_metrics_dataset = tracking.MetricsDataset(
- filepath=Path(tmp_path / "test.json").as_posix(),
- version=Version(None, example_run_ids[1]),
- )
- new_metrics_dataset.save({"col1": 1, "col3": 3})
- new_metrics_dataset = tracking.MetricsDataset(
- filepath=Path(tmp_path / "test.json").as_posix(),
- version=Version(None, example_run_ids[0]),
- )
- new_data = {"col1": 3, "col2": 3.23}
- new_metrics_dataset.save(new_data)
- catalog = DataCatalog(
- datasets={
- "new_metrics": new_metrics_dataset,
- }
- )
-
- yield catalog
-
-
-@pytest.fixture
-def example_multiple_run_tracking_catalog_at_least_one_empty_run(
- example_run_ids, tmp_path
-):
- new_metrics_dataset = tracking.MetricsDataset(
- filepath=Path(tmp_path / "test.json").as_posix(),
- version=Version(None, example_run_ids[1]),
- )
- new_metrics_dataset.save({"col1": 1, "col3": 3})
- new_metrics_dataset = tracking.MetricsDataset(
- filepath=Path(tmp_path / "test.json").as_posix(),
- version=Version(None, example_run_ids[0]),
- )
- catalog = DataCatalog(
- datasets={
- "new_metrics": new_metrics_dataset,
- }
- )
-
- yield catalog
-
-
-@pytest.fixture
-def example_multiple_run_tracking_catalog_all_empty_runs(example_run_ids, tmp_path):
- new_metrics_dataset = tracking.MetricsDataset(
- filepath=Path(tmp_path / "test.json").as_posix(),
- version=Version(None, example_run_ids[1]),
- )
- new_metrics_dataset = tracking.MetricsDataset(
- filepath=Path(tmp_path / "test.json").as_posix(),
- version=Version(None, example_run_ids[0]),
- )
- catalog = DataCatalog(
- datasets={
- "new_metrics": new_metrics_dataset,
- }
- )
-
- yield catalog
-
-
-@pytest.fixture
-def example_runs(example_run_ids):
- yield [
- Run(
- id=run_id,
- bookmark=False,
- notes="Hello World",
- title="Hello Kedro",
- author="",
- git_branch="",
- git_sha="",
- run_command="",
- )
- for run_id in example_run_ids
- ]
diff --git a/package/tests/test_api/test_graphql/test_mutations.py b/package/tests/test_api/test_graphql/test_mutations.py
deleted file mode 100644
index 5ff3285386..0000000000
--- a/package/tests/test_api/test_graphql/test_mutations.py
+++ /dev/null
@@ -1,232 +0,0 @@
-import json
-
-import pytest
-
-from kedro_viz.models.experiment_tracking import RunModel
-
-
-@pytest.mark.usefixtures("data_access_manager_with_runs")
-class TestGraphQLMutation:
- @pytest.mark.parametrize(
- "bookmark,notes,title",
- [
- (
- False,
- "new notes",
- "new title",
- ),
- (True, "new notes", "new title"),
- (True, "", ""),
- ],
- )
- def test_update_user_details_success(
- self,
- bookmark,
- notes,
- title,
- client,
- example_run_ids,
- ):
- example_run_id = example_run_ids[0]
- query = f"""
- mutation updateRun {{
- updateRunDetails(
- runId: "{example_run_id}",
- runInput: {{bookmark: {str(bookmark).lower()}, notes: "{notes}", title: "{title}"}}
- ) {{
- __typename
- ... on UpdateRunDetailsSuccess {{
- run {{
- id
- title
- bookmark
- notes
- }}
- }}
- ... on UpdateRunDetailsFailure {{
- id
- errorMessage
- }}
- }}
- }}
- """
- response = client.post("/graphql", json={"query": query})
- assert response.json() == {
- "data": {
- "updateRunDetails": {
- "__typename": "UpdateRunDetailsSuccess",
- "run": {
- "id": example_run_id,
- "bookmark": bookmark,
- "title": title if title != "" else example_run_id,
- "notes": notes,
- },
- }
- }
- }
-
- def test_update_user_details_only_bookmark(
- self,
- client,
- example_run_ids,
- ):
- example_run_id = example_run_ids[0]
- query = f"""
- mutation updateRun {{
- updateRunDetails(runId: "{example_run_id}", runInput: {{bookmark: true}}) {{
- __typename
- ... on UpdateRunDetailsSuccess {{
- run {{
- id
- title
- bookmark
- notes
- }}
- }}
- ... on UpdateRunDetailsFailure {{
- id
- errorMessage
- }}
- }}
- }}
- """
-
- response = client.post("/graphql", json={"query": query})
- assert response.json() == {
- "data": {
- "updateRunDetails": {
- "__typename": "UpdateRunDetailsSuccess",
- "run": {
- "id": example_run_id,
- "bookmark": True,
- "title": example_run_id,
- "notes": "",
- },
- }
- }
- }
-
- def test_update_user_details_should_add_when_no_details_exist(
- self, client, data_access_manager_with_no_run
- ):
- # add a new run
- example_run_id = "test_id"
- run = RunModel(
- id=example_run_id,
- blob=json.dumps(
- {"session_id": example_run_id, "cli": {"command_path": "kedro run"}}
- ),
- )
- data_access_manager_with_no_run.runs.add_run(run)
-
- query = f"""
- mutation updateRun {{
- updateRunDetails(runId: "{example_run_id}", runInput: {{bookmark: true}}) {{
- __typename
- ... on UpdateRunDetailsSuccess {{
- run {{
- id
- title
- bookmark
- notes
- }}
- }}
- ... on UpdateRunDetailsFailure {{
- id
- errorMessage
- }}
- }}
- }}
- """
-
- response = client.post("/graphql", json={"query": query})
- assert response.json() == {
- "data": {
- "updateRunDetails": {
- "__typename": "UpdateRunDetailsSuccess",
- "run": {
- "id": example_run_id,
- "bookmark": True,
- "title": example_run_id,
- "notes": "",
- },
- }
- }
- }
-
- def test_update_user_details_should_update_when_details_exist(
- self, client, example_run_ids
- ):
- example_run_id = example_run_ids[0]
- query = f"""
- mutation updateRun {{
- updateRunDetails(runId: "{example_run_id}", runInput: {{title:"new title", notes: "new notes"}}) {{
- __typename
- ... on UpdateRunDetailsSuccess {{
- run {{
- id
- title
- bookmark
- notes
- }}
- }}
- ... on UpdateRunDetailsFailure {{
- id
- errorMessage
- }}
- }}
- }}
- """
-
- response = client.post("/graphql", json={"query": query})
- assert response.json() == {
- "data": {
- "updateRunDetails": {
- "__typename": "UpdateRunDetailsSuccess",
- "run": {
- "id": example_run_id,
- "bookmark": True,
- "title": "new title",
- "notes": "new notes",
- },
- }
- }
- }
-
- def test_update_user_details_should_fail_when_run_doesnt_exist(self, client):
- response = client.post(
- "/graphql",
- json={
- "query": """
- mutation {
- updateRunDetails(
- runId: "I don't exist",
- runInput: { bookmark: false, title: "Hello Kedro", notes: "There are notes"}
- ) {
- __typename
- ... on UpdateRunDetailsSuccess {
- run {
- id
- title
- notes
- bookmark
- }
- }
- ... on UpdateRunDetailsFailure {
- id
- errorMessage
- }
- }
- }
- """
- },
- )
- assert response.json() == {
- "data": {
- "updateRunDetails": {
- "__typename": "UpdateRunDetailsFailure",
- "id": "I don't exist",
- "errorMessage": "Given run_id: I don't exist doesn't exist",
- }
- }
- }
diff --git a/package/tests/test_api/test_graphql/test_queries.py b/package/tests/test_api/test_graphql/test_queries.py
deleted file mode 100644
index 05dcf6fcda..0000000000
--- a/package/tests/test_api/test_graphql/test_queries.py
+++ /dev/null
@@ -1,429 +0,0 @@
-import json
-
-import pytest
-from packaging.version import parse
-
-from kedro_viz import __version__
-
-
-class TestQueryNoSessionStore:
- def test_graphql_run_list_endpoint(self, client):
- response = client.post("/graphql", json={"query": "{runsList {id bookmark}}"})
- assert response.json() == {"data": {"runsList": []}}
-
- def test_graphql_runs_metadata_endpoint(self, client):
- response = client.post(
- "/graphql",
- json={"query": '{runMetadata(runIds: ["id"]) {id bookmark}}'},
- )
- assert response.json() == {"data": {"runMetadata": []}}
-
-
-@pytest.mark.usefixtures("data_access_manager_with_no_run")
-class TestQueryNoRun:
- def test_graphql_run_list_endpoint(self, client):
- response = client.post("/graphql", json={"query": "{runsList {id bookmark}}"})
- assert response.json() == {"data": {"runsList": []}}
-
- def test_graphql_runs_metadata_endpoint(self, client):
- response = client.post(
- "/graphql",
- json={"query": '{runMetadata(runIds: ["invalid run id"]) {id bookmark}}'},
- )
- assert response.json() == {"data": {"runMetadata": []}}
-
-
-@pytest.mark.usefixtures("data_access_manager_with_runs")
-class TestQueryWithRuns:
- def test_run_list_query(
- self,
- client,
- example_run_ids,
- ):
- response = client.post("/graphql", json={"query": "{runsList {id bookmark}}"})
- assert response.json() == {
- "data": {
- "runsList": [
- {"id": run_id, "bookmark": True} for run_id in example_run_ids
- ]
- }
- }
-
- def test_graphql_runs_metadata_endpoint(self, example_run_ids, client):
- response = client.post(
- "/graphql",
- json={
- "query": f"""{{runMetadata(runIds: ["{ example_run_ids[0] }"]) {{id bookmark}}}}"""
- },
- )
- assert response.json() == {
- "data": {"runMetadata": [{"id": example_run_ids[0], "bookmark": True}]}
- }
-
- def test_run_tracking_data_query(
- self,
- example_run_ids,
- client,
- example_tracking_catalog,
- data_access_manager_with_runs,
- example_pipelines,
- ):
- data_access_manager_with_runs.add_catalog(
- example_tracking_catalog, example_pipelines
- )
- example_run_id = example_run_ids[0]
-
- response = client.post(
- "/graphql",
- json={
- "query": f"""
- {{
- metrics: runTrackingData(runIds:["{example_run_id}"],group:METRIC)
- {{datasetName, datasetType, data}}
- json: runTrackingData(runIds:["{example_run_id}"],group:JSON)
- {{datasetName, datasetType, data}}
- plots: runTrackingData(runIds:["{example_run_id}"],group:PLOT)
- {{datasetName, datasetType, data}}
- }}
- """
- },
- )
-
- expected_response = {
- "data": {
- "metrics": [
- {
- "datasetName": "metrics",
- "datasetType": "tracking.metrics_dataset.MetricsDataset",
- "data": {
- "col1": [{"runId": example_run_id, "value": 1.0}],
- "col2": [{"runId": example_run_id, "value": 2.0}],
- "col3": [{"runId": example_run_id, "value": 3.0}],
- },
- },
- {
- "datasetName": "more_metrics",
- "datasetType": "tracking.metrics_dataset.MetricsDataset",
- "data": {
- "col4": [{"runId": example_run_id, "value": 4.0}],
- "col5": [{"runId": example_run_id, "value": 5.0}],
- "col6": [{"runId": example_run_id, "value": 6.0}],
- },
- },
- ],
- "json": [
- {
- "datasetName": "json_tracking",
- "datasetType": "tracking.json_dataset.JSONDataset",
- "data": {
- "col2": [{"runId": example_run_id, "value": True}],
- "col3": [{"runId": example_run_id, "value": 3}],
- "col7": [
- {
- "runId": example_run_id,
- "value": "column_seven",
- }
- ],
- },
- },
- ],
- "plots": [
- {
- "datasetName": "plotly_dataset",
- "datasetType": "plotly.json_dataset.JSONDataset",
- "data": {
- "plotly.json": [
- {
- "runId": "2021-11-03T18.24.24.379Z",
- "value": {
- "data": [
- {
- "x": [
- "giraffes",
- "orangutans",
- "monkeys",
- ],
- "y": [20, 14, 23],
- "type": "bar",
- }
- ]
- },
- }
- ]
- },
- },
- {
- "datasetName": "matplotlib_dataset",
- "datasetType": "matplotlib.matplotlib_writer.MatplotlibWriter",
- "data": {
- "matplotlib.png": [
- {
- "runId": "2021-11-03T18.24.24.379Z",
- "value": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVQYV2NgYAAAAAMAAWgmWQ0AAAAASUVORK5CYII=",
- }
- ]
- },
- },
- ],
- }
- }
-
- assert response.json() == expected_response
-
- def test_metrics_data(
- self,
- client,
- example_tracking_catalog,
- data_access_manager_with_runs,
- example_pipelines,
- ):
- data_access_manager_with_runs.add_catalog(
- example_tracking_catalog, example_pipelines
- )
-
- response = client.post(
- "/graphql",
- json={
- "query": "query MyQuery {\n runMetricsData(limit: 3) {\n data\n }\n}\n"
- },
- )
-
- expected = {
- "data": {
- "runMetricsData": {
- "data": {
- "metrics": {
- "metrics.col1": [1.0, None],
- "metrics.col2": [2.0, None],
- "metrics.col3": [3.0, None],
- "more_metrics.col4": [4.0, None],
- "more_metrics.col5": [5.0, None],
- "more_metrics.col6": [6.0, None],
- },
- "runs": {
- "2021-11-02T18.24.24.379Z": [
- None,
- None,
- None,
- None,
- None,
- None,
- ],
- "2021-11-03T18.24.24.379Z": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
- },
- }
- }
- }
- }
-
- assert response.json() == expected
-
- @pytest.mark.parametrize(
- "show_diff,expected_response",
- [
- (
- True,
- {
- "data": {
- "runTrackingData": [
- {
- "datasetName": "new_metrics",
- "datasetType": "tracking.metrics_dataset.MetricsDataset",
- "data": {
- "col1": [
- {
- "runId": "2021-11-03T18.24.24.379Z",
- "value": 3.0,
- },
- {
- "runId": "2021-11-02T18.24.24.379Z",
- "value": 1.0,
- },
- ],
- "col2": [
- {
- "runId": "2021-11-03T18.24.24.379Z",
- "value": 3.23,
- },
- ],
- "col3": [
- {
- "runId": "2021-11-02T18.24.24.379Z",
- "value": 3.0,
- },
- ],
- },
- }
- ]
- }
- },
- ),
- (
- False,
- {
- "data": {
- "runTrackingData": [
- {
- "datasetName": "new_metrics",
- "datasetType": "tracking.metrics_dataset.MetricsDataset",
- "data": {
- "col1": [
- {
- "runId": "2021-11-03T18.24.24.379Z",
- "value": 3.0,
- },
- {
- "runId": "2021-11-02T18.24.24.379Z",
- "value": 1.0,
- },
- ],
- },
- },
- ]
- }
- },
- ),
- ],
- )
- def test_graphql_run_tracking_data(
- self,
- example_run_ids,
- client,
- example_multiple_run_tracking_catalog,
- data_access_manager_with_runs,
- show_diff,
- expected_response,
- example_pipelines,
- ):
- data_access_manager_with_runs.add_catalog(
- example_multiple_run_tracking_catalog, example_pipelines
- )
-
- response = client.post(
- "/graphql",
- json={
- "query": f"""{{runTrackingData
- (group: METRIC runIds:{json.dumps(example_run_ids)}, showDiff: {json.dumps(show_diff)})
- {{datasetName, datasetType, data}}}}"""
- },
- )
- assert response.json() == expected_response
-
- @pytest.mark.parametrize(
- "show_diff,expected_response",
- [
- (
- True,
- {
- "data": {
- "runTrackingData": [
- {
- "datasetName": "new_metrics",
- "datasetType": "tracking.metrics_dataset.MetricsDataset",
- "data": {
- "col1": [
- {
- "runId": "2021-11-02T18.24.24.379Z",
- "value": 1.0,
- },
- ],
- "col3": [
- {
- "runId": "2021-11-02T18.24.24.379Z",
- "value": 3.0,
- },
- ],
- },
- }
- ]
- }
- },
- ),
- (
- False,
- {"data": {"runTrackingData": []}},
- ),
- ],
- )
- def test_graphql_run_tracking_data_at_least_one_empty_run(
- self,
- example_run_ids,
- client,
- example_multiple_run_tracking_catalog_at_least_one_empty_run,
- data_access_manager_with_runs,
- show_diff,
- expected_response,
- example_pipelines,
- ):
- data_access_manager_with_runs.add_catalog(
- example_multiple_run_tracking_catalog_at_least_one_empty_run,
- example_pipelines,
- )
-
- response = client.post(
- "/graphql",
- json={
- "query": f"""{{runTrackingData
- (group: METRIC runIds:{json.dumps(example_run_ids)}, showDiff: {json.dumps(show_diff)})
- {{datasetName, datasetType, data}}}}"""
- },
- )
- assert response.json() == expected_response
-
- @pytest.mark.parametrize(
- "show_diff,expected_response",
- [
- (
- True,
- {"data": {"runTrackingData": []}},
- ),
- (
- False,
- {"data": {"runTrackingData": []}},
- ),
- ],
- )
- def test_graphql_run_tracking_data_all_empty_runs(
- self,
- example_run_ids,
- client,
- example_multiple_run_tracking_catalog_all_empty_runs,
- data_access_manager_with_runs,
- show_diff,
- expected_response,
- example_pipelines,
- ):
- data_access_manager_with_runs.add_catalog(
- example_multiple_run_tracking_catalog_all_empty_runs, example_pipelines
- )
-
- response = client.post(
- "/graphql",
- json={
- "query": f"""{{runTrackingData
- (group: METRIC runIds:{json.dumps(example_run_ids)}, showDiff: {json.dumps(show_diff)})
- {{datasetName, datasetType, data}}}}"""
- },
- )
- assert response.json() == expected_response
-
-
-class TestQueryVersion:
- def test_graphql_version_endpoint(self, client, mocker):
- mocker.patch(
- "kedro_viz.api.graphql.schema.get_latest_version",
- return_value=parse("1.0.0"),
- )
- response = client.post(
- "/graphql",
- json={"query": "{version {installed isOutdated latest}}"},
- )
- assert response.json() == {
- "data": {
- "version": {
- "installed": __version__,
- "isOutdated": False,
- "latest": "1.0.0",
- }
- }
- }
diff --git a/package/tests/test_api/test_graphql/test_serializers.py b/package/tests/test_api/test_graphql/test_serializers.py
deleted file mode 100644
index e69de29bb2..0000000000
From 4ad56fada1928607e46ad2b3778617628ab2a2b5 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Wed, 4 Dec 2024 11:32:02 +0000
Subject: [PATCH 13/16] remove cy test related to ET tab
Signed-off-by: Huong Nguyen
---
cypress/tests/ui/toolbar/global-toolbar.cy.js | 11 -----------
1 file changed, 11 deletions(-)
diff --git a/cypress/tests/ui/toolbar/global-toolbar.cy.js b/cypress/tests/ui/toolbar/global-toolbar.cy.js
index 64971aa1d7..c64c8ecf78 100644
--- a/cypress/tests/ui/toolbar/global-toolbar.cy.js
+++ b/cypress/tests/ui/toolbar/global-toolbar.cy.js
@@ -20,17 +20,6 @@ describe('Global Toolbar', () => {
cy.get('.details__tabs').should('not.exist');
});
- it('verifies that users can access the experiment tracking page through the experiment tracking button, when in the flowchart view. #TC-2', () => {
- cy.get('[data-test="global-toolbar-experiments-btn"]').click();
- cy.location('pathname').should('eq', '/experiment-tracking');
-
- // should exist
- cy.get('.details__tabs').should('exist');
-
- // should not exist
- cy.get('.pipeline-wrapper').should('not.exist');
- });
-
it('verifies that users can change the theme from light to dark theme, or dark to light theme. #TC-3', () => {
// Alias
cy.get('[data-test*="global-toolbar-theme-btn-"]').as('toggleTheme');
From 7bf73d772199667db7266ddee1fddaf810042744 Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Wed, 4 Dec 2024 11:47:16 +0000
Subject: [PATCH 14/16] Revert changes for .md as it will be done separately
Signed-off-by: Huong Nguyen
---
docs/source/experiment_tracking.md | 360 +++++++++++++++++++++++++
docs/source/index.md | 4 +-
docs/source/kedro-viz_visualisation.md | 3 +-
package/README.md | 29 +-
4 files changed, 393 insertions(+), 3 deletions(-)
create mode 100644 docs/source/experiment_tracking.md
diff --git a/docs/source/experiment_tracking.md b/docs/source/experiment_tracking.md
new file mode 100644
index 0000000000..3a6f42addb
--- /dev/null
+++ b/docs/source/experiment_tracking.md
@@ -0,0 +1,360 @@
+# Experiment tracking in Kedro-Viz
+
+```{important}
+Starting from version 8.0.0 of Kedro-Viz, Experiment Tracking is exclusively supported for users with kedro-datasets version 2.1.0 or higher.
+```
+
+Experiment tracking is the process of saving all the metadata related to an experiment each time you run it. It enables you to compare different runs of a machine-learning model as part of the experimentation process.
+
+The metadata you store may include:
+
+* Scripts used for running the experiment
+* Environment configuration files
+* Versions of the data used for training and evaluation
+* Evaluation metrics
+* Model weights
+* Plots and other visualisations
+
+You can use Kedro-Viz experiment tracking to store and access results, and to share them with others for comparison. Storage can be local or remote, such as cloud storage on AWS S3.
+
+The experiment tracking demo enables you to explore the experiment tracking capabilities of Kedro-Viz.
+
+![](./images/experiment-tracking_demo.gif)
+
+## Kedro versions supporting experiment tracking
+Kedro has always supported parameter versioning (as part of your codebase with a version control system like `git`) and Kedro’s dataset versioning capabilities enabled you to [snapshot models, datasets and plots](https://docs.kedro.org/en/stable/data/data_catalog.html#dataset-versioning).
+
+Kedro-Viz version 4.1.1 introduced metadata capture, visualisation, discovery and comparison, enabling you to access, edit and [compare your experiments](#access-run-data-and-compare-runs) and additionally [track how your metrics change over time](#view-and-compare-metrics-data).
+
+Kedro-Viz version 5.0 also supports the [display and comparison of plots, such as Plotly and Matplotlib](./preview_plotly_datasets.md). Support for metric plots (timeseries and parallel coords) was added to Kedro-Viz version 5.2.1.
+
+Kedro-Viz version 6.2 includes support for collaborative experiment tracking using a cloud storage solution. This means that multiple users can store their experiment data in a centralized remote storage, such as AWS S3, and access it through Kedro-Viz.
+
+## When should I use experiment tracking in Kedro?
+
+The choice of experiment tracking tool depends on your use case and choice of complementary tools, such as MLflow and Neptune:
+
+- **Kedro** - If you need experiment tracking, are looking for improved metrics visualisation and want a lightweight tool to work alongside existing functionality in Kedro. Kedro does not support a model registry.
+- **MLflow** - You can combine MLflow with Kedro by using [`kedro-mlflow`](https://kedro-mlflow.readthedocs.io/en/stable/) if you require experiment tracking, model registry and/or model serving capabilities or have access to Managed MLflow within the Databricks ecosystem.
+- **Neptune** - If you require experiment tracking and model registry functionality, improved visualisation of metrics and support for collaborative data science, you may consider [`kedro-neptune`](https://docs.neptune.ai/integrations/kedro/) for your workflow.
+
+{doc}`We support a growing list of integrations`.
+
+## Set up a project
+
+This section describes the steps necessary to set up experiment tracking and access logged metrics, using the {doc}`spaceflights tutorial` with a version of Kedro equal to or higher than 0.18.4, and a version of Kedro-Viz equal to or higher than 5.2.
+
+There are three steps to enable experiment tracking features with Kedro-Viz. We illustrate how to:
+
+- [Set up a session store to capture experiment metadata](#set-up-the-session-store)
+- [Set up experiment tracking datasets to list the metrics to track](#set-up-experiment-tracking-datasets)
+- [Modify your nodes and pipelines to output those metrics](#modify-your-nodes-and-pipelines-to-log-metrics)
+
+### Install Kedro and Kedro-Viz
+To use this tutorial code, you must already have {doc}`installed Kedro` and [Kedro-Viz](./kedro-viz_visualisation.md). You can confirm the versions you have installed by running `kedro info`
+
+```{note}
+The example code uses a version of Kedro-Viz `>6.2.0`.
+```
+
+Create a new project using the spaceflights starter. From the terminal run:
+
+```bash
+kedro new --starter=spaceflights-pandas
+```
+
+Feel free to name your project as you like, but this guide assumes the project is named `Spaceflights`.
+
+### Install the dependencies for the project
+
+Once you have created the project, to run project-specific Kedro commands, you must navigate to the directory in which it has been created:
+
+```bash
+cd spaceflights
+```
+Install the project's dependencies:
+
+```bash
+pip install -r src/requirements.txt
+```
+
+## Set up the session store
+
+In the domain of experiment tracking, each pipeline run is considered a session. A session store records all related metadata for each pipeline run, from logged metrics to other run-related data such as timestamp, `git` username and branch. The session store is a [SQLite](https://www.sqlite.org/index.html) database that is generated during your first pipeline run after it has been set up in your project.
+
+### Local storage
+To set up the session store locally, go to the `src/spaceflights/settings.py` file and add the following:
+
+```python
+from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
+from pathlib import Path
+
+SESSION_STORE_CLASS = SQLiteStore
+SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")}
+```
+
+This specifies the creation of the `SQLiteStore` under the `data` subfolder, using the `SQLiteStore` setup from your installed Kedro-Viz plugin
+
+This step is crucial to enable experiment tracking features on Kedro-Viz, as it is the database used to serve all run data to the Kedro-Viz front-end. Once this step is complete, you can either proceed to [set up the tracking datasets](#set-up-experiment-tracking-datasets) or [set up your nodes and pipelines to log metrics](#modify-your-nodes-and-pipelines-to-log-metrics); these two activities are interchangeable, but both should be completed to get a working experiment tracking setup.
+
+```{note}
+Starting from Kedro-Viz 9.2.0, if the user does not provide `SESSION_STORE_ARGS` in the project settings, a default directory `.viz` will be created at the root of your Kedro project and used for `SQLiteStore`.
+```
+
+## Collaborative experiment tracking
+
+```{note}
+To use collaborative experiment tracking, ensure that your installed version of Kedro-Viz is `>=6.2.0`.
+```
+
+For collaborative experiment tracking, Kedro-Viz saves your experiments as SQLite database files on a central cloud storage. To ensure that all users have a unique filename, set up your `KEDRO_SQLITE_STORE_USERNAME` in the environment variables. By default, Kedro-Viz will take your computer user name if this is not specified.
+
+> Note: In Kedro-Viz version 6.2, the only way to set up credentials for accessing your cloud storage is through environment variables.
+
+```bash
+export KEDRO_SQLITE_STORE_USERNAME="your_unique__username"
+
+```
+
+Now specify a remote path in the `SESSION_STORE_ARGS` variable, which links to your cloud storage.
+
+
+```python
+from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
+from pathlib import Path
+
+SESSION_STORE_CLASS = SQLiteStore
+SESSION_STORE_ARGS = {
+ "path": str(Path(__file__).parents[2] / "data"),
+ "remote_path": "s3://my-bucket-name/path/to/experiments",
+}
+```
+
+Finally, ensure you have the necessary credentials set up as shown below:
+
+```bash
+export AWS_ACCESS_KEY_ID="your_access_key_id"
+export AWS_SECRET_ACCESS_KEY="your_secret_access_key"
+export AWS_REGION="your_aws_region"
+
+```
+
+## Set up experiment tracking datasets
+
+There are two types of tracking datasets: {py:class}`tracking.MetricsDataset ` and {py:class}`tracking.JSONDataset `. The `tracking.MetricsDataset` should be used for tracking numerical metrics, and the `tracking.JSONDataset` can be used for tracking any other JSON-compatible data like boolean or text-based data.
+
+Set up two datasets to log the columns used in the companies dataset (`companies_columns`) and experiment metrics for the data science pipeline (`metrics`) like the coefficient of determination (`r2 score`), max error (`me`) and mean absolute error (`mae`) by adding the following in the `conf/base/catalog.yml` file:
+
+```yaml
+metrics:
+ type: tracking.MetricsDataset
+ filepath: data/09_tracking/metrics.json
+
+companies_columns:
+ type: tracking.JSONDataset
+ filepath: data/09_tracking/companies_columns.json
+```
+
+## Modify your nodes and pipelines to log metrics
+
+Now that you have set up the tracking datasets to log experiment tracking data, next ensure that the data is returned from your nodes.
+
+Set up the data to be logged for the metrics dataset - under `nodes.py` of your `data_science` pipeline (`src/spaceflights/pipelines/data_science/nodes.py`), add three different metrics to your `evaluate_model` function to log `r2_score`, `mae` and `me` and return these 3 metrics as key-value pairs.
+
+The new `evaluate_model` function should look like this:
+
+```python
+from sklearn.metrics import mean_absolute_error, max_error
+
+
+def evaluate_model(
+ regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series
+) -> Dict[str, float]:
+ """Calculates and logs the coefficient of determination.
+
+ Args:
+ regressor: Trained model.
+ X_test: Testing data of independent features.
+ y_test: Testing data for price.
+ """
+ y_pred = regressor.predict(X_test)
+ score = r2_score(y_test, y_pred)
+ mae = mean_absolute_error(y_test, y_pred)
+ me = max_error(y_test, y_pred)
+ logger = logging.getLogger(__name__)
+ logger.info("Model has a coefficient R^2 of %.3f on test data.", score)
+ return {"r2_score": score, "mae": mae, "max_error": me}
+```
+
+Next, ensure that the dataset is also specified as an output of your `evaluate_model` node. In the `src/spaceflights/pipelines/data_science/pipeline.py` file, specify the `output` of your `evaluate_model` to be the `metrics` dataset. Note that the output dataset must exactly match the name of the tracking dataset specified in the catalog file.
+
+The node of the `evaluate_model` on the pipeline should look like this:
+
+```python
+node(
+ func=evaluate_model,
+ inputs=["regressor", "X_test", "y_test"],
+ name="evaluate_model_node",
+ outputs="metrics",
+)
+```
+
+Repeat the same steps to set up the `companies_column` dataset. For this dataset, log the column that contains the list of companies as outlined in the `companies.csv` file under the `data/01_raw` directory. Modify the `preprocess_companies` node under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/nodes.py`) to return the data under a key-value pair, as shown below:
+
+```python
+from typing import Tuple, Dict
+
+
+def preprocess_companies(companies: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
+ """Preprocesses the data for companies.
+
+ Args:
+ companies: Raw data.
+ Returns:
+ Preprocessed data, with `company_rating` converted to a float and
+ `iata_approved` converted to boolean.
+ """
+ companies["iata_approved"] = _is_true(companies["iata_approved"])
+ companies["company_rating"] = _parse_percentage(companies["company_rating"])
+ return companies, {"columns": companies.columns.tolist(), "data_type": "companies"}
+```
+
+Again, you must ensure that the dataset is also specified as an output on the `pipeline.py` file under the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`), as follows:
+
+```python
+node(
+ func=preprocess_companies,
+ inputs="companies",
+ outputs=["preprocessed_companies", "companies_columns"],
+ name="preprocess_companies_node",
+)
+```
+
+Having set up both datasets, you can now generate your first set of experiment tracking data!
+
+## Generate the run data
+
+The beauty of native experiment tracking in Kedro is that all tracked data is generated and stored each time you do a Kedro run. Hence, to generate the data, you need only execute:
+
+```bash
+kedro run
+```
+
+After the run completes, under `data/09_tracking`, you can now see two folders, `companies_column.json` and `metrics.json`. On performing a pipeline run after setting up the tracking datasets, Kedro generates a folder with the dataset name for each tracked dataset. Each folder of the tracked dataset contains folders named by the timestamp of each pipeline run to store the saved metrics of the dataset, and each future pipeline run generates a new timestamp folder with the JSON file of the saved metrics under the folder of its subsequent tracked dataset.
+
+You can also see the `session_store.db` generated from your first pipeline run after enabling experiment tracking, which is used to store all the generated run metadata, alongside the tracking dataset, to be used for exposing experiment tracking to Kedro-Viz.
+
+![](./images/experiment-tracking-folder.png)
+
+Execute `kedro run` a few times in a row to generate a larger set of experiment data. You can also play around with setting up different tracking datasets, and check the logged data via the generated JSON data files.
+
+## Access run data and compare runs
+
+Here comes the fun part of accessing your run data on Kedro-Viz. Having generated some run data, execute the following command:
+
+```bash
+kedro viz run
+```
+
+When you open the Kedro-Viz web app, you see an experiment tracking icon on the left-hand side of the screen.
+
+![](./images/experiment-tracking-icon.png)
+
+Click the icon to go to the experiment tracking page (you can also access the page from your browser at `http://127.0.0.1:4141/experiment-tracking`), where you can see the sets of experiment data generated from all previous runs:
+
+![](./images/experiment-tracking-runs-list.png)
+
+You can now access, compare and pin your runs by toggling the `Compare runs` button:
+
+![](./images/experiment-tracking-compare-runs.png)
+
+## View and compare plots
+
+In this section, we illustrate how to compare Matplotlib plots across experimental runs (functionality available since Kedro-Viz version 5.0).
+
+### Update the dependencies
+
+Update the `src/requirements.txt` file in your Kedro project by adding the following dataset to enable Matplotlib for your project:
+
+```text
+kedro-datasets[matplotlib.MatplotlibWriter]~=1.1
+seaborn~=0.12.1
+```
+
+And install the requirements with:
+
+```bash
+pip install -r src/requirements.txt
+```
+
+### Add a plotting node
+
+Add a new node to the `data_processing` nodes (`src/spaceflights/pipelines/data_processing/nodes.py`):
+
+```python
+import matplotlib.pyplot as plt
+import seaborn as sn
+
+
+def create_confusion_matrix(companies: pd.DataFrame):
+ actuals = [0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1]
+ predicted = [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1]
+ data = {"y_Actual": actuals, "y_Predicted": predicted}
+ df = pd.DataFrame(data, columns=["y_Actual", "y_Predicted"])
+ confusion_matrix = pd.crosstab(
+ df["y_Actual"], df["y_Predicted"], rownames=["Actual"], colnames=["Predicted"]
+ )
+ sn.heatmap(confusion_matrix, annot=True)
+ return plt
+```
+
+And now add this node to the `data_processing` pipeline (`src/spaceflights/pipelines/data_processing/pipeline.py`):
+
+```python
+from .nodes import create_confusion_matrix
+
+node(
+ func=create_confusion_matrix,
+ inputs="companies",
+ outputs="confusion_matrix",
+),
+```
+
+In the catalog (`conf/base/catalog.yml`) add the `confusion_matrix` data definition, making sure to set the versioned flag to `true` within the project catalog to include the plot in experiment tracking:
+
+```yaml
+confusion_matrix:
+ type: matplotlib.MatplotlibWriter
+ filepath: data/09_tracking/confusion_matrix.png
+ versioned: true
+```
+
+After running the pipeline with `kedro run`, the plot is saved and you can see it in the experiment tracking panel when you execute `kedro viz run`. Clicking on a plot expands it. When in comparison view, expanding a plot shows all the plots in that view for side-by-side comparison.
+
+![](./images/experiment-tracking-plots-comparison.png)
+
+![](./images/experiment-tracking-plots-comparison-expanded.png)
+
+## View and compare metrics data
+
+From Kedro-Viz `>=5.2.1` experiment tracking also supports the display and comparison of metrics data through two chart types: time series and parallel coordinates.
+
+Time series displays one metric per graph, showing how the metric value has changed over time.
+
+Parallel coordinates displays all metrics on a single graph, with each vertical line representing one metric with its own scale. The metric values are positioned along those vertical lines and connected across each axis.
+
+When in comparison view, comparing runs highlights your selections on the respective chart types, improving readability even in the event there is a multitude of data points.
+
+```{note}
+The following graphic is taken from the [Kedro-Viz experiment tracking demo](https://demo.kedro.org/) (it is not a visualisation from the example code you created above).
+```
+
+![](./images/experiment-tracking-metrics-comparison.gif)
+
+Additionally, you can monitor the changes to metrics over time from the pipeline visualisation tab which you can access by following the icon on the left-hand side of the screen.
+
+![](./images/pipeline_visualisation_icon.png)
+
+Clicking on any `MetricsDataset` node opens a side panel displaying how the metric value has changed over time:
+
+![](./images/pipeline_show_metrics.gif)
\ No newline at end of file
diff --git a/docs/source/index.md b/docs/source/index.md
index 4db6c326b8..ea10570cfb 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -6,7 +6,7 @@
-Kedro-Viz is an interactive development tool for visualising data science pipelines built with [Kedro](https://github.com/kedro-org/kedro).
+Kedro-Viz is an interactive development tool for visualising data science pipelines built with [Kedro](https://github.com/kedro-org/kedro). Kedro-Viz also enables users to view and compare different experiment runs within their Kedro project.
Kedro-Viz features include:
@@ -18,6 +18,7 @@ Kedro-Viz features include:
🎨 Rich metadata side panel to display parameters, plots, etc.
📊 Support for all types of [Plotly charts](https://plotly.com/javascript/).
♻️ Autoreload on code change.
+🧪 Support for experiment tracking and comparing runs in a Kedro project.
Take a look at the live demo for a preview of Kedro-Viz.
@@ -29,6 +30,7 @@ kedro-viz_visualisation
share_kedro_viz
preview_datasets
slice_a_pipeline
+experiment_tracking
```
```{toctree}
diff --git a/docs/source/kedro-viz_visualisation.md b/docs/source/kedro-viz_visualisation.md
index d4e30fb80f..18379f45d4 100644
--- a/docs/source/kedro-viz_visualisation.md
+++ b/docs/source/kedro-viz_visualisation.md
@@ -89,6 +89,7 @@ Some of the known limitations while using `--lite` flag:
* If the datasets are not resolved, they will be defaulted to a custom dataset `UnavailableDataset`.
* The flowchart will not show the layers information for the datasets.
+* Experiment Tracking will not work if the pre-requisite of having kedro-datasets version 2.1.0 and above is not met.
## Automatic visualisation updates
@@ -325,4 +326,4 @@ Press `Cmd` + `Shift` + `P` (on macOS) or `Ctrl` + `Shift` + `P` (on Windows/Lin
Type `kedro: Run Kedro Viz` and select the command.
This will launch Kedro-Viz and display your pipeline visually within the extension.
-![Kedro Viz in VSCode](./images/viz-in-vscode.gif)
+![Kedro Viz in VSCode](./images/viz-in-vscode.gif)
\ No newline at end of file
diff --git a/package/README.md b/package/README.md
index f2d5ab880d..781fff39c6 100644
--- a/package/README.md
+++ b/package/README.md
@@ -205,6 +205,33 @@ Options:
-h, --help Show this message and exit.
```
+### Experiment Tracking usage
+
+To enable [experiment tracking](https://docs.kedro.org/en/stable/experiment_tracking/index.html) in Kedro-Viz, you need to add the Kedro-Viz `SQLiteStore` to your Kedro project.
+
+This can be done by adding the below code to `settings.py` in the `src` folder of your Kedro project.
+
+```python
+from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
+from pathlib import Path
+SESSION_STORE_CLASS = SQLiteStore
+SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")}
+```
+
+Once the above set-up is complete, tracking datasets can be used to track relevant data for Kedro runs. More information on how to use tracking datasets can be found in the [experiment tracking documentation](https://docs.kedro.org/en/stable/experiment_tracking/index.html)
+
+**Notes:**
+
+- Experiment Tracking is only available for Kedro-Viz >= 4.0.2 and Kedro >= 0.17.5
+- Prior to Kedro 0.17.6, when using tracking datasets, you will have to explicitly mark the datasets as `versioned` for it to show up properly in Kedro-Viz experiment tracking tab. From Kedro >= 0.17.6, this is done automatically:
+
+```yaml
+train_evaluation.r2_score_linear_regression:
+ type: tracking.MetricsDataset
+ filepath: ${base_location}/09_tracking/linear_score.json
+ versioned: true
+```
+
### Standalone React component usage
To use Kedro-Viz as a standalone React component, you can follow the example below. However, please note that Kedro-Viz does not support server-side rendering (SSR). If you're using Next.js or another SSR framework, you should be aware of this limitation.
@@ -280,4 +307,4 @@ Kedro-Viz is licensed under the [Apache 2.0](https://github.com/kedro-org/kedro-
## Citation
-If you're an academic, Kedro-Viz can also help you, for example, as a tool to visualise how your publication's pipeline is structured. Find our citation reference on [Zenodo](https://doi.org/10.5281/zenodo.4277218).
+If you're an academic, Kedro-Viz can also help you, for example, as a tool to visualise how your publication's pipeline is structured. Find our citation reference on [Zenodo](https://doi.org/10.5281/zenodo.4277218).
\ No newline at end of file
From ccb3e265a31a1821416d84559b351435390a594b Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Wed, 4 Dec 2024 11:55:39 +0000
Subject: [PATCH 15/16] revert contributing.md
Signed-off-by: Huong Nguyen
---
CONTRIBUTING.md | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4d5f1b2ea9..8b8fda0c15 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -210,6 +210,14 @@ make run PROJECT_PATH=/new-kedro-project
> **Note**: Once the backend development server is launched at port 4142, the local app will always pull data from that server. To prevent this, you can comment out the proxy setting in `package.json` and restart the dev server at port 4141.
+#### Launch the development server with the `SQLiteSessionStore`
+
+Kedro-Viz provides a `SQLiteSessionStore` that users can use in their project to enable experiment tracking functionality. If you want to use this session store with the development server, make sure you don't use a relative path when specifying the store's location in `settings.py`. For example, `demo-project` specifies the local `data` directory within a project as the session store's location as follows:
+
+```python
+from kedro_viz.integrations.kedro.sqlite_store import SQLiteStore
+SESSION_STORE_ARGS = {"path": str(Path(__file__).parents[2] / "data")}
+```
Owing to this coupling between the project settings and Kedro-Viz, if you wish to execute any Kedro commands on `demo-project` (including `kedro run`), you will need to install the Kedro-Viz Python package. To install your local development version of the package, run:
From 96488b31d80710163ee011cecd341f873939610e Mon Sep 17 00:00:00 2001
From: Huong Nguyen
Date: Wed, 4 Dec 2024 12:51:13 +0000
Subject: [PATCH 16/16] remove e2e test failed from ET
Signed-off-by: Huong Nguyen
---
cypress/tests/ui/flowchart/flowchart.cy.js | 18 ------------------
1 file changed, 18 deletions(-)
diff --git a/cypress/tests/ui/flowchart/flowchart.cy.js b/cypress/tests/ui/flowchart/flowchart.cy.js
index f3f241dcb3..805a042f6c 100644
--- a/cypress/tests/ui/flowchart/flowchart.cy.js
+++ b/cypress/tests/ui/flowchart/flowchart.cy.js
@@ -173,24 +173,6 @@ describe('Flowchart DAG', () => {
).should('exist');
});
- it('verifies that users can navigate to experiment tracking by clicking on Open in Experiment Tracking button on the metadata panel. #TC-32', () => {
- const nodeToClickText = 'R2 Score';
-
- // Assert before action
- cy.location('pathname').should('not.eq', '/experiment-tracking');
- cy.location('search').should('not.eq', '?view=Metrics');
-
- // Action
- cy.contains('text', nodeToClickText).click({ force: true });
- cy.get('.pipeline-metadata__link').click();
-
- // Assert after action
- cy.location('pathname').should('eq', '/experiment-tracking');
- cy.location('search').should('eq', '?view=Metrics');
- cy.get('.details-mainframe').should('exist');
- cy.get('.details__tabs').should('exist');
- });
-
it('verifies that users see an error message when there are no nodes/pipelines. #TC-33', () => {
// Intercept the network request to mock with a fixture
cy.__interceptRest__('/api/main', 'GET', '/mock/emptyDataset.json');