Skip to content

Commit

Permalink
Flask caching per dataset (#152)
Browse files Browse the repository at this point in the history
  • Loading branch information
lingyielia authored Nov 9, 2023
1 parent d6c2215 commit 5ea1ca8
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 32 deletions.
32 changes: 14 additions & 18 deletions vizro-core/examples/default/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,24 @@ def retrieve_gapminder():
return px.data.gapminder()


data_manager["gapminder"] = retrieve_gapminder
data_manager["gapminder2"] = retrieve_gapminder

df_gapminder = px.data.gapminder()
df_gapminder2 = px.data.gapminder()


# Options for configuring per-dataset arguments:
data_manager["gapminder"] = retrieve_gapminder
print("to update per dataset cache config")
data_manager["gapminder"]._cache_arguments = {"timeout": 600}
print(f"_cache_arguments: {data_manager['gapminder']._cache_arguments}")

data_manager["gapminder2"] = retrieve_gapminder
print("to update per dataset cache config")
data_manager["gapminder2"]._cache_arguments = {
"timeout": 0,
# "unless": (lambda: True)
}
print(f"_cache_arguments: {data_manager['gapminder2']._cache_arguments}")

@capture("action")
def delete_memoized_cache(delete_button_id_n_clicks):
"""Delete one memoized cache."""
Expand Down Expand Up @@ -147,18 +158,3 @@ def empty_cache(empty_button_id_n_clicks):
# processes=3,
# dev_tools_hot_reload=False
# )

# Options for configuring per-dataset arguments:
# 1.
data_manager["iris"] = lambda: pd.DataFrame()
data_manager["iris"].set_cache(timeout=50)


# 2.
class VizroDataSet:
pass


data_manager["iris"] = VizroDataSet(lambda: pd.DataFrame(), timeout=50)

# 3.
64 changes: 50 additions & 14 deletions vizro-core/src/vizro/managers/_data_manager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""The data manager handles access to all DataFrames used in a Vizro app."""
import logging
import time
from typing import Callable, Dict, Union
from typing import Callable, Dict, Optional, Union

import pandas as pd
from flask_caching import Cache
Expand All @@ -16,6 +16,32 @@
pd_LazyDataFrame = Callable[[], pd.DataFrame]


class _Dataset:
"""A dataset with flask-caching config attached.
Examples:
>>> import plotly.express as px
>>> data_manager["iris"] = pd.DataFrame()
>>> data_manager["iris"]._cache_arguments = {"timeout": 600}
"""
_cache_arguments_default: Dict[str, int] = {}

def __init__(self, data: pd_LazyDataFrame, timeout: Optional[int] = None, unless: Optional[Callable] = None):
"""Initializes a `_Dataset` object.
Args:
data (pd_LazyDataFrame): A callable that returns a pandas DataFrame.
timeout (Optional[int], optional): The timeout in seconds for the cache. Defaults to None.
unless (Optional[Callable], optional): A callable that returns True if the cache should not be used.
Defaults to None.
"""
self.data = data
self._cache_arguments = dict(self._cache_arguments_default)
self._cache_arguments["timeout"] = timeout
self._cache_arguments["unless"] = unless


class DataManager:
"""Object to handle all data for the `vizro` application.
Expand All @@ -25,13 +51,13 @@ class DataManager:
"""

_cache = Cache(config={"CACHE_TYPE": "SimpleCache"})

def __init__(self):
self.__lazy_data: Dict[DatasetName, pd_LazyDataFrame] = {}
"""Initializes the `DataManager` object."""
self.__lazy_data: Dict[DatasetName, _Dataset] = {}
self.__original_data: Dict[DatasetName, pd.DataFrame] = {}
self.__component_to_original: Dict[ComponentID, DatasetName] = {}
self._frozen_state = False
self._cache = Cache(config={"CACHE_TYPE": "SimpleCache"})

# happens before dashboard build
@_state_modifier
Expand All @@ -45,14 +71,24 @@ def __setitem__(self, dataset_name: DatasetName, data: Union[pd.DataFrame, pd_La
raise ValueError(f"Dataset {dataset_name} already exists.")

if callable(data):
self.__lazy_data[dataset_name] = data
self.__lazy_data[dataset_name] = _Dataset(data)
elif isinstance(data, pd.DataFrame):
self.__original_data[dataset_name] = data
else:
raise TypeError(
f"Dataset {dataset_name} must be a pandas DataFrame or callable that returns pandas DataFrame."
)

def __getitem__(self, dataset_name: DatasetName) -> _Dataset:
"""Returns the `_Dataset` object associated with `dataset_name`."""
if dataset_name not in self.__original_data and dataset_name not in self.__lazy_data:
raise KeyError(f"Dataset {dataset_name} does not exist.")
if dataset_name in self.__original_data:
# no cache available
raise ValueError(f"Dataset {dataset_name} is not lazy.")
if dataset_name in self.__lazy_data:
return self.__lazy_data[dataset_name]

# happens before dashboard build
@_state_modifier
def _add_component(self, component_id: ComponentID, dataset_name: DatasetName):
Expand All @@ -69,19 +105,19 @@ def _add_component(self, component_id: ComponentID, dataset_name: DatasetName):
self.__component_to_original[component_id] = dataset_name

def _load_lazy_data(self, dataset_name: DatasetName) -> pd.DataFrame:
self._cache_dataset_arguments = {"iris": {"timeout": 50}, "other_dataset": {"timeout": 100}}
"""Loads the data for `dataset_name` and returns it.
@self._cache.memoize(**self._cache_dataset_arguments.get(dataset_name, {}))
This function is memoized using flask-caching. If the data is already loaded, it will return the cached data.
"""
@self._cache.memoize(**self[dataset_name]._cache_arguments)
# timeout (including 0 -> never expires), unless -> always executes function (like
# timeout=0.000001) and doesn't update cache.
# timeout and unless need to depend on dataset_name
def inner():
self.__lazy_data[dataset_name]()

"""Returns the original data for `dataset_name`."""
logger.debug("reloading lazy data: %s", dataset_name)
time.sleep(2.0)
return inner()
def inner(dataset):
logger.debug("Loading lazy data: %s", dataset)
time.sleep(2.0)
return self[dataset].data()
return inner(dataset_name)

def _get_component_data(self, component_id: ComponentID) -> pd.DataFrame:
"""Returns the original data for `component_id`."""
Expand Down

0 comments on commit 5ea1ca8

Please sign in to comment.