Skip to content

Commit

Permalink
Updated docstrings
Browse files Browse the repository at this point in the history
Signed-off-by: Elena Khaustova <[email protected]>
  • Loading branch information
ElenaKhaustova committed Oct 22, 2024
1 parent 33e314a commit 44d0207
Showing 1 changed file with 157 additions and 9 deletions.
166 changes: 157 additions & 9 deletions kedro/io/kedro_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,12 @@ def __init__(
Example:
::
>>> # settings.py
>>> from kedro.io import KedroDataCatalog
>>> from kedro_datasets.pandas import CSVDataset
>>>
>>> DATA_CATALOG_CLASS = KedroDataCatalog
>>> cars = CSVDataset(filepath="cars.csv",
>>> load_args=None,
>>> save_args={"index": False})
>>> catalog = KedroDataCatalog(datasets={'cars': cars})
"""
self._config_resolver = config_resolver or CatalogConfigResolver()
self._datasets = datasets or {}
Expand Down Expand Up @@ -102,34 +104,83 @@ def __repr__(self) -> str:
return repr(self._datasets)

def __contains__(self, dataset_name: str) -> bool:
"""Check if an item is in the catalog as a materialised dataset or pattern"""
"""Check if an item is in the catalog as a materialised dataset or pattern."""
return (
dataset_name in self._datasets
or self._config_resolver.match_pattern(dataset_name) is not None
)

def __eq__(self, other) -> bool: # type: ignore[no-untyped-def]
"""Compares two catalogs based on materialised datasets' and datasets' patterns."""
return (self._datasets, self._config_resolver.list_patterns()) == (
other._datasets,
other.config_resolver.list_patterns(),
)

def keys(self) -> List[str]: # noqa: UP006
"""List all dataset names registered in the catalog."""
return list(self.__iter__())

def values(self) -> List[AbstractDataset]: # noqa: UP006
"""List all datasets registered in the catalog."""
return [self._datasets[key] for key in self]

def items(self) -> List[tuple[str, AbstractDataset]]: # noqa: UP006
"""List all dataset names and datasets registered in the catalog."""
return [(key, self._datasets[key]) for key in self]

def __iter__(self) -> Iterator[str]:
yield from self._datasets.keys()

def __getitem__(self, ds_name: str) -> AbstractDataset:
"""Get a dataset by name from an internal collection of datasets.
If a dataset is not in the collection but matches any pattern
it is instantiated and added to the collection first, then returned.
Args:
ds_name: A dataset name.
Returns:
An instance of AbstractDataset.
Raises:
DatasetNotFoundError: When a dataset with the given name
is not in the collection and do not match patterns.
"""
return self.get_dataset(ds_name)

def __setitem__(self, key: str, value: Any) -> None:
"""Add dataset to the ``KedroDataCatalog`` using key as a datsets name and the data provided through the value.
Values can either be raw data or Kedro datasets - instances of classes that inherit from ``AbstractDataset``.
If raw data is provided, it will be automatically wrapped in a ``MemoryDataset`` before being added to the ``KedroDataCatalog``.
Args:
key: A dataset name.
value: Raw data or instance of classes that inherit from ``AbstractDataset``.
Example:
::
>>> from kedro_datasets.pandas import CSVDataset
>>> import pandas as pd
>>>
>>> df = pd.DataFrame({"col1": [1, 2],
>>> "col2": [4, 5],
>>> "col3": [5, 6]})
>>>
>>> catalog = KedroDataCatalog()
>>> catalog["data_df"] = df
>>>
>>> assert catalog.load("data_df").equals(df)
>>>
>>> csv_dataset = CSVDataset(filepath="test.csv")
>>> csv_dataset.save(df)
>>> catalog["data_csv_dataset"] = csv_dataset
>>>
>>> assert catalog.load("data_csv_dataset").equals(df)
"""
if key in self._datasets:
self._logger.warning("Replacing dataset '%s'", key)
if isinstance(value, AbstractDataset):
Expand All @@ -144,7 +195,19 @@ def __len__(self) -> int:
def get(
self, key: str, default: AbstractDataset | None = None
) -> AbstractDataset | None:
"""Get a dataset by name from an internal collection of datasets."""
"""Get a dataset by name from an internal collection of datasets.
If a dataset is not in the collection but matches any pattern
it is instantiated and added to the collection first, then returned.
Args:
key: A dataset name.
default: Optional argument for default dataset to return in case
requested dataset not in the catalog.
Returns:
An instance of AbstractDataset.
"""
if key not in self._datasets:
ds_config = self._config_resolver.resolve_pattern(key)
if ds_config:
Expand Down Expand Up @@ -172,6 +235,69 @@ def from_config(
"""Create a ``KedroDataCatalog`` instance from configuration. This is a
factory method used to provide developers with a way to instantiate
``KedroDataCatalog`` with configuration parsed from configuration files.
Args:
catalog: A dictionary whose keys are the dataset names and
the values are dictionaries with the constructor arguments
for classes implementing ``AbstractDataset``. The dataset
class to be loaded is specified with the key ``type`` and their
fully qualified class name. All ``kedro.io`` dataset can be
specified by their class name only, i.e. their module name
can be omitted.
credentials: A dictionary containing credentials for different
datasets. Use the ``credentials`` key in a ``AbstractDataset``
to refer to the appropriate credentials as shown in the example
below.
load_versions: A mapping between dataset names and versions
to load. Has no effect on datasets without enabled versioning.
save_version: Version string to be used for ``save`` operations
by all datasets with enabled versioning. It must: a) be a
case-insensitive string that conforms with operating system
filename limitations, b) always return the latest version when
sorted in lexicographical order.
Returns:
An instantiated ``DataCatalog`` containing all specified
datasets, created and ready to use.
Raises:
DatasetNotFoundError: When `load_versions` refers to a dataset that doesn't
exist in the catalog.
Example:
::
>>> config = {
>>> "cars": {
>>> "type": "pandas.CSVDataset",
>>> "filepath": "cars.csv",
>>> "save_args": {
>>> "index": False
>>> }
>>> },
>>> "boats": {
>>> "type": "pandas.CSVDataset",
>>> "filepath": "s3://aws-bucket-name/boats.csv",
>>> "credentials": "boats_credentials",
>>> "save_args": {
>>> "index": False
>>> }
>>> }
>>> }
>>>
>>> credentials = {
>>> "boats_credentials": {
>>> "client_kwargs": {
>>> "aws_access_key_id": "<your key id>",
>>> "aws_secret_access_key": "<your secret>"
>>> }
>>> }
>>> }
>>>
>>> catalog = KedroDataCatalog.from_config(config, credentials)
>>>
>>> df = catalog.load("cars")
>>> catalog.save("boats", df)
"""
catalog = catalog or {}
config_resolver = CatalogConfigResolver(catalog, credentials)
Expand Down Expand Up @@ -284,10 +410,32 @@ def list(
self, regex_search: str | None = None, regex_flags: int | re.RegexFlag = 0
) -> List[str]: # noqa: UP006
# TODO: rename depending on the solution for https://github.com/kedro-org/kedro/issues/3917
"""
List of all dataset names registered in the catalog.
This can be filtered by providing an optional regular expression
which will only return matching keys.
# TODO: make regex_search mandatory argument as we have catalog.keys() for listing all the datasets.
"""List of all dataset names registered in the catalog.
This can be filtered by providing an optional regular expression which will only return matching keys.
Args:
regex_search: An optional regular expression which can be provided
to limit the datasets returned by a particular pattern.
regex_flags: An optional combination of regex flags.
Returns:
A list of dataset names available which match the `regex_search` criteria (if provided).
All dataset names are returned by default.
Raises:
SyntaxError: When an invalid regex filter is provided.
Example:
::
>>> catalog = KedroDataCatalog()
>>> # get datasets where the substring 'raw' is present
>>> raw_data = catalog.list(regex_search='raw')
>>> # get datasets which start with 'prm' or 'feat'
>>> feat_eng_data = catalog.list(regex_search='^(prm|feat)')
>>> # get datasets which end with 'time_series'
>>> models = catalog.list(regex_search='.+time_series$')
"""
if regex_search is None:
return self.keys()
Expand Down

0 comments on commit 44d0207

Please sign in to comment.