From 2e19cd4bd51dd0c470d2aa1dbf4fa0b0ff1d6768 Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:09:23 +0200
Subject: [PATCH 01/10] update prepare data catalog docs

---
 docs/assets/example_catalog.yml               |  68 +++++
 docs/conf.py                                  |   1 +
 .../guides/advanced_user/data_prepare_cat.rst | 255 ++++++++----------
 docs/parse_predefined_catalogs.py             |   2 +-
 pixi.lock                                     |  27 +-
 pixi.toml                                     |   6 +-
 6 files changed, 209 insertions(+), 150 deletions(-)
 create mode 100644 docs/assets/example_catalog.yml

diff --git a/docs/assets/example_catalog.yml b/docs/assets/example_catalog.yml
new file mode 100644
index 000000000..80075693a
--- /dev/null
+++ b/docs/assets/example_catalog.yml
@@ -0,0 +1,68 @@
+  meta:
+    roots:
+      - /linux/path/to/data_root/
+      - C:\Windows\path\to\data_root
+    version: version
+    name: data_catalog_name
+
+  era5:
+    data_type: RasterDataset
+    variants:
+    - provider: netcdf
+      uri: meteo/era5_daily/nc_merged/era5_{year}_daily.nc
+      driver:
+        name: raster_xarray
+        options:
+          chunks:
+            latitude: 250
+            longitude: 240
+            time: 30
+          combine: by_coords
+          decode_times: true
+          parallel: true
+    - provider: zarr
+      uri: meteo/era5_daily.zarr
+      driver:
+        name: raster_xarray
+        options:
+          chunks: auto
+    metadata:
+      category: meteo
+      notes: Extracted from Copernicus Climate Data Store; resampled by Deltares to
+        daily frequency
+      paper_doi: 10.1002/qj.3803
+      paper_ref: Hersbach et al. (2019)
+      url: https://doi.org/10.24381/cds.bd0915c6
+      version: ERA5 daily data on pressure levels
+      license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products
+      crs: 4326
+      temporal_extent:
+        start: '1950-01-02'
+        end: '2023-11-30'
+      spatial_extent:
+        West: -0.125
+        South: -90.125
+        East: 359.875
+        North: 90.125
+    data_adapter:
+      unit_add:
+        temp: -273.15
+        temp_dew: -273.15
+        temp_max: -273.15
+        temp_min: -273.15
+      unit_mult:
+        kin: 0.000277778
+        kout: 0.000277778
+        ssr: 0.000277778
+        press_msl: 0.01
+      rename:
+        d2m: temp_dew
+        msl: press_msl
+        ssrd: kin
+        t2m: temp
+        tisr: kout
+        tmax: temp_max
+        tmin: temp_min
+        tp: precip
+        u10: wind10_u
+        v10: wind10_v
diff --git a/docs/conf.py b/docs/conf.py
index 63834367e..871bacb02 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -158,6 +158,7 @@ def clean_str(s):
 extensions = [
     "sphinx_design",
     "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
     "sphinx.ext.viewcode",
     "sphinx.ext.todo",
     "sphinx.ext.napoleon",
diff --git a/docs/guides/advanced_user/data_prepare_cat.rst b/docs/guides/advanced_user/data_prepare_cat.rst
index 200fc6cf1..c936f0315 100644
--- a/docs/guides/advanced_user/data_prepare_cat.rst
+++ b/docs/guides/advanced_user/data_prepare_cat.rst
@@ -5,14 +5,16 @@ Preparing a Data Catalog
 
 **Steps in brief:**
 
-1) Have your (local) dataset ready in one of the supported :ref:`raster <raster_formats>`,
-   :ref:`vector <vector_formats>` or :ref:`geospatial time-series <geo_formats>`
-2) Create your own :ref:`yaml file <data_yaml>` with a reference to your prepared dataset following
-   the HydroMT :ref:`data conventions <data_convention>`, see examples below.
+1) Have your (local) dataset ready in one of the supported :ref:`raster
+   <raster_formats>`, :ref:`vector <vector_formats>` or :ref:`geospatial time-series
+   <geo_formats>`
+2) Create your own :ref:`yaml file <data_yaml>` with a reference to your prepared
+   dataset following the HydroMT :ref:`data conventions <data_convention>`, see examples
+   below.
 
-A detailed description of the yaml file is given below.
-For more information see :py:meth:`~hydromt.data_catalog.DataCatalog.from_yml`
-and examples per :ref:`data type <data_types>`
+A detailed description of the yaml file is given below. For more information see
+:py:meth:`~hydromt.data_catalog.DataCatalog.from_yml` and examples per :ref:`data type
+<data_types>`
 
 .. _data_yaml:
 
@@ -21,178 +23,143 @@ Data catalog yaml file
 
 Each data source, is added to a data catalog yaml file with a user-defined name.
 
-A blue print for a dataset called **my_dataset** is shown below.
-The ``path``, ``data_type`` and ``driver`` options are required and the ``meta`` option with the shown keys is highly recommended.
-The ``rename``, ``nodata``, ``unit_add`` and ``unit_mult`` options are set per variable (or attribute table column in case of a GeoDataFrame).
-``driver_kwargs`` contain any options passed to different drivers.
+A blue print for a dataset called **my_dataset** is shown below. The ``uri``,
+``data_type`` and ``driver`` options are required and the ``metadata`` option with the
+shown keys is highly recommended. The ``rename``, ``nodata``, ``unit_add`` and
+``unit_mult`` options are set per variable (or attribute table column in case of a
+GeoDataFrame).
 
-.. code-block:: yaml
+.. include:: ../../assets/example_catalog.yml
+  :code: yaml
+
+.. testsetup:: *
+
+  from hydromt import DataCatalog
 
-    meta:
-      roots:
-        - /linux/path/to/data_root/
-        - C:\Windows\path\to\data_root
-      version: version
-      name: data_catalog_name
-    my_dataset:
-      crs: EPSG/WKT
-      data_type: RasterDataset/GeoDataset/GeoDataFrame/DataFrame
-      driver: raster/raster_tindex/netcdf/zarr/vector/vector_table/csv/xlsx/xls
-      driver_kwargs:
-        key: value
-      filesystem: local/gcs/s3/http
-      meta:
-        source_url: zenodo.org/my_dataset
-        source_license: CC-BY-3.0
-        source_version: vX.X
-        paper_ref: Author et al. (year)
-        paper_doi: doi
-        category: category
-      nodata:
-        new_variable_name: value
-      path: /absolut_path/to/my_dataset.extension OR relative_path/to_my_dataset.extension
-      rename:
-        old_variable_name: new_variable_name
-      unit_add:
-        new_variable_name: value
-      unit_mult:
-        new_variable_name: value
-      attrs:
-        new_variable_name:
-          units: value
-          long_name: variable_long_name
-      zoom_levels:
-        [zoom_level: zoom_resolution]
-
-The yaml file has an *optional* global **meta** data section:
-
-- **roots** (optional): root folders for all the data sources in the yaml file.
-  If not provided the folder of where the yaml file is located will be used as root.
-  This is used in combination with each data source **path** argument to avoid repetition.
-  The roots listed will be checked in the order they are provided. The first one to be found to exist will be used as the actual root.
-  This should be used for cross platform and cross machine compatibility only, as can be seen above. Note that in the end
-  only one of the roots will be used, so all data should still be located in the same folder tree.
-- **version** (recommended): data catalog version; we recommend `calendar versioning <https://calver.org/>`_
-- **hydromt_version** (recommended): range of hydromt version that can read this catalog. Format should be acording to `PEP 440 <https://peps.python.org/pep-0440/#version-specifiers>`_.
-- **category** (optional): used if all data source in catalog belong to the same category. Usual categories within HydroMT are
-  *geography*, *topography*, *hydrography*, *meteo*, *landuse*, *ocean*, *socio-economic*, *observed data*
-  but the user is free to define its own categories.
+.. testcode:: read_catalog
+  :hide:
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml("docs/assets/example_catalog.yml")
+
+The yaml file has an *optional* global **metadata** data section:
+
+- **roots** (optional): root folders for all the data sources in the yaml file. If not
+  provided the folder of where the yaml file is located will be used as root. This is
+  used in combination with each data source **uri** argument to avoid repetition. The
+  roots listed will be checked in the order they are provided. The first one to be found
+  to exist will be used as the actual root. This should be used for cross platform and
+  cross machine compatibility only, as can be seen above. Note that in the end only one
+  of the roots will be used, so all data should still be located in the same folder
+  tree.
+- **version** (recommended): data catalog version
+- **hydromt_version** (recommended): range of hydromt version that can read this
+  catalog. Format should be acording to `PEP 440
+  <https://peps.python.org/pep-0440/#version-specifiers>`_.
+- **category** (optional): used if all data source in catalog belong to the same
+  category. Usual categories within HydroMT are *geography*, *topography*,
+  *hydrography*, *meteo*, *landuse*, *ocean*, *socio-economic*, *observed data* but the
+  user is free to define its own categories.
 
 The following are **required data source arguments**:
 
-- **data_type**: type of input data. Either *RasterDataset*, *GeoDataset*, *GeoDataFrame* or *DataFrame*.
-- **driver**: data_type specific driver to read a dataset, see overview below.
-- **path**: path to the data file.
-  Relative paths are combined with the global ``root`` option of the yaml file (if available) or the directory of the yaml file itself.
-  To read multiple files in a single dataset (if supported by the driver) a string glob in the form of ``"path/to/my/files/*.nc"`` can be used.
-  The filenames can be further specified with ``{variable}``, ``{year}`` and ``{month}`` keys to limit which files are being read
-  based on the get_data request in the form of ``"path/to/my/files/{variable}_{year}_{month}.nc"``.
-  Note that ``month`` is by default *not* zero-padded (e.g. January 2012 is stored as ``"path/to/my/files/{variable}_2012_1.nc"``).
-  Users can optionally add a formatting string to define how the key should be read.
-  For example, in a path written as ``"path/to/my/files/{variable}_{year}_{month:02d}.nc"``,
-  the month always has two digits and is zero-padded for Jan-Sep (e.g. January 2012 is stored as ``"path/to/my/files/{variable}_2012_01.nc"``).
+- **data_type**: type of input data. Either *RasterDataset*, *GeoDataset*, *Dataset*
+  *GeoDataFrame* or *DataFrame*.
+- **driver**: data_type specific :Class:`Driver` to read a dataset. If the default
+  settings of a driver are sufficient, then a string with the name of the driver is
+  enough. Otherwise, a dictionary with the driver class properties can be used. Refer to
+  the :Class:`Driver` documentation to see which options are available.
+- **uri**: URI pointing to where the data can be queried. Relative paths are combined
+  with the global ``root`` option of the yaml file (if available) or the directory of
+  the yaml file itself. To read multiple files in a single dataset (if supported by the
+  driver) a string glob in the form of ``"path/to/my/files/*.nc"`` can be used. The
+  filenames can be further specified with ``{variable}``, ``{year}`` and ``{month}``
+  keys to limit which files are being read based on the get_data request in the form of
+  ``"path/to/my/files/{variable}_{year}_{month}.nc"``. Note that ``month`` is by default
+  *not* zero-padded (e.g. January 2012 is stored as
+  ``"path/to/my/files/{variable}_2012_1.nc"``). Users can optionally add a formatting
+  string to define how the key should be read. For example, in a path written as
+  ``"path/to/my/files/{variable}_{year}_{month:02d}.nc"``, the month always has two
+  digits and is zero-padded for Jan-Sep (e.g. January 2012 is stored as
+  ``"path/to/my/files/{variable}_2012_01.nc"``).
 
 A full list of **optional data source arguments** is given below
 
-- **driver_kwargs**: pairs of key value arguments to pass to the driver specific open data method
-  (eg xr.open_mfdataset for netdcf raster, see the full list below).
-  *NOTE*: New with HydroMT v0.7.2 (was called *kwargs* before)
-- **filesystem** (optional): specify at what filesystem the data is stored. This is used to select the correct protocol to
-  access different filesystems (e.g. local, gcs, s3, http). If not provided the filesystem is inferred from the path.
-  See `fsspec <https://filesystem-spec.readthedocs.io/en/latest/index.html>`_ for more available protocols.
-- **storage_options** (optional): Additional arguments to pass to the filesystem protocol, these are protocol specific.
-  *NOTE*: New in HydroMT v0.8.1
 - **version** (recommended): data source version
-  *NOTE*: New in HydroMT v0.8.1
 - **provider** (recommended): data source provider
-  *NOTE*: New in HydroMT v0.8.1
-- **meta** (recommended): additional information on the dataset organized in a sub-list.
-  Good meta data includes a *source_url*, *source_license*, *source_version*, *paper_ref*, *paper_doi*, *category*, etc. These are added to the data attributes.
-  Usual categories within HydroMT are *geography*, *topography*, *hydrography*, *meteo*, *landuse*, *ocean*, *socio-economic*, *observed data*
-  but the user is free to define its own categories.
-- **nodata** (required if missing in the data): nodata value of the input data. For Raster- and GeoDatasets this is only used if not inferred from the original input data.
-  For GeoDataFrame provided nodata values are converted to nan values.
-- **rename**: pairs of variable names in the input data (*old_variable_name*) and the corresponding
-  :ref:`HydroMT variable naming conventions <data_convention>` and :ref:`recognized dimension names <dimensions>` (*new_variable_name*).
-- **unit_add**: add or substract a value to the input data for unit conversion (e.g. -273.15 for conversion of temperature from Kelvin to Celsius).
-- **unit_mult**: multiply the input data by a value for unit conversion (e.g. 1000 for conversion from m to mm of precipitation).
-- **attrs** (optional): This argument allows for setting attributes like the unit or long name to variables.
-  *NOTE*: New in HydroMT v0.7.2
-- **placeholder** (optional): this argument can be used to generate multiple sources with a single entry in the data catalog file. If different files follow a logical
-  nomenclature, multiple data sources can be defined by iterating through all possible combinations of the placeholders. The placeholder names should be given in the
-  source name and the path and its values listed under the placeholder argument.
-- **variants** (optional): This argument can be used to generate multiple sources with the same name, but from different providers or versions.
-  Any keys here are essentially used to extend/overwrite the base arguments.
-
-The following are **optional data source arguments** for *RasterDataset*, *GeoDataFrame*, and *GeoDataset*:
-
-- **crs** (required if missing in the data): EPSG code or WKT string of the reference coordinate system of the data.
-  Only used if not crs can be inferred from the input data.
-
-The following are **optional data source arguments** for *RasterDataset*:
-
-- **zoom_level** (optional): this argument can be used for a *RasterDatasets* that contain multiple zoom levels of different resolution.
-  It should contain a list of numeric zoom levels that correspond to the `zoom_level` key in file path, e.g.,  ``"path/to/my/files/{zoom_level}/data.tif"``
-  and corresponding resolution, expressed in the unit of the data crs.
-  The *crs* argument is therefore required when using zoom_levels to correctly interpret the unit of the resolution.
-  The required zoom level can be requested from HydroMT as argument to the `DataCatalog.get_rasterdataset` method,
-  see `Reading tiled raster data with different zoom levels <../_examples/working_with_tiled_raster_data.ipynb>`_.
-
-.. note::
-
-  The **alias** argument will be deprecated and should no longer be used, see
-  `github issue for more information <https://github.com/Deltares/hydromt/issues/148>`_
-
-.. warning::
-
-  Using cloud data is still experimental and only supported for *DataFrame*, *RasterDataset* and
-  *Geodataset* with *zarr*. *RasterDataset* with *raster* driver is also possible
-  but in case of multiple files (mosaic) we strongly recommend using a vrt file for speed and computation efficiency.
+- **metadata** (recommended): additional information on the dataset. In
+  :Class:`SourceMetaData` there are many different metadata options available. Some
+  metadata properties, like the `crs`, `nodata` or `temporal_extent` and
+  `spatial_extent` can help HydroMT more efficiently read the data. Good meta data
+  includes a *source_url*, *source_license*, *source_version*, *paper_ref*, *paper_doi*,
+  *category*, etc. These are added to the data attributes. Usual categories within
+  HydroMT are *geography*, *topography*, *hydrography*, *meteo*, *landuse*, *ocean*,
+  *socio-economic*, *observed data* but the user is free to define its own categories.
+- **data_adapter**: the data adapter harmonizes the data so that within HydroMT, there
+  are strong conventions on for example variable naming, :ref:`HydroMT variable naming
+  conventions <data_convention>` and variable names. :ref:`recognized dimension names
+  <dimensions>`. There are multiple different parameters available for each
+  :Class:`DataAdapter`.
+- **placeholder** (optional): this argument can be used to generate multiple sources
+  with a single entry in the data catalog file. If different files follow a logical
+  nomenclature, multiple data sources can be defined by iterating through all possible
+  combinations of the placeholders. The placeholder names should be given in the source
+  name and the path and its values listed under the placeholder argument.
+- **variants** (optional): This argument can be used to generate multiple sources with
+  the same name, but from different providers or versions. Any keys here are essentially
+  used to extend/overwrite the base arguments.
 
 Data variants
 -------------
 
-Data variants are used to define multiple data sources with the same name, but from different providers or versions.
-Below, we show an example of a data catalog for a RasterDataset with multiple variants of the same data source (esa_worldcover),
-but this works identical for other data types.
-Here, the *crs*, *data_type*, *driver* and *filesystem* are common arguments used for all variants.
-The variant arguments are used to extend and/or overwrite the common arguments, creating new sources.
+Data variants are used to define multiple data sources with the same name, but from
+different providers or versions. Below, we show an example of a data catalog for a
+RasterDataset with multiple variants of the same data source (esa_worldcover), but this
+works identical for other data types. Here, the *metadata*, *data_type*, *driver* and
+are common arguments used for all variants. The variant arguments are used
+to extend and/or overwrite the common arguments, creating new sources.
 
 .. code-block:: yaml
 
   esa_worldcover:
-    crs: 4326
+    metadata:
+      crs: 4326
     data_type: RasterDataset
-    driver: raster
-    filesystem: local
+    driver:
+      name: raster
+      filesystem: local
     variants:
       - provider: local
         version: 2021
-        path: landuse/esa_worldcover_2021/esa-worldcover.vrt
+        uri: landuse/esa_worldcover_2021/esa-worldcover.vrt
       - provider: local
         version: 2020
-        path: landuse/esa_worldcover/esa-worldcover.vrt
+        uri: landuse/esa_worldcover/esa-worldcover.vrt
       - provider: aws
         version: 2020
-        path: s3://esa-worldcover/v100/2020/ESA_WorldCover_10m_2020_v100_Map_AWS.vrt
-        filesystem: s3
+        uri: s3://esa-worldcover/v100/2020/ESA_WorldCover_10m_2020_v100_Map_AWS.vrt
+        driver:
+          name: raster
+          filesystem: s3
 
+To request a specific variant, the variant arguments can be used as keyword arguments to
+the `DataCatalog.get_rasterdataset` method, see code below. By default the newest
+version from the last provider is returned when requesting a data source with specific
+version or provider. Requesting a specific version from a HydroMT configuration file is
+also possible, see :ref:`model_config`.
 
-To request a specific variant, the variant arguments can be used as keyword arguments
-to the `DataCatalog.get_rasterdataset` method, see code below.
-By default the newest version from the last provider is returned when requesting a data
-source with specific version or provider.
-Requesting a specific version from a HydroMT configuration file is also possible, see :ref:`model_config`.
 
 .. code-block:: python
 
   from hydromt import DataCatalog
-  dc = DataCatalog.from_yml("data_catalog.yml")
-  # get the default version. This will return the latest (2020) version from the last provider (aws)
+  dc = DataCatalog().from_yml("data_catalog.yml")
+  # get the default version. This will return the latest (2020) version from the last
+  # provider (aws)
   ds = dc.get_rasterdataset("esa_worldcover")
   # get a 2020 version. This will return the 2020 version from the last provider (aws)
   ds = dc.get_rasterdataset("esa_worldcover", version=2020)
-  # get a 2021 version. This will return the 2021 version from the local provider as this verion is not available from aws .
+  # get a 2021 version. This will return the 2021 version from the local provider as
+  # this verion is not available from aws .
   ds = dc.get_rasterdataset("esa_worldcover", version=2021)
   # get the 2020 version from the local provider
   ds = dc.get_rasterdataset("esa_worldcover", version=2020, provider="local")
diff --git a/docs/parse_predefined_catalogs.py b/docs/parse_predefined_catalogs.py
index 323e5e695..897075792 100644
--- a/docs/parse_predefined_catalogs.py
+++ b/docs/parse_predefined_catalogs.py
@@ -147,7 +147,7 @@ def write_predefined_catalogs_to_rst_panels(
                 except OSError as e:
                     print(e)
                     continue
-            df = data_cat.to_dataframe().sort_index().drop_duplicates("path")
+            df = data_cat._to_dataframe().sort_index().drop_duplicates("path")
             df_dict[version] = df.copy()
             data_cat._sources = {}  # reset
         path = write_nested_dropdown(name, df_dict)
diff --git a/pixi.lock b/pixi.lock
index c5fbd1161..6f19c1078 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -2347,7 +2347,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-core-7.16.4-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-pandoc-7.16.4-hd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nbformat-5.10.4-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/nbsphinx-0.9.4-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nbsphinx-0.9.5-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-h59595ed_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.6.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-64/netcdf4-1.7.1-nompi_py311h25b3b55_101.conda
@@ -2836,7 +2836,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-core-7.16.4-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nbconvert-pandoc-7.16.4-hd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nbformat-5.10.4-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/nbsphinx-0.9.4-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/nbsphinx-0.9.5-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/nest-asyncio-1.6.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/win-64/netcdf4-1.7.1-nompi_py311hbdc12eb_101.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/networkx-3.3-pyhd8ed1ab_1.conda
@@ -20700,6 +20700,29 @@ packages:
   - pkg:pypi/nbsphinx?source=conda-forge-mapping
   size: 33630
   timestamp: 1715074950890
+- kind: conda
+  name: nbsphinx
+  version: 0.9.5
+  build: pyhd8ed1ab_0
+  subdir: noarch
+  noarch: python
+  url: https://conda.anaconda.org/conda-forge/noarch/nbsphinx-0.9.5-pyhd8ed1ab_0.conda
+  sha256: 0fc92fc4e1eab73ce7808b5055c33f319a8949b4ad272fc69ebb96b2f157d5eb
+  md5: b808b8a0494c5cca76200c73e260a060
+  depends:
+  - docutils
+  - jinja2
+  - nbconvert
+  - nbformat
+  - python >=3.6
+  - sphinx
+  - traitlets
+  license: MIT
+  license_family: MIT
+  purls:
+  - pkg:pypi/nbsphinx?source=conda-forge-mapping
+  size: 33725
+  timestamp: 1723612159088
 - kind: conda
   name: ncurses
   version: '6.5'
diff --git a/pixi.toml b/pixi.toml
index 3b797bd7a..0b2fdb2c8 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -91,9 +91,9 @@ docker-publish = { depends_on = [
 ] }
 docker-clean = { cmd = ["docker", "system", "prune", "-f"] }
 
-html = { cmd = ["sphinx-build", "-M", "html", "docs", "docs/_build", "-W"] }
-docs = { depends_on = ["html"] }
-doc = { depends_on = ["html"] }
+docs-build = { cmd = ["sphinx-build", "docs", "docs/_build", "--builder", "doctest", "-W"] }
+docs = { depends_on = ["docs-build"] }  # alias
+doc = { depends_on = ["docs-build"] }  # alias
 serve = { cmd = ["python", "-m", "http.server", "-d", "docs/_build/html"] }
 
 

From 6f825955762519dd6be62d1c3c5a09cd3fa04904 Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Wed, 4 Sep 2024 15:02:09 +0200
Subject: [PATCH 02/10] pixi toml with working doctest

---
 pixi.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pixi.toml b/pixi.toml
index 0b2fdb2c8..af187ff08 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -91,7 +91,8 @@ docker-publish = { depends_on = [
 ] }
 docker-clean = { cmd = ["docker", "system", "prune", "-f"] }
 
-docs-build = { cmd = ["sphinx-build", "docs", "docs/_build", "--builder", "doctest", "-W"] }
+doctest = { cmd = ["sphinx-build","-M", "doctest", "docs", "docs/_build", "-W"] }
+docs-build = { cmd = ["sphinx-build", "-M", "html", "docs", "docs/_build", "-W"], depends_on = ["doctest"] }
 docs = { depends_on = ["docs-build"] }  # alias
 doc = { depends_on = ["docs-build"] }  # alias
 serve = { cmd = ["python", "-m", "http.server", "-d", "docs/_build/html"] }

From 6a005ec5e8cc8dd72c79c1a4d49bca04f6899531 Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Thu, 5 Sep 2024 11:11:54 +0200
Subject: [PATCH 03/10] update docs

---
 docs/api/api.rst                         |   2 +
 docs/api/data_adapter.rst                |  49 +++
 docs/api/data_catalog.rst                | 245 --------------
 docs/api/data_source.rst                 |  59 ++++
 docs/api/drivers.rst                     | 140 ++++++++
 docs/api/uri_resolvers.rst               |  32 ++
 docs/guides/advanced_user/data_types.rst | 387 +++++++++++------------
 7 files changed, 472 insertions(+), 442 deletions(-)
 create mode 100644 docs/api/data_adapter.rst
 create mode 100644 docs/api/uri_resolvers.rst

diff --git a/docs/api/api.rst b/docs/api/api.rst
index d241d399e..bdde4ea95 100644
--- a/docs/api/api.rst
+++ b/docs/api/api.rst
@@ -11,6 +11,7 @@ API reference
    :maxdepth: 2
 
    cli
+   data_adapter
    data_catalog
    data_source
    drivers
@@ -20,3 +21,4 @@ API reference
    stats
    plugin
    utils
+   uri_resolvers
diff --git a/docs/api/data_adapter.rst b/docs/api/data_adapter.rst
new file mode 100644
index 000000000..1c32e26a9
--- /dev/null
+++ b/docs/api/data_adapter.rst
@@ -0,0 +1,49 @@
+.. currentmodule:: hydromt.data_catalog.adapters
+
+DataAdapter
+===========
+
+RasterDataset
+-------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   RasterDatasetAdapter
+   RasterDatasetAdapter.transform
+
+GeoDataset
+----------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDatasetAdapter
+   GeoDatasetAdapter.transform
+
+GeoDataFrame
+------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDataFrameAdapter
+   GeoDataFrameAdapter.transform
+
+DataFrame
+---------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   DataFrameAdapter
+   DataFrameAdapter.transform
+
+Dataset
+-------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   DatasetAdapter
+   DatasetAdapter.transform
diff --git a/docs/api/data_catalog.rst b/docs/api/data_catalog.rst
index 794b64944..39fa7c2ea 100644
--- a/docs/api/data_catalog.rst
+++ b/docs/api/data_catalog.rst
@@ -1,9 +1,5 @@
 .. currentmodule:: hydromt.data_catalog
 
-====
-Data
-====
-
 .. _api_data_catalog:
 
 Data catalog
@@ -63,244 +59,3 @@ Predefined data catalog
    PredefinedCatalog.get_catalog_file
 
    predefined_catalog.create_registry_file
-
-
-DataSource
-==========
-
-General
--------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   sources.DataSource
-   sources.DataSource.summary
-
-RasterDataset
--------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   sources.RasterDatasetSource
-   sources.RasterDatasetSource.read_data
-   sources.RasterDatasetSource.to_stac_catalog
-   sources.RasterDatasetSource.get_bbox
-   sources.RasterDatasetSource.get_time_range
-   sources.RasterDatasetSource.detect_bbox
-   sources.RasterDatasetSource.detect_time_range
-
-GeoDataFrame
-------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   sources.GeoDataFrameSource
-   sources.GeoDataFrameSource.read_data
-   sources.GeoDataFrameSource.to_stac_catalog
-   sources.GeoDataFrameSource.get_bbox
-   sources.GeoDataFrameSource.detect_bbox
-
-DataFrame
----------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   sources.DataFrameSource
-   sources.DataFrameSource.read_data
-   sources.DataFrameSource.to_stac_catalog
-
-GeoDataset
-------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   sources.GeoDatasetSource
-   sources.GeoDatasetSource.read_data
-   sources.GeoDatasetSource.to_stac_catalog
-   sources.GeoDatasetSource.get_bbox
-   sources.GeoDatasetSource.detect_bbox
-
-URIResolver
-================
-
-General
--------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   uri_resolvers.URIResolver
-   uri_resolvers.URIResolver.resolve
-
-ConventionResolver
-------------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   uri_resolvers.ConventionResolver
-   uri_resolvers.ConventionResolver.resolve
-
-RasterTindexResolver
---------------------
-.. autosummary::
-   :toctree: ../_generated
-
-   uri_resolvers.RasterTindexResolver
-   uri_resolvers.RasterTindexResolver.resolve
-
-Driver
-======
-
-General
--------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.base_driver.BaseDriver
-
-RasterDataset
--------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.raster.raster_dataset_driver.RasterDatasetDriver
-   drivers.raster.raster_dataset_driver.RasterDatasetDriver.read
-   drivers.raster.raster_dataset_driver.RasterDatasetDriver.write
-
-RasterDatasetXarrayDriver
--------------------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.raster.raster_xarray_driver.RasterDatasetXarrayDriver
-   drivers.raster.raster_xarray_driver.RasterDatasetXarrayDriver.read
-   drivers.raster.raster_xarray_driver.RasterDatasetXarrayDriver.write
-
-RasterioDriver
---------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.raster.rasterio_driver.RasterioDriver
-   drivers.raster.rasterio_driver.RasterioDriver.read
-   drivers.raster.rasterio_driver.RasterioDriver.write
-
-GeoDataFrame
-------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.geodataframe.geodataframe_driver.GeoDataFrameDriver
-   drivers.geodataframe.geodataframe_driver.GeoDataFrameDriver.read
-   drivers.geodataframe.geodataframe_driver.GeoDataFrameDriver.write
-
-PyogrioDriver
--------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.geodataframe.pyogrio_driver.PyogrioDriver
-   drivers.geodataframe.pyogrio_driver.PyogrioDriver.read
-   drivers.geodataframe.pyogrio_driver.PyogrioDriver.write
-
-GeoDataFrameTableDriver
------------------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.geodataframe.table_driver.GeoDataFrameTableDriver
-   drivers.geodataframe.table_driver.GeoDataFrameTableDriver.read
-   drivers.geodataframe.table_driver.GeoDataFrameTableDriver.write
-
-DataFrame
----------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.dataframe.dataframe_driver.DataFrameDriver
-   drivers.dataframe.dataframe_driver.DataFrameDriver.read
-   drivers.dataframe.dataframe_driver.DataFrameDriver.write
-
-PandasDriver
-------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.dataframe.pandas_driver.PandasDriver
-   drivers.dataframe.pandas_driver.PandasDriver.read
-   drivers.dataframe.pandas_driver.PandasDriver.write
-
-GeoDataFrame
-------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   drivers.geodataset.geodataset_driver.GeoDatasetDriver
-   drivers.geodataset.geodataset_driver.GeoDatasetDriver.read
-   drivers.geodataset.geodataset_driver.GeoDatasetDriver.write
-
-DataAdapter
-===========
-
-General
--------
-
-RasterDataset
--------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   adapters.RasterDatasetAdapter
-   adapters.RasterDatasetAdapter.transform
-
-GeoDataset
-----------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   adapters.GeoDatasetAdapter
-   adapters.GeoDatasetAdapter.transform
-
-GeoDataFrame
-------------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   adapters.GeoDataFrameAdapter
-   adapters.GeoDataFrameAdapter.transform
-
-DataFrame
----------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   adapters.dataframe.DataFrameAdapter
-   adapters.dataframe.DataFrameAdapter.transform
-
-Dataset
--------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   adapters.DatasetAdapter
diff --git a/docs/api/data_source.rst b/docs/api/data_source.rst
index 15ea2e999..4ce956d5a 100644
--- a/docs/api/data_source.rst
+++ b/docs/api/data_source.rst
@@ -1,5 +1,64 @@
+.. currentmodule:: hydromt.data_catalog.sources
+
 .. _data_source:
 
 ============
 Data sources
 ============
+
+General
+-------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   DataSource
+   DataSource.summary
+
+RasterDataset
+-------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   RasterDatasetSource
+   RasterDatasetSource.read_data
+   RasterDatasetSource.to_stac_catalog
+   RasterDatasetSource.get_bbox
+   RasterDatasetSource.get_time_range
+   RasterDatasetSource.detect_bbox
+   RasterDatasetSource.detect_time_range
+
+GeoDataFrame
+------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDataFrameSource
+   GeoDataFrameSource.read_data
+   GeoDataFrameSource.to_stac_catalog
+   GeoDataFrameSource.get_bbox
+   GeoDataFrameSource.detect_bbox
+
+DataFrame
+---------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   DataFrameSource
+   DataFrameSource.read_data
+   DataFrameSource.to_stac_catalog
+
+GeoDataset
+------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDatasetSource
+   GeoDatasetSource.read_data
+   GeoDatasetSource.to_stac_catalog
+   GeoDatasetSource.get_bbox
+   GeoDatasetSource.detect_bbox
diff --git a/docs/api/drivers.rst b/docs/api/drivers.rst
index 9a696c5fa..26a23f3bc 100644
--- a/docs/api/drivers.rst
+++ b/docs/api/drivers.rst
@@ -1,5 +1,145 @@
+.. currentmodule:: hydromt.data_catalog.drivers
+
 .. _drivers:
 
 =======
 Drivers
 =======
+
+Base
+----
+
+.. autosummary::
+   :toctree: ../_generated
+
+   BaseDriver
+
+RasterDataset
+-------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   RasterDatasetDriver
+   RasterDatasetDriver.read
+   RasterDatasetDriver.write
+
+RasterDatasetXarrayDriver
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: ../_generated
+
+   RasterDatasetXarrayDriver
+   RasterDatasetXarrayDriver.read
+   RasterDatasetXarrayDriver.write
+
+RasterioDriver
+^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: ../_generated
+
+   RasterioDriver
+   RasterioDriver.read
+   RasterioDriver.write
+
+GeoDataFrame
+------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDataFrameDriver
+   GeoDataFrameDriver.read
+   GeoDataFrameDriver.write
+
+PyogrioDriver
+^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: ../_generated
+
+   PyogrioDriver
+   PyogrioDriver.read
+   PyogrioDriver.write
+
+GeoDataFrameTableDriver
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDataFrameTableDriver
+   GeoDataFrameTableDriver.read
+   GeoDataFrameTableDriver.write
+
+DataFrame
+---------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   DataFrameDriver
+   DataFrameDriver.read
+   DataFrameDriver.write
+
+PandasDriver
+^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: ../_generated
+
+   PandasDriver
+   PandasDriver.read
+   PandasDriver.write
+
+GeoDataset
+----------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDatasetDriver
+   GeoDatasetDriver.read
+   GeoDatasetDriver.write
+
+GeoDatasetXarrayDriver
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: ../_generated
+
+   GeoDatasetXarrayDriver
+   GeoDatasetXarrayDriver.read
+   GeoDatasetXarrayDriver.write
+
+GeoDatasetVectorDriver
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: ../_generated
+
+    GeoDatasetVectorDriver
+    GeoDatasetVectorDriver.read
+    GeoDatasetVectorDriver.write
+
+Dataset
+-------
+
+.. autosummary::
+    :toctree: ../_generated
+
+    DatasetDriver
+    DatasetDriver.read
+    DatasetDriver.write
+
+DatasetXarrayDriver
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: ../_generated
+
+   DatasetXarrayDriver
+   DatasetXarrayDriver.read
+   DatasetXarrayDriver.write
diff --git a/docs/api/uri_resolvers.rst b/docs/api/uri_resolvers.rst
new file mode 100644
index 000000000..ca5015d4e
--- /dev/null
+++ b/docs/api/uri_resolvers.rst
@@ -0,0 +1,32 @@
+===========
+URIResolver
+===========
+
+.. currentmodule:: hydromt.data_catalog.uri_resolvers
+
+General
+-------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   URIResolver
+   URIResolver.resolve
+
+ConventionResolver
+------------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   ConventionResolver
+   ConventionResolver.resolve
+
+RasterTindexResolver
+--------------------
+
+.. autosummary::
+   :toctree: ../_generated
+
+   RasterTindexResolver
+   RasterTindexResolver.resolve
diff --git a/docs/guides/advanced_user/data_types.rst b/docs/guides/advanced_user/data_types.rst
index 837ac3cd6..4e062c8f5 100644
--- a/docs/guides/advanced_user/data_types.rst
+++ b/docs/guides/advanced_user/data_types.rst
@@ -1,5 +1,7 @@
 .. _data_types:
 
+.. currentmodule:: hydromt.data_catalog.drivers
+
 Supported data types
 ====================
 
@@ -11,15 +13,16 @@ HydroMT currently supports the following data types:
 - :ref:`Dataset <Dataset>`:  non-spatial n-dimensional data
 - :ref:`DataFrame <DataFrame>`: 2D tabular data
 
-Internally the RasterDataset, GeoDataset, and Dataset are represented by :py:class:`xarray.Dataset` objects,
-the GeoDataFrame by :py:class:`geopandas.GeoDataFrame`, and the DataFrame by
-:py:class:`pandas.DataFrame`. We use drivers, typically from third-party packages and sometimes
-wrapped in HydroMT functions, to parse many different file formats to this standardized internal
-data representation.
+Internally the RasterDataset, GeoDataset, and Dataset are represented by
+:py:class:`xarray.Dataset` objects, the GeoDataFrame by
+:py:class:`geopandas.GeoDataFrame`, and the DataFrame by :py:class:`pandas.DataFrame`.
+We use drivers, typically from third-party packages and sometimes wrapped in HydroMT
+functions, to parse many different file formats to this standardized internal data
+representation.
 
 .. note::
 
-    Please contact us through the issue list if you would like to add other drivers.
+  It is also possible to create your own driver. See at :ref:`Custom Driver<custom_driver>`
 
 .. _dimensions:
 
@@ -44,26 +47,23 @@ Raster data (RasterDataset)
 .. _raster_formats:
 
 .. list-table::
-   :widths: 17, 25, 28, 30
+   :widths: 17, 25, 30
    :header-rows: 1
 
    * - Driver
      - File formats
-     - Method
      - Comments
-   * - ``raster``
+   * - :py:class:`raster <raster.rasterio_driver.RasterioDriver>`
      - GeoTIFF, ArcASCII, VRT, etc. (see `GDAL formats <http://www.gdal.org/formats_list.html>`_)
-     - :py:meth:`~hydromt.io.open_mfraster`
      - Based on :py:func:`xarray.open_rasterio`
        and :py:func:`rasterio.open`
-   * - ``raster_tindex``
+   * - :py:class:`raster <raster.rasterio_driver.RasterioDriver>` with the
+       :py:class:`raster_tindex <hydromt.data_catalog.uri_resolvers.raster_tindex_resolver.RasterTindexResolver>` resolver
      - raster tile index file (see `gdaltindex <https://gdal.org/programs/gdaltindex.html>`_)
-     - :py:meth:`~hydromt.io.open_raster_from_tindex`
-     - Options to merge tiles via ``mosaic_kwargs``.
-   * - ``netcdf`` or ``zarr``
+     - Options to merge tiles via `options -> mosaic_kwargs`.
+   * - :py:class:`raster_xarray <raster.raster_xarray_driver.RasterDatasetXarrayDriver>`
      - NetCDF and Zarr
-     - :py:func:`xarray.open_mfdataset`, :py:func:`xarray.open_zarr`
-     - required y and x dimensions_
+     - required y and x dimensions
 
 
 .. _GeoTiff:
@@ -73,18 +73,21 @@ Raster data (RasterDataset)
 Single raster files are parsed to a **RasterDataset** based on the **raster** driver.
 This driver supports 2D raster for which the dimensions are names "x" and "y".
 A potential third dimension is called "dim0".
-The variable name is based on the filename, in this case "GLOBCOVER_200901_200912_300x300m".
-The ``chunks`` key-word argument is passed to :py:meth:`~hydromt.io.open_mfraster`
+The variable name is based on the filename, in this case `"GLOBCOVER_200901_200912_300x300m"`.
+The `chunks` key-word argument is passed to :py:meth:`~hydromt.io.open_mfraster`
 and allows lazy reading of the data.
 
 .. code-block:: yaml
 
     globcover:
-      path: base/landcover/globcover/GLOBCOVER_200901_200912_300x300m.tif
+      uri: base/landcover/globcover/GLOBCOVER_200901_200912_300x300m.tif
       data_type: RasterDataset
-      driver: raster
-      driver_kwargs:
-        chunks: {x: 3600, y: 3600}
+      driver:
+        name: raster
+        options:
+          chunks:
+            x: 3600
+            y: 3600
       meta:
         category: landuse
         source_url: http://due.esrin.esa.int/page_globcover.php
@@ -100,32 +103,36 @@ Multi-variable Virtual Raster Tileset (VRT)
 Multiple raster layers from different files are parsed using the **raster** driver.
 Each raster becomes a variable in the resulting RasterDataset based on its filename.
 The path to multiple files can be set using a sting glob or several keys,
-see description of the ``path`` argument in the :ref:`yaml file description <data_yaml>`.
+see description of the `uri` argument in the :ref:`yaml file description <data_yaml>`.
 Note that the rasters should have identical grids.
 
-Here multiple .vrt files (dir.vrt, bas.vrt, etc.) are combined based on their variable name
-into a single dataset with variables flwdir, basins, etc.
-Other multiple file raster datasets (e.g. GeoTIFF files) can be read in the same way.
-VRT files are useful for large raster datasets which are often tiled and can be combined using
+Here multiple .vrt files (dir.vrt, bas.vrt, etc.) are combined based on their variable
+name into a single dataset with variables flwdir, basins, etc. Other multiple file
+raster datasets (e.g. GeoTIFF files) can be read in the same way. VRT files are useful
+for large raster datasets which are often tiled and can be combined using
 `gdalbuildvrt. <https://gdal.org/programs/gdalbuildvrt.html>`_
 
 
 .. code-block:: yaml
 
     merit_hydro:
-      path: base/merit_hydro/{variable}.vrt
+      uri: base/merit_hydro/{variable}.vrt
       data_type: RasterDataset
-      driver: raster
-      crs: 4326
-      driver_kwargs:
-        chunks: {x: 6000, y: 6000}
-      rename:
-        dir: flwdir
-        bas: basins
-        upa: uparea
-        elv: elevtn
-        sto: strord
-      meta:
+      driver:
+        name: raster
+        options:
+          chunks:
+          x: 6000
+          y: 6000
+      data_adapter:
+        rename:
+          dir: flwdir
+          bas: basins
+          upa: uparea
+          elv: elevtn
+          sto: strord
+      metadata:
+        crs: 4326
         category: topography
         source_version: 1.0
         paper_doi: 10.1029/2019WR024873
@@ -138,7 +145,9 @@ VRT files are useful for large raster datasets which are often tiled and can be
 Tiled raster dataset
 ^^^^^^^^^^^^^^^^^^^^
 
-Tiled index datasets are parsed using the **raster_tindex** driver.
+Tiled index datasets are parsed using the
+:py:Class:`raster_tindex <hydromt.data_catalog.uri_resolvers.raster_tindex_resolver.RasterTindexResolver>`
+:py:class:`~hydromt.data_catalog.uri_resolvers.uri_resolver.URIResolver`.
 This data format is used to combine raster tiles with different CRS projections.
 A polygon vector file (e.g. GeoPackage) is used to make a tile index with the spatial
 footprints of each tile. When reading a spatial slice of this data the files with
@@ -146,10 +155,11 @@ intersecting footprints will be merged together in the CRS of the most central t
 Use `gdaltindex <https://gdal.org/programs/gdaltindex.html>`_ to build an excepted tile index file.
 
 Here a GeoPackage with the tile index referring to individual GeoTiff raster tiles is used.
-The ``mosaic_kwargs`` are passed to :py:meth:`~hydromt.io.open_raster_from_tindex` to
-set the resampling ``method``. The name of the column in the tile index attribute table ``tileindex``
-which contains the raster tile file names is set in the ``driver_kwargs`` (to be directly passed as an argument to
-:py:meth:`~hydromt.io.open_raster_from_tindex`).
+The `mosaic_kwargs` are passed to :py:meth:`~hydromt._io._open_raster_from_tindex` to
+set the resampling `method`. The name of the column in the tile index attribute table
+`tileindex` which contains the raster tile file names is set in the `driver.options``
+(to be directly passed as an argument to
+:py:meth:`~hydromt._io._open_raster_from_tindex`).
 
 .. code-block:: yaml
 
@@ -157,12 +167,15 @@ which contains the raster tile file names is set in the ``driver_kwargs`` (to be
       path: static_data/base/grwl/tindex.gpkg
       data_type: RasterDataset
       driver: raster_tindex
-      nodata: 0
-      driver_kwargs:
-        chunks: {x: 3000, y: 3000}
-        mosaic_kwargs: {method: nearest}
-        tileindex: location
+        options:
+          chunks:
+            x: 3000
+            y: 3000
+          mosaic_kwargs:
+            method: nearest
+          tileindex: location
       meta:
+        nodata: 0
         category: hydrography
         paper_doi: 10.1126/science.aat0636
         paper_ref: Allen and Pavelsky (2018)
@@ -172,8 +185,9 @@ which contains the raster tile file names is set in the ``driver_kwargs`` (to be
 
 .. NOTE::
 
-  Tiled raster datasets are not read lazily as different tiles have to be merged together based on
-  their values. For fast access to large raster datasets, other formats might be more suitable.
+  Tiled raster datasets are not read lazily as different tiles have to be merged
+  together based on their values. For fast access to large raster datasets, other
+  formats might be more suitable.
 
 .. _NC_raster:
 
@@ -199,54 +213,52 @@ See list of recognized dimensions_ names.
 
 
 To read a raster dataset from a multiple file netcdf archive the following data entry
-is used, where the ``driver_kwargs`` are passed to :py:func:`xarray.open_mfdataset`
+is used, where the `options` are passed to :py:func:`xarray.open_mfdataset`
 (or :py:func:`xarray.open_zarr` for zarr data).
-In case the CRS cannot be inferred from the netcdf data it should be defined with the ``crs`` option here.
+In case the CRS cannot be inferred from the netcdf metadata it should be defined with
+the `crs` `metadata`` here.
 The path to multiple files can be set using a sting glob or several keys,
-see description of the ``path`` argument in the :ref:`yaml file description <data_yaml>`.
+see description of the `uri` argument in the :ref:`yaml file description <data_yaml>`.
 In this example additional renaming and unit conversion preprocessing steps are added to
 unify the data to match the HydroMT naming and unit :ref:`terminology <terminology>`.
 
 .. code-block:: yaml
 
     era5_hourly:
-      path: forcing/ERA5/org/era5_{variable}_{year}_hourly.nc
+      uri: forcing/ERA5/org/era5_{variable}_{year}_hourly.nc
       data_type: RasterDataset
-      driver: netcdf
-      crs: 4326
-      driver_kwargs:
-        chunks: {latitude: 125, longitude: 120, time: 50}
-        combine: by_coords
-        decode_times: true
-        parallel: true
-      meta:
+      driver:
+        name: netcdf
+        options:
+          chunks: {latitude: 125, longitude: 120, time: 50}
+          combine: by_coords
+          decode_times: true
+          parallel: true
+      metadata:
+        crs: 4326
         category: meteo
         paper_doi: 10.1002/qj.3803
         paper_ref: Hersbach et al. (2019)
         source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products
         source_url: https://doi.org/10.24381/cds.bd0915c6
-      rename:
-        t2m: temp
-        tp: precip
-      unit_add:
-        temp: -273.15
-      unit_mult:
-        precip: 1000
+      data_adapter:
+        rename:
+          t2m: temp
+          tp: precip
+        unit_add:
+          temp: -273.15
+        unit_mult:
+          precip: 1000
 
 
 Preprocess functions when combining multiple files
 """"""""""""""""""""""""""""""""""""""""""""""""""
 
-In :py:func:`xarray.open_mfdataset`, xarray allows for a *preprocess* function to be run before merging several
-netcdf files together. In hydroMT, some preprocess functions are available and can be passed through the ``driver_kwargs``
-options in the same way as any xr.open_mfdataset options. These preprocess functions are:
-
-- **round_latlon**: round x and y dimensions to 5 decimals to avoid merging problems in xarray due to small differences
-  in x, y values in the different netcdf files of the same data source.
-- **to_datetimeindex**: force parsing the time dimension to a datetime index.
-- **remove_duplicates**: remove time duplicates
-
-
+In :py:func:`xarray.open_mfdataset`, xarray allows for a **preprocess** function to be
+run before merging several netcdf files together. In hydroMT, some preprocess functions
+are available and can be passed through the options in the same way as any
+xr.open_mfdataset options. These preprocess functions are found at
+:py:obj:`hydromt.data_catalog.preprocessing.py`
 
 .. _GeoDataFrame:
 
@@ -259,23 +271,18 @@ Vector data (GeoDataFrame)
 .. _vector_formats:
 
 .. list-table::
-   :widths: 17, 25, 28, 30
+   :widths: 17, 25, 30
    :header-rows: 1
 
    * - Driver
      - File formats
-     - Method
      - Comments
-   * - ``vector``
+   * - :py:class:`vector <geodataframe.pyogrio_driver.PyogrioDriver>`
      - ESRI Shapefile, GeoPackage, GeoJSON, etc.
-     - :py:meth:`~hydromt.io.open_vector`
-     - Point, Line and Polygon geometries. Uses :py:func:`geopandas.read_file`
-   * - ``vector_table``
+     - Point, Line and Polygon geometries. Uses :py:func:`pyogrio.read_dataframe`
+   * - :py:class:`geodataframe_table <geodataframe.table_driver.GeoDataFrameTableDriver>`
      - CSV, XY, PARQUET and EXCEL.
-     - :py:meth:`~hydromt.io.open_vector`
-     - Point geometries only. Uses :py:meth:`~hydromt.io.open_vector_from_table`
-
-
+     - Point geometries only. Uses :py:meth:`~hydromt._io._open_vector_from_table`
 
 .. _GPKG_vector:
 
@@ -290,19 +297,21 @@ columns of the attribute table in case of a GeoDataFrame.
 
 .. code-block:: yaml
 
-      GDP_world:
-        path: base/emissions/GDP-countries/World_countries_GDPpcPPP.gpkg
-        data_type: GeoDataFrame
-        driver: vector
-        driver_kwargs:
-          layer: GDP
-        rename:
-          GDP: gdp
-        unit_mult:
-          gdp: 0.001
-        meta:
-          category: socio-economic
-          source_version: 1.0
+  GDP_world:
+    uri: base/emissions/GDP-countries/World_countries_GDPpcPPP.gpkg
+    data_type: GeoDataFrame
+    driver:
+      name: vector
+      options:
+        layer: GDP
+    data_adapter:
+      rename:
+        GDP: gdp
+      unit_mult:
+        gdp: 0.001
+    metadata:
+      category: socio-economic
+      source_version: 1.0
 
 .. _textdelimited_vector:
 
@@ -336,28 +345,24 @@ of the GeoDataFrame attribute table.
     ...
 
 As the CRS of the coordinates cannot be inferred from the data it must be set in the
-data entry in the yaml file as shown in the example below. The internal data format
-is based on the file extension unless the ``driver_kwargs`` ``driver`` option is set.
-See :py:meth:`~hydromt.io.open_vector` and :py:func:`~hydromt.io.open_vector_from_table` for more
-options.
+data entry in the yaml file as shown in the example below.
 
 .. code-block:: yaml
 
     stations:
-      path: /path/to/stations.csv
+      uri: /path/to/stations.csv
       data_type: GeoDataFrame
-      driver: vector_table
-      crs: 4326
-      driver_kwargs:
-        driver: csv
+      driver: geodataframe_table
+      metadata:
+        crs: 4326
 
 .. _binary_vector:
 
-HydroMT also supports reading and writing vector data in binary format. Currently only parquet is
-supported, but others could be added if desired. The structure of the files should be the same as
-the text format files described above but writing according to the parquet file spec. Since this is
-a binary format, not examples are provided, but for example pandas can write the same data structure
-to parquet as it can csv.
+HydroMT also supports reading and writing vector data in binary format. Currently only
+parquet is supported, but others could be added if desired. The structure of the files
+should be the same as the text format files described above but writing according to the
+parquet file spec. Since this is a binary format, not examples are provided, but for
+example pandas can write the same data structure to parquet as it can csv.
 
 
 .. _GeoDataset:
@@ -371,20 +376,17 @@ Geospatial point time-series (GeoDataset)
 .. _geo_formats:
 
 .. list-table::
-   :widths: 17, 25, 28, 30
+   :widths: 17, 25, 30
    :header-rows: 1
 
    * - Driver
      - File formats
-     - Method
      - Comments
-   * - ``vector``
+   * - :py:class:`vector <geodataset.vector_driver.GeoDatasetVectorDriver>`
      - Combined point location (e.g. CSV or GeoJSON) and text delimited time-series (e.g. CSV) data.
-     - :py:meth:`~hydromt.io.open_geodataset`
-     - Uses :py:meth:`~hydromt.io.open_vector`, :py:meth:`~hydromt.io.open_timeseries_from_table`
-   * - ``netcdf`` or ``zarr``
+     - Uses :py:meth:`~hydromt._io._open_vector`, :py:meth:`~hydromt._io._open_timeseries_from_table`
+   * - :py:class:`geodataset_xarray <geodataset.xarray_driver.GeoDatasetXarrayDriver>`
      - NetCDF and Zarr
-     - :py:func:`xarray.open_mfdataset`, :py:func:`xarray.open_zarr`
      - required time and index dimensions_ and x- and y coordinates.
 
 
@@ -411,31 +413,33 @@ on a list of recognized dimensions_ names.
         waterlevel   (time, stations)
 
 To read a point time-series dataset from a multiple file netcdf archive the following data entry
-is used, where the ``driver_kwargs`` are passed to :py:func:`xarray.open_mfdataset`
+is used, where the options are passed to :py:func:`xarray.open_mfdataset`
 (or :py:func:`xarray.open_zarr` for zarr data).
 In case the CRS cannot be inferred from the netcdf data it is defined here.
 The path to multiple files can be set using a sting glob or several keys,
-see description of the ``path`` argument in the :ref:`yaml file description <data_yaml>`.
+see description of the `uri` argument in the :ref:`yaml file description <data_yaml>`.
 In this example additional renaming and unit conversion preprocessing steps are added to
 unify the data to match the HydroMT naming and unit :ref:`terminology <terminology>`.
 
 .. code-block:: yaml
 
     gtsmv3_eu_era5:
-      path: reanalysis-waterlevel-{year}-m{month:02d}.nc
+      uri: reanalysis-waterlevel-{year}-m{month:02d}.nc
       data_type: GeoDataset
-      driver: netcdf
-      crs: 4326
-      driver_kwargs:
-        chunks: {stations: 100, time: 1500}
-        combine: by_coords
-        decode_times: true
-        parallel: true
-      rename:
-        station_x_coordinate: lon
-        station_y_coordinate: lat
-        stations: index
-      meta:
+      driver:
+        name: netcdf
+        options:
+          chunks: {stations: 100, time: 1500}
+          combine: by_coords
+          decode_times: true
+          parallel: true
+      data_adapter:
+        rename:
+          station_x_coordinate: lon
+          station_y_coordinate: lat
+          stations: index
+      metadata:
+        crs: 4326
         category: ocean
         paper_doi: 10.24381/cds.8c59054f
         paper_ref: Copernicus Climate Change Service 2019
@@ -447,31 +451,34 @@ unify the data to match the HydroMT naming and unit :ref:`terminology <terminolo
 CSV point time-series data
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Point time-series data where the geospatial point geometries and time-series are saved in
-separate (text) files are parsed to **GeoDataset** using the **vector** driver.
-The GeoDataset must at least contain a location index with point geometries which is referred to by the ``path`` argument
-The path may refer to both GIS vector data such as GeoJSON with only Point geometries
-or tabulated point vector data such as csv files, see earlier examples for GeoDataFrame datasets.
-Finally, certain binary formats such as parquet are also supported.
-In addition a tabulated time-series text file can be passed to be used as a variable of the GeoDataset.
-This data is added by a second file which is referred to using the ``data_path`` key-word argument.
-The index of the time-series (in the columns header) and point locations must match.
-For more options see the :py:meth:`~hydromt.io.open_geodataset` method.
+Point time-series data where the geospatial point geometries and time-series are saved
+in separate (text) files are parsed to **GeoDataset** using the **vector** driver. The
+GeoDataset must at least contain a location index with point geometries which is
+referred to by the `uri` argument The path may refer to both GIS vector data such as
+GeoJSON with only Point geometries or tabulated point vector data such as csv files, see
+earlier examples for GeoDataFrame datasets. Finally, certain binary formats such as
+parquet are also supported. In addition a tabulated time-series text file can be passed
+to be used as a variable of the GeoDataset. This data is added by a second file which is
+referred to using the `data_path` option. The index of the time-series (in the columns
+header) and point locations must match. For more options see the
+:py:meth:`~hydromt._io._open_geodataset` method.
 
 .. code-block:: yaml
 
     waterlevels_txt:
-      path: /path/to/stations.csv
+      uri: /path/to/stations.csv
       data_type: GeoDataset
-      driver: vector
-      crs: 4326
-      driver_kwargs:
-        data_path: /path/to/stations_data.csv
+      driver:
+        name: vector
+        options:
+          data_path: /path/to/stations_data.csv
+      metadata:
+        crs: 4326
 
 *Tabulated time series text file*
 
-This data is read using the :py:meth:`~hydromt.io.open_timeseries_from_table` method. To
-read the time stamps the :py:func:`pandas.to_datetime` method is used.
+This data is read using the :py:meth:`~hydromt._io._open_timeseries_from_table` method.
+To read the time stamps the :py:func:`pandas.to_datetime` method is used.
 
 .. code-block:: console
 
@@ -488,36 +495,34 @@ NetCDF time-series dataset (Dataset)
 .. _dataset_formats:
 
 .. list-table::
-   :widths: 17, 25, 28, 30
+   :widths: 17, 25, 30
    :header-rows: 1
 
    * - Driver
      - File formats
-     - Method
      - Comments
-   * - ``netcdf`` or ``zarr``
+   * - :py:Class:`dataset_xarray <dataset.xarray_driver.DatasetXarrayDriver>`
      - NetCDF and Zarr
-     - :py:func:`xarray.open_mfdataset`, :py:func:`xarray.open_zarr`
      - required time and index dimensions_.
 
 .. _NC_timeseries:
 
 
 Netcdf time-series dataset
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-NetCDF and zarr timeseries data are parsed to **Dataset** with the **netcdf** and **zarr** drivers.
-The resulting dataset is similar to the **GeoDataset** except that it lacks a spatial dimension.
+NetCDF and zarr timeseries data are parsed to **Dataset** with the
+:py:class:`~dataset.xarray_driver.DatasetXarrayDriver`.
+The resulting dataset is similar to the **GeoDataset** except that it lacks a spatial
+dimension.
 
 .. code-block:: yaml
 
     timeseries_dataset:
-      path: /path/to/timeseries.netcdf
+      uri: /path/to/timeseries.netcdf
       data_type: Dataset
       driver: netcdf
 
-
-
 .. _DataFrame:
 
 2D tabular data (DataFrame)
@@ -526,30 +531,15 @@ The resulting dataset is similar to the **GeoDataset** except that it lacks a sp
 .. _dataframe_formats:
 
 .. list-table::
-   :widths: 17, 25, 28, 30
+   :widths: 17, 25, 30
    :header-rows: 1
 
    * - Driver
      - File formats
-     - Method
      - Comments
-   * - ``csv``
-     - Comma-separated files (or using another delimiter)
-     - :py:func:`pandas.read_csv`
-     - See :py:func:`pandas.read_csv` for all
-   * - ``excel``
-     - Excel files
-     - :py:func:`pandas.read_excel`
-     - If required, provide a sheet name through driver_kwargs
-   * - ``parquet``
-     - Binary encoded columnar data format
-     - :py:func:`pandas.read_parquet`
-     -
-   * - ``fwf``
-     - Fixed width delimited text files
-     - :py:func:`pandas.read_fwf`
-     - The formatting of these files can either be inferred or defined by the user, both through the driver_kwargs.
-
+   * - :py:class:`csv <dataframe.pandas_driver.PandasDriver>`
+     - any file readable by pandas
+     - Provide a sheet name or formatting through options
 
 .. note::
 
@@ -559,24 +549,27 @@ The resulting dataset is similar to the **GeoDataset** except that it lacks a sp
 Supported files
 ^^^^^^^^^^^^^^^
 
-The DataFrameAdapter is quite flexible in supporting different types of tabular data formats. All drivers allow for flexible reading of
-files: for example both mapping tables and time series data are supported. Please note that for timeseries, the driver_kwargs need to be used to
-set the correct column for indexing, and formatting and parsing of datetime-strings. See the relevant pandas function for which arguments
-can be used. Also note that the **csv** driver is not restricted to comma-separated files, as the delimiter can be given to the reader
-through the driver_kwargs.
+The DataFrameAdapter is quite flexible in supporting different types of tabular data
+formats. The driver allows for flexible reading of files: for example both mapping
+tables and time series data are supported. Please note that for timeseries, the
+`options` need to be used to set the correct column for indexing, and formatting and
+parsing of datetime-strings. See the relevant pandas function for which arguments can be
+used. Also note that the driver is not restricted to comma-separated files, as
+the delimiter can be given to the reader through the `options`.
 
 .. code-block:: yaml
 
     observations:
-      path: data/lulc/globcover_mapping.csv
+      uri: data/lulc/globcover_mapping.csv
       data_type: DataFrame
-      driver: csv
-      meta:
+      driver:
+        name: csv
+        options:
+          header: null  # null translates to None in Python -> no header
+          index_col: 0
+          parse_dates: false
+      metadata:
         category: parameter_mapping
-      driver_kwargs:
-        header: null  # null translates to None in Python -> no header
-        index_col: 0
-        parse_dates: false
 
 .. note::
     The yml-parser does not correctly parses `None` arguments. When this is required, the `null` argument should be used instead.

From 8f7d3d7475a862a1e693511d012a0560b04b3c7d Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Thu, 5 Sep 2024 11:12:07 +0200
Subject: [PATCH 04/10] update mosaic_kwargs behaviour

---
 hydromt/data_catalog/drivers/raster/rasterio_driver.py    | 7 ++++++-
 tests/data_catalog/drivers/raster/test_rasterio_driver.py | 8 ++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/hydromt/data_catalog/drivers/raster/rasterio_driver.py b/hydromt/data_catalog/drivers/raster/rasterio_driver.py
index 43d3fcb22..bfa4c6289 100644
--- a/hydromt/data_catalog/drivers/raster/rasterio_driver.py
+++ b/hydromt/data_catalog/drivers/raster/rasterio_driver.py
@@ -56,6 +56,7 @@ def read(
             {"time_range": time_range},
         )
         kwargs: Dict[str, Any] = {}
+        mosaic_kwargs: Dict[str, Any] = self.options.get("mosaic_kwargs", {})
 
         # get source-specific options
         cache_root: str = str(
@@ -78,7 +79,11 @@ def read(
             uris = uris_cached
 
         if mask is not None:
-            kwargs.update({"mosaic_kwargs": {"mask": mask}})
+            mosaic_kwargs.update({"mask": mask})
+
+        # get mosaic kwargs
+        if mosaic_kwargs:
+            kwargs.update({"mosaic_kwargs": mosaic_kwargs})
 
         if np.issubdtype(type(metadata.nodata), np.number):
             kwargs.update(nodata=metadata.nodata)
diff --git a/tests/data_catalog/drivers/raster/test_rasterio_driver.py b/tests/data_catalog/drivers/raster/test_rasterio_driver.py
index 1846282a1..c9704de70 100644
--- a/tests/data_catalog/drivers/raster/test_rasterio_driver.py
+++ b/tests/data_catalog/drivers/raster/test_rasterio_driver.py
@@ -2,6 +2,7 @@
 from os.path import join
 from pathlib import Path
 from typing import Tuple
+from unittest.mock import MagicMock, patch
 
 import numpy as np
 import pytest
@@ -63,6 +64,13 @@ def test_sets_nodata(self, rioda: xr.DataArray, tmp_path: Path):
         )
         assert ds["test_sets_nodata"].raster.nodata == 42
 
+    @patch("hydromt.data_catalog.drivers.raster.rasterio_driver.open_mfraster")
+    def test_sets_mosaic_kwargs(self, fake_open_mfraster: MagicMock):
+        uris = ["test", "test2"]
+        mosaic_kwargs = {"mykwarg: 0"}
+        RasterioDriver(options={"mosaic_kwargs": mosaic_kwargs}).read(uris=uris)
+        fake_open_mfraster.assert_called_once_with(uris, mosaic_kwargs=mosaic_kwargs)
+
 
 class TestOpenMFRaster:
     @pytest.fixture()

From 0d3739fab52b6e31b3367f1e7f5e63dd9648980b Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Thu, 5 Sep 2024 11:12:45 +0200
Subject: [PATCH 05/10] remove data sources header

---
 docs/guides/advanced_user/data_sources.rst | 4 ----
 docs/guides/advanced_user/index.rst        | 1 -
 2 files changed, 5 deletions(-)
 delete mode 100644 docs/guides/advanced_user/data_sources.rst

diff --git a/docs/guides/advanced_user/data_sources.rst b/docs/guides/advanced_user/data_sources.rst
deleted file mode 100644
index 56b3ea200..000000000
--- a/docs/guides/advanced_user/data_sources.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _data_sources:
-
-Data sources
-============
diff --git a/docs/guides/advanced_user/index.rst b/docs/guides/advanced_user/index.rst
index b78790f37..28dedd088 100644
--- a/docs/guides/advanced_user/index.rst
+++ b/docs/guides/advanced_user/index.rst
@@ -5,7 +5,6 @@ Advanced user guide
 
     architecture
     data_prepare_cat
-    data_sources
     data_types
     hydromt_python
     methods_stats

From 24d9fc2a4a2a77f89aa08330d0ab0c654e21932a Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Thu, 5 Sep 2024 15:02:37 +0200
Subject: [PATCH 06/10] fix missing dataset driver and options in resolvers

---
 hydromt/data_catalog/data_catalog.py          | 31 ++++++-------------
 hydromt/data_catalog/sources/factory.py       |  2 ++
 .../uri_resolvers/convention_resolver.py      |  3 --
 .../uri_resolvers/raster_tindex_resolver.py   |  7 ++---
 .../uri_resolvers/uri_resolver.py             |  4 +--
 .../test_raster_tindex_resolver.py            | 21 +++++++------
 6 files changed, 25 insertions(+), 43 deletions(-)

diff --git a/hydromt/data_catalog/data_catalog.py b/hydromt/data_catalog/data_catalog.py
index 2294157d9..bb8103d44 100644
--- a/hydromt/data_catalog/data_catalog.py
+++ b/hydromt/data_catalog/data_catalog.py
@@ -666,7 +666,8 @@ def from_yml(
         A yaml data entry is provided below, where all the text between <>
         should be filled by the user. Multiple data sources of the same
         data type should be grouped.  Currently the following data types are supported:
-        {'RasterDataset', 'GeoDataset', 'GeoDataFrame'}. See the specific data adapters
+        {'RasterDataset', 'GeoDataset', 'GeoDataFrame', 'DataFrame', 'Dataset'}. See the
+        specific data adapters
         for more information about the required and optional arguments.
 
         .. code-block:: yaml
@@ -678,22 +679,12 @@ def from_yml(
               name: <name>
               sha256: <sha256> # only if the root is an archive
             <name>:
-              path: <path>
+              uri: <uri>
               data_type: <data_type>
               driver: <driver>
-              filesystem: <filesystem>
-              driver_kwargs:
-                <key>: <value>
-              nodata:
-                <hydromt_variable_name1>: <nodata>
-              rename:
-                <native_variable_name1>: <hydromt_variable_name1>
-                <native_variable_name2>: <hydromt_variable_name2>
-              unit_add:
-                <hydromt_variable_name1>: <float/int>
-              unit_mult:
-                <hydromt_variable_name1>: <float/int>
-              meta:
+              data_adapter: <data_adapter>
+              uri_resolver: <uri_resolver>
+              metadata:
                 source_url: <source_url>
                 source_version: <source_version>
                 source_licence: <source_licence>
@@ -802,13 +793,9 @@ def from_dict(
                     "path": <path>,
                     "data_type": <data_type>,
                     "driver": <driver>,
-                    "filesystem": <filesystem>,
-                    "driver_kwargs": {<key>: <value>},
-                    "nodata": <nodata>,
-                    "rename": {<native_variable_name1>: <hydromt_variable_name1>},
-                    "unit_add": {<hydromt_variable_name1>: <float/int>},
-                    "unit_mult": {<hydromt_variable_name1>: <float/int>},
-                    "meta": {...},
+                    "data_adapter": <data_adapter>,
+                    "uri_resolver": <uri_resolver>,
+                    "metadata": {...},
                     "placeholders": {<placeholder_name_1>: <list of names>},
                 }
                 <name2>: {
diff --git a/hydromt/data_catalog/sources/factory.py b/hydromt/data_catalog/sources/factory.py
index dd6868212..84bd5d2d4 100644
--- a/hydromt/data_catalog/sources/factory.py
+++ b/hydromt/data_catalog/sources/factory.py
@@ -6,6 +6,7 @@
 from hydromt._typing.type_def import DataType
 from hydromt.data_catalog.sources import (
     DataFrameSource,
+    DatasetSource,
     DataSource,
     GeoDataFrameSource,
     GeoDatasetSource,
@@ -15,6 +16,7 @@
 # Map DataType to DataSource, need to add here when implementing a new Type
 available_sources: Dict[DataType, DataSource] = {
     "DataFrame": DataFrameSource,
+    "Dataset": DatasetSource,
     "RasterDataset": RasterDatasetSource,
     "GeoDataFrame": GeoDataFrameSource,
     "GeoDataset": GeoDatasetSource,
diff --git a/hydromt/data_catalog/uri_resolvers/convention_resolver.py b/hydromt/data_catalog/uri_resolvers/convention_resolver.py
index 687c15c99..1cf66a4c8 100644
--- a/hydromt/data_catalog/uri_resolvers/convention_resolver.py
+++ b/hydromt/data_catalog/uri_resolvers/convention_resolver.py
@@ -132,7 +132,6 @@ def resolve(
         variables: Optional[List[str]] = None,
         metadata: Optional[SourceMetadata] = None,
         handle_nodata: NoDataStrategy = NoDataStrategy.RAISE,
-        options: Optional[Dict[str, Any]] = None,
     ) -> List[str]:
         """Resolve the placeholders in the URI using naming conventions.
 
@@ -152,8 +151,6 @@ def resolve(
             DataSource metadata.
         handle_nodata : NoDataStrategy, optional
             how to react when no data is found, by default NoDataStrategy.RAISE
-        options : Optional[Dict[str, Any]], optional
-            extra options for this resolver, by default None
 
         Returns
         -------
diff --git a/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py b/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py
index 3e6856e65..fe0cae660 100644
--- a/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py
+++ b/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py
@@ -3,7 +3,7 @@
 from logging import Logger, getLogger
 from os.path import abspath, dirname, join
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import List, Optional, Union
 
 import geopandas as gpd
 
@@ -34,7 +34,6 @@ def resolve(
         variables: Union[int, tuple[float, str], None] = None,
         metadata: Optional[SourceMetadata],
         handle_nodata: NoDataStrategy = NoDataStrategy.RAISE,
-        options: Optional[Dict[str, Any]] = None,
     ) -> List[str]:
         """Resolve URIs of a raster tindex file.
 
@@ -54,8 +53,6 @@ def resolve(
             DataSource metadata.
         handle_nodata : NoDataStrategy, optional
             how to react when no data is found, by default NoDataStrategy.RAISE
-        options : Optional[Dict[str, Any]], optional
-            extra options for this resolver, by default None
 
         Returns
         -------
@@ -71,7 +68,7 @@ def resolve(
             raise ValueError(f"Resolver {self.name} needs a mask")
         gdf = gpd.read_file(uri)
         gdf = gdf.iloc[gdf.sindex.query(mask.to_crs(gdf.crs).union_all())]
-        tileindex: Optional[str] = options.get("tileindex")
+        tileindex: Optional[str] = self.options.get("tileindex")
         if tileindex is None:
             raise ValueError(
                 f"{self.__class__.__name__} needs options specifying 'tileindex'"
diff --git a/hydromt/data_catalog/uri_resolvers/uri_resolver.py b/hydromt/data_catalog/uri_resolvers/uri_resolver.py
index 7fc791a79..f612307f9 100644
--- a/hydromt/data_catalog/uri_resolvers/uri_resolver.py
+++ b/hydromt/data_catalog/uri_resolvers/uri_resolver.py
@@ -19,6 +19,7 @@ class URIResolver(AbstractBaseModel, ABC):
 
     model_config = ConfigDict(extra="forbid")
     filesystem: FS = Field(default_factory=LocalFileSystem)
+    options: Dict[str, Any] = Field(default_factory=dict)
 
     @abstractmethod
     def resolve(
@@ -31,7 +32,6 @@ def resolve(
         zoom_level: Optional[Zoom] = None,
         metadata: Optional[SourceMetadata] = None,
         handle_nodata: NoDataStrategy = NoDataStrategy.RAISE,
-        options: Optional[Dict[str, Any]] = None,
     ) -> List[str]:
         """Resolve a single uri to multiple uris.
 
@@ -51,8 +51,6 @@ def resolve(
             Metadata of DataSource.
         handle_nodata : NoDataStrategy, optional
             how to react when no data is found, by default NoDataStrategy.RAISE
-        options : Optional[Dict[str, Any]], optional
-            extra options for this resolver, by default None
 
         Returns
         -------
diff --git a/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py b/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py
index 4a34b39f0..92178117e 100644
--- a/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py
+++ b/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py
@@ -70,12 +70,13 @@ def test_resolves_correctly(self, raster_tindex):
         geom = gpd.GeoDataFrame(geometry=[box(-78, 0.0005, -65, 4)], crs=4326)
         metadata = SourceMetadata()
         options = {"tileindex": "location"}
-        resolver = RasterTindexResolver(filesystem=AbstractFileSystem())
+        resolver = RasterTindexResolver(
+            filesystem=AbstractFileSystem(), options=options
+        )
         paths = resolver.resolve(
             uri=raster_tindex,
             metadata=metadata,
             mask=geom,
-            options=options,
         )
         assert len(paths) == 2
         assert (
@@ -92,7 +93,6 @@ def test_resolves_correctly(self, raster_tindex):
             uri=raster_tindex,
             metadata=metadata,
             mask=geom,
-            options=options,
         )
         assert len(paths) == 1
         path = str(Path(join(dirname(raster_tindex), "GRWL_mask_V01.01/NA19.tif")))
@@ -110,13 +110,14 @@ def test_raises_no_tileindex(self, raster_tindex):
                 uri=raster_tindex,
                 metadata=metadata,
                 mask=geom,
-                options={},
             )
 
     def test_raises_missing_tileindex(self, raster_tindex):
-        resolver = RasterTindexResolver(filesystem=AbstractFileSystem())
-        metadata = SourceMetadata()
         options = {"tileindex": "file"}
+        resolver = RasterTindexResolver(
+            filesystem=AbstractFileSystem(), options=options
+        )
+        metadata = SourceMetadata()
         geom = gpd.GeoDataFrame(geometry=[box(-78, 0.0005, -65, 4)], crs=4326)
         with pytest.raises(
             IOError,
@@ -126,18 +127,18 @@ def test_raises_missing_tileindex(self, raster_tindex):
                 uri=raster_tindex,
                 metadata=metadata,
                 mask=geom,
-                options=options,
             )
 
     def test_raises_no_intersecting_files(self, raster_tindex):
-        resolver = RasterTindexResolver(filesystem=AbstractFileSystem())
-        metadata = SourceMetadata()
         options = {"tileindex": "file"}
+        resolver = RasterTindexResolver(
+            filesystem=AbstractFileSystem(), options=options
+        )
+        metadata = SourceMetadata()
         geom = gpd.GeoDataFrame(geometry=[box(4, 52, 5, 53)], crs=4326)
         with pytest.raises(NoDataException, match="found no intersecting tiles."):
             resolver.resolve(
                 uri=raster_tindex,
                 metadata=metadata,
                 mask=geom,
-                options=options,
             )

From af6e0ce8885e1f0a488634ec1cb789f4a3fe61cf Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Thu, 5 Sep 2024 15:03:59 +0200
Subject: [PATCH 07/10] doctest for data_types

---
 data/catalogs/test_data_catalog.py            |   4 +-
 docs/assets/data_types/csv_dataframe.yml      |  11 +
 docs/assets/data_types/csv_geodataframe.yml   |   6 +
 docs/assets/data_types/csv_geodataset.yml     |   9 +
 docs/assets/data_types/gpkg_geodataframe.yml  |  15 +
 docs/assets/data_types/netcdf_dataset.yml     |   4 +
 docs/assets/data_types/netcdf_geodataset.yml  |  22 ++
 .../data_types/netcdf_raster_dataset.yml      |  26 ++
 .../single_variable_geotiff_raster.yml        |  15 +
 .../data_types/tiled_raster_dataset.yml       |  23 ++
 docs/assets/data_types/vrt_raster_dataset.yml |  24 ++
 .../guides/advanced_user/data_prepare_cat.rst |   4 +-
 docs/guides/advanced_user/data_types.rst      | 297 ++++++++----------
 13 files changed, 286 insertions(+), 174 deletions(-)
 create mode 100644 docs/assets/data_types/csv_dataframe.yml
 create mode 100644 docs/assets/data_types/csv_geodataframe.yml
 create mode 100644 docs/assets/data_types/csv_geodataset.yml
 create mode 100644 docs/assets/data_types/gpkg_geodataframe.yml
 create mode 100644 docs/assets/data_types/netcdf_dataset.yml
 create mode 100644 docs/assets/data_types/netcdf_geodataset.yml
 create mode 100644 docs/assets/data_types/netcdf_raster_dataset.yml
 create mode 100644 docs/assets/data_types/single_variable_geotiff_raster.yml
 create mode 100644 docs/assets/data_types/tiled_raster_dataset.yml
 create mode 100644 docs/assets/data_types/vrt_raster_dataset.yml

diff --git a/data/catalogs/test_data_catalog.py b/data/catalogs/test_data_catalog.py
index 79b0c6db3..4f01147bb 100644
--- a/data/catalogs/test_data_catalog.py
+++ b/data/catalogs/test_data_catalog.py
@@ -75,7 +75,7 @@ def test_data_catalog(args, datacatalog):
     logger.info("Checking paths of data catalog sources")
     for source_name, source in datacatalog.__iter__():
         logger.info(f"Checking paths of {source_name}")
-        if isinstance(source.driver.metadata_resolver, RasterTindexResolver):
+        if isinstance(source.uri_resolver, RasterTindexResolver):
             if not exists(source.full_uri):
                 error_count += 1
                 logger.error(
@@ -84,7 +84,7 @@ def test_data_catalog(args, datacatalog):
             continue
 
         else:
-            paths = source.driver.metadata_resolver.resolve(
+            paths = source.driver.uri_resolver.resolve(
                 source.full_uri, source.driver.filesystem
             )
             for path in paths:
diff --git a/docs/assets/data_types/csv_dataframe.yml b/docs/assets/data_types/csv_dataframe.yml
new file mode 100644
index 000000000..532caca01
--- /dev/null
+++ b/docs/assets/data_types/csv_dataframe.yml
@@ -0,0 +1,11 @@
+observations:
+  uri: data/lulc/globcover_mapping.csv
+  data_type: DataFrame
+  driver:
+    name: pandas
+    options:
+      header: null  # null translates to None in Python -> no header
+      index_col: 0
+      parse_dates: false
+  metadata:
+    category: parameter_mapping
diff --git a/docs/assets/data_types/csv_geodataframe.yml b/docs/assets/data_types/csv_geodataframe.yml
new file mode 100644
index 000000000..6fa6c1c6c
--- /dev/null
+++ b/docs/assets/data_types/csv_geodataframe.yml
@@ -0,0 +1,6 @@
+stations:
+  uri: /path/to/stations.csv
+  data_type: GeoDataFrame
+  driver: geodataframe_table
+  metadata:
+    crs: 4326
diff --git a/docs/assets/data_types/csv_geodataset.yml b/docs/assets/data_types/csv_geodataset.yml
new file mode 100644
index 000000000..212a9861f
--- /dev/null
+++ b/docs/assets/data_types/csv_geodataset.yml
@@ -0,0 +1,9 @@
+waterlevels_txt:
+  uri: /path/to/stations.csv
+  data_type: GeoDataset
+  driver:
+    name: geodataset_vector
+    options:
+      data_path: /path/to/stations_data.csv
+  metadata:
+    crs: 4326
diff --git a/docs/assets/data_types/gpkg_geodataframe.yml b/docs/assets/data_types/gpkg_geodataframe.yml
new file mode 100644
index 000000000..1cbc6d962
--- /dev/null
+++ b/docs/assets/data_types/gpkg_geodataframe.yml
@@ -0,0 +1,15 @@
+GDP_world:
+  uri: base/emissions/GDP-countries/World_countries_GDPpcPPP.gpkg
+  data_type: GeoDataFrame
+  driver:
+    name: pyogrio
+    options:
+      layer: GDP
+  data_adapter:
+    rename:
+      GDP: gdp
+    unit_mult:
+      gdp: 0.001
+  metadata:
+    category: socio-economic
+    source_version: 1.0
diff --git a/docs/assets/data_types/netcdf_dataset.yml b/docs/assets/data_types/netcdf_dataset.yml
new file mode 100644
index 000000000..8b279d735
--- /dev/null
+++ b/docs/assets/data_types/netcdf_dataset.yml
@@ -0,0 +1,4 @@
+timeseries_dataset:
+  uri: /path/to/timeseries.netcdf
+  data_type: Dataset
+  driver: dataset_xarray
diff --git a/docs/assets/data_types/netcdf_geodataset.yml b/docs/assets/data_types/netcdf_geodataset.yml
new file mode 100644
index 000000000..ae3a13cb6
--- /dev/null
+++ b/docs/assets/data_types/netcdf_geodataset.yml
@@ -0,0 +1,22 @@
+gtsmv3_eu_era5:
+  uri: reanalysis-waterlevel-{year}-m{month:02d}.nc
+  data_type: GeoDataset
+  driver:
+    name: geodataset_xarray
+    options:
+      chunks: {stations: 100, time: 1500}
+      combine: by_coords
+      decode_times: true
+      parallel: true
+  data_adapter:
+    rename:
+      station_x_coordinate: lon
+      station_y_coordinate: lat
+      stations: index
+  metadata:
+    crs: 4326
+    category: ocean
+    paper_doi: 10.24381/cds.8c59054f
+    paper_ref: Copernicus Climate Change Service 2019
+    source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products
+    source_url: https://cds.climate.copernicus.eu/cdsapp#!/dataset/10.24381/cds.8c59054f?tab=overview
diff --git a/docs/assets/data_types/netcdf_raster_dataset.yml b/docs/assets/data_types/netcdf_raster_dataset.yml
new file mode 100644
index 000000000..3d3d70832
--- /dev/null
+++ b/docs/assets/data_types/netcdf_raster_dataset.yml
@@ -0,0 +1,26 @@
+
+era5_hourly:
+  uri: forcing/ERA5/org/era5_{variable}_{year}_hourly.nc
+  data_type: RasterDataset
+  driver:
+    name: raster_xarray
+    options:
+      chunks: {latitude: 125, longitude: 120, time: 50}
+      combine: by_coords
+      decode_times: true
+      parallel: true
+  metadata:
+    crs: 4326
+    category: meteo
+    paper_doi: 10.1002/qj.3803
+    paper_ref: Hersbach et al. (2019)
+    source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products
+    source_url: https://doi.org/10.24381/cds.bd0915c6
+  data_adapter:
+    rename:
+      t2m: temp
+      tp: precip
+    unit_add:
+      temp: -273.15
+    unit_mult:
+      precip: 1000
diff --git a/docs/assets/data_types/single_variable_geotiff_raster.yml b/docs/assets/data_types/single_variable_geotiff_raster.yml
new file mode 100644
index 000000000..cbdec0b15
--- /dev/null
+++ b/docs/assets/data_types/single_variable_geotiff_raster.yml
@@ -0,0 +1,15 @@
+globcover:
+  uri: base/landcover/globcover/GLOBCOVER_200901_200912_300x300m.tif
+  data_type: RasterDataset
+  driver:
+    name: rasterio
+    options:
+      chunks:
+        x: 3600
+        y: 3600
+  metadata:
+    category: landuse
+    source_url: http://due.esrin.esa.int/page_globcover.php
+    source_license: CC-BY-3.0
+    paper_ref: Arino et al (2012)
+    paper_doi: 10.1594/PANGAEA.787668
diff --git a/docs/assets/data_types/tiled_raster_dataset.yml b/docs/assets/data_types/tiled_raster_dataset.yml
new file mode 100644
index 000000000..866025681
--- /dev/null
+++ b/docs/assets/data_types/tiled_raster_dataset.yml
@@ -0,0 +1,23 @@
+grwl_mask:
+  uri: static_data/base/grwl/tindex.gpkg
+  data_type: RasterDataset
+  uri_resolver:
+    name: raster_tindex
+    options:
+      tileindex: location
+  driver:
+    name: rasterio
+    options:
+      chunks:
+        x: 3000
+        y: 3000
+      mosaic_kwargs:
+        method: nearest
+  metadata:
+    nodata: 0
+    category: hydrography
+    paper_doi: 10.1126/science.aat0636
+    paper_ref: Allen and Pavelsky (2018)
+    source_license: CC BY 4.0
+    source_url: https://doi.org/10.5281/zenodo.1297434
+    source_version: 1.01
diff --git a/docs/assets/data_types/vrt_raster_dataset.yml b/docs/assets/data_types/vrt_raster_dataset.yml
new file mode 100644
index 000000000..0d3d37f7f
--- /dev/null
+++ b/docs/assets/data_types/vrt_raster_dataset.yml
@@ -0,0 +1,24 @@
+merit_hydro:
+  uri: base/merit_hydro/{variable}.vrt
+  data_type: RasterDataset
+  driver:
+    name: rasterio
+    options:
+      chunks:
+      x: 6000
+      y: 6000
+  data_adapter:
+    rename:
+      dir: flwdir
+      bas: basins
+      upa: uparea
+      elv: elevtn
+      sto: strord
+  metadata:
+    crs: 4326
+    category: topography
+    source_version: 1.0
+    paper_doi: 10.1029/2019WR024873
+    paper_ref: Dai Yamazaki
+    source_url: http://hydro.iis.u-tokyo.ac.jp/~yamadai/MERIT_Hydro
+    source_license: CC-BY-NC 4.0 or ODbL 1.0
diff --git a/docs/guides/advanced_user/data_prepare_cat.rst b/docs/guides/advanced_user/data_prepare_cat.rst
index c936f0315..31ea0af0e 100644
--- a/docs/guides/advanced_user/data_prepare_cat.rst
+++ b/docs/guides/advanced_user/data_prepare_cat.rst
@@ -29,8 +29,8 @@ shown keys is highly recommended. The ``rename``, ``nodata``, ``unit_add`` and
 ``unit_mult`` options are set per variable (or attribute table column in case of a
 GeoDataFrame).
 
-.. include:: ../../assets/example_catalog.yml
-  :code: yaml
+.. literalinclude:: ../../assets/example_catalog.yml
+  :language: yaml
 
 .. testsetup:: *
 
diff --git a/docs/guides/advanced_user/data_types.rst b/docs/guides/advanced_user/data_types.rst
index 4e062c8f5..f201035d9 100644
--- a/docs/guides/advanced_user/data_types.rst
+++ b/docs/guides/advanced_user/data_types.rst
@@ -77,23 +77,20 @@ The variable name is based on the filename, in this case `"GLOBCOVER_200901_2009
 The `chunks` key-word argument is passed to :py:meth:`~hydromt.io.open_mfraster`
 and allows lazy reading of the data.
 
-.. code-block:: yaml
-
-    globcover:
-      uri: base/landcover/globcover/GLOBCOVER_200901_200912_300x300m.tif
-      data_type: RasterDataset
-      driver:
-        name: raster
-        options:
-          chunks:
-            x: 3600
-            y: 3600
-      meta:
-        category: landuse
-        source_url: http://due.esrin.esa.int/page_globcover.php
-        source_license: CC-BY-3.0
-        paper_ref: Arino et al (2012)
-        paper_doi: 10.1594/PANGAEA.787668
+.. literalinclude:: ../../assets/data_types/single_variable_geotiff_raster.yml
+   :language: yaml
+
+.. testsetup:: *
+
+  from hydromt import DataCatalog
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/single_variable_geotiff_raster.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. _VRT:
 
@@ -113,32 +110,16 @@ for large raster datasets which are often tiled and can be combined using
 `gdalbuildvrt. <https://gdal.org/programs/gdalbuildvrt.html>`_
 
 
-.. code-block:: yaml
-
-    merit_hydro:
-      uri: base/merit_hydro/{variable}.vrt
-      data_type: RasterDataset
-      driver:
-        name: raster
-        options:
-          chunks:
-          x: 6000
-          y: 6000
-      data_adapter:
-        rename:
-          dir: flwdir
-          bas: basins
-          upa: uparea
-          elv: elevtn
-          sto: strord
-      metadata:
-        crs: 4326
-        category: topography
-        source_version: 1.0
-        paper_doi: 10.1029/2019WR024873
-        paper_ref: Dai Yamazaki
-        source_url: http://hydro.iis.u-tokyo.ac.jp/~yamadai/MERIT_Hydro
-        source_license: CC-BY-NC 4.0 or ODbL 1.0
+.. literalinclude:: ../../assets/data_types/vrt_raster_dataset.yml
+   :language: yaml
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/vrt_raster_dataset.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. _Tile:
 
@@ -161,27 +142,16 @@ set the resampling `method`. The name of the column in the tile index attribute
 (to be directly passed as an argument to
 :py:meth:`~hydromt._io._open_raster_from_tindex`).
 
-.. code-block:: yaml
-
-    grwl_mask:
-      path: static_data/base/grwl/tindex.gpkg
-      data_type: RasterDataset
-      driver: raster_tindex
-        options:
-          chunks:
-            x: 3000
-            y: 3000
-          mosaic_kwargs:
-            method: nearest
-          tileindex: location
-      meta:
-        nodata: 0
-        category: hydrography
-        paper_doi: 10.1126/science.aat0636
-        paper_ref: Allen and Pavelsky (2018)
-        source_license: CC BY 4.0
-        source_url: https://doi.org/10.5281/zenodo.1297434
-        source_version: 1.01
+.. literalinclude:: ../../assets/data_types/tiled_raster_dataset.yml
+   :language: yaml
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/tiled_raster_dataset.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. NOTE::
 
@@ -222,34 +192,16 @@ see description of the `uri` argument in the :ref:`yaml file description <data_y
 In this example additional renaming and unit conversion preprocessing steps are added to
 unify the data to match the HydroMT naming and unit :ref:`terminology <terminology>`.
 
-.. code-block:: yaml
-
-    era5_hourly:
-      uri: forcing/ERA5/org/era5_{variable}_{year}_hourly.nc
-      data_type: RasterDataset
-      driver:
-        name: netcdf
-        options:
-          chunks: {latitude: 125, longitude: 120, time: 50}
-          combine: by_coords
-          decode_times: true
-          parallel: true
-      metadata:
-        crs: 4326
-        category: meteo
-        paper_doi: 10.1002/qj.3803
-        paper_ref: Hersbach et al. (2019)
-        source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products
-        source_url: https://doi.org/10.24381/cds.bd0915c6
-      data_adapter:
-        rename:
-          t2m: temp
-          tp: precip
-        unit_add:
-          temp: -273.15
-        unit_mult:
-          precip: 1000
+.. literalinclude:: ../../assets/data_types/netcdf_raster_dataset.yml
+   :language: yaml
 
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/netcdf_raster_dataset.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 Preprocess functions when combining multiple files
 """"""""""""""""""""""""""""""""""""""""""""""""""
@@ -277,7 +229,7 @@ Vector data (GeoDataFrame)
    * - Driver
      - File formats
      - Comments
-   * - :py:class:`vector <geodataframe.pyogrio_driver.PyogrioDriver>`
+   * - :py:class:`pyogrio <geodataframe.pyogrio_driver.PyogrioDriver>`
      - ESRI Shapefile, GeoPackage, GeoJSON, etc.
      - Point, Line and Polygon geometries. Uses :py:func:`pyogrio.read_dataframe`
    * - :py:class:`geodataframe_table <geodataframe.table_driver.GeoDataFrameTableDriver>`
@@ -295,23 +247,20 @@ spatial index for fast filtering of the data based on spatial location. An examp
 shown below. Note that the rename, ``unit_mult``, ``unit_add`` and ``nodata`` options refer to
 columns of the attribute table in case of a GeoDataFrame.
 
-.. code-block:: yaml
-
-  GDP_world:
-    uri: base/emissions/GDP-countries/World_countries_GDPpcPPP.gpkg
-    data_type: GeoDataFrame
-    driver:
-      name: vector
-      options:
-        layer: GDP
-    data_adapter:
-      rename:
-        GDP: gdp
-      unit_mult:
-        gdp: 0.001
-    metadata:
-      category: socio-economic
-      source_version: 1.0
+.. literalinclude:: ../../assets/data_types/gpkg_geodataframe.yml
+   :language: yaml
+
+.. testsetup:: *
+
+  from hydromt import DataCatalog
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/gpkg_geodataframe.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. _textdelimited_vector:
 
@@ -347,14 +296,20 @@ of the GeoDataFrame attribute table.
 As the CRS of the coordinates cannot be inferred from the data it must be set in the
 data entry in the yaml file as shown in the example below.
 
-.. code-block:: yaml
+.. literalinclude:: ../../assets/data_types/csv_geodataframe.yml
+   :language: yaml
+
+.. testsetup:: *
+
+  from hydromt import DataCatalog
+
+.. testcode:: geotiff
+  :hide:
 
-    stations:
-      uri: /path/to/stations.csv
-      data_type: GeoDataFrame
-      driver: geodataframe_table
-      metadata:
-        crs: 4326
+  catalog_path = "docs/assets/data_types/csv_geodataframe.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. _binary_vector:
 
@@ -382,7 +337,7 @@ Geospatial point time-series (GeoDataset)
    * - Driver
      - File formats
      - Comments
-   * - :py:class:`vector <geodataset.vector_driver.GeoDatasetVectorDriver>`
+   * - :py:class:`geodataset_vector <geodataset.vector_driver.GeoDatasetVectorDriver>`
      - Combined point location (e.g. CSV or GeoJSON) and text delimited time-series (e.g. CSV) data.
      - Uses :py:meth:`~hydromt._io._open_vector`, :py:meth:`~hydromt._io._open_timeseries_from_table`
    * - :py:class:`geodataset_xarray <geodataset.xarray_driver.GeoDatasetXarrayDriver>`
@@ -421,30 +376,20 @@ see description of the `uri` argument in the :ref:`yaml file description <data_y
 In this example additional renaming and unit conversion preprocessing steps are added to
 unify the data to match the HydroMT naming and unit :ref:`terminology <terminology>`.
 
-.. code-block:: yaml
-
-    gtsmv3_eu_era5:
-      uri: reanalysis-waterlevel-{year}-m{month:02d}.nc
-      data_type: GeoDataset
-      driver:
-        name: netcdf
-        options:
-          chunks: {stations: 100, time: 1500}
-          combine: by_coords
-          decode_times: true
-          parallel: true
-      data_adapter:
-        rename:
-          station_x_coordinate: lon
-          station_y_coordinate: lat
-          stations: index
-      metadata:
-        crs: 4326
-        category: ocean
-        paper_doi: 10.24381/cds.8c59054f
-        paper_ref: Copernicus Climate Change Service 2019
-        source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products
-        source_url: https://cds.climate.copernicus.eu/cdsapp#!/dataset/10.24381/cds.8c59054f?tab=overview
+.. literalinclude:: ../../assets/data_types/netcdf_geodataset.yml
+   :language: yaml
+
+.. testsetup:: *
+
+  from hydromt import DataCatalog
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/netcdf_geodataset.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. _CSV_point:
 
@@ -463,17 +408,20 @@ referred to using the `data_path` option. The index of the time-series (in the c
 header) and point locations must match. For more options see the
 :py:meth:`~hydromt._io._open_geodataset` method.
 
-.. code-block:: yaml
+.. literalinclude:: ../../assets/data_types/csv_geodataset.yml
+   :language: yaml
 
-    waterlevels_txt:
-      uri: /path/to/stations.csv
-      data_type: GeoDataset
-      driver:
-        name: vector
-        options:
-          data_path: /path/to/stations_data.csv
-      metadata:
-        crs: 4326
+.. testsetup:: *
+
+  from hydromt import DataCatalog
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/csv_geodataset.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 *Tabulated time series text file*
 
@@ -492,6 +440,7 @@ To read the time stamps the :py:func:`pandas.to_datetime` method is used.
 
 NetCDF time-series dataset (Dataset)
 ------------------------------------
+
 .. _dataset_formats:
 
 .. list-table::
@@ -507,7 +456,6 @@ NetCDF time-series dataset (Dataset)
 
 .. _NC_timeseries:
 
-
 Netcdf time-series dataset
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -516,12 +464,20 @@ NetCDF and zarr timeseries data are parsed to **Dataset** with the
 The resulting dataset is similar to the **GeoDataset** except that it lacks a spatial
 dimension.
 
-.. code-block:: yaml
+.. literalinclude:: ../../assets/data_types/netcdf_dataset.yml
+  :language: yaml
+
+.. testsetup:: *
+
+  from hydromt import DataCatalog
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/netcdf_dataset.yml"
 
-    timeseries_dataset:
-      uri: /path/to/timeseries.netcdf
-      data_type: Dataset
-      driver: netcdf
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. _DataFrame:
 
@@ -557,19 +513,20 @@ parsing of datetime-strings. See the relevant pandas function for which argument
 used. Also note that the driver is not restricted to comma-separated files, as
 the delimiter can be given to the reader through the `options`.
 
-.. code-block:: yaml
-
-    observations:
-      uri: data/lulc/globcover_mapping.csv
-      data_type: DataFrame
-      driver:
-        name: csv
-        options:
-          header: null  # null translates to None in Python -> no header
-          index_col: 0
-          parse_dates: false
-      metadata:
-        category: parameter_mapping
+.. literalinclude:: ../../assets/data_types/csv_dataframe.yml
+  :language: yaml
+
+.. testsetup:: *
+
+  from hydromt import DataCatalog
+
+.. testcode:: geotiff
+  :hide:
+
+  catalog_path = "docs/assets/data_types/csv_dataframe.yml"
+
+  catalog = DataCatalog(fallback_lib=None)  # do not read default catalog
+  catalog.from_yml(catalog_path)
 
 .. note::
     The yml-parser does not correctly parses `None` arguments. When this is required, the `null` argument should be used instead.

From 42851904f5b54aa68f36bf1dabe0e27abd0b3896 Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Fri, 6 Sep 2024 11:05:32 +0200
Subject: [PATCH 08/10] fix renamed mock

---
 tests/data_catalog/drivers/raster/test_rasterio_driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/data_catalog/drivers/raster/test_rasterio_driver.py b/tests/data_catalog/drivers/raster/test_rasterio_driver.py
index 3617dad76..b656dbd10 100644
--- a/tests/data_catalog/drivers/raster/test_rasterio_driver.py
+++ b/tests/data_catalog/drivers/raster/test_rasterio_driver.py
@@ -64,7 +64,7 @@ def test_sets_nodata(self, rioda: xr.DataArray, tmp_path: Path):
         )
         assert ds["test_sets_nodata"].raster.nodata == 42
 
-    @patch("hydromt.data_catalog.drivers.raster.rasterio_driver.open_mfraster")
+    @patch("hydromt.data_catalog.drivers.raster.rasterio_driver._open_mfraster")
     def test_sets_mosaic_kwargs(self, fake_open_mfraster: MagicMock):
         uris = ["test", "test2"]
         mosaic_kwargs = {"mykwarg: 0"}

From c23c8405a17eb75660aac04dfe6810bc1096135b Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Fri, 6 Sep 2024 13:55:31 +0200
Subject: [PATCH 09/10] all references to _io private module gone

---
 docs/api/api.rst                              |  1 -
 docs/api/data_catalog.rst                     |  1 -
 docs/api/gis.rst                              | 44 -------------------
 docs/api/io.rst                               | 39 ----------------
 docs/conf.py                                  |  4 +-
 docs/guides/advanced_user/data_types.rst      | 15 +++----
 hydromt/_io/readers.py                        |  4 +-
 .../gis/{_raster_merge.py => raster_merge.py} |  4 +-
 8 files changed, 12 insertions(+), 100 deletions(-)
 delete mode 100644 docs/api/io.rst
 rename hydromt/gis/{_raster_merge.py => raster_merge.py} (99%)

diff --git a/docs/api/api.rst b/docs/api/api.rst
index bdde4ea95..cb5cb730e 100644
--- a/docs/api/api.rst
+++ b/docs/api/api.rst
@@ -16,7 +16,6 @@ API reference
    data_source
    drivers
    gis
-   io
    model
    stats
    plugin
diff --git a/docs/api/data_catalog.rst b/docs/api/data_catalog.rst
index 39fa7c2ea..4296f3711 100644
--- a/docs/api/data_catalog.rst
+++ b/docs/api/data_catalog.rst
@@ -16,7 +16,6 @@ General
    DataCatalog.sources
    DataCatalog.predefined_catalogs
    DataCatalog.to_dict
-   DataCatalog.to_dataframe
    DataCatalog.to_yml
    DataCatalog.export_data
    DataCatalog.get_source_bbox
diff --git a/docs/api/gis.rst b/docs/api/gis.rst
index 3004ab31e..a733aab02 100644
--- a/docs/api/gis.rst
+++ b/docs/api/gis.rst
@@ -285,47 +285,3 @@ visit the `pyflwdir docs. <https://deltares.github.io/pyflwdir/latest/>`_
    flw.outlet_map
    flw.clip_basins
    flw.dem_adjust
-
-.. _gis_utils_api:
-
-GIS utility methods
-===================
-
-Raster
-------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   create_vrt.create_vrt
-   raster_utils.spread2d
-   raster_utils.reggrid_area
-   raster_utils.cellarea
-   raster_utils.cellres
-   raster_utils.meridian_offset
-   raster_utils.affine_to_coords
-   raster_utils.affine_to_meshgrid
-
-Vector
-------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   vector_utils.filter_gdf
-   vector_utils.nearest
-   vector_utils.nearest_merge
-
-
-General
--------
-
-.. autosummary::
-   :toctree: ../_generated
-
-   gis_utils.parse_crs
-   gis_utils.utm_crs
-   gis_utils.bbox_from_file_and_filters
-   gis_utils.parse_geom_bbox_buffer
-   gis_utils.to_geographic_bbox
-   gis_utils.axes_attrs
diff --git a/docs/api/io.rst b/docs/api/io.rst
deleted file mode 100644
index d2b33848f..000000000
--- a/docs/api/io.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-.. currentmodule:: hydromt.io
-
-=======================
-Reading/writing methods
-=======================
-
-.. _open_methods:
-
-Reading methods
-===============
-
-.. autosummary::
-   :toctree: ../_generated
-
-   configread
-   open_geodataset
-   open_mfcsv
-   open_mfraster
-   open_raster
-   open_raster_from_tindex
-   open_timeseries_from_table
-   open_vector
-   open_vector_from_table
-   read_nc
-   read_toml
-   read_yaml
-
-Writing methods
-===============
-
-.. autosummary::
-   :toctree: ../_generated
-
-   netcdf_writer
-   write_nc
-   write_toml
-   write_xy
-   write_yaml
-   zarr_writer
diff --git a/docs/conf.py b/docs/conf.py
index 871bacb02..60cded4fb 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -57,8 +57,8 @@ def write_panel(f, name, content="", level=0, item="dropdown"):
         f.write("\n")
 
 
-def write_nested_dropdown(name, data_cat, note="", categories=[]):
-    df = data_cat.to_dataframe().sort_index().drop_duplicates("uri")
+def write_nested_dropdown(name, data_cat: hydromt.DataCatalog, note="", categories=[]):
+    df = data_cat._to_dataframe().sort_index().drop_duplicates("uri")
     with open(f"_generated/{name}.rst", mode="w") as f:
         write_panel(f, name, note, level=0)
         write_panel(f, "", level=1, item="tab-set")
diff --git a/docs/guides/advanced_user/data_types.rst b/docs/guides/advanced_user/data_types.rst
index f201035d9..fe63a5138 100644
--- a/docs/guides/advanced_user/data_types.rst
+++ b/docs/guides/advanced_user/data_types.rst
@@ -136,11 +136,9 @@ intersecting footprints will be merged together in the CRS of the most central t
 Use `gdaltindex <https://gdal.org/programs/gdaltindex.html>`_ to build an excepted tile index file.
 
 Here a GeoPackage with the tile index referring to individual GeoTiff raster tiles is used.
-The `mosaic_kwargs` are passed to :py:meth:`~hydromt._io._open_raster_from_tindex` to
+The `mosaic_kwargs` are passed to :py:meth:`hydromt.gis.merge` to
 set the resampling `method`. The name of the column in the tile index attribute table
 `tileindex` which contains the raster tile file names is set in the `driver.options``
-(to be directly passed as an argument to
-:py:meth:`~hydromt._io._open_raster_from_tindex`).
 
 .. literalinclude:: ../../assets/data_types/tiled_raster_dataset.yml
    :language: yaml
@@ -234,7 +232,7 @@ Vector data (GeoDataFrame)
      - Point, Line and Polygon geometries. Uses :py:func:`pyogrio.read_dataframe`
    * - :py:class:`geodataframe_table <geodataframe.table_driver.GeoDataFrameTableDriver>`
      - CSV, XY, PARQUET and EXCEL.
-     - Point geometries only. Uses :py:meth:`~hydromt._io._open_vector_from_table`
+     - Point geometries only.
 
 .. _GPKG_vector:
 
@@ -338,8 +336,9 @@ Geospatial point time-series (GeoDataset)
      - File formats
      - Comments
    * - :py:class:`geodataset_vector <geodataset.vector_driver.GeoDatasetVectorDriver>`
-     - Combined point location (e.g. CSV or GeoJSON) and text delimited time-series (e.g. CSV) data.
-     - Uses :py:meth:`~hydromt._io._open_vector`, :py:meth:`~hydromt._io._open_timeseries_from_table`
+     - Combined point location (e.g. CSV or GeoJSON) and text delimited time-series
+       (e.g. CSV) data.
+     -
    * - :py:class:`geodataset_xarray <geodataset.xarray_driver.GeoDatasetXarrayDriver>`
      - NetCDF and Zarr
      - required time and index dimensions_ and x- and y coordinates.
@@ -405,8 +404,7 @@ earlier examples for GeoDataFrame datasets. Finally, certain binary formats such
 parquet are also supported. In addition a tabulated time-series text file can be passed
 to be used as a variable of the GeoDataset. This data is added by a second file which is
 referred to using the `data_path` option. The index of the time-series (in the columns
-header) and point locations must match. For more options see the
-:py:meth:`~hydromt._io._open_geodataset` method.
+header) and point locations must match.
 
 .. literalinclude:: ../../assets/data_types/csv_geodataset.yml
    :language: yaml
@@ -425,7 +423,6 @@ header) and point locations must match. For more options see the
 
 *Tabulated time series text file*
 
-This data is read using the :py:meth:`~hydromt._io._open_timeseries_from_table` method.
 To read the time stamps the :py:func:`pandas.to_datetime` method is used.
 
 .. code-block:: console
diff --git a/hydromt/_io/readers.py b/hydromt/_io/readers.py
index 3b84d0c57..aa964d342 100644
--- a/hydromt/_io/readers.py
+++ b/hydromt/_io/readers.py
@@ -29,8 +29,8 @@
 from hydromt._utils.path import _make_config_paths_abs
 from hydromt._utils.uris import _is_valid_url
 from hydromt.gis import _gis_utils, _vector_utils, raster, vector
-from hydromt.gis._raster_merge import _merge
 from hydromt.gis.raster import GEO_MAP_COORD
+from hydromt.gis.raster_merge import merge
 
 if TYPE_CHECKING:
     from hydromt._validators.model_config import HydromtModelStep
@@ -360,7 +360,7 @@ def _open_mfraster(
                 da = da.sortby(concat_dim).transpose(concat_dim, ...)
                 da.attrs.update(da_lst[0].attrs)
         else:
-            da = _merge(da_lst, **mosaic_kwargs)  # spatial merge
+            da = merge(da_lst, **mosaic_kwargs)  # spatial merge
             da.attrs.update({"source_file": "; ".join(file_attrs)})
         ds = da.to_dataset()  # dataset for consistency
     else:
diff --git a/hydromt/gis/_raster_merge.py b/hydromt/gis/raster_merge.py
similarity index 99%
rename from hydromt/gis/_raster_merge.py
rename to hydromt/gis/raster_merge.py
index a479e10f6..26782efd7 100644
--- a/hydromt/gis/_raster_merge.py
+++ b/hydromt/gis/raster_merge.py
@@ -6,10 +6,10 @@
 
 from hydromt.gis.raster import full_from_transform
 
-__all__ = ["_merge"]
+__all__ = ["merge"]
 
 
-def _merge(
+def merge(
     data_arrays,
     dst_crs=None,
     dst_bounds=None,

From 72698feb788bf339871a56b50ccb5fd3b1ca75af Mon Sep 17 00:00:00 2001
From: Jaap <33715902+Jaapel@users.noreply.github.com>
Date: Fri, 6 Sep 2024 14:15:35 +0200
Subject: [PATCH 10/10] add some documentation help

---
 docs/guides/core_dev/documentation.rst | 13 +++++++++++++
 docs/guides/core_dev/index.rst         |  1 +
 2 files changed, 14 insertions(+)
 create mode 100644 docs/guides/core_dev/documentation.rst

diff --git a/docs/guides/core_dev/documentation.rst b/docs/guides/core_dev/documentation.rst
new file mode 100644
index 000000000..bcf9aa327
--- /dev/null
+++ b/docs/guides/core_dev/documentation.rst
@@ -0,0 +1,13 @@
+.. _contribute_documentation:
+
+Adding Documentation
+====================
+
+There are a few guidelines when adding new documentation, or when refactoring the
+current documentation.
+
+- We use the `numpy docstring format <https://numpydoc.readthedocs.io/en/latest/format.html>`.
+- Code examples or example ``yaml`` files should be tested using the sphinx extension
+  ``doctest``.
+- New APIs should be added to the ``docs/api`` folder. The builtin ``autosummary``
+  and ``toctree`` are used to keep track.
diff --git a/docs/guides/core_dev/index.rst b/docs/guides/core_dev/index.rst
index 86b698fda..60a7918d6 100644
--- a/docs/guides/core_dev/index.rst
+++ b/docs/guides/core_dev/index.rst
@@ -4,4 +4,5 @@ Core developer guide
 .. toctree::
 
    contributing
+   documentation
    dev_install