diff --git a/data/catalogs/test_data_catalog.py b/data/catalogs/test_data_catalog.py index f691b2a73..77dc4524e 100644 --- a/data/catalogs/test_data_catalog.py +++ b/data/catalogs/test_data_catalog.py @@ -75,7 +75,7 @@ def test_data_catalog(args, datacatalog): logger.info("Checking paths of data catalog sources") for source_name, source in datacatalog.__iter__(): logger.info(f"Checking paths of {source_name}") - if isinstance(source.driver.metadata_resolver, RasterTindexResolver): + if isinstance(source.uri_resolver, RasterTindexResolver): if not exists(source.full_uri): error_count += 1 logger.error( @@ -84,7 +84,7 @@ def test_data_catalog(args, datacatalog): continue else: - paths = source.driver.metadata_resolver.resolve( + paths = source.driver.uri_resolver.resolve( source.full_uri, source.driver.filesystem ) for path in paths: diff --git a/docs/api/api.rst b/docs/api/api.rst index d241d399e..cb5cb730e 100644 --- a/docs/api/api.rst +++ b/docs/api/api.rst @@ -11,12 +11,13 @@ API reference :maxdepth: 2 cli + data_adapter data_catalog data_source drivers gis - io model stats plugin utils + uri_resolvers diff --git a/docs/api/data_adapter.rst b/docs/api/data_adapter.rst new file mode 100644 index 000000000..1c32e26a9 --- /dev/null +++ b/docs/api/data_adapter.rst @@ -0,0 +1,49 @@ +.. currentmodule:: hydromt.data_catalog.adapters + +DataAdapter +=========== + +RasterDataset +------------- + +.. autosummary:: + :toctree: ../_generated + + RasterDatasetAdapter + RasterDatasetAdapter.transform + +GeoDataset +---------- + +.. autosummary:: + :toctree: ../_generated + + GeoDatasetAdapter + GeoDatasetAdapter.transform + +GeoDataFrame +------------ + +.. autosummary:: + :toctree: ../_generated + + GeoDataFrameAdapter + GeoDataFrameAdapter.transform + +DataFrame +--------- + +.. autosummary:: + :toctree: ../_generated + + DataFrameAdapter + DataFrameAdapter.transform + +Dataset +------- + +.. autosummary:: + :toctree: ../_generated + + DatasetAdapter + DatasetAdapter.transform diff --git a/docs/api/data_catalog.rst b/docs/api/data_catalog.rst index 794b64944..4296f3711 100644 --- a/docs/api/data_catalog.rst +++ b/docs/api/data_catalog.rst @@ -1,9 +1,5 @@ .. currentmodule:: hydromt.data_catalog -==== -Data -==== - .. _api_data_catalog: Data catalog @@ -20,7 +16,6 @@ General DataCatalog.sources DataCatalog.predefined_catalogs DataCatalog.to_dict - DataCatalog.to_dataframe DataCatalog.to_yml DataCatalog.export_data DataCatalog.get_source_bbox @@ -63,244 +58,3 @@ Predefined data catalog PredefinedCatalog.get_catalog_file predefined_catalog.create_registry_file - - -DataSource -========== - -General -------- - -.. autosummary:: - :toctree: ../_generated - - sources.DataSource - sources.DataSource.summary - -RasterDataset -------------- - -.. autosummary:: - :toctree: ../_generated - - sources.RasterDatasetSource - sources.RasterDatasetSource.read_data - sources.RasterDatasetSource.to_stac_catalog - sources.RasterDatasetSource.get_bbox - sources.RasterDatasetSource.get_time_range - sources.RasterDatasetSource.detect_bbox - sources.RasterDatasetSource.detect_time_range - -GeoDataFrame ------------- - -.. autosummary:: - :toctree: ../_generated - - sources.GeoDataFrameSource - sources.GeoDataFrameSource.read_data - sources.GeoDataFrameSource.to_stac_catalog - sources.GeoDataFrameSource.get_bbox - sources.GeoDataFrameSource.detect_bbox - -DataFrame ---------- - -.. autosummary:: - :toctree: ../_generated - - sources.DataFrameSource - sources.DataFrameSource.read_data - sources.DataFrameSource.to_stac_catalog - -GeoDataset ------------- - -.. autosummary:: - :toctree: ../_generated - - sources.GeoDatasetSource - sources.GeoDatasetSource.read_data - sources.GeoDatasetSource.to_stac_catalog - sources.GeoDatasetSource.get_bbox - sources.GeoDatasetSource.detect_bbox - -URIResolver -================ - -General -------- - -.. autosummary:: - :toctree: ../_generated - - uri_resolvers.URIResolver - uri_resolvers.URIResolver.resolve - -ConventionResolver ------------------- - -.. autosummary:: - :toctree: ../_generated - - uri_resolvers.ConventionResolver - uri_resolvers.ConventionResolver.resolve - -RasterTindexResolver --------------------- -.. autosummary:: - :toctree: ../_generated - - uri_resolvers.RasterTindexResolver - uri_resolvers.RasterTindexResolver.resolve - -Driver -====== - -General -------- - -.. autosummary:: - :toctree: ../_generated - - drivers.base_driver.BaseDriver - -RasterDataset -------------- - -.. autosummary:: - :toctree: ../_generated - - drivers.raster.raster_dataset_driver.RasterDatasetDriver - drivers.raster.raster_dataset_driver.RasterDatasetDriver.read - drivers.raster.raster_dataset_driver.RasterDatasetDriver.write - -RasterDatasetXarrayDriver -------------------------- - -.. autosummary:: - :toctree: ../_generated - - drivers.raster.raster_xarray_driver.RasterDatasetXarrayDriver - drivers.raster.raster_xarray_driver.RasterDatasetXarrayDriver.read - drivers.raster.raster_xarray_driver.RasterDatasetXarrayDriver.write - -RasterioDriver --------------- - -.. autosummary:: - :toctree: ../_generated - - drivers.raster.rasterio_driver.RasterioDriver - drivers.raster.rasterio_driver.RasterioDriver.read - drivers.raster.rasterio_driver.RasterioDriver.write - -GeoDataFrame ------------- - -.. autosummary:: - :toctree: ../_generated - - drivers.geodataframe.geodataframe_driver.GeoDataFrameDriver - drivers.geodataframe.geodataframe_driver.GeoDataFrameDriver.read - drivers.geodataframe.geodataframe_driver.GeoDataFrameDriver.write - -PyogrioDriver -------------- - -.. autosummary:: - :toctree: ../_generated - - drivers.geodataframe.pyogrio_driver.PyogrioDriver - drivers.geodataframe.pyogrio_driver.PyogrioDriver.read - drivers.geodataframe.pyogrio_driver.PyogrioDriver.write - -GeoDataFrameTableDriver ------------------------ - -.. autosummary:: - :toctree: ../_generated - - drivers.geodataframe.table_driver.GeoDataFrameTableDriver - drivers.geodataframe.table_driver.GeoDataFrameTableDriver.read - drivers.geodataframe.table_driver.GeoDataFrameTableDriver.write - -DataFrame ---------- - -.. autosummary:: - :toctree: ../_generated - - drivers.dataframe.dataframe_driver.DataFrameDriver - drivers.dataframe.dataframe_driver.DataFrameDriver.read - drivers.dataframe.dataframe_driver.DataFrameDriver.write - -PandasDriver ------------- - -.. autosummary:: - :toctree: ../_generated - - drivers.dataframe.pandas_driver.PandasDriver - drivers.dataframe.pandas_driver.PandasDriver.read - drivers.dataframe.pandas_driver.PandasDriver.write - -GeoDataFrame ------------- - -.. autosummary:: - :toctree: ../_generated - - drivers.geodataset.geodataset_driver.GeoDatasetDriver - drivers.geodataset.geodataset_driver.GeoDatasetDriver.read - drivers.geodataset.geodataset_driver.GeoDatasetDriver.write - -DataAdapter -=========== - -General -------- - -RasterDataset -------------- - -.. autosummary:: - :toctree: ../_generated - - adapters.RasterDatasetAdapter - adapters.RasterDatasetAdapter.transform - -GeoDataset ----------- - -.. autosummary:: - :toctree: ../_generated - - adapters.GeoDatasetAdapter - adapters.GeoDatasetAdapter.transform - -GeoDataFrame ------------- - -.. autosummary:: - :toctree: ../_generated - - adapters.GeoDataFrameAdapter - adapters.GeoDataFrameAdapter.transform - -DataFrame ---------- - -.. autosummary:: - :toctree: ../_generated - - adapters.dataframe.DataFrameAdapter - adapters.dataframe.DataFrameAdapter.transform - -Dataset -------- - -.. autosummary:: - :toctree: ../_generated - - adapters.DatasetAdapter diff --git a/docs/api/data_source.rst b/docs/api/data_source.rst index 15ea2e999..4ce956d5a 100644 --- a/docs/api/data_source.rst +++ b/docs/api/data_source.rst @@ -1,5 +1,64 @@ +.. currentmodule:: hydromt.data_catalog.sources + .. _data_source: ============ Data sources ============ + +General +------- + +.. autosummary:: + :toctree: ../_generated + + DataSource + DataSource.summary + +RasterDataset +------------- + +.. autosummary:: + :toctree: ../_generated + + RasterDatasetSource + RasterDatasetSource.read_data + RasterDatasetSource.to_stac_catalog + RasterDatasetSource.get_bbox + RasterDatasetSource.get_time_range + RasterDatasetSource.detect_bbox + RasterDatasetSource.detect_time_range + +GeoDataFrame +------------ + +.. autosummary:: + :toctree: ../_generated + + GeoDataFrameSource + GeoDataFrameSource.read_data + GeoDataFrameSource.to_stac_catalog + GeoDataFrameSource.get_bbox + GeoDataFrameSource.detect_bbox + +DataFrame +--------- + +.. autosummary:: + :toctree: ../_generated + + DataFrameSource + DataFrameSource.read_data + DataFrameSource.to_stac_catalog + +GeoDataset +------------ + +.. autosummary:: + :toctree: ../_generated + + GeoDatasetSource + GeoDatasetSource.read_data + GeoDatasetSource.to_stac_catalog + GeoDatasetSource.get_bbox + GeoDatasetSource.detect_bbox diff --git a/docs/api/drivers.rst b/docs/api/drivers.rst index 9a696c5fa..26a23f3bc 100644 --- a/docs/api/drivers.rst +++ b/docs/api/drivers.rst @@ -1,5 +1,145 @@ +.. currentmodule:: hydromt.data_catalog.drivers + .. _drivers: ======= Drivers ======= + +Base +---- + +.. autosummary:: + :toctree: ../_generated + + BaseDriver + +RasterDataset +------------- + +.. autosummary:: + :toctree: ../_generated + + RasterDatasetDriver + RasterDatasetDriver.read + RasterDatasetDriver.write + +RasterDatasetXarrayDriver +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + RasterDatasetXarrayDriver + RasterDatasetXarrayDriver.read + RasterDatasetXarrayDriver.write + +RasterioDriver +^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + RasterioDriver + RasterioDriver.read + RasterioDriver.write + +GeoDataFrame +------------ + +.. autosummary:: + :toctree: ../_generated + + GeoDataFrameDriver + GeoDataFrameDriver.read + GeoDataFrameDriver.write + +PyogrioDriver +^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + PyogrioDriver + PyogrioDriver.read + PyogrioDriver.write + +GeoDataFrameTableDriver +^^^^^^^^^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + GeoDataFrameTableDriver + GeoDataFrameTableDriver.read + GeoDataFrameTableDriver.write + +DataFrame +--------- + +.. autosummary:: + :toctree: ../_generated + + DataFrameDriver + DataFrameDriver.read + DataFrameDriver.write + +PandasDriver +^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + PandasDriver + PandasDriver.read + PandasDriver.write + +GeoDataset +---------- + +.. autosummary:: + :toctree: ../_generated + + GeoDatasetDriver + GeoDatasetDriver.read + GeoDatasetDriver.write + +GeoDatasetXarrayDriver +^^^^^^^^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + GeoDatasetXarrayDriver + GeoDatasetXarrayDriver.read + GeoDatasetXarrayDriver.write + +GeoDatasetVectorDriver +^^^^^^^^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + GeoDatasetVectorDriver + GeoDatasetVectorDriver.read + GeoDatasetVectorDriver.write + +Dataset +------- + +.. autosummary:: + :toctree: ../_generated + + DatasetDriver + DatasetDriver.read + DatasetDriver.write + +DatasetXarrayDriver +^^^^^^^^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: ../_generated + + DatasetXarrayDriver + DatasetXarrayDriver.read + DatasetXarrayDriver.write diff --git a/docs/api/gis.rst b/docs/api/gis.rst index 3004ab31e..a733aab02 100644 --- a/docs/api/gis.rst +++ b/docs/api/gis.rst @@ -285,47 +285,3 @@ visit the `pyflwdir docs. `_ flw.outlet_map flw.clip_basins flw.dem_adjust - -.. _gis_utils_api: - -GIS utility methods -=================== - -Raster ------- - -.. autosummary:: - :toctree: ../_generated - - create_vrt.create_vrt - raster_utils.spread2d - raster_utils.reggrid_area - raster_utils.cellarea - raster_utils.cellres - raster_utils.meridian_offset - raster_utils.affine_to_coords - raster_utils.affine_to_meshgrid - -Vector ------- - -.. autosummary:: - :toctree: ../_generated - - vector_utils.filter_gdf - vector_utils.nearest - vector_utils.nearest_merge - - -General -------- - -.. autosummary:: - :toctree: ../_generated - - gis_utils.parse_crs - gis_utils.utm_crs - gis_utils.bbox_from_file_and_filters - gis_utils.parse_geom_bbox_buffer - gis_utils.to_geographic_bbox - gis_utils.axes_attrs diff --git a/docs/api/io.rst b/docs/api/io.rst deleted file mode 100644 index d2b33848f..000000000 --- a/docs/api/io.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. currentmodule:: hydromt.io - -======================= -Reading/writing methods -======================= - -.. _open_methods: - -Reading methods -=============== - -.. autosummary:: - :toctree: ../_generated - - configread - open_geodataset - open_mfcsv - open_mfraster - open_raster - open_raster_from_tindex - open_timeseries_from_table - open_vector - open_vector_from_table - read_nc - read_toml - read_yaml - -Writing methods -=============== - -.. autosummary:: - :toctree: ../_generated - - netcdf_writer - write_nc - write_toml - write_xy - write_yaml - zarr_writer diff --git a/docs/api/uri_resolvers.rst b/docs/api/uri_resolvers.rst new file mode 100644 index 000000000..ca5015d4e --- /dev/null +++ b/docs/api/uri_resolvers.rst @@ -0,0 +1,32 @@ +=========== +URIResolver +=========== + +.. currentmodule:: hydromt.data_catalog.uri_resolvers + +General +------- + +.. autosummary:: + :toctree: ../_generated + + URIResolver + URIResolver.resolve + +ConventionResolver +------------------ + +.. autosummary:: + :toctree: ../_generated + + ConventionResolver + ConventionResolver.resolve + +RasterTindexResolver +-------------------- + +.. autosummary:: + :toctree: ../_generated + + RasterTindexResolver + RasterTindexResolver.resolve diff --git a/docs/assets/data_types/csv_dataframe.yml b/docs/assets/data_types/csv_dataframe.yml new file mode 100644 index 000000000..532caca01 --- /dev/null +++ b/docs/assets/data_types/csv_dataframe.yml @@ -0,0 +1,11 @@ +observations: + uri: data/lulc/globcover_mapping.csv + data_type: DataFrame + driver: + name: pandas + options: + header: null # null translates to None in Python -> no header + index_col: 0 + parse_dates: false + metadata: + category: parameter_mapping diff --git a/docs/assets/data_types/csv_geodataframe.yml b/docs/assets/data_types/csv_geodataframe.yml new file mode 100644 index 000000000..6fa6c1c6c --- /dev/null +++ b/docs/assets/data_types/csv_geodataframe.yml @@ -0,0 +1,6 @@ +stations: + uri: /path/to/stations.csv + data_type: GeoDataFrame + driver: geodataframe_table + metadata: + crs: 4326 diff --git a/docs/assets/data_types/csv_geodataset.yml b/docs/assets/data_types/csv_geodataset.yml new file mode 100644 index 000000000..212a9861f --- /dev/null +++ b/docs/assets/data_types/csv_geodataset.yml @@ -0,0 +1,9 @@ +waterlevels_txt: + uri: /path/to/stations.csv + data_type: GeoDataset + driver: + name: geodataset_vector + options: + data_path: /path/to/stations_data.csv + metadata: + crs: 4326 diff --git a/docs/assets/data_types/gpkg_geodataframe.yml b/docs/assets/data_types/gpkg_geodataframe.yml new file mode 100644 index 000000000..1cbc6d962 --- /dev/null +++ b/docs/assets/data_types/gpkg_geodataframe.yml @@ -0,0 +1,15 @@ +GDP_world: + uri: base/emissions/GDP-countries/World_countries_GDPpcPPP.gpkg + data_type: GeoDataFrame + driver: + name: pyogrio + options: + layer: GDP + data_adapter: + rename: + GDP: gdp + unit_mult: + gdp: 0.001 + metadata: + category: socio-economic + source_version: 1.0 diff --git a/docs/assets/data_types/netcdf_dataset.yml b/docs/assets/data_types/netcdf_dataset.yml new file mode 100644 index 000000000..8b279d735 --- /dev/null +++ b/docs/assets/data_types/netcdf_dataset.yml @@ -0,0 +1,4 @@ +timeseries_dataset: + uri: /path/to/timeseries.netcdf + data_type: Dataset + driver: dataset_xarray diff --git a/docs/assets/data_types/netcdf_geodataset.yml b/docs/assets/data_types/netcdf_geodataset.yml new file mode 100644 index 000000000..ae3a13cb6 --- /dev/null +++ b/docs/assets/data_types/netcdf_geodataset.yml @@ -0,0 +1,22 @@ +gtsmv3_eu_era5: + uri: reanalysis-waterlevel-{year}-m{month:02d}.nc + data_type: GeoDataset + driver: + name: geodataset_xarray + options: + chunks: {stations: 100, time: 1500} + combine: by_coords + decode_times: true + parallel: true + data_adapter: + rename: + station_x_coordinate: lon + station_y_coordinate: lat + stations: index + metadata: + crs: 4326 + category: ocean + paper_doi: 10.24381/cds.8c59054f + paper_ref: Copernicus Climate Change Service 2019 + source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products + source_url: https://cds.climate.copernicus.eu/cdsapp#!/dataset/10.24381/cds.8c59054f?tab=overview diff --git a/docs/assets/data_types/netcdf_raster_dataset.yml b/docs/assets/data_types/netcdf_raster_dataset.yml new file mode 100644 index 000000000..3d3d70832 --- /dev/null +++ b/docs/assets/data_types/netcdf_raster_dataset.yml @@ -0,0 +1,26 @@ + +era5_hourly: + uri: forcing/ERA5/org/era5_{variable}_{year}_hourly.nc + data_type: RasterDataset + driver: + name: raster_xarray + options: + chunks: {latitude: 125, longitude: 120, time: 50} + combine: by_coords + decode_times: true + parallel: true + metadata: + crs: 4326 + category: meteo + paper_doi: 10.1002/qj.3803 + paper_ref: Hersbach et al. (2019) + source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products + source_url: https://doi.org/10.24381/cds.bd0915c6 + data_adapter: + rename: + t2m: temp + tp: precip + unit_add: + temp: -273.15 + unit_mult: + precip: 1000 diff --git a/docs/assets/data_types/single_variable_geotiff_raster.yml b/docs/assets/data_types/single_variable_geotiff_raster.yml new file mode 100644 index 000000000..cbdec0b15 --- /dev/null +++ b/docs/assets/data_types/single_variable_geotiff_raster.yml @@ -0,0 +1,15 @@ +globcover: + uri: base/landcover/globcover/GLOBCOVER_200901_200912_300x300m.tif + data_type: RasterDataset + driver: + name: rasterio + options: + chunks: + x: 3600 + y: 3600 + metadata: + category: landuse + source_url: http://due.esrin.esa.int/page_globcover.php + source_license: CC-BY-3.0 + paper_ref: Arino et al (2012) + paper_doi: 10.1594/PANGAEA.787668 diff --git a/docs/assets/data_types/tiled_raster_dataset.yml b/docs/assets/data_types/tiled_raster_dataset.yml new file mode 100644 index 000000000..866025681 --- /dev/null +++ b/docs/assets/data_types/tiled_raster_dataset.yml @@ -0,0 +1,23 @@ +grwl_mask: + uri: static_data/base/grwl/tindex.gpkg + data_type: RasterDataset + uri_resolver: + name: raster_tindex + options: + tileindex: location + driver: + name: rasterio + options: + chunks: + x: 3000 + y: 3000 + mosaic_kwargs: + method: nearest + metadata: + nodata: 0 + category: hydrography + paper_doi: 10.1126/science.aat0636 + paper_ref: Allen and Pavelsky (2018) + source_license: CC BY 4.0 + source_url: https://doi.org/10.5281/zenodo.1297434 + source_version: 1.01 diff --git a/docs/assets/data_types/vrt_raster_dataset.yml b/docs/assets/data_types/vrt_raster_dataset.yml new file mode 100644 index 000000000..0d3d37f7f --- /dev/null +++ b/docs/assets/data_types/vrt_raster_dataset.yml @@ -0,0 +1,24 @@ +merit_hydro: + uri: base/merit_hydro/{variable}.vrt + data_type: RasterDataset + driver: + name: rasterio + options: + chunks: + x: 6000 + y: 6000 + data_adapter: + rename: + dir: flwdir + bas: basins + upa: uparea + elv: elevtn + sto: strord + metadata: + crs: 4326 + category: topography + source_version: 1.0 + paper_doi: 10.1029/2019WR024873 + paper_ref: Dai Yamazaki + source_url: http://hydro.iis.u-tokyo.ac.jp/~yamadai/MERIT_Hydro + source_license: CC-BY-NC 4.0 or ODbL 1.0 diff --git a/docs/conf.py b/docs/conf.py index 871bacb02..60cded4fb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,8 +57,8 @@ def write_panel(f, name, content="", level=0, item="dropdown"): f.write("\n") -def write_nested_dropdown(name, data_cat, note="", categories=[]): - df = data_cat.to_dataframe().sort_index().drop_duplicates("uri") +def write_nested_dropdown(name, data_cat: hydromt.DataCatalog, note="", categories=[]): + df = data_cat._to_dataframe().sort_index().drop_duplicates("uri") with open(f"_generated/{name}.rst", mode="w") as f: write_panel(f, name, note, level=0) write_panel(f, "", level=1, item="tab-set") diff --git a/docs/guides/advanced_user/data_prepare_cat.rst b/docs/guides/advanced_user/data_prepare_cat.rst index c936f0315..31ea0af0e 100644 --- a/docs/guides/advanced_user/data_prepare_cat.rst +++ b/docs/guides/advanced_user/data_prepare_cat.rst @@ -29,8 +29,8 @@ shown keys is highly recommended. The ``rename``, ``nodata``, ``unit_add`` and ``unit_mult`` options are set per variable (or attribute table column in case of a GeoDataFrame). -.. include:: ../../assets/example_catalog.yml - :code: yaml +.. literalinclude:: ../../assets/example_catalog.yml + :language: yaml .. testsetup:: * diff --git a/docs/guides/advanced_user/data_sources.rst b/docs/guides/advanced_user/data_sources.rst deleted file mode 100644 index 56b3ea200..000000000 --- a/docs/guides/advanced_user/data_sources.rst +++ /dev/null @@ -1,4 +0,0 @@ -.. _data_sources: - -Data sources -============ diff --git a/docs/guides/advanced_user/data_types.rst b/docs/guides/advanced_user/data_types.rst index 837ac3cd6..fe63a5138 100644 --- a/docs/guides/advanced_user/data_types.rst +++ b/docs/guides/advanced_user/data_types.rst @@ -1,5 +1,7 @@ .. _data_types: +.. currentmodule:: hydromt.data_catalog.drivers + Supported data types ==================== @@ -11,15 +13,16 @@ HydroMT currently supports the following data types: - :ref:`Dataset `: non-spatial n-dimensional data - :ref:`DataFrame `: 2D tabular data -Internally the RasterDataset, GeoDataset, and Dataset are represented by :py:class:`xarray.Dataset` objects, -the GeoDataFrame by :py:class:`geopandas.GeoDataFrame`, and the DataFrame by -:py:class:`pandas.DataFrame`. We use drivers, typically from third-party packages and sometimes -wrapped in HydroMT functions, to parse many different file formats to this standardized internal -data representation. +Internally the RasterDataset, GeoDataset, and Dataset are represented by +:py:class:`xarray.Dataset` objects, the GeoDataFrame by +:py:class:`geopandas.GeoDataFrame`, and the DataFrame by :py:class:`pandas.DataFrame`. +We use drivers, typically from third-party packages and sometimes wrapped in HydroMT +functions, to parse many different file formats to this standardized internal data +representation. .. note:: - Please contact us through the issue list if you would like to add other drivers. + It is also possible to create your own driver. See at :ref:`Custom Driver` .. _dimensions: @@ -44,26 +47,23 @@ Raster data (RasterDataset) .. _raster_formats: .. list-table:: - :widths: 17, 25, 28, 30 + :widths: 17, 25, 30 :header-rows: 1 * - Driver - File formats - - Method - Comments - * - ``raster`` + * - :py:class:`raster ` - GeoTIFF, ArcASCII, VRT, etc. (see `GDAL formats `_) - - :py:meth:`~hydromt.io.open_mfraster` - Based on :py:func:`xarray.open_rasterio` and :py:func:`rasterio.open` - * - ``raster_tindex`` + * - :py:class:`raster ` with the + :py:class:`raster_tindex ` resolver - raster tile index file (see `gdaltindex `_) - - :py:meth:`~hydromt.io.open_raster_from_tindex` - - Options to merge tiles via ``mosaic_kwargs``. - * - ``netcdf`` or ``zarr`` + - Options to merge tiles via `options -> mosaic_kwargs`. + * - :py:class:`raster_xarray ` - NetCDF and Zarr - - :py:func:`xarray.open_mfdataset`, :py:func:`xarray.open_zarr` - - required y and x dimensions_ + - required y and x dimensions .. _GeoTiff: @@ -73,24 +73,24 @@ Raster data (RasterDataset) Single raster files are parsed to a **RasterDataset** based on the **raster** driver. This driver supports 2D raster for which the dimensions are names "x" and "y". A potential third dimension is called "dim0". -The variable name is based on the filename, in this case "GLOBCOVER_200901_200912_300x300m". -The ``chunks`` key-word argument is passed to :py:meth:`~hydromt.io.open_mfraster` +The variable name is based on the filename, in this case `"GLOBCOVER_200901_200912_300x300m"`. +The `chunks` key-word argument is passed to :py:meth:`~hydromt.io.open_mfraster` and allows lazy reading of the data. -.. code-block:: yaml - - globcover: - path: base/landcover/globcover/GLOBCOVER_200901_200912_300x300m.tif - data_type: RasterDataset - driver: raster - driver_kwargs: - chunks: {x: 3600, y: 3600} - meta: - category: landuse - source_url: http://due.esrin.esa.int/page_globcover.php - source_license: CC-BY-3.0 - paper_ref: Arino et al (2012) - paper_doi: 10.1594/PANGAEA.787668 +.. literalinclude:: ../../assets/data_types/single_variable_geotiff_raster.yml + :language: yaml + +.. testsetup:: * + + from hydromt import DataCatalog + +.. testcode:: geotiff + :hide: + + catalog_path = "docs/assets/data_types/single_variable_geotiff_raster.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. _VRT: @@ -100,45 +100,35 @@ Multi-variable Virtual Raster Tileset (VRT) Multiple raster layers from different files are parsed using the **raster** driver. Each raster becomes a variable in the resulting RasterDataset based on its filename. The path to multiple files can be set using a sting glob or several keys, -see description of the ``path`` argument in the :ref:`yaml file description `. +see description of the `uri` argument in the :ref:`yaml file description `. Note that the rasters should have identical grids. -Here multiple .vrt files (dir.vrt, bas.vrt, etc.) are combined based on their variable name -into a single dataset with variables flwdir, basins, etc. -Other multiple file raster datasets (e.g. GeoTIFF files) can be read in the same way. -VRT files are useful for large raster datasets which are often tiled and can be combined using +Here multiple .vrt files (dir.vrt, bas.vrt, etc.) are combined based on their variable +name into a single dataset with variables flwdir, basins, etc. Other multiple file +raster datasets (e.g. GeoTIFF files) can be read in the same way. VRT files are useful +for large raster datasets which are often tiled and can be combined using `gdalbuildvrt. `_ -.. code-block:: yaml - - merit_hydro: - path: base/merit_hydro/{variable}.vrt - data_type: RasterDataset - driver: raster - crs: 4326 - driver_kwargs: - chunks: {x: 6000, y: 6000} - rename: - dir: flwdir - bas: basins - upa: uparea - elv: elevtn - sto: strord - meta: - category: topography - source_version: 1.0 - paper_doi: 10.1029/2019WR024873 - paper_ref: Dai Yamazaki - source_url: http://hydro.iis.u-tokyo.ac.jp/~yamadai/MERIT_Hydro - source_license: CC-BY-NC 4.0 or ODbL 1.0 +.. literalinclude:: ../../assets/data_types/vrt_raster_dataset.yml + :language: yaml + +.. testcode:: geotiff + :hide: + + catalog_path = "docs/assets/data_types/vrt_raster_dataset.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. _Tile: Tiled raster dataset ^^^^^^^^^^^^^^^^^^^^ -Tiled index datasets are parsed using the **raster_tindex** driver. +Tiled index datasets are parsed using the +:py:Class:`raster_tindex ` +:py:class:`~hydromt.data_catalog.uri_resolvers.uri_resolver.URIResolver`. This data format is used to combine raster tiles with different CRS projections. A polygon vector file (e.g. GeoPackage) is used to make a tile index with the spatial footprints of each tile. When reading a spatial slice of this data the files with @@ -146,34 +136,26 @@ intersecting footprints will be merged together in the CRS of the most central t Use `gdaltindex `_ to build an excepted tile index file. Here a GeoPackage with the tile index referring to individual GeoTiff raster tiles is used. -The ``mosaic_kwargs`` are passed to :py:meth:`~hydromt.io.open_raster_from_tindex` to -set the resampling ``method``. The name of the column in the tile index attribute table ``tileindex`` -which contains the raster tile file names is set in the ``driver_kwargs`` (to be directly passed as an argument to -:py:meth:`~hydromt.io.open_raster_from_tindex`). - -.. code-block:: yaml - - grwl_mask: - path: static_data/base/grwl/tindex.gpkg - data_type: RasterDataset - driver: raster_tindex - nodata: 0 - driver_kwargs: - chunks: {x: 3000, y: 3000} - mosaic_kwargs: {method: nearest} - tileindex: location - meta: - category: hydrography - paper_doi: 10.1126/science.aat0636 - paper_ref: Allen and Pavelsky (2018) - source_license: CC BY 4.0 - source_url: https://doi.org/10.5281/zenodo.1297434 - source_version: 1.01 +The `mosaic_kwargs` are passed to :py:meth:`hydromt.gis.merge` to +set the resampling `method`. The name of the column in the tile index attribute table +`tileindex` which contains the raster tile file names is set in the `driver.options`` + +.. literalinclude:: ../../assets/data_types/tiled_raster_dataset.yml + :language: yaml + +.. testcode:: geotiff + :hide: + + catalog_path = "docs/assets/data_types/tiled_raster_dataset.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. NOTE:: - Tiled raster datasets are not read lazily as different tiles have to be merged together based on - their values. For fast access to large raster datasets, other formats might be more suitable. + Tiled raster datasets are not read lazily as different tiles have to be merged + together based on their values. For fast access to large raster datasets, other + formats might be more suitable. .. _NC_raster: @@ -199,54 +181,34 @@ See list of recognized dimensions_ names. To read a raster dataset from a multiple file netcdf archive the following data entry -is used, where the ``driver_kwargs`` are passed to :py:func:`xarray.open_mfdataset` +is used, where the `options` are passed to :py:func:`xarray.open_mfdataset` (or :py:func:`xarray.open_zarr` for zarr data). -In case the CRS cannot be inferred from the netcdf data it should be defined with the ``crs`` option here. +In case the CRS cannot be inferred from the netcdf metadata it should be defined with +the `crs` `metadata`` here. The path to multiple files can be set using a sting glob or several keys, -see description of the ``path`` argument in the :ref:`yaml file description `. +see description of the `uri` argument in the :ref:`yaml file description `. In this example additional renaming and unit conversion preprocessing steps are added to unify the data to match the HydroMT naming and unit :ref:`terminology `. -.. code-block:: yaml - - era5_hourly: - path: forcing/ERA5/org/era5_{variable}_{year}_hourly.nc - data_type: RasterDataset - driver: netcdf - crs: 4326 - driver_kwargs: - chunks: {latitude: 125, longitude: 120, time: 50} - combine: by_coords - decode_times: true - parallel: true - meta: - category: meteo - paper_doi: 10.1002/qj.3803 - paper_ref: Hersbach et al. (2019) - source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products - source_url: https://doi.org/10.24381/cds.bd0915c6 - rename: - t2m: temp - tp: precip - unit_add: - temp: -273.15 - unit_mult: - precip: 1000 - +.. literalinclude:: ../../assets/data_types/netcdf_raster_dataset.yml + :language: yaml -Preprocess functions when combining multiple files -"""""""""""""""""""""""""""""""""""""""""""""""""" +.. testcode:: geotiff + :hide: -In :py:func:`xarray.open_mfdataset`, xarray allows for a *preprocess* function to be run before merging several -netcdf files together. In hydroMT, some preprocess functions are available and can be passed through the ``driver_kwargs`` -options in the same way as any xr.open_mfdataset options. These preprocess functions are: + catalog_path = "docs/assets/data_types/netcdf_raster_dataset.yml" -- **round_latlon**: round x and y dimensions to 5 decimals to avoid merging problems in xarray due to small differences - in x, y values in the different netcdf files of the same data source. -- **to_datetimeindex**: force parsing the time dimension to a datetime index. -- **remove_duplicates**: remove time duplicates + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) +Preprocess functions when combining multiple files +"""""""""""""""""""""""""""""""""""""""""""""""""" +In :py:func:`xarray.open_mfdataset`, xarray allows for a **preprocess** function to be +run before merging several netcdf files together. In hydroMT, some preprocess functions +are available and can be passed through the options in the same way as any +xr.open_mfdataset options. These preprocess functions are found at +:py:obj:`hydromt.data_catalog.preprocessing.py` .. _GeoDataFrame: @@ -259,23 +221,18 @@ Vector data (GeoDataFrame) .. _vector_formats: .. list-table:: - :widths: 17, 25, 28, 30 + :widths: 17, 25, 30 :header-rows: 1 * - Driver - File formats - - Method - Comments - * - ``vector`` + * - :py:class:`pyogrio ` - ESRI Shapefile, GeoPackage, GeoJSON, etc. - - :py:meth:`~hydromt.io.open_vector` - - Point, Line and Polygon geometries. Uses :py:func:`geopandas.read_file` - * - ``vector_table`` + - Point, Line and Polygon geometries. Uses :py:func:`pyogrio.read_dataframe` + * - :py:class:`geodataframe_table ` - CSV, XY, PARQUET and EXCEL. - - :py:meth:`~hydromt.io.open_vector` - - Point geometries only. Uses :py:meth:`~hydromt.io.open_vector_from_table` - - + - Point geometries only. .. _GPKG_vector: @@ -288,21 +245,20 @@ spatial index for fast filtering of the data based on spatial location. An examp shown below. Note that the rename, ``unit_mult``, ``unit_add`` and ``nodata`` options refer to columns of the attribute table in case of a GeoDataFrame. -.. code-block:: yaml - - GDP_world: - path: base/emissions/GDP-countries/World_countries_GDPpcPPP.gpkg - data_type: GeoDataFrame - driver: vector - driver_kwargs: - layer: GDP - rename: - GDP: gdp - unit_mult: - gdp: 0.001 - meta: - category: socio-economic - source_version: 1.0 +.. literalinclude:: ../../assets/data_types/gpkg_geodataframe.yml + :language: yaml + +.. testsetup:: * + + from hydromt import DataCatalog + +.. testcode:: geotiff + :hide: + + catalog_path = "docs/assets/data_types/gpkg_geodataframe.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. _textdelimited_vector: @@ -336,28 +292,30 @@ of the GeoDataFrame attribute table. ... As the CRS of the coordinates cannot be inferred from the data it must be set in the -data entry in the yaml file as shown in the example below. The internal data format -is based on the file extension unless the ``driver_kwargs`` ``driver`` option is set. -See :py:meth:`~hydromt.io.open_vector` and :py:func:`~hydromt.io.open_vector_from_table` for more -options. +data entry in the yaml file as shown in the example below. + +.. literalinclude:: ../../assets/data_types/csv_geodataframe.yml + :language: yaml + +.. testsetup:: * + + from hydromt import DataCatalog -.. code-block:: yaml +.. testcode:: geotiff + :hide: - stations: - path: /path/to/stations.csv - data_type: GeoDataFrame - driver: vector_table - crs: 4326 - driver_kwargs: - driver: csv + catalog_path = "docs/assets/data_types/csv_geodataframe.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. _binary_vector: -HydroMT also supports reading and writing vector data in binary format. Currently only parquet is -supported, but others could be added if desired. The structure of the files should be the same as -the text format files described above but writing according to the parquet file spec. Since this is -a binary format, not examples are provided, but for example pandas can write the same data structure -to parquet as it can csv. +HydroMT also supports reading and writing vector data in binary format. Currently only +parquet is supported, but others could be added if desired. The structure of the files +should be the same as the text format files described above but writing according to the +parquet file spec. Since this is a binary format, not examples are provided, but for +example pandas can write the same data structure to parquet as it can csv. .. _GeoDataset: @@ -371,20 +329,18 @@ Geospatial point time-series (GeoDataset) .. _geo_formats: .. list-table:: - :widths: 17, 25, 28, 30 + :widths: 17, 25, 30 :header-rows: 1 * - Driver - File formats - - Method - Comments - * - ``vector`` - - Combined point location (e.g. CSV or GeoJSON) and text delimited time-series (e.g. CSV) data. - - :py:meth:`~hydromt.io.open_geodataset` - - Uses :py:meth:`~hydromt.io.open_vector`, :py:meth:`~hydromt.io.open_timeseries_from_table` - * - ``netcdf`` or ``zarr`` + * - :py:class:`geodataset_vector ` + - Combined point location (e.g. CSV or GeoJSON) and text delimited time-series + (e.g. CSV) data. + - + * - :py:class:`geodataset_xarray ` - NetCDF and Zarr - - :py:func:`xarray.open_mfdataset`, :py:func:`xarray.open_zarr` - required time and index dimensions_ and x- and y coordinates. @@ -411,67 +367,63 @@ on a list of recognized dimensions_ names. waterlevel (time, stations) To read a point time-series dataset from a multiple file netcdf archive the following data entry -is used, where the ``driver_kwargs`` are passed to :py:func:`xarray.open_mfdataset` +is used, where the options are passed to :py:func:`xarray.open_mfdataset` (or :py:func:`xarray.open_zarr` for zarr data). In case the CRS cannot be inferred from the netcdf data it is defined here. The path to multiple files can be set using a sting glob or several keys, -see description of the ``path`` argument in the :ref:`yaml file description `. +see description of the `uri` argument in the :ref:`yaml file description `. In this example additional renaming and unit conversion preprocessing steps are added to unify the data to match the HydroMT naming and unit :ref:`terminology `. -.. code-block:: yaml - - gtsmv3_eu_era5: - path: reanalysis-waterlevel-{year}-m{month:02d}.nc - data_type: GeoDataset - driver: netcdf - crs: 4326 - driver_kwargs: - chunks: {stations: 100, time: 1500} - combine: by_coords - decode_times: true - parallel: true - rename: - station_x_coordinate: lon - station_y_coordinate: lat - stations: index - meta: - category: ocean - paper_doi: 10.24381/cds.8c59054f - paper_ref: Copernicus Climate Change Service 2019 - source_license: https://cds.climate.copernicus.eu/cdsapp/#!/terms/licence-to-use-copernicus-products - source_url: https://cds.climate.copernicus.eu/cdsapp#!/dataset/10.24381/cds.8c59054f?tab=overview +.. literalinclude:: ../../assets/data_types/netcdf_geodataset.yml + :language: yaml + +.. testsetup:: * + + from hydromt import DataCatalog + +.. testcode:: geotiff + :hide: + + catalog_path = "docs/assets/data_types/netcdf_geodataset.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. _CSV_point: CSV point time-series data ^^^^^^^^^^^^^^^^^^^^^^^^^^ -Point time-series data where the geospatial point geometries and time-series are saved in -separate (text) files are parsed to **GeoDataset** using the **vector** driver. -The GeoDataset must at least contain a location index with point geometries which is referred to by the ``path`` argument -The path may refer to both GIS vector data such as GeoJSON with only Point geometries -or tabulated point vector data such as csv files, see earlier examples for GeoDataFrame datasets. -Finally, certain binary formats such as parquet are also supported. -In addition a tabulated time-series text file can be passed to be used as a variable of the GeoDataset. -This data is added by a second file which is referred to using the ``data_path`` key-word argument. -The index of the time-series (in the columns header) and point locations must match. -For more options see the :py:meth:`~hydromt.io.open_geodataset` method. - -.. code-block:: yaml - - waterlevels_txt: - path: /path/to/stations.csv - data_type: GeoDataset - driver: vector - crs: 4326 - driver_kwargs: - data_path: /path/to/stations_data.csv +Point time-series data where the geospatial point geometries and time-series are saved +in separate (text) files are parsed to **GeoDataset** using the **vector** driver. The +GeoDataset must at least contain a location index with point geometries which is +referred to by the `uri` argument The path may refer to both GIS vector data such as +GeoJSON with only Point geometries or tabulated point vector data such as csv files, see +earlier examples for GeoDataFrame datasets. Finally, certain binary formats such as +parquet are also supported. In addition a tabulated time-series text file can be passed +to be used as a variable of the GeoDataset. This data is added by a second file which is +referred to using the `data_path` option. The index of the time-series (in the columns +header) and point locations must match. + +.. literalinclude:: ../../assets/data_types/csv_geodataset.yml + :language: yaml + +.. testsetup:: * + + from hydromt import DataCatalog + +.. testcode:: geotiff + :hide: + + catalog_path = "docs/assets/data_types/csv_geodataset.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) *Tabulated time series text file* -This data is read using the :py:meth:`~hydromt.io.open_timeseries_from_table` method. To -read the time stamps the :py:func:`pandas.to_datetime` method is used. +To read the time stamps the :py:func:`pandas.to_datetime` method is used. .. code-block:: console @@ -485,38 +437,44 @@ read the time stamps the :py:func:`pandas.to_datetime` method is used. NetCDF time-series dataset (Dataset) ------------------------------------ + .. _dataset_formats: .. list-table:: - :widths: 17, 25, 28, 30 + :widths: 17, 25, 30 :header-rows: 1 * - Driver - File formats - - Method - Comments - * - ``netcdf`` or ``zarr`` + * - :py:Class:`dataset_xarray ` - NetCDF and Zarr - - :py:func:`xarray.open_mfdataset`, :py:func:`xarray.open_zarr` - required time and index dimensions_. .. _NC_timeseries: - Netcdf time-series dataset -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +NetCDF and zarr timeseries data are parsed to **Dataset** with the +:py:class:`~dataset.xarray_driver.DatasetXarrayDriver`. +The resulting dataset is similar to the **GeoDataset** except that it lacks a spatial +dimension. -NetCDF and zarr timeseries data are parsed to **Dataset** with the **netcdf** and **zarr** drivers. -The resulting dataset is similar to the **GeoDataset** except that it lacks a spatial dimension. +.. literalinclude:: ../../assets/data_types/netcdf_dataset.yml + :language: yaml -.. code-block:: yaml +.. testsetup:: * - timeseries_dataset: - path: /path/to/timeseries.netcdf - data_type: Dataset - driver: netcdf + from hydromt import DataCatalog +.. testcode:: geotiff + :hide: + catalog_path = "docs/assets/data_types/netcdf_dataset.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. _DataFrame: @@ -526,30 +484,15 @@ The resulting dataset is similar to the **GeoDataset** except that it lacks a sp .. _dataframe_formats: .. list-table:: - :widths: 17, 25, 28, 30 + :widths: 17, 25, 30 :header-rows: 1 * - Driver - File formats - - Method - Comments - * - ``csv`` - - Comma-separated files (or using another delimiter) - - :py:func:`pandas.read_csv` - - See :py:func:`pandas.read_csv` for all - * - ``excel`` - - Excel files - - :py:func:`pandas.read_excel` - - If required, provide a sheet name through driver_kwargs - * - ``parquet`` - - Binary encoded columnar data format - - :py:func:`pandas.read_parquet` - - - * - ``fwf`` - - Fixed width delimited text files - - :py:func:`pandas.read_fwf` - - The formatting of these files can either be inferred or defined by the user, both through the driver_kwargs. - + * - :py:class:`csv ` + - any file readable by pandas + - Provide a sheet name or formatting through options .. note:: @@ -559,24 +502,28 @@ The resulting dataset is similar to the **GeoDataset** except that it lacks a sp Supported files ^^^^^^^^^^^^^^^ -The DataFrameAdapter is quite flexible in supporting different types of tabular data formats. All drivers allow for flexible reading of -files: for example both mapping tables and time series data are supported. Please note that for timeseries, the driver_kwargs need to be used to -set the correct column for indexing, and formatting and parsing of datetime-strings. See the relevant pandas function for which arguments -can be used. Also note that the **csv** driver is not restricted to comma-separated files, as the delimiter can be given to the reader -through the driver_kwargs. - -.. code-block:: yaml - - observations: - path: data/lulc/globcover_mapping.csv - data_type: DataFrame - driver: csv - meta: - category: parameter_mapping - driver_kwargs: - header: null # null translates to None in Python -> no header - index_col: 0 - parse_dates: false +The DataFrameAdapter is quite flexible in supporting different types of tabular data +formats. The driver allows for flexible reading of files: for example both mapping +tables and time series data are supported. Please note that for timeseries, the +`options` need to be used to set the correct column for indexing, and formatting and +parsing of datetime-strings. See the relevant pandas function for which arguments can be +used. Also note that the driver is not restricted to comma-separated files, as +the delimiter can be given to the reader through the `options`. + +.. literalinclude:: ../../assets/data_types/csv_dataframe.yml + :language: yaml + +.. testsetup:: * + + from hydromt import DataCatalog + +.. testcode:: geotiff + :hide: + + catalog_path = "docs/assets/data_types/csv_dataframe.yml" + + catalog = DataCatalog(fallback_lib=None) # do not read default catalog + catalog.from_yml(catalog_path) .. note:: The yml-parser does not correctly parses `None` arguments. When this is required, the `null` argument should be used instead. diff --git a/docs/guides/advanced_user/index.rst b/docs/guides/advanced_user/index.rst index b78790f37..28dedd088 100644 --- a/docs/guides/advanced_user/index.rst +++ b/docs/guides/advanced_user/index.rst @@ -5,7 +5,6 @@ Advanced user guide architecture data_prepare_cat - data_sources data_types hydromt_python methods_stats diff --git a/docs/guides/core_dev/documentation.rst b/docs/guides/core_dev/documentation.rst new file mode 100644 index 000000000..bcf9aa327 --- /dev/null +++ b/docs/guides/core_dev/documentation.rst @@ -0,0 +1,13 @@ +.. _contribute_documentation: + +Adding Documentation +==================== + +There are a few guidelines when adding new documentation, or when refactoring the +current documentation. + +- We use the `numpy docstring format `. +- Code examples or example ``yaml`` files should be tested using the sphinx extension + ``doctest``. +- New APIs should be added to the ``docs/api`` folder. The builtin ``autosummary`` + and ``toctree`` are used to keep track. diff --git a/docs/guides/core_dev/index.rst b/docs/guides/core_dev/index.rst index 86b698fda..60a7918d6 100644 --- a/docs/guides/core_dev/index.rst +++ b/docs/guides/core_dev/index.rst @@ -4,4 +4,5 @@ Core developer guide .. toctree:: contributing + documentation dev_install diff --git a/hydromt/_io/readers.py b/hydromt/_io/readers.py index 3b84d0c57..aa964d342 100644 --- a/hydromt/_io/readers.py +++ b/hydromt/_io/readers.py @@ -29,8 +29,8 @@ from hydromt._utils.path import _make_config_paths_abs from hydromt._utils.uris import _is_valid_url from hydromt.gis import _gis_utils, _vector_utils, raster, vector -from hydromt.gis._raster_merge import _merge from hydromt.gis.raster import GEO_MAP_COORD +from hydromt.gis.raster_merge import merge if TYPE_CHECKING: from hydromt._validators.model_config import HydromtModelStep @@ -360,7 +360,7 @@ def _open_mfraster( da = da.sortby(concat_dim).transpose(concat_dim, ...) da.attrs.update(da_lst[0].attrs) else: - da = _merge(da_lst, **mosaic_kwargs) # spatial merge + da = merge(da_lst, **mosaic_kwargs) # spatial merge da.attrs.update({"source_file": "; ".join(file_attrs)}) ds = da.to_dataset() # dataset for consistency else: diff --git a/hydromt/data_catalog/data_catalog.py b/hydromt/data_catalog/data_catalog.py index ae64614cf..2e72215b4 100644 --- a/hydromt/data_catalog/data_catalog.py +++ b/hydromt/data_catalog/data_catalog.py @@ -665,7 +665,8 @@ def from_yml( A yaml data entry is provided below, where all the text between <> should be filled by the user. Multiple data sources of the same data type should be grouped. Currently the following data types are supported: - {'RasterDataset', 'GeoDataset', 'GeoDataFrame'}. See the specific data adapters + {'RasterDataset', 'GeoDataset', 'GeoDataFrame', 'DataFrame', 'Dataset'}. See the + specific data adapters for more information about the required and optional arguments. .. code-block:: yaml @@ -677,22 +678,12 @@ def from_yml( name: sha256: # only if the root is an archive : - path: + uri: data_type: driver: - filesystem: - driver_kwargs: - : - nodata: - : - rename: - : - : - unit_add: - : - unit_mult: - : - meta: + data_adapter: + uri_resolver: + metadata: source_url: source_version: source_licence: @@ -801,13 +792,9 @@ def from_dict( "path": , "data_type": , "driver": , - "filesystem": , - "driver_kwargs": {: }, - "nodata": , - "rename": {: }, - "unit_add": {: }, - "unit_mult": {: }, - "meta": {...}, + "data_adapter": , + "uri_resolver": , + "metadata": {...}, "placeholders": {: }, } : { diff --git a/hydromt/data_catalog/drivers/raster/rasterio_driver.py b/hydromt/data_catalog/drivers/raster/rasterio_driver.py index 5e7ed6592..b5e596c82 100644 --- a/hydromt/data_catalog/drivers/raster/rasterio_driver.py +++ b/hydromt/data_catalog/drivers/raster/rasterio_driver.py @@ -56,6 +56,7 @@ def read( {"time_range": time_range}, ) kwargs: Dict[str, Any] = {} + mosaic_kwargs: Dict[str, Any] = self.options.get("mosaic_kwargs", {}) # get source-specific options cache_root: str = str( @@ -78,7 +79,11 @@ def read( uris = uris_cached if mask is not None: - kwargs.update({"mosaic_kwargs": {"mask": mask}}) + mosaic_kwargs.update({"mask": mask}) + + # get mosaic kwargs + if mosaic_kwargs: + kwargs.update({"mosaic_kwargs": mosaic_kwargs}) if np.issubdtype(type(metadata.nodata), np.number): kwargs.update(nodata=metadata.nodata) diff --git a/hydromt/data_catalog/sources/factory.py b/hydromt/data_catalog/sources/factory.py index dd6868212..84bd5d2d4 100644 --- a/hydromt/data_catalog/sources/factory.py +++ b/hydromt/data_catalog/sources/factory.py @@ -6,6 +6,7 @@ from hydromt._typing.type_def import DataType from hydromt.data_catalog.sources import ( DataFrameSource, + DatasetSource, DataSource, GeoDataFrameSource, GeoDatasetSource, @@ -15,6 +16,7 @@ # Map DataType to DataSource, need to add here when implementing a new Type available_sources: Dict[DataType, DataSource] = { "DataFrame": DataFrameSource, + "Dataset": DatasetSource, "RasterDataset": RasterDatasetSource, "GeoDataFrame": GeoDataFrameSource, "GeoDataset": GeoDatasetSource, diff --git a/hydromt/data_catalog/uri_resolvers/convention_resolver.py b/hydromt/data_catalog/uri_resolvers/convention_resolver.py index 4171eeb60..b0d2570cb 100644 --- a/hydromt/data_catalog/uri_resolvers/convention_resolver.py +++ b/hydromt/data_catalog/uri_resolvers/convention_resolver.py @@ -87,7 +87,6 @@ def resolve( variables: Optional[List[str]] = None, metadata: Optional[SourceMetadata] = None, handle_nodata: NoDataStrategy = NoDataStrategy.RAISE, - options: Optional[Dict[str, Any]] = None, ) -> List[str]: """Resolve the placeholders in the URI using naming conventions. @@ -107,8 +106,6 @@ def resolve( DataSource metadata. handle_nodata : NoDataStrategy, optional how to react when no data is found, by default NoDataStrategy.RAISE - options : Optional[Dict[str, Any]], optional - extra options for this resolver, by default None Returns ------- diff --git a/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py b/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py index 3e6856e65..fe0cae660 100644 --- a/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py +++ b/hydromt/data_catalog/uri_resolvers/raster_tindex_resolver.py @@ -3,7 +3,7 @@ from logging import Logger, getLogger from os.path import abspath, dirname, join from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional, Union import geopandas as gpd @@ -34,7 +34,6 @@ def resolve( variables: Union[int, tuple[float, str], None] = None, metadata: Optional[SourceMetadata], handle_nodata: NoDataStrategy = NoDataStrategy.RAISE, - options: Optional[Dict[str, Any]] = None, ) -> List[str]: """Resolve URIs of a raster tindex file. @@ -54,8 +53,6 @@ def resolve( DataSource metadata. handle_nodata : NoDataStrategy, optional how to react when no data is found, by default NoDataStrategy.RAISE - options : Optional[Dict[str, Any]], optional - extra options for this resolver, by default None Returns ------- @@ -71,7 +68,7 @@ def resolve( raise ValueError(f"Resolver {self.name} needs a mask") gdf = gpd.read_file(uri) gdf = gdf.iloc[gdf.sindex.query(mask.to_crs(gdf.crs).union_all())] - tileindex: Optional[str] = options.get("tileindex") + tileindex: Optional[str] = self.options.get("tileindex") if tileindex is None: raise ValueError( f"{self.__class__.__name__} needs options specifying 'tileindex'" diff --git a/hydromt/data_catalog/uri_resolvers/uri_resolver.py b/hydromt/data_catalog/uri_resolvers/uri_resolver.py index f064ef12a..f9eafc9f4 100644 --- a/hydromt/data_catalog/uri_resolvers/uri_resolver.py +++ b/hydromt/data_catalog/uri_resolvers/uri_resolver.py @@ -19,6 +19,7 @@ class URIResolver(AbstractBaseModel, ABC): model_config = ConfigDict(extra="forbid") filesystem: FS = Field(default_factory=LocalFileSystem) + options: Dict[str, Any] = Field(default_factory=dict) @abstractmethod def resolve( @@ -31,7 +32,6 @@ def resolve( zoom_level: Optional[Zoom] = None, metadata: Optional[SourceMetadata] = None, handle_nodata: NoDataStrategy = NoDataStrategy.RAISE, - options: Optional[Dict[str, Any]] = None, ) -> List[str]: """Resolve a single uri to multiple uris. @@ -51,8 +51,6 @@ def resolve( Metadata of DataSource. handle_nodata : NoDataStrategy, optional how to react when no data is found, by default NoDataStrategy.RAISE - options : Optional[Dict[str, Any]], optional - extra options for this resolver, by default None Returns ------- diff --git a/hydromt/gis/_raster_merge.py b/hydromt/gis/raster_merge.py similarity index 99% rename from hydromt/gis/_raster_merge.py rename to hydromt/gis/raster_merge.py index a479e10f6..26782efd7 100644 --- a/hydromt/gis/_raster_merge.py +++ b/hydromt/gis/raster_merge.py @@ -6,10 +6,10 @@ from hydromt.gis.raster import full_from_transform -__all__ = ["_merge"] +__all__ = ["merge"] -def _merge( +def merge( data_arrays, dst_crs=None, dst_bounds=None, diff --git a/tests/data_catalog/drivers/raster/test_rasterio_driver.py b/tests/data_catalog/drivers/raster/test_rasterio_driver.py index 2cff2539c..b656dbd10 100644 --- a/tests/data_catalog/drivers/raster/test_rasterio_driver.py +++ b/tests/data_catalog/drivers/raster/test_rasterio_driver.py @@ -2,6 +2,7 @@ from os.path import join from pathlib import Path from typing import Tuple +from unittest.mock import MagicMock, patch import numpy as np import pytest @@ -63,6 +64,13 @@ def test_sets_nodata(self, rioda: xr.DataArray, tmp_path: Path): ) assert ds["test_sets_nodata"].raster.nodata == 42 + @patch("hydromt.data_catalog.drivers.raster.rasterio_driver._open_mfraster") + def test_sets_mosaic_kwargs(self, fake_open_mfraster: MagicMock): + uris = ["test", "test2"] + mosaic_kwargs = {"mykwarg: 0"} + RasterioDriver(options={"mosaic_kwargs": mosaic_kwargs}).read(uris=uris) + fake_open_mfraster.assert_called_once_with(uris, mosaic_kwargs=mosaic_kwargs) + class TestOpenMFRaster: @pytest.fixture() diff --git a/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py b/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py index 4a34b39f0..92178117e 100644 --- a/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py +++ b/tests/data_catalog/uri_resolvers/test_raster_tindex_resolver.py @@ -70,12 +70,13 @@ def test_resolves_correctly(self, raster_tindex): geom = gpd.GeoDataFrame(geometry=[box(-78, 0.0005, -65, 4)], crs=4326) metadata = SourceMetadata() options = {"tileindex": "location"} - resolver = RasterTindexResolver(filesystem=AbstractFileSystem()) + resolver = RasterTindexResolver( + filesystem=AbstractFileSystem(), options=options + ) paths = resolver.resolve( uri=raster_tindex, metadata=metadata, mask=geom, - options=options, ) assert len(paths) == 2 assert ( @@ -92,7 +93,6 @@ def test_resolves_correctly(self, raster_tindex): uri=raster_tindex, metadata=metadata, mask=geom, - options=options, ) assert len(paths) == 1 path = str(Path(join(dirname(raster_tindex), "GRWL_mask_V01.01/NA19.tif"))) @@ -110,13 +110,14 @@ def test_raises_no_tileindex(self, raster_tindex): uri=raster_tindex, metadata=metadata, mask=geom, - options={}, ) def test_raises_missing_tileindex(self, raster_tindex): - resolver = RasterTindexResolver(filesystem=AbstractFileSystem()) - metadata = SourceMetadata() options = {"tileindex": "file"} + resolver = RasterTindexResolver( + filesystem=AbstractFileSystem(), options=options + ) + metadata = SourceMetadata() geom = gpd.GeoDataFrame(geometry=[box(-78, 0.0005, -65, 4)], crs=4326) with pytest.raises( IOError, @@ -126,18 +127,18 @@ def test_raises_missing_tileindex(self, raster_tindex): uri=raster_tindex, metadata=metadata, mask=geom, - options=options, ) def test_raises_no_intersecting_files(self, raster_tindex): - resolver = RasterTindexResolver(filesystem=AbstractFileSystem()) - metadata = SourceMetadata() options = {"tileindex": "file"} + resolver = RasterTindexResolver( + filesystem=AbstractFileSystem(), options=options + ) + metadata = SourceMetadata() geom = gpd.GeoDataFrame(geometry=[box(4, 52, 5, 53)], crs=4326) with pytest.raises(NoDataException, match="found no intersecting tiles."): resolver.resolve( uri=raster_tindex, metadata=metadata, mask=geom, - options=options, )