From 21898d3728e9b03ede6ee5207b44a94ee8f9cb1e Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Mon, 5 Oct 2020 09:23:31 +0200 Subject: [PATCH 01/15] fix optional input argument --- neuralhydrology/nh_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralhydrology/nh_run.py b/neuralhydrology/nh_run.py index c9c4efc8..7748fb99 100644 --- a/neuralhydrology/nh_run.py +++ b/neuralhydrology/nh_run.py @@ -99,7 +99,7 @@ def continue_run(run_dir: Path, config_file: Path = None, gpu: int = None): start_training(base_config) -def eval_run(run_dir: Path, period: str, epoch: int, gpu: int = None): +def eval_run(run_dir: Path, period: str, epoch: int = None, gpu: int = None): """Start evaluating a trained model. Parameters From bd4fdbdd713c5fe66f891e8ad2229b8a3f7f7bad Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Mon, 5 Oct 2020 12:44:20 +0200 Subject: [PATCH 02/15] add github workflows (#2) * add github workflows * Update .github/workflows/pytest-ci.yml Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> --- .github/workflows/pytest-ci.yml | 46 +++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/pytest-ci.yml diff --git a/.github/workflows/pytest-ci.yml b/.github/workflows/pytest-ci.yml new file mode 100644 index 00000000..07612002 --- /dev/null +++ b/.github/workflows/pytest-ci.yml @@ -0,0 +1,46 @@ +# Workflow to run the pytest test suite. + +name: pytest CI + +on: + pull_request: + branches: + - master + push: + branches: + - master + +jobs: + test: + runs-on: ubuntu-latest + + steps: + # Checkout the repository under $GITHUB_WORKSPACE + - uses: actions/checkout@v2 + + # initialize conda + - name: Conda setup + uses: s-weigand/setup-conda@v1 + with: + update-conda: true + python-version: 3.7 + + # cache the conda installation to speedup the CI runs + - uses: actions/cache@v2 + id: cache + with: + path: /usr/share/miniconda/envs/neuralhydrology + key: ${{ runner.os }}-conda-cache-${{ hashFiles('environments/environment_cuda10_2.yml') }} + + # on cache miss, create the env from scratch + - name: Conda environment creation + if: steps.cache.outputs.cache-hit != 'true' + run: | + conda env create -f environments/environment_cuda10_2.yml + source activate neuralhydrology + + # Run the tests + - name: Testing with pytest + run: | + source activate neuralhydrology + pytest --cov=neuralhydrology From 8fa3b278384e9ac7fea53ea42aaac9181f42f854 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Mon, 5 Oct 2020 12:52:03 +0200 Subject: [PATCH 03/15] automatic datetime coord inference (#1) * automatic datetime coord inference * Update neuralhydrology/data/utils.py Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> --- neuralhydrology/data/utils.py | 29 +++++ neuralhydrology/evaluation/metrics.py | 39 +++++-- neuralhydrology/evaluation/signatures.py | 130 +++++++++++++---------- 3 files changed, 132 insertions(+), 66 deletions(-) diff --git a/neuralhydrology/data/utils.py b/neuralhydrology/data/utils.py index b0e3b398..9e7e508a 100644 --- a/neuralhydrology/data/utils.py +++ b/neuralhydrology/data/utils.py @@ -4,6 +4,8 @@ import numpy as np import pandas as pd import xarray +from xarray.core.dataarray import DataArray +from xarray.core.dataset import Dataset ######################################################################################################################## # CAMELS US utility functions # @@ -512,3 +514,30 @@ def infer_frequency(index: Union[pd.DatetimeIndex, np.ndarray]) -> str: if pd.to_timedelta(native_frequency) == pd.to_timedelta(0): raise ValueError('Inferred dataset frequency is zero.') return native_frequency + + +def infer_datetime_coord(xr: Union[DataArray, Dataset]) -> str: + """Checks for coordinate with 'date' in its name and returns the name. + + Parameters + ---------- + xr : Union[DataArray, Dataset] + Array to infer coordinate name of. + + Returns + ------- + str + Name of datetime coordinate name. + + Raises + ------ + RuntimeError + If none or multiple coordinates with 'date' in its name are found. + """ + candidates = [c for c in list(xr.coords) if "date" in c] + if len(candidates) > 1: + raise RuntimeError("Found multiple coordinates with 'date' in its name.") + if not candidates: + raise RuntimeError("Did not find any coordinate with 'date' in its name") + + return candidates[0] diff --git a/neuralhydrology/evaluation/metrics.py b/neuralhydrology/evaluation/metrics.py index 833a83cf..32e62944 100644 --- a/neuralhydrology/evaluation/metrics.py +++ b/neuralhydrology/evaluation/metrics.py @@ -5,6 +5,8 @@ from scipy import stats, signal from xarray.core.dataarray import DataArray +from neuralhydrology.data import utils + def get_available_metrics() -> List[str]: """Get list of available metrics. @@ -513,7 +515,11 @@ def fdc_flv(obs: DataArray, sim: DataArray, l: float = 0.3) -> float: return flv * 100 -def mean_peak_timing(obs: DataArray, sim: DataArray, window: int = None, resolution: str = '1D') -> float: +def mean_peak_timing(obs: DataArray, + sim: DataArray, + window: int = None, + resolution: str = '1D', + datetime_coord: str = None) -> float: """Mean difference in peak flow timing. Uses scipy.find_peaks to find peaks in the observed time series. Starting with all observed peaks, those with a @@ -536,6 +542,8 @@ def mean_peak_timing(obs: DataArray, sim: DataArray, window: int = None, resolut for a resolution of '1H' the the window size is 12. resolution : str, optional Temporal resolution of the time series in pandas format, e.g. '1D' for daily and '1H' for hourly. + datetime_coord : str, optional + Name of datetime coordinate. Tried to infer automatically if not specified. Returns @@ -558,6 +566,10 @@ def mean_peak_timing(obs: DataArray, sim: DataArray, window: int = None, resolut # heuristic to get indices of peaks and their corresponding height. peaks, properties = signal.find_peaks(obs.values, distance=100, prominence=np.std(obs.values)) + # infer name of datetime index + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(obs) + if window is None: # infer a reasonable window size window = max((0.5 * pd.to_timedelta('1D')) // pd.to_timedelta(resolution), 3) @@ -567,8 +579,8 @@ def mean_peak_timing(obs: DataArray, sim: DataArray, window: int = None, resolut for idx in peaks: # skip peaks at the start and end of the sequence and peaks around missing observations # (NaNs that were removed in obs & sim would result in windows that span too much time). - if (idx - window < 0) or (idx + window >= len(obs)) or (pd.date_range(obs[idx - window]['datetime'].values, - obs[idx + window]['datetime'].values, + if (idx - window < 0) or (idx + window >= len(obs)) or (pd.date_range(obs[idx - window][datetime_coord].values, + obs[idx + window][datetime_coord].values, freq=resolution).size != 2 * window + 1): continue @@ -584,7 +596,7 @@ def mean_peak_timing(obs: DataArray, sim: DataArray, window: int = None, resolut peak_obs = obs[idx] # calculate the time difference between the peaks - delta = peak_obs.coords['datetime'] - peak_sim.coords['datetime'] + delta = peak_obs.coords[datetime_coord] - peak_sim.coords[datetime_coord] timing_error = np.abs(delta.values / pd.to_timedelta(resolution)) @@ -593,7 +605,8 @@ def mean_peak_timing(obs: DataArray, sim: DataArray, window: int = None, resolut return np.mean(timing_errors) if len(timing_errors) > 0 else np.nan -def calculate_all_metrics(obs: DataArray, sim: DataArray, resolution: str = "1D") -> Dict[str, float]: +def calculate_all_metrics(obs: DataArray, sim: DataArray, resolution: str = "1D", + datetime_coord: str = None) -> Dict[str, float]: """Calculate all metrics with default values. Parameters @@ -604,7 +617,9 @@ def calculate_all_metrics(obs: DataArray, sim: DataArray, resolution: str = "1D" Simulated time series. resolution : str, optional Temporal resolution of the time series in pandas format, e.g. '1D' for daily and '1H' for hourly. - + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. + Returns ------- Dict[str, float] @@ -621,13 +636,17 @@ def calculate_all_metrics(obs: DataArray, sim: DataArray, resolution: str = "1D" "FHV": fdc_fhv(obs, sim), "FMS": fdc_fms(obs, sim), "FLV": fdc_flv(obs, sim), - "Peak-Timing": mean_peak_timing(obs, sim, resolution=resolution) + "Peak-Timing": mean_peak_timing(obs, sim, resolution=resolution, datetime_coord=datetime_coord) } return results -def calculate_metrics(obs: DataArray, sim: DataArray, metrics: List[str], resolution: str = "1D") -> Dict[str, float]: +def calculate_metrics(obs: DataArray, + sim: DataArray, + metrics: List[str], + resolution: str = "1D", + datetime_coord: str = None) -> Dict[str, float]: """Calculate specific metrics with default values. Parameters @@ -640,6 +659,8 @@ def calculate_metrics(obs: DataArray, sim: DataArray, metrics: List[str], resolu List of metric names. resolution : str, optional Temporal resolution of the time series in pandas format, e.g. '1D' for daily and '1H' for hourly. + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Returns ------- @@ -673,7 +694,7 @@ def calculate_metrics(obs: DataArray, sim: DataArray, metrics: List[str], resolu elif metric.lower() == "flv": values["FLV"] = fdc_flv(obs, sim) elif metric.lower() == "peak-timing": - values["Peak-Timing"] = mean_peak_timing(obs, sim, resolution=resolution) + values["Peak-Timing"] = mean_peak_timing(obs, sim, resolution=resolution, datetime_coord=datetime_coord) else: raise RuntimeError(f"Unknown metric {metric}") diff --git a/neuralhydrology/evaluation/signatures.py b/neuralhydrology/evaluation/signatures.py index 603b0964..dcc1e726 100644 --- a/neuralhydrology/evaluation/signatures.py +++ b/neuralhydrology/evaluation/signatures.py @@ -8,7 +8,7 @@ from numba import njit from xarray.core.dataarray import DataArray -from neuralhydrology.data.utils import infer_frequency +from neuralhydrology.data import utils def get_available_signatures() -> List[str]: @@ -26,7 +26,7 @@ def get_available_signatures() -> List[str]: return signatures -def calculate_all_signatures(da: DataArray, prcp: DataArray, datetime_coord: str = 'date') -> Dict[str, float]: +def calculate_all_signatures(da: DataArray, prcp: DataArray, datetime_coord: str = None) -> Dict[str, float]: """Calculate all signatures with default values. Parameters @@ -36,34 +36,35 @@ def calculate_all_signatures(da: DataArray, prcp: DataArray, datetime_coord: str prcp : DataArray Array of precipitation values. datetime_coord : str, optional - Datetime coordinate in the passed DataArray. + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Returns ------- Dict[str, float] Dictionary with signature names as keys and signature values as values. """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) + results = { - "high_q_freq": high_q_freq(da, coord=datetime_coord), + "high_q_freq": high_q_freq(da, datetime_coord=datetime_coord), "high_q_dur": high_q_dur(da), - "low_q_freq": low_q_freq(da, coord=datetime_coord), + "low_q_freq": low_q_freq(da, datetime_coord=datetime_coord), "low_q_dur": low_q_dur(da), "zero_q_freq": zero_q_freq(da), "q95": q95(da), "q5": q5(da), "q_mean": q_mean(da), - "hfd_mean": hfd_mean(da, coord=datetime_coord), + "hfd_mean": hfd_mean(da, datetime_coord=datetime_coord), "baseflow_index": baseflow_index(da)[0], "slope_fdc": slope_fdc(da), - "stream_elas": stream_elas(da, prcp, coord=datetime_coord), - "runoff_ratio": runoff_ratio(da, prcp, coord=datetime_coord) + "stream_elas": stream_elas(da, prcp, datetime_coord=datetime_coord), + "runoff_ratio": runoff_ratio(da, prcp, datetime_coord=datetime_coord) } return results -def calculate_signatures(da: DataArray, - signatures: List[str], - datetime_coord: str = 'date', +def calculate_signatures(da: DataArray, signatures: List[str], datetime_coord: str = None, prcp: DataArray = None) -> Dict[str, float]: """Calculate the specified signatures with default values. @@ -74,7 +75,7 @@ def calculate_signatures(da: DataArray, signatures : List[str] List of names of the signatures to calculate. datetime_coord : str, optional - Datetime coordinate in the passed DataArray. + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. prcp : DataArray, optional Array of precipitation values. Required for signatures 'runoff_ratio' and 'streamflow_elas'. @@ -88,14 +89,17 @@ def calculate_signatures(da: DataArray, ValueError If a passed signature name does not exist. """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) + values = {} for signature in signatures: if signature == "high_q_freq": - values["high_q_freq"] = high_q_freq(da, coord=datetime_coord) + values["high_q_freq"] = high_q_freq(da, datetime_coord=datetime_coord) elif signature == "high_q_dur": values["high_q_dur"] = high_q_dur(da) elif signature == "low_q_freq": - values["low_q_freq"] = low_q_freq(da, coord=datetime_coord) + values["low_q_freq"] = low_q_freq(da, datetime_coord=datetime_coord) elif signature == "low_q_dur": values["low_q_dur"] = low_q_dur(da) elif signature == "zero_q_freq": @@ -107,15 +111,15 @@ def calculate_signatures(da: DataArray, elif signature == "q_mean": values["q_mean"] = q_mean(da) elif signature == "hfd_mean": - values["hfd_mean"] = hfd_mean(da, coord=datetime_coord) + values["hfd_mean"] = hfd_mean(da, datetime_coord=datetime_coord) elif signature == "baseflow_index": - values["baseflow_index"] = baseflow_index(da, coord=datetime_coord)[0] + values["baseflow_index"] = baseflow_index(da, datetime_coord=datetime_coord)[0] elif signature == "slope_fdc": values["slope_fdc"] = slope_fdc(da) elif signature == "runoff_ratio": - values["runoff_ratio"] = runoff_ratio(da, prcp, coord=datetime_coord) + values["runoff_ratio"] = runoff_ratio(da, prcp, datetime_coord=datetime_coord) elif signature == "stream_elas": - values["stream_elas"] = stream_elas(da, prcp, coord=datetime_coord) + values["stream_elas"] = stream_elas(da, prcp, datetime_coord=datetime_coord) else: ValueError(f"Unknown signatures {signature}") return values @@ -199,7 +203,6 @@ def low_q_dur(da: DataArray, threshold: float = 0.2) -> float: .. [#] Westerberg, I. K. and McMillan, H. K.: Uncertainty in hydrological signatures. Hydrology and Earth System Sciences, 2015, 19, 3951--3968, doi:10.5194/hess-19-3951-2015 """ - mean_flow = float(da.mean()) idx = np.where(da.values < threshold * mean_flow)[0] if len(idx) > 0: @@ -225,14 +228,13 @@ def zero_q_freq(da: DataArray) -> float: float Zero-flow frequency. """ - # number of steps with zero flow n_steps = (da == 0).sum() return float(n_steps / len(da)) -def high_q_freq(da: DataArray, coord: str = 'date', threshold: float = 9.) -> float: +def high_q_freq(da: DataArray, datetime_coord: str = None, threshold: float = 9.) -> float: """Calculate high-flow frequency. Frequency of high-flow events (>`threshold` times the median flow) [#]_, [#]_ (Table 2). @@ -241,8 +243,8 @@ def high_q_freq(da: DataArray, coord: str = 'date', threshold: float = 9.) -> fl ---------- da : DataArray Array of flow values. - coord : str, optional - Datetime coordinate in `da`. + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. threshold : float, optional High-flow threshold. Values larger than ``threshold * median`` are considered high flows. @@ -258,10 +260,12 @@ def high_q_freq(da: DataArray, coord: str = 'date', threshold: float = 9.) -> fl .. [#] Westerberg, I. K. and McMillan, H. K.: Uncertainty in hydrological signatures. Hydrology and Earth System Sciences, 2015, 19, 3951--3968, doi:10.5194/hess-19-3951-2015 """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) # determine the date of the first January 1st in the data period - first_date = da.coords[coord][0].values.astype('datetime64[s]').astype(datetime) - last_date = da.coords[coord][-1].values.astype('datetime64[s]').astype(datetime) + first_date = da.coords[datetime_coord][0].values.astype('datetime64[s]').astype(datetime) + last_date = da.coords[datetime_coord][-1].values.astype('datetime64[s]').astype(datetime) if first_date == datetime.strptime(f'{first_date.year}-01-01', '%Y-%m-%d'): start_date = first_date @@ -277,7 +281,7 @@ def high_q_freq(da: DataArray, coord: str = 'date', threshold: float = 9.) -> fl hqfs = [] while end_date < last_date: - data = da.sel({coord: slice(start_date, end_date)}) + data = da.sel({datetime_coord: slice(start_date, end_date)}) # number of steps with discharge higher than threshold * median in a one year period n_steps = (data > (threshold * median_flow)).sum() @@ -290,7 +294,7 @@ def high_q_freq(da: DataArray, coord: str = 'date', threshold: float = 9.) -> fl return np.mean(hqfs) -def low_q_freq(da: DataArray, coord: str = 'date', threshold: float = 0.2) -> float: +def low_q_freq(da: DataArray, datetime_coord: str = None, threshold: float = 0.2) -> float: """Calculate Low-flow frequency. Frequency of low-flow events (<`threshold` times the median flow) [#]_, [#]_ (Table 2). @@ -299,8 +303,8 @@ def low_q_freq(da: DataArray, coord: str = 'date', threshold: float = 0.2) -> fl ---------- da : DataArray Array of flow values. - coord : str, optional - Datetime coordinate in `da`. + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. threshold : float, optional Low-flow threshold. Values below ``threshold * median`` are considered low flows. @@ -316,10 +320,12 @@ def low_q_freq(da: DataArray, coord: str = 'date', threshold: float = 0.2) -> fl .. [#] Westerberg, I. K. and McMillan, H. K.: Uncertainty in hydrological signatures. Hydrology and Earth System Sciences, 2015, 19, 3951--3968, doi:10.5194/hess-19-3951-2015 """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) # determine the date of the first January 1st in the data period - first_date = da.coords[coord][0].values.astype('datetime64[s]').astype(datetime) - last_date = da.coords[coord][-1].values.astype('datetime64[s]').astype(datetime) + first_date = da.coords[datetime_coord][0].values.astype('datetime64[s]').astype(datetime) + last_date = da.coords[datetime_coord][-1].values.astype('datetime64[s]').astype(datetime) if first_date == datetime.strptime(f'{first_date.year}-01-01', '%Y-%m-%d'): start_date = first_date @@ -335,7 +341,7 @@ def low_q_freq(da: DataArray, coord: str = 'date', threshold: float = 0.2) -> fl lqfs = [] while end_date < last_date: - data = da.sel({coord: slice(start_date, end_date)}) + data = da.sel({datetime_coord: slice(start_date, end_date)}) # number of steps with discharge lower than threshold * median in a one year period n_steps = (data < (threshold * mean_flow)).sum() @@ -348,7 +354,7 @@ def low_q_freq(da: DataArray, coord: str = 'date', threshold: float = 0.2) -> fl return np.mean(lqfs) -def hfd_mean(da: DataArray, coord: str = 'date') -> float: +def hfd_mean(da: DataArray, datetime_coord: str = None) -> float: """Calculate mean half-flow duration. Mean half-flow date (step on which the cumulative discharge since October 1st @@ -358,8 +364,8 @@ def hfd_mean(da: DataArray, coord: str = 'date') -> float: ---------- da : DataArray Array of flow values. - coord : str, optional - Datetime coordinate name in `da`. + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Returns ------- @@ -371,10 +377,12 @@ def hfd_mean(da: DataArray, coord: str = 'date') -> float: .. [#] Court, A.: Measures of streamflow timing. Journal of Geophysical Research (1896-1977), 1962, 67, 4335--4339, doi:10.1029/JZ067i011p04335 """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) # determine the date of the first October 1st in the data period - first_date = da.coords[coord][0].values.astype('datetime64[s]').astype(datetime) - last_date = da.coords[coord][-1].values.astype('datetime64[s]').astype(datetime) + first_date = da.coords[datetime_coord][0].values.astype('datetime64[s]').astype(datetime) + last_date = da.coords[datetime_coord][-1].values.astype('datetime64[s]').astype(datetime) if first_date > datetime.strptime(f'{first_date.year}-10-01', '%Y-%m-%d'): start_date = datetime.strptime(f'{first_date.year + 1}-10-01', '%Y-%m-%d') @@ -387,7 +395,7 @@ def hfd_mean(da: DataArray, coord: str = 'date') -> float: while end_date < last_date: # compute cumulative sum for the selected period - data = da.sel({coord: slice(start_date, end_date)}) + data = da.sel({datetime_coord: slice(start_date, end_date)}) cs = data.cumsum(skipna=True) # find steps with more cumulative discharge than the half annual sum @@ -417,7 +425,6 @@ def q5(da: DataArray) -> float: float 5th flow quantile. """ - return float(da.quantile(0.05)) @@ -498,7 +505,7 @@ def baseflow_index(da: DataArray, alpha: float = 0.98, warmup: int = 30, n_passes: int = None, - coord: str = 'date') -> Tuple[float, DataArray]: + datetime_coord: str = None) -> Tuple[float, DataArray]: """Calculate baseflow index. Ratio of mean baseflow to mean discharge [#]_. If `da` contains NaN values, the baseflow is calculated for each @@ -515,8 +522,9 @@ def baseflow_index(da: DataArray, n_passes : int, optional Number of passes (alternating forward and backward) to perform. Should be an odd number. If None, will use 3 for daily and 9 for hourly data and fail for all other input frequencies. - coord : str, optional - Datetime coordinate in `da`, used to infer the frequency if `n_passes` is None. + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Used to infer the + frequency if `n_passes` is None. Returns ------- @@ -535,9 +543,11 @@ def baseflow_index(da: DataArray, Lyne and Hollick Filter. Australasian Journal of Water Resources, Taylor & Francis, 2013, 17, 25--34, doi:10.7158/13241583.2013.11465417 """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) if n_passes is None: - freq = infer_frequency(da[coord].values) + freq = utils.infer_frequency(da[datetime_coord].values) if freq == '1D': n_passes = 3 elif freq == '1H': @@ -595,7 +605,7 @@ def slope_fdc(da: DataArray, lower_quantile: float = 0.33, upper_quantile: float return value -def runoff_ratio(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: +def runoff_ratio(da: DataArray, prcp: DataArray, datetime_coord: str = None) -> float: """Calculate runoff ratio. Runoff ratio (ratio of mean discharge to mean precipitation) [#]_ (Eq. 2). @@ -606,8 +616,8 @@ def runoff_ratio(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: Array of flow values. prcp : DataArray Array of precipitation values. - coord : str, optional - Datetime dimension name in `da`. + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Returns ------- @@ -620,11 +630,14 @@ def runoff_ratio(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: analysis of hydrologic similarity based on catchment function in the eastern USA. Hydrology and Earth System Sciences, 2011, 15, 2895--2911, doi:10.5194/hess-15-2895-2011 """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) + # rename precip coordinate name (to avoid problems with 'index' or 'date') - prcp = prcp.rename({list(prcp.coords.keys())[0]: coord}) + prcp = prcp.rename({list(prcp.coords.keys())[0]: datetime_coord}) # slice prcp to the same time window as the discharge - prcp = prcp.sel({coord: slice(da.coords[coord][0], da.coords[coord][-1])}) + prcp = prcp.sel({datetime_coord: slice(da.coords[datetime_coord][0], da.coords[datetime_coord][-1])}) # calculate runoff ratio value = da.mean() / prcp.mean() @@ -632,7 +645,7 @@ def runoff_ratio(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: return float(value) -def stream_elas(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: +def stream_elas(da: DataArray, prcp: DataArray, datetime_coord: str = None) -> float: """Calculate stream elasticity. Streamflow precipitation elasticity (sensitivity of streamflow to changes in precipitation at @@ -644,8 +657,8 @@ def stream_elas(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: Array of flow values. prcp : DataArray Array of precipitation values. - coord : str, optional - Datetime dimension name in `da`. + datetime_coord : str, optional + Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Returns ------- @@ -657,15 +670,18 @@ def stream_elas(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: .. [#] Sankarasubramanian, A., Vogel, R. M., and Limbrunner, J. F.: Climate elasticity of streamflow in the United States. Water Resources Research, 2001, 37, 1771--1781, doi:10.1029/2000WR900330 """ + if datetime_coord is None: + datetime_coord = utils.infer_datetime_coord(da) + # rename precip coordinate name (to avoid problems with 'index' or 'date') - prcp = prcp.rename({list(prcp.coords.keys())[0]: coord}) + prcp = prcp.rename({list(prcp.coords.keys())[0]: datetime_coord}) # slice prcp to the same time window as the discharge - prcp = prcp.sel({coord: slice(da.coords[coord][0], da.coords[coord][-1])}) + prcp = prcp.sel({datetime_coord: slice(da.coords[datetime_coord][0], da.coords[datetime_coord][-1])}) # determine the date of the first October 1st in the data period - first_date = da.coords[coord][0].values.astype('datetime64[s]').astype(datetime) - last_date = da.coords[coord][-1].values.astype('datetime64[s]').astype(datetime) + first_date = da.coords[datetime_coord][0].values.astype('datetime64[s]').astype(datetime) + last_date = da.coords[datetime_coord][-1].values.astype('datetime64[s]').astype(datetime) if first_date > datetime.strptime(f'{first_date.year}-10-01', '%Y-%m-%d'): start_date = datetime.strptime(f'{first_date.year + 1}-10-01', '%Y-%m-%d') @@ -685,8 +701,8 @@ def stream_elas(da: DataArray, prcp: DataArray, coord: str = 'date') -> float: values = [] while end_date < last_date: - q = da.sel({coord: slice(start_date, end_date)}) - p = prcp.sel({coord: slice(start_date, end_date)}) + q = da.sel({datetime_coord: slice(start_date, end_date)}) + p = prcp.sel({datetime_coord: slice(start_date, end_date)}) val = (q.mean() - q_mean_total) / (p.mean() - p_mean_total) * (p_mean_total / q_mean_total) values.append(val) From 89580e6ee05988521d2f07a4d2c96c63dd52b922 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Mon, 5 Oct 2020 14:18:12 +0200 Subject: [PATCH 04/15] Added introduction notebook to docs, some docs updates (#3) * Added introduction notebook * added nbsphinx resources * Apply suggestions from code review Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> * Update package name, integrated code review comments * Update docs/source/usage/models.rst Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> --- .gitignore | 4 +- docs/source/api/neuralhydrology.rst | 4 +- docs/source/conf.py | 16 +- docs/source/index.rst | 8 +- docs/source/tutorials/index.rst | 11 + docs/source/tutorials/introduction.nblink | 3 + docs/source/usage/models.rst | 54 +- docs/source/usage/quickstart.rst | 19 +- environments/environment_cpu.yml | 3 +- environments/environment_cuda10_2.yml | 3 +- environments/environment_cuda9_2.yml | 3 +- examples/01-Introduction/1_basin.txt | 1 + examples/01-Introduction/1_basin.yml | 145 ++++ examples/01-Introduction/Introduction.ipynb | 891 ++++++++++++++++++++ 14 files changed, 1124 insertions(+), 41 deletions(-) create mode 100644 docs/source/tutorials/index.rst create mode 100644 docs/source/tutorials/introduction.nblink create mode 100644 examples/01-Introduction/1_basin.txt create mode 100644 examples/01-Introduction/1_basin.yml create mode 100644 examples/01-Introduction/Introduction.ipynb diff --git a/.gitignore b/.gitignore index c1832336..3a8ddfb2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,8 +12,8 @@ dist/* neuralhydrology.egg-info/* .vscode/* .idea/* -runs/* +runs/ configs/* -.ipynb_checkpoints/* +.ipynb_checkpoints/ data/* docs/build/* diff --git a/docs/source/api/neuralhydrology.rst b/docs/source/api/neuralhydrology.rst index 21746964..eddeb281 100644 --- a/docs/source/api/neuralhydrology.rst +++ b/docs/source/api/neuralhydrology.rst @@ -1,5 +1,5 @@ -neuralhydrology -=============== +neuralhydrology API +=================== .. automodule:: neuralhydrology :members: diff --git a/docs/source/conf.py b/docs/source/conf.py index 75568bd9..044745aa 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,18 +10,22 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # +import datetime import os import sys sys.path.insert(0, os.path.abspath('.')) sys.path.insert(0, os.path.abspath('../../')) # -- Project information ----------------------------------------------------- +about = {} +with open('../../neuralhydrology/__about__.py', "r") as fp: + exec(fp.read(), about) -project = 'NeuralHydrology' -copyright = '2020, Frederik Kratzert' +project = 'neuralHydrology' +copyright = f'{datetime.datetime.now().year}, Frederik Kratzert' author = 'Frederik Kratzert' # The full version, including alpha/beta/rc tags -release = '0.9.0-beta' +release = about["__version__"] # -- General configuration --------------------------------------------------- @@ -31,7 +35,9 @@ extensions = [ 'sphinx.ext.autodoc', # autodocument 'sphinx.ext.napoleon', # google and numpy doc string support - 'sphinx.ext.mathjax' # latex rendering of equations using MathJax + 'sphinx.ext.mathjax', # latex rendering of equations using MathJax + 'nbsphinx', # for direct embedding of jupyter notebooks into sphinx docs + 'nbsphinx_link' # to be able to include notebooks from outside of the docs folder ] # Add any paths that contain templates here, relative to this directory. @@ -40,7 +46,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns = ['**.ipynb_checkpoints'] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index c57b6085..0e023dc4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,13 +3,17 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to NeuralHydrology's documentation! +Welcome to neuralHydrology's documentation! =========================================== +The documentation is still work-in-progress. Stay tuned for a lot of updates during the next days/weeks, as well as +a handful of tutorials. + .. toctree:: - :maxdepth: 5 + :maxdepth: 2 :caption: Contents: usage/quickstart usage/models + tutorials/index api/neuralhydrology diff --git a/docs/source/tutorials/index.rst b/docs/source/tutorials/index.rst new file mode 100644 index 00000000..5f6e8275 --- /dev/null +++ b/docs/source/tutorials/index.rst @@ -0,0 +1,11 @@ +Tutorials +--------- + +We will gradually add more tutorials over the next couple of weeks, highlighting some of +the functionality of this Python package. + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + introduction diff --git a/docs/source/tutorials/introduction.nblink b/docs/source/tutorials/introduction.nblink new file mode 100644 index 00000000..bd966504 --- /dev/null +++ b/docs/source/tutorials/introduction.nblink @@ -0,0 +1,3 @@ +{ + "path": "../../../examples/01-Introduction/Introduction.ipynb" +} \ No newline at end of file diff --git a/docs/source/usage/models.rst b/docs/source/usage/models.rst index 567b77bb..e1095ad4 100644 --- a/docs/source/usage/models.rst +++ b/docs/source/usage/models.rst @@ -1,28 +1,29 @@ -Models -====== +Modelzoo +======== -The following section gives an overview over all implemented models. +The following section gives an overview of all implemented models. See `Implementing a new model`_ for details +on how to add your own model to the neuralHydrology package. + +BaseModel +--------- +Abstract base class from which all models derive. Do not use this class for model training. CudaLSTM -------- :py:class:`neuralhydrology.modelzoo.cudalstm.CudaLSTM` is a network using the standard PyTorch LSTM implementation. -All features (``x_d``, ``x_s``, ``x_one_hot``) are concatenated and passed at each time step. -Initial forget gate bias can be set (in config.yml) and will be set during model initialization. +All features (``x_d``, ``x_s``, ``x_one_hot``) are concatenated and passed to the network at each time step. +The initial forget gate bias can be defined in config.yml (``initial_forget_bias``) and will be set accordingly during +model initialization. EA-LSTM ------- -:py:class:`neuralhydrology.modelzoo.ealstm.EALSTM` is an implementation of the Entity-Aware LSTM, as used -in the 2019 HESS paper. The static features (``x_s`` and/or ``x_one_hot``) are used to compute the input gate -activations, while ``x_d`` is used in all other gates of the network. -Initial forget gate bias can be set, and if ``embedding_hiddens`` is passed, the input gate consists of the so-defined +:py:class:`neuralhydrology.modelzoo.ealstm.EALSTM` is an implementation of the Entity-Aware LSTM, as introduced in +`Kratzert et al. "Towards learning universal, regional, and local hydrological behaviors via machine learning applied to large-sample datasets" `__. +The static features (``x_s`` and/or ``x_one_hot``) are used to compute the input gate activations, while the dynamic +inputs ``x_d`` are used in all other gates of the network. +The initial forget gate bias can be defined in config.yml (``initial_forget_bias``). If ``embedding_hiddens`` is passed, the input gate consists of the so-defined FC network and not a single linear layer. -LSTM ----- -:py:class:`neuralhydrology.modelzoo.lstm.LSTM` is an own LSTM implementation. -Momentarily, the only advantage compared to CudaLSTM is the return of the entire cell state array. -This class will be most likely adapted/changed in the near future to provide much more flexibility for various settings. - EmbCudaLSTM ----------- :py:class:`neuralhydrology.modelzoo.embcudalstm.EmbCudaLSTM` is similar to `CudaLSTM`_, @@ -31,6 +32,26 @@ with the only difference that static inputs (``x_s`` and/or ``x_one_hot``) are p at each time step. +LSTM +---- +:py:class:`neuralhydrology.modelzoo.lstm.LSTM` is a PyTorch port of the CudaLSTM that returns all gate and state +activations for all time steps. This class is implemented for exploratory reasons. You can use the method +``model.copy_weights()`` to copy the weights of a ``CudaLSTM`` model into an ``LSTM`` model. This allows to use the fast +CUDA implementation for training, and only use this class for inference with more detailed outputs. + +MultiFreqLSTM +------------- +:py:class:`neuralhydrology.modelzoo.multifreqlstm.MultiFreqLSTM` is a newly proposed model by Gauch et al. (pre-print +published soon). This model allows the training on more than one temporal frequency (e.g. daily and hourly inputs) and +returns multi-frequency model predictions accordingly. A more detailed tutorial will follow shortly. + +ODELSTM +------- +:py:class:`neuralhydrology.modelzoo.odelstm.ODELSTM` is a PyTorch implementation of the ODE-LSTM proposed by +`Lechner and Hasani `_. This model can be used with unevenly sampled inputs and can +be queried to return predictions for any arbitrary time step. + + Implementing a new model ------------------------ The listing below shows the skeleton of a template model you can use to start implementing your own model. @@ -38,7 +59,7 @@ Once you have implemented your model, make sure to modify :py:func:`neuralhydrol Furthermore, make sure to select a *unique* model abbreviation that will be used to specify the model in the config.yml files. -:: +.. code-block:: python from typing import Dict @@ -105,4 +126,3 @@ files. # Implement forward pass here # ############################### pass - diff --git a/docs/source/usage/quickstart.rst b/docs/source/usage/quickstart.rst index 25c396c5..18a018a8 100644 --- a/docs/source/usage/quickstart.rst +++ b/docs/source/usage/quickstart.rst @@ -3,18 +3,17 @@ Quick Start Installation ------------ -The neuralhydrology project is available on PyPI. -Hence, installation is as easy as:: +For now, download or clone the repository to your local machine and install a local, editable copy. +This is a good idea if you want to edit the ``neuralhydrology`` code (e.g., adding new models or datasets) - pip install neuralhydrology +.. code-block:: -Alternatively, you can clone the repository and install the local, editable copy. This is a good idea if you want to -edit the ``neuralhydrology`` code (e.g., adding new models or datasets).:: - - git clone https://github.com/kratzert/lstm_based_hydrology.git - cd lstm_based_hydrology + git clone https://github.com/neuralhydrology/neuralhydrology.git + cd neuralhydrology pip install -e . +Besides adding the package to your Python environment, it will also add three bash scripts: +`nh-run`, `nh-run-scheduler` and `nh-results-ensemble`. For details, see below. Data ---- @@ -35,7 +34,7 @@ To train a model, prepare a configuration file, then run:: If you want to train multiple models, you can make use of the ``nh-run-scheduler`` command. Place all configs in a folder, then run:: - nh-run-scheduler --config-dir /path/to/config_dir/ --runs-per-gpu X --gpu-ids Y + nh-run-scheduler train --config-dir /path/to/config_dir/ --runs-per-gpu X --gpu-ids Y With X, you can specify how many models should be trained on parallel on a single GPU. With Y, you can specify which GPUs to use for training (use the id as specified in ``nvidia-smi``). @@ -52,7 +51,7 @@ the weights of the last epoch are used. To evaluate all runs in a specific directory you can, similarly to training, run:: - nh-run-scheduler --mode evaluate --run-dir /path/to/config_dir/ --runs-per-gpu X --gpu-ids Y + nh-run-scheduler evaluate --run-dir /path/to/config_dir/ --runs-per-gpu X --gpu-ids Y To merge the predictons of a number of runs (stored in ``$DIR1``, ...) into one averaged ensemble, diff --git a/environments/environment_cpu.yml b/environments/environment_cpu.yml index cb1e768b..ce9d16af 100644 --- a/environments/environment_cpu.yml +++ b/environments/environment_cpu.yml @@ -27,4 +27,5 @@ dependencies: - pip: - tensorboard - sphinx-rtd-theme - + - nbsphinx + - nbsphinx-link \ No newline at end of file diff --git a/environments/environment_cuda10_2.yml b/environments/environment_cuda10_2.yml index 0b2f5a8f..1e6ab168 100644 --- a/environments/environment_cuda10_2.yml +++ b/environments/environment_cuda10_2.yml @@ -27,4 +27,5 @@ dependencies: - pip: - tensorboard - sphinx-rtd-theme - + - nbsphinx + - nbsphinx-link \ No newline at end of file diff --git a/environments/environment_cuda9_2.yml b/environments/environment_cuda9_2.yml index 5e06c5ff..54020dbd 100644 --- a/environments/environment_cuda9_2.yml +++ b/environments/environment_cuda9_2.yml @@ -27,4 +27,5 @@ dependencies: - pip: - tensorboard - sphinx-rtd-theme - + - nbsphinx + - nbsphinx-link \ No newline at end of file diff --git a/examples/01-Introduction/1_basin.txt b/examples/01-Introduction/1_basin.txt new file mode 100644 index 00000000..94860eb2 --- /dev/null +++ b/examples/01-Introduction/1_basin.txt @@ -0,0 +1 @@ +01022500 diff --git a/examples/01-Introduction/1_basin.yml b/examples/01-Introduction/1_basin.yml new file mode 100644 index 00000000..0b5e3e28 --- /dev/null +++ b/examples/01-Introduction/1_basin.yml @@ -0,0 +1,145 @@ +# --- Experiment configurations -------------------------------------------------------------------- + +# experiment name, used as folder name +experiment_name: test_run + +# files to specify training, validation and test basins (relative to code root or absolute path) +train_basin_file: 1_basin.txt +validation_basin_file: 1_basin.txt +test_basin_file: 1_basin.txt + +# training, validation and test time periods (format = 'dd/mm/yyyy') +train_start_date: '01/10/1999' +train_end_date: '30/09/2008' +validation_start_date: '01/10/1980' +validation_end_date: '30/09/1989' +test_start_date: '01/10/1989' +test_end_date: '30/09/1999' + +# which GPU (id) to use [in format of cuda:0, cuda:1 etc, or cpu or None] +device: cuda:0 + +# --- Validation configuration --------------------------------------------------------------------- + +# specify after how many epochs to perform validation +validate_every: 3 + +# specify how many random basins to use for validation +validate_n_random_basins: 1 + +# specify which metrics to calculate during validation (see neuralhydrology.evaluation.metrics) +# this can either be a list or a dictionary. If a dictionary is used, the inner keys must match the name of the +# target_variable specified below. Using dicts allows for different metrics per target variable. +metrics: +- NSE + +# --- Model configuration -------------------------------------------------------------------------- + +# base model type [lstm, ealstm, cudalstm, embcudalstm, multifreqlstm] +# (has to match the if statement in modelzoo/__init__.py) +model: cudalstm + +# prediction head [regression]. Define the head specific parameters below +head: regression + +# ----> Regression settings <---- +output_activation: linear + +# ----> General settings <---- + +# Number of cell states of the LSTM +hidden_size: 20 + +# Initial bias value of the forget gate +initial_forget_bias: 3 + +# Dropout applied to the output of the LSTM +output_dropout: 0.4 + +# --- Training configuration ----------------------------------------------------------------------- + +# specify optimizer [Adam] +optimizer: Adam + +# specify loss [MSE, NSE, RMSE] +loss: MSE + +# specify learning rates to use starting at specific epochs (0 is the initial learning rate) +learning_rate: + 0: 1e-2 + 30: 5e-3 + 40: 1e-3 + +# Mini-batch size +batch_size: 256 + +# Number of training epochs +epochs: 50 + +# If a value, clips the gradients during training to that norm. +clip_gradient_norm: 1 + +# Defines which time steps are used to calculate the loss. Can't be larger than seq_length. +# If use_frequencies is used, this needs to be a dict mapping each frequency to a predict_last_n-value, else an int. +predict_last_n: 1 + +# Length of the input sequence +# If use_frequencies is used, this needs to be a dict mapping each frequency to a seq_length, else an int. +seq_length: 365 + +# Number of parallel workers used in the data pipeline +num_workers: 8 + +# Log the training loss every n steps +log_interval: 5 + +# If true, writes logging results into tensorboard file +log_tensorboard: True + +# If a value and greater than 0, logs n random basins as figures during validation +log_n_figures: 1 + +# Save model weights every n epochs +save_weights_every: 1 + +# --- Data configurations -------------------------------------------------------------------------- + +# which data set to use [camels_us, camels_gb, global, hourly_camels_us] +dataset: camels_us + +# Path to data set root +data_dir: /data/Hydrology/CAMELS_US + +# Forcing product [daymet, maurer, maurer_extended, nldas, nldas_extended, nldas_hourly] +# can be either a list of forcings or a single forcing product +forcings: +- maurer_extended +- daymet +- nldas_extended + +dynamic_inputs: +- PRCP(mm/day)_nldas_extended +- SRAD(W/m2)_nldas_extended +- Tmax(C)_nldas_extended +- Tmin(C)_nldas_extended +- Vp(Pa)_nldas_extended +- prcp(mm/day)_maurer_extended +- srad(W/m2)_maurer_extended +- tmax(C)_maurer_extended +- tmin(C)_maurer_extended +- vp(Pa)_maurer_extended +- prcp(mm/day)_daymet +- srad(W/m2)_daymet +- tmax(C)_daymet +- tmin(C)_daymet +- vp(Pa)_daymet + +# which columns to use as target +target_variables: +- QObs(mm/d) + +# clip negative predictions to zero for all variables listed below. Should be a list, even for single variables. +clip_target_to_zero: +- QObs(mm/d) + +zero_center_target: True diff --git a/examples/01-Introduction/Introduction.ipynb b/examples/01-Introduction/Introduction.ipynb new file mode 100644 index 00000000..aa2525cf --- /dev/null +++ b/examples/01-Introduction/Introduction.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to neuralHydrology\n", + "The Python package `neuralHydrology` was was developed with a strong focus on research. The main application area is hydrology, however, in principle the code can be used with any data. To allow fast iteration of research ideas, we tried to develop the package as modular as possible so that new models, new data sets, new loss functions, new regularizations, new metrics etc. can be integrated with minor effort.\n", + "\n", + "There are two different ways to use this package:\n", + "\n", + "1. From the terminal, making use of some high-level entry points (such as `nh-run` and `nh-run-scheduler`)\n", + "2. From any other Python file or Jupyter Notebook, using neuralHydrology's API\n", + "\n", + "In this tutorial, we will give a very short overview of the two different modes.\n", + "\n", + "Both approaches require a **configuration file**. These are `.yml` files which define the entire run configuration (such as data set, basins, data periods, model specifications, etc.). A full list of config arguments is listed in the [Wiki on GitHub](https://github.com/neuralhydrology/neuralhydrology/wiki/Config-arguments) and we highly recommend to check this page and read the documentation carefully. There is a lot that you can do with this Python package and we can't cover everything in tutorials.\n", + "\n", + "For every run that you start, a new folder will be created. This folder is used to store the model and optimizer checkpoints, train data means/stds (needed for scaling during inference), tensorboard log file (can be used to monitor and compare training runs visually), validation results (optionally) and training progress figures (optionally, e.g., model predictions and observations for _n_ random basins). During inference, the evaluation results will also be stored in this directory (e.g., test period results).\n", + "\n", + "\n", + "### TensorBoard logging\n", + "By default, the training progress is logged in TensorBoard files (add `log_tensorboard: False` to the config to disable TensorBoard logging). If you installed a Python environment from one of our environment files, you have TensorBoard already installed. If not, you can install TensorBoard with:\n", + "\n", + "```\n", + "pip install tensorboard\n", + "``` \n", + "\n", + "To start the TensorBoard dashboard, run:\n", + "\n", + "```\n", + "tensorboard --logdir /path/to/run-dir\n", + "```\n", + "\n", + "You can also visualize multiple runs at once if you point the `--logdir` to the parent directory (useful for model intercomparison)\n", + "\n", + "### File logging\n", + "In addition to TensorBoard, you will always find a file called `output.log` in the run directory. This file is a dump of the console output you see during training and evaluation.\n", + "\n", + "\n", + "## Using `neuralHydrology` from the Terminal\n", + "\n", + "### nh-run\n", + "\n", + "\n", + "Given a run configuration file, you can use the bash command `nh-run` to train/evaluate a model. To train a model, use\n", + "\n", + "\n", + "```bash\n", + "nh-run train --config-file path/to/config.yml\n", + "```\n", + "\n", + "to evaluate the model after training, use\n", + "\n", + "```bash\n", + "nh-run evaluate --run-dir path/to/run-directory\n", + "```\n", + "\n", + "### nh-run-scheduler\n", + "\n", + "If you want to train/evaluate multiple models on different GPUs, you can use the `nh-run-scheduler`. This tool automatically distributes runs across GPUs and starts a new one, whenever one run finishes.\n", + "\n", + "Calling `nh-run-scheduler` in `train` mode will train one model for each `.yml` file in a directory (or its sub-directories).\n", + "\n", + "```bash\n", + "nh-run-scheduler train --directory /path/to/config-dir --runs-per-gpu 2 --gpu_ids 0 1 2 3 \n", + "```\n", + "Use `-runs-per-gpu` to define the number of models that are simultaneously trained on a _single_ GPU (2 in this case) and `--gpu-ids` to define which GPUs will be used (numbers are ids according to nvidia-smi). In this example, 8 models will train simultaneously on 4 different GPUs.\n", + "\n", + "Calling `nh-run-scheduler` in `evaluate` mode will evaluate all models in all run directories in a given root directory.\n", + "\n", + "```bash\n", + "nh-run-scheduler evaluate --directory /path/to/parent-run-dir/ --runs-per-gpu 2 --gpu_ids 0 1 2 3 \n", + "```\n", + "\n", + "## API usage\n", + "\n", + "Besides the command line tools, you can also use the neuralHydrology package just like any other Python package by importing its modules, classes, or functions.\n", + "\n", + "This can be helpful for exploratory studies with trained models, but also if you want to use some of the functions or classes within a different codebase. \n", + "\n", + "Look at the [API Documentation](https://neuralhydrology.readthedocs.io/en/latest/api/neuralhydrology.html) for a full list of functions/classes you could use.\n", + "\n", + "The following example shows how to train and evaluate a model via the API." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "from pathlib import Path\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from neuralhydrology.evaluation import metrics\n", + "from neuralhydrology.nh_run import start_run, eval_run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train a model for a single config file" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-10-05 12:52:52,508: Logging to /home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252/output.log initialized.\n", + "2020-10-05 12:52:52,510: ### Folder structure created at /home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252\n", + "2020-10-05 12:52:52,510: ### Run configurations for test_run\n", + "2020-10-05 12:52:52,511: experiment_name: test_run\n", + "2020-10-05 12:52:52,512: train_basin_file: 1_basin.txt\n", + "2020-10-05 12:52:52,513: validation_basin_file: 1_basin.txt\n", + "2020-10-05 12:52:52,514: test_basin_file: 1_basin.txt\n", + "2020-10-05 12:52:52,514: train_start_date: 1999-10-01 00:00:00\n", + "2020-10-05 12:52:52,515: train_end_date: 2008-09-30 00:00:00\n", + "2020-10-05 12:52:52,516: validation_start_date: 1980-10-01 00:00:00\n", + "2020-10-05 12:52:52,517: validation_end_date: 1989-09-30 00:00:00\n", + "2020-10-05 12:52:52,517: test_start_date: 1989-10-01 00:00:00\n", + "2020-10-05 12:52:52,518: test_end_date: 1999-09-30 00:00:00\n", + "2020-10-05 12:52:52,519: device: cuda:0\n", + "2020-10-05 12:52:52,520: validate_every: 3\n", + "2020-10-05 12:52:52,520: validate_n_random_basins: 1\n", + "2020-10-05 12:52:52,521: metrics: ['NSE']\n", + "2020-10-05 12:52:52,522: model: cudalstm\n", + "2020-10-05 12:52:52,524: head: regression\n", + "2020-10-05 12:52:52,524: output_activation: linear\n", + "2020-10-05 12:52:52,525: hidden_size: 20\n", + "2020-10-05 12:52:52,525: initial_forget_bias: 3\n", + "2020-10-05 12:52:52,526: output_dropout: 0.4\n", + "2020-10-05 12:52:52,527: optimizer: Adam\n", + "2020-10-05 12:52:52,527: loss: MSE\n", + "2020-10-05 12:52:52,528: learning_rate: {0: 0.01, 30: 0.005, 40: 0.001}\n", + "2020-10-05 12:52:52,529: batch_size: 256\n", + "2020-10-05 12:52:52,529: epochs: 50\n", + "2020-10-05 12:52:52,530: clip_gradient_norm: 1\n", + "2020-10-05 12:52:52,530: predict_last_n: 1\n", + "2020-10-05 12:52:52,531: seq_length: 365\n", + "2020-10-05 12:52:52,533: num_workers: 8\n", + "2020-10-05 12:52:52,534: log_interval: 5\n", + "2020-10-05 12:52:52,535: log_tensorboard: True\n", + "2020-10-05 12:52:52,536: log_n_figures: 1\n", + "2020-10-05 12:52:52,537: save_weights_every: 1\n", + "2020-10-05 12:52:52,537: dataset: camels_us\n", + "2020-10-05 12:52:52,538: data_dir: /data/Hydrology/CAMELS_US\n", + "2020-10-05 12:52:52,539: forcings: ['maurer_extended', 'daymet', 'nldas_extended']\n", + "2020-10-05 12:52:52,539: dynamic_inputs: ['PRCP(mm/day)_nldas_extended', 'SRAD(W/m2)_nldas_extended', 'Tmax(C)_nldas_extended', 'Tmin(C)_nldas_extended', 'Vp(Pa)_nldas_extended', 'prcp(mm/day)_maurer_extended', 'srad(W/m2)_maurer_extended', 'tmax(C)_maurer_extended', 'tmin(C)_maurer_extended', 'vp(Pa)_maurer_extended', 'prcp(mm/day)_daymet', 'srad(W/m2)_daymet', 'tmax(C)_daymet', 'tmin(C)_daymet', 'vp(Pa)_daymet']\n", + "2020-10-05 12:52:52,540: target_variables: ['QObs(mm/d)']\n", + "2020-10-05 12:52:52,541: clip_target_to_zero: ['QObs(mm/d)']\n", + "2020-10-05 12:52:52,542: zero_center_target: True\n", + "2020-10-05 12:52:52,543: number_of_basins: 1\n", + "2020-10-05 12:52:52,544: run_dir: /home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252\n", + "2020-10-05 12:52:52,544: train_dir: /home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252/train_data\n", + "2020-10-05 12:52:52,545: img_log_dir: /home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252/img_log\n", + "2020-10-05 12:52:52,586: ### Device cuda:0 will be used for training\n", + "2020-10-05 12:52:55,028: Loading basin data into xarray data set.\n", + "100%|██████████| 1/1 [00:00<00:00, 5.97it/s]\n", + "2020-10-05 12:52:55,213: Create lookup table and convert to pytorch tensor\n", + "100%|██████████| 1/1 [00:01<00:00, 1.29s/it]\n", + "# Epoch 1: 100%|██████████| 13/13 [00:00<00:00, 21.75it/s, Loss: 0.3369]\n", + "2020-10-05 12:52:57,229: Epoch 1 average loss: 0.40770340997439164\n", + "# Epoch 2: 100%|██████████| 13/13 [00:00<00:00, 21.06it/s, Loss: 0.1233]\n", + "2020-10-05 12:52:57,852: Epoch 2 average loss: 0.2696097624989656\n", + "# Epoch 3: 100%|██████████| 13/13 [00:00<00:00, 20.21it/s, Loss: 0.1440]\n", + "2020-10-05 12:52:58,503: Epoch 3 average loss: 0.2056583693394294\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 1.15it/s]\n", + "2020-10-05 12:52:59,650: -- Median validation metrics:NSE: 0.62918\n", + "# Epoch 4: 100%|██████████| 13/13 [00:00<00:00, 21.25it/s, Loss: 0.1681]\n", + "2020-10-05 12:53:00,265: Epoch 4 average loss: 0.16614325917684114\n", + "# Epoch 5: 100%|██████████| 13/13 [00:00<00:00, 20.18it/s, Loss: 0.0893]\n", + "2020-10-05 12:53:00,916: Epoch 5 average loss: 0.1314379280576339\n", + "# Epoch 6: 100%|██████████| 13/13 [00:00<00:00, 20.25it/s, Loss: 0.1715]\n", + "2020-10-05 12:53:01,565: Epoch 6 average loss: 0.11709093875609912\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 3.77it/s]\n", + "2020-10-05 12:53:02,108: -- Median validation metrics:NSE: 0.70598\n", + "# Epoch 7: 100%|██████████| 13/13 [00:00<00:00, 19.89it/s, Loss: 0.0765]\n", + "2020-10-05 12:53:02,766: Epoch 7 average loss: 0.10338054998562886\n", + "# Epoch 8: 100%|██████████| 13/13 [00:00<00:00, 19.77it/s, Loss: 0.0803]\n", + "2020-10-05 12:53:03,430: Epoch 8 average loss: 0.09337395773484157\n", + "# Epoch 9: 100%|██████████| 13/13 [00:00<00:00, 18.54it/s, Loss: 0.0631]\n", + "2020-10-05 12:53:04,137: Epoch 9 average loss: 0.09041231240217502\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 3.74it/s]\n", + "2020-10-05 12:53:04,691: -- Median validation metrics:NSE: 0.72472\n", + "# Epoch 10: 100%|██████████| 13/13 [00:01<00:00, 11.72it/s, Loss: 0.1419]\n", + "2020-10-05 12:53:05,804: Epoch 10 average loss: 0.08599556208803104\n", + "# Epoch 11: 100%|██████████| 13/13 [00:00<00:00, 20.62it/s, Loss: 0.1633]\n", + "2020-10-05 12:53:06,442: Epoch 11 average loss: 0.08094931651766483\n", + "# Epoch 12: 100%|██████████| 13/13 [00:00<00:00, 20.40it/s, Loss: 0.0485]\n", + "2020-10-05 12:53:07,085: Epoch 12 average loss: 0.0698587424479998\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 4.31it/s]\n", + "2020-10-05 12:53:07,590: -- Median validation metrics:NSE: 0.77600\n", + "# Epoch 13: 100%|██████████| 13/13 [00:00<00:00, 21.13it/s, Loss: 0.0707]\n", + "2020-10-05 12:53:08,210: Epoch 13 average loss: 0.07356188331659023\n", + "# Epoch 14: 100%|██████████| 13/13 [00:00<00:00, 22.66it/s, Loss: 0.0825]\n", + "2020-10-05 12:53:08,788: Epoch 14 average loss: 0.07214784335631591\n", + "# Epoch 15: 100%|██████████| 13/13 [00:00<00:00, 21.50it/s, Loss: 0.0473]\n", + "2020-10-05 12:53:09,399: Epoch 15 average loss: 0.06782861340504426\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 3.59it/s]\n", + "2020-10-05 12:53:09,974: -- Median validation metrics:NSE: 0.79518\n", + "# Epoch 16: 100%|██████████| 13/13 [00:00<00:00, 22.83it/s, Loss: 0.0356]\n", + "2020-10-05 12:53:10,548: Epoch 16 average loss: 0.06452611088752747\n", + "# Epoch 17: 100%|██████████| 13/13 [00:00<00:00, 18.94it/s, Loss: 0.0343]\n", + "2020-10-05 12:53:11,239: Epoch 17 average loss: 0.06379958213521884\n", + "# Epoch 18: 100%|██████████| 13/13 [00:00<00:00, 21.93it/s, Loss: 0.0528]\n", + "2020-10-05 12:53:11,838: Epoch 18 average loss: 0.06280213909653518\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 4.13it/s]\n", + "2020-10-05 12:53:12,351: -- Median validation metrics:NSE: 0.80859\n", + "# Epoch 19: 100%|██████████| 13/13 [00:00<00:00, 21.07it/s, Loss: 0.0701]\n", + "2020-10-05 12:53:12,972: Epoch 19 average loss: 0.0555433054956106\n", + "# Epoch 20: 100%|██████████| 13/13 [00:00<00:00, 21.90it/s, Loss: 0.0627]\n", + "2020-10-05 12:53:13,571: Epoch 20 average loss: 0.062321179188214816\n", + "# Epoch 21: 100%|██████████| 13/13 [00:00<00:00, 21.03it/s, Loss: 0.0453]\n", + "2020-10-05 12:53:14,194: Epoch 21 average loss: 0.05481961842339773\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 4.32it/s]\n", + "2020-10-05 12:53:14,698: -- Median validation metrics:NSE: 0.81559\n", + "# Epoch 22: 100%|██████████| 13/13 [00:00<00:00, 20.03it/s, Loss: 0.1060]\n", + "2020-10-05 12:53:15,351: Epoch 22 average loss: 0.06080526944536429\n", + "# Epoch 23: 100%|██████████| 13/13 [00:00<00:00, 20.79it/s, Loss: 0.0510]\n", + "2020-10-05 12:53:15,983: Epoch 23 average loss: 0.0530552279490691\n", + "# Epoch 24: 100%|██████████| 13/13 [00:00<00:00, 21.31it/s, Loss: 0.0569]\n", + "2020-10-05 12:53:16,598: Epoch 24 average loss: 0.05936390132858203\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 2.85it/s]\n", + "2020-10-05 12:53:17,205: -- Median validation metrics:NSE: 0.79431\n", + "# Epoch 25: 100%|██████████| 13/13 [00:00<00:00, 21.29it/s, Loss: 0.0278]\n", + "2020-10-05 12:53:17,821: Epoch 25 average loss: 0.05501310441356439\n", + "# Epoch 26: 100%|██████████| 13/13 [00:00<00:00, 21.44it/s, Loss: 0.0452]\n", + "2020-10-05 12:53:18,432: Epoch 26 average loss: 0.05229091515334753\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Epoch 27: 100%|██████████| 13/13 [00:00<00:00, 22.24it/s, Loss: 0.0588]\n", + "2020-10-05 12:53:19,021: Epoch 27 average loss: 0.04917494379557096\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 4.42it/s]\n", + "2020-10-05 12:53:19,514: -- Median validation metrics:NSE: 0.81387\n", + "# Epoch 28: 100%|██████████| 13/13 [00:00<00:00, 20.37it/s, Loss: 0.0417]\n", + "2020-10-05 12:53:20,157: Epoch 28 average loss: 0.046723469805258974\n", + "# Epoch 29: 100%|██████████| 13/13 [00:00<00:00, 19.72it/s, Loss: 0.0399]\n", + "2020-10-05 12:53:20,822: Epoch 29 average loss: 0.04707713339191217\n", + "2020-10-05 12:53:20,826: Setting learning rate to 0.005\n", + "# Epoch 30: 100%|██████████| 13/13 [00:00<00:00, 20.35it/s, Loss: 0.0569]\n", + "2020-10-05 12:53:21,468: Epoch 30 average loss: 0.04919698060705112\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 3.89it/s]\n", + "2020-10-05 12:53:22,026: -- Median validation metrics:NSE: 0.81469\n", + "# Epoch 31: 100%|██████████| 13/13 [00:00<00:00, 18.51it/s, Loss: 0.0412]\n", + "2020-10-05 12:53:22,732: Epoch 31 average loss: 0.046678220136807516\n", + "# Epoch 32: 100%|██████████| 13/13 [00:00<00:00, 19.51it/s, Loss: 0.0385]\n", + "2020-10-05 12:53:23,405: Epoch 32 average loss: 0.04431860951276926\n", + "# Epoch 33: 100%|██████████| 13/13 [00:00<00:00, 18.42it/s, Loss: 0.0381]\n", + "2020-10-05 12:53:24,116: Epoch 33 average loss: 0.04529069263774615\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 3.29it/s]\n", + "2020-10-05 12:53:24,701: -- Median validation metrics:NSE: 0.82519\n", + "# Epoch 34: 100%|██████████| 13/13 [00:00<00:00, 19.21it/s, Loss: 0.0354]\n", + "2020-10-05 12:53:25,382: Epoch 34 average loss: 0.047089104182445086\n", + "# Epoch 35: 100%|██████████| 13/13 [00:00<00:00, 20.25it/s, Loss: 0.0414]\n", + "2020-10-05 12:53:26,030: Epoch 35 average loss: 0.03992394959697357\n", + "# Epoch 36: 100%|██████████| 13/13 [00:00<00:00, 20.30it/s, Loss: 0.0488]\n", + "2020-10-05 12:53:26,678: Epoch 36 average loss: 0.04217390658763739\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 4.24it/s]\n", + "2020-10-05 12:53:27,180: -- Median validation metrics:NSE: 0.82312\n", + "# Epoch 37: 100%|██████████| 13/13 [00:00<00:00, 20.71it/s, Loss: 0.0486]\n", + "2020-10-05 12:53:27,811: Epoch 37 average loss: 0.04324197224699534\n", + "# Epoch 38: 100%|██████████| 13/13 [00:00<00:00, 20.47it/s, Loss: 0.0845]\n", + "2020-10-05 12:53:28,451: Epoch 38 average loss: 0.042667378599827104\n", + "# Epoch 39: 100%|██████████| 13/13 [00:00<00:00, 20.77it/s, Loss: 0.0399]\n", + "2020-10-05 12:53:29,082: Epoch 39 average loss: 0.043971521636614434\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 2.85it/s]\n", + "2020-10-05 12:53:29,695: -- Median validation metrics:NSE: 0.81838\n", + "2020-10-05 12:53:29,696: Setting learning rate to 0.001\n", + "# Epoch 40: 100%|██████████| 13/13 [00:00<00:00, 20.51it/s, Loss: 0.0629]\n", + "2020-10-05 12:53:30,334: Epoch 40 average loss: 0.047052909405185625\n", + "# Epoch 41: 100%|██████████| 13/13 [00:00<00:00, 21.16it/s, Loss: 0.0358]\n", + "2020-10-05 12:53:30,955: Epoch 41 average loss: 0.037803018608918555\n", + "# Epoch 42: 100%|██████████| 13/13 [00:00<00:00, 21.40it/s, Loss: 0.0514]\n", + "2020-10-05 12:53:31,568: Epoch 42 average loss: 0.04351525338223347\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 4.34it/s]\n", + "2020-10-05 12:53:32,075: -- Median validation metrics:NSE: 0.82342\n", + "# Epoch 43: 100%|██████████| 13/13 [00:00<00:00, 21.82it/s, Loss: 0.0397]\n", + "2020-10-05 12:53:32,675: Epoch 43 average loss: 0.03911813692404674\n", + "# Epoch 44: 100%|██████████| 13/13 [00:00<00:00, 18.98it/s, Loss: 0.0404]\n", + "2020-10-05 12:53:33,365: Epoch 44 average loss: 0.042462579905986786\n", + "# Epoch 45: 100%|██████████| 13/13 [00:00<00:00, 19.18it/s, Loss: 0.0339]\n", + "2020-10-05 12:53:34,051: Epoch 45 average loss: 0.04013453395320819\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 3.41it/s]\n", + "2020-10-05 12:53:34,639: -- Median validation metrics:NSE: 0.83013\n", + "# Epoch 46: 100%|██████████| 13/13 [00:00<00:00, 18.73it/s, Loss: 0.0397]\n", + "2020-10-05 12:53:35,337: Epoch 46 average loss: 0.04252039125332466\n", + "# Epoch 47: 100%|██████████| 13/13 [00:00<00:00, 19.62it/s, Loss: 0.0451]\n", + "2020-10-05 12:53:36,005: Epoch 47 average loss: 0.03530547395348549\n", + "# Epoch 48: 100%|██████████| 13/13 [00:00<00:00, 21.38it/s, Loss: 0.0298]\n", + "2020-10-05 12:53:36,621: Epoch 48 average loss: 0.039933502387541994\n", + "# Validation: 100%|██████████| 1/1 [00:00<00:00, 4.36it/s]\n", + "2020-10-05 12:53:37,112: -- Median validation metrics:NSE: 0.82898\n", + "# Epoch 49: 100%|██████████| 13/13 [00:00<00:00, 21.71it/s, Loss: 0.0408]\n", + "2020-10-05 12:53:37,714: Epoch 49 average loss: 0.04054238203053291\n", + "# Epoch 50: 100%|██████████| 13/13 [00:00<00:00, 21.83it/s, Loss: 0.0302]\n", + "2020-10-05 12:53:38,315: Epoch 50 average loss: 0.04161823354661465\n" + ] + } + ], + "source": [ + "start_run(config_file=Path(\"1_basin.yml\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Evaluate run on test set\n", + "The run directory that needs to be specified for evaluation is printed in the output log above." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-10-05 12:54:54,703: Using the model weights from /home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252/model_epoch050.pt\n", + "# Evaluation: 100%|██████████| 1/1 [00:00<00:00, 1.79it/s]\n", + "2020-10-05 12:54:55,269: Stored results at /home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252/test/model_epoch050/test_results.p\n" + ] + } + ], + "source": [ + "run_dir = Path(\"/home/frederik/Projects/neuralhydrology/examples/01-Introduction/runs/test_run_0510_125252\")\n", + "eval_run(run_dir=run_dir, period=\"test\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load and inspect model predictions\n", + "Next, we load the results file and compare the model predictions with observations. The results file is always a pickled dictionary with one key per basin (even for a single basin). The next-lower dictionary level is the temporal resolution of the predictions. In this case, we trained a model only on daily data ('1D'). Within the temporal resolution, the next-lower dictionary level are `xr`(an xarray Dataset that contains observations and predictions), as well as one key for each metric that was specified in the config file." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['01022500'])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open(run_dir / \"test\" / \"model_epoch050\" / \"test_results.p\", \"rb\") as fp:\n", + " results = pickle.load(fp)\n", + " \n", + "results.keys()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data variables in the xarray Dataset are named according to the name of the target variables, with suffix `_obs` for the observations and suffix `_sim` for the simulations." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:         (date: 3652, time_step: 1)\n",
+       "Coordinates:\n",
+       "  * date            (date) datetime64[ns] 1989-10-01 1989-10-02 ... 1999-09-30\n",
+       "  * time_step       (time_step) timedelta64[ns] 00:00:00\n",
+       "Data variables:\n",
+       "    QObs(mm/d)_obs  (date, time_step) float32 0.6203073 0.5536971 ... 0.9991529\n",
+       "    QObs(mm/d)_sim  (date, time_step) float32 0.6279986 0.6001396 ... 1.966821
" + ], + "text/plain": [ + "\n", + "Dimensions: (date: 3652, time_step: 1)\n", + "Coordinates:\n", + " * date (date) datetime64[ns] 1989-10-01 1989-10-02 ... 1999-09-30\n", + " * time_step (time_step) timedelta64[ns] 00:00:00\n", + "Data variables:\n", + " QObs(mm/d)_obs (date, time_step) float32 0.6203073 0.5536971 ... 0.9991529\n", + " QObs(mm/d)_sim (date, time_step) float32 0.6279986 0.6001396 ... 1.966821" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results['01022500']['1D']['xr']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the model predictions vs. the observations" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Test period - NSE 0.791')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# extract observations and simulations\n", + "qobs = results['01022500']['1D']['xr']['QObs(mm/d)_obs']\n", + "qsim = results['01022500']['1D']['xr']['QObs(mm/d)_sim']\n", + "\n", + "fig, ax = plt.subplots(figsize=(16,10))\n", + "ax.plot(qobs['date'], qobs)\n", + "ax.plot(qsim['date'], qsim)\n", + "ax.set_ylabel(\"Discharge (mm/d)\")\n", + "ax.set_title(f\"Test period - NSE {results['01022500']['1D']['NSE']:.3f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we are going to compute all metrics that are implemented in the neuralHydrology package. You will find additional hydrological signatures implemented in `neuralhydrology.evaluation.signatures`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NSE: 0.791\n", + "MSE: 1.028\n", + "RMSE: 1.014\n", + "KGE: 0.864\n", + "Alpha-NSE: 0.929\n", + "Beta-NSE: 0.036\n", + "Pearson-r: 0.891\n", + "FHV: -8.793\n", + "FMS: -5.994\n", + "FLV: -876.161\n", + "Peak-Timing: 0.087\n" + ] + } + ], + "source": [ + "values = metrics.calculate_all_metrics(qobs.isel(time_step=-1), qsim.isel(time_step=-1))\n", + "for key, val in values.items():\n", + " print(f\"{key}: {val:.3f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From e39330e5d2d2f950bf8cbf924d32785b8d83e7ee Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Mon, 5 Oct 2020 14:21:56 +0200 Subject: [PATCH 05/15] increment version number --- neuralhydrology/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralhydrology/__about__.py b/neuralhydrology/__about__.py index ea68199a..3f013dc7 100644 --- a/neuralhydrology/__about__.py +++ b/neuralhydrology/__about__.py @@ -1 +1 @@ -__version__ = "0.9.0-beta" +__version__ = "0.9.1-beta" From 664b4e7d6cfa977a8c157a4a143a7b5e4fdb53ce Mon Sep 17 00:00:00 2001 From: Martin Gauch Date: Mon, 5 Oct 2020 21:55:55 +0200 Subject: [PATCH 06/15] Addresses #6: Faster merging of date and time_step during evaluation --- neuralhydrology/evaluation/tester.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neuralhydrology/evaluation/tester.py b/neuralhydrology/evaluation/tester.py index e001bc71..c6528192 100644 --- a/neuralhydrology/evaluation/tester.py +++ b/neuralhydrology/evaluation/tester.py @@ -258,12 +258,12 @@ def evaluate(self, # stack dates and time_steps so we don't just evaluate every 24H when use_frequencies=[1D, 1H] obs = xr.isel(time_step=slice(-frequency_factor, None)) \ .stack(datetime=['date', 'time_step'])[f"{target_variable}_obs"] - obs['datetime'] = [c[0] + c[1] for c in obs.coords['datetime'].values] + obs['datetime'] = obs.coords['date'] + obs.coords['time_step'] # check if not empty (in case no observations exist in this period if not all(obs.isnull()): sim = xr.isel(time_step=slice(-frequency_factor, None)) \ .stack(datetime=['date', 'time_step'])[f"{target_variable}_sim"] - sim['datetime'] = [c[0] + c[1] for c in sim.coords['datetime'].values] + sim['datetime'] = sim.coords['date'] + sim.coords['time_step'] # clip negative predictions to zero, if variable is listed in config 'clip_target_to_zero' if target_variable in self.cfg.clip_targets_to_zero: From 30021d602449046db6915c51ddda4031dd474be6 Mon Sep 17 00:00:00 2001 From: Martin Gauch Date: Tue, 6 Oct 2020 09:33:18 +0200 Subject: [PATCH 07/15] Rename multifreqlstm to MTS-LSTM --- ...neuralhydrology.modelzoo.multifreqlstm.rst | 6 +- docs/source/api/neuralhydrology.modelzoo.rst | 2 +- docs/source/usage/models.rst | 14 ++--- examples/01-Introduction/1_basin.yml | 2 +- examples/config.yml.example | 8 +-- neuralhydrology/modelzoo/__init__.py | 6 +- .../modelzoo/{multifreqlstm.py => mtslstm.py} | 63 ++++++++++++------- neuralhydrology/modelzoo/odelstm.py | 2 +- neuralhydrology/utils/config.py | 12 ++-- test/conftest.py | 18 +++--- test/test_config_runs.py | 26 ++++---- ...ml => multi_timescale_regression.test.yml} | 2 +- 12 files changed, 89 insertions(+), 72 deletions(-) rename neuralhydrology/modelzoo/{multifreqlstm.py => mtslstm.py} (73%) rename test/test_configs/{multifreq_regression.test.yml => multi_timescale_regression.test.yml} (98%) diff --git a/docs/source/api/neuralhydrology.modelzoo.multifreqlstm.rst b/docs/source/api/neuralhydrology.modelzoo.multifreqlstm.rst index 40909071..ff849478 100644 --- a/docs/source/api/neuralhydrology.modelzoo.multifreqlstm.rst +++ b/docs/source/api/neuralhydrology.modelzoo.multifreqlstm.rst @@ -1,7 +1,7 @@ -MultiFreqLSTM -============= +MTSLSTM +======= -.. automodule:: neuralhydrology.modelzoo.multifreqlstm +.. automodule:: neuralhydrology.modelzoo.mtslstm :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/neuralhydrology.modelzoo.rst b/docs/source/api/neuralhydrology.modelzoo.rst index 66e50c77..77cee575 100644 --- a/docs/source/api/neuralhydrology.modelzoo.rst +++ b/docs/source/api/neuralhydrology.modelzoo.rst @@ -16,6 +16,6 @@ nh.modelzoo neuralhydrology.modelzoo.fc neuralhydrology.modelzoo.head neuralhydrology.modelzoo.lstm - neuralhydrology.modelzoo.multifreqlstm + neuralhydrology.modelzoo.mtslstm neuralhydrology.modelzoo.odelstm neuralhydrology.modelzoo.template diff --git a/docs/source/usage/models.rst b/docs/source/usage/models.rst index e1095ad4..6c137352 100644 --- a/docs/source/usage/models.rst +++ b/docs/source/usage/models.rst @@ -39,14 +39,14 @@ activations for all time steps. This class is implemented for exploratory reason ``model.copy_weights()`` to copy the weights of a ``CudaLSTM`` model into an ``LSTM`` model. This allows to use the fast CUDA implementation for training, and only use this class for inference with more detailed outputs. -MultiFreqLSTM -------------- -:py:class:`neuralhydrology.modelzoo.multifreqlstm.MultiFreqLSTM` is a newly proposed model by Gauch et al. (pre-print -published soon). This model allows the training on more than one temporal frequency (e.g. daily and hourly inputs) and -returns multi-frequency model predictions accordingly. A more detailed tutorial will follow shortly. +MTS-LSTM +-------- +:py:class:`neuralhydrology.modelzoo.mtslstm.MTSLSTM` is a newly proposed model by Gauch et al. (pre-print +published soon). This model allows the training on more than temporal resolution (e.g. daily and hourly inputs) and +returns multi-timescale model predictions accordingly. A more detailed tutorial will follow shortly. -ODELSTM -------- +ODE-LSTM +-------- :py:class:`neuralhydrology.modelzoo.odelstm.ODELSTM` is a PyTorch implementation of the ODE-LSTM proposed by `Lechner and Hasani `_. This model can be used with unevenly sampled inputs and can be queried to return predictions for any arbitrary time step. diff --git a/examples/01-Introduction/1_basin.yml b/examples/01-Introduction/1_basin.yml index 0b5e3e28..d7119c1b 100644 --- a/examples/01-Introduction/1_basin.yml +++ b/examples/01-Introduction/1_basin.yml @@ -35,7 +35,7 @@ metrics: # --- Model configuration -------------------------------------------------------------------------- -# base model type [lstm, ealstm, cudalstm, embcudalstm, multifreqlstm] +# base model type [lstm, ealstm, cudalstm, embcudalstm, mtslstm] # (has to match the if statement in modelzoo/__init__.py) model: cudalstm diff --git a/examples/config.yml.example b/examples/config.yml.example index b7b3e770..0d4b5533 100644 --- a/examples/config.yml.example +++ b/examples/config.yml.example @@ -52,7 +52,7 @@ metrics: # --- Model configuration -------------------------------------------------------------------------- -# base model type [lstm, ealstm, cudalstm, embcudalstm, multifreqlstm] +# base model type [lstm, ealstm, cudalstm, embcudalstm, mtslstm] # (has to match the if statement in modelzoo/__init__.py) model: cudalstm @@ -79,13 +79,13 @@ embedding_activation: tanh # dropout applied to embedding network embedding_dropout: 0.0 -# ----> MultiFreqLSTM settings <---- +# ----> MTSLSTM settings <---- # Use an individual LSTM per frequencies (True) vs. use a single shared LSTM for all frequencies (False) -per_frequency_lstm: True +shared_mtslstm: True # how to transfer states from lower to higher frequencies. One of [identity, linear, None]. -transfer_multifreq_states: +transfer_mtslstm_states: h: identity c: identity diff --git a/neuralhydrology/modelzoo/__init__.py b/neuralhydrology/modelzoo/__init__.py index 0a30816e..d2b89a5a 100644 --- a/neuralhydrology/modelzoo/__init__.py +++ b/neuralhydrology/modelzoo/__init__.py @@ -5,7 +5,7 @@ from neuralhydrology.modelzoo.embcudalstm import EmbCudaLSTM from neuralhydrology.modelzoo.lstm import LSTM from neuralhydrology.modelzoo.odelstm import ODELSTM -from neuralhydrology.modelzoo.multifreqlstm import MultiFreqLSTM +from neuralhydrology.modelzoo.mtslstm import MTSLSTM from neuralhydrology.utils.config import Config SINGLE_FREQ_MODELS = ["cudalstm", "ealstm", "lstm", "embcudalstm"] @@ -35,8 +35,8 @@ def get_model(cfg: Config) -> nn.Module: model = LSTM(cfg=cfg) elif cfg.model == "embcudalstm": model = EmbCudaLSTM(cfg=cfg) - elif cfg.model == "multifreqlstm": - model = MultiFreqLSTM(cfg=cfg) + elif cfg.model == "mtslstm": + model = MTSLSTM(cfg=cfg) elif cfg.model == "odelstm": model = ODELSTM(cfg=cfg) else: diff --git a/neuralhydrology/modelzoo/multifreqlstm.py b/neuralhydrology/modelzoo/mtslstm.py similarity index 73% rename from neuralhydrology/modelzoo/multifreqlstm.py rename to neuralhydrology/modelzoo/mtslstm.py index fba41871..305dd3b2 100644 --- a/neuralhydrology/modelzoo/multifreqlstm.py +++ b/neuralhydrology/modelzoo/mtslstm.py @@ -13,10 +13,29 @@ LOGGER = logging.getLogger(__name__) -class MultiFreqLSTM(BaseModel): +class MTSLSTM(BaseModel): + """Multi-Timescale LSTM (MTS-LSTM) from Gauch et al. (preprint to be released soon). + + An LSTM architecture that allows simultaneous prediction at multiple timescales within one model. + There are two flavors of this model: MTS-LTSM and sMTS-LSTM (shared MTS-LSTM). The MTS-LSTM processes inputs at + low temporal resolutions up to a point in time. Then, the LSTM splits into one branch for each target timescale. + Each branch processes the inputs at its respective timescale. Finally, one prediction head per timescale generates + the predictions for that timescale based on the LSTM output. + Optionally, one can specify: + - a different hidden size for each LSTM branch (use a dict in the ``hidden_size`` config argument) + - different dynamic input variables for each timescale (use a dict in the ``dynamic_inputs`` config argument) + - the strategy to transfer states from the initial shared low-resolution LSTM to the per-timescale + higher-resolution LSTMs. By default, this is a linear transfer layer, but you can specify 'identity' to use an + identity operation or 'None' to turn off any transfer (via the ``transfer_mtlstm_states`` config argument). + + + The sMTS-LSTM variant has the same overall architecture, but the weights of the per-timescale branches (including + the output heads) are shared. + Thus, unlike MTS-LSTM, the sMTS-LSTM cannot use per-timescale hidden sizes or dynamic input variables. + """ def __init__(self, cfg: Config): - super(MultiFreqLSTM, self).__init__(cfg=cfg) + super(MTSLSTM, self).__init__(cfg=cfg) self.lstms = None self.transfer_fcs = None self.heads = None @@ -26,22 +45,22 @@ def __init__(self, cfg: Config): self._frequency_factors = [] self._seq_lengths = cfg.seq_length - self._per_frequency_lstm = self.cfg.per_frequency_lstm # default: a distinct LSTM per frequency - self._transfer_multifreq_states = self.cfg.transfer_multifreq_states # default: linear transfer layer + self._is_shared_mtslstm = self.cfg.shared_mtslstm # default: a distinct LSTM per timescale + self._transfer_mtslstm_states = self.cfg.transfer_mtslstm_states # default: linear transfer layer transfer_modes = [None, "None", "identity", "linear"] - if self._transfer_multifreq_states["h"] not in transfer_modes \ - or self._transfer_multifreq_states["c"] not in transfer_modes: - raise ValueError(f"MultiFreqLSTM supports state transfer modes {transfer_modes}") + if self._transfer_mtslstm_states["h"] not in transfer_modes \ + or self._transfer_mtslstm_states["c"] not in transfer_modes: + raise ValueError(f"MTS-LSTM supports state transfer modes {transfer_modes}") if len(cfg.use_frequencies) < 2: - raise ValueError("MultiFreqLSTM expects more than one input frequency") + raise ValueError("MTS-LSTM expects more than one input frequency") self._frequencies = sort_frequencies(cfg.use_frequencies) # start to count the number of inputs input_sizes = len(cfg.camels_attributes + cfg.hydroatlas_attributes + cfg.static_inputs) - # if not per_frequency_lstm, the LSTM gets an additional frequency flag as input. - if not self._per_frequency_lstm: + # if not is_shared_mtslstm, the LSTM gets an additional frequency flag as input. + if not self._is_shared_mtslstm: input_sizes += len(self._frequencies) if cfg.use_basin_id_encoding: @@ -52,8 +71,8 @@ def __init__(self, cfg: Config): if isinstance(cfg.dynamic_inputs, list): input_sizes = {freq: input_sizes + len(cfg.dynamic_inputs) for freq in self._frequencies} else: - if not self._per_frequency_lstm: - raise ValueError(f'Different inputs not allowed if per_frequency_lstm is False.') + if not self._is_shared_mtslstm: + raise ValueError(f'Different inputs not allowed if shared_mtslstm is False.') input_sizes = {freq: input_sizes + len(cfg.dynamic_inputs[freq]) for freq in self._frequencies} if not isinstance(cfg.hidden_size, dict): @@ -62,11 +81,11 @@ def __init__(self, cfg: Config): else: self._hidden_size = cfg.hidden_size - if (not self._per_frequency_lstm - or self._transfer_multifreq_states["h"] == "identity" - or self._transfer_multifreq_states["c"] == "identity") \ + if (not self._is_shared_mtslstm + or self._transfer_mtslstm_states["h"] == "identity" + or self._transfer_mtslstm_states["c"] == "identity") \ and any(size != self._hidden_size[self._frequencies[0]] for size in self._hidden_size.values()): - raise ValueError("All hidden sizes must be equal if per_frequency_lstm=False or state transfer=identity.") + raise ValueError("All hidden sizes must be equal if shared_mtslstm=False or state transfer=identity.") # create layer depending on selected frequencies self._init_modules(input_sizes) @@ -83,7 +102,7 @@ def _init_modules(self, input_sizes: Dict[str, int]): for idx, freq in enumerate(self._frequencies): freq_input_size = input_sizes[freq] - if not self._per_frequency_lstm and idx > 0: + if not self._is_shared_mtslstm and idx > 0: self.lstms[freq] = self.lstms[self._frequencies[idx - 1]] # same LSTM for all frequencies. self.heads[freq] = self.heads[self._frequencies[idx - 1]] # same head for all frequencies. else: @@ -92,10 +111,10 @@ def _init_modules(self, input_sizes: Dict[str, int]): if idx < len(self._frequencies) - 1: for state in ["c", "h"]: - if self._transfer_multifreq_states[state] == "linear": + if self._transfer_mtslstm_states[state] == "linear": self.transfer_fcs[f"{state}_{freq}"] = nn.Linear(self._hidden_size[freq], self._hidden_size[self._frequencies[idx + 1]]) - elif self._transfer_multifreq_states[state] == "identity": + elif self._transfer_mtslstm_states[state] == "identity": self.transfer_fcs[f"{state}_{freq}"] = nn.Identity() else: pass @@ -138,7 +157,7 @@ def _prepare_inputs(self, data: Dict[str, torch.Tensor], freq: str) -> torch.Ten else: pass - if not self._per_frequency_lstm: + if not self._is_shared_mtslstm: # add frequency one-hot encoding idx = self._frequencies.index(freq) one_hot_freq = torch.zeros(x_d.shape[0], x_d.shape[1], len(self._frequencies)).to(x_d) @@ -165,9 +184,9 @@ def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: (h_0_transfer, c_0_transfer)) # project the states through a hidden layer to the dimensions of the next LSTM - if self._transfer_multifreq_states["h"] is not None: + if self._transfer_mtslstm_states["h"] is not None: h_0_transfer = self.transfer_fcs[f"h_{freq}"](h_n_slice1) - if self._transfer_multifreq_states["c"] is not None: + if self._transfer_mtslstm_states["c"] is not None: c_0_transfer = self.transfer_fcs[f"c_{freq}"](c_n_slice1) # get predictions of remaining part and concat results diff --git a/neuralhydrology/modelzoo/odelstm.py b/neuralhydrology/modelzoo/odelstm.py index afa31999..f053eaec 100644 --- a/neuralhydrology/modelzoo/odelstm.py +++ b/neuralhydrology/modelzoo/odelstm.py @@ -57,7 +57,7 @@ def __init__(self, cfg: Config): raise ValueError('ODELSTM does not support per-frequency input variables or hidden sizes.') # Note: be aware that frequency_factors and slice_timesteps have a slightly different meaning here vs. in - # multifreqlstm. Here, the frequency_factor is relative to the _lowest_ (not the next-lower) frequency. + # MTSLSTM. Here, the frequency_factor is relative to the _lowest_ (not the next-lower) frequency. # slice_timesteps[freq] is the input step (counting backwards) in the next-*lower* frequency from where on input # data at frequency freq is available. self._frequency_factors = {} diff --git a/neuralhydrology/utils/config.py b/neuralhydrology/utils/config.py index 85abd525..98392af1 100644 --- a/neuralhydrology/utils/config.py +++ b/neuralhydrology/utils/config.py @@ -419,10 +419,6 @@ def per_basin_train_periods_file(self) -> Path: def per_basin_validation_periods_file(self) -> Path: return self._cfg.get("per_basin_validation_periods_file", None) - @property - def per_frequency_lstm(self) -> bool: - return self._cfg.get("per_frequency_lstm", True) - @property def predict_last_n(self) -> Union[int, Dict[str, int]]: return self._get_value_verbose("predict_last_n") @@ -470,6 +466,10 @@ def seed(self, seed: int): def seq_length(self) -> Union[int, Dict[str, int]]: return self._get_value_verbose("seq_length") + @property + def shared_mtslstm(self) -> bool: + return self._cfg.get("shared_mtslstm", True) + @property def static_inputs(self) -> List[str]: return self._as_default_list(self._cfg.get("static_inputs", [])) @@ -540,8 +540,8 @@ def train_start_date(self) -> pd.Timestamp: return self._get_value_verbose("train_start_date") @property - def transfer_multifreq_states(self) -> Dict[str, str]: - return self._cfg.get("transfer_multifreq_states", {'h': 'linear', 'c': 'linear'}) + def transfer_mtslstm_states(self) -> Dict[str, str]: + return self._cfg.get("transfer_mtslstm_states", {'h': 'linear', 'c': 'linear'}) @property def umal_extend_batch(self) -> bool: diff --git a/test/conftest.py b/test/conftest.py index feb18906..f204198f 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -11,7 +11,7 @@ def pytest_addoption(parser): parser.addoption('--smoke-test', action='store_true', default=False, - help='Skips some tests for faster execution. Out of the single-frequency ' + help='Skips some tests for faster execution. Out of the single-timescale ' 'models and forcings, only test cudalstm on forcings that include daymet.') @@ -44,13 +44,13 @@ def _get_config(name): @pytest.fixture(params=['lstm', 'ealstm', 'cudalstm', 'embcudalstm']) -def single_freq_model(request) -> str: - """Fixture that provides models that support predicting only a single frequency. +def single_timescale_model(request) -> str: + """Fixture that provides models that support predicting only a single timescale. Returns ------- str - Name of the single-frequency model. + Name of the single-timescale model. """ if request.config.getoption('--smoke-test') and request.param != 'cudalstm': pytest.skip('--smoke-test skips this test.') @@ -62,7 +62,7 @@ def single_freq_model(request) -> str: (['daymet', 'nldas'], ['prcp(mm/day)_daymet', 'tmax(C)_daymet', 'PRCP(mm/day)_nldas', 'Tmax(C)_nldas'])], ids=lambda param: str(param[0])) -def single_freq_forcings(request) -> Dict[str, Union[str, List[str]]]: +def single_timescale_forcings(request) -> Dict[str, Union[str, List[str]]]: """Fixture that provides daily forcings. Returns @@ -75,14 +75,14 @@ def single_freq_forcings(request) -> Dict[str, Union[str, List[str]]]: return {'forcings': request.param[0], 'variables': request.param[1]} -@pytest.fixture(params=['multifreqlstm', 'odelstm']) -def multi_freq_model(request) -> str: - """Fixture that provides multifrequency models. +@pytest.fixture(params=['mtslstm', 'odelstm']) +def multi_timescale_model(request) -> str: + """Fixture that provides multi-timescale models. Returns ------- str - Name of the multifrequency model. + Name of the multi-timescale model. """ return request.param diff --git a/test/test_config_runs.py b/test/test_config_runs.py index d2f9b600..6eb760fb 100644 --- a/test/test_config_runs.py +++ b/test/test_config_runs.py @@ -15,28 +15,28 @@ from test import Fixture -def test_daily_regression(get_config: Fixture[Callable[[str], dict]], single_freq_model: Fixture[str], - daily_dataset: Fixture[str], single_freq_forcings: Fixture[str]): +def test_daily_regression(get_config: Fixture[Callable[[str], dict]], single_timescale_model: Fixture[str], + daily_dataset: Fixture[str], single_timescale_forcings: Fixture[str]): """Test regression training and evaluation for daily predictions. Parameters ---------- get_config : Fixture[Callable[[str], dict] Method that returns a run configuration to test. - single_freq_model : Fixture[str] + single_timescale_model : Fixture[str] Model to test. daily_dataset : Fixture[str] Daily dataset to use. - single_freq_forcings : Fixture[str] + single_timescale_forcings : Fixture[str] Daily forcings set to use. """ config = get_config('daily_regression') - config.log_only('model', single_freq_model) + config.log_only('model', single_timescale_model) config.log_only('dataset', daily_dataset['dataset']) config.log_only('data_dir', config.data_dir / daily_dataset['dataset']) config.log_only('target_variables', daily_dataset['target']) - config.log_only('forcings', single_freq_forcings['forcings']) - config.log_only('dynamic_inputs', single_freq_forcings['variables']) + config.log_only('forcings', single_timescale_forcings['forcings']) + config.log_only('dynamic_inputs', single_timescale_forcings['variables']) basin = '01022500' test_start_date, test_end_date = _get_test_start_end_dates(config) @@ -88,20 +88,18 @@ def test_daily_regression_additional_features(get_config: Fixture[Callable[[str] assert not pd.isna(results[f'{config.target_variables[0]}_sim']).any() -def test_multifreq_regression(get_config: Fixture[Callable[[str], dict]], multi_freq_model: Fixture[str]): - """Test regression training and evaluation for multifrequency predictions. +def test_multi_timescale_regression(get_config: Fixture[Callable[[str], dict]], multi_timescale_model: Fixture[str]): + """Test regression training and evaluation for multi-timescale predictions. Parameters ---------- get_config : Fixture[Callable[[str], dict] Method that returns a run configuration to test. - multi_freq_model : Fixture[str] + multi_timescale_model : Fixture[str] Model to test. - multi_freq_forcings : Fixture[str] - Forcings set to use. """ - config = get_config('multifreq_regression') - config.log_only('model', multi_freq_model) + config = get_config('multi_timescale_regression') + config.log_only('model', multi_timescale_model) basin = '01022500' test_start_date, test_end_date = _get_test_start_end_dates(config) diff --git a/test/test_configs/multifreq_regression.test.yml b/test/test_configs/multi_timescale_regression.test.yml similarity index 98% rename from test/test_configs/multifreq_regression.test.yml rename to test/test_configs/multi_timescale_regression.test.yml index 69aaf8d0..c6fcca57 100644 --- a/test/test_configs/multifreq_regression.test.yml +++ b/test/test_configs/multi_timescale_regression.test.yml @@ -28,7 +28,7 @@ metrics: - Beta-NSE # --- Model configuration -------------------------------------------------------------------------- -model: multifreqlstm +model: mtslstm head: regression output_activation: linear From bbb3cf7000dc03217cadddca01e0cb90870f1cad Mon Sep 17 00:00:00 2001 From: Martin Gauch Date: Tue, 6 Oct 2020 10:16:11 +0200 Subject: [PATCH 08/15] Add docstrings --- neuralhydrology/modelzoo/mtslstm.py | 21 ++++++- neuralhydrology/modelzoo/odelstm.py | 89 ++++++++++++++++++++++------- 2 files changed, 87 insertions(+), 23 deletions(-) diff --git a/neuralhydrology/modelzoo/mtslstm.py b/neuralhydrology/modelzoo/mtslstm.py index 305dd3b2..f16f3566 100644 --- a/neuralhydrology/modelzoo/mtslstm.py +++ b/neuralhydrology/modelzoo/mtslstm.py @@ -32,6 +32,11 @@ class MTSLSTM(BaseModel): The sMTS-LSTM variant has the same overall architecture, but the weights of the per-timescale branches (including the output heads) are shared. Thus, unlike MTS-LSTM, the sMTS-LSTM cannot use per-timescale hidden sizes or dynamic input variables. + + Parameters + ---------- + cfg : Config + The run configuration. """ def __init__(self, cfg: Config): @@ -89,7 +94,7 @@ def __init__(self, cfg: Config): # create layer depending on selected frequencies self._init_modules(input_sizes) - self.reset_parameters() + self._reset_parameters() # frequency factors are needed to determine the time step of information transfer self._init_frequency_factors_and_slice_timesteps() @@ -131,7 +136,7 @@ def _init_frequency_factors_and_slice_timesteps(self): slice_timestep = int(self._seq_lengths[self._frequencies[idx + 1]] / self._frequency_factors[idx]) self._slice_timestep[freq] = slice_timestep - def reset_parameters(self): + def _reset_parameters(self): if self.cfg.initial_forget_bias is not None: for freq in self._frequencies: hidden_size = self._hidden_size[freq] @@ -167,6 +172,18 @@ def _prepare_inputs(self, data: Dict[str, torch.Tensor], freq: str) -> torch.Ten return x_d def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """Perform a forward pass on the MTS-LSTM model. + + Parameters + ---------- + data : Dict[str, torch.Tensor] + Input data for the forward pass. See the documentation overview of all models for details on the dict keys. + + Returns + ------- + Dict[str, torch.Tensor] + Model predictions for each target timescale. + """ x_d = {freq: self._prepare_inputs(data, freq) for freq in self._frequencies} # initial states for lowest frequencies are set to zeros diff --git a/neuralhydrology/modelzoo/odelstm.py b/neuralhydrology/modelzoo/odelstm.py index f053eaec..b8853670 100644 --- a/neuralhydrology/modelzoo/odelstm.py +++ b/neuralhydrology/modelzoo/odelstm.py @@ -42,6 +42,11 @@ class ODELSTM(BaseModel): 5. lowest-frequency steps to generate predict_last_n lowest-frequency predictions. 6. repeat steps four and five for the next-higher frequency (using the same random-frequency bounds but generating predictions for the next-higher frequency). + + Parameters + ---------- + cfg : Config + The run configuration. References ---------- @@ -171,6 +176,18 @@ def _randomize_freq(self, x_d: torch.Tensor, low_frequency: str, high_frequency: return torch.cat(x_d_randomized, dim=0) def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """Perform a forward pass on the ODE-LSTM model. + + Parameters + ---------- + data : Dict[str, torch.Tensor] + Input data for the forward pass. See the documentation overview of all models for details on the dict keys. + + Returns + ------- + Dict[str, torch.Tensor] + Model predictions for each target timescale. + """ x_d = {freq: self._prepare_inputs(data, freq) for freq in self._frequencies} @@ -245,19 +262,35 @@ def _run_odelstm(self, input_slice: torch.Tensor, h_0: torch.Tensor, class _ODERNNCell(nn.Module): - """An ODE-RNN cell (Adapted from https://github.com/mlech26l/learning-long-term-irregular-ts). """ + """An ODE-RNN cell (Adapted from https://github.com/mlech26l/learning-long-term-irregular-ts) [#]_. + + Parameters + ---------- + input_size : int + Input dimension + hidden_size : int + Size of the cell's hidden state + num_unfolds : int + Number of steps into which each timestep will be broken down to solve the ODE. + method : {'euler', 'heun', 'rk4'} + Method to use for ODE solving (Euler's method, Heun's method, or Runge-Kutta 4) + + References + ---------- + .. [#] Lechner, M.; Hasani, R.: Learning Long-Term Dependencies in Irregularly-Sampled Time Series. arXiv, 2020, + https://arxiv.org/abs/2006.04418. + """ - def __init__(self, input_size: int, hidden_size: int, num_unfolds: int, method: str, tau=1): + def __init__(self, input_size: int, hidden_size: int, num_unfolds: int, method: str): super(_ODERNNCell, self).__init__() self.method = { - 'euler': self.euler, - 'heun': self.heun, - 'rk4': self.rk4, + 'euler': self._euler, + 'heun': self._heun, + 'rk4': self._rk4, }[method] self.input_size = input_size self.hidden_size = hidden_size self.num_unfolds = num_unfolds - self.tau = tau self.w_ih = nn.Parameter(torch.FloatTensor(hidden_size, input_size)) self.w_hh = nn.Parameter(torch.FloatTensor(hidden_size, hidden_size)) @@ -267,12 +300,29 @@ def __init__(self, input_size: int, hidden_size: int, num_unfolds: int, method: self.reset_parameters() def reset_parameters(self): + """Reset the paramters of the ODERNNCell. """ nn.init.orthogonal_(self.w_hh) nn.init.xavier_uniform_(self.w_ih) nn.init.zeros_(self.bias) nn.init.constant_(self.scale, 1.0) def forward(self, new_hidden_state: torch.Tensor, old_hidden_state: torch.Tensor, elapsed: float) -> torch.Tensor: + """Perform a forward pass on the ODERNNCell. + + Parameters + ---------- + new_hidden_state : torch.Tensor + The current hidden state to be updated by the ODERNNCell. + old_hidden_state : torch.Tensor + The previous hidden state. + elapsed : float + Time elapsed between new and old hidden state. + + Returns + ------- + torch.Tensor + Predicted new hidden state + """ delta_t = elapsed / self.num_unfolds hidden_state = old_hidden_state @@ -280,29 +330,26 @@ def forward(self, new_hidden_state: torch.Tensor, old_hidden_state: torch.Tensor hidden_state = self.method(new_hidden_state, hidden_state, delta_t) return hidden_state - def dfdt(self, inputs: torch.Tensor, hidden_state: torch.Tensor) -> torch.Tensor: + def _dfdt(self, inputs: torch.Tensor, hidden_state: torch.Tensor) -> torch.Tensor: h_in = torch.matmul(inputs, self.w_ih) h_rec = torch.matmul(hidden_state, self.w_hh) dh_in = self.scale * torch.tanh(h_in + h_rec + self.bias) - if self.tau > 0: - dh = dh_in - hidden_state * self.tau - else: - dh = dh_in + dh = dh_in - hidden_state return dh - def euler(self, inputs: torch.Tensor, hidden_state: torch.Tensor, delta_t: float) -> torch.Tensor: - dy = self.dfdt(inputs, hidden_state) + def _euler(self, inputs: torch.Tensor, hidden_state: torch.Tensor, delta_t: float) -> torch.Tensor: + dy = self._dfdt(inputs, hidden_state) return hidden_state + delta_t * dy - def heun(self, inputs: torch.Tensor, hidden_state: torch.Tensor, delta_t: float) -> torch.Tensor: - k1 = self.dfdt(inputs, hidden_state) - k2 = self.dfdt(inputs, hidden_state + delta_t * k1) + def _heun(self, inputs: torch.Tensor, hidden_state: torch.Tensor, delta_t: float) -> torch.Tensor: + k1 = self._dfdt(inputs, hidden_state) + k2 = self._dfdt(inputs, hidden_state + delta_t * k1) return hidden_state + delta_t * 0.5 * (k1 + k2) - def rk4(self, inputs: torch.Tensor, hidden_state: torch.Tensor, delta_t: float) -> torch.Tensor: - k1 = self.dfdt(inputs, hidden_state) - k2 = self.dfdt(inputs, hidden_state + k1 * delta_t * 0.5) - k3 = self.dfdt(inputs, hidden_state + k2 * delta_t * 0.5) - k4 = self.dfdt(inputs, hidden_state + k3 * delta_t) + def _rk4(self, inputs: torch.Tensor, hidden_state: torch.Tensor, delta_t: float) -> torch.Tensor: + k1 = self._dfdt(inputs, hidden_state) + k2 = self._dfdt(inputs, hidden_state + k1 * delta_t * 0.5) + k3 = self._dfdt(inputs, hidden_state + k2 * delta_t * 0.5) + k4 = self._dfdt(inputs, hidden_state + k3 * delta_t) return hidden_state + delta_t * (k1 + 2 * k2 + 2 * k3 + k4) / 6.0 From 4ee0ae88e2c9a701d26b2b60f7e912ab38388eaf Mon Sep 17 00:00:00 2001 From: Martin Gauch Date: Tue, 6 Oct 2020 10:24:10 +0200 Subject: [PATCH 09/15] Make reset_parameters private --- neuralhydrology/modelzoo/odelstm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neuralhydrology/modelzoo/odelstm.py b/neuralhydrology/modelzoo/odelstm.py index b8853670..63bd1979 100644 --- a/neuralhydrology/modelzoo/odelstm.py +++ b/neuralhydrology/modelzoo/odelstm.py @@ -297,9 +297,9 @@ def __init__(self, input_size: int, hidden_size: int, num_unfolds: int, method: self.bias = nn.Parameter(torch.FloatTensor(hidden_size)) self.scale = nn.Parameter(torch.FloatTensor(hidden_size)) - self.reset_parameters() + self._reset_parameters() - def reset_parameters(self): + def _reset_parameters(self): """Reset the paramters of the ODERNNCell. """ nn.init.orthogonal_(self.w_hh) nn.init.xavier_uniform_(self.w_ih) From c47562a6d5915ce32c89b651828ffaa9ffc14a48 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Tue, 6 Oct 2020 13:53:29 +0200 Subject: [PATCH 10/15] Docs and restructuring (#12) * doc update and minor restructuring #5 * Update model docstrings * Apply suggestions from code review Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> * fix sphinx stuff, add space Co-authored-by: Martin Gauch <15731649+gauchm@users.noreply.github.com> --- ...t => neuralhydrology.modelzoo.mtslstm.rst} | 0 neuralhydrology/data/climateindices.py | 92 ++++++++++++------- neuralhydrology/data/dischargeinput.py | 72 +++++++++++---- neuralhydrology/data/pet.py | 90 +++++++++--------- neuralhydrology/modelzoo/basemodel.py | 45 ++++++++- neuralhydrology/modelzoo/cudalstm.py | 38 +++++++- neuralhydrology/modelzoo/ealstm.py | 45 ++++++++- neuralhydrology/modelzoo/embcudalstm.py | 38 +++++++- neuralhydrology/modelzoo/fc.py | 37 +++++++- neuralhydrology/modelzoo/head.py | 31 ++++++- neuralhydrology/modelzoo/lstm.py | 13 +-- neuralhydrology/modelzoo/mtslstm.py | 10 +- 12 files changed, 391 insertions(+), 120 deletions(-) rename docs/source/api/{neuralhydrology.modelzoo.multifreqlstm.rst => neuralhydrology.modelzoo.mtslstm.rst} (100%) diff --git a/docs/source/api/neuralhydrology.modelzoo.multifreqlstm.rst b/docs/source/api/neuralhydrology.modelzoo.mtslstm.rst similarity index 100% rename from docs/source/api/neuralhydrology.modelzoo.multifreqlstm.rst rename to docs/source/api/neuralhydrology.modelzoo.mtslstm.rst diff --git a/neuralhydrology/data/climateindices.py b/neuralhydrology/data/climateindices.py index 6c6b9c32..d564c818 100644 --- a/neuralhydrology/data/climateindices.py +++ b/neuralhydrology/data/climateindices.py @@ -2,36 +2,65 @@ import pickle import sys from pathlib import Path -from typing import List +from typing import List, Dict import numpy as np +import pandas as pd from numba import njit from tqdm import tqdm -from .pet import get_priestley_taylor_pet -from .utils import load_camels_us_attributes, load_camels_us_forcings, load_basin_file +from neuralhydrology.data import pet, utils LOGGER = logging.getLogger(__name__) -def precalculate_dyn_climate_indices(data_dir: Path, basin_file: Path, window_length: int, forcings: str): - basins = load_basin_file(basin_file=basin_file) - camels_attributes = load_camels_us_attributes(data_dir=data_dir, basins=basins) +def calculate_dyn_climate_indices(data_dir: Path, + basins: List[str], + window_length: int, + forcings: str, + output_file: Path = None) -> Dict[str, pd.DataFrame]: + """Calculate dynamic climate indices. + + Compared to the long-term static climate indices included in the CAMELS data set, this function computes the same + climate indices by a moving window approach over the entire data set. That is, for each time step, the climate + indices are re-computed from the last `window_length` time steps. The resulting dictionary of DataFrames can be + used with the `additional_feature_files` argument. + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. + basins : List[str] + List of basin ids. + window_length : int + Look-back period to use to compute the climate indices. + forcings : str + Can be e.g. 'daymet' or 'nldas', etc. Must match the folder names in the 'basin_mean_forcing' directory. + output_file : Path, optional + If specified, stores the resulting dictionary of DataFrames to this location as a pickle dump. + + Returns + ------- + Dict[str, pd.DataFrame] + Dictionary with one time-indexed DataFrame per basin. By definition, the climate indices for a given day in the + DataFrame are computed from the `window_length` previous time steps (including the given day). + """ + camels_attributes = utils.load_camels_us_attributes(data_dir=data_dir, basins=basins) additional_features = {} new_columns = [ 'p_mean_dyn', 'pet_mean_dyn', 'aridity_dyn', 't_mean_dyn', 'frac_snow_dyn', 'high_prec_freq_dyn', 'high_prec_dur_dyn', 'low_prec_freq_dyn', 'low_prec_dur_dyn' ] for basin in tqdm(basins, file=sys.stdout): - df, _ = load_camels_us_forcings(data_dir=data_dir, basin=basin, forcings=forcings) + df, _ = utils.load_camels_us_forcings(data_dir=data_dir, basin=basin, forcings=forcings) lat = camels_attributes.loc[camels_attributes.index == basin, 'gauge_lat'].values elev = camels_attributes.loc[camels_attributes.index == basin, 'elev_mean'].values - df["PET(mm/d)"] = get_priestley_taylor_pet(t_min=df["tmin(C)"].values, - t_max=df["tmax(C)"].values, - s_rad=df["srad(W/m2)"].values, - lat=lat, - elev=elev, - doy=df.index.dayofyear.values) + df["PET(mm/d)"] = pet.get_priestley_taylor_pet(t_min=df["tmin(C)"].values, + t_max=df["tmax(C)"].values, + s_rad=df["srad(W/m2)"].values, + lat=lat, + elev=elev, + doy=df.index.dayofyear.values) for col in new_columns: df[col] = np.nan @@ -41,7 +70,7 @@ def precalculate_dyn_climate_indices(data_dir: Path, basin_file: Path, window_le df['vp(Pa)'].values, df['PET(mm/d)'].values ]).T - new_features = numba_climate_indexes(x, window_length=window_length) + new_features = _numba_climate_indexes(x, window_length=window_length) if np.sum(np.isnan(new_features)) > 0: raise ValueError(f"NaN in new features of basin {basin}") @@ -62,20 +91,16 @@ def precalculate_dyn_climate_indices(data_dir: Path, basin_file: Path, window_le additional_features[basin] = df - filename = f"dyn_climate_indices_{forcings}_{len(basins)}basins_{window_length}lookback.p" - - output_file = Path(__file__).parent.parent.parent / 'data' / filename - - with output_file.open("wb") as fp: - pickle.dump(additional_features, fp) - - LOGGER.info(f"Precalculated features successfully stored at {output_file}") + if output_file is not None: + with output_file.open("wb") as fp: + pickle.dump(additional_features, fp) + LOGGER.info(f"Precalculated features successfully stored at {output_file}") return additional_features @njit -def numba_climate_indexes(features: np.ndarray, window_length: int) -> np.ndarray: +def _numba_climate_indexes(features: np.ndarray, window_length: int) -> np.ndarray: n_samples = features.shape[0] new_features = np.zeros((n_samples - 365 + 1, 9)) @@ -94,11 +119,11 @@ def numba_climate_indexes(features: np.ndarray, window_length: int) -> np.ndarra low_prec_freq = precip_days[precip_days[:, 0] < 1].shape[0] / precip_days.shape[0] idx = np.where(x[:, 0] < 1)[0] - groups = split_list(idx) + groups = _split_list(idx) low_prec_dur = np.mean(np.array([len(p) for p in groups])) idx = np.where(x[:, 0] >= 5 * p_mean)[0] - groups = split_list(idx) + groups = _split_list(idx) high_prec_dur = np.mean(np.array([len(p) for p in groups])) new_features[i, 0] = p_mean @@ -115,16 +140,15 @@ def numba_climate_indexes(features: np.ndarray, window_length: int) -> np.ndarra @njit -def split_list(alist: List) -> List: - newlist = [] +def _split_list(a_list: List) -> List: + new_list = [] start = 0 - end = 0 - for index, value in enumerate(alist): - if index < len(alist) - 1: - if alist[index + 1] > value + 1: + for index, value in enumerate(a_list): + if index < len(a_list) - 1: + if a_list[index + 1] > value + 1: end = index + 1 - newlist.append(alist[start:end]) + new_list.append(a_list[start:end]) start = end else: - newlist.append(alist[start:len(alist)]) - return newlist + new_list.append(a_list[start:len(a_list)]) + return new_list diff --git a/neuralhydrology/data/dischargeinput.py b/neuralhydrology/data/dischargeinput.py index b198a5fa..f8b97474 100644 --- a/neuralhydrology/data/dischargeinput.py +++ b/neuralhydrology/data/dischargeinput.py @@ -2,36 +2,72 @@ import pickle import sys from pathlib import Path +from typing import Dict, List +import pandas as pd from tqdm import tqdm -from neuralhydrology.data.utils import load_basin_file, load_camels_us_forcings, load_camels_us_discharge +from neuralhydrology.data import utils LOGGER = logging.getLogger(__name__) -def create_discharge_files(data_dir: Path, basin_file: Path, out_dir: Path, shift: int = 1): - - out_file = out_dir / f"discharge_input_shift{shift}.p" - if out_file.is_file(): - raise FileExistsError - - basins = load_basin_file(basin_file) - +def shift_discharge(data_dir: Path, basins: List[str], dataset: str, shift: int = 1, + output_file: Path = None) -> Dict[str, pd.DataFrame]: + """Return shifted discharge data. + + This function returns the shifted discharge data for each basin in `basins`. Useful when training + models with lagged discharge as input. Use the `output_file` argument to store the resulting dictionary as + pickle dump to disk. This pickle file can be used with the config argument `additional_feature_files` to make the + data available as input feature. + For CAMELS GB, the 'discharge_spec' column is used. + + Parameters + ---------- + data_dir : Path + Path to the dataset directory. + basins : List[str] + List of basin ids. + dataset : {'camels_us', 'camels_gb', 'hourly_camels_us'} + Which data set to use. + shift : int, optional + Number of discharge lag in time steps, by default 1. + output_file : Path, optional + If specified, stores the resulting dictionary of DataFrames to this location as a pickle dump. + + Returns + ------- + Dict[str, pd.DataFrame] + Dictionary with one time-indexed DataFrame per basin. The lagged discharge column is named according to the + discharge column name, with '_t-SHIFT' as suffix, where 'SHIFT' corresponds to the argument `shift`. + """ data = {} - for basin in tqdm(basins, file=sys.stdout): - df, area = load_camels_us_forcings(data_dir=data_dir, basin=basin, forcings="daymet") - df["QObs(mm/d)"] = load_camels_us_discharge(data_dir=data_dir, basin=basin, area=area) - - df[f"QObs(t-{shift})"] = df["QObs(mm/d)"].shift(shift) + # load discharge data + if dataset == "camels_us": + df, area = utils.load_camels_us_forcings(data_dir=data_dir, basin=basin, forcings="daymet") + df["QObs(mm/d)"] = utils.load_camels_us_discharge(data_dir=data_dir, basin=basin, area=area) + discharge_col = "QObs(mm/d)" + elif dataset == "camels_gb": + df = utils.load_camels_gb_timeseries(data_dir=data_dir, basin=basin) + discharge_col = "discharge_spec" + elif dataset == "hourly_camels_gb": + df = utils.load_hourly_us_discharge(data_dir=data_dir, basin=basin) + discharge_col = "QObs(mm/h)" - drop_columns = [col for col in df.columns if col != f"QObs(t-{shift})"] + # shift discharge data by `shift` time steps + df[f"{discharge_col}_t-{shift}"] = df[discharge_col].shift(shift) + # remove all columns from data set except shifted discharge + drop_columns = [col for col in df.columns if col != f"{discharge_col}_t-{shift}"] data[basin] = df.drop(labels=drop_columns, axis=1) - with out_file.open("wb") as fp: - pickle.dump(data, fp) + if output_file is not None: + # store pickle dump + with output_file.open("wb") as fp: + pickle.dump(data, fp) + + LOGGER.info(f"Data successfully stored at {output_file}") - LOGGER.info(f"Data successfully stored at {out_file}") + return data diff --git a/neuralhydrology/data/pet.py b/neuralhydrology/data/pet.py index 818b615f..3a721058 100644 --- a/neuralhydrology/data/pet.py +++ b/neuralhydrology/data/pet.py @@ -1,19 +1,14 @@ import numpy as np from numba import njit -LAMBDA = 2.45 # Kept constant, MJkg-1 -ALPHA = 1.26 # Calibrated in CAMELS, here static -STEFAN_BOLTZMAN = 4.903e-09 -PI = np.pi - @njit -def get_priestley_taylor_pet(t_min: np.ndarray, t_max: np.ndarray, s_rad: np.ndarray, lat: float, - elev: float, doy: np.ndarray) -> np.ndarray: +def get_priestley_taylor_pet(t_min: np.ndarray, t_max: np.ndarray, s_rad: np.ndarray, lat: float, elev: float, + doy: np.ndarray) -> np.ndarray: """Calculate potential evapotranspiration (PET) as an approximation following the Priestley-Taylor equation. - The ground head flux G is assumed to be 0 at daily time steps (see Newman et al. 2015). The - equation follow FAO-56 (Allen et al. (1998)) + The ground head flux G is assumed to be 0 at daily time steps (see Newman et al., 2015 [#]_). The + equations follow FAO-56 (Allen et al., 1998 [#]_). Parameters ---------- @@ -34,35 +29,47 @@ def get_priestley_taylor_pet(t_min: np.ndarray, t_max: np.ndarray, s_rad: np.nda ------- np.ndarray Array containing PET estimates in mm/day + + References + ---------- + .. [#] A. J. Newman, M. P. Clark, K. Sampson, A. Wood, L. E. Hay, A. Bock, R. J. Viger, D. Blodgett, + L. Brekke, J. R. Arnold, T. Hopson, and Q. Duan: Development of a large-sample watershed-scale + hydrometeorological dataset for the contiguous USA: dataset characteristics and assessment of regional + variability in hydrologic model performance. Hydrol. Earth Syst. Sci., 19, 209-223, + doi:10.5194/hess-19-209-2015, 2015 + .. [#] Allen, R. G., Pereira, L. S., Raes, D., & Smith, M. (1998). Crop evapotranspiration-Guidelines for computing + crop water requirements-FAO Irrigation and drainage paper 56. Fao, Rome, 300(9), D05109. """ - lat = lat * (PI / 180) # degree to rad + lat = lat * (np.pi / 180) # degree to rad # Slope of saturation vapour pressure curve t_mean = 0.5 * (t_min + t_max) - slope_svp = get_slope_svp_curve(t_mean) + slope_svp = _get_slope_svp_curve(t_mean) # incoming netto short-wave radiation s_rad = s_rad * 0.0864 # conversion Wm-2 -> MJm-2day-1 - in_sw_rad = get_net_sw_srad(s_rad) + in_sw_rad = _get_net_sw_srad(s_rad) # outgoginng netto long-wave radiation - sol_dec = get_sol_decl(doy) - sha = get_sunset_hour_angle(lat, sol_dec) - ird = get_ird_earth_sun(doy) - et_rad = get_extraterra_rad(lat, sol_dec, sha, ird) - cs_rad = get_clear_sky_rad(elev, et_rad) - a_vp = get_avp_tmin(t_min) - out_lw_rad = get_net_outgoing_lw_rad(t_min, t_max, s_rad, cs_rad, a_vp) + sol_dec = _get_sol_decl(doy) + sha = _get_sunset_hour_angle(lat, sol_dec) + ird = _get_ird_earth_sun(doy) + et_rad = _get_extraterra_rad(lat, sol_dec, sha, ird) + cs_rad = _get_clear_sky_rad(elev, et_rad) + a_vp = _get_avp_tmin(t_min) + out_lw_rad = _get_net_outgoing_lw_rad(t_min, t_max, s_rad, cs_rad, a_vp) # net radiation - net_rad = get_net_rad(in_sw_rad, out_lw_rad) + net_rad = _get_net_rad(in_sw_rad, out_lw_rad) # gamma - atm_pressure = get_atmos_pressure(elev) - gamma = get_psy_const(atm_pressure) + atm_pressure = _get_atmos_pressure(elev) + gamma = _get_psy_const(atm_pressure) # PET MJm-2day-1 - pet = (ALPHA / LAMBDA) * (slope_svp * net_rad) / (slope_svp + gamma) + alpha = 1.26 # Calibrated in CAMELS, here static + _lambda = 2.45 # Kept constant, MJkg-1 + pet = (alpha / _lambda) * (slope_svp * net_rad) / (slope_svp + gamma) # convert energy to evap pet = pet * 0.408 @@ -71,7 +78,7 @@ def get_priestley_taylor_pet(t_min: np.ndarray, t_max: np.ndarray, s_rad: np.nda @njit -def get_slope_svp_curve(t_mean: np.ndarray) -> np.ndarray: +def _get_slope_svp_curve(t_mean: np.ndarray) -> np.ndarray: """Slope of saturation vapour pressure curve Equation 13 FAO-56 Allen et al. (1998) @@ -91,7 +98,7 @@ def get_slope_svp_curve(t_mean: np.ndarray) -> np.ndarray: @njit -def get_net_sw_srad(s_rad: np.ndarray, albedo: float = 0.23) -> np.ndarray: +def _get_net_sw_srad(s_rad: np.ndarray, albedo: float = 0.23) -> np.ndarray: """Calculate net shortwave radiation Equation 38 FAO-56 Allen et al. (1998) @@ -113,7 +120,7 @@ def get_net_sw_srad(s_rad: np.ndarray, albedo: float = 0.23) -> np.ndarray: @njit -def get_sol_decl(doy: np.ndarray) -> np.ndarray: +def _get_sol_decl(doy: np.ndarray) -> np.ndarray: """Get solar declination Equation 24 FAO-56 Allen et al. (1998) @@ -134,7 +141,7 @@ def get_sol_decl(doy: np.ndarray) -> np.ndarray: @njit -def get_sunset_hour_angle(lat: float, sol_dec: np.ndarray) -> np.ndarray: +def _get_sunset_hour_angle(lat: float, sol_dec: np.ndarray) -> np.ndarray: """Sunset hour angle @@ -159,7 +166,7 @@ def get_sunset_hour_angle(lat: float, sol_dec: np.ndarray) -> np.ndarray: @njit -def get_ird_earth_sun(doy: np.ndarray) -> np.ndarray: +def _get_ird_earth_sun(doy: np.ndarray) -> np.ndarray: """Inverse relative distance between Earth and Sun Equation 23 FAO-56 Allen et al. (1998) @@ -174,13 +181,12 @@ def get_ird_earth_sun(doy: np.ndarray) -> np.ndarray: np.ndarray Inverse relative distance between Earth and Sun """ - ird = 1 + 0.033 * np.cos((2 * PI) / 365 * doy) + ird = 1 + 0.033 * np.cos((2 * np.pi) / 365 * doy) return ird @njit -def get_extraterra_rad(lat: float, sol_dec: np.ndarray, sha: np.ndarray, - ird: np.ndarray) -> np.ndarray: +def _get_extraterra_rad(lat: float, sol_dec: np.ndarray, sha: np.ndarray, ird: np.ndarray) -> np.ndarray: """Extraterrestrial Radiation Equation 21 FAO-56 Allen et al. (1998) @@ -201,14 +207,14 @@ def get_extraterra_rad(lat: float, sol_dec: np.ndarray, sha: np.ndarray, np.ndarray Extraterrestrial radiation MJm-2day-1 """ - term1 = (24 * 60) / PI * 0.082 * ird + term1 = (24 * 60) / np.pi * 0.082 * ird term2 = sha * np.sin(lat) * np.sin(sol_dec) + np.cos(lat) * np.cos(sol_dec) * np.sin(sha) et_rad = term1 * term2 return et_rad @njit -def get_clear_sky_rad(elev: float, et_rad: np.ndarray) -> np.ndarray: +def _get_clear_sky_rad(elev: float, et_rad: np.ndarray) -> np.ndarray: """Clear sky radiation Equation 37 FAO-56 Allen et al. (1998) @@ -230,7 +236,7 @@ def get_clear_sky_rad(elev: float, et_rad: np.ndarray) -> np.ndarray: @njit -def get_avp_tmin(t_min: np.ndarray) -> np.ndarray: +def _get_avp_tmin(t_min: np.ndarray) -> np.ndarray: """Actual vapor pressure estimated using min temperature Equation 48 FAO-56 Allen et al. (1998) @@ -250,8 +256,8 @@ def get_avp_tmin(t_min: np.ndarray) -> np.ndarray: @njit -def get_net_outgoing_lw_rad(t_min: np.ndarray, t_max: np.ndarray, s_rad: np.ndarray, - cs_rad: np.ndarray, a_vp: np.ndarray) -> np.ndarray: +def _get_net_outgoing_lw_rad(t_min: np.ndarray, t_max: np.ndarray, s_rad: np.ndarray, cs_rad: np.ndarray, + a_vp: np.ndarray) -> np.ndarray: """Net outgoing longwave radiation Expects temperatures in degree and does the conversion in kelvin in the function. @@ -279,12 +285,13 @@ def get_net_outgoing_lw_rad(t_min: np.ndarray, t_max: np.ndarray, s_rad: np.ndar term1 = ((t_max + 273.16)**4 + (t_min + 273.16)**4) / 2 # conversion in K in equation term2 = 0.34 - 0.14 * np.sqrt(a_vp) term3 = 1.35 * s_rad / cs_rad - 0.35 - net_lw = STEFAN_BOLTZMAN * term1 * term2 * term3 + stefan_boltzman = 4.903e-09 + net_lw = stefan_boltzman * term1 * term2 * term3 return net_lw @njit -def get_net_rad(sw_rad: np.ndarray, lw_rad: np.ndarray) -> np.ndarray: +def _get_net_rad(sw_rad: np.ndarray, lw_rad: np.ndarray) -> np.ndarray: """Net radiation Equation 40 FAO-56 Allen et al. (1998) @@ -305,7 +312,7 @@ def get_net_rad(sw_rad: np.ndarray, lw_rad: np.ndarray) -> np.ndarray: @njit -def get_atmos_pressure(elev: float) -> float: +def _get_atmos_pressure(elev: float) -> float: """Atmospheric pressure Equation 7 FAO-56 Allen et al. (1998) @@ -325,7 +332,7 @@ def get_atmos_pressure(elev: float) -> float: @njit -def get_psy_const(atm_pressure: float) -> float: +def _get_psy_const(atm_pressure: float) -> float: """Psychometric constant Parameters @@ -342,7 +349,8 @@ def get_psy_const(atm_pressure: float) -> float: @njit -def srad_from_t(et_rad, cs_rad, t_min, t_max, coastal=False): +def _srad_from_t(et_rad, cs_rad, t_min, t_max, coastal=False): + """Estimate solar radiation from temperature""" # equation 50 if coastal: adj = 0.19 diff --git a/neuralhydrology/modelzoo/basemodel.py b/neuralhydrology/modelzoo/basemodel.py index 212112d0..984fe277 100644 --- a/neuralhydrology/modelzoo/basemodel.py +++ b/neuralhydrology/modelzoo/basemodel.py @@ -7,6 +7,18 @@ class BaseModel(nn.Module): + """Abstract base model class, don't use this class for model training. + + Use subclasses of this class for training/evaluating different models, e.g. use `CudaLSTM` for training a standard + LSTM model or `EA-LSTM` for training an Entity-Aware-LSTM. Refer to + `Documentation/Modelzoo `_ for a full list of + available models and how to integrate a new model. + + Parameters + ---------- + cfg : Config + The run configuration. + """ def __init__(self, cfg: Config): super(BaseModel, self).__init__() @@ -15,13 +27,44 @@ def __init__(self, cfg: Config): self.output_size = len(cfg.target_variables) def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """Perform a forward pass. + + Parameters + ---------- + data : Dict[str, torch.Tensor] + Dictionary, containing input features as key-value pairs. + + Returns + ------- + Dict[str, torch.Tensor] + Model output and potentially any intermediate states and activations as a dictionary. + """ raise NotImplementedError def sample(self, data: Dict[str, torch.Tensor], n_samples: int) -> torch.Tensor: + """Sample model predictions, e.g., for MC-Dropout. + + This function does `n_samples` forward passes for each sample in the batch. Only useful for models with dropout, + to perform MC-Dropout sampling. Make sure to set the model to train mode before calling this function + (`model.train()`), otherwise dropout won't be active. + + Parameters + ---------- + data : Dict[str, torch.Tensor] + Dictionary, containing input features as key-value pairs. + n_samples : int + Number of samples to generate for each input sample. + + Returns + ------- + torch.Tensor + Sampled model outputs for the `predict_last_n` (config argument) time steps of each sequence. The shape of + the output is ``[batch size, predict_last_n, n_samples]``. + """ predict_last_n = self.cfg.predict_last_n samples = torch.zeros(data['x_d'].shape[0], predict_last_n, n_samples) for i in range(n_samples): prediction = self.forward(data) samples[:, -predict_last_n:, i] = prediction['y_hat'][:, -predict_last_n:, 0] - return samples \ No newline at end of file + return samples diff --git a/neuralhydrology/modelzoo/cudalstm.py b/neuralhydrology/modelzoo/cudalstm.py index 47818f6e..6dfc83f1 100644 --- a/neuralhydrology/modelzoo/cudalstm.py +++ b/neuralhydrology/modelzoo/cudalstm.py @@ -12,6 +12,22 @@ class CudaLSTM(BaseModel): + """LSTM model class, which relies on PyTorch's CUDA LSTM class. + + This class implements the standard LSTM combined with a model head, as specified in the config. All features + (time series and static) are concatenated and passed to the LSTM directly. If you want to embedd the static features + prior to the concatenation, use the `EmbCudaLSTM` class. + To control the initial forget gate bias, use the config argument `initial_forget_bias`. Often it is useful to set + this value to a positive value at the start of the model training, to keep the forget gate closed and to facilitate + the gradient flow. + The `CudaLSTM` class does only support single timescale predictions. Use `MTSLSTM` to train a model and get + predictions on multiple temporal resolutions at the same time. + + Parameters + ---------- + cfg : Config + The run configuration. + """ def __init__(self, cfg: Config): super(CudaLSTM, self).__init__(cfg=cfg) @@ -32,13 +48,29 @@ def __init__(self, cfg: Config): self.head = get_head(cfg=cfg, n_in=cfg.hidden_size, n_out=self.output_size) - self.reset_parameters() + self._reset_parameters() - def reset_parameters(self): + def _reset_parameters(self): + """Special initialization of certain model weights.""" if self.cfg.initial_forget_bias is not None: self.lstm.bias_hh_l0.data[self.cfg.hidden_size:2 * self.cfg.hidden_size] = self.cfg.initial_forget_bias def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """Perform a forward pass on the CudaLSTM model. + + Parameters + ---------- + data : Dict[str, torch.Tensor] + Dictionary, containing input features as key-value pairs. + + Returns + ------- + Dict[str, torch.Tensor] + Model outputs and intermediate states as a dictionary. + - `y_hat`: model predictions of shape [batch size, sequence length, number of target variables]. + - `h_n`: hidden state at the last time step of the sequence of shape [1, batch size, hidden size]. + - `c_n`: cell state at the last time step of the sequence of shape [1, batch size, hidden size]. + """ # transpose to [seq_length, batch_size, n_features] x_d = data['x_d'].transpose(0, 1) @@ -58,7 +90,7 @@ def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: lstm_output, (h_n, c_n) = self.lstm(input=x_d) - # reshape to [batch_size, seq_length, n_hiddens] + # reshape to [1 , batch_size, n_hiddens] h_n = h_n.transpose(0, 1) c_n = c_n.transpose(0, 1) diff --git a/neuralhydrology/modelzoo/ealstm.py b/neuralhydrology/modelzoo/ealstm.py index 64c0e4fc..dba26e08 100644 --- a/neuralhydrology/modelzoo/ealstm.py +++ b/neuralhydrology/modelzoo/ealstm.py @@ -10,6 +10,28 @@ class EALSTM(BaseModel): + """Entity-Aware LSTM (EA-LSTM) model class. + + This model has been proposed by Kratzert et al. [#]_ as a variant of the standard LSTM. The main difference is that + the input gate of the EA-LSTM is modulated using only the static inputs, while the dynamic (time series) inputs are + used in all other parts of the model (i.e. forget gate, cell update gate and output gate). + To control the initial forget gate bias, use the config argument `initial_forget_bias`. Often it is useful to set + this value to a positive value at the start of the model training, to keep the forget gate closed and to facilitate + the gradient flow. + The `EALSTM` class does only support single timescale predictions. Use `MTSLSTM` to train an LSTM-based model and + get predictions on multiple temporal resolutions at the same time. + + Parameters + ---------- + cfg : Config + The run configuration. + + References + ---------- + .. [#] Kratzert, F., Klotz, D., Shalev, G., Klambauer, G., Hochreiter, S., and Nearing, G.: Towards learning + universal, regional, and local hydrological behaviors via machine learning applied to large-sample datasets, + Hydrol. Earth Syst. Sci., 23, 5089–5110, https://doi.org/10.5194/hess-23-5089-2019, 2019. + """ def __init__(self, cfg: Config): super(EALSTM, self).__init__(cfg=cfg) @@ -36,9 +58,10 @@ def __init__(self, cfg: Config): self.head = get_head(cfg=cfg, n_in=cfg.hidden_size, n_out=self.output_size) # initialize parameters - self.reset_parameters() + self._reset_parameters() - def reset_parameters(self): + def _reset_parameters(self): + """Special initialization of certain model weights.""" nn.init.orthogonal_(self.weight_ih.data) weight_hh_data = torch.eye(self.cfg.hidden_size) @@ -51,7 +74,23 @@ def reset_parameters(self): self.bias.data[:self.cfg.hidden_size] = self.cfg.initial_forget_bias def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: - + """Perform a forward pass on the EA-LSTM model. + + Parameters + ---------- + data : Dict[str, torch.Tensor] + Dictionary, containing input features as key-value pairs. + + Returns + ------- + Dict[str, torch.Tensor] + Model outputs and intermediate states as a dictionary. + - `y_hat`: model predictions of shape [batch size, sequence length, number of target variables]. + - `h_n`: hidden state at the last time step of the sequence of shape + [batch size, sequence length, number of target variables]. + - `c_n`: cell state at the last time step of the sequence of shape + [batch size, sequence length, number of target variables]. + """ if 'x_s' in data and 'x_one_hot' in data: x_s = torch.cat([data['x_s'], data['x_one_hot']], dim=-1) elif 'x_s' in data: diff --git a/neuralhydrology/modelzoo/embcudalstm.py b/neuralhydrology/modelzoo/embcudalstm.py index e098daf2..38ec694e 100644 --- a/neuralhydrology/modelzoo/embcudalstm.py +++ b/neuralhydrology/modelzoo/embcudalstm.py @@ -10,6 +10,23 @@ class EmbCudaLSTM(BaseModel): + """EmbCudaLSTM model class, which adds embedding networks for static inputs to the standard LSTM. + + This class extends the standard `CudaLSTM` class to preprocess the static inputs by an embedding network, prior + to concatenating those values to the dynamic (time series) inputs. Use the config argument `embedding_hiddens` to + specify the architecture of the fully-connected embedding network. No activation function is applied to the outputs + of the embedding network. + To control the initial forget gate bias, use the config argument `initial_forget_bias`. Often it is useful to set + this value to a positive value at the start of the model training, to keep the forget gate closed and to facilitate + the gradient flow. + The `EmbCudaLSTM` class does only support single timescale predictions. Use `MTSLSTM` to train a model and get + predictions on multiple temporal resolutions at the same time. + + Parameters + ---------- + cfg : Config + The run configuration. + """ def __init__(self, cfg: Config): super(EmbCudaLSTM, self).__init__(cfg=cfg) @@ -27,14 +44,29 @@ def __init__(self, cfg: Config): self.head = get_head(cfg=cfg, n_in=cfg.hidden_size, n_out=self.output_size) - self.reset_parameters() + self._reset_parameters() - def reset_parameters(self): + def _reset_parameters(self): + """Special initialization of certain model weights.""" if self.cfg.initial_forget_bias is not None: self.lstm.bias_hh_l0.data[self.cfg.hidden_size:2 * self.cfg.hidden_size] = self.cfg.initial_forget_bias def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: - + """Perform a forward pass on the EmbCudaLSTM model. + + Parameters + ---------- + data : Dict[str, torch.Tensor] + Dictionary, containing input features as key-value pairs. + + Returns + ------- + Dict[str, torch.Tensor] + Model outputs and intermediate states as a dictionary. + - `y_hat`: model predictions of shape [batch size, sequence length, number of target variables]. + - `h_n`: hidden state at the last time step of the sequence of shape [1, batch size, hidden size]. + - `c_n`: cell state at the last time step of the sequence of shape [1, batch size, hidden size]. + """ x_d = data['x_d'].transpose(0, 1) if 'x_s' in data and 'x_one_hot' in data: diff --git a/neuralhydrology/modelzoo/fc.py b/neuralhydrology/modelzoo/fc.py index d024b0e4..fb5bb175 100644 --- a/neuralhydrology/modelzoo/fc.py +++ b/neuralhydrology/modelzoo/fc.py @@ -6,6 +6,25 @@ class FC(nn.Module): + """Auxiliary class to build (multi-layer) fully-connected networks. + + This class is used in different places of the codebase to build fully-connected networks. E.g., the `EA-LSTM` and + `EmbCudaLSTM` use this class to create embedding networks for the static inputs. + Use the config argument `embedding_hiddens` to specify the hidden neurons of the embedding network. If only one + number is specified the embedding network consists of a single linear layer that maps the input into the specified + dimension. + Use the config argument `embedding_activation` to specify the activation function for intermediate layers. Currently + supported are 'tanh' and 'sigmoid'. + Use the config argument `embedding_dropout` to specify the dropout rate in intermediate layers. + + Parameters + ---------- + cfg : Config + The run configuration. + input_size : int, optional + Number of input features. If not specified, the number of input features is the sum of all static inputs (i.e., + catchment attributes, one-hot-encoding, etc.) + """ def __init__(self, cfg: Config, input_size: int = None): super(FC, self).__init__() @@ -44,7 +63,7 @@ def __init__(self, cfg: Config, input_size: int = None): layers.append(nn.Linear(input_size, output_size)) self.net = nn.Sequential(*layers) - self.reset_parameters() + self._reset_parameters() def _get_activation(self, name: str) -> nn.Module: if name.lower() == "tanh": @@ -55,7 +74,8 @@ def _get_activation(self, name: str) -> nn.Module: raise NotImplementedError(f"{name} currently not supported as activation in this class") return activation - def reset_parameters(self): + def _reset_parameters(self): + """Special initialization of certain model weights.""" for layer in self.net: if isinstance(layer, nn.modules.linear.Linear): n_in = layer.weight.shape[1] @@ -64,4 +84,17 @@ def reset_parameters(self): nn.init.constant_(layer.bias, val=0) def forward(self, x: torch.Tensor) -> torch.Tensor: + """Perform a forward pass on the FC model. + + Parameters + ---------- + x : torch.Tensor + Input data of shape [any, any, input size] + + Returns + ------- + torch.Tensor + Embedded inputs of shape [any, any, output_size], where 'output_size' is the last number specified in the + `embedding_hiddens` config argument. + """ return self.net(x) diff --git a/neuralhydrology/modelzoo/head.py b/neuralhydrology/modelzoo/head.py index cc112de4..7739a8c3 100644 --- a/neuralhydrology/modelzoo/head.py +++ b/neuralhydrology/modelzoo/head.py @@ -1,4 +1,5 @@ import logging +from typing import Dict import torch import torch.nn as nn @@ -34,13 +35,24 @@ def get_head(cfg: Config, n_in: int, n_out: int) -> nn.Module: class Regression(nn.Module): - """ - Regression head with different output activations. + """Single-layer regression head with different output activations. + + Parameters + ---------- + n_in : int + Number of input neurons. + n_out : int + Number of output neurons. + activation: str, optional + Output activation function. Can be specified in the config using the `output_activation` argument. Supported + are {'linear', 'relu', 'softplus'}. If not specified (or an unsupported activation function is specified), will + default to 'linear' activation. """ def __init__(self, n_in: int, n_out: int, activation: str = "linear"): super(Regression, self).__init__() + # TODO: Add multi-layer support layers = [nn.Linear(n_in, n_out)] if activation != "linear": if activation.lower() == "relu": @@ -51,5 +63,16 @@ def __init__(self, n_in: int, n_out: int, activation: str = "linear"): LOGGER.warning(f"## WARNING: Ignored output activation {activation} and used 'linear' instead.") self.net = nn.Sequential(*layers) - def forward(self, x: torch.Tensor): - return {'y_hat': self.net(x)} \ No newline at end of file + def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]: + """Perform a forward pass on the Regression head. + + Parameters + ---------- + x : torch.Tensor + + Returns + ------- + Dict[str, torch.Tensor] + Dictionary, containing the model predictions in the 'y_hat' key. + """ + return {'y_hat': self.net(x)} diff --git a/neuralhydrology/modelzoo/lstm.py b/neuralhydrology/modelzoo/lstm.py index edc91d7a..aa478506 100644 --- a/neuralhydrology/modelzoo/lstm.py +++ b/neuralhydrology/modelzoo/lstm.py @@ -17,6 +17,8 @@ class LSTM(BaseModel): The idea of this model is to be able to train an LSTM using the nn.LSTM layer, which uses the optimized CuDNN implementation, and later to copy the weights into this model for a more in-depth network analysis. + Can be used with trained models. For instance, from the `CudaLSTM` or `EmbCudaLSTM` classes, the CudNN LSTM layer can be + accessed as `model.lstm`, where `model` is a `CudaLSTM` or `EmbCudaLSTM` instance. Note: Currently only supports one-layer CuDNN LSTMs @@ -50,11 +52,9 @@ def __init__(self, cfg: Config): self.head = get_head(cfg=cfg, n_in=self._hidden_size, n_out=self.output_size) - def forward(self, - data: Dict[str, torch.Tensor], - h_0: torch.Tensor = None, + def forward(self, data: Dict[str, torch.Tensor], h_0: torch.Tensor = None, c_0: torch.Tensor = None) -> Dict[str, torch.Tensor]: - """Forward pass through the LSTM network + """Perform a forward pass on the LSTM model. Parameters ---------- @@ -141,9 +141,10 @@ def __init__(self, input_size: int, hidden_size: int, initial_forget_bias: float self.b_hh = nn.Parameter(torch.FloatTensor(4 * hidden_size)) self.b_ih = nn.Parameter(torch.FloatTensor(4 * hidden_size)) - self.reset_parameters() + self._reset_parameters() - def reset_parameters(self): + def _reset_parameters(self): + """Special initialization of certain model weights.""" stdv = math.sqrt(3 / self.hidden_size) for weight in self.parameters(): if len(weight.shape) > 1: diff --git a/neuralhydrology/modelzoo/mtslstm.py b/neuralhydrology/modelzoo/mtslstm.py index f16f3566..68e650da 100644 --- a/neuralhydrology/modelzoo/mtslstm.py +++ b/neuralhydrology/modelzoo/mtslstm.py @@ -22,11 +22,11 @@ class MTSLSTM(BaseModel): Each branch processes the inputs at its respective timescale. Finally, one prediction head per timescale generates the predictions for that timescale based on the LSTM output. Optionally, one can specify: - - a different hidden size for each LSTM branch (use a dict in the ``hidden_size`` config argument) - - different dynamic input variables for each timescale (use a dict in the ``dynamic_inputs`` config argument) - - the strategy to transfer states from the initial shared low-resolution LSTM to the per-timescale - higher-resolution LSTMs. By default, this is a linear transfer layer, but you can specify 'identity' to use an - identity operation or 'None' to turn off any transfer (via the ``transfer_mtlstm_states`` config argument). + - a different hidden size for each LSTM branch (use a dict in the ``hidden_size`` config argument) + - different dynamic input variables for each timescale (use a dict in the ``dynamic_inputs`` config argument) + - the strategy to transfer states from the initial shared low-resolution LSTM to the per-timescale + higher-resolution LSTMs. By default, this is a linear transfer layer, but you can specify 'identity' to use an + identity operation or 'None' to turn off any transfer (via the ``transfer_mtlstm_states`` config argument). The sMTS-LSTM variant has the same overall architecture, but the weights of the per-timescale branches (including From 2714f25c7013b5a363e618089a6a3838ba9cd8cc Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Wed, 7 Oct 2020 08:38:52 +0200 Subject: [PATCH 11/15] Data module restructuring (#13) * split data module into datasetzoo and datautils #5 * moved dataset loading funcs to class files, changed attribute sanity check --- .../api/neuralhydrology.data.caravan.rst | 7 - docs/source/api/neuralhydrology.data.rst | 20 - ...euralhydrology.datasetzoo.basedataset.rst} | 2 +- ...> neuralhydrology.datasetzoo.camelsgb.rst} | 2 +- ...> neuralhydrology.datasetzoo.camelsus.rst} | 2 +- ...alhydrology.datasetzoo.hourlycamelsus.rst} | 2 +- .../source/api/neuralhydrology.datasetzoo.rst | 15 + ...ralhydrology.datautils.climateindices.rst} | 2 +- ...ralhydrology.datautils.dischargeinput.rst} | 2 +- ....rst => neuralhydrology.datautils.pet.rst} | 2 +- docs/source/api/neuralhydrology.datautils.rst | 15 + ...st => neuralhydrology.datautils.utils.rst} | 2 +- docs/source/api/neuralhydrology.rst | 3 +- neuralhydrology/data/camelsus.py | 106 ---- neuralhydrology/data/utils.py | 543 ------------------ .../{data => datasetzoo}/__init__.py | 10 +- .../{data => datasetzoo}/basedataset.py | 14 +- .../{data => datasetzoo}/camelsgb.py | 102 +++- neuralhydrology/datasetzoo/camelsus.py | 240 ++++++++ .../{data => datasetzoo}/hourlycamelsus.py | 137 ++++- neuralhydrology/datautils/__init__.py | 0 .../{data => datautils}/climateindices.py | 2 +- .../{data => datautils}/dischargeinput.py | 2 +- neuralhydrology/{data => datautils}/pet.py | 0 neuralhydrology/datautils/utils.py | 167 ++++++ neuralhydrology/evaluation/metrics.py | 2 +- neuralhydrology/evaluation/signatures.py | 2 +- neuralhydrology/evaluation/tester.py | 6 +- neuralhydrology/modelzoo/mtslstm.py | 2 +- neuralhydrology/modelzoo/odelstm.py | 2 +- neuralhydrology/training/basetrainer.py | 6 +- neuralhydrology/training/regularization.py | 2 +- neuralhydrology/utils/nh_results_ensemble.py | 2 +- test/test_config_runs.py | 8 +- 34 files changed, 698 insertions(+), 733 deletions(-) delete mode 100644 docs/source/api/neuralhydrology.data.caravan.rst delete mode 100644 docs/source/api/neuralhydrology.data.rst rename docs/source/api/{neuralhydrology.data.basedataset.rst => neuralhydrology.datasetzoo.basedataset.rst} (58%) rename docs/source/api/{neuralhydrology.data.camelsgb.rst => neuralhydrology.datasetzoo.camelsgb.rst} (58%) rename docs/source/api/{neuralhydrology.data.camelsus.rst => neuralhydrology.datasetzoo.camelsus.rst} (58%) rename docs/source/api/{neuralhydrology.data.hourlycamelsus.rst => neuralhydrology.datasetzoo.hourlycamelsus.rst} (59%) create mode 100644 docs/source/api/neuralhydrology.datasetzoo.rst rename docs/source/api/{neuralhydrology.data.climateindices.rst => neuralhydrology.datautils.climateindices.rst} (59%) rename docs/source/api/{neuralhydrology.data.dischargeinput.rst => neuralhydrology.datautils.dischargeinput.rst} (59%) rename docs/source/api/{neuralhydrology.data.pet.rst => neuralhydrology.datautils.pet.rst} (57%) create mode 100644 docs/source/api/neuralhydrology.datautils.rst rename docs/source/api/{neuralhydrology.data.utils.rst => neuralhydrology.datautils.utils.rst} (58%) delete mode 100644 neuralhydrology/data/camelsus.py delete mode 100644 neuralhydrology/data/utils.py rename neuralhydrology/{data => datasetzoo}/__init__.py (92%) rename neuralhydrology/{data => datasetzoo}/basedataset.py (98%) rename neuralhydrology/{data => datasetzoo}/camelsgb.py (51%) create mode 100644 neuralhydrology/datasetzoo/camelsus.py rename neuralhydrology/{data => datasetzoo}/hourlycamelsus.py (54%) create mode 100644 neuralhydrology/datautils/__init__.py rename neuralhydrology/{data => datautils}/climateindices.py (99%) rename neuralhydrology/{data => datautils}/dischargeinput.py (98%) rename neuralhydrology/{data => datautils}/pet.py (100%) create mode 100644 neuralhydrology/datautils/utils.py diff --git a/docs/source/api/neuralhydrology.data.caravan.rst b/docs/source/api/neuralhydrology.data.caravan.rst deleted file mode 100644 index ef3bab55..00000000 --- a/docs/source/api/neuralhydrology.data.caravan.rst +++ /dev/null @@ -1,7 +0,0 @@ -Caravan -======= - -.. automodule:: neuralhydrology.data.caravan - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/api/neuralhydrology.data.rst b/docs/source/api/neuralhydrology.data.rst deleted file mode 100644 index 8eed95f4..00000000 --- a/docs/source/api/neuralhydrology.data.rst +++ /dev/null @@ -1,20 +0,0 @@ -nh.data -======= - -.. automodule:: neuralhydrology.data - :members: - :undoc-members: - :show-inheritance: - -.. toctree:: - :maxdepth: 4 - - neuralhydrology.data.basedataset - neuralhydrology.data.camelsus - neuralhydrology.data.hourlycamelsus - neuralhydrology.data.camelsgb - neuralhydrology.data.caravan - neuralhydrology.data.climateindices - neuralhydrology.data.dischargeinput - neuralhydrology.data.pet - neuralhydrology.data.utils diff --git a/docs/source/api/neuralhydrology.data.basedataset.rst b/docs/source/api/neuralhydrology.datasetzoo.basedataset.rst similarity index 58% rename from docs/source/api/neuralhydrology.data.basedataset.rst rename to docs/source/api/neuralhydrology.datasetzoo.basedataset.rst index 8aaf33f1..bc60db2a 100644 --- a/docs/source/api/neuralhydrology.data.basedataset.rst +++ b/docs/source/api/neuralhydrology.datasetzoo.basedataset.rst @@ -1,7 +1,7 @@ BaseDataset =========== -.. automodule:: neuralhydrology.data.basedataset +.. automodule:: neuralhydrology.datasetzoo.basedataset :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/source/api/neuralhydrology.data.camelsgb.rst b/docs/source/api/neuralhydrology.datasetzoo.camelsgb.rst similarity index 58% rename from docs/source/api/neuralhydrology.data.camelsgb.rst rename to docs/source/api/neuralhydrology.datasetzoo.camelsgb.rst index dffefa3c..5c2bc844 100644 --- a/docs/source/api/neuralhydrology.data.camelsgb.rst +++ b/docs/source/api/neuralhydrology.datasetzoo.camelsgb.rst @@ -1,7 +1,7 @@ CamelsGB ======== -.. automodule:: neuralhydrology.data.camelsgb +.. automodule:: neuralhydrology.datasetzoo.camelsgb :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/source/api/neuralhydrology.data.camelsus.rst b/docs/source/api/neuralhydrology.datasetzoo.camelsus.rst similarity index 58% rename from docs/source/api/neuralhydrology.data.camelsus.rst rename to docs/source/api/neuralhydrology.datasetzoo.camelsus.rst index b3540506..8cd08366 100644 --- a/docs/source/api/neuralhydrology.data.camelsus.rst +++ b/docs/source/api/neuralhydrology.datasetzoo.camelsus.rst @@ -1,7 +1,7 @@ CamelsUS ======== -.. automodule:: neuralhydrology.data.camelsus +.. automodule:: neuralhydrology.datasetzoo.camelsus :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/neuralhydrology.data.hourlycamelsus.rst b/docs/source/api/neuralhydrology.datasetzoo.hourlycamelsus.rst similarity index 59% rename from docs/source/api/neuralhydrology.data.hourlycamelsus.rst rename to docs/source/api/neuralhydrology.datasetzoo.hourlycamelsus.rst index 7876745f..b1c3a527 100644 --- a/docs/source/api/neuralhydrology.data.hourlycamelsus.rst +++ b/docs/source/api/neuralhydrology.datasetzoo.hourlycamelsus.rst @@ -1,7 +1,7 @@ HourlyCamelsUS ============== -.. automodule:: neuralhydrology.data.hourlycamelsus +.. automodule:: neuralhydrology.datasetzoo.hourlycamelsus :members: :undoc-members: :show-inheritance: \ No newline at end of file diff --git a/docs/source/api/neuralhydrology.datasetzoo.rst b/docs/source/api/neuralhydrology.datasetzoo.rst new file mode 100644 index 00000000..f8f49b58 --- /dev/null +++ b/docs/source/api/neuralhydrology.datasetzoo.rst @@ -0,0 +1,15 @@ +nh.datasetzoo +============= + +.. automodule:: neuralhydrology.datasetzoo + :members: + :undoc-members: + :show-inheritance: + +.. toctree:: + :maxdepth: 4 + + neuralhydrology.datasetzoo.basedataset + neuralhydrology.datasetzoo.camelsus + neuralhydrology.datasetzoo.hourlycamelsus + neuralhydrology.datasetzoo.camelsgb diff --git a/docs/source/api/neuralhydrology.data.climateindices.rst b/docs/source/api/neuralhydrology.datautils.climateindices.rst similarity index 59% rename from docs/source/api/neuralhydrology.data.climateindices.rst rename to docs/source/api/neuralhydrology.datautils.climateindices.rst index 253ed06c..2d6dd2cf 100644 --- a/docs/source/api/neuralhydrology.data.climateindices.rst +++ b/docs/source/api/neuralhydrology.datautils.climateindices.rst @@ -1,7 +1,7 @@ climateindices ============== -.. automodule:: neuralhydrology.data.climateindices +.. automodule:: neuralhydrology.datautils.climateindices :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/neuralhydrology.data.dischargeinput.rst b/docs/source/api/neuralhydrology.datautils.dischargeinput.rst similarity index 59% rename from docs/source/api/neuralhydrology.data.dischargeinput.rst rename to docs/source/api/neuralhydrology.datautils.dischargeinput.rst index 4f470eeb..894ee906 100644 --- a/docs/source/api/neuralhydrology.data.dischargeinput.rst +++ b/docs/source/api/neuralhydrology.datautils.dischargeinput.rst @@ -1,7 +1,7 @@ dischargeinput ============== -.. automodule:: neuralhydrology.data.dischargeinput +.. automodule:: neuralhydrology.datautils.dischargeinput :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/neuralhydrology.data.pet.rst b/docs/source/api/neuralhydrology.datautils.pet.rst similarity index 57% rename from docs/source/api/neuralhydrology.data.pet.rst rename to docs/source/api/neuralhydrology.datautils.pet.rst index 7c05d758..65cb03f1 100644 --- a/docs/source/api/neuralhydrology.data.pet.rst +++ b/docs/source/api/neuralhydrology.datautils.pet.rst @@ -1,7 +1,7 @@ pet === -.. automodule:: neuralhydrology.data.pet +.. automodule:: neuralhydrology.datautils.pet :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/neuralhydrology.datautils.rst b/docs/source/api/neuralhydrology.datautils.rst new file mode 100644 index 00000000..be25b0b2 --- /dev/null +++ b/docs/source/api/neuralhydrology.datautils.rst @@ -0,0 +1,15 @@ +nh.datautils +============ + +.. automodule:: neuralhydrology.datautils + :members: + :undoc-members: + :show-inheritance: + +.. toctree:: + :maxdepth: 4 + + neuralhydrology.datautils.climateindices + neuralhydrology.datautils.dischargeinput + neuralhydrology.datautils.pet + neuralhydrology.datautils.utils diff --git a/docs/source/api/neuralhydrology.data.utils.rst b/docs/source/api/neuralhydrology.datautils.utils.rst similarity index 58% rename from docs/source/api/neuralhydrology.data.utils.rst rename to docs/source/api/neuralhydrology.datautils.utils.rst index 258676ee..f9a5cfe4 100644 --- a/docs/source/api/neuralhydrology.data.utils.rst +++ b/docs/source/api/neuralhydrology.datautils.utils.rst @@ -1,7 +1,7 @@ utils ===== -.. automodule:: neuralhydrology.data.utils +.. automodule:: neuralhydrology.datautils.utils :members: :undoc-members: :show-inheritance: diff --git a/docs/source/api/neuralhydrology.rst b/docs/source/api/neuralhydrology.rst index eddeb281..111ddea4 100644 --- a/docs/source/api/neuralhydrology.rst +++ b/docs/source/api/neuralhydrology.rst @@ -10,7 +10,8 @@ neuralhydrology API .. toctree:: :maxdepth: 4 - neuralhydrology.data + neuralhydrology.datasetzoo + neuralhydrology.datautils neuralhydrology.evaluation neuralhydrology.modelzoo neuralhydrology.training diff --git a/neuralhydrology/data/camelsus.py b/neuralhydrology/data/camelsus.py deleted file mode 100644 index 05a461a6..00000000 --- a/neuralhydrology/data/camelsus.py +++ /dev/null @@ -1,106 +0,0 @@ -from typing import Dict, List, Union - -import numpy as np -import pandas as pd -import xarray - -from neuralhydrology.data.basedataset import BaseDataset -from neuralhydrology.data import utils -from neuralhydrology.utils.config import Config - - -class CamelsUS(BaseDataset): - """Data set class for the CAMELS US data set by [#]_ and [#]_. - - Parameters - ---------- - cfg : Config - The run configuration. - is_train : bool - Defines if the dataset is used for training or evaluating. If True (training), means/stds for each feature - are computed and stored to the run directory. If one-hot encoding is used, the mapping for the one-hot encoding - is created and also stored to disk. If False, a `scaler` input is expected and similarly the `id_to_int` input - if one-hot encoding is used. - period : {'train', 'validation', 'test'} - Defines the period for which the data will be loaded - basin : str, optional - If passed, the data for only this basin will be loaded. Otherwise the basin(s) are read from the appropriate - basin file, corresponding to the `period`. - additional_features : List[Dict[str, pd.DataFrame]], optional - List of dictionaries, mapping from a basin id to a pandas DataFrame. This DataFrame will be added to the data - loaded from the dataset and all columns are available as 'dynamic_inputs', 'static_inputs' and - 'target_variables' - id_to_int : Dict[str, int], optional - If the config argument 'use_basin_id_encoding' is True in the config and period is either 'validation' or - 'test', this input is required. It is a dictionary, mapping from basin id to an integer (the one-hot encoding). - scaler : Dict[str, Union[pd.Series, xarray.DataArray]], optional - If period is either 'validation' or 'test', this input is required. It contains the means and standard - deviations for each feature and is stored to the run directory during training (train_data/train_data_scaler.p) - - References - ---------- - .. [#] A. J. Newman, M. P. Clark, K. Sampson, A. Wood, L. E. Hay, A. Bock, R. J. Viger, D. Blodgett, - L. Brekke, J. R. Arnold, T. Hopson, and Q. Duan: Development of a large-sample watershed-scale - hydrometeorological dataset for the contiguous USA: dataset characteristics and assessment of regional - variability in hydrologic model performance. Hydrol. Earth Syst. Sci., 19, 209-223, - doi:10.5194/hess-19-209-2015, 2015 - .. [#] Addor, N., Newman, A. J., Mizukami, N. and Clark, M. P.: The CAMELS data set: catchment attributes and - meteorology for large-sample studies, Hydrol. Earth Syst. Sci., 21, 5293-5313, doi:10.5194/hess-21-5293-2017, - 2017. - """ - - def __init__(self, - cfg: Config, - is_train: bool, - period: str, - basin: str = None, - additional_features: List[Dict[str, pd.DataFrame]] = [], - id_to_int: Dict[str, int] = {}, - scaler: Dict[str, Union[pd.Series, xarray.DataArray]] = {}): - super(CamelsUS, self).__init__(cfg=cfg, - is_train=is_train, - period=period, - basin=basin, - additional_features=additional_features, - id_to_int=id_to_int, - scaler=scaler) - - def _load_basin_data(self, basin: str) -> pd.DataFrame: - """Load input and output data from text files.""" - # get forcings - dfs = [] - for forcing in self.cfg.forcings: - df, area = utils.load_camels_us_forcings(self.cfg.data_dir, basin, forcing) - - # rename columns - if len(self.cfg.forcings) > 1: - df = df.rename(columns={col: f"{col}_{forcing}" for col in df.columns}) - dfs.append(df) - df = pd.concat(dfs, axis=1) - - # add discharge - df['QObs(mm/d)'] = utils.load_camels_us_discharge(self.cfg.data_dir, basin, area) - - # replace invalid discharge values by NaNs - qobs_cols = [col for col in df.columns if "qobs" in col.lower()] - for col in qobs_cols: - df.loc[df[col] < 0, col] = np.nan - - return df - - def _load_attributes(self) -> pd.DataFrame: - if self.cfg.camels_attributes: - if self.is_train: - # sanity check attributes for NaN in per-feature standard deviation - utils.attributes_sanity_check(data_dir=self.cfg.data_dir, - attribute_set=self.cfg.dataset, - basins=self.basins, - attribute_list=self.cfg.camels_attributes) - - df = utils.load_camels_us_attributes(self.cfg.data_dir, basins=self.basins) - - # remove all attributes not defined in the config - drop_cols = [c for c in df.columns if c not in self.cfg.camels_attributes] - df = df.drop(drop_cols, axis=1) - - return df diff --git a/neuralhydrology/data/utils.py b/neuralhydrology/data/utils.py deleted file mode 100644 index 9e7e508a..00000000 --- a/neuralhydrology/data/utils.py +++ /dev/null @@ -1,543 +0,0 @@ -from pathlib import Path -from typing import List, Tuple, Union - -import numpy as np -import pandas as pd -import xarray -from xarray.core.dataarray import DataArray -from xarray.core.dataset import Dataset - -######################################################################################################################## -# CAMELS US utility functions # -######################################################################################################################## - - -def load_camels_us_attributes(data_dir: Path, basins: List[str] = []) -> pd.DataFrame: - """Load CAMELS US attributes from the dataset provided by [#]_ - - Parameters - ---------- - data_dir : Path - Path to the CAMELS US directory. This folder must contain a 'camels_attributes_v2.0' folder (the original - data set) containing the corresponding txt files for each attribute group. - basins : List[str], optional - If passed, return only attributes for the basins specified in this list. Otherwise, the attributes of all basins - are returned. - - Returns - ------- - pandas.DataFrame - Basin-indexed DataFrame, containing the attributes as columns. - - References - ---------- - .. [#] Addor, N., Newman, A. J., Mizukami, N. and Clark, M. P.: The CAMELS data set: catchment attributes and - meteorology for large-sample studies, Hydrol. Earth Syst. Sci., 21, 5293-5313, doi:10.5194/hess-21-5293-2017, - 2017. - """ - attributes_path = Path(data_dir) / 'camels_attributes_v2.0' - - if not attributes_path.exists(): - raise RuntimeError(f"Attribute folder not found at {attributes_path}") - - txt_files = attributes_path.glob('camels_*.txt') - - # Read-in attributes into one big dataframe - dfs = [] - for txt_file in txt_files: - df_temp = pd.read_csv(txt_file, sep=';', header=0, dtype={'gauge_id': str}) - df_temp = df_temp.set_index('gauge_id') - - dfs.append(df_temp) - - df = pd.concat(dfs, axis=1) - # convert huc column to double digit strings - df['huc'] = df['huc_02'].apply(lambda x: str(x).zfill(2)) - df = df.drop('huc_02', axis=1) - - if basins: - # drop rows of basins not contained in the passed list - drop_basins = [b for b in df.index if b not in basins] - df = df.drop(drop_basins, axis=0) - - return df - - -def load_camels_us_forcings(data_dir: Path, basin: str, forcings: str) -> Tuple[pd.DataFrame, int]: - """Load the forcing data for a basin of the CAMELS US data set. - - Parameters - ---------- - data_dir : Path - Path to the CAMELS US directory. This folder must contain a 'basin_mean_forcing' folder containing one - subdirectory for each forcing. The forcing directories have to contain 18 subdirectories (for the 18 HUCS) as in - the original CAMELS data set. In each HUC folder are the forcing files (.txt), starting with the 8-digit basin - id. - basin : str - 8-digit USGS identifier of the basin. - forcings : str - Can be e.g. 'daymet' or 'nldas', etc. Must match the folder names in the 'basin_mean_forcing' directory. - - Returns - ------- - pd.DataFrame - Time-indexed DataFrame, containing the forcing data. - int - Catchment area (m2), specified in the header of the forcing file. - """ - forcing_path = data_dir / 'basin_mean_forcing' / forcings - if not forcing_path.is_dir(): - raise OSError(f"{forcing_path} does not exist") - - files = list(forcing_path.glob('**/*_forcing_leap.txt')) - file_path = [f for f in files if f.name[:8] == basin] - if file_path: - file_path = file_path[0] - else: - raise FileNotFoundError(f'No file for Basin {basin} at {file_path}') - - df = pd.read_csv(file_path, sep='\s+', header=3) - df["date"] = pd.to_datetime(df.Year.map(str) + "/" + df.Mnth.map(str) + "/" + df.Day.map(str), format="%Y/%m/%d") - df = df.set_index("date") - - # load area from header - with open(file_path, 'r') as fp: - content = fp.readlines() - area = int(content[2]) - - return df, area - - -def load_camels_us_discharge(data_dir: Path, basin: str, area: int) -> pd.Series: - """Load the discharge data for a basin of the CAMELS US data set. - - Parameters - ---------- - data_dir : Path - Path to the CAMELS US directory. This folder must contain a 'usgs_streamflow' folder with 18 - subdirectories (for the 18 HUCS) as in the original CAMELS data set. In each HUC folder are the discharge files - (.txt), starting with the 8-digit basin id. - basin : str - 8-digit USGS identifier of the basin. - area : int - Catchment area (m2), used to normalize the discharge. - - Returns - ------- - pd.Series - Time-index pandas.Series of the discharge values (mm/day) - """ - - discharge_path = data_dir / 'usgs_streamflow' - files = list(discharge_path.glob('**/*_streamflow_qc.txt')) - file_path = [f for f in files if f.name[:8] == basin] - if file_path: - file_path = file_path[0] - else: - raise FileNotFoundError(f'No file for Basin {basin} at {file_path}') - - col_names = ['basin', 'Year', 'Mnth', 'Day', 'QObs', 'flag'] - df = pd.read_csv(file_path, sep='\s+', header=None, names=col_names) - df["date"] = pd.to_datetime(df.Year.map(str) + "/" + df.Mnth.map(str) + "/" + df.Day.map(str), format="%Y/%m/%d") - df = df.set_index("date") - - # normalize discharge from cubic feed per second to mm per day - df.QObs = 28316846.592 * df.QObs * 86400 / (area * 10**6) - - return df.QObs - - -######################################################################################################################## -# HOURLY CAMELS US utility functions # -######################################################################################################################## - - -def load_hourly_us_forcings(data_dir: Path, basin: str, forcings: str) -> pd.DataFrame: - """Load the hourly forcing data for a basin of the CAMELS US data set. - - The hourly forcings are not included in the original data set by Newman et al. (2017). - - Parameters - ---------- - data_dir : Path - Path to the CAMELS US directory. This folder must contain an 'hourly' folder containing one subdirectory - for each forcing, which contains the forcing files (.csv) for each basin. Files have to contain the 8-digit - basin id. - basin : str - 8-digit USGS identifier of the basin. - forcings : str - Must match the folder names in the 'hourly' directory. E.g. 'nldas_hourly' - - Returns - ------- - pd.DataFrame - Time-indexed DataFrame, containing the forcing data. - """ - forcing_path = data_dir / 'hourly' / forcings - if not forcing_path.is_dir(): - raise OSError(f"{forcing_path} does not exist") - - files = list(forcing_path.glob('*.csv')) - file_path = [f for f in files if basin in f.stem] - if file_path: - file_path = file_path[0] - else: - raise FileNotFoundError(f'No file for Basin {basin} at {forcing_path}') - - return pd.read_csv(file_path, index_col=['date'], parse_dates=['date']) - - -def load_hourly_us_discharge(data_dir: Path, basin: str) -> pd.DataFrame: - """Load the hourly discharge data for a basin of the CAMELS US data set. - - Parameters - ---------- - data_dir : Path - Path to the CAMELS US directory. This folder must contain a folder called 'hourly' with a subdirectory - 'usgs_streamflow' which contains the discharge files (.csv) for each basin. File names must contain the 8-digit - basin id. - basin : str - 8-digit USGS identifier of the basin. - - Returns - ------- - pd.Series - Time-index Series of the discharge values (mm/hour) - """ - discharge_path = data_dir / 'hourly' / 'usgs_streamflow' - files = list(discharge_path.glob('**/*usgs-hourly.csv')) - file_path = [f for f in files if basin in f.stem] - if file_path: - file_path = file_path[0] - else: - raise FileNotFoundError(f'No file for Basin {basin} at {discharge_path}') - - return pd.read_csv(file_path, index_col=['date'], parse_dates=['date']) - - -def load_hourly_us_stage(data_dir: Path, basin: str) -> pd.Series: - """Load the hourly stage data for a basin of the CAMELS US data set. - - Parameters - ---------- - data_dir : Path - Path to the CAMELS US directory. This folder must contain a folder called 'hourly' with a subdirectory - 'usgs_stage' which contains the stage files (.csv) for each basin. File names must contain the 8-digit basin id. - basin : str - 8-digit USGS identifier of the basin. - - Returns - ------- - pd.Series - Time-index Series of the stage values (m) - """ - stage_path = data_dir / 'hourly' / 'usgs_stage' - files = list(stage_path.glob('**/*_utc.csv')) - file_path = [f for f in files if basin in f.stem] - if file_path: - file_path = file_path[0] - else: - raise FileNotFoundError(f'No file for Basin {basin} at {stage_path}') - - df = pd.read_csv(file_path, - sep=',', - index_col=['datetime'], - parse_dates=['datetime'], - usecols=['datetime', 'gauge_height_ft']) - df = df.resample('H').mean() - df["gauge_height_m"] = df["gauge_height_ft"] * 0.3048 - - return df["gauge_height_m"] - - -def load_hourly_us_netcdf(data_dir: Path, forcings: str) -> xarray.Dataset: - """Load hourly forcing and discharge data from preprocessed netCDF file. - - Parameters - ---------- - data_dir : Path - Path to the CAMELS US directory. This folder must contain a folder called 'hourly', containing the netCDF file. - forcings : str - Name of the forcing product. Must match the ending of the netCDF file. E.g. 'nldas_hourly' for - 'usgs-streamflow-nldas_hourly.nc' - - Returns - ------- - xarray.Dataset - Dataset containing the combined discharge and forcing data of all basins (as stored in the netCDF) - """ - netcdf_path = data_dir / 'hourly' / f'usgs-streamflow-{forcings}.nc' - if not netcdf_path.is_file(): - raise FileNotFoundError(f'No NetCDF file for hourly streamflow and {forcings} at {netcdf_path}.') - - return xarray.open_dataset(netcdf_path) - - -######################################################################################################################## -# CAMELS GB utility functions # -######################################################################################################################## - - -def load_camels_gb_attributes(data_dir: Path, basins: List[str] = []) -> pd.DataFrame: - """Load CAMELS GB attributes from the dataset provided by [#]_ - - Parameters - ---------- - data_dir : Path - Path to the CAMELS GB directory. This folder must contain an 'attributes' folder containing the corresponding - csv files for each attribute group (ending with _attributes.csv). - basins : List[str], optional - If passed, return only attributes for the basins specified in this list. Otherwise, the attributes of all basins - are returned. - - Returns - ------- - pd.DataFrame - Basin-indexed DataFrame, containing the attributes as columns. - - References - ---------- - .. [#] Coxon, G., Addor, N., Bloomfield, J. P., Freer, J., Fry, M., Hannaford, J., Howden, N. J. K., Lane, R., - Lewis, M., Robinson, E. L., Wagener, T., and Woods, R.: CAMELS-GB: Hydrometeorological time series and landscape - attributes for 671 catchments in Great Britain, Earth Syst. Sci. Data Discuss., - https://doi.org/10.5194/essd-2020-49, in review, 2020. - """ - attributes_path = Path(data_dir) / 'attributes' - - if not attributes_path.exists(): - raise RuntimeError(f"Attribute folder not found at {attributes_path}") - - txt_files = attributes_path.glob('*_attributes.csv') - - # Read-in attributes into one big dataframe - dfs = [] - for txt_file in txt_files: - df_temp = pd.read_csv(txt_file, sep=',', header=0, dtype={'gauge_id': str}) - df_temp = df_temp.set_index('gauge_id') - - dfs.append(df_temp) - - df = pd.concat(dfs, axis=1) - - if basins: - # drop rows of basins not contained in the passed list - drop_basins = [b for b in df.index if b not in basins] - df = df.drop(drop_basins, axis=0) - - return df - - -def load_camels_gb_timeseries(data_dir: Path, basin: str) -> pd.DataFrame: - """Load the time series data for one basin of the CAMELS GB data set. - - Parameters - ---------- - data_dir : Path - Path to the CAMELS GB directory. This folder must contain a folder called 'timeseries' containing the forcing - files for each basin as .csv file. The file names have to start with 'CAMELS_GB_hydromet_timeseries'. - basin : str - Basin identifier number as string. - - Returns - ------- - pd.DataFrame - Time-indexed DataFrame, containing the time series data (forcings + discharge) data. - """ - forcing_path = data_dir / 'timeseries' - if not forcing_path.is_dir(): - raise OSError(f"{forcing_path} does not exist") - - files = list(forcing_path.glob('**/CAMELS_GB_hydromet_timeseries*.csv')) - file_path = [f for f in files if f"_{basin}_" in f.name] - if file_path: - file_path = file_path[0] - else: - raise FileNotFoundError(f'No file for Basin {basin} at {file_path}') - - df = pd.read_csv(file_path, sep=',', header=0, dtype={'date': str}) - df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d") - df = df.set_index("date") - - return df - - -######################################################################################################################## -# General utility functions # -######################################################################################################################## - - -def load_hydroatlas_attributes(data_dir: Path, basins: List[str] = []) -> pd.DataFrame: - """Load HydroATLAS attributes into a pandas DataFrame - - Parameters - ---------- - data_dir : Path - Path to the root directory of the dataset. Must contain a folder called 'hydroatlas_attributes' with a file - called `attributes.csv`. The attributes file is expected to have one column called `basin_id`. - basins : List[str], optional - If passed, return only attributes for the basins specified in this list. Otherwise, the attributes of all basins - are returned. - - Returns - ------- - pd.DataFrame - Basin-indexed DataFrame containing the HydroATLAS attributes. - """ - attribute_file = data_dir / "hydroatlas_attributes" / "attributes.csv" - if not attribute_file.is_file(): - raise FileNotFoundError(attribute_file) - - df = pd.read_csv(attribute_file, dtype={'basin_id': str}) - df = df.set_index('basin_id') - - if basins: - drop_basins = [b for b in df.index if b not in basins] - df = df.drop(drop_basins, axis=0) - - return df - - -def load_basin_file(basin_file: Path) -> List[str]: - """Load list of basins from text file. - - Parameters - ---------- - basin_file : Path - Path to a basin txt file. File has to contain one basin id per row. - - Returns - ------- - List[str] - List of basin ids as strings. - """ - with basin_file.open('r') as fp: - basins = fp.readlines() - basins = sorted(basin.strip() for basin in basins) - return basins - - -def attributes_sanity_check(data_dir: Path, attribute_set: str, basins: List[str], attribute_list: List[str]): - """Utility function to check if the standard deviation of one (or more) attributes is zero. - - This utility function can be used to check if any attribute has a standard deviation of zero. This would lead to - NaN's, when normalizing the features and thus would lead to NaN's when training the model. The function will raise - a `RuntimeError` if one or more zeros have been detected and will print the list of corresponding attribute names - to the console. - - Parameters - ---------- - data_dir : Path - Path to the root directory of the data set - attribute_set : {'hydroatlas', 'camels_us', 'hourly_camels_us', 'camels_gb'} - Name of the attribute set to check. - basins : - List of basins to consider in the check. - attribute_list : - List of attribute names to consider in the check. - - Raises - ------ - ValueError - For an unknown 'attribute_set' - RuntimeError - If one or more attributes have a standard deviation of zero. - """ - if attribute_set == "hydroatlas": - df = load_hydroatlas_attributes(data_dir, basins) - elif attribute_set in ["camels_us", "hourly_camels_us"]: - df = load_camels_us_attributes(data_dir, basins) - elif attribute_set == "camels_gb": - df = load_camels_gb_attributes(data_dir, basins) - else: - raise ValueError(f"Unknown 'attribute_set' {attribute_set}") - drop_cols = [c for c in df.columns if c not in attribute_list] - df = df.drop(drop_cols, axis=1) - attributes = [] - if any(df.std() == 0.0) or any(df.std().isnull()): - for k, v in df.std().iteritems(): - if (v == 0) or (np.isnan(v)): - attributes.append(k) - if attributes: - msg = [ - "The following attributes have a std of zero or NaN, which results in NaN's ", - "when normalizing the features. Remove the attributes from the attribute feature list ", - "and restart the run. \n", f"Attributes: {attributes}" - ] - raise RuntimeError("".join(msg)) - - -def sort_frequencies(frequencies: List[str]) -> List[str]: - """Sort the passed frequencies from low to high frequencies. - - Use `pandas frequency strings - `_ - to define frequencies. Note: The strings need to include values, e.g., '1D' instead of 'D'. - - Parameters - ---------- - frequencies : List[str] - List of pandas frequency identifiers to be sorted. - - Returns - ------- - List[str] - Sorted list of pandas frequency identifiers. - """ - deltas = {freq: pd.to_timedelta(freq) for freq in frequencies} - return sorted(deltas, key=deltas.get)[::-1] - - -def infer_frequency(index: Union[pd.DatetimeIndex, np.ndarray]) -> str: - """Infer the frequency of an index of a pandas DataFrame/Series or xarray DataArray. - - Parameters - ---------- - index : Union[pd.DatetimeIndex, np.ndarray] - DatetimeIndex of a DataFrame/Series or array of datetime values. - - Returns - ------- - str - Frequency of the index as a `pandas frequency string - `_ - - Raises - ------ - ValueError - If the frequency cannot be inferred from the index or is zero. - """ - native_frequency = pd.infer_freq(index) - if native_frequency is None: - raise ValueError(f'Cannot infer a legal frequency from dataset: {native_frequency}.') - if native_frequency[0] not in '0123456789': # add a value to the unit so to_timedelta works - native_frequency = f'1{native_frequency}' - if pd.to_timedelta(native_frequency) == pd.to_timedelta(0): - raise ValueError('Inferred dataset frequency is zero.') - return native_frequency - - -def infer_datetime_coord(xr: Union[DataArray, Dataset]) -> str: - """Checks for coordinate with 'date' in its name and returns the name. - - Parameters - ---------- - xr : Union[DataArray, Dataset] - Array to infer coordinate name of. - - Returns - ------- - str - Name of datetime coordinate name. - - Raises - ------ - RuntimeError - If none or multiple coordinates with 'date' in its name are found. - """ - candidates = [c for c in list(xr.coords) if "date" in c] - if len(candidates) > 1: - raise RuntimeError("Found multiple coordinates with 'date' in its name.") - if not candidates: - raise RuntimeError("Did not find any coordinate with 'date' in its name") - - return candidates[0] diff --git a/neuralhydrology/data/__init__.py b/neuralhydrology/datasetzoo/__init__.py similarity index 92% rename from neuralhydrology/data/__init__.py rename to neuralhydrology/datasetzoo/__init__.py index 15f0d181..3ef0c1ea 100644 --- a/neuralhydrology/data/__init__.py +++ b/neuralhydrology/datasetzoo/__init__.py @@ -1,7 +1,7 @@ -from neuralhydrology.data.basedataset import BaseDataset -from neuralhydrology.data.camelsus import CamelsUS -from neuralhydrology.data.camelsgb import CamelsGB -from neuralhydrology.data.hourlycamelsus import HourlyCamelsUS +from neuralhydrology.datasetzoo.basedataset import BaseDataset +from neuralhydrology.datasetzoo.camelsus import CamelsUS +from neuralhydrology.datasetzoo.camelsgb import CamelsGB +from neuralhydrology.datasetzoo.hourlycamelsus import HourlyCamelsUS from neuralhydrology.utils.config import Config @@ -45,7 +45,7 @@ def get_dataset(cfg: Config, ------- BaseDataset A new data set instance, depending on the run configuration. - + Raises ------ NotImplementedError diff --git a/neuralhydrology/data/basedataset.py b/neuralhydrology/datasetzoo/basedataset.py similarity index 98% rename from neuralhydrology/data/basedataset.py rename to neuralhydrology/datasetzoo/basedataset.py index 651c7ea9..4be9d3a3 100644 --- a/neuralhydrology/data/basedataset.py +++ b/neuralhydrology/datasetzoo/basedataset.py @@ -13,7 +13,7 @@ from torch.utils.data import Dataset from tqdm import tqdm -from neuralhydrology.data import utils +from neuralhydrology.datautils import utils from neuralhydrology.utils.config import Config LOGGER = logging.getLogger(__name__) @@ -383,18 +383,16 @@ def _create_lookup_table(self, xr: xarray.Dataset): self.num_samples = len(self.lookup_table) def _load_hydroatlas_attributes(self): - if self.is_train: - # sanity check attributes for NaN in per-feature standard deviation - utils.attributes_sanity_check(data_dir=self.cfg.data_dir, - attribute_set="hydroatlas", - basins=self.basins, - attribute_list=self.cfg.hydroatlas_attributes) - df = utils.load_hydroatlas_attributes(self.cfg.data_dir, basins=self.basins) # remove all attributes not defined in the config drop_cols = [c for c in df.columns if c not in self.cfg.hydroatlas_attributes] df = df.drop(drop_cols, axis=1) + + if self.is_train: + # sanity check attributes for NaN in per-feature standard deviation + utils.attributes_sanity_check(df=df) + return df def _load_combined_attributes(self): diff --git a/neuralhydrology/data/camelsgb.py b/neuralhydrology/datasetzoo/camelsgb.py similarity index 51% rename from neuralhydrology/data/camelsgb.py rename to neuralhydrology/datasetzoo/camelsgb.py index 613298d5..7200835e 100644 --- a/neuralhydrology/data/camelsgb.py +++ b/neuralhydrology/datasetzoo/camelsgb.py @@ -1,10 +1,11 @@ +from pathlib import Path from typing import Dict, List, Union import pandas as pd import xarray -from neuralhydrology.data.basedataset import BaseDataset -from neuralhydrology.data import utils +from neuralhydrology.datasetzoo.basedataset import BaseDataset +from neuralhydrology.datautils import utils from neuralhydrology.utils.config import Config @@ -62,23 +63,104 @@ def __init__(self, def _load_basin_data(self, basin: str) -> pd.DataFrame: """Load input and output data from text files.""" - df = utils.load_camels_gb_timeseries(data_dir=self.cfg.data_dir, basin=basin) + df = load_camels_gb_timeseries(data_dir=self.cfg.data_dir, basin=basin) return df def _load_attributes(self) -> pd.DataFrame: if self.cfg.camels_attributes: - if self.is_train: - # sanity check attributes for NaN in per-feature standard deviation - utils.attributes_sanity_check(data_dir=self.cfg.data_dir, - attribute_set=self.cfg.dataset, - basins=self.basins, - attribute_list=self.cfg.camels_attributes) - df = utils.load_camels_gb_attributes(self.cfg.data_dir, basins=self.basins) + df = load_camels_gb_attributes(self.cfg.data_dir, basins=self.basins) # remove all attributes not defined in the config drop_cols = [c for c in df.columns if c not in self.cfg.camels_attributes] df = df.drop(drop_cols, axis=1) + if self.is_train: + # sanity check attributes for NaN in per-feature standard deviation + utils.attributes_sanity_check(df=df) + return df + + +def load_camels_gb_attributes(data_dir: Path, basins: List[str] = []) -> pd.DataFrame: + """Load CAMELS GB attributes from the dataset provided by [#]_ + + Parameters + ---------- + data_dir : Path + Path to the CAMELS GB directory. This folder must contain an 'attributes' folder containing the corresponding + csv files for each attribute group (ending with _attributes.csv). + basins : List[str], optional + If passed, return only attributes for the basins specified in this list. Otherwise, the attributes of all basins + are returned. + + Returns + ------- + pd.DataFrame + Basin-indexed DataFrame, containing the attributes as columns. + + References + ---------- + .. [#] Coxon, G., Addor, N., Bloomfield, J. P., Freer, J., Fry, M., Hannaford, J., Howden, N. J. K., Lane, R., + Lewis, M., Robinson, E. L., Wagener, T., and Woods, R.: CAMELS-GB: Hydrometeorological time series and landscape + attributes for 671 catchments in Great Britain, Earth Syst. Sci. Data Discuss., + https://doi.org/10.5194/essd-2020-49, in review, 2020. + """ + attributes_path = Path(data_dir) / 'attributes' + + if not attributes_path.exists(): + raise RuntimeError(f"Attribute folder not found at {attributes_path}") + + txt_files = attributes_path.glob('*_attributes.csv') + + # Read-in attributes into one big dataframe + dfs = [] + for txt_file in txt_files: + df_temp = pd.read_csv(txt_file, sep=',', header=0, dtype={'gauge_id': str}) + df_temp = df_temp.set_index('gauge_id') + + dfs.append(df_temp) + + df = pd.concat(dfs, axis=1) + + if basins: + # drop rows of basins not contained in the passed list + drop_basins = [b for b in df.index if b not in basins] + df = df.drop(drop_basins, axis=0) + + return df + + +def load_camels_gb_timeseries(data_dir: Path, basin: str) -> pd.DataFrame: + """Load the time series data for one basin of the CAMELS GB data set. + + Parameters + ---------- + data_dir : Path + Path to the CAMELS GB directory. This folder must contain a folder called 'timeseries' containing the forcing + files for each basin as .csv file. The file names have to start with 'CAMELS_GB_hydromet_timeseries'. + basin : str + Basin identifier number as string. + + Returns + ------- + pd.DataFrame + Time-indexed DataFrame, containing the time series data (forcings + discharge) data. + """ + forcing_path = data_dir / 'timeseries' + if not forcing_path.is_dir(): + raise OSError(f"{forcing_path} does not exist") + + files = list(forcing_path.glob('**/CAMELS_GB_hydromet_timeseries*.csv')) + file_path = [f for f in files if f"_{basin}_" in f.name] + if file_path: + file_path = file_path[0] + else: + raise FileNotFoundError(f'No file for Basin {basin} at {file_path}') + + df = pd.read_csv(file_path, sep=',', header=0, dtype={'date': str}) + df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d") + df = df.set_index("date") + + return df diff --git a/neuralhydrology/datasetzoo/camelsus.py b/neuralhydrology/datasetzoo/camelsus.py new file mode 100644 index 00000000..44cfe770 --- /dev/null +++ b/neuralhydrology/datasetzoo/camelsus.py @@ -0,0 +1,240 @@ +from pathlib import Path +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +import xarray + +from neuralhydrology.datasetzoo.basedataset import BaseDataset +from neuralhydrology.datautils import utils +from neuralhydrology.utils.config import Config + + +class CamelsUS(BaseDataset): + """Data set class for the CAMELS US data set by [#]_ and [#]_. + + Parameters + ---------- + cfg : Config + The run configuration. + is_train : bool + Defines if the dataset is used for training or evaluating. If True (training), means/stds for each feature + are computed and stored to the run directory. If one-hot encoding is used, the mapping for the one-hot encoding + is created and also stored to disk. If False, a `scaler` input is expected and similarly the `id_to_int` input + if one-hot encoding is used. + period : {'train', 'validation', 'test'} + Defines the period for which the data will be loaded + basin : str, optional + If passed, the data for only this basin will be loaded. Otherwise the basin(s) are read from the appropriate + basin file, corresponding to the `period`. + additional_features : List[Dict[str, pd.DataFrame]], optional + List of dictionaries, mapping from a basin id to a pandas DataFrame. This DataFrame will be added to the data + loaded from the dataset and all columns are available as 'dynamic_inputs', 'static_inputs' and + 'target_variables' + id_to_int : Dict[str, int], optional + If the config argument 'use_basin_id_encoding' is True in the config and period is either 'validation' or + 'test', this input is required. It is a dictionary, mapping from basin id to an integer (the one-hot encoding). + scaler : Dict[str, Union[pd.Series, xarray.DataArray]], optional + If period is either 'validation' or 'test', this input is required. It contains the means and standard + deviations for each feature and is stored to the run directory during training (train_data/train_data_scaler.p) + + References + ---------- + .. [#] A. J. Newman, M. P. Clark, K. Sampson, A. Wood, L. E. Hay, A. Bock, R. J. Viger, D. Blodgett, + L. Brekke, J. R. Arnold, T. Hopson, and Q. Duan: Development of a large-sample watershed-scale + hydrometeorological dataset for the contiguous USA: dataset characteristics and assessment of regional + variability in hydrologic model performance. Hydrol. Earth Syst. Sci., 19, 209-223, + doi:10.5194/hess-19-209-2015, 2015 + .. [#] Addor, N., Newman, A. J., Mizukami, N. and Clark, M. P.: The CAMELS data set: catchment attributes and + meteorology for large-sample studies, Hydrol. Earth Syst. Sci., 21, 5293-5313, doi:10.5194/hess-21-5293-2017, + 2017. + """ + + def __init__(self, + cfg: Config, + is_train: bool, + period: str, + basin: str = None, + additional_features: List[Dict[str, pd.DataFrame]] = [], + id_to_int: Dict[str, int] = {}, + scaler: Dict[str, Union[pd.Series, xarray.DataArray]] = {}): + super(CamelsUS, self).__init__(cfg=cfg, + is_train=is_train, + period=period, + basin=basin, + additional_features=additional_features, + id_to_int=id_to_int, + scaler=scaler) + + def _load_basin_data(self, basin: str) -> pd.DataFrame: + """Load input and output data from text files.""" + # get forcings + dfs = [] + for forcing in self.cfg.forcings: + df, area = load_camels_us_forcings(self.cfg.data_dir, basin, forcing) + + # rename columns + if len(self.cfg.forcings) > 1: + df = df.rename(columns={col: f"{col}_{forcing}" for col in df.columns}) + dfs.append(df) + df = pd.concat(dfs, axis=1) + + # add discharge + df['QObs(mm/d)'] = load_camels_us_discharge(self.cfg.data_dir, basin, area) + + # replace invalid discharge values by NaNs + qobs_cols = [col for col in df.columns if "qobs" in col.lower()] + for col in qobs_cols: + df.loc[df[col] < 0, col] = np.nan + + return df + + def _load_attributes(self) -> pd.DataFrame: + if self.cfg.camels_attributes: + + df = load_camels_us_attributes(self.cfg.data_dir, basins=self.basins) + + # remove all attributes not defined in the config + drop_cols = [c for c in df.columns if c not in self.cfg.camels_attributes] + df = df.drop(drop_cols, axis=1) + + if self.is_train: + # sanity check attributes for NaN in per-feature standard deviation + utils.attributes_sanity_check(df=df) + + return df + + +def load_camels_us_attributes(data_dir: Path, basins: List[str] = []) -> pd.DataFrame: + """Load CAMELS US attributes from the dataset provided by [#]_ + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. This folder must contain a 'camels_attributes_v2.0' folder (the original + data set) containing the corresponding txt files for each attribute group. + basins : List[str], optional + If passed, return only attributes for the basins specified in this list. Otherwise, the attributes of all basins + are returned. + + Returns + ------- + pandas.DataFrame + Basin-indexed DataFrame, containing the attributes as columns. + + References + ---------- + .. [#] Addor, N., Newman, A. J., Mizukami, N. and Clark, M. P.: The CAMELS data set: catchment attributes and + meteorology for large-sample studies, Hydrol. Earth Syst. Sci., 21, 5293-5313, doi:10.5194/hess-21-5293-2017, + 2017. + """ + attributes_path = Path(data_dir) / 'camels_attributes_v2.0' + + if not attributes_path.exists(): + raise RuntimeError(f"Attribute folder not found at {attributes_path}") + + txt_files = attributes_path.glob('camels_*.txt') + + # Read-in attributes into one big dataframe + dfs = [] + for txt_file in txt_files: + df_temp = pd.read_csv(txt_file, sep=';', header=0, dtype={'gauge_id': str}) + df_temp = df_temp.set_index('gauge_id') + + dfs.append(df_temp) + + df = pd.concat(dfs, axis=1) + # convert huc column to double digit strings + df['huc'] = df['huc_02'].apply(lambda x: str(x).zfill(2)) + df = df.drop('huc_02', axis=1) + + if basins: + # drop rows of basins not contained in the passed list + drop_basins = [b for b in df.index if b not in basins] + df = df.drop(drop_basins, axis=0) + + return df + + +def load_camels_us_forcings(data_dir: Path, basin: str, forcings: str) -> Tuple[pd.DataFrame, int]: + """Load the forcing data for a basin of the CAMELS US data set. + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. This folder must contain a 'basin_mean_forcing' folder containing one + subdirectory for each forcing. The forcing directories have to contain 18 subdirectories (for the 18 HUCS) as in + the original CAMELS data set. In each HUC folder are the forcing files (.txt), starting with the 8-digit basin + id. + basin : str + 8-digit USGS identifier of the basin. + forcings : str + Can be e.g. 'daymet' or 'nldas', etc. Must match the folder names in the 'basin_mean_forcing' directory. + + Returns + ------- + pd.DataFrame + Time-indexed DataFrame, containing the forcing data. + int + Catchment area (m2), specified in the header of the forcing file. + """ + forcing_path = data_dir / 'basin_mean_forcing' / forcings + if not forcing_path.is_dir(): + raise OSError(f"{forcing_path} does not exist") + + files = list(forcing_path.glob('**/*_forcing_leap.txt')) + file_path = [f for f in files if f.name[:8] == basin] + if file_path: + file_path = file_path[0] + else: + raise FileNotFoundError(f'No file for Basin {basin} at {file_path}') + + df = pd.read_csv(file_path, sep='\s+', header=3) + df["date"] = pd.to_datetime(df.Year.map(str) + "/" + df.Mnth.map(str) + "/" + df.Day.map(str), format="%Y/%m/%d") + df = df.set_index("date") + + # load area from header + with open(file_path, 'r') as fp: + content = fp.readlines() + area = int(content[2]) + + return df, area + + +def load_camels_us_discharge(data_dir: Path, basin: str, area: int) -> pd.Series: + """Load the discharge data for a basin of the CAMELS US data set. + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. This folder must contain a 'usgs_streamflow' folder with 18 + subdirectories (for the 18 HUCS) as in the original CAMELS data set. In each HUC folder are the discharge files + (.txt), starting with the 8-digit basin id. + basin : str + 8-digit USGS identifier of the basin. + area : int + Catchment area (m2), used to normalize the discharge. + + Returns + ------- + pd.Series + Time-index pandas.Series of the discharge values (mm/day) + """ + + discharge_path = data_dir / 'usgs_streamflow' + files = list(discharge_path.glob('**/*_streamflow_qc.txt')) + file_path = [f for f in files if f.name[:8] == basin] + if file_path: + file_path = file_path[0] + else: + raise FileNotFoundError(f'No file for Basin {basin} at {file_path}') + + col_names = ['basin', 'Year', 'Mnth', 'Day', 'QObs', 'flag'] + df = pd.read_csv(file_path, sep='\s+', header=None, names=col_names) + df["date"] = pd.to_datetime(df.Year.map(str) + "/" + df.Mnth.map(str) + "/" + df.Day.map(str), format="%Y/%m/%d") + df = df.set_index("date") + + # normalize discharge from cubic feed per second to mm per day + df.QObs = 28316846.592 * df.QObs * 86400 / (area * 10**6) + + return df.QObs diff --git a/neuralhydrology/data/hourlycamelsus.py b/neuralhydrology/datasetzoo/hourlycamelsus.py similarity index 54% rename from neuralhydrology/data/hourlycamelsus.py rename to neuralhydrology/datasetzoo/hourlycamelsus.py index c02dcfa8..0cd4d8d5 100644 --- a/neuralhydrology/data/hourlycamelsus.py +++ b/neuralhydrology/datasetzoo/hourlycamelsus.py @@ -1,10 +1,12 @@ import logging import pickle +from pathlib import Path import numpy as np import pandas as pd +import xarray -from neuralhydrology.data import utils, CamelsUS +from neuralhydrology.datasetzoo.camelsus import CamelsUS, load_camels_us_forcings, load_camels_us_attributes from neuralhydrology.utils.config import Config LOGGER = logging.getLogger(__name__) @@ -71,7 +73,7 @@ def _load_basin_data(self, basin: str) -> pd.DataFrame: df = self.load_hourly_data(basin, forcing) else: # load daily CAMELS forcings and upsample to hourly - df, _ = utils.load_camels_us_forcings(self.cfg.data_dir, basin, forcing) + df, _ = load_camels_us_forcings(self.cfg.data_dir, basin, forcing) df = df.resample('1H').ffill() if len(self.cfg.forcings) > 1: # rename columns @@ -86,12 +88,12 @@ def _load_basin_data(self, basin: str) -> pd.DataFrame: # add stage, if requested if 'gauge_height_m' in self.cfg.target_variables: - df = df.join(utils.load_hourly_us_stage(self.cfg.data_dir, basin)) + df = df.join(load_hourly_us_stage(self.cfg.data_dir, basin)) df.loc[df['gauge_height_m'] < 0, 'gauge_height_m'] = np.nan # convert discharge to 'synthetic' stage, if requested if 'synthetic_qobs_stage_meters' in self.cfg.target_variables: - attributes = utils.load_camels_us_attributes(data_dir=self.cfg.data_dir, basins=[basin]) + attributes = load_camels_us_attributes(data_dir=self.cfg.data_dir, basins=[basin]) with open(self.cfg.rating_curve_file, 'rb') as f: rating_curves = pickle.load(f) df['synthetic_qobs_stage_meters'] = np.nan @@ -119,7 +121,7 @@ def load_hourly_data(self, basin: str, forcings: str) -> pd.DataFrame: fallback_csv = False try: if self._netcdf_dataset is None: - self._netcdf_dataset = utils.load_hourly_us_netcdf(self.cfg.data_dir, forcings) + self._netcdf_dataset = load_hourly_us_netcdf(self.cfg.data_dir, forcings) df = self._netcdf_dataset.sel(basin=basin).to_dataframe() except FileNotFoundError: fallback_csv = True @@ -130,9 +132,130 @@ def load_hourly_data(self, basin: str, forcings: str) -> pd.DataFrame: fallback_csv = True LOGGER.warning(f'## Warning: NetCDF file does not contain data for {basin}. Trying slower csv files.') if fallback_csv: - df = utils.load_hourly_us_forcings(self.cfg.data_dir, basin, forcings) + df = load_hourly_us_forcings(self.cfg.data_dir, basin, forcings) # add discharge - df = df.join(utils.load_hourly_us_discharge(self.cfg.data_dir, basin)) + df = df.join(load_hourly_us_discharge(self.cfg.data_dir, basin)) return df + + +def load_hourly_us_forcings(data_dir: Path, basin: str, forcings: str) -> pd.DataFrame: + """Load the hourly forcing data for a basin of the CAMELS US data set. + + The hourly forcings are not included in the original data set by Newman et al. (2017). + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. This folder must contain an 'hourly' folder containing one subdirectory + for each forcing, which contains the forcing files (.csv) for each basin. Files have to contain the 8-digit + basin id. + basin : str + 8-digit USGS identifier of the basin. + forcings : str + Must match the folder names in the 'hourly' directory. E.g. 'nldas_hourly' + + Returns + ------- + pd.DataFrame + Time-indexed DataFrame, containing the forcing data. + """ + forcing_path = data_dir / 'hourly' / forcings + if not forcing_path.is_dir(): + raise OSError(f"{forcing_path} does not exist") + + files = list(forcing_path.glob('*.csv')) + file_path = [f for f in files if basin in f.stem] + if file_path: + file_path = file_path[0] + else: + raise FileNotFoundError(f'No file for Basin {basin} at {forcing_path}') + + return pd.read_csv(file_path, index_col=['date'], parse_dates=['date']) + + +def load_hourly_us_discharge(data_dir: Path, basin: str) -> pd.DataFrame: + """Load the hourly discharge data for a basin of the CAMELS US data set. + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. This folder must contain a folder called 'hourly' with a subdirectory + 'usgs_streamflow' which contains the discharge files (.csv) for each basin. File names must contain the 8-digit + basin id. + basin : str + 8-digit USGS identifier of the basin. + + Returns + ------- + pd.Series + Time-index Series of the discharge values (mm/hour) + """ + discharge_path = data_dir / 'hourly' / 'usgs_streamflow' + files = list(discharge_path.glob('**/*usgs-hourly.csv')) + file_path = [f for f in files if basin in f.stem] + if file_path: + file_path = file_path[0] + else: + raise FileNotFoundError(f'No file for Basin {basin} at {discharge_path}') + + return pd.read_csv(file_path, index_col=['date'], parse_dates=['date']) + + +def load_hourly_us_stage(data_dir: Path, basin: str) -> pd.Series: + """Load the hourly stage data for a basin of the CAMELS US data set. + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. This folder must contain a folder called 'hourly' with a subdirectory + 'usgs_stage' which contains the stage files (.csv) for each basin. File names must contain the 8-digit basin id. + basin : str + 8-digit USGS identifier of the basin. + + Returns + ------- + pd.Series + Time-index Series of the stage values (m) + """ + stage_path = data_dir / 'hourly' / 'usgs_stage' + files = list(stage_path.glob('**/*_utc.csv')) + file_path = [f for f in files if basin in f.stem] + if file_path: + file_path = file_path[0] + else: + raise FileNotFoundError(f'No file for Basin {basin} at {stage_path}') + + df = pd.read_csv(file_path, + sep=',', + index_col=['datetime'], + parse_dates=['datetime'], + usecols=['datetime', 'gauge_height_ft']) + df = df.resample('H').mean() + df["gauge_height_m"] = df["gauge_height_ft"] * 0.3048 + + return df["gauge_height_m"] + + +def load_hourly_us_netcdf(data_dir: Path, forcings: str) -> xarray.Dataset: + """Load hourly forcing and discharge data from preprocessed netCDF file. + + Parameters + ---------- + data_dir : Path + Path to the CAMELS US directory. This folder must contain a folder called 'hourly', containing the netCDF file. + forcings : str + Name of the forcing product. Must match the ending of the netCDF file. E.g. 'nldas_hourly' for + 'usgs-streamflow-nldas_hourly.nc' + + Returns + ------- + xarray.Dataset + Dataset containing the combined discharge and forcing data of all basins (as stored in the netCDF) + """ + netcdf_path = data_dir / 'hourly' / f'usgs-streamflow-{forcings}.nc' + if not netcdf_path.is_file(): + raise FileNotFoundError(f'No NetCDF file for hourly streamflow and {forcings} at {netcdf_path}.') + + return xarray.open_dataset(netcdf_path) diff --git a/neuralhydrology/datautils/__init__.py b/neuralhydrology/datautils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/neuralhydrology/data/climateindices.py b/neuralhydrology/datautils/climateindices.py similarity index 99% rename from neuralhydrology/data/climateindices.py rename to neuralhydrology/datautils/climateindices.py index d564c818..05a36b0d 100644 --- a/neuralhydrology/data/climateindices.py +++ b/neuralhydrology/datautils/climateindices.py @@ -9,7 +9,7 @@ from numba import njit from tqdm import tqdm -from neuralhydrology.data import pet, utils +from neuralhydrology.datautils import pet, utils LOGGER = logging.getLogger(__name__) diff --git a/neuralhydrology/data/dischargeinput.py b/neuralhydrology/datautils/dischargeinput.py similarity index 98% rename from neuralhydrology/data/dischargeinput.py rename to neuralhydrology/datautils/dischargeinput.py index f8b97474..e467c995 100644 --- a/neuralhydrology/data/dischargeinput.py +++ b/neuralhydrology/datautils/dischargeinput.py @@ -7,7 +7,7 @@ import pandas as pd from tqdm import tqdm -from neuralhydrology.data import utils +from neuralhydrology.datautils import utils LOGGER = logging.getLogger(__name__) diff --git a/neuralhydrology/data/pet.py b/neuralhydrology/datautils/pet.py similarity index 100% rename from neuralhydrology/data/pet.py rename to neuralhydrology/datautils/pet.py diff --git a/neuralhydrology/datautils/utils.py b/neuralhydrology/datautils/utils.py new file mode 100644 index 00000000..e40134fe --- /dev/null +++ b/neuralhydrology/datautils/utils.py @@ -0,0 +1,167 @@ +from pathlib import Path +from typing import List, Union + +import numpy as np +import pandas as pd +from xarray.core.dataarray import DataArray +from xarray.core.dataset import Dataset + + +def load_hydroatlas_attributes(data_dir: Path, basins: List[str] = []) -> pd.DataFrame: + """Load HydroATLAS attributes into a pandas DataFrame + + Parameters + ---------- + data_dir : Path + Path to the root directory of the dataset. Must contain a folder called 'hydroatlas_attributes' with a file + called `attributes.csv`. The attributes file is expected to have one column called `basin_id`. + basins : List[str], optional + If passed, return only attributes for the basins specified in this list. Otherwise, the attributes of all basins + are returned. + + Returns + ------- + pd.DataFrame + Basin-indexed DataFrame containing the HydroATLAS attributes. + """ + attribute_file = data_dir / "hydroatlas_attributes" / "attributes.csv" + if not attribute_file.is_file(): + raise FileNotFoundError(attribute_file) + + df = pd.read_csv(attribute_file, dtype={'basin_id': str}) + df = df.set_index('basin_id') + + if basins: + drop_basins = [b for b in df.index if b not in basins] + df = df.drop(drop_basins, axis=0) + + return df + + +def load_basin_file(basin_file: Path) -> List[str]: + """Load list of basins from text file. + + Parameters + ---------- + basin_file : Path + Path to a basin txt file. File has to contain one basin id per row. + + Returns + ------- + List[str] + List of basin ids as strings. + """ + with basin_file.open('r') as fp: + basins = fp.readlines() + basins = sorted(basin.strip() for basin in basins) + return basins + + +def attributes_sanity_check(df: pd.DataFrame): + """Utility function to check if the standard deviation of one (or more) attributes is zero. + + This utility function can be used to check if any attribute has a standard deviation of zero. This would lead to + NaN's, when normalizing the features and thus would lead to NaN's when training the model. The function will raise + a `RuntimeError` if one or more zeros have been detected and will print the list of corresponding attribute names + to the console. + + Parameters + ---------- + df : pd.DataFrame + DataFrame of catchment attributes as columns. + + Raises + ------ + RuntimeError + If one or more attributes have a standard deviation of zero. + """ + # Iterate over attributes and check for NaNs + attributes = [] + if any(df.std() == 0.0) or any(df.std().isnull()): + for k, v in df.std().iteritems(): + if (v == 0) or (np.isnan(v)): + attributes.append(k) + if attributes: + msg = [ + "The following attributes have a std of zero or NaN, which results in NaN's ", + "when normalizing the features. Remove the attributes from the attribute feature list ", + "and restart the run. \n", f"Attributes: {attributes}" + ] + raise RuntimeError("".join(msg)) + + +def sort_frequencies(frequencies: List[str]) -> List[str]: + """Sort the passed frequencies from low to high frequencies. + + Use `pandas frequency strings + `_ + to define frequencies. Note: The strings need to include values, e.g., '1D' instead of 'D'. + + Parameters + ---------- + frequencies : List[str] + List of pandas frequency identifiers to be sorted. + + Returns + ------- + List[str] + Sorted list of pandas frequency identifiers. + """ + deltas = {freq: pd.to_timedelta(freq) for freq in frequencies} + return sorted(deltas, key=deltas.get)[::-1] + + +def infer_frequency(index: Union[pd.DatetimeIndex, np.ndarray]) -> str: + """Infer the frequency of an index of a pandas DataFrame/Series or xarray DataArray. + + Parameters + ---------- + index : Union[pd.DatetimeIndex, np.ndarray] + DatetimeIndex of a DataFrame/Series or array of datetime values. + + Returns + ------- + str + Frequency of the index as a `pandas frequency string + `_ + + Raises + ------ + ValueError + If the frequency cannot be inferred from the index or is zero. + """ + native_frequency = pd.infer_freq(index) + if native_frequency is None: + raise ValueError(f'Cannot infer a legal frequency from dataset: {native_frequency}.') + if native_frequency[0] not in '0123456789': # add a value to the unit so to_timedelta works + native_frequency = f'1{native_frequency}' + if pd.to_timedelta(native_frequency) == pd.to_timedelta(0): + raise ValueError('Inferred dataset frequency is zero.') + return native_frequency + + +def infer_datetime_coord(xr: Union[DataArray, Dataset]) -> str: + """Checks for coordinate with 'date' in its name and returns the name. + + Parameters + ---------- + xr : Union[DataArray, Dataset] + Array to infer coordinate name of. + + Returns + ------- + str + Name of datetime coordinate name. + + Raises + ------ + RuntimeError + If none or multiple coordinates with 'date' in its name are found. + """ + candidates = [c for c in list(xr.coords) if "date" in c] + if len(candidates) > 1: + raise RuntimeError("Found multiple coordinates with 'date' in its name.") + if not candidates: + raise RuntimeError("Did not find any coordinate with 'date' in its name") + + return candidates[0] diff --git a/neuralhydrology/evaluation/metrics.py b/neuralhydrology/evaluation/metrics.py index 32e62944..2a7c9e36 100644 --- a/neuralhydrology/evaluation/metrics.py +++ b/neuralhydrology/evaluation/metrics.py @@ -5,7 +5,7 @@ from scipy import stats, signal from xarray.core.dataarray import DataArray -from neuralhydrology.data import utils +from neuralhydrology.datautils import utils def get_available_metrics() -> List[str]: diff --git a/neuralhydrology/evaluation/signatures.py b/neuralhydrology/evaluation/signatures.py index dcc1e726..b282afee 100644 --- a/neuralhydrology/evaluation/signatures.py +++ b/neuralhydrology/evaluation/signatures.py @@ -8,7 +8,7 @@ from numba import njit from xarray.core.dataarray import DataArray -from neuralhydrology.data import utils +from neuralhydrology.datautils import utils def get_available_signatures() -> List[str]: diff --git a/neuralhydrology/evaluation/tester.py b/neuralhydrology/evaluation/tester.py index c6528192..6e25c505 100644 --- a/neuralhydrology/evaluation/tester.py +++ b/neuralhydrology/evaluation/tester.py @@ -14,9 +14,9 @@ from torch.utils.data import DataLoader from tqdm import tqdm -from neuralhydrology.data import get_dataset -from neuralhydrology.data.basedataset import BaseDataset -from neuralhydrology.data.utils import load_basin_file, sort_frequencies +from neuralhydrology.datasetzoo import get_dataset +from neuralhydrology.datasetzoo.basedataset import BaseDataset +from neuralhydrology.datautils.utils import load_basin_file, sort_frequencies from neuralhydrology.evaluation import plots from neuralhydrology.evaluation.metrics import calculate_metrics from neuralhydrology.modelzoo import get_model diff --git a/neuralhydrology/modelzoo/mtslstm.py b/neuralhydrology/modelzoo/mtslstm.py index 68e650da..d2fd9587 100644 --- a/neuralhydrology/modelzoo/mtslstm.py +++ b/neuralhydrology/modelzoo/mtslstm.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn -from neuralhydrology.data.utils import sort_frequencies +from neuralhydrology.datautils.utils import sort_frequencies from neuralhydrology.modelzoo.head import get_head from neuralhydrology.modelzoo.basemodel import BaseModel from neuralhydrology.utils.config import Config diff --git a/neuralhydrology/modelzoo/odelstm.py b/neuralhydrology/modelzoo/odelstm.py index 63bd1979..248dc6be 100644 --- a/neuralhydrology/modelzoo/odelstm.py +++ b/neuralhydrology/modelzoo/odelstm.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn -from neuralhydrology.data.utils import sort_frequencies +from neuralhydrology.datautils.utils import sort_frequencies from neuralhydrology.modelzoo.basemodel import BaseModel from neuralhydrology.modelzoo.head import get_head from neuralhydrology.modelzoo.lstm import _LSTMCell diff --git a/neuralhydrology/training/basetrainer.py b/neuralhydrology/training/basetrainer.py index 6cdb1ee6..053c64c7 100644 --- a/neuralhydrology/training/basetrainer.py +++ b/neuralhydrology/training/basetrainer.py @@ -11,9 +11,9 @@ from tqdm import tqdm import neuralhydrology.training.loss as loss -from neuralhydrology.data import get_dataset -from neuralhydrology.data.basedataset import BaseDataset -from neuralhydrology.data.utils import load_basin_file +from neuralhydrology.datasetzoo import get_dataset +from neuralhydrology.datasetzoo.basedataset import BaseDataset +from neuralhydrology.datautils.utils import load_basin_file from neuralhydrology.evaluation import get_tester from neuralhydrology.evaluation.tester import BaseTester from neuralhydrology.modelzoo import get_model diff --git a/neuralhydrology/training/regularization.py b/neuralhydrology/training/regularization.py index e39485a3..33b7f408 100644 --- a/neuralhydrology/training/regularization.py +++ b/neuralhydrology/training/regularization.py @@ -3,7 +3,7 @@ import pandas as pd import torch -from neuralhydrology.data.utils import sort_frequencies +from neuralhydrology.datautils.utils import sort_frequencies from neuralhydrology.utils.config import Config diff --git a/neuralhydrology/utils/nh_results_ensemble.py b/neuralhydrology/utils/nh_results_ensemble.py index 255a0089..7abd79fa 100644 --- a/neuralhydrology/utils/nh_results_ensemble.py +++ b/neuralhydrology/utils/nh_results_ensemble.py @@ -12,7 +12,7 @@ from tqdm import tqdm sys.path.append(str(Path(__file__).parent.parent.parent)) -from neuralhydrology.data.utils import sort_frequencies +from neuralhydrology.datautils.utils import sort_frequencies from neuralhydrology.evaluation.metrics import calculate_metrics from neuralhydrology.utils.config import Config diff --git a/test/test_config_runs.py b/test/test_config_runs.py index 6eb760fb..12c4aeea 100644 --- a/test/test_config_runs.py +++ b/test/test_config_runs.py @@ -8,7 +8,7 @@ import pytest from pytest import approx -from neuralhydrology.data.utils import load_camels_us_forcings, load_camels_us_discharge, load_hourly_us_netcdf +from neuralhydrology.datasetzoo import camelsus, hourlycamelsus from neuralhydrology.evaluation.evaluate import start_evaluation from neuralhydrology.training.train import start_training from neuralhydrology.utils.config import Config @@ -108,7 +108,7 @@ def test_multi_timescale_regression(get_config: Fixture[Callable[[str], dict]], start_evaluation(cfg=config, run_dir=config.run_dir, epoch=1, period='test') results = _get_basin_results(config.run_dir, 1)[basin] - discharge = load_hourly_us_netcdf(config.data_dir, config.forcings[0]) \ + discharge = hourlycamelsus.load_hourly_us_netcdf(config.data_dir, config.forcings[0]) \ .sel(basin=basin, date=slice(test_start_date, test_end_date))['qobs_mm_per_hour'] hourly_results = results['1H']['xr'].to_dataframe().reset_index() @@ -147,7 +147,7 @@ def _get_basin_results(run_dir: Path, epoch: int) -> Dict: def _get_discharge(config: Config, basin: str) -> pd.Series: if config.dataset == 'camels_us': - _, area = load_camels_us_forcings(config.data_dir, basin, 'daymet') - return load_camels_us_discharge(config.data_dir, basin, area) + _, area = camelsus.load_camels_us_forcings(config.data_dir, basin, 'daymet') + return camelsus.load_camels_us_discharge(config.data_dir, basin, area) else: raise NotImplementedError From ab6f8e8509f3c6b63553683e24b395911b2082dd Mon Sep 17 00:00:00 2001 From: Martin Gauch <15731649+gauchm@users.noreply.github.com> Date: Wed, 7 Oct 2020 13:16:10 +0200 Subject: [PATCH 12/15] Add docs action (#15) Closes #14 --- .github/workflows/docs-ci.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/docs-ci.yml diff --git a/.github/workflows/docs-ci.yml b/.github/workflows/docs-ci.yml new file mode 100644 index 00000000..d99540e3 --- /dev/null +++ b/.github/workflows/docs-ci.yml @@ -0,0 +1,25 @@ +name: "docs check" +on: + pull_request: + branches: + - master + - public + +jobs: + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install pandoc + run: | + sudo apt-get update -y && sudo apt-get install -y pandoc + - name: Install dependencies + run: | + pip install sphinx sphinx-rtd-theme nbsphinx nbsphinx-link + - name: Build Sphinx docs + working-directory: docs/ + run: | + make html From 9d669add48fa8d8989f2b1d24b7a8f85c3ca4812 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Wed, 7 Oct 2020 13:23:42 +0200 Subject: [PATCH 13/15] Increment version number --- neuralhydrology/__about__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neuralhydrology/__about__.py b/neuralhydrology/__about__.py index 3f013dc7..bbd0a5bf 100644 --- a/neuralhydrology/__about__.py +++ b/neuralhydrology/__about__.py @@ -1 +1 @@ -__version__ = "0.9.1-beta" +__version__ = "0.9.2-beta" From ff58e290bbabd582d117f3a50f3b1c34b022eff1 Mon Sep 17 00:00:00 2001 From: Martin Gauch <15731649+gauchm@users.noreply.github.com> Date: Wed, 7 Oct 2020 16:17:00 +0200 Subject: [PATCH 14/15] MTS-LSTM: fix shared_mtslstm config argument (#17) * MTS-LSTM: fix shared_mtslstm config argument With the change from per_frequency_lstm, the semantics of the config argument got reversed, but we didn't reflect this in the code. * bump version --- neuralhydrology/__about__.py | 2 +- neuralhydrology/modelzoo/mtslstm.py | 16 ++++++++-------- neuralhydrology/utils/config.py | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/neuralhydrology/__about__.py b/neuralhydrology/__about__.py index bbd0a5bf..363306ca 100644 --- a/neuralhydrology/__about__.py +++ b/neuralhydrology/__about__.py @@ -1 +1 @@ -__version__ = "0.9.2-beta" +__version__ = "0.9.3-beta" diff --git a/neuralhydrology/modelzoo/mtslstm.py b/neuralhydrology/modelzoo/mtslstm.py index d2fd9587..80b4829b 100644 --- a/neuralhydrology/modelzoo/mtslstm.py +++ b/neuralhydrology/modelzoo/mtslstm.py @@ -64,8 +64,8 @@ def __init__(self, cfg: Config): # start to count the number of inputs input_sizes = len(cfg.camels_attributes + cfg.hydroatlas_attributes + cfg.static_inputs) - # if not is_shared_mtslstm, the LSTM gets an additional frequency flag as input. - if not self._is_shared_mtslstm: + # if is_shared_mtslstm, the LSTM gets an additional frequency flag as input. + if self._is_shared_mtslstm: input_sizes += len(self._frequencies) if cfg.use_basin_id_encoding: @@ -76,8 +76,8 @@ def __init__(self, cfg: Config): if isinstance(cfg.dynamic_inputs, list): input_sizes = {freq: input_sizes + len(cfg.dynamic_inputs) for freq in self._frequencies} else: - if not self._is_shared_mtslstm: - raise ValueError(f'Different inputs not allowed if shared_mtslstm is False.') + if self._is_shared_mtslstm: + raise ValueError(f'Different inputs not allowed if shared_mtslstm is used.') input_sizes = {freq: input_sizes + len(cfg.dynamic_inputs[freq]) for freq in self._frequencies} if not isinstance(cfg.hidden_size, dict): @@ -86,11 +86,11 @@ def __init__(self, cfg: Config): else: self._hidden_size = cfg.hidden_size - if (not self._is_shared_mtslstm + if (self._is_shared_mtslstm or self._transfer_mtslstm_states["h"] == "identity" or self._transfer_mtslstm_states["c"] == "identity") \ and any(size != self._hidden_size[self._frequencies[0]] for size in self._hidden_size.values()): - raise ValueError("All hidden sizes must be equal if shared_mtslstm=False or state transfer=identity.") + raise ValueError("All hidden sizes must be equal if shared_mtslstm is used or state transfer=identity.") # create layer depending on selected frequencies self._init_modules(input_sizes) @@ -107,7 +107,7 @@ def _init_modules(self, input_sizes: Dict[str, int]): for idx, freq in enumerate(self._frequencies): freq_input_size = input_sizes[freq] - if not self._is_shared_mtslstm and idx > 0: + if self._is_shared_mtslstm and idx > 0: self.lstms[freq] = self.lstms[self._frequencies[idx - 1]] # same LSTM for all frequencies. self.heads[freq] = self.heads[self._frequencies[idx - 1]] # same head for all frequencies. else: @@ -162,7 +162,7 @@ def _prepare_inputs(self, data: Dict[str, torch.Tensor], freq: str) -> torch.Ten else: pass - if not self._is_shared_mtslstm: + if self._is_shared_mtslstm: # add frequency one-hot encoding idx = self._frequencies.index(freq) one_hot_freq = torch.zeros(x_d.shape[0], x_d.shape[1], len(self._frequencies)).to(x_d) diff --git a/neuralhydrology/utils/config.py b/neuralhydrology/utils/config.py index 98392af1..591655c7 100644 --- a/neuralhydrology/utils/config.py +++ b/neuralhydrology/utils/config.py @@ -468,7 +468,7 @@ def seq_length(self) -> Union[int, Dict[str, int]]: @property def shared_mtslstm(self) -> bool: - return self._cfg.get("shared_mtslstm", True) + return self._cfg.get("shared_mtslstm", False) @property def static_inputs(self) -> List[str]: From 5e4903a399279d2e1a880ff3abb531cb2c4da783 Mon Sep 17 00:00:00 2001 From: Martin Gauch <15731649+gauchm@users.noreply.github.com> Date: Thu, 8 Oct 2020 13:36:49 +0200 Subject: [PATCH 15/15] Update setup.py module list and metadata (#19) * Update setup.py module list and metadata * add github action to test installation --- .github/workflows/package-install.yml | 20 ++++++++++++++++++++ neuralhydrology/__about__.py | 2 +- setup.py | 16 ++++++++++------ 3 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/package-install.yml diff --git a/.github/workflows/package-install.yml b/.github/workflows/package-install.yml new file mode 100644 index 00000000..7ac0f84c --- /dev/null +++ b/.github/workflows/package-install.yml @@ -0,0 +1,20 @@ +name: test package installation + +on: + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: '3.8' + + - name: Try installing the package + run: pip install --no-deps . \ No newline at end of file diff --git a/neuralhydrology/__about__.py b/neuralhydrology/__about__.py index 363306ca..885a0dd3 100644 --- a/neuralhydrology/__about__.py +++ b/neuralhydrology/__about__.py @@ -1 +1 @@ -__version__ = "0.9.3-beta" +__version__ = "0.9.3-beta.2" diff --git a/setup.py b/setup.py index 2c89b207..505665e5 100644 --- a/setup.py +++ b/setup.py @@ -16,13 +16,16 @@ setup(name='neuralhydrology', version=about["__version__"], packages=[ - 'neuralhydrology', 'neuralhydrology.data', 'neuralhydrology.utils', 'neuralhydrology.modelzoo', - 'neuralhydrology.training', 'neuralhydrology.evaluation' + 'neuralhydrology', 'neuralhydrology.datasetzoo', 'neuralhydrology.datautils', 'neuralhydrology.utils', + 'neuralhydrology.modelzoo', 'neuralhydrology.training', 'neuralhydrology.evaluation' ], - url='neuralhydrology.readthedocs.io', - license='', - author='Frederik Kratzert', - author_email='f.kratzert@gmail.com', + url='https://neuralhydrology.readthedocs.io', + project_urls={ + 'Documentation': 'https://neuralhydrology.readthedocs.io', + 'Source': 'https://github.com/neuralhydrology/neuralhydrology', + 'Research Blog': 'https://neuralhydrology.github.io/' + }, + author='Frederik Kratzert , Daniel Klotz, Martin Gauch', description='Library for training deep learning models with environmental focus', long_description=long_description, long_description_content_type='text/markdown', @@ -50,5 +53,6 @@ 'Operating System :: OS Independent', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Hydrology', + 'License :: OSI Approved :: BSD License', ], keywords='deep learning hydrology lstm neural network streamflow discharge rainfall-runoff')