diff --git a/example/snow_water_equivalent/LSTM_SWE_data_integration_testing.py b/example/snow_water_equivalent/LSTM_SWE_data_integration_testing.py new file mode 100644 index 0000000..d5be8e8 --- /dev/null +++ b/example/snow_water_equivalent/LSTM_SWE_data_integration_testing.py @@ -0,0 +1,208 @@ +import pickle +import pandas as pd +import numpy as np +import os +import json +import random +import torch +import sys +sys.path.append('../../') +from hydroDL.model import test + +from hydroDL.data import scale +from hydroDL.master.master import loadModel +from hydroDL.post import stat + +randomseed = 111111 +random.seed(randomseed) +torch.manual_seed(randomseed) +np.random.seed(randomseed) +torch.cuda.manual_seed(randomseed) +torch.backends.cudnn.deterministic = True +torch.backends.cudnn.benchmark = False + +traingpuid = 0 +torch.cuda.set_device(traingpuid) + + +rootDB_s=f'/mnt/sdb/yxs275/check_code/KFOLD_inputs/SNOTEL_filter_data_1988/' + +DateRange=['2000-01-01', '2019-12-31'] +testDateRange=['2016-01-01', '2019-12-31'] + +var_x_list = ['pr_gridMET', 'tmmn_gridMET', 'tmmx_gridMET', 'srad_gridMET', 'vs_gridMET', 'th_gridMET', + 'sph_gridMET', 'rmin_gridMET', 'rmax_gridMET'] + +attributeLst = ['lat','mean_elev', 'mean_slope', 'aspect', + 'dom_land_cover', 'dom_land_cover_frac', 'forest_fraction'] + + + +targetLst = ['SWE'] + +#Data integration laggaed day +##Can be 30 or 7 +DI_day = 30 +## Data integration variable +##Can be ['SWE'] or ['snow_frac'] +DI_varibale = ['SWE'] + +### Read data: +# load forcing and target data +time_range = pd.date_range(DateRange[0], DateRange[-1], freq='d') +startyear = time_range[0].year +endyear = time_range[-1].year +for year in range(startyear,endyear+1): + for fid, foring_ in enumerate(var_x_list+DI_varibale+targetLst): + + foring_data = pd.read_csv(rootDB_s+'/'+str(year)+'/' + foring_ + '.csv', header=None, ) + foring_data = np.expand_dims(foring_data, axis = -1) + if fid==0: + xTrain_year = foring_data + else: + xTrain_year = np.concatenate((xTrain_year,foring_data), axis=-1) + + + + if year== startyear: + xTrain = xTrain_year + else: + xTrain = np.concatenate((xTrain,xTrain_year), axis=1) + +# load attributes +for aid, attribute_ in enumerate(attributeLst) : + attribute_data = pd.read_csv(rootDB_s+'/const/' + attribute_ + '.csv', header=None, ) + if aid==0: + attribute = attribute_data + else: + attribute = np.concatenate((attribute,attribute_data), axis=-1) + +##Select the training data +testing_time = pd.date_range(testDateRange[0], testDateRange[-1], freq='d') + +index_start = time_range.get_loc(testing_time[0]) +index_end = time_range.get_loc(testing_time[-1]) + 1 + +xTrain = xTrain[:,index_start:index_end] +## Calculate the statistics and normalize the data +stat_dict={} +for fid, forcing_item in enumerate(var_x_list+DI_varibale+targetLst) : + stat_dict[forcing_item] = scale.cal_stat(xTrain[:,:,fid]) + +for aid, attribute_item in enumerate (attributeLst): + stat_dict[attribute_item] = scale.cal_stat(attribute[:,aid]) + + +xTrain_norm = scale.trans_norm( + xTrain, var_x_list+DI_varibale+targetLst, stat_dict, to_norm=True +) + +xTrain_norm[xTrain_norm!=xTrain_norm] = 0 + +attribute_norm = scale.trans_norm(attribute, list(attributeLst), stat_dict, to_norm=True) +attribute_norm[attribute_norm!=attribute_norm] = 0 + + +forcing_train_norm = xTrain_norm[:,:,:len(var_x_list)] +integrated_var_norm = xTrain_norm[:,:,len(var_x_list):len(var_x_list)+len(DI_varibale)] +target_train_norm = xTrain_norm[:,:,-len(targetLst):] + +forcing_train_norm_combined = np.concatenate((forcing_train_norm[:,DI_day:,:],integrated_var_norm[:,:integrated_var_norm.shape[1]-DI_day,:]), axis=-1) + +target_train_norm = target_train_norm[:,DI_day:,:] + +target = xTrain[:,DI_day:,-len(targetLst):] + +## Load model +rootOut = "/mnt/sdb/yxs275/snow_hydroDL/output/"+'/LSTM_SWE_temp_DI_SWE_30/' +out = os.path.join(rootOut, f"exp_EPOCH600_BS100_RHO365_HS256_trainBuff365") # output folder to save results +if os.path.exists(out) is False: + os.mkdir(out) +with open(out + '/scaler_stat.json') as f: + stat_dict = json.load(f) + +## test the model +testepoch = 600 +model_path = out +print("Load model from ", model_path) +testmodel = loadModel(model_path, epoch=testepoch) + +testbatch =200 #len(indexes) + +filePathLst = [out+f"/SWE_norm.csv"] + +testmodel.inittime = 0 + + +test.testModel( + testmodel, forcing_train_norm_combined, c=attribute_norm, batchSize=testbatch, filePathLst=filePathLst) + +dataPred = pd.read_csv( out+f"/SWE_norm.csv", dtype=np.float32, header=None).values +dataPred = np.expand_dims(dataPred, axis=-1) + + +##Denormalization +yPred = scale.trans_norm( + dataPred, + targetLst, + stat_dict, + to_norm=False, +) + +##Denormalization + +evaDict = [stat.statError(yPred[:,:,0], target[:,:,0])] + +evaDictLst = evaDict +keyLst = ['NSE', 'RMSE','Bias', 'Corr'] +dataBox = list() +for iS in range(len(keyLst)): + statStr = keyLst[iS] + temp = list() + for k in range(len(evaDictLst)): + data = evaDictLst[k][statStr] + #data = data[~np.isnan(data)] + temp.append(data) + dataBox.append(temp) + + +print("LSTM model for SWE prediction: NSE, RMSE (m),Bias (m), Corr: ", + np.nanmedian(dataBox[0][0]), + np.nanmedian(dataBox[1][0]), np.nanmedian(dataBox[2][0]), np.nanmedian(dataBox[3][0])) + + + + + +pred_df = pd.DataFrame(yPred[:,:,0].transpose(), index=testing_time[DI_day:]) + +yearly_max_pred = pred_df.resample('AS-OCT').max() + +obs_df = pd.DataFrame(target[:,:,0].transpose(), index=testing_time[DI_day:]) +yearly_max_obs = obs_df.resample('AS-OCT').max() + + +yearly_max_pred = yearly_max_pred[(yearly_max_pred.index >= f'{testing_time[0].year}-10-01') & (yearly_max_pred.index < f'{testing_time[-1].year}-09-30')] +yearly_max_obs = yearly_max_obs[(yearly_max_obs.index >= f'{testing_time[0].year}-10-01') & (yearly_max_obs.index = f'{testing_time[0].year}-10-01') & (yearly_max_pred.index < f'{testing_time[-1].year}-09-30')] +yearly_max_obs = yearly_max_obs[(yearly_max_obs.index >= f'{testing_time[0].year}-10-01') & (yearly_max_obs.index np.array: + """ + Normalization or inverse normalization + + There are two normalization formulas: + + .. math:: normalized_x = (x - mean) / std + + and + + .. math:: normalized_x = [log_{10}(\sqrt{x} + 0.1) - mean] / std + + The later is only for vars in log_norm_cols; mean is mean value; std means standard deviation + + Parameters + ---------- + x + data to be normalized or denormalized + var_lst + the type of variables + stat_dict + statistics of all variables + log_norm_cols + which cols use the second norm method + to_norm + if true, normalize; else denormalize + + Returns + ------- + np.array + normalized or denormalized data + """ + if type(var_lst) is str: + var_lst = [var_lst] + out = np.full(x.shape, np.nan) + for k in range(len(var_lst)): + var = var_lst[k] + stat = stat_dict[var] + if to_norm is True: + if len(x.shape) == 3: + if var in log_norm_cols: + x[:, :, k] = np.log10(np.sqrt(x[:, :, k]) + 0.1) + out[:, :, k] = (x[:, :, k] - stat[2]) / stat[3] + elif len(x.shape) == 2: + if var in log_norm_cols: + x[:, k] = np.log10(np.sqrt(x[:, k]) + 0.1) + out[:, k] = (x[:, k] - stat[2]) / stat[3] + else: + if len(x.shape) == 3: + out[:, :, k] = x[:, :, k] * stat[3] + stat[2] + if var in log_norm_cols: + out[:, :, k] = (np.power(10, out[:, :, k]) - 0.1) ** 2 + elif len(x.shape) == 2: + out[:, k] = x[:, k] * stat[3] + stat[2] + if var in log_norm_cols: + out[:, k] = (np.power(10, out[:, k]) - 0.1) ** 2 + return out + + + +def cal_4_stat_inds(b): + """ + Calculate four statistics indices: percentile 10 and 90, mean value, standard deviation + + Parameters + ---------- + b + input data + + Returns + ------- + list + [p10, p90, mean, std] + """ + p10 = np.percentile(b, 10).astype(float) + p90 = np.percentile(b, 90).astype(float) + mean = np.mean(b).astype(float) + std = np.std(b).astype(float) + if std < 0.001: + std = 1 + return [p10, p90, mean, std] + + +def cal_stat_gamma(x): + """ + Try to transform a time series data to normal distribution + + Now only for daily streamflow, precipitation and evapotranspiration; + When nan values exist, just ignore them. + + Parameters + ---------- + x + time series data + + Returns + ------- + list + [p10, p90, mean, std] + """ + a = x.flatten() + b = a[~np.isnan(a)] # kick out Nan + b = np.log10( + np.sqrt(b) + 0.1 + ) # do some tranformation to change gamma characteristics + return cal_4_stat_inds(b) + + +def cal_stat(x: np.array) -> list: + """ + Get statistic values of x (Exclude the NaN values) + + Parameters + ---------- + x: the array + + Returns + ------- + list + [10% quantile, 90% quantile, mean, std] + """ + a = x.flatten() + b = a[~np.isnan(a)] + if b.size == 0: + # if b is [], then give it a 0 value + b = np.array([0]) + return cal_4_stat_inds(b) diff --git a/hydroDL/post/stat.py b/hydroDL/post/stat.py index ed586dc..8c0d650 100644 --- a/hydroDL/post/stat.py +++ b/hydroDL/post/stat.py @@ -8,6 +8,9 @@ def statError(pred, target): ngrid, nt = pred.shape # Bias Bias = np.nanmean(pred - target, axis=1) + + absBias = np.nanmean(abs(pred - target), axis=1) + # RMSE RMSE = np.sqrt(np.nanmean((pred - target)**2, axis=1)) # ubRMSE @@ -91,7 +94,7 @@ def statError(pred, target): R2[k] = 1-SSRes/SST NSE[k] = 1-SSRes/SST - outDict = dict(Bias=Bias, RMSE=RMSE, ubRMSE=ubRMSE, Corr=Corr, CorrSp=CorrSp, R2=R2, NSE=NSE, + outDict = dict(Bias=Bias,absBias = absBias, RMSE=RMSE, ubRMSE=ubRMSE, Corr=Corr, CorrSp=CorrSp, R2=R2, NSE=NSE, FLV=PBiaslow, FHV=PBiashigh, PBias=PBias, PBiasother=PBiasother, AFLV=absPBiaslow, AFHV=absPBiashigh, AFMV=absPBiasother, KGE=KGE, KGE12=KGE12, lowRMSE=RMSElow, highRMSE=RMSEhigh, midRMSE=RMSEother)