diff --git a/METreformat/write_stat_ascii.py b/METreformat/write_stat_ascii.py index 98dcfb32..ae16a5e5 100644 --- a/METreformat/write_stat_ascii.py +++ b/METreformat/write_stat_ascii.py @@ -65,13 +65,16 @@ def __init__(self, parms, logger): sys.exit("*** Error initializing class WriteStatAscii") def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame: - """ For line types: FHO, CTC, CTS, SL1L2, ECNT, MCTS, and VCNT reformat the MET stat files (.stat) to another - ASCII file with stat_name, stat_value, + """ For line types: FHO, CTC, CTS, SL1L2, ECNT, MCTS, VCNT, MPR (line plot), and DMAP (line plot) + reformat the MET stat files (.stat) to another ASCII file with stat_name, stat_value, stat_bcl, stat_bcu, stat_ncl, and stat_ncu columns, converting the original data file from wide form to long form. For TCDiag line type, the MET .tcst stat files (from TC-Pairs) are converted to an ASCII file with the original TC-Pairs columns with the corresponding TC-Diag columns. + !!!!!!!!!! + !!!NOTE!!! + !!!!!!!!!! For line types such as PCT: specific reformatting is required, based on the type of plot that is utilizing that data. @@ -106,7 +109,7 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame # ---------------------------------- supported_linetypes = [cn.FHO, cn.CNT, cn.VCNT, cn.CTC, cn.CTS, cn.MCTS, cn.SL1L2, cn.ECNT, cn.PCT, - cn.RHIST, cn.TCDIAG, cn.MPR] + cn.RHIST, cn.TCDIAG, cn.MPR, cn.DMAP] # Different formats based on the line types. Most METplotpy plots accept the long format where # all stats are under the stat_name and stat_value columns and the confidence limits under the @@ -218,6 +221,8 @@ def process_by_stat_linetype(self, linetype: str, stat_data: pd.DataFrame, is_ag reformatted with columns corresponding to the linetype's statistics names. """ + linetype_data = pd.DataFrame() + # FHO forecast, hit rate, observation rate if linetype == cn.FHO: if is_aggregated: @@ -313,6 +318,10 @@ def process_by_stat_linetype(self, linetype: str, stat_data: pd.DataFrame, is_ag # code in METcalcpy agg_stat.py for MPR. linetype_data: pd.DataFrame = self.process_mpr(stat_data) + elif linetype == cn.DMAP: + # no need to support further formatting for agg_stat. No code in METcalcpy's + # agg_stat.py for DMAP + linetype_data: pd.DataFrame = self.process_dmap(stat_data) else: return None @@ -1766,9 +1775,9 @@ def process_mpr(self, stat_data: pd.DataFrame) -> pd.DataFrame: Retrieve the MPR line type data and reshape it to replace the original columns (based on column number) into stat_name, stat_value, stat_bcl, stat_bcu, stat_ncu, and stat_ncl if the - keep_all_mpr_cols setting is False. + keep_all_cols setting is False. - If keep_all_mpr_cols is set to True, merge the reformatted/reshaped MPR + If keep_all_cols is set to True, merge the reformatted/reshaped MPR data with the original MET output to use the output by both the METplotpy line plot and the METplotpy scatter plot. @@ -1816,14 +1825,11 @@ def process_mpr(self, stat_data: pd.DataFrame) -> pd.DataFrame: # Work on a copy of the mpr_df dataframe to avoid a possible PerformanceWarning # message due to a fragmented dataframe. mpr_df_copy = mpr_df.copy() - # DEBUG REMOVE ME WHEN DONE - mpr_df_copy.to_csv("./mpr_df_orig.txt", sep='\t', index=False) - # DEBUG END mpr_df_copy.insert(loc=0, column='Idx', value=idx) # if reformatting for a scatter plot, only return all the original columns, # maintaining the 'tidy' format provided by the MET tool. - if self.parms['keep_all_mpr_cols'] is True: + if self.parms['keep_all_cols'] is True: return mpr_df_copy # Use pandas 'melt' to reshape the data frame from wide to long shape (i.e. @@ -1870,6 +1876,118 @@ def process_mpr(self, stat_data: pd.DataFrame) -> pd.DataFrame: return linetype_data + + def process_dmap(self, stat_data: pd.DataFrame) -> pd.DataFrame: + """ + Retrieve the DMAP line type data and reshape it to replace the original + columns (based on column number) into + stat_name, stat_value, stat_bcl, stat_bcu, stat_ncu, and stat_ncl if the + keep_all_cols setting is False. + + If keep_all_cols is set to True, merge the reformatted/reshaped DMAP + data with the original MET output to use the output by the METplotpy + line and contour plot and the METplotpy scatter plot. If keep_all_cols is False, then + the output data can only be used for the line and contour plots. The line plot and contour + plots only require the + + Arguments: + @param stat_data: The dataframe containing the data from + the MET .stat file. + + Returns: + linetype_data: The dataframe with the reshaped data for the DMAP line type + """ + + # Extract the stat_names and stat_values for this line type: + # TOTAL, FY, OY, FBIAS, BADDELEY, HAUSDORFF, MED_FO, MED_OF, MED_MIN, + # MED_MAX, FOM_FO, FOM_OF, FOM_MIN, FOM_MAX, FOM_MEAN, ZHU_FO, ZHU_OF, + # ZHU_MIN, ZHU_MAX, G, GBETA, BETA_VALUE + # (these will be the names under the stat name column). + # There are no corresponding xyz_bcl, xyz_bcu, + # xyz_ncl, and xyz_ncu values where xyz = stat name, these columns will be + # created with NA values. + + # + # Subset the stat_data dataframe into a smaller data frame containing only + # the DMAP line type with all its columns (some of which may be unlabelled + # if there were other linetypes in the input file). + # + + # Relevant columns for the DMAP line type + linetype: str = cn.DMAP + end = cn.NUM_STAT_DMAP_COLS + dmap_columns_to_use: List[str] = \ + np.arange(0, end).tolist() + + # Subset the original dataframe to another dataframe consisting of only the DMAP + # line type. The DMAP specific columns will only have numbers at this point. + dmap_df: pd.DataFrame = stat_data[stat_data['line_type'] == linetype].iloc[:, + dmap_columns_to_use] + + # Add the stat columns header names for the DMAP line type + dmap_columns: List[str] = cn.DMAP_HEADERS + dmap_df.columns: List[str] = dmap_columns + + # Create another index column to preserve the index values from the stat_data + # dataframe (ie the dataframe + # containing the original data from the MET output file). + idx = list(dmap_df.index) + + # Work on a copy of the dmap_df dataframe to avoid a possible PerformanceWarning + # message due to a fragmented dataframe. + dmap_df_copy = dmap_df.copy() + dmap_df_copy.insert(loc=0, column='Idx', value=idx) + + # if reformatting for a scatter plot, only return all the original columns, + # maintaining the 'tidy' format provided by the MET tool. + if self.parms['keep_all_cols'] is True: + return dmap_df_copy + + # Use pandas 'melt' to reshape the data frame from wide to long shape (i.e. + # collecting the fy, oy, fbias, baddeley,..., and beta_value + # values and putting them under the column 'stat_value' + # corresponding to the 'stat_name' column + # containing the names FY, OY, ..., and BETA_VALUE columns. + + # columns that we don't want to change (the last eleven columns are the stat + # columns of interest, + # we want to capture that information into the stat_name and stat_values + # columns) + columns_to_use: List[str] = dmap_df_copy.columns[0:].tolist() + self.logger.info(f"Columns to use: {columns_to_use} ") + + # variables to transform from wide to long (i.e. organize into + # key-value structure with variables in one column and their corresponding + # values in another column). Omit the matched pair index. + variables_to_transform = list(cn.LC_DMAP_SPECIFIC)[:] + self.logger.info( + f"Variables to transform from wide to long: {cn.LC_DMAP_SPECIFIC[1:]} ") + + melted: pd.DataFrame = pd.melt(dmap_df_copy, id_vars=columns_to_use[1:27], + value_vars=variables_to_transform, + var_name='stat_name', + value_name='stat_value', + ignore_index=True) + + linetype_data = melted.copy(deep=True) + + # The MPR line type doesn't have the bcl and bcu stat values; set these to NA + na_column: List[str] = ['NA' for _ in range(0, linetype_data.shape[0])] + + linetype_data['stat_ncl']: pd.Series = na_column + linetype_data['stat_ncu']: pd.Series = na_column + linetype_data['stat_bcl']: pd.Series = na_column + linetype_data['stat_bcu']: pd.Series = na_column + + # clean up all the intermediate dataframes + del dmap_df + del dmap_df_copy + del melted + _ = gc.collect() + + return linetype_data + + def rename_confidence_level_columns(self, confidence_level_columns: List[str]) -> \ List[str]: """