Skip to content

Commit

Permalink
Add support for more EAMxx variables (#880)
Browse files Browse the repository at this point in the history
Co-authored-by: Tom Vo <[email protected]>
  • Loading branch information
chengzhuzhang and tomvothecoder authored Jan 16, 2025
1 parent 70ecf94 commit d01fd7a
Show file tree
Hide file tree
Showing 19 changed files with 659 additions and 71 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
strategy:
matrix:
python-version: ["3.9", "3.10", "3.11", "3.12"]
container:
container:
image: ghcr.io/e3sm-project/containers-e3sm-diags-test-data:e3sm-diags-test-data-0.0.2
steps:
- id: skip_check
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
This script is used to debug the bottleneck issue in the reference u variable.
"""

# %%
import timeit

import xarray as xr

# Perlmutter
# ----------
# filepaths = [
# "/global/cfs/cdirs/e3sm/diagnostics/observations/Atm/time-series/ERA5/ua_197901_201912.nc"
# ]

# LCRC
# -----
filepaths = [
"/lcrc/group/e3sm/diagnostics/observations/Atm/time-series/ERA5/ua_197901_201912.nc"
]
time_slice = slice("1996-01-15", "1997-01-15", None)

# %%
# Test case 1 - OPEN_MFDATASET() + "ua" dataset (76 GB) + subsetting + `.load()`
# Result: .load() hangs when using `open_mfdataset`
# ------------------------------------------------------------------------------
ds_ua_omfd = xr.open_mfdataset(
filepaths,
decode_times=True,
use_cftime=True,
coords="minimal",
compat="override",
)
ds_ua_omfd_sub = ds_ua_omfd.sel(time=time_slice)

# %%
start_time = timeit.default_timer()
ds_ua_omfd_sub.load()
elapsed = timeit.default_timer() - start_time
print(f"Time taken to load ds_xc_sub: {elapsed} seconds")

# %%
# Test case 2 - OPEN_DATASET() + "ua" dataset (76 GB) + subsetting + `.load()`
# Result: load() works fine when using `open_dataset`
# ------------------------------------------------------------------------------
ds_ua_od = xc.open_dataset(
filepaths[0],
add_bounds=["X", "Y", "T"],
decode_times=True,
use_cftime=True,
# coords="minimal",
# compat="override",
)
ds_ua_od_sub = ds_ua_od.sel(time=time_slice)

# %%
start_time = timeit.default_timer()
ds_ua_od_sub.load()
elapsed = timeit.default_timer() - start_time
print(f"Time taken to load ds_xc_sub: {elapsed} seconds")

# %%
# Test case 3 - OPEN_MFDATASET() + "pr" dataset (2 GB) + subsetting + `.load()`
# Result: ds.load() works fine with pr variable, but not with ua variable
# Notes: pr is 3D variable (time, lat, lon), ua is a 4D variable (time, lat, lon, plev).
# ------------------------------------------------------------------------------
filepaths_pr = [
"/global/cfs/cdirs/e3sm/diagnostics/observations/Atm/time-series/ERA5/pr_197901_201912.nc"
]
ds_pr = xc.open_mfdataset(
filepaths_pr,
add_bounds=["X", "Y", "T"],
decode_times=True,
use_cftime=True,
coords="minimal",
compat="override",
)

# %%
# pr dataset is ~2 GB without subsetting. There is no need to subset.
start_time = timeit.default_timer()
ds_pr.load()
elapsed = timeit.default_timer() - start_time
print(f"Time taken to load ds_xc_sub_0: {elapsed} seconds")
# %%
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[#]
sets = ["lat_lon"]
case_id = "ERA5"
variables = ["U"]
ref_name = "ERA5"
reference_name = "ERA5 Reanalysis"
seasons = ["ANN", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "DJF", "MAM", "JJA", "SON"]
plevs = [850.0]
test_colormap = "PiYG_r"
reference_colormap = "PiYG_r"
contour_levels = [-20, -15, -10, -8, -5, -3, -1, 1, 3, 5, 8, 10, 15, 20]
diff_levels = [-8, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 8]
regrid_method = "bilinear"
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import os
from e3sm_diags.parameter.core_parameter import CoreParameter
from e3sm_diags.run import runner

param = CoreParameter()


param.reference_data_path = (
"/global/cfs/cdirs/e3sm/diagnostics/observations/Atm/time-series"
)
param.test_data_path = "/global/cfs/cdirs/e3sm/chengzhu/eamxx/post/data/rgr"
param.test_name = "eamxx_decadal"
param.seasons = ["ANN"]
# param.save_netcdf = True

param.ref_timeseries_input = True
# Years to slice the ref data, base this off the years in the filenames.
param.ref_start_yr = "1996"
param.ref_end_yr = "1996"

prefix = "/global/cfs/cdirs/e3sm/www/cdat-migration-fy24/892-bottleneck"
param.results_dir = os.path.join(prefix, "eamxx_decadal_1996_1107_edv3")

cfg_path = "auxiliary_tools/cdat_regression_testing/892-bottleneck/run_script.cfg"
sys.argv.extend(["--diags", cfg_path])

runner.sets_to_run = [
"lat_lon",
"zonal_mean_xy",
"zonal_mean_2d",
"zonal_mean_2d_stratosphere",
"polar",
"cosp_histogram",
"meridional_mean_2d",
"annual_cycle_zonal_mean",
]

runner.run_diags([param])
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# %%
import timeit

import xarray as xr

filepaths = [
"/lcrc/group/e3sm/diagnostics/observations/Atm/time-series/ERA5/ua_197901_201912.nc"
]

ds = xr.open_mfdataset(filepaths)

ds_sub = ds.sel(time=slice("1996-01-15", "1997-01-15", None))

# %%
start_time = timeit.default_timer()
ds_sub.ua.load()
elapsed = timeit.default_timer() - start_time
print(f"Time taken to load ds_xc_sub: {elapsed} seconds")

# %%
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# %%
import numpy as np
import pandas as pd
import xarray as xr
import timeit

import dask.array as da

# %%
# Define the dimensions
time = 12
plev = 37
lat = 721
lon = 1440

# Create the data arrays using dask.
data = da.random.random(size=(time, plev, lat, lon), chunks=(12, 37, 721, 1440)).astype(
np.float32
)

# Create the coordinates.
times = pd.date_range("2000-01-01", periods=time)
plevs = np.linspace(100000, 10, plev)
lats = np.linspace(-90, 90, lat)
lons = np.linspace(0, 360, lon, endpoint=False)

# Create the dataset and write out to a file.
ds = xr.Dataset(
{"data": (["time", "plev", "lat", "lon"], data)},
coords={"time": times, "plev": plevs, "lat": lats, "lon": lons},
)
# %%
ds.to_netcdf("dask_bottleneck.nc")

# %%
# Open the dataset.
ds_open = xr.open_mfdataset("dask_bottleneck.nc")

# %%
# Load the dataset into memory
start_time = timeit.default_timer()
ds.load()
end_time = timeit.default_timer()

print(f"Time taken to load the dataset: {end_time - start_time} seconds")


# %%
Loading

0 comments on commit d01fd7a

Please sign in to comment.