Merge in upstream

ihmeuw · May 13, 2024 · 1ee4c4a · 1ee4c4a
2 parents 2f3d168 + e6e252a
commit 1ee4c4a
Show file tree

Hide file tree

Showing 13 changed files with 518 additions and 310 deletions.
diff --git a/.gitignore b/.gitignore
@@ -78,5 +78,5 @@ venv.bak/
 # IDEs
 .idea/
 
-# Jupyter Notebook
+# Jupyter scratch notebooks
 notebooks/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,11 +25,6 @@ repos:
         # Add --fix, in case you want it to autofix when this hook runs
         entry: poetry run ruff check --force-exclude --fix
         require_serial: true
-      - id: ruff
-        name: ruff
-        # Add --fix, in case you want it to autofix when this hook runs
-        entry: poetry run ruff check --force-exclude
-        require_serial: true
         language: system
         types: [ python ]
       - id: mypy

diff --git a/README.md b/README.md
@@ -21,13 +21,13 @@ Pipelines to downscale ERA5 and CMIP6 data.
 
 Instructions using conda:
 
-1. Clone this repository. 
+1. Clone this repository.
 
     Over ssh:
     ```sh
     git clone [email protected]:ihmeuw/climate-downscale.git
     ```
-    
+
     Over https:
     ```sh
     git clone https://github.com/ihmeuw/climate-downscale.git
@@ -49,8 +49,8 @@ Instructions using conda:
 
 ### Pre-commit
 
-Pre-commit hooks run all the auto-formatting (`ruff format`), linters 
-(e.g. `ruff` and `mypy`), and other quality checks to make sure the changeset is in 
+Pre-commit hooks run all the auto-formatting (`ruff format`), linters
+(e.g. `ruff` and `mypy`), and other quality checks to make sure the changeset is in
 good shape before a commit/push happens.
 
 You can install the hooks with (runs for each commit):

diff --git a/pyproject.toml b/pyproject.toml
@@ -68,13 +68,19 @@ cdtask = "climate_downscale.cli:cdtask"
 
 [tool.ruff]
 target-version = "py310" # The lowest supported version
+exclude = [
+    "temperature",
+]
 
 [tool.ruff.lint]
 # By default, enable all the lint rules.
 # Add to the ignore list below if you don't want some rules.
 # If you need some ignores for certain modules, see tool.ruff.lint.per-file-ignores below.
 # For individual ignore cases, prefer inline `# noqa`s within the code.
 select = ["ALL"]
+exclude = [
+    "temperature",
+]
 ignore = [
     "COM812",   # flake8 missing trailing comma, formatter handles
     "ISC001",   # Implicit string concatenation
@@ -103,7 +109,7 @@ classmethod-decorators = [
 ]
 
 [tool.ruff.lint.flake8-tidy-imports]
-ban-relative-imports = true
+ban-relative-imports = "all"
 
 [tool.pytest.ini_options]
 addopts = """\
@@ -125,6 +131,9 @@ exclude_lines = [
 # Avoid changing this!
 strict = true  # See all the enabled flags `mypy --help | grep -A 10 'Strict mode'`
 disallow_any_unimported = true
+exclude = [
+    "temperature",
+]
 
 # If you need to ignore something for some specific module,
 # add overrides for them. Avoid changing the global config!

diff --git a/temperature/README.md b/temperature/README.md
@@ -4,5 +4,5 @@ This repo contains the fundamental code used to downscale ERA5 daily average tem
 
 1. Moving the intermediate processing steps to their own worker scripts
 2. Converting all outputs to a compressed file format. Ideally, using compression options found in the xarray library when outputting .nc files.
-3. Creating an additional script that addresses problematic outliers in the prediction step. Ideally, this is done within the regional_predictions.py script or directly after it. 
-4. Include diagnostic scripts used to visualize prediction data.
+3. Creating an additional script that addresses problematic outliers in the prediction step. Ideally, this is done within the regional_predictions.py script or directly after it.
+4. Include diagnostic scripts used to visualize prediction data.
diff --git a/temperature/daily_preds_nc.py b/temperature/daily_preds_nc.py
@@ -22,23 +22,23 @@
 final_data = pd.DataFrame()
 
 for file in os.listdir(folder):
-    if file.endswith('.feather'):
+    if file.endswith(".feather"):
         print(file)
         # load the data
         print("Loading data")
         data = pd.read_feather(folder + file)
         # drop index column
         print("Dropping unnecessary columns")
         # data = data.drop(columns='index')
-        data = data[['latitude_st', 'longitude_st', 'day_of_year', 'predictions']]
+        data = data[["latitude_st", "longitude_st", "day_of_year", "predictions"]]
         # filter to get day
         print("Filtering to get day: ", day)
-        data_day = data[data['day_of_year'] == day]
+        data_day = data[data["day_of_year"] == day]
         # round lat/lon for storage
         print("Rounding lat/lon")
-        data_day['latitude_st'] = data_day['latitude_st'].round(10)
-        data_day['longitude_st'] = data_day['longitude_st'].round(10)
-        data_day['predictions'] = data_day['predictions'].round(3)
+        data_day["latitude_st"] = data_day["latitude_st"].round(10)
+        data_day["longitude_st"] = data_day["longitude_st"].round(10)
+        data_day["predictions"] = data_day["predictions"].round(3)
         print("Appending data")
         # Append the data for the current day to the larger DataFrame
         final_data = pd.concat([final_data, data_day])
@@ -53,26 +53,26 @@
 date = date + timedelta(days=day_of_year)
 
 # Format the datetime object into a string
-date_string = date.strftime('%Y_%m_%d')
+date_string = date.strftime("%Y_%m_%d")
 print("Here is the date string: ", date_string)
 
 # replace old time columns with date
-final_data.rename(columns={'latitude_st': 'lat', 'longitude_st': 'lon'}, inplace=True)
-final_data = final_data[['lat', 'lon', 'predictions']]
-final_data['date'] = date_string
+final_data.rename(columns={"latitude_st": "lat", "longitude_st": "lon"}, inplace=True)
+final_data = final_data[["lat", "lon", "predictions"]]
+final_data["date"] = date_string
 
-filename = 'predictions_' + date_string
+filename = "predictions_" + date_string
 
-print('resetting index')
-final_data.set_index(['date', 'lat', 'lon'], inplace=True)
+print("resetting index")
+final_data.set_index(["date", "lat", "lon"], inplace=True)
 
 # convert the dataframe to an xarray Dataset
-print('converting to xarray Dataset')
+print("converting to xarray Dataset")
 final_data = xr.Dataset.from_dataframe(final_data)
 
 # Calculate the scale factor and add offset
-min_value = final_data['predictions'].min(dim=('date', 'lat', 'lon')).item()
-max_value = final_data['predictions'].max(dim=('date', 'lat', 'lon')).item()
+min_value = final_data["predictions"].min(dim=("date", "lat", "lon")).item()
+max_value = final_data["predictions"].max(dim=("date", "lat", "lon")).item()
 scale_factor = (max_value - min_value) / (2**16 - 1)
 add_offset = min_value
 
@@ -82,13 +82,15 @@
 print("saving file to: ", output_path + filename)
 
 # save the Dataset to a netCDF file
-final_data.to_netcdf(output_path + filename + '.nc',
-            encoding={
-                'predictions': {
-                    'dtype': 'float32', 
-                    'scale_factor': scale_factor, 
-                    'add_offset': add_offset, 
-                    '_FillValue': -9999,
-                    'zlib': True
-                    }
-                })
+final_data.to_netcdf(
+    output_path + filename + ".nc",
+    encoding={
+        "predictions": {
+            "dtype": "float32",
+            "scale_factor": scale_factor,
+            "add_offset": add_offset,
+            "_FillValue": -9999,
+            "zlib": True,
+        }
+    },
+)
diff --git a/temperature/grid_setup/grid_setup_with_elevation_v3.py b/temperature/grid_setup/grid_setup_with_elevation_v3.py
@@ -23,7 +23,7 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_absolute_error
-from sklearn.model_selection import cross_val_score 
+from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import RandomizedSearchCV
 from joblib import load
@@ -45,25 +45,35 @@
 import subprocess
 
 import pyproj
-pyproj.datadir.set_data_dir('/ihme/homes/nhashmeh/miniconda3/envs/earth-analytics-python/share/proj')
-os.environ['PROJ_LIB'] = '/ihme/homes/nhashmeh/miniconda3/envs/earth-analytics-python/share/proj'
+
+pyproj.datadir.set_data_dir(
+    "/ihme/homes/nhashmeh/miniconda3/envs/earth-analytics-python/share/proj"
+)
+os.environ["PROJ_LIB"] = (
+    "/ihme/homes/nhashmeh/miniconda3/envs/earth-analytics-python/share/proj"
+)
 
 # Parse arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('file', type=str, help='filename')
+parser.add_argument("file", type=str, help="filename")
 
 args = parser.parse_args()
 
-chunks_path = "/mnt/share/erf/ERA5/merged_era5_with_stations/grid_setup/chunks_by_latlon_v3/"
+chunks_path = (
+    "/mnt/share/erf/ERA5/merged_era5_with_stations/grid_setup/chunks_by_latlon_v3/"
+)
 
 chunk_file = args.file
 
 grid_df = pd.read_feather(chunks_path + chunk_file)
 
-grid_df.drop(columns=['latitude_right', 'longitude_right'], inplace=True) # this should have been dropped earlier...
+grid_df.drop(
+    columns=["latitude_right", "longitude_right"], inplace=True
+)  # this should have been dropped earlier...
+
+grid_df["geometry"] = grid_df["geometry"].apply(wkt.loads)
+grid_gdf = gpd.GeoDataFrame(grid_df, geometry="geometry")
 
-grid_df['geometry'] = grid_df['geometry'].apply(wkt.loads)
-grid_gdf = gpd.GeoDataFrame(grid_df, geometry='geometry')
 
 # Function to adjust longitude
 def adjust_longitude(geometry):
@@ -72,15 +82,15 @@ def adjust_longitude(geometry):
     else:
         return geometry
 
+
 gdf_merge = grid_gdf
-gdf_merge['elevation'] = np.nan
+gdf_merge["elevation"] = np.nan
 
 merge_counter = 0
 success_counter = 0
 
-for file in os.listdir('/mnt/share/erf/SRTM_GL1_srtm/'):
-
-    file_ext = file.split('.')[-1]
+for file in os.listdir("/mnt/share/erf/SRTM_GL1_srtm/"):
+    file_ext = file.split(".")[-1]
 
     if file_ext != "tif":
         continue
@@ -93,59 +103,72 @@ def adjust_longitude(geometry):
     if file_ext != "tif":
         continue
 
-    if NS == 'S':
-        NS_num= float(NS_num) * -1
+    if NS == "S":
+        NS_num = float(NS_num) * -1
 
-    if EW == 'W':
+    if EW == "W":
         EW_num = 360 - float(EW_num)
 
-    if (((EW_num < math.floor(gdf_merge['longitude_left'].min())) | (EW_num > math.ceil(gdf_merge['longitude_left'].max()))) | ((NS_num < math.floor(gdf_merge['latitude_left'].min())) | (NS_num > math.ceil(gdf_merge['latitude_left'].max())))):
+    if (
+        (EW_num < math.floor(gdf_merge["longitude_left"].min()))
+        | (EW_num > math.ceil(gdf_merge["longitude_left"].max()))
+    ) | (
+        (NS_num < math.floor(gdf_merge["latitude_left"].min()))
+        | (NS_num > math.ceil(gdf_merge["latitude_left"].max()))
+    ):
         merge_counter += 1
         continue
 
     # rescaling 30 meters to 1 km
-    downscale_factor = 30/1000
+    downscale_factor = 30 / 1000
 
     # Load the raster
     with reo.open("/mnt/share/erf/SRTM_GL1_srtm/" + file) as src:
         # resample data to target shape
         data = src.read(
             out_shape=(
                 src.count,
-                int(src.height * downscale_factor), # multiply height by downscale factor
-                int(src.width * downscale_factor) # multiply widrh by downscale factor
+                int(
+                    src.height * downscale_factor
+                ),  # multiply height by downscale factor
+                int(src.width * downscale_factor),  # multiply widrh by downscale factor
             ),
-            resampling=Resampling.bilinear # resampling method
+            resampling=Resampling.bilinear,  # resampling method
         )
 
         # scale image transform
         transform = src.transform * src.transform.scale(
-            (src.width / data.shape[-1]),
-            (src.height / data.shape[-2])
+            (src.width / data.shape[-1]), (src.height / data.shape[-2])
         )
 
-                # convert the resampled raster to a geodataframe
+        # convert the resampled raster to a geodataframe
         results = (
-            {'properties': {'raster_val': v}, 'geometry': s}
-            for i, (s, v) 
-            in enumerate(
-                reo.features.shapes(data[0], transform=transform)))  # assumes a single band image
+            {"properties": {"raster_val": v}, "geometry": s}
+            for i, (s, v) in enumerate(
+                reo.features.shapes(data[0], transform=transform)
+            )
+        )  # assumes a single band image
 
         geodf = gpd.GeoDataFrame.from_features(list(results))
 
         # Apply the function to the geometry column
-        geodf['geometry'] = geodf['geometry'].apply(adjust_longitude)
+        geodf["geometry"] = geodf["geometry"].apply(adjust_longitude)
 
-        prev_len = len(gdf_merge[~gdf_merge['elevation'].isna()]) # used for debugging
+        prev_len = len(gdf_merge[~gdf_merge["elevation"].isna()])  # used for debugging
 
-        gdf_merge = gpd.sjoin(gdf_merge, geodf, how='left', predicate='intersects')
+        gdf_merge = gpd.sjoin(gdf_merge, geodf, how="left", predicate="intersects")
         # test_merge.rename(columns = {'raster_val' : 'elevation'}, inplace=True)
-        gdf_merge['elevation'] = gdf_merge['elevation'].combine_first(gdf_merge['raster_val'])
-        gdf_merge.drop(columns = 'index_right', inplace=True)
-        gdf_merge.drop(columns = 'raster_val', inplace=True)
-
+        gdf_merge["elevation"] = gdf_merge["elevation"].combine_first(
+            gdf_merge["raster_val"]
+        )
+        gdf_merge.drop(columns="index_right", inplace=True)
+        gdf_merge.drop(columns="raster_val", inplace=True)
 
-        if len(gdf_merge[~gdf_merge['elevation'].isna()]) > prev_len:
+        if len(gdf_merge[~gdf_merge["elevation"].isna()]) > prev_len:
             success_counter += 1
 
-gdf_merge.to_feather("/mnt/share/erf/ERA5/merged_era5_with_stations/grid_setup/chunks_with_elevation_v3/" + chunk_file, index=False)
+gdf_merge.to_feather(
+    "/mnt/share/erf/ERA5/merged_era5_with_stations/grid_setup/chunks_with_elevation_v3/"
+    + chunk_file,
+    index=False,
+)