spaceml-org · annajungbluth · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
diff --git a/.project-root b/.project-root
diff --git a/config/example-hydra-config/data.yaml b/config/example-hydra-config/data.yaml
@@ -0,0 +1,54 @@
+A_data:
+  A_path: null
+  A_train_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific msg dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+    load_coords: False
+    load_cloudmask: False
+  A_val_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific msg dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+    load_coords: False
+    load_cloudmask: False 
+  A_plot_settings: null 
+
+B_data:
+  B_path: null
+  B_train_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific goes dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
+    load_coords: False
+    load_cloudmask: False
+  B_val_dataset:
+    _target_: iti.data.geo_datasets.GeoDataset # TODO: make specific goes dataset?
+    data_dir: null
+    editors: null # TODO: hard code in dataset?
+    splits_dict:
+      train:
+        years: [2020]
+        months: [10]
+        days: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
+    load_coords: False
+    load_cloudmask: False  
+  B_plot_settings: null
+
+num_workers: 4
+iterations_per_epoch: 1000
diff --git a/config/example-hydra-config/model.yaml b/config/example-hydra-config/model.yaml
@@ -0,0 +1,9 @@
+model:
+  __target__: null
+  input_dim_a: 11
+  input_dim_b: 16
+  upsampling: 0
+  discriminator_mode: CHANNELS
+  lambda_diversity: 0
+  norm: 'none'
+  use_batch_statistic: False
diff --git a/config/example-hydra-config/train.yaml b/config/example-hydra-config/train.yaml
@@ -0,0 +1 @@
+base_dir: /home/freischem/outputs/miniset/
diff --git a/config/example-hydra-config/wandb.yaml b/config/example-hydra-config/wandb.yaml
@@ -0,0 +1,6 @@
+experiment_name: null
+tags: null
+wandb_entity: null
+wandb_project: null
+wandb_name: null
+wandb_id: null
diff --git a/config/msg_to_goes.yaml b/config/msg_to_goes.yaml
@@ -0,0 +1,28 @@
+base_dir: /home/anna.jungbluth/outputs/msg-to-goes/
+data:
+  A_path: /mnt/disks/eo-data/msg/
+  converted_A_path: /home/anna.jungbluth/tmp-data/msg/
+  B_path: /mnt/disks/eo-data/goes/
+  converted_B_path: /home/anna.jungbluth/tmp-data/goes/
+  num_workers: 4
+  iterations_per_epoch: 1000
+  A_patch_size: (1024, 1024) # Larger patches are saved for accelerated training.
+  B_patch_size: (1024, 1024) # Patches are further cropped to (256, 256) before training.
+  A_bands: [6.25, 7.35, 8.7, 9.66, 10.8, 12.0, 13.4] # [0.64, 0.81, 1.64, 3.92, 6.25, 7.35, 8.7, 9.66, 10.8, 12.0, 13.4]
+  B_bands: [6.17, 6.93, 7.34, 8.44, 9.61, 10.33, 11.19, 12.27, 13.27] # [0.47, 0.64, 0.87, 1.38, 1.61, 2.25, 3.89, 6.17, 6.93, 7.34, 8.44, 9.61, 10.33, 11.19, 12.27, 13.27]
+model:
+  input_dim_a: 7 
+  input_dim_b: 9
+  upsampling: 0
+  discriminator_mode: CHANNELS
+  lambda_diversity: 0
+  norm: 'in_rs_aff'
+  use_batch_statistic: False
+logging:
+  wandb_entity: itieo
+  wandb_project: msg-to-goes
+  wandb_name: MSG_to_GOES-infrared-7bands (6.25 - 13.4 um)
+training:
+  epochs: 200
+  limit_train_batches: null
+normalization: # TODO: Change to avoid absolute paths
diff --git a/config/msg_to_goes_miniset.yaml b/config/msg_to_goes_miniset.yaml
@@ -0,0 +1,29 @@
+base_dir: /home/anna.jungbluth/outputs/msg-to-goes-miniset/
+data:
+  A_path: /home/anna.jungbluth/data-miniset/msg/
+  B_path: /home/anna.jungbluth/data-miniset/goes/
+  num_workers: 4
+  iterations_per_epoch: 500
+  A_patch_size: null # Already cropped to 150 x 150
+  B_patch_size: null # Already cropped to 450 x 450
+  A_bands: [10.8] # [0.64, 0.81, 1.64, 3.92, 6.25, 7.35, 8.7, 9.66, 10.8, 12.0, 13.4]
+  B_bands: [10.33] # [0.47, 0.64, 0.87, 1.38, 1.61, 2.25, 3.89, 6.17, 6.93, 7.34, 8.44, 9.61, 10.33, 11.19, 12.27, 13.27]
+model:
+  input_dim_a: 1 
+  input_dim_b: 1 
+  upsampling: 1 # one upscaling of x3
+  discriminator_mode: SINGLE
+  lambda_diversity: 0
+  norm: 'in_aff'
+  use_batch_statistic: False
+logging:
+  wandb_entity: itieo
+  wandb_project: msg-to-goes
+  wandb_name: "[miniset]-MSG_to_GOES-infrared-upsampled (10.8 um) ['in_aff' norm]"
+training:
+  epochs: 200
+  limit_train_batches: null
+  limit_val_batches: null
+normalization: # TODO: Change to avoid absolute paths
+  A_norm_dir: /home/anna.jungbluth/InstrumentToInstrument/dataset/tmp/msg_2020_miniset.csv
+  B_norm_dir: /home/anna.jungbluth/InstrumentToInstrument/dataset/tmp/goes_2020_miniset.csv
diff --git a/dataset/goes_2020_hourly.csv b/dataset/goes_2020_hourly.csv
diff --git a/dataset/goes_2020_hourly_subset.csv b/dataset/goes_2020_hourly_subset.csv
diff --git a/dataset/msg_2020_hourly.csv b/dataset/msg_2020_hourly.csv
diff --git a/dataset/msg_2020_hourly_subset.csv b/dataset/msg_2020_hourly_subset.csv
diff --git a/dataset/tmp/goes_2020_miniset.csv b/dataset/tmp/goes_2020_miniset.csv
diff --git a/dataset/tmp/msg_2020_miniset.csv b/dataset/tmp/msg_2020_miniset.csv
diff --git a/itipy/callback.py b/itipy/callback.py
@@ -102,7 +102,7 @@ def __init__(self, data, model, plot_settings_A=None, plot_settings_B=None, plot
 
         plot_settings = [*plot_settings_A, *plot_settings_B, *plot_settings_A]
 
-        super().__init__(data, model, path, plot_id, plot_settings, **kwargs)
+        super().__init__(data, model, plot_id, plot_settings, **kwargs)
 
     def predict(self, x):
         x_ab, x_aba = self.model.forwardABA(x)
@@ -138,7 +138,7 @@ def __init__(self, data, model, plot_settings_A=None, plot_settings_B=None, plot
 
         plot_settings = [*plot_settings_B, *plot_settings_A, *plot_settings_B]
 
-        super().__init__(data, model, path, plot_id, plot_settings, **kwargs)
+        super().__init__(data, model, plot_id, plot_settings, **kwargs)
 
     def predict(self, x):
         x_ba, x_bab = self.model.forwardBAB(x)
@@ -169,7 +169,7 @@ def __init__(self, data, model, plot_settings_A=None, plot_settings_B=None, plot
 
         plot_settings = [*plot_settings_A, *plot_settings_B]
 
-        super().__init__(data, model, path, plot_id, plot_settings, **kwargs)
+        super().__init__(data, model, plot_id, plot_settings, **kwargs)
 
     def predict(self, input_data):
         x_ab = self.model.forwardAB(input_data)

diff --git a/itipy/data/geo_datasets.py b/itipy/data/geo_datasets.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+import collections
+import collections.abc
+
+#hyper needs the four following aliases to be done manually.
+collections.Iterable = collections.abc.Iterable
+collections.Mapping = collections.abc.Mapping
+collections.MutableSet = collections.abc.MutableSet
+collections.MutableMapping = collections.abc.MutableMapping
+
+import logging
+import time
+import torch
+import numpy as np
+import xarray as xr
+from typing import List, Union, Dict
+from loguru import logger
+
+from itipy.data.editor import Editor
+from itipy.data.geo_editor import CenterWeightedCropDatasetEditor
+from itipy.data.dataset import BaseDataset
+from itipy.data.geo_utils import get_split, get_list_filenames, _check_any_constant_channels
+
+class GeoDataset(BaseDataset):
+    def __init__(
+        self,
+        data_dir: List[str],
+        splits_dict: Dict,
+        editors: List[Editor]=None,
+        ext: str="nc",
+        limit: int=None,
+        fov_radius: float=0.6, 
+        load_coords: bool=True,
+        load_cloudmask: bool=True, 
+        patch_size: tuple[int, int] = (256, 256),
+        **kwargs
+    ):
+        """
+        Initialize the GeoDataset class.
+
+        Args:
+            data_dir (List[str]): A list of directories containing the data files.
+            editors (List[Editor]): A list of editors for data preprocessing.
+            splits_dict (Dict, optional): A dictionary specifying the splits for the dataset. Defaults to None.
+            ext (str, optional): The file extension of the data files. Defaults to "nc".
+            limit (int, optional): The maximum number of files to load. Defaults to None.
+            fov_radius (float, optional): The radius of the field of view. Defaults to 0.6.
+            load_coords (bool, optional): Whether to load the coordinates. Defaults to True.
+            load_cloudmask (bool, optional): Whether to load the cloud mask. Defaults to True.
+            patch_size (tuple[int, int], optional): The size of the patches to crop. Defaults to (256, 256).
+            **kwargs: Additional keyword arguments.
+
+        """
+        self.data_dir = data_dir
+        self.editors = editors
+        self.splits_dict = splits_dict
+        self.ext = ext
+        self.limit = limit
+        self.fov_radius = fov_radius
+        self.load_coords = load_coords
+        self.load_cloudmask = load_cloudmask
+        self.patch_size = patch_size
+
+        self.files = self.get_files()
+
+        self.crop = CenterWeightedCropDatasetEditor(patch_shape=self.patch_size, fov_radius=self.fov_radius)
+
+        super().__init__(
+            data=self.files,
+            editors=self.editors,
+            ext=self.ext,
+            limit=self.limit,
+            **kwargs
+        )
+
+    def get_files(self):
+        # Get filenames from data_dir
+        files = get_list_filenames(data_path=self.data_dir, ext=self.ext)
+        # split files based on split criteria
+        files = get_split(files=files, split_dict=self.splits_dict)
+        return files
+
+    def __len__(self):
+        return len(self.files)
+
+    def getIndex(self, data_dict, idx):
+        # Attempt applying editors
+        try:
+            return self.convertData(data_dict)
+        except Exception as ex:
+            logging.error('Unable to convert %s: %s' % (self.files[idx], ex))
+            raise ex
+
+    def __getitem__(self, idx):
+        data_dict = {}
+
+        ds: xr.Dataset = xr.load_dataset(self.files[idx], engine="netcdf4")
+        if self.patch_size is not None:
+            ds, xmin, ymin = self.crop(ds)
+        else:
+            xmin, ymin = 0, 0 # Set to 0 if no cropping is done
+        data = ds.Rad.compute().to_numpy()
+
+        data_dict["data"] = data
+        del data # Delete data to reduce memory usage
+        # Extract wavelengths
+        wavelengths = ds.band_wavelength.compute().to_numpy()
+        data_dict["wavelengths"] = wavelengths
+        del wavelengths # Delete data to reduce memory usage
+
+        # Extract coordinates
+        if self.load_coords:
+            latitude = ds.latitude.compute().to_numpy()
+            longitude = ds.longitude.compute().to_numpy()
+            coords = np.stack([latitude, longitude], axis=0)
+            data_dict["coords"] = coords
+            del latitude, longitude # Delete data to reduce memory usage
+            del coords # Delete data to reduce memory usage
+
+        # Extract cloud mask
+        if self.load_cloudmask:
+            cloud_mask = ds.cloud_mask.compute().to_numpy()
+            data_dict["cloud_mask"] = cloud_mask
+            del cloud_mask # Delete data to reduce memory usage
+
+        # Delete dataset to reduce memory usage
+        del ds
+
+        if self.editors is not None:
+            # Apply editors
+            data, _ = self.getIndex(data_dict, idx)
+
+            if np.any(np.nanstd(data, axis=(1, 2)) == 0):
+                print(f"Constant channel in patch")
+                print(f"File: {self.files[idx]}")
+                print(f"Patch x/y: {xmin}/{ymin}")
+            return data
+        else:
+            return data_dict
+
+
+