diff --git a/.gitignore b/.gitignore index 1969ae4..ee0f8eb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ __pychache__ .ipynb_checkpoints .gitignore venv +glc23_data \ No newline at end of file diff --git a/config/local.yaml b/config/local.yaml index f677dd4..1885629 100644 --- a/config/local.yaml +++ b/config/local.yaml @@ -1,7 +1,7 @@ -sent_data_path: "/data/jdolli/glc23_data/SatelliteImages/" -bioclim_path: "/shares/wegner.ics.uzh/glc23_data/bioclim+elev/bioclim_elevation_scaled_europe.npy" -dataset_file_path: "/shares/wegner.ics.uzh/glc23_data/Pot_10_to_1000.csv" -cp_dir_path: "/scratch/jdolli/sent-sinr/checkpoints" -logs_dir_path: "/scratch/jdolli/sent-sinr/" -test_data_path: "/shares/wegner.ics.uzh/glc23_data/Presence_Absence_surveys/Presences_Absences_train.csv" +sent_data_path: "glc23_data/SatelliteImages/" +bioclim_path: "glc23_data/bioclim+elev/bioclim_elevation_scaled_europe.npy" +dataset_file_path: "glc23_data/Pot_10_to_1000.csv" +cp_dir_path: "cps/" +logs_dir_path: "logs/" +test_data_path: "glc23_data/Presence_Absence_surveys/Presences_Absences_train.csv" gpu: True \ No newline at end of file diff --git a/dataset.py b/dataset.py index ba11f10..3e3c746 100644 --- a/dataset.py +++ b/dataset.py @@ -16,10 +16,21 @@ def __init__(self, params, dataset_file, predictors, bioclim_path, sent_data_pat # test_data is not used by the dataset itself, but the model needs this object with open(params.local.test_data_path, "r") as f: data_test = pd.read_csv(f, sep=";", header="infer", low_memory=False) - self.test_data = data_test.groupby(["patchID", "dayOfYear", "lon", "lat"]).agg( - {"speciesId": lambda x: list(x)}).reset_index() - self.test_data = {str(entry["lon"]) + "/" + str(entry["lat"]) + "/" + str(entry["dayOfYear"]) + "/" + str( - entry["patchID"]): entry["speciesId"] for idx, entry in self.test_data.iterrows()} + self.test_data = ( + data_test.groupby(["patchID", "dayOfYear", "lon", "lat"]) + .agg({"speciesId": lambda x: list(x)}) + .reset_index() + ) + self.test_data = { + str(entry["lon"]) + + "/" + + str(entry["lat"]) + + "/" + + str(entry["dayOfYear"]) + + "/" + + str(entry["patchID"]): entry["speciesId"] + for idx, entry in self.test_data.iterrows() + } self.predictors = predictors if "sent2" in predictors: @@ -28,14 +39,18 @@ def __init__(self, params, dataset_file, predictors, bioclim_path, sent_data_pat # The raster we are loading is already cropped to Europe and normalized context_feats = np.load(bioclim_path).astype(np.float32) self.raster = torch.from_numpy(context_feats) - self.raster[torch.isnan(self.raster)] = 0.0 # replace with mean value (0 is mean post-normalization) + self.raster[torch.isnan(self.raster)] = ( + 0.0 # replace with mean value (0 is mean post-normalization) + ) self.sent_data_path = sent_data_path - self.transforms = v2.Compose([ - v2.RandomHorizontalFlip(p=0.5), - v2.RandomVerticalFlip(p=0.5), - ]) + self.transforms = v2.Compose( + [ + v2.RandomHorizontalFlip(p=0.5), + v2.RandomVerticalFlip(p=0.5), + ] + ) def __len__(self): return len(self.data) @@ -50,7 +65,12 @@ def _normalize_loc_to_uniform(self, lon, lat): def _encode_loc(self, lon, lat): """Expects lon and lat to be scale between [-1,1]""" - features = [np.sin(np.pi * lon), np.cos(np.pi * lon), np.sin(np.pi * lat), np.cos(np.pi * lat)] + features = [ + np.sin(np.pi * lon), + np.cos(np.pi * lon), + np.sin(np.pi * lat), + np.cos(np.pi * lat), + ] return np.stack(features, axis=-1) def sample_encoded_locs(self, size): @@ -61,7 +81,9 @@ def sample_encoded_locs(self, size): lat = lat * 2 - 1 loc_enc = torch.tensor(self._encode_loc(lon, lat), dtype=torch.float32) if "env" in self.predictors: - env_enc = bilinear_interpolate(torch.stack([torch.tensor(lon), torch.tensor(lat)], dim=1), self.raster) + env_enc = bilinear_interpolate( + torch.stack([torch.tensor(lon), torch.tensor(lat)], dim=1), self.raster + ) if "loc" in self.predictors: return torch.cat([loc_enc, env_enc], dim=1).type("torch.FloatTensor") else: @@ -82,7 +104,9 @@ def get_env_raster(self, lon, lat): def get_loc_env(self, lon, lat): """Given lon and lat, create the location and environmental embedding.""" lon_norm, lat_norm = self._normalize_loc_to_uniform(lon, lat) - loc_enc = torch.tensor(self._encode_loc(lon_norm, lat_norm), dtype=torch.float32) + loc_enc = torch.tensor( + self._encode_loc(lon_norm, lat_norm), dtype=torch.float32 + ) env_enc = self.get_env_raster(lon, lat).type("torch.FloatTensor") return torch.cat((loc_enc, env_enc.view(20))) @@ -108,8 +132,26 @@ def encode(self, lon, lat): def get_gbif_sent2(self, pid): """Get Sentinel-2 image for patch_id.""" - rgb_path = self.sent_data_path + "rgb/" + str(pid)[-2:] + "/" + str(pid)[-4:-2] + "/" + str(pid) + ".jpeg" - nir_path = self.sent_data_path + "nir/" + str(pid)[-2:] + "/" + str(pid)[-4:-2] + "/" + str(pid) + ".jpeg" + rgb_path = ( + self.sent_data_path + + "rgb/" + + str(pid)[-2:] + + "/" + + str(pid)[-4:-2] + + "/" + + str(pid) + + ".jpeg" + ) + nir_path = ( + self.sent_data_path + + "nir/" + + str(pid)[-2:] + + "/" + + str(pid)[-4:-2] + + "/" + + str(pid) + + ".jpeg" + ) rgb = Image.open(rgb_path) nir = Image.open(nir_path) img = torch.concat([self.to_tensor(rgb), self.to_tensor(nir)], dim=0) / 255 @@ -121,21 +163,39 @@ def __getitem__(self, idx): data_dict = self.data.iloc[idx] lon, lat = tuple(data_dict[["lon", "lat"]].to_numpy()) if "sent2" in self.predictors: - return self.encode(lon, lat), self.get_gbif_sent2(data_dict["patchID"]), torch.tensor( - data_dict["speciesId"]) + return ( + self.encode(lon, lat), + self.get_gbif_sent2(data_dict["patchID"]), + torch.tensor(data_dict["speciesId"]), + ) else: return self.encode(lon, lat), torch.tensor(data_dict["speciesId"]) def create_datasets(params): """Creates dataset and dataloaders from the various files""" - dataset_file = pd.read_csv(params.local.dataset_file_path, sep=";", header='infer', low_memory=False) + dataset_file = pd.read_csv( + params.local.dataset_file_path, sep=";", header="infer", low_memory=False + ) bioclim_path = params.local.bioclim_path - dataset = SINR_DS(params, dataset_file, params.dataset.predictors, sent_data_path=params.local.sent_data_path, - bioclim_path=bioclim_path) + dataset = SINR_DS( + params, + dataset_file, + params.dataset.predictors, + sent_data_path=params.local.sent_data_path, + bioclim_path=bioclim_path, + ) ds_train, ds_val = torch.utils.data.random_split(dataset, [0.9, 0.1]) - train_loader = torch.utils.data.DataLoader(ds_train, shuffle=True, batch_size=params.dataset.batchsize, - num_workers=params.dataset.num_workers) - val_loader = torch.utils.data.DataLoader(ds_val, shuffle=False, batch_size=params.dataset.batchsize, - num_workers=params.dataset.num_workers) + train_loader = torch.utils.data.DataLoader( + ds_train, + shuffle=True, + batch_size=params.dataset.batchsize, + num_workers=params.dataset.num_workers, + ) + val_loader = torch.utils.data.DataLoader( + ds_val, + shuffle=False, + batch_size=params.dataset.batchsize, + num_workers=params.dataset.num_workers, + ) return dataset, train_loader, val_loader diff --git a/embedders.py b/embedders.py index 985081e..bba9bfa 100644 --- a/embedders.py +++ b/embedders.py @@ -38,8 +38,7 @@ def __init__(self, layer_removed=1, hidden_dim=128): super().__init__() self.center_crop = torchvision.transforms.functional.center_crop self.layer_removed = layer_removed - layers = [torch.nn.Conv2d(4, 32, 4, 2, 1), - torch.nn.ReLU()] + layers = [torch.nn.Conv2d(4, 32, 4, 2, 1), torch.nn.ReLU()] for i in range(layer_removed): layers.append(torch.nn.Conv2d(32, 32, 3, 1, 1)) layers.append(torch.nn.ReLU()) @@ -71,7 +70,7 @@ def forward(self, tensor): def get_embedder(params): - if params.embedder == "ae_default": + if params.embedder == "cnn_default": return AE_DEFAULT() elif params.embedder.startswith("cnn_si"): return CNN_SMALLERINPUT(int(params.embedder[-1])) diff --git a/main.py b/main.py index e6f56ed..d68ca11 100644 --- a/main.py +++ b/main.py @@ -28,18 +28,30 @@ def get_logger(params, tag=""): name += " val" name += " " + tag - logger = hydra.utils.instantiate({"_target_": "pytorch_lightning.loggers.WandbLogger", - "name": name, - "save_dir": params.local.logs_dir_path, - "project": "sinr_on_glc23"}) + logger = hydra.utils.instantiate( + { + "_target_": "pytorch_lightning.loggers.WandbLogger", + "name": name, + "save_dir": params.local.logs_dir_path, + "project": "sinr_on_glc23", + } + ) return logger -def train_model(params, dataset, train_loader, val_loader, provide_model=None, logger=None, validate=False): +def train_model( + params, + dataset, + train_loader, + val_loader, + provide_model=None, + logger=None, + validate=False, +): """ Instantiates model, defines which epoch to save as checkpoint, and trains """ - torch.set_float32_matmul_precision('medium') + torch.set_float32_matmul_precision("medium") if not provide_model: if params.model == "sinr" or params.model == "log_reg": @@ -54,11 +66,17 @@ def train_model(params, dataset, train_loader, val_loader, provide_model=None, l monitor="val_loss", mode="min", dirpath=params.local.cp_dir_path, - filename=logger._name + "{val_loss:.4f}" + filename=logger._name + "{val_loss:.4f}", + ) + trainer = pl.Trainer( + max_epochs=params.epochs, + accelerator=("gpu" if params.local.gpu else "cpu"), + devices=1, + precision="16-mixed", + logger=logger, + log_every_n_steps=50, + callbacks=[checkpoint_callback], ) - trainer = pl.Trainer(max_epochs=params.epochs, accelerator=("gpu" if params.local.gpu else "cpu"), devices=1, - precision="16-mixed", logger=logger, log_every_n_steps=50, - callbacks=[checkpoint_callback]) if validate: trainer.validate(model=model, dataloaders=[val_loader]) else: @@ -68,22 +86,35 @@ def train_model(params, dataset, train_loader, val_loader, provide_model=None, l def load_cp(params, dataset): """Loads checkpoint.""" if params.model == "sinr" or params.model == "log_reg": - model = SINR.load_from_checkpoint(params.checkpoint, params=params, dataset=dataset) + model = SINR.load_from_checkpoint( + params.checkpoint, params=params, dataset=dataset + ) elif "sat" in params.model: - model = SAT_SINR.load_from_checkpoint(params.checkpoint, params=params, dataset=dataset, - sent2_net=get_embedder(params)) + model = SAT_SINR.load_from_checkpoint( + params.checkpoint, + params=params, + dataset=dataset, + sent2_net=get_embedder(params), + ) return model -@hydra.main(version_base=None, config_path='config', config_name='base_config.yaml') +@hydra.main(version_base=None, config_path="config", config_name="base_config.yaml") def main(params): """main funct.""" dataset, train_loader, val_loader = create_datasets(params) logger = get_logger(params, tag=params.tag) if params.checkpoint != "None": model = load_cp(params, dataset) - train_model(params, dataset, train_loader, val_loader, provide_model=model, logger=logger, - validate=params.validate) + train_model( + params, + dataset, + train_loader, + val_loader, + provide_model=model, + logger=logger, + validate=params.validate, + ) else: train_model(params, dataset, train_loader, val_loader, logger=logger) wandb.finish() diff --git a/models.py b/models.py index f7dec3f..1e31632 100644 --- a/models.py +++ b/models.py @@ -14,7 +14,7 @@ def __init__(self, hidden_dim, dropout): torch.nn.ReLU(), torch.nn.Dropout(dropout), torch.nn.Linear(hidden_dim, hidden_dim), - torch.nn.ReLU() + torch.nn.ReLU(), ) def forward(self, x): @@ -45,7 +45,8 @@ def forward(self, x): class SINR(pl.LightningModule): """Base SINR, including metric calculations used in all models. - Also includes the log_reg implementation, replacing the SINR_net with a single layer.""" + Also includes the log_reg implementation, replacing the SINR_net with a single layer. + """ def __init__(self, params, dataset, **kwargs): super().__init__(**kwargs) @@ -61,8 +62,12 @@ def __init__(self, params, dataset, **kwargs): if params.model == "log_reg": self.net = torch.nn.Linear(input_len, 10040) elif params.model == "sinr": - self.net = SINR_Net(input_len, hidden_dim=params.sinr_hidden, dropout=params.dropout, - layers=params.sinr_layers) + self.net = SINR_Net( + input_len, + hidden_dim=params.sinr_hidden, + dropout=params.dropout, + layers=params.sinr_layers, + ) self.dataset = dataset self.test_data = dataset.test_data @@ -70,7 +75,7 @@ def __init__(self, params, dataset, **kwargs): self.max_weighted_roc_auc = 0 self.max_micro_f1 = 0 - self.save_hyperparameters(ignore=['dataset']) + self.save_hyperparameters(ignore=["dataset"]) def forward(self, x): return self.net(x) @@ -78,7 +83,9 @@ def forward(self, x): def apply_model_and_an_full_loss(self, batch, dataset, params): """Get x, sample random background samples, process both through the network, and calculate the loss.""" loc_features, labels = batch - random_loc_features = dataset.sample_encoded_locs(len(loc_features)).to(labels.device) + random_loc_features = dataset.sample_encoded_locs(len(loc_features)).to( + labels.device + ) loc_pred = torch.sigmoid(self(loc_features)) rand_pred = torch.sigmoid(self(random_loc_features)) @@ -92,7 +99,9 @@ def apply_model_and_an_full_loss(self, batch, dataset, params): # Assume all classes at the random background locations to be absent loss_bg = -torch.log((1 - rand_pred) + 1e-5) # For the confirmed occurrences, switch the sign of the predicted probability and upscale with pos_weight - loss_pos[inds, labels] = params.pos_weight * -torch.log(loc_pred[inds, labels] + 1e-5) + loss_pos[inds, labels] = params.pos_weight * -torch.log( + loc_pred[inds, labels] + 1e-5 + ) return loss_pos.mean() + loss_bg.mean() @@ -180,13 +189,14 @@ def validation_step(self, batch, batch_nb): return {"loss": loss, "progress_bar": float(loss_detached)} def configure_optimizers(self): - opt = torch.optim.Adam(self.net.parameters(), lr=self.params.lr, - weight_decay=self.params.l2_dec) + opt = torch.optim.Adam( + self.net.parameters(), lr=self.params.lr, weight_decay=self.params.l2_dec + ) return opt class SAT_SINR(SINR): - """Abstract Sat-Sinr with adapted loss """ + """Abstract Sat-Sinr with adapted loss""" def __init__(self, params, dataset, sent2_net, **kwargs): super().__init__(params, dataset, **kwargs) @@ -194,8 +204,12 @@ def __init__(self, params, dataset, sent2_net, **kwargs): self.net.sent2_net = sent2_net self.dataset = dataset # Instantiate another DataLoader from the dataset to serve as background samples - self.re_dl = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=params.dataset.batchsize, - num_workers=params.dataset.num_workers) + self.re_dl = torch.utils.data.DataLoader( + dataset, + shuffle=True, + batch_size=params.dataset.batchsize, + num_workers=params.dataset.num_workers, + ) # Instantiate iterator from the dataloader self.re_iter = iter(self.re_dl) @@ -211,19 +225,27 @@ def apply_model_and_an_full_loss(self, batch, dataset, params): random_loc_features, random_sent2, _ = next(self.re_iter) rand_pred = torch.sigmoid( - self.net((random_loc_features.to(loc_features.device), random_sent2.to(loc_features.device)))) + self.net( + ( + random_loc_features.to(loc_features.device), + random_sent2.to(loc_features.device), + ) + ) + ) loc_pred = torch.sigmoid(self.net((loc_features, sent2_images))) # Make sure that all have the same length (Avoiding the edge-case of last batch in dl being smaller than rest). - rand_pred = rand_pred[:len(loc_pred)] - loc_pred = loc_pred[:len(rand_pred)] - labels = labels[:len(loc_pred)] + rand_pred = rand_pred[: len(loc_pred)] + loc_pred = loc_pred[: len(rand_pred)] + labels = labels[: len(loc_pred)] inds = torch.arange(len(labels)) loss_pos = -torch.log((1 - loc_pred) + 1e-5) loss_bg = -torch.log((1 - rand_pred) + 1e-5) - loss_pos[inds, labels] = params.pos_weight * -torch.log(loc_pred[inds, labels] + 1e-5) + loss_pos[inds, labels] = params.pos_weight * -torch.log( + loc_pred[inds, labels] + 1e-5 + ) return loss_pos.mean() + loss_bg.mean() @@ -247,7 +269,12 @@ def __init__(self, params, sat_only=False): inp_size += 4 if "env" in params.dataset.predictors: inp_size += 20 - self.net = SINR_Net(inp_size, hidden_dim=params.sinr_hidden, dropout=params.dropout, layers=params.sinr_layers) + self.net = SINR_Net( + inp_size, + hidden_dim=params.sinr_hidden, + dropout=params.dropout, + layers=params.sinr_layers, + ) self.sent2_to_classes = torch.nn.Linear(256, 10040) self.sat_only = sat_only @@ -274,7 +301,7 @@ def __init__(self, hidden_dim, dropout): torch.nn.ReLU(), torch.nn.Dropout(dropout), torch.nn.Linear(hidden_dim, hidden_dim), - torch.nn.ReLU() + torch.nn.ReLU(), ) self.embedder = torch.nn.Linear(256, hidden_dim) # Init embedder weights to zero @@ -295,7 +322,9 @@ def __init__(self, input_len=4, hidden_dim=256, dropout=0.5, layers=4): self.inp_l = torch.nn.Linear(input_len, hidden_dim) self.relu = torch.nn.ReLU() - self.resid_l = torch.nn.Sequential(*[ContextResidLayer(hidden_dim, dropout) for i in range(layers)]) + self.resid_l = torch.nn.Sequential( + *[ContextResidLayer(hidden_dim, dropout) for i in range(layers)] + ) self.classifier = self.net = torch.nn.Linear(hidden_dim, 10040) @@ -320,8 +349,12 @@ def __init__(self, params): inp_size += 4 if "env" in params.dataset.predictors: inp_size += 20 - self.net = Context_SINR_Net(inp_size, hidden_dim=params.sinr_hidden, dropout=params.dropout, - layers=params.sinr_layers) + self.net = Context_SINR_Net( + inp_size, + hidden_dim=params.sinr_hidden, + dropout=params.dropout, + layers=params.sinr_layers, + ) self.predictors = params.dataset.predictors def forward(self, x): @@ -345,8 +378,12 @@ def __init__(self, params, enc_dim=24): inp_size += 4 if "env" in params.dataset.predictors: inp_size += 20 - self.net = SINR_Net(inp_size + enc_dim, hidden_dim=params.sinr_hidden, dropout=params.dropout, - layers=params.sinr_layers) + self.net = SINR_Net( + inp_size + enc_dim, + hidden_dim=params.sinr_hidden, + dropout=params.dropout, + layers=params.sinr_layers, + ) self.sent2_to_input = torch.nn.Linear(256, enc_dim) def forward(self, x): diff --git a/requirements.txt b/requirements.txt index 058f2d5..fb8b528 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,7 @@ wandb pytorch_lightning scikit-learn jupyter -pygbif \ No newline at end of file +pygbif +rasterio +pyproj +rioxarray \ No newline at end of file diff --git a/scripts/create data visuals.ipynb b/scripts/create data visuals.ipynb deleted file mode 100644 index 4c4ac8a..0000000 --- a/scripts/create data visuals.ipynb +++ /dev/null @@ -1,183 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "6be70676-5ba3-4533-ad46-81a21f3f59d5", - "metadata": {}, - "source": [ - "### Visualizing the BIOS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c28597f-b638-4c4f-a784-b70a11ffd7d4", - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "import torch\n", - "import numpy as np\n", - "import os\n", - "\n", - "DATA_DIR = \"/data/jdolli/\"\n", - "\n", - "bioses = os.listdir(DATA_DIR + \"bioclim+elev\")\n", - "\n", - "for sid in range(20):\n", - " \n", - " #vmin = min(preds[:, sid].min(), 1 - preds[:, sid].max())\n", - " #vmax = max(preds[:, sid].max(), 1 - preds[:, sid].min())\n", - " \n", - " vmin = preds[:, sid].min()\n", - " vmax = preds[:, sid].max()\n", - "\n", - " mask = np.load(os.path.join(DATA_DIR + \"glc23_data/sinr_data/data/masks\", 'ocean_mask_hr.npy'))\n", - " lon_res = mask.shape[1] / 360\n", - " lat_res = mask.shape[0] / 180\n", - " north = int((90-max_lat) * lat_res)\n", - " south = int((90-min_lat) * lat_res)\n", - " west = int((180 + min_lon) * lon_res)\n", - " east = int((180 + max_lon) * lon_res)\n", - " mask = mask[north:south, west:east]\n", - "\n", - " fig, ax = plt.subplots(figsize=(6, 4))\n", - " ax.set_xlim([-10.53904, 34.55792])\n", - " ax.set_ylim([34.56858, 71.18392])\n", - " cmap = plt.cm.plasma\n", - " cmap.set_bad(color='none')\n", - " mask_inds = np.where(mask.reshape(-1) == 1)[0]\n", - " \n", - " im = preds[:, sid]\n", - " print(\"Bios:\", bioses[sid], im.min().item(), im.max().item())\n", - " im = torch.rot90(im.view(RES_LON, RES_LAT))\n", - " im = torch.reshape(im, (RES_LAT * RES_LON, 1))\n", - " im = im[mask_inds]\n", - " \n", - " op_im = np.ones(mask.shape[0] * mask.shape[1]) * np.nan\n", - " op_im[mask_inds] = im.detach().view(len(mask_inds)).numpy()\n", - " op_im = np.ma.masked_invalid(op_im)\n", - " op_im = op_im.reshape(RES_LAT, RES_LON)\n", - " \n", - " TRESHHOLD = 0\n", - " if TRESHHOLD > 0:\n", - " #op_im[op_im > TRESHHOLD] = 1\n", - " op_im[op_im <= TRESHHOLD] = 0\n", - " \n", - " im = ax.imshow(op_im, extent=(-10.53904, 34.55792, 34.56858, 71.18392), vmin=vmin, vmax=vmax, cmap=cmap)\n", - " \n", - " fig.colorbar(im, ax=ax)\n", - " \n", - " plt.show()\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "4eff0342-fb0c-4f4c-b243-69ff51a127e4", - "metadata": {}, - "source": [ - "# Create distribution graphic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7af97e58-667c-4cc5-ab2c-47febe16b790", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from collections import Counter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1da1adf4-3d03-4d49-a050-76f99917ef43", - "metadata": {}, - "outputs": [], - "source": [ - "DATA_DIR = \"/data/jdolli/\"\n", - "#dataset_file = pd.read_csv(DATA_DIR + 'Presences_only_train.csv', sep=\";\", header='infer', low_memory=False)\n", - "dataset_file = pd.read_csv(DATA_DIR + 'Pot_10_to_1000.csv', sep=\";\", header='infer', low_memory=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2dc77140-84b6-4048-9ef8-42c971b3c1c1", - "metadata": {}, - "outputs": [], - "source": [ - "ids = dataset_file[\"speciesId\"].to_numpy()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48d23400-5478-4dd4-ba8b-08fdeef0d998", - "metadata": {}, - "outputs": [], - "source": [ - "counter = Counter(ids)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88ef5578-9c4b-431c-aa01-775dcff27595", - "metadata": {}, - "outputs": [], - "source": [ - "amounts = counter.values()\n", - "amounts = list(amounts)\n", - "amounts.sort(reverse=True)\n", - "print(len(amounts))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ecd3034-312e-47f7-acc7-ccdfa5fcf43e", - "metadata": {}, - "outputs": [], - "source": [ - "fig = plt.figure(figsize = (10, 5))\n", - "plt.bar([i for i in range(len(amounts))], amounts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d447c0d9-7c28-4bb2-a86d-6de3a4754a95", - "metadata": {}, - "outputs": [], - "source": [ - "fig.savefig(\"./class_distribution.png\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "glc23", - "language": "python", - "name": "glc23" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/scripts/create prediction visuals.ipynb b/scripts/create prediction visuals.ipynb index a6440e3..67b09e5 100644 --- a/scripts/create prediction visuals.ipynb +++ b/scripts/create prediction visuals.ipynb @@ -37,9 +37,14 @@ "metadata": {}, "outputs": [], "source": [ - "Data_FOLDER = \"/data/jdolli/\"\n", - "#dataset_file = pd.read_csv(Data_FOLDER + 'glc23_data/Pot_10_to_1000.csv', sep=\";\", header='infer', low_memory=False)\n", - "dataset_file = pd.read_csv(Data_FOLDER + 'glc23_data/Pot_10_to_1000_nofrance.csv', sep=\";\", header='infer', low_memory=False)" + "Data_FOLDER = \"glc23_data/\"\n", + "# dataset_file = pd.read_csv(Data_FOLDER + 'Pot_10_to_1000.csv', sep=\";\", header='infer', low_memory=False)\n", + "dataset_file = pd.read_csv(\n", + " Data_FOLDER + \"Pot_10_to_1000_nofrance.csv\",\n", + " sep=\";\",\n", + " header=\"infer\",\n", + " low_memory=False,\n", + ")" ] }, { @@ -70,6 +75,7 @@ "# Code-Snippet to match GLC23 occurrence with gbif occurrence to retrieve the original species\n", "print(dataset_file[dataset_file[\"speciesId\"] == 265].iloc[0])\n", "import pygbif\n", + "\n", "# Replace key with key from previously printed snippet\n", "pygbif.occurrences.get(key=3951621754)" ] @@ -92,7 +98,12 @@ " sinr = True\n", " PREDICTORS = \"loc\"\n", " bioclim_path = Data_FOLDER + \"bioclim+elev/bioclim_elevation_scaled_europe.npy\"\n", - " dataset = SINR_DS(dataset_file, PREDICTORS, sent_data_path = Data_FOLDER + \"glc23_data/SatelliteImages/\", bioclim_path = bioclim_path, use_subm_val=False)\n", + " dataset = SINR_DS(\n", + " dataset_file,\n", + " PREDICTORS,\n", + " sent_data_path=Data_FOLDER + \"SatelliteImages/\",\n", + " bioclim_path=bioclim_path,\n", + " )\n", "\n", " default_params = DefaultParams(sinr)\n", " default_params.dataset.predictors = PREDICTORS\n", @@ -143,8 +154,8 @@ " # i is lon\n", " # j is lat\n", " for j in range(RES_LAT):\n", - " lon = i/RES_LON\n", - " lat = j/RES_LAT\n", + " lon = i / RES_LON\n", + " lat = j / RES_LAT\n", " lon = lon * (max_lon - min_lon) + min_lon\n", " lat = lat * (max_lat - min_lat) + min_lat\n", " locs.append(dataset.encode(lon, lat))\n", @@ -157,25 +168,30 @@ " # i is lon\n", " # j is lat\n", " for j in range(RES_LAT):\n", - " lon = i/RES_LON\n", - " lat = j/RES_LAT\n", + " lon = i / RES_LON\n", + " lat = j / RES_LAT\n", " lon = lon * (max_lon - min_lon) + min_lon\n", " lat = lat * (max_lat - min_lat) + min_lat\n", " loc = dataset.encode(lon, lat)\n", " pos = str(lat) + \",\" + str(lon)\n", - " rgb_path = \"/data/jdolli/sentinel_2 2021 Europe/rgb/\" + pos + \".jpeg\"\n", - " nir_path = \"/data/jdolli/sentinel_2 2021 Europe/nir/\" + pos + \".jpeg\"\n", + " # Requires downloading and cropping fitting Sentinel-2 images from the Ecodatacube\n", + " rgb_path = \"sentinel_2 2021 Europe/rgb/\" + pos + \".jpeg\"\n", + " nir_path = \"sentinel_2 2021 Europe/nir/\" + pos + \".jpeg\"\n", " try:\n", " rgb = Image.open(rgb_path)\n", " nir = Image.open(nir_path)\n", " to_tensor = torchvision.transforms.PILToTensor()\n", - " sent2 = torch.concat([to_tensor(rgb), to_tensor(nir)], dim=0)/255\n", + " sent2 = torch.concat([to_tensor(rgb), to_tensor(nir)], dim=0) / 255\n", " except:\n", " sent2 = torch.zeros(4, 128, 128)\n", " if sent2.shape != torch.Size([4, 128, 128]):\n", " sent2 = torch.zeros(4, 128, 128)\n", " with torch.no_grad():\n", - " preds.append(model.net((loc.to(\"cuda\"), sent2.to(\"cuda\")), no_sent2=False).detach().to(\"cpu\"))\n", + " preds.append(\n", + " model.net((loc.to(\"cuda\"), sent2.to(\"cuda\")), no_sent2=False)\n", + " .detach()\n", + " .to(\"cpu\")\n", + " )\n", " preds = torch.stack(preds).sigmoid()\n", " return preds" ] @@ -190,7 +206,7 @@ "def print_and_save_res(preds, name, NOCCS=False, FRANCE_ONLY=False, STD=False):\n", "\n", " try:\n", - " os.mkdir(\"./visuals/\"+name)\n", + " os.mkdir(\"./visuals/\" + name)\n", " except:\n", " pass\n", " NUM_SAMPLES = len(to_map)\n", @@ -207,11 +223,14 @@ " lon_occs = occs[\"lon\"].to_numpy()\n", " lat_occs = occs[\"lat\"].to_numpy()\n", "\n", - " mask = np.load(os.path.join(\"/data/jdolli/glc23_data/sinr_data/data/masks\", 'ocean_mask_hr.npy'))\n", + " # Ocean mask can be downloaded the original SINR repo\n", + " mask = np.load(\n", + " os.path.join(Data_FOLDER + \"sinr_data/data/masks\", \"ocean_mask_hr.npy\")\n", + " )\n", " lon_res = mask.shape[1] / 360\n", " lat_res = mask.shape[0] / 180\n", - " north = int((90-max_lat) * lat_res)\n", - " south = int((90-min_lat) * lat_res)\n", + " north = int((90 - max_lat) * lat_res)\n", + " south = int((90 - min_lat) * lat_res)\n", " west = int((180 + min_lon) * lon_res)\n", " east = int((180 + max_lon) * lon_res)\n", " mask = mask[north:south, west:east]\n", @@ -224,11 +243,18 @@ " ax.set_xlim([-4.807615, 8.238722])\n", " ax.set_ylim([42.325170, 51.235825])\n", " cmap = plt.cm.plasma\n", - " cmap.set_bad(color='none')\n", + " cmap.set_bad(color=\"none\")\n", " mask_inds = np.where(mask.reshape(-1) == 1)[0]\n", "\n", " im = preds[:, to_map[sid]]\n", - " print(\"SpeciesId:\", to_map[sid], \"; Num samples:\", num_samples[sid], im.min().item(), im.max().item())\n", + " print(\n", + " \"SpeciesId:\",\n", + " to_map[sid],\n", + " \"; Num samples:\",\n", + " num_samples[sid],\n", + " im.min().item(),\n", + " im.max().item(),\n", + " )\n", " im = torch.rot90(im.view(RES_LON, RES_LAT))\n", " im = torch.reshape(im, (RES_LAT * RES_LON, 1))\n", " im = im[mask_inds]\n", @@ -240,24 +266,43 @@ "\n", " TRESHHOLD = 0\n", " if TRESHHOLD > 0:\n", - " #op_im[op_im > TRESHHOLD] = 1\n", + " # op_im[op_im > TRESHHOLD] = 1\n", " op_im[op_im <= TRESHHOLD] = 0\n", "\n", " if FRANCE_ONLY:\n", - " op_im = op_im[408-186:408-86, 64:209]\n", + " op_im = op_im[408 - 186 : 408 - 86, 64:209]\n", "\n", " if FRANCE_ONLY:\n", - " im = ax.imshow(op_im, extent=(-4.807615, 8.238722, 42.325170, 51.235825), vmin=vmin, vmax=vmax, cmap=cmap)\n", + " im = ax.imshow(\n", + " op_im,\n", + " extent=(-4.807615, 8.238722, 42.325170, 51.235825),\n", + " vmin=vmin,\n", + " vmax=vmax,\n", + " cmap=cmap,\n", + " )\n", " else:\n", - " im = ax.imshow(op_im, extent=(-10.53904, 34.55792, 34.56858, 71.18392), vmin=vmin, vmax=vmax, cmap=cmap)\n", + " im = ax.imshow(\n", + " op_im,\n", + " extent=(-10.53904, 34.55792, 34.56858, 71.18392),\n", + " vmin=vmin,\n", + " vmax=vmax,\n", + " cmap=cmap,\n", + " )\n", " if not NOCCS:\n", " ax.scatter(lon_occs, lat_occs, c=\"lime\", alpha=0.5, s=3)\n", "\n", " if not name == \"only_dist\":\n", " fig.colorbar(im, ax=ax)\n", "\n", - " fig.savefig(\"./visuals/\"+name+\"/\"+str(to_map[sid])+(\"_noccs\" if NOCCS else \"\")+(\"_france\" if FRANCE_ONLY else \"\")\n", - " +(\"_std\" if STD else \"\"))\n", + " fig.savefig(\n", + " \"./visuals/\"\n", + " + name\n", + " + \"/\"\n", + " + str(to_map[sid])\n", + " + (\"_noccs\" if NOCCS else \"\")\n", + " + (\"_france\" if FRANCE_ONLY else \"\")\n", + " + (\"_std\" if STD else \"\")\n", + " )\n", "\n", " plt.show()" ] @@ -269,11 +314,7 @@ "metadata": {}, "outputs": [], "source": [ - "checkpoint_names = [\n", - " \"\",\n", - " \"\",\n", - " \"\"\n", - "]\n", + "checkpoint_names = [\"\", \"\", \"\"]\n", "preds = []\n", "for name in checkpoint_names:\n", " model, sinr, dataset = get_model(name)\n", @@ -300,8 +341,53 @@ "metadata": {}, "outputs": [], "source": [ - "to_map = [265, 268,271,439,751,905,966,1122,1224,1303,1559,1957,2071,2854,3207,3384,3947,4062,4269,4501,5022,5113,5400,5793,6510,6612,6895,6922,7519,7580,\n", - " 7760,8023,8196,8267,8586,8791,8994,9170,9240,9315,9509,9753,9761,9807,9983] # classes present in the first PA sample\n", + "to_map = [\n", + " 265,\n", + " 268,\n", + " 271,\n", + " 439,\n", + " 751,\n", + " 905,\n", + " 966,\n", + " 1122,\n", + " 1224,\n", + " 1303,\n", + " 1559,\n", + " 1957,\n", + " 2071,\n", + " 2854,\n", + " 3207,\n", + " 3384,\n", + " 3947,\n", + " 4062,\n", + " 4269,\n", + " 4501,\n", + " 5022,\n", + " 5113,\n", + " 5400,\n", + " 5793,\n", + " 6510,\n", + " 6612,\n", + " 6895,\n", + " 6922,\n", + " 7519,\n", + " 7580,\n", + " 7760,\n", + " 8023,\n", + " 8196,\n", + " 8267,\n", + " 8586,\n", + " 8791,\n", + " 8994,\n", + " 9170,\n", + " 9240,\n", + " 9315,\n", + " 9509,\n", + " 9753,\n", + " 9761,\n", + " 9807,\n", + " 9983,\n", + "] # classes present in the first PA sample\n", "# to_map = random.sample(c.keys(), NUM_SAMPLES)\n", "to_map = [265]\n", "num_samples = [c[sid] for sid in to_map]\n", diff --git a/scripts/crop and scale bioclim.ipynb b/scripts/crop and scale bioclim.ipynb new file mode 100644 index 0000000..3278890 --- /dev/null +++ b/scripts/crop and scale bioclim.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "ExecuteTime": { + "start_time": "2024-05-28T12:39:05.996048Z", + "end_time": "2024-05-28T12:39:06.044610Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os\n", + "import rasterio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Put all 19 files in this folder along with the elevation-tif\n", + "# Important to also use the 1km elevation file to match the bio-files resolution\n", + "DATA_PATH = \"glc23_data/bioclim+elev\"\n", + "raster_files = os.listdir(DATA_PATH)\n", + "print(raster_files)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2024-05-28T12:42:38.286107Z", + "end_time": "2024-05-28T12:42:38.328338Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "rasters = []\n", + "for raster in raster_files:\n", + " file = rasterio.open(DATA_PATH + raster)\n", + " rasters.append(file.read(1))\n", + "rasters = np.stack(rasters, axis=-1)\n", + "print(rasters.shape)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2024-05-28T12:43:38.170598Z", + "end_time": "2024-05-28T12:43:41.171427Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "means = np.nanmean(rasters, axis=(0, 1), dtype=np.float64)\n", + "stds = np.nanstd(rasters, axis=(0, 1), dtype=np.float64)\n", + "rasters = (rasters - means) / stds" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2024-05-28T12:43:47.433843Z", + "end_time": "2024-05-28T12:43:52.154686Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# These bounds are lon -11 to 35, and lat 34 to 72\n", + "rasters = rasters[2159:6719, 20279:25799]\n", + "np.save(DATA_PATH + \"/bioclim_elevation_scaled_europe.npy\", rasters)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2024-05-28T12:43:58.082297Z", + "end_time": "2024-05-28T12:43:58.084246Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/scripts/crop sentinel2 images from ecodatacube.ipynb b/scripts/crop sentinel2 images from ecodatacube.ipynb new file mode 100644 index 0000000..f7b61f4 --- /dev/null +++ b/scripts/crop sentinel2 images from ecodatacube.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "### !!! THIS FILE IS NOT FUNCTIONAL, BUT SHOWCASES THE PROCESS OF IMAGE CREATION FROM THE ECODATACUBE !!!" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import os\n", + "import rioxarray\n", + "import rasterio\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "from pyproj import Transformer" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "TIFF_DIR = \"sent2_files_2021/\"\n", + "os.listdir(TIFF_DIR)\n", + "\n", + "color = \"rgb\"\n", + "\n", + "red = rioxarray.open_rasterio(\n", + " TIFF_DIR\n", + " + \"lcv_\"\n", + " + \"red\"\n", + " + \"_sentinel.s2l2a_p50_10m_0..0cm_2021.03.21..2021.12.01_eumap_epsg3035_v1.0.tif\"\n", + ")\n", + "red_rio = rasterio.open(\n", + " TIFF_DIR\n", + " + \"lcv_\"\n", + " + \"red\"\n", + " + \"_sentinel.s2l2a_p50_10m_0..0cm_2021.03.21..2021.12.01_eumap_epsg3035_v1.0.tif\"\n", + ")\n", + "blue = rioxarray.open_rasterio(\n", + " TIFF_DIR\n", + " + \"lcv_\"\n", + " + \"blue\"\n", + " + \"_sentinel.s2l2a_p50_10m_0..0cm_2021.03.21..2021.12.01_eumap_epsg3035_v1.0.tif\"\n", + ")\n", + "green = rioxarray.open_rasterio(\n", + " TIFF_DIR\n", + " + \"lcv_\"\n", + " + \"green\"\n", + " + \"_sentinel.s2l2a_p50_10m_0..0cm_2021.03.21..2021.12.01_eumap_epsg3035_v1.0.tif\"\n", + ")\n", + "# nir = rioxarray.open_rasterio(TIFF_DIR + 'lcv_' + \"nir\" + '_sentinel.s2l2a_p50_10m_0..0cm_2021.03.21..2021.12.01_eumap_epsg3035_v1.0.tif')\n", + "\n", + "tf = Transformer.from_crs(\"epsg:4326\", \"epsg:3035\")\n", + "CLIP = 10000\n", + "\n", + "\n", + "def get_image_pixels_raster(lon, lat, dir_path=\"\"):\n", + "\n", + " y, x = tf.transform(lat, lon)\n", + " south, east = red_rio.index(x, y)\n", + " hs = 64\n", + " r = red[0, east - hs : east + hs, south - hs : south + hs]\n", + " g = green[0, east - hs : east + hs, south - hs : south + hs]\n", + " b = blue[0, east - hs : east + hs, south - hs : south + hs]\n", + " try:\n", + " img = np.stack([r, g, b], axis=-1)\n", + " img[img > CLIP] = CLIP\n", + " img = (img - img.min(axis=0).min(axis=0)) / (\n", + " img.max(axis=0).max(axis=0) - img.min(axis=0).min(axis=0)\n", + " )\n", + " np.power(img, 1 / 2.5)\n", + " img = (img * 255).astype(np.uint8)\n", + " except:\n", + " pass\n", + "\n", + " # Save image\n", + "\n", + "\n", + "RES_LON = 502\n", + "RES_LAT = 408\n", + "max_lon = 34.55792\n", + "min_lon = -10.53904\n", + "max_lat = 71.18392\n", + "min_lat = 34.56858\n", + "\n", + "for i in tqdm(range(RES_LON)):\n", + " # i is lon\n", + " # j is lat\n", + " for j in range(RES_LAT):\n", + " lon = i / RES_LON\n", + " lat = j / RES_LAT\n", + " lon = lon * (max_lon - min_lon) + min_lon\n", + " lat = lat * (max_lat - min_lat) + min_lat\n", + "\n", + " # get_image_pixels_raster(lon, lat)" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/scripts/reduce_classes.ipynb b/scripts/reduce classes.ipynb similarity index 76% rename from scripts/reduce_classes.ipynb rename to scripts/reduce classes.ipynb index bab36ab..f8a43ce 100644 --- a/scripts/reduce_classes.ipynb +++ b/scripts/reduce classes.ipynb @@ -27,8 +27,10 @@ "metadata": {}, "outputs": [], "source": [ - "DATA_FOLDER = '/data/jdolli/glc23_data/'\n", - "ds = pd.read_csv(DATA_FOLDER + 'Presences_only_train.csv', sep=\";\", header='infer', low_memory=False)" + "DATA_FOLDER = \"glc23_data/\"\n", + "ds = pd.read_csv(\n", + " DATA_FOLDER + \"Presences_only_train.csv\", sep=\";\", header=\"infer\", low_memory=False\n", + ")" ] }, { @@ -52,7 +54,9 @@ "source": [ "for species_id in tqdm(c.keys()):\n", " if c[species_id] > 1000:\n", - " ds = ds.drop(ds.loc[ds[\"speciesId\"] == species_id].sample(c[species_id]-1000).index)\n", + " ds = ds.drop(\n", + " ds.loc[ds[\"speciesId\"] == species_id].sample(c[species_id] - 1000).index\n", + " )\n", " elif c[species_id] < 10:\n", " ds = ds.drop(ds.loc[ds[\"speciesId\"] == species_id].index)" ] @@ -76,7 +80,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds.to_csv('/data/jdolli/glc23_data/Pot_10_to_1000.csv', sep=\";\")" + "ds.to_csv(DATA_FOLDER + \"Pot_10_to_1000.csv\", sep=\";\")" ] }, { @@ -86,7 +90,9 @@ "metadata": {}, "outputs": [], "source": [ - "test = pd.read_csv('/data/jdolli/glc23_data/Pot_10_to_1000.csv', sep=\";\", header='infer', low_memory=False)" + "test = pd.read_csv(\n", + " DATA_FOLDER + \"Pot_10_to_1000.csv\", sep=\";\", header=\"infer\", low_memory=False\n", + ")" ] }, { @@ -109,7 +115,7 @@ "metadata": {}, "outputs": [], "source": [ - "#reduce number of samples" + "# reduce number of samples" ] }, { @@ -119,7 +125,9 @@ "metadata": {}, "outputs": [], "source": [ - "ds = pd.read_csv('/data/jdolli/glc23_data/Pot_10_to_1000.csv', sep=\";\", header='infer', low_memory=False)" + "ds = pd.read_csv(\n", + " DATA_FOLDER + \"Pot_10_to_1000.csv\", sep=\";\", header=\"infer\", low_memory=False\n", + ")" ] }, { @@ -130,13 +138,13 @@ "outputs": [], "source": [ "red = 32\n", - "new_len = len(ds)//red\n", + "new_len = len(ds) // red\n", "red_ds = ds.sample(new_len)\n", "ids = red_ds[[\"speciesId\"]].to_numpy().tolist()\n", "ids = [id[0] for id in ids]\n", "c = Counter(ids)\n", "print(len(red_ds), len(c))\n", - "red_ds.to_csv('/data/jdolli/glc23_data/Pot_10_to_1000_red' + str(red) + '.csv', sep=\";\")" + "red_ds.to_csv(DATA_FOLDER + \"Pot_10_to_1000_red\" + str(red) + \".csv\", sep=\";\")" ] }, { @@ -148,14 +156,14 @@ "source": [ "FRANCE_SE = 42.325170, 8.238722\n", "FRANCE_NW = 51.235825, -4.807615\n", - "france = ds[ds[\"lon\"]>-4.807615]\n", - "france = france[france[\"lon\"]<8.238722]\n", - "france = france[france[\"lat\"]>42.325170]\n", - "france = france[france[\"lat\"]<51.235825]\n", + "france = ds[ds[\"lon\"] > -4.807615]\n", + "france = france[france[\"lon\"] < 8.238722]\n", + "france = france[france[\"lat\"] > 42.325170]\n", + "france = france[france[\"lat\"] < 51.235825]\n", "len(france)\n", "ds = ds.drop(france.index)\n", "len(ds)\n", - "ds.to_csv('/data/jdolli/glc23_data/Pot_10_to_1000_nofrance.csv', sep=\";\")" + "ds.to_csv(DATA_FOLDER + \"Pot_10_to_1000_nofrance.csv\", sep=\";\")" ] } ], diff --git a/utils.py b/utils.py index e880d0e..c8c9947 100644 --- a/utils.py +++ b/utils.py @@ -14,14 +14,16 @@ def bilinear_interpolate(loc_ip, data): # map to [0,1], then scale to data size loc = (loc_ip.clone() + 1) / 2.0 - loc[:, 1] = 1 - loc[:, 1] # this is because latitude goes from +90 on top to bottom while + loc[:, 1] = ( + 1 - loc[:, 1] + ) # this is because latitude goes from +90 on top to bottom while # longitude goes from -90 to 90 left to right assert not torch.any(torch.isnan(loc)) # cast locations into pixel space - loc[:, 0] *= (data.shape[1] - 1) - loc[:, 1] *= (data.shape[0] - 1) + loc[:, 0] *= data.shape[1] - 1 + loc[:, 1] *= data.shape[0] - 1 loc_int = torch.floor(loc).long() # integer pixel coordinates xx = loc_int[:, 0] @@ -35,8 +37,12 @@ def bilinear_interpolate(loc_ip, data): dx = loc_delta[:, 0].unsqueeze(1) dy = loc_delta[:, 1].unsqueeze(1) - interp_val = data[yy, xx, :] * (1 - dx) * (1 - dy) + data[yy, xx_plus, :] * dx * (1 - dy) + \ - data[yy_plus, xx, :] * (1 - dx) * dy + data[yy_plus, xx_plus, :] * dx * dy + interp_val = ( + data[yy, xx, :] * (1 - dx) * (1 - dy) + + data[yy, xx_plus, :] * dx * (1 - dy) + + data[yy_plus, xx, :] * (1 - dx) * dy + + data[yy_plus, xx_plus, :] * dx * dy + ) return interp_val