Add Fields Of The World field boundary delineation dataset (microsoft…

…#2296) * Initial commit * Formatting changes * Reformatting how self.files is stored * Added pyarrow dependency for reading parquet files * Load splits from existing parquet files * ruff * Adding tests * Adding tests * Ruff * Update torchgeo/datasets/ftw.py Co-authored-by: Adam J. Stewart <[email protected]> * Update torchgeo/datasets/ftw.py Co-authored-by: Adam J. Stewart <[email protected]> * Update torchgeo/datasets/ftw.py Co-authored-by: Adam J. Stewart <[email protected]> * Using einops * Use tuple for valid country list * Add pandas parquet * Added pyarrow req to min dataset * pandas bump * Add paper link * Add a pyarrow importorskip * No minversion * ClassVar not needed --------- Co-authored-by: Adam J. Stewart <[email protected]>
sede-open · Sep 29, 2024 · b2f9936 · b2f9936
1 parent 3e968a9
commit b2f9936
Show file tree

Hide file tree

Showing 10 changed files with 574 additions and 0 deletions.
diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst
@@ -282,6 +282,11 @@ FAIR1M
 
 .. autoclass:: FAIR1M
 
+Fields Of The World
+^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: FieldsOfTheWorld
+
 FireRisk
 ^^^^^^^^
 

diff --git a/docs/api/datasets/non_geo_datasets.csv b/docs/api/datasets/non_geo_datasets.csv
@@ -15,6 +15,7 @@ Dataset,Task,Source,License,# Samples,# Classes,Size (px),Resolution (m),Bands
 `ETCI2021 Flood Detection`_,S,Sentinel-1,-,"66,810",2,256x256,5--20,SAR
 `EuroSAT`_,C,Sentinel-2,"MIT","27,000",10,64x64,10,MSI
 `FAIR1M`_,OD,Gaofen/Google Earth,"CC-BY-NC-SA-3.0","15,000",37,"1,024x1,024",0.3--0.8,RGB
+`Fields Of The World`_,"S,I",Sentinel-2,"Various","70795","2,3",256x256,10,MSI
 `FireRisk`_,C,NAIP Aerial,"CC-BY-NC-4.0","91,872",7,"320x320",1,RGB
 `Forest Damage`_,OD,Drone imagery,"CDLA-Permissive-1.0","1,543",4,"1,500x1,500",,RGB
 `GeoNRW`_,S,Aerial,"CC-BY-4.0","7,783",11,"1,000x1,000",1,"RGB, DEM"

diff --git a/pyproject.toml b/pyproject.toml
@@ -87,6 +87,8 @@ datasets = [
     "laspy>=2",
     # opencv-python 4.5.4+ required for Python 3.10 wheels
     "opencv-python>=4.5.4",
+    # pandas 2+ required for parquet extra
+    "pandas[parquet]>=2",
     # pycocotools 2.0.7+ required for wheels
     "pycocotools>=2.0.7",
     # pyvista 0.34.2+ required to avoid ImportError in CI

diff --git a/requirements/datasets.txt b/requirements/datasets.txt
@@ -2,6 +2,7 @@
 h5py==3.12.1
 laspy==2.5.4
 opencv-python==4.10.0.84
+pandas[parquet]==2.2.3
 pycocotools==2.0.8
 pyvista==0.44.1
 scikit-image==0.24.0

diff --git a/requirements/min-reqs.old b/requirements/min-reqs.old
@@ -26,6 +26,7 @@ h5py==3.6.0
 laspy==2.0.0
 opencv-python==4.5.4.58
 pycocotools==2.0.7
+pyarrow==15.0.0  # Remove when we upgrade min verison of pandas to `pandas[parquet]>=2`
 pyvista==0.34.2
 scikit-image==0.19.0
 scipy==1.7.2

diff --git a/tests/data/ftw/austria.zip b/tests/data/ftw/austria.zip
diff --git a/tests/data/ftw/data.py b/tests/data/ftw/data.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import hashlib
+import os
+import shutil
+import zipfile
+
+import numpy as np
+import pandas as pd
+import rasterio
+from affine import Affine
+
+np.random.seed(0)
+
+country = 'austria'
+SIZE = 32
+num_samples = {'train': 2, 'val': 2, 'test': 2}
+BASE_PROFILE = {
+    'driver': 'GTiff',
+    'dtype': 'uint16',
+    'nodata': None,
+    'width': SIZE,
+    'height': SIZE,
+    'count': 4,
+    'crs': 'EPSG:4326',
+    'transform': Affine(5.4e-05, 0.0, 0, 0.0, 5.4e-05, 0),
+    'blockxsize': SIZE,
+    'blockysize': SIZE,
+    'tiled': True,
+    'interleave': 'pixel',
+}
+
+
+def create_image(fn: str) -> None:
+    os.makedirs(os.path.dirname(fn), exist_ok=True)
+
+    profile = BASE_PROFILE.copy()
+
+    data = np.random.randint(0, 20000, size=(4, SIZE, SIZE), dtype=np.uint16)
+    with rasterio.open(fn, 'w', **profile) as dst:
+        dst.write(data)
+
+
+def create_mask(fn: str, min_val: int, max_val: int) -> None:
+    os.makedirs(os.path.dirname(fn), exist_ok=True)
+
+    profile = BASE_PROFILE.copy()
+    profile['dtype'] = 'uint8'
+    profile['nodata'] = 0
+    profile['count'] = 1
+
+    data = np.random.randint(min_val, max_val, size=(1, SIZE, SIZE), dtype=np.uint8)
+    with rasterio.open(fn, 'w', **profile) as dst:
+        dst.write(data)
+
+
+if __name__ == '__main__':
+    i = 0
+    cols = {'aoi_id': [], 'split': []}
+    for split, n in num_samples.items():
+        for j in range(n):
+            aoi = f'g_{i}'
+            cols['aoi_id'].append(aoi)
+            cols['split'].append(split)
+
+            create_image(os.path.join(country, 's2_images', 'window_a', f'{aoi}.tif'))
+            create_image(os.path.join(country, 's2_images', 'window_b', f'{aoi}.tif'))
+
+            create_mask(
+                os.path.join(country, 'label_masks', 'semantic_2class', f'{aoi}.tif'),
+                0,
+                1,
+            )
+            create_mask(
+                os.path.join(country, 'label_masks', 'semantic_3class', f'{aoi}.tif'),
+                0,
+                2,
+            )
+            create_mask(
+                os.path.join(country, 'label_masks', 'instance', f'{aoi}.tif'), 0, 100
+            )
+
+            i += 1
+
+    # Create an extra train file to test for missing other files
+    aoi = f'g_{i}'
+    cols['aoi_id'].append(aoi)
+    cols['split'].append(split)
+    create_image(os.path.join(country, 's2_images', 'window_a', f'{aoi}.tif'))
+
+    # Write parquet index
+    df = pd.DataFrame(cols)
+    df.to_parquet(os.path.join(country, f'chips_{country}.parquet'))
+
+    # archive to zip
+    with zipfile.ZipFile(f'{country}.zip', 'w') as zipf:
+        for root, _, files in os.walk(country):
+            for file in files:
+                output_fn = os.path.join(root, file)
+                zipf.write(output_fn, os.path.relpath(output_fn, country))
+
+    shutil.rmtree(country)
+
+    # Compute checksums
+    with open(f'{country}.zip', 'rb') as f:
+        md5 = hashlib.md5(f.read()).hexdigest()
+        print(f'{md5}')
diff --git a/tests/datasets/test_ftw.py b/tests/datasets/test_ftw.py
@@ -0,0 +1,90 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import shutil
+from itertools import product
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pytest
+import torch
+import torch.nn as nn
+from _pytest.fixtures import SubRequest
+from pytest import MonkeyPatch
+from torch.utils.data import ConcatDataset
+
+from torchgeo.datasets import DatasetNotFoundError, FieldsOfTheWorld
+
+pytest.importorskip('pyarrow')
+
+
+class TestFieldsOfTheWorld:
+    @pytest.fixture(
+        params=product(['train', 'val', 'test'], ['2-class', '3-class', 'instance'])
+    )
+    def dataset(
+        self, monkeypatch: MonkeyPatch, tmp_path: Path, request: SubRequest
+    ) -> FieldsOfTheWorld:
+        split, task = request.param
+
+        monkeypatch.setattr(FieldsOfTheWorld, 'valid_countries', ['austria'])
+        monkeypatch.setattr(
+            FieldsOfTheWorld,
+            'country_to_md5',
+            {'austria': '1cf9593c9bdceeaba21bbcb24d35816c'},
+        )
+        base_url = os.path.join('tests', 'data', 'ftw') + '/'
+        monkeypatch.setattr(FieldsOfTheWorld, 'base_url', base_url)
+        root = tmp_path
+        transforms = nn.Identity()
+        return FieldsOfTheWorld(
+            root,
+            split,
+            task,
+            countries='austria',
+            transforms=transforms,
+            download=True,
+            checksum=True,
+        )
+
+    def test_getitem(self, dataset: FieldsOfTheWorld) -> None:
+        x = dataset[0]
+        assert isinstance(x, dict)
+        assert isinstance(x['image'], torch.Tensor)
+        assert isinstance(x['mask'], torch.Tensor)
+
+    def test_len(self, dataset: FieldsOfTheWorld) -> None:
+        assert len(dataset) == 2
+
+    def test_add(self, dataset: FieldsOfTheWorld) -> None:
+        ds = dataset + dataset
+        assert isinstance(ds, ConcatDataset)
+        assert len(ds) == 4
+
+    def test_already_extracted(self, dataset: FieldsOfTheWorld) -> None:
+        FieldsOfTheWorld(root=dataset.root, download=True)
+
+    def test_already_downloaded(self, monkeypatch: MonkeyPatch, tmp_path: Path) -> None:
+        url = os.path.join('tests', 'data', 'ftw', 'austria.zip')
+        root = tmp_path
+        shutil.copy(url, root)
+        FieldsOfTheWorld(root)
+
+    def test_not_downloaded(self, tmp_path: Path) -> None:
+        with pytest.raises(DatasetNotFoundError, match='Dataset not found'):
+            FieldsOfTheWorld(tmp_path)
+
+    def test_invalid_split(self) -> None:
+        with pytest.raises(AssertionError):
+            FieldsOfTheWorld(split='foo')
+
+    def test_plot(self, dataset: FieldsOfTheWorld) -> None:
+        x = dataset[0].copy()
+        dataset.plot(x, suptitle='Test')
+        plt.close()
+        dataset.plot(x, show_titles=False)
+        plt.close()
+        x['prediction'] = x['mask'].clone()
+        dataset.plot(x)
+        plt.close()
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -46,6 +46,7 @@
 from .fair1m import FAIR1M
 from .fire_risk import FireRisk
 from .forestdamage import ForestDamage
+from .ftw import FieldsOfTheWorld
 from .gbif import GBIF
 from .geo import (
     GeoDataset,
@@ -217,6 +218,7 @@
     'EuroSATSpatial',
     'EuroSAT100',
     'FAIR1M',
+    'FieldsOfTheWorld',
     'FireRisk',
     'ForestDamage',
     'GeoNRW',