-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy paths1dataset.py
186 lines (147 loc) · 7.4 KB
/
s1dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import os
import torch
from torch.utils.data import Dataset
import gzip
import zipfile
from sh import gunzip
from glob import glob
import pickle
import sentinelhub # this import is necessary for pickle loading
import geopandas as gpd
import numpy as np
import rasterio as rio
from rasterio import features
from tqdm import tqdm
CLASSES = ["Wheat", "Rye", "Barley", "Oats", "Corn", "Oil Seeds", "Root Crops", "Meadows", "Forage Crops"]
CROP_IDS = [1, 2, 3, 4, 5, 6, 7, 8, 9]
class S2DatasetV2(Dataset):
def __init__(self, zippath, labelgeojson, transform=None, min_area=1000):
self.data_transform = transform
datadir = os.path.dirname(zippath)
rootpath = zippath.replace(".zip", "")
if not (os.path.exists(rootpath) and os.path.isdir(rootpath)):
print(f"unzipping {zippath} to {datadir}")
with zipfile.ZipFile(zippath, 'r') as zip_ref:
zip_ref.extractall(datadir)
else:
print(f"found folder in {rootpath}, no need to unzip")
# find all .gz-ipped files and unzip
for gz in glob(os.path.join(rootpath, "*", "*.gz")) + glob(os.path.join(rootpath, "*.gz")):
print(f"unzipping {gz}")
gunzip(gz)
with open(os.path.join(rootpath, "bbox.pkl"), 'rb') as f:
bbox = pickle.load(f)
crs = str(bbox.crs)
minx, miny, maxx, maxy = bbox.min_x, bbox.min_y, bbox.max_x, bbox.max_y
labels = gpd.read_file(labelgeojson)
# project to same coordinate reference system (crs) as the imagery
labels = labels.to_crs(crs)
mask = labels.geometry.area > min_area
print(f"ignoring {(~mask).sum()}/{len(mask)} fields with area < {min_area}m2")
labels = labels.loc[mask]
self.bands = np.load(os.path.join(rootpath, "data", "BANDS.npy"))
self. clp = np.load(os.path.join(rootpath, "data", "CLP.npy"))
# bands = np.concatenate([bands, clp], axis=-1) # concat cloud probability
_, width, height, _ = self.bands.shape
transform = rio.transform.from_bounds(minx, miny, maxx, maxy, width, height)
self.fid_mask = features.rasterize(zip(labels.geometry, labels.fid), all_touched=True,
transform=transform, out_shape=(width, height))
assert len(np.unique(self.fid_mask)) > 0, f"vectorized fid mask contains no fields. " \
f"Does the label geojson {labelgeojson} cover the region defined by {zippath}?"
self.crop_mask = features.rasterize(zip(labels.geometry, labels.crop_id), all_touched=True,
transform=transform, out_shape=(width, height))
assert len(np.unique(self.crop_mask)) > 0, f"vectorized fid mask contains no fields. " \
f"Does the label geojson {labelgeojson} cover the region defined by {zippath}?"
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, item):
feature = self.labels.iloc[item]
y = feature.crop_id
fid = feature.fid
field_mask = self.fid_mask == fid
X = self.bands.transpose(0, 3, 1, 2)[:, :, field_mask]
clp = self.clp.transpose(0, 3, 1, 2)[:, :, field_mask]
if self.data_transform is not None:
X = self.data_transform(X, clp)
return X, CROP_IDS.index(y), fid
class S1Dataset(Dataset):
def __init__(self, zippath, labelgeojson, transform=None):
npzcache = zippath.replace(".zip", ".npz")
self.data_transform = transform
if not os.path.exists(npzcache):
self.tsdata, self.fids, self.crop_ids = setup(zippath, labelgeojson)
print(f"saving extracted time series with label data to {npzcache}")
np.savez(npzcache, tsdata=self.tsdata, fids=self.fids, crop_ids=self.crop_ids)
else:
self.tsdata = np.load(npzcache)["tsdata"]
self.fids = np.load(npzcache)["fids"]
self.crop_ids = np.load(npzcache)["crop_ids"]
def __len__(self):
return len(self.fids)
def __getitem__(self, item):
X = self.tsdata[item]
y = self.crop_ids[item]
fid = self.fids[item]
if self.data_transform is not None:
X = self.data_transform(X)
return X, CROP_IDS.index(y), fid
def setup(zippath, labelgeojson):
"""
This utility function unzipps a dataset from Sinergize and performs a field-wise aggregation.
results are written to a .npz cache with same name as zippath
"""
datadir = os.path.dirname(zippath)
rootpath = zippath.replace(".zip", "")
if not (os.path.exists(rootpath) and os.path.isdir(rootpath)):
print(f"unzipping {zippath} to {datadir}")
with zipfile.ZipFile(zippath, 'r') as zip_ref:
zip_ref.extractall(datadir)
else:
print(f"found folder in {rootpath}, no need to unzip")
# find all .gz-ipped files and unzip
for gz in glob(os.path.join(rootpath,"*","*.gz")) + glob(os.path.join(rootpath,"*.gz")):
print(f"unzipping {gz}")
gunzip(gz)
with open(os.path.join(rootpath, "bbox.pkl"), 'rb') as f:
bbox = pickle.load(f)
crs = str(bbox.crs)
minx, miny, maxx, maxy = bbox.min_x, bbox.min_y, bbox.max_x, bbox.max_y
labels = gpd.read_file(labelgeojson)
# project to same coordinate reference system (crs) as the imagery
labels = labels.to_crs(crs)
vv = np.load(os.path.join(rootpath, "data", "VV.npy"))
vh = np.load(os.path.join(rootpath, "data", "VH.npy"))
bands = np.stack([vv[:,:,:,0],vh[:,:,:,0]], axis=3)
#clp = np.load(os.path.join(rootpath, "data", "CLP.npy"))
#bands = np.concatenate([bands, clp], axis=-1) # concat cloud probability
_, width, height, _ = bands.shape
transform = rio.transform.from_bounds(minx, miny, maxx, maxy, width, height)
fid_mask = features.rasterize(zip(labels.geometry, labels.fid), all_touched=True,
transform=transform, out_shape=(width, height))
assert len(np.unique(fid_mask)) > 0, f"vectorized fid mask contains no fields. " \
f"Does the label geojson {labelgeojson} cover the region defined by {zippath}?"
crop_mask = features.rasterize(zip(labels.geometry, labels.crop_id), all_touched=True,
transform=transform, out_shape=(width, height))
assert len(np.unique(crop_mask)) > 0, f"vectorized fid mask contains no fields. " \
f"Does the label geojson {labelgeojson} cover the region defined by {zippath}?"
fids = []
crop_ids = []
tsdata = []
for fid, crop_id in tqdm(zip(labels.fid.unique(), labels.crop_id.values), total=len(labels), desc="extracting time series"):
field_mask = fid_mask == fid
if field_mask.sum() > 0:
data = bands.transpose(0, 3, 1, 2)[:, :, field_mask].mean(-1)
tsdata.append(data)
crop_ids.append(crop_id)
fids.append(fid)
else:
print(f"field {fid} contained no pixels. Is it too small with {labels.loc[labels.fid==fid].geometry.area}m2 ? skipping...")
tsdata = np.stack(tsdata)
return tsdata, fids, crop_ids
if __name__ == '__main__':
zippath = "/ssd/DENETHOR/S1/Test/s1_test_des.zip"
labelgeojson = "/ssd/DENETHOR/crops_test_2019.geojson"
ds = S1Dataset(zippath, labelgeojson)
len(ds)
X,y,fid = ds[0]