-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcreate_dataset.py
85 lines (67 loc) · 2.5 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import shutil
import zipfile
import urllib.request
import scipy.io
import h5py
import numpy as np
from constants import DATASET_FN
from constants import DATASET_URL
from constants import DATASET_HDF
from constants import DATA_PATH
def download_dataset():
file_name = DATASET_FN
u = urllib.request.urlopen(DATASET_URL)
with open(file_name, 'wb') as f:
meta = u.info()
file_size = int(meta['Content-Length'])
print("Downloading: {0:} {1:5.2f} MB".format(os.path.basename(file_name), file_size / (1024**2)))
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
status = r"%10d MB [%3.2f%%]" % (file_size_dl / (1024**2), file_size_dl * 100. / file_size)
status = status + chr(8) * (len(status) + 1)
print(status, end='')
print('Download done!')
def extract():
print('Extracting Files...')
with zipfile.ZipFile(DATASET_FN) as zip:
zip.extractall(DATA_PATH)
def convert():
print('Converting to NumPy...')
with h5py.File(DATASET_HDF, 'w') as hdf:
for dirpath, dnames, fnames in os.walk(DATA_PATH):
for f in fnames:
if f.endswith(".mat"):
fn = os.path.join(dirpath, f)
split_path = fn.split(os.sep)
if 'volumetric_data' not in split_path or f == 'train_feature.mat' or f == 'test_feature.mat':
continue
category = split_path[-4]
arr = scipy.io.loadmat(fn)['instance'].astype(np.uint8)
arrpad = np.zeros((32,) * 3, dtype=np.uint8)
arrpad[1:-1, 1:-1, 1:-1] = arr
if category not in hdf:
hdf.create_group(category)
hdf[category].create_dataset(os.path.splitext(f)[0], data=arrpad, compression='gzip')
def cleanup():
print('Cleaning up...')
dataset_file = os.path.abspath(DATASET_FN)
dataset_path = os.path.abspath(dataset_file)[:-8]
if os.path.exists(dataset_file):
os.remove(dataset_file)
shutil.rmtree(dataset_path, ignore_errors=True)
if __name__ == '__main__':
if os.path.exists(DATASET_HDF):
print('HDF File exists, no need to re-download data')
else:
download_dataset()
extract()
convert()
cleanup()
print('Done!')