-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_preprocessing.py
34 lines (28 loc) · 1.23 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os
import re
import logging
import pickle
import torch
from dataloader import BreastHistopathologyDataset
DATA_PATH = "./data"
IMAGE_NAMES_FILENAME = "patient_ids.pkl"
logging.basicConfig(level=logging.INFO) # DEBUG, INFO, WARNING, ERROR, CRITICAL
if __name__ == "__main__":
# Calling the BreastHistopathologyDataset constructor (i.e. __init__) will
# run all the necessary data preprocessing steps and dump any slow-to-load
# data into serialized .pkl files; on subsequent runs, initializing new
# BreastHistopathologyDataset objects will be much faster
full_dataset = BreastHistopathologyDataset()
logging.info("Total dataset size: {}".format(len(full_dataset)))
logging.info(full_dataset)
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
# https://pytorch.org/docs/stable/data.html#torch.utils.data.random_split
train_dataset, test_dataset = torch.utils.data.random_split(
full_dataset, [train_size, test_size],
generator=torch.Generator().manual_seed(6)
)
logging.info(f"Train dataset size: {len(train_dataset)}")
logging.info(train_dataset)
logging.info(f"Test dataset size: {len(test_dataset)}")
logging.info(test_dataset)