run_experiment.py

#run_experiment.py
#Copyright (c) 2020 Rachel Lea Ballantyne Draelos

#MIT License

#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:

#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.

#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE

import os
import timeit
import datetime
import numpy as np
import pandas as pd

from torch.utils.data import Dataset, DataLoader
import torch, torch.nn as nn, torch.nn.functional as F
import torchvision
from torchvision import transforms, models, utils

import evaluate
from load_dataset import custom_datasets

#Set seeds
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)

class DukeCTModel(object):
    def __init__(self, descriptor, custom_net, custom_net_args,
                 loss, loss_args, num_epochs, patience, batch_size, device, data_parallel,
                 use_test_set, task, old_params_dir, dataset_class, dataset_args):
        """Variables:
        <descriptor>: string describing the experiment
        <custom_net>: class defining a model
        <custom_net_args>: dictionary where keys correspond to custom net
            input arguments, and values are the desired values    
        <loss>: 'bce' for binary cross entropy
        <loss_args>: arguments to pass to the loss function if any
        <num_epochs>: int for the maximum number of epochs to train
        <patience>: number of epochs for which loss must fail to improve to
            cause early stopping
        <batch_size>: int for number of examples per batch
        <device>: int specifying which device to use, or 'all' for all devices
        <data_parallel>: if True then parallelize across available GPUs.
        <use_test_set>: if True, then run model on the test set. If False, use
            only the training and validation sets.
        <task>:
            'train_eval': train and evaluate a new model. 'evaluate' will
                always imply use of the validation set. if <use_test_set> is
                True, then 'evaluate' also includes calculation of test set
                performance for the best validation epoch.
            'predict_on_test': load a trained model and make predictions on
                the test set using that model.
        <old_params_dir>: this is only needed if <task>=='predict_on_test'. This
            is the path to the parameters that will be loaded in to the model.
        <dataset_class>: CT Dataset class for preprocessing the data
        <dataset_args>: arguments for the dataset class specifying how
            the data should be prepared."""
        self.descriptor = descriptor
        self.set_up_results_dirs()
        self.custom_net = custom_net
        self.custom_net_args = custom_net_args
        self.loss = loss
        self.loss_args = loss_args
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        print('self.batch_size=',self.batch_size)
        #num_workers is number of threads to use for data loading
        self.num_workers = int(batch_size*4) #batch_size 1 = num_workers 4. batch_size 2 = num workers 8. batch_size 4 = num_workers 16.
        print('self.num_workers=',self.num_workers)
        if self.num_workers == 1:
            print('Warning: Using only one worker will slow down data loading')
        
        #Set Device and Data Parallelism
        if device in [0,1,2,3]: #i.e. if a GPU number was specified:
            self.device = torch.device('cuda:'+str(device))
            print('using device:',str(self.device),'\ndescriptor: ',self.descriptor)
        elif device == 'all':
            self.device = torch.device('cuda')
        self.data_parallel = data_parallel
        if self.data_parallel:
            assert device == 'all' #use all devices when running data parallel
        
        #Set Task
        self.use_test_set = use_test_set
        self.task = task
        assert self.task in ['train_eval','predict_on_test']
        if self.task == 'predict_on_test':
            #overwrite the params dir that was created in the call to
            #set_up_results_dirs() with the dir you want to load from
            self.params_dir = old_params_dir
        
        #Data and Labels
        self.CTDatasetClass = dataset_class
        self.dataset_args = dataset_args
        #Get label meanings, a list of descriptive strings (list elements must
        #be strings found in the column headers of the labels file)
        self.set_up_label_meanings(self.dataset_args['label_meanings'])
        if self.task == 'train_eval':
            self.dataset_train = self.CTDatasetClass(setname = 'train', **self.dataset_args)
            self.dataset_valid = self.CTDatasetClass(setname = 'valid', **self.dataset_args)
        if self.use_test_set:
            self.dataset_test = self.CTDatasetClass(setname = 'test', **self.dataset_args)
        
        #Tracking losses and evaluation results
        self.train_loss = np.zeros((self.num_epochs))
        self.valid_loss = np.zeros((self.num_epochs))
        self.eval_results_valid, self.eval_results_test = evaluate.initialize_evaluation_dfs(self.label_meanings, self.num_epochs)
        
        #For early stopping
        self.initial_patience = patience
        self.patience_remaining = patience
        self.best_valid_epoch = 0
        self.min_val_loss = np.inf
        
        #Run everything
        self.run_model()
    
    ### Methods ###
    def set_up_label_meanings(self,label_meanings):
        if label_meanings == 'all': #get full list of all available labels
            temp = custom_datasets.read_in_labels(self.dataset_args['label_type_ld'], 'valid')
            self.label_meanings = temp.columns.values.tolist()
        else: #use the label meanings that were passed in
            self.label_meanings = label_meanings
        print('label meanings ('+str(len(self.label_meanings))+' labels total):',self.label_meanings)
        
    def set_up_results_dirs(self):
        if not os.path.isdir('results'):
            os.mkdir('results')
        self.results_dir = os.path.join('results',datetime.datetime.today().strftime('%Y-%m-%d')+'_'+self.descriptor)
        if not os.path.isdir(self.results_dir):
            os.mkdir(self.results_dir)
        self.params_dir = os.path.join(self.results_dir,'params')
        if not os.path.isdir(self.params_dir):
            os.mkdir(self.params_dir)
        self.backup_dir = os.path.join(self.results_dir,'backup')
        if not os.path.isdir(self.backup_dir):
            os.mkdir(self.backup_dir)
        
    def run_model(self):
        if self.data_parallel:
            self.model = nn.DataParallel(self.custom_net(**self.custom_net_args)).to(self.device)
        else:
            self.model = self.custom_net(**self.custom_net_args).to(self.device)
        self.sigmoid = torch.nn.Sigmoid()
        self.set_up_loss_function()
        
        momentum = 0.99
        print('Running with optimizer lr=1e-3, momentum='+str(round(momentum,2))+' and weight_decay=1e-7')
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr = 1e-3, momentum=momentum, weight_decay=1e-7)
        
        train_dataloader = DataLoader(self.dataset_train, batch_size=self.batch_size, shuffle=True, num_workers = self.num_workers)
        valid_dataloader = DataLoader(self.dataset_valid, batch_size=self.batch_size, shuffle=False, num_workers = self.num_workers)
        
        if self.task == 'train_eval':
            for epoch in range(self.num_epochs):
                t0 = timeit.default_timer()
                self.train(train_dataloader, epoch)
                self.valid(valid_dataloader, epoch)
                self.save_evals(epoch)
                if self.patience_remaining <= 0:
                    print('No more patience (',self.initial_patience,') left at epoch',epoch)
                    print('--> Implementing early stopping. Best epoch was:',self.best_valid_epoch)
                    break
                t1 = timeit.default_timer()
                self.back_up_model_every_ten(epoch)
                print('Epoch',epoch,'time:',round((t1 - t0)/60.0,2),'minutes')  
        if self.use_test_set: self.test(DataLoader(self.dataset_test, batch_size=self.batch_size, shuffle=False, num_workers = self.num_workers))
        self.save_final_summary()
    
    def set_up_loss_function(self):
        if self.loss == 'bce': 
            self.loss_func = nn.BCEWithLogitsLoss() #includes application of sigmoid for numerical stability
    
    def train(self, dataloader, epoch):
        model = self.model.train()
        epoch_loss, pred_epoch, gr_truth_epoch, volume_accs_epoch = self.iterate_through_batches(model, dataloader, epoch, training=True)
        self.train_loss[epoch] = epoch_loss
        self.plot_roc_and_pr_curves('train', epoch, pred_epoch, gr_truth_epoch)
        print("{:5s} {:<3d} {:11s} {:.3f}".format('Epoch', epoch, 'Train Loss', epoch_loss))
        
    def valid(self, dataloader, epoch):
        model = self.model.eval()
        with torch.no_grad():
            epoch_loss, pred_epoch, gr_truth_epoch, volume_accs_epoch = self.iterate_through_batches(model, dataloader, epoch, training=False)
        self.valid_loss[epoch] = epoch_loss
        self.eval_results_valid = evaluate.evaluate_all(self.eval_results_valid, epoch,
            self.label_meanings, gr_truth_epoch, pred_epoch)
        self.early_stopping_check(epoch, pred_epoch, gr_truth_epoch, volume_accs_epoch)
        print("{:5s} {:<3d} {:11s} {:.3f}".format('Epoch', epoch, 'Valid Loss', epoch_loss))
    
    def early_stopping_check(self, epoch, val_pred_epoch, val_gr_truth_epoch, val_volume_accs_epoch):
        """Check whether criteria for early stopping are met and update
        counters accordingly"""
        val_loss = self.valid_loss[epoch]
        if (val_loss < self.min_val_loss) or epoch==0: #then save parameters
            self.min_val_loss = val_loss
            check_point = {'params': self.model.state_dict(),                            
                           'optimizer': self.optimizer.state_dict()}
            torch.save(check_point, os.path.join(self.params_dir, self.descriptor))                                 
            self.best_valid_epoch = epoch
            self.patience_remaining = self.initial_patience
            print('model saved, val loss',val_loss)
            self.plot_roc_and_pr_curves('valid', epoch, val_pred_epoch, val_gr_truth_epoch)
            self.save_all_pred_probs('valid', epoch, val_pred_epoch, val_gr_truth_epoch, val_volume_accs_epoch)
        else:
            self.patience_remaining -= 1
    
    def back_up_model_every_ten(self, epoch):
        """Back up the model parameters every 10 epochs"""
        if epoch % 10 == 0:
            check_point = {'params': self.model.state_dict(),                            
                           'optimizer': self.optimizer.state_dict()}
            torch.save(check_point, os.path.join(self.backup_dir, self.descriptor+'_ep_'+str(epoch)))   
    
    def test(self, dataloader):
        epoch = self.best_valid_epoch
        if self.data_parallel:
            model = nn.DataParallel(self.custom_net(**self.custom_net_args)).to(self.device).eval()
        else:
            model = self.custom_net(**self.custom_net_args).to(self.device).eval()
        params_path = os.path.join(self.params_dir,self.descriptor)
        print('For test set predictions, loading model params from params_path=',params_path)
        check_point = torch.load(params_path)
        model.load_state_dict(check_point['params'])
        with torch.no_grad():
            epoch_loss, pred_epoch, gr_truth_epoch, volume_accs_epoch = self.iterate_through_batches(model, dataloader, epoch, training=False)
        self.eval_results_test = evaluate.evaluate_all(self.eval_results_test, epoch,
            self.label_meanings, gr_truth_epoch, pred_epoch)
        self.plot_roc_and_pr_curves('test', epoch, pred_epoch, gr_truth_epoch)
        self.save_all_pred_probs('test', epoch, pred_epoch, gr_truth_epoch, volume_accs_epoch)
        print("{:5s} {:<3d} {:11s} {:.3f}".format('Epoch', epoch, 'Test Loss', epoch_loss))
    
    def iterate_through_batches(self, model, dataloader, epoch, training):
        epoch_loss = 0
        
        #Initialize numpy arrays for storing results. examples x labels
        #Do NOT use concatenation, or else you will have memory fragmentation.
        num_examples = len(dataloader.dataset)
        num_labels = len(self.label_meanings)
        pred_epoch = np.zeros([num_examples,num_labels])
        gr_truth_epoch = np.zeros([num_examples,num_labels])
        volume_accs_epoch = np.empty(num_examples,dtype='U32') #need to use U32 to allow string of length 32
        
        for batch_idx, batch in enumerate(dataloader):
            data, gr_truth = self.move_data_to_device(batch)
            self.optimizer.zero_grad()
            if training:
                out = model(data)
            else:
                with torch.set_grad_enabled(False):
                   out = model(data)
            loss = self.loss_func(out, gr_truth)
            if training:
                loss.backward()
                self.optimizer.step()   
            
            epoch_loss += loss.item()
            torch.cuda.empty_cache()
            
            #Save predictions and ground truth across batches
            pred = self.sigmoid(out.data).detach().cpu().numpy()
            gr_truth = gr_truth.detach().cpu().numpy()
            
            start_row = batch_idx*self.batch_size
            stop_row = min(start_row + self.batch_size, num_examples)
            pred_epoch[start_row:stop_row,:] = pred #pred_epoch is e.g. [25355,80] and pred is e.g. [1,80] for a batch size of 1
            gr_truth_epoch[start_row:stop_row,:] = gr_truth #gr_truth_epoch has same shape as pred_epoch
            volume_accs_epoch[start_row:stop_row] = batch['volume_acc'] #volume_accs_epoch stores the volume accessions in the order they were used
            
            #the following line to empty the cache is necessary in order to
            #reduce memory usage and avoid OOM error:
            torch.cuda.empty_cache() 
        return epoch_loss, pred_epoch, gr_truth_epoch, volume_accs_epoch
    
    def move_data_to_device(self, batch):
        """Move data and ground truth to device."""
        assert self.dataset_args['crop_type'] == 'single'
        if self.dataset_args['crop_type'] == 'single':
            data = batch['data'].to(self.device)
        
        #Ground truth to device
        gr_truth = batch['gr_truth'].to(self.device)
        return data, gr_truth
    
    def plot_roc_and_pr_curves(self, setname, epoch, pred_epoch, gr_truth_epoch):
        outdir = os.path.join(self.results_dir,'curves')
        if not os.path.isdir(outdir):
            os.mkdir(outdir)
        evaluate.plot_roc_curve_multi_class(label_meanings=self.label_meanings,
                    y_test=gr_truth_epoch, y_score=pred_epoch,
                    outdir = outdir, setname = setname, epoch = epoch)
        evaluate.plot_pr_curve_multi_class(label_meanings=self.label_meanings,
                    y_test=gr_truth_epoch, y_score=pred_epoch,
                    outdir = outdir, setname = setname, epoch = epoch)
    
    def save_all_pred_probs(self, setname, epoch, pred_epoch, gr_truth_epoch, volume_accs_epoch):
        outdir = os.path.join(self.results_dir,'pred_probs')
        if not os.path.isdir(outdir):
            os.mkdir(outdir)
        (pd.DataFrame(pred_epoch,columns=self.label_meanings,index=volume_accs_epoch.tolist())).to_csv(os.path.join(outdir, setname+'_predprob_ep'+str(epoch)+'.csv'))
        (pd.DataFrame(gr_truth_epoch,columns=self.label_meanings,index=volume_accs_epoch.tolist())).to_csv(os.path.join(outdir, setname+'_grtruth_ep'+str(epoch)+'.csv'))
        
    def save_evals(self, epoch):
        evaluate.save(self.eval_results_valid, self.results_dir, self.descriptor+'_valid')
        if self.use_test_set: evaluate.save(self.eval_results_test, self.results_dir, self.descriptor+'_test')
        evaluate.plot_learning_curves(self.train_loss, self.valid_loss, self.results_dir, self.descriptor)
               
    def save_final_summary(self):
        evaluate.save_final_summary(self.eval_results_valid, self.best_valid_epoch, 'valid', self.results_dir)
        if self.use_test_set: evaluate.save_final_summary(self.eval_results_test, self.best_valid_epoch, 'test', self.results_dir)
        evaluate.clean_up_output_files(self.best_valid_epoch, self.results_dir)