code cleaning

dudzicp · Sep 22, 2021 · d757f15 · d757f15
1 parent 5eeb1fd
commit d757f15
Show file tree

Hide file tree

Showing 6 changed files with 0 additions and 132 deletions.
diff --git a/model/callbacks_custom.py b/model/callbacks_custom.py
@@ -1,6 +1,5 @@
 import logging
 import warnings
-
 import numpy as np
 import pandas as pd
 from keras.callbacks import Callback

diff --git a/pipeline/crossvalidation_pipeline.py b/pipeline/crossvalidation_pipeline.py
@@ -11,7 +11,6 @@
 from sklearn.model_selection import StratifiedKFold
 
 from data.data_access import Data
-# from features_processing.feature_scaler import FeatureScaler
 from model.model_factory import get_model
 from pipeline.one_split import OneSplitPipeline
 from utils.plots import plot_box_plot
@@ -61,7 +60,6 @@ def run(self, n_splits=5):
             logging.info('fitting model ...')
 
             for model_param in self.model_params:
-                # model_name = m['type']
                 if 'id' in model_param:
                     model_name = model_param['id']
                 else:
@@ -77,7 +75,6 @@ def run(self, n_splits=5):
                 scores_df, scores_mean, scores_std = get_mean_variance(scores)
                 list_model_scores.append(scores_df)
                 model_names.append(model_name)
-                # self.save_score(scores_df, scores_mean, scores_std, model_param['type'])
                 self.save_score(data_params, m_param, scores_df, scores_mean, scores_std, model_name)
                 logging.info('scores')
                 logging.info(scores_df)
@@ -113,44 +110,28 @@ def train_predict_crossvalidation(self, model_params, X, y, info, cols, model_na
         for train_index, test_index in skf.split(X, y.ravel()):
             model = get_model(model_params)
             logging.info('fold # ----------------%d---------' % i)
-            # x_train, x_test = X.iloc[train_index], X.iloc[test_index]
             x_train, x_test = X[train_index], X[test_index]
             y_train, y_test = y[train_index], y[test_index]
             info_train = pd.DataFrame(index=info[train_index])
             info_test = pd.DataFrame(index=info[test_index])
-            # info_test = info.iloc[test_index,:].copy()
             x_train, x_test = self.preprocess(x_train, x_test)
             # feature extraction
             logging.info('feature extraction....')
             x_train, x_test = self.extract_features(x_train, x_test)
 
-            # hack  TODO: change this
-            # scaler = FeatureScaler()
-            # x_train = scaler.transform(x_train, cols)
-            # x_test = scaler.transform(x_test, cols)
-
-            # if 'fitting_params' in model_params['params']:
-            #     if 'x_to_list' in model_params['params']['fitting_params']:
-            #         if model_params['params']['fitting_params']['x_to_list']:
-            #             x_train = self.get_list(x_train, cols)
-            #             x_test = self.get_list(x_test, cols)
-
             model = model.fit(x_train, y_train)
 
             y_pred_test, y_pred_test_scores = self.predict(model, x_test, y_test)
             score_test = self.evaluate(y_test, y_pred_test, y_pred_test_scores)
             logging.info('model {} -- Test score {}'.format(model_name, score_test))
             self.save_prediction(info_test, y_pred_test, y_pred_test_scores, y_test, i, model_name)
 
-            # logging.info('saving results')
-            # self.save_score(score_test, score_test, score_test, model_name )
             if hasattr(model, 'save_model'):
                 logging.info('saving coef')
                 save_model(model, model_name + '_' + str(i), self.directory)
 
             if self.save_train:
                 logging.info('predicting training ...')
-                # y_pred_train, y_pred_train_scores, score_train, confusion_mtrx = self.predict(model, x_train, y_train)
                 y_pred_train, y_pred_train_scores = self.predict(model, x_train, y_train)
                 self.save_prediction(info_train, y_pred_train, y_pred_train_scores, y_train, i, model_name,
                                      training=True)
@@ -178,5 +159,3 @@ def save_score(self, data_params, model_params, scores, scores_mean, scores_std,
                            'pipeline': self.pipeline_params, 'scores': scores.to_json(),
                            'scores_mean': scores_mean.to_json(), 'scores_std': scores_std.to_json()},
                           default_flow_style=False))
-    #
-    #
diff --git a/pipeline/one_split.py b/pipeline/one_split.py
@@ -10,11 +10,8 @@
 import yaml
 from matplotlib import pyplot as plt
 from sklearn.metrics import confusion_matrix
-
 from data.data_access import Data
 from model.model_factory import get_model
-# from features_processing import feature_extraction
-# from features_processing.feature_scaler import FeatureScaler
 from pipeline.pipe_utils import get_model_id, get_coef_from_model, get_balanced
 from preprocessing import pre
 from utils.evaluate import evalualte_survival, evalualte_classification_binary, evalualte_regression
@@ -44,7 +41,6 @@ def get_model_name(model):
 class OneSplitPipeline:
     def __init__(self, task, data_params, pre_params, feature_params, model_params, pipeline_params, exp_name):
 
-        # self.eval_dataset = eval_dataset
         self.task = task
         if type(data_params) == list:
             self.data_params = data_params
@@ -67,7 +63,6 @@ def __init__(self, task, data_params, pre_params, feature_params, model_params,
         self.prapre_saving_dir()
 
     def prapre_saving_dir(self):
-        # self.directory = self.exp_name +  timeStamp
         self.directory = self.exp_name
         if not exists(self.directory):
             makedirs(self.directory)
@@ -106,12 +101,7 @@ def get_list(self, x, cols):
         return genes_list
 
     def get_train_test(self, data):
-        # x_train, x_test, y_train, y_test= data.get_train_test()
         x_train, x_test, y_train, y_test, info_train, info_test, columns = data.get_train_test()
-        # info_train= x_train.index
-        # info_test= x_test.index
-        # columns=x_train.columns
-        # balance data
         balance_train = False
         balance_test = False
         p = self.pipeline_params['params']
@@ -142,7 +132,6 @@ def run(self):
             logging.info('loading data....')
             data = Data(**data_params)
             # get data
-            # x_train, x_test, y_train, y_test, info_train, info_test, cols = self.get_train_test(data)
             x_train, x_validate_, x_test_, y_train, y_validate_, y_test_, info_train, info_validate_, info_test_, cols = data.get_train_validate_test()
 
             logging.info('predicting')
@@ -151,22 +140,10 @@ def run(self):
                 y_t = y_validate_
                 info_t = info_validate_
             else:
-                # print type(x_train)
-                # print type(y_train)
-                # print type(info_train)
-                #
-                # x_train = np.concatenate((x_train , x_validate_))
-                # y_train = np.concatenate((y_train , y_validate_))
-                # info_train = info_train.append(info_validate_)
-
                 x_t = np.concatenate((x_test_, x_validate_))
                 y_t = np.concatenate((y_test_, y_validate_))
                 info_t = info_test_.append(info_validate_)
 
-                # x_t = x_test_
-                # y_t = y_test_
-                # info_t = info_test_
-
             logging.info('x_train {} y_train {} '.format(x_train.shape, y_train.shape))
             logging.info('x_test {} y_test {} '.format(x_t.shape, y_t.shape))
 
@@ -180,19 +157,15 @@ def run(self):
                 model = get_model(model_params_)
                 logging.info('fitting')
                 logging.info(model_params_)
-                # model = model.fit(x_train, y_train)
                 if model_params_['type'] == 'nn' and not self.eval_dataset == 'validation':
                     model = model.fit(x_train, y_train, x_validate_, y_validate_)
-                    # model = model.fit(x_train, y_train, x_train, y_train)
                 else:
                     model = model.fit(x_train, y_train)
-                # # model_list.append(model)
                 logging.info('predicting')
 
                 model_name = get_model_name(model_params_)
                 model_name = model_name + '_' + data_id
                 model_params_['id'] = model_name
-
                 logging.info('model id: {}'.format(model_name))
                 model_list.append((model, model_params_))
                 y_pred_test, y_pred_test_scores = self.predict(model, x_test, y_t)
@@ -201,7 +174,6 @@ def run(self):
                 test_scores.append(test_score)
                 model_names.append(model_name)
                 logging.info('saving results')
-                # self.save_score(test_score, model_name)
                 self.save_score(data_params, model_params_, test_score, model_name)
                 self.save_prediction(info_t, y_pred_test, y_pred_test_scores, y_t, model_name)
                 y_test_list.append(y_t)
@@ -228,14 +200,9 @@ def run(self):
                     self.save_prediction(info_train, y_pred_train, y_pred_train_scores, y_train, model_name,
                                          training=True)
 
-        # self.save_coef(model_list, cols)
-        # auc_fig.savefig(join(self.directory, 'auc_curves'))
-        # prc_fig.savefig(join(self.directory, 'auprc_curves'))
         test_scores = pd.DataFrame(test_scores, index=model_names)
         generate_plots(test_scores, self.directory)
         self.save_all_scores(test_scores)
-        # self.plot_coef(model_list)
-        # self.save_cnf_matrix(cnf_matrix_list, model_names)
 
         if self.task == 'classification_binary':
             auc_fig = plt.figure()
@@ -246,8 +213,6 @@ def run(self):
                                                                            y_pred_test_scores_list, model_names):
                 plot_roc(auc_fig, y_test, y_pred_test_scores, self.directory, label=model_name)
                 plot_prc(prc_fig, y_test, y_pred_test_scores, self.directory, label=model_name)
-                # cnf_matrix = confusion_matrix(y_test, y_pred_test)
-                # save_confusion_matrix(cnf_matrix, self.directory, model_name)
             auc_fig.savefig(join(self.directory, 'auc_curves'))
             prc_fig.savefig(join(self.directory, 'auprc_curves'))
         return test_scores
@@ -269,13 +234,11 @@ def save_coef(self, model_list, cols):
             makedirs(dir_name)
 
         for model, model_params in model_list:
-            # print model_params
             model_name = get_model_id(model_params)
             c_ = get_coef_from_model(model)
             logging.info('saving coef ')
             model_name_col = model_name
             if hasattr(model, 'get_named_coef') and c_ is not None:
-                # print 'save_feature_importance'
                 file_name = join(dir_name, 'coef_' + model_name)
                 coef = model.get_named_coef()
                 if type(coef) == list:
@@ -294,18 +257,13 @@ def save_coef(self, model_list, cols):
         file_name = join(dir_name, 'coef.csv')
         coef_df.to_csv(file_name)
 
-        # Plot normalized confusion matrix
-        # plt.figure()
-        # plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
-        #                       title='Normalized confusion matrix')
 
     def plot_coef(self, model_list):
         for model, model_name in model_list:
             plt.figure()
             file_name = join(self.directory, 'coef_' + model_name)
             for coef in model.coef_:
                 plt.hist(coef, bins=20)
-            # plt.hist(model.coef_[1], bins=20)
             plt.savefig(file_name)
 
     def save_all_scores(self, scores):
@@ -327,30 +285,17 @@ def save_score(self, data_params, model_params, score, model_name):
 
         with open(file_name, 'w') as yaml_file:
             yaml_file.write(
-                # yaml.dump([self.data_params, self.model_params, self.pre_params, str(score)], default_flow_style=False))
                 yaml.dump(yml_dict, default_flow_style=False)
             )
 
-        # with open(file_name, 'w') as yaml_file:
-        #     yaml_file.write(
-        #         yaml.dump([self.data_params, self.model_params, self.pre_params, str(score)], default_flow_style=False))
-
     def predict(self, model, x_test, y_test):
         logging.info('predicitng ...')
-        # if hasattr(model, 'transform'):
-        #     y_pred_test = model.transform(x_test)
-        # else:
-        #     pass
         y_pred_test = model.predict(x_test)
         if hasattr(model, 'predict_proba'):
             y_pred_test_scores = model.predict_proba(x_test)[:, 1]
         else:
             y_pred_test_scores = y_pred_test
 
-        # y_pred_test_scores =  model.predict_proba(x_test)[:,1]
-        # logging.info('scoring ...')
-        # score = evalualte(y_test, y_pred_test, y_pred_test_scores)
-        # cnf_matrix = confusion_matrix(y_test, y_pred_test)
         print 'y_pred_test', y_pred_test.shape, y_pred_test_scores.shape
         return y_pred_test, y_pred_test_scores
 
@@ -374,7 +319,6 @@ def extract_features(self, x_train, x_test):
 
         proc = feature_extraction.get_processor(self.features_params)
         if proc:
-            # proc.fit(x_train)
             x_train = proc.transform(x_train)
             x_test = proc.transform(x_test)
 

diff --git a/preprocessing/pre.py b/preprocessing/pre.py
@@ -3,7 +3,6 @@
 import copy
 from sklearn import preprocessing as p
 
-__author__ = 'marakeby'
 
 def get_processor(args):
     print args
@@ -40,8 +39,6 @@ def get_processor(args):
 def remove_outliers(y):
     m = np.mean(y)
     s = np.std(y)
-
-
     y2 = copy.deepcopy(y)
     s = np.std(y)
     n = 4

diff --git a/try_things/__init__.py b/try_things/__init__.py
diff --git a/try_things/load_model_test.py b/try_things/load_model_test.py