Skip to content

Commit

Permalink
code cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
marakeby committed Sep 22, 2021
1 parent 5eeb1fd commit d757f15
Show file tree
Hide file tree
Showing 6 changed files with 0 additions and 132 deletions.
1 change: 0 additions & 1 deletion model/callbacks_custom.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import warnings

import numpy as np
import pandas as pd
from keras.callbacks import Callback
Expand Down
21 changes: 0 additions & 21 deletions pipeline/crossvalidation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from sklearn.model_selection import StratifiedKFold

from data.data_access import Data
# from features_processing.feature_scaler import FeatureScaler
from model.model_factory import get_model
from pipeline.one_split import OneSplitPipeline
from utils.plots import plot_box_plot
Expand Down Expand Up @@ -61,7 +60,6 @@ def run(self, n_splits=5):
logging.info('fitting model ...')

for model_param in self.model_params:
# model_name = m['type']
if 'id' in model_param:
model_name = model_param['id']
else:
Expand All @@ -77,7 +75,6 @@ def run(self, n_splits=5):
scores_df, scores_mean, scores_std = get_mean_variance(scores)
list_model_scores.append(scores_df)
model_names.append(model_name)
# self.save_score(scores_df, scores_mean, scores_std, model_param['type'])
self.save_score(data_params, m_param, scores_df, scores_mean, scores_std, model_name)
logging.info('scores')
logging.info(scores_df)
Expand Down Expand Up @@ -113,44 +110,28 @@ def train_predict_crossvalidation(self, model_params, X, y, info, cols, model_na
for train_index, test_index in skf.split(X, y.ravel()):
model = get_model(model_params)
logging.info('fold # ----------------%d---------' % i)
# x_train, x_test = X.iloc[train_index], X.iloc[test_index]
x_train, x_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
info_train = pd.DataFrame(index=info[train_index])
info_test = pd.DataFrame(index=info[test_index])
# info_test = info.iloc[test_index,:].copy()
x_train, x_test = self.preprocess(x_train, x_test)
# feature extraction
logging.info('feature extraction....')
x_train, x_test = self.extract_features(x_train, x_test)

# hack TODO: change this
# scaler = FeatureScaler()
# x_train = scaler.transform(x_train, cols)
# x_test = scaler.transform(x_test, cols)

# if 'fitting_params' in model_params['params']:
# if 'x_to_list' in model_params['params']['fitting_params']:
# if model_params['params']['fitting_params']['x_to_list']:
# x_train = self.get_list(x_train, cols)
# x_test = self.get_list(x_test, cols)

model = model.fit(x_train, y_train)

y_pred_test, y_pred_test_scores = self.predict(model, x_test, y_test)
score_test = self.evaluate(y_test, y_pred_test, y_pred_test_scores)
logging.info('model {} -- Test score {}'.format(model_name, score_test))
self.save_prediction(info_test, y_pred_test, y_pred_test_scores, y_test, i, model_name)

# logging.info('saving results')
# self.save_score(score_test, score_test, score_test, model_name )
if hasattr(model, 'save_model'):
logging.info('saving coef')
save_model(model, model_name + '_' + str(i), self.directory)

if self.save_train:
logging.info('predicting training ...')
# y_pred_train, y_pred_train_scores, score_train, confusion_mtrx = self.predict(model, x_train, y_train)
y_pred_train, y_pred_train_scores = self.predict(model, x_train, y_train)
self.save_prediction(info_train, y_pred_train, y_pred_train_scores, y_train, i, model_name,
training=True)
Expand Down Expand Up @@ -178,5 +159,3 @@ def save_score(self, data_params, model_params, scores, scores_mean, scores_std,
'pipeline': self.pipeline_params, 'scores': scores.to_json(),
'scores_mean': scores_mean.to_json(), 'scores_std': scores_std.to_json()},
default_flow_style=False))
#
#
56 changes: 0 additions & 56 deletions pipeline/one_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,8 @@
import yaml
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix

from data.data_access import Data
from model.model_factory import get_model
# from features_processing import feature_extraction
# from features_processing.feature_scaler import FeatureScaler
from pipeline.pipe_utils import get_model_id, get_coef_from_model, get_balanced
from preprocessing import pre
from utils.evaluate import evalualte_survival, evalualte_classification_binary, evalualte_regression
Expand Down Expand Up @@ -44,7 +41,6 @@ def get_model_name(model):
class OneSplitPipeline:
def __init__(self, task, data_params, pre_params, feature_params, model_params, pipeline_params, exp_name):

# self.eval_dataset = eval_dataset
self.task = task
if type(data_params) == list:
self.data_params = data_params
Expand All @@ -67,7 +63,6 @@ def __init__(self, task, data_params, pre_params, feature_params, model_params,
self.prapre_saving_dir()

def prapre_saving_dir(self):
# self.directory = self.exp_name + timeStamp
self.directory = self.exp_name
if not exists(self.directory):
makedirs(self.directory)
Expand Down Expand Up @@ -106,12 +101,7 @@ def get_list(self, x, cols):
return genes_list

def get_train_test(self, data):
# x_train, x_test, y_train, y_test= data.get_train_test()
x_train, x_test, y_train, y_test, info_train, info_test, columns = data.get_train_test()
# info_train= x_train.index
# info_test= x_test.index
# columns=x_train.columns
# balance data
balance_train = False
balance_test = False
p = self.pipeline_params['params']
Expand Down Expand Up @@ -142,7 +132,6 @@ def run(self):
logging.info('loading data....')
data = Data(**data_params)
# get data
# x_train, x_test, y_train, y_test, info_train, info_test, cols = self.get_train_test(data)
x_train, x_validate_, x_test_, y_train, y_validate_, y_test_, info_train, info_validate_, info_test_, cols = data.get_train_validate_test()

logging.info('predicting')
Expand All @@ -151,22 +140,10 @@ def run(self):
y_t = y_validate_
info_t = info_validate_
else:
# print type(x_train)
# print type(y_train)
# print type(info_train)
#
# x_train = np.concatenate((x_train , x_validate_))
# y_train = np.concatenate((y_train , y_validate_))
# info_train = info_train.append(info_validate_)

x_t = np.concatenate((x_test_, x_validate_))
y_t = np.concatenate((y_test_, y_validate_))
info_t = info_test_.append(info_validate_)

# x_t = x_test_
# y_t = y_test_
# info_t = info_test_

logging.info('x_train {} y_train {} '.format(x_train.shape, y_train.shape))
logging.info('x_test {} y_test {} '.format(x_t.shape, y_t.shape))

Expand All @@ -180,19 +157,15 @@ def run(self):
model = get_model(model_params_)
logging.info('fitting')
logging.info(model_params_)
# model = model.fit(x_train, y_train)
if model_params_['type'] == 'nn' and not self.eval_dataset == 'validation':
model = model.fit(x_train, y_train, x_validate_, y_validate_)
# model = model.fit(x_train, y_train, x_train, y_train)
else:
model = model.fit(x_train, y_train)
# # model_list.append(model)
logging.info('predicting')

model_name = get_model_name(model_params_)
model_name = model_name + '_' + data_id
model_params_['id'] = model_name

logging.info('model id: {}'.format(model_name))
model_list.append((model, model_params_))
y_pred_test, y_pred_test_scores = self.predict(model, x_test, y_t)
Expand All @@ -201,7 +174,6 @@ def run(self):
test_scores.append(test_score)
model_names.append(model_name)
logging.info('saving results')
# self.save_score(test_score, model_name)
self.save_score(data_params, model_params_, test_score, model_name)
self.save_prediction(info_t, y_pred_test, y_pred_test_scores, y_t, model_name)
y_test_list.append(y_t)
Expand All @@ -228,14 +200,9 @@ def run(self):
self.save_prediction(info_train, y_pred_train, y_pred_train_scores, y_train, model_name,
training=True)

# self.save_coef(model_list, cols)
# auc_fig.savefig(join(self.directory, 'auc_curves'))
# prc_fig.savefig(join(self.directory, 'auprc_curves'))
test_scores = pd.DataFrame(test_scores, index=model_names)
generate_plots(test_scores, self.directory)
self.save_all_scores(test_scores)
# self.plot_coef(model_list)
# self.save_cnf_matrix(cnf_matrix_list, model_names)

if self.task == 'classification_binary':
auc_fig = plt.figure()
Expand All @@ -246,8 +213,6 @@ def run(self):
y_pred_test_scores_list, model_names):
plot_roc(auc_fig, y_test, y_pred_test_scores, self.directory, label=model_name)
plot_prc(prc_fig, y_test, y_pred_test_scores, self.directory, label=model_name)
# cnf_matrix = confusion_matrix(y_test, y_pred_test)
# save_confusion_matrix(cnf_matrix, self.directory, model_name)
auc_fig.savefig(join(self.directory, 'auc_curves'))
prc_fig.savefig(join(self.directory, 'auprc_curves'))
return test_scores
Expand All @@ -269,13 +234,11 @@ def save_coef(self, model_list, cols):
makedirs(dir_name)

for model, model_params in model_list:
# print model_params
model_name = get_model_id(model_params)
c_ = get_coef_from_model(model)
logging.info('saving coef ')
model_name_col = model_name
if hasattr(model, 'get_named_coef') and c_ is not None:
# print 'save_feature_importance'
file_name = join(dir_name, 'coef_' + model_name)
coef = model.get_named_coef()
if type(coef) == list:
Expand All @@ -294,18 +257,13 @@ def save_coef(self, model_list, cols):
file_name = join(dir_name, 'coef.csv')
coef_df.to_csv(file_name)

# Plot normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
# title='Normalized confusion matrix')

def plot_coef(self, model_list):
for model, model_name in model_list:
plt.figure()
file_name = join(self.directory, 'coef_' + model_name)
for coef in model.coef_:
plt.hist(coef, bins=20)
# plt.hist(model.coef_[1], bins=20)
plt.savefig(file_name)

def save_all_scores(self, scores):
Expand All @@ -327,30 +285,17 @@ def save_score(self, data_params, model_params, score, model_name):

with open(file_name, 'w') as yaml_file:
yaml_file.write(
# yaml.dump([self.data_params, self.model_params, self.pre_params, str(score)], default_flow_style=False))
yaml.dump(yml_dict, default_flow_style=False)
)

# with open(file_name, 'w') as yaml_file:
# yaml_file.write(
# yaml.dump([self.data_params, self.model_params, self.pre_params, str(score)], default_flow_style=False))

def predict(self, model, x_test, y_test):
logging.info('predicitng ...')
# if hasattr(model, 'transform'):
# y_pred_test = model.transform(x_test)
# else:
# pass
y_pred_test = model.predict(x_test)
if hasattr(model, 'predict_proba'):
y_pred_test_scores = model.predict_proba(x_test)[:, 1]
else:
y_pred_test_scores = y_pred_test

# y_pred_test_scores = model.predict_proba(x_test)[:,1]
# logging.info('scoring ...')
# score = evalualte(y_test, y_pred_test, y_pred_test_scores)
# cnf_matrix = confusion_matrix(y_test, y_pred_test)
print 'y_pred_test', y_pred_test.shape, y_pred_test_scores.shape
return y_pred_test, y_pred_test_scores

Expand All @@ -374,7 +319,6 @@ def extract_features(self, x_train, x_test):

proc = feature_extraction.get_processor(self.features_params)
if proc:
# proc.fit(x_train)
x_train = proc.transform(x_train)
x_test = proc.transform(x_test)

Expand Down
3 changes: 0 additions & 3 deletions preprocessing/pre.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import copy
from sklearn import preprocessing as p

__author__ = 'marakeby'

def get_processor(args):
print args
Expand Down Expand Up @@ -40,8 +39,6 @@ def get_processor(args):
def remove_outliers(y):
m = np.mean(y)
s = np.std(y)


y2 = copy.deepcopy(y)
s = np.std(y)
n = 4
Expand Down
Empty file removed try_things/__init__.py
Empty file.
51 changes: 0 additions & 51 deletions try_things/load_model_test.py

This file was deleted.

0 comments on commit d757f15

Please sign in to comment.