-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Nikola Simidjievski
committed
Nov 5, 2019
1 parent
9a19a80
commit 96530c1
Showing
98 changed files
with
43,870 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from misc.helpers import get_data | ||
import os | ||
import glob | ||
import pandas as pd | ||
import numpy as np | ||
|
||
type_to_data = { | ||
'ER': r"../data/5-fold_ERstratified", | ||
'IC': r"../data/5-fold_ic10stratified", | ||
'PAM': r"../data/5-fold_pam50stratified", | ||
'DR': r"../data/5-fold_DRstratified", | ||
'W': r"../data/", | ||
|
||
} | ||
|
||
|
||
class Dataset: | ||
def __init__(self, dtype, fold): | ||
self.type = dtype | ||
self.fold = fold | ||
self.train, self.test = self._get_data(dtype, fold) | ||
|
||
def _get_data(self, dtype, fold): | ||
foldpath = os.path.join(type_to_data[dtype], "fold" + fold) | ||
dev_file = glob.glob(foldpath + "/*test.csv") | ||
train_file = glob.glob(foldpath + "/*train.csv") | ||
|
||
for file_ in dev_file: | ||
dev = pd.read_csv(file_, index_col=None, header=0) | ||
for file_ in train_file: | ||
train = pd.read_csv(file_, index_col=None, header=0) | ||
return get_data(train), get_data(dev) | ||
|
||
class DatasetWhole: | ||
def __init__(self, dtype): | ||
self.type = dtype | ||
self.train = self._get_data(dtype) | ||
|
||
def _get_data(self, dtype): | ||
foldpath = os.path.join(type_to_data[dtype]) | ||
train_file = glob.glob(foldpath + "/*.csv") | ||
|
||
for file_ in train_file: | ||
train = pd.read_csv(file_, index_col=None, header=0) | ||
return get_data(train) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import os | ||
|
||
|
||
def to_categorical(data, dtype=None): | ||
val_to_cat = {} | ||
cat = [] | ||
index = 0 | ||
for val in data: | ||
if dtype == 'ic': | ||
if val not in ['1', '2', '3', '4ER+', '4ER-', '5', '6', '7', '8', '9', '10']: | ||
val = '1' | ||
if val in ['4ER+','4ER-']: | ||
val='4' | ||
if val not in val_to_cat: | ||
val_to_cat[val] = index | ||
cat.append(index) | ||
index += 1 | ||
else: | ||
cat.append(val_to_cat[val]) | ||
return np.array(cat) | ||
|
||
|
||
def get_data(data): | ||
|
||
d = {} | ||
clin_fold = data[["METABRIC_ID"]] | ||
|
||
rna = data[[col for col in data if col.startswith('GE')]] | ||
cna = data[[col for col in data if col.startswith('CNA')]] | ||
|
||
d['ic'] = list(data['iC10'].values) | ||
d['pam50'] = list(data['Pam50Subtype'].values) | ||
d['er'] = list(data['ER_Expr'].values) | ||
d['pr'] = list(data['PR_Expr'].values) | ||
d['her2'] = list(data['Her2_Expr'].values) | ||
d['drnp'] = list(data['DR'].values) | ||
|
||
d['rnanp'] = rna.astype(np.float32).values | ||
d['cnanp'] = ((cna.astype(np.float32).values + 2.0) / 4.0) | ||
d['icnp'] = to_categorical(d['ic'], dtype='ic') | ||
d['pam50np'] = to_categorical(d['pam50']) | ||
d['ernp'] = to_categorical(d['er']) | ||
d['prnp'] = to_categorical(d['pr']) | ||
d['her2np'] = to_categorical(d['her2']) | ||
d['drnp'] = to_categorical(d['drnp']) | ||
|
||
|
||
""" | ||
preprocessing for clinical data to match current pipeline | ||
""" | ||
## Clinical Data Quick Descriptions | ||
# clin["Age_At_Diagnosis"] # Truly numeric | ||
# clin["Breast_Tumour_Laterality"] # Categorical "L, R" (3 unique) | ||
# clin["NPI"] # Truly numeric | ||
# clin["Inferred_Menopausal_State"] # Categorical "Pre, Post" (3 unique) | ||
# clin["Lymph_Nodes_Positive"] # Ordinal ints 0-24 | ||
# clin["Grade"] # Ordinal string (come on) 1-3 + "?" | ||
# clin["Size"] # Truly Numeric | ||
# clin["Histological_Type"] # Categorical strings (9 unique) | ||
# clin["Cellularity"] # Categorical strings (4 unique) | ||
# clin["Breast_Surgery"] # Categorical strings (3 Unique) | ||
# clin["CT"] # Categorical strings (9 unique) | ||
# clin["HT"] # Categorical strings (9 Unique) | ||
# clin["RT"] # Categorical strings (9 Unique) | ||
|
||
## Clinical Data Transformations | ||
# On the basis of the above we will keep some as numeric and others into one-hot encodings | ||
# (I am not comfortable binning the continuous numeric columns without some basis for their bins) | ||
# Or since we dont have that much anyway just one hot everything and use BCE Loss to train | ||
|
||
# We have to get the entire dataset, transform them into one-hots, bins | ||
complete_data = r"../data/MBdata_33CLINwMiss_1KfGE_1KfCNA.csv" | ||
# complete_data = pd.read_csv(complete_data).set_index("METABRIC_ID") | ||
complete_data = pd.read_csv(complete_data, index_col=None, header=0) | ||
|
||
# Either we keep numerics as | ||
clin_numeric = complete_data[["METABRIC_ID","Age_At_Diagnosis", "NPI", "Size"]] | ||
|
||
# Numerical binned to arbitrary ranges then one-hot dummies | ||
metabric_id = complete_data[["METABRIC_ID"]] | ||
aad = pd.get_dummies(pd.cut(complete_data["NPI"],10, labels=[1,2,3,4,5,6,7,8,9,10]),prefix="aad", dummy_na = True) | ||
npi = pd.get_dummies(pd.cut(complete_data["NPI"],6, labels=[1,2,3,4,5,6]),prefix="npi", dummy_na = True) | ||
size = pd.get_dummies(complete_data["Size"], prefix = "size", dummy_na = True) | ||
|
||
|
||
# Categorical and ordinals to one-hot dummies | ||
btl = pd.get_dummies(complete_data["Breast_Tumour_Laterality"], prefix = "btl", dummy_na = True) | ||
ims = pd.get_dummies(complete_data["Inferred_Menopausal_State"], prefix = "ims", dummy_na = True) | ||
lnp = pd.get_dummies(complete_data["Lymph_Nodes_Positive"], prefix = "lnp", dummy_na = True) | ||
grade = pd.get_dummies(complete_data["Grade"], prefix = "grade", dummy_na = True) | ||
hist = pd.get_dummies(complete_data["Histological_Type"], prefix = "hist", dummy_na = True) | ||
cellularity = pd.get_dummies(complete_data["Cellularity"], prefix = "cellularity", dummy_na = True) | ||
ct = pd.get_dummies(complete_data["CT"], prefix = "ct", dummy_na = True) | ||
ht = pd.get_dummies(complete_data["HT"], prefix = "ht", dummy_na = True) | ||
rt = pd.get_dummies(complete_data["RT"], prefix = "rt", dummy_na = True) | ||
|
||
clin_transformed = pd.concat([clin_numeric, btl, ims, lnp, grade, size, hist, cellularity, ct, ht, rt ], axis = 1) # 222 columns | ||
clin_transformed = pd.concat([metabric_id, aad, npi, size, btl, ims, lnp, grade, size, hist, cellularity, ct, ht, rt ], axis = 1) # 2278 columns non binned, 350 columns if binned | ||
|
||
# Now create the fold data by selecting from the complete transformed clinical data | ||
# print(list(clin_fold.flatten())) | ||
fold_ids = [x.item() for x in list(clin_fold.values)] | ||
clin_transformed = clin_transformed.loc[clin_transformed['METABRIC_ID'].isin(fold_ids)] | ||
del clin_transformed['METABRIC_ID'] | ||
|
||
d['clin'] = clin_transformed.astype(np.float32).values | ||
return d | ||
|
||
def normalizeRNA(*args): | ||
if len(args) > 1: | ||
normalizeData=np.concatenate((args[0],args[1]),axis=0) | ||
normalizeData=(normalizeData-normalizeData.min(axis=0))/(normalizeData.max(axis=0)-normalizeData.min(0)) | ||
return normalizeData[:args[0].shape[0]], normalizeData[args[0].shape[0]:] | ||
else: | ||
return (args[0]-args[0].min(axis=0))/(args[0].max(axis=0)-args[0].min(0)) | ||
|
||
|
||
def save_embedding(savedir,savefile, *args): | ||
save_path = os.path.join(savedir, savefile) | ||
if len(args)>1: | ||
np.savez(save_path, emb_train=args[0],emb_test=args[1]) | ||
else: | ||
np.savez(save_path, emb_train=args[0]) | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
from keras import backend as K | ||
from keras import optimizers | ||
from keras.layers import BatchNormalization as BN, Concatenate, Dense, Input, Lambda | ||
from keras.models import Model | ||
import os | ||
from cb_code.models.common import sse, bce, sampling | ||
|
||
|
||
class HVAE: | ||
def __init__(self, args, type): | ||
self.args = args | ||
self.type = type | ||
self.vae = None | ||
self.encoder = None | ||
|
||
def build_model(self): | ||
if self.type == 'CNA': | ||
self.build_cna() | ||
elif self.type == 'RNA': | ||
self.build_rna() | ||
elif self.type == 'H': | ||
self.build_merged() | ||
else: | ||
raise ValueError('Unrecognised HVAE network type') | ||
|
||
# Define the loss | ||
kl_loss = 1 + self.z_log_sigma - K.square(self.z_mean) - K.exp(self.z_log_sigma) | ||
kl_loss = K.sum(kl_loss, axis=-1) | ||
kl_loss *= -0.5 | ||
vae_loss = K.mean(self.reconstruction_loss + 1.0 * kl_loss) | ||
self.vae.add_loss(vae_loss) | ||
|
||
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False) | ||
self.vae.compile(optimizer=adam) | ||
self.vae.summary() | ||
|
||
def build_cna(self): | ||
# Build the encoder network | ||
# ------------ Input ----------------- | ||
inp = Input(shape=(self.args.cna_input_size,)) | ||
|
||
# ------------ Concat Layer ----------------- | ||
x = Dense(self.args.ds, activation=self.args.act)(inp) | ||
x = BN()(x) | ||
|
||
# ------------ Embedding Layer -------------- | ||
self.z_mean = Dense(self.args.ds // 2, name='z_mean')(x) | ||
self.z_log_sigma = Dense(self.args.ds // 2, name='z_log_sigma')(x) | ||
z = Lambda(sampling, output_shape=(self.args.ds // 2,), name='z')([self.z_mean, self.z_log_sigma]) | ||
|
||
self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder') | ||
self.encoder.summary() | ||
|
||
# Build the decoder network | ||
# ------------ Dense out ----------------- | ||
latent_inputs = Input(shape=(self.args.ds // 2,), name='z_sampling') | ||
x = latent_inputs | ||
x = Dense(self.args.ds, activation=self.args.act)(x) | ||
x = BN()(x) | ||
|
||
# ------------ Out ----------------------- | ||
cna_out = Dense(self.args.cna_input_size, )(x) | ||
|
||
decoder = Model(latent_inputs, cna_out, name='decoder') | ||
decoder.summary() | ||
|
||
output = decoder(self.encoder(inp)[2]) | ||
self.vae = Model(inp, output, name='vae_cna') | ||
self.reconstruction_loss = bce(inp, output) | ||
|
||
def build_rna(self): | ||
# Build the encoder network | ||
# ------------ Input ----------------- | ||
inp = Input(shape=(self.args.rna_input_size,)) | ||
|
||
# ------------ Concat Layer ----------------- | ||
x = Dense(self.args.ds, activation=self.args.act)(inp) | ||
x = BN()(x) | ||
|
||
# ------------ Embedding Layer -------------- | ||
self.z_mean = Dense(self.args.ds // 2, name='z_mean')(x) | ||
self.z_log_sigma = Dense(self.args.ds // 2, name='z_log_sigma')(x) | ||
z = Lambda(sampling, output_shape=(self.args.ds // 2,), name='z')([self.z_mean, self.z_log_sigma]) | ||
|
||
self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder') | ||
self.encoder.summary() | ||
|
||
# Build the decoder network | ||
# ------------ Dense out ----------------- | ||
latent_inputs = Input(shape=(self.args.ds // 2,), name='z_sampling') | ||
x = latent_inputs | ||
x = Dense(self.args.ds, activation=self.args.act)(x) | ||
x = BN()(x) | ||
|
||
# ------------ Out ----------------------- | ||
cna_out = Dense(self.args.rna_input_size, )(x) | ||
|
||
decoder = Model(latent_inputs, cna_out, name='decoder') | ||
decoder.summary() | ||
|
||
output = decoder(self.encoder(inp)[2]) | ||
self.vae = Model(inp, output, name='vae_rna') | ||
self.reconstruction_loss = sse(inp, output) | ||
|
||
def build_merged(self): | ||
# Build the encoder network | ||
# ------------ Input ----------------- | ||
inp = Input(shape=(self.args.ds,)) | ||
|
||
# ------------ Concat Layer ----------------- | ||
x = Dense(self.args.ds // 2, activation=self.args.act)(inp) | ||
x = BN()(x) | ||
|
||
# ------------ Embedding Layer -------------- | ||
self.z_mean = Dense(self.args.ls, name='z_mean')(x) | ||
self.z_log_sigma = Dense(self.args.ls, name='z_log_sigma')(x) | ||
z = Lambda(sampling, output_shape=(self.args.ls,), name='z')([self.z_mean, self.z_log_sigma]) | ||
|
||
self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder') | ||
self.encoder.summary() | ||
|
||
# Build the decoder network | ||
# ------------ Dense out ----------------- | ||
latent_inputs = Input(shape=(self.args.ls,), name='z_sampling') | ||
x = latent_inputs | ||
x = Dense(self.args.ds // 2, activation=self.args.act)(x) | ||
x = BN()(x) | ||
|
||
# ------------ Out ----------------------- | ||
cna_out = Dense(self.args.ds, )(x) | ||
|
||
decoder = Model(latent_inputs, cna_out, name='decoder') | ||
decoder.summary() | ||
|
||
output = decoder(self.encoder(inp)[2]) | ||
self.vae = Model(inp, output, name='vae_merged') | ||
self.reconstruction_loss = sse(inp, output) | ||
|
||
def train(self, train, test): | ||
self.vae.fit(train, epochs=self.args.epochs, batch_size=self.args.bs, shuffle=True, | ||
validation_data=(test, None)) | ||
if self.args.save_model: | ||
self.vae.save_weights('./models/vae_x_mlp.h5') | ||
|
||
def predict(self, inp): | ||
return self.encoder.predict(inp, batch_size=self.args.bs)[0] |
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.