Skip to content

Commit

Permalink
initial version
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikola Simidjievski committed Nov 5, 2019
1 parent 9a19a80 commit 96530c1
Show file tree
Hide file tree
Showing 98 changed files with 43,870 additions and 0 deletions.
525 changes: 525 additions & 0 deletions code/analyse_representations.py

Large diffs are not rendered by default.

Empty file added code/misc/__init__.py
Empty file.
Binary file added code/misc/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added code/misc/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added code/misc/__pycache__/dataset.cpython-36.pyc
Binary file not shown.
Binary file added code/misc/__pycache__/dataset.cpython-37.pyc
Binary file not shown.
Binary file added code/misc/__pycache__/datasetWhole.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added code/misc/__pycache__/helpers.cpython-36.pyc
Binary file not shown.
Binary file added code/misc/__pycache__/helpers.cpython-37.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
45 changes: 45 additions & 0 deletions code/misc/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from misc.helpers import get_data
import os
import glob
import pandas as pd
import numpy as np

type_to_data = {
'ER': r"../data/5-fold_ERstratified",
'IC': r"../data/5-fold_ic10stratified",
'PAM': r"../data/5-fold_pam50stratified",
'DR': r"../data/5-fold_DRstratified",
'W': r"../data/",

}


class Dataset:
def __init__(self, dtype, fold):
self.type = dtype
self.fold = fold
self.train, self.test = self._get_data(dtype, fold)

def _get_data(self, dtype, fold):
foldpath = os.path.join(type_to_data[dtype], "fold" + fold)
dev_file = glob.glob(foldpath + "/*test.csv")
train_file = glob.glob(foldpath + "/*train.csv")

for file_ in dev_file:
dev = pd.read_csv(file_, index_col=None, header=0)
for file_ in train_file:
train = pd.read_csv(file_, index_col=None, header=0)
return get_data(train), get_data(dev)

class DatasetWhole:
def __init__(self, dtype):
self.type = dtype
self.train = self._get_data(dtype)

def _get_data(self, dtype):
foldpath = os.path.join(type_to_data[dtype])
train_file = glob.glob(foldpath + "/*.csv")

for file_ in train_file:
train = pd.read_csv(file_, index_col=None, header=0)
return get_data(train)
130 changes: 130 additions & 0 deletions code/misc/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import numpy as np
import pandas as pd
import os


def to_categorical(data, dtype=None):
val_to_cat = {}
cat = []
index = 0
for val in data:
if dtype == 'ic':
if val not in ['1', '2', '3', '4ER+', '4ER-', '5', '6', '7', '8', '9', '10']:
val = '1'
if val in ['4ER+','4ER-']:
val='4'
if val not in val_to_cat:
val_to_cat[val] = index
cat.append(index)
index += 1
else:
cat.append(val_to_cat[val])
return np.array(cat)


def get_data(data):

d = {}
clin_fold = data[["METABRIC_ID"]]

rna = data[[col for col in data if col.startswith('GE')]]
cna = data[[col for col in data if col.startswith('CNA')]]

d['ic'] = list(data['iC10'].values)
d['pam50'] = list(data['Pam50Subtype'].values)
d['er'] = list(data['ER_Expr'].values)
d['pr'] = list(data['PR_Expr'].values)
d['her2'] = list(data['Her2_Expr'].values)
d['drnp'] = list(data['DR'].values)

d['rnanp'] = rna.astype(np.float32).values
d['cnanp'] = ((cna.astype(np.float32).values + 2.0) / 4.0)
d['icnp'] = to_categorical(d['ic'], dtype='ic')
d['pam50np'] = to_categorical(d['pam50'])
d['ernp'] = to_categorical(d['er'])
d['prnp'] = to_categorical(d['pr'])
d['her2np'] = to_categorical(d['her2'])
d['drnp'] = to_categorical(d['drnp'])


"""
preprocessing for clinical data to match current pipeline
"""
## Clinical Data Quick Descriptions
# clin["Age_At_Diagnosis"] # Truly numeric
# clin["Breast_Tumour_Laterality"] # Categorical "L, R" (3 unique)
# clin["NPI"] # Truly numeric
# clin["Inferred_Menopausal_State"] # Categorical "Pre, Post" (3 unique)
# clin["Lymph_Nodes_Positive"] # Ordinal ints 0-24
# clin["Grade"] # Ordinal string (come on) 1-3 + "?"
# clin["Size"] # Truly Numeric
# clin["Histological_Type"] # Categorical strings (9 unique)
# clin["Cellularity"] # Categorical strings (4 unique)
# clin["Breast_Surgery"] # Categorical strings (3 Unique)
# clin["CT"] # Categorical strings (9 unique)
# clin["HT"] # Categorical strings (9 Unique)
# clin["RT"] # Categorical strings (9 Unique)

## Clinical Data Transformations
# On the basis of the above we will keep some as numeric and others into one-hot encodings
# (I am not comfortable binning the continuous numeric columns without some basis for their bins)
# Or since we dont have that much anyway just one hot everything and use BCE Loss to train

# We have to get the entire dataset, transform them into one-hots, bins
complete_data = r"../data/MBdata_33CLINwMiss_1KfGE_1KfCNA.csv"
# complete_data = pd.read_csv(complete_data).set_index("METABRIC_ID")
complete_data = pd.read_csv(complete_data, index_col=None, header=0)

# Either we keep numerics as
clin_numeric = complete_data[["METABRIC_ID","Age_At_Diagnosis", "NPI", "Size"]]

# Numerical binned to arbitrary ranges then one-hot dummies
metabric_id = complete_data[["METABRIC_ID"]]
aad = pd.get_dummies(pd.cut(complete_data["NPI"],10, labels=[1,2,3,4,5,6,7,8,9,10]),prefix="aad", dummy_na = True)
npi = pd.get_dummies(pd.cut(complete_data["NPI"],6, labels=[1,2,3,4,5,6]),prefix="npi", dummy_na = True)
size = pd.get_dummies(complete_data["Size"], prefix = "size", dummy_na = True)


# Categorical and ordinals to one-hot dummies
btl = pd.get_dummies(complete_data["Breast_Tumour_Laterality"], prefix = "btl", dummy_na = True)
ims = pd.get_dummies(complete_data["Inferred_Menopausal_State"], prefix = "ims", dummy_na = True)
lnp = pd.get_dummies(complete_data["Lymph_Nodes_Positive"], prefix = "lnp", dummy_na = True)
grade = pd.get_dummies(complete_data["Grade"], prefix = "grade", dummy_na = True)
hist = pd.get_dummies(complete_data["Histological_Type"], prefix = "hist", dummy_na = True)
cellularity = pd.get_dummies(complete_data["Cellularity"], prefix = "cellularity", dummy_na = True)
ct = pd.get_dummies(complete_data["CT"], prefix = "ct", dummy_na = True)
ht = pd.get_dummies(complete_data["HT"], prefix = "ht", dummy_na = True)
rt = pd.get_dummies(complete_data["RT"], prefix = "rt", dummy_na = True)

clin_transformed = pd.concat([clin_numeric, btl, ims, lnp, grade, size, hist, cellularity, ct, ht, rt ], axis = 1) # 222 columns
clin_transformed = pd.concat([metabric_id, aad, npi, size, btl, ims, lnp, grade, size, hist, cellularity, ct, ht, rt ], axis = 1) # 2278 columns non binned, 350 columns if binned

# Now create the fold data by selecting from the complete transformed clinical data
# print(list(clin_fold.flatten()))
fold_ids = [x.item() for x in list(clin_fold.values)]
clin_transformed = clin_transformed.loc[clin_transformed['METABRIC_ID'].isin(fold_ids)]
del clin_transformed['METABRIC_ID']

d['clin'] = clin_transformed.astype(np.float32).values
return d

def normalizeRNA(*args):
if len(args) > 1:
normalizeData=np.concatenate((args[0],args[1]),axis=0)
normalizeData=(normalizeData-normalizeData.min(axis=0))/(normalizeData.max(axis=0)-normalizeData.min(0))
return normalizeData[:args[0].shape[0]], normalizeData[args[0].shape[0]:]
else:
return (args[0]-args[0].min(axis=0))/(args[0].max(axis=0)-args[0].min(0))


def save_embedding(savedir,savefile, *args):
save_path = os.path.join(savedir, savefile)
if len(args)>1:
np.savez(save_path, emb_train=args[0],emb_test=args[1])
else:
np.savez(save_path, emb_train=args[0])





146 changes: 146 additions & 0 deletions code/models/.ipynb_checkpoints/hvae-checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from keras import backend as K
from keras import optimizers
from keras.layers import BatchNormalization as BN, Concatenate, Dense, Input, Lambda
from keras.models import Model
import os
from cb_code.models.common import sse, bce, sampling


class HVAE:
def __init__(self, args, type):
self.args = args
self.type = type
self.vae = None
self.encoder = None

def build_model(self):
if self.type == 'CNA':
self.build_cna()
elif self.type == 'RNA':
self.build_rna()
elif self.type == 'H':
self.build_merged()
else:
raise ValueError('Unrecognised HVAE network type')

# Define the loss
kl_loss = 1 + self.z_log_sigma - K.square(self.z_mean) - K.exp(self.z_log_sigma)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(self.reconstruction_loss + 1.0 * kl_loss)
self.vae.add_loss(vae_loss)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False)
self.vae.compile(optimizer=adam)
self.vae.summary()

def build_cna(self):
# Build the encoder network
# ------------ Input -----------------
inp = Input(shape=(self.args.cna_input_size,))

# ------------ Concat Layer -----------------
x = Dense(self.args.ds, activation=self.args.act)(inp)
x = BN()(x)

# ------------ Embedding Layer --------------
self.z_mean = Dense(self.args.ds // 2, name='z_mean')(x)
self.z_log_sigma = Dense(self.args.ds // 2, name='z_log_sigma')(x)
z = Lambda(sampling, output_shape=(self.args.ds // 2,), name='z')([self.z_mean, self.z_log_sigma])

self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder')
self.encoder.summary()

# Build the decoder network
# ------------ Dense out -----------------
latent_inputs = Input(shape=(self.args.ds // 2,), name='z_sampling')
x = latent_inputs
x = Dense(self.args.ds, activation=self.args.act)(x)
x = BN()(x)

# ------------ Out -----------------------
cna_out = Dense(self.args.cna_input_size, )(x)

decoder = Model(latent_inputs, cna_out, name='decoder')
decoder.summary()

output = decoder(self.encoder(inp)[2])
self.vae = Model(inp, output, name='vae_cna')
self.reconstruction_loss = bce(inp, output)

def build_rna(self):
# Build the encoder network
# ------------ Input -----------------
inp = Input(shape=(self.args.rna_input_size,))

# ------------ Concat Layer -----------------
x = Dense(self.args.ds, activation=self.args.act)(inp)
x = BN()(x)

# ------------ Embedding Layer --------------
self.z_mean = Dense(self.args.ds // 2, name='z_mean')(x)
self.z_log_sigma = Dense(self.args.ds // 2, name='z_log_sigma')(x)
z = Lambda(sampling, output_shape=(self.args.ds // 2,), name='z')([self.z_mean, self.z_log_sigma])

self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder')
self.encoder.summary()

# Build the decoder network
# ------------ Dense out -----------------
latent_inputs = Input(shape=(self.args.ds // 2,), name='z_sampling')
x = latent_inputs
x = Dense(self.args.ds, activation=self.args.act)(x)
x = BN()(x)

# ------------ Out -----------------------
cna_out = Dense(self.args.rna_input_size, )(x)

decoder = Model(latent_inputs, cna_out, name='decoder')
decoder.summary()

output = decoder(self.encoder(inp)[2])
self.vae = Model(inp, output, name='vae_rna')
self.reconstruction_loss = sse(inp, output)

def build_merged(self):
# Build the encoder network
# ------------ Input -----------------
inp = Input(shape=(self.args.ds,))

# ------------ Concat Layer -----------------
x = Dense(self.args.ds // 2, activation=self.args.act)(inp)
x = BN()(x)

# ------------ Embedding Layer --------------
self.z_mean = Dense(self.args.ls, name='z_mean')(x)
self.z_log_sigma = Dense(self.args.ls, name='z_log_sigma')(x)
z = Lambda(sampling, output_shape=(self.args.ls,), name='z')([self.z_mean, self.z_log_sigma])

self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder')
self.encoder.summary()

# Build the decoder network
# ------------ Dense out -----------------
latent_inputs = Input(shape=(self.args.ls,), name='z_sampling')
x = latent_inputs
x = Dense(self.args.ds // 2, activation=self.args.act)(x)
x = BN()(x)

# ------------ Out -----------------------
cna_out = Dense(self.args.ds, )(x)

decoder = Model(latent_inputs, cna_out, name='decoder')
decoder.summary()

output = decoder(self.encoder(inp)[2])
self.vae = Model(inp, output, name='vae_merged')
self.reconstruction_loss = sse(inp, output)

def train(self, train, test):
self.vae.fit(train, epochs=self.args.epochs, batch_size=self.args.bs, shuffle=True,
validation_data=(test, None))
if self.args.save_model:
self.vae.save_weights('./models/vae_x_mlp.h5')

def predict(self, inp):
return self.encoder.predict(inp, batch_size=self.args.bs)[0]
Empty file added code/models/__init__.py
Empty file.
Binary file added code/models/__pycache__/TCxvae.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added code/models/__pycache__/cncvae.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/cncvae.cpython-37.pyc
Binary file not shown.
Binary file added code/models/__pycache__/common.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/common.cpython-37.pyc
Binary file not shown.
Binary file added code/models/__pycache__/deepXvae.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/deepXvaeX.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Binary file added code/models/__pycache__/hvae.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/hvaeSameLS.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/mlp.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/mmvae.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/single.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/twoS_XVAE.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/xvae.cpython-36.pyc
Binary file not shown.
Binary file added code/models/__pycache__/xvae.cpython-37.pyc
Binary file not shown.
Loading

0 comments on commit 96530c1

Please sign in to comment.