initial version

CancerAI-CL · Nov 5, 2019 · 96530c1 · 96530c1
1 parent 9a19a80
commit 96530c1
Show file tree

Hide file tree

Showing 98 changed files with 43,870 additions and 0 deletions.
diff --git a/code/analyse_representations.py b/code/analyse_representations.py
diff --git a/code/misc/__init__.py b/code/misc/__init__.py
diff --git a/code/misc/__pycache__/__init__.cpython-36.pyc b/code/misc/__pycache__/__init__.cpython-36.pyc
diff --git a/code/misc/__pycache__/__init__.cpython-37.pyc b/code/misc/__pycache__/__init__.cpython-37.pyc
diff --git a/code/misc/__pycache__/dataset.cpython-36.pyc b/code/misc/__pycache__/dataset.cpython-36.pyc
diff --git a/code/misc/__pycache__/dataset.cpython-37.pyc b/code/misc/__pycache__/dataset.cpython-37.pyc
diff --git a/code/misc/__pycache__/datasetWhole.cpython-36.pyc b/code/misc/__pycache__/datasetWhole.cpython-36.pyc
diff --git a/code/misc/__pycache__/datasetWholeTemp.cpython-36.pyc b/code/misc/__pycache__/datasetWholeTemp.cpython-36.pyc
diff --git a/code/misc/__pycache__/dataset_scale.cpython-36.pyc b/code/misc/__pycache__/dataset_scale.cpython-36.pyc
diff --git a/code/misc/__pycache__/helpers.cpython-36.pyc b/code/misc/__pycache__/helpers.cpython-36.pyc
diff --git a/code/misc/__pycache__/helpers.cpython-37.pyc b/code/misc/__pycache__/helpers.cpython-37.pyc
diff --git a/code/misc/__pycache__/helpers_scale.cpython-36.pyc b/code/misc/__pycache__/helpers_scale.cpython-36.pyc
diff --git a/code/misc/__pycache__/helpers_scaleTemp.cpython-36.pyc b/code/misc/__pycache__/helpers_scaleTemp.cpython-36.pyc
diff --git a/code/misc/dataset.py b/code/misc/dataset.py
@@ -0,0 +1,45 @@
+from misc.helpers import get_data
+import os
+import glob
+import pandas as pd
+import numpy as np
+
+type_to_data = {
+    'ER': r"../data/5-fold_ERstratified",
+    'IC': r"../data/5-fold_ic10stratified",
+    'PAM': r"../data/5-fold_pam50stratified",
+    'DR': r"../data/5-fold_DRstratified",
+    'W': r"../data/",
+
+}
+
+
+class Dataset:
+    def __init__(self, dtype, fold):
+        self.type = dtype
+        self.fold = fold
+        self.train, self.test = self._get_data(dtype, fold)
+
+    def _get_data(self, dtype, fold):
+        foldpath = os.path.join(type_to_data[dtype], "fold" + fold)
+        dev_file = glob.glob(foldpath + "/*test.csv")
+        train_file = glob.glob(foldpath + "/*train.csv")
+
+        for file_ in dev_file:
+            dev = pd.read_csv(file_, index_col=None, header=0)
+        for file_ in train_file:
+            train = pd.read_csv(file_, index_col=None, header=0)
+        return get_data(train), get_data(dev)
+
+class DatasetWhole:
+    def __init__(self, dtype):
+        self.type = dtype
+        self.train = self._get_data(dtype)
+
+    def _get_data(self, dtype):
+        foldpath = os.path.join(type_to_data[dtype])
+        train_file = glob.glob(foldpath + "/*.csv")
+
+        for file_ in train_file:
+            train = pd.read_csv(file_, index_col=None, header=0)
+        return get_data(train)
diff --git a/code/misc/helpers.py b/code/misc/helpers.py
@@ -0,0 +1,130 @@
+import numpy as np
+import pandas as pd
+import os
+
+
+def to_categorical(data, dtype=None):
+    val_to_cat = {}
+    cat = []
+    index = 0
+    for val in data:
+        if dtype == 'ic':
+            if val not in ['1', '2', '3', '4ER+', '4ER-', '5', '6', '7', '8', '9', '10']:
+                val = '1'
+            if val in ['4ER+','4ER-']:
+                val='4'
+        if val not in val_to_cat:
+            val_to_cat[val] = index
+            cat.append(index)
+            index += 1
+        else:
+            cat.append(val_to_cat[val])
+    return np.array(cat)
+
+
+def get_data(data):
+
+    d = {}
+    clin_fold = data[["METABRIC_ID"]]
+
+    rna = data[[col for col in data if col.startswith('GE')]]
+    cna = data[[col for col in data if col.startswith('CNA')]]
+
+    d['ic'] = list(data['iC10'].values)
+    d['pam50'] = list(data['Pam50Subtype'].values)
+    d['er'] = list(data['ER_Expr'].values)
+    d['pr'] = list(data['PR_Expr'].values)
+    d['her2'] = list(data['Her2_Expr'].values)
+    d['drnp'] = list(data['DR'].values)
+
+    d['rnanp'] = rna.astype(np.float32).values
+    d['cnanp'] = ((cna.astype(np.float32).values + 2.0) / 4.0)
+    d['icnp'] = to_categorical(d['ic'], dtype='ic')
+    d['pam50np'] = to_categorical(d['pam50'])
+    d['ernp'] = to_categorical(d['er'])
+    d['prnp'] = to_categorical(d['pr'])
+    d['her2np'] = to_categorical(d['her2'])
+    d['drnp'] = to_categorical(d['drnp'])
+
+
+    """
+    preprocessing for clinical data to match current pipeline
+    """
+    ## Clinical Data Quick Descriptions 
+    # clin["Age_At_Diagnosis"]           # Truly numeric
+    # clin["Breast_Tumour_Laterality"]   # Categorical "L, R" (3 unique)
+    # clin["NPI"]                        # Truly numeric
+    # clin["Inferred_Menopausal_State"]  # Categorical "Pre, Post" (3 unique)
+    # clin["Lymph_Nodes_Positive"]       # Ordinal ints 0-24
+    # clin["Grade"]                      # Ordinal string (come on) 1-3 + "?"
+    # clin["Size"]                       # Truly Numeric
+    # clin["Histological_Type"]          # Categorical strings (9 unique)
+    # clin["Cellularity"]                # Categorical strings (4 unique)                              
+    # clin["Breast_Surgery"]             # Categorical strings (3 Unique)
+    # clin["CT"]                         # Categorical strings (9 unique)
+    # clin["HT"]                         # Categorical strings (9 Unique)
+    # clin["RT"]                         # Categorical strings (9 Unique)
+
+    ## Clinical Data Transformations
+    # On the basis of the above we will keep some as numeric and others into one-hot encodings 
+    # (I am not comfortable binning the continuous numeric columns without some basis for their bins)
+    # Or since we dont have that much anyway just one hot everything and use BCE Loss to train
+
+    # We have to get the entire dataset, transform them into one-hots, bins
+    complete_data = r"../data/MBdata_33CLINwMiss_1KfGE_1KfCNA.csv"
+    # complete_data = pd.read_csv(complete_data).set_index("METABRIC_ID")
+    complete_data =  pd.read_csv(complete_data, index_col=None, header=0)
+
+    # Either we keep numerics as 
+    clin_numeric = complete_data[["METABRIC_ID","Age_At_Diagnosis", "NPI", "Size"]]
+
+    # Numerical binned to arbitrary ranges then one-hot dummies
+    metabric_id = complete_data[["METABRIC_ID"]]
+    aad = pd.get_dummies(pd.cut(complete_data["NPI"],10, labels=[1,2,3,4,5,6,7,8,9,10]),prefix="aad", dummy_na = True)  
+    npi = pd.get_dummies(pd.cut(complete_data["NPI"],6, labels=[1,2,3,4,5,6]),prefix="npi", dummy_na = True)
+    size = pd.get_dummies(complete_data["Size"], prefix = "size", dummy_na = True)
+
+
+    # Categorical and ordinals to one-hot dummies
+    btl = pd.get_dummies(complete_data["Breast_Tumour_Laterality"], prefix = "btl", dummy_na = True)
+    ims = pd.get_dummies(complete_data["Inferred_Menopausal_State"], prefix = "ims", dummy_na = True)
+    lnp = pd.get_dummies(complete_data["Lymph_Nodes_Positive"], prefix = "lnp", dummy_na = True)
+    grade = pd.get_dummies(complete_data["Grade"], prefix = "grade", dummy_na = True)
+    hist = pd.get_dummies(complete_data["Histological_Type"], prefix = "hist", dummy_na = True)
+    cellularity = pd.get_dummies(complete_data["Cellularity"], prefix = "cellularity", dummy_na = True)
+    ct = pd.get_dummies(complete_data["CT"], prefix = "ct", dummy_na = True)
+    ht = pd.get_dummies(complete_data["HT"], prefix = "ht", dummy_na = True)
+    rt = pd.get_dummies(complete_data["RT"], prefix = "rt", dummy_na = True)
+
+    clin_transformed = pd.concat([clin_numeric, btl, ims, lnp, grade, size, hist, cellularity, ct, ht, rt ], axis = 1) # 222 columns
+    clin_transformed = pd.concat([metabric_id, aad, npi, size, btl, ims, lnp, grade, size, hist, cellularity, ct, ht, rt ], axis = 1) # 2278 columns non binned, 350 columns if binned
+
+    # Now create the fold data by selecting from the complete transformed clinical data
+    # print(list(clin_fold.flatten()))
+    fold_ids = [x.item() for x in list(clin_fold.values)]
+    clin_transformed = clin_transformed.loc[clin_transformed['METABRIC_ID'].isin(fold_ids)]
+    del clin_transformed['METABRIC_ID']
+
+    d['clin'] = clin_transformed.astype(np.float32).values
+    return d
+
+def normalizeRNA(*args):
+    if len(args) > 1: 
+        normalizeData=np.concatenate((args[0],args[1]),axis=0)
+        normalizeData=(normalizeData-normalizeData.min(axis=0))/(normalizeData.max(axis=0)-normalizeData.min(0))
+        return normalizeData[:args[0].shape[0]], normalizeData[args[0].shape[0]:]
+    else:
+        return (args[0]-args[0].min(axis=0))/(args[0].max(axis=0)-args[0].min(0))
+
+
+def save_embedding(savedir,savefile, *args):
+    save_path = os.path.join(savedir, savefile)
+    if len(args)>1:
+        np.savez(save_path, emb_train=args[0],emb_test=args[1])
+    else:
+        np.savez(save_path, emb_train=args[0])
+
+
+
+
+
diff --git a/code/models/.ipynb_checkpoints/hvae-checkpoint.py b/code/models/.ipynb_checkpoints/hvae-checkpoint.py
@@ -0,0 +1,146 @@
+from keras import backend as K
+from keras import optimizers
+from keras.layers import BatchNormalization as BN, Concatenate, Dense, Input, Lambda
+from keras.models import Model
+import os
+from cb_code.models.common import sse, bce, sampling
+
+
+class HVAE:
+    def __init__(self, args, type):
+        self.args = args
+        self.type = type
+        self.vae = None
+        self.encoder = None
+
+    def build_model(self):
+        if self.type == 'CNA':
+            self.build_cna()
+        elif self.type == 'RNA':
+            self.build_rna()
+        elif self.type == 'H':
+            self.build_merged()
+        else:
+            raise ValueError('Unrecognised HVAE network type')
+
+        # Define the loss
+        kl_loss = 1 + self.z_log_sigma - K.square(self.z_mean) - K.exp(self.z_log_sigma)
+        kl_loss = K.sum(kl_loss, axis=-1)
+        kl_loss *= -0.5
+        vae_loss = K.mean(self.reconstruction_loss + 1.0 * kl_loss)
+        self.vae.add_loss(vae_loss)
+
+        adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, amsgrad=False)
+        self.vae.compile(optimizer=adam)
+        self.vae.summary()
+
+    def build_cna(self):
+        # Build the encoder network
+        # ------------ Input -----------------
+        inp = Input(shape=(self.args.cna_input_size,))
+
+        # ------------ Concat Layer -----------------
+        x = Dense(self.args.ds, activation=self.args.act)(inp)
+        x = BN()(x)
+
+        # ------------ Embedding Layer --------------
+        self.z_mean = Dense(self.args.ds // 2, name='z_mean')(x)
+        self.z_log_sigma = Dense(self.args.ds // 2, name='z_log_sigma')(x)
+        z = Lambda(sampling, output_shape=(self.args.ds // 2,), name='z')([self.z_mean, self.z_log_sigma])
+
+        self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder')
+        self.encoder.summary()
+
+        # Build the decoder network
+        # ------------ Dense out -----------------
+        latent_inputs = Input(shape=(self.args.ds // 2,), name='z_sampling')
+        x = latent_inputs
+        x = Dense(self.args.ds, activation=self.args.act)(x)
+        x = BN()(x)
+
+        # ------------ Out -----------------------
+        cna_out = Dense(self.args.cna_input_size, )(x)
+
+        decoder = Model(latent_inputs, cna_out, name='decoder')
+        decoder.summary()
+
+        output = decoder(self.encoder(inp)[2])
+        self.vae = Model(inp, output, name='vae_cna')
+        self.reconstruction_loss = bce(inp, output)
+
+    def build_rna(self):
+        # Build the encoder network
+        # ------------ Input -----------------
+        inp = Input(shape=(self.args.rna_input_size,))
+
+        # ------------ Concat Layer -----------------
+        x = Dense(self.args.ds, activation=self.args.act)(inp)
+        x = BN()(x)
+
+        # ------------ Embedding Layer --------------
+        self.z_mean = Dense(self.args.ds // 2, name='z_mean')(x)
+        self.z_log_sigma = Dense(self.args.ds // 2, name='z_log_sigma')(x)
+        z = Lambda(sampling, output_shape=(self.args.ds // 2,), name='z')([self.z_mean, self.z_log_sigma])
+
+        self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder')
+        self.encoder.summary()
+
+        # Build the decoder network
+        # ------------ Dense out -----------------
+        latent_inputs = Input(shape=(self.args.ds // 2,), name='z_sampling')
+        x = latent_inputs
+        x = Dense(self.args.ds, activation=self.args.act)(x)
+        x = BN()(x)
+
+        # ------------ Out -----------------------
+        cna_out = Dense(self.args.rna_input_size, )(x)
+
+        decoder = Model(latent_inputs, cna_out, name='decoder')
+        decoder.summary()
+
+        output = decoder(self.encoder(inp)[2])
+        self.vae = Model(inp, output, name='vae_rna')
+        self.reconstruction_loss = sse(inp, output)
+
+    def build_merged(self):
+        # Build the encoder network
+        # ------------ Input -----------------
+        inp = Input(shape=(self.args.ds,))
+
+        # ------------ Concat Layer -----------------
+        x = Dense(self.args.ds // 2, activation=self.args.act)(inp)
+        x = BN()(x)
+
+        # ------------ Embedding Layer --------------
+        self.z_mean = Dense(self.args.ls, name='z_mean')(x)
+        self.z_log_sigma = Dense(self.args.ls, name='z_log_sigma')(x)
+        z = Lambda(sampling, output_shape=(self.args.ls,), name='z')([self.z_mean, self.z_log_sigma])
+
+        self.encoder = Model(inp, [self.z_mean, self.z_log_sigma, z], name='encoder')
+        self.encoder.summary()
+
+        # Build the decoder network
+        # ------------ Dense out -----------------
+        latent_inputs = Input(shape=(self.args.ls,), name='z_sampling')
+        x = latent_inputs
+        x = Dense(self.args.ds // 2, activation=self.args.act)(x)
+        x = BN()(x)
+
+        # ------------ Out -----------------------
+        cna_out = Dense(self.args.ds, )(x)
+
+        decoder = Model(latent_inputs, cna_out, name='decoder')
+        decoder.summary()
+
+        output = decoder(self.encoder(inp)[2])
+        self.vae = Model(inp, output, name='vae_merged')
+        self.reconstruction_loss = sse(inp, output)
+
+    def train(self, train, test):
+        self.vae.fit(train, epochs=self.args.epochs, batch_size=self.args.bs, shuffle=True,
+                     validation_data=(test, None))
+        if self.args.save_model:
+            self.vae.save_weights('./models/vae_x_mlp.h5')
+
+    def predict(self, inp):
+        return self.encoder.predict(inp, batch_size=self.args.bs)[0]
diff --git a/code/models/__init__.py b/code/models/__init__.py
diff --git a/code/models/__pycache__/TCxvae.cpython-36.pyc b/code/models/__pycache__/TCxvae.cpython-36.pyc
diff --git a/code/models/__pycache__/__init__.cpython-36.pyc b/code/models/__pycache__/__init__.cpython-36.pyc
diff --git a/code/models/__pycache__/__init__.cpython-37.pyc b/code/models/__pycache__/__init__.cpython-37.pyc
diff --git a/code/models/__pycache__/cncvae.cpython-36.pyc b/code/models/__pycache__/cncvae.cpython-36.pyc
diff --git a/code/models/__pycache__/cncvae.cpython-37.pyc b/code/models/__pycache__/cncvae.cpython-37.pyc
diff --git a/code/models/__pycache__/common.cpython-36.pyc b/code/models/__pycache__/common.cpython-36.pyc
diff --git a/code/models/__pycache__/common.cpython-37.pyc b/code/models/__pycache__/common.cpython-37.pyc
diff --git a/code/models/__pycache__/deepXvae.cpython-36.pyc b/code/models/__pycache__/deepXvae.cpython-36.pyc
diff --git a/code/models/__pycache__/deepXvaeX.cpython-36.pyc b/code/models/__pycache__/deepXvaeX.cpython-36.pyc
diff --git a/code/models/__pycache__/deep_common.cpython-36.pyc b/code/models/__pycache__/deep_common.cpython-36.pyc
diff --git a/code/models/__pycache__/hvae.cpython-36.pyc b/code/models/__pycache__/hvae.cpython-36.pyc
diff --git a/code/models/__pycache__/hvaeSameLS.cpython-36.pyc b/code/models/__pycache__/hvaeSameLS.cpython-36.pyc
diff --git a/code/models/__pycache__/mlp.cpython-36.pyc b/code/models/__pycache__/mlp.cpython-36.pyc
diff --git a/code/models/__pycache__/mmvae.cpython-36.pyc b/code/models/__pycache__/mmvae.cpython-36.pyc
diff --git a/code/models/__pycache__/single.cpython-36.pyc b/code/models/__pycache__/single.cpython-36.pyc
diff --git a/code/models/__pycache__/twoS_XVAE.cpython-36.pyc b/code/models/__pycache__/twoS_XVAE.cpython-36.pyc
diff --git a/code/models/__pycache__/xvae.cpython-36.pyc b/code/models/__pycache__/xvae.cpython-36.pyc
diff --git a/code/models/__pycache__/xvae.cpython-37.pyc b/code/models/__pycache__/xvae.cpython-37.pyc