Skip to content

Commit

Permalink
Merge pull request #96 from BIMSBbioinfo/covariates
Browse files Browse the repository at this point in the history
Allow user to use clinical variables as covariates to the supervisor MLPs
  • Loading branch information
borauyar authored Jan 16, 2025
2 parents b92e815 + 41959d7 commit f351624
Show file tree
Hide file tree
Showing 7 changed files with 274 additions and 30 deletions.
16 changes: 16 additions & 0 deletions flexynesis/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def main():
help="(Optional if survival variables are not set to None)."
"Which variables in 'clin.csv' to use for predictions, comma-separated if multiple",
type = str, default = None)
parser.add_argument("--covariates",
help="Which variables in 'clin.csv' to be used as feature covariates, comma-separated if multiple",
type = str, default = None)
parser.add_argument("--surv_event_var", help="Which column in 'clin.csv' to use as event/status indicator for survival modeling", type = str, default = None)
parser.add_argument("--surv_time_var", help="Which column in 'clin.csv' to use as time/duration indicator for survival modeling", type = str, default = None)
parser.add_argument('--config_path', type=str, default=None, help='Optional path to an external hyperparameter configuration file in YAML format.')
Expand Down Expand Up @@ -206,9 +209,22 @@ def main():
# Set concatenate to True to use early fusion, otherwise it will run intermediate fusion
# Currently, GNNs will only work in early fusion mode, but requires the data to be not concatenated
concatenate = args.fusion_type == 'early' and args.model_class != 'GNN'

# handle covariates
if args.covariates:
if args.model_class == 'GNN': # Covariates not yet supported for GNNs
warning_message = "\n".join([
"\n\n!!! Covariates are currently not supported for GNN models, they will be ignored. !!!\n\n"
])
warnings.warn(warning_message)
time.sleep(3)
covariates = None
else:
covariates = args.covariates.strip().split(',')

data_importer = flexynesis.DataImporter(path = args.data_path,
data_types = datatypes,
covariates = covariates,
concatenate = concatenate,
log_transform = args.log_transform == 'True',
variance_threshold = args.variance_threshold/100,
Expand Down
60 changes: 34 additions & 26 deletions flexynesis/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from .feature_selection import filter_by_laplacian
from .utils import get_variable_types, create_covariate_matrix

from itertools import chain

Expand Down Expand Up @@ -87,7 +88,7 @@ class DataImporter:
Encodes categorical labels in the annotation dataframe.
"""

def __init__(self, path, data_types, processed_dir="processed", log_transform = False, concatenate = False, restrict_to_features = None, min_features=None,
def __init__(self, path, data_types, covariates = None, processed_dir="processed", log_transform = False, concatenate = False, restrict_to_features = None, min_features=None,
top_percentile=20, correlation_threshold = 0.9, variance_threshold=0.01, na_threshold=0.1, downsample=0):
self.path = path
self.data_types = data_types
Expand All @@ -106,7 +107,8 @@ def __init__(self, path, data_types, processed_dir="processed", log_transform =
# initialize data transformers
self.transformers = None
self.downsample = downsample

self.covariates = covariates

# read user-specified feature list to restrict the analysis to that
self.restrict_to_features = restrict_to_features
self.get_user_features()
Expand Down Expand Up @@ -164,9 +166,6 @@ def import_data(self):
# harmonize feature sets in train/test
train_dat, test_dat = self.harmonize(train_dat, test_dat)

train_feature_ann = {}
test_feature_ann = {}

# log_transform
if self.log_transform:
print("[INFO] transforming data to log scale")
Expand All @@ -177,13 +176,19 @@ def import_data(self):
# learned from training data to apply on test data (see fit = False)
train_dat = self.normalize_data(train_dat, scaler_type="standard", fit=True)
test_dat = self.normalize_data(test_dat, scaler_type="standard", fit=False)


# if covariates are defined, create a covariate matrix and add to the dictionary of data matrices
if self.covariates:
print("[INFO] Attempting to create a covariate matrix for the covariates:",self.covariates)
train_dat['covariates'] = create_covariate_matrix(self.covariates, get_variable_types(train_ann), train_ann)
test_dat['covariates'] = create_covariate_matrix(self.covariates, get_variable_types(test_ann), test_ann)
# harmonize again to match the covariate features
train_dat, test_dat = self.harmonize(train_dat, test_dat)

# encode the variable annotations, convert data matrices and annotations pytorch datasets
training_dataset = self.get_torch_dataset(train_dat, train_ann, train_samples, train_feature_ann)
testing_dataset = self.get_torch_dataset(test_dat, test_ann, test_samples, test_feature_ann)
training_dataset = self.get_torch_dataset(train_dat, train_ann, train_samples)
testing_dataset = self.get_torch_dataset(test_dat, test_ann, test_samples)

# NOTE: Exporting to the disk happens in get_torch_dataset, so the concatenate doesn't work.
# TODO: Find better way for early integration, or move it to get_torch_dataset. Otherwise it will be ignored.
# for early fusion, concatenate all data matrices and feature lists
if self.concatenate:
training_dataset.dat = {'all': torch.cat([training_dataset.dat[x] for x in training_dataset.dat.keys()], dim = 1)}
Expand Down Expand Up @@ -359,11 +364,13 @@ def select_features(self, dat):

def harmonize(self, dat1, dat2):
print("\n[INFO] ----------------- Harmonizing Data Sets ----------------- ")
# common data layers
common_layers = dat1.keys() & dat2.keys()
# Get common features
common_features = {x: dat1[x].index.intersection(dat2[x].index) for x in self.data_types}
common_features = {x: dat1[x].index.intersection(dat2[x].index) for x in common_layers}
# Subset both datasets to only include common features
dat1 = {x: dat1[x].loc[common_features[x]] for x in dat1.keys()}
dat2 = {x: dat2[x].loc[common_features[x]] for x in dat2.keys()}
dat1 = {x: dat1[x].loc[common_features[x]] for x in common_layers}
dat2 = {x: dat2[x].loc[common_features[x]] for x in common_layers}
print("\n[INFO] ----------------- Finished Harmonizing ----------------- ")

return dat1, dat2
Expand Down Expand Up @@ -391,7 +398,7 @@ def normalize_data(self, data, scaler_type="standard", fit=True):
for x in data.keys()}
return normalized_data

def get_torch_dataset(self, dat, ann, samples, feature_ann):
def get_torch_dataset(self, dat, ann, samples):

features = {x: dat[x].index for x in dat.keys()}
dat = {x: torch.from_numpy(np.array(dat[x].T)).float() for x in dat.keys()}
Expand Down Expand Up @@ -431,6 +438,7 @@ def encode_column(series):
variable_types.update({col: 'numerical' for col in df.select_dtypes(exclude=['object', 'category']).columns})

return df_encoded, variable_types, label_mappings


def validate_input_data(self, train_dat, test_dat):
print("\n[INFO] ----------------- Checking for problems with the input data ----------------- ")
Expand Down Expand Up @@ -536,21 +544,21 @@ def __len__ (self):
return len(self.samples)

def subset(self, indices):
"""Create a new dataset object containing only the specified indices.
"""Create a new dataset object containing only the specified indices.
Args:
indices (list of int): The indices of the samples to include in the subset.
Args:
indices (list of int): The indices of the samples to include in the subset.
Returns:
MultiOmicDataset: A new dataset object with the same structure but only containing the selected samples.
"""
subset_dat = {x: self.dat[x][indices] for x in self.dat.keys()}
subset_ann = {x: self.ann[x][indices] for x in self.ann.keys()}
subset_samples = [self.samples[idx] for idx in indices]
Returns:
MultiOmicDataset: A new dataset object with the same structure but only containing the selected samples.
"""
subset_dat = {x: self.dat[x][indices] for x in self.dat.keys()}
subset_ann = {x: self.ann[x][indices] for x in self.ann.keys()}
subset_samples = [self.samples[idx] for idx in indices]

# Create a new dataset object
return MultiOmicDataset(subset_dat, subset_ann, self.variable_types, self.features,
subset_samples, self.label_mappings, self.feature_ann)
# Create a new dataset object
return MultiOmicDataset(subset_dat, subset_ann, self.variable_types, self.features,
subset_samples, self.label_mappings, self.feature_ann)

def get_feature_subset(self, feature_df):
"""Get a subset of data matrices corresponding to specified features and concatenate them into a pandas DataFrame.
Expand Down
2 changes: 1 addition & 1 deletion flexynesis/feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def filter_by_laplacian(X, layer, k=5, t=None, topN=100, correlation_threshold=0
- The process may select additional features beyond `topN` before correlation filtering to ensure
that the best candidates are considered. The final number of features, however, is pruned to `topN`.
"""
print("[INFO] Implementing feature selection using laplacian score for layer:",layer,"with ",X.shape[1],"features")
print("[INFO] Implementing feature selection using laplacian score for layer:",layer,"with ",X.shape[1],"features"," and ",X.shape[0], " samples ")

feature_log = pd.DataFrame({'feature': X.columns, 'laplacian_score': np.nan})
# only apply filtering if topN < n_features
Expand Down
4 changes: 3 additions & 1 deletion flexynesis/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,11 @@ def get_batch_space(self, min_size = 32, max_size = 128):

def setup_trainer(self, params, current_step, total_steps, full_train = False):
# Configure callbacks and trainer for the current fold
mycallbacks = [self.progress_bar]
mycallbacks = []
if self.plot_losses:
mycallbacks.append(LiveLossPlot(hyperparams=params, current_step=current_step, total_steps=total_steps))
else:
mycallbacks.append(self.progress_bar)
# when training on a full dataset; no cross-validation or no validation splits;
# we don't do early stopping
early_stop_callback = None
Expand Down
3 changes: 2 additions & 1 deletion flexynesis/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ def __init__(self, input_dim, hidden_dim, output_dim):
hidden_dim (int, optional): The size of the hidden layer. Default is 32.
output_dim (int): The output dimension. Set to 1 for regression tasks, and > 1 for classification tasks.
"""
super(MLP, self).__init__()
super().__init__()
hidden_dim = max(hidden_dim, 2) # make sure there are at least 2 units
self.layer_1 = nn.Linear(input_dim, hidden_dim)
self.layer_out = nn.Linear(hidden_dim, output_dim) if output_dim > 1 else nn.Linear(hidden_dim, 1, bias=False)
self.relu = nn.ReLU()
Expand Down
Loading

0 comments on commit f351624

Please sign in to comment.