diff --git a/main.py b/main.py index 1c7868c0a..2569e6238 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,6 @@ from scripts import main_con as m import os - +import pickle p = r"/data/rainville/HYPNOSIS_ASL_ANALYSIS/CBF_normalized" conf_dir = ( @@ -10,25 +10,31 @@ #p = r'/home/dsutterlin/projects/test_data/ASL_RS_hypnosis/CBF_4D_normalized' #conf_dir = False #pwd_main = r"/home/dsutterlin/projects/resting_state_hypnosis/resting_state_hypnosis" - +''' data, fcdict = m.con_matrix( p, pwd_main=pwd_main, conf_dir=conf_dir, save_base= os.path.join(pwd_main, 'debug'), - save_folder="difumo64_correl_negw", + save_folder="difumo64_correl_noProc", atlas_name="difumo64", sphere_coord = [(54, -28, 26)], connectivity_measure="correlation", - n_sub = 3, + n_sub = 10, verbose=True, remove_ROI_maps = [8,14,43], # based on masker report, outside brain or no interest remove_subjects = ['APM_07_H1', 'APM_11_H1', 'APM_22_H2'] # based on rainville et al., 2019 and .xlsx file ) -graphs = m.connectome_analyses(data, fcdict, n_iter = 100) +graphs = m.connectome_analyses(data, fcdict, bootstrap = 10) +''' +with open(os.path.join(pwd_main,'debug', 'difumo64_correl_noProc', 'data.pkl'),'rb') as f: + data = pickle.load(f) + +with open(os.path.join(pwd_main,'debug', 'difumo64_correl_noProc', 'graphs_dict.pkl'),'rb') as f: + graphs = pickle.load(f) -cv_results = m.prediction_analyses(data, graphs, save_to, n_permut = 100, verbose = False) +cv_results = m.prediction_analyses(data, graphs, n_permut = 10, verbose = False) ''' import matplotlib.pyplot as plt diff --git a/scripts/main_con.py b/scripts/main_con.py index 1ce54c48b..27d9ce3d5 100644 --- a/scripts/main_con.py +++ b/scripts/main_con.py @@ -5,6 +5,7 @@ import copy import glob import numpy as np +import pandas as pd import nibabel as nib from scipy.stats import pearsonr from nilearn.maskers import ( @@ -221,12 +222,15 @@ def con_matrix( save_to_plot = os.path.join(save_base, save_folder, 'plots') if os.path.exists(save_to_plot) is False: os.mkdir(save_to_plot) - + if os.path.exists(os.path.join(save_to, 'reports')) is False: os.mkdir(os.path.join(save_to, 'reports')) voxel_masker.generate_report().save_as_html(os.path.join(save_to, 'reports', 'voxelMasker_report.html')) masker.generate_report(displayed_maps='all').save_as_html(os.path.join(save_to,'reports','mapsMasker_report.html')) + data.atlas_labels = atlas_labels + data.save_to = save_to + with open(os.path.join(save_to, "dict_connectomes.pkl"), "wb") as f: pickle.dump(fcdict, f) with open(os.path.join(save_to, 'data.pkl'), 'wb') as f: @@ -239,13 +243,13 @@ def con_matrix( if os.path.exists(os.path.join(save_to,'NBS_txtData')) is False: os.mkdir(os.path.join(save_to,'NBS_txtData')) func.export_txt_NBS(os.path.join(save_to,'NBS_txtData'), atlas, atlas_labels, fcdict['pre_connectomes'],fcdict["post_connectomes"], fcdict["diff_weight_connectomes"], data.subjects) - + print('---PREPARING AND SAVING PLOTS---') # Plotting connectomes and weights distribution for cond, matrix_list in zip(fcdict['conditions'],[ fcdict["pre_connectomes"],fcdict['post_connectomes'], fcdict["diff_weight_connectomes"]]): plot_func.dist_mean_edges(cond, matrix_list, save_to_plot) - + # Replication of Rainville et al., 2019 automaticity ~ rCBF in parietal Operculum(supramarg. gyrus in difumo64) Y = data.phenotype vd = 'Abs_diff_automaticity' @@ -258,14 +262,11 @@ def con_matrix( plot_func.visu_correl(auto, tvalues, save_to_plot, vd_name = vd, vi_name = 'T test change in PO', title = 'Automaticity score vs mean ttest') print('---DONE with connectivity matrices and plots---') - data.atlas_labels = atlas_labels - data.save_to = save_to - return data, fcdict -def connectome_analyses(data, fcdict, bootstrap = 10 ): +def connectome_analyses(data, fcdict, bootstrap = 1000 ): subjects = data.subjects atlas_labels = data.atlas_labels @@ -330,7 +331,7 @@ def connectome_analyses(data, fcdict, bootstrap = 10 ): return graphs -def prediction_analyses(data, graphs, n_permut = 5000, test_size = 0.20, pca = '90%', verbose = False): +def prediction_analyses(data, graphs, n_permut = 5000, test_size = 0.20, pca = 0.90, verbose = False): # Models to test # - Yi ~ Strenghts, Clustering, EigenCentrality, LocalEfficiency # Yi ~ PO multivariate node metrics @@ -343,39 +344,39 @@ def prediction_analyses(data, graphs, n_permut = 5000, test_size = 0.20, pca = ' "Mental_relax_absChange", "Abs_diff_automaticity", ] - pred_node_metrics = ['strength','strengthnorm', 'eigenCent', 'localEfficiency'] + pred_node_metrics = ['strength','strengthnorm', 'eigenCent'] #'localEfficiency'] #Xmat_names = ["Degree", "Closeness", "Betweenness", "Clustering", "Edge_weights"] cv_results = dict(pre= dict(), post=dict(), change=dict()) cv_results['phenotype'] = Y single_ROI_reg = ['Supramarginal gyrus', 'Anterior Cingulate Cortex', 'Cingulate gyrus mid-anterior',' Cingulate cortex posterior'] # PO, + subjects = data.subjects save_to = data.save_to atlas_labels = data.atlas_labels - + print(r'---CROSS-VAL REGRESSION [Yi ~ Node] METRICS---') - - + # 1) Extracts metric column for each node (1 x Nodes) and stack them/sub --> (N sub x Nodes) ~ Yi # 2) Compute CV regression for Yi variable for node_metric in pred_node_metrics: feat_mat = pd.DataFrame(np.array([sub_mat[node_metric] for sub_mat in graphs['change_nodes']]), columns = atlas_labels, index = subjects) - cv_results['change'][node_metric] = graphsCV.regression_cv(feat_mat, Y, target_col, n_permut = n_permut, test_size = test_size, pca=pca) + cv_results['change'][node_metric] = graphsCV.regression_cv(feat_mat, Y, target_col, pred_metric_name = node_metric, n_permut = n_permut, test_size = test_size, pca=pca) for node_metric in pred_node_metrics: feat_mat = pd.DataFrame(np.array([sub_mat[node_metric] for sub_mat in graphs['post_nodes_metrics']]), columns = atlas_labels, index = subjects) - cv_results['post'][node_metric] = graphsCV.regression_cv(feat_mat, Y, target_col, n_permut = n_permut, test_size = test_size, pca=pca) + cv_results['post'][node_metric] = graphsCV.regression_cv(feat_mat, Y, target_col, pred_metric_name = node_metric, n_permut = n_permut, test_size = test_size, pca=pca) # Edge based prediction for post and change edge_feat_mat = graphsCV.edge_attributes2df(cv_results['post'], edge_metric = 'weight') - cv_results['post']['edge_weight'] = graphsCV.regression_cv(edge_feat_mat, Y, target_col, n_permut = n_permut, test_size = test_size, pca=pca) + cv_results['post']['edge_weight'] = graphsCV.regression_cv(edge_feat_mat, Y, target_col, pred_metric_name = 'weight', n_permut = n_permut, test_size = test_size, pca=pca) edge_feat_mat = graphsCV.edge_attributes2df(cv_results['change'], edge_metric = 'weight') - cv_results['change']['edge_weight'] = graphsCV.regression_cv(edge_feat_mat, Y, target_col, n_permut = n_permut, test_size = test_size, pca=pca) + cv_results['change']['edge_weight'] = graphsCV.regression_cv(edge_feat_mat, Y, target_col, pred_metric_name = 'weight', n_permut = n_permut, test_size = test_size, pca=pca) # ROI specific multivariate (multi-node metric) prediction for roi in single_ROI_reg: # compute N sub x M metrics df for prediction of Yi roi_feat_mat = pd.DataFrame(np.array([sub_mat.loc[roi,:] for sub_mat in graphs['change_nodes']]), columns = pred_node_metrics, index = subjects) - cv_results['change'][roi] = graphsCV.regression_cv(roi_feat_mat, Y, target_col) - + cv_results['change'][roi] = graphsCV.regression_cv(roi_feat_mat, Y, target_col, pred_metric_name = roi) + # Save reg results with open(os.path.join(save_to, "cv_results.pkl"), "wb") as f: pickle.dump(cv_results, f) diff --git a/src/graphs_regressionCV.py b/src/graphs_regressionCV.py index 8ac5b8c6c..ccdac8d93 100644 --- a/src/graphs_regressionCV.py +++ b/src/graphs_regressionCV.py @@ -4,7 +4,7 @@ import glob as glob import pickle import pandas as pd -import bct.algorithms as bct +import bct.algorithms as bct from sklearn.model_selection import permutation_test_score from scipy.stats import pearsonr from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error @@ -136,7 +136,7 @@ def edge_attributes2df(Gs_dict, edge_metric = 'weight', subjects = False): ''' if subjects is False: subjects = Gs_dict.keys() - assert len(subjects) == Gs_dict.keys(), 'Subjects names and Gs_dict items do not match' + #assert len(subjects) == Gs_dict.keys(), 'Subjects names and Gs_dict items do not match' tuple_ls = [G.edges().data(edge_metric) for G in Gs_dict.values()] # (node 1, node 2, value) edges_dfs = [] # to append sub i x pairs of single row dfs @@ -224,32 +224,33 @@ def hqs_algorithm(e, cov, var, N, seed, return_correlation=False): else: return C - - resulting_covariance_matrix = hqs_algorithm(e, v, edash, N) - print("Covariance Matrix:") - print(resulting_covariance_matrix) + # Old code can remove if run success /11 march 24 - resulting_correlation_matrix = hqs_algorithm(e, v, edash, N, return_correlation=True) - print("\nCorrelation Matrix:") - print(resulting_correlation_matrix) + #resulting_covariance_matrix = hqs_algorithm(e, v, edash, N) + #print("Covariance Matrix:") + #print(resulting_covariance_matrix) + + #resulting_correlation_matrix = hqs_algorithm(e, v, edash, N, return_correlation=True) + #print("\nCorrelation Matrix:") + #print(resulting_correlation_matrix)# # ---Feature matrices based on subjects' graphs---# - subject_names = list(Gs.keys()) - node_names = list(Gs[subject_names[0]].nodes()) + #subject_names = list(Gs.keys()) + #node_names = list(Gs[subject_names[0]].nodes())# # Reorganize data structure from lists of vectors to pd (N x nodes array) for each dict key - for metric in list(metric_dict.keys()): # Excluding nodes and communities - tmp_data = metric_dict[metric] - if metric != 'communities' and metric != 'nodes': - metric_dict[metric] = pd.DataFrame( - np.array(tmp_data), columns=labels, index=subjects - ) - breakpoint() # check data structure for each nodes ( clustering ?) - #maybe save as a df would be easier to compute 2samp t tests - breakpoint()#store the yeo7 id of ROI tocompute network metrics/subjects - - return metric_dict # dict w/ keys = metrics and values = list of list vectors (one list per subject) + #for metric in list(metric_dict.keys()): # Excluding nodes and communities + # tmp_data = metric_dict[metric] + # if metric != 'communities' and metric != 'nodes': + # metric_dict[metric] = pd.DataFrame( + # np.array(tmp_data), columns=labels, index=subjects + # ) + ##breakpoint() # check data structure for each nodes ( clustering ?) + ##maybe save as a df would be easier to compute 2samp t tests + ##breakpoint()#store the yeo7 id of ROI tocompute network metrics/subjects + + #return metric_dict # dict w/ keys = metrics and values = list of list vectors (one list per subject) def rand_graphs(dict_graphs, subjects, n_permut): ''' @@ -260,14 +261,14 @@ def rand_graphs(dict_graphs, subjects, n_permut): for i, sub in enumerate(subjects): perms = [] seeds = np.random.randint(0, 1000000, n_permut) - + return perm_graphs def node_metric_diff(post_df, pre_df, subjects): change_dfs = [] - for i, sub in range(len(subjects)): + for i, sub in enumerate(subjects): assert list(post_df[i].columns) == list(pre_df[i].columns) temp_df = post_df[i].copy(deep=True) @@ -285,7 +286,7 @@ def bootstrap_pvals_df(df_metric_ls, dict_dfs_permut, subjects, mult_comp = 'fdr for i, sub in enumerate(subjects): df_metric = df_metric_ls[i] rand_dist_3d = np.stack(dict_dfs_permut[sub], axis=2) # stack list of 2d rand dfs on 3d dim - assert df_metric.shape == rand_dist.shape[0:2] + assert df_metric.shape == rand_dist_3d.shape[0:2] #mean_val = np.mean(rand_dist, axis=2) #td_val = np.std(rand_dist, axis=2) @@ -297,7 +298,7 @@ def bootstrap_pvals_df(df_metric_ls, dict_dfs_permut, subjects, mult_comp = 'fdr p_df[i, j] = np.mean(np.abs(rand_dist_3d[i, j, :]) >= np.abs(cell_value)) # Two-tailed test p_values_df = pd.DataFrame(p_df, index=df_metric.index, columns=df_metric.columns) - + if mult_comp == 'fdr_bh': for column in p_values_df.columns: p_values = p_values_df[column] @@ -307,7 +308,7 @@ def bootstrap_pvals_df(df_metric_ls, dict_dfs_permut, subjects, mult_comp = 'fdr if mult_comp == 'fdr_bn': print('FDR correction applied to change metric p-values') - + return pval_dfs_ls @@ -336,7 +337,7 @@ def compute_permutation( n_components=0.80, n_permutations=5000, scoring="r2", - random_seed=40 + rdm_seed=40 ): """ Compute the permutation test for a specified metric (r2 by default) @@ -372,7 +373,7 @@ def compute_permutation( scoring=scoring, cv=cv, n_permutations=n_permutations, - random_state=random_seed, + random_state=rdm_seed, ) return score, perm_scores, pvalue @@ -386,7 +387,7 @@ def compute_permutation( from scipy.stats import pearsonr import numpy as np -def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 5000, test_size = 0.2, pca='80%'): +def regression_cv(features_matrix, Y, target_columns, pred_metric_name = 'X_pred', rdm_seed=40, n_permut = 5000, test_size = 0.2, pca='80%'): """ Compute the regression with cross-validation for each graph metric @@ -397,10 +398,10 @@ def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 50 metrics_names : list of strings from the graph metrics but written in a way that will figure in the plots outputs. """ - mean_metrics = [] - out_dict = dict() - + #out_dict = dict() + result_Yi = dict() + prePipeline = Pipeline([("scaler", StandardScaler())]) feat_mat = prePipeline.fit_transform(features_matrix) pipeline = Pipeline([("pca", PCA(n_components=pca)), ("reg", Ridge(alpha=1.0))]) @@ -408,7 +409,7 @@ def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 50 cv = ShuffleSplit(n_splits=10, test_size=test_size, random_state=rdm_seed) for target_column in target_columns: - print(f"--- {target_column} ---") + print(f"[CV-regression] : __{target_column} ~ -{pred_metric_name}__") y_preds = [] y_tests = [] y = Y[target_column].values @@ -420,6 +421,7 @@ def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 50 rmse_scores = [] n_components = [] all_coefficients = [] + all_corr_coefficients = [] for train_index, test_index in cv.split(feat_mat): # Split the data into train and test sets based on the current fold @@ -441,11 +443,11 @@ def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 50 rmse = np.sqrt(mse) cov_x = np.cov( pipeline.named_steps["pca"] - .transform(X_test) + .transform(X_train) .transpose() .astype(np.float64) ) - cov_y = np.cov(y_test.transpose().astype(np.float64)) + cov_y = np.cov(y_train.transpose().astype(np.float64)) # Append metrics to the respective lists pearson_r_scores.append(pearson_r) @@ -464,14 +466,18 @@ def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 50 # print('cov_x type and shape', type(cov_x), cov_x.shape) # print('cov y shape', cov_y.shape) # print('coeff transposed shape', coefficients.transpose().shape) - # correction from Eqn 6 (Haufe et al., 2014) corr_coeffs = np.matmul(cov_x, coefficients.transpose()) * (1 / cov_y) - all_coefficients.append( + all_corr_coefficients.append( pipeline.named_steps["pca"].inverse_transform( corr_coeffs.transpose() ) ) + all_coefficients.append( + pipeline.named_steps["pca"].inverse_transform( + coefficients.transpose() + ) + ) # Permutation tests r2score, _, r2p_value = compute_permutation( @@ -509,7 +515,7 @@ def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 50 # avg_z_score = np.mean(np.array(all_coefficients), axis=0) / np.std(all_coefficients, axis=0) # print(f"Average z-score = {avg_z_score} std = {np.std(all_coefficients, axis=0)}") # Plot - plot_title = f"{metrics_name} based CV-prediction of {target_column}" + plot_title = f"{pred_metric_name} based CV-prediction of {target_column}" # reg_plot_performance( # y_tests, # y_preds, @@ -536,9 +542,10 @@ def regression_cv(features_matrix, Y, target_columns, rdm_seed=40, n_permut = 50 "rmse_p_value": rmse_p_value, "y_preds": y_preds, "y_tests": y_tests, - "corr_coeffs": all_coefficients, + "coeffs" : all_coefficients, + "corr_coeffs": all_corr_coefficients, } - out_dict[metrics_name] = result_Yi + # out_dict[metrics_name] = result_Yi - return out_dict + return result_Yi