-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenetic_data_scheetz.py
177 lines (134 loc) · 7.67 KB
/
genetic_data_scheetz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from __future__ import division
import csv
import numpy as np
from scipy.stats.stats import pearsonr
from sklearn.preprocessing import scale
import additional_functions as af
import general_functions as gf
import graphical_code as gc
"""
Code for the analysis of the scheetz genetic dataset formed by 120 rats and 31000 genes
"""
# LOAD THE DATASET #####################################################################################################
"""
The dataset was originally in log-scale, but the data loaded here is in the original scale.
"""
x = np.loadtxt("data/x.txt", delimiter="\t", skiprows=1)
y = np.loadtxt("data/y.txt", delimiter="\t", skiprows=1)
with open("data/x_names.csv") as f:
reader = csv.reader(f)
next(reader) # skip header
x_names = np.asarray([r[0][1:] for r in reader])
# SELECT THESE WITH LARGE CORRELATION
cor = np.zeros(x.shape[1])
for i in range(len(cor)):
cor[i] = pearsonr(x[:,i], y)[0]
min_cor = 0.5
large_cor_index = np.where(np.abs(cor)>min_cor)[0]
x = x[:,large_cor_index]
x_names = x_names[large_cor_index]
x = scale(x)
y = scale(y)
# OBTAIN A GROUP INDEX FOR THE GENES
index = af.pca_group_creator(x)
gc.index_boxplot(index)
########################################################################################################################
"""
Analysis using function automatic_analyzer and solving an asgl_pct and a pls_pct model
"""
model = 'qr'
intercept = True
tol = 1e-4
parallel = True
lambda1 = (10.0**np.arange(-3, 1.01, 0.2)).tolist()
alpha = (np.r_[np.arange(0.0, 0.3, 0.03), np.arange(0.3, 0.7, 0.1), np.arange(0.7, 0.99, 0.03), np.array([1])]).tolist()
tau = 0.5
l_power_weight = np.arange(0, 1.21, 0.2).tolist()
gl_power_weight = np.arange(0, 1.21, 0.2).tolist()
var_pct = 0.8
train_size = 80
validate_size = 20
error_type = 'QRE'
# Automatic simulator
n_repetitions = 1
folder = 'simulation_results/'
model_selection_param = dict(train_size=train_size, validate_size=validate_size, error_type=error_type)
asgl_model = dict(model='qr', penalization='asgl', intercept=intercept, tol=tol, lambda1=lambda1, alpha=alpha, tau=tau, parallel=parallel)
asgl_wc_pca_d = dict(l_power_weight=l_power_weight, gl_power_weight=gl_power_weight, var_pct=var_pct, weight_technique='pca_d')
asgl_wc_pls_d = dict(l_power_weight=l_power_weight, gl_power_weight=gl_power_weight, var_pct=var_pct, weight_technique='pls_d')
model_param = [dict(model_solver=asgl_model, weight_calculator=asgl_wc_pca_d),
dict(model_solver=asgl_model, weight_calculator=asgl_wc_pls_d)]
dataset= dict(x=x, y=y, index=index)
results = gf.automatic_analyzer(dataset=dataset, model_selection_param=model_selection_param, model_param=model_param, n_repetitions=n_repetitions, folder=folder)
########################################################################################################################
"""
Processing the results obtained
"""
route = 'D:/Documentos_2/GoogleDrive/Doctorado/Tesis/Codigo/Python/Quantile_regression/simulation_results/'
fnamer2 = '2019-05-07--10-31-51--real_dataset--n=120--p=3734--standardized--cols--tau5.json'
fnamer3 = '2019-05-11--00-03-53--real_dataset--n=120--p=3734--standardized--cols--tau3.json'
fnamer4 = '2019-05-15--17-30-16--real_dataset--n=120--p=3734--standardized--cols--tau7.json'
results5 = af.simulation_results_to_tables(results=route+fnamer2, from_file=True, table_format='row_models')
results3 = af.simulation_results_to_tables(results=route+fnamer3, from_file=True, table_format='row_models')
results7 = af.simulation_results_to_tables(results=route+fnamer4, from_file=True, table_format='row_models')
gc. boxplot_creator_by_metric(results=results, interesting_metrics=['final_error', 'non_zero_pred_beta'], figsize=(25, 10), sorting = ['lasso', 'sgl', 'asgl_pca_d', 'asgl_pls_d'])
significance = gc. variables_probability_heatmap(results=results7, sorting=['asgl_pca_d', 'asgl_pls_d', 'sgl', 'lasso'])
model_names = significance['model_names']
probability_of_significance = significance['probability_of_significance']
global_significance_variables = significance['global_significance_variables']
# List of significant genes
asgl_pls_d_index = np.where(np.asarray(model_names)=='asgl_pca_d')[0][0]
mean_number_significant = int(round(results['results'][asgl_pls_d_index]['summary']['non_zero_pred_beta'][0]))
significant_genes_index = global_significance_variables[0:mean_number_significant]
significant_genes_names = x_names[significant_genes_index]
# Number of genes above a threshold
threshold = 0.5
for i in range(len(model_names)):
tmp_prob = probability_of_significance[i , :]
num_genes = len(np.where(tmp_prob>=threshold)[0])
print("Model: {}. Threshold: {}. Number of genes: {}".format(model_names[i], threshold, num_genes))
print(np.sort(-1*tmp_prob)[0:30])
########################################################################################################################
"""
Analysis of different quantiles
"""
def probability_of_significance_calculator(list_results, model='asgl_pls_d', n_var=3734):
response_models = []
i = 0
general_probability_of_significance = np.zeros((len(list_results), n_var))
for results in list_results:
stored_model_names = []
for j in range(len(results['results'])):
model_name = results['results'][j]['model_solver']['penalization']
if 'asgl' in results['results'][j]['model_solver']['penalization']:
model_name += '_' + results['results'][j]['weight_calculator']['weight_technique']
stored_model_names.append(str(model_name))
model_index = np.where(np.asarray(stored_model_names)==model)[0][0]
response_models.append(stored_model_names[model_index]+'_tau'+str(results['results'][0]['model_solver']['tau']))
for index_significant_variables in results['results'][model_index]['extra_metrics']['index_non_zero_pred_beta']:
general_probability_of_significance[i, index_significant_variables] += 1.0
general_probability_of_significance[i,:] = general_probability_of_significance[i,:] / len(results['results'][model_index]['extra_metrics']['index_non_zero_pred_beta'])
i += 1
response = dict(models=response_models, probability_of_significance=general_probability_of_significance)
return response
list_results = (results5, results3, results7)
# Collect the probabilities of each model for different taus
lasso_significance = probability_of_significance_calculator(list_results, model='lasso', n_var=3734)
sgl_significance = probability_of_significance_calculator(list_results, model='sgl', n_var=3734)
asgl_pca_significance = probability_of_significance_calculator(list_results, model='asgl_pca_d', n_var=3734)
asgl_pls_significance = probability_of_significance_calculator(list_results, model='asgl_pls_d', n_var=3734)
model_significance = [dict(model='lasso', significance_dict=lasso_significance),
dict(model='sgl', significance_dict=sgl_significance),
dict(model='asgl_pca_d', significance_dict=asgl_pca_significance),
dict(model='asgl_pls_d', significance_dict=asgl_pls_significance)]
threshold = 0.5
intersection = dict()
for elt in model_significance:
tmp_results = []
p = elt['significance_dict']['probability_of_significance']
n_models = p.shape[0]
for j in range(p.shape[1]):
if np.sum(p[:, j] > threshold) == n_models:
tmp_results.append(j)
intersection[elt['model']]=tmp_results
########################################################################################################################