-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathExample.py
371 lines (316 loc) · 13.4 KB
/
Example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
# ---
# jupyter:
# jupytext:
# formats: ipynb,py
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.9.1
# kernelspec:
# display_name: Python [conda env:sophie] *
# language: python
# name: conda-env-sophie-py
# ---
# # Template
#
# This notebook allows users to find common and specific genes in their experiment of interest using an *existing* VAE model and selecting a template experiment that is included in the training compendium.
#
# This notebook will generate a `generic_gene_summary_<experiment id>.tsv` file that contains z-scores per gene that indicates how specific a gene is the experiment in question.
# %load_ext autoreload
# %load_ext rpy2.ipython
# %autoreload 2
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from ponyo import utils, simulate_expression_data
from sophie import (
process,
stats,
ranking,
)
# ## Inputs
#
# User needs to fill in the [config file](config_new_experiment.tsv) following the instructions provided in the [readme]](README.md)
# +
# Read in config variables
config_filename = "config_example.tsv"
params = utils.read_config(config_filename)
# +
# Load config params
# Root directory containing analysis subdirectories and scripts
base_dir = params["base_dir"]
# Local directory to store intermediate files
local_dir = params["local_dir"]
# Normalized compendium filename
normalized_compendium_filename = params["normalized_compendium_filename"]
# Metadata file that maps experiment ids to sample ids
metadata_filename = params["metadata_filename"]
# Delimiter used in metadata file
metadata_delimiter = params["metadata_delimiter"]
# Column header corresponding to experiment id and sample id
metadata_experiment_colname = params["metadata_experiment_colname"]
metadata_sample_colname = params["metadata_sample_colname"]
# ID for template experiment to be selected
project_id = params["project_id"]
# Number of simulated experiments to generate
num_simulated = params["num_simulated"]
# Directory that simulated experiments will be written to
# This directory is created by https://github.com/greenelab/ponyo/blob/master/ponyo/utils.py
simulated_data_dir = params["simulated_data_dir"]
# Directory containing the existing trained VAE model
vae_model_dir = params["vae_model_dir"]
# Size of the latent dimension
latent_dim = params["latent_dim"]
# Scaler transform used to scale compendium data into 0-1 range for training
scaler_transform_filename = params["scaler_transform_filename"]
# Which DE method to use
# We recommend that if data is RNA-seq then use DESeq2 ("deseq")
# If data is microarray then use Limma ("limma")
de_method = params["DE_method"]
# If using DE-seq, setting this parameter will
# remove genes below a certain threshold
count_threshold = params["count_threshold"]
# Metadata file that specifies which samples to keep for DE analysis (Optional)
# By default, a two-condition differential expression analysis is supported (case vs control).
# However, some experiments included more than 2 conditions and so these "extra" samples
# should not considered in the downstream differential expression analysis.
template_process_samples_filename = params["template_process_samples_filename"]
# Metadata file that specifies sample grouping for DE analysis
template_DE_grouping_filename = params["template_DE_grouping_filename"]
# Statistic to use to rank genes or pathways by
# Choices are "log2FoldChange" if using DESeq or "log2FC"
# if using limma as the `de_method`
col_to_rank_genes = params["rank_genes_by"]
# +
# Files generated by this notebook
# File storing normalized template experiment
normalized_template_filename = params["normalized_template_filename"]
# File storing un-normalized template experiment for us in downstream DE analysis
raw_template_filename = params["raw_template_filename"]
# File storing processed template experiment,
# after samples have been selected for comparison in DE analysis
processed_template_filename = params["processed_template_filename"]
# Output summary file
output_filename = params["output_filename"]
# -
# ## Setup directories
utils.setup_dir(config_filename)
# ## Get unnormalized template experiment
process.fetch_template_experiment(
normalized_compendium_filename,
metadata_filename,
metadata_delimiter,
metadata_experiment_colname,
project_id,
metadata_sample_colname,
scaler_transform_filename,
raw_template_filename,
normalized_template_filename
)
# ## Simulate data
# Run simulation
simulate_expression_data.shift_template_experiment(
normalized_compendium_filename,
vae_model_dir,
latent_dim,
scaler_transform_filename,
metadata_filename,
metadata_delimiter,
metadata_experiment_colname,
metadata_sample_colname,
project_id,
local_dir,
simulated_data_dir,
num_simulated)
# ## Process template and simulated experiments
#
# * Remove samples not required for comparison
# * Make sure ordering of samples matches metadata for proper comparison
# * Make sure values are cast as integers if using DESeq
# * Filter lowly expressed genes if using DESeq
# +
# Update simulated dir
if not os.path.exists(template_process_samples_filename):
template_process_samples_filename = None
if de_method == "deseq":
# Process template data
stats.process_samples_for_DESeq(
raw_template_filename,
template_DE_grouping_filename,
processed_template_filename,
count_threshold,
template_process_samples_filename,
)
# Process simulated data
for i in range(num_simulated):
simulated_filename = os.path.join(
simulated_data_dir,
f"selected_simulated_data_{project_id}_{i}.tsv",
)
out_simulated_filename = os.path.join(
simulated_data_dir,
f"selected_simulated_data_{project_id}_{i}_processed.tsv",
)
stats.process_samples_for_DESeq(
simulated_filename,
template_DE_grouping_filename,
out_simulated_filename,
count_threshold,
template_process_samples_filename,
)
else:
stats.process_samples_for_limma(
raw_template_filename,
template_DE_grouping_filename,
processed_template_filename,
template_process_samples_filename,
)
for i in range(num_simulated):
simulated_filename = os.path.join(
simulated_data_dir,
f"selected_simulated_data_{project_id}_{i}.tsv",
)
stats.process_samples_for_limma(
simulated_filename,
template_DE_grouping_filename,
None,
template_process_samples_filename,
)
# -
# ## Differential expression analysis
# Create subdirectory: "<local_dir>/DE_stats/"
os.makedirs(os.path.join(local_dir, "DE_stats"), exist_ok=True)
# + magic_args="-i template_DE_grouping_filename -i project_id -i processed_template_filename -i local_dir -i base_dir -i de_method" language="R"
#
# source(paste0(base_dir, '/sophie/DE_analysis.R'))
#
# # File created: "<local_dir>/DE_stats/DE_stats_template_data_<project_id>_real.txt"
# if (de_method == "deseq"){
# get_DE_stats_DESeq(
# template_DE_grouping_filename,
# project_id,
# processed_template_filename,
# "template",
# local_dir,
# "real"
# )
# }
# else{
# get_DE_stats_limma(
# template_DE_grouping_filename,
# project_id,
# processed_template_filename,
# "template",
# local_dir,
# "real"
# )
# }
# + magic_args="-i template_DE_grouping_filename -i project_id -i base_dir -i simulated_data_dir -i num_simulated -i de_method" language="R"
#
# source(paste0(base_dir, '/sophie/DE_analysis.R'))
#
# # Files created: "<local_dir>/DE_stats/DE_stats_simulated_data_<project_id>_<n>.txt"
# for (i in 0:(num_simulated-1)){
# simulated_data_filename <- paste(
# simulated_data_dir,
# "/selected_simulated_data_",
# project_id,
# "_",
# i,
# "_processed.tsv",
# sep = ""
# )
# if (de_method == "deseq"){
# get_DE_stats_DESeq(
# template_DE_grouping_filename,
# project_id,
# simulated_data_filename,
# "simulated",
# local_dir,
# i
# )
# }
# else {
# get_DE_stats_limma(
# template_DE_grouping_filename,
# project_id,
# simulated_data_filename,
# "simulated",
# local_dir,
# i
# )
# }
# }
# -
# ## Rank genes
#
# Genes are ranked by their "generic-ness" - how frequently these genes are changed across the simulated experiments using user-specific test statistic provided in the `col_to_rank_genes` params (i.e. log2 fold change).
# +
analysis_type = "DE"
template_DE_stats_filename = os.path.join(
local_dir, "DE_stats", f"DE_stats_template_data_{project_id}_real.txt"
)
# Added
if de_method == "deseq":
logFC_name = "log2FoldChange"
pvalue_name = "padj"
else:
logFC_name = "logFC"
pvalue_name = "adj.P.Val"
template_DE_stats, simulated_DE_summary_stats = ranking.process_and_rank_genes_pathways(
template_DE_stats_filename,
local_dir,
num_simulated,
project_id,
analysis_type,
col_to_rank_genes,
logFC_name,
pvalue_name,
)
# -
# ## Summary table
#
# * Gene ID: Gene identifier (hgnc symbols for human data or PA number for *P. aeruginosa* data)
# * (Real): Statistics for template experiment
# * (Simulated): Statistics across simulated experiments
# * Number of experiments: Number of simulated experiments
# * Z-score: High z-score indicates that gene is more changed in template compared to the null set of simulated experiments (high z-score = highly specific to template experiment). These z-scores are true standard scores using mean and standard deviation. The calculation for the z-score for a given gene is
#
# $$
# \frac{\text{log}_2 \text{fold change of the gene in the template experiment} - mean(\text{log}_2 \text{fold change of the gene in simulated experiments)}}{variance(\text{log}_2 \text{fold change of the gene in simulated experiments)}}
# $$
#
# The range of this z-score will vary depending on the number of simulated experiments, so the number of simulated experiments should be held constant if the user is performing multiple SOPHIE runs or if they're comparing to previous SOPHIE runs performed by someone else.
#
# * Percentile (simulated): percentile rank of the median(abs(log fold change)). So its the median absolute change for that gene across the 25 simulated experiments that is then converted to a percentile rank from 0 - 100. Where a higher percentile indicates that the gene was highly changed frequently and would suggest that the gene is more commonly DE.
# * Percent DE (simulated): the fraction of the simulated experiments in which that gene was found to be DE using (log fold change > 1 and adjusted p-value < 0.05). _Note:_ you may find that many genes have a 0 fraction. This is because there is some compression that happens when pushing data through the VAE so the variance of the simulated experiments is lower compared to the real experiment. We are aware of this limitation in the VAE and are looking at how to improve the variance and biological signal captured by the VAE, however we were still able to demonstrate that for now the VAE is able to simulate realistic looking biological experiments in our previous [paper](https://academic.oup.com/gigascience/article/9/11/giaa117/5952607).
#
#
# **Note:**
# * If using DESeq, genes with NaN in only the `Adj P-value (Real)` column are those genes flagged because of the `cooksCutoff` parameter. The cook's distance as a diagnostic to tell if a single sample has a count which has a disproportionate impact on the log fold change and p-values. These genes are flagged with an NA in the pvalue and padj columns of the result table.
#
# * If using DESeq with count threshold, some genes may not be present in all simulated experiments (i.e. the `Number of experiments (simulated)` will not equal the number of simulated experiments you specified in the beginning. This pre-filtering will lead to some genes found in few simulated experiments and so the background/null set for that gene is not robust. Thus, the user should sort by both z-score and number of experiments to identify specific expressed genes.
#
# * If using DESeq without count threshold, some genes may still not be present in all simulated experiments (i.e. the `Number of experiments (simulated)` will not equal the number of simulated experiments you specified in the beginning. If the gene is 0 expressed across all samples and thus automatically given an NA in `log fold change, adjusted p-value` columns. Thus, the user should sort by both z-score and number of experiments to identify specific expressed genes.
#
# For more information you can read [DESeq FAQs](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#pvaluesNA)
# +
# Get summary table
summary_gene_ranks = ranking.generate_summary_table(
template_DE_stats_filename,
template_DE_stats,
simulated_DE_summary_stats,
col_to_rank_genes,
local_dir,
"gene",
params,
)
summary_gene_ranks.sort_values(by="Z score", ascending=False).head(10)
# -
summary_gene_ranks.isna().any()
summary_gene_ranks[summary_gene_ranks.isna().any(axis=1)]
# Save
summary_gene_ranks.to_csv(output_filename, sep="\t")