-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathTCGA_DEGs_clin_subcategories.Rmd
432 lines (377 loc) · 19 KB
/
TCGA_DEGs_clin_subcategories.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
---
title: Differential expression analysis of TCGA data between clinical subgroups
author: "Mikhail Dozmorov"
date: "`r Sys.Date()`"
output:
pdf_document:
toc: no
html_document:
theme: united
toc: no
csl: styles.ref/genomebiology.csl
bibliography: data.TCGA/TCGA.bib
editor_options:
chunk_output_type: console
---
```{r setup, echo=FALSE, message=FALSE, warning=FALSE}
# Set up the environment
library(knitr)
opts_chunk$set(cache.path='cache/', fig.path='img/', cache=F, tidy=T, fig.keep='high', echo=F, dpi=100, warnings=F, message=F, comment=NA, warning=F, results='as.is') #out.width=700,
library(pander)
panderOptions('table.split.table', Inf)
set.seed(1)
library(dplyr)
options(stringsAsFactors = FALSE)
```
```{r results='hide'}
library(TCGA2STAT)
library(dplyr)
library(knitr)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(limma)
library(openxlsx)
library(MDmisc)
library(org.Hs.eg.db)
# devtools::install_github("mdozmorov/enrichR")
library(enrichR)
library(pheatmap)
```
```{r}
# A function to load TCGA data, from remote repository, or a local R object
load_data <- function(disease = cancer, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE) {
FILE = paste0(data_dir, "/mtx_", disease, "_", data.type, "_", type, ".rda") # R object with data
if (all(file.exists(FILE), !(force_reload))) {
# If the data has been previously saved, load it
load(file = FILE)
} else {
# If no saved data exists, get it from the remote source
mtx <- getTCGA(disease = disease, data.type = data.type, type = type, clinical = TRUE)
save(file = FILE, list = c("mtx")) # Save it
}
return(mtx)
}
# A function to get data overview
summarize_data <- function(mtx = mtx) {
print(paste0("Dimensions of expression matrix, genex X patients: ", paste(dim(mtx$dat), collapse = " ")))
print(paste0("Dimensions of clinical matrix, patients X parameters: ", paste(dim(mtx$clinical), collapse = " ")))
print(paste0("Dimensions of merged matrix, patients X parameters + genes: ", paste(dim(mtx$merged.dat), collapse = " ")))
print("Head of the merged matrix")
print(mtx$merged.dat[1:5, 1:10])
print("Head of the clinical matrix")
print(mtx$clinical[1:5, 1:7])
print("List of clinical values, and frequency of each variable: ")
clin_vars <- apply(mtx$clinical, 2, function(x) length(table(x[ !(is.na(x) & x != "" )]))) %>% as.data.frame()
# Filter clinical variables to have at least 2, but no more than 10 categories,
# And they are not dates
clin_vars <- clin_vars[ as.numeric(clin_vars$.) > 1 & as.numeric(clin_vars$.) < 10 & !grepl("years|days|date|vital", rownames(clin_vars), perl = TRUE) , , drop = FALSE]
print(kable(clin_vars))
return(rownames(clin_vars))
}
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## groupvars: a vector containing names of columns that contain grouping variables
## na.rm: a boolean that indicates whether to ignore NA's
## conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
```
# Methods
```{r echo=TRUE}
# Cancer type
cancer = "LUAD"
# Minimum number of samples in a subcategory to be considered
min_samples <- 15
# Path where the downloaded data is stored
data_dir = "/Users/mdozmorov/Documents/Data/GenomeRunner/TCGAsurvival/data" # Mac
# General settings
useTPM = FALSE # Whether or not to convert expression counts to TPM
# What is the difference between CNV and CNA?
# https://github.com/mksamur/RTCGAToolbox/issues/3
# RNA-seq data
data.type = "RNASeq2" # "CNA_SNP" # "CNV_SNP" # "RNASeq2"
type = ""
# Functional category, from http://amp.pharm.mssm.edu/Enrichr/#stats. If NA, no enrichment analysis
enrichr_category <- c("KEGG_2019_Human", "Chromosome_Location", "GO_Biological_Process_2017b", "GO_Molecular_Function_2017b", "GO_Cellular_Component_2017b") # NA
# miRNA-seq data
data.type = "miRNASeq" # miRNASeq - "count" for raw read counts (default); "rpmmm" for normalized read counts
type = "rpmmm"
enrichr_category <- NA
# Selected clinical subcategory, will be run if not NA. Defined from summarize_data(mtx = mtx)
selected_clinical_subcategory <- "race" # Set to NA to run all clinical subcategories
selected_clinical_subgroup1 <- "black or african american" # Set both subgroups to NA
selected_clinical_subgroup2 <- "white" # To run all pairs of subgroups
# Differential expression cutoff
pval_cutoff <- 0.3 # Adjusted P-value cutoff
max_num_genes <- 5000 # Maximum number of genes to feed into functional enrichment
full_degs_output <- TRUE # If TRUE, output differential expression statistics for all genes
# Enrichment cutoffs
fdr.cutoff <- 0.1
top_X <- 10 # How many top significant differentially expressed genes/pathways to output
nplot <- 50 # How many genes to plot on a heatmap
# Filename to same the results
fileNameRes <- paste0("results/", cancer, "_", data.type, "_DEGs_clin_subcategories.xlsx")
# Color palette for the heatmap, https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf
col3 <- colorRampPalette(c('blue', 'white', 'red'))(20)
col3 <- colorRampPalette(c('blue', 'gray', 'yellow'))(20)
col3 <- colorRampPalette(c('green', 'black', 'red'))(20)
# col3 <- colorRamps::green2red(n = 20)
```
```{r}
# Do TPM conversion, if needed
if (useTPM) {
# Check if the TPMs have already been precalculated for a given cancer
fileNameTPM <- paste0(data_dir, "/", cancer, "_TPM.Rda")
if (!file.exists(fileNameTPM)) {
source("calcTPM.R")
load(file = "data/feature_length.Rda")
common_genes <- intersect(colnames(expr), feature_length$Symbol) # Common gene symbols
expr <- expr[, c("AffyID", common_genes)] # Subset expression matrix
feature_length <- feature_length[feature_length$Symbol %in% common_genes, ] # Subset feature length
feature_length <- feature_length[match(colnames(expr)[ -1 ], feature_length$Symbol), ] # Match order
all.equal(colnames(expr), c("AffyID", feature_length$Symbol)) # Should be true
expr_tpm <- calcTPM(t(expr[, -1]), feature_length = feature_length) # Convert to TPM, takes time
expr_tpm <- data.frame(AffyID = expr[, "AffyID"], t(expr_tpm), stringsAsFactors = FALSE)
expr <- expr_tpm
save(list = c("expr"), file = fileNameTPM) # Save the object
} else {
load(file = fileNameTPM)
}
}
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## groupvars: a vector containing names of columns that contain grouping variables
## na.rm: a boolean that indicates whether to ignore NA's
## conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
```
## Differential expression analysis
Samples in the selected cancer cohort were separated into clinical subgroups, e.g., within "race" clinical category pairs of subcategories, e.g., "black or african american" vs. "white" subgroups were compared for differentially expressed genes using `limma` v `r packageVersion("limma")` R package [@Ritchie:2015aa; @Smyth:2004aa]. P-values were corrected for multiple testing using False Discovery Rate (FDR) method [@Benjamini:1995aa]. Genes differentially expressed at unadjusted p-value cutoff `r pval_cutoff` were selected for '`r enrichr_category`' functional enrichment analysis.
Results are stored in the Excel file `r fileNameRes`.
- Sheets are named by numbers
- One sheet contains differentially expressed genes, followed by sheets with functional analysis results
- Each results output has a header explaining what the results are
- Legend for differentially expressed gene lists: "Gene" - gene annotations; "logFC" - log fold change; "AveExpr" - average expression, log2; "t" - t-statistics; "P.Val"/"adj.P.Val" - non-/FDR-adjusted p-value, "B" - another statistics.
- Legend for functional enrichment analysis: "database" - source of functional annotations, "category" - name of functional annotation, "pval" - unadjusted enrichment p-value, "qval" - FDR-adjusted p-value, "genes" - comma-separated differentially expressed genes enriched in a corresponding functional category.
```{r}
# Create results folder
system("mkdir -p results")
# Initialize results file
unlink(fileNameRes)
wb <- openxlsx::createWorkbook(fileNameRes) # openxlsx::loadWorkbook(fileName)
sheetCount <- 0 # Simple sheet name placeholder
cancer_type <- cancer
print(paste0("Processing cancer ", cancer_type))
mtx <- load_data(disease = cancer_type, data.type = data.type, type = type, data_dir = data_dir, force_reload = FALSE)
clinical_annotations <- summarize_data(mtx = mtx)
# Prepare expression data
expr <- mtx$merged.dat[ , 4:ncol(mtx$merged.dat)] %>% as.matrix
# Filter out low expressed genes
# Should be more than 90% of non-zero values
# ff <- genefilter::pOverA(p = 0.9, A = 0, na.rm = TRUE)
# expr <- expr[, apply(expr, 2, ff)]
expr <- data.frame(AffyID = mtx$merged.dat$bcr, expr, stringsAsFactors = FALSE)
# Prepare clinical data
clin <- mtx$merged.dat[, 1:3]
colnames(clin)[1] <- "AffyID"
# Full clinical information
clin_full <- mtx$clinical[rownames(mtx$clinical) %in% clin$AffyID, ]
clin_full <- clin_full[match(expr$AffyID, rownames(clin_full)), ]
all.equal(rownames(clin_full), expr$AffyID)
# Save clinical data
# wb1 <- openxlsx::createWorkbook(paste0("results/", cancer, "_clinical.xlsx")) # openxlsx::loadWorkbook(fileName)
# save_res(data.frame(Patients = rownames(clin_full), clin_full), fileName = paste0("results/", cancer, "_clinical.xlsx"), wb = wb1, sheetName = paste0(cancer, "_clinical"))
# Override clinical subcategories, if non-empty
if (!is.na(selected_clinical_subcategory) | selected_clinical_subcategory != "") {
clinical_annotations <- selected_clinical_subcategory
table(clin_full[, clinical_annotations], useNA = "no") # Check subgroups
}
# For each clinical annotation
for (annotation in clinical_annotations) {
# Get the number of patients per category in the current annotation
annotations <- table(clin_full[, annotation], useNA = "no")
# Remove annotations less than a pre-defined number of samples
annotations <- annotations[annotations >= min_samples]
# Get combinations of all pairs of clinical subgroups
annotations_combn <- combn(names(annotations), 2)
# If custom subgroups selected, subset the combination matrix
if (!(is.na(selected_clinical_subgroup1) & is.na(selected_clinical_subgroup2))) {
annotations_combn <- annotations_combn[, grepl(selected_clinical_subgroup1, annotations_combn[1, ]) &
grepl(selected_clinical_subgroup2, annotations_combn[2, ]), drop = FALSE]
}
# Go through each combination
for (i in 1:ncol(annotations_combn)) {
group1 <- annotations_combn[1, i] # First group
group2 <- annotations_combn[2, i] # second group
# Samples for second vs. first group comparison
ind_up <- which(rownames(clin_full) %in% rownames(clin_full[ clin_full[, annotation] %in% group2 , ]))
ind_lo <- which(rownames(clin_full) %in% rownames(clin_full[ clin_full[, annotation] %in% group1 , ]))
# For differential analysis, create group labels
group <- vector(mode = "numeric", length = nrow(expr)) # Empty bector
group[ind_up] <- 1 # Assign numeric groups
group[ind_lo] <- 2
# table(group) # How many patients we have
expr_subset <- expr[group != 0, ] # Remove those that are not in quantiles
group <- group[ group != 0 ]
names(group) <- rownames(expr_subset)
# Reshape expr_subset matrix
expr_subset <- (t(expr_subset))
colnames(expr_subset) <- expr_subset[1, ]
expr_subset <- expr_subset[-1, ]
class(expr_subset) <- "numeric"
# Convert to the proper numerical scale for limma
if (grepl("miRNA", data.type)) {
expr_subset <- log2(expr_subset + 1) # miRNA data needs log2 transformation
}
if (grepl("RNA", data.type)) {
expr_subset <- voom(expr_subset)$E
}
if (grepl("CN", data.type)) {
expr_subset <- ((1 + expr_subset) / 2) * 10
}
# boxplot(expr_subset)
# Limma
design <- model.matrix(~0 + factor(group))
colnames(design) <- c("up", "lo")
fit <- lmFit(expr_subset, design)
contrast.matrix <- makeContrasts(up-lo, levels = design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)
degs <- topTable(fit2, coef = 1, number = Inf, p.value = pval_cutoff)
# Check if the list of DEGs is non-empty, proceed with enrichment analysis
if (nrow(degs) > 0) {
# Subset the number of DEGs for KEGG analysis to the pre-defined maximum
if (nrow(degs) > max_num_genes) {
degs_subset <- degs[1:max_num_genes, ]
} else {
degs_subset <- degs
}
# Save DEGs, with header
sheetCount <- sheetCount + 1 # Increase sheet count
if (full_degs_output) { # Save the full DEG list?
degs_to_save <- topTable(fit2, coef = 1, number = Inf)
degs_to_save <- data.frame(Gene = rownames(degs_to_save), degs_to_save) # Append gene names to the DEG list
} else {
degs_to_save <- data.frame(Gene = rownames(degs), degs) #Append gene names to the DEG list
}
# Make header
header <- data.frame(matrix("", ncol = ncol(degs_to_save), nrow = 1, dimnames = list(" ", colnames(degs_to_save)))) # Header one-row data frame, column names matching DEGs
header[1, 1] <- paste0(nrow(degs), " DEGs in ", cancer, ", between groups \'", annotation, ":", group2, "\' vs. \'", annotation, ":", group1, "\'") # Construct header
save_res(rbind(header, degs_to_save), fileName = fileNameRes, wb = wb, sheetName = as.character(sheetCount))
# Get list of up- and downregulated genes
up.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t > 0 ]))
dn.genes <- sort(unique(rownames(degs_subset)[ degs_subset$t < 0 ]))
# Run functional enrichment, if not NA
if (!is.na(enrichr_category)) {
for (j in enrichr_category) {
res.kegg <- save_enrichr(up.genes = unique(c(up.genes, dn.genes)), databases = j, fdr.cutoff = fdr.cutoff)
# Save the results, if non-empty
if (nrow(res.kegg) > 0) {
sheetCount <- sheetCount + 1 # Increase sheet count
header <- data.frame(matrix("", ncol = ncol(res.kegg), nrow = 1, dimnames = list(" ", colnames(res.kegg)))) # Header one-row data frame, column names matching DEGs
header[1, 1] <- paste0(nrow(res.kegg), " enrichments in ", j, " in ", cancer, ", between groups \'", annotation, ":", group2, "\' vs. \'", annotation, ":", group1, "\'") # Construct header
save_res(rbind(header, res.kegg), fileName = fileNameRes, wb = wb, sheetName = as.character(sheetCount))
}
}
}
}
}
}
```
```{r fig.height=7}
matrix.to.plot <- expr_subset[rownames(expr_subset) %in% rownames(degs)[1:min(nrow(degs), nplot)], ]
matrix.to.plot[1:5, 1:5]
group.to.plot <- group
group.to.plot <- ifelse(group.to.plot == 1, "EA", "AA")
group.to.plot <- data.frame(Group = group.to.plot)
rownames(group.to.plot) <- colnames(matrix.to.plot)
pheatmap(matrix.to.plot, color=col3, clustering_method = "ward.D", scale = "row", annotation_col = group.to.plot, treeheight_row = 0, treeheight_col = 0, show_colnames = FALSE)
```
```{r fig.height=3, fig.width=5, eval=FALSE}
selected_genes <- c("UBA1", "PMAIP1")
selected_genes_expr <- expr_subset[rownames(expr_subset) %in% selected_genes, ]
colnames(selected_genes_expr) <- ifelse(group == 1, "Black, or AA", "White") %>% as.character()
# Reshape the data
gdata <- reshape2::melt(selected_genes_expr)
colnames(gdata) <- c("Gene", "Race", "value")
# ggplot(gdata, aes(x = gene, y = value, fill = group)) + geom_boxplot()
# ggplot(gdata, aes(x = cancer, y = value, fill = variable)) + geom_bar(position=position_dodge(), stat = "summary", fun.y = "mean")
# http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
gdata_summary <- summarySE(gdata, measurevar="value", groupvars=c("Gene", "Race"))
ggplot(gdata_summary, aes(x = Gene, y = value, fill = Race)) +
geom_bar(position=position_dodge(), stat="identity",
colour="black", # Use black outlines,
size=.3) + # Thinner lines
geom_errorbar(aes(ymin=value-se, ymax=value+se),
size=.3, # Thinner lines
width=.2,
position=position_dodge(.9)) +
xlab("Gene") +
ylab("log2 expression") +
# scale_fill_hue(name="Gene", # Legend label, use darker colors
# breaks=selected_genes,
# labels=selected_genes) +
ggtitle("Expression of selected genes in different races") +
scale_y_continuous(breaks=0:20*4) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
```
# References