Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Generate summary enrichment heatmaps #55

Merged
merged 8 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
343 changes: 71 additions & 272 deletions analyses/add-histologies/02-summary_stats.R

Large diffs are not rendered by default.

184 changes: 184 additions & 0 deletions analyses/add-histologies/03-alluvial_plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# R Corbett 2023
#
# Generate alluvial plot for PBTA ancestry project

# load libraries and set directories
library(data.table)
library(tidyverse)
# library(ComplexHeatmap)
# library(RColorBrewer)
# library(circlize)
library(colorblindr)
library(ggpubr)
library(ggalluvial)
library(cowplot)

root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Set file paths
data_dir <- file.path(root_dir, "data")
analysis_dir <- file.path(root_dir, "analyses", "add-histologies")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
plots_dir <- file.path(analysis_dir, "plots")

# call plot theme
source(file.path(root_dir, "figures", "theme.R"))

# set file paths
ancestry_file <- file.path(results_dir, "merged_ancestry_histology_data.tsv")

# wrangle data
ancestry <- read_tsv(ancestry_file)

# consolidate unreported/unknown reported races and ethnicities
ancestry <- ancestry %>%
dplyr::mutate(race = case_when(
race %in% c("Not Reported", "Other", "Reported Unknown", "Not Reported/Unknown", "Unknown") | is.na(race) ~ "Race Unknown",
race == "White/Caucasian" ~ "White",
race == "American Indian or Alaska Native" ~ "AI/AN",
race == "Native Hawaiian or Other Pacific Islander" ~ "NHPI",
race == "Black or African American" ~ "Black/Afr. Am.",
race == "More Than One Race" ~ ">1 Race",
TRUE ~ race
)) %>%
mutate(ethnicity = case_when(
grepl("Reported|Available|Unavailable", ethnicity) | is.na(ethnicity) ~ "Unknown",
grepl("Non-Hispanic|Not Hispanic", ethnicity) ~ "Not Hispanic/Latino",
TRUE ~ "Hispanic/Latino"
))


# Create data frame for alluvial plots, including only predicted_ancestry, race, and ethnicity columns
alluvial_df <- as.data.frame(table(ancestry$predicted_ancestry, ancestry$race, ancestry$ethnicity)) %>%
# format for alluvial plot
dplyr::rename(predicted_ancestry = Var1,
race = Var2,
ethnicity = Var3) %>%
to_lodes_form(axes = 1:3) %>%
dplyr::rename(Group = stratum) %>%
mutate(Group = factor(Group, levels = c("EAS", "SAS", "AFR", "AMR", "EUR",
"Asian", "Black/Afr. Am.", "AI/AN",
"NHPI", "White", ">1 Race",
"Race Unknown",
"Hispanic/Latino",
"Not Hispanic/Latino",
"Unknown"))) %>%
mutate(race = case_when(Group %in% c("Asian", "Black/Afr. Am.", "AI/AN",
"NHPI", "White", ">1 Race",
"Race Unknown") ~ Group,
TRUE ~ NA),
predicted_ancestry = case_when(Group %in% c("EAS", "SAS", "AFR", "EUR", "AMR") ~ Group,
TRUE ~ NA),
ethnicity = case_when(Group %in% c("Hispanic/Latino",
"Not Hispanic/Latino",
"Unknown") ~ Group,
TRUE ~ NA))

# Generate alluvial plot
p1 <- ggplot(alluvial_df, aes(y = Freq, stratum = Group, alluvium = alluvium, x = x, fill = Group)) +
geom_alluvium(show.legend = F) +
geom_stratum(show.legend = F) +
scale_fill_manual(values = c("Asian" = "#009E73", "SAS" = "#D55E00", "EAS" = "#009E73",
"White" = "#0072B2", "EUR" = "#0072B2",
"Black/Afr. Am." = "#E69F00", "AFR" = "#E69F00",
"NHPI" = "skyblue", "AMR" = "#56B4E9",
"AI/AN" = "#56B4E9",
"Race Unknown" = "grey", ">1 Race" = "brown",
"Hispanic/Latino" = "#56B4E9",
"Not Hispanic/Latino" = "#0072B2",
"Unknown" = "grey")) +
xlab("") +
ylab("Number of Patients") +
scale_x_discrete(labels = c("predicted ancestry", "reported race", "reported ethnicity")) +
theme_Publication()

# Create separate ancestry, race, and ethnicity dfs for legend generation
race_df <- data.frame(race = c("Asian", "Black/Afr. Am.", "AI/AN",
"NHPI", "White", ">1 Race",
"Race Unknown"),
value = 1)
ancestry_df <- data.frame(predicted_ancestry = c("EAS", "SAS", "AFR", "EUR", "AMR"),
value = 1)
ethnicity_df <- data.frame(ethnicity = c("Hispanic/Latino",
"Not Hispanic/Latino",
"Unknown"),
value = 1)

# plot race legend
lgd_race <- ggplot(race_df, aes(x = value, y = factor(race, levels = c("Race Unknown",
">1 Race", "White",
"AI/AN",
"NHPI",
"Black/Afr. Am.", "Asian")),
fill = race)) +
geom_tile(show.legend = T, color = "black",
lwd = 0.5, linetype = 1) +
scale_fill_manual(values = c("Asian" = "#009E73",
"White" = "#0072B2",
"Black/Afr. Am." = "#E69F00",
"AI/AN" = "#56B4E9",
"NHPI" = "skyblue",
">1 Race" = "brown",
"Race Unknown" = "grey"),
breaks = c("Asian",
"Black/Afr. Am.",
"AI/AN",
"NHPI",
"White",
">1 Race",
"Race Unknown")) +
labs(fill = "Reported Race") +
theme_Publication()
leg_race <- get_legend(lgd_race)

# Plot ancestry legend
lgd_anc <- ggplot(ancestry_df, aes(x = value, y = factor(predicted_ancestry,
levels = c("SAS","EAS","EUR","AMR","AFR")),
fill = predicted_ancestry)) +
geom_tile(show.legend = T, col = "black",
lwd = 0.5, linetype = 1) +
#xlim() +
scale_fill_manual(values = c("SAS" = "#D55E00",
"EAS" = "#009E73",
"EUR" = "#0072B2",
"AFR" = "#E69F00",
"AMR" = "#56B4E9"), breaks = c("EAS", "SAS", "AFR", "AMR", "EUR")) +
labs(fill = "Predicted Ancestry") +
theme_Publication()
leg_anc <- get_legend(lgd_anc)

# Plot ethnicity legend
lgd_ethn <- ggplot(ethnicity_df, aes(x = value, y = factor(ethnicity,
levels = c("Hispanic/Latino",
"Not Hispanic/Latino",
"Unknown")),
fill = ethnicity)) +
geom_tile(show.legend = T, col = "black",
lwd = 0.5, linetype = 1) +
#xlim() +
scale_fill_manual(values = c("Hispanic/Latino" = "#56B4E9",
"Not Hispanic/Latino" = "#0072B2",
"Unknown" = "grey")) +
labs(fill = "Reported Ethnicity") +
theme_Publication()
leg_ethn <- get_legend(lgd_ethn)

leg <- plot_grid(leg_anc, leg_race, leg_ethn,
nrow = 3,
align = "v",
rel_heights = c(1.5, 0.4, 1.5))


final_p <- plot_grid(p1, leg,
nrow = 1,
align = "none",
axis = "t",
rel_widths = c(1,0.45))

pdf(file.path(plots_dir, "ancestry-race-ethnicity-alluvial.pdf"),
width = 10, height = 6)

final_p

dev.off()
Binary file modified analyses/add-histologies/plots/ancestry-pcs.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified analyses/add-histologies/plots/low_major_ancestry_heatmap.pdf
Binary file not shown.
Binary file modified analyses/add-histologies/plots/major_predicted_ancestry_hist.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file modified analyses/add-histologies/plots/plot_group_by_ancestry.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
63 changes: 63 additions & 0 deletions analyses/add-histologies/util/heatmap_function.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Functions for generating count and enrichment heatmaps
#
# Ryan Corbett
#
# 2024

# Function to create frequency count and enrichment heatmap for two selected variables. Significance of enrichment is calculated using hypergeometric tests

plot_enr <- function(df, var1, var2,
var1_names, var2_names,
padjust = FALSE) {

enr <- matrix(0, nrow(unique(df[,var1])),
nrow(unique(df[,var2])),
dimnames = list(var1_names,
var2_names))
pval <- enr
ct <- enr

# loop through cancer groups to calculate enrichment
for (i in 1:nrow(enr)){
no_var1 <- sum(unlist(df[,var1]) == rownames(enr)[i])
for (j in 1:ncol(enr)){
no_var2 <- sum(unlist(df[,var2]) == colnames(enr)[j] & !is.na(unlist(df[,var2])))
no_var1_var2 <- sum(unlist(df[,var1]) == rownames(enr)[i] & unlist(df[,var2]) == colnames(enr)[j])
ct[i,j] <- no_var1_var2
enr[i,j] <- (no_var1_var2/no_var2)/(no_var1/nrow(df))
pval[i,j] <- phyper(no_var1_var2, no_var1, nrow(df) - no_var1, no_var2, lower.tail = F)
}
}

if (padjust == TRUE) {

fdr <- t(apply(pval, 1, function(x) p.adjust(x, "fdr")))
sig_mat <- ifelse(fdr < 0.05 & enr > 1 & ct > 1, "*", "")

} else {

sig_mat <- ifelse(pval < 0.05 & enr > 1 & ct > 1, "*", "")

}

fill_mat <- matrix(glue::glue("{round(enr, 1)}{sig_mat}"),
nrow(enr), ncol(enr))

ct_enr_mat <- matrix(glue::glue("{ct}\n({fill_mat})"),
nrow(ct), ncol(ct))

col_fun = colorRamp2(c(0, ceiling(max(enr))), c("white", "orangered"))

region_ht <- Heatmap(enr,
name = "Odds ratio",
cluster_rows = F,
cluster_columns = F,
rect_gp = gpar(col = "black", lwd = 2),
col = col_fun,
cell_fun = function(j, i, x, y, width, height, fill) {
grid.text(sprintf("%s", ct_enr_mat[i, j]), x, y, gp = gpar(fontsize = 12))
})

return(region_ht)

}
Loading