Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI related edits to MB subtype steps #1

Merged
merged 3 commits into from
Aug 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions analyses/molecular-subtyping-MB/00-filter-and-batch-correction.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@

# load libraries
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(sva))

option_list <- list(
make_option(c("--batch_col"), type = "character",
default = NULL,
help = "Combine and batch correct input matrices using which column?"),
make_option(c("--output_dir"), type = "character",
help = "Output directory"),
make_option(c("--output_prefix"), type = "character",
help = "Output file prefix")

)

# parse options
opt <- parse_args(OptionParser(option_list = option_list))
batch_col <- opt$batch_col
output_prefix <- opt$output_prefix
output_dir <- opt$output_dir

# directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

# input files
polya.file <- file.path(data_dir,
"pbta-gene-expression-rsem-fpkm-collapsed.polya.rds")
stranded.file <- file.path(data_dir,
"pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds")
clin.file <- file.path(data_dir,
"pbta-histologies.tsv")

# output files
uncorrected.file <- file.path(output_dir, paste0(output_prefix, ".rds"))
corrected.file <- file.path(output_dir, paste0(output_prefix, "-batch-corrected.rds"))

# expression from polya and stranded data
polya <- readRDS(polya.file)
stranded <- readRDS(stranded.file)

# read and subset clinical file to MB samples
clin <- read.delim(clin.file, stringsAsFactors = F)
clin.mb <- clin %>%
filter(experimental_strategy == "RNA-Seq",
integrated_diagnosis == "Medulloblastoma") %>%
dplyr::select(Kids_First_Biospecimen_ID, sample_id, RNA_library)

# function to filter only MB samples
filter.mat <- function(expr.input, clin.mb) {

# get data
expr.input <- get(expr.input)

# subset expression to MB samples
mb.samples <- intersect(clin.mb$Kids_First_Biospecimen_ID, colnames(expr.input))
expr.input <- expr.input %>%
dplyr::select(mb.samples)

return(expr.input)
}

# filter matrices to MB only
expr.input <- c("polya", "stranded")
expr.input.mb <- lapply(expr.input, FUN = function(x) filter.mat(expr.input = x, clin = clin.mb))

# combine both matrices using common genes
expr.input.mb <- expr.input.mb[[1]] %>%
rownames_to_column('gene') %>%
inner_join(expr.input.mb[[2]] %>%
rownames_to_column('gene'), by = 'gene') %>%
column_to_rownames('gene')

uncorrected.df <- log2(expr.input.mb + 1)
write_rds(uncorrected.df, uncorrected.file)

if(!is.null(batch_col)){
print("Batch correct input matrices...")

# match clinical rows and expression cols
expr.input.mb <- as.matrix(expr.input.mb[,clin.mb$Kids_First_Biospecimen_ID])

# batch correct using batch_col
corrected.mat <- ComBat(dat = log2(expr.input.mb + 1), batch = clin.mb[, batch_col])

write_rds(corrected.mat, corrected.file)
}

13 changes: 5 additions & 8 deletions analyses/molecular-subtyping-MB/01-classify-mb.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ option_list <- list(
make_option(c("--strandedexprs"), type = "character",
help = "Stranded Expression data: HUGO symbol x Sample identifiers (.rds)"),
make_option(c("--batch_col"), type = "character",
default = NA,
default = NULL,
help = "Combine and batch correct input matrices using which column?"),
make_option(c("--clin"), type = "character",
help = "Clinical file (.tsv)"),
Expand Down Expand Up @@ -68,7 +68,7 @@ filter.mat <- function(expr.input, clin.mb) {
# subset expression to MB samples
mb.samples <- intersect(clin.mb$Kids_First_Biospecimen_ID, colnames(expr.input))
expr.input <- expr.input %>%
dplyr::select(all_of(mb.samples))
dplyr::select(mb.samples)

return(expr.input)
}
Expand All @@ -85,17 +85,14 @@ expr.input.mb <- expr.input.mb[[1]] %>%
column_to_rownames('gene')

# if batch_col is set, do the following:
if(!is.na(batch_col)){
if(!is.null(batch_col)){
print("Batch correct input matrices...")

# match clinical rows and expression cols
clin.mb <- clin.mb %>%
mutate(tmp = Kids_First_Biospecimen_ID) %>%
column_to_rownames('tmp')
expr.input.mb <- expr.input.mb[,rownames(clin.mb)]
expr.input.mb <- as.matrix(expr.input.mb[,clin.mb$Kids_First_Biospecimen_ID])

# batch correct using batch_col
expr.input.mb <- ComBat(dat = log2(expr.input.mb + 1), batch = clin.mb$RNA_library)
expr.input.mb <- ComBat(dat = log2(expr.input.mb + 1), batch = clin.mb[, batch_col])
} else {
# log transform
expr.input.mb <- log2(expr.input.mb + 1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ Rscript --vanilla 01-classify-mb.R \
--polyaexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.polya.rds \
--strandedexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds \
--clin ../../data/pbta-histologies.tsv \
--batch_col NA \
--hist_column integrated_diagnosis \
--method MM2S \
--outputfile results/mb-molecular-subtypes-MM2S.rds
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ Rscript --vanilla 01-classify-mb.R \
--polyaexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.polya.rds \
--strandedexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds \
--clin ../../data/pbta-histologies.tsv \
--batch_col NA \
--hist_column integrated_diagnosis \
--method medullo-classifier \
--outputfile results/mb-molecular-subtypes-medullo-classifier.rds
Expand All @@ -32,4 +31,4 @@ Rscript --vanilla 01-classify-mb.R \
Rscript 02-compare-classes.R \
--observed_class results/mb-molecular-subtypes-medullo-classifier.rds \
--expected_class input/expected_class.rds \
--outputfile results/comparison-medullo-classifier.rds
--outputfile results/comparison-medullo-classifier.rds