From 9b4d7f98d937463b339baa5241b2e9e11df0712d Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni Date: Wed, 19 Aug 2020 21:55:37 -0400 Subject: [PATCH 1/2] Changes to get the ComBat step to run in Docker --- analyses/molecular-subtyping-MB/01-classify-mb.R | 13 +++++-------- .../run-molecular-subtyping-MB-MM2S.sh | 1 - ...run-molecular-subtyping-MB-medullo-classifier.sh | 3 +-- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/analyses/molecular-subtyping-MB/01-classify-mb.R b/analyses/molecular-subtyping-MB/01-classify-mb.R index c95d59c35f..07608c870f 100644 --- a/analyses/molecular-subtyping-MB/01-classify-mb.R +++ b/analyses/molecular-subtyping-MB/01-classify-mb.R @@ -21,7 +21,7 @@ option_list <- list( make_option(c("--strandedexprs"), type = "character", help = "Stranded Expression data: HUGO symbol x Sample identifiers (.rds)"), make_option(c("--batch_col"), type = "character", - default = NA, + default = NULL, help = "Combine and batch correct input matrices using which column?"), make_option(c("--clin"), type = "character", help = "Clinical file (.tsv)"), @@ -68,7 +68,7 @@ filter.mat <- function(expr.input, clin.mb) { # subset expression to MB samples mb.samples <- intersect(clin.mb$Kids_First_Biospecimen_ID, colnames(expr.input)) expr.input <- expr.input %>% - dplyr::select(all_of(mb.samples)) + dplyr::select(mb.samples) return(expr.input) } @@ -85,17 +85,14 @@ expr.input.mb <- expr.input.mb[[1]] %>% column_to_rownames('gene') # if batch_col is set, do the following: -if(!is.na(batch_col)){ +if(!is.null(batch_col)){ print("Batch correct input matrices...") # match clinical rows and expression cols - clin.mb <- clin.mb %>% - mutate(tmp = Kids_First_Biospecimen_ID) %>% - column_to_rownames('tmp') - expr.input.mb <- expr.input.mb[,rownames(clin.mb)] + expr.input.mb <- as.matrix(expr.input.mb[,clin.mb$Kids_First_Biospecimen_ID]) # batch correct using batch_col - expr.input.mb <- ComBat(dat = log2(expr.input.mb + 1), batch = clin.mb$RNA_library) + expr.input.mb <- ComBat(dat = log2(expr.input.mb + 1), batch = clin.mb[, batch_col]) } else { # log transform expr.input.mb <- log2(expr.input.mb + 1) diff --git a/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-MM2S.sh b/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-MM2S.sh index 5a017c9f0d..85440ed33b 100644 --- a/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-MM2S.sh +++ b/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-MM2S.sh @@ -23,7 +23,6 @@ Rscript --vanilla 01-classify-mb.R \ --polyaexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.polya.rds \ --strandedexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds \ --clin ../../data/pbta-histologies.tsv \ ---batch_col NA \ --hist_column integrated_diagnosis \ --method MM2S \ --outputfile results/mb-molecular-subtypes-MM2S.rds diff --git a/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-medullo-classifier.sh b/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-medullo-classifier.sh index 5c2f943281..9daac02f80 100644 --- a/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-medullo-classifier.sh +++ b/analyses/molecular-subtyping-MB/run-molecular-subtyping-MB-medullo-classifier.sh @@ -23,7 +23,6 @@ Rscript --vanilla 01-classify-mb.R \ --polyaexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.polya.rds \ --strandedexprs ../../data/pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds \ --clin ../../data/pbta-histologies.tsv \ ---batch_col NA \ --hist_column integrated_diagnosis \ --method medullo-classifier \ --outputfile results/mb-molecular-subtypes-medullo-classifier.rds @@ -32,4 +31,4 @@ Rscript --vanilla 01-classify-mb.R \ Rscript 02-compare-classes.R \ --observed_class results/mb-molecular-subtypes-medullo-classifier.rds \ --expected_class input/expected_class.rds \ ---outputfile results/comparison-medullo-classifier.rds \ No newline at end of file +--outputfile results/comparison-medullo-classifier.rds From adf3287529575e9793c818f13de0f5eae9bde3e4 Mon Sep 17 00:00:00 2001 From: Jaclyn Taroni Date: Wed, 19 Aug 2020 22:46:08 -0400 Subject: [PATCH 2/2] Add idea for filter and ComBat script --- .../00-filter-and-batch-correction.R | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 analyses/molecular-subtyping-MB/00-filter-and-batch-correction.R diff --git a/analyses/molecular-subtyping-MB/00-filter-and-batch-correction.R b/analyses/molecular-subtyping-MB/00-filter-and-batch-correction.R new file mode 100644 index 0000000000..ed3af2a002 --- /dev/null +++ b/analyses/molecular-subtyping-MB/00-filter-and-batch-correction.R @@ -0,0 +1,91 @@ + +# load libraries +suppressPackageStartupMessages(library(optparse)) +suppressPackageStartupMessages(library(tidyverse)) +suppressPackageStartupMessages(library(sva)) + +option_list <- list( + make_option(c("--batch_col"), type = "character", + default = NULL, + help = "Combine and batch correct input matrices using which column?"), + make_option(c("--output_dir"), type = "character", + help = "Output directory"), + make_option(c("--output_prefix"), type = "character", + help = "Output file prefix") + +) + +# parse options +opt <- parse_args(OptionParser(option_list = option_list)) +batch_col <- opt$batch_col +output_prefix <- opt$output_prefix +output_dir <- opt$output_dir + +# directories +root_dir <- rprojroot::find_root(rprojroot::has_dir(".git")) +data_dir <- file.path(root_dir, "data") +dir.create(output_dir, showWarnings = FALSE, recursive = TRUE) + +# input files +polya.file <- file.path(data_dir, + "pbta-gene-expression-rsem-fpkm-collapsed.polya.rds") +stranded.file <- file.path(data_dir, + "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds") +clin.file <- file.path(data_dir, + "pbta-histologies.tsv") + +# output files +uncorrected.file <- file.path(output_dir, paste0(output_prefix, ".rds")) +corrected.file <- file.path(output_dir, paste0(output_prefix, "-batch-corrected.rds")) + +# expression from polya and stranded data +polya <- readRDS(polya.file) +stranded <- readRDS(stranded.file) + +# read and subset clinical file to MB samples +clin <- read.delim(clin.file, stringsAsFactors = F) +clin.mb <- clin %>% + filter(experimental_strategy == "RNA-Seq", + integrated_diagnosis == "Medulloblastoma") %>% + dplyr::select(Kids_First_Biospecimen_ID, sample_id, RNA_library) + +# function to filter only MB samples +filter.mat <- function(expr.input, clin.mb) { + + # get data + expr.input <- get(expr.input) + + # subset expression to MB samples + mb.samples <- intersect(clin.mb$Kids_First_Biospecimen_ID, colnames(expr.input)) + expr.input <- expr.input %>% + dplyr::select(mb.samples) + + return(expr.input) +} + +# filter matrices to MB only +expr.input <- c("polya", "stranded") +expr.input.mb <- lapply(expr.input, FUN = function(x) filter.mat(expr.input = x, clin = clin.mb)) + +# combine both matrices using common genes +expr.input.mb <- expr.input.mb[[1]] %>% + rownames_to_column('gene') %>% + inner_join(expr.input.mb[[2]] %>% + rownames_to_column('gene'), by = 'gene') %>% + column_to_rownames('gene') + +uncorrected.df <- log2(expr.input.mb + 1) +write_rds(uncorrected.df, uncorrected.file) + +if(!is.null(batch_col)){ + print("Batch correct input matrices...") + + # match clinical rows and expression cols + expr.input.mb <- as.matrix(expr.input.mb[,clin.mb$Kids_First_Biospecimen_ID]) + + # batch correct using batch_col + corrected.mat <- ComBat(dat = log2(expr.input.mb + 1), batch = clin.mb[, batch_col]) + + write_rds(corrected.mat, corrected.file) +} +