From cedba6024de51b0fcf473c19bd59ba8f644c4dce Mon Sep 17 00:00:00 2001
From: Adam Dinan <adam1989ie@gmail.com>
Date: Tue, 26 Mar 2024 12:10:09 -0600
Subject: [PATCH] Add additional R dependencies to Dockerfile

---
 Dockerfile                    |   1 +
 bin/DESeq2_normalise_counts.R |  10 +++-
 bin/TMM_normalise_counts.R    | 100 ++++++++++++++++++++--------------
 3 files changed, 70 insertions(+), 41 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b6d7656..55c7cc2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,6 +30,7 @@ RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.rstudio.com'; \
 
 RUN R -e 'install.packages(c(  \
     "optparse", "RColorBrewer", "reshape2", "tidyverse",   \
+    "readr", "tibble", "purrr", "tidyr", "stringr", "ggplot2", \
     "scales", "pheatmap", "matrixstats", "umap", "BiocManager"))'
 
 RUN R -e 'BiocManager::install(c("edgeR", "DESeq2"))'
diff --git a/bin/DESeq2_normalise_counts.R b/bin/DESeq2_normalise_counts.R
index 5924663..948d26f 100755
--- a/bin/DESeq2_normalise_counts.R
+++ b/bin/DESeq2_normalise_counts.R
@@ -2,7 +2,15 @@
 
 library(optparse)
 library(DESeq2)
-library(tidyverse)
+if (!require("tidyverse")) {
+    lapply(c("readr", "tibble", "purrr", "tidyr", "stringr", "ggplot2"),
+        library,
+        character.only = TRUE
+    )
+} else {
+    library(tidyverse)
+}
+
 
 option_list <- list(
     make_option(c("-c", "--counts"),
diff --git a/bin/TMM_normalise_counts.R b/bin/TMM_normalise_counts.R
index a9db338..93f7105 100755
--- a/bin/TMM_normalise_counts.R
+++ b/bin/TMM_normalise_counts.R
@@ -3,31 +3,50 @@
 library(optparse)
 library(DESeq2)
 library(edgeR)
-library(tidyverse)
+if (!require("tidyverse")) {
+    lapply(c("readr", "tibble", "purrr", "tidyr", "stringr", "ggplot2"),
+        library,
+        character.only = TRUE
+    )
+} else {
+    library(tidyverse)
+}
 
 option_list <- list(
-    make_option(c("-c", "--counts"), type="character", default=NULL,
-        help="table of read counts per gene", metavar="character"),
-    make_option(c("-l", "--lengths"), type="character", default=NULL,
-        help="table of effective lengths per gene", metavar="character"),
-    make_option(c("-g", "--genes"), type="character", default=NULL,
-        help="core gene subset to be used", metavar="character"),
-    make_option(c("-p", "--perc"), type="character", default=NULL,
-        help="filter to core genes?", metavar="character"),
-    make_option(c("-t", "--log_transform"), type="character", default=NULL,
-        help="log transform the counts? default = FALSE", metavar="character"),
-    make_option(c("-o", "--outdir"), type="character", default=NULL,
-        help="output directory for results", metavar="character")
+    make_option(c("-c", "--counts"),
+        type = "character", default = NULL,
+        help = "table of read counts per gene", metavar = "character"
+    ),
+    make_option(c("-l", "--lengths"),
+        type = "character", default = NULL,
+        help = "table of effective lengths per gene", metavar = "character"
+    ),
+    make_option(c("-g", "--genes"),
+        type = "character", default = NULL,
+        help = "core gene subset to be used", metavar = "character"
+    ),
+    make_option(c("-p", "--perc"),
+        type = "character", default = NULL,
+        help = "filter to core genes?", metavar = "character"
+    ),
+    make_option(c("-t", "--log_transform"),
+        type = "character", default = NULL,
+        help = "log transform the counts? default = FALSE", metavar = "character"
+    ),
+    make_option(c("-o", "--outdir"),
+        type = "character", default = NULL,
+        help = "output directory for results", metavar = "character"
+    )
 )
 
-opt_parser <- OptionParser(option_list=option_list)
+opt_parser <- OptionParser(option_list = option_list)
 opt <- parse_args(opt_parser)
 
 counts_f <- opt$counts
 lengths_f <- opt$lengths
 gene_f <- opt$genes
-perc <- if(opt$perc == "TRUE") TRUE else FALSE
-log <- if(opt$log_transform == "TRUE") TRUE else FALSE
+perc <- if (opt$perc == "TRUE") TRUE else FALSE
+log <- if (opt$log_transform == "TRUE") TRUE else FALSE
 outdir <- opt$outdir
 
 
@@ -36,18 +55,18 @@ counts_tab <- suppressMessages(read_tsv(counts_f))
 lengths_tab <- suppressMessages(read_tsv(lengths_f))
 core_genome <- suppressMessages(read_tsv(gene_f))
 
-lengths_tab <- lengths_tab[match(counts_tab$Gene, lengths_tab$Gene),]
+lengths_tab <- lengths_tab[match(counts_tab$Gene, lengths_tab$Gene), ]
 
 gene_ids <- counts_tab$Gene
-counts_tab <- data.frame(counts_tab[,2:ncol(counts_tab)])
-lengths_tab <- data.frame(lengths_tab[,2:ncol(lengths_tab)])
+counts_tab <- data.frame(counts_tab[, 2:ncol(counts_tab)])
+lengths_tab <- data.frame(lengths_tab[, 2:ncol(lengths_tab)])
 rownames(counts_tab) <- rownames(lengths_tab) <- gene_ids
 
 ## scale counts to reads per median gene length
-median_lens <- rowMedians(as.matrix(lengths_tab), na.rm=TRUE)
+median_lens <- rowMedians(as.matrix(lengths_tab), na.rm = TRUE)
 names(median_lens) <- gene_ids
 
-counts_tab_scaled <- (counts_tab/lengths_tab)
+counts_tab_scaled <- (counts_tab / lengths_tab)
 counts_tab_scaled <- sweep(counts_tab_scaled, 1, median_lens, "*")
 
 
@@ -64,17 +83,17 @@ size_factors <- effectiveLibSizes(y)
 
 ## NOTE: = effectiveLibSizes(y) = (y$samples$lib.size)*(y$samples$norm.factors)
 
-## NOTE: the output of edgeR effectiveLibSizes() is equivalent to the output of 
+## NOTE: the output of edgeR effectiveLibSizes() is equivalent to the output of
 ## sizeFactors() in DESeq2; see: https://support.bioconductor.org/p/46779/
 
 
 
 
-if(isTRUE(perc)){
-
+if (isTRUE(perc)) {
     ## subset to `perc` genes
     counts_tab_perc <- subset(counts_tab_scaled, rownames(
-         counts_tab_scaled) %in% core_genome$gene)
+        counts_tab_scaled
+    ) %in% core_genome$gene)
     counts_tab_perc[is.na(counts_tab_perc)] <- 0
 
     median_sub <- median_lens[rownames(counts_tab_perc)]
@@ -83,50 +102,51 @@ if(isTRUE(perc)){
     res_df <- sweep(counts_tab_perc, 2, size_factors, `/`)
 
     ## get RPKM values
-    lib_sf <- (colSums(counts_tab_perc, na.rm=TRUE)/1e6)
-    gene_sf <- sapply(median_sub, function(l){ l/1e3 * lib_sf })
+    lib_sf <- (colSums(counts_tab_perc, na.rm = TRUE) / 1e6)
+    gene_sf <- sapply(median_sub, function(l) {
+        l / 1e3 * lib_sf
+    })
     gene_sf <- as.data.frame(t(gene_sf))
     rpkm_df <- counts_tab_perc / gene_sf
-
-
 } else {
-
     ## get size factor-scaled counts
     res_df <- sweep(counts_tab_scaled, 2, size_factors, `/`)
 
     ## get RPKM values
-    lib_sf <- (colSums(counts_tab_scaled, na.rm=TRUE)/1e6)
-    gene_sf <- sapply(median_lens, function(l){ l/1e3 * lib_sf })
+    lib_sf <- (colSums(counts_tab_scaled, na.rm = TRUE) / 1e6)
+    gene_sf <- sapply(median_lens, function(l) {
+        l / 1e3 * lib_sf
+    })
     gene_sf <- as.data.frame(t(gene_sf))
     rpkm_df <- counts_tab_scaled / gene_sf
-
 }
 
-if(isTRUE(log)){
-    res_df <- log2(res_df+1)
-    rpkm_df <- log2(rpkm_df+1) 
+if (isTRUE(log)) {
+    res_df <- log2(res_df + 1)
+    rpkm_df <- log2(rpkm_df + 1)
 }
 
 ## convert rownames to column
 res_df <- tibble::rownames_to_column(as.data.frame(res_df), "feature_id")
 rpkm_df <- tibble::rownames_to_column(as.data.frame(rpkm_df), "feature_id")
 counts_tab_scaled <- tibble::rownames_to_column(as.data.frame(
-    counts_tab_scaled), "feature_id")
+    counts_tab_scaled
+), "feature_id")
 
 write.table(
-    counts_tab_scaled, file.path(outdir,"raw_counts.tsv"), 
+    counts_tab_scaled, file.path(outdir, "raw_counts.tsv"),
     col.names = TRUE, row.names = FALSE,
     sep = "\t", quote = FALSE
 )
 
 write.table(
-    res_df, file.path(outdir,"norm_counts.tsv"),
+    res_df, file.path(outdir, "norm_counts.tsv"),
     col.names = TRUE, row.names = FALSE,
     sep = "\t", quote = FALSE
 )
 
 write.table(
-    rpkm_df, file.path(outdir,"rpkm_counts.tsv"),
+    rpkm_df, file.path(outdir, "rpkm_counts.tsv"),
     col.names = TRUE, row.names = FALSE,
     sep = "\t", quote = FALSE
 )