Skip to content

Commit

Permalink
Add additional R dependencies to Dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
adamd3 committed Mar 26, 2024
1 parent b6d55ac commit cedba60
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 41 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.rstudio.com'; \

RUN R -e 'install.packages(c( \
"optparse", "RColorBrewer", "reshape2", "tidyverse", \
"readr", "tibble", "purrr", "tidyr", "stringr", "ggplot2", \
"scales", "pheatmap", "matrixstats", "umap", "BiocManager"))'

RUN R -e 'BiocManager::install(c("edgeR", "DESeq2"))'
10 changes: 9 additions & 1 deletion bin/DESeq2_normalise_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,15 @@

library(optparse)
library(DESeq2)
library(tidyverse)
if (!require("tidyverse")) {
lapply(c("readr", "tibble", "purrr", "tidyr", "stringr", "ggplot2"),
library,
character.only = TRUE
)
} else {
library(tidyverse)
}


option_list <- list(
make_option(c("-c", "--counts"),
Expand Down
100 changes: 60 additions & 40 deletions bin/TMM_normalise_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,50 @@
library(optparse)
library(DESeq2)
library(edgeR)
library(tidyverse)
if (!require("tidyverse")) {
lapply(c("readr", "tibble", "purrr", "tidyr", "stringr", "ggplot2"),
library,
character.only = TRUE
)
} else {
library(tidyverse)
}

option_list <- list(
make_option(c("-c", "--counts"), type="character", default=NULL,
help="table of read counts per gene", metavar="character"),
make_option(c("-l", "--lengths"), type="character", default=NULL,
help="table of effective lengths per gene", metavar="character"),
make_option(c("-g", "--genes"), type="character", default=NULL,
help="core gene subset to be used", metavar="character"),
make_option(c("-p", "--perc"), type="character", default=NULL,
help="filter to core genes?", metavar="character"),
make_option(c("-t", "--log_transform"), type="character", default=NULL,
help="log transform the counts? default = FALSE", metavar="character"),
make_option(c("-o", "--outdir"), type="character", default=NULL,
help="output directory for results", metavar="character")
make_option(c("-c", "--counts"),
type = "character", default = NULL,
help = "table of read counts per gene", metavar = "character"
),
make_option(c("-l", "--lengths"),
type = "character", default = NULL,
help = "table of effective lengths per gene", metavar = "character"
),
make_option(c("-g", "--genes"),
type = "character", default = NULL,
help = "core gene subset to be used", metavar = "character"
),
make_option(c("-p", "--perc"),
type = "character", default = NULL,
help = "filter to core genes?", metavar = "character"
),
make_option(c("-t", "--log_transform"),
type = "character", default = NULL,
help = "log transform the counts? default = FALSE", metavar = "character"
),
make_option(c("-o", "--outdir"),
type = "character", default = NULL,
help = "output directory for results", metavar = "character"
)
)

opt_parser <- OptionParser(option_list=option_list)
opt_parser <- OptionParser(option_list = option_list)
opt <- parse_args(opt_parser)

counts_f <- opt$counts
lengths_f <- opt$lengths
gene_f <- opt$genes
perc <- if(opt$perc == "TRUE") TRUE else FALSE
log <- if(opt$log_transform == "TRUE") TRUE else FALSE
perc <- if (opt$perc == "TRUE") TRUE else FALSE
log <- if (opt$log_transform == "TRUE") TRUE else FALSE
outdir <- opt$outdir


Expand All @@ -36,18 +55,18 @@ counts_tab <- suppressMessages(read_tsv(counts_f))
lengths_tab <- suppressMessages(read_tsv(lengths_f))
core_genome <- suppressMessages(read_tsv(gene_f))

lengths_tab <- lengths_tab[match(counts_tab$Gene, lengths_tab$Gene),]
lengths_tab <- lengths_tab[match(counts_tab$Gene, lengths_tab$Gene), ]

gene_ids <- counts_tab$Gene
counts_tab <- data.frame(counts_tab[,2:ncol(counts_tab)])
lengths_tab <- data.frame(lengths_tab[,2:ncol(lengths_tab)])
counts_tab <- data.frame(counts_tab[, 2:ncol(counts_tab)])
lengths_tab <- data.frame(lengths_tab[, 2:ncol(lengths_tab)])
rownames(counts_tab) <- rownames(lengths_tab) <- gene_ids

## scale counts to reads per median gene length
median_lens <- rowMedians(as.matrix(lengths_tab), na.rm=TRUE)
median_lens <- rowMedians(as.matrix(lengths_tab), na.rm = TRUE)
names(median_lens) <- gene_ids

counts_tab_scaled <- (counts_tab/lengths_tab)
counts_tab_scaled <- (counts_tab / lengths_tab)
counts_tab_scaled <- sweep(counts_tab_scaled, 1, median_lens, "*")


Expand All @@ -64,17 +83,17 @@ size_factors <- effectiveLibSizes(y)

## NOTE: = effectiveLibSizes(y) = (y$samples$lib.size)*(y$samples$norm.factors)

## NOTE: the output of edgeR effectiveLibSizes() is equivalent to the output of
## NOTE: the output of edgeR effectiveLibSizes() is equivalent to the output of
## sizeFactors() in DESeq2; see: https://support.bioconductor.org/p/46779/




if(isTRUE(perc)){

if (isTRUE(perc)) {
## subset to `perc` genes
counts_tab_perc <- subset(counts_tab_scaled, rownames(
counts_tab_scaled) %in% core_genome$gene)
counts_tab_scaled
) %in% core_genome$gene)
counts_tab_perc[is.na(counts_tab_perc)] <- 0

median_sub <- median_lens[rownames(counts_tab_perc)]
Expand All @@ -83,50 +102,51 @@ if(isTRUE(perc)){
res_df <- sweep(counts_tab_perc, 2, size_factors, `/`)

## get RPKM values
lib_sf <- (colSums(counts_tab_perc, na.rm=TRUE)/1e6)
gene_sf <- sapply(median_sub, function(l){ l/1e3 * lib_sf })
lib_sf <- (colSums(counts_tab_perc, na.rm = TRUE) / 1e6)
gene_sf <- sapply(median_sub, function(l) {
l / 1e3 * lib_sf
})
gene_sf <- as.data.frame(t(gene_sf))
rpkm_df <- counts_tab_perc / gene_sf


} else {

## get size factor-scaled counts
res_df <- sweep(counts_tab_scaled, 2, size_factors, `/`)

## get RPKM values
lib_sf <- (colSums(counts_tab_scaled, na.rm=TRUE)/1e6)
gene_sf <- sapply(median_lens, function(l){ l/1e3 * lib_sf })
lib_sf <- (colSums(counts_tab_scaled, na.rm = TRUE) / 1e6)
gene_sf <- sapply(median_lens, function(l) {
l / 1e3 * lib_sf
})
gene_sf <- as.data.frame(t(gene_sf))
rpkm_df <- counts_tab_scaled / gene_sf

}

if(isTRUE(log)){
res_df <- log2(res_df+1)
rpkm_df <- log2(rpkm_df+1)
if (isTRUE(log)) {
res_df <- log2(res_df + 1)
rpkm_df <- log2(rpkm_df + 1)
}

## convert rownames to column
res_df <- tibble::rownames_to_column(as.data.frame(res_df), "feature_id")
rpkm_df <- tibble::rownames_to_column(as.data.frame(rpkm_df), "feature_id")
counts_tab_scaled <- tibble::rownames_to_column(as.data.frame(
counts_tab_scaled), "feature_id")
counts_tab_scaled
), "feature_id")

write.table(
counts_tab_scaled, file.path(outdir,"raw_counts.tsv"),
counts_tab_scaled, file.path(outdir, "raw_counts.tsv"),
col.names = TRUE, row.names = FALSE,
sep = "\t", quote = FALSE
)

write.table(
res_df, file.path(outdir,"norm_counts.tsv"),
res_df, file.path(outdir, "norm_counts.tsv"),
col.names = TRUE, row.names = FALSE,
sep = "\t", quote = FALSE
)

write.table(
rpkm_df, file.path(outdir,"rpkm_counts.tsv"),
rpkm_df, file.path(outdir, "rpkm_counts.tsv"),
col.names = TRUE, row.names = FALSE,
sep = "\t", quote = FALSE
)

0 comments on commit cedba60

Please sign in to comment.