From ba487cc8671b24460ccc15b67fb8815d23f2ecc7 Mon Sep 17 00:00:00 2001 From: salsalsal97 Date: Fri, 18 Oct 2024 00:07:25 +0100 Subject: [PATCH] Added new functions --- DESCRIPTION | 3 +- NAMESPACE | 8 +- R/compute_downsampled_corr.r | 84 +++++++++ R/downsampling_DEanalysis.r | 2 +- R/downsampling_corrplots.r | 293 ++++++++++++++++++++++++++++++++ R/power_analysis.r | 142 ++++++++++++++++ R/power_plots.r | 3 +- R/preliminary_plots.r | 115 +++++++++++++ man/compute_downsampled_corr.Rd | 23 +++ man/downsampling_DEanalysis.Rd | 6 +- man/downsampling_corrplots.Rd | 84 +++++++++ man/power_analysis.Rd | 99 +++++++++++ man/power_plots.Rd | 6 +- man/preliminary_plots.Rd | 79 +++++++++ vignettes/getting_started.Rmd | 33 ++++ vignettes/poweranalysis.Rmd | 50 +++++- 16 files changed, 1017 insertions(+), 13 deletions(-) create mode 100644 R/compute_downsampled_corr.r create mode 100644 R/downsampling_corrplots.r create mode 100644 R/power_analysis.r create mode 100644 R/preliminary_plots.r create mode 100644 man/compute_downsampled_corr.Rd create mode 100644 man/downsampling_corrplots.Rd create mode 100644 man/power_analysis.Rd create mode 100644 man/preliminary_plots.Rd create mode 100644 vignettes/getting_started.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index f8602a8..00dfa00 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,7 +38,8 @@ Imports: utils, gtools, stringr, - gridExtra + gridExtra, + ggcorrplot Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index 21361b0..a42fd6c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,12 +19,14 @@ importFrom(edgeR,calcNormFactors) importFrom(edgeR,estimateDisp) importFrom(edgeR,glmFit) importFrom(edgeR,glmLRT) +importFrom(ggcorrplot,ggcorrplot) importFrom(ggplot2,aes) importFrom(ggplot2,element_blank) importFrom(ggplot2,element_text) importFrom(ggplot2,facet_wrap) importFrom(ggplot2,geom_bar) importFrom(ggplot2,geom_boxplot) +importFrom(ggplot2,geom_histogram) importFrom(ggplot2,geom_hline) importFrom(ggplot2,geom_jitter) importFrom(ggplot2,geom_point) @@ -33,6 +35,7 @@ importFrom(ggplot2,ggsave) importFrom(ggplot2,ggtitle) importFrom(ggplot2,labs) importFrom(ggplot2,scale_colour_manual) +importFrom(ggplot2,scale_fill_gradient2) importFrom(ggplot2,scale_shape_manual) importFrom(ggplot2,stat_summary) importFrom(ggplot2,theme) @@ -45,11 +48,14 @@ importFrom(qs,qread) importFrom(reshape2,melt) importFrom(scales,percent) importFrom(stats,as.formula) +importFrom(stats,complete.cases) +importFrom(stats,cor) importFrom(stats,median) importFrom(stats,model.matrix) importFrom(stats,p.adjust) importFrom(stats,quantile) +importFrom(stats,reshape) importFrom(stringr,str_sub) importFrom(utils,globalVariables) importFrom(viridis,scale_colour_viridis) -importFrom(viridis,scale_fill_viridis) \ No newline at end of file +importFrom(viridis,scale_fill_viridis) diff --git a/R/compute_downsampled_corr.r b/R/compute_downsampled_corr.r new file mode 100644 index 0000000..b31582d --- /dev/null +++ b/R/compute_downsampled_corr.r @@ -0,0 +1,84 @@ +# Define global variables +utils::globalVariables(c(".","dataset")) + +#' For a given down-sampled DE output, computes the correlation of the log-foldchange of the DEGs (at specified p-value) for a given dataset (celltype) + +#' @importFrom ggcorrplot ggcorrplot +#' @importFrom data.table rbindlist setkey +#' @importFrom stats reshape complete.cases cor +#' @importFrom ggplot2 theme element_text scale_fill_gradient2 labs + +#' @param allstudies - a list containing DGE analysis outputs (subset to "celltype_all_genes" and the specified cell type) at each down-sampled point +#' @param sampled - downsampling carried out based on what (either "individuals" or "cells") + +#' @return correlation matrix, plot and the number of DEGs at the specified p-value + +compute_downsampled_corr <- function(allstudies, + sampled="individuals"){ + + # check input parameters are fine + if(class(allstudies)!="list"){ + stop("Error: allstudies should be a list.") + } + + # variable to redefine allstudies + allstudies_new <- list() + j <- 1 + # select data for each down-sampling point + for(sampling_point in names(allstudies)){ + # redefine allstudies so each element only contains data for that down-sampled point + allstudies_new[[j]] <- allstudies[[sampling_point]] + names(allstudies_new)[[j]] <- sampling_point + j <- j+1 + } + # reshape data so "dataset" is now a variable + allstudies_dt <- rbindlist(allstudies_new,idcol="dataset") + #filter dataset + setkey(allstudies_dt,name) + + # make matrix for corr() - cols will be [datset, name, logFC] + mat_lfc <- allstudies_dt[,.(dataset,name,logFC)] + # reshape so cols now are [(gene) name, logFC.dataset1, logFC.dataset2,...,logFC.datasetN] (N being length(names(allstudies))) + mat_lfc <- + reshape(mat_lfc, idvar = "name", timevar = "dataset", + direction = "wide") + # remove logFC. from name (now cols are just [name, dataset1, dataset2,...,datasetN] with logFC values in each column) + colnames(mat_lfc)<- + c(colnames(mat_lfc)[1], + substr(colnames(mat_lfc[,2:ncol(mat_lfc)]), + 7,nchar(colnames(mat_lfc[,2:ncol(mat_lfc)])))) + # remove NA rows + mat_lfc <- mat_lfc[complete.cases(mat_lfc), ] + + # get correlation matrix + corr_lfc <- cor(mat_lfc[,2:ncol(mat_lfc)],method = "spearman") + + # plot correlation matrix + if(sampled=="individuals"){ + corr_plot.plot <- ggcorrplot(round(corr_lfc,2), + hc.order = F, insig="pch",pch=5,pch.col = "grey", + pch.cex=9, + title=paste0("LogFC Correlation Matrix when downsampling individuals"), + colors = c("#FC4E07", "white", "#00AFBB"), + outline.color = "white", lab = TRUE, lab_size=3.5, + sig.level=0.05) + + theme(plot.title = element_text(hjust = 0.7)) + + scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) + + labs(fill="Corr") + }else{ + corr_plot.plot <- ggcorrplot(round(corr_lfc,2), + hc.order = F, insig="pch",pch=5,pch.col = "grey", + pch.cex=9, + title=paste0("LogFC Correlation Matrix when downsampling cells"), + colors = c("#FC4E07", "white", "#00AFBB"), + outline.color = "white", lab = TRUE, lab_size=3.5, + sig.level=0.05) + + theme(plot.title = element_text(hjust = 0.7)) + + scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) + + labs(fill="Corr") + } + + # output matrix, plot + return(list(corr_lfc, corr_plot.plot)) + +} \ No newline at end of file diff --git a/R/downsampling_DEanalysis.r b/R/downsampling_DEanalysis.r index 14dc169..0068904 100644 --- a/R/downsampling_DEanalysis.r +++ b/R/downsampling_DEanalysis.r @@ -23,7 +23,7 @@ utils::globalVariables(c("PValue","name")) #' @param pval_adjust_method the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See stats::p.adjust for available options #' @param rmv_zero_count_genes whether genes with no count values in any cell should be removed. Default is TRUE -#' @return saves all DE outputs for downsampled files as well as a summary table of results showing number of true DEGs detected at each number of samples/cells +#' Saves all DE outputs for downsampled files as well as a summary table of results showing number of true DEGs detected at each number of samples/cells downsampling_DEanalysis <- function(data, range_downsampled="placeholder", diff --git a/R/downsampling_corrplots.r b/R/downsampling_corrplots.r new file mode 100644 index 0000000..9a81d91 --- /dev/null +++ b/R/downsampling_corrplots.r @@ -0,0 +1,293 @@ +#' Create correlation plots of the effect sizes of the top 1000 and top 500 DEGs + +#' @importFrom gtools mixedsort +#' @importFrom stringr str_sub +#' @importFrom ggplot2 labs theme ggsave element_text scale_fill_gradient2 +#' @importFrom ggcorrplot ggcorrplot + +#' @param data - the input data (should be an SCE object) +#' @param range_downsampled - range of values to be downsampled for, in ascending order +#' @param output_path - base path in which outputs will be stored +#' @param inpath - base path where downsampled DE output folders are stored (taken to be output_path if not provided) +#' @param sampled - downsampling carried out based on what (either "individuals" or "cells") +#' @param sampleID - sample ID +#' @param celltypeID - cell type ID +#' @param coeff - which coefficient to carry out DE analysis with respect to +#' @param Nperms - number of subsets created when downsampling at each level +#' @param y - the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression) +#' @param region - the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region. +#' @param control - character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL. +#' @param pval_adjust_method - the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See stats::p.adjust for available options +#' @param rmv_zero_count_genes - whether genes with no count values in any cell should be removed. Default is TRUE + +#' Saves all plots in the appropriate directory + +downsampling_corrplots <- function(data, + range_downsampled="placeholder", + output_path=getwd(), + inpath="placeholder", + sampled="individuals", + sampleID="donor_id", + celltypeID="cell_type", + coeff="male", + Nperms=20, + y=NULL, + region="single_region", + control=NULL, + pval_adjust_method="BH", + rmv_zero_count_genes=TRUE){ + + # alter range_downsampled + if(range_downsampled=="placeholder"){ + range_downsampled <- downsampling_range(data, sampled, sampleID) + } + # alter inpath + if(inpath=="placeholder"){ + inpath <- output_path + } + + # create output path if doesn't already exist + setwd(output_path) + dir.create(output_path,showWarnings=FALSE) + # create directory for correlation analysis + dir.create(paste0(output_path, "/corr_analysis/"), showWarnings=FALSE) + # validate function input params + validate_input_parameters_power(data=data, range_downsampled=range_downsampled, output_path=output_path, + inpath=inpath, sampled=sampled, sampleID=sampleID, + celltypeID=celltypeID, coeff=coeff, y=y, + region=region, control=control, pval_adjust_method=pval_adjust_method, + rmv_zero_count_genes=rmv_zero_count_genes) + + # get celltype name from dataset + celltype_name <- toString(unique(data[[celltypeID]])) + # check if DE analysis output present already in output_path + if(!"DEout.RData" %in% list.files(output_path)){ + # run and save DE analysis + assign("DEout", sc_cell_type_de(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff)) + save(DEout,file=paste0(output_path,"/DEout.RData")) + }else{ + load(paste0(output_path,"/DEout.RData")) + } + + ## correlation using down-sampled datasets - top 1000 genes + allgenes <- DEout$celltype_all_genes[[celltype_name]] + # rank DEGs by significance + allgenes_ordered <- allgenes[order(allgenes$PValue),] + # get top 1000 most signficant DEGs from full dataset + top1000_full <- allgenes_ordered[1:1000,] + top1000 <- data.frame(top1000_full$name) + colnames(top1000) <- c("name") + top1000$`logFC` <- top1000_full$logFC + ## correlation using down-sampled datasets - top 500 genes + # get top 500 most signficant DEGs from full dataset + top500_full <- allgenes_ordered[1:500,] + top500 <- data.frame(top500_full$name) + colnames(top500) <- c("name") + top500$`logFC` <- top500_full$logFC + + ## if down-sampled individuals + if(sampled=="individuals"){ + + # define path and folders + path <- paste0(inpath,"/DE_downsampling/") + downsampled_folders <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"samples") + # get corr matrices for each permutation + corrMats_1000 <- list() + for(i in 1:Nperms){ + setwd(path) + for(folder in mixedsort(list.files())){ + # remove other files above + if(folder%in%downsampled_folders){ + newpath <- paste0(path,folder) + # enter directory + setwd(newpath) + # get num_samples + num_samples <- str_sub(folder,1,-8) + # go into permutation + subpath <- paste0(newpath,"/",paste0(num_samples,"_",i)) + setwd(subpath) + # read DE output + load(paste0("DEout",num_samples,"_",i,".RData")) + # get df for top 1000 genes + all_genes <- get(paste0("DEout_",num_samples))$celltype_all_genes[[celltype_name]] + assign(paste0("samples_",num_samples),subset(all_genes,name%in%top1000$name)) + # move out again + setwd(newpath) + } + } + allstudies <- mget(paste0("samples_", paste(range_downsampled, sep=" ", collapse=NULL))) + names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_samples") + # compute correlation for this permutation + corrMats_1000[[i]] <- compute_downsampled_corr(allstudies)[[1]] + } + # compute mean of correlation matrices + meanCorr_1000 <- Reduce("+",corrMats_1000) / length(corrMats_1000) + # plot + setwd(paste0(output_path, "/corr_analysis/")) + # plot correlation matrix + meanCorr_downsampling_1000.plot <- ggcorrplot(round(meanCorr_1000,2), + hc.order = F, insig="pch",pch=5,pch.col = "grey", + pch.cex=9, + title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 1000 genes when downsampling"), + colors = c("#FC4E07", "white", "#00AFBB"), + outline.color = "white", lab = TRUE, lab_size=3.5, + sig.level=0.05) + + theme(plot.title = element_text(hjust = 0.7)) + + scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) + + labs(fill="Corr") + ggsave("meanCorr_downsampling_1000.png",meanCorr_downsampling_1000.plot,width=25,height=20,units="cm",bg="white") + ggsave("meanCorr_downsampling_1000.pdf",meanCorr_downsampling_1000.plot,width=25,height=20,units="cm",bg="white") + + # get corr matrices for each permutation + corrMats_500 <- list() + for(i in 1:Nperms){ + setwd(path) + for(folder in mixedsort(list.files())){ + # remove other files above + if(folder%in%downsampled_folders){ + newpath <- paste0(path,folder) + # enter directory + setwd(newpath) + # get num_samples + num_samples <- str_sub(folder,1,-8) + # go into permutation + subpath <- paste0(newpath,"/",paste0(num_samples,"_",i)) + setwd(subpath) + # read DE output + load(paste0("DEout",num_samples,"_",i,".RData")) + # get df for top 500 genes + all_genes <- get(paste0("DEout_",num_samples))$celltype_all_genes[[celltype_name]] + assign(paste0("samples_",num_samples),subset(all_genes,name%in%top500$name)) + # move out again + setwd(newpath) + } + } + allstudies <- mget(paste0("samples_", paste(range_downsampled, sep=" ", collapse=NULL))) + names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_samples") + # compute correlation for this permutation + corrMats_500[[i]] <- compute_downsampled_corr(allstudies)[[1]] + } + # compute mean of correlation matrices + meanCorr_500 <- Reduce("+",corrMats_500) / length(corrMats_500) + # plot + setwd(paste0(output_path, "/corr_analysis/")) + # plot correlation matrix + meanCorr_downsampling_500.plot <- ggcorrplot(round(meanCorr_500,2), + hc.order = F, insig="pch",pch=5,pch.col = "grey", + pch.cex=9, + title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 500 genes when downsampling"), + colors = c("#FC4E07", "white", "#00AFBB"), + outline.color = "white", lab = TRUE, lab_size=3.5, + sig.level=0.05) + + theme(plot.title = element_text(hjust = 0.7)) + + scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) + + labs(fill="Corr") + ggsave("meanCorr_downsampling_500.png",meanCorr_downsampling_500.plot,width=25,height=20,units="cm",bg="white") + ggsave("meanCorr_downsampling_500.pdf",meanCorr_downsampling_500.plot,width=25,height=20,units="cm",bg="white") + + }else{ + + ## if down-sampled cells + # define path and folders + path <- paste0(inpath,"/DE_downsampling_cells/") + downsampled_folders <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"cells_persample") + # get corr matrices for each permutation + corrMats_cells_1000 <- list() + for(i in 1:Nperms){ + setwd(path) + for(folder in mixedsort(list.files())){ + # remove other files above + if(folder%in%downsampled_folders){ + newpath <- paste0(path,folder) + # enter directory + setwd(newpath) + # get num_cells + num_cells <- str_sub(folder,1,-16) + # go into permutation + subpath <- paste0(newpath,"/",paste0(num_cells,"_",i)) + setwd(subpath) + # read DE output + load(paste0("DEout",num_cells,"_",i,".RData")) + # get df for top 1000 genes + all_genes <- get(paste0("DEout_",num_cells))$celltype_all_genes[[celltype_name]] + assign(paste0("cells_",num_cells),subset(all_genes,name%in%top1000$name)) + # move out again + setwd(newpath) + } + } + allstudies <- mget(paste0("cells_", paste(range_downsampled, sep=" ", collapse=NULL))) + names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_cells/sample") + # compute correlation for this permutation + corrMats_cells_1000[[i]] <- compute_downsampled_corr(allstudies,"cells")[[1]] + } + # compute mean of correlation matrices + meanCorr_cells_1000 <- Reduce("+",corrMats_cells_1000) / length(corrMats_cells_1000) + # plot + setwd(paste0(output_path, "/corr_analysis/")) + # plot correlation matrix + meanCorr_downsampling_cells_1000.plot <- ggcorrplot(round(meanCorr_cells_1000,2), + hc.order = F, insig="pch",pch=5,pch.col = "grey", + pch.cex=9, + title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 1000 genes when downsampling cells"), + colors = c("#FC4E07", "white", "#00AFBB"), + outline.color = "white", lab = TRUE, lab_size=3.5, + sig.level=0.05) + + theme(plot.title = element_text(hjust = 0.7)) + + scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) + + labs(fill="Corr") + ggsave("meanCorr_downsampling_cells_1000.png",meanCorr_downsampling_cells_1000.plot,width=25,height=20,units="cm",bg="white") + ggsave("meanCorr_downsampling_cells_1000.pdf",meanCorr_downsampling_cells_1000.plot,width=25,height=20,units="cm",bg="white") + + ## correlation using down-sampled datasets (cells) - top 500 genes + # load in DE outputs for down-sampled datasets, take union of DEGs across perms + setwd(path) + # get corr matrices for each permutation + corrMats_cells_500 <- list() + for(i in 1:Nperms){ + setwd(path) + for(folder in mixedsort(list.files())){ + # remove other files above + if(folder%in%downsampled_folders){ + newpath <- paste0(path,folder) + # enter directory + setwd(newpath) + # get num_cells + num_cells <- str_sub(folder,1,-16) + # go into permutation + subpath <- paste0(newpath,"/",paste0(num_cells,"_",i)) + setwd(subpath) + # read DE output + load(paste0("DEout",num_cells,"_",i,".RData")) + # get df for top 500 genes + all_genes <- get(paste0("DEout_",num_cells))$celltype_all_genes[[celltype_name]] + assign(paste0("cells_",num_cells),subset(all_genes,name%in%top500$name)) + # move out again + setwd(newpath) + } + } + allstudies <- mget(paste0("cells_", paste(range_downsampled, sep=" ", collapse=NULL))) + names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_cells/sample") + # compute correlation for this permutation + corrMats_cells_500[[i]] <- compute_downsampled_corr(allstudies,"cells")[[1]] + } + # compute mean of correlation matrices + meanCorr_cells_500 <- Reduce("+",corrMats_cells_500) / length(corrMats_cells_500) + # plot + setwd(paste0(output_path, "/corr_analysis/")) + # plot correlation matrix + meanCorr_downsampling_cells_500.plot <- ggcorrplot(round(meanCorr_cells_500,2), + hc.order = F, insig="pch",pch=5,pch.col = "grey", + pch.cex=9, + title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 500 genes when downsampling cells"), + colors = c("#FC4E07", "white", "#00AFBB"), + outline.color = "white", lab = TRUE, lab_size=3.5, + sig.level=0.05) + + theme(plot.title = element_text(hjust = 0.7)) + + scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) + + labs(fill="Corr") + ggsave("meanCorr_downsampling_cells_500.png",meanCorr_downsampling_cells_500.plot,width=25,height=20,units="cm",bg="white") + ggsave("meanCorr_downsampling_cells_500.pdf",meanCorr_downsampling_cells_500.plot,width=25,height=20,units="cm",bg="white") + + } + +} \ No newline at end of file diff --git a/R/power_analysis.r b/R/power_analysis.r new file mode 100644 index 0000000..fa2b411 --- /dev/null +++ b/R/power_analysis.r @@ -0,0 +1,142 @@ +# Define global variables +utils::globalVariables(c("dataset")) + +#' Runs entire power analysis + +#' @importFrom stats as.formula + +#' @param data - the input data (should be an SCE object) +#' @param range_downsampled_individuals - range of values to downsample individuals for +#' @param range_downsampled_cells - range of values to downsample cells for +#' @param output_path - base path in which outputs will be stored +#' @param sampleID - sample ID +#' @param design - the design formula of class type `formula`. Equation used to fit the model- data for the generalised linear model e.g. expression ~ sex + pmi + disease +#' @param sexID - sex ID +#' @param celltypeID - cell type ID +#' @param coeff - which coefficient to carry out DE analysis with respect to +#' @param fdr - the cut-off False Discovery Rate below which to select DEGs +#' @param nom_pval - the cut-off nominal P-value below which to select DEGs (as an alternative to FDR) +#' @param Nperms - number of subsets created when downsampling at each level +#' @param y - the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression) +#' @param region - the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region. +#' @param control - character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL. +#' @param pval_adjust_method - the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See stats::p.adjust for available options +#' @param rmv_zero_count_genes - whether genes with no count values in any cell should be removed. Default is TRUE + +#' Saves all plots and DE outputs in the appropriate directories + +power_analysis <- function(data, + range_downsampled_individuals="placeholder", + range_downsampled_cells="placeholder", + output_path=getwd(), + sampleID="donor_id", + design="placeholder", + sexID="sex", + celltypeID="cell_type", + coeff="male", + fdr=0.05, + nom_pval=0.05, + Nperms=20, + y=NULL, + region="single_region", + control=NULL, + pval_adjust_method="BH", + rmv_zero_count_genes=TRUE){ + + setwd(output_path) + + # alter range_downsampled_individuals + if(range_downsampled_individuals=="placeholder"){ + range_downsampled_individuals <- downsampling_range(data, "individuals", sampleID) + } + # alter range_downsampled_cells + if(range_downsampled_cells=="placeholder"){ + range_downsampled_cells <- downsampling_range(data, "cells", sampleID) + } + # alter design + if(design=="placeholder"){ + design=as.formula(paste0("~",sexID)) + } + + # create preliminary plots + preliminary_plots(data=data, + output_path=output_path, + sampleID=sampleID, + design=design, + sexID=sexID, + celltypeID=celltypeID, + coeff=coeff, + fdr=fdr) + + ## create power plots for down-sampling individuals, cells + # down-sample individuals and run DE analysis + downsampling_DEanalysis(data=data, + range_downsampled=range_downsampled_individuals, + output_path=output_path, + sampled="individuals", + sampleID=sampleID, + design=design, + sexID=sexID, + celltypeID=celltypeID, + y=y, + region=region, + control=control, + pval_adjust_method=pval_adjust_method, + rmv_zero_count_genes=rmv_zero_count_genes, + coeff=coeff, + fdr=fdr, + nom_pval=nom_pval, + Nperms=Nperms) + # create power plots + power_plots(data=data, + range_downsampled=range_downsampled_individuals, + output_path=output_path, + sampled="individuals", + sampleID=sampleID, + celltypeID=celltypeID, + fdr=fdr, + nom_pval=nom_pval, + Nperms=Nperms) + # down-sample cells and run DE analysis + downsampling_DEanalysis(data=data, + range_downsampled=range_downsampled_cells, + output_path=output_path, + sampled="cells", + sampleID=sampleID, + design=design, + sexID=sexID, + celltypeID=celltypeID, + y=y, + region=region, + control=control, + pval_adjust_method=pval_adjust_method, + rmv_zero_count_genes=rmv_zero_count_genes, + coeff=coeff, + fdr=fdr, + nom_pval=nom_pval, + Nperms=Nperms) + # create power plots + power_plots(data=data, + range_downsampled=range_downsampled_cells, + output_path=output_path, + sampled="cells", + sampleID=sampleID, + celltypeID=celltypeID, + fdr=fdr, + nom_pval=nom_pval, + Nperms=Nperms) + # create down-sampling correlation plots (individuals) + downsampling_corrplots(data=data, + range_downsampled=range_downsampled_individuals, + output_path=output_path, + sampled="individuals", + celltypeID=celltypeID, + Nperms=Nperms) + # create down-sampling correlation plots (cells) + downsampling_corrplots(data=data, + range_downsampled=range_downsampled_cells, + output_path=output_path, + sampled="cells", + celltypeID=celltypeID, + Nperms=Nperms) +} \ No newline at end of file diff --git a/R/power_plots.r b/R/power_plots.r index 3284ae1..eefc2a2 100644 --- a/R/power_plots.r +++ b/R/power_plots.r @@ -29,7 +29,7 @@ utils::globalVariables(c("design","PValue","logFC","name","variable")) #' @param pval_adjust_method the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See stats::p.adjust for available options #' @param rmv_zero_count_genes whether genes with no count values in any cell should be removed. Default is TRUE -#' @return saves all plots in the appropriate directory +#' Saves all plots in the appropriate directory power_plots <- function(data, range_downsampled="placeholder", @@ -58,6 +58,7 @@ power_plots <- function(data, } # create output path if doesn't already exist + setwd(output_path) dir.create(output_path,showWarnings=FALSE) # validate function input params validate_input_parameters_power(data=data, range_downsampled=range_downsampled, output_path=output_path, diff --git a/R/preliminary_plots.r b/R/preliminary_plots.r new file mode 100644 index 0000000..7b1b2db --- /dev/null +++ b/R/preliminary_plots.r @@ -0,0 +1,115 @@ +# Define global variables +utils::globalVariables(c("..density..")) + +#' Create preliminary plots for data exploration + +#' @importFrom stats as.formula +#' @importFrom ggplot2 ggplot labs theme ggsave aes element_text element_blank geom_histogram +#' @importFrom cowplot theme_cowplot +#' @importFrom gridExtra arrangeGrob +#' @importFrom SingleCellExperiment colData + +#' @param data - the input data (should be an SCE object) +#' @param output_path - base path in which outputs will be stored +#' @param sampleID - sample ID +#' @param design - the design formula of class type `formula`. Equation used to fit the model- data for the generalised linear model e.g. expression ~ sex + pmi + disease +#' @param sexID - sex ID +#' @param celltypeID - cell type ID +#' @param coeff - which coefficient to carry out DE analysis with respect to +#' @param fdr - the cut-off False Discovery Rate below which to select DEGs +#' @param y - the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression) +#' @param region - the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region. +#' @param control - character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL. +#' @param pval_adjust_method - the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See stats::p.adjust for available options +#' @param rmv_zero_count_genes - whether genes with no count values in any cell should be removed. Default is TRUE + +#' Saves all plots in the appropriate directory + +preliminary_plots <- function(data, + output_path=getwd(), + sampleID="donor_id", + design="placeholder", + sexID="sex", + celltypeID="cell_type", + coeff="male", + fdr=0.05, + y=NULL, + region="single_region", + control=NULL, + pval_adjust_method="BH", + rmv_zero_count_genes=TRUE){ + + # create output path if doesn't already exist + setwd(output_path) + dir.create(output_path,showWarnings=FALSE) + # alter design + if(design=="placeholder"){ + design=as.formula(paste0("~",sexID)) + } + # validate function input params + validate_input_parameters_power(data=data, output_path=output_path, sampleID=sampleID, + design=design, sexID=sexID, celltypeID=celltypeID, + coeff=coeff, fdr=fdr, y=y, + region=region, control=control, pval_adjust_method=pval_adjust_method, + rmv_zero_count_genes=rmv_zero_count_genes) + + # get celltype name from dataset + celltype_name <- toString(unique(data[[celltypeID]])) + # check if DE analysis output present already in output_path + if(!"DEout.RData" %in% list.files(output_path)){ + # run and save DE analysis + assign("DEout", sc_cell_type_de(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff)) + save(DEout,file=paste0(output_path,"/DEout.RData")) + }else{ + load(paste0(output_path,"/DEout.RData")) + } + + ## plot histogram of effect sizes in DEGs and all genes + # for DEGs (at fdr cut-off) + DEGs_full <- subset(DEout$celltype_all_genes[[celltype_name]], adj_pval + %\VignetteIndexEntry{getting_started} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` +*** +# Introduction + +The *poweranalysis* R package is designed to run robust power analysis for differential gene expression in scRNA-seq studies, addressing the challenge of low statistical power due to small sample sizes. It provides tools to estimate the optimal number of samples and cells needed to achieve reliable power levels. +
+
+Using *poweranalysis* involves four steps:
+**1. Differential Expression (DE) Analysis:** Import your SCE object and perform DE analysis using a pseudobulking approach, allowing for the robust identification of differentially expressed genes (DEGs) across conditions or groups.
+**2. Correlation Analysis:** Assess the consistency of DEG effect sizes across random permutations, subsets of your data, and different studies by computing correlation matrices.
+**3. Downsampling and Power Analysis:** Test the robustness of your findings by downsampling both individuals and cells, then analyze the effect on DEG detection power.
+**4. Visualisation:** Generate comprehensive power plots and visual summaries of the power analysis results, including DEG detection rates, false discovery rates, and mean LogFC correlation between down-sampled subsets. +
+ +# Setup + +```{r setup} +library(poweranalysis) +``` diff --git a/vignettes/poweranalysis.Rmd b/vignettes/poweranalysis.Rmd index a498173..353272d 100644 --- a/vignettes/poweranalysis.Rmd +++ b/vignettes/poweranalysis.Rmd @@ -1,5 +1,5 @@ --- -title: "poweranalysis" +title: "Power Analysis" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{poweranalysis} @@ -13,7 +13,55 @@ knitr::opts_chunk$set( comment = "#>" ) ``` +*** +### Authors: *Salman Fawad*, *Yunning Yuan*, *Nathan Skene*
+### README updated: *Aug-09-2024*
+
+# Introduction + +The *poweranalysis* R package is designed to run robust power analysis for differential gene expression in scRNA-seq studies, addressing the challenge of low statistical power due to small sample sizes. It provides tools to estimate the optimal number of samples and cells needed to achieve reliable power levels. +
+
+Using *poweranalysis* involves four steps:
+**1. Differential Expression (DE) Analysis:** Import your SCE object and perform DE analysis using a pseudobulking approach, allowing for the robust identification of differentially expressed genes (DEGs) across conditions or groups.
+**2. Correlation Analysis:** Assess the consistency of DEG effect sizes across random permutations, subsets of your data, and different studies by computing correlation matrices.
+**3. Downsampling and Power Analysis:** Test the robustness of your findings by downsampling both individuals and cells, then analyze the effect on DEG detection power.
+**4. Visualisation:** Generate comprehensive power plots and visual summaries of the power analysis results, including DEG detection rates, false discovery rates, and mean LogFC correlation between down-sampled subsets. + + +# Installation +
+ +# Setup ```{r setup} library(poweranalysis) ``` +
+ +# Documentation +## [Getting Started](http://127.0.0.1:18836/library/poweranalysis/doc/getting_started.html) +
+ +# Troubleshooting +If you have any problems, please do submit an [Issue here on GitHub](https://github.com/neurogenomics/Power_Analysis_package) with a reproducible example. +
+ +# Citation + +
+*** +
+ +# Contact +## [Neurogenomis Lab](https://www.neurogenomics.co.uk/) +UK Dementia Research Institute
+Department of Brain Sciences
+Faculty of Medicine
+Imperial College London
+[GitHub](https://github.com/neurogenomics) +[DockerHub](https://hub.docker.com/orgs/neurogenomicslab) +
+ +# References +