From ba487cc8671b24460ccc15b67fb8815d23f2ecc7 Mon Sep 17 00:00:00 2001
From: salsalsal97 <salman.fawad@hotmail.com>
Date: Fri, 18 Oct 2024 00:07:25 +0100
Subject: [PATCH] Added new functions

---
 DESCRIPTION                     |   3 +-
 NAMESPACE                       |   8 +-
 R/compute_downsampled_corr.r    |  84 +++++++++
 R/downsampling_DEanalysis.r     |   2 +-
 R/downsampling_corrplots.r      | 293 ++++++++++++++++++++++++++++++++
 R/power_analysis.r              | 142 ++++++++++++++++
 R/power_plots.r                 |   3 +-
 R/preliminary_plots.r           | 115 +++++++++++++
 man/compute_downsampled_corr.Rd |  23 +++
 man/downsampling_DEanalysis.Rd  |   6 +-
 man/downsampling_corrplots.Rd   |  84 +++++++++
 man/power_analysis.Rd           |  99 +++++++++++
 man/power_plots.Rd              |   6 +-
 man/preliminary_plots.Rd        |  79 +++++++++
 vignettes/getting_started.Rmd   |  33 ++++
 vignettes/poweranalysis.Rmd     |  50 +++++-
 16 files changed, 1017 insertions(+), 13 deletions(-)
 create mode 100644 R/compute_downsampled_corr.r
 create mode 100644 R/downsampling_corrplots.r
 create mode 100644 R/power_analysis.r
 create mode 100644 R/preliminary_plots.r
 create mode 100644 man/compute_downsampled_corr.Rd
 create mode 100644 man/downsampling_corrplots.Rd
 create mode 100644 man/power_analysis.Rd
 create mode 100644 man/preliminary_plots.Rd
 create mode 100644 vignettes/getting_started.Rmd

diff --git a/DESCRIPTION b/DESCRIPTION
index f8602a8..00dfa00 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -38,7 +38,8 @@ Imports:
     utils,
     gtools,
     stringr,
-    gridExtra
+    gridExtra,
+    ggcorrplot
 Suggests: 
     knitr,
     rmarkdown,
diff --git a/NAMESPACE b/NAMESPACE
index 21361b0..a42fd6c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -19,12 +19,14 @@ importFrom(edgeR,calcNormFactors)
 importFrom(edgeR,estimateDisp)
 importFrom(edgeR,glmFit)
 importFrom(edgeR,glmLRT)
+importFrom(ggcorrplot,ggcorrplot)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,element_blank)
 importFrom(ggplot2,element_text)
 importFrom(ggplot2,facet_wrap)
 importFrom(ggplot2,geom_bar)
 importFrom(ggplot2,geom_boxplot)
+importFrom(ggplot2,geom_histogram)
 importFrom(ggplot2,geom_hline)
 importFrom(ggplot2,geom_jitter)
 importFrom(ggplot2,geom_point)
@@ -33,6 +35,7 @@ importFrom(ggplot2,ggsave)
 importFrom(ggplot2,ggtitle)
 importFrom(ggplot2,labs)
 importFrom(ggplot2,scale_colour_manual)
+importFrom(ggplot2,scale_fill_gradient2)
 importFrom(ggplot2,scale_shape_manual)
 importFrom(ggplot2,stat_summary)
 importFrom(ggplot2,theme)
@@ -45,11 +48,14 @@ importFrom(qs,qread)
 importFrom(reshape2,melt)
 importFrom(scales,percent)
 importFrom(stats,as.formula)
+importFrom(stats,complete.cases)
+importFrom(stats,cor)
 importFrom(stats,median)
 importFrom(stats,model.matrix)
 importFrom(stats,p.adjust)
 importFrom(stats,quantile)
+importFrom(stats,reshape)
 importFrom(stringr,str_sub)
 importFrom(utils,globalVariables)
 importFrom(viridis,scale_colour_viridis)
-importFrom(viridis,scale_fill_viridis)
\ No newline at end of file
+importFrom(viridis,scale_fill_viridis)
diff --git a/R/compute_downsampled_corr.r b/R/compute_downsampled_corr.r
new file mode 100644
index 0000000..b31582d
--- /dev/null
+++ b/R/compute_downsampled_corr.r
@@ -0,0 +1,84 @@
+# Define global variables
+utils::globalVariables(c(".","dataset"))
+
+#' For a given down-sampled DE output, computes the correlation of the log-foldchange of the DEGs (at specified p-value) for a given dataset (celltype)
+
+#' @importFrom ggcorrplot ggcorrplot
+#' @importFrom data.table rbindlist setkey
+#' @importFrom stats reshape complete.cases cor
+#' @importFrom ggplot2 theme element_text scale_fill_gradient2 labs
+
+#' @param allstudies - a list containing DGE analysis outputs (subset to "celltype_all_genes" and the specified cell type) at each down-sampled point
+#' @param sampled - downsampling carried out based on what (either "individuals" or "cells")
+
+#' @return correlation matrix, plot and the number of DEGs at the specified p-value
+
+compute_downsampled_corr <- function(allstudies,
+                                     sampled="individuals"){
+
+    # check input parameters are fine
+    if(class(allstudies)!="list"){
+        stop("Error: allstudies should be a list.")
+    }
+
+    # variable to redefine allstudies
+    allstudies_new <- list()
+    j <- 1
+    # select data for each down-sampling point
+    for(sampling_point in names(allstudies)){
+        # redefine allstudies so each element only contains data for that down-sampled point
+        allstudies_new[[j]] <- allstudies[[sampling_point]]
+        names(allstudies_new)[[j]] <- sampling_point
+        j <- j+1
+    }
+    # reshape data so "dataset" is now a variable
+    allstudies_dt <- rbindlist(allstudies_new,idcol="dataset")
+    #filter dataset
+    setkey(allstudies_dt,name)
+    
+    # make matrix for corr() - cols will be [datset, name, logFC]
+    mat_lfc <- allstudies_dt[,.(dataset,name,logFC)]
+    # reshape so cols now are [(gene) name, logFC.dataset1, logFC.dataset2,...,logFC.datasetN] (N being length(names(allstudies)))
+    mat_lfc <-
+    reshape(mat_lfc, idvar = "name", timevar = "dataset", 
+            direction = "wide")
+    # remove logFC. from name (now cols are just [name, dataset1, dataset2,...,datasetN] with logFC values in each column)
+    colnames(mat_lfc)<-
+        c(colnames(mat_lfc)[1],
+            substr(colnames(mat_lfc[,2:ncol(mat_lfc)]),
+                    7,nchar(colnames(mat_lfc[,2:ncol(mat_lfc)]))))
+    # remove NA rows
+    mat_lfc <- mat_lfc[complete.cases(mat_lfc), ]
+
+    # get correlation matrix
+    corr_lfc <- cor(mat_lfc[,2:ncol(mat_lfc)],method = "spearman")
+
+    # plot correlation matrix
+    if(sampled=="individuals"){
+        corr_plot.plot <- ggcorrplot(round(corr_lfc,2), 
+                hc.order = F, insig="pch",pch=5,pch.col = "grey",
+                pch.cex=9,
+                title=paste0("LogFC Correlation Matrix when downsampling individuals"),
+                colors = c("#FC4E07", "white", "#00AFBB"),
+                outline.color = "white", lab = TRUE, lab_size=3.5,
+                sig.level=0.05) + 
+                theme(plot.title = element_text(hjust = 0.7)) + 
+                scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) +
+                labs(fill="Corr")
+    }else{
+        corr_plot.plot <- ggcorrplot(round(corr_lfc,2), 
+        hc.order = F, insig="pch",pch=5,pch.col = "grey",
+        pch.cex=9,
+        title=paste0("LogFC Correlation Matrix when downsampling cells"),
+        colors = c("#FC4E07", "white", "#00AFBB"),
+        outline.color = "white", lab = TRUE, lab_size=3.5,
+        sig.level=0.05) + 
+        theme(plot.title = element_text(hjust = 0.7)) +
+        scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) +
+        labs(fill="Corr")
+    }
+
+    # output matrix, plot
+    return(list(corr_lfc, corr_plot.plot))
+
+}
\ No newline at end of file
diff --git a/R/downsampling_DEanalysis.r b/R/downsampling_DEanalysis.r
index 14dc169..0068904 100644
--- a/R/downsampling_DEanalysis.r
+++ b/R/downsampling_DEanalysis.r
@@ -23,7 +23,7 @@ utils::globalVariables(c("PValue","name"))
 #' @param pval_adjust_method the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
 #' @param rmv_zero_count_genes whether genes with no count values in any cell should be removed. Default is TRUE
 
-#' @return saves all DE outputs for downsampled files as well as a summary table of results showing number of true DEGs detected at each number of samples/cells
+#' Saves all DE outputs for downsampled files as well as a summary table of results showing number of true DEGs detected at each number of samples/cells
 
 downsampling_DEanalysis <- function(data,
                                     range_downsampled="placeholder",
diff --git a/R/downsampling_corrplots.r b/R/downsampling_corrplots.r
new file mode 100644
index 0000000..9a81d91
--- /dev/null
+++ b/R/downsampling_corrplots.r
@@ -0,0 +1,293 @@
+#' Create correlation plots of the effect sizes of the top 1000 and top 500 DEGs
+
+#' @importFrom gtools mixedsort
+#' @importFrom stringr str_sub
+#' @importFrom ggplot2 labs theme ggsave element_text scale_fill_gradient2
+#' @importFrom ggcorrplot ggcorrplot
+
+#' @param data - the input data (should be an SCE object)
+#' @param range_downsampled - range of values to be downsampled for, in ascending order
+#' @param output_path - base path in which outputs will be stored
+#' @param inpath - base path where downsampled DE output folders are stored (taken to be output_path if not provided)
+#' @param sampled - downsampling carried out based on what (either "individuals" or "cells")
+#' @param sampleID - sample ID
+#' @param celltypeID - cell type ID
+#' @param coeff - which coefficient to carry out DE analysis with respect to
+#' @param Nperms - number of subsets created when downsampling at each level
+#' @param y - the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression)
+#' @param region - the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region.
+#' @param control - character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL.
+#' @param pval_adjust_method - the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
+#' @param rmv_zero_count_genes - whether genes with no count values in any cell should be removed. Default is TRUE
+
+#' Saves all plots in the appropriate directory
+
+downsampling_corrplots <- function(data,
+                                   range_downsampled="placeholder",
+                                   output_path=getwd(),
+                                   inpath="placeholder",
+                                   sampled="individuals",
+                                   sampleID="donor_id",
+                                   celltypeID="cell_type",
+                                   coeff="male",
+                                   Nperms=20,
+                                   y=NULL,
+                                   region="single_region",
+                                   control=NULL,
+                                   pval_adjust_method="BH",
+                                   rmv_zero_count_genes=TRUE){
+    
+    # alter range_downsampled
+    if(range_downsampled=="placeholder"){
+        range_downsampled <- downsampling_range(data, sampled, sampleID)
+    }
+    # alter inpath
+    if(inpath=="placeholder"){
+        inpath <- output_path
+    }
+
+    # create output path if doesn't already exist
+    setwd(output_path)
+    dir.create(output_path,showWarnings=FALSE)
+    # create directory for correlation analysis
+    dir.create(paste0(output_path, "/corr_analysis/"), showWarnings=FALSE)
+    # validate function input params
+    validate_input_parameters_power(data=data, range_downsampled=range_downsampled, output_path=output_path,
+                                    inpath=inpath, sampled=sampled, sampleID=sampleID,
+                                    celltypeID=celltypeID, coeff=coeff, y=y,
+                                    region=region, control=control, pval_adjust_method=pval_adjust_method,
+                                    rmv_zero_count_genes=rmv_zero_count_genes)
+    
+    # get celltype name from dataset
+    celltype_name <- toString(unique(data[[celltypeID]]))
+    # check if DE analysis output present already in output_path
+    if(!"DEout.RData" %in% list.files(output_path)){
+        # run and save DE analysis
+        assign("DEout", sc_cell_type_de(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
+        save(DEout,file=paste0(output_path,"/DEout.RData"))
+    }else{
+        load(paste0(output_path,"/DEout.RData"))
+    }
+
+    ## correlation using down-sampled datasets - top 1000 genes
+    allgenes <- DEout$celltype_all_genes[[celltype_name]]
+    # rank DEGs by significance
+    allgenes_ordered <- allgenes[order(allgenes$PValue),]
+    # get top 1000 most signficant DEGs from full dataset
+    top1000_full <- allgenes_ordered[1:1000,]
+    top1000 <- data.frame(top1000_full$name)
+    colnames(top1000) <- c("name")
+    top1000$`logFC` <- top1000_full$logFC
+    ## correlation using down-sampled datasets - top 500 genes
+    # get top 500 most signficant DEGs from full dataset
+    top500_full <- allgenes_ordered[1:500,]
+    top500 <- data.frame(top500_full$name)
+    colnames(top500) <- c("name")
+    top500$`logFC` <- top500_full$logFC
+
+    ## if down-sampled individuals
+    if(sampled=="individuals"){
+        
+        # define path and folders
+        path <- paste0(inpath,"/DE_downsampling/")
+        downsampled_folders <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"samples")
+        # get corr matrices for each permutation
+        corrMats_1000 <- list()
+        for(i in 1:Nperms){
+            setwd(path)
+            for(folder in mixedsort(list.files())){
+                # remove other files above
+                if(folder%in%downsampled_folders){
+                    newpath <- paste0(path,folder)
+                    # enter directory
+                    setwd(newpath)
+                    # get num_samples
+                    num_samples <- str_sub(folder,1,-8) 
+                    # go into permutation
+                    subpath <- paste0(newpath,"/",paste0(num_samples,"_",i))
+                    setwd(subpath)
+                    # read DE output
+                    load(paste0("DEout",num_samples,"_",i,".RData"))
+                    # get df for top 1000 genes
+                    all_genes <- get(paste0("DEout_",num_samples))$celltype_all_genes[[celltype_name]]
+                    assign(paste0("samples_",num_samples),subset(all_genes,name%in%top1000$name))
+                    # move out again
+                    setwd(newpath)
+                }
+            }
+            allstudies <- mget(paste0("samples_", paste(range_downsampled, sep=" ", collapse=NULL)))
+            names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_samples")
+            # compute correlation for this permutation
+            corrMats_1000[[i]] <- compute_downsampled_corr(allstudies)[[1]]
+        }
+        # compute mean of correlation matrices
+        meanCorr_1000 <- Reduce("+",corrMats_1000) / length(corrMats_1000)
+        # plot
+        setwd(paste0(output_path, "/corr_analysis/"))
+        # plot correlation matrix
+        meanCorr_downsampling_1000.plot <- ggcorrplot(round(meanCorr_1000,2), 
+                hc.order = F, insig="pch",pch=5,pch.col = "grey",
+                pch.cex=9,
+                title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 1000 genes when downsampling"),
+                colors = c("#FC4E07", "white", "#00AFBB"),
+                outline.color = "white", lab = TRUE, lab_size=3.5,
+                sig.level=0.05) +
+                theme(plot.title = element_text(hjust = 0.7)) +
+                scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) +
+                labs(fill="Corr")
+        ggsave("meanCorr_downsampling_1000.png",meanCorr_downsampling_1000.plot,width=25,height=20,units="cm",bg="white")
+        ggsave("meanCorr_downsampling_1000.pdf",meanCorr_downsampling_1000.plot,width=25,height=20,units="cm",bg="white")
+
+        # get corr matrices for each permutation
+        corrMats_500 <- list()
+        for(i in 1:Nperms){
+            setwd(path)
+            for(folder in mixedsort(list.files())){
+                # remove other files above
+                if(folder%in%downsampled_folders){
+                    newpath <- paste0(path,folder)
+                    # enter directory
+                    setwd(newpath)
+                    # get num_samples
+                    num_samples <- str_sub(folder,1,-8) 
+                    # go into permutation
+                    subpath <- paste0(newpath,"/",paste0(num_samples,"_",i))
+                    setwd(subpath)
+                    # read DE output
+                    load(paste0("DEout",num_samples,"_",i,".RData"))
+                    # get df for top 500 genes
+                    all_genes <- get(paste0("DEout_",num_samples))$celltype_all_genes[[celltype_name]]
+                    assign(paste0("samples_",num_samples),subset(all_genes,name%in%top500$name))
+                    # move out again
+                    setwd(newpath)
+                }
+            }
+            allstudies <- mget(paste0("samples_", paste(range_downsampled, sep=" ", collapse=NULL)))
+            names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_samples")
+            # compute correlation for this permutation
+            corrMats_500[[i]] <- compute_downsampled_corr(allstudies)[[1]]
+        }
+        # compute mean of correlation matrices
+        meanCorr_500 <- Reduce("+",corrMats_500) / length(corrMats_500)
+        # plot
+        setwd(paste0(output_path, "/corr_analysis/"))
+        # plot correlation matrix
+        meanCorr_downsampling_500.plot <- ggcorrplot(round(meanCorr_500,2), 
+                hc.order = F, insig="pch",pch=5,pch.col = "grey",
+                pch.cex=9,
+                title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 500 genes when downsampling"),
+                colors = c("#FC4E07", "white", "#00AFBB"),
+                outline.color = "white", lab = TRUE, lab_size=3.5,
+                sig.level=0.05) +
+                theme(plot.title = element_text(hjust = 0.7)) +
+                scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) +
+                labs(fill="Corr")
+        ggsave("meanCorr_downsampling_500.png",meanCorr_downsampling_500.plot,width=25,height=20,units="cm",bg="white")
+        ggsave("meanCorr_downsampling_500.pdf",meanCorr_downsampling_500.plot,width=25,height=20,units="cm",bg="white")
+
+    }else{
+
+        ## if down-sampled cells
+        # define path and folders
+        path <- paste0(inpath,"/DE_downsampling_cells/")
+        downsampled_folders <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"cells_persample")
+        # get corr matrices for each permutation
+        corrMats_cells_1000 <- list()
+        for(i in 1:Nperms){
+            setwd(path)
+            for(folder in mixedsort(list.files())){
+                # remove other files above
+                if(folder%in%downsampled_folders){
+                    newpath <- paste0(path,folder)
+                    # enter directory
+                    setwd(newpath)
+                    # get num_cells
+                    num_cells <- str_sub(folder,1,-16)
+                    # go into permutation
+                    subpath <- paste0(newpath,"/",paste0(num_cells,"_",i))
+                    setwd(subpath)
+                    # read DE output
+                    load(paste0("DEout",num_cells,"_",i,".RData"))
+                    # get df for top 1000 genes
+                    all_genes <- get(paste0("DEout_",num_cells))$celltype_all_genes[[celltype_name]]
+                    assign(paste0("cells_",num_cells),subset(all_genes,name%in%top1000$name))
+                    # move out again
+                    setwd(newpath)
+                }
+            }
+            allstudies <- mget(paste0("cells_", paste(range_downsampled, sep=" ", collapse=NULL)))
+            names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_cells/sample")
+            # compute correlation for this permutation
+            corrMats_cells_1000[[i]] <- compute_downsampled_corr(allstudies,"cells")[[1]]
+        }
+        # compute mean of correlation matrices
+        meanCorr_cells_1000 <- Reduce("+",corrMats_cells_1000) / length(corrMats_cells_1000)
+        # plot
+        setwd(paste0(output_path, "/corr_analysis/"))
+        # plot correlation matrix
+        meanCorr_downsampling_cells_1000.plot <- ggcorrplot(round(meanCorr_cells_1000,2), 
+                hc.order = F, insig="pch",pch=5,pch.col = "grey",
+                pch.cex=9,
+                title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 1000 genes when downsampling cells"),
+                colors = c("#FC4E07", "white", "#00AFBB"),
+                outline.color = "white", lab = TRUE, lab_size=3.5,
+                sig.level=0.05) +
+                theme(plot.title = element_text(hjust = 0.7)) +
+                scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) +
+                labs(fill="Corr")
+        ggsave("meanCorr_downsampling_cells_1000.png",meanCorr_downsampling_cells_1000.plot,width=25,height=20,units="cm",bg="white")
+        ggsave("meanCorr_downsampling_cells_1000.pdf",meanCorr_downsampling_cells_1000.plot,width=25,height=20,units="cm",bg="white")
+
+        ## correlation using down-sampled datasets (cells) - top 500 genes
+        # load in DE outputs for down-sampled datasets, take union of DEGs across perms
+        setwd(path)
+        # get corr matrices for each permutation
+        corrMats_cells_500 <- list()
+        for(i in 1:Nperms){
+            setwd(path)
+            for(folder in mixedsort(list.files())){
+                # remove other files above
+                if(folder%in%downsampled_folders){
+                    newpath <- paste0(path,folder)
+                    # enter directory
+                    setwd(newpath)
+                    # get num_cells
+                    num_cells <- str_sub(folder,1,-16)
+                    # go into permutation
+                    subpath <- paste0(newpath,"/",paste0(num_cells,"_",i))
+                    setwd(subpath)
+                    # read DE output
+                    load(paste0("DEout",num_cells,"_",i,".RData"))
+                    # get df for top 500 genes
+                    all_genes <- get(paste0("DEout_",num_cells))$celltype_all_genes[[celltype_name]]
+                    assign(paste0("cells_",num_cells),subset(all_genes,name%in%top500$name))
+                    # move out again
+                    setwd(newpath)
+                }
+            }
+            allstudies <- mget(paste0("cells_", paste(range_downsampled, sep=" ", collapse=NULL)))
+            names(allstudies) <- paste0(paste(range_downsampled, sep=" ", collapse=NULL),"_cells/sample")
+            # compute correlation for this permutation
+            corrMats_cells_500[[i]] <- compute_downsampled_corr(allstudies,"cells")[[1]]
+        }
+        # compute mean of correlation matrices
+        meanCorr_cells_500 <- Reduce("+",corrMats_cells_500) / length(corrMats_cells_500)
+        # plot
+        setwd(paste0(output_path, "/corr_analysis/"))
+        # plot correlation matrix
+        meanCorr_downsampling_cells_500.plot <- ggcorrplot(round(meanCorr_cells_500,2), 
+                hc.order = F, insig="pch",pch=5,pch.col = "grey",
+                pch.cex=9,
+                title=paste0("Mean LogFC Correlation Matrix (across permutations) for top 500 genes when downsampling cells"),
+                colors = c("#FC4E07", "white", "#00AFBB"),
+                outline.color = "white", lab = TRUE, lab_size=3.5,
+                sig.level=0.05) +
+                theme(plot.title = element_text(hjust = 0.7)) +
+                scale_fill_gradient2(breaks=c(0, 1), limit=c(0, 1)) +
+                labs(fill="Corr")
+        ggsave("meanCorr_downsampling_cells_500.png",meanCorr_downsampling_cells_500.plot,width=25,height=20,units="cm",bg="white")
+        ggsave("meanCorr_downsampling_cells_500.pdf",meanCorr_downsampling_cells_500.plot,width=25,height=20,units="cm",bg="white")
+
+    }
+    
+}
\ No newline at end of file
diff --git a/R/power_analysis.r b/R/power_analysis.r
new file mode 100644
index 0000000..fa2b411
--- /dev/null
+++ b/R/power_analysis.r
@@ -0,0 +1,142 @@
+# Define global variables
+utils::globalVariables(c("dataset"))
+
+#' Runs entire power analysis
+
+#' @importFrom stats as.formula
+
+#' @param data - the input data (should be an SCE object)
+#' @param range_downsampled_individuals - range of values to downsample individuals for
+#' @param range_downsampled_cells - range of values to downsample cells for
+#' @param output_path - base path in which outputs will be stored
+#' @param sampleID - sample ID
+#' @param design - the design formula of class type `formula`. Equation used to fit the model- data for the generalised linear model e.g. expression ~ sex + pmi + disease
+#' @param sexID - sex ID
+#' @param celltypeID - cell type ID
+#' @param coeff - which coefficient to carry out DE analysis with respect to
+#' @param fdr - the cut-off False Discovery Rate below which to select DEGs
+#' @param nom_pval - the cut-off nominal P-value below which to select DEGs (as an alternative to FDR)
+#' @param Nperms - number of subsets created when downsampling at each level
+#' @param y - the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression)
+#' @param region - the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region.
+#' @param control - character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL.
+#' @param pval_adjust_method - the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
+#' @param rmv_zero_count_genes - whether genes with no count values in any cell should be removed. Default is TRUE
+
+#' Saves all plots and DE outputs in the appropriate directories
+
+power_analysis <- function(data,
+                           range_downsampled_individuals="placeholder",
+                           range_downsampled_cells="placeholder",
+                           output_path=getwd(),
+                           sampleID="donor_id",
+                           design="placeholder",
+                           sexID="sex",
+                           celltypeID="cell_type",
+                           coeff="male",
+                           fdr=0.05,
+                           nom_pval=0.05,
+                           Nperms=20,
+                           y=NULL,
+                           region="single_region",
+                           control=NULL,
+                           pval_adjust_method="BH",
+                           rmv_zero_count_genes=TRUE){
+    
+    setwd(output_path)
+
+    # alter range_downsampled_individuals
+    if(range_downsampled_individuals=="placeholder"){
+        range_downsampled_individuals <- downsampling_range(data, "individuals", sampleID)
+    }
+    # alter range_downsampled_cells
+    if(range_downsampled_cells=="placeholder"){
+        range_downsampled_cells <- downsampling_range(data, "cells", sampleID)
+    }
+    # alter design
+    if(design=="placeholder"){
+        design=as.formula(paste0("~",sexID))
+    }    
+
+    # create preliminary plots
+    preliminary_plots(data=data, 
+                      output_path=output_path, 
+                      sampleID=sampleID, 
+                      design=design, 
+                      sexID=sexID, 
+                      celltypeID=celltypeID, 
+                      coeff=coeff, 
+                      fdr=fdr)
+    
+    ## create power plots for down-sampling individuals, cells
+    # down-sample individuals and run DE analysis
+    downsampling_DEanalysis(data=data,
+                            range_downsampled=range_downsampled_individuals,
+                            output_path=output_path,
+                            sampled="individuals",
+                            sampleID=sampleID,
+                            design=design,
+                            sexID=sexID,
+                            celltypeID=celltypeID,
+                            y=y,
+                            region=region, 
+                            control=control, 
+                            pval_adjust_method=pval_adjust_method, 
+                            rmv_zero_count_genes=rmv_zero_count_genes, 
+                            coeff=coeff, 
+                            fdr=fdr, 
+                            nom_pval=nom_pval, 
+                            Nperms=Nperms)
+    # create power plots
+    power_plots(data=data, 
+                range_downsampled=range_downsampled_individuals, 
+                output_path=output_path,
+                sampled="individuals", 
+                sampleID=sampleID, 
+                celltypeID=celltypeID, 
+                fdr=fdr, 
+                nom_pval=nom_pval, 
+                Nperms=Nperms)
+    # down-sample cells and run DE analysis
+    downsampling_DEanalysis(data=data,
+                            range_downsampled=range_downsampled_cells,
+                            output_path=output_path,
+                            sampled="cells",
+                            sampleID=sampleID,
+                            design=design,
+                            sexID=sexID,
+                            celltypeID=celltypeID,
+                            y=y,
+                            region=region, 
+                            control=control, 
+                            pval_adjust_method=pval_adjust_method, 
+                            rmv_zero_count_genes=rmv_zero_count_genes, 
+                            coeff=coeff, 
+                            fdr=fdr, 
+                            nom_pval=nom_pval, 
+                            Nperms=Nperms)
+    # create power plots
+    power_plots(data=data, 
+                range_downsampled=range_downsampled_cells, 
+                output_path=output_path,
+                sampled="cells", 
+                sampleID=sampleID, 
+                celltypeID=celltypeID, 
+                fdr=fdr, 
+                nom_pval=nom_pval, 
+                Nperms=Nperms)
+    # create down-sampling correlation plots (individuals)
+    downsampling_corrplots(data=data, 
+                           range_downsampled=range_downsampled_individuals, 
+                           output_path=output_path, 
+                           sampled="individuals", 
+                           celltypeID=celltypeID, 
+                           Nperms=Nperms)
+    # create down-sampling correlation plots (cells)
+    downsampling_corrplots(data=data, 
+                           range_downsampled=range_downsampled_cells, 
+                           output_path=output_path, 
+                           sampled="cells", 
+                           celltypeID=celltypeID, 
+                           Nperms=Nperms)
+}
\ No newline at end of file
diff --git a/R/power_plots.r b/R/power_plots.r
index 3284ae1..eefc2a2 100644
--- a/R/power_plots.r
+++ b/R/power_plots.r
@@ -29,7 +29,7 @@ utils::globalVariables(c("design","PValue","logFC","name","variable"))
 #' @param pval_adjust_method the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
 #' @param rmv_zero_count_genes whether genes with no count values in any cell should be removed. Default is TRUE
 
-#' @return saves all plots in the appropriate directory
+#' Saves all plots in the appropriate directory
 
 power_plots <- function(data,
                         range_downsampled="placeholder",
@@ -58,6 +58,7 @@ power_plots <- function(data,
     }
 
     # create output path if doesn't already exist
+    setwd(output_path)
     dir.create(output_path,showWarnings=FALSE)
     # validate function input params
     validate_input_parameters_power(data=data, range_downsampled=range_downsampled, output_path=output_path,
diff --git a/R/preliminary_plots.r b/R/preliminary_plots.r
new file mode 100644
index 0000000..7b1b2db
--- /dev/null
+++ b/R/preliminary_plots.r
@@ -0,0 +1,115 @@
+# Define global variables
+utils::globalVariables(c("..density.."))
+
+#' Create preliminary plots for data exploration
+
+#' @importFrom stats as.formula
+#' @importFrom ggplot2 ggplot labs theme ggsave aes element_text element_blank geom_histogram
+#' @importFrom cowplot theme_cowplot
+#' @importFrom gridExtra arrangeGrob
+#' @importFrom SingleCellExperiment colData
+
+#' @param data - the input data (should be an SCE object)
+#' @param output_path - base path in which outputs will be stored
+#' @param sampleID - sample ID
+#' @param design - the design formula of class type `formula`. Equation used to fit the model- data for the generalised linear model e.g. expression ~ sex + pmi + disease
+#' @param sexID - sex ID
+#' @param celltypeID - cell type ID
+#' @param coeff - which coefficient to carry out DE analysis with respect to
+#' @param fdr - the cut-off False Discovery Rate below which to select DEGs
+#' @param y - the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression)
+#' @param region - the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region.
+#' @param control - character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL.
+#' @param pval_adjust_method - the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
+#' @param rmv_zero_count_genes - whether genes with no count values in any cell should be removed. Default is TRUE
+
+#' Saves all plots in the appropriate directory
+
+preliminary_plots <- function(data,
+                              output_path=getwd(),
+                              sampleID="donor_id",
+                              design="placeholder",
+                              sexID="sex",
+                              celltypeID="cell_type",
+                              coeff="male",
+                              fdr=0.05,
+                              y=NULL,
+                              region="single_region",
+                              control=NULL,
+                              pval_adjust_method="BH",
+                              rmv_zero_count_genes=TRUE){
+
+    # create output path if doesn't already exist
+    setwd(output_path)
+    dir.create(output_path,showWarnings=FALSE)
+    # alter design
+    if(design=="placeholder"){
+        design=as.formula(paste0("~",sexID))
+    }
+    # validate function input params
+    validate_input_parameters_power(data=data, output_path=output_path, sampleID=sampleID,
+                                    design=design, sexID=sexID, celltypeID=celltypeID, 
+                                    coeff=coeff, fdr=fdr, y=y,
+                                    region=region, control=control, pval_adjust_method=pval_adjust_method,
+                                    rmv_zero_count_genes=rmv_zero_count_genes)
+
+    # get celltype name from dataset
+    celltype_name <- toString(unique(data[[celltypeID]]))
+    # check if DE analysis output present already in output_path
+    if(!"DEout.RData" %in% list.files(output_path)){
+    # run and save DE analysis
+        assign("DEout", sc_cell_type_de(data, design=design, pseudobulk_ID=sampleID, celltype_ID=celltypeID, y=y, region=region, control=control, pval_adjust_method=pval_adjust_method, rmv_zero_count_genes=rmv_zero_count_genes, verbose=T, coef=coeff))
+        save(DEout,file=paste0(output_path,"/DEout.RData"))
+    }else{
+        load(paste0(output_path,"/DEout.RData"))
+    }
+    
+    ## plot histogram of effect sizes in DEGs and all genes
+    # for DEGs (at fdr cut-off)
+    DEGs_full <- subset(DEout$celltype_all_genes[[celltype_name]], adj_pval<fdr)
+    logFCdist.plot <- ggplot(DEGs_full,aes(x=logFC)) + 
+                            geom_histogram(aes(y=..density..), color="grey1", fill="grey1", alpha=0.5, bins=25) + 
+                            labs(title=paste0("Distribution of LogFC (DEGs at ", fdr,"%)"),y="Density",x ="LogFC") + 
+                            theme_cowplot() +
+                            theme(axis.text=element_text(size=20),axis.title=element_text(size=22),plot.title=element_text(size=22))
+    ggsave(paste0(output_path, "/logFCdist.png"),logFCdist.plot,width=20,height=20,units="cm",bg="white")
+    ggsave(paste0(output_path, "/logFCdist.pdf"),logFCdist.plot,width=20,height=20,units="cm",bg="white")
+    # for all genes
+    allgenes <- DEout$celltype_all_genes[[celltype_name]]
+    logFCdist_allgenes.plot <- ggplot(allgenes,aes(x=logFC)) + 
+                            geom_histogram(aes(y=..density..), color="grey1", fill="grey1", alpha=0.5, bins=25) + 
+                            labs(title=paste0("Distribution of LogFC (all genes)"),y="Density",x ="LogFC") + 
+                            theme_cowplot() +
+                            theme(axis.text=element_text(size=20),axis.title=element_text(size=22),plot.title=element_text(size=22))
+    ggsave(paste0(output_path, "/logFCdist_allgenes.png"),logFCdist_allgenes.plot,width=20,height=20,units="cm",bg="white")
+    ggsave(paste0(output_path, "/logFCdist_allgenes.pdf"),logFCdist_allgenes.plot,width=20,height=20,units="cm",bg="white")
+
+    ## plot distribution of cells across individuals
+    ## get table of sample ID/number of cells
+    # get list of sample IDs
+    coldata <- colData(data)
+    IDs <- unique(coldata[[sampleID]])
+    # get list of number of cells
+    numIDs <- length(IDs)
+    numCells <- list()
+    for(i in 1:numIDs){
+        numCells[[i]] <- dim(coldata[coldata[[sampleID]]==IDs[[i]],])[[1]]
+    }
+    numCells <- unlist(numCells)
+    # put in dataframe
+    cellcount <- data.frame(IDs)
+    cellcount$numCells <- numCells
+    # create and save histogram
+    cellcount.plot <- ggplot(cellcount,aes(x=numCells)) + 
+                            geom_histogram(aes(y=..density..), color="grey1", fill="grey1", alpha=0.5, bins=25) + 
+                            labs(title="Distribution of numbers of cells across individuals",y="Density",x ="Number of cells") + 
+                            theme_cowplot()+
+                            theme(axis.text.y=element_blank(),axis.ticks.y=element_blank(),axis.text=element_text(size=20),axis.title=element_text(size=22),plot.title=element_text(size=22),axis.text.x = element_text(hjust=0.8))
+    ggsave(paste0(output_path, "/cellcount.png"),cellcount.plot,width=20,height=20,units="cm",bg="white")
+    ggsave(paste0(output_path, "/cellcount.pdf"),cellcount.plot,width=20,height=20,units="cm",bg="white")
+
+    # put plots in a subplot together
+    ggsave(paste0(output_path, "/QC_plots.png"),arrangeGrob(logFCdist.plot,logFCdist_allgenes.plot,cellcount.plot,ncol=3),width=60,height=20,units="cm",bg="white")
+    ggsave(paste0(output_path, "/QC_plots.pdf"),arrangeGrob(logFCdist.plot,logFCdist_allgenes.plot,cellcount.plot,ncol=3),width=60,height=20,units="cm",bg="white")
+
+}
\ No newline at end of file
diff --git a/man/compute_downsampled_corr.Rd b/man/compute_downsampled_corr.Rd
new file mode 100644
index 0000000..a4deb5f
--- /dev/null
+++ b/man/compute_downsampled_corr.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compute_downsampled_corr.r
+\name{compute_downsampled_corr}
+\alias{compute_downsampled_corr}
+\title{For a given down-sampled DE output, computes the correlation of the log-foldchange of the DEGs (at specified p-value) for a given dataset (celltype)}
+\usage{
+compute_downsampled_corr(allstudies, sampled = "individuals")
+}
+\arguments{
+\item{allstudies}{\itemize{
+\item a list containing DGE analysis outputs (subset to "celltype_all_genes" and the specified cell type) at each down-sampled point
+}}
+
+\item{sampled}{\itemize{
+\item downsampling carried out based on what (either "individuals" or "cells")
+}}
+}
+\value{
+correlation matrix, plot and the number of DEGs at the specified p-value
+}
+\description{
+For a given down-sampled DE output, computes the correlation of the log-foldchange of the DEGs (at specified p-value) for a given dataset (celltype)
+}
diff --git a/man/downsampling_DEanalysis.Rd b/man/downsampling_DEanalysis.Rd
index 3e40bb6..5d322ec 100644
--- a/man/downsampling_DEanalysis.Rd
+++ b/man/downsampling_DEanalysis.Rd
@@ -57,10 +57,8 @@ downsampling_DEanalysis(
 
 \item{pval_adjust_method}{the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options}
 
-\item{rmv_zero_count_genes}{whether genes with no count values in any cell should be removed. Default is TRUE}
-}
-\value{
-saves all DE outputs for downsampled files as well as a summary table of results showing number of true DEGs detected at each number of samples/cells
+\item{rmv_zero_count_genes}{whether genes with no count values in any cell should be removed. Default is TRUE
+Saves all DE outputs for downsampled files as well as a summary table of results showing number of true DEGs detected at each number of samples/cells}
 }
 \description{
 Downsample the dataset, based either on the individuals or cells, and run DE analysis on each downsampled output. Save results in a dataframe
diff --git a/man/downsampling_corrplots.Rd b/man/downsampling_corrplots.Rd
new file mode 100644
index 0000000..643620b
--- /dev/null
+++ b/man/downsampling_corrplots.Rd
@@ -0,0 +1,84 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/downsampling_corrplots.r
+\name{downsampling_corrplots}
+\alias{downsampling_corrplots}
+\title{Create correlation plots of the effect sizes of the top 1000 and top 500 DEGs}
+\usage{
+downsampling_corrplots(
+  data,
+  range_downsampled = "placeholder",
+  output_path = getwd(),
+  inpath = "placeholder",
+  sampled = "individuals",
+  sampleID = "donor_id",
+  celltypeID = "cell_type",
+  coeff = "male",
+  Nperms = 20,
+  y = NULL,
+  region = "single_region",
+  control = NULL,
+  pval_adjust_method = "BH",
+  rmv_zero_count_genes = TRUE
+)
+}
+\arguments{
+\item{data}{\itemize{
+\item the input data (should be an SCE object)
+}}
+
+\item{range_downsampled}{\itemize{
+\item range of values to be downsampled for, in ascending order
+}}
+
+\item{output_path}{\itemize{
+\item base path in which outputs will be stored
+}}
+
+\item{inpath}{\itemize{
+\item base path where downsampled DE output folders are stored (taken to be output_path if not provided)
+}}
+
+\item{sampled}{\itemize{
+\item downsampling carried out based on what (either "individuals" or "cells")
+}}
+
+\item{sampleID}{\itemize{
+\item sample ID
+}}
+
+\item{celltypeID}{\itemize{
+\item cell type ID
+}}
+
+\item{coeff}{\itemize{
+\item which coefficient to carry out DE analysis with respect to
+}}
+
+\item{Nperms}{\itemize{
+\item number of subsets created when downsampling at each level
+}}
+
+\item{y}{\itemize{
+\item the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression)
+}}
+
+\item{region}{\itemize{
+\item the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region.
+}}
+
+\item{control}{\itemize{
+\item character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL.
+}}
+
+\item{pval_adjust_method}{\itemize{
+\item the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
+}}
+
+\item{rmv_zero_count_genes}{\itemize{
+\item whether genes with no count values in any cell should be removed. Default is TRUE
+Saves all plots in the appropriate directory
+}}
+}
+\description{
+Create correlation plots of the effect sizes of the top 1000 and top 500 DEGs
+}
diff --git a/man/power_analysis.Rd b/man/power_analysis.Rd
new file mode 100644
index 0000000..78eef9b
--- /dev/null
+++ b/man/power_analysis.Rd
@@ -0,0 +1,99 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/power_analysis.r
+\name{power_analysis}
+\alias{power_analysis}
+\title{Runs entire power analysis}
+\usage{
+power_analysis(
+  data,
+  range_downsampled_individuals = "placeholder",
+  range_downsampled_cells = "placeholder",
+  output_path = getwd(),
+  sampleID = "donor_id",
+  design = "placeholder",
+  sexID = "sex",
+  celltypeID = "cell_type",
+  coeff = "male",
+  fdr = 0.05,
+  nom_pval = 0.05,
+  Nperms = 20,
+  y = NULL,
+  region = "single_region",
+  control = NULL,
+  pval_adjust_method = "BH",
+  rmv_zero_count_genes = TRUE
+)
+}
+\arguments{
+\item{data}{\itemize{
+\item the input data (should be an SCE object)
+}}
+
+\item{range_downsampled_individuals}{\itemize{
+\item range of values to downsample individuals for
+}}
+
+\item{range_downsampled_cells}{\itemize{
+\item range of values to downsample cells for
+}}
+
+\item{output_path}{\itemize{
+\item base path in which outputs will be stored
+}}
+
+\item{sampleID}{\itemize{
+\item sample ID
+}}
+
+\item{design}{\itemize{
+\item the design formula of class type \code{formula}. Equation used to fit the model- data for the generalised linear model e.g. expression ~ sex + pmi + disease
+}}
+
+\item{sexID}{\itemize{
+\item sex ID
+}}
+
+\item{celltypeID}{\itemize{
+\item cell type ID
+}}
+
+\item{coeff}{\itemize{
+\item which coefficient to carry out DE analysis with respect to
+}}
+
+\item{fdr}{\itemize{
+\item the cut-off False Discovery Rate below which to select DEGs
+}}
+
+\item{nom_pval}{\itemize{
+\item the cut-off nominal P-value below which to select DEGs (as an alternative to FDR)
+}}
+
+\item{Nperms}{\itemize{
+\item number of subsets created when downsampling at each level
+}}
+
+\item{y}{\itemize{
+\item the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression)
+}}
+
+\item{region}{\itemize{
+\item the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region.
+}}
+
+\item{control}{\itemize{
+\item character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL.
+}}
+
+\item{pval_adjust_method}{\itemize{
+\item the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
+}}
+
+\item{rmv_zero_count_genes}{\itemize{
+\item whether genes with no count values in any cell should be removed. Default is TRUE
+Saves all plots and DE outputs in the appropriate directories
+}}
+}
+\description{
+Runs entire power analysis
+}
diff --git a/man/power_plots.Rd b/man/power_plots.Rd
index db571aa..d7bdc32 100644
--- a/man/power_plots.Rd
+++ b/man/power_plots.Rd
@@ -54,10 +54,8 @@ power_plots(
 
 \item{pval_adjust_method}{the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options}
 
-\item{rmv_zero_count_genes}{whether genes with no count values in any cell should be removed. Default is TRUE}
-}
-\value{
-saves all plots in the appropriate directory
+\item{rmv_zero_count_genes}{whether genes with no count values in any cell should be removed. Default is TRUE
+Saves all plots in the appropriate directory}
 }
 \description{
 Create plots for power analysis, with down-sampling based either on the individuals or cells
diff --git a/man/preliminary_plots.Rd b/man/preliminary_plots.Rd
new file mode 100644
index 0000000..f5dddd7
--- /dev/null
+++ b/man/preliminary_plots.Rd
@@ -0,0 +1,79 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/preliminary_plots.r
+\name{preliminary_plots}
+\alias{preliminary_plots}
+\title{Create preliminary plots for data exploration}
+\usage{
+preliminary_plots(
+  data,
+  output_path = getwd(),
+  sampleID = "donor_id",
+  design = "placeholder",
+  sexID = "sex",
+  celltypeID = "cell_type",
+  coeff = "male",
+  fdr = 0.05,
+  y = NULL,
+  region = "single_region",
+  control = NULL,
+  pval_adjust_method = "BH",
+  rmv_zero_count_genes = TRUE
+)
+}
+\arguments{
+\item{data}{\itemize{
+\item the input data (should be an SCE object)
+}}
+
+\item{output_path}{\itemize{
+\item base path in which outputs will be stored
+}}
+
+\item{sampleID}{\itemize{
+\item sample ID
+}}
+
+\item{design}{\itemize{
+\item the design formula of class type \code{formula}. Equation used to fit the model- data for the generalised linear model e.g. expression ~ sex + pmi + disease
+}}
+
+\item{sexID}{\itemize{
+\item sex ID
+}}
+
+\item{celltypeID}{\itemize{
+\item cell type ID
+}}
+
+\item{coeff}{\itemize{
+\item which coefficient to carry out DE analysis with respect to
+}}
+
+\item{fdr}{\itemize{
+\item the cut-off False Discovery Rate below which to select DEGs
+}}
+
+\item{y}{\itemize{
+\item the column name in the SCE object for the return variable e.g. "diagnosis" - Case or disease. Default is the last variable in the design formula. y can be discrete (logistic regression) or continuous (linear regression)
+}}
+
+\item{region}{\itemize{
+\item the column name in the SCE object for the study region. If there are multiple regions in the study (for example two brain regions). Pseudobulk values can be derived separately. Default is "single_region" which will not split by region.
+}}
+
+\item{control}{\itemize{
+\item character specifying which control level for the differential expression analysis e.g. in a case/control/other study use "control" in the y column to compare against. NOTE only need to specify if more than two groups in y, leave as default value for two groups or continuous y. Default is NULL.
+}}
+
+\item{pval_adjust_method}{\itemize{
+\item the adjustment method for the p-value in the differential expression analysis. Default is benjamini hochberg "BH". See  stats::p.adjust for available options
+}}
+
+\item{rmv_zero_count_genes}{\itemize{
+\item whether genes with no count values in any cell should be removed. Default is TRUE
+Saves all plots in the appropriate directory
+}}
+}
+\description{
+Create preliminary plots for data exploration
+}
diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd
new file mode 100644
index 0000000..0faaba7
--- /dev/null
+++ b/vignettes/getting_started.Rmd
@@ -0,0 +1,33 @@
+---
+title: "Getting Started"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{getting_started}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+```
+***
+# Introduction
+
+The *poweranalysis* R package is designed to run robust power analysis for differential gene expression in scRNA-seq studies, addressing the challenge of low statistical power due to small sample sizes. It provides tools to estimate the optimal number of samples and cells needed to achieve reliable power levels.
+<br>
+<br>
+Using *poweranalysis* involves four steps: <br>
+**1. Differential Expression (DE) Analysis:** Import your SCE object and perform DE analysis using a pseudobulking approach, allowing for the robust identification of differentially expressed genes (DEGs) across conditions or groups. <br>
+**2. Correlation Analysis:** Assess the consistency of DEG effect sizes across random permutations, subsets of your data, and different studies by computing correlation matrices. <br>
+**3. Downsampling and Power Analysis:** Test the robustness of your findings by downsampling both individuals and cells, then analyze the effect on DEG detection power. <br>
+**4. Visualisation:** Generate comprehensive power plots and visual summaries of the power analysis results, including DEG detection rates, false discovery rates, and mean LogFC correlation between down-sampled subsets. 
+<br>
+
+# Setup
+
+```{r setup}
+library(poweranalysis)
+```
diff --git a/vignettes/poweranalysis.Rmd b/vignettes/poweranalysis.Rmd
index a498173..353272d 100644
--- a/vignettes/poweranalysis.Rmd
+++ b/vignettes/poweranalysis.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "poweranalysis"
+title: "Power Analysis"
 output: rmarkdown::html_vignette
 vignette: >
   %\VignetteIndexEntry{poweranalysis}
@@ -13,7 +13,55 @@ knitr::opts_chunk$set(
   comment = "#>"
 )
 ```
+***
+### Authors: *Salman Fawad*, *Yunning Yuan*, *Nathan Skene* <br>
+### README updated: *Aug-09-2024* <br>
+<br>
 
+# Introduction
+
+The *poweranalysis* R package is designed to run robust power analysis for differential gene expression in scRNA-seq studies, addressing the challenge of low statistical power due to small sample sizes. It provides tools to estimate the optimal number of samples and cells needed to achieve reliable power levels.
+<br>
+<br>
+Using *poweranalysis* involves four steps: <br>
+**1. Differential Expression (DE) Analysis:** Import your SCE object and perform DE analysis using a pseudobulking approach, allowing for the robust identification of differentially expressed genes (DEGs) across conditions or groups. <br>
+**2. Correlation Analysis:** Assess the consistency of DEG effect sizes across random permutations, subsets of your data, and different studies by computing correlation matrices. <br>
+**3. Downsampling and Power Analysis:** Test the robustness of your findings by downsampling both individuals and cells, then analyze the effect on DEG detection power. <br>
+**4. Visualisation:** Generate comprehensive power plots and visual summaries of the power analysis results, including DEG detection rates, false discovery rates, and mean LogFC correlation between down-sampled subsets.  
+
+
+# Installation  
+<br>
+
+# Setup
 ```{r setup}
 library(poweranalysis)
 ```
+<br>
+
+# Documentation  
+## [Getting Started](http://127.0.0.1:18836/library/poweranalysis/doc/getting_started.html)
+<br>
+
+# Troubleshooting
+If you have any problems, please do submit an [Issue here on GitHub](https://github.com/neurogenomics/Power_Analysis_package) with a reproducible example.  
+<br>
+
+# Citation  
+
+<br>
+***
+<br>
+
+# Contact  
+## [Neurogenomis Lab](https://www.neurogenomics.co.uk/)
+UK Dementia Research Institute <br>
+Department of Brain Sciences <br>
+Faculty of Medicine <br>
+Imperial College London <br>
+[GitHub](https://github.com/neurogenomics)
+[DockerHub](https://hub.docker.com/orgs/neurogenomicslab)  
+<br>
+
+# References
+