Updates bioc labs 01-03

NBISweden · Jan 30, 2024 · b88b04b · b88b04b
1 parent 0d58017
commit b88b04b
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 25 deletions.
diff --git a/labs/_metadata.yml b/labs/_metadata.yml
@@ -140,6 +140,13 @@ dimred_plotgenes: "Genes of interest"
 dimred_plotgenes_1: "Let's plot some marker genes for different cell types onto the embedding."
 dimred_plotgenes_2: "Select some of your dimensionality reductions and plot some of the QC stats that were calculated in the previous lab. Can you see if some of the separation in your data is driven by quality of the cells?"
 
+dimred_harmony: "Harmony"
+dimred_harmony_1: "An alternative method for integration is Harmony, for more details on the method, please se their paper  [Nat. Methods](https://www.nature.com/articles/s41592-019-0619-0). This method runs the integration on a dimensionality reduction, in most applications the PCA. So first, we will rerun scaling and PCA with the same set of genes that were used for the CCA integration."
+
+dimred_scanorama: "Scanorama"
+dimred_scanorama_1: "Another integration method is Scanorama (see [Nat. Biotech.](https://www.nature.com/articles/s41587-019-0113-3)). This method is implemented in python, but we can run it through the Reticulate package."
+dimred_scanorama_2: "We will run it with the same set of variable genes, but first we have to create a list of all the objects per sample."
+
 # save data
 dimred_save: "Save data"
 dimred_save_1: "We can finally save the object for use in future steps."

diff --git a/labs/bioc/bioc_01_qc.qmd b/labs/bioc/bioc_01_qc.qmd
@@ -92,18 +92,18 @@ ctrl.19 <- Seurat::Read10X_h5(
 {{< meta qc_collate_1 >}}
 
 ```{r}
-sce <- SingleCellExperiment(assays = list(counts = cbind(cov.1, cov.15, cov.17, ctrl.5, ctrl.13, ctrl.14)))
+sce <- SingleCellExperiment(assays = list(counts = cbind(cov.1, cov.15, cov.16, cov.17, ctrl.5, ctrl.13, ctrl.14,ctrl.19)))
 dim(sce)
 # Adding metadata
-sce@colData$sample <- unlist(sapply(c("cov.1", "cov.15", "cov.17", "ctrl.5", "ctrl.13", "ctrl.14"), function(x) rep(x, ncol(get(x)))))
+sce@colData$sample <- unlist(sapply(c("cov.1", "cov.15", "cov.16", "cov.17", "ctrl.5", "ctrl.13", "ctrl.14","ctrl.19"), function(x) rep(x, ncol(get(x)))))
 sce@colData$type <- ifelse(grepl("cov", sce@colData$sample), "Covid", "Control")
 ```
 
 {{< meta qc_collate_2 >}}
 
 ```{r}
 # remove all objects that will not be used.
-rm(cov.15, cov.1, cov.17, ctrl.5, ctrl.13, ctrl.14)
+rm(cov.15, cov.1, cov.17, cov.16, ctrl.5, ctrl.13, ctrl.14, ctrl.19)
 # run garbage collect to free up memory
 gc()
 ```
@@ -127,7 +127,7 @@ mito_genes <- rownames(sce)[grep("^MT-", rownames(sce))]
 # Ribosomal genes
 ribo_genes <- rownames(sce)[grep("^RP[SL]", rownames(sce))]
 # Hemoglobin genes - includes all genes starting with HB except HBP.
-hb_genes <- rownames(sce)[grep("^HB[^(P)]", rownames(sce))]
+hb_genes <- rownames(sce)[grep("^HB[^(P|E|S)]", rownames(sce))]
 ```
 
 First, let Scran calculate some general qc-stats for genes and cells with the function `perCellQCMetrics`. It can also calculate proportion of counts for specific gene subsets, so first we need to define which genes are mitochondrial, ribosomal and hemoglobin.
@@ -222,7 +222,7 @@ In Scater, you can also use the function `plotHighestExprs()` to plot the gene c
 # Compute the relative expression of each gene per cell
 # Use sparse matrix operations, if your dataset is large, doing matrix devisions the regular way will take a very long time.
 C <- counts(sce)
-C@x <- C@x / rep.int(colSums(C), diff(C@p))
+C@x <- C@x / rep.int(colSums(C), diff(C@p)) * 100
 most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
 boxplot(as.matrix(t(C[most_expressed, ])), cex = .1, las = 1, xlab = "% total count per cell", col = scales::hue_pal()(20)[20:1], horizontal = TRUE)
 
@@ -238,7 +238,7 @@ rm(C)
 {{< meta qc_filter_mr_1 >}}
 
 ```{r}
-selected_mito <- sce.filt$subsets_mt_percent < 30
+selected_mito <- sce.filt$subsets_mt_percent < 20
 selected_ribo <- sce.filt$subsets_ribo_percent > 5
 
 # and subset the object to only keep those cells
@@ -282,8 +282,8 @@ sce.filt <- sce.filt[!grepl("^MT-", rownames(sce.filt)), ]
 # Filter Ribossomal gene (optional if that is a problem on your data)
 # sce.filt <- sce.filt[ ! grepl("^RP[SL]", rownames(sce.filt)), ]
 
-# Filter Hemoglobin gene
-sce.filt <- sce.filt[!grepl("^HB[^(PES)]", rownames(sce.filt)), ]
+# Filter Hemoglobin gene  (optional if that is a problem on your data)
+#sce.filt <- sce.filt[!grepl("^HB[^(P|E|S)]", rownames(sce.filt)), ]
 
 dim(sce.filt)
 ```
@@ -385,6 +385,7 @@ assignments <- cyclone(sce.filt[ensembl %in% cc.ensembl, ], hs.pairs, gene.names
 sce.filt$G1.score <- assignments$scores$G1
 sce.filt$G2M.score <- assignments$scores$G2M
 sce.filt$S.score <- assignments$scores$S
+sce.filt$phase <- assignments$phases
 ```
 
 {{< meta qc_cellcycle_2 >}}
@@ -394,7 +395,7 @@ sce.filt$S.score <- assignments$scores$S
 #| fig-width: 14
 
 wrap_plots(
-    plotColData(sce.filt, y = "G2M.score", x = "G1.score", colour_by = "sample"),
+    plotColData(sce.filt, y = "G2M.score", x = "G1.score", colour_by = "phase"),
     plotColData(sce.filt, y = "G2M.score", x = "sample", colour_by = "sample"),
     plotColData(sce.filt, y = "G1.score", x = "sample", colour_by = "sample"),
     plotColData(sce.filt, y = "S.score", x = "sample", colour_by = "sample"),
@@ -404,6 +405,8 @@ wrap_plots(
 
 Cyclone predicts most cells as G1, but also quite a lot of cells with high S-Phase scores. Compare to results with Seurat and Scanpy and see how different predictors will give clearly different results.
 
+Cyclone does an automatic prediction of cell cycle phase with a default cutoff of the scores at 0.5 As you can see this does not fit this data very well, so be cautious with using these predictions. Instead we suggest that you look at the scores.
+
 ## {{< meta qc_doublet >}}
 
 {{< meta qc_doublet_1 >}}

diff --git a/labs/bioc/bioc_02_dimred.qmd b/labs/bioc/bioc_02_dimred.qmd
@@ -110,6 +110,8 @@ wrap_plots(
 
 {{< meta dimred_pca_3 >}}
 
+Here, we can check how the different metadata variables contributes to each PC. This can be important to look at to understand different biases you may have in your data.
+
 ```{r}
 #| fig-height: 3.5
 #| fig-width: 10

diff --git a/labs/bioc/bioc_03_integration.qmd b/labs/bioc/bioc_03_integration.qmd
@@ -22,7 +22,20 @@ Code chunks run R commands unless otherwise specified.
 
 {{< meta int_prep_1 >}}
 
+
+
 ```{r}
+load("temp.rds")
+scanorama <- reticulate::import("scanorama")
+
+integrated.data <- scanorama$integrate(datasets_full = scelist, genes_list = genelist)
+```
+
+```{r}
+# Activate scanorama Python venv
+reticulate::use_virtualenv("/opt/venv/scanorama")
+reticulate::py_discover_config()
+
 suppressPackageStartupMessages({
     library(scater)
     library(scran)
@@ -33,9 +46,7 @@ suppressPackageStartupMessages({
     library(reticulate)
 })
 
-# Activate scanorama Python venv
-reticulate::use_virtualenv("/opt/venv/scanorama")
-reticulate::py_discover_config()
+
 ```
 
 ```{r}
@@ -155,10 +166,10 @@ for (i in c("CD3E", "CD4", "CD8A", "NKG7", "GNLY", "MS4A1", "CD14", "LYZ", "MS4A
 }
 wrap_plots(plotlist = plotlist, ncol = 3)
 ```
+## {{< meta dimred_harmony >}}
 
-INTEG_R1:
+{{< meta dimred_harmony_1 >}}
 
-INTEG_R2:
 
 ```{r}
 #| fig-height: 5
@@ -180,9 +191,13 @@ sce <- RunHarmony(
 sce <- runUMAP(sce, dimred = "harmony", n_dimred = 50, ncomponents = 2, name = "UMAP_on_Harmony")
 ```
 
-INTEG_R3:
+## {{< meta dimred_scanorama >}}
+
+{{< meta dimred_scanorama_1 >}}
+
+{{< meta dimred_scanorama_2 >}}
+
 
-INTEG_R4:
 
 ```{r}
 #| fig-height: 5
@@ -200,7 +215,7 @@ for (i in 1:length(sce.list)) {
 lapply(scelist, dim)
 ```
 
-INTEG_R5:
+Now we can run scanorama:
 
 ```{r}
 #| fig-height: 5
@@ -215,17 +230,22 @@ intdimred <- do.call(rbind, integrated.data[[1]])
 colnames(intdimred) <- paste0("PC_", 1:100)
 rownames(intdimred) <- colnames(logcounts(sce))
 
-# Add standard deviations in order to draw Elbow Plots in Seurat
+# Add standard deviations in order to draw Elbow Plots 
 stdevs <- apply(intdimred, MARGIN = 2, FUN = sd)
 attr(intdimred, "varExplained") <- stdevs
 
 reducedDim(sce, "Scanorama_PCA") <- intdimred
 
 # Here we use all PCs computed from Scanorama for UMAP calculation
 sce <- runUMAP(sce, dimred = "Scanorama_PCA", n_dimred = 50, ncomponents = 2, name = "UMAP_on_Scanorama")
+
+plotReducedDim(sce, dimred = "UMAP_on_Scanorama", colour_by = "sample", point_size = 0.6) + ggplot2::ggtitle(label = "UMAP_on_Scanorama")
 ```
 
-INTEG_R6:
+## Overview all methods
+
+Now we will plot UMAPS with all three integration methods side by side.
+
 
 ```{r}
 #| fig-height: 8
@@ -241,7 +261,6 @@ wrap_plots(p1, p2, p3, p4, nrow = 2) +
     plot_layout(guides = "collect")
 ```
 
-INTEG_R7:
 
 {{< meta int_save >}}
 

diff --git a/labs/seurat/seurat_01_qc.qmd b/labs/seurat/seurat_01_qc.qmd
@@ -287,7 +287,7 @@ data.filt <- data.filt[!grepl("^MT-", rownames(data.filt)), ]
 # data.filt <- data.filt[ ! grepl("^RP[SL]", rownames(data.filt)), ]
 
 # Filter Hemoglobin gene (optional if that is a problem on your data)
-data.filt <- data.filt[!grepl("^HB[^(PES)]", rownames(data.filt)), ]
+# data.filt <- data.filt[!grepl("^HB[^(P|E|S)]", rownames(data.filt)), ]
 
 dim(data.filt)
 ```

diff --git a/labs/seurat/seurat_05_dge.qmd b/labs/seurat/seurat_05_dge.qmd
@@ -422,8 +422,8 @@ nPatient <- sapply(unique(cell_selection$orig.ident), function(x) {
     rowSums(cell_selection@assays$RNA@counts[, cell_selection$orig.ident
     == x] > 2)
 })
-nCovid <- rowSums(nPatient[, 1:3] > 2)
-nCtrl <- rowSums(nPatient[, 4:6] > 2)
+nCovid <- rowSums(nPatient[, 1:4] > 2)
+nCtrl <- rowSums(nPatient[, 5:8] > 2)
 
 sel <- nCovid >= 2 | nCtrl >= 2
 cell_selection_sub <- cell_selection[sel, ]
@@ -502,7 +502,7 @@ top1 <- head(fcHurdle1[order(fcHurdle1$`Pr(>Chisq)`), ], 10)
 top1
 
 fcHurdle1$pval <- fcHurdle1$`Pr(>Chisq)`
-fcHurdle1$dir <- ifelse(fcHurdle1$z > 0, "up", "down")
+fcHurdle1$dir <- ifelse(fcHurdle1$z > 0, "Ctrl", "Covid")
 fcHurdle1 %>%
     group_by(dir) %>%
     top_n(-10, pval) %>%
@@ -518,7 +518,7 @@ top2 <- head(fcHurdle2[order(fcHurdle2$`Pr(>Chisq)`), ], 10)
 top2
 
 fcHurdle2$pval <- fcHurdle2$`Pr(>Chisq)`
-fcHurdle2$dir <- ifelse(fcHurdle2$z > 0, "up", "down")
+fcHurdle2$dir <- ifelse(fcHurdle2$z > 0, "Ctrl", "Covid")
 fcHurdle2 %>%
     group_by(dir) %>%
     top_n(-10, pval) %>%