From b88b04ba596a7dd909fb0dc6d8b4729adff3d14c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=85sa=20Bj=C3=B6rklund?= <asa.bjorklund@scilifelab.se>
Date: Tue, 30 Jan 2024 16:50:44 +0100
Subject: [PATCH] Updates bioc labs 01-03

---
 labs/_metadata.yml                |  7 ++++++
 labs/bioc/bioc_01_qc.qmd          | 21 +++++++++-------
 labs/bioc/bioc_02_dimred.qmd      |  2 ++
 labs/bioc/bioc_03_integration.qmd | 41 ++++++++++++++++++++++---------
 labs/seurat/seurat_01_qc.qmd      |  2 +-
 labs/seurat/seurat_05_dge.qmd     |  8 +++---
 6 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/labs/_metadata.yml b/labs/_metadata.yml
index 5f3dc964..eb43684a 100644
--- a/labs/_metadata.yml
+++ b/labs/_metadata.yml
@@ -140,6 +140,13 @@ dimred_plotgenes: "Genes of interest"
 dimred_plotgenes_1: "Let's plot some marker genes for different cell types onto the embedding."
 dimred_plotgenes_2: "Select some of your dimensionality reductions and plot some of the QC stats that were calculated in the previous lab. Can you see if some of the separation in your data is driven by quality of the cells?"
 
+dimred_harmony: "Harmony"
+dimred_harmony_1: "An alternative method for integration is Harmony, for more details on the method, please se their paper  [Nat. Methods](https://www.nature.com/articles/s41592-019-0619-0). This method runs the integration on a dimensionality reduction, in most applications the PCA. So first, we will rerun scaling and PCA with the same set of genes that were used for the CCA integration."
+
+dimred_scanorama: "Scanorama"
+dimred_scanorama_1: "Another integration method is Scanorama (see [Nat. Biotech.](https://www.nature.com/articles/s41587-019-0113-3)). This method is implemented in python, but we can run it through the Reticulate package."
+dimred_scanorama_2: "We will run it with the same set of variable genes, but first we have to create a list of all the objects per sample."
+
 # save data
 dimred_save: "Save data"
 dimred_save_1: "We can finally save the object for use in future steps."
diff --git a/labs/bioc/bioc_01_qc.qmd b/labs/bioc/bioc_01_qc.qmd
index 533196f0..c05b8dfb 100644
--- a/labs/bioc/bioc_01_qc.qmd
+++ b/labs/bioc/bioc_01_qc.qmd
@@ -92,10 +92,10 @@ ctrl.19 <- Seurat::Read10X_h5(
 {{< meta qc_collate_1 >}}
 
 ```{r}
-sce <- SingleCellExperiment(assays = list(counts = cbind(cov.1, cov.15, cov.17, ctrl.5, ctrl.13, ctrl.14)))
+sce <- SingleCellExperiment(assays = list(counts = cbind(cov.1, cov.15, cov.16, cov.17, ctrl.5, ctrl.13, ctrl.14,ctrl.19)))
 dim(sce)
 # Adding metadata
-sce@colData$sample <- unlist(sapply(c("cov.1", "cov.15", "cov.17", "ctrl.5", "ctrl.13", "ctrl.14"), function(x) rep(x, ncol(get(x)))))
+sce@colData$sample <- unlist(sapply(c("cov.1", "cov.15", "cov.16", "cov.17", "ctrl.5", "ctrl.13", "ctrl.14","ctrl.19"), function(x) rep(x, ncol(get(x)))))
 sce@colData$type <- ifelse(grepl("cov", sce@colData$sample), "Covid", "Control")
 ```
 
@@ -103,7 +103,7 @@ sce@colData$type <- ifelse(grepl("cov", sce@colData$sample), "Covid", "Control")
 
 ```{r}
 # remove all objects that will not be used.
-rm(cov.15, cov.1, cov.17, ctrl.5, ctrl.13, ctrl.14)
+rm(cov.15, cov.1, cov.17, cov.16, ctrl.5, ctrl.13, ctrl.14, ctrl.19)
 # run garbage collect to free up memory
 gc()
 ```
@@ -127,7 +127,7 @@ mito_genes <- rownames(sce)[grep("^MT-", rownames(sce))]
 # Ribosomal genes
 ribo_genes <- rownames(sce)[grep("^RP[SL]", rownames(sce))]
 # Hemoglobin genes - includes all genes starting with HB except HBP.
-hb_genes <- rownames(sce)[grep("^HB[^(P)]", rownames(sce))]
+hb_genes <- rownames(sce)[grep("^HB[^(P|E|S)]", rownames(sce))]
 ```
 
 First, let Scran calculate some general qc-stats for genes and cells with the function `perCellQCMetrics`. It can also calculate proportion of counts for specific gene subsets, so first we need to define which genes are mitochondrial, ribosomal and hemoglobin.
@@ -222,7 +222,7 @@ In Scater, you can also use the function `plotHighestExprs()` to plot the gene c
 # Compute the relative expression of each gene per cell
 # Use sparse matrix operations, if your dataset is large, doing matrix devisions the regular way will take a very long time.
 C <- counts(sce)
-C@x <- C@x / rep.int(colSums(C), diff(C@p))
+C@x <- C@x / rep.int(colSums(C), diff(C@p)) * 100
 most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
 boxplot(as.matrix(t(C[most_expressed, ])), cex = .1, las = 1, xlab = "% total count per cell", col = scales::hue_pal()(20)[20:1], horizontal = TRUE)
 
@@ -238,7 +238,7 @@ rm(C)
 {{< meta qc_filter_mr_1 >}}
 
 ```{r}
-selected_mito <- sce.filt$subsets_mt_percent < 30
+selected_mito <- sce.filt$subsets_mt_percent < 20
 selected_ribo <- sce.filt$subsets_ribo_percent > 5
 
 # and subset the object to only keep those cells
@@ -282,8 +282,8 @@ sce.filt <- sce.filt[!grepl("^MT-", rownames(sce.filt)), ]
 # Filter Ribossomal gene (optional if that is a problem on your data)
 # sce.filt <- sce.filt[ ! grepl("^RP[SL]", rownames(sce.filt)), ]
 
-# Filter Hemoglobin gene
-sce.filt <- sce.filt[!grepl("^HB[^(PES)]", rownames(sce.filt)), ]
+# Filter Hemoglobin gene  (optional if that is a problem on your data)
+#sce.filt <- sce.filt[!grepl("^HB[^(P|E|S)]", rownames(sce.filt)), ]
 
 dim(sce.filt)
 ```
@@ -385,6 +385,7 @@ assignments <- cyclone(sce.filt[ensembl %in% cc.ensembl, ], hs.pairs, gene.names
 sce.filt$G1.score <- assignments$scores$G1
 sce.filt$G2M.score <- assignments$scores$G2M
 sce.filt$S.score <- assignments$scores$S
+sce.filt$phase <- assignments$phases
 ```
 
 {{< meta qc_cellcycle_2 >}}
@@ -394,7 +395,7 @@ sce.filt$S.score <- assignments$scores$S
 #| fig-width: 14
 
 wrap_plots(
-    plotColData(sce.filt, y = "G2M.score", x = "G1.score", colour_by = "sample"),
+    plotColData(sce.filt, y = "G2M.score", x = "G1.score", colour_by = "phase"),
     plotColData(sce.filt, y = "G2M.score", x = "sample", colour_by = "sample"),
     plotColData(sce.filt, y = "G1.score", x = "sample", colour_by = "sample"),
     plotColData(sce.filt, y = "S.score", x = "sample", colour_by = "sample"),
@@ -404,6 +405,8 @@ wrap_plots(
 
 Cyclone predicts most cells as G1, but also quite a lot of cells with high S-Phase scores. Compare to results with Seurat and Scanpy and see how different predictors will give clearly different results.
 
+Cyclone does an automatic prediction of cell cycle phase with a default cutoff of the scores at 0.5 As you can see this does not fit this data very well, so be cautious with using these predictions. Instead we suggest that you look at the scores.
+
 ## {{< meta qc_doublet >}}
 
 {{< meta qc_doublet_1 >}}
diff --git a/labs/bioc/bioc_02_dimred.qmd b/labs/bioc/bioc_02_dimred.qmd
index a031d693..ea36f643 100644
--- a/labs/bioc/bioc_02_dimred.qmd
+++ b/labs/bioc/bioc_02_dimred.qmd
@@ -110,6 +110,8 @@ wrap_plots(
 
 {{< meta dimred_pca_3 >}}
 
+Here, we can check how the different metadata variables contributes to each PC. This can be important to look at to understand different biases you may have in your data.
+
 ```{r}
 #| fig-height: 3.5
 #| fig-width: 10
diff --git a/labs/bioc/bioc_03_integration.qmd b/labs/bioc/bioc_03_integration.qmd
index 8319a727..0fcea288 100644
--- a/labs/bioc/bioc_03_integration.qmd
+++ b/labs/bioc/bioc_03_integration.qmd
@@ -22,7 +22,20 @@ Code chunks run R commands unless otherwise specified.
 
 {{< meta int_prep_1 >}}
 
+
+
 ```{r}
+load("temp.rds")
+scanorama <- reticulate::import("scanorama")
+
+integrated.data <- scanorama$integrate(datasets_full = scelist, genes_list = genelist)
+```
+
+```{r}
+# Activate scanorama Python venv
+reticulate::use_virtualenv("/opt/venv/scanorama")
+reticulate::py_discover_config()
+
 suppressPackageStartupMessages({
     library(scater)
     library(scran)
@@ -33,9 +46,7 @@ suppressPackageStartupMessages({
     library(reticulate)
 })
 
-# Activate scanorama Python venv
-reticulate::use_virtualenv("/opt/venv/scanorama")
-reticulate::py_discover_config()
+
 ```
 
 ```{r}
@@ -155,10 +166,10 @@ for (i in c("CD3E", "CD4", "CD8A", "NKG7", "GNLY", "MS4A1", "CD14", "LYZ", "MS4A
 }
 wrap_plots(plotlist = plotlist, ncol = 3)
 ```
+## {{< meta dimred_harmony >}}
 
-INTEG_R1:
+{{< meta dimred_harmony_1 >}}
 
-INTEG_R2:
 
 ```{r}
 #| fig-height: 5
@@ -180,9 +191,13 @@ sce <- RunHarmony(
 sce <- runUMAP(sce, dimred = "harmony", n_dimred = 50, ncomponents = 2, name = "UMAP_on_Harmony")
 ```
 
-INTEG_R3:
+## {{< meta dimred_scanorama >}}
+
+{{< meta dimred_scanorama_1 >}}
+
+{{< meta dimred_scanorama_2 >}}
+
 
-INTEG_R4:
 
 ```{r}
 #| fig-height: 5
@@ -200,7 +215,7 @@ for (i in 1:length(sce.list)) {
 lapply(scelist, dim)
 ```
 
-INTEG_R5:
+Now we can run scanorama:
 
 ```{r}
 #| fig-height: 5
@@ -215,7 +230,7 @@ intdimred <- do.call(rbind, integrated.data[[1]])
 colnames(intdimred) <- paste0("PC_", 1:100)
 rownames(intdimred) <- colnames(logcounts(sce))
 
-# Add standard deviations in order to draw Elbow Plots in Seurat
+# Add standard deviations in order to draw Elbow Plots 
 stdevs <- apply(intdimred, MARGIN = 2, FUN = sd)
 attr(intdimred, "varExplained") <- stdevs
 
@@ -223,9 +238,14 @@ reducedDim(sce, "Scanorama_PCA") <- intdimred
 
 # Here we use all PCs computed from Scanorama for UMAP calculation
 sce <- runUMAP(sce, dimred = "Scanorama_PCA", n_dimred = 50, ncomponents = 2, name = "UMAP_on_Scanorama")
+
+plotReducedDim(sce, dimred = "UMAP_on_Scanorama", colour_by = "sample", point_size = 0.6) + ggplot2::ggtitle(label = "UMAP_on_Scanorama")
 ```
 
-INTEG_R6:
+## Overview all methods
+
+Now we will plot UMAPS with all three integration methods side by side.
+
 
 ```{r}
 #| fig-height: 8
@@ -241,7 +261,6 @@ wrap_plots(p1, p2, p3, p4, nrow = 2) +
     plot_layout(guides = "collect")
 ```
 
-INTEG_R7:
 
 {{< meta int_save >}}
 
diff --git a/labs/seurat/seurat_01_qc.qmd b/labs/seurat/seurat_01_qc.qmd
index d79417c8..beacc808 100644
--- a/labs/seurat/seurat_01_qc.qmd
+++ b/labs/seurat/seurat_01_qc.qmd
@@ -287,7 +287,7 @@ data.filt <- data.filt[!grepl("^MT-", rownames(data.filt)), ]
 # data.filt <- data.filt[ ! grepl("^RP[SL]", rownames(data.filt)), ]
 
 # Filter Hemoglobin gene (optional if that is a problem on your data)
-data.filt <- data.filt[!grepl("^HB[^(PES)]", rownames(data.filt)), ]
+# data.filt <- data.filt[!grepl("^HB[^(P|E|S)]", rownames(data.filt)), ]
 
 dim(data.filt)
 ```
diff --git a/labs/seurat/seurat_05_dge.qmd b/labs/seurat/seurat_05_dge.qmd
index d43fbd83..59361fbc 100644
--- a/labs/seurat/seurat_05_dge.qmd
+++ b/labs/seurat/seurat_05_dge.qmd
@@ -422,8 +422,8 @@ nPatient <- sapply(unique(cell_selection$orig.ident), function(x) {
     rowSums(cell_selection@assays$RNA@counts[, cell_selection$orig.ident
     == x] > 2)
 })
-nCovid <- rowSums(nPatient[, 1:3] > 2)
-nCtrl <- rowSums(nPatient[, 4:6] > 2)
+nCovid <- rowSums(nPatient[, 1:4] > 2)
+nCtrl <- rowSums(nPatient[, 5:8] > 2)
 
 sel <- nCovid >= 2 | nCtrl >= 2
 cell_selection_sub <- cell_selection[sel, ]
@@ -502,7 +502,7 @@ top1 <- head(fcHurdle1[order(fcHurdle1$`Pr(>Chisq)`), ], 10)
 top1
 
 fcHurdle1$pval <- fcHurdle1$`Pr(>Chisq)`
-fcHurdle1$dir <- ifelse(fcHurdle1$z > 0, "up", "down")
+fcHurdle1$dir <- ifelse(fcHurdle1$z > 0, "Ctrl", "Covid")
 fcHurdle1 %>%
     group_by(dir) %>%
     top_n(-10, pval) %>%
@@ -518,7 +518,7 @@ top2 <- head(fcHurdle2[order(fcHurdle2$`Pr(>Chisq)`), ], 10)
 top2
 
 fcHurdle2$pval <- fcHurdle2$`Pr(>Chisq)`
-fcHurdle2$dir <- ifelse(fcHurdle2$z > 0, "up", "down")
+fcHurdle2$dir <- ifelse(fcHurdle2$z > 0, "Ctrl", "Covid")
 fcHurdle2 %>%
     group_by(dir) %>%
     top_n(-10, pval) %>%