-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTraining_Day2.R
140 lines (111 loc) · 6.55 KB
/
Training_Day2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
### Training Day 2 ###
## By Cole Nawrocki ##
## Topic: Seurat Workflow Part 2 -- Adding QC and finishing the process ##
# Load packages. To install scDblFinder, see Training_Day3.R, which is on
# packages and package managers.
library(Seurat)
library(SeuratDisk)
library(Matrix)
library(scDblFinder)
# Set the objects to be the older, simpler version.
options(Seurat.object.assay.version = "v3")
# Read in data
cts <- Read10X("/Users/cnawrocki/Library/CloudStorage/OneDrive-UniversityofVirginia/Grainger-Lab/Bioinformatics/Learning-Resources/Example-Data/filtered_feature_bc_matrix")
cts <- cts[[1]]
# Last time, we didn't do QC. Normally, you want to.
# You want to filter out outlier cells and genes. You also want to detect possible doublets (two cells mistaken as one).
# For a long time, people would just make the Seurat object, look at violin plots and set arbitrary cutoffs.
# Now people use the MAD statistic to set the cutoffs.
obj <- CreateSeuratObject(counts = cts, project = "Training2")
# Looking at some QC features. People used to just eyeball it and cut it at, say, 15000 for nCount_RNA and 4000 for nFeature_RNA.
VlnPlot(obj, features = c("nCount_RNA", "nFeature_RNA"))
# First, lets calculate the necessary metrics for our new methods. I got these from Single Cell Best Practices.
obj$pct_mt <- PercentageFeatureSet(obj, pattern = "^MT")
top20 <- names(sort(rowSums(obj@assays$RNA@counts), decreasing = T))[1:20]
obj$pct_counts_inTop20_genes <- PercentageFeatureSet(obj, features = top20)
obj$log1p_nCount_RNA <- log1p(obj$nCount_RNA)
obj$log1p_nFeature_RNA <- log1p(obj$nFeature_RNA)
# Identifying possible outliers. We are using MAD > 5.
# A note on for loops. If you aren't looping through a ton of things, they are fine. Don't listen to the STAT department.
outlier_df <- data.frame(row.names = rownames([email protected]))
for (metric in c("pct_counts_inTop20_genes", "log1p_nCount_RNA", "log1p_nFeature_RNA", "pct_mt")) {
M <- [email protected][,c(metric)]
names(M) <- rownames([email protected])
mean_abs_dev <- mad(M)
outlier <- (M < median(M)-5*mean_abs_dev) | (M > median(M)+5*mean_abs_dev)
print(sum(outlier))
outlier_df[[metric]] <- outlier
}
# Filtering out the outliers.
obj <- AddMetaData(obj, rowSums(outlier_df) > 0, col.name = "outlier_status")
obj <- subset(obj, outlier_status == FALSE)
VlnPlot(obj, features = c("pct_counts_inTop20_genes", "nCount_RNA", "nFeature_RNA", "pct_mt"), ncol = 4)
# Doing doublet detection. We will decide if they need to be removed later.
doublets <- scDblFinder(obj@assays$RNA@counts)
obj <- AddMetaData(obj, metadata = doublets@colData$scDblFinder.class, col.name = "multiplet_class")
# Same steps as last week.
obj <- NormalizeData(obj, normalization.method = "LogNormalize")
obj <- FindVariableFeatures(obj, method = "vst", nfeatures = 2000)
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "pct_mt")
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "pct_counts_inTop20_genes")
options(future.globals.maxSize = 2e9)
plan(strategy = "multisession", workers=4)
# If you see a variable that is positively correlated with nCount_RNA, then it
# makes sense to regress it out. nFeature_RNA should be positively correlated
# with nCount_RNA though (this makes intuitive sense), and should not be
# regressed out, typically. We will regress it out here just for example's sake
# (so we can get used to working with the functions).
obj <- ScaleData(obj, vars.to.regress = "nFeature_RNA")
obj <- RunPCA(obj, npcs = 50)
ElbowPlot(obj, ndims = 50)
obj <- RunUMAP(obj, dims = 1:20)
plan(strategy = "sequential")
# Should we remove doublets? They all sort of cluster together, so YES.
DimPlot(obj, group.by = "multiplet_class")
# Remove them and redo the previous steps.
obj <- subset(obj, multiplet_class == "singlet")
obj <- NormalizeData(obj, normalization.method = "LogNormalize")
obj <- FindVariableFeatures(obj, method = "vst", nfeatures = 2000)
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "pct_mt")
FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "pct_counts_inTop20_genes")
options(future.globals.maxSize = 2e9)
plan(strategy = "multisession", workers=4)
obj <- ScaleData(obj, vars.to.regress = "nFeature_RNA")
obj <- RunPCA(obj, npcs = 50)
ElbowPlot(obj, ndims = 50)
obj <- RunUMAP(obj, dims = 1:20)
plan(strategy = "sequential")
# Now the data looks like this.
DimPlot(obj, group.by = "multiplet_class")
# New steps.
# Finding neighbors.
obj <- FindNeighbors(obj, dims = 1:20)
# Clustering. Can do this iteratively. We are using the "Louvain" algorithm. The "Leiden" algorithm is good too.
obj <- FindClusters(obj, algorithm = 1, resolution = seq(0.5,3,0.5))
for (i in seq(0.5,3,0.5)) {
print(DimPlot(obj, group.by = paste("RNA_snn_res", i, sep = "."), label = T))
}
# Resolution of 1 looks good. This is sort of an arbitrary value though.
# What I find best is to look for a marker gene using FeaturePlot.
# Pick the resolution that confines the marker gene to a cluster.
obj$seurat_clusters <- obj$RNA_snn_res.1
ob <- SetIdent(obj, value = "seurat_clusters")
DimPlot(obj)
# Finding markers. We will discuss the arguments at a later date.
de <- FindAllMarkers(obj, logfc.threshold = 1, test.use = "wilcox", only.pos = T)
FeaturePlot(obj, "CD14", label = T)
# Saving the data.
# As .h5Seurat (Need SeuratDisk package, but more reliable, apparently)
SaveH5Seurat(obj, "/Users/cnawrocki/Library/CloudStorage/OneDrive-UniversityofVirginia/Grainger-Lab/Bioinformatics/Learning-Resources/Example-Data/Training2.h5Seurat")
# As .Rds (Don't need SeuratDisk package)
SaveSeuratRds(obj, "/Users/cnawrocki/Library/CloudStorage/OneDrive-UniversityofVirginia/Grainger-Lab/Bioinformatics/Learning-Resources/Example-Data/Training2.Rds")
# As .h5ad (AnnData format for opening in Scanpy)
SeuratDisk::Convert("/Users/cnawrocki/Library/CloudStorage/OneDrive-UniversityofVirginia/Grainger-Lab/Bioinformatics/Learning-Resources/Example-Data/Training2.h5Seurat",
"/Users/cnawrocki/Library/CloudStorage/OneDrive-UniversityofVirginia/Grainger-Lab/Bioinformatics/Learning-Resources/Example-Data/Training2.h5ad")
# Loading the data.
# From .h5Seurat
obj_test1 <- LoadH5Seurat("/Users/cnawrocki/Library/CloudStorage/OneDrive-UniversityofVirginia/Grainger-Lab/Bioinformatics/Learning-Resources/Example-Data/Training2.h5Seurat")
# From .Rds
obj_test2 <- LoadSeuratRds("/Users/cnawrocki/Library/CloudStorage/OneDrive-UniversityofVirginia/Grainger-Lab/Bioinformatics/Learning-Resources/Example-Data/Training2.Rds")