-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscRNA-seq_Annotation.R
406 lines (304 loc) · 16.4 KB
/
scRNA-seq_Annotation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
############ Load in packages ###############
lapply(c("dplyr","Seurat","HGNChelper","openxlsx", "patchwork", "SingleR", "scGate", "plotly", "purrr", "tidyverse"), library, character.only = T)
############ load gene set preparation function ###############
# GNU General Public License v3.0 (https://github.com/IanevskiAleksandr/sc-type/blob/master/LICENSE)
# Written by Aleksandr Ianevski <[email protected]>, June 2021
#
# Functions on this page:
# gene_sets_prepare: prepare gene sets and calculate marker sensitivity from input Cell Type excel file
#
# @params: path_to_db_file - DB file with cell types
# @cell_type - cell type (e.g. Immune system, Liver, Pancreas, Kidney, Eye, Brain)
#
gene_sets_prepare <- function(path_to_db_file, cell_type){
cell_markers = openxlsx::read.xlsx(path_to_db_file)
cell_markers = cell_markers[cell_markers$tissueType == cell_type,]
cell_markers$geneSymbolmore1 = gsub(" ","",cell_markers$geneSymbolmore1); cell_markers$geneSymbolmore2 = gsub(" ","",cell_markers$geneSymbolmore2)
# correct gene symbols from the given DB (up-genes)
cell_markers$geneSymbolmore1 = sapply(1:nrow(cell_markers), function(i){
markers_all = gsub(" ", "", unlist(strsplit(cell_markers$geneSymbolmore1[i],",")))
markers_all = toupper(markers_all[markers_all != "NA" & markers_all != ""])
markers_all = sort(markers_all)
if(length(markers_all) > 0){
suppressMessages({markers_all = unique(na.omit(checkGeneSymbols(markers_all)$Suggested.Symbol))})
paste0(markers_all, collapse=",")
} else {
""
}
})
# correct gene symbols from the given DB (down-genes)
cell_markers$geneSymbolmore2 = sapply(1:nrow(cell_markers), function(i){
markers_all = gsub(" ", "", unlist(strsplit(cell_markers$geneSymbolmore2[i],",")))
markers_all = toupper(markers_all[markers_all != "NA" & markers_all != ""])
markers_all = sort(markers_all)
if(length(markers_all) > 0){
suppressMessages({markers_all = unique(na.omit(checkGeneSymbols(markers_all)$Suggested.Symbol))})
paste0(markers_all, collapse=",")
} else {
""
}
})
cell_markers$geneSymbolmore1 = gsub("///",",",cell_markers$geneSymbolmore1);cell_markers$geneSymbolmore1 = gsub(" ","",cell_markers$geneSymbolmore1)
cell_markers$geneSymbolmore2 = gsub("///",",",cell_markers$geneSymbolmore2);cell_markers$geneSymbolmore2 = gsub(" ","",cell_markers$geneSymbolmore2)
gs = lapply(1:nrow(cell_markers), function(j) gsub(" ","",unlist(strsplit(toString(cell_markers$geneSymbolmore1[j]),",")))); names(gs) = cell_markers$cellName
gs2 = lapply(1:nrow(cell_markers), function(j) gsub(" ","",unlist(strsplit(toString(cell_markers$geneSymbolmore2[j]),",")))); names(gs2) = cell_markers$cellName
list(gs_positive = gs, gs_negative = gs2)
}
############ load cell type annotation function ############
# GNU General Public License v3.0 (https://github.com/IanevskiAleksandr/sc-type/blob/master/LICENSE)
# Written by Aleksandr Ianevski <[email protected]>, June 2021
#
# Functions on this page:
# sctype_score: calculate ScType scores and assign cell types
#
# @params: scRNAseqData - input scRNA-seq matrix (rownames - genes, column names - cells),
# @params: scale - indicates whether the matrix is scaled (TRUE by default)
# @params: gs - list of gene sets positively expressed in the cell type
# @params: gs2 - list of gene sets that should not be expressed in the cell type (NULL if not applicable)
sctype_score <- function(scRNAseqData, scaled = !0, gs, gs2 = NULL, gene_names_to_uppercase = !0, ...){
# check input matrix
if(!is.matrix(scRNAseqData)){
warning("scRNAseqData doesn't seem to be a matrix")
} else {
if(sum(dim(scRNAseqData))==0){
warning("The dimension of input scRNAseqData matrix equals to 0, is it an empty matrix?")
}
}
# marker sensitivity
marker_stat = sort(table(unlist(gs)), decreasing = T);
marker_sensitivity = data.frame(score_marker_sensitivity = scales::rescale(as.numeric(marker_stat), to = c(0,1), from = c(length(gs),1)),
gene_ = names(marker_stat), stringsAsFactors = !1)
# convert gene names to Uppercase
if(gene_names_to_uppercase){
rownames(scRNAseqData) = toupper(rownames(scRNAseqData));
}
# subselect genes only found in data
names_gs_cp = names(gs); names_gs_2_cp = names(gs2);
gs = lapply(1:length(gs), function(d_){
GeneIndToKeep = rownames(scRNAseqData) %in% as.character(gs[[d_]]); rownames(scRNAseqData)[GeneIndToKeep]})
gs2 = lapply(1:length(gs2), function(d_){
GeneIndToKeep = rownames(scRNAseqData) %in% as.character(gs2[[d_]]); rownames(scRNAseqData)[GeneIndToKeep]})
names(gs) = names_gs_cp; names(gs2) = names_gs_2_cp;
cell_markers_genes_score = marker_sensitivity[marker_sensitivity$gene_ %in% unique(unlist(gs)),]
# z-scale if not
if(!scaled) Z <- t(scale(t(scRNAseqData))) else Z <- scRNAseqData
# multiple by marker sensitivity
for(jj in 1:nrow(cell_markers_genes_score)){
Z[cell_markers_genes_score[jj,"gene_"], ] = Z[cell_markers_genes_score[jj,"gene_"], ] * cell_markers_genes_score[jj, "score_marker_sensitivity"]
}
# subselect only with marker genes
Z = Z[unique(c(unlist(gs),unlist(gs2))), ]
# combine scores
es = do.call("rbind", lapply(names(gs), function(gss_){
sapply(1:ncol(Z), function(j) {
gs_z = Z[gs[[gss_]], j]; gz_2 = Z[gs2[[gss_]], j] * -1
sum_t1 = (sum(gs_z) / sqrt(length(gs_z))); sum_t2 = sum(gz_2) / sqrt(length(gz_2));
if(is.na(sum_t2)){
sum_t2 = 0;
}
sum_t1 + sum_t2
})
}))
dimnames(es) = list(names(gs), colnames(Z))
es.max <- es[!apply(is.na(es) | es == "", 1, all),] # remove na rows
es.max
}
############ User Input and Output ###############
# Input File
sc_RNAseq_dir = "single_cell_analysis_GSE205013/GSM6204109_P01/Data"
# Input DB and study tissue (for sc_Type package)
db_ <- "ScTypeDB_full.xlsx";
tissue <- "Immune system" # e.g. Immune system,Pancreas,Liver,Eye,Kidney,Brain,Lung,Adrenal,Heart,Intestine,Muscle,Placenta,Spleen,Stomach,Thymus
# Project name
Prj = "GSE205013"
# Output directory
dir = getwd()
# Output File 1 cell-level meta
meta_dir = "GSM6204109_P01_meta_data_combined"
# Output File 2 cluster level annotatiion
cluster_annotation_dir = "GSM6204109_P01_cluster_annotation_percentage"
################ Load in seurat object ##################
# Load the sc-RNAseq dataset
pdac.data <- Read10X(data.dir = sc_RNAseq_dir)
# Initialize the Seurat object with the raw (non-normalized data).
seu_obj <- CreateSeuratObject(counts = pdac.data, project = Prj, min.cells = 3, min.features = 200)
seu_obj
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
seu_obj[["percent.mt"]] <- PercentageFeatureSet(seu_obj, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(seu_obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# Normalizing the data
seu_obj <- NormalizeData(seu_obj, normalization.method = "LogNormalize", scale.factor = 10000)
seu_obj <- NormalizeData(seu_obj)
# Identification of highly variable features
seu_obj <- FindVariableFeatures(seu_obj, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(seu_obj), 10)
# Scailing the data
all.genes <- rownames(seu_obj)
seu_obj <- ScaleData(seu_obj, features = all.genes)
# Linear dimensional reduction
seu_obj <- RunPCA(seu_obj, features = VariableFeatures(object = seu_obj))
# Examine and visualize seurat object results a few different ways
print(seu_obj[["pca"]], dims = 1:5, nfeatures = 5)
# Cluster the cells
seu_obj <- FindNeighbors(seu_obj, dims = 1:10)
seu_obj <- FindClusters(seu_obj, resolution = 0.5)
# Look at cluster IDs of the first 5 cells
head(Idents(seu_obj), 5)
# Non-linear dimensional reduction
seu_obj <- RunUMAP(seu_obj, dims = 1:10)
# find markers for every cluster compared to all remaining cells, report only the positive ones
seu_obj.markers <- FindAllMarkers(seu_obj, only.pos = TRUE)
seu_obj.markers %>%
group_by(cluster) %>%
dplyr::filter(avg_log2FC > 1)
################ scGate Annoatation ##################
# List of model names and corresponding scGate models
scGate_models_DB <- get_scGateDB()
models <- list(
CD4_TIL = scGate_models_DB$human$CD4_TIL,
CD8_TIL = scGate_models_DB$human$CD8_TIL,
generic = scGate_models_DB$human$generic,
HiTME = scGate_models_DB$human$HiTME,
PBMC = scGate_models_DB$human$PBMC,
TME_broad = scGate_models_DB$human$TME_broad,
TME_HiRes = scGate_models_DB$human$TME_HiRes
)
# Create a list to store new columns
new_columns_list <- list()
# Loop through each model
for (model_name in names(models)) {
# Get the current model
my_scGate_model <- models[[model_name]]
# Copy the original Seurat object to avoid modification
seu_obj_copy <- seu_obj
# Apply scGate to the copied Seurat object
seu_obj_processed <- scGate(seu_obj_copy, model = my_scGate_model, ncores = 4)
# Assign the processed Seurat object to a variable named according to the model name
assign(paste0("seu_obj_", model_name), seu_obj_processed)
# Extract metadata
meta_data <- [email protected]
##### Process "is.pure_" Columns #####
# Identify columns that start with "is.pure_"
pure_columns <- grep("^is.pure_", colnames(meta_data), value = TRUE)
# Initialize the new column to store the features
new_col_name <- paste0("scGate_", model_name, "_Celltype")
meta_data[[new_col_name]] <- NA
# Loop through each "is.pure_" column and update the new column
for (col in pure_columns) {
feature <- sub("^is.pure_", "", col)
pure_barcodes <- rownames(meta_data)[meta_data[[col]] == "Pure"]
for (barcode in pure_barcodes) {
if (is.na(meta_data[barcode, new_col_name])) {
meta_data[barcode, new_col_name] <- feature
} else {
meta_data[barcode, new_col_name] <- paste(meta_data[barcode, new_col_name], feature, sep = " ")
}
}
}
# Store the new column in the list
new_columns_list[[new_col_name]] <- meta_data[, new_col_name, drop = F]
##### Process "scGate_multi" Column #####
if ("scGate_multi" %in% colnames(meta_data)) {
multi_col_name <- paste0("scGate_multi_", model_name)
meta_data[[multi_col_name]] <- meta_data[["scGate_multi"]]
# Store the renamed "scGate_multi" column in the list
new_columns_list[[multi_col_name]] <- meta_data[, multi_col_name, drop = F]
}
# Save the metadata to a CSV file
write.csv(meta_data, file = paste0(dir, "/meta_data_", model_name, ".csv"))
View(meta_data)
}
# Combine the original metadata with the new columns
meta_data_combined <- [email protected]
for (new_col in new_columns_list) {
meta_data_combined <- merge(meta_data_combined, new_col, by = "row.names", all.x = TRUE)
rownames(meta_data_combined) <- meta_data_combined$Row.names
meta_data_combined$Row.names <- NULL
}
# Update the original Seurat object with the combined metadata
[email protected] <- meta_data_combined
View([email protected])
################ SingleR annotation ##################
# Automated annotation using SingleR
reference_HPCA <- celldex::HumanPrimaryCellAtlasData()
singleR_result_HPCA <- SingleR(test = seu_obj@assays$RNA$data, ref = reference_HPCA, labels = reference_HPCA$label.main)
seu_obj$SingleR.labels.HPCA <- singleR_result_HPCA$labels
DimPlot(seu_obj, reduction = "umap", group.by = "SingleR.labels.HPCA", label = T, label.size = 3) + ggtitle("SingleR annotation HPCA")
################# sc_Type annotation #################
# prepare gene sets
gs_list <- gene_sets_prepare(db_, tissue)
# check Seurat object version (scRNA-seq matrix extracted differently in Seurat v4/v5)
seurat_package_v5 <- isFALSE('counts' %in% names(attributes(seu_obj[["RNA"]])));
print(sprintf("Seurat object %s is used", ifelse(seurat_package_v5, "v5", "v4")))
# extract scaled scRNA-seq matrix
scRNAseqData_scaled <- if (seurat_package_v5) as.matrix(seu_obj[["RNA"]]$scale.data) else as.matrix(seu_obj[["RNA"]]@scale.data)
# run ScType
es.max <- sctype_score(scRNAseqData = scRNAseqData_scaled, scaled = TRUE, gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)
# NOTE: scRNAseqData parameter should correspond to your input scRNA-seq matrix. For raw (unscaled) count matrix set scaled = FALSE
# When using Seurat, we use "RNA" slot with 'scale.data' by default. Please change "RNA" to "SCT" for sctransform-normalized data,
# or to "integrated" for joint dataset analysis. To apply sctype with unscaled data, use e.g. pbmc[["RNA"]]$counts or pbmc[["RNA"]]@counts, with scaled set to FALSE.
# merge by cluster
cL_resutls <- do.call("rbind", lapply(unique([email protected]$seurat_clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ ,rownames([email protected][[email protected]$seurat_clusters==cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum([email protected]$seurat_clusters==cl)), 10)
}))
sctype_scores <- cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown"
print(sctype_scores[,1:3])
[email protected]$sctype_classification = ""
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster==j,];
[email protected]$sctype_classification[[email protected]$seurat_clusters == j] = as.character(cl_type$type[1])
}
DimPlot(seu_obj, reduction = "umap", label = TRUE, repel = TRUE, group.by = 'sctype_classification')
################# find the percentage of each cell types in each clusters #####################
# List of cell type columns to analyze
cell_type_columns <- c(
"scGate_generic_Celltype", "scGate_HiTME_Celltype", "scGate_PBMC_Celltype",
"scGate_TME_broad_Celltype", "scGate_TME_HiRes_Celltype", "sctype_classification", "SingleR.labels.HPCA",
"scGate_CD4_TIL_Celltype", "scGate_CD8_TIL_Celltype"
)
# Initialize an empty list to store the results
cluster_percentage_list <- list()
# Loop through each cell type column
for (cell_type_col in cell_type_columns) {
# Generate a df with cluster and cell type information
cluster_celltype_df <- data.frame(
cluster = seu_obj$RNA_snn_res.0.5,
cell_type = seu_obj[[cell_type_col]]
)
cluster_celltype_df
# Rename the cell_type column if it's not named correctly
colnames(cluster_celltype_df)[2] <- "cell_type"
# Convert NA values to a string so they are treated as a category
cluster_celltype_df$cell_type <- ifelse(is.na(cluster_celltype_df$cell_type), "NA", cluster_celltype_df$cell_type)
# Generate a table of cluster vs cell type counts
cluster_celltype_count <- table(cluster_celltype_df$cluster, cluster_celltype_df$cell_type)
# Convert counts to percentages
cluster_celltype_percentage <- prop.table(cluster_celltype_count, margin = 1) * 100
# Convert to df for easier viewing
cluster_celltype_df <- as.data.frame(cluster_celltype_percentage)
colnames(cluster_celltype_df) <- c('cluster', 'cell_type', 'percentage')
# Cell type percentage grouped by cluster
cluster_percentage_strings <- cluster_celltype_df %>%
group_by(cluster) %>%
arrange(desc(percentage)) %>%
summarise(!!paste0(cell_type_col, "_percentage") := paste0(cell_type, "(", round(percentage, 2), "%)", collapse = " "))
# Store the result in the list
cluster_percentage_list[[cell_type_col]] <- cluster_percentage_strings
}
# Merge all the results into a single data frame
cluster_percentage_strings_combined <- purrr::reduce(cluster_percentage_list, full_join, by = "cluster")
# View the final combined data frame
View(cluster_percentage_strings_combined)
# Generate cluster level annotation
write.csv(cluster_percentage_strings_combined, file = paste0(dir, "/", cluster_annotation_dir, ".csv"))
# Generate the combined metadata to a csv file
write.csv([email protected], file = paste0(dir, "/", meta_dir, ".csv"))
# ##### Generate Umap #####
# # Metadata column to group
# group_col = "scGate_multi_generic"
# DimPlot(seu_obj, reduction = 'umap', group.by = group_col, label = T, label.size = 3) + ggtitle(group_col)