-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgs_first.R
1148 lines (972 loc) · 47 KB
/
gs_first.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
## ==========================================================================
## Installing Libraries
## ==========================================================================
##
## Install Several Required packages:
install.packages(c("devtools", "gplots",
"ggplot2", "igraph",
"lattice", "knitr","rsample",
"RColorBrewer", "rmarkdown",
"stringr", "UpSetR", "vcfR",
"pcaPP", "SciViews", "tidyverse"))
## ---------------------------------------------------------------------------
## TCGAbiolinks package (from github):
devtools::install_github(repo = "BioinformaticsFMRP/TCGAbiolinks")
## ---------------------------------------------------------------------------
## Bioconductor packages: (R >= 3.5)
if(!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(c("IRanges", "GenomicRanges", "GenomicFeatures",
"org.Hs.eg.db", "Rsamtools",
"SummarizedExperiment",
"TxDb.Hsapiens.UCSC.hg19.knownGene",
"TxDb.Hsapiens.UCSC.hg38.knownGene",
"VariantAnnotation"))
## ---------------------------------------------------------------------------
## Install ISMA - For Integrative retrieval
## and analysis of Mutations
install.packages("isma/isma_0.1.4.tar.gz", repos=NULL)
## ---------------------------------------------------------------------------
## Install mND - For Multi Network Diffusion
install.packages("mND/mND_0.1.7.tar.gz", repos = NULL)
## ---------------------------------------------------------------------------
## Get GeneSurrounder.R and calc_p.R from github and Source it
source("genesurrounder/GeneSurrounder.R")
source("genesurrounder/run_geneSurrounder.R")
source("mND/calc_p.R")
## ==========================================================================
## Loading Libraries
## ==========================================================================
library(isma)
library(mND)
library(limma)
library(igraph)
library(pcaPP)
library(SciViews)
library(tidyverse)
library(lubridate)
#library(rsample)
## ==========================================================================
## Get Data Inputs
## ==========================================================================
## Now we are ready to start our analysis
data(X0)
data(A)
## ==========================================================================
## First we Try mND Only - Without GeneSurrounder
## ==========================================================================
## Normalize the Adjacency Matrix
W <- normalize_adj_mat(A)
## Permute the Layers Matrix
X0_perm <- perm_X0(X0, r = 50, W, seed_n = 2)
## Perform Network Diffusion - Non-Windows
#Xs <- ND(X0_perm, W, cores = 2)
## Perform Network Diffusion - Windows
#Xs <- ND(X0_perm, W)
#saveRDS(Xs, "Data/Xs.rds")
data(Xs)
## Get Indices of Adjacent Neighbours
ind_adj <- neighbour_index(W)
## Perform mND considering sets of 3-neighbors - Non-Windows
mND_score <- mND(Xs, ind_adj, k = 3, cores = 2)
## Perform mND considering sets of 3-neighbors - Windows
mND_score <- mND(Xs, ind_adj, k = 3)
mND_score <- signif_assess(mND_score)
#saveRDS(mND_score, "Data/mND_scores.rds")
## ==========================================================================
## GeneSurrounder (Trial)
## ==========================================================================
## Preprocessing
ge <- X0[,2]
plot(sort(ge))
ggplot(data.frame(ge = ge), aes(ge)) + geom_density()
## Gives 438 genes, roughly 2 hours to run GeneSurrounder
ge_filtered <- ge[ge >= 10]
int_network <- graph_from_adjacency_matrix(A, mode = "undirected")
ge_dist <- calcAllPairsDistances(int_network, directionPaths = "all", networkName = "int_network")
ge_cor <- calcCorMatrix(data.frame(ge_filtered), corMethod = "pearson", exprName = "ge_filtered", useMethod = "everything")
diam <- diameter(int_network)
genes.assayedETnetwork <- intersect(rownames(ge_dist), rownames(ge_cor))
## Resampling
ge_resampled <- data.frame(matrix(ncol = length(ge_filtered), nrow = 1000))
colnames(ge_resampled) <- rownames(data.frame(ge_filtered))
set.seed(123)
for(i in 1:1000){
ge_resampled[i,] <- sample(ge_filtered, length(ge_filtered), replace = TRUE)
}
## GeneSurrounder
gs_results <- data.frame()
time <- vector()
for(i in 1:length(ge_filtered)){
start_time <- Sys.time()
print(paste("Run", i))
gs <- geneNIDG(distance.matrix = ge_dist,
cor.matrix = ge_cor,
geneStats.observed = ge_filtered,
perm.geneStats.matrix = as.matrix(ge_resampled),
genes.assayedETnetwork = genes.assayedETnetwork,
#> diameter >= 8 # diameter < 8 gives an
#> error due to geneid.d line 376 of GeneSurrounder.R
diameter = diam,
num.Sphere.resamples = 1000,
gene.id = rownames(data.frame(ge_filtered))[i]
)
print(gs[which.min(gs$p.Fisher),])
gs_results <- rbind(gs_results, gs[which.min(gs$p.Fisher),])
end_time <- Sys.time()
time[i] <- end_time - start_time
print(paste("Time:", time[i], ""))
}
gs_results <- gs_results %>% mutate(time = time)
write.csv(gs_results, "gs_result_1.csv")
ggplot(gs_results, aes(p.Sphere)) +
geom_density()
ggplot(gs_results, aes(p.Fisher)) +
geom_density()
ggplot(gs_results, aes(time)) +
geom_density()
## ==========================================================================
## GeneSurrounder on mND (Trial)
## ==========================================================================
mND_sig_genes <- rownames(mND_score$mND)[mND_score$mND$mNDp <= 0.05]
ge_mND_filtered <- ge[mND_sig_genes]
ge_mND_filtered <- ge_mND_filtered[ge_mND_filtered != 0]
ge_cor_mND <- calcCorMatrix(ge_mND_filtered, corMethod = "pearson", exprName = "ge_mND_filtered", useMethod = "everything")
ge_resampled_mND <- data.frame(matrix(ncol = length(ge_mND_filtered), nrow = 1000))
colnames(ge_resampled_mND) <- names(ge_mND_filtered)
set.seed(123)
for(i in 1:1000){
ge_resampled_mND[i,] <- sample(ge_mND_filtered, length(ge_mND_filtered), replace = TRUE)
}
gs_new_results <- run_geneSurrounder(distance.matrix = ge_dist,
cor.matrix = ge_cor_mND,
geneStats.observed = ge_mND_filtered,
perm.geneStats.matrix = as.matrix(ge_resampled_mND),
diameter = diam,
num.Sphere.resamples = 1000,
gene.id = names(ge_mND_filtered),
decay_only = TRUE
)
ggplot(gs_new_results, aes(p.Decay)) +
geom_density()
ggplot(gs_new_results, aes(radius)) +
geom_bar()
ggplot(gs_new_results, aes(time)) +
geom_density()
sum(gs_new_results$time)/3600
######### P values need to be based on ND score not mND (excluding top k neighbors)
NDp <- data.frame(calc_p(Xs))
NDp_filtered_genes <- rownames(NDp %>% filter(L1 <= 0.05 | L2 <= 0.05))
ge_NDp_filtered <- X0[NDp_filtered_genes,2]
ge_NDp_filtered <- ge_NDp_filtered[ge_NDp_filtered > 0]
ge_cor_NDp <- calcCorMatrix(ge_NDp_filtered, corMethod = "pearson", exprName = "ge_NDp_filtered", useMethod = "everything")
ge_resampled_NDp <- data.frame(matrix(ncol = length(ge_NDp_filtered), nrow = 1000))
colnames(ge_resampled_NDp) <- names(ge_NDp_filtered)
set.seed(123)
for(i in 1:1000){
ge_resampled_NDp[i,] <- sample(ge_NDp_filtered, length(ge_NDp_filtered), replace = TRUE)
}
gs_NDp_filtered_results <- run_geneSurrounder(distance.matrix = ge_dist,
cor.matrix = ge_cor_NDp,
geneStats.observed = ge_NDp_filtered,
perm.geneStats.matrix = as.matrix(ge_resampled_NDp),
diameter = diam,
num.Sphere.resamples = 1000,
gene.id = names(ge_NDp_filtered),
decay_only = TRUE
)
ggplot(gs_NDp_filtered_results, aes(p.Decay)) +
geom_density()
ggplot(gs_NDp_filtered_results, aes(radius)) +
geom_bar()
ggplot(gs_NDp_filtered_results, aes(time)) +
geom_density()
sum(gs_NDp_filtered_results$time)/60
### These results can then be used to adjust top k neighbors in mND
## ==========================================================================
## mND on GeneSurrounder
## ==========================================================================
## Strategy:
## 1. Run GeneSurrounder to obtain p-decay, p-sphere, p-fisher
## 2. Use one or more of the above to adjust the values of some layer.
## 3. Run mND on the new adjusted layers.
## 4. Save the Results & Compare with the previous results.
## Step 1: GeneSurrounder ##
##--------------------------------------##
#> Optional: Filter non zero values
#X0_new <- data.frame(X0) %>%
# mutate(sum = L1 + L2) %>%
# filter(sum > 0) %>%
# select(-sum)
#> Load in the Data
data(X0)
data(A)
#> Get a Correlation Matrix (Required to run GS)
ge <- X0[,2]
ge_no_0 <- data.frame(ge[ge > 0])
ge_cor <- calcCorMatrix(ge_no_0, corMethod = "pearson", exprName = "ge_no_0",
useMethod = "everything")
#> Get a resampled GE matrix (Required to run GS)
ge_resampled <- data.frame(matrix(ncol = length(ge[ge > 0]), nrow = 1000))
colnames(ge_resampled) <- names(ge[ge > 0])
set.seed(123)
for(i in 1:1000){
ge_resampled[i,] <- sample(ge[ge > 0], length(ge[ge > 0]), replace = TRUE)
}
#> Get the interaction network & distance matrix (Required to run GS)
int_network <- graph_from_adjacency_matrix(A, mode = "undirected")
ge_dist <- calcAllPairsDistances(int_network, directionPaths = "all",
networkName = "int_network")
#> Run Gene Surrounder
#> Takes as parameters:
#> 1. Genes distance matrix
#> 2. GE correlation matrix
#> 3. GE layer with ge > 0
#> 4. Resampled GE matrix
#> 5. Gene IDs of > 0 GE
#> 6. Output file name
gs_results_all <- run_geneSurrounder(distance.matrix = ge_dist,
cor.matrix = ge_cor,
geneStats.observed = ge[ge > 0],
perm.geneStats.matrix = as.matrix(ge_resampled),
diameter = diam,
num.Sphere.resamples = 1000,
gene.id = names(ge[ge > 0]), #subset for chunks
decay_only = TRUE,
file_name = "gs_results_all.csv"
) # Took too much time, ran in chunks
#> Save Results to CSV
## write.csv(gs_results_all, "Data/gs_results_all.csv")
##---------------------------------------------------------------##
## Step 2: mND on GeneSurrounder - Adjusted scores ##
##---------------------------------------------------------------##
## 1. Read the results of Gene Surrounder
gs_results_all <- read.csv("Data/gs_results_all.csv") %>% mutate(time = duration(time)) %>% select(-X)
rownames(gs_results_all) <- gs_results_all$gene.id
## 2. Adjust the Scores of one Layer
## Adjusted score = -log10(p.Decay)
X0_gs_adjusted <- data.frame(X0) %>%
rownames_to_column('rn') %>%
mutate(L2 = if_else(rn %in% gs_results_all$gene.id, -log10(gs_results_all[rn,]$p.Decay)*L2, L2)) %>%
column_to_rownames('rn')
X0_gs_adjusted <- as.matrix(X0_gs_adjusted)
#> Save Results to RDS
## saveRDS(X0_gs_adjusted, "Data/X0_gs_adjusted.rds")
#> Load the Results
data(X0)
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
## 3. Visualize the difference between
## the two layers: old & adjusted
## Plot Results with Density plot
ggplot(data.frame(X0), aes(L2)) + geom_density()
ggplot(data.frame(X0_gs_adjusted), aes(L2)) + geom_density()
## 4. Run mND on Adjusted Values
## take different k-values to run mND
data(A)
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)
X0_perm_new <- perm_X0(X0_gs_adjusted, r = 50, W, seed_n = 2)
Xs_new <- ND(X0_perm_new, W, cores = 2)
#saveRDS(Xs_new, "Data/Xs_new.rds")
mND_score_new_k2 <- mND(Xs_new, ind_adj, k=2, cores = 2)
#mND_score_new <- mND(Xs_new, ind_adj, k=3, cores = 2)
mND_score_new_k2 <- signif_assess(mND_score_new_k2)
#mND_score_new <- signif_assess(mND_score_new)
## 5. Save Results in RDS
#saveRDS(mND_score_new, "Data/mND_gs_adjusted_scores.rds")
#saveRDS(mND_score_new_k2, "Data/mND_gs_adjusted_scores_k2.rds")
## ==========================================================================
## Getting the Results
## ==========================================================================
## Load Data
#> Adjacency Matrix
data(A)
## Normalized AM
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)
## Matrices with Original Values
data(X0)
data(Xs)
## Matrices with Adjusted Values
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
Xs_new <- readRDS("Data/Xs_new.rds")
## mND Scores Results
mND_score <- readRDS("Data/mND_scores.rds")
#mND_score_new_k2 <- readRDS("Data/mND_gs_adjusted_scores_k2.rds")
mND_score_new <- readRDS("Data/mND_gs_adjusted_scores.rds")
##### mND Alone results #####
# H1: genes with a mutation frequency greater than zero;
# H2: top 1200 differentially expressed genes (FDR < 10^-7).
# Further, we set the cardinalities of gene sets N1 and N2,
# containing the genes with the highest scoring neighborhoods,
# as |H1|=|N1| and |H2|=|N2|
Hl <- list(l1 = rownames(X0[X0[,1]>0,]),
l2 = names(X0[order(X0[,2], decreasing = T),2][1:1200])
)
top_Nl <- unlist(lapply(Hl, function(x) length(x)))
top_Nl
class_res <- classification(mND_score, X0, Hl, top = top_Nl)
## Classification of genes in every layer
head(class_res$gene_class)
## Occurrence of (M; L; I; NS)
## for each gene across layers
head(class_res$occ_labels)
## Plot the results of mND Score
plot_results(mND_score, class_res, W, n = 150, directory = "Results/mND_results/")
## Optimizing k (Mac only)
k_val <- seq(1,6,1)
k_results <- optimize_k(Xs, X0, k_val, ind_adj, W, top = 200, cores = 2)
k_results <- data.frame(k_results)
colnames(k_results) <- k_val
## Save Results to CSV
#> can also be loaded through data(k_results) but is a list not data.frame
# write.csv(k_results, "Data/k_results.csv")
## New Results -- GS Adjusted mND Results ##
## -------------------------------------- ##
#H1: genes with a mutation frequency greater than zero;
#H2: top 1200 differentially expressed genes (FDR < 10^-7).
#Further, we set the cardinalities of gene sets N1 and N2,
# containing the genes with the highest scoring neighborhoods,
# as |H1|=|N1| and |H2|=|N2|
Hl_new <- list(l1 = rownames(X0[X0[,1]>0,]),
l2 = names(X0_gs_adjusted[order(X0_gs_adjusted[,2], decreasing = T),2][1:1200])
)
top_Nl_new <- unlist(lapply(Hl_new, function(x) length(x)))
top_Nl_new
class_res_new <- classification(mND_score_new, X0_gs_adjusted, Hl_new, top = top_Nl_new)
## Classification of genes in every layer
head(class_res_new$gene_class)
## Occurrence of (M; L; I; NS) for each gene across layers
head(class_res_new$occ_labels)
## Plotting new mND Scores Results
plot_results(mND_score_new, class_res_new, W, n = 150, directory = "Results/mND_results_new_k2/")
#plot_results(mND_score_new, class_res_new, W, n = 150, directory = "Results/mND_results_new/")
## Optimizing k (Mac only)
k_results_new <- optimize_k(Xs_new, X0_gs_adjusted, k_val, ind_adj, W, top = 200, cores = 2)
k_results_new <- data.frame(k_results_new)
colnames(k_results_new) <- k_val
## Saving Results
#write.csv(k_results_new, "Data/k_results_new.csv")
#> k = 2 seems like a better choice in this case
## ==========================================================================
## Analysis of Results - Ghadi
## A. Classification Change
## B. Network Visualization
## ==========================================================================
## A. Discussing Classification Change
## % Label Change
## -----------------------------------
## Sanity check:
#> Get new Classes - new Results
class_res_new$gene_class <- class_res_new$gene_class[match(rownames(class_res$gene_class), rownames(class_res_new$gene_class)),]
#> Results for Somatic Mutations Layer
sum(class_res_new$gene_class[,1] != class_res$gene_class[,1])
shift_sm <- table(mND = class_res$gene_class[,1], GS_adjusted_mND = class_res_new$gene_class[,1])
#> Results for Gene Expression Adjusted Layer
sum(class_res_new$gene_class[,2] != class_res$gene_class[,2])
shift_ge <- table(mND = class_res$gene_class[,2], GS_adjusted_mND = class_res_new$gene_class[,2])
## Confusion Matrix Percentages over original mND genes
results_ge_mND <- shift_ge
for(i in 1:nrow(shift_ge)){
results_ge_mND[i,] <- (shift_ge[i,]/sum(shift_ge[i,]))*100
}
results_ge_mND
## Total selected mND genes (I, L, M)
11796 - sum(shift_ge[4,]) #2297 k = 3
## Percentages over GS_adjusted genes
results_ge_GS_adjusted <- shift_ge
for(i in 1:ncol(shift_ge)){
results_ge_GS_adjusted[,i] <- (shift_ge[,i]/sum(shift_ge[,i]))*100
}
results_ge_GS_adjusted
## Total selected GS-adjusted mND genes (I, L, M)
#> 2109 k = 3
11796 - sum(shift_ge[,4])
#> Difference btween k=3 and k=2
#> -188 genes k = 3, -205 k = 2
11796 - sum(shift_ge[,4]) - (11796 - sum(shift_ge[4,]))
## Difference in M
## GS-adjusted (314 with k = 3)
## --> +37 modules
sum(shift_ge[,3])
## Percent over all genes
(shift_ge/11796)*100
#> Analysis of previous results:
#>
## ---------------------------------------------------------------
## B. Network Visualization
## ------------------------------------
library(visNetwork)
library(RCy3)
ls("package:igraph", pattern = "^layout_.")
## mND graph
genes_subset_mND <- class_res$gene_class %>% filter(L2 != "NS")
A_subset_mND <- A[rownames(A) %in% rownames(genes_subset_mND),
colnames(A) %in% rownames(genes_subset_mND)]
graph_mND <- graph_from_adjacency_matrix(A_subset_mND) %>%
set_vertex_attr("class", value = genes_subset_mND[,2])
vis_mND <- toVisNetworkData(graph_mND)
visNetwork(nodes = vis_mND$nodes, edges = vis_mND$edges) %>%
visIgraphLayout(layout = "layout_") %>%
visOptions(nodesIdSelection = TRUE, selectedBy = "class") %>%
visNodes(color = vis_mND$nodes$class)
#> Trying Cytoscape: Took a loong time
# cytoscapePing()
# createNetworkFromIgraph(graph_mND,new.title='mND')
## 1 = Isolated, 2 = Linker, 3 = Module
## GS-adjusted mND graph
genes_subset_adjusted_mND <- class_res_new$gene_class %>% filter(L2 != "NS")
A_subset_adjusted_mND <- A[rownames(A) %in% rownames(genes_subset_adjusted_mND),
colnames(A) %in% rownames(genes_subset_adjusted_mND)]
graph_adjusted_mND <- graph_from_adjacency_matrix(A_subset_adjusted_mND) %>%
set_vertex_attr("class", value = genes_subset_adjusted_mND[,2])
vis_adjusted_mND <- toVisNetworkData(graph_adjusted_mND)
visNetwork(nodes = vis_adjusted_mND$nodes, edges = vis_adjusted_mND$edges) %>%
visIgraphLayout(layout = "layout_in_circle") %>%
visOptions(nodesIdSelection = TRUE, selectedBy = "class")
## ==========================================================================
## Analysis of Results - Bilal
## A. Checking Old vs New Genes Count
## B. Introducing Cumulative Decay Score
## ==========================================================================
## A. Checking Old vs New Genes Count
## ----------------------------------
## Load Data
#> Adjacency Matrix
data(A)
## Normalized AM
W <- normalize_adj_mat(A)
ind_adj <- neighbour_index(W)
## Matrices with Original Values
data(X0)
data(Xs)
## Matrices with Adjusted Values
X0_gs_adjusted <- readRDS("Data/X0_gs_adjusted.rds")
Xs_new <- readRDS("Data/Xs_new.rds")
## mND Scores Results
mND_score <- readRDS("Data/mND_scores.rds")
mND_score_new_k2 <- readRDS("Data/mND_gs_adjusted_scores_k2.rds")
mND_score_new <- readRDS("Data/mND_gs_adjusted_scores.rds")
## Get new and old genes
new_genes = rownames(mND_score_new$mND)[which(mND_score_new$mND$mNDp < 0.05)]
length(new_genes)
old_genes = rownames(mND_score$mND)[which(mND_score$mND$mNDp < 0.05)]
length(old_genes)
## Ratio - we have a 7% change in the genes
## Ghadi: " 7.7% of the total genes were additionally
## significant with a 0.05 mNDp threshold "
sum(!(new_genes %in% old_genes))/nrow(mND_score$mND)
# Ratio would be length(new_genes)/length(old_genes)
# (diff lengths means hard to get a single percentage to reflect change)
(length(new_genes)-length(old_genes))/length(new_genes)
## B. Introducing Cumulative Decay Score
## -------------------------------------
## Load Classificaiton of Old vs New Genes
data(X0)
Hl_old <- list(l1 = rownames(X0[X0[,1]>0,]),
l2 = names(X0[order(X0[,2], decreasing = T),2][1:1200])
)
top_Nl_old <- unlist(lapply(Hl_old, function(x) length(x)))
top_Nl_old
class_res_old <- classification(mND_score, X0, Hl_old, top = top_Nl_old)
head(class_res_old$gene_class)
Hl_new <- list(l1 = rownames(X0_gs_adjusted[X0_gs_adjusted[,1]>0,]),
l2 = names(X0_gs_adjusted[order(X0_gs_adjusted[,2], decreasing = T),2][1:1200])
)
top_Nl_new <- unlist(lapply(Hl_new, function(x) length(x)))
top_Nl_new
class_res_new <- classification(mND_score_new, X0_gs_adjusted, Hl_new, top = top_Nl_new)
#class_res_new <- classification(mND_score_new_k2, X0_gs_adjusted, Hl_new, top = top_Nl_new)
head(class_res_new$gene_class)
## After obtaining the classification of each gene
## we wish to check the cumulative_decay_score on target genes
## Target Genes are new genes that are obtained
## Cumulative Decay Score = GE_neighbor/d_neighbor_target
## Stategy:
## For ever node in the graph:
## for all modules in the list of old modules:
## distance = get_distance (old_module, node)
## score[node] += Measure(old_module)/distance
##------------------------------------------------
## Now, this Measure could be taken as GE
## or the adjusted -log_10(p-value)
##-------------------------------------------------
## 1. First lets convert the Adjacency Matrix to Distance Matrix
## install.packages("netmeta")
library(netmeta)
int_network <- graph_from_adjacency_matrix(A, mode = "undirected")
ge_dist <- calcAllPairsDistances(int_network, directionPaths = "all", networkName = "int_network")
## 2. Second, lets get the decay effect of each gene from gs_results_all
gs_results_all <- read.csv("Data/gs_results_all.csv")
## 3. Get List of all old nodes classification
old_class = class_res_old$gene_class[,2]
## 4. Get list of all new nodes classification
new_class = class_res_new$gene_class[,2]
## 5. Remember - adjusted scores of genes
X0_gs_adjusted
## 6. initialize empty vector
results_c_decay_score = rep(0,nrow(ge_dist))
new_module = rep(0,nrow(ge_dist))
## 7. Get Cumulative Decay Score for each Gene
for(i in seq(1,nrow(gs_results_all))){
## Gene Attributes
gene = gs_results_all[i,]
g_id = gene$gene.id
g_rd = gene$radius
g_old_class = old_class[g_id]
g_new_class = new_class[g_id]
## Get the measure of the module node
## we choose measyre as -log_10_pvalue
## Other choice is original GE
## measure = X0_gs_adjusted[i,2]
measure <- -log10(gene$p.Decay)
#> print(measure)
## If the gene is an old module
#> then we add its affect to neighboring (d<r) nodes
if(g_old_class == "M"){
## Traverse other genes in the surrounding radius
## and update their cumulative_decay_score
for(j in seq(1, nrow(ge_dist))){
if (j == i){
next
}
## Get distance between the two nodes
g_dist = ge_dist[i,j]
if (g_dist <= g_rd){
## Update cumulative score of gene_j
results_c_decay_score[j] = as.numeric(results_c_decay_score[j]) + measure/g_dist
}
}
}
## If not old module:
#> check if it is a new module,
#> if yes put 1 in the vector
else{
if (g_new_class == "M"){
new_module[i] = 1
}
}
}
## We have 69 new modules - validate this
## Ghadi: 72 with k = 2, 69 with k = 3
sum(new_module==1)
## How many modules are in new but are not in old
sum(class_res_old$gene_class[,2]=="M")
sum(class_res_new$gene_class[,2]=="M")
## Get Module Genes in our method: GS + mND
new_mdg = rownames(class_res_new$gene_class)[class_res_new$gene_class[,2] == "M"]
## Get Module Genes in 1st method: Only mND
old_mdg = rownames(class_res_old$gene_class)[class_res_old$gene_class[,2] == "M"]
#> Count Common Module Genes in our Method
sum(new_mdg %in% old_mdg)
#> Count Unique Module Genes in both Method
## 32 (k=2) modules discarded by GS
sum(!(old_mdg %in% new_mdg))
## Ghadi: 72 with k = 2, 69 k = 3
sum(!(new_mdg %in% old_mdg))
## Get names of unique genes (distinct module genes: mdg)
distinct_mdg = new_mdg[which(!(new_mdg %in% old_mdg))]
discarded_mdg <- new_mdg[which(!(old_mdg %in% new_mdg))]
## Get Indices of unique genes
dmdg_indices = which(rownames(X0_gs_adjusted)%in%distinct_mdg)
discarded_mdg_indices <- which(rownames(X0_gs_adjusted)%in%discarded_mdg)
## Mean Results of Decay Score in 1st method vs our method
mean(results_c_decay_score[dmdg_indices])
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
mean(results_c_decay_score[discarded_mdg_indices])
mean(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
## Distinct Density Plots
ggplot(as.data.frame(as.matrix(results_c_decay_score[dmdg_indices])), aes(V1)) + geom_density()
ggplot(as.data.frame(as.matrix(results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])), aes(V1)) + geom_density()
## Overlaying the Density Plots
#> Distinct
a = data.frame(cumulative_decay_score = results_c_decay_score[dmdg_indices])
b = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))])
ggplot() +
geom_density(data = a, aes(x = cumulative_decay_score),
fill = "#DAF7A6", color = "black", alpha = 0.7) +
geom_density(data = b, aes(x = y),
fill = "#C70039", color = "black", alpha = 0.7)
#> Discarded
x = data.frame(cumulative_decay_score = results_c_decay_score[discarded_mdg_indices])
y = data.frame(y = results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))])
ggplot() +
geom_density(data = x, aes(x = cumulative_decay_score),
fill = "#DAF7A6", color = "black", alpha = 0.7) +
geom_density(data = y, aes(x = y),
fill = "#C70039", color = "black", alpha = 0.7)
## Next: Frederick Abi Chahine
## Significance Test between two vectors which are (distinct):
## 1. results_c_decay_score[dmdg_indices]
## 2. results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))]
## ----------------------------------------------------------------------------
vector1 <- results_c_decay_score[dmdg_indices]
vector2 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%distinct_mdg))]
## Comparing Variances:
#Using var.test() in order to compare the variances
#Running a two-sided variance test on both vectors with alpha = 5% (0.05) and null hypothesis (H0) -> variances are equal
#ratio (default is 1) is the expected ratio for H0, and the alternative hypothesis (H1) displays the other side (default is two.sided)
var.test(x = vector1, y = vector2, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
#we obtain a p-value = 0.002163 with a cut-off of 0.05 which is statistically significant
#we obtain a ratio = 0.554 for x/y; This means that the variance of the distinctive genes that were selected as new modules was significantly lower than the old modules for the calculated cumulative decay score
#the 95% confidence interval is [0.406, 0.801]
## Comparing Means:
#Using t.test() in order to compare the means
#mu is either the population's expected mean or the expected difference in the means of the two samples
#mu = 0 as the null hypothesis.
#paired indicates if the data is paired or not
#var.equal indicates if both the samples are treated as equal or not equal w.r.t their variances
#the above var.equal would be based on the var.test done before (in our case = FALSE)
#since our data is unpaired => we perform the following comparison:
t.test(x = vector1, y = vector2, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
#we obtain a p-value = 5.896e-09 with a cut-off of 0.05 which is statistically significant
#the 95% confidence interval is [29.82, 55.46]
#the means of vector1 and vector2 are 168.97 and 126.33 respectively.
## Next:
## Significance Test between the other two vectors which are (discarded):
## 3. results_c_decay_score[discarded_mdg_indices]
## 4. results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))]
## ----------------------------------------------------------------------------
vector3 <- results_c_decay_score[discarded_mdg_indices]
vector4 <- results_c_decay_score[which(!(rownames(X0_gs_adjusted)%in%discarded_mdg))]
#Now we perform the exact same steps as before and analyze the output for the discarded genes...
## Comparing Variances:
var.test(x = vector3, y = vector4, ratio = 1, alternative = c("two.sided", "less", "greater"), conf.level = 0.95)
#we obtain a p-value = 0.08254 with a cut-off of 0.05 which implies that it is not statistically significant
#we obtain a ratio = 0.605 for x/y; This means that the variance of the discarded genes that were selected as new modules was not significantly lower than the old modules for the calculated cumulative decay score
#the 95% confidence interval is [0.388, 1.070]
## Comparing Means:
t.test(x = vector3, y = vector4, alternative = "two.sided", mu = 0, paired = FALSE, var.equal = FALSE, conf.level = 0.95)
#we obtain a p-value = 5.055e-06 with a cut-off of 0.05 which is statistically significant
#the 95% confidence interval is [33.95, 74.00]
#the means of vector3 and vector4 are 180.41 and 126.44 respectively.
## ----------------------------------------------------------------------------
## Sanity Re-check:
sum(class_res_new$gene_class[,1] != class_res_old$gene_class[,1])
shift_sm <- table(mND = class_res_old$gene_class[,1], GS_adjusted_mND = class_res_new$gene_class[,1])
shift_ge <- table(mND = class_res_old$gene_class[,2], GS_adjusted_mND = class_res_new$gene_class[,2])
# Percentages over mND genes
results_ge_mND <- shift_ge
for(i in 1:nrow(shift_ge)){
results_ge_mND[i,] <- (shift_ge[i,]/sum(shift_ge[i,]))*100
}
results_ge_mND
# Total selected mND genes (I, L, M)
11796 - sum(shift_ge[4,])
# Percentages over GS_adjusted genes
results_ge_GS_adjusted <- shift_ge
for(i in 1:ncol(shift_ge)){
results_ge_GS_adjusted[,i] <- (shift_ge[,i]/sum(shift_ge[,i]))*100
}
results_ge_GS_adjusted
# Total selected GS-adjusted mND genes (I, L, M) --> -205 genes
11796 - sum(shift_ge[,4])
11796 - sum(shift_ge[,4]) - (11796 - sum(shift_ge[4,]))
# Percent over all genes
(shift_ge/11796)*100
## ============================================================================
## Finally Analyzing coverage of Breast Cancer Related Genes - Ghadi
## ============================================================================
## Getting Disease-related genes with AutoSeed
## (includes eDGAR, DrugBank, and MalaCards)
## install.packages("Autoseed")
library(Autoseed)
bc_genes_search <- AutoSeed("breast cancer")
bc_genes_search2 <- AutoSeed("breast carcinoma")
bc_genes <- Reduce(union, c(bc_genes_search$edgar,bc_genes_search$malacards,
bc_genes_search$drugbank,
bc_genes_search2$edgar,bc_genes_search2$malacards,
bc_genes_search2$drugbank))
bc_genes_clean <- unique(vapply(bc_genes, function(x){
return(unlist(strsplit(x, split='::', fixed=TRUE))[1])},
c("x")))
bc_genes_clean <- bc_genes_clean[bc_genes_clean%in%rownames(X0)]
# Matched distinct genes
sum(distinct_mdg %in% bc_genes_clean)
# Matched discarded genes
sum(discarded_mdg %in% bc_genes_clean)
# Matched old M genes
sum(old_mdg %in% bc_genes_clean)
# Matched new M genes
sum(new_mdg %in% bc_genes_clean)
genes_selected_old <- rownames(class_res_old$gene_class %>% filter(L2 != "NS"))
genes_selected_new <- rownames(class_res_new$gene_class %>% filter(L2 != "NS"))
# Matched selected old
sum(genes_selected_old %in% bc_genes_clean)/length(bc_genes_clean)*100
# Matched selected new
sum(genes_selected_new %in% bc_genes_clean)/length(bc_genes_clean)*100
# Analyzing coverage by mNDp cutoffs
cutoffs <- seq(0.0001, 0.06, 0.0001)
genes_selected_cutoffs <- data.frame(mNDp_cutoffs = 1:length(cutoffs),
old = 1:length(cutoffs),
new = 1:length(cutoffs),
new_k2 = 1:length(cutoffs))
for(i in 1:length(cutoffs)){
# %>% filter(L2 != "NS"))
genes_cutoff_old <- rownames(mND_score$mND %>% filter(mNDp <= cutoffs[i]))
# %>% filter(L2 != "NS"))
genes_cutoff_new <- rownames(mND_score_new$mND %>% filter(mNDp <= cutoffs[i]))
genes_cutoff_new_k2 <- rownames(mND_score_new_k2$mND %>% filter(mNDp <= cutoffs[i]))
print(paste("Cutoff:", i))
# Matched selected by mNDp cutoff old
old <- sum(genes_cutoff_old %in% bc_genes_clean)/length(bc_genes_clean)*100
print("Old")
print(old)
# Matched selected by mNDp cutoff new
new <- sum(genes_cutoff_new %in% bc_genes_clean)/length(bc_genes_clean)*100
print("New (k = 3)")
print(new)
# Matched selected by mNDp cutoff new k = 2
new_k2 <- sum(genes_cutoff_new_k2 %in% bc_genes_clean)/length(bc_genes_clean)*100
print("New (k = 2)")
print(new_k2)
genes_selected_cutoffs[i,] <- c(cutoffs[i], old, new, new_k2)
}
# Interesting how we get more coverage no matter the cutoff although converges
# slightly at xtrmly low cutoffs. k = 2 shows slightly less coverage but at lower
# cutoffs coverage is almost the same (seems that lower k prioritizes important genes)
ggplot() +
geom_line(data = genes_selected_cutoffs, aes(cutoffs, old), color = "black") +
geom_line(data = genes_selected_cutoffs, aes(cutoffs, new), color = "red") +
geom_line(data = genes_selected_cutoffs, aes(cutoffs, new_k2), color = "blue", alpha = 0.5) +
ylab("Cumulative Coverage (%)") +
xlab("mNDp Cutoff") +
ggtitle("Cumulative Coverage of disease-related genes by mNDp cutoffs",
subtitle = "mND (k = 3) in black, GS-adjusted mND (k = 2) in blue, GS-adjusted mND (k = 3) in red")
## ===========================================================================
## Enrichment Analysis - Ghadi
## ===========================================================================
#BiocManager::install("KEGGREST")
library(KEGGREST)
pathways.list <- keggList("pathway", "hsa") #Needs Internet connection
pathway.codes <- sub("path:", "", names(pathways.list))
genes.by.pathway <- sapply(pathway.codes,
function(pwid){
pw <- keggGet(pwid)
if (is.null(pw[[1]]$GENE)) return(NA)
pw2 <- pw[[1]]$GENE[c(FALSE,TRUE)] # may need to modify this to c(FALSE, TRUE) for other organisms
pw2 <- unlist(lapply(strsplit(pw2, split = ";", fixed = T), function(x)x[1]))
return(pw2)
}
)
#saveRDS(genes.by.pathway, "Data/genes.by.pathway.rds")
genes.by.pathway <- readRDS("Data/genes.by.pathway.rds")
enrich <- function(mND_score, pathways.list, pathway.codes, genes.by.pathway)
{
geneList <- mND_score$mND$mNDp
names(geneList) <- rownames(mND_score$mND)
# Wilcoxon test for each pathway
pVals.by.pathway <- t(sapply(names(genes.by.pathway),
function(pathway) {
pathway.genes <- genes.by.pathway[[pathway]]
list.genes.in.pathway <- intersect(names(geneList), pathway.genes)
list.genes.not.in.pathway <- setdiff(names(geneList), list.genes.in.pathway)
scores.in.pathway <- geneList[list.genes.in.pathway]
scores.not.in.pathway <- geneList[list.genes.not.in.pathway]
if (length(scores.in.pathway) > 0){
p.value <- wilcox.test(scores.in.pathway, scores.not.in.pathway, alternative = "less")$p.value
} else{
p.value <- NA
}
return(c(p.value = p.value, Annotated = length(list.genes.in.pathway)))
}
))
# Assemble output table
outdat <- data.frame(pathway.code = rownames(pVals.by.pathway))
outdat$pathway.name <- pathways.list[paste("path:",outdat$pathway.code, sep = "")]
outdat$p.value <- pVals.by.pathway[,"p.value"]
outdat$Annotated <- pVals.by.pathway[,"Annotated"]
return(outdat[order(outdat$p.value),])
}
enrichment_old <- enrich(mND_score, pathways.list, pathway.codes, genes.by.pathway)
enrichment_new <- enrich(mND_score_new, pathways.list, pathway.codes, genes.by.pathway)
enrichment_new_k2 <- enrich(mND_score_new_k2, pathways.list, pathway.codes, genes.by.pathway)
# Write and Load Codes
#write.csv(enrichment_old, "Results/Enrichment/enrichment_old.csv")
enrichment_old <- read.csv("Results/Enrichment/enrichment_old.csv")
#write.csv(enrichment_new, "Results/Enrichment/enrichment_new.csv")
enrichment_new <- read.csv("Results/Enrichment/enrichment_new.csv")
#write.csv(enrichment_new_k2, "Results/Enrichment/enrichment_new_k2.csv")
enrichment_new_k2 <- read.csv("Results/Enrichment/enrichment_new_k2.csv")
comparative_enrich <- function(X0, mND_scores = list(), pathways.list, pathway.codes, genes.by.pathway){
result <- list()
for(i in 1:length(genes.by.pathway)){
pathway_result <- data.frame(matrix(nrow = length(seq (50, 500, 1)),
ncol = length(mND_scores)+2))
colnames(pathway_result) <- append(names(mND_scores), c("DE_scores","cutoff"))
count <- 0
for(mND_score in mND_scores){
count <- count + 1
count2 <- 0
genes <- mND_score$mND %>% arrange(desc(mND))
control_genes <- sort(X0[,2], decreasing = TRUE)
for(cutoff in seq (50, 500, 1)){
count2 <- count2 + 1
pathway_result[count2,count] <- sum(rownames(genes[1:cutoff,]) %in% genes.by.pathway[[i]])/length(genes.by.pathway[[i]])*100
pathway_result[count2,ncol(pathway_result)] <- cutoff
if(count == 1){
pathway_result[count2,ncol(pathway_result)-1] <- sum(names(control_genes[1:cutoff]) %in% genes.by.pathway[[i]])/length(genes.by.pathway[[i]])*100
}
}
}
result[[pathways.list[[i]]]] <- pathway_result
}
return(result)
}
comparative_enrichment <- comparative_enrich(X0, list(mND_old_k3 = mND_score,
mND_new_k3 = mND_score_new,
mND_new_k2 = mND_score_new_k2),
pathways.list, pathway.codes, genes.by.pathway)
bc_pathways <- c("Breast cancer - Homo sapiens (human)",
"Epstein-Barr virus infection - Homo sapiens (human)",
"Pathways in cancer - Homo sapiens (human)",
"Viral carcinogenesis - Homo sapiens (human)")
other_pathways <- c("Cell cycle - Homo sapiens (human)",
"FoxO signaling pathway - Homo sapiens (human)",
"Hippo signaling pathway - Homo sapiens (human)",
"Oocyte meiosis - Homo sapiens (human)",
"p53 signaling pathway - Homo sapiens (human)",
"PI3K-Akt signaling pathway - Homo sapiens (human)",
"Progesterone-mediated oocyte maturation - Homo sapiens (human)",
"Proteasome - Homo sapiens (human)",
"Rap1 signaling pathway - Homo sapiens (human)",
"Ras signaling pathway - Homo sapiens (human)",
"Sphingolipid signaling pathway - Homo sapiens (human)",
"TGF-beta signaling pathway - Homo sapiens (human)")
for(pathway in append(bc_pathways, other_pathways)){
plot <- ggplot(data = comparative_enrichment[[pathway]]) +
geom_line(aes(cutoff, mND_old_k3), color = "black") +
geom_line(aes(cutoff, mND_new_k3), color = "red") +
geom_line(aes(cutoff, mND_new_k2), color = "blue") +
geom_line(aes(cutoff, DE_scores), color = "magenta") +
ylab("Cumulative Coverage (%)") +
xlab("Cutoff of sorted gene list") +
ggtitle(pathway)
#subtitle = "Black: mND (k = 3), Blue: GS-adjusted mND (k = 2),
#Red: GS-adjusted mND (k = 3), Magenta: original DE scores")
print(plot)