-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmotif_relative_to_TSS.R
139 lines (100 loc) · 5.48 KB
/
motif_relative_to_TSS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
library(tidyverse)
library(openxlsx)
library(Seurat)
library(ggplot2)
library(ggfortify)
library(jcolors)
require(gridExtra)
library(grid)
library(tidytext)
library(seqinr)
peak.genes <- read.xlsx("../Input/toxo_scATAC_MJ_ME49_59/peak_gene_assigned_final.xlsx")
sc.rna.sig.markers <- readRDS("../Input/toxo_cdc/rds_ME49_59/rna_markers_rna_trns_sig_v2.rds")
colnames(sc.rna.sig.markers) <- gsub("cluster", "phase", colnames(sc.rna.sig.markers))
sc.rna.sig.markers <- sc.rna.sig.markers %>% transmute(GeneID, phase) %>% distinct()
sc.rna.sig.markers$GeneID <- gsub("-", "_", sc.rna.sig.markers$GeneID)
sc.rna.sig.markers.stat <- sc.rna.sig.markers %>% group_by(phase) %>% summarise(total = n())
sc.rna.sig.markers.stat
sc.rna.sig.markers.peaks <-
right_join(sc.rna.sig.markers, peak.genes, by = c("GeneID" = "gene_name" )) %>% na.omit()
sc.rna.sig.markers.peaks$phase <- gsub("\\.", "_", sc.rna.sig.markers.peaks$phase)
colnames(sc.rna.sig.markers.peaks) <- gsub("V1.x", "chr" ,
gsub("V11", "strand",
colnames(sc.rna.sig.markers.peaks)))
sc.rna.sig.markers.peaks <- sc.rna.sig.markers.peaks %>%
dplyr::select(chr, start_peak, end_peak, strand, phase, GeneID)
sc.rna.sig.markers.peaks.list <- split(sc.rna.sig.markers.peaks,
f = sc.rna.sig.markers.peaks$phase)
## write fasta file (atac regions) for rna transition markers to perform motif search
## fasta file have been generated in terminal with strandness option enabled
options(bedtools.path = "/Users/kourosh.zarringhalam/miniconda3/bin/")
out.dir <- "../Input/toxo_cdc/rds_ME49_59/bed_files/rna_transitions_v4/"
cluster.bed.list <- lapply(1:length(sc.rna.sig.markers.peaks.list), function(i){
bed.name <- paste(names(sc.rna.sig.markers.peaks.list)[i], "bed", sep = ".")
fasta.name <- paste(names(sc.rna.sig.markers.peaks.list)[i], "fasta", sep = ".")
tmp <- sc.rna.sig.markers.peaks.list[[i]]
cluster.bed <- tmp %>% dplyr::transmute(V1 = chr, V2 = start_peak, V3 = end_peak, V4 = GeneID ,V5 = ".", V6 = strand)
#write.table(cluster.bed, paste(out.dir, bed.name, sep = ""), sep = "\t", quote = F, row.names = F, col.names = F)
# cluster.fasta <- bedtoolsr::bt.getfasta(fi = "../Input/toxo_cdc/Genome/ToxoDB-59_TgondiiME49_Genome.fasta",
# bed = tmp,
# fo =paste(out.dir, fasta.name, sep = ""))
return(cluster.bed)
})
#####
## read in the bed file for each motif identified by BAM
## overlap with the gene.peaks table
## to see which gene has that specific motif
## this version is without specifying strand, this gives the bed file corresponding to the occurence of motif
prod.desc <- read.xlsx('../Input/toxo_genomics/genes/ProductDescription_GT1_ME49.xlsx')
peak.genes <- read.xlsx("../Input/toxo_scATAC_MJ_ME49_59/peak_gene_assigned_final.xlsx")
in.dir <- "../Input/toxo_cdc/bed_files/rna_transitions_v2/motif_beds_BAMM/"
out.dir <- "../Input/toxo_cdc/bed_files/rna_transitions_v2/motif_beds_genes/"
bed.files <- list.files(in.dir)
bed.names <- gsub(".bed", "", bed.files)
options(bedtools.path = "/Users/kourosh.zarringhalam/miniconda3/bin/")
bed.genes.list <- lapply(1:length(bed.files), function(i){
bed.tmp <- read.table(paste(in.dir, bed.files[i], sep = "/"), header = F, quote = NULL, sep = "\t")
## intersect with the peak gene
bed.genes <- bedtoolsr::bt.intersect(a = bed.tmp, b = peak.genes, wo = T)
bed.genes <- bed.genes %>% group_by(V13) %>% mutate(motif_freq = n())
bed.genes <- left_join(bed.genes, prod.desc, by = c("V13" = "TGME49"))
#bed.genes <- bed.genes %>% distinct(GeneID, .keep_all = T)
bed.genes <- bed.genes %>% select(V1, V2, V3, V4, V5, V6, V12,V13, motif_freq, ProductDescription)
#write.xlsx(bed.genes, paste(out.dir, paste(bed.names[i], "genes.xlsx", sep = "_"), sep =""))
# write.table(bed.genes, paste(out.dir, paste(bed.names[i], "genes.txt", sep = "_"), sep =""),
# sep = "\t", quote = F, row.names = F, col.names = F)
#
return(bed.genes)
})
names(bed.genes.list) <- bed.names
saveRDS(bed.genes.list,"../Input/toxo_cdc/rds_ME49_59/motif_dist_from_TSS.rds")
## distance of motifs relative to TSS
gtf.file <- "../Genomes/ToxoDB-59_TgondiiME49.gtf"
gtf <- read.table(gtf.file, header = F, sep = '\t', quote = NULL)
gtf.filt <- gtf %>% dplyr::filter(!grepl('KE.*', V1))
gtf.filt$gene_name <- gsub("\\;.*", "", gsub("transcript_id ", "", gsub("-t.*", "", gtf.filt$V9)))
gtf.filt$gene_name <- gsub("\"", "", gtf.filt$gene_name)
tss.df <- lapply(1:length(bed.genes.list), function(i){
tmp <- bed.genes.list[[i]]
tmp.gtf <- left_join(tmp, gtf.filt, by = c("V13" = "gene_name") )
tmp.gtf <- tmp.gtf %>%
filter(V3.y == "transcript") %>% mutate(tss = ifelse((V7 == "+"), V4.y, V5.y))
tmp.gtf <- tmp.gtf %>% mutate(tss.dist = tss - V2.x)
tmp.gtf$group <- names(bed.genes.list)[i]
return(tmp.gtf)
})
names(tss.df) <- bed.names
plt.title <- sub("^([^-]+)_([^-]+)_", "\\1-\\2.", bed.names)
tss.plt <- lapply(1:length(tss.df), function(i){
p <- ggplot(tss.df[[i]], aes(x = tss.dist)) +
geom_histogram(aes(y = ..density..),
colour = 1, fill = "white") +
geom_density(lwd = 0.7,linetype = 1,colour = 2) +
theme_bw() +
ggtitle(plt.title[[i]])
return(p)
})
pp <- grid.arrange(grobs = tss.plt, ncol = 3)
ggsave( "../Output/toxo_cdc/ME49_59/figures_paper/motifs_dist_tss.pdf",
plot = pp,
height = 8,width = 8, dpi = 300)