-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTranScan.R
296 lines (263 loc) · 11.3 KB
/
TranScan.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#' Hi-C inter-chromosomal translocation detection via Scan clustering
#'
#' This function allows you to detect the inter-chromosomal translocation
#' from Hi-C data. It requires two intra-regions and the corresponding
#' inter-region as inputs. We suggest rounded log counts as inputs.
#' @param chr1 First chromosome
#' @param chr2 Second chromosome
#' @param chr1_2 Inter-chromosomal region of chr1 and chr2
#' @param outputpath The path to store all results
#' @param sizes Chrom sizes of chr1_2; vector of length 2
#' @param resolution Resolution of Hi-C data
#' @param quantitest Grid of testing, must be 2^k, default 256, meaning 256 * 256
#' @param alpha FDX, default 0.05
#' @param gammath Gamma threshold, default 0.1
#' @param number_of_bandwidth Number of bandwidths used, default 20
#' @param diag_remove Removed short-range interactions, must be k * resolution, default 2e+06
#' @param add_gap Added gap between two chromosomes, must be k * resolution, defult 2500000
#' @return A heatmap and a rejected region plot will be generated under the output folder. The matrix form of rejected region is saved as RR.txt. All intermediate objects will also be saved.
#' @examples
#' data(chr10)
#' data(chr20)
#' data(chr10_20)
#' p_sizes = "../hg38.sizes"
#' p_outdir = "output"
#' sizes = read.table(p_sizes, col.name=c("chrom", "size"))
#' chr10_l = subset(sizes, chrom=="chr10")$size
#' chr20_l = subset(sizes, chrom=="chr20")$size
#' size_dim = c(chr10_l, chr20_l)
#' TranScan(chr10, chr20, chr10_20, p_outdir, size_dim)
#' @export
TranScan <- function(chr1, chr2, chr1_2, outputpath, sizes, resolution = 500000,
quantitest = 256, alpha = 0.05, gammath = 0.1,
number_of_bandwidth = 20,
diag_remove = 2000000, add_gap = 2500000){
library(ggplot2); library(dplyr); library(ggpubr)
dir.create(outputpath, F)
setwd(outputpath)
bins1 <- (sizes[1] %/% resolution)+1
bins2 <- (sizes[2] %/% resolution)+1
chr1[, 1:2] <- chr1[, 1:2] %/% resolution + 1
chr2[, 1:2] <- chr2[, 1:2] %/% resolution + 1
chr1_2[, 1:2] <- chr1_2[, 1:2] %/% resolution + 1
#bin_seg <- max(max(chr1[, 1:2]), max(chr1_2[, 1]))
bin_seg <- bins1
add_line <- add_gap / resolution
chr2[, 1:2] <- chr2[, 1:2] + add_line + bin_seg
chr1_2[, 2] <- chr1_2[, 2] + add_line + bin_seg
sep_line <- bin_seg + add_line %/% 2 + 1
dat <- rbind(chr1, chr2, chr1_2)
colnames(dat) <- c("V1", "V2", "V3")
dims <- c(bins1+add_line+bins2, bins1+add_line+bins2)
dat <- dat %>% filter(V1 <= V2 - diag_remove / resolution)
dat <- Matrix::sparseMatrix(dat[, 1], dat[, 2], x = dat[, 3], dims=dims, symmetric = T)
dat <- reshape2::melt(as.matrix(dat))
rm(list = c("chr1", "chr2", "chr1_2"))
colnames(dat) <- c("V1", "V2", "V3")
p1 <- ggplot(dat, aes(V1, -V2)) +
geom_tile(aes(fill = V3)) +
scale_fill_gradient2(low = "blue", mid = "white",
high = "red", midpoint = mean(dat$V3)) +
theme_classic() +
theme(aspect.ratio = 1,
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = unit(c(0,0,0,0), "lines"))+
geom_hline(yintercept = -sep_line) + geom_vline(xintercept = sep_line)
png("Heatmap.png")
print(p1)
dev.off()
dati <- dat
xmin <- min(dati[, 1])
xmax <- max(dati[, 1])
ymin <- min(dati[, 2])
ymax <- max(dati[, 2])
dati[,1] <- (dati[, 1] - min(dati[, 1])) / (max(dati[, 1]) - min(dati[, 1]))
dati[,2] <- (dati[, 2] - min(dati[, 2])) / (max(dati[, 2]) - min(dati[, 2]))
colnames(dati) <- c("V1", "V2", "V3")
sep_ratio <- (sep_line - xmin) / (xmax - xmin)
################################
# Prepare grid point positions #
################################
Repeat_it <- function(row){
rep(1, row[3]) %*% t(row[1:2])
}
newdat <- do.call("rbind", apply(dati, 1, Repeat_it))
dati <- as.matrix(newdat)
#############################
# Kernel density estimation #
#############################
grigliay <- rep(1, quantitest) %*% t(seq(0, 1, length= quantitest))
grigliax <- t(grigliay)
# stabilisci i valori di n, alpha e gamma
n <- dim(dati)[1] # total number of events
# KERNEL SMOOTHING
# input: vector of 2 bandwidths, 2 column matrix of data
# output: matrix of density estimate vettorizzata
library("MASS")
kernel.fun <- function(bandwidths, dati){
zeta <- kde2d(dati[,1], dati[,2], h= bandwidths, lims = c(0,1,0,1), n=quantitest)$z
return(c(zeta))
}
kernel.spezza.fun <- function(bandwidths, dati){
ampiezza <- dim(dati)[1] /10
ris <- 0
for (blocco in 1:10) {
datib <- dati[(blocco-1)*ampiezza + 1:ampiezza , ]
zeta <- kde2d(datib[,1], datib[,2], h= bandwidths, lims = c(0,1,0,1), n=quantitest)$z
ris <- ris + zeta
}
ris <- ris/10
return(c(ris))
}
# TEST STATISTICS
# input: vector of 2 bandwidths, vector of density estimate
# output: vector of standardized values
teststat.fun <- function(bandwidths, fhat) {
mu0 <- 1
sigma0 <- sqrt(1/(2*pi*4*bandwidths[1]*bandwidths[2]) - 1)
teststat <- sqrt(n)*(fhat - mu0)/sigma0
return(teststat)
}
# PETERBARG (it is "almost" it: we do not consider here the area of the set)
# input: vector of 2 bandwidths, vector of standardized test statistics
# output: vector of "almost" tail probability
peterbarg.fun <- function(bandwidths, teststat){
detC <- (4*bandwidths[1]*bandwidths[2])*(1- 4*bandwidths[1]*bandwidths[2])
peterbarg <- ((pi*detC)^(-1)) * (teststat^2) * (1-pnorm(teststat))
return(peterbarg)
}
# SUPERSET
# input: vector of vector of density estimate
# output: binary vector of superset
piterpolin <- function(z){(z^2)*(1-pnorm(z))}
minroot <- optimize (f=piterpolin, interval=c(0, 10), maximum=TRUE)$maximum # it's the mimimum value for piterbargh
superset.fun <- function(bandwidths, fhat){
teststat <- teststat.fun(bandwidths, fhat)
teststat <- pmax(teststat, minroot)
sorttest <- sort(teststat, decreasing=TRUE)
detC <- (4*bandwidths[1]*bandwidths[2])*(1- 4*bandwidths[1]*bandwidths[2])
piter <- ((pi*detC)^(-1))* (sorttest^2)*(1-pnorm(sorttest)) # piterbarg formula (without area of set)
piter <- rev(cummin(rev(piter))) # force pvalues to be monotone increasing
piter <- piter*(seq(1, 1/length(piter), length=length(piter))) # include area of the set
if(max(piter) < alpha){thresh <- min(sorttest - 1)} else{
thresh <- sorttest[min((1:length(piter))[piter>=alpha])]
}
superset <- 1*(teststat <= thresh)
return(superset)
}
onearg.superset.fun <- function(vect){
superset.fun(vect[1:2], vect[-(1:2)])
}
# AUGMENTATION
# input: binary vector of familywise rejected region, vector of density estimate
# output: binary vector of augmented region
augment.fun <- function(fwise, fhat){
areafwise <- sum(fwise)
quantinuovi <- floor(areafwise*gammath/(1-gammath)) + length(fwise)*(areafwise<((1-gammath)/gammath))
scegli <- (1-fwise)*fhat
thresh <- sort(scegli, decreasing=TRUE)[quantinuovi] + (1+max(fhat))*(areafwise<((1-gammath)/gammath))
nuovi <- 1*(scegli>=thresh)
return(nuovi)
}
onearg.augment.fun <- function(vect){
lungh <- length(vect)/2
augment.fun(vect[1:lungh], vect[lungh+(1:lungh)])
}
# SHAVING
# input: binary vector of rejected region, vector of 2 bandwidths
# output: binary vector of shaved region
shave.fun <- function(bands, reject){
ellypse <- 1*(((grigliax-grigliax[quantitest/2, quantitest/2])/(1*bands[1]))^2 +
((grigliay-grigliay[quantitest/2, quantitest/2])/(1*bands[2]))^2 <= 1)
ellypse <- rbind(row(ellypse)[ellypse==1], col(ellypse)[ellypse==1]) - quantitest/2
xmin <- abs(min(ellypse[1,]))
xmax <- abs(max(ellypse[1,]))
ymin <- abs(min(ellypse[2,]))
ymax <- abs(max(ellypse[2,]))
bigmatrix <- matrix(1, nrow=quantitest + xmin + xmax, ncol=quantitest + ymin + ymax)
bigmatrix[xmin + (1:quantitest), ymin + (1:quantitest)] <- matrix(reject, nrow=quantitest, ncol=quantitest)
shift.fun <- function(pair){
matr <- bigmatrix[xmin + pair[1] + (1:quantitest), ymin + pair[2] + (1:quantitest)]
return(c(matr))
}
shaved <- reject
for (index in (1:dim(ellypse)[2])){shaved <- shaved*shift.fun(ellypse[,index])}
#shaved <- apply(ellypse, 2, shift.fun)
#shaved <- apply(shaved, 1, prod)
return(shaved)
}
# input: a vector where the first 2 entries are the bandwidths and the others are reject
onearg.shave.fun <- function(vect){
shave.fun(vect[1:2], vect[-(1:2)])
}
oversmooth <- 1.1* sd(c(dati)) /(n^(1/6))
hvect <- c(seq(1.5/quantitest, oversmooth, length=number_of_bandwidth))
# bandwidth for x and y
h1vect <- hvect*sd(dati[,1])/min(sd(dati))
h2vect <- hvect*sd(dati[,2])/min(sd(dati))
bandmatr <- rbind(h1vect, h2vect) # 2 * 20 matrix
fhat <- apply(bandmatr, 2, kernel.spezza.fun, dati=dati) # 65536 * 20 matrix
bandfhat <- rbind(bandmatr, fhat) # Each column: 2 bandwidth + 256^2 test point
Ucompl <- 1- apply(bandfhat, 2, onearg.superset.fun)
Ucomplfhat <- rbind(Ucompl, fhat)
FDXaugment<- apply(Ucomplfhat, 2, onearg.augment.fun)
FDXreject <- Ucompl + FDXaugment
print("2")
bandUcompl <- rbind(bandmatr, Ucompl)
Ushaved <- apply(bandUcompl, 2, onearg.shave.fun)
Ushavedfhat <- rbind(Ushaved, fhat)
augshaved <- apply(Ushavedfhat, 2, onearg.augment.fun)
augshaved <- Ushaved + augshaved
#
# Drop the 1 pixel situation
#
sep_rej <- quantitest * sep_ratio
inter_sum <- function(vec){
mat <- matrix(vec, quantitest, quantitest)
mat[1:floor(sep_rej), 1:floor(sep_rej)] <- 0
mat[ceiling(sep_rej):quantitest, ceiling(sep_rej):quantitest] <- 0
return(sum(mat))
}
selected_index <- which.max(apply(augshaved, 2, inter_sum))
finalset <- augshaved[, selected_index]
toplot <- matrix(finalset, quantitest, quantitest)
dat <- reshape2::melt(toplot)
p1 <- ggplot(dat, aes(Var1, -Var2)) +
geom_tile(aes(fill = value), show.legend = F) +
scale_fill_gradient(low = "white", high = "red") +
theme_classic() +
theme(aspect.ratio = 1,
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = unit(c(0,0,0,0), "lines"))+
geom_hline(yintercept = -sep_rej) + geom_vline(xintercept = sep_rej)
png("RR.png")
print(p1)
dev.off()
# extract kde matrix with the best selected bandwidth for breakpoint selection
kde_mat = matrix(fhat[,selected_index], quantitest, quantitest)
kde_long = reshape2::melt(kde_mat)
# write out rejection regions and kde smoothed matrix
if (inter_sum(finalset) >= 3){
write.table(toplot, file = "RR.txt", row.names = F, col.names = F)
write.table(kde_mat, file = "KDE.txt", row.names = F, col.names = F)
}
save(list=ls(all.names = TRUE), file="all_objects.Rdata")
}