Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Zhang-Data-Science-Lab authored Apr 29, 2024
1 parent 315d977 commit 1e936cb
Show file tree
Hide file tree
Showing 15 changed files with 16,024 additions and 2 deletions.
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

14,702 changes: 14,702 additions & 0 deletions Notebook01_Run_TranScan.html

Large diffs are not rendered by default.

296 changes: 296 additions & 0 deletions R/TranScan.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
#' Hi-C inter-chromosomal translocation detection via Scan clustering
#'
#' This function allows you to detect the inter-chromosomal translocation
#' from Hi-C data. It requires two intra-regions and the corresponding
#' inter-region as inputs. We suggest rounded log counts as inputs.
#' @param chr1 First chromosome
#' @param chr2 Second chromosome
#' @param chr1_2 Inter-chromosomal region of chr1 and chr2
#' @param outputpath The path to store all results
#' @param sizes Chrom sizes of chr1_2; vector of length 2
#' @param resolution Resolution of Hi-C data
#' @param quantitest Grid of testing, must be 2^k, default 256, meaning 256 * 256
#' @param alpha FDX, default 0.05
#' @param gammath Gamma threshold, default 0.1
#' @param number_of_bandwidth Number of bandwidths used, default 20
#' @param diag_remove Removed short-range interactions, must be k * resolution, default 2e+06
#' @param add_gap Added gap between two chromosomes, must be k * resolution, defult 2500000
#' @return A heatmap and a rejected region plot will be generated under the output folder. The matrix form of rejected region is saved as RR.txt. All intermediate objects will also be saved.
#' @examples
#' data(chr10)
#' data(chr20)
#' data(chr10_20)
#' p_sizes = "../hg38.sizes"
#' p_outdir = "output"
#' sizes = read.table(p_sizes, col.name=c("chrom", "size"))
#' chr10_l = subset(sizes, chrom=="chr10")$size
#' chr20_l = subset(sizes, chrom=="chr20")$size
#' size_dim = c(chr10_l, chr20_l)
#' TranScan(chr10, chr20, chr10_20, p_outdir, size_dim)
#' @export
TranScan <- function(chr1, chr2, chr1_2, outputpath, sizes, resolution = 500000,
quantitest = 256, alpha = 0.05, gammath = 0.1,
number_of_bandwidth = 20,
diag_remove = 2000000, add_gap = 2500000){
library(ggplot2); library(dplyr); library(ggpubr)
dir.create(outputpath, F)
setwd(outputpath)

bins1 <- (sizes[1] %/% resolution)+1
bins2 <- (sizes[2] %/% resolution)+1
chr1[, 1:2] <- chr1[, 1:2] %/% resolution + 1
chr2[, 1:2] <- chr2[, 1:2] %/% resolution + 1
chr1_2[, 1:2] <- chr1_2[, 1:2] %/% resolution + 1
#bin_seg <- max(max(chr1[, 1:2]), max(chr1_2[, 1]))
bin_seg <- bins1
add_line <- add_gap / resolution
chr2[, 1:2] <- chr2[, 1:2] + add_line + bin_seg
chr1_2[, 2] <- chr1_2[, 2] + add_line + bin_seg
sep_line <- bin_seg + add_line %/% 2 + 1
dat <- rbind(chr1, chr2, chr1_2)
colnames(dat) <- c("V1", "V2", "V3")
dims <- c(bins1+add_line+bins2, bins1+add_line+bins2)
dat <- dat %>% filter(V1 <= V2 - diag_remove / resolution)
dat <- Matrix::sparseMatrix(dat[, 1], dat[, 2], x = dat[, 3], dims=dims, symmetric = T)
dat <- reshape2::melt(as.matrix(dat))
rm(list = c("chr1", "chr2", "chr1_2"))
colnames(dat) <- c("V1", "V2", "V3")

p1 <- ggplot(dat, aes(V1, -V2)) +
geom_tile(aes(fill = V3)) +
scale_fill_gradient2(low = "blue", mid = "white",
high = "red", midpoint = mean(dat$V3)) +
theme_classic() +
theme(aspect.ratio = 1,
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = unit(c(0,0,0,0), "lines"))+
geom_hline(yintercept = -sep_line) + geom_vline(xintercept = sep_line)
png("Heatmap.png")
print(p1)
dev.off()

dati <- dat
xmin <- min(dati[, 1])
xmax <- max(dati[, 1])
ymin <- min(dati[, 2])
ymax <- max(dati[, 2])
dati[,1] <- (dati[, 1] - min(dati[, 1])) / (max(dati[, 1]) - min(dati[, 1]))
dati[,2] <- (dati[, 2] - min(dati[, 2])) / (max(dati[, 2]) - min(dati[, 2]))
colnames(dati) <- c("V1", "V2", "V3")
sep_ratio <- (sep_line - xmin) / (xmax - xmin)

################################
# Prepare grid point positions #
################################
Repeat_it <- function(row){
rep(1, row[3]) %*% t(row[1:2])
}
newdat <- do.call("rbind", apply(dati, 1, Repeat_it))
dati <- as.matrix(newdat)

#############################
# Kernel density estimation #
#############################
grigliay <- rep(1, quantitest) %*% t(seq(0, 1, length= quantitest))
grigliax <- t(grigliay)
# stabilisci i valori di n, alpha e gamma
n <- dim(dati)[1] # total number of events

# KERNEL SMOOTHING
# input: vector of 2 bandwidths, 2 column matrix of data
# output: matrix of density estimate vettorizzata

library("MASS")
kernel.fun <- function(bandwidths, dati){
zeta <- kde2d(dati[,1], dati[,2], h= bandwidths, lims = c(0,1,0,1), n=quantitest)$z
return(c(zeta))
}

kernel.spezza.fun <- function(bandwidths, dati){
ampiezza <- dim(dati)[1] /10
ris <- 0
for (blocco in 1:10) {
datib <- dati[(blocco-1)*ampiezza + 1:ampiezza , ]
zeta <- kde2d(datib[,1], datib[,2], h= bandwidths, lims = c(0,1,0,1), n=quantitest)$z
ris <- ris + zeta
}
ris <- ris/10
return(c(ris))
}

# TEST STATISTICS
# input: vector of 2 bandwidths, vector of density estimate
# output: vector of standardized values

teststat.fun <- function(bandwidths, fhat) {
mu0 <- 1
sigma0 <- sqrt(1/(2*pi*4*bandwidths[1]*bandwidths[2]) - 1)
teststat <- sqrt(n)*(fhat - mu0)/sigma0
return(teststat)
}

# PETERBARG (it is "almost" it: we do not consider here the area of the set)
# input: vector of 2 bandwidths, vector of standardized test statistics
# output: vector of "almost" tail probability

peterbarg.fun <- function(bandwidths, teststat){
detC <- (4*bandwidths[1]*bandwidths[2])*(1- 4*bandwidths[1]*bandwidths[2])
peterbarg <- ((pi*detC)^(-1)) * (teststat^2) * (1-pnorm(teststat))
return(peterbarg)
}

# SUPERSET
# input: vector of vector of density estimate
# output: binary vector of superset

piterpolin <- function(z){(z^2)*(1-pnorm(z))}
minroot <- optimize (f=piterpolin, interval=c(0, 10), maximum=TRUE)$maximum # it's the mimimum value for piterbargh

superset.fun <- function(bandwidths, fhat){
teststat <- teststat.fun(bandwidths, fhat)
teststat <- pmax(teststat, minroot)
sorttest <- sort(teststat, decreasing=TRUE)
detC <- (4*bandwidths[1]*bandwidths[2])*(1- 4*bandwidths[1]*bandwidths[2])
piter <- ((pi*detC)^(-1))* (sorttest^2)*(1-pnorm(sorttest)) # piterbarg formula (without area of set)
piter <- rev(cummin(rev(piter))) # force pvalues to be monotone increasing
piter <- piter*(seq(1, 1/length(piter), length=length(piter))) # include area of the set
if(max(piter) < alpha){thresh <- min(sorttest - 1)} else{
thresh <- sorttest[min((1:length(piter))[piter>=alpha])]
}
superset <- 1*(teststat <= thresh)
return(superset)
}

onearg.superset.fun <- function(vect){
superset.fun(vect[1:2], vect[-(1:2)])
}

# AUGMENTATION
# input: binary vector of familywise rejected region, vector of density estimate
# output: binary vector of augmented region


augment.fun <- function(fwise, fhat){
areafwise <- sum(fwise)
quantinuovi <- floor(areafwise*gammath/(1-gammath)) + length(fwise)*(areafwise<((1-gammath)/gammath))
scegli <- (1-fwise)*fhat
thresh <- sort(scegli, decreasing=TRUE)[quantinuovi] + (1+max(fhat))*(areafwise<((1-gammath)/gammath))
nuovi <- 1*(scegli>=thresh)
return(nuovi)
}

onearg.augment.fun <- function(vect){
lungh <- length(vect)/2
augment.fun(vect[1:lungh], vect[lungh+(1:lungh)])
}

# SHAVING
# input: binary vector of rejected region, vector of 2 bandwidths
# output: binary vector of shaved region

shave.fun <- function(bands, reject){
ellypse <- 1*(((grigliax-grigliax[quantitest/2, quantitest/2])/(1*bands[1]))^2 +
((grigliay-grigliay[quantitest/2, quantitest/2])/(1*bands[2]))^2 <= 1)
ellypse <- rbind(row(ellypse)[ellypse==1], col(ellypse)[ellypse==1]) - quantitest/2
xmin <- abs(min(ellypse[1,]))
xmax <- abs(max(ellypse[1,]))
ymin <- abs(min(ellypse[2,]))
ymax <- abs(max(ellypse[2,]))
bigmatrix <- matrix(1, nrow=quantitest + xmin + xmax, ncol=quantitest + ymin + ymax)
bigmatrix[xmin + (1:quantitest), ymin + (1:quantitest)] <- matrix(reject, nrow=quantitest, ncol=quantitest)

shift.fun <- function(pair){
matr <- bigmatrix[xmin + pair[1] + (1:quantitest), ymin + pair[2] + (1:quantitest)]
return(c(matr))
}

shaved <- reject
for (index in (1:dim(ellypse)[2])){shaved <- shaved*shift.fun(ellypse[,index])}
#shaved <- apply(ellypse, 2, shift.fun)
#shaved <- apply(shaved, 1, prod)
return(shaved)
}


# input: a vector where the first 2 entries are the bandwidths and the others are reject
onearg.shave.fun <- function(vect){
shave.fun(vect[1:2], vect[-(1:2)])
}

oversmooth <- 1.1* sd(c(dati)) /(n^(1/6))
hvect <- c(seq(1.5/quantitest, oversmooth, length=number_of_bandwidth))
# bandwidth for x and y
h1vect <- hvect*sd(dati[,1])/min(sd(dati))
h2vect <- hvect*sd(dati[,2])/min(sd(dati))
bandmatr <- rbind(h1vect, h2vect) # 2 * 20 matrix
fhat <- apply(bandmatr, 2, kernel.spezza.fun, dati=dati) # 65536 * 20 matrix
bandfhat <- rbind(bandmatr, fhat) # Each column: 2 bandwidth + 256^2 test point

Ucompl <- 1- apply(bandfhat, 2, onearg.superset.fun)
Ucomplfhat <- rbind(Ucompl, fhat)
FDXaugment<- apply(Ucomplfhat, 2, onearg.augment.fun)
FDXreject <- Ucompl + FDXaugment
print("2")
bandUcompl <- rbind(bandmatr, Ucompl)
Ushaved <- apply(bandUcompl, 2, onearg.shave.fun)

Ushavedfhat <- rbind(Ushaved, fhat)
augshaved <- apply(Ushavedfhat, 2, onearg.augment.fun)
augshaved <- Ushaved + augshaved

#
# Drop the 1 pixel situation
#
sep_rej <- quantitest * sep_ratio
inter_sum <- function(vec){
mat <- matrix(vec, quantitest, quantitest)
mat[1:floor(sep_rej), 1:floor(sep_rej)] <- 0
mat[ceiling(sep_rej):quantitest, ceiling(sep_rej):quantitest] <- 0
return(sum(mat))
}
selected_index <- which.max(apply(augshaved, 2, inter_sum))
finalset <- augshaved[, selected_index]
toplot <- matrix(finalset, quantitest, quantitest)
dat <- reshape2::melt(toplot)
p1 <- ggplot(dat, aes(Var1, -Var2)) +
geom_tile(aes(fill = value), show.legend = F) +
scale_fill_gradient(low = "white", high = "red") +
theme_classic() +
theme(aspect.ratio = 1,
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.margin = unit(c(0,0,0,0), "lines"))+
geom_hline(yintercept = -sep_rej) + geom_vline(xintercept = sep_rej)
png("RR.png")
print(p1)
dev.off()

# extract kde matrix with the best selected bandwidth for breakpoint selection
kde_mat = matrix(fhat[,selected_index], quantitest, quantitest)
kde_long = reshape2::melt(kde_mat)

# write out rejection regions and kde smoothed matrix
if (inter_sum(finalset) >= 3){
write.table(toplot, file = "RR.txt", row.names = F, col.names = F)
write.table(kde_mat, file = "KDE.txt", row.names = F, col.names = F)
}
save(list=ls(all.names = TRUE), file="all_objects.Rdata")
}

29 changes: 29 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#' Chromosome 10 of T47D
#'
#' Intra region of chromosome 10 in cell line T47D (500kb resolution)
#'
#' @docType data
#' @format The third column refers to rounded log count
#' @usage data(chr10)
#'
"chr10"

#' Chromosome 20 of T47D
#'
#' Intra region of chromosome 20 in cell line T47D (500kb resolution)
#'
#' @docType data
#' @format The third column refers to rounded log count
#' @usage data(chr20)
#'
"chr20"

#' Chromosome 10_20 of T47D
#'
#' Inter region between chromosome 10 and 20 in cell line T47D (500kb resolution)
#'
#' @docType data
#' @format The third column refers to rounded log count
#' @usage data(chr10)
#'
"chr10_20"
57 changes: 55 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,55 @@
# TranScan
R-package for the paper entitled "Translocation Detection from Hi-C data via Scan Statistics'' by Anthony Cheng, Disheng Mao, Yuping Zhang, Joseph Glaz, Zhengqing Ouyang
TranScan software presented in the paper entitled "Translocation Detection from Hi-C data via Scan Statistics" by Anthony Cheng, Disheng Mao, Yuping Zhang, Joseph Glaz, and Zhengqing Ouyang.

The TranScan R package detects chromosome translocation events in genome-wide proximity ligation data such as Hi-C. The accompanying translocation breakpoint finder is post-processing scripts that are written in Python.

# Software Requirements

## OS Requirements
This package has been tested on the following systems:
+ macOS Sierra (v 10.12.6)
+ Linux: CentOS 7

## Dependencies and Prerequisties
The following prerequisites are required to run TranScan
+ R (3.5.1)
+ ggplot2 (3.1.0)
+ dplyr (0.7.8)
+ ggpubr (0.2.5)
+ MASS (7.3-51.1)

The following prerequisites are required to run the breakpoint finder
+ Python (2.7.14)
+ matplotlib (2.1.0)
+ numpy (1.15.2)
+ seaborn (0.9.0)

### Optimized Performance (Optional)
For an optimized performance, consider installing OpenBLAS or ATLAS and export
the following environmental variable that points to the appropriate shared object.
There is an approximately 5 fold improvement in the runtime performance as compared
to the default R blas implementation: libRblas.

### OpenBLAS
```
export LD_PRELOAD=</path/to/libopenblas.so>
```

### ATLAS
```
export LD_PRELOAD=</path/to/libatlas.so>
```

# Installation

```
install.packages("devtools")
devtools::install("TranScan")
library(TranScan)
```

# Examples/Demos
Reproducible code examples can be found in the following HTML renders of the jupyter notebooks
+ Notebook01_Run_TranScan.html

# License
The implementations written for this project is covered by the GNU General Public License, version 3.0 (GPL-3.0).
Binary file added TranScan-manual.pdf
Binary file not shown.
Binary file added data/chr10.RData
Binary file not shown.
Binary file added data/chr10_20.RData
Binary file not shown.
Binary file added data/chr20.RData
Binary file not shown.
Loading

0 comments on commit 1e936cb

Please sign in to comment.