Add files via upload

Zhang-Data-Science-Research-Lab · Apr 29, 2024 · 1e936cb · 1e936cb
1 parent 315d977
commit 1e936cb
Show file tree

Hide file tree

Showing 15 changed files with 16,024 additions and 2 deletions.
diff --git a/LICENSE b/LICENSE
diff --git a/Notebook01_Run_TranScan.html b/Notebook01_Run_TranScan.html
diff --git a/R/TranScan.R b/R/TranScan.R
@@ -0,0 +1,296 @@
+#' Hi-C inter-chromosomal translocation detection via Scan clustering
+#'
+#' This function allows you to detect the inter-chromosomal translocation
+#' from Hi-C data. It requires two intra-regions and the corresponding
+#' inter-region as inputs. We suggest rounded log counts as inputs.
+#' @param chr1 First chromosome
+#' @param chr2 Second chromosome
+#' @param chr1_2 Inter-chromosomal region of chr1 and chr2
+#' @param outputpath The path to store all results
+#' @param sizes Chrom sizes of chr1_2; vector of length 2
+#' @param resolution Resolution of Hi-C data
+#' @param quantitest Grid of testing, must be 2^k, default 256, meaning 256 * 256
+#' @param alpha FDX, default 0.05
+#' @param gammath Gamma threshold, default 0.1
+#' @param number_of_bandwidth Number of bandwidths used, default 20
+#' @param diag_remove Removed short-range interactions, must be k * resolution, default 2e+06
+#' @param add_gap Added gap between two chromosomes, must be k * resolution, defult 2500000
+#' @return A heatmap and a rejected region plot will be generated under the output folder. The matrix form of rejected region is saved as RR.txt. All intermediate objects will also be saved.
+#' @examples
+#' data(chr10)
+#' data(chr20)
+#' data(chr10_20)
+#' p_sizes = "../hg38.sizes"
+#' p_outdir = "output"
+#' sizes = read.table(p_sizes, col.name=c("chrom", "size"))
+#' chr10_l = subset(sizes, chrom=="chr10")$size
+#' chr20_l = subset(sizes, chrom=="chr20")$size
+#' size_dim = c(chr10_l, chr20_l)
+#' TranScan(chr10, chr20, chr10_20, p_outdir, size_dim)
+#' @export
+TranScan <- function(chr1, chr2, chr1_2, outputpath, sizes, resolution = 500000,
+                     quantitest = 256, alpha = 0.05, gammath = 0.1,
+                     number_of_bandwidth = 20,
+                     diag_remove = 2000000, add_gap = 2500000){
+  library(ggplot2); library(dplyr); library(ggpubr)
+  dir.create(outputpath, F)
+  setwd(outputpath)
+
+  bins1 <- (sizes[1] %/% resolution)+1
+  bins2 <- (sizes[2] %/% resolution)+1
+  chr1[, 1:2] <- chr1[, 1:2] %/% resolution + 1
+  chr2[, 1:2] <- chr2[, 1:2] %/% resolution + 1
+  chr1_2[, 1:2] <- chr1_2[, 1:2] %/% resolution + 1
+  #bin_seg <- max(max(chr1[, 1:2]), max(chr1_2[, 1]))
+  bin_seg <- bins1
+  add_line <- add_gap / resolution
+  chr2[, 1:2] <- chr2[, 1:2] + add_line + bin_seg
+  chr1_2[, 2] <- chr1_2[, 2] + add_line + bin_seg
+  sep_line <- bin_seg + add_line %/% 2 + 1
+  dat <- rbind(chr1, chr2, chr1_2)
+  colnames(dat) <- c("V1", "V2", "V3")
+  dims <- c(bins1+add_line+bins2, bins1+add_line+bins2)
+  dat <- dat %>% filter(V1 <= V2 - diag_remove / resolution)
+  dat <- Matrix::sparseMatrix(dat[, 1], dat[, 2], x = dat[, 3], dims=dims, symmetric = T)
+  dat <- reshape2::melt(as.matrix(dat))
+  rm(list = c("chr1", "chr2", "chr1_2"))
+  colnames(dat) <- c("V1", "V2", "V3")
+
+  p1 <- ggplot(dat, aes(V1, -V2)) +
+    geom_tile(aes(fill = V3)) +
+    scale_fill_gradient2(low = "blue", mid = "white",
+                         high = "red", midpoint = mean(dat$V3)) +
+    theme_classic() +
+    theme(aspect.ratio = 1,
+          axis.line = element_blank(),
+          axis.text.x = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks = element_blank(),
+          axis.title.x = element_blank(),
+          axis.title.y = element_blank(),
+          legend.position = "none",
+          panel.border = element_blank(),
+          panel.grid.major = element_blank(),
+          panel.grid.minor = element_blank(),
+          plot.margin = unit(c(0,0,0,0), "lines"))+
+    geom_hline(yintercept = -sep_line) + geom_vline(xintercept = sep_line)
+  png("Heatmap.png")
+  print(p1)
+  dev.off()
+
+  dati <- dat
+  xmin <- min(dati[, 1])
+  xmax <- max(dati[, 1])
+  ymin <- min(dati[, 2])
+  ymax <- max(dati[, 2])
+  dati[,1] <- (dati[, 1] - min(dati[, 1])) / (max(dati[, 1]) - min(dati[, 1]))
+  dati[,2] <- (dati[, 2] - min(dati[, 2])) / (max(dati[, 2]) - min(dati[, 2]))
+  colnames(dati) <- c("V1", "V2", "V3")
+  sep_ratio <- (sep_line - xmin) / (xmax - xmin)
+
+  ################################
+  # Prepare grid point positions #
+  ################################
+  Repeat_it <- function(row){
+    rep(1, row[3]) %*% t(row[1:2])
+  }
+  newdat <- do.call("rbind", apply(dati, 1, Repeat_it))
+  dati <- as.matrix(newdat)
+
+  #############################
+  # Kernel density estimation #
+  #############################
+  grigliay <- rep(1, quantitest) %*% t(seq(0, 1, length= quantitest))
+  grigliax <- t(grigliay)
+  # stabilisci i valori di n, alpha e gamma
+  n <- dim(dati)[1]   # total number of events
+
+  # KERNEL SMOOTHING
+  # input: vector of 2 bandwidths, 2 column matrix of data
+  # output: matrix of density estimate vettorizzata
+
+  library("MASS")
+  kernel.fun <- function(bandwidths, dati){
+    zeta <- kde2d(dati[,1], dati[,2], h= bandwidths, lims = c(0,1,0,1), n=quantitest)$z
+    return(c(zeta))
+  }
+
+  kernel.spezza.fun <- function(bandwidths, dati){
+    ampiezza <- dim(dati)[1] /10
+    ris <- 0
+    for (blocco in 1:10) {
+      datib <- dati[(blocco-1)*ampiezza + 1:ampiezza , ]
+      zeta <- kde2d(datib[,1], datib[,2], h= bandwidths, lims = c(0,1,0,1), n=quantitest)$z
+      ris <- ris + zeta
+    }
+    ris <- ris/10
+    return(c(ris))
+  }
+
+  # TEST STATISTICS
+  # input: vector of 2 bandwidths, vector of density estimate
+  # output: vector of standardized values
+
+  teststat.fun <- function(bandwidths, fhat) {
+    mu0 <- 1
+    sigma0 <- sqrt(1/(2*pi*4*bandwidths[1]*bandwidths[2]) - 1)
+    teststat <- sqrt(n)*(fhat - mu0)/sigma0
+    return(teststat)
+  }
+
+  # PETERBARG (it is "almost" it: we do not consider here the area of the set)
+  # input: vector of 2 bandwidths, vector of standardized test statistics
+  # output: vector of "almost" tail probability
+
+  peterbarg.fun <- function(bandwidths, teststat){
+    detC <- (4*bandwidths[1]*bandwidths[2])*(1- 4*bandwidths[1]*bandwidths[2])
+    peterbarg <- ((pi*detC)^(-1)) * (teststat^2) * (1-pnorm(teststat))
+    return(peterbarg)
+  }
+
+  # SUPERSET
+  # input: vector of vector of density estimate
+  # output: binary vector of superset
+
+  piterpolin <- function(z){(z^2)*(1-pnorm(z))}
+  minroot <- optimize (f=piterpolin, interval=c(0, 10), maximum=TRUE)$maximum  # it's the mimimum value for piterbargh
+
+  superset.fun <- function(bandwidths, fhat){
+    teststat <- teststat.fun(bandwidths, fhat)
+    teststat <- pmax(teststat, minroot)
+    sorttest <- sort(teststat, decreasing=TRUE)
+    detC <- (4*bandwidths[1]*bandwidths[2])*(1- 4*bandwidths[1]*bandwidths[2])
+    piter <- ((pi*detC)^(-1))* (sorttest^2)*(1-pnorm(sorttest))   # piterbarg formula (without area of set)
+    piter <- rev(cummin(rev(piter)))      # force pvalues to be monotone increasing
+    piter <- piter*(seq(1, 1/length(piter), length=length(piter)))      # include area of the set
+    if(max(piter) < alpha){thresh <- min(sorttest - 1)} else{
+      thresh <- sorttest[min((1:length(piter))[piter>=alpha])]
+    }
+    superset <- 1*(teststat <= thresh)
+    return(superset)
+  }
+
+  onearg.superset.fun <- function(vect){
+    superset.fun(vect[1:2], vect[-(1:2)])
+  }
+
+  # AUGMENTATION
+  # input: binary vector of familywise rejected region, vector of density estimate
+  # output: binary vector of augmented region
+
+
+  augment.fun <- function(fwise, fhat){
+    areafwise <- sum(fwise)
+    quantinuovi <- floor(areafwise*gammath/(1-gammath)) + length(fwise)*(areafwise<((1-gammath)/gammath))
+    scegli <- (1-fwise)*fhat
+    thresh <- sort(scegli, decreasing=TRUE)[quantinuovi] + (1+max(fhat))*(areafwise<((1-gammath)/gammath))
+    nuovi <- 1*(scegli>=thresh)
+    return(nuovi)
+  }
+
+  onearg.augment.fun <- function(vect){
+    lungh <- length(vect)/2
+    augment.fun(vect[1:lungh], vect[lungh+(1:lungh)])
+  }
+
+  # SHAVING
+  # input: binary vector of rejected region, vector of 2 bandwidths
+  # output: binary vector of shaved region
+
+  shave.fun <- function(bands, reject){
+    ellypse <- 1*(((grigliax-grigliax[quantitest/2, quantitest/2])/(1*bands[1]))^2 +
+                    ((grigliay-grigliay[quantitest/2, quantitest/2])/(1*bands[2]))^2 <= 1)
+    ellypse <- rbind(row(ellypse)[ellypse==1], col(ellypse)[ellypse==1])  - quantitest/2
+    xmin <- abs(min(ellypse[1,]))
+    xmax <- abs(max(ellypse[1,]))
+    ymin <- abs(min(ellypse[2,]))
+    ymax <- abs(max(ellypse[2,]))
+    bigmatrix <- matrix(1, nrow=quantitest + xmin + xmax, ncol=quantitest + ymin + ymax)
+    bigmatrix[xmin + (1:quantitest), ymin + (1:quantitest)] <- matrix(reject, nrow=quantitest, ncol=quantitest)
+
+    shift.fun <- function(pair){
+      matr <- bigmatrix[xmin + pair[1] + (1:quantitest), ymin + pair[2] + (1:quantitest)]
+      return(c(matr))
+    }
+
+    shaved <- reject
+    for (index in (1:dim(ellypse)[2])){shaved <- shaved*shift.fun(ellypse[,index])}
+    #shaved <- apply(ellypse, 2, shift.fun)
+    #shaved <- apply(shaved, 1, prod)
+    return(shaved)
+  }
+
+
+  # input: a vector where the first 2 entries are the bandwidths and the others are reject
+  onearg.shave.fun <- function(vect){
+    shave.fun(vect[1:2], vect[-(1:2)])
+  }
+
+  oversmooth <-  1.1* sd(c(dati)) /(n^(1/6))
+  hvect <- c(seq(1.5/quantitest, oversmooth, length=number_of_bandwidth))
+  # bandwidth for x and y
+  h1vect <- hvect*sd(dati[,1])/min(sd(dati))
+  h2vect <- hvect*sd(dati[,2])/min(sd(dati))
+  bandmatr <- rbind(h1vect, h2vect) # 2 * 20 matrix
+  fhat <- apply(bandmatr, 2, kernel.spezza.fun, dati=dati) # 65536 * 20 matrix
+  bandfhat <- rbind(bandmatr, fhat) # Each column: 2 bandwidth + 256^2 test point
+
+  Ucompl <- 1- apply(bandfhat, 2, onearg.superset.fun)
+  Ucomplfhat <- rbind(Ucompl, fhat)
+  FDXaugment<- apply(Ucomplfhat, 2, onearg.augment.fun)
+  FDXreject <- Ucompl + FDXaugment
+  print("2")
+  bandUcompl <- rbind(bandmatr, Ucompl)
+  Ushaved <- apply(bandUcompl, 2, onearg.shave.fun)
+
+  Ushavedfhat <- rbind(Ushaved, fhat)
+  augshaved <- apply(Ushavedfhat, 2, onearg.augment.fun)
+  augshaved <- Ushaved + augshaved
+
+  #
+  # Drop the 1 pixel situation
+  #
+  sep_rej <- quantitest * sep_ratio
+  inter_sum <- function(vec){
+    mat <- matrix(vec, quantitest, quantitest)
+    mat[1:floor(sep_rej), 1:floor(sep_rej)] <- 0
+    mat[ceiling(sep_rej):quantitest, ceiling(sep_rej):quantitest] <- 0
+    return(sum(mat))
+  }
+  selected_index <- which.max(apply(augshaved, 2, inter_sum))
+  finalset <- augshaved[, selected_index]
+  toplot <- matrix(finalset, quantitest, quantitest)
+  dat <- reshape2::melt(toplot)
+  p1 <- ggplot(dat, aes(Var1, -Var2)) +
+    geom_tile(aes(fill = value), show.legend = F) +
+    scale_fill_gradient(low = "white", high = "red") +
+    theme_classic() +
+    theme(aspect.ratio = 1,
+          axis.line = element_blank(),
+          axis.text.x = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks = element_blank(),
+          axis.title.x = element_blank(),
+          axis.title.y = element_blank(),
+          legend.position = "none",
+          panel.border = element_blank(),
+          panel.grid.major = element_blank(),
+          panel.grid.minor = element_blank(),
+          plot.margin = unit(c(0,0,0,0), "lines"))+
+    geom_hline(yintercept = -sep_rej) + geom_vline(xintercept = sep_rej)
+  png("RR.png")
+  print(p1)
+  dev.off()
+
+  # extract kde matrix with the best selected bandwidth for breakpoint selection
+  kde_mat = matrix(fhat[,selected_index], quantitest, quantitest)
+  kde_long = reshape2::melt(kde_mat)
+
+  # write out rejection regions and kde smoothed matrix
+  if (inter_sum(finalset) >= 3){
+    write.table(toplot, file = "RR.txt", row.names = F, col.names = F)
+    write.table(kde_mat, file = "KDE.txt", row.names = F, col.names = F)
+  }
+  save(list=ls(all.names = TRUE), file="all_objects.Rdata")
+}
+
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,29 @@
+#' Chromosome 10 of T47D
+#'
+#' Intra region of chromosome 10 in cell line T47D (500kb resolution)
+#'
+#' @docType data
+#' @format The third column refers to rounded log count
+#' @usage data(chr10)
+#'
+"chr10"
+
+#' Chromosome 20 of T47D
+#'
+#' Intra region of chromosome 20 in cell line T47D (500kb resolution)
+#'
+#' @docType data
+#' @format The third column refers to rounded log count
+#' @usage data(chr20)
+#'
+"chr20"
+
+#' Chromosome 10_20 of T47D
+#'
+#' Inter region between chromosome 10 and 20 in cell line T47D (500kb resolution)
+#'
+#' @docType data
+#' @format The third column refers to rounded log count
+#' @usage data(chr10)
+#'
+"chr10_20"
diff --git a/README.md b/README.md
@@ -1,2 +1,55 @@
-# TranScan
-R-package for the paper entitled "Translocation Detection from Hi-C data via Scan Statistics'' by Anthony Cheng, Disheng Mao, Yuping Zhang, Joseph Glaz, Zhengqing Ouyang
+TranScan software presented in the paper entitled "Translocation Detection from Hi-C data via Scan Statistics" by Anthony Cheng, Disheng Mao, Yuping Zhang, Joseph Glaz, and Zhengqing Ouyang.
+
+The TranScan R package detects chromosome translocation events in genome-wide proximity ligation data such as Hi-C. The accompanying translocation breakpoint finder is post-processing scripts that are written in Python.
+
+# Software Requirements
+
+## OS Requirements
+This package has been tested on the following systems:
++ macOS Sierra (v 10.12.6)
++ Linux: CentOS 7
+
+## Dependencies and Prerequisties
+The following prerequisites are required to run TranScan
++ R (3.5.1)
++ ggplot2 (3.1.0)
++ dplyr (0.7.8)
++ ggpubr (0.2.5)
++ MASS (7.3-51.1)
+
+The following prerequisites are required to run the breakpoint finder
++ Python (2.7.14)
++ matplotlib (2.1.0)
++ numpy (1.15.2)
++ seaborn (0.9.0)
+
+### Optimized Performance (Optional)
+For an optimized performance, consider installing OpenBLAS or ATLAS and export
+the following environmental variable that points to the appropriate shared object.
+There is an approximately 5 fold improvement in the runtime performance as compared
+to the default R blas implementation: libRblas.
+
+### OpenBLAS
+```
+export LD_PRELOAD=</path/to/libopenblas.so>
+```
+
+### ATLAS
+```
+export LD_PRELOAD=</path/to/libatlas.so>
+```
+
+# Installation
+
+```
+install.packages("devtools")
+devtools::install("TranScan")
+library(TranScan)
+```
+
+# Examples/Demos
+Reproducible code examples can be found in the following HTML renders of the jupyter notebooks
++ Notebook01_Run_TranScan.html
+
+# License
+The implementations written for this project is covered by the GNU General Public License, version 3.0 (GPL-3.0).
diff --git a/TranScan-manual.pdf b/TranScan-manual.pdf
diff --git a/data/chr10.RData b/data/chr10.RData
diff --git a/data/chr10_20.RData b/data/chr10_20.RData
diff --git a/data/chr20.RData b/data/chr20.RData