Skip to content

Commit

Permalink
add data selection by tlen and read length
Browse files Browse the repository at this point in the history
  • Loading branch information
Ozonov committed Oct 8, 2024
1 parent cde5332 commit c5595a4
Show file tree
Hide file tree
Showing 15 changed files with 338 additions and 56 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: fetchNOMe
Type: Package
Title: Package for fast and convenient retrieval of NOMe-seq data from BAM files
Version: 0.3
Version: 0.4
Date: 2024-05-09
Authors@R: person(given = "Evgeniy A.", family="Ozonov", email = "[email protected]", role = c("aut", "cre"))
Description: This R package is tailored for fast and convenient retrieval of NOMe-seq data from BAM files aligned with QuasR, Bismark and BISCUIT.
Expand Down
12 changes: 6 additions & 6 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) {
.Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)
fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) {
.Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)
}

fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) {
.Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)
fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) {
.Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)
}

fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) {
.Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)
fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) {
.Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)
}

30 changes: 26 additions & 4 deletions R/get_ctable_from_bams.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@
#' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency.
#' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis.
#' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis.
#' @param max_read_size maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences.
#' This parameter does not influence any filtering of fragments based on the length.
#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param min_read_size \code{integer}. minimum read length to be included.
#' @param max_read_size \code{integer}. maximum read length to be included.
#' @param ncores number of cores to use.
#'
#' @return \code{tibble} where each row corresponds to sample - region combination.
Expand Down Expand Up @@ -60,6 +64,9 @@ get_ctable_from_bams <- function(bamfiles,
min_bisC_size = 10,
mapqMin = 0,
mapqMax = 255,
absIsizeMin = NULL,
absIsizeMax = NULL,
min_read_size = 25L,
max_read_size = 1000L,
ncores = 1L){

Expand All @@ -86,17 +93,28 @@ get_ctable_from_bams <- function(bamfiles,

alignerUsed <- match.arg(alignerUsed)

if (is.null(absIsizeMin)) # no tlen filtering
absIsizeMin <- -1L
if (is.null(absIsizeMax))
absIsizeMax <- -1L



message(paste0("Collecting co-occurrence frequencies from BAM files generated by ",alignerUsed,"."))


## load refsequences for regions
## extend regions by max_read_size from left and right
## extend regions from left and right
if(absIsizeMax > 0)
ext_len <- absIsizeMax
else
ext_len <- max_read_size

fa_index <- Rsamtools::scanFaIndex(genome)
GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index)
GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)),
seqlengths = GenomicRanges::width(fa_index)))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center"))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center"))

## trim to remove parts out of chromosomes
refseq_gr <- GenomicRanges::trim(refseq_gr)
Expand Down Expand Up @@ -132,6 +150,10 @@ get_ctable_from_bams <- function(bamfiles,
min_bisC_size = min_bisC_size,
mapqMin = mapqMin,
mapqMax = mapqMax,
absIsizeMin = absIsizeMin,
absIsizeMax = absIsizeMax,
min_read_size = min_read_size,
max_read_size = max_read_size,
alignerUsed = alignerUsed)


Expand Down
30 changes: 25 additions & 5 deletions R/get_data_matrix_from_bams.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@
#' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency.
#' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis.
#' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis.
#' @param max_read_size maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences.
#' This parameter does not influence any filtering of fragments based on the length.
#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param min_read_size \code{integer}. minimum read length to be included.
#' @param max_read_size \code{integer}. maximum read length to be included.
#' @param ncores number of cores to use.
#' @return \code{tibble} where each row corresponds to sample - region combination.
#' Columns of the returned \code{tibble} represent:
Expand Down Expand Up @@ -63,6 +67,9 @@ get_data_matrix_from_bams <- function(bamfiles,
min_bisC_size = 10,
mapqMin = 0,
mapqMax = 255,
absIsizeMin = NULL,
absIsizeMax = NULL,
min_read_size = 25L,
max_read_size = 1000L,
ncores = 1L){
if(!inherits(bamfiles,"character"))
Expand All @@ -89,20 +96,29 @@ get_data_matrix_from_bams <- function(bamfiles,

alignerUsed <- match.arg(alignerUsed)

if (is.null(absIsizeMin)) # no tlen filtering
absIsizeMin <- -1L
if (is.null(absIsizeMax))
absIsizeMax <- -1L

message(paste0("Fetching data from BAM files generated by ",alignerUsed,"."))


## remove metadata from regions
GenomicRanges::mcols(regions) <- NULL

## load refsequences for regions
## extend regions by max_read_size from left and right


## extend regions from left and right
if(absIsizeMax > 0)
ext_len <- absIsizeMax
else
ext_len <- max_read_size
fa_index <- Rsamtools::scanFaIndex(genome)
GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index)
GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)),
seqlengths = GenomicRanges::width(fa_index)))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center"))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center"))

## trim to remove parts out of chromosomes
refseq_gr <- GenomicRanges::trim(refseq_gr)
Expand Down Expand Up @@ -140,6 +156,10 @@ get_data_matrix_from_bams <- function(bamfiles,
min_bisC_size = min_bisC_size,
mapqMin = mapqMin,
mapqMax = mapqMax,
absIsizeMin = absIsizeMin,
absIsizeMax = absIsizeMax,
min_read_size = min_read_size,
max_read_size = max_read_size,
alignerUsed = alignerUsed)

data_mat <- outlist[["DataMatrix"]]
Expand Down
29 changes: 23 additions & 6 deletions R/get_protect_stats_from_bams.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@
#' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency.
#' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis.
#' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis.
#' @param max_read_size maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences.
#' This parameter does not influence any filtering of fragments based on the length.
#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included.
#' @param min_read_size \code{integer}. minimum read length to be included.
#' @param max_read_size \code{integer}. maximum read length to be included.
#' @param ncores number of cores to use.
#'
#' @return \code{tibble} where each row corresponds to sample - region combination.
Expand Down Expand Up @@ -60,6 +64,9 @@ get_protect_stats_from_bams <- function(bamfiles,
min_bisC_size = 10,
mapqMin = 0,
mapqMax = 255,
absIsizeMin = NULL,
absIsizeMax = NULL,
min_read_size = 25L,
max_read_size = 1000L,
ncores = 1L){
if(!inherits(bamfiles,"character"))
Expand All @@ -82,18 +89,24 @@ get_protect_stats_from_bams <- function(bamfiles,
stop("Could not find genome:", genome)

alignerUsed <- match.arg(alignerUsed)

if (is.null(absIsizeMin)) # no tlen filtering
absIsizeMin <- -1L
if (is.null(absIsizeMax))
absIsizeMax <- -1L
message(paste0("Collecting protection statistics from BAM files generated by ",alignerUsed,"."))


## load refsequences for regions
## extend regions by max_read_size from left and right

## extend regions from left and right
if(absIsizeMax > 0)
ext_len <- absIsizeMax
else
ext_len <- max_read_size
fa_index <- Rsamtools::scanFaIndex(genome)
GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index)
GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)),
seqlengths = GenomicRanges::width(fa_index)))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center"))
refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center"))

## trim to remove parts out of chromosomes
refseq_gr <- GenomicRanges::trim(refseq_gr)
Expand Down Expand Up @@ -129,6 +142,10 @@ get_protect_stats_from_bams <- function(bamfiles,
min_bisC_size = min_bisC_size,
mapqMin = mapqMin,
mapqMax = mapqMax,
absIsizeMin = absIsizeMin,
absIsizeMax = absIsizeMax,
min_read_size = min_read_size,
max_read_size = max_read_size,
alignerUsed = alignerUsed)

data_mat <- outlist[["ProtectStats"]]
Expand Down
14 changes: 12 additions & 2 deletions man/get_ctable_from_bams.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 12 additions & 2 deletions man/get_data_matrix_from_bams.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 12 additions & 2 deletions man/get_protect_stats_from_bams.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit c5595a4

Please sign in to comment.