From c5595a4ce97ef1c0b2c43a5235a0672819b6bb43 Mon Sep 17 00:00:00 2001 From: Ozonov Date: Tue, 8 Oct 2024 14:01:10 +0200 Subject: [PATCH] add data selection by tlen and read length --- DESCRIPTION | 2 +- R/RcppExports.R | 12 +- R/get_ctable_from_bams.R | 30 ++++- R/get_data_matrix_from_bams.R | 30 ++++- R/get_protect_stats_from_bams.R | 29 ++++- man/get_ctable_from_bams.Rd | 14 ++- man/get_data_matrix_from_bams.Rd | 14 ++- man/get_protect_stats_from_bams.Rd | 14 ++- src/RcppExports.cpp | 36 ++++-- src/fetch_data_from_bam.cpp | 27 +++-- src/fetch_data_from_bam.h | 4 + src/functions_export_rcpp.cpp | 49 +++++++- src/functions_export_rcpp.h | 12 ++ src/utilities.h | 16 ++- .../testthat/test-get_data_matrix_from_bams.R | 105 ++++++++++++++++++ 15 files changed, 338 insertions(+), 56 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1263dd5..a8273a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: fetchNOMe Type: Package Title: Package for fast and convenient retrieval of NOMe-seq data from BAM files -Version: 0.3 +Version: 0.4 Date: 2024-05-09 Authors@R: person(given = "Evgeniy A.", family="Ozonov", email = "evgeniy.ozonov@fmi.ch", role = c("aut", "cre")) Description: This R package is tailored for fast and convenient retrieval of NOMe-seq data from BAM files aligned with QuasR, Bismark and BISCUIT. diff --git a/R/RcppExports.R b/R/RcppExports.R index 2163da0..0b285ec 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,15 +1,15 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) { - .Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) +fetch_cooc_ctable_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) { + .Call(`_fetchNOMe_fetch_cooc_ctable_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) } -fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) { - .Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) +fetch_data_matrix_from_bams_cpp <- function(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) { + .Call(`_fetchNOMe_fetch_data_matrix_from_bams_cpp`, whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) } -fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) { - .Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed) +fetch_protect_stats_from_bams_cpp <- function(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) { + .Call(`_fetchNOMe_fetch_protect_stats_from_bams_cpp`, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed) } diff --git a/R/get_ctable_from_bams.R b/R/get_ctable_from_bams.R index 729aad3..63c5d38 100644 --- a/R/get_ctable_from_bams.R +++ b/R/get_ctable_from_bams.R @@ -22,8 +22,12 @@ #' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency. #' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis. #' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis. -#' @param max_read_size maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences. -#' This parameter does not influence any filtering of fragments based on the length. +#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert +#' size (TLEN field in SAM Spec v1.4) of alignments to be included. +#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert +#' size (TLEN field in SAM Spec v1.4) of alignments to be included. +#' @param min_read_size \code{integer}. minimum read length to be included. +#' @param max_read_size \code{integer}. maximum read length to be included. #' @param ncores number of cores to use. #' #' @return \code{tibble} where each row corresponds to sample - region combination. @@ -60,6 +64,9 @@ get_ctable_from_bams <- function(bamfiles, min_bisC_size = 10, mapqMin = 0, mapqMax = 255, + absIsizeMin = NULL, + absIsizeMax = NULL, + min_read_size = 25L, max_read_size = 1000L, ncores = 1L){ @@ -86,17 +93,28 @@ get_ctable_from_bams <- function(bamfiles, alignerUsed <- match.arg(alignerUsed) + if (is.null(absIsizeMin)) # no tlen filtering + absIsizeMin <- -1L + if (is.null(absIsizeMax)) + absIsizeMax <- -1L + + + message(paste0("Collecting co-occurrence frequencies from BAM files generated by ",alignerUsed,".")) ## load refsequences for regions - ## extend regions by max_read_size from left and right + ## extend regions from left and right + if(absIsizeMax > 0) + ext_len <- absIsizeMax + else + ext_len <- max_read_size fa_index <- Rsamtools::scanFaIndex(genome) GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index) GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)), seqlengths = GenomicRanges::width(fa_index))) - refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center")) + refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center")) ## trim to remove parts out of chromosomes refseq_gr <- GenomicRanges::trim(refseq_gr) @@ -132,6 +150,10 @@ get_ctable_from_bams <- function(bamfiles, min_bisC_size = min_bisC_size, mapqMin = mapqMin, mapqMax = mapqMax, + absIsizeMin = absIsizeMin, + absIsizeMax = absIsizeMax, + min_read_size = min_read_size, + max_read_size = max_read_size, alignerUsed = alignerUsed) diff --git a/R/get_data_matrix_from_bams.R b/R/get_data_matrix_from_bams.R index 12af51a..134a5b2 100644 --- a/R/get_data_matrix_from_bams.R +++ b/R/get_data_matrix_from_bams.R @@ -27,8 +27,12 @@ #' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency. #' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis. #' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis. -#' @param max_read_size maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences. -#' This parameter does not influence any filtering of fragments based on the length. +#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert +#' size (TLEN field in SAM Spec v1.4) of alignments to be included. +#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert +#' size (TLEN field in SAM Spec v1.4) of alignments to be included. +#' @param min_read_size \code{integer}. minimum read length to be included. +#' @param max_read_size \code{integer}. maximum read length to be included. #' @param ncores number of cores to use. #' @return \code{tibble} where each row corresponds to sample - region combination. #' Columns of the returned \code{tibble} represent: @@ -63,6 +67,9 @@ get_data_matrix_from_bams <- function(bamfiles, min_bisC_size = 10, mapqMin = 0, mapqMax = 255, + absIsizeMin = NULL, + absIsizeMax = NULL, + min_read_size = 25L, max_read_size = 1000L, ncores = 1L){ if(!inherits(bamfiles,"character")) @@ -89,6 +96,11 @@ get_data_matrix_from_bams <- function(bamfiles, alignerUsed <- match.arg(alignerUsed) + if (is.null(absIsizeMin)) # no tlen filtering + absIsizeMin <- -1L + if (is.null(absIsizeMax)) + absIsizeMax <- -1L + message(paste0("Fetching data from BAM files generated by ",alignerUsed,".")) @@ -96,13 +108,17 @@ get_data_matrix_from_bams <- function(bamfiles, GenomicRanges::mcols(regions) <- NULL ## load refsequences for regions - ## extend regions by max_read_size from left and right - + + ## extend regions from left and right + if(absIsizeMax > 0) + ext_len <- absIsizeMax + else + ext_len <- max_read_size fa_index <- Rsamtools::scanFaIndex(genome) GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index) GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)), seqlengths = GenomicRanges::width(fa_index))) - refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center")) + refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center")) ## trim to remove parts out of chromosomes refseq_gr <- GenomicRanges::trim(refseq_gr) @@ -140,6 +156,10 @@ get_data_matrix_from_bams <- function(bamfiles, min_bisC_size = min_bisC_size, mapqMin = mapqMin, mapqMax = mapqMax, + absIsizeMin = absIsizeMin, + absIsizeMax = absIsizeMax, + min_read_size = min_read_size, + max_read_size = max_read_size, alignerUsed = alignerUsed) data_mat <- outlist[["DataMatrix"]] diff --git a/R/get_protect_stats_from_bams.R b/R/get_protect_stats_from_bams.R index 93e3721..e22b1fa 100644 --- a/R/get_protect_stats_from_bams.R +++ b/R/get_protect_stats_from_bams.R @@ -19,8 +19,12 @@ #' i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \code{min_bisC_size} are subject to filtering based on bisulfite conversion efficiency. #' @param mapqMin \code{integer} setting minimum MAPQ of reads to be included into analysis. #' @param mapqMax \code{integer} setting maximum MAPQ of reads to be included into analysis. -#' @param max_read_size maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences. -#' This parameter does not influence any filtering of fragments based on the length. +#' @param absIsizeMin \code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert +#' size (TLEN field in SAM Spec v1.4) of alignments to be included. +#' @param absIsizeMax \code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert +#' size (TLEN field in SAM Spec v1.4) of alignments to be included. +#' @param min_read_size \code{integer}. minimum read length to be included. +#' @param max_read_size \code{integer}. maximum read length to be included. #' @param ncores number of cores to use. #' #' @return \code{tibble} where each row corresponds to sample - region combination. @@ -60,6 +64,9 @@ get_protect_stats_from_bams <- function(bamfiles, min_bisC_size = 10, mapqMin = 0, mapqMax = 255, + absIsizeMin = NULL, + absIsizeMax = NULL, + min_read_size = 25L, max_read_size = 1000L, ncores = 1L){ if(!inherits(bamfiles,"character")) @@ -82,18 +89,24 @@ get_protect_stats_from_bams <- function(bamfiles, stop("Could not find genome:", genome) alignerUsed <- match.arg(alignerUsed) - + if (is.null(absIsizeMin)) # no tlen filtering + absIsizeMin <- -1L + if (is.null(absIsizeMax)) + absIsizeMax <- -1L message(paste0("Collecting protection statistics from BAM files generated by ",alignerUsed,".")) ## load refsequences for regions - ## extend regions by max_read_size from left and right - + ## extend regions from left and right + if(absIsizeMax > 0) + ext_len <- absIsizeMax + else + ext_len <- max_read_size fa_index <- Rsamtools::scanFaIndex(genome) GenomeInfoDb::seqlevels(regions) <- GenomeInfoDb::seqlevels(fa_index) GenomeInfoDb::seqinfo(regions) <- suppressMessages(GenomeInfoDb::Seqinfo(seqnames = as.character(GenomeInfoDb::seqnames(fa_index)), seqlengths = GenomicRanges::width(fa_index))) - refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*max_read_size,fix="center")) + refseq_gr <- suppressWarnings(GenomicRanges::resize(regions,width = GenomicRanges::width(regions) + 2*ext_len,fix="center")) ## trim to remove parts out of chromosomes refseq_gr <- GenomicRanges::trim(refseq_gr) @@ -129,6 +142,10 @@ get_protect_stats_from_bams <- function(bamfiles, min_bisC_size = min_bisC_size, mapqMin = mapqMin, mapqMax = mapqMax, + absIsizeMin = absIsizeMin, + absIsizeMax = absIsizeMax, + min_read_size = min_read_size, + max_read_size = max_read_size, alignerUsed = alignerUsed) data_mat <- outlist[["ProtectStats"]] diff --git a/man/get_ctable_from_bams.Rd b/man/get_ctable_from_bams.Rd index 73550f1..114627e 100644 --- a/man/get_ctable_from_bams.Rd +++ b/man/get_ctable_from_bams.Rd @@ -19,6 +19,9 @@ get_ctable_from_bams( min_bisC_size = 10, mapqMin = 0, mapqMax = 255, + absIsizeMin = NULL, + absIsizeMax = NULL, + min_read_size = 25L, max_read_size = 1000L, ncores = 1L ) @@ -54,8 +57,15 @@ i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \c \item{mapqMax}{\code{integer} setting maximum MAPQ of reads to be included into analysis.} -\item{max_read_size}{maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences. -This parameter does not influence any filtering of fragments based on the length.} +\item{absIsizeMin}{\code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert +size (TLEN field in SAM Spec v1.4) of alignments to be included.} + +\item{absIsizeMax}{\code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert +size (TLEN field in SAM Spec v1.4) of alignments to be included.} + +\item{min_read_size}{\code{integer}. minimum read length to be included.} + +\item{max_read_size}{\code{integer}. maximum read length to be included.} \item{ncores}{number of cores to use.} } diff --git a/man/get_data_matrix_from_bams.Rd b/man/get_data_matrix_from_bams.Rd index 54c43e4..aa37a89 100644 --- a/man/get_data_matrix_from_bams.Rd +++ b/man/get_data_matrix_from_bams.Rd @@ -19,6 +19,9 @@ get_data_matrix_from_bams( min_bisC_size = 10, mapqMin = 0, mapqMax = 255, + absIsizeMin = NULL, + absIsizeMax = NULL, + min_read_size = 25L, max_read_size = 1000L, ncores = 1L ) @@ -61,8 +64,15 @@ i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \c \item{mapqMax}{\code{integer} setting maximum MAPQ of reads to be included into analysis.} -\item{max_read_size}{maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences. -This parameter does not influence any filtering of fragments based on the length.} +\item{absIsizeMin}{\code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert +size (TLEN field in SAM Spec v1.4) of alignments to be included.} + +\item{absIsizeMax}{\code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert +size (TLEN field in SAM Spec v1.4) of alignments to be included.} + +\item{min_read_size}{\code{integer}. minimum read length to be included.} + +\item{max_read_size}{\code{integer}. maximum read length to be included.} \item{ncores}{number of cores to use.} } diff --git a/man/get_protect_stats_from_bams.Rd b/man/get_protect_stats_from_bams.Rd index 90d48cd..a201c5e 100644 --- a/man/get_protect_stats_from_bams.Rd +++ b/man/get_protect_stats_from_bams.Rd @@ -18,6 +18,9 @@ get_protect_stats_from_bams( min_bisC_size = 10, mapqMin = 0, mapqMax = 255, + absIsizeMin = NULL, + absIsizeMax = NULL, + min_read_size = 25L, max_read_size = 1000L, ncores = 1L ) @@ -51,8 +54,15 @@ i.e. only fragments with number of non-GCH, and non-WCG cytosines higher then \c \item{mapqMax}{\code{integer} setting maximum MAPQ of reads to be included into analysis.} -\item{max_read_size}{maximum read length expected in the BAM file. This parameter only controls how much regions of interest are flanked left and right to extract reference sequences. -This parameter does not influence any filtering of fragments based on the length.} +\item{absIsizeMin}{\code{integer} or \code{NULL} (default). For paired-end experiments, minimal absolute insert +size (TLEN field in SAM Spec v1.4) of alignments to be included.} + +\item{absIsizeMax}{\code{integer} or \code{NULL} (default). For paired-end experiments, maximal absolute insert +size (TLEN field in SAM Spec v1.4) of alignments to be included.} + +\item{min_read_size}{\code{integer}. minimum read length to be included.} + +\item{max_read_size}{\code{integer}. maximum read length to be included.} \item{ncores}{number of cores to use.} } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index de05d37..b1a19d8 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -12,8 +12,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // fetch_cooc_ctable_from_bams_cpp -Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::IntegerVector& max_spac, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed); -RcppExport SEXP _fetchNOMe_fetch_cooc_ctable_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP max_spacSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP) { +Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::IntegerVector& max_spac, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::IntegerVector& absIsizeMin, const Rcpp::IntegerVector& absIsizeMax, const Rcpp::IntegerVector& min_read_size, const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); +RcppExport SEXP _fetchNOMe_fetch_cooc_ctable_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP max_spacSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP absIsizeMinSEXP, SEXP absIsizeMaxSEXP, SEXP min_read_sizeSEXP, SEXP max_read_sizeSEXP, SEXP alignerUsedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -32,14 +32,18 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMin(absIsizeMinSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMax(absIsizeMaxSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_read_size(min_read_sizeSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type max_read_size(max_read_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); - rcpp_result_gen = Rcpp::wrap(fetch_cooc_ctable_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)); + rcpp_result_gen = Rcpp::wrap(fetch_cooc_ctable_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, max_spac, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)); return rcpp_result_gen; END_RCPP } // fetch_data_matrix_from_bams_cpp -Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed); -RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP) { +Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichContext, const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::IntegerVector& absIsizeMin, const Rcpp::IntegerVector& absIsizeMax, const Rcpp::IntegerVector& min_read_size, const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); +RcppExport SEXP _fetchNOMe_fetch_data_matrix_from_bams_cpp(SEXP whichContextSEXP, SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP absIsizeMinSEXP, SEXP absIsizeMaxSEXP, SEXP min_read_sizeSEXP, SEXP max_read_sizeSEXP, SEXP alignerUsedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -58,14 +62,18 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMin(absIsizeMinSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMax(absIsizeMaxSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_read_size(min_read_sizeSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type max_read_size(max_read_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); - rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)); + rcpp_result_gen = Rcpp::wrap(fetch_data_matrix_from_bams_cpp(whichContext, infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)); return rcpp_result_gen; END_RCPP } // fetch_protect_stats_from_bams_cpp -Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::CharacterVector& alignerUsed); -RcppExport SEXP _fetchNOMe_fetch_protect_stats_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP alignerUsedSEXP) { +Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::CharacterVector& regionChr, const Rcpp::IntegerVector& regionStart, const Rcpp::IntegerVector& regionEnd, const Rcpp::CharacterVector& seqstring, const Rcpp::IntegerVector& seqStart, const Rcpp::IntegerVector& seqEnd, const Rcpp::LogicalVector& remove_nonunique, const Rcpp::IntegerVector& clip_until_nbg, const Rcpp::NumericVector& max_protect_frac, const Rcpp::NumericVector& max_bisC_meth, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, const Rcpp::IntegerVector& absIsizeMin, const Rcpp::IntegerVector& absIsizeMax, const Rcpp::IntegerVector& min_read_size, const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); +RcppExport SEXP _fetchNOMe_fetch_protect_stats_from_bams_cpp(SEXP infilesSEXP, SEXP regionChrSEXP, SEXP regionStartSEXP, SEXP regionEndSEXP, SEXP seqstringSEXP, SEXP seqStartSEXP, SEXP seqEndSEXP, SEXP remove_nonuniqueSEXP, SEXP clip_until_nbgSEXP, SEXP max_protect_fracSEXP, SEXP max_bisC_methSEXP, SEXP min_bisC_sizeSEXP, SEXP mapqMinSEXP, SEXP mapqMaxSEXP, SEXP absIsizeMinSEXP, SEXP absIsizeMaxSEXP, SEXP min_read_sizeSEXP, SEXP max_read_sizeSEXP, SEXP alignerUsedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -83,16 +91,20 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_bisC_size(min_bisC_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMin(mapqMinSEXP); Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type mapqMax(mapqMaxSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMin(absIsizeMinSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type absIsizeMax(absIsizeMaxSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type min_read_size(min_read_sizeSEXP); + Rcpp::traits::input_parameter< const Rcpp::IntegerVector& >::type max_read_size(max_read_sizeSEXP); Rcpp::traits::input_parameter< const Rcpp::CharacterVector& >::type alignerUsed(alignerUsedSEXP); - rcpp_result_gen = Rcpp::wrap(fetch_protect_stats_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, alignerUsed)); + rcpp_result_gen = Rcpp::wrap(fetch_protect_stats_from_bams_cpp(infiles, regionChr, regionStart, regionEnd, seqstring, seqStart, seqEnd, remove_nonunique, clip_until_nbg, max_protect_frac, max_bisC_meth, min_bisC_size, mapqMin, mapqMax, absIsizeMin, absIsizeMax, min_read_size, max_read_size, alignerUsed)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_fetchNOMe_fetch_cooc_ctable_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_cooc_ctable_from_bams_cpp, 16}, - {"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 16}, - {"_fetchNOMe_fetch_protect_stats_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_protect_stats_from_bams_cpp, 15}, + {"_fetchNOMe_fetch_cooc_ctable_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_cooc_ctable_from_bams_cpp, 20}, + {"_fetchNOMe_fetch_data_matrix_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_data_matrix_from_bams_cpp, 20}, + {"_fetchNOMe_fetch_protect_stats_from_bams_cpp", (DL_FUNC) &_fetchNOMe_fetch_protect_stats_from_bams_cpp, 19}, {NULL, NULL, 0} }; diff --git a/src/fetch_data_from_bam.cpp b/src/fetch_data_from_bam.cpp index 991209d..99d2b21 100644 --- a/src/fetch_data_from_bam.cpp +++ b/src/fetch_data_from_bam.cpp @@ -369,13 +369,26 @@ static int collectRegionData_Bisq(const bam1_t *hit, void *data) { // checks validity of an alignment bool isValidAln(const bam1_t *hit,const obj_pnts* pdata){ - return(!bam_is_not_primary(hit) && - !bam_is_qc_fail(hit) && - !bam_is_pcr_dupl(hit) && - !bam_is_suppl(hit) && - ((int )(hit->core).qual >= pdata->mapqMin) && - ((int )(hit->core).qual <= pdata->mapqMax) - ); + bool valaln = !bam_is_not_primary(hit) && + !bam_is_qc_fail(hit) && + !bam_is_pcr_dupl(hit) && + !bam_is_suppl(hit) && + ((int )(hit->core).qual >= pdata->mapqMin) && + ((int )(hit->core).qual <= pdata->mapqMax) && + ((int )(hit->core).l_qseq >= pdata->min_read_size) && + ((int )(hit->core).l_qseq <= pdata->max_read_size); + + // check insert length + if(bam_is_proper_pair(hit)){ + if(pdata->absIsizeMin > 0){ + valaln = valaln && llabs(((int )(hit->core).isize)) >= pdata->absIsizeMin; + } + if(pdata->absIsizeMax > 0){ + valaln = valaln && llabs(((int )(hit->core).isize)) <= pdata->absIsizeMax; + } + } + + return(valaln); } diff --git a/src/fetch_data_from_bam.h b/src/fetch_data_from_bam.h index 40ffd77..f648e55 100644 --- a/src/fetch_data_from_bam.h +++ b/src/fetch_data_from_bam.h @@ -31,6 +31,10 @@ typedef struct { // for use in callback function. contains pointers to required regionData *reg_data; // point to an object which stores data in region int mapqMin; // minimum mapping quality score int mapqMax; // maximum mapping quality score + int absIsizeMin; // minimum absolute tlen for paired reads + int absIsizeMax; // maximum absolute tlen for paired reads + int min_read_size; // minimum read length + int max_read_size; // maximum read length string prefix; // prefix for qnames to avoid problems with identical qnames in different bams } obj_pnts; diff --git a/src/functions_export_rcpp.cpp b/src/functions_export_rcpp.cpp index 9e588f6..40cd79e 100644 --- a/src/functions_export_rcpp.cpp +++ b/src/functions_export_rcpp.cpp @@ -17,6 +17,10 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, + const Rcpp::IntegerVector& absIsizeMin, + const Rcpp::IntegerVector& absIsizeMax, + const Rcpp::IntegerVector& min_read_size, + const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed){ string alignerUsed_ = Rcpp::as(alignerUsed); @@ -47,6 +51,11 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, int min_bisC_size_ = Rcpp::as(min_bisC_size); int mapqMin_ = Rcpp::as(mapqMin); int mapqMax_ = Rcpp::as(mapqMax); + int absIsizeMin_ = Rcpp::as(absIsizeMin); + int absIsizeMax_ = Rcpp::as(absIsizeMax); + int min_read_size_ = Rcpp::as(min_read_size); + int max_read_size_ = Rcpp::as(max_read_size); + // initialize container for reference sequence refSeqInfo refseqInfo(seqstring_, seqStart_,seqEnd_); @@ -55,9 +64,15 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, obj_pnts pnts; // store pointer to reference sequence container pnts.refseq_info = &refseqInfo; - // store min and amp MAPQ + // store min and max MAPQ pnts.mapqMin = mapqMin_; pnts.mapqMax = mapqMax_; + // store min and max tlen + pnts.absIsizeMin = absIsizeMin_; + pnts.absIsizeMax = absIsizeMax_; + // store min and max read lengths + pnts.min_read_size = min_read_size_; + pnts.max_read_size = max_read_size_; // initialize container for cooccurrence counts and counters coocCntTable cTable(max_spac_); @@ -146,6 +161,10 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, + const Rcpp::IntegerVector& absIsizeMin, + const Rcpp::IntegerVector& absIsizeMax, + const Rcpp::IntegerVector& min_read_size, + const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed){ // convert input parameters @@ -191,7 +210,10 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon int min_bisC_size_ = Rcpp::as(min_bisC_size); int mapqMin_ = Rcpp::as(mapqMin); int mapqMax_ = Rcpp::as(mapqMax); - + int absIsizeMin_ = Rcpp::as(absIsizeMin); + int absIsizeMax_ = Rcpp::as(absIsizeMax); + int min_read_size_ = Rcpp::as(min_read_size); + int max_read_size_ = Rcpp::as(max_read_size); // initialize container for reference sequence refSeqInfo refseqInfo(seqstring_, seqStart_,seqEnd_); @@ -202,7 +224,12 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon // store min and amp MAPQ pnts.mapqMin = mapqMin_; pnts.mapqMax = mapqMax_; - + // store min and max tlen + pnts.absIsizeMin = absIsizeMin_; + pnts.absIsizeMax = absIsizeMax_; + // store min and max read lengths + pnts.min_read_size = min_read_size_; + pnts.max_read_size = max_read_size_; // create container for fragments and counters regionData regData(regionChr_,regionStart_,regionEnd_); pnts.reg_data = ®Data; @@ -273,6 +300,10 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, + const Rcpp::IntegerVector& absIsizeMin, + const Rcpp::IntegerVector& absIsizeMax, + const Rcpp::IntegerVector& min_read_size, + const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed){ @@ -305,7 +336,10 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile int min_bisC_size_ = Rcpp::as(min_bisC_size); int mapqMin_ = Rcpp::as(mapqMin); int mapqMax_ = Rcpp::as(mapqMax); - + int absIsizeMin_ = Rcpp::as(absIsizeMin); + int absIsizeMax_ = Rcpp::as(absIsizeMax); + int min_read_size_ = Rcpp::as(min_read_size); + int max_read_size_ = Rcpp::as(max_read_size); // initialize container for reference sequence refSeqInfo refseqInfo(seqstring_, seqStart_,seqEnd_); @@ -317,7 +351,12 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile // store min and max MAPQ pnts.mapqMin = mapqMin_; pnts.mapqMax = mapqMax_; - + // store min and max tlen + pnts.absIsizeMin = absIsizeMin_; + pnts.absIsizeMax = absIsizeMax_; + // store min and max read lengths + pnts.min_read_size = min_read_size_; + pnts.max_read_size = max_read_size_; // initialize container for collecting protect statistics and counters protectStatsTable protStats; int n_fetched = 0, n_nonUnique = 0, n_bisfailed = 0, n_analysed = 0, n_clipped = 0; diff --git a/src/functions_export_rcpp.h b/src/functions_export_rcpp.h index fd93602..ff86799 100644 --- a/src/functions_export_rcpp.h +++ b/src/functions_export_rcpp.h @@ -43,6 +43,10 @@ Rcpp::List fetch_cooc_ctable_from_bams_cpp(const Rcpp::CharacterVector& infiles, const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, + const Rcpp::IntegerVector& absIsizeMin, + const Rcpp::IntegerVector& absIsizeMax, + const Rcpp::IntegerVector& min_read_size, + const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); @@ -62,6 +66,10 @@ Rcpp::List fetch_data_matrix_from_bams_cpp(const Rcpp::CharacterVector& whichCon const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, + const Rcpp::IntegerVector& absIsizeMin, + const Rcpp::IntegerVector& absIsizeMax, + const Rcpp::IntegerVector& min_read_size, + const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); // [[Rcpp::export]] @@ -79,6 +87,10 @@ Rcpp::List fetch_protect_stats_from_bams_cpp(const Rcpp::CharacterVector& infile const Rcpp::IntegerVector& min_bisC_size, const Rcpp::IntegerVector& mapqMin, const Rcpp::IntegerVector& mapqMax, + const Rcpp::IntegerVector& absIsizeMin, + const Rcpp::IntegerVector& absIsizeMax, + const Rcpp::IntegerVector& min_read_size, + const Rcpp::IntegerVector& max_read_size, const Rcpp::CharacterVector& alignerUsed); #endif \ No newline at end of file diff --git a/src/utilities.h b/src/utilities.h index 640faf7..9ab4f35 100644 --- a/src/utilities.h +++ b/src/utilities.h @@ -23,7 +23,7 @@ /*! @function - * @abstrat Get whether the query is not primary alignment + * @abstract Get whether the query is not primary alignment * @param b pointer to an alignment * @return boolean true if query is not primary */ @@ -31,7 +31,7 @@ #define bam_is_not_primary(b) (((b)->core.flag&BAM_FSECONDARY) != 0) /*! @function - * @abstrat Get whether the query failed QC + * @abstract Get whether the query failed QC * @param b pointer to an alignment * @return boolean true if query failed QC */ @@ -40,7 +40,7 @@ /*! @function - * @abstrat Get whether the query is optical or PCR duplicate + * @abstract Get whether the query is optical or PCR duplicate * @param b pointer to an alignment * @return boolean true if query is optical or PCR duplicate */ @@ -48,7 +48,7 @@ #define bam_is_pcr_dupl(b) (((b)->core.flag&BAM_FDUP) != 0) /*! @function - * @abstrat Get whether the query is supplementary alignment + * @abstract Get whether the query is supplementary alignment * @param b pointer to an alignment * @return boolean true if query is supplementary alignment */ @@ -56,6 +56,14 @@ #define bam_is_suppl(b) (((b)->core.flag&BAM_FSUPPLEMENTARY) != 0) +/* + * @function + * @abstract Get whether read is mapped in a proper pair + * @param b pointer to an alignment + * @return boolean true if query is mapped in a proper pair + */ +#define bam_is_proper_pair(b) (((b)->core.flag&BAM_FPROPER_PAIR) != 0) + samfile_t * _bam_tryopen(const char *filename, const char *filemode, void *aux); diff --git a/tests/testthat/test-get_data_matrix_from_bams.R b/tests/testthat/test-get_data_matrix_from_bams.R index 80d6e6e..68956d4 100644 --- a/tests/testthat/test-get_data_matrix_from_bams.R +++ b/tests/testthat/test-get_data_matrix_from_bams.R @@ -22,6 +22,61 @@ test_that("get_data_matrix_from_bams works for QuasR bam file", { ## test that identical expect_true(all(quasR_methmat$allC_DataMatrix[[1]] == quasr_expected_methmat)) + + + #### check if filtering by tlen works #### + quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", + samplenames = "quasr_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1, + absIsizeMin = 500) + + ## test whether 0 reads are fetched + expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) + quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", + samplenames = "quasr_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1, + absIsizeMax = 50) + expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) + + #### check if filtering by read length works + quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", + samplenames = "quasr_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1, + min_read_size = 400) + expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) + quasR_methmat <- get_data_matrix_from_bams(bamfiles = "QuasR_test.bam", + samplenames = "quasr_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + max_bisC_meth = 1, + max_read_size = 50) + expect_true(nrow(quasR_methmat$allC_DataMatrix[[1]]) == 0) }) @@ -54,4 +109,54 @@ test_that("get_data_matrix_from_bams works for Bismark bam file with indels and ## test that identical expect_true(all(bismark_methmat$allC_DataMatrix[[1]] == bismark_expected_methmat)) + + #### test if filtering by tlen works #### + bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", + samplenames = "bismark_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + absIsizeMin = 500) + expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) + bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", + samplenames = "bismark_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + absIsizeMax = 50) + expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) + + #### test if filtering by read length works #### + bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", + samplenames = "bismark_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + min_read_size = 500) + expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) + bismark_methmat <- get_data_matrix_from_bams(bamfiles = "Bismark_test.bam", + samplenames = "bismark_pe", + regions = GenomicRanges::GRanges(seqnames = "random_genome_700bp", + strand = "+", + IRanges::IRanges(start = 200,end = 500)), + genome = "random_genome_700bp.fa", + whichContext ="allC", + remove_nonunique = F, + clip_until_nbg = 0L, + max_read_size = 50) + expect_true(nrow(bismark_methmat$allC_DataMatrix[[1]]) == 0) + + }) \ No newline at end of file