From 241c0401f04fb40a9b4a82a43168e13441e5e95e Mon Sep 17 00:00:00 2001 From: clintval Date: Tue, 10 Dec 2024 09:40:18 -1000 Subject: [PATCH] Ensure SvPileup ignores duplicate records by default --- .../fulcrumgenomics/sv/tools/SvPileup.scala | 42 ++++++--- .../sv/tools/SvPileupTest.scala | 88 +++++++++++++++++++ 2 files changed, 116 insertions(+), 14 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala b/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala index f74dd59..b04603e 100644 --- a/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala +++ b/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala @@ -122,7 +122,8 @@ class SvPileup |side of an aligned segment when merging segments.""") slop: Int = 5, @arg(flag='t', doc="Optional bed file of target regions") targetsBed: Option[FilePath] = None, @arg(flag='T', doc="Requirement on if each side of the breakpoint must overlap a target. Will always annotate each side of the breakpoint.") - targetsBedRequirement: TargetBedRequirement.Requirement = TargetBedRequirement.AnnotateOnly + targetsBedRequirement: TargetBedRequirement.Requirement = TargetBedRequirement.AnnotateOnly, + @arg(doc="Whether to include duplicate marked records in the pileup or not.") val includeDuplicates: Boolean = false ) extends SvTool { import SvPileup._ @@ -145,7 +146,14 @@ class SvPileup Bams.templateIterator(source) .tapEach(t => progress.record(t.allReads.next())) - .flatMap(template => filterTemplate(template, minPrimaryMapq=minPrimaryMappingQuality, minSupplementaryMapq=minSupplementaryMappingQuality)) + .flatMap { template => + filterTemplate( + template, + minPrimaryMapq=minPrimaryMappingQuality, + minSupplementaryMapq=minSupplementaryMappingQuality, + includeDuplicates=includeDuplicates, + ) + } .foreach { template => // Find the breakpoints val evidences = findBreakpoints( @@ -297,18 +305,24 @@ object SvPileup extends LazyLogging { */ def filterTemplate(t: Template, minPrimaryMapq: Int, - minSupplementaryMapq: Int): Option[Template] = { - val r1PrimaryOk = t.r1.exists(r => r.mapped && r.mapq >= minPrimaryMapq) - val r2PrimaryOk = t.r2.exists(r => r.mapped && r.mapq >= minPrimaryMapq) - - if (!r1PrimaryOk && !r2PrimaryOk) None else Some( - Template( - r1 = if (r1PrimaryOk) t.r1 else None, - r2 = if (r2PrimaryOk) t.r2 else None, - r1Supplementals = if (r1PrimaryOk) t.r1Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, - r2Supplementals = if (r2PrimaryOk) t.r2Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, - ) - ) + minSupplementaryMapq: Int, + includeDuplicates: Boolean = false): Option[Template] = { + t.allReads.filter(rec => includeDuplicates || !rec.duplicate).toList match { + case Nil => None + case records => + val duplicateFiltered = Template(records.iterator) + val r1PrimaryOk = duplicateFiltered.r1.exists(r => r.mapped && r.mapq >= minPrimaryMapq) + val r2PrimaryOk = duplicateFiltered.r2.exists(r => r.mapped && r.mapq >= minPrimaryMapq) + + if (!r1PrimaryOk && !r2PrimaryOk) None else Some( + Template( + r1 = if (r1PrimaryOk) duplicateFiltered.r1 else None, + r2 = if (r2PrimaryOk) duplicateFiltered.r2 else None, + r1Supplementals = if (r1PrimaryOk) duplicateFiltered.r1Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, + r2Supplementals = if (r2PrimaryOk) duplicateFiltered.r2Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, + ) + ) + } } diff --git a/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala b/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala index 1597c91..9aee4e7 100644 --- a/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala +++ b/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala @@ -420,6 +420,94 @@ class SvPileupTest extends UnitSpec { SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20) shouldBe None } + it should s"ignore records for the entire ordinal where R1 (true) or R2 (false) are marked as duplicates" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20).value shouldBe template + + r1.duplicate = true + r2.duplicate = false + + val expected = template.copy(r1=None, r1Supplementals = Nil, r1Secondaries = Nil) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20).value shouldBe expected + } + + it should s"ignore records for the entire ordinal where R1 (false) or R2 (true) are marked as duplicates" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20).value shouldBe template + + r1.duplicate = false + r2.duplicate = true + + val expected = template.copy(r2=None, r2Supplementals = Nil, r2Secondaries = Nil) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20).value shouldBe expected + } + + it should s"ignore all records where R1 (true) or R2 (true) are both marked as duplicates" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20).value shouldBe template + + r1.duplicate = true + r2.duplicate = true + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20) shouldBe None + } + + it should "ignore duplicate supplementals when asked to" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50) + val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20).value shouldBe template + + r1supp.duplicate = true + r2supp.duplicate = true + + val expected = t(r1, r2) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20).value shouldBe expected + } + + it should "not ignore duplicate records when asked to" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50) + val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template + + r1.duplicate = true + r2.duplicate = true + r1supp.duplicate = true + r2supp.duplicate = true + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template + } + /** Writes the given BAM records to a BAM file */ private def toInput(rec: SamRecord*): PathToBam = { val path = makeTempFile("input.", ".bam")