diff --git a/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala b/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala index f74dd59..f7fed9b 100644 --- a/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala +++ b/src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala @@ -122,7 +122,9 @@ class SvPileup |side of an aligned segment when merging segments.""") slop: Int = 5, @arg(flag='t', doc="Optional bed file of target regions") targetsBed: Option[FilePath] = None, @arg(flag='T', doc="Requirement on if each side of the breakpoint must overlap a target. Will always annotate each side of the breakpoint.") - targetsBedRequirement: TargetBedRequirement.Requirement = TargetBedRequirement.AnnotateOnly + targetsBedRequirement: TargetBedRequirement.Requirement = TargetBedRequirement.AnnotateOnly, + @arg(doc="Whether to include duplicate marked records for breakpoint pileup or not.") val includeDuplicates: Boolean = false, + @arg(doc="Whether to include QC failed records for breakpoint pileup or not.") val includeQcFails: Boolean = false, ) extends SvTool { import SvPileup._ @@ -145,7 +147,15 @@ class SvPileup Bams.templateIterator(source) .tapEach(t => progress.record(t.allReads.next())) - .flatMap(template => filterTemplate(template, minPrimaryMapq=minPrimaryMappingQuality, minSupplementaryMapq=minSupplementaryMappingQuality)) + .flatMap { template => + filterTemplate( + template, + minPrimaryMapq=minPrimaryMappingQuality, + minSupplementaryMapq=minSupplementaryMappingQuality, + includeDuplicates=includeDuplicates, + includeQcFails=includeQcFails, + ) + } .foreach { template => // Find the breakpoints val evidences = findBreakpoints( @@ -297,18 +307,25 @@ object SvPileup extends LazyLogging { */ def filterTemplate(t: Template, minPrimaryMapq: Int, - minSupplementaryMapq: Int): Option[Template] = { - val r1PrimaryOk = t.r1.exists(r => r.mapped && r.mapq >= minPrimaryMapq) - val r2PrimaryOk = t.r2.exists(r => r.mapped && r.mapq >= minPrimaryMapq) - - if (!r1PrimaryOk && !r2PrimaryOk) None else Some( - Template( - r1 = if (r1PrimaryOk) t.r1 else None, - r2 = if (r2PrimaryOk) t.r2 else None, - r1Supplementals = if (r1PrimaryOk) t.r1Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, - r2Supplementals = if (r2PrimaryOk) t.r2Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, - ) - ) + minSupplementaryMapq: Int, + includeDuplicates: Boolean = false, + includeQcFails: Boolean = false): Option[Template] = { + t.allReads.filter(rec => (includeDuplicates || !rec.duplicate) && (includeQcFails || rec.pf)).toList match { + case Nil => None + case records => + val filteredTemplate = Template(records.iterator) + val r1PrimaryOk = filteredTemplate.r1.exists(r => r.mapped && r.mapq >= minPrimaryMapq) + val r2PrimaryOk = filteredTemplate.r2.exists(r => r.mapped && r.mapq >= minPrimaryMapq) + + if (!r1PrimaryOk && !r2PrimaryOk) None else Some( + Template( + r1 = if (r1PrimaryOk) filteredTemplate.r1 else None, + r2 = if (r2PrimaryOk) filteredTemplate.r2 else None, + r1Supplementals = if (r1PrimaryOk) filteredTemplate.r1Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, + r2Supplementals = if (r2PrimaryOk) filteredTemplate.r2Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil, + ) + ) + } } diff --git a/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala b/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala index 1597c91..13a8355 100644 --- a/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala +++ b/src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala @@ -332,7 +332,7 @@ class SvPileupTest extends UnitSpec { r("chr1", 300, Plus, r=2, cigar="100M", supp=false, mapq=50), ) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template } it should "do nothing to a template with high mapq primaries and supplementaries" in { @@ -343,7 +343,7 @@ class SvPileupTest extends UnitSpec { r("chr1", 120, Minus, r=2, cigar="30M70S", supp=true, mapq=50), ) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template } it should "remove a low quality supplementary record" in { @@ -354,7 +354,7 @@ class SvPileupTest extends UnitSpec { r("chr1", 120, Minus, r=2, cigar="30M70S", supp=true, mapq=1), ) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template.copy(r2Supplementals=Nil) + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template.copy(r2Supplementals=Nil) } it should "remove all evidence of R2 if the primary mapping is low quality" in { @@ -366,7 +366,7 @@ class SvPileupTest extends UnitSpec { ) val expected = template.copy(r2=None, r2Supplementals=Nil) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe expected + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe expected } it should "return None if both R1 and R2 primaries are low quality" in { @@ -377,7 +377,7 @@ class SvPileupTest extends UnitSpec { r("chr1", 120, Minus, r=2, cigar="30M70S", supp=true, mapq=50), ) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20) shouldBe None + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true) shouldBe None } it should "remove an unmapped R2" in { @@ -387,7 +387,7 @@ class SvPileupTest extends UnitSpec { Seq(r("chr1", 100, Plus, r=2, cigar="100M", supp=false, mapq=0)).tapEach(_.unmapped = true).head, ) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template.copy(r2=None) + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template.copy(r2=None) } it should "handle a template with fragment data with high mapping quality" in { @@ -396,7 +396,7 @@ class SvPileupTest extends UnitSpec { r("chr7", 800, Plus, r=0, cigar="50S50M", supp=true, mapq=50), ) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template } it should "remove low mapq supplementary records from a fragment template" in { @@ -407,7 +407,7 @@ class SvPileupTest extends UnitSpec { ) val expected = template.copy(r1Supplementals = template.r1Supplementals.filter(_.start != 800)) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe expected + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe expected } it should "return None for a fragment template with a low quality primary mapping" in { @@ -417,7 +417,183 @@ class SvPileupTest extends UnitSpec { r("chr7", 820, Plus, r=0, cigar="70S30M", supp=true, mapq=50), ) - SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20) shouldBe None + SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true) shouldBe None + } + + it should s"ignore records for the entire ordinal where R1 (true) or R2 (false) are marked as duplicates" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template + + r1.duplicate = true + r2.duplicate = false + + val expected = template.copy(r1=None, r1Supplementals = Nil, r1Secondaries = Nil) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe expected + } + + it should s"ignore records for the entire ordinal where R1 (false) or R2 (true) are marked as duplicates" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template + + r1.duplicate = false + r2.duplicate = true + + val expected = template.copy(r2=None, r2Supplementals = Nil, r2Secondaries = Nil) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe expected + } + + it should s"ignore all records where R1 (true) or R2 (true) are both marked as duplicates" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template + + r1.duplicate = true + r2.duplicate = true + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true) shouldBe None + } + + it should "ignore duplicate supplementals when asked to" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50) + val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template + + r1supp.duplicate = true + r2supp.duplicate = true + + val expected = t(r1, r2) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe expected + } + + it should "not ignore duplicate records when asked to" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50) + val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template + + r1.duplicate = true + r2.duplicate = true + r1supp.duplicate = true + r2supp.duplicate = true + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template + } + + it should s"ignore records for the entire ordinal where R1 (false) or R2 (true) are marked as failing QC" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template + + r1.pf = false + r2.pf = true + + val expected = template.copy(r1=None, r1Supplementals = Nil, r1Secondaries = Nil) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe expected + } + + it should s"ignore records for the entire ordinal where R1 (true) or R2 (false) are marked as failing QC" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template + + r1.pf = true + r2.pf = false + + val expected = template.copy(r2=None, r2Supplementals = Nil, r2Secondaries = Nil) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe expected + } + + it should s"ignore all records where R1 (false) or R2 (false) are both marked as failing QC" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50) + val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template + + r1.pf = false + r2.pf = false + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true) shouldBe None + } + + it should "ignore QC failed supplementals when asked to" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50) + val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template + + r1supp.pf = false + r2supp.pf = false + + val expected = t(r1, r2) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe expected + } + + it should "not ignore passing QC records when asked to" in { + val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50) + val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50) + val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50) + val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50) + + val template = t(r1, r2, r1supp, r2supp) + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template + + r1.pf = false + r2.pf = false + r1supp.pf = false + r2supp.pf = false + + SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template } /** Writes the given BAM records to a BAM file */