Skip to content

Commit

Permalink
Ensure SvPileup ignores duplicate records by default
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval committed Dec 10, 2024
1 parent 0f5a127 commit 565e637
Show file tree
Hide file tree
Showing 2 changed files with 216 additions and 23 deletions.
45 changes: 31 additions & 14 deletions src/main/scala/com/fulcrumgenomics/sv/tools/SvPileup.scala
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ class SvPileup
|side of an aligned segment when merging segments.""") slop: Int = 5,
@arg(flag='t', doc="Optional bed file of target regions") targetsBed: Option[FilePath] = None,
@arg(flag='T', doc="Requirement on if each side of the breakpoint must overlap a target. Will always annotate each side of the breakpoint.")
targetsBedRequirement: TargetBedRequirement.Requirement = TargetBedRequirement.AnnotateOnly
targetsBedRequirement: TargetBedRequirement.Requirement = TargetBedRequirement.AnnotateOnly,
@arg(doc="Whether to include duplicate marked records for breakpoint pileup or not.") val includeDuplicates: Boolean = false,
@arg(doc="Whether to include QC failed records for breakpoint pileup or not.") val includeQcFails: Boolean = false,
) extends SvTool {

import SvPileup._
Expand All @@ -145,7 +147,15 @@ class SvPileup

Bams.templateIterator(source)
.tapEach(t => progress.record(t.allReads.next()))
.flatMap(template => filterTemplate(template, minPrimaryMapq=minPrimaryMappingQuality, minSupplementaryMapq=minSupplementaryMappingQuality))
.flatMap { template =>
filterTemplate(
template,
minPrimaryMapq=minPrimaryMappingQuality,
minSupplementaryMapq=minSupplementaryMappingQuality,
includeDuplicates=includeDuplicates,
includeQcFails=includeQcFails,
)
}
.foreach { template =>
// Find the breakpoints
val evidences = findBreakpoints(
Expand Down Expand Up @@ -297,18 +307,25 @@ object SvPileup extends LazyLogging {
*/
def filterTemplate(t: Template,
minPrimaryMapq: Int,
minSupplementaryMapq: Int): Option[Template] = {
val r1PrimaryOk = t.r1.exists(r => r.mapped && r.mapq >= minPrimaryMapq)
val r2PrimaryOk = t.r2.exists(r => r.mapped && r.mapq >= minPrimaryMapq)

if (!r1PrimaryOk && !r2PrimaryOk) None else Some(
Template(
r1 = if (r1PrimaryOk) t.r1 else None,
r2 = if (r2PrimaryOk) t.r2 else None,
r1Supplementals = if (r1PrimaryOk) t.r1Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil,
r2Supplementals = if (r2PrimaryOk) t.r2Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil,
)
)
minSupplementaryMapq: Int,
includeDuplicates: Boolean = false,
includeQcFails: Boolean = false): Option[Template] = {
t.allReads.filter(rec => (includeDuplicates || !rec.duplicate) && (includeQcFails || rec.pf)).toList match {
case Nil => None
case records =>
val filteredTemplate = Template(records.iterator)
val r1PrimaryOk = filteredTemplate.r1.exists(r => r.mapped && r.mapq >= minPrimaryMapq)
val r2PrimaryOk = filteredTemplate.r2.exists(r => r.mapped && r.mapq >= minPrimaryMapq)

if (!r1PrimaryOk && !r2PrimaryOk) None else Some(
Template(
r1 = if (r1PrimaryOk) filteredTemplate.r1 else None,
r2 = if (r2PrimaryOk) filteredTemplate.r2 else None,
r1Supplementals = if (r1PrimaryOk) filteredTemplate.r1Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil,
r2Supplementals = if (r2PrimaryOk) filteredTemplate.r2Supplementals.filter(_.mapq >= minSupplementaryMapq) else Nil,
)
)
}
}


Expand Down
194 changes: 185 additions & 9 deletions src/test/scala/com/fulcrumgenomics/sv/tools/SvPileupTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ class SvPileupTest extends UnitSpec {
r("chr1", 300, Plus, r=2, cigar="100M", supp=false, mapq=50),
)

SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template
}

it should "do nothing to a template with high mapq primaries and supplementaries" in {
Expand All @@ -343,7 +343,7 @@ class SvPileupTest extends UnitSpec {
r("chr1", 120, Minus, r=2, cigar="30M70S", supp=true, mapq=50),
)

SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template
}

it should "remove a low quality supplementary record" in {
Expand All @@ -354,7 +354,7 @@ class SvPileupTest extends UnitSpec {
r("chr1", 120, Minus, r=2, cigar="30M70S", supp=true, mapq=1),
)

SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template.copy(r2Supplementals=Nil)
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template.copy(r2Supplementals=Nil)
}

it should "remove all evidence of R2 if the primary mapping is low quality" in {
Expand All @@ -366,7 +366,7 @@ class SvPileupTest extends UnitSpec {
)

val expected = template.copy(r2=None, r2Supplementals=Nil)
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe expected
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe expected
}

it should "return None if both R1 and R2 primaries are low quality" in {
Expand All @@ -377,7 +377,7 @@ class SvPileupTest extends UnitSpec {
r("chr1", 120, Minus, r=2, cigar="30M70S", supp=true, mapq=50),
)

SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20) shouldBe None
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true) shouldBe None
}

it should "remove an unmapped R2" in {
Expand All @@ -387,7 +387,7 @@ class SvPileupTest extends UnitSpec {
Seq(r("chr1", 100, Plus, r=2, cigar="100M", supp=false, mapq=0)).tapEach(_.unmapped = true).head,
)

SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template.copy(r2=None)
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template.copy(r2=None)
}

it should "handle a template with fragment data with high mapping quality" in {
Expand All @@ -396,7 +396,7 @@ class SvPileupTest extends UnitSpec {
r("chr7", 800, Plus, r=0, cigar="50S50M", supp=true, mapq=50),
)

SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe template
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe template
}

it should "remove low mapq supplementary records from a fragment template" in {
Expand All @@ -407,7 +407,7 @@ class SvPileupTest extends UnitSpec {
)

val expected = template.copy(r1Supplementals = template.r1Supplementals.filter(_.start != 800))
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20).value shouldBe expected
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true).value shouldBe expected
}

it should "return None for a fragment template with a low quality primary mapping" in {
Expand All @@ -417,7 +417,183 @@ class SvPileupTest extends UnitSpec {
r("chr7", 820, Plus, r=0, cigar="70S30M", supp=true, mapq=50),
)

SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20) shouldBe None
SvPileup.filterTemplate(template, minPrimaryMapq=30, minSupplementaryMapq=20, includeDuplicates = true, includeQcFails = true) shouldBe None
}

it should s"ignore records for the entire ordinal where R1 (true) or R2 (false) are marked as duplicates" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50)
val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template

r1.duplicate = true
r2.duplicate = false

val expected = template.copy(r1=None, r1Supplementals = Nil, r1Secondaries = Nil)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe expected
}

it should s"ignore records for the entire ordinal where R1 (false) or R2 (true) are marked as duplicates" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50)
val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template

r1.duplicate = false
r2.duplicate = true

val expected = template.copy(r2=None, r2Supplementals = Nil, r2Secondaries = Nil)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe expected
}

it should s"ignore all records where R1 (true) or R2 (true) are both marked as duplicates" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50)
val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template

r1.duplicate = true
r2.duplicate = true

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true) shouldBe None
}

it should "ignore duplicate supplementals when asked to" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50)
val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe template

r1supp.duplicate = true
r2supp.duplicate = true

val expected = t(r1, r2)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeQcFails = true).value shouldBe expected
}

it should "not ignore duplicate records when asked to" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50)
val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template

r1.duplicate = true
r2.duplicate = true
r1supp.duplicate = true
r2supp.duplicate = true

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template
}

it should s"ignore records for the entire ordinal where R1 (false) or R2 (true) are marked as failing QC" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50)
val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template

r1.pf = false
r2.pf = true

val expected = template.copy(r1=None, r1Supplementals = Nil, r1Secondaries = Nil)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe expected
}

it should s"ignore records for the entire ordinal where R1 (true) or R2 (false) are marked as failing QC" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50)
val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template

r1.pf = true
r2.pf = false

val expected = template.copy(r2=None, r2Supplementals = Nil, r2Secondaries = Nil)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe expected
}

it should s"ignore all records where R1 (false) or R2 (false) are both marked as failing QC" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr7", 800, Plus, r=1, cigar="50S50M", supp=true, mapq=50)
val r2supp = r("chr7", 800, Plus, r=2, cigar="50S50M", supp=true, mapq=50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template

r1.pf = false
r2.pf = false

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true) shouldBe None
}

it should "ignore QC failed supplementals when asked to" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50)
val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe template

r1supp.pf = false
r2supp.pf = false

val expected = t(r1, r2)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true).value shouldBe expected
}

it should "not ignore passing QC records when asked to" in {
val r1 = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = false, mapq = 50)
val r2 = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = false, mapq = 50)
val r1supp = r("chr1", 100, Plus, r = 1, cigar = "100M", supp = true, mapq = 50)
val r2supp = r("chr1", 300, Plus, r = 2, cigar = "100M", supp = true, mapq = 50)

val template = t(r1, r2, r1supp, r2supp)

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template

r1.pf = false
r2.pf = false
r1supp.pf = false
r2supp.pf = false

SvPileup.filterTemplate(template, minPrimaryMapq = 30, minSupplementaryMapq = 20, includeDuplicates = true, includeQcFails = true).value shouldBe template
}

/** Writes the given BAM records to a BAM file */
Expand Down

0 comments on commit 565e637

Please sign in to comment.