Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to maintain N in umi #907

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ class GroupReadsByUmi
@arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI",
@arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1,
@arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false,
@arg(flag='N', doc="Include UMIs which contain non-ATCG bases (N) in output.") val includeNonATCG: Boolean = false,
JoeVieira marked this conversation as resolved.
Show resolved Hide resolved
@arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy,
@arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1,
@arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length,
Expand Down Expand Up @@ -551,7 +552,8 @@ class GroupReadsByUmi
.filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1; false })
.filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false })
.filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false })
.filter(r => !r.get[String](rawTag).exists(_.contains('N')) || { filteredNsInUmi += 1; false })
.filter(r =>
(this.includeNonATCG || !r.get[String](rawTag).exists(_.contains('N'))) || { filteredNsInUmi += 1; false })
.filter { r =>
this.minUmiLength.forall { l =>
r.get[String](this.rawTag).forall { umi =>
Expand Down Expand Up @@ -644,7 +646,7 @@ class GroupReadsByUmi
logger.info(f"Accepted $kept%,d reads for grouping.")
if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.")
logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
if (!this.includeNonATCG) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") }
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,70 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
}

it should "correctly group together single-end reads with UMIs containing N's ,if option is set" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN"))
builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA"))
builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC"))
builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Edit, edits = 1).execute()

val recs = readBamRecs(out)
recs should have size 8

val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
groups should have size 4
groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
}

it should "correctly filter and not group together single-end reads with UMIs containing N's" in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN"))
builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA"))
builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC"))
builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG"))
builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA"))
builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNonATCG = false, strategy = Strategy.Edit, edits = 1).execute()

val recs = readBamRecs(out)
recs should have size 7

val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet)
groups should have size 4
groups should contain theSameElementsAs Seq(Set("a01"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08"))
}



it should "include reads that contain an N in the UMI, if option is set." in {
val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT"))
builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ANN"))

val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".bam")
new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNonATCG = true, strategy = Strategy.Paired, edits = 2).execute()

readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03")
}


it should "exclude reads that contain an N in the UMI" in {
val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate))
builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ACT"))
Expand Down