Skip to content

Commit

Permalink
change to having an option that disables default behavior
Browse files Browse the repository at this point in the history
  • Loading branch information
jdidion committed Jul 15, 2024
1 parent 700741e commit 56b4d75
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 38 deletions.
24 changes: 10 additions & 14 deletions src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala
Original file line number Diff line number Diff line change
Expand Up @@ -44,26 +44,22 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger}
|The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag
|will always be hyphen delimited.
|
|Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence
|has been reverse-complemented. The `--reverse-complement-prefix` option specifies the prefix character(s) and
|causes them to be removed. Any reverse-complemented UMIs will be normalized (i.e., reverse-complemented back to
|be in the forward orientation).
|
|To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and
|prefix separately, i.e. `--field-delimiter=":"` and `--reverse-complement-prefix="r"`.
|Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add an 'r' prefix to indicate that the sequence
|has been reverse-complemented. By default, the 'r' prefix is removed and the sequence is reverse-complemented
|back to the forward orientation. The `--override-reverse-complement-umis` disables the latter behavior, such that
|the 'r' prefix is removed but the UMI sequence is left as reverse-complemented.
""")
class CopyUmiFromReadName
( @arg(flag='i', doc="The input BAM file.") input: PathToBam,
@arg(flag='o', doc="The output BAM file.") output: PathToBam,
@arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false,
@arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':',
@arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+',
@arg(flag='p', doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") reverseComplementPrefix: Option[String] = None,
@arg(doc="Do not reverse-complement UMIs prefixed with 'r'.") overrideReverseComplementUmis: Boolean = false,
) extends FgBioTool with LazyLogging {

Io.assertReadable(input)
Io.assertCanWriteFile(output)
validate(reverseComplementPrefix.forall(_.nonEmpty), "--reverse-complement-prefix cannot be an empty string")

override def execute(): Unit = {
val source = SamSource(input)
Expand All @@ -73,11 +69,11 @@ class CopyUmiFromReadName
progress.record(rec)

writer += Umis.copyUmiFromReadName(
rec = rec,
removeUmi = removeUmi,
fieldDelimiter = fieldDelimiter,
umiDelimiter = umiDelimiter,
reverseComplementPrefix = reverseComplementPrefix,
rec = rec,
removeUmi = removeUmi,
fieldDelimiter = fieldDelimiter,
umiDelimiter = umiDelimiter,
reverseComplementPrefixedUmis = !overrideReverseComplementUmis,
)
}
progress.logLast()
Expand Down
43 changes: 27 additions & 16 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import com.fulcrumgenomics.bam.api.SamRecord
import com.fulcrumgenomics.util.Sequences

object Umis {
val RevcompPrefix: String = "r"

/** Copies the UMI sequence from the read name.
*
Expand All @@ -41,21 +42,21 @@ object Umis {
* @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag
* @param fieldDelimiter the delimiter of fields within the read name
* @param umiDelimiter the delimiter between sequences in the UMI string
* @param reverseComplementPrefix the prefix of a UMI that indicates it is reverse-complimented
* @param reverseComplementPrefixedUmis whether to reverse-compliment UMIs prefixed with 'r'
* @return the modified record
*/
def copyUmiFromReadName(rec: SamRecord,
removeUmi: Boolean = false,
fieldDelimiter: Char = ':',
umiDelimiter: Char = '+',
reverseComplementPrefix: Option[String] = None): SamRecord = {
reverseComplementPrefixedUmis: Boolean = true): SamRecord = {
// Extract and set the UMI
val umi = extractUmisFromReadName(
name = rec.name,
fieldDelimiter = fieldDelimiter,
strict = false,
umiDelimiter = umiDelimiter,
reverseComplementPrefix = reverseComplementPrefix,
name = rec.name,
fieldDelimiter = fieldDelimiter,
strict = false,
umiDelimiter = umiDelimiter,
reverseComplementPrefixedUmis = reverseComplementPrefixedUmis,
)
require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}")
umi.foreach(u => rec(ConsensusTags.UmiBases) = u)
Expand Down Expand Up @@ -84,7 +85,7 @@ object Umis {
fieldDelimiter: Char = ':',
strict: Boolean,
umiDelimiter: Char = '+',
reverseComplementPrefix: Option[String] = None): Option[String] = {
reverseComplementPrefixedUmis: Boolean = true): Option[String] = {
// If strict, check that the read name actually has eight parts, which is expected
val rawUmi = if (strict) {
val colons = name.count(_ == fieldDelimiter)
Expand All @@ -97,14 +98,24 @@ object Umis {
Some(name.substring(idx + 1, name.length))
}

var umi = rawUmi.map(raw => reverseComplementPrefix match {
case Some(prefix) if raw.indexOf(prefix) >= 0 =>
raw.split(umiDelimiter).map(seq =>
(if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase
).mkString("-")
case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-').toUpperCase
case _ => raw.toUpperCase
})
// Remove 'r' prefixes, optionally reverse-complementing the prefixed UMIs if
// reverseComplementPrefixedUmis = true, replace the delimiter (if any) with '-',
// and make sure the sequence is upper-case.
var umi = rawUmi.map(raw =>
(raw.indexOf(RevcompPrefix) >= 0, raw.indexOf(umiDelimiter) > 0) match {
case (true, true) if reverseComplementPrefixedUmis =>
raw.split(umiDelimiter).map(seq =>
if (seq.startsWith(RevcompPrefix)) Sequences.revcomp(seq.stripPrefix(RevcompPrefix))
else seq.stripPrefix(RevcompPrefix)
).mkString("-").toUpperCase
case (true, false) if reverseComplementPrefixedUmis =>
Sequences.revcomp(raw.stripPrefix(RevcompPrefix)).toUpperCase
case (true, true) => raw.replace(RevcompPrefix, "").replace(umiDelimiter, '-').toUpperCase
case (true, false) => raw.replace(RevcompPrefix, "").toUpperCase
case (false, true) => raw.replace(umiDelimiter, '-').toUpperCase
case (false, false) => raw.toUpperCase
}
)

val valid = umi.forall(u => u.forall(isValidUmiCharacter))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,18 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
/** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */
private def run(names: Iterable[String],
removeUmi: Boolean,
reverseComplementPrefix: Option[String] = None): IndexedSeq[Result] = {
overrideReverseComplementUmis: Boolean = false): IndexedSeq[Result] = {
// build the reads
val builder = new SamBuilder()
names.foreach { name => builder.addFrag(name=name, unmapped=true) }

// run the tool
val out = makeTempFile("test.", ".bam")
val tool = new CopyUmiFromReadName(
input = builder.toTempFile(),
output = out,
removeUmi = removeUmi,
reverseComplementPrefix = reverseComplementPrefix,
input = builder.toTempFile(),
output = out,
removeUmi = removeUmi,
overrideReverseComplementUmis = overrideReverseComplementUmis
)
executeFgbioTool(tool)

Expand Down Expand Up @@ -80,9 +80,20 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
it should "remove a reverse-complement prefix to the UMI" in {
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
val results = run(
names = names,
removeUmi = true,
reverseComplementPrefix = Some("r"),
names = names,
removeUmi = true,
overrideReverseComplementUmis = true
)
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
}

it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI" in {
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC")
val results = run(
names = names,
removeUmi = true,
overrideReverseComplementUmis = false
)
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC")
Expand Down

0 comments on commit 56b4d75

Please sign in to comment.