diff --git a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala index 332471c26..deb38f3a6 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CopyUmiFromReadName.scala @@ -44,13 +44,10 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger} |The `--umi-delimiter` option specifies the delimiter on which to split. The resulting UMI in the `RX` tag |will always be hyphen delimited. | - |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add a prefix to indicate that the sequence - |has been reverse-complemented. The `--reverse-complement-prefix` option specifies the prefix character(s) and - |causes them to be removed. Any reverse-complemented UMIs will be normalized (i.e., reverse-complemented back to - |be in the forward orientation). - | - |To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and - |prefix separately, i.e. `--field-delimiter=":"` and `--reverse-complement-prefix="r"`. + |Some tools (e.g. BCL Convert) may reverse-complement UMIs on R2 and add an 'r' prefix to indicate that the sequence + |has been reverse-complemented. By default, the 'r' prefix is removed and the sequence is reverse-complemented + |back to the forward orientation. The `--override-reverse-complement-umis` disables the latter behavior, such that + |the 'r' prefix is removed but the UMI sequence is left as reverse-complemented. """) class CopyUmiFromReadName ( @arg(flag='i', doc="The input BAM file.") input: PathToBam, @@ -58,12 +55,11 @@ class CopyUmiFromReadName @arg(doc="Remove the UMI from the read name.") removeUmi: Boolean = false, @arg(doc="Delimiter between the read name and UMI.") fieldDelimiter: Char = ':', @arg(doc="Delimiter between UMI sequences.") umiDelimiter: Char = '+', - @arg(flag='p', doc="The prefix to a UMI sequence that indicates it is reverse-complemented.") reverseComplementPrefix: Option[String] = None, + @arg(doc="Do not reverse-complement UMIs prefixed with 'r'.") overrideReverseComplementUmis: Boolean = false, ) extends FgBioTool with LazyLogging { Io.assertReadable(input) Io.assertCanWriteFile(output) - validate(reverseComplementPrefix.forall(_.nonEmpty), "--reverse-complement-prefix cannot be an empty string") override def execute(): Unit = { val source = SamSource(input) @@ -73,11 +69,11 @@ class CopyUmiFromReadName progress.record(rec) writer += Umis.copyUmiFromReadName( - rec = rec, - removeUmi = removeUmi, - fieldDelimiter = fieldDelimiter, - umiDelimiter = umiDelimiter, - reverseComplementPrefix = reverseComplementPrefix, + rec = rec, + removeUmi = removeUmi, + fieldDelimiter = fieldDelimiter, + umiDelimiter = umiDelimiter, + reverseComplementPrefixedUmis = !overrideReverseComplementUmis, ) } progress.logLast() diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index b293dac83..08369a682 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -29,6 +29,7 @@ import com.fulcrumgenomics.bam.api.SamRecord import com.fulcrumgenomics.util.Sequences object Umis { + val RevcompPrefix: String = "r" /** Copies the UMI sequence from the read name. * @@ -41,21 +42,21 @@ object Umis { * @param removeUmi true to remove the UMI from the read name, otherwise only copy the UMI to the tag * @param fieldDelimiter the delimiter of fields within the read name * @param umiDelimiter the delimiter between sequences in the UMI string - * @param reverseComplementPrefix the prefix of a UMI that indicates it is reverse-complimented + * @param reverseComplementPrefixedUmis whether to reverse-compliment UMIs prefixed with 'r' * @return the modified record */ def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, fieldDelimiter: Char = ':', umiDelimiter: Char = '+', - reverseComplementPrefix: Option[String] = None): SamRecord = { + reverseComplementPrefixedUmis: Boolean = true): SamRecord = { // Extract and set the UMI val umi = extractUmisFromReadName( - name = rec.name, - fieldDelimiter = fieldDelimiter, - strict = false, - umiDelimiter = umiDelimiter, - reverseComplementPrefix = reverseComplementPrefix, + name = rec.name, + fieldDelimiter = fieldDelimiter, + strict = false, + umiDelimiter = umiDelimiter, + reverseComplementPrefixedUmis = reverseComplementPrefixedUmis, ) require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}") umi.foreach(u => rec(ConsensusTags.UmiBases) = u) @@ -84,7 +85,7 @@ object Umis { fieldDelimiter: Char = ':', strict: Boolean, umiDelimiter: Char = '+', - reverseComplementPrefix: Option[String] = None): Option[String] = { + reverseComplementPrefixedUmis: Boolean = true): Option[String] = { // If strict, check that the read name actually has eight parts, which is expected val rawUmi = if (strict) { val colons = name.count(_ == fieldDelimiter) @@ -97,14 +98,24 @@ object Umis { Some(name.substring(idx + 1, name.length)) } - var umi = rawUmi.map(raw => reverseComplementPrefix match { - case Some(prefix) if raw.indexOf(prefix) >= 0 => - raw.split(umiDelimiter).map(seq => - (if (seq.startsWith(prefix)) Sequences.revcomp(seq.stripPrefix(prefix)) else seq).toUpperCase - ).mkString("-") - case _ if raw.indexOf(umiDelimiter) > 0 => raw.replace(umiDelimiter, '-').toUpperCase - case _ => raw.toUpperCase - }) + // Remove 'r' prefixes, optionally reverse-complementing the prefixed UMIs if + // reverseComplementPrefixedUmis = true, replace the delimiter (if any) with '-', + // and make sure the sequence is upper-case. + var umi = rawUmi.map(raw => + (raw.indexOf(RevcompPrefix) >= 0, raw.indexOf(umiDelimiter) > 0) match { + case (true, true) if reverseComplementPrefixedUmis => + raw.split(umiDelimiter).map(seq => + if (seq.startsWith(RevcompPrefix)) Sequences.revcomp(seq.stripPrefix(RevcompPrefix)) + else seq.stripPrefix(RevcompPrefix) + ).mkString("-").toUpperCase + case (true, false) if reverseComplementPrefixedUmis => + Sequences.revcomp(raw.stripPrefix(RevcompPrefix)).toUpperCase + case (true, true) => raw.replace(RevcompPrefix, "").replace(umiDelimiter, '-').toUpperCase + case (true, false) => raw.replace(RevcompPrefix, "").toUpperCase + case (false, true) => raw.replace(umiDelimiter, '-').toUpperCase + case (false, false) => raw.toUpperCase + } + ) val valid = umi.forall(u => u.forall(isValidUmiCharacter)) diff --git a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala index 1159e99ef..2e4eb14de 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CopyUmiFromReadNameTest.scala @@ -35,7 +35,7 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { /** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */ private def run(names: Iterable[String], removeUmi: Boolean, - reverseComplementPrefix: Option[String] = None): IndexedSeq[Result] = { + overrideReverseComplementUmis: Boolean = false): IndexedSeq[Result] = { // build the reads val builder = new SamBuilder() names.foreach { name => builder.addFrag(name=name, unmapped=true) } @@ -43,10 +43,10 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { // run the tool val out = makeTempFile("test.", ".bam") val tool = new CopyUmiFromReadName( - input = builder.toTempFile(), - output = out, - removeUmi = removeUmi, - reverseComplementPrefix = reverseComplementPrefix, + input = builder.toTempFile(), + output = out, + removeUmi = removeUmi, + overrideReverseComplementUmis = overrideReverseComplementUmis ) executeFgbioTool(tool) @@ -80,9 +80,20 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues { it should "remove a reverse-complement prefix to the UMI" in { val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") val results = run( - names = names, - removeUmi = true, - reverseComplementPrefix = Some("r"), + names = names, + removeUmi = true, + overrideReverseComplementUmis = true + ) + results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") + results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC") + } + + it should "remove a reverse-complement prefix to the UMI and reverse-complement the UMI" in { + val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA+CCCC") + val results = run( + names = names, + removeUmi = true, + overrideReverseComplementUmis = false ) results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah") results.map(_.umi) should contain theSameElementsInOrderAs Seq("TTTT", "GGGG", "CCCC", "TTTT-CCCC")