From f559b3fe9eea60a94b85ca7d3a9cdad00641b31a Mon Sep 17 00:00:00 2001 From: jdidion Date: Wed, 3 Apr 2024 15:39:34 -0700 Subject: [PATCH] add tests --- .../fulcrumgenomics/vcf/DownsampleVcf.scala | 111 ++++++++++++------ .../vcf/DownsampleVcfTest.scala | 90 +++++++++++--- 2 files changed, 145 insertions(+), 56 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala index b2c40eb0c..a66460f20 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala @@ -97,69 +97,102 @@ object DownsampleVcf extends LazyLogging { def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = { val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}")) val newAds = downsampleADs(oldAds, proportion, random) - val Seq(aa, ab, bb) = computePls(newAds) - val Seq(alleleA, alleleB) = gt.alleles.toSeq - - val calls = { - if (aa == 0 && ab == 0 && bb == 0) IndexedSeq(NoCallAllele, NoCallAllele) - else if (aa < ab && aa < bb) IndexedSeq(alleleA, alleleA) - else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB) - else IndexedSeq(alleleA, alleleB) - } - gt.copy(attrs=Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls=calls) - } - - /** - * Compute the genotype likelihoods given the allele depths, assuming a diploid genotype (i.e. - * two allele depths). - * @param ads The input depths for the two alleles A and B. - * @return a list of three likelihoods for the alleles AA, AB, and BB. - */ - def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = { - require(ads.length == 2, "there must be exactly two allele depths") - val likelihoods = Likelihoods(ads(0), ads(1)) - IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt) + val likelihoods = Likelihoods(newAds) + val pls = likelihoods.pls + val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq) + gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls) } object Likelihoods { - /** Computes the likelihoods for each possible genotype. - * + /** Computes the likelihoods for each possible biallelic genotype. * @param alleleDepthA the reference allele depth * @param alleleDepthB the alternate allele depth * @param epsilon the error rate for genotyping * @return a new `Likelihood` that has the likelihoods of AA, AB, and BB */ - def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double=0.01): Likelihoods = { + def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): Likelihoods = { val aGivenAA = log10(1 - epsilon) val aGivenBB = log10(epsilon) val aGivenAB = log10((1 - epsilon) / 2) - val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) * -10 - val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) * -10 - val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) * -10 + val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) + val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) + val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) - val minGL = math.min(math.min(rawGlAA, rawGlAB), rawGlBB) + Likelihoods(2, IndexedSeq(rawGlAA, rawGlAB, rawGlBB)) + } + /** Computes the likelihoods for each possible multiallelic genotype. + * @param alleleDepths the sequence of allele depths in the order specified in the VCF + * @param epsilon the error rate for genotyping + * @return a new `Likelihood` that has the likelihoods of all possible genotypes in the order + * specified in VFC spec for the GL/PL tags. + */ + def multiallelic(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { + val numAlleles = alleleDepths.length + // probabilities associated with each possible genotype for a pair of alleles + val probs: Array[Double] = Array( + math.log10(epsilon), + math.log10((1 - epsilon) / 2), + math.log10(1 - epsilon) + ) + // raw genotype log-likelihoods Likelihoods( - aa = rawGlAA - minGL, - ab = rawGlAB - minGL, - bb = rawGlBB - minGL + numAlleles, + (0 until numAlleles).flatMap(b => + (0 to b).map(a => + (0 until numAlleles).map(allele => + probs(Array(a, b).count(_ == allele)) * alleleDepths(allele) + ).sum + ) + ) ) } + + def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = { + require(alleleDepths.length >= 2, "at least two alleles are required to calculate genotype likelihoods") + if (alleleDepths.length > 2) multiallelic(alleleDepths, epsilon) + else biallelic(alleleDepths(0), alleleDepths(1), epsilon) + } } - /** Stores the log10(likelihoods) for all possible bi-allelic genotypes. - * - * @param aa likelihood of AA - * @param ab likelihood of AB - * @param bb likelihood of BB + /** Stores the log10(likelihoods) for all possible genotypes. + * @param numAlleles the number of alleles the variant has + * @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec */ - case class Likelihoods(aa: Double, ab: Double, bb: Double) { + case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) { /** * Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag). * @return a list of phred-scaled likelihooodS for AA, AB, BB. */ - def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt) + def pls: IndexedSeq[Int] = { + // subtract the min value so the smallest GL is 0, then multiply by -10 and convert to + // Int to make it PHRED-scale + val rawPL = genotypeLikelihoods.map(gl => gl * -10) + val minPL = rawPL.min + rawPL.map(pl => (pl - minPL).round.toInt) + } + + def mostLikelyGenotype: Option[(Int, Int)] = { + val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0) + minIndexes.length match { + case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0") + case 1 => { + val genotypes = + for (b <- 0 until numAlleles; a <- 0 to b) + yield (a, b) + Some(genotypes(minIndexes.head._2)) + } + case _ => None // if multiple genotypes are most likely, don't make a call + } + } + + def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele] = { + mostLikelyGenotype match { + case None => IndexedSeq(NoCallAllele, NoCallAllele) + case Some((a, b)) => IndexedSeq(alleles(a), alleles(b)) + } + } } } diff --git a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala index 1af287e18..f6b6eb7c2 100644 --- a/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala +++ b/src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala @@ -6,7 +6,7 @@ import com.fulcrumgenomics.util.Metric import com.fulcrumgenomics.vcf.api.Allele.SimpleAllele import com.fulcrumgenomics.vcf.api.{Allele, AlleleSet, Genotype, Variant} import com.fulcrumgenomics.testing.UnitSpec -import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, computePls, downsampleAndRegenotype} +import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, downsampleAndRegenotype} import scala.util.Random @@ -187,7 +187,7 @@ class DownsampleVcfTest extends UnitSpec { "DownsampleVcf.computePls" should "return new PLs that are not always 0,0,0" in { val ads = IndexedSeq[Int](0, 100) val expected = IndexedSeq(1996, 301, 0) - val newlikelihoods = computePls(ads) + val newlikelihoods = Likelihoods(ads).pls newlikelihoods should contain theSameElementsInOrderAs expected } @@ -196,51 +196,57 @@ class DownsampleVcfTest extends UnitSpec { */ "DownsampleVcf.Likelihoods" should "return ref if all allele depths are zero" in { - val likelihood = Likelihoods(alleleDepthA=0, alleleDepthB=0) + val likelihood = Likelihoods(IndexedSeq(0, 0)) val expected = IndexedSeq[Int](0, 0, 0) likelihood.pls.length shouldBe expected.length likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for AA if there are only ref alleles observed" in { - val likelihood = Likelihoods(alleleDepthA = 10, alleleDepthB = 0) + val likelihood = Likelihoods(IndexedSeq(10, 0)) val expected = IndexedSeq[Int](0, 30, 200) likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for BB if there are only alt alleles observed" in { - val likelihood = Likelihoods(alleleDepthA = 0, alleleDepthB = 10) + val likelihood = Likelihoods(IndexedSeq(0, 10)) val expected = IndexedSeq[Int](200, 30, 0) likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for AB if there are an equal number of ref and alt alleles" in { - val likelihood = Likelihoods(alleleDepthA = 5, alleleDepthB = 5) + val likelihood = Likelihoods(IndexedSeq(5, 5)) val expected = IndexedSeq[Int](70, 0, 70) likelihood.pls should contain theSameElementsInOrderAs expected } it should "return a likelihood of 0 for AA if the AD A >> AD B" in { - val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 2) + val likelihood = Likelihoods(IndexedSeq(15, 2)) likelihood.pls(0) == 0 } it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in { - val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 17) + val likelihood = Likelihoods(IndexedSeq(15, 17)) likelihood.pls(1) == 0 } it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in { - val likelihood = Likelihoods(alleleDepthA = 3, alleleDepthB = 30) + val likelihood = Likelihoods(IndexedSeq(3, 30)) likelihood.pls(2) == 0 } it should "return correct values when there are very few reads" in { - Likelihoods(0, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0) - Likelihoods(1, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20) - Likelihoods(1, 1).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14) - Likelihoods(0, 2).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0) - Likelihoods(1, 2).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11) + Likelihoods(IndexedSeq(0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0) + Likelihoods(IndexedSeq(1, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20) + Likelihoods(IndexedSeq(1, 1)).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14) + Likelihoods(IndexedSeq(0, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0) + Likelihoods(IndexedSeq(1, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11) + } + + it should "return correct values for multi-allelic variants" in { + Likelihoods(IndexedSeq(0, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0, 0, 0, 0) + Likelihoods(IndexedSeq(10, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 30, 200, 30, 200, 200) + Likelihoods(IndexedSeq(10, 10, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(139, 0, 139, 169, 169, 339) } @@ -251,10 +257,10 @@ class DownsampleVcfTest extends UnitSpec { Genotype(alleles=AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt))), sample=sample, calls=IndexedSeq[Allele](Allele(ref), Allele(alt)), - attrs=Map("AD" -> ads, "PL" -> Likelihoods(alleleDepthA = ads(0), alleleDepthB = ads(1)))) + attrs=Map("AD" -> ads, "PL" -> Likelihoods(ads)) + ) } - "DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in { val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0)) val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01) @@ -298,6 +304,30 @@ class DownsampleVcfTest extends UnitSpec { newGeno.calls should contain theSameElementsInOrderAs expected } + /* + testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes + */ + private def makeTriallelicGt(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Genotype = { + val likelihoods = Likelihoods(ads) + val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2))) + val calls = likelihoods.mostLikelyCall(alleles.toSeq) + Genotype(alleles, sample=sample, calls=calls, attrs=Map("AD" -> ads, "PL" -> likelihoods.pls)) + } + + it should "return ref,alt1 for a tri-allelic genotype if those alleles have the highest depth" in { + val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("A"), Allele("T")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + + it should "return alt1,alt2 for a tri-allelic genotype if those alleles have the highest depth" in { + val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100)) + val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("T"), Allele("G")) + newGeno.calls should contain theSameElementsInOrderAs expected + } + /* testing DownsampleVcf.downsampleAndRegenotype on Variant */ @@ -306,7 +336,7 @@ class DownsampleVcfTest extends UnitSpec { Variant(chrom="1", pos=10, alleles=AlleleSet(ref=Allele(ref), alts=Allele(alt)), - genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample = sample)) + genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample=sample)) ) } @@ -345,6 +375,32 @@ class DownsampleVcfTest extends UnitSpec { newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected } + /* + testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes + */ + private def makeTriallelicVariant(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Variant = { + val likelihoods = Likelihoods(ads) + val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2))) + Variant(chrom="1", + pos=10, + alleles=alleles, + genotypes=Map(sample -> makeTriallelicGt(ref=ref, alt1=alt1, alt2=alt2, ads=ads, sample=sample))) + } + + it should "return ref,alt1 for a tri-allelic variant if those alleles have the highest depth" in { + val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0)) + val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("A"), Allele("T")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + + it should "return alt1,alt2 for a tri-allelic variant if those alleles have the highest depth" in { + val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100)) + val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01) + val expected = IndexedSeq(Allele("T"), Allele("G")) + newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected + } + private val sample = "test1" private val builder = VcfBuilder(samples=Seq(sample)) builder.add(chrom="chr1", pos=100, id="1", alleles=Seq("A", "C"), info=Map(),