Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt hl.de_novo function #760

Open
wants to merge 45 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
f24e6a5
Add function to compute post probability of de novos
KoalaQin Jan 21, 2025
3f3f8e3
confidence and fail check function
KoalaQin Jan 24, 2025
e681bdc
Modify de novo function
KoalaQin Jan 28, 2025
4ee8cdc
black formatting
KoalaQin Jan 29, 2025
376811d
Reformat docstring
KoalaQin Jan 29, 2025
f80698c
Change the citation
KoalaQin Jan 29, 2025
ddf3811
Apply suggestions from code review
KoalaQin Jan 30, 2025
b5fc1b7
Transpose the thresholds table
KoalaQin Jan 30, 2025
05a4320
merge origin changes
KoalaQin Jan 30, 2025
b565fa2
Add a call_de_novo function
KoalaQin Jan 31, 2025
2f11811
Formatting
KoalaQin Jan 31, 2025
f4dfe32
isort
KoalaQin Jan 31, 2025
dce95f0
Add test module for de novo functions
KoalaQin Feb 3, 2025
7ccda17
small formatting
KoalaQin Feb 3, 2025
e9d49d4
black formatting
KoalaQin Feb 3, 2025
ef6dbe1
Black
KoalaQin Feb 3, 2025
0a5616d
isort
KoalaQin Feb 3, 2025
4376c66
Add missing docstring and notes
KoalaQin Feb 3, 2025
6bfd82e
formatting
KoalaQin Feb 3, 2025
3d977d7
Apply suggestions from code review
KoalaQin Feb 5, 2025
7252583
Reformat docstring and move helper functions
KoalaQin Feb 5, 2025
90ab407
Correct the confidence errors
KoalaQin Feb 5, 2025
2816f9c
Move is_de_novo
KoalaQin Feb 5, 2025
50f4745
reST hyperlink format in docstring
KoalaQin Feb 5, 2025
bffe574
Add test
KoalaQin Feb 6, 2025
f9a625b
Black
KoalaQin Feb 6, 2025
46538d2
remove imports
KoalaQin Feb 6, 2025
d7d69b5
docstring typo
KoalaQin Feb 6, 2025
9d37bcb
Apply suggestions from code review
KoalaQin Feb 7, 2025
f70125d
Docstring changes
KoalaQin Feb 7, 2025
fcfa796
Merge suggestions
KoalaQin Feb 7, 2025
a387d19
A small change
KoalaQin Feb 7, 2025
f45bc3c
Add error case
KoalaQin Feb 7, 2025
9c36896
Add suggested test cases
KoalaQin Feb 7, 2025
768d9f4
Black
KoalaQin Feb 7, 2025
c2050d0
extra space removal
KoalaQin Feb 7, 2025
2742e26
Add warning block back
KoalaQin Feb 7, 2025
285865c
Change wording
KoalaQin Feb 7, 2025
ace2d37
Apply suggestions from code review
KoalaQin Feb 8, 2025
49f6249
Address review comments
KoalaQin Feb 8, 2025
457cd9a
Black
KoalaQin Feb 8, 2025
3571265
Change indel HIGH code
KoalaQin Feb 8, 2025
9b1515b
typo
KoalaQin Feb 8, 2025
b899e7b
Remove extra
KoalaQin Feb 9, 2025
5631c62
Adjust table
KoalaQin Feb 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
366 changes: 366 additions & 0 deletions gnomad/sample_qc/relatedness.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I missed this in previous reviews -- is there a way to italicize de novo in these docstrings?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I did for my functions except the ones with the superlink and in bold, is it okay? Do you want me to change the font for de novo above in the same file?

Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
import hail as hl
import networkx as nx

from gnomad.utils.annotations import get_copy_state_by_sex
from gnomad.utils.filtering import add_filters_expr

logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -1298,3 +1301,366 @@ def _get_alt_count(locus, gt, is_female):
)

return sib_stats


def get_freq_prior(freq_prior_expr: hl.expr.Float64Expression, min_pop_prior=100 / 3e7):
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"""
Get the population frequency prior for a de novo mutation.

:param freq_prior_expr: The population frequency prior for the variant.
:param min_pop_prior: The minimum population frequency prior.
"""
return hl.max(
hl.or_else(
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
hl.case()
.when((freq_prior_expr >= 0) & (freq_prior_expr <= 1), freq_prior_expr)
.or_error(
hl.format(
"de_novo: expect 0 <= freq_prior_expr <= 1, found %.3e",
freq_prior_expr,
)
),
0.0,
),
min_pop_prior,
)


def transform_pl_to_pp(pl_expr: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression:
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"""
Transform the PLs into the probability of observing genotype.

:param pl_expr: ArrayExpression of PL.
:return: ArrayExpression of the probability of observing each genotype.
"""
return hl.bind(lambda x: x / hl.sum(x), 10 ** (-pl_expr / 10))


def calculate_de_novo_post_prob(
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
proband_pl: hl.expr.ArrayExpression,
father_pl: hl.expr.ArrayExpression,
mother_pl: hl.expr.ArrayExpression,
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
locus_expr: hl.expr.LocusExpression,
is_xx_expr: hl.expr.BooleanExpression,
freq_prior_expr: hl.expr.Float64Expression,
min_pop_prior: Optional[float] = 100 / 3e7,
de_novo_prior: Optional[float] = 1 / 3e7,
) -> hl.expr.Float64Expression:
"""
Calculate the posterior probability of a de novo mutation.

This function computes the posterior probability of a de novo mutation (P_dn)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
using the likelihoods of the proband's and parents' genotypes and the population
frequency prior for the variant.

Based on Kaitlin Samocha's [de novo caller](
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also add a version of this documentation back to the docstring?

    The simplified version is the same as Hail's methods when using the
    `ignore_in_sample_allele_frequency` parameter. The main difference is that this
    mode should be used when families larger than a single trio are in the dataset, in
    which an allele might be de novo in a parent and transmitted to a child in the
    dataset. This mode will not consider the allele count (AC) in the dataset, and will
    only consider the Phred-scaled likelihoods (PL) of the child and parents,
    allele balance (AB) of the child and parents, the genotype quality (GQ) of the
    child, the depth (DP) of the child and parents, and the population frequency prior.

the explanation that this function behaves the same as Hail's + ignore_in_sample_allele_frequency and which exact fields the function needs is clear and helpful to have

https://github.com/ksamocha/de_novo_scripts),
the posterior probability of a de novo mutation (`P_dn`) is computed as:

P_dn = P(DN | data) / (P(DN | data) + P(missed het in parent(s) | data))
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

The terms are defined as:
P(DN | data): The probability of a de novo mutation given the data. This is
computed as:

P(DN | data) = P(data | DN) * P(DN)

P(data | DN): The probability of observing the data under the assumption of a de novo mutation:
* Autosomes and PAR regions:

P(data | DN) = P(hom_ref in father) * P(hom_ref in mother) * P(het in proband)

* X non-PAR regions (XY only):

P(data | DN) = P(hom_ref in mother) * P(hom_alt in proband)

* Y non-PAR regions (XY only):

P(data | DN) = P(hom_ref in father) * P(hom_alt in proband)

P(DN): The prior probability of a de novo mutation from the literature,
P(DN) = 1 / 3e7

P(missed het in parent(s) | data): The probability of observing missed het in
parent(s) given the data. This is computed as:

P(missed het in parent(s) | data) = P(data | at least one parent is het) * P(one parent is het)

P(data | missed het in parent(s)): The probability of observing the data under the assumption of a missed het in parent(s):
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
* Autosomes and PAR regions:

P(data | missed het in parents) = (P(het in father) * P(hom_ref in mother) + P(hom_ref in father) * P(het in mother)) * P(het in proband)

* X non-PAR regions:

P(data | missed het in mother) = (P(het in mother) + P(hom_var in mother)) * P(hom_var in proband)

* Y non-PAR regions:

P(data | missed het in father) = (P(het in father) + P(hom_var in father)) * P(hom_var in proband)

- P(het in one parent): The prior probability for at least one alternate allele between the parents depends on the alternate allele frequency:
1 - (1 - freq_prior)**4, where freq_prior is the population frequency prior for the variant.

:param proband_pl: Phred-scaled genotype likelihoods for the proband.
:param father_pl: Phred-scaled genotype likelihoods for the father.
:param mother_pl: Phred-scaled genotype likelihoods for the mother.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:param locus_expr: LocusExpression of the variant.
:param is_xx_expr: BooleanExpression indicating whether the proband has XX sex karyotype.
:param freq_prior_expr: Population frequency prior for the variant.
:param min_pop_prior: Minimum population frequency prior (default: 100/3e7).
:param de_novo_prior: Prior probability of a de novo mutation (default: 1/3e7).
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:return: Posterior probability of a de novo mutation (P_dn).
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"""
# Ensure valid genomic context
diploid_expr, hemi_x, hemi_y = get_copy_state_by_sex(locus_expr, is_xx_expr)

# Adjust frequency prior
freq_prior_expr = get_freq_prior(freq_prior_expr, min_pop_prior)
prior_one_parent_het = 1 - (1 - freq_prior_expr) ** 4

# Convert PL to probabilities
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this back because your suggestion will need more changes below.

pp_proband = transform_pl_to_pp(proband_pl)
pp_father = transform_pl_to_pp(father_pl)
pp_mother = transform_pl_to_pp(mother_pl)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

# Compute P(data | DN)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
prob_data_given_dn = (
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
hl.case()
.when(hemi_x, pp_mother[0] * pp_proband[2])
.when(hemi_y, pp_father[0] * pp_proband[2])
.when(diploid_expr, pp_father[0] * pp_mother[0] * pp_proband[1])
.or_missing()
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
)

# Compute P(data | missed het in parent(s))
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
prob_data_missed_het = (
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
hl.case()
.when(
hemi_x, (pp_mother[1] + pp_mother[2]) * pp_proband[2] * prior_one_parent_het
)
.when(
hemi_y, (pp_father[1] + pp_father[2]) * pp_proband[2] * prior_one_parent_het
)
.when(
diploid_expr,
(pp_father[1] * pp_mother[0] + pp_father[0] * pp_mother[1])
* pp_proband[1]
* prior_one_parent_het,
)
.or_missing()
)

# Calculate posterior probability of de novo mutation
prob_dn_given_data = prob_data_given_dn * de_novo_prior
p_dn = prob_dn_given_data / (prob_dn_given_data + prob_data_missed_het)
return p_dn
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved


def call_de_novo(
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
locus_expr: hl.expr.LocusExpression,
proband_expr: hl.expr.StructExpression,
father_expr: hl.expr.StructExpression,
mother_expr: hl.expr.StructExpression,
is_xx_expr: hl.expr.BooleanExpression,
) -> hl.expr.BooleanExpression:
"""
Call a de novo mutation based on the proband and parent genotypes.

:param locus_expr: Variant locus.
:param proband_expr: Proband genotype info, required field: GT.
:param father_expr: Father genotype info, required field: GT.
:param mother_expr: Mother genotype info, required field: GT.
:param is_xx_expr: Whether the proband is XX.
:return: BooleanExpression indicating whether the variant is a de novo mutation.
"""
# Ensure valid genomic context
diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex(
locus_expr, is_xx_expr
)

is_de_novo = (
diploid_expr
& (
proband_expr.GT.is_het()
& father_expr.GT.is_hom_ref()
& mother_expr.GT.is_hom_ref()
)
| hemi_x_expr & (proband_expr.GT.is_hom_var() & mother_expr.GT.is_hom_ref())
| hemi_y_expr & (proband_expr.GT.is_hom_var() & father_expr.GT.is_hom_ref())
)

KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
return is_de_novo


def get_de_novo_expr(
locus_expr: hl.expr.LocusExpression,
alleles_expr: hl.expr.ArrayExpression,
proband_expr: hl.expr.StructExpression,
father_expr: hl.expr.StructExpression,
mother_expr: hl.expr.StructExpression,
is_xx_expr: hl.expr.BooleanExpression,
freq_prior_expr: hl.expr.Float64Expression,
min_pop_prior: float = 100 / 3e7,
de_novo_prior: float = 1 / 3e7,
min_dp_ratio: float = 0.1,
min_gq: int = 20,
min_proband_ab: float = 0.2,
max_parent_ab: float = 0.05,
min_de_novo_p: float = 0.05,
high_conf_dp_ratio: float = 0.2,
dp_threshold_snp: int = 10,
high_med_conf_ab: float = 0.3,
low_conf_ab: float = 0.2,
high_conf_p: float = 0.99,
med_conf_p: float = 0.5,
low_conf_p: float = 0.2,
) -> hl.expr.StructExpression:
"""
Get the de novo status of a variant based on the proband and parent genotypes.

Thresholds:
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

+----------------+------------+----------------------+------+------+------+------+------+
| Category | P(de novo) | AB* | AD* | DP* | DR* | GQ* | AC* |
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
+----------------+------------+----------------------+------+------+------+------+------+
| FAIL | < 0.05 | AB(parents) > 0.05 | 0 | | <0.1 | <20 | |
| | | OR AB(proband) < 0.2 | | | | | |
| HIGH (Indel) | > 0.99 | > 0.3 | | | | | =1 |
| HIGH (SNV) 1 | > 0.99 | > 0.3 | | >10 | | | |
| HIGH (SNV) 2 | > 0.5 | > 0.3 | | | >0.2 | | <10 |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the columns might be a little mixed up here -- aren't the criteria either:

  • p > 0.99 AND AB > 0.3 AND DR > 0.2
  • p > 0.5 AND AB > 0.3 AND DP > 10
    (from Hail's documentation)? these columns have:
  • p > 0.99 AND AB > 0.3 AND DP > 10
  • p > 0.5 AND AB > 0.3 AND DR > 0.2

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great catch, the table is a bit mixed up, based on https://github.com/ksamocha/de_novo_scripts/blob/bde3e40cba46e02d5b45bc4d780de7761e02aee6/de_novo_finder_3.py#L100
Kaitlin wanted to rescue some MEDIUM SNVs by their DP. Given we're having too many de novos, should we ignore this logic?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's fine to keep Kaitlin's original flags. we'll only release the high confidence de novos in a TSV and full set of de novos in the Hail Table (along with documentation of the Hail Table and how to filter it)

| MEDIUM | > 0.5 | > 0.3 | | | | | |
| LOW | > 0.2 | > 0.2 | | | | | |
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in combination with this comment, the > for AB should be >= based on your code below

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the P(de novo) threshold for LOW is 0.05, not 0.2, right?

| VERY LOW | >= 0.05 | | | | | | |
+----------------+------------+----------------------+------+------+------+------+------+

* AB: Normally refers to AB for the proband, except when a threshold for parent(s) is specified for FAIL.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

* DP: DP for the proband.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

* DR: Defined as DP(proband) / DP(parent(s)).

* GQ: GQ for the proband.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

* AC: Supposed to be the sum of alternate alleles in the proband and parents. This has not been implemented yet due to multiple trios in one family, where an allele might be de novo in a parent and transmitted to a child in the dataset.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

:param locus_expr: Variant locus.
:param alleles_expr: Variant alleles. It assumes bi-allelic variants, meaning
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this message about assuming multiallelics have been split be a warning?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added an error message, does that work?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep looks good, can you reword this statement though from

It assumes bi-allelic variants, meaning that the matrix table or table should be already split to bi-allelics.

to

Function assumes all variants are biallelic, meaning that multiallelic variants in the input dataset should be split prior to running this function.

that the matrix table or table should be already split to bi-allelics.
:param proband_expr: Proband genotype info, required fields: GT, DP, GQ, AD, PL.
:param father_expr: Father genotype info, required fields: GT, DP, GQ, AD, PL.
:param mother_expr: Mother genotype info, required fields: GT, DP, GQ, AD, PL.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:param is_xx_expr: Whether the proband is XX.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:param freq_prior_expr: Population frequency prior for the variant.
:param min_pop_prior: Minimum population frequency prior, default to 100 / 3e7.
:param de_novo_prior: Prior probability of a de novo mutation, default to 1 / 3e7.
:param min_dp_ratio: Minimum depth ratio for proband to parents, default to 0.1.
:param min_gq: Minimum genotype quality for the proband, default to 20.
:param min_proband_ab: Minimum allele balance for the proband, default to 0.2.
:param max_parent_ab: Maximum allele balance for parents, default to 0.05.
:param min_de_novo_p: Minimum probability for variant to be called de novo, default to 0.05.
:param high_conf_dp_ratio: DP ratio threshold of proband DP to combined DP in parents for high confidence, default to 0.2.
:param dp_threshold_snp: Minimum depth for high-confidence SNPs, default to 10.
:param high_med_conf_ab: AB threshold for high/medium confidence, default to 0.3.
:param low_conf_ab: AB threshold for low confidence, default to 0.2.
:param high_conf_p: P(de novo) threshold for high confidence, default to 0.99.
:param med_conf_p: P(de novo) threshold for medium confidence, default to 0.5.
:param low_conf_p: P(de novo) threshold for low confidence, default to 0.2.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved

KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
:return: A StructExpression with the de novo status and confidence.
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"""
# Determine genomic context
diploid_expr, hemi_x_expr, hemi_y_expr = get_copy_state_by_sex(
locus_expr, is_xx_expr
)

p_de_novo = calculate_de_novo_post_prob(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the code to approximate PLs stored somewhere else? I thought that was part of the work done to update this de novo caller

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe let's keep that in gnomad_qc?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense to me to keep the actual code in gnomad_qc, but can you add this documentation back to the function? I took this from Julia's function:

    .. warning::

        This method assumes that the PL and AD fields are present in the genotype fields
        of the child and parents. If they are missing, this method will not work.
        Many of our larger datasets have the PL and AD fields intentionally removed to
        save storage space. If this is the reason that the PL and AD fields are
        missing, the only way to use this method is to set them to their approximate
        values:

        .. code-block:: python

                PL=hl.or_else(PL, [0, GQ, 2 * GQ])
                AD=hl.or_else(AD, [DP, 0])

this is very clear documentation for our users both as a warning that this function needs AD and PL and how to approximate them if they also need to do so

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just saw this one and added now.

KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
proband_expr.PL,
father_expr.PL,
mother_expr.PL,
locus_expr,
is_xx_expr,
freq_prior_expr,
min_pop_prior=min_pop_prior,
de_novo_prior=de_novo_prior,
)

# Calculate DP ratio
parent_dp = (
hl.case()
.when(diploid_expr, father_expr.DP + mother_expr.DP)
.when(hemi_x_expr, mother_expr.DP)
.when(hemi_y_expr, father_expr.DP)
.or_missing()
)
dp_ratio = proband_expr.DP / parent_dp

# Calculate proband AB and assign variant type
proband_ab = proband_expr.AD[1] / hl.sum(proband_expr.AD)
ch-kr marked this conversation as resolved.
Show resolved Hide resolved
is_snp = hl.is_snp(alleles_expr[0], alleles_expr[1])

# Confidence assignment
confidence = (
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
hl.case()
.when(
(
is_snp
& (p_de_novo > 0.99)
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
& (proband_ab > high_med_conf_ab)
& (
(proband_expr.DP > dp_threshold_snp)
| (dp_ratio > high_conf_dp_ratio)
)
)
| (~is_snp & (p_de_novo > high_conf_p) & (proband_ab > high_med_conf_ab)),
"HIGH",
)
.when((p_de_novo > med_conf_p) & (proband_ab > high_med_conf_ab), "MEDIUM")
.when((p_de_novo > low_conf_p) & (proband_ab >= low_conf_ab), "LOW")
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
.when(
(p_de_novo >= min_de_novo_p),
"VERY LOW",
)
.or_missing()
)

parent_sum_ad_0 = (
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
hl.case()
.when(
diploid_expr, (hl.sum(father_expr.AD) == 0) | (hl.sum(mother_expr.AD) == 0)
)
.when(hemi_x_expr, hl.sum(mother_expr.AD) == 0)
.when(hemi_y_expr, hl.sum(father_expr.AD) == 0)
.or_missing()
)

fail_max_parent_ab = (
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
hl.case()
.when(
diploid_expr,
(father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab)
| (mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab),
)
.when(hemi_x_expr, mother_expr.AD[1] / hl.sum(mother_expr.AD) > max_parent_ab)
.when(hemi_y_expr, father_expr.AD[1] / hl.sum(father_expr.AD) > max_parent_ab)
.or_missing()
)

# Fail checks
fail_checks = {
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"min_dp_ratio": dp_ratio < min_dp_ratio,
"parent_sum_ad_0": parent_sum_ad_0,
"max_parent_ab": fail_max_parent_ab,
"min_proband_ab": proband_ab < min_proband_ab,
KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
"min_proband_gq": proband_expr.GQ < min_gq,
"min_de_novo_p": p_de_novo <= min_de_novo_p,
}

fail = hl.any(list(fail_checks.values()))
result_expr = hl.struct(
p_de_novo=hl.if_else(fail, hl.missing(hl.tfloat64), p_de_novo),
confidence=hl.if_else(fail, hl.missing(hl.tstr), confidence),
fail_reason=add_filters_expr(filters=fail_checks),
)

KoalaQin marked this conversation as resolved.
Show resolved Hide resolved
return result_expr
Loading