Skip to content

Commit

Permalink
feat!: get_genomic_mane_genes should return mane status (#398)
Browse files Browse the repository at this point in the history
  • Loading branch information
korikuzma authored Feb 12, 2025
1 parent ef342d4 commit de3d5c0
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 31 deletions.
12 changes: 10 additions & 2 deletions src/cool_seq_tool/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,18 @@ def values(cls) -> list[str]:
return [item.value for item in cls]


class ManeStatus(str, Enum):
"""Define constraints for mane status"""

SELECT = "mane_select"
PLUS_CLINICAL = "mane_plus_clinical"


class TranscriptPriority(str, Enum):
"""Create Enum for Transcript Priority labels"""

MANE_SELECT = "mane_select"
MANE_PLUS_CLINICAL = "mane_plus_clinical"
MANE_SELECT = ManeStatus.SELECT.value
MANE_PLUS_CLINICAL = ManeStatus.PLUS_CLINICAL.value
LONGEST_COMPATIBLE_REMAINING = "longest_compatible_remaining"
GRCH38 = "grch38"

Expand Down Expand Up @@ -137,6 +144,7 @@ class ManeGeneData(BaseModel, extra="forbid"):
ncbi_gene_id: StrictInt
hgnc_id: StrictInt | None
symbol: StrictStr
status: list[ManeStatus]


class ServiceMeta(BaseModelForbidExtra):
Expand Down
60 changes: 46 additions & 14 deletions src/cool_seq_tool/sources/mane_transcript_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,26 +117,58 @@ def get_genomic_mane_genes(
:param end: Genomic end position. Assumes residue coordinates.
:return: Unique MANE gene(s) found for a genomic location
"""
# Only interested in rows where genomic location lives
mane_rows = self.df.filter(
(start >= pl.col("chr_start"))
& (end <= pl.col("chr_end"))
& (pl.col("GRCh38_chr") == ac)
).unique(subset=["#NCBI_GeneID"])
)

if len(mane_rows) == 0:
if mane_rows.is_empty():
return []

mane_rows = mane_rows.with_columns(
pl.col("#NCBI_GeneID")
.str.split_exact(":", 1)
.struct.field("field_1")
.cast(pl.Int32)
.alias("ncbi_gene_id"),
pl.col("HGNC_ID")
.str.split_exact(":", 1)
.struct.field("field_1")
.cast(pl.Int32)
.alias("hgnc_id"),
# Group rows by NCBI ID, transform values to representation we want, MANE status
# will be converted to list with DESC order
mane_rows = mane_rows.group_by("#NCBI_GeneID").agg(
[
pl.col("#NCBI_GeneID")
.first()
.str.split_exact(":", 1)
.struct.field("field_1")
.cast(pl.Int32)
.alias("ncbi_gene_id"),
pl.col("HGNC_ID")
.first()
.str.split_exact(":", 1)
.struct.field("field_1")
.cast(pl.Int32)
.alias("hgnc_id"),
pl.col("MANE_status")
.unique()
.str.to_lowercase()
.str.replace_all(" ", "_")
.alias("status")
.sort(descending=True),
pl.col("symbol").first(),
]
)

# Sort final rows based on MANE status
# First by length (which means gene has both select and plus clinical)
# Then by DESC order
# Then by NCBI ID ASC order
mane_rows = (
mane_rows.with_columns(
[
pl.col("status").list.len().alias("status_count"),
pl.col("status").list.join("_").alias("status_str"),
pl.col("ncbi_gene_id"),
]
)
.sort(
["status_count", "status_str", "ncbi_gene_id"],
descending=[True, True, False],
)
.drop(["status_count", "status_str", "#NCBI_GeneID"])
)
mane_rows = mane_rows.select(["ncbi_gene_id", "hgnc_id", "symbol"])
return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]
15 changes: 12 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,10 +340,19 @@ def genomic_tx_data():
@pytest.fixture(scope="session")
def egfr_mane_gene():
"""Create test fixture for EGFR MANE gene"""
return ManeGeneData(ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR")
return ManeGeneData(
ncbi_gene_id=1956, hgnc_id=3236, symbol="EGFR", status=["mane_select"]
)


@pytest.fixture(scope="session")
def braf_mane_gene():
def braf_mane_genes():
"""Create test fixture for BRAF MANE gene"""
return ManeGeneData(ncbi_gene_id=673, hgnc_id=1097, symbol="BRAF")
return [
ManeGeneData(
ncbi_gene_id=673,
hgnc_id=1097,
symbol="BRAF",
status=["mane_select", "mane_plus_clinical"],
),
]
4 changes: 2 additions & 2 deletions tests/mappers/test_mane_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,13 @@ def grch38_egfr(egfr_mane_gene):


@pytest.fixture(scope="module")
def grch38_braf(braf_mane_gene):
def grch38_braf(braf_mane_genes):
"""Create a test fixture for grch38 responses BRAF V600E (genomic)."""
params = {
"pos": (140753335, 140753336),
"status": TranscriptPriority.GRCH38.value,
"ac": "NC_000007.14",
"mane_genes": [braf_mane_gene],
"mane_genes": braf_mane_genes,
}
return GenomicRepresentation(**params)

Expand Down
82 changes: 72 additions & 10 deletions tests/sources/test_mane_transcript_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,43 +209,105 @@ def test_get_mane_data_from_chr_pos(


def test_get_genomic_mane_genes(
test_mane_transcript_mappings, braf_mane_gene, egfr_mane_gene
test_mane_transcript_mappings, braf_mane_genes, egfr_mane_gene
):
"""Test that get_genomic_mane_genes method works correctly"""
new_df = pl.DataFrame(
{
"#NCBI_GeneID": ["GeneID:673", "GeneID:673", "GeneID:1956", "GeneID:1"],
"#NCBI_GeneID": [
"GeneID:673",
"GeneID:673",
"GeneID:1956",
"GeneID:1",
"GeneID:2",
"GeneID:2",
"GeneID:3",
],
"Ensembl_Gene": [
"ENSG00000157764.14",
"ENSG00000157764.14",
"ENSG00000146648.21",
"ENSG1.1",
"ENSG1.1",
"ENSG1.1",
"ENSG1.1",
],
"HGNC_ID": [
"HGNC:1097",
"HGNC:1097",
"HGNC:3236",
"HGNC:1",
"HGNC:2",
"HGNC:2",
"HGNC:3",
],
"HGNC_ID": ["HGNC:1097", "HGNC:1097", "HGNC:3236", "HGNC:2"],
"symbol": ["BRAF", "BRAF", "EGFR", "Dummy"],
"symbol": ["BRAF", "BRAF", "EGFR", "Dummy1", "Dummy2", "Dummy2", "Dummy3"],
"GRCh38_chr": [
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
],
"chr_start": [
140719337,
140730665,
55019017,
55019017,
55019017,
55019017,
55019017,
],
"chr_end": [
140924929,
140924929,
55211628,
55211628,
55211628,
55211628,
55211628,
],
"MANE_status": [
"MANE Plus Clinical",
"MANE Select",
"MANE Select",
"MANE Plus Clinical",
"MANE Select",
"MANE Plus Clinical",
"MANE Select",
],
"chr_start": [140719337, 140730665, 55019017, 55019017],
"chr_end": [140924929, 140924929, 55211628, 55211628],
}
)

with patch.object(test_mane_transcript_mappings, "df", new_df):
mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
"NC_000007.14", 140753336, 140753336
)
assert mane_genes == [braf_mane_gene]
assert mane_genes == braf_mane_genes

mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
"NC_000007.14", 55191822, 55191822
)
assert len(mane_genes) == 2
assert egfr_mane_gene in mane_genes
assert ManeGeneData(ncbi_gene_id=1, hgnc_id=2, symbol="Dummy") in mane_genes
assert mane_genes == [
ManeGeneData(
ncbi_gene_id=2,
hgnc_id=2,
symbol="Dummy2",
status=["mane_select", "mane_plus_clinical"],
),
ManeGeneData(
ncbi_gene_id=3, hgnc_id=3, symbol="Dummy3", status=["mane_select"]
),
egfr_mane_gene,
ManeGeneData(
ncbi_gene_id=1,
hgnc_id=1,
symbol="Dummy1",
status=["mane_plus_clinical"],
),
]

# No MANE genes found for given genomic location
mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
Expand Down

0 comments on commit de3d5c0

Please sign in to comment.