Skip to content

Commit

Permalink
sort final list
Browse files Browse the repository at this point in the history
  • Loading branch information
korikuzma committed Feb 12, 2025
1 parent a29f211 commit 482519a
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 13 deletions.
23 changes: 22 additions & 1 deletion src/cool_seq_tool/sources/mane_transcript_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def get_genomic_mane_genes(
:param end: Genomic end position. Assumes residue coordinates.
:return: Unique MANE gene(s) found for a genomic location
"""
# Only interested in rows where genomic location lives
mane_rows = self.df.filter(
(start >= pl.col("chr_start"))
& (end <= pl.col("chr_end"))
Expand All @@ -126,6 +127,8 @@ def get_genomic_mane_genes(
if mane_rows.is_empty():
return []

# Group rows by NCBI ID, transform values to representation we want, MANE status
# will be converted to list with DESC order
mane_rows = mane_rows.group_by("#NCBI_GeneID").agg(
[
pl.col("#NCBI_GeneID")
Expand All @@ -149,5 +152,23 @@ def get_genomic_mane_genes(
pl.col("symbol").first(),
]
)
mane_rows = mane_rows.drop("#NCBI_GeneID")

# Sort final rows based on MANE status
# First by length (which means gene has both select and plus clinical)
# Then by DESC order
# Then by NCBI ID ASC order
mane_rows = (
mane_rows.with_columns(
[
pl.col("status").list.len().alias("status_count"),
pl.col("status").list.join("_").alias("status_str"),
pl.col("ncbi_gene_id"),
]
)
.sort(
["status_count", "status_str", "ncbi_gene_id"],
descending=[True, True, False],
)
.drop(["status_count", "status_str", "#NCBI_GeneID"])
)
return [ManeGeneData(**mane_gene) for mane_gene in mane_rows.to_dicts()]
75 changes: 63 additions & 12 deletions tests/sources/test_mane_transcript_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,27 +214,68 @@ def test_get_genomic_mane_genes(
"""Test that get_genomic_mane_genes method works correctly"""
new_df = pl.DataFrame(
{
"#NCBI_GeneID": ["GeneID:673", "GeneID:673", "GeneID:1956", "GeneID:1"],
"#NCBI_GeneID": [
"GeneID:673",
"GeneID:673",
"GeneID:1956",
"GeneID:1",
"GeneID:2",
"GeneID:2",
"GeneID:3",
],
"Ensembl_Gene": [
"ENSG00000157764.14",
"ENSG00000157764.14",
"ENSG00000146648.21",
"ENSG1.1",
"ENSG1.1",
"ENSG1.1",
"ENSG1.1",
],
"HGNC_ID": ["HGNC:1097", "HGNC:1097", "HGNC:3236", "HGNC:2"],
"symbol": ["BRAF", "BRAF", "EGFR", "Dummy"],
"HGNC_ID": [
"HGNC:1097",
"HGNC:1097",
"HGNC:3236",
"HGNC:1",
"HGNC:2",
"HGNC:2",
"HGNC:3",
],
"symbol": ["BRAF", "BRAF", "EGFR", "Dummy1", "Dummy2", "Dummy2", "Dummy3"],
"GRCh38_chr": [
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
"NC_000007.14",
],
"chr_start": [
140719337,
140730665,
55019017,
55019017,
55019017,
55019017,
55019017,
],
"chr_end": [
140924929,
140924929,
55211628,
55211628,
55211628,
55211628,
55211628,
],
"chr_start": [140719337, 140730665, 55019017, 55019017],
"chr_end": [140924929, 140924929, 55211628, 55211628],
"MANE_status": [
"MANE Plus Clinical",
"MANE Select",
"MANE Select",
"MANE Plus Clinical",
"MANE Select",
"MANE Plus Clinical",
"MANE Select",
],
}
Expand All @@ -249,14 +290,24 @@ def test_get_genomic_mane_genes(
mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
"NC_000007.14", 55191822, 55191822
)
assert len(mane_genes) == 2
assert egfr_mane_gene in mane_genes
assert (
assert mane_genes == [
ManeGeneData(
ncbi_gene_id=1, hgnc_id=2, symbol="Dummy", status=["mane_select"]
)
in mane_genes
)
ncbi_gene_id=2,
hgnc_id=2,
symbol="Dummy2",
status=["mane_select", "mane_plus_clinical"],
),
ManeGeneData(
ncbi_gene_id=3, hgnc_id=3, symbol="Dummy3", status=["mane_select"]
),
egfr_mane_gene,
ManeGeneData(
ncbi_gene_id=1,
hgnc_id=1,
symbol="Dummy1",
status=["mane_plus_clinical"],
),
]

# No MANE genes found for given genomic location
mane_genes = test_mane_transcript_mappings.get_genomic_mane_genes(
Expand Down

0 comments on commit 482519a

Please sign in to comment.