Skip to content

Commit

Permalink
group alt alleles using gene symbols
Browse files Browse the repository at this point in the history
refs #9

enables grouping genes that are not grouped in alt_allele table.
  • Loading branch information
dhimmel committed Feb 15, 2024
1 parent 65b8711 commit a38d1d9
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 86 deletions.
97 changes: 38 additions & 59 deletions ensembl_genes/ensembl_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,17 @@ def __init__(self, species: str | Species = "human", release: str = "latest"):
def connection_url(self) -> str:
"""
Ensembl public MySQL Servers information at
https://uswest.ensembl.org/info/data/mysql.html
<https://uswest.ensembl.org/info/data/mysql.html>.
NOTE: Common Table Expression (CTEs) are not supported in MySQL < 8.0 or MariaDB < 10.2.
Window functions are also not available.
"""
import sqlalchemy

url = sqlalchemy.engine.url.URL.create(
drivername="mysql+mysqlconnector",
username="anonymous",
host="ensembldb.ensembl.org", # From Ensembl 48 onwards only
# host="useastdb.ensembl.org", # Current and previous Ensembl version only
host="ensembldb.ensembl.org", # From Ensembl 48 onwards only, MySQL 5.6.33
# host="useastdb.ensembl.org", # Current and previous Ensembl version only, MariaDB 10.0.30
port=3306,
database=self.database,
)
Expand Down Expand Up @@ -96,7 +98,9 @@ def gene_df(self) -> pd.DataFrame:
gene_df = gene_df.join(desc_df)
# add ensembl_representative_gene_id column
gene_repr_df = gene_df.merge(
self.alt_allele_df[["ensembl_gene_id", "ensembl_representative_gene_id"]],
self.representative_gene_df[
["ensembl_gene_id", "ensembl_representative_gene_id"]
],
how="left",
)
gene_repr_df.ensembl_representative_gene_id = (
Expand Down Expand Up @@ -130,36 +134,26 @@ def _check_gene_df(self, gene_df: pd.DataFrame) -> None:
assert bad_chromosome_df.empty

@cached_property
def alt_allele_df(self) -> pd.DataFrame:
alt_allele_df = self.run_query("gene_alt_alleles")
expected_cols = [
*alt_allele_df.columns,
"ensembl_representative_gene_id",
"is_representative_gene",
"representative_gene_method",
]
if not alt_allele_df.empty:
alt_allele_df = alt_allele_df.groupby("alt_allele_group_id").apply(
self._alt_allele_add_representative
)
else:
# Force expected output columns, since groupby-apply does not add columns
# from _alt_allele_add_representative when alt_allele_df is empty.
alt_allele_df = alt_allele_df.reindex(columns=expected_cols)
# ensembl_gene_id can be duplicated due to multiple alt_allele_attrib values
alt_allele_df = alt_allele_df.drop_duplicates("ensembl_gene_id", keep="first")
self._check_alt_allele_df(alt_allele_df)
return alt_allele_df
def representative_gene_df(self) -> pd.DataFrame:
df = (
self.run_query("gene_alt_alleles")
.groupby("rs_allele_group", group_keys=False)
.apply(self._alt_allele_add_representative)
# ensembl_gene_id can be duplicated due to multiple alt_allele_attrib values
.drop_duplicates("ensembl_gene_id", keep="first")
)
self._check_representative_gene_df(df)
return df

@staticmethod
def _check_alt_allele_df(alt_allele_df: pd.DataFrame) -> None:
def _check_representative_gene_df(alt_allele_df: pd.DataFrame) -> None:
dup_df = alt_allele_df[alt_allele_df.ensembl_gene_id.duplicated(keep=False)]
assert dup_df.empty

@staticmethod
def _alt_allele_get_representative(df: pd.DataFrame) -> "tuple[str, str]":
def _alt_allele_get_representative(df: pd.DataFrame) -> str:
"""
For a subset of the alt_allele_df corresponding to a single `alt_allele_group_id`,
For a subset of the alt_allele_df corresponding to a single `rs_allele_group`,
return a (ensembl_representative_gene_id, representative_gene_method) tuple,
where `ensembl_representative_gene_id` is the selected representative gene from this group
and `representative_gene_method` is the method by which it was selected.
Expand All @@ -169,43 +163,28 @@ def _alt_allele_get_representative(df: pd.DataFrame) -> "tuple[str, str]":
2. primary_assembly: a single gene is on the primary assembly.
3. first_added: the gene first added to the Ensembl database.
"""
representatives = list(
df.loc[df.alt_allele_is_representative.astype("bool"), "ensembl_gene_id"]
)
if len(representatives) == 1:
return representatives[0], "alt_allele_is_representative"
if len(representatives) > 1:
raise ValueError(
"expected at most 1 IS_REPRESENTATIVE gene per alt_allele_group"
)
representatives = list(
df.loc[df.primary_assembly.astype("bool"), "ensembl_gene_id"]
)
if len(representatives) == 1:
return representatives[0], "primary_assembly"
if len(representatives) > 1:
raise ValueError(
"expected at most 1 primary assembly gene per alt_allele_group"
)
return (
df.sort_values(
["ensembl_created_date", "ensembl_gene_id"]
).ensembl_gene_id.iloc[0],
"first_added",
)

@staticmethod
def _alt_allele_add_representative(df: pd.DataFrame) -> pd.DataFrame:
representative_gene = df.sort_values(
[
"alt_allele_is_representative",
"primary_assembly",
"ensembl_created_date",
"ensembl_gene_id",
],
ascending=[False, False, True, True],
).ensembl_gene_id.iloc[0]
assert isinstance(representative_gene, str)
return representative_gene

@classmethod
def _alt_allele_add_representative(cls, df: pd.DataFrame) -> pd.DataFrame:
"""
Apply to alt_allele_df grouped by `alt_allele_group_id` to add columns:
Apply to alt_allele_df grouped by `rs_allele_group` to add columns:
`ensembl_representative_gene_id`, `is_representative_gene`, `representative_gene_method`.
"""
representative, method = Ensembl_Gene_Queries._alt_allele_get_representative(df)
df["ensembl_representative_gene_id"] = representative
df["ensembl_representative_gene_id"] = cls._alt_allele_get_representative(df)
df["is_representative_gene"] = (
df.ensembl_gene_id == df.ensembl_representative_gene_id
)
df["representative_gene_method"] = method
# the ordering of alt_allele_df should always place the representative gene first.
# at some point we might be able to move all logic to SQL
assert df["is_representative_gene"].iloc[0]
Expand Down Expand Up @@ -481,7 +460,7 @@ class Ensembl_Gene_Catalog_Writer(Ensembl_Gene_Queries):
),
DatasetExport(
name="alt_alleles",
query_fxn="alt_allele_df",
query_fxn="representative_gene_df",
description=(
"This is an intermediate table that groups genes if they are alternate alleles of each other. "
"A representative gene is selected from each group."
Expand Down
15 changes: 3 additions & 12 deletions ensembl_genes/notebooks/ensembl_genes_eda.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@
"metadata": {},
"outputs": [],
"source": [
"ensg.alt_allele_df.head()"
"ensg.representative_gene_df.head()"
]
},
{
Expand All @@ -221,7 +221,7 @@
"metadata": {},
"outputs": [],
"source": [
"ensg.alt_allele_df.alt_allele_attrib.value_counts()"
"ensg.representative_gene_df.alt_allele_attrib.value_counts()"
]
},
{
Expand All @@ -230,16 +230,7 @@
"metadata": {},
"outputs": [],
"source": [
"ensg.alt_allele_df.query(\"is_representative_gene\").representative_gene_method.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ensg.gene_df.query(\"ensembl_gene_id != ensembl_representative_gene_id\").head(2)"
"ensg.representative_gene_df.query(\"ensembl_gene_id != ensembl_representative_gene_id\").head(2)"
]
},
{
Expand Down
47 changes: 32 additions & 15 deletions ensembl_genes/queries/gene_alt_alleles.sql
Original file line number Diff line number Diff line change
@@ -1,24 +1,41 @@
-- Get alt allele groups: genes with multiple alleles
-- Get alt allele groups: genes with multiple alleles.
-- Includes all genes, even if they are in a group of only themselves.
-- Use gene symbols to group genes, because the alt_allele table alone is incomplete.
-- https://github.com/related-sciences/ensembl-genes/issues/9.
SELECT
COALESCE(
xref.display_label,
CAST(alt_allele_group_id AS CHAR),
gene.stable_id
) AS rs_allele_group,
gene.stable_id AS ensembl_gene_id,
alt_allele.alt_allele_group_id,
alt_allele_attrib.attrib = "IS_REPRESENTATIVE" AS alt_allele_is_representative,
xref.display_label AS gene_symbol,
gene.created_date AS ensembl_created_date,
seq_region.name AS seq_region,
-- we used to use assembly_exception to determine primary assembly, but this table is now empty
-- https://github.com/related-sciences/ensembl-genes/issues/22#issuecomment-1664197773
-- instead just look for a short seq_region name (e.g. '19' instead of 'HSCHR19LRC_PGF1_CTG3_1'),
-- even though this is a flawed method that would miss scaffolds that are primary assemblies.
LENGTH(seq_region.name) <= 3 AS primary_assembly,
seq_region.name AS seq_region,
alt_allele.alt_allele_group_id AS alt_allele_group_id,
-- WARNING: alt_allele_attrib can introduce multiple rows per gene!
alt_allele_attrib.attrib AS alt_allele_attrib,
gene.created_date AS ensembl_created_date
FROM alt_allele
INNER JOIN gene
ON gene.gene_id = alt_allele.gene_id
INNER JOIN alt_allele_attrib
ON alt_allele.alt_allele_id = alt_allele_attrib.alt_allele_id
INNER JOIN seq_region
ON gene.seq_region_id = seq_region.seq_region_id
-- all genes were current when query was written, ensure this is always the case
IFNULL(alt_allele_attrib.attrib = "IS_REPRESENTATIVE", FALSE) AS alt_allele_is_representative
FROM gene
LEFT JOIN seq_region USING (seq_region_id)
LEFT JOIN xref ON xref.xref_id = gene.display_xref_id
LEFT JOIN alt_allele USING (gene_id)
LEFT JOIN alt_allele_attrib USING (alt_allele_id)
WHERE gene.is_current
ORDER BY alt_allele_group_id, alt_allele_is_representative DESC, primary_assembly DESC, ensembl_created_date, ensembl_gene_id
-- LIMIT 5
ORDER BY rs_allele_group, alt_allele_is_representative DESC, primary_assembly DESC, ensembl_created_date, ensembl_gene_id

-- it would be nice to identify the representative gene from a group in this query,
-- but window functions like the following are not supported:
-- ROW_NUMBER() OVER (
-- PARTITION BY rs_allele_group
-- ORDER BY
-- alt_allele_is_representative DESC NULLS LAST,
-- primary_assembly DESC NULLS LAST,
-- ensembl_created_date ASC NULLS LAST,
-- ensembl_gene_id ASC NULLS LAST
-- ) AS representative_gene_rank

0 comments on commit a38d1d9

Please sign in to comment.