Skip to content

Commit

Permalink
[builder] update uberon and tissue_general mapping to schema 4 standa…
Browse files Browse the repository at this point in the history
…rd (#1018)

* update uberon and tissue_general mapping to schema 4 standard

* incorporate fixes to resovlve 841

* lint

* update incorrect comment
  • Loading branch information
Bruce Martin authored Feb 22, 2024
1 parent aaccb5b commit f0593c4
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ def get_summary_stats(experiment_builders: Sequence[ExperimentBuilder]) -> Summa
def add_tissue_mapping(obs_df: pd.DataFrame, dataset_id: str) -> None:
"""Inplace addition of tissue_general-related column."""
# UBERON tissue term mapper
from .tissue_mapper import TissueMapper # type: ignore
from .tissue_mapper import TissueMapper

tissue_mapper: TissueMapper = TissueMapper()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

# NOTE: The UBERON ontology URL needs to manually updated if the CXG Dataset Schema is updated. This is a temporary
# hassle, however, since the TissueMapper, which relies upon this ontology, will eventually be removed from the Builder
CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2023-06-28/uberon.owl"
CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2023-09-05/uberon.owl"

# Columns expected in the census_datasets dataframe
CENSUS_DATASETS_TABLE_SPEC = TableSpec.create(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,4 @@
# type: ignore
# isort:skip_file
# flake8: noqa
"""
NOTE: This is a (literal) copy of
https://github.com/chanzuckerberg/single-cell-data-portal/blob/9b94ccb0a2e0a8f6182b213aa4852c491f6f6aff/backend/wmg/data/tissue_mapper.py
Please do not modify this file directly here. Instead, modify the original file in single-cell-data-portal, run the unit tests (which exist in that repo),
get the PR approved and merged, and then port back the changes to this file.
In the future, this code will be part of an ontology service library.
This code contains several places that do not pass the lint/static analysis CI for this pipeline, so the analysis is disabled in this prologue.
"""
from typing import List
"""NOTE: In the future, this code will be part of an ontology service library."""

import owlready2

Expand Down Expand Up @@ -106,8 +92,8 @@ class TissueMapper:
"UBERON_0001868", # skin of chest
"UBERON_0001511", # skin of leg
"UBERON_0002190", # subcutaneous adipose tissue
"UBERON_0035328", # upper outer quadrant of breast
"UBERON_0000014", # zone of skin
"UBERON_0000916", # abdomen
]

# Terms to ignore when mapping
Expand All @@ -122,22 +108,21 @@ class TissueMapper:
"UBERON_0001062", # anatomical entity
]

def __init__(self):
self._cached_tissues = {}
self._cached_labels = {}
def __init__(self) -> None:
self._cached_tissues: dict[str, str] = {}
self._cached_labels: dict[str, str] = {}
self._uberon = owlready2.get_ontology(CXG_UBERON_ONTOLOGY_URL).load()

def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
"""
Returns the associated high-level tissue ontology term ID from any other ID
"""Returns the associated high-level tissue ontology term ID from any other ID.
Edge cases:
- If multiple high-level tissues exists for a given tissue, returns the one with higher priority (the first
appearance in list self.HIGH_LEVEL_TISSUES.
- If no high-level tissue is found, returns the same as input.
- If the input tissue is not found in the ontology, return the same as input.
- This could happen with something like "UBERON:0002048 (cell culture)"
"""

tissue_ontology_term_id = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=False)

if tissue_ontology_term_id in self._cached_tissues:
Expand All @@ -155,7 +140,7 @@ def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
# List ancestors for this entity, including itself. Ignore any ancestors that
# are not descendents of UBERON_0000061 (anatomical structure).
ancestors = [entity.name]
branch_ancestors = []
branch_ancestors: list[str] = []
for is_a in entity.is_a:
branch_ancestors = self._list_ancestors(is_a, branch_ancestors)

Expand All @@ -175,33 +160,31 @@ def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
self._cached_tissues[tissue_ontology_term_id] = result
return result

def get_label_from_writable_id(self, ontology_term_id: str):
"""
Returns the label from and ontology term id that is in writable form
def get_label_from_writable_id(self, ontology_term_id: str) -> str:
"""Returns the label from and ontology term id that is in writable form.
Example: "UBERON:0002048" returns "lung"
Example: "UBERON_0002048" raises ValueError because the ID is not in writable form
"""

if ontology_term_id in self._cached_labels:
return self._cached_labels[ontology_term_id]

entity = self._get_entity_from_id(self.reformat_ontology_term_id(ontology_term_id, to_writable=False))
if entity:
result = entity.label[0]
result: str = entity.label[0]
else:
result = ontology_term_id

self._cached_labels[ontology_term_id] = result
return result

@staticmethod
def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True):
"""
Converts ontology term id string between two formats:
- `to_writable == True`: from "UBERON_0002048" to "UBERON:0002048"
- `to_writable == False`: from "UBERON:0002048" to "UBERON_0002048"
"""
def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) -> str:
"""Converts ontology term id string between two formats.
- `to_writable == True`: from "UBERON_0002048" to "UBERON:0002048"
- `to_writable == False`: from "UBERON:0002048" to "UBERON_0002048"
"""
if to_writable:
if ontology_term_id.count("_") != 1:
raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one '_'")
Expand All @@ -211,11 +194,12 @@ def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True):
raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one ':'")
return ontology_term_id.replace(":", "_")

def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: List[str] = []) -> List[str]:
"""
Recursive function that given an entity of an ontology, it traverses the ontology and returns
def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: list[str] | None = None) -> list[str]:
"""Recursive function that given an entity of an ontology, it traverses the ontology and returns
a list of all ancestors associated with the entity.
"""
if ancestors is None:
ancestors = []

if self._is_restriction(entity):
# Entity is a restriction, check for part_of relationship
Expand All @@ -242,10 +226,10 @@ def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: List[s
self._list_ancestors(super_entity, ancestors)
return ancestors

raise ValueError("Unexpected condition in ontology.")

def _get_entity_from_id(self, ontology_term_id: str) -> owlready2.entity.ThingClass:
"""
Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity
"""
"""Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity."""
return self._uberon.search_one(iri=f"http://purl.obolibrary.org/obo/{ontology_term_id}")

@staticmethod
Expand Down
41 changes: 41 additions & 0 deletions tools/cellxgene_census_builder/tests/test_tissue_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import unittest

from cellxgene_census_builder.build_soma.tissue_mapper import TissueMapper


class TissueMapperTest(unittest.TestCase):
def setUp(self) -> None:
self.tissue_mapper = TissueMapper()

def test__high_level_tissue_retrieval_exists(self) -> None:
low_level_tissue = "UBERON:0008951" # lef lung lobe
expected_high_level_tissue = "UBERON:0002048" # lung
self.assertEqual(self.tissue_mapper.get_high_level_tissue(low_level_tissue), expected_high_level_tissue)

def test__high_level_tissue_retrieval_does_not_exist(self) -> None:
low_level_tissue = "UBERON:noId"
expected_high_level_tissue = "UBERON:noId"
self.assertEqual(self.tissue_mapper.get_high_level_tissue(low_level_tissue), expected_high_level_tissue)

def test__high_level_tissue_retrieval_suffix(self) -> None:
low_level_tissue = "UBERON:0008951 (organoid)" # lef lung lobe
expected_high_level_tissue = "UBERON:0008951 (organoid)" # lung
self.assertEqual(self.tissue_mapper.get_high_level_tissue(low_level_tissue), expected_high_level_tissue)

def test__making_ontology_id_writable(self) -> None:
tissue = "UBERON_0008951"
expected_tissue = "UBERON:0008951"

self.assertEqual(self.tissue_mapper.reformat_ontology_term_id(tissue, to_writable=True), expected_tissue)

def test__making_ontology_id_readable(self) -> None:
tissue = "UBERON:0008951"
expected_tissue = "UBERON_0008951"

self.assertEqual(self.tissue_mapper.reformat_ontology_term_id(tissue, to_writable=False), expected_tissue)

def test__get_label_from_id(self) -> None:
tissue = "UBERON:0008951"
expected_label = "left lung lobe"

self.assertEqual(self.tissue_mapper.get_label_from_writable_id(tissue), expected_label)

0 comments on commit f0593c4

Please sign in to comment.