[builder] update uberon and tissue_general mapping to schema 4 standa…

…rd (#1018) * update uberon and tissue_general mapping to schema 4 standard * incorporate fixes to resovlve 841 * lint * update incorrect comment
chanzuckerberg · Feb 22, 2024 · f0593c4 · f0593c4
1 parent aaccb5b
commit f0593c4
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 42 deletions.
diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/experiment_builder.py
@@ -730,7 +730,7 @@ def get_summary_stats(experiment_builders: Sequence[ExperimentBuilder]) -> Summa
 def add_tissue_mapping(obs_df: pd.DataFrame, dataset_id: str) -> None:
     """Inplace addition of tissue_general-related column."""
     # UBERON tissue term mapper
-    from .tissue_mapper import TissueMapper  # type: ignore
+    from .tissue_mapper import TissueMapper
 
     tissue_mapper: TissueMapper = TissueMapper()
 

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/globals.py
@@ -18,7 +18,7 @@
 
 # NOTE: The UBERON ontology URL needs to manually updated if the CXG Dataset Schema is updated. This is a temporary
 # hassle, however, since the TissueMapper, which relies upon this ontology, will eventually be removed from the Builder
-CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2023-06-28/uberon.owl"
+CXG_UBERON_ONTOLOGY_URL = "https://github.com/obophenotype/uberon/releases/download/v2023-09-05/uberon.owl"
 
 # Columns expected in the census_datasets dataframe
 CENSUS_DATASETS_TABLE_SPEC = TableSpec.create(

diff --git a/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py b/tools/cellxgene_census_builder/src/cellxgene_census_builder/build_soma/tissue_mapper.py
@@ -1,18 +1,4 @@
-# type: ignore
-# isort:skip_file
-# flake8: noqa
-"""
-NOTE: This is a (literal) copy of
-https://github.com/chanzuckerberg/single-cell-data-portal/blob/9b94ccb0a2e0a8f6182b213aa4852c491f6f6aff/backend/wmg/data/tissue_mapper.py
-
-Please do not modify this file directly here. Instead, modify the original file in single-cell-data-portal, run the unit tests (which exist in that repo),
-get the PR approved and merged, and then port back the changes to this file.
-
-In the future, this code will be part of an ontology service library.
-
-This code contains several places that do not pass the lint/static analysis CI for this pipeline, so the analysis is disabled in this prologue.
-"""
-from typing import List
+"""NOTE: In the future, this code will be part of an ontology service library."""
 
 import owlready2
 
@@ -106,8 +92,8 @@ class TissueMapper:
         "UBERON_0001868",  # skin of chest
         "UBERON_0001511",  # skin of leg
         "UBERON_0002190",  # subcutaneous adipose tissue
-        "UBERON_0035328",  # upper outer quadrant of breast
         "UBERON_0000014",  # zone of skin
+        "UBERON_0000916",  # abdomen
     ]
 
     # Terms to ignore when mapping
@@ -122,22 +108,21 @@ class TissueMapper:
         "UBERON_0001062",  # anatomical entity
     ]
 
-    def __init__(self):
-        self._cached_tissues = {}
-        self._cached_labels = {}
+    def __init__(self) -> None:
+        self._cached_tissues: dict[str, str] = {}
+        self._cached_labels: dict[str, str] = {}
         self._uberon = owlready2.get_ontology(CXG_UBERON_ONTOLOGY_URL).load()
 
     def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
-        """
-        Returns the associated high-level tissue ontology term ID from any other ID
+        """Returns the associated high-level tissue ontology term ID from any other ID.
+
         Edge cases:
             - If multiple high-level tissues exists for a given tissue, returns the one with higher priority (the first
             appearance in list self.HIGH_LEVEL_TISSUES.
             - If no high-level tissue is found, returns the same as input.
             - If the input tissue is not found in the ontology, return the same as input.
                 - This could happen with something like "UBERON:0002048 (cell culture)"
         """
-
         tissue_ontology_term_id = self.reformat_ontology_term_id(tissue_ontology_term_id, to_writable=False)
 
         if tissue_ontology_term_id in self._cached_tissues:
@@ -155,7 +140,7 @@ def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
         # List ancestors for this entity, including itself. Ignore any ancestors that
         # are not descendents of UBERON_0000061 (anatomical structure).
         ancestors = [entity.name]
-        branch_ancestors = []
+        branch_ancestors: list[str] = []
         for is_a in entity.is_a:
             branch_ancestors = self._list_ancestors(is_a, branch_ancestors)
 
@@ -175,33 +160,31 @@ def get_high_level_tissue(self, tissue_ontology_term_id: str) -> str:
         self._cached_tissues[tissue_ontology_term_id] = result
         return result
 
-    def get_label_from_writable_id(self, ontology_term_id: str):
-        """
-        Returns the label from and ontology term id that is in writable form
+    def get_label_from_writable_id(self, ontology_term_id: str) -> str:
+        """Returns the label from and ontology term id that is in writable form.
+
         Example: "UBERON:0002048" returns "lung"
         Example: "UBERON_0002048" raises ValueError because the ID is not in writable form
         """
-
         if ontology_term_id in self._cached_labels:
             return self._cached_labels[ontology_term_id]
 
         entity = self._get_entity_from_id(self.reformat_ontology_term_id(ontology_term_id, to_writable=False))
         if entity:
-            result = entity.label[0]
+            result: str = entity.label[0]
         else:
             result = ontology_term_id
 
         self._cached_labels[ontology_term_id] = result
         return result
 
     @staticmethod
-    def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True):
-        """
-        Converts ontology term id string between two formats:
-            - `to_writable == True`: from "UBERON_0002048" to "UBERON:0002048"
-            - `to_writable == False`: from "UBERON:0002048" to "UBERON_0002048"
-        """
+    def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True) -> str:
+        """Converts ontology term id string between two formats.
 
+        - `to_writable == True`: from "UBERON_0002048" to "UBERON:0002048"
+        - `to_writable == False`: from "UBERON:0002048" to "UBERON_0002048"
+        """
         if to_writable:
             if ontology_term_id.count("_") != 1:
                 raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one '_'")
@@ -211,11 +194,12 @@ def reformat_ontology_term_id(ontology_term_id: str, to_writable: bool = True):
                 raise ValueError(f"{ontology_term_id} is an invalid ontology term id, it must contain exactly one ':'")
             return ontology_term_id.replace(":", "_")
 
-    def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: List[str] = []) -> List[str]:
-        """
-        Recursive function that given an entity of an ontology, it traverses the ontology and returns
+    def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: list[str] | None = None) -> list[str]:
+        """Recursive function that given an entity of an ontology, it traverses the ontology and returns
         a list of all ancestors associated with the entity.
         """
+        if ancestors is None:
+            ancestors = []
 
         if self._is_restriction(entity):
             # Entity is a restriction, check for part_of relationship
@@ -242,10 +226,10 @@ def _list_ancestors(self, entity: owlready2.entity.ThingClass, ancestors: List[s
                 self._list_ancestors(super_entity, ancestors)
             return ancestors
 
+        raise ValueError("Unexpected condition in ontology.")
+
     def _get_entity_from_id(self, ontology_term_id: str) -> owlready2.entity.ThingClass:
-        """
-        Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity
-        """
+        """Given a readable ontology term id (e.g. "UBERON_0002048"), it returns the associated ontology entity."""
         return self._uberon.search_one(iri=f"http://purl.obolibrary.org/obo/{ontology_term_id}")
 
     @staticmethod

diff --git a/tools/cellxgene_census_builder/tests/test_tissue_mapper.py b/tools/cellxgene_census_builder/tests/test_tissue_mapper.py
@@ -0,0 +1,41 @@
+import unittest
+
+from cellxgene_census_builder.build_soma.tissue_mapper import TissueMapper
+
+
+class TissueMapperTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.tissue_mapper = TissueMapper()
+
+    def test__high_level_tissue_retrieval_exists(self) -> None:
+        low_level_tissue = "UBERON:0008951"  # lef lung lobe
+        expected_high_level_tissue = "UBERON:0002048"  # lung
+        self.assertEqual(self.tissue_mapper.get_high_level_tissue(low_level_tissue), expected_high_level_tissue)
+
+    def test__high_level_tissue_retrieval_does_not_exist(self) -> None:
+        low_level_tissue = "UBERON:noId"
+        expected_high_level_tissue = "UBERON:noId"
+        self.assertEqual(self.tissue_mapper.get_high_level_tissue(low_level_tissue), expected_high_level_tissue)
+
+    def test__high_level_tissue_retrieval_suffix(self) -> None:
+        low_level_tissue = "UBERON:0008951 (organoid)"  # lef lung lobe
+        expected_high_level_tissue = "UBERON:0008951 (organoid)"  # lung
+        self.assertEqual(self.tissue_mapper.get_high_level_tissue(low_level_tissue), expected_high_level_tissue)
+
+    def test__making_ontology_id_writable(self) -> None:
+        tissue = "UBERON_0008951"
+        expected_tissue = "UBERON:0008951"
+
+        self.assertEqual(self.tissue_mapper.reformat_ontology_term_id(tissue, to_writable=True), expected_tissue)
+
+    def test__making_ontology_id_readable(self) -> None:
+        tissue = "UBERON:0008951"
+        expected_tissue = "UBERON_0008951"
+
+        self.assertEqual(self.tissue_mapper.reformat_ontology_term_id(tissue, to_writable=False), expected_tissue)
+
+    def test__get_label_from_id(self) -> None:
+        tissue = "UBERON:0008951"
+        expected_label = "left lung lobe"
+
+        self.assertEqual(self.tissue_mapper.get_label_from_writable_id(tissue), expected_label)