Merge pull request #1 from chembl/refactoring

Refactoring
chembl · Feb 21, 2024 · d757444 · d757444
2 parents 6982ee6 + 3fd8e87
commit d757444
Show file tree

Hide file tree

Showing 16 changed files with 1,380 additions and 1,460 deletions.
diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py
@@ -1,11 +1,18 @@
+"""
+Add ChEMBL compound properties to the dataset.
+"""
+
 import sqlite3
 
 import pandas as pd
 
+from dataset import Dataset
+import sanity_checks
+
 
 ########### Add Compound Properties Based on ChEMBL Data ###########
-def add_first_publication_date(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool
+def get_first_publication_cpd_date(
+    chembl_con: sqlite3.Connection, limit_to_literature: bool
 ) -> pd.DataFrame:
     """
     Query and calculate the first publication of a compound
@@ -14,13 +21,11 @@ def add_first_publication_date(
     of the compound in the literature according to ChEMBL.
     Otherwise this is the first appearance in any source in ChEMBL.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
     :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
     :type limit_to_literature: bool
-    :return: Pandas DataFrame with added first_publication_cpd.
+    :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL.
     :rtype: pd.DataFrame
     """
     # information about salts is aggregated in the parent
@@ -42,26 +47,21 @@ def add_first_publication_date(
     ].transform("min")
     df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates()
 
-    df_combined = df_combined.merge(df_docs, on="parent_molregno", how="left")
+    return df_docs
 
-    return df_combined
 
-
-def add_chembl_properties_and_structures(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def get_chembl_properties_and_structures(
+    chembl_con: sqlite3.Connection,
+) -> pd.DataFrame:
     """
-    Add compound properties from the compound_properties table 
+    Get compound properties from the compound_properties table
     (e.g., alogp, #hydrogen bond acceptors / donors, etc.).
-    Add InChI, InChI key and canonical smiles. 
+    Get InChI, InChI key and canonical smiles.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with added compound properties and structures. \\
-        - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL.
-    :rtype: (pd.DataFrame, pd.DataFrame)
+    :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, 
@@ -79,14 +79,12 @@ def add_chembl_properties_and_structures(
 
     df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
 
-    df_combined = df_combined.merge(df_cpd_props, on="parent_molregno", how="left")
-
-    return df_combined, df_cpd_props
+    return df_cpd_props
 
 
-def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
+def calculate_ligand_efficiency_metrics(dataset: Dataset):
     """
-    Calculate the ligand efficiency metrics for the compounds
+    Calculate and add the ligand efficiency metrics for the compounds
     based on the mean pchembl values for a compound-target pair and
     the following ligand efficiency (LE) formulas:
 
@@ -108,33 +106,37 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
     Once for the pchembl values based on binding + functional assays (BF)
     and once for the pchembl values based on binding assays only (B).
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added ligand efficiency metrics
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include ligand efficiency metrics.
+    :type dataset: Dataset
     """
     for suffix in ["BF", "B"]:
-        df_combined.loc[df_combined["heavy_atoms"] != 0, f"LE_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"]
-            / df_combined["heavy_atoms"]
+        dataset.df_result.loc[dataset.df_result["heavy_atoms"] != 0, f"LE_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            / dataset.df_result["heavy_atoms"]
             * (2.303 * 298 * 0.00199)
         )
 
-        df_combined.loc[df_combined["mw_freebase"] != 0, f"BEI_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"]
+        dataset.df_result.loc[
+            dataset.df_result["mw_freebase"] != 0, f"BEI_{suffix}"
+        ] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
             * 1000
-            / df_combined["mw_freebase"]
+            / dataset.df_result["mw_freebase"]
         )
 
-        df_combined.loc[df_combined["psa"] != 0, f"SEI_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"] * 100 / df_combined["psa"]
+        dataset.df_result.loc[dataset.df_result["psa"] != 0, f"SEI_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            * 100
+            / dataset.df_result["psa"]
         )
 
-        df_combined[f"LLE_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"] - df_combined["alogp"]
+        dataset.df_result[f"LLE_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            - dataset.df_result["alogp"]
         )
 
-        df_combined = df_combined.astype(
+        dataset.df_result = dataset.df_result.astype(
             {
                 f"LE_{suffix}": "float64",
                 f"BEI_{suffix}": "float64",
@@ -143,26 +145,19 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
             }
         )
 
-    return df_combined
-
 
-def add_atc_classification(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame:
     """
-    Query and add ATC classifications (level 1) from the atc_classification and 
+    Query ATC classifications (level 1) from the atc_classification and
     molecule_atc_classification tables.
-    ATC level annotations for the same parent_molregno are combined into one description 
-    that concatenates all descriptions sorted alphabetically 
+    ATC level annotations for the same parent_molregno are combined into one description
+    that concatenates all descriptions sorted alphabetically
     into one string with ' | ' as a separator.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with added ATC classifications \\
-        - Pandas DataFrame with ATC annotations in ChEMBL
-    :rtype: (pd.DataFrame, pd.DataFrame)
+    :return: Pandas DataFrame with ATC annotations in ChEMBL.
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description
@@ -185,14 +180,12 @@ def add_atc_classification(
     ].transform(lambda x: between_str_join.join(sorted(x)))
     atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates()
 
-    df_combined = df_combined.merge(atc_levels, on="parent_molregno", how="left")
-
-    return df_combined, atc_levels
+    return atc_levels
 
 
 def add_all_chembl_compound_properties(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
+):
     """
     Add ChEMBL-based compound properties to the given compound-target pairs, specifically:
 
@@ -202,24 +195,33 @@ def add_all_chembl_compound_properties(
     - ligand efficiency metrics
     - ATC classifications
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include compound properties.
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param limit_to_literature: Base first_publication_cpd on literature sources only if True. 
+    :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
         Base it on all available sources otherwise.
     :type limit_to_literature: bool
-    :return: - Pandas DataFrame with added compound properties \\
-        - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL \\
-        - Pandas DataFrame with ATC annotations in ChEMBL
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
     """
-    df_combined = add_first_publication_date(
-        df_combined, chembl_con, limit_to_literature
+    df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature)
+    dataset.df_result = dataset.df_result.merge(
+        df_docs, on="parent_molregno", how="left"
     )
-    df_combined, df_cpd_props = add_chembl_properties_and_structures(
-        df_combined, chembl_con
+
+    df_cpd_props = get_chembl_properties_and_structures(chembl_con)
+    dataset.df_cpd_props = df_cpd_props
+    dataset.df_result = dataset.df_result.merge(
+        df_cpd_props, on="parent_molregno", how="left"
+    )
+    sanity_checks.check_compound_props(dataset.df_result, df_cpd_props)
+
+    calculate_ligand_efficiency_metrics(dataset)
+    sanity_checks.check_ligand_efficiency_metrics(dataset.df_result)
+
+    atc_levels = get_atc_classification(chembl_con)
+    dataset.atc_levels = atc_levels
+    dataset.df_result = dataset.df_result.merge(
+        atc_levels, on="parent_molregno", how="left"
     )
-    df_combined = add_ligand_efficiency_metrics(df_combined)
-    df_combined, atc_levels = add_atc_classification(df_combined, chembl_con)
-    return df_combined, df_cpd_props, atc_levels
+    sanity_checks.check_atc(dataset.df_result, atc_levels)