Remove unnecessary variables from Dataset

chembl · Feb 20, 2024 · fe2a49a · fe2a49a
1 parent a4721c6
commit fe2a49a
Show file tree

Hide file tree

Showing 10 changed files with 285 additions and 226 deletions.
diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py
@@ -3,26 +3,26 @@
 import pandas as pd
 
 from dataset import Dataset
+import sanity_checks
 
 
 ########### Add Compound Properties Based on ChEMBL Data ###########
-def add_first_publication_date(
-    dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
-):
+def get_first_publication_cpd_date(
+    chembl_con: sqlite3.Connection, limit_to_literature: bool
+) -> pd.DataFrame:
     """
     Query and calculate the first publication of a compound
     based on ChEMBL data (column name: first_publication_cpd).
     If limit_to_literature is True, this corresponds to the first appearance
     of the compound in the literature according to ChEMBL.
     Otherwise this is the first appearance in any source in ChEMBL.
 
-    :param dataset: Dataset with compound-target pairs.
-        Will be updated to include first_publication_cpd
-    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
     :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
     :type limit_to_literature: bool
+    :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL.
+    :rtype: pd.DataFrame
     """
     # information about salts is aggregated in the parent
     sql = """
@@ -43,26 +43,21 @@ def add_first_publication_date(
     ].transform("min")
     df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates()
 
-    dataset.df_result = dataset.df_result.merge(
-        df_docs, on="parent_molregno", how="left"
-    )
+    return df_docs
 
 
-def add_chembl_properties_and_structures(
-    dataset: Dataset, chembl_con: sqlite3.Connection
-):
+def get_chembl_properties_and_structures(
+    chembl_con: sqlite3.Connection,
+) -> pd.DataFrame:
     """
-    Add compound properties from the compound_properties table
+    Get compound properties from the compound_properties table
     (e.g., alogp, #hydrogen bond acceptors / donors, etc.).
-    Add InChI, InChI key and canonical smiles.
+    Get InChI, InChI key and canonical smiles.
 
-    :param dataset: Dataset with compound-target pairs.
-        Will be updated to include compound properties and structures.
-        dataset.df_cpd_props will be set to
-        compound properties and structures for all compound ids in ChEMBL.
-    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
+    :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, 
@@ -79,16 +74,13 @@ def add_chembl_properties_and_structures(
     """
 
     df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
-    dataset.df_cpd_props = df_cpd_props
 
-    dataset.df_result = dataset.df_result.merge(
-        df_cpd_props, on="parent_molregno", how="left"
-    )
+    return df_cpd_props
 
 
-def add_ligand_efficiency_metrics(dataset: Dataset):
+def calculate_ligand_efficiency_metrics(dataset: Dataset):
     """
-    Calculate the ligand efficiency metrics for the compounds
+    Calculate and add the ligand efficiency metrics for the compounds
     based on the mean pchembl values for a compound-target pair and
     the following ligand efficiency (LE) formulas:
 
@@ -150,20 +142,18 @@ def add_ligand_efficiency_metrics(dataset: Dataset):
         )
 
 
-def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection):
+def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame:
     """
-    Query and add ATC classifications (level 1) from the atc_classification and
+    Query ATC classifications (level 1) from the atc_classification and
     molecule_atc_classification tables.
     ATC level annotations for the same parent_molregno are combined into one description
     that concatenates all descriptions sorted alphabetically
     into one string with ' | ' as a separator.
 
-    :param dataset: Dataset with compound-target pairs.
-        Will be updated to include ATC classifications.
-        dataset.atc_levels will be set to ATC annotations in ChEMBL.
-    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
+    :return: Pandas DataFrame with ATC annotations in ChEMBL.
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description
@@ -186,11 +176,7 @@ def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection):
     ].transform(lambda x: between_str_join.join(sorted(x)))
     atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates()
 
-    dataset.atc_levels = atc_levels
-
-    dataset.df_result = dataset.df_result.merge(
-        atc_levels, on="parent_molregno", how="left"
-    )
+    return atc_levels
 
 
 def add_all_chembl_compound_properties(
@@ -214,10 +200,24 @@ def add_all_chembl_compound_properties(
         Base it on all available sources otherwise.
     :type limit_to_literature: bool
     """
-    add_first_publication_date(dataset, chembl_con, limit_to_literature)
+    df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature)
+    dataset.df_result = dataset.df_result.merge(
+        df_docs, on="parent_molregno", how="left"
+    )
 
-    add_chembl_properties_and_structures(dataset, chembl_con)
+    df_cpd_props = get_chembl_properties_and_structures(chembl_con)
+    dataset.df_cpd_props = df_cpd_props
+    dataset.df_result = dataset.df_result.merge(
+        df_cpd_props, on="parent_molregno", how="left"
+    )
+    sanity_checks.check_compound_props(dataset.df_result, df_cpd_props)
 
-    add_ligand_efficiency_metrics(dataset)
+    calculate_ligand_efficiency_metrics(dataset)
+    sanity_checks.check_ligand_efficiency_metrics(dataset.df_result)
 
-    add_atc_classification(dataset, chembl_con)
+    atc_levels = get_atc_classification(chembl_con)
+    dataset.atc_levels = atc_levels
+    dataset.df_result = dataset.df_result.merge(
+        atc_levels, on="parent_molregno", how="left"
+    )
+    sanity_checks.check_atc(dataset.df_result, atc_levels)
diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
@@ -7,6 +7,7 @@
 import write_subsets
 from arguments import OutputArgs, CalculationArgs
 from dataset import Dataset
+import sanity_checks
 
 
 ########### Add Target Class Annotations Based on ChEMBL Data ###########
@@ -80,44 +81,31 @@ def get_target_class_table(
     return df_target_classes
 
 
-def add_chembl_target_class_annotations(
+def get_aggregated_target_classes(
     dataset: Dataset,
     chembl_con: sqlite3.Connection,
-    args: CalculationArgs,
-    out: OutputArgs,
-):
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
-    Add level 1 and 2 target class annotations.
-    Assignments for target IDs with more than one target class assignment per level
-    are summarised into one string with '|' as a separator
-    between the different target class annotations.
-
-    Targets with more than one level 1 / level 2 target class assignment are written to a file.
-    These could be reassigned by hand if a single target class is preferable.
+    Get mappings for target id to aggregated level 1 / level 2 target class.
 
     :param dataset: Dataset with compound-target pairs.
-        Will be updated to only include target class annotations.
-        dataset.target_classes_level1 will be set to
-            pandas DataFrame with mapping from target id to level 1 target class
-        dataset.target_classes_level2 will be set to
-            pandas DataFrame with mapping from target id to level 2 target class
     :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param args: Arguments related to how to calculate the dataset
-    :type args: CalculationArgs
-    :param out: Arguments related to how to output the dataset
-    :type out: OutputArgs
+    :return: [pandas DataFrame with mapping from target id to level 1 target class,
+        pandas DataFrame with mapping from target id to level 2 target class]
+    :rtype: tuple[pd.DataFrame, pd.DataFrame]
     """
     current_tids = set(dataset.df_result["tid"])
     df_target_classes = get_target_class_table(chembl_con, current_tids)
 
+    between_str_join = "|"
+
     # Summarise the information for a target id with
     # several assigned target classes of level 1 into one description.
     # If a target id has more than one assigned target class,
     # the target class 'Unclassified protein' is discarded.
     level = "l1"
-    between_str_join = "|"
     target_classes_level1 = df_target_classes[["tid", level]].drop_duplicates().dropna()
 
     # remove 'Unclassified protein' from targets with more than one target class, level 1
@@ -145,10 +133,6 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l1"]
     ].drop_duplicates()
 
-    dataset.df_result = dataset.df_result.merge(
-        target_classes_level1, on="tid", how="left"
-    )
-
     # Repeat the summary step for target classes of level 2.
     level = "l2"
     target_classes_level2 = df_target_classes[["tid", level]].drop_duplicates().dropna()
@@ -159,11 +143,24 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l2"]
     ].drop_duplicates()
 
-    dataset.df_result = dataset.df_result.merge(
-        target_classes_level2, on="tid", how="left"
-    )
+    return target_classes_level1, target_classes_level2
+
 
-    # Output targets have more than one target class assignment
+def output_ambiguous_target_classes(
+    dataset: Dataset,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Output targets have more than one target class assignment
+
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
     more_than_one_level_1 = dataset.df_result[
         (dataset.df_result["target_class_l1"].notnull())
         & (dataset.df_result["target_class_l1"].str.contains("|", regex=False))
@@ -203,5 +200,50 @@ def add_chembl_target_class_annotations(
         out,
     )
 
-    dataset.target_classes_level1 = target_classes_level1
-    dataset.target_classes_level2 = target_classes_level2
+
+def add_chembl_target_class_annotations(
+    dataset: Dataset,
+    chembl_con: sqlite3.Connection,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Add level 1 and 2 target class annotations.
+    Assignments for target IDs with more than one target class assignment per level
+    are summarised into one string with '|' as a separator
+    between the different target class annotations.
+
+    Targets with more than one level 1 / level 2 target class assignment are written to a file.
+    These could be reassigned by hand if a single target class is preferable.
+
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include target class annotations.
+        dataset.target_classes_level1 will be set to
+            pandas DataFrame with mapping from target id to level 1 target class
+        dataset.target_classes_level2 will be set to
+            pandas DataFrame with mapping from target id to level 2 target class
+    :type dataset: Dataset
+    :param chembl_con: Sqlite3 connection to ChEMBL database.
+    :type chembl_con: sqlite3.Connection
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    target_classes_level1, target_classes_level2 = get_aggregated_target_classes(
+        dataset, chembl_con
+    )
+
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level1, on="tid", how="left"
+    )
+
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level2, on="tid", how="left"
+    )
+
+    sanity_checks.check_target_classes(
+        dataset.df_result, target_classes_level1, target_classes_level2
+    )
+
+    output_ambiguous_target_classes(dataset, args, out)
diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py
@@ -80,27 +80,31 @@ def add_dti_annotations(
         ),
         "DTI",
     ] = "D_DT"
+
     dataset.df_result.loc[
         (
             dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
             & (dataset.df_result["max_phase"] == 3)
         ),
         "DTI",
     ] = "C3_DT"
+
     dataset.df_result.loc[
         (
             dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
             & (dataset.df_result["max_phase"] == 2)
         ),
         "DTI",
     ] = "C2_DT"
+
     dataset.df_result.loc[
         (
             dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
             & (dataset.df_result["max_phase"] == 1)
         ),
         "DTI",
     ] = "C1_DT"
+
     # Compounds that are in the drug_mechanism table but don't have a known phase between 1-4:
     dataset.df_result.loc[
         (

diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py
@@ -4,6 +4,7 @@
 from tqdm import tqdm
 
 from dataset import Dataset
+import sanity_checks
 
 
 def add_built_in_descriptors(dataset: Dataset):
@@ -168,3 +169,4 @@ def add_rdkit_compound_descriptors(dataset: Dataset):
     """
     add_built_in_descriptors(dataset)
     add_aromaticity_descriptors(dataset)
+    sanity_checks.check_rdkit_props(dataset.df_result)
diff --git a/src/clean_dataset.py b/src/clean_dataset.py
@@ -6,6 +6,7 @@
 from dataset import Dataset
 
 
+########### Remove Irrelevant Compounds ###########
 def remove_compounds_without_smiles_and_mixtures(
     dataset: Dataset, chembl_con: sqlite3.Connection
 ):
@@ -96,9 +97,8 @@ def remove_compounds_without_smiles_and_mixtures(
         )
     ]
 
-    return dataset.df_result
-
 
+########### General Cleaning Steps ###########
 def clean_none_values(dataset: Dataset):
     """
     Change nan values and empty strings to None for consistency.

diff --git a/src/dataset.py b/src/dataset.py
@@ -7,22 +7,15 @@
 class Dataset:
     """
     df_result:                  Pandas DataFrame with the full dataset
+    drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table,
+                                used for DTI assignments
+    drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
+                                used for DTI assigments
     df_sizes_all:               List of intermediate sized of the dataset used for debugging
     df_sizes_pchembl:           List of intermediate sized of the dataset used for debugging
-    drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table
-    drug_mechanism_targets_set: Set of targets in the drug_mechanism table
-    df_cpd_props:               Pandas DataFrame with compound properties and
-                                structures for all compound ids in ChEMBL
-    atc_levels:                 Pandas DataFrame with ATC annotations in ChEMBL
-    target_classes_level1:      Pandas DataFrame with mapping from target id to level 1 target class
-    target_classes_level2:      Pandas DataFrame with mapping from target id to level 2 target class
     """
 
     df_result: pd.DataFrame
-    df_cpd_props: pd.DataFrame
-    atc_levels: pd.DataFrame
-    target_classes_level1: pd.DataFrame
-    target_classes_level2: pd.DataFrame
     drug_mechanism_pairs_set: set
     drug_mechanism_targets_set: set
     df_sizes_all: list[int]