diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py index 836b06a..5f6afe8 100644 --- a/src/add_chembl_compound_properties.py +++ b/src/add_chembl_compound_properties.py @@ -1,11 +1,18 @@ +""" +Add ChEMBL compound properties to the dataset. +""" + import sqlite3 import pandas as pd +from dataset import Dataset +import sanity_checks + ########### Add Compound Properties Based on ChEMBL Data ########### -def add_first_publication_date( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool +def get_first_publication_cpd_date( + chembl_con: sqlite3.Connection, limit_to_literature: bool ) -> pd.DataFrame: """ Query and calculate the first publication of a compound @@ -14,13 +21,11 @@ def add_first_publication_date( of the compound in the literature according to ChEMBL. Otherwise this is the first appearance in any source in ChEMBL. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :param limit_to_literature: Base first_publication_cpd on literature sources only if True. :type limit_to_literature: bool - :return: Pandas DataFrame with added first_publication_cpd. + :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL. :rtype: pd.DataFrame """ # information about salts is aggregated in the parent @@ -42,26 +47,21 @@ def add_first_publication_date( ].transform("min") df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates() - df_combined = df_combined.merge(df_docs, on="parent_molregno", how="left") + return df_docs - return df_combined - -def add_chembl_properties_and_structures( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_chembl_properties_and_structures( + chembl_con: sqlite3.Connection, +) -> pd.DataFrame: """ - Add compound properties from the compound_properties table + Get compound properties from the compound_properties table (e.g., alogp, #hydrogen bond acceptors / donors, etc.). - Add InChI, InChI key and canonical smiles. + Get InChI, InChI key and canonical smiles. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :return: - Pandas DataFrame with added compound properties and structures. \\ - - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL. - :rtype: (pd.DataFrame, pd.DataFrame) + :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL + :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT mh.parent_molregno, @@ -79,14 +79,12 @@ def add_chembl_properties_and_structures( df_cpd_props = pd.read_sql_query(sql, con=chembl_con) - df_combined = df_combined.merge(df_cpd_props, on="parent_molregno", how="left") - - return df_combined, df_cpd_props + return df_cpd_props -def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame: +def calculate_ligand_efficiency_metrics(dataset: Dataset): """ - Calculate the ligand efficiency metrics for the compounds + Calculate and add the ligand efficiency metrics for the compounds based on the mean pchembl values for a compound-target pair and the following ligand efficiency (LE) formulas: @@ -108,33 +106,37 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame: Once for the pchembl values based on binding + functional assays (BF) and once for the pchembl values based on binding assays only (B). - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added ligand efficiency metrics - :rtype: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to include ligand efficiency metrics. + :type dataset: Dataset """ for suffix in ["BF", "B"]: - df_combined.loc[df_combined["heavy_atoms"] != 0, f"LE_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] - / df_combined["heavy_atoms"] + dataset.df_result.loc[dataset.df_result["heavy_atoms"] != 0, f"LE_{suffix}"] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] + / dataset.df_result["heavy_atoms"] * (2.303 * 298 * 0.00199) ) - df_combined.loc[df_combined["mw_freebase"] != 0, f"BEI_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] + dataset.df_result.loc[ + dataset.df_result["mw_freebase"] != 0, f"BEI_{suffix}" + ] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] * 1000 - / df_combined["mw_freebase"] + / dataset.df_result["mw_freebase"] ) - df_combined.loc[df_combined["psa"] != 0, f"SEI_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] * 100 / df_combined["psa"] + dataset.df_result.loc[dataset.df_result["psa"] != 0, f"SEI_{suffix}"] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] + * 100 + / dataset.df_result["psa"] ) - df_combined[f"LLE_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] - df_combined["alogp"] + dataset.df_result[f"LLE_{suffix}"] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] + - dataset.df_result["alogp"] ) - df_combined = df_combined.astype( + dataset.df_result = dataset.df_result.astype( { f"LE_{suffix}": "float64", f"BEI_{suffix}": "float64", @@ -143,26 +145,19 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame: } ) - return df_combined - -def add_atc_classification( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> tuple[pd.DataFrame, pd.DataFrame]: +def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame: """ - Query and add ATC classifications (level 1) from the atc_classification and + Query ATC classifications (level 1) from the atc_classification and molecule_atc_classification tables. - ATC level annotations for the same parent_molregno are combined into one description - that concatenates all descriptions sorted alphabetically + ATC level annotations for the same parent_molregno are combined into one description + that concatenates all descriptions sorted alphabetically into one string with ' | ' as a separator. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :return: - Pandas DataFrame with added ATC classifications \\ - - Pandas DataFrame with ATC annotations in ChEMBL - :rtype: (pd.DataFrame, pd.DataFrame) + :return: Pandas DataFrame with ATC annotations in ChEMBL. + :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description @@ -185,14 +180,12 @@ def add_atc_classification( ].transform(lambda x: between_str_join.join(sorted(x))) atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates() - df_combined = df_combined.merge(atc_levels, on="parent_molregno", how="left") - - return df_combined, atc_levels + return atc_levels def add_all_chembl_compound_properties( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool +): """ Add ChEMBL-based compound properties to the given compound-target pairs, specifically: @@ -202,24 +195,33 @@ def add_all_chembl_compound_properties( - ligand efficiency metrics - ATC classifications - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to include compound properties. + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :param limit_to_literature: Base first_publication_cpd on literature sources only if True. + :param limit_to_literature: Base first_publication_cpd on literature sources only if True. Base it on all available sources otherwise. :type limit_to_literature: bool - :return: - Pandas DataFrame with added compound properties \\ - - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL \\ - - Pandas DataFrame with ATC annotations in ChEMBL - :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame) """ - df_combined = add_first_publication_date( - df_combined, chembl_con, limit_to_literature + df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature) + dataset.df_result = dataset.df_result.merge( + df_docs, on="parent_molregno", how="left" ) - df_combined, df_cpd_props = add_chembl_properties_and_structures( - df_combined, chembl_con + + df_cpd_props = get_chembl_properties_and_structures(chembl_con) + dataset.df_cpd_props = df_cpd_props + dataset.df_result = dataset.df_result.merge( + df_cpd_props, on="parent_molregno", how="left" + ) + sanity_checks.check_compound_props(dataset.df_result, df_cpd_props) + + calculate_ligand_efficiency_metrics(dataset) + sanity_checks.check_ligand_efficiency_metrics(dataset.df_result) + + atc_levels = get_atc_classification(chembl_con) + dataset.atc_levels = atc_levels + dataset.df_result = dataset.df_result.merge( + atc_levels, on="parent_molregno", how="left" ) - df_combined = add_ligand_efficiency_metrics(df_combined) - df_combined, atc_levels = add_atc_classification(df_combined, chembl_con) - return df_combined, df_cpd_props, atc_levels + sanity_checks.check_atc(dataset.df_result, atc_levels) diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py index 25c1d01..bb8d080 100644 --- a/src/add_chembl_target_class_annotations.py +++ b/src/add_chembl_target_class_annotations.py @@ -1,10 +1,17 @@ +""" +Add target class annotations based on ChEMBL data to the dataset. +""" + import logging import os import sqlite3 import pandas as pd -import write_subsets +from arguments import OutputArgs, CalculationArgs +from dataset import Dataset +import output +import sanity_checks ########### Add Target Class Annotations Based on ChEMBL Data ########### @@ -78,56 +85,31 @@ def get_target_class_table( return df_target_classes -def add_chembl_target_class_annotations( - df_combined: pd.DataFrame, +def get_aggregated_target_classes( + dataset: Dataset, chembl_con: sqlite3.Connection, - output_path: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - chembl_version: str, - limited_flag: str, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame]: """ - Add level 1 and 2 target class annotations. - Assignments for target IDs with more than one target class assignment per level - are summarised into one string with '|' as a separator - between the different target class annotations. + Get mappings for target id to aggregated level 1 / level 2 target class. - Targets with more than one level 1 / level 2 target class assignment are written to a file. - These could be reassigned by hand if a single target class is preferable. - - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :param output_path: Path to write the targets with more than one target class assignment to - :type output_path: str - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str - :return: - Pandas DataFrame with added target class annotations \\ - - Pandas DataFrame with mapping from target id to level 1 target class \\ - - Pandas DataFrame with mapping from target id to level 2 target class - :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame) + :return: [pandas DataFrame with mapping from target id to level 1 target class, + pandas DataFrame with mapping from target id to level 2 target class] + :rtype: tuple[pd.DataFrame, pd.DataFrame] """ - current_tids = set(df_combined["tid"]) + current_tids = set(dataset.df_result["tid"]) df_target_classes = get_target_class_table(chembl_con, current_tids) + between_str_join = "|" + # Summarise the information for a target id with # several assigned target classes of level 1 into one description. # If a target id has more than one assigned target class, # the target class 'Unclassified protein' is discarded. level = "l1" - between_str_join = "|" target_classes_level1 = df_target_classes[["tid", level]].drop_duplicates().dropna() # remove 'Unclassified protein' from targets with more than one target class, level 1 @@ -155,8 +137,6 @@ def add_chembl_target_class_annotations( ["tid", "target_class_l1"] ].drop_duplicates() - df_combined = df_combined.merge(target_classes_level1, on="tid", how="left") - # Repeat the summary step for target classes of level 2. level = "l2" target_classes_level2 = df_target_classes[["tid", level]].drop_duplicates().dropna() @@ -167,12 +147,27 @@ def add_chembl_target_class_annotations( ["tid", "target_class_l2"] ].drop_duplicates() - df_combined = df_combined.merge(target_classes_level2, on="tid", how="left") + return target_classes_level1, target_classes_level2 + - # Output targets have more than one target class assignment - more_than_one_level_1 = df_combined[ - (df_combined["target_class_l1"].notnull()) - & (df_combined["target_class_l1"].str.contains("|", regex=False)) +def output_ambiguous_target_classes( + dataset: Dataset, + args: CalculationArgs, + out: OutputArgs, +): + """ + Output targets have more than one target class assignment + + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + more_than_one_level_1 = dataset.df_result[ + (dataset.df_result["target_class_l1"].notnull()) + & (dataset.df_result["target_class_l1"].str.contains("|", regex=False)) ][ ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"] ].drop_duplicates() @@ -180,9 +175,9 @@ def add_chembl_target_class_annotations( "Targets with more than one level 1 target class assignment: %s", len(more_than_one_level_1), ) - more_than_one_level_2 = df_combined[ - (df_combined["target_class_l2"].notnull()) - & (df_combined["target_class_l2"].str.contains("|", regex=False)) + more_than_one_level_2 = dataset.df_result[ + (dataset.df_result["target_class_l2"].notnull()) + & (dataset.df_result["target_class_l2"].str.contains("|", regex=False)) ][ ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"] ].drop_duplicates() @@ -199,15 +194,60 @@ def add_chembl_target_class_annotations( ) name_more_than_one_tclass = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_targets_w_more_than_one_tclass", + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass", ) - write_subsets.write_output( + output.write_output( more_than_one_tclass, name_more_than_one_tclass, - write_to_csv, - write_to_excel, - delimiter, + out, + ) + + +def add_chembl_target_class_annotations( + dataset: Dataset, + chembl_con: sqlite3.Connection, + args: CalculationArgs, + out: OutputArgs, +): + """ + Add level 1 and 2 target class annotations. + Assignments for target IDs with more than one target class assignment per level + are summarised into one string with '|' as a separator + between the different target class annotations. + + Targets with more than one level 1 / level 2 target class assignment are written to a file. + These could be reassigned by hand if a single target class is preferable. + + :param dataset: Dataset with compound-target pairs. + Will be updated to only include target class annotations. + dataset.target_classes_level1 will be set to + pandas DataFrame with mapping from target id to level 1 target class + dataset.target_classes_level2 will be set to + pandas DataFrame with mapping from target id to level 2 target class + :type dataset: Dataset + :param chembl_con: Sqlite3 connection to ChEMBL database. + :type chembl_con: sqlite3.Connection + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + target_classes_level1, target_classes_level2 = get_aggregated_target_classes( + dataset, chembl_con + ) + + dataset.df_result = dataset.df_result.merge( + target_classes_level1, on="tid", how="left" + ) + + dataset.df_result = dataset.df_result.merge( + target_classes_level2, on="tid", how="left" + ) + + sanity_checks.check_target_classes( + dataset.df_result, target_classes_level1, target_classes_level2 ) - return df_combined, target_classes_level1, target_classes_level2 + output_ambiguous_target_classes(dataset, args, out) diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py index 9f181c5..94367e9 100644 --- a/src/add_dti_annotations.py +++ b/src/add_dti_annotations.py @@ -1,12 +1,14 @@ -import pandas as pd +""" +Add DTI (Drug-Target Interaction) Annotations to the dataset. +""" +from dataset import Dataset -########### CTI (Compound-Target Interaction) Annotations ########### + +########### DTI (Drug-Target Interaction) Annotations ########### def add_dti_annotations( - df_combined: pd.DataFrame, - drug_mechanism_pairs_set: set, - drug_mechanism_targets_set: set, -) -> pd.DataFrame: + dataset: Dataset, +): """ Every compound-target pair is assigned a DTI (drug target interaction) annotation. @@ -60,84 +62,95 @@ def add_dti_annotations( and for which the target was also not in the drug_mechanisms table (not a comparator compound), are discarded. - :param df_combined: Pandas DataFrame with compound-target pairs - based on activities AND drug_mechanism table - :type df_combined: pd.DataFrame - :param drug_mechanism_pairs_set: set of compound-target pairs in the drug_mechanism table - :type drug_mechanism_pairs_set: set - :param drug_mechanism_targets_set: set of targets in the drug_mechanism table - :type drug_mechanism_targets_set: set - :return: Pandas DataFrame with all compound-target pairs and their DTI annotations. - :rtype: pd.DataFrame + :param dataset: Dataset with all relevant information: + - Pandas DataFrame with compound-target pairs + based on activities AND drug_mechanism table + - set of compound-target pairs in the drug_mechanism table + - set of targets in the drug_mechanism table + :type dataset: Dataset """ # Add a new column *therapeutic_target* which is set to True # if the target is in the drug_mechanism table - df_combined["therapeutic_target"] = df_combined["tid"].isin( - drug_mechanism_targets_set + dataset.df_result["therapeutic_target"] = dataset.df_result["tid"].isin( + dataset.drug_mechanism_targets_set ) # Assign the annotations based on the table. # Compound-target pairs from the drug mechanism table - df_combined.loc[ + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 4) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 4) ), "DTI", ] = "D_DT" - df_combined.loc[ + + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 3) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 3) ), "DTI", ] = "C3_DT" - df_combined.loc[ + + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 2) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 2) ), "DTI", ] = "C2_DT" - df_combined.loc[ + + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 1) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 1) ), "DTI", ] = "C1_DT" + # Compounds that are in the drug_mechanism table but don't have a known phase between 1-4: - df_combined.loc[ + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (~df_combined["max_phase"].isin([1, 2, 3, 4])) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (~dataset.df_result["max_phase"].isin([1, 2, 3, 4])) ), "DTI", ] = "C0_DT" # Target is in the drug mechanism table - df_combined.loc[ + dataset.df_result.loc[ ( - (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)) - & (df_combined["therapeutic_target"] == True) + ( + ~dataset.df_result["cpd_target_pair"].isin( + dataset.drug_mechanism_pairs_set + ) + ) + & (dataset.df_result["therapeutic_target"]) ), "DTI", ] = "DT" # Other compound-target pairs # if target is not a therapeutic target, 'cpd_target_pair' cannot be in DTIs_set - # (~df_combined['cpd_target_pair'].isin(DTIs_set)) is included for clarity - df_combined.loc[ + # (~dataset.df_result['cpd_target_pair'].isin(DTIs_set)) is included for clarity + dataset.df_result.loc[ ( - (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)) - & (df_combined["therapeutic_target"] == False) + ( + ~dataset.df_result["cpd_target_pair"].isin( + dataset.drug_mechanism_pairs_set + ) + ) + & ~(dataset.df_result["therapeutic_target"]) ), "DTI", ] = "NDT" # Discard NDT rows - df_combined = df_combined[ - (df_combined["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"])) + dataset.df_result = dataset.df_result[ + ( + dataset.df_result["DTI"].isin( + ["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"] + ) + ) ] - - return df_combined diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py new file mode 100644 index 0000000..27d4076 --- /dev/null +++ b/src/add_filtering_columns.py @@ -0,0 +1,214 @@ +""" +Add filtering columns for obtaining the different subsets to the dataset. +""" + +import logging +import os + +import pandas as pd + +from arguments import CalculationArgs, OutputArgs +from dataset import Dataset +import get_stats +import output + + +def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[ + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], +]: + """ + Calculate and return the different subsets of interest. + + :param data: Pandas DataFrame with compound-target pairs + :type data: pd.DataFrame + :param min_nof_cpds: Miminum number of compounds per target + :type min_nof_cpds: int + :param desc: Types of assays current_df contains information about. + Options: "BF" (binding+functional), "B" (binding) + :type desc: str + :return: List of dataset subsets and the string describing them + - data: Pandas DataFrame with compound-target pairs + without filtering columns and without + the annotations for the opposite desc, + e.g. if desc = "BF", the average pchembl value based on + binding data only is dropped + - df_enough_cpds: Pandas DataFrame with targets + with at least compounds with a pchembl value, + - df_c_dt_d_dt: As df_enough_cpds but with + at least one compound-target pair labelled as + 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), + - df_d_dt: As df_enough_cpds but with + at least one compound-target pair labelled as + 'D_DT' (i.e., known drug-target interaction) + :rtype: tuple[tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str]] + """ + if desc == "B": + drop_desc = "BF" + else: + drop_desc = "B" + data = data.drop( + columns=[ + f"pchembl_value_mean_{drop_desc}", + f"pchembl_value_max_{drop_desc}", + f"pchembl_value_median_{drop_desc}", + f"first_publication_cpd_target_pair_{drop_desc}", + f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}", + f"LE_{drop_desc}", + f"BEI_{drop_desc}", + f"SEI_{drop_desc}", + f"LLE_{drop_desc}", + ] + + [ # exclude columns related to the other assay types + col for col in data.columns if col.startswith("B_") or col.startswith("BF_") + ] # exclude filtering columns + ).drop_duplicates() + + # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value. + comparator_counts = ( + data[data[f"pchembl_value_mean_{desc}"].notnull()] + .groupby(["tid_mutation"])["parent_molregno"] + .count() + ) + # pylint: disable-next=unused-variable + targets_w_enough_cpds = comparator_counts[ + comparator_counts >= min_nof_cpds + ].index.tolist() + df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds") + + # Restrict the dataset further to targets + # with at least one compound-target pair labelled as + # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT', + # i.e., compound-target pairs with a known interactions. + # pylint: disable-next=unused-variable + c_dt_d_dt_targets = set( + df_enough_cpds[ + df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"]) + ].tid_mutation.to_list() + ) + df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets") + + # Restrict the dataset further to targets with + # at least one compound-target pair labelled as 'D_DT', + # i.e., known drug-target interactions. + # pylint: disable-next=unused-variable + d_dt_targets = set( + df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list() + ) + df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets") + + return [ + [data, f"{desc}"], + [df_enough_cpds, f"{desc}_{min_nof_cpds}"], + [df_c_dt_d_dt, f"{desc}_{min_nof_cpds}_c_dt_d_dt"], + [df_d_dt, f"{desc}_{min_nof_cpds}_d_dt"], + ] + + +def add_subset_filtering_columns( + df_combined_subset: pd.DataFrame, + dataset: Dataset, + desc: str, + args: CalculationArgs, + out: OutputArgs, +): + """ + Add filtering column for binding + functional vs binding + + :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data + in df_combined + :type df_combined_subset: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include filtering columns. + :type dataset: Dataset + :param desc: Assay description, + either "BF" (binding+functional) or "B" (binding) + :type desc: str + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + subsets = get_data_subsets( + df_combined_subset, + args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b, + desc, + ) + + # write subsets if required + if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b): + for [df_subset, subset_desc] in subsets: + name_subset = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"{subset_desc}", + ) + output.write_and_check_output( + df_subset, + name_subset, + desc, + args, + out, + ) + + # add filtering columns to df_combined + # do not add a filtering column for BF / B (-> [1:]) + for [df, col_name] in subsets[1:]: + dataset.df_result[col_name] = False + dataset.df_result.loc[(dataset.df_result.index.isin(df.index)), col_name] = True + # check that filtering works + assert dataset.df_result[dataset.df_result[col_name]][df.columns].equals( + df + ), f"Filtering is not accurate for {col_name}." + + if logging.DEBUG >= logging.root.level: + for [df_subset, subset_desc] in subsets: + get_stats.add_debugging_info(dataset, df_subset, subset_desc) + + +def add_filtering_columns( + dataset: Dataset, + args: CalculationArgs, + out: OutputArgs, +): + """ + Add filtering columns to main dataset and save subsets if required. + + :param dataset: Dataset with compound-target pairs. + Will be updated to only include filtering columns. + :type dataset: Dataset + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + # consider binding and functional assays + # assay description = binding+functional + desc = "BF" + # df_combined without binding only data + df_combined_subset = dataset.df_result.copy() + add_subset_filtering_columns( + df_combined_subset, + dataset, + desc, + args, + out, + ) + + # consider only binding assays + # assay description = binding + desc = "B" + df_combined_subset = dataset.df_result[dataset.df_result["keep_for_binding"]].copy() + add_subset_filtering_columns( + df_combined_subset, + dataset, + desc, + args, + out, + ) diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py index 5c7d4e8..889ff70 100644 --- a/src/add_rdkit_compound_descriptors.py +++ b/src/add_rdkit_compound_descriptors.py @@ -1,72 +1,79 @@ -import pandas as pd +""" +Add RDKit-based compound properties to the dataset. +""" + from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.Chem import PandasTools from tqdm import tqdm +from dataset import Dataset +import sanity_checks + -def add_built_in_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: +def add_built_in_descriptors(dataset: Dataset): """ Add RDKit built-in compound descriptors. + :param dataset: Dataset with compound-target pairs. + Will be updated to only include built-in RDKit compound descriptors. + :type dataset: Dataset :param df_combined: Pandas DataFrame with compound-target pairs :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added built-in RDKit compound descriptors - :rtype: pd.DataFrame """ # add a column with RDKit molecules, used to calculate the descriptors PandasTools.AddMoleculeColumnToFrame( - df_combined, "canonical_smiles", "mol", includeFingerprints=False + dataset.df_result, "canonical_smiles", "mol", includeFingerprints=False ) - df_combined.loc[:, "fraction_csp3"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "fraction_csp3"] = dataset.df_result["mol"].apply( Descriptors.FractionCSP3 ) - df_combined.loc[:, "ring_count"] = df_combined["mol"].apply(Descriptors.RingCount) - df_combined.loc[:, "num_aliphatic_rings"] = df_combined["mol"].apply( - Descriptors.NumAliphaticRings + dataset.df_result.loc[:, "ring_count"] = dataset.df_result["mol"].apply( + Descriptors.RingCount ) - df_combined.loc[:, "num_aliphatic_carbocycles"] = df_combined["mol"].apply( - Descriptors.NumAliphaticCarbocycles - ) - df_combined.loc[:, "num_aliphatic_heterocycles"] = df_combined["mol"].apply( - Descriptors.NumAliphaticHeterocycles + dataset.df_result.loc[:, "num_aliphatic_rings"] = dataset.df_result["mol"].apply( + Descriptors.NumAliphaticRings ) - df_combined.loc[:, "num_aromatic_rings"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_aliphatic_carbocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAliphaticCarbocycles) + dataset.df_result.loc[:, "num_aliphatic_heterocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAliphaticHeterocycles) + dataset.df_result.loc[:, "num_aromatic_rings"] = dataset.df_result["mol"].apply( Descriptors.NumAromaticRings ) - df_combined.loc[:, "num_aromatic_carbocycles"] = df_combined["mol"].apply( - Descriptors.NumAromaticCarbocycles - ) - df_combined.loc[:, "num_aromatic_heterocycles"] = df_combined["mol"].apply( - Descriptors.NumAromaticHeterocycles - ) - df_combined.loc[:, "num_saturated_rings"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_aromatic_carbocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAromaticCarbocycles) + dataset.df_result.loc[:, "num_aromatic_heterocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAromaticHeterocycles) + dataset.df_result.loc[:, "num_saturated_rings"] = dataset.df_result["mol"].apply( Descriptors.NumSaturatedRings ) - df_combined.loc[:, "num_saturated_carbocycles"] = df_combined["mol"].apply( - Descriptors.NumSaturatedCarbocycles - ) - df_combined.loc[:, "num_saturated_heterocycles"] = df_combined["mol"].apply( - Descriptors.NumSaturatedHeterocycles - ) - df_combined.loc[:, "num_stereocentres"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_saturated_carbocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumSaturatedCarbocycles) + dataset.df_result.loc[:, "num_saturated_heterocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumSaturatedHeterocycles) + dataset.df_result.loc[:, "num_stereocentres"] = dataset.df_result["mol"].apply( Chem.rdMolDescriptors.CalcNumAtomStereoCenters ) - df_combined.loc[:, "num_heteroatoms"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_heteroatoms"] = dataset.df_result["mol"].apply( Descriptors.NumHeteroatoms ) # add scaffolds - PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_w_stereo") + PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_w_stereo") # remove stereo information of the molecule to add scaffolds without stereo information - df_combined["mol"].apply(Chem.RemoveStereochemistry) - PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_wo_stereo") + dataset.df_result["mol"].apply(Chem.RemoveStereochemistry) + PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_wo_stereo") # drop the column with RDKit molecules - df_combined = df_combined.drop(["mol"], axis=1) - - return df_combined + dataset.df_result = dataset.df_result.drop(["mol"], axis=1) def calculate_aromatic_atoms( @@ -121,7 +128,7 @@ def calculate_aromatic_atoms( return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict -def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: +def add_aromaticity_descriptors(dataset: Dataset): """ Add number of aromatic atoms in a compounds, specifically: @@ -130,40 +137,40 @@ def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: - # aromatic nitrogen atoms (aromatic_n) - # aromatic hetero atoms (aromatic_hetero) - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added counts of aromatic atoms - :rtype: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include counts of aromatic atoms + :type dataset: Dataset """ # use df_combined_w_smiles to exclude null values - smiles_set = set(df_combined["canonical_smiles"]) + smiles_set = set(dataset.df_result["canonical_smiles"]) aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = ( calculate_aromatic_atoms(smiles_set) ) - df_combined["aromatic_atoms"] = df_combined["canonical_smiles"].map( + dataset.df_result["aromatic_atoms"] = dataset.df_result["canonical_smiles"].map( aromatic_atoms_dict ) - df_combined["aromatic_c"] = df_combined["canonical_smiles"].map(aromatic_c_dict) - df_combined["aromatic_n"] = df_combined["canonical_smiles"].map(aromatic_n_dict) - df_combined["aromatic_hetero"] = df_combined["canonical_smiles"].map( + dataset.df_result["aromatic_c"] = dataset.df_result["canonical_smiles"].map( + aromatic_c_dict + ) + dataset.df_result["aromatic_n"] = dataset.df_result["canonical_smiles"].map( + aromatic_n_dict + ) + dataset.df_result["aromatic_hetero"] = dataset.df_result["canonical_smiles"].map( aromatic_hetero_dict ) - return df_combined - -def add_rdkit_compound_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: +def add_rdkit_compound_descriptors(dataset: Dataset): """ Add RDKit-based compound descriptors (built-in and numbers of aromatic atoms). - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added built-in RDKit compound descriptors - and numbers of aromatic atoms - :rtype: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include + built-in RDKit compound descriptors + and numbers of aromatic atoms. + :type dataset: Dataset """ - df_combined = add_built_in_descriptors(df_combined) - df_combined = add_aromaticity_descriptors(df_combined) - - return df_combined + add_built_in_descriptors(dataset) + add_aromaticity_descriptors(dataset) + sanity_checks.check_rdkit_props(dataset.df_result) diff --git a/src/arguments.py b/src/arguments.py new file mode 100644 index 0000000..ea02154 --- /dev/null +++ b/src/arguments.py @@ -0,0 +1,178 @@ +""" +Dataclasses related to handling arguments, +specifically arguments related to how to calculate or output the dataset. +""" + +import argparse + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class CalculationArgs: + """ + Collection of arguments related to how to calculate the dataset. + + chembl_version: Version of ChEMBL for output file names + calculate_rdkit: True if RDKit-based compound properties should be calculated + limit_to_literature: Include only literature sources if True + limited_flag: String version of limit_to_literature used in file names + min_nof_cpds_bf: Minimum number of compounds per target for the BF subset + min_nof_cpds_b: Minimum number of compounds per target for the B subset + """ + + chembl_version: str + calculate_rdkit: bool + limit_to_literature: bool + limited_flag: str + min_nof_cpds_bf: int + min_nof_cpds_b: int + + +@dataclass(frozen=True) +class OutputArgs: + """ + Collection of arguments related to how to output the dataset. + + output_path: Path to write output files to + delimiter: Delimiter in csv-output + write_to_csv: True if output should be written to csv + write_to_excel: True if output should be written to excel + write_full_dataset: True if the full dataset should be written to output + write_bf: True if subsets based on binding+functional data should be written to output + write_b: True if subsets based on binding data only should be written to output + """ + + output_path: str + delimiter: str + write_to_csv: bool + write_to_excel: bool + write_full_dataset: bool + write_bf: bool + write_b: bool + + +def parse_args() -> argparse.Namespace: + """ + Get arguments with argparse. + + :return: Populated argparse.Namespace + :rtype: argparse.Namespace + """ + parser = argparse.ArgumentParser( + description="Extract the compound-target pairs dataset from ChEMBL. \ + The full dataset plus filtering columns for binding vs. binding+functional data \ + will always be written to csv. \ + Additional outputs and output types can be chosen with the parameters below." + ) + + parser.add_argument( + "--chembl", + "-v", + dest="chembl_version", + metavar="", + type=str, + default=None, + help="ChEMBL version. \ + Latest version if None. \ + Required if a path to a SQLite database is provided, \ + i.e., if --sqlite is set. (default: None)", + ) + parser.add_argument( + "--sqlite", + "-s", + metavar="", + type=str, + default=None, + help="Path to SQLite database. \ + ChEMBL is downloaded as an SQLite database \ + and handled by chembl_downloader if None. (default: None)", + ) + parser.add_argument( + "--output", + "-o", + dest="output_path", + metavar="", + type=str, + required=True, + help="Path to write the output file(s) to. (required)", + ) + parser.add_argument( + "--delimiter", + "-d", + metavar="", + type=str, + default=";", + help="Delimiter in output csv-files. (default: ;)", + ) + parser.add_argument( + "--all_sources", + action="store_true", + help="If this is set, the dataset is calculated based on all sources in ChEMBL. \ + This includes data from BindingDB which may skew the results. \ + Default (not set): the dataset is calculated based on only literature data.", + ) + parser.add_argument( + "--rdkit", + dest="calculate_rdkit", + action="store_true", + help="Calculate RDKit-based compound properties.", + ) + parser.add_argument( + "--excel", + dest="write_to_excel", + action="store_true", + help="Write the results to excel. Note: this may fail if the output is too large.", + ) + parser.add_argument( + "--BF", + dest="write_bf", + action="store_true", + help="Write binding+functional data subsets.", + ) + parser.add_argument( + "--B", dest="write_b", action="store_true", help="Write binding data subsets." + ) + parser.add_argument( + "--debug", action="store_true", help="Log additional debugging information." + ) + args = parser.parse_args() + + return args + + +def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]: + """ + Get parsed and default arguments. + + :return: parserd arguments, + arguments related to how to calculate the dataset as CalculationArgs, + arguments related to how to output the dataset as OutputArgs + :rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs] + """ + args = parse_args() + + calc_args = CalculationArgs( + chembl_version=args.chembl_version, + calculate_rdkit=args.calculate_rdkit, + limit_to_literature=not args.all_sources, + # used in file names + limited_flag="literature_only" if not args.all_sources else "all_sources", + min_nof_cpds_bf=100, + min_nof_cpds_b=100, + ) + + output_args = OutputArgs( + output_path=args.output_path, + delimiter=args.delimiter, + # Always write the results to csv. + write_to_csv=True, + write_to_excel=args.write_to_excel, + # Always write the full dataset plus filtering columns + # for binding vs. binding+functional data. + write_full_dataset=True, + write_bf=args.write_bf, + write_b=args.write_b, + ) + + return args, calc_args, output_args diff --git a/src/clean_dataset.py b/src/clean_dataset.py index 01a2564..56517dd 100644 --- a/src/clean_dataset.py +++ b/src/clean_dataset.py @@ -1,12 +1,19 @@ +""" +Methods related to cleaning the dataset. +""" + import logging import sqlite3 import pandas as pd +from dataset import Dataset + +########### Remove Irrelevant Compounds ########### def remove_compounds_without_smiles_and_mixtures( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> pd.DataFrame: + dataset: Dataset, chembl_con: sqlite3.Connection +): """ Remove @@ -16,12 +23,12 @@ def remove_compounds_without_smiles_and_mixtures( Since compound information is aggregated for the parents of salts, the number of smiles with a dot is relatively low. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include + compound-target pairs with a smiles that does not contain a '.' + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :return: Pandas DataFrame with compound-target pairs with a smiles that does not contain a '.' - :rtype: pd.DataFrame """ # Double-check that rows with a SMILES containing a '.' are the parent structures, # i.e., there was no error in using salt information instead of parent information. @@ -31,9 +38,9 @@ def remove_compounds_without_smiles_and_mixtures( """ df_hierarchy = pd.read_sql_query(sql, con=chembl_con) - smiles_with_dot = df_combined[ - df_combined["canonical_smiles"].notnull() - & df_combined["canonical_smiles"].str.contains(".", regex=False) + smiles_with_dot = dataset.df_result[ + dataset.df_result["canonical_smiles"].notnull() + & dataset.df_result["canonical_smiles"].str.contains(".", regex=False) ][["canonical_smiles", "parent_molregno"]].drop_duplicates() for parent_molregno in set(smiles_with_dot["parent_molregno"]): @@ -72,42 +79,45 @@ def remove_compounds_without_smiles_and_mixtures( the smiles for the compound in ChEMBL ({parent_smiles_in_chembl})." # Remove rows that contain a SMILES with a dot or that don't have a SMILES. - len_missing_smiles = len(df_combined[df_combined["canonical_smiles"].isnull()]) + len_missing_smiles = len( + dataset.df_result[dataset.df_result["canonical_smiles"].isnull()] + ) len_smiles_w_dot = len( - df_combined[ - df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"])) + dataset.df_result[ + dataset.df_result["parent_molregno"].isin( + set(smiles_with_dot["parent_molregno"]) + ) ] ) logging.debug("#Compounds without a SMILES: %s", len_missing_smiles) logging.debug("#SMILES with a dot: %s", len_smiles_w_dot) - df_combined = df_combined[ - (df_combined["canonical_smiles"].notnull()) + dataset.df_result = dataset.df_result[ + (dataset.df_result["canonical_smiles"].notnull()) & ~( - df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"])) + dataset.df_result["parent_molregno"].isin( + set(smiles_with_dot["parent_molregno"]) + ) ) ] - return df_combined - -def clean_none_values(df_combined): +########### General Cleaning Steps ########### +def clean_none_values(dataset: Dataset): """ Change nan values and empty strings to None for consistency. """ # Change all None / nan values to None - df_combined = df_combined.where(pd.notnull(df_combined), None) + dataset.df_result = dataset.df_result.where(pd.notnull(dataset.df_result), None) # replace empty strings with None - df_combined = df_combined.replace("", None).reset_index(drop=True) + dataset.df_result = dataset.df_result.replace("", None).reset_index(drop=True) - return df_combined - -def set_types_to_int(df_combined, calculate_rdkit): +def set_types_to_int(dataset, calculate_rdkit): """ Set the type of relevant columns to Int64. """ - df_combined = df_combined.astype( + dataset.df_result = dataset.df_result.astype( { "first_approval": "Int64", "usan_year": "Int64", @@ -129,7 +139,7 @@ def set_types_to_int(df_combined, calculate_rdkit): ) if calculate_rdkit: - df_combined = df_combined.astype( + dataset.df_result = dataset.df_result.astype( { "num_aliphatic_carbocycles": "Int64", "num_aliphatic_heterocycles": "Int64", @@ -150,26 +160,26 @@ def set_types_to_int(df_combined, calculate_rdkit): } ) - return df_combined - -def round_floats(df_combined, decimal_places=4): +def round_floats(dataset, decimal_places=4): """ Round float columns to decimal places. This does not apply to max_phase. """ - for _, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()): + for _, (col, dtype) in enumerate(dataset.df_result.dtypes.to_dict().items()): if (dtype in ("float64", "Float64")) and col != "max_phase": - df_combined[col] = df_combined[col].round(decimals=decimal_places) + dataset.df_result[col] = dataset.df_result[col].round( + decimals=decimal_places + ) - return df_combined + return dataset.df_result -def reorder_columns(df_combined, calculate_rdkit): +def reorder_columns(dataset, calculate_rdkit): """ Reorder the columns in the DataFrame. """ - len_columns_before = len(df_combined.columns) + len_columns_before = len(dataset.df_result.columns) compound_target_pair_columns = [ "parent_molregno", @@ -283,7 +293,7 @@ def reorder_columns(df_combined, calculate_rdkit): + rdkit_columns + filtering_columns ) - df_combined = df_combined[columns] + dataset.df_result = dataset.df_result[columns] else: columns = ( compound_target_pair_columns @@ -296,18 +306,16 @@ def reorder_columns(df_combined, calculate_rdkit): + chembl_target_annotations + filtering_columns ) - df_combined = df_combined[columns] + dataset.df_result = dataset.df_result[columns] - len_columns_after = len(df_combined.columns) + len_columns_after = len(dataset.df_result.columns) assert ( len_columns_before == len_columns_after ), f"Different number of columns after reordering \ (before: {len_columns_before}, after: {len_columns_after})." - return df_combined - -def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFrame: +def clean_dataset(dataset: Dataset, calculate_rdkit: bool) -> pd.DataFrame: """ Clean the dataset by @@ -317,18 +325,16 @@ def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFr - reordering columns - sorting rows by cpd_target_pair_mutation - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to clean version with the updates described above. + :type dataset: Dataset :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties :type calculate_rdkit: bool - :return: Cleaned pandas DataFrame with compound-target pairs - :rtype: pd.DataFrame """ - df_combined = clean_none_values(df_combined) - df_combined = set_types_to_int(df_combined, calculate_rdkit) - df_combined = round_floats(df_combined, decimal_places=4) - df_combined = reorder_columns(df_combined, calculate_rdkit) - df_combined = df_combined.sort_values(by=["cpd_target_pair_mutation"]).reset_index( - drop=True - ) - return df_combined + clean_none_values(dataset) + set_types_to_int(dataset, calculate_rdkit) + round_floats(dataset, decimal_places=4) + reorder_columns(dataset, calculate_rdkit) + dataset.df_result = dataset.df_result.sort_values( + by=["cpd_target_pair_mutation"] + ).reset_index(drop=True) diff --git a/src/dataset.py b/src/dataset.py new file mode 100644 index 0000000..19fd259 --- /dev/null +++ b/src/dataset.py @@ -0,0 +1,30 @@ +""" +Dataclass for handling the calculated compound-target pair dataset +and related data. +""" + +from dataclasses import dataclass + +import pandas as pd + + +@dataclass +class Dataset: + """ + df_result: Pandas DataFrame with the full dataset + drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table, + used for DTI assignments + drug_mechanism_targets_set: Set of targets in the drug_mechanism table, + used for DTI assigments + df_sizes_all: Pandas DataFrame of intermediate sizes of the dataset, + used for debugging + df_sizes_pchembl: Pandas DataFrame of intermediate sizes of the dataset, + restricted to entries with a pchembl value, + used for debugging + """ + + df_result: pd.DataFrame + drug_mechanism_pairs_set: set + drug_mechanism_targets_set: set + df_sizes_all: pd.DataFrame + df_sizes_pchembl: pd.DataFrame diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py index 7c825db..3f6a02c 100644 --- a/src/get_activity_ct_pairs.py +++ b/src/get_activity_ct_pairs.py @@ -1,17 +1,20 @@ -import logging +""" +Get initial set of compound-target pairs with an associated activity +for the dataset. +""" + import sqlite3 import numpy as np import pandas as pd -import get_stats +from dataset import Dataset ########### Get Initial Compound-Target Data From ChEMBL ########### def get_compound_target_pairs_with_pchembl( chembl_con: sqlite3.Connection, limit_to_literature: bool, - df_sizes: list[list[int], list[int]], ) -> pd.DataFrame: """ Query ChEMBL activities and related assay for compound-target pairs @@ -27,8 +30,6 @@ def get_compound_target_pairs_with_pchembl( :param limit_to_literature: Include only literature sources if True. Include all available sources otherwise. :type limit_to_literature: bool - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] :return: Pandas DataFrame with compound-target pairs with a pchembl value. :rtype: pd.DataFrame """ @@ -84,9 +85,6 @@ def get_compound_target_pairs_with_pchembl( f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"]) ] - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_mols, "initial query", df_sizes) - return df_mols @@ -170,10 +168,9 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame: ########### Get Aggregated Compound-Target Pair Information ########### -def get_aggregated_activity_ct_pairs( +def get_aggregated_compound_target_pairs_with_pchembl( chembl_con: sqlite3.Connection, limit_to_literature: bool, - df_sizes: list[list[int], list[int]], ) -> pd.DataFrame: """ Get dataset of compound target-pairs with an associated pchembl value @@ -194,14 +191,13 @@ def get_aggregated_activity_ct_pairs( :param limit_to_literature: Include only literature sources if True. Include all available sources otherwise. :type limit_to_literature: bool - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data - aggregated into one entry per compound-target pair. + :return: Pandas Dataframe with compound-target pairs + based on ChEMBL activity data aggregated into one entry per compound-target pair. :rtype: pd.DataFrame """ df_mols = get_compound_target_pairs_with_pchembl( - chembl_con, limit_to_literature, df_sizes + chembl_con, + limit_to_literature, ) # Summarise the information for binding and functional assays @@ -232,3 +228,34 @@ def get_aggregated_activity_ct_pairs( ) return df_combined + + +def get_aggregated_activity_ct_pairs( + chembl_con: sqlite3.Connection, + limit_to_literature: bool, +) -> Dataset: + """ + Wrapper for get_aggregated_compound_target_pairs_with_pchembl, + initialising a dataset. + + :param chembl_con: Sqlite3 connection to ChEMBL database. + :type chembl_con: sqlite3.Connection + :param limit_to_literature: Include only literature sources if True. + Include all available sources otherwise. + :type limit_to_literature: bool + :return: Dataset with a pandas Dataframe with compound-target pairs + based on ChEMBL activity data aggregated into one entry per compound-target pair. + :rtype: Dataset + """ + df_result = get_aggregated_compound_target_pairs_with_pchembl( + chembl_con, limit_to_literature + ) + + dataset = Dataset( + df_result, + set(), + set(), + pd.DataFrame(), + pd.DataFrame(), + ) + return dataset diff --git a/src/get_dataset.py b/src/get_dataset.py index 2fa6b91..faffb3b 100644 --- a/src/get_dataset.py +++ b/src/get_dataset.py @@ -1,224 +1,90 @@ +""" +Main workflow to calculate the compound-target pairs dataset. +""" + import logging -import os import sqlite3 +from arguments import OutputArgs, CalculationArgs +import add_filtering_columns import get_activity_ct_pairs -import get_drug_mechanism_ct_pairs -import add_dti_annotations import add_chembl_compound_properties -import clean_dataset import add_chembl_target_class_annotations +import get_drug_mechanism_ct_pairs +import add_dti_annotations import add_rdkit_compound_descriptors -import sanity_checks -import write_subsets +import clean_dataset import get_stats +import output +import sanity_checks def get_ct_pair_dataset( - chembl_con: sqlite3.Connection, - chembl_version: str, - output_path: str, - limit_to_literature: bool, - calculate_rdkit: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - write_full_dataset: bool, - write_bf: bool, - write_b: bool, + chembl_con: sqlite3.Connection, args: CalculationArgs, out: OutputArgs ): """ Calculate and output the compound-target pair dataset. :param chembl_con: Sqlite3 connection to ChEMBL database :type chembl_con: sqlite3.Connection - :param chembl_version: Version of ChEMBL for output file names - :type chembl_version: str - :param output_path: Path to write output files to - :type output_path: str - :param limit_to_literature: Include only literature sources if True. - Include all available sources otherwise. - :type limit_to_literature: bool - :param calculate_rdkit: True if RDKit-based compound properties should be calculated - :type calculate_rdkit: bool - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - :param write_full_dataset: True if the full dataset should be written to output - :type write_full_dataset: bool - :param write_bf: True if subsets based on binding+functional data should be written to output - :type write_bf: bool - :param write_b: True if subsets based on binding data only should be written to output - :type write_b: bool + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs """ - # list with sizes of full dataset and dataset subset with pchembl values for debugging - df_sizes = [[], []] - - # used in file names - if limit_to_literature: - limited_flag = "literature_only" - else: - limited_flag = "all_sources" - logging.info("get_aggregated_activity_ct_pairs") - df_combined = get_activity_ct_pairs.get_aggregated_activity_ct_pairs( - chembl_con, limit_to_literature, df_sizes + dataset = get_activity_ct_pairs.get_aggregated_activity_ct_pairs( + chembl_con, args.limit_to_literature ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "activity ct-pairs", df_sizes) + get_stats.add_debugging_info(dataset, dataset.df_result, "activity ct-pairs") logging.info("add_cti_from_drug_mechanisms") - df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set = ( - get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(df_combined, chembl_con) - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "dm ct-pairs", df_sizes) + get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(dataset, chembl_con) + get_stats.add_debugging_info(dataset, dataset.df_result, "dm ct-pairs") logging.info("add_cti_annotations") - df_combined = add_dti_annotations.add_dti_annotations( - df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "DTI annotations", df_sizes) + add_dti_annotations.add_dti_annotations(dataset) + get_stats.add_debugging_info(dataset, dataset.df_result, "DTI annotations") logging.info("add_all_chembl_compound_properties") - df_combined, df_cpd_props, atc_levels = ( - add_chembl_compound_properties.add_all_chembl_compound_properties( - df_combined, chembl_con, limit_to_literature - ) + add_chembl_compound_properties.add_all_chembl_compound_properties( + dataset, chembl_con, args.limit_to_literature ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "ChEMBL props", df_sizes) + get_stats.add_debugging_info(dataset, dataset.df_result, "ChEMBL props") logging.info("remove_compounds_without_smiles_and_mixtures") - df_combined = clean_dataset.remove_compounds_without_smiles_and_mixtures( - df_combined, chembl_con - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "removed smiles", df_sizes) + clean_dataset.remove_compounds_without_smiles_and_mixtures(dataset, chembl_con) + get_stats.add_debugging_info(dataset, dataset.df_result, "removed smiles") logging.info("add_chembl_target_class_annotations") - df_combined, target_classes_level1, target_classes_level2 = ( - add_chembl_target_class_annotations.add_chembl_target_class_annotations( - df_combined, - chembl_con, - output_path, - write_to_csv, - write_to_excel, - delimiter, - chembl_version, - limited_flag, - ) + add_chembl_target_class_annotations.add_chembl_target_class_annotations( + dataset, + chembl_con, + args, + out, ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "tclass annotations", df_sizes) + get_stats.add_debugging_info(dataset, dataset.df_result, "tclass annotations") - logging.info("add_rdkit_compound_descriptors") - if calculate_rdkit: - df_combined = add_rdkit_compound_descriptors.add_rdkit_compound_descriptors( - df_combined - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "RDKit props", df_sizes) + if args.calculate_rdkit: + logging.info("add_rdkit_compound_descriptors") + add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(dataset) + get_stats.add_debugging_info(dataset, dataset.df_result, "RDKit props") logging.info("clean_dataset") - df_combined = clean_dataset.clean_dataset(df_combined, calculate_rdkit) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "clean df", df_sizes) + clean_dataset.clean_dataset(dataset, args.calculate_rdkit) + get_stats.add_debugging_info(dataset, dataset.df_result, "clean df") logging.info("sanity_checks") - sanity_checks.sanity_checks( - df_combined, - df_cpd_props, - atc_levels, - target_classes_level1, - target_classes_level2, - calculate_rdkit, - ) + sanity_checks.sanity_checks(dataset) - logging.info("write_BF_to_file") - min_nof_cpds_bf = 100 - df_combined_annotated = write_subsets.write_bf_to_file( - df_combined, - chembl_version, - min_nof_cpds_bf, - output_path, - write_bf, - write_to_csv, - write_to_excel, - delimiter, - limited_flag, - calculate_rdkit, - df_sizes, - ) - - logging.info("write_B_to_file") - min_nof_cpds_b = 100 - df_combined_annotated = write_subsets.write_b_to_file( - df_combined, - df_combined_annotated, - chembl_version, - min_nof_cpds_b, - output_path, - write_b, - write_to_csv, - write_to_excel, - delimiter, - limited_flag, - calculate_rdkit, - df_sizes, - ) + logging.info("add_filtering_columns") + add_filtering_columns.add_filtering_columns(dataset, args, out) logging.info("write_full_dataset_to_file") - write_subsets.write_full_dataset_to_file( - df_combined_annotated, - chembl_version, - output_path, - write_full_dataset, - write_to_csv, - write_to_excel, - delimiter, - limited_flag, - calculate_rdkit, - ) + output.write_full_dataset_to_file(dataset, args, out) logging.info("output_stats") - - output_file = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset_stats" - ) - write_subsets.output_stats( - df_combined_annotated, output_file, write_to_csv, write_to_excel, delimiter - ) - if write_bf: - output_file = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt_stats", - ) - write_subsets.output_stats( - df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]], - output_file, - write_to_csv, - write_to_excel, - delimiter, - ) - if write_b: - output_file = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt_stats", - ) - write_subsets.output_stats( - df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]], - output_file, - write_to_csv, - write_to_excel, - delimiter, - ) + output.output_all_stats(dataset, args, out) if logging.DEBUG >= logging.root.level: - write_subsets.output_debug_sizes( - df_sizes, output_path, write_to_csv, write_to_excel, delimiter - ) + output.write_debug_sizes(dataset, out) diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py index be67c59..6121acd 100644 --- a/src/get_drug_mechanism_ct_pairs.py +++ b/src/get_drug_mechanism_ct_pairs.py @@ -1,8 +1,16 @@ +""" +Get and add compound-target pairs based on information +in the drug_mechanism table. +""" + import logging import sqlite3 import pandas as pd +from dataset import Dataset +import sanity_checks + ########### Extract Drug-Target Interactions From the drug_mechanism Table ########### def get_drug_mechanisms_interactions(chembl_con: sqlite3.Connection) -> pd.DataFrame: @@ -151,7 +159,7 @@ def add_annotations_to_drug_mechanisms_cti( :return: Updated pandas DataFrame with the additional annotations. :rtype: pd.DataFrame """ - ##### Set columns existing in the df_combined table. ##### + ##### Set columns existing in the df_results table. ##### # None of the targets from the drug mechanism table have any mutation annotation, # hence tid_mutation = tid cpd_target_pairs["tid_mutation"] = cpd_target_pairs["tid"].astype("str") @@ -239,50 +247,73 @@ def get_drug_mechanism_ct_pairs(chembl_con: sqlite3.Connection) -> pd.DataFrame: ########### Add Compounds From the drug_mechanism Table to the Dataset ########### -def add_drug_mechanism_ct_pairs( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> tuple[pd.DataFrame, set, set]: +def add_dm_filtering_columns(dataset: Dataset): """ - Add compound-target pairs from the drug_mechanism table - that are not in the dataset based on the initial ChEMBL query. - These are compound-target pairs for which there is no associated pchembl value data. - Since the pairs are known interactions, - they are added to the dataset despite not having a pchembl value. + Add filtering columns related to the drug_mechanism table. + - pair_mutation_in_dm_table: pair is in dm table (incl. mutations) + - pair_in_dm_table: pair is in dm table (excl. mutations) + - keep_for_binding: use to limit to binding assays - :param df_combined: Pandas Dataframe with compound-target pairs based on ChEMBL activity data - :type df_combined: pd.DataFrame - :param chembl_con: Sqlite3 connection to ChEMBL database. - :type chembl_con: sqlite3.Connection - :return: - Pandas DataFrame with compound-target pairs - based on activities AND drug_mechanism table \\ - - set of compound-target pairs in the drug_mechanism table \\ - - set of targets in the drug_mechanism table - :rtype: (pd.DataFrame, set, set) + :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data + :type dataset: Dataset """ - cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con) - drug_mechanism_pairs_set = set( - f"{a}_{b}" - for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"]) - ) - - drug_mechanism_targets_set = set(cpd_target_pairs["tid"]) - # Add a new column *pair_mutation_in_dm_table* which is set to True if the compound target pair # (taking mutation annotations into account) is in the drug_mechanism table. - df_combined["pair_mutation_in_dm_table"] = False - df_combined.loc[ - (df_combined["cpd_target_pair_mutation"].isin(drug_mechanism_pairs_set)), + dataset.df_result["pair_mutation_in_dm_table"] = False + dataset.df_result.loc[ + ( + dataset.df_result["cpd_target_pair_mutation"].isin( + dataset.drug_mechanism_pairs_set + ) + ), "pair_mutation_in_dm_table", ] = True # Add a new column *pair_in_dm_table* which is set to True if the compound target pair # (NOT taking mutation annotations into account) is in the drug_mechanism table. - df_combined["pair_in_dm_table"] = False - df_combined.loc[ - (df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)), + dataset.df_result["pair_in_dm_table"] = False + dataset.df_result.loc[ + (dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)), "pair_in_dm_table", ] = True + # Add a new column *keep_for_binding* which is set to True if the row should be kept + # if you want to limit the dataset to only data based on binding assays. + # Rows are kept if + # - there is a binding data-based pchembl value or + # - the compound-target pair (including mutation info) is in the drug_mechanism table + dataset.df_result["keep_for_binding"] = False + dataset.df_result.loc[ + ( + (dataset.df_result["pchembl_value_mean_B"].notnull()) + | (dataset.df_result["pair_mutation_in_dm_table"]) + ), + "keep_for_binding", + ] = True + + +def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection): + """ + Add compound-target pairs from the drug_mechanism table + that are not in the dataset based on the initial ChEMBL query. + These are compound-target pairs for which there is no associated pchembl value data. + Since the pairs are known interactions, + they are added to the dataset despite not having a pchembl value. + Add the set of compound-target pairs in the drug_mechanism table and + the set of targets in the drug_mechanism table to the dataset. + + :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data + :type dataset: Dataset + :param chembl_con: Sqlite3 connection to ChEMBL database. + :type chembl_con: sqlite3.Connection + """ + cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con) + dataset.drug_mechanism_pairs_set = set( + f"{a}_{b}" + for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"]) + ) + dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"]) + ##### Limit the drug_mechanism pairs to the ones that are not yet in the dataset. ##### # Mutation annotations are taken into account. # Therefore, *(cpd A, target B without mutation)* will be added @@ -291,7 +322,7 @@ def add_drug_mechanism_ct_pairs( cpd_target_pairs = cpd_target_pairs[ ~( cpd_target_pairs["cpd_target_pair_mutation"].isin( - set(df_combined["cpd_target_pair_mutation"]) + set(dataset.df_result["cpd_target_pair_mutation"]) ) ) ].copy() @@ -302,20 +333,8 @@ def add_drug_mechanism_ct_pairs( ) # Combined data of existing query with new compound-target pairs. - df_combined = pd.concat([df_combined, cpd_target_pairs]) + dataset.df_result = pd.concat([dataset.df_result, cpd_target_pairs]) - # Add a new column *keep_for_binding* which is set to True if the row should be kept - # if you want to limit the dataset to only data based on binding assays. - # Rows are kept if - # - there is a binding data-based pchembl value or - # - the compound-target pair (including mutation info) is in the drug_mechanism table - df_combined["keep_for_binding"] = False - df_combined.loc[ - ( - (df_combined["pchembl_value_mean_B"].notnull()) - | (df_combined["pair_mutation_in_dm_table"] == True) - ), - "keep_for_binding", - ] = True + add_dm_filtering_columns(dataset) - return df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set + sanity_checks.check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result) diff --git a/src/get_stats.py b/src/get_stats.py index e27a4a5..662f937 100644 --- a/src/get_stats.py +++ b/src/get_stats.py @@ -1,79 +1,40 @@ -import pandas as pd - - -##### Debugging Stats ##### -def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]: - """ - Calculate the number of unique compounds, targets and pairs - for df and df limited to drugs. - - :param df: Pandas DataFrame for which the dataset sizes should be calculated. - :type df: pd.DataFrame - :return: List of calculated unique counts. - :rtype: list[int] - """ - now_mols = df["parent_molregno"].nunique() - now_targets = df["tid"].nunique() - now_targets_mutation = df["tid_mutation"].nunique() - now_pairs = df["cpd_target_pair"].nunique() - now_pairs_mutation = df["cpd_target_pair_mutation"].nunique() +""" +Get statistics of dataset for final results and debugging. +""" - if "DTI" in df.columns: - # drugs = compounds of a compound-target pair with a known interaction - df_drugs = df[df["DTI"] == "D_DT"] - else: - df_drugs = df[df["max_phase"] == 4] - - now_drugs = df_drugs["parent_molregno"].nunique() - now_drug_targets = df_drugs["tid"].nunique() - now_drug_targets_mutation = df_drugs["tid_mutation"].nunique() - now_drug_pairs = df_drugs["cpd_target_pair"].nunique() - now_drug_pairs_mutation = df_drugs["cpd_target_pair_mutation"].nunique() +import logging +import pandas as pd - return [ - now_mols, - now_drugs, - now_targets, - now_drug_targets, - now_targets_mutation, - now_drug_targets_mutation, - now_pairs, - now_drug_pairs, - now_pairs_mutation, - now_drug_pairs_mutation, - ] +from dataset import Dataset -def add_dataset_sizes( - df: pd.DataFrame, label: str, df_sizes: list[list[int], list[int]] -): +##### Logging Stats ##### +def get_stats_columns() -> tuple[list[str], list[str]]: """ - Count and add representative counts of df to the list df_sizes used for debugging. - - :param df: Pandas DataFrame with current compound-target pairs - :type df: pd.DataFrame - :param label: Description of pipeline step (e.g., initial query). - :type label: str - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] + Get the relevant columns for which stats should be calculated + and a list of descriptions corresponding to the columns. """ - df_copy = df.copy() - df_sizes[0].append([label] + calculate_dataset_sizes(df_copy)) - - # restrict to data with any pchembl value (any data with a pchembl, - # even if it is based on only functional data) - # these statistics are purely based on removing - # compound-target pairs without pchembl information, - # i.e., the subset of the dataset is determined by the given df and not recalculated - df_pchembl = df_copy.dropna( - subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all" - ) - df_sizes[1].append([label] + calculate_dataset_sizes(df_pchembl)) + df_columns = [ + "parent_molregno", + "tid", + "tid_mutation", + "cpd_target_pair", + "cpd_target_pair_mutation", + ] + columns_descs = [ + "compound ID", + "target ID", + "target ID with mutation annotations", + "compound-target pair", + "compound-target pair with mutation annotations", + ] + return df_columns, columns_descs -##### Logging Stats ##### def get_stats_for_column( - df: pd.DataFrame, column: str, columns_desc: str + df: pd.DataFrame, + column: str, + columns_desc: str, ) -> list[list[str, str, int]]: """ Calculate the number of unique values in df[column] and various subsets of df. @@ -127,3 +88,78 @@ def get_stats_for_column( df[df["DTI"] == "C0_DT"][column].nunique(), ], ] + + +##### Debugging Stats ##### +def get_dataset_sizes(df: pd.DataFrame, label: str) -> pd.DataFrame: + """ + Calculate the number of unique compounds, targets and pairs + for df and df limited to drugs. + + :param df: Pandas DataFrame for which the dataset sizes should be calculated. + :type df: pd.DataFrame + :param label: Description of pipeline step (e.g., initial query). + :type label: str + :return: Pandas DataFrame with calculated unique counts. + :rtype: pd.DataFrame + """ + stats = {"step": label} + + if "DTI" in df.columns: + # drugs = compounds of a compound-target pair with a known interaction + df_drugs = df[df["DTI"] == "D_DT"] + else: + df_drugs = df[df["max_phase"] == 4] + + df_columns, _ = get_stats_columns() + for column in df_columns: + stats[f"{column}_all"] = df[column].nunique() + stats[f"{column}_drugs"] = df_drugs[column].nunique() + + df_stats = pd.DataFrame([stats]) + return df_stats + + +def add_dataset_sizes( + dataset: Dataset, + df: pd.DataFrame, + label: str, +): + """ + Count and add representative counts of df used for debugging to the dataset. + + :param dataset: Dataset with compound-target pairs and debugging sizes. + :type dataset: Dataset + :param df: Pandas DataFrame with current compound-target pairs + :type df: pd.DataFrame + :param label: Description of pipeline step (e.g., initial query). + :type label: str + """ + df_stats = get_dataset_sizes(df, label) + + dataset.df_sizes_all = pd.concat([dataset.df_sizes_all, df_stats]) + + # restrict to data with any pchembl value (any data with a pchembl, + # even if it is based on only functional data) + # these statistics are purely based on removing + # compound-target pairs without pchembl information, + # i.e., the subset of the dataset is determined by the given df and not recalculated + df_copy = df.copy() + df_pchembl = df_copy.dropna( + subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all" + ) + df_stats = get_dataset_sizes(df_pchembl, label) + dataset.df_sizes_pchembl = pd.concat([dataset.df_sizes_pchembl, df_stats]) + + +def add_debugging_info( + dataset: Dataset, + df: pd.DataFrame, + label: str, +): + """ + Wrapper for add_dataset_sizes. + Handles logging level. + """ + if logging.DEBUG >= logging.root.level: + add_dataset_sizes(dataset, df, label) diff --git a/src/main.py b/src/main.py index 8198f3e..5b297b6 100644 --- a/src/main.py +++ b/src/main.py @@ -1,103 +1,21 @@ -import argparse +""" +Get the compound-target pairs dataset from ChEMBL using the given arguments. +""" + import logging import sqlite3 import chembl_downloader +import arguments import get_dataset -def parse_args() -> argparse.Namespace: - """ - Get arguments with argparse. - - :return: Populated argparse.Namespace - :rtype: argparse.Namespace - """ - parser = argparse.ArgumentParser( - description="Extract the compound-target pairs dataset from ChEMBL. \ - The full dataset plus filtering columns for binding vs. binding+functional data \ - will always be written to csv. \ - Additional outputs and output types can be chosen with the parameters below." - ) - - parser.add_argument( - "--chembl", - "-v", - metavar="", - type=str, - default=None, - help="ChEMBL version. \ - Latest version if None. \ - Required if a path to a SQLite database is provided, \ - i.e., if --sqlite is set. (default: None)", - ) - parser.add_argument( - "--sqlite", - "-s", - metavar="", - type=str, - default=None, - help="Path to SQLite database. \ - ChEMBL is downloaded as an SQLite database \ - and handled by chembl_downloader if None. (default: None)", - ) - parser.add_argument( - "--output", - "-o", - metavar="", - type=str, - required=True, - help="Path to write the output file(s) to. (required)", - ) - parser.add_argument( - "--delimiter", - "-d", - metavar="", - type=str, - default=";", - help="Delimiter in output csv-files. (default: ;)", - ) - parser.add_argument( - "--all_sources", - action="store_true", - help="If this is set, the dataset is calculated based on all sources in ChEMBL. \ - This includes data from BindingDB which may skew the results. \ - Default (not set): the dataset is calculated based on only literature data.", - ) - parser.add_argument( - "--rdkit", - action="store_true", - help="Calculate RDKit-based compound properties.", - ) - parser.add_argument( - "--excel", - action="store_true", - help="Write the results to excel. Note: this may fail if the output is too large.", - ) - parser.add_argument( - "--BF", action="store_true", help="Write binding+functional data subsets." - ) - parser.add_argument("--B", action="store_true", help="Write binding data subsets.") - parser.add_argument( - "--debug", action="store_true", help="Log additional debugging information." - ) - args = parser.parse_args() - - return args - - def main(): """ Call get_ct_pair_dataset to get the compound-target dataset using the given arguments. """ - args = parse_args() - - # Set arguments that are always true. - # Write the results to csv. - csv = True - # Write the full dataset plus filtering columns for binding vs. binding+functional data. - full_df = True + args, calc_args, output_args = arguments.get_args() log_level = "DEBUG" if args.debug else "INFO" numeric_log_level = getattr(logging, log_level, None) @@ -112,35 +30,19 @@ def main(): with sqlite3.connect(args.sqlite) as chembl_con: get_dataset.get_ct_pair_dataset( chembl_con, - args.chembl, - args.output, - not args.all_sources, - args.rdkit, - csv, - args.excel, - args.delimiter, - full_df, - args.BF, - args.B, + calc_args, + output_args, ) else: logging.info("Using chembl_downloader to connect to ChEMBL.") - if args.chembl is None: - args.chembl = chembl_downloader.latest() + if args.chembl_version is None: + args.chembl_version = chembl_downloader.latest() - with chembl_downloader.connect(version=args.chembl) as chembl_con: + with chembl_downloader.connect(version=args.chembl_version) as chembl_con: get_dataset.get_ct_pair_dataset( chembl_con, - args.chembl, - args.output, - not args.all_sources, - args.rdkit, - csv, - args.excel, - args.delimiter, - full_df, - args.BF, - args.B, + calc_args, + output_args, ) diff --git a/src/output.py b/src/output.py new file mode 100644 index 0000000..f1a4a5f --- /dev/null +++ b/src/output.py @@ -0,0 +1,223 @@ +""" +Write the dataset, subsets and related statistics to files +and to the command line. +""" + +import logging +import os +import pandas as pd +import sanity_checks + +from arguments import OutputArgs, CalculationArgs +from dataset import Dataset +import get_stats + + +##### Writing Output ##### +def write_output( + df: pd.DataFrame, + filename: str, + out: OutputArgs, +) -> list[str]: + """ + Write DataFrame df to output file named . + + :param df: Pandas Dataframe to write to output file. + :type df: pd.DataFrame + :param filename: Filename to write the output to + :type filename: bool + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + :return: Returns list of types of files that was written to (csv and/or xlsx) + :rtype: list[str] + """ + file_type_list = [] + if out.write_to_csv: + df.to_csv(f"{filename}.csv", sep=out.delimiter, index=False) + file_type_list.append("csv") + if out.write_to_excel: + try: + with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer: + writer.book.use_zip64() + df.to_excel(writer, index=False) + file_type_list.append("xlsx") + except ValueError as e: # full dataset may be too large to write to excel + # remove empty file in case of error to avoid confusion + if os.path.exists(f"{filename}.xlsx"): + os.remove(f"{filename}.xlsx") + print(e) + return file_type_list + + +def write_and_check_output( + df: pd.DataFrame, + filename: str, + assay_type: str, + args: CalculationArgs, + out: OutputArgs, +): + """ + Write df to file and check that writing was successful. + + :param df: Pandas Dataframe to write to output file. + :type df: pd.DataFrame + :param filename: Filename to write the output to + :type filename: bool + :param assay_type: Types of assays current_df contains information about. \ + Options: "BF" (binding+functional), + "B" (binding), + "all" (contains both BF and B information) + :type assay_type: str + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + file_type_list = write_output(df, filename, out) + sanity_checks.test_equality( + df, filename, assay_type, file_type_list, args.calculate_rdkit + ) + + +##### Output Specific Results ##### +def write_full_dataset_to_file( + dataset: Dataset, + args: CalculationArgs, + out: OutputArgs, +): + """ + If write_full_dataset, write df_combined with filtering columns to output_path. + + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + desc = "all" + if out.write_full_dataset: + name_all = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset", + ) + write_and_check_output(dataset.df_result, name_all, desc, args, out) + + +def output_stats( + df: pd.DataFrame, + output_file: str, + out: OutputArgs, +): + """ + Summarise and output the number of unique values in the following columns: + + - parent_molregno (compound ID) + - tid (target ID) + - tid_mutation (target ID + mutation annotations) + - cpd_target_pair (compound-target pairs) + - cpd_target_pair_mutation (compound-target pairs including mutation annotations) + + :param df: Pandas Dataframe for which the stats should be calculated + :type df: pd.DataFrame + :param output_file: Path and filename to write the dataset stats to + :type output_file: str + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + logging.debug("Stats for %s", output_file) + stats = [] + df_columns, columns_descs = get_stats.get_stats_columns() + for column, columns_desc in zip(df_columns, columns_descs): + logging.debug("Stats for column %s:", column) + column_stats = get_stats.get_stats_for_column(df, column, columns_desc) + stats += column_stats + for colum_stat in column_stats: + logging.debug("%20s %s", colum_stat[2], colum_stat[3]) + + df_stats = pd.DataFrame( + stats, columns=["column", "column_description", "subset_type", "counts"] + ) + write_output( + df_stats, + output_file, + out, + ) + + +def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs): + """ + Output stats for all datasets and subsets calculated. + + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + output_file = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset_stats", + ) + + output_stats(dataset.df_result, output_file, out) + + if out.write_bf: + output_file = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"BF_{args.min_nof_cpds_bf}_c_dt_d_dt_stats", + ) + output_stats( + dataset.df_result[dataset.df_result["BF_100_c_dt_d_dt"]], + output_file, + out, + ) + + if out.write_b: + output_file = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"B_{args.min_nof_cpds_b}_c_dt_d_dt_stats", + ) + output_stats( + dataset.df_result[dataset.df_result["B_100_c_dt_d_dt"]], + output_file, + out, + ) + + +def write_debug_sizes( + dataset: Dataset, + out: OutputArgs, +): + """ + Output counts at various points during calculating the final dataset for debugging. + + :param dataset: Dataset with compound-target pairs and debugging sizes. + :type dataset: Dataset + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + # Size of full dataset at different points. + name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes") + write_output( + dataset.df_sizes_all, + name_full_df_sizes, + out, + ) + + # Size of dataset with any pchembl values at different points. + # This includes data for which we only have pchembl data + # for functional assays but not for binding assays. + name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes") + write_output( + dataset.df_sizes_pchembl, + name_pchembl_df_sizes, + out, + ) diff --git a/src/sanity_checks.py b/src/sanity_checks.py index b34517e..94c5d6d 100644 --- a/src/sanity_checks.py +++ b/src/sanity_checks.py @@ -1,39 +1,14 @@ -import pandas as pd - - -########### Sanity checks for the dataset ########### -def check_null_values(df_combined: pd.DataFrame): - """ - Check if any columns contain nan or null which aren't recognised as null values. - """ - for col in df_combined.columns: - col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str)) - assert ( - "nan" not in col_as_str - ), f"Problem with unrecognised nan value in column {col}" - assert ( - "null" not in col_as_str - ), f"Problem with unrecognised null value in column {col}" +""" +Perform sanity checks on the dataset. +""" +import pandas as pd -def check_for_mixed_types(df_combined: pd.DataFrame): - """ - Check that there are no mixed types in columns with dtype=object. - """ - for col, dtype in df_combined.dtypes.to_dict().items(): - if dtype == object: - col_original = set(df_combined[df_combined[col].notnull()][col]) - col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str)) - # is there a difference in the two sets (ignoring null values) - assert ( - len(col_original - col_as_str) == 0 - ), f"Mixed types in colum {col}: {col_original-col_as_str}" - assert ( - len(col_as_str - col_original) == 0 - ), f"Mixed types in colum {col}: {col_as_str-col_original}" +from dataset import Dataset -def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame): +########### Sanity checks during assignments ########### +def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame): """ Check that rows without a pchembl value based on binding+functional assays (pchembl_x_BF) are in the drug_mechanism table. @@ -47,55 +22,15 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame "pchembl_value_max_BF", "pchembl_value_median_BF", ]: - assert df_combined[(df_combined[pchembl_col].isnull())].equals( - df_combined[ - (df_combined["pair_mutation_in_dm_table"] == True) - & (df_combined[pchembl_col].isnull()) + assert df_result[(df_result[pchembl_col].isnull())].equals( + df_result[ + (df_result["pair_mutation_in_dm_table"]) + & (df_result[pchembl_col].isnull()) ] ), f"Missing pchembl value in column {pchembl_col}" -def check_ligand_efficiency_metrics(df_combined: pd.DataFrame): - """ - Check that ligand efficiency metrics are only null - when at least one of the values used to calculate them is null. - Ligand efficiency metrics are only null when at least - one of the values used to calculate them is null. - """ - for suffix in ["BF", "B"]: - assert df_combined[(df_combined[f"LE_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["heavy_atoms"].isnull()) - | (df_combined["heavy_atoms"] == 0) - ] - ), f"Missing LE value in LE_{suffix}" - - assert df_combined[(df_combined[f"BEI_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["mw_freebase"].isnull()) - | (df_combined["mw_freebase"] == 0) - ] - ), f"Missing BEI value in BEI_{suffix}" - - assert df_combined[(df_combined[f"SEI_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["psa"].isnull()) - | (df_combined["psa"] == 0) - ] - ), f"Missing SEI value in SEI_{suffix}" - - assert df_combined[(df_combined[f"LLE_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["alogp"].isnull()) - ] - ), f"Missing LLE value in LLE_{suffix}" - - -def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame): +def check_compound_props(df_result: pd.DataFrame, df_cpd_props: pd.DataFrame): """ Check that compound props are only null if @@ -104,8 +39,8 @@ def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame): """ # missing values because the parent_molregno is not in the compound props table no_cpd_prop_info = len( - df_combined[ - ~df_combined["parent_molregno"].isin(set(df_cpd_props["parent_molregno"])) + df_result[ + ~df_result["parent_molregno"].isin(set(df_cpd_props["parent_molregno"])) ] ) @@ -113,47 +48,95 @@ def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame): if col != "parent_molregno": # missing values because the compound props query returns null (exists but is null) missing_values = len( - df_combined[ - df_combined["parent_molregno"].isin( + df_result[ + df_result["parent_molregno"].isin( set(df_cpd_props[df_cpd_props[col].isnull()]["parent_molregno"]) ) ] ) null_values = no_cpd_prop_info + missing_values assert null_values == len( - df_combined[df_combined[col].isnull()] + df_result[df_result[col].isnull()] ), f"Too many null values in {col}" -def check_atc_and_target_classes( - df_combined: pd.DataFrame, +def check_ligand_efficiency_metrics(df_result: pd.DataFrame): + """ + Check that ligand efficiency metrics are only null + when at least one of the values used to calculate them is null. + Ligand efficiency metrics are only null when at least + one of the values used to calculate them is null. + """ + for suffix in ["BF", "B"]: + assert df_result[(df_result[f"LE_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["heavy_atoms"].isnull()) + | (df_result["heavy_atoms"] == 0) + ] + ), f"Missing LE value in LE_{suffix}" + + assert df_result[(df_result[f"BEI_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["mw_freebase"].isnull()) + | (df_result["mw_freebase"] == 0) + ] + ), f"Missing BEI value in BEI_{suffix}" + + assert df_result[(df_result[f"SEI_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["psa"].isnull()) + | (df_result["psa"] == 0) + ] + ), f"Missing SEI value in SEI_{suffix}" + + assert df_result[(df_result[f"LLE_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["alogp"].isnull()) + ] + ), f"Missing LLE value in LLE_{suffix}" + + +def check_atc( + df_result: pd.DataFrame, atc_levels: pd.DataFrame, - target_classes_level1: pd.DataFrame, - target_classes_level2: pd.DataFrame, ): """ - Check that atc_level1 and target class information is only null - if the parent_molregno / target id is not in the respective table. + Check that atc_level1 information is only null + if the parent_molregno is not in the respective table. """ - assert df_combined[(df_combined["atc_level1"].isnull())].equals( - df_combined[ - ~df_combined["parent_molregno"].isin(set(atc_levels["parent_molregno"])) + assert df_result[(df_result["atc_level1"].isnull())].equals( + df_result[ + ~df_result["parent_molregno"].isin(set(atc_levels["parent_molregno"])) ] ), "Null values in atc_level1 are not exclusively \ because the parent_molregno is not in the atc_classification table." - assert df_combined[(df_combined["target_class_l1"].isnull())].equals( - df_combined[~df_combined["tid"].isin(set(target_classes_level1["tid"]))] + +def check_target_classes( + df_result: pd.DataFrame, + target_classes_level1: pd.DataFrame, + target_classes_level2: pd.DataFrame, +): + """ + Check that target class information is only null + if the target id is not in the respective table. + """ + assert df_result[(df_result["target_class_l1"].isnull())].equals( + df_result[~df_result["tid"].isin(set(target_classes_level1["tid"]))] ), "Null values in target_class_l1 are not exclusively \ because the tid is not in the protein_classification table." - assert df_combined[(df_combined["target_class_l2"].isnull())].equals( - df_combined[~df_combined["tid"].isin(set(target_classes_level2["tid"]))] + assert df_result[(df_result["target_class_l2"].isnull())].equals( + df_result[~df_result["tid"].isin(set(target_classes_level2["tid"]))] ), "Null values in target_class_l2 are not exclusively \ because the tid is not in the protein_classification table." -def check_rdkit_props(df_combined: pd.DataFrame): +def check_rdkit_props(df_result: pd.DataFrame): """ Check that columns set by the RDKit are only null if there is no canonical SMILES for the molecule. @@ -179,61 +162,59 @@ def check_rdkit_props(df_combined: pd.DataFrame): "aromatic_n", "aromatic_hetero", ]: - assert len(df_combined[df_combined[col].isnull()]) == len( - df_combined[df_combined["canonical_smiles"].isnull()].copy() + assert len(df_result[df_result[col].isnull()]) == len( + df_result[df_result["canonical_smiles"].isnull()].copy() ), f"Missing value in {col} despite a smiles being available." +########### Final sanity checks for the dataset ########### +def check_null_values(df_result: pd.DataFrame): + """ + Check if any columns contain nan or null which aren't recognised as null values. + """ + for col in df_result.columns: + col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) + assert ( + "nan" not in col_as_str + ), f"Problem with unrecognised nan value in column {col}" + assert ( + "null" not in col_as_str + ), f"Problem with unrecognised null value in column {col}" + + +def check_for_mixed_types(df_result: pd.DataFrame): + """ + Check that there are no mixed types in columns with dtype=object. + """ + for col, dtype in df_result.dtypes.to_dict().items(): + if dtype == object: + col_original = set(df_result[df_result[col].notnull()][col]) + col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) + # is there a difference in the two sets (ignoring null values) + assert ( + len(col_original - col_as_str) == 0 + ), f"Mixed types in colum {col}: {col_original-col_as_str}" + assert ( + len(col_as_str - col_original) == 0 + ), f"Mixed types in colum {col}: {col_as_str-col_original}" + + def sanity_checks( - df_combined: pd.DataFrame, - df_cpd_props: pd.DataFrame, - atc_levels: pd.DataFrame, - target_classes_level1: pd.DataFrame, - target_classes_level2: pd.DataFrame, - calculate_rdkit: bool, + dataset: Dataset, ): """ Check basic assumptions about the finished dataset, specifically: - no columns contain nan or null values which aren't recognised as null values - there are no mixed types in columns with dtype=object - - rows without a pchembl value based on binding+functional assays (pchembl_x_BF) - are in the drug_mechanism table - - ligand efficiency metrics are only null when at least one of the values - used to calculate them is null - - compound props are only null if the compound is not in df_cpd_props - or the value in that table is null - - atc_level1 and target class information is only null if - the parent_molregno / target id is not in the respective table - - columns set by the RDKit are only null if there is no canonical SMILES - for the molecule (excluding scaffolds) - - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param df_cpd_props: Pandas DataFrame with compound properties - and structures for all compound ids in ChEMBL. - :type df_cpd_props: pd.DataFrame - :param atc_levels: Pandas DataFrame with ATC annotations in ChEMBL - :type atc_levels: pd.DataFrame - :param target_classes_level1: Pandas DataFrame with mapping - from target id to level 1 target class - :type target_classes_level1: pd.DataFrame - :param target_classes_level2: Pandas DataFrame with mapping - from target id to level 2 target class - :type target_classes_level2: pd.DataFrame + + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties :type calculate_rdkit: bool """ - check_null_values(df_combined) - check_for_mixed_types(df_combined) - check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined) - check_ligand_efficiency_metrics(df_combined) - check_compound_props(df_combined, df_cpd_props) - check_atc_and_target_classes( - df_combined, atc_levels, target_classes_level1, target_classes_level2 - ) - if calculate_rdkit: - check_rdkit_props(df_combined) + check_null_values(dataset.df_result) + check_for_mixed_types(dataset.df_result) ########### Sanity checks for writing and reading a dataset ########### diff --git a/src/write_subsets.py b/src/write_subsets.py deleted file mode 100644 index b138b35..0000000 --- a/src/write_subsets.py +++ /dev/null @@ -1,624 +0,0 @@ -import logging -import os -import pandas as pd -import sanity_checks - -import get_stats - - -def write_output( - df: pd.DataFrame, - filename: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, -) -> list[str]: - """ - Write DataFrame df to output file named . - - :param df: Pandas Dataframe to write to output file. - :type df: pd.DataFrame - :param filename: Filename to write the output to - :type filename: bool - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - :return: Returns list of types of files that was written to (csv and/or xlsx) - :rtype: list[str] - """ - file_type_list = [] - if write_to_csv: - df.to_csv(f"{filename}.csv", sep=delimiter, index=False) - file_type_list.append("csv") - if write_to_excel: - try: - with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer: - writer.book.use_zip64() - df.to_excel(writer, index=False) - file_type_list.append("xlsx") - except ValueError as e: # full dataset may be too large to write to excel - # remove empty file in case of error to avoid confusion - if os.path.exists(f"{filename}.xlsx"): - os.remove(f"{filename}.xlsx") - print(e) - return file_type_list - - -def write_and_check_output( - df: pd.DataFrame, - filename: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - assay_type: str, - calculate_rdkit: bool, -): - """ - Write df to file and check that writing was successful. - - :param df: Pandas Dataframe to write to output file. - :type df: pd.DataFrame - :param filename: Filename to write the output to - :type filename: bool - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - :param assay_type: Types of assays current_df contains information about. \ - Options: "BF" (binding+functional), - "B" (binding), - "all" (contains both BF and B information) - :type assay_type: str - :param calculate_rdkit: If True, current_df contains RDKit-based columns - :type calculate_rdkit: bool - """ - file_type_list = write_output(df, filename, write_to_csv, write_to_excel, delimiter) - sanity_checks.test_equality( - df, filename, assay_type, file_type_list, calculate_rdkit - ) - - -def get_data_subsets( - data: pd.DataFrame, min_nof_cpds: int, desc: str -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Calculate and return the different subsets of interest. - - :param data: Pandas DataFrame with compound-target pairs - :type data: pd.DataFrame - :param min_nof_cpds: Miminum number of compounds per target - :type min_nof_cpds: int - :param desc: Types of assays current_df contains information about. \ - Options: "BF" (binding+functional), "B" (binding) - :type desc: str - :return: - - data: Pandas DataFrame with compound-target pairs - without the annotations for the opposite desc, \ - e.g. if desc = "BF", the average pchembl value based on - binding data only is dropped - - df_enough_cpds: Pandas DataFrame with targets - with at least compounds with a pchembl value, - - df_c_dt_d_dt: As df_enough_cpds but with \ - at least one compound-target pair labelled as - 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), - - df_d_dt: As df_enough_cpds but with \ - at least one compound-target pair labelled as - 'D_DT' (i.e., known drug-target interaction) - :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame) - """ - if desc == "B": - drop_desc = "BF" - else: - drop_desc = "B" - data = data.drop( - columns=[ - f"pchembl_value_mean_{drop_desc}", - f"pchembl_value_max_{drop_desc}", - f"pchembl_value_median_{drop_desc}", - f"first_publication_cpd_target_pair_{drop_desc}", - f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}", - f"LE_{drop_desc}", - f"BEI_{drop_desc}", - f"SEI_{drop_desc}", - f"LLE_{drop_desc}", - ] - ).drop_duplicates() - - # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value. - comparator_counts = ( - data[data[f"pchembl_value_mean_{desc}"].notnull()] - .groupby(["tid_mutation"])["parent_molregno"] - .count() - ) - # pylint: disable-next=unused-variable - targets_w_enough_cpds = comparator_counts[ - comparator_counts >= min_nof_cpds - ].index.tolist() - df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds") - - # Restrict the dataset further to targets - # with at least one compound-target pair labelled as - # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT', - # i.e., compound-target pairs with a known interactions. - # pylint: disable-next=unused-variable - c_dt_d_dt_targets = set( - df_enough_cpds[ - df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"]) - ].tid_mutation.to_list() - ) - df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets") - - # Restrict the dataset further to targets with - # at least one compound-target pair labelled as 'D_DT', - # i.e., known drug-target interactions. - # pylint: disable-next=unused-variable - d_dt_targets = set( - df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list() - ) - df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets") - - return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt - - -def write_bf_to_file( - df_combined: pd.DataFrame, - chembl_version: str, - min_nof_cpds_bf: int, - output_path: str, - write_bf: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - limited_flag: str, - calculate_rdkit: bool, - df_sizes: list[list[int], list[int]], -) -> pd.DataFrame: - """ - Calculate relevant subsets for the portion of df_combined - that is based on binding+functional data. - If write_bf the subsets are written to output_path. - Independent of write_bf, filtering columns for BF are added to df_combined and returned. - - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param min_nof_cpds_bf: Miminum number of compounds per target - :type min_nof_cpds_bf: int - :param output_path: Path to write the output to - :type output_path: str - :param write_bf: Should the subsets be written to files? - :type write_bf: bool - :param write_to_csv: Should the subsets be written to csv? - :type write_to_csv: bool - :param write_to_excel: Should the subsets be written to excel? - :type write_to_excel: bool - :param delimiter: Delimiter for csv output - :type delimiter: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str - :param calculate_rdkit: Does df_combined include RDKit-based columns? - :type calculate_rdkit: bool - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :return: Pandas DataFrame with additional filtering columns for BF subsets - :rtype: pd.Dataframe - """ - # consider binding and functional assays - # assay description = binding+functional - desc = "BF" - # df_combined with additional filtering columns - df_combined_annotated = df_combined.copy() - # df_combined without binding only data - df_combined_bf = df_combined.copy() - ( - df_combined_bf, - df_combined_bf_enough_cpds, - df_combined_bf_c_dt_d_dt, - df_combined_bf_d_dt, - ) = get_data_subsets(df_combined_bf, min_nof_cpds_bf, desc) - - # add filtering columns to df_combined_annotated - for df, col_name in zip( - [ - df_combined_bf_enough_cpds, - df_combined_bf_c_dt_d_dt, - df_combined_bf_d_dt, - ], - [ - f"BF_{min_nof_cpds_bf}", - f"BF_{min_nof_cpds_bf}_c_dt_d_dt", - f"BF_{min_nof_cpds_bf}_d_dt", - ], - ): - df_combined_annotated[col_name] = False - df_combined_annotated.loc[ - (df_combined_annotated.index.isin(df.index)), col_name - ] = True - # check that filtering works - assert df_combined_annotated[df_combined_annotated[col_name] == True][ - df.columns - ].equals(df), f"Filtering is not accurate for {col_name}." - - if write_bf: - # NOTE: This is almost identical to the full dataset which will be saved later on. - # However, the binding-related columns are dropped - name_bf = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF" - ) - write_and_check_output( - df_combined_bf, - name_bf, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_bf_100 = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}", - ) - write_and_check_output( - df_combined_bf_enough_cpds, - name_bf_100, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_bf_100_c_dt_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt", - ) - write_and_check_output( - df_combined_bf_c_dt_d_dt, - name_bf_100_c_dt_d_dt, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_bf_100_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_d_dt", - ) - write_and_check_output( - df_combined_bf_d_dt, - name_bf_100_d_dt, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined_bf, "binding + functional", df_sizes) - get_stats.add_dataset_sizes(df_combined_bf_enough_cpds, "BF, >= 100", df_sizes) - get_stats.add_dataset_sizes( - df_combined_bf_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes - ) - get_stats.add_dataset_sizes(df_combined_bf_d_dt, "BF, >= 100, d_dt", df_sizes) - - return df_combined_annotated - - -def write_b_to_file( - df_combined: pd.DataFrame, - df_combined_annotated: pd.DataFrame, - chembl_version: str, - min_nof_cpds_b: int, - output_path: str, - write_b: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - limited_flag: str, - calculate_rdkit: bool, - df_sizes: list[list[int], list[int]], -) -> pd.DataFrame: - """ - Calculate relevant subsets for the portion of df_combined that is based on binding data. - If write_b the subsets are written to output_path. - Independent of write_b, filtering columns for B are added to df_combined_annotated. - - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param df_combined_annotated: Pandas DataFrame with additional filtering columns - :type df_combined_annotated: pd.DataFrame - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param min_nof_cpds_b: Miminum number of compounds per target - :type min_nof_cpds_b: int - :param output_path: Path to write the output to - :type output_path: str - :param write_b: Should the subsets be written to files? - :type write_b: bool - :param write_to_csv: Should the subsets be written to csv? - :type write_to_csv: bool - :param write_to_excel: Should the subsets be written to excel? - :type write_to_excel: bool - :param delimiter: Delimiter for csv output - :type delimiter: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str - :param calculate_rdkit: Does df_combined include RDKit-based columns? - :type calculate_rdkit: bool - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :return: Pandas DataFrame with additional filtering columns for B subsets - :rtype: pd.Dataframe - """ - # consider only binding assays - # assay description = binding - desc = "B" - df_combined_b = df_combined[df_combined["keep_for_binding"] == True].copy() - ( - df_combined_b, - df_combined_b_enough_cpds, - df_combined_b_c_dt_d_dt, - df_combined_b_d_dt, - ) = get_data_subsets(df_combined_b, min_nof_cpds_b, desc) - - # add filtering columns to df_combined_annotated - for df, col_name in zip( - [df_combined_b_enough_cpds, df_combined_b_c_dt_d_dt, df_combined_b_d_dt], - [ - f"B_{min_nof_cpds_b}", - f"B_{min_nof_cpds_b}_c_dt_d_dt", - f"B_{min_nof_cpds_b}_d_dt", - ], - ): - df_combined_annotated[col_name] = False - df_combined_annotated.loc[ - (df_combined_annotated.index.isin(df.index)), col_name - ] = True - # check that filtering works - assert df_combined_annotated[df_combined_annotated[col_name] == True][ - df.columns - ].equals(df), f"Filtering is not accurate for {col_name}." - - if write_b: - name_b = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B" - ) - write_and_check_output( - df_combined_b, - name_b, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_b_100 = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}" - ) - write_and_check_output( - df_combined_b_enough_cpds, - name_b_100, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_b_100_c_dt_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt", - ) - write_and_check_output( - df_combined_b_c_dt_d_dt, - name_b_100_c_dt_d_dt, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_b_100_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_d_dt", - ) - write_and_check_output( - df_combined_b_d_dt, - name_b_100_d_dt, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined_b, "binding", df_sizes) - get_stats.add_dataset_sizes(df_combined_b_enough_cpds, "B, >= 100", df_sizes) - get_stats.add_dataset_sizes( - df_combined_b_c_dt_d_dt, "B, >= 100, c_dt and d_dt", df_sizes - ) - get_stats.add_dataset_sizes(df_combined_b_d_dt, "B, >= 100, d_dt", df_sizes) - - return df_combined_annotated - - -def write_full_dataset_to_file( - df_combined: pd.DataFrame, - chembl_version: str, - output_path: str, - write_full_dataset: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - limited_flag: str, - calculate_rdkit: bool, -): - """ - If write_full_dataset, write df_combined with filtering columns to output_path. - - :param df_combined: Pandas DataFrame with compound-target pairs and filtering columns - :type df_combined: pd.DataFrame - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param output_path: Path to write the output to - :type output_path: str - :param write_full_dataset: Should the subsets be written to files? - :type write_full_dataset: bool - :param write_to_csv: Should the subsets be written to csv? - :type write_to_csv: bool - :param write_to_excel: Should the subsets be written to excel? - :type write_to_excel: bool - :param delimiter: Delimiter for csv output - :type delimiter: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str - :param calculate_rdkit: Does df_combined include RDKit-based columns? - :type calculate_rdkit: bool - """ - desc = "all" - if write_full_dataset: - name_all = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset" - ) - write_and_check_output( - df_combined, - name_all, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - -def output_debug_sizes( - df_sizes: list[list[int], list[int]], - output_path: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, -): - """ - Output counts at various points during calculating the final dataset for debugging. - - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :param output_path: Path to write the dataset counts to - :type output_path: str - :param write_to_csv: True if counts should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if counts should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - """ - column_names = [ - "type", - "#mols", - "#drugs", - "#targets", - "#drug_ targets", - "#targets_ mutation", - "#drug_ targets_mutation", - "#cpd_tid_ pairs", - "#drug_tid_ pairs", - "#cpd_ tid_mutation_ pairs", - "#drug_ tid_mutation_ pairs", - ] - - logging.debug("Size of full dataset at different points.") - full_df_sizes = pd.DataFrame(df_sizes[0], columns=column_names) - logging.debug(full_df_sizes) - name_full_df_sizes = os.path.join(output_path, "debug_full_df_sizes") - write_output( - full_df_sizes, name_full_df_sizes, write_to_csv, write_to_excel, delimiter - ) - - logging.debug("Size of dataset with any pchembl values at different points.") - logging.debug( - "This includes data for which we only have pchembl data \ - for functional assays but not for binding assays." - ) - df_pchembl_sizes = pd.DataFrame(df_sizes[1], columns=column_names) - logging.debug(df_pchembl_sizes) - name_pchembl_df_sizes = os.path.join(output_path, "debug_pchembl_df_sizes") - write_output( - full_df_sizes, name_pchembl_df_sizes, write_to_csv, write_to_excel, delimiter - ) - - -def output_stats( - df: pd.DataFrame, - output_file: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, -): - """ - Summarise and output the number of unique values in the following columns: - - - parent_molregno (compound ID) - - tid (target ID) - - tid_mutation (target ID + mutation annotations) - - cpd_target_pair (compound-target pairs) - - cpd_target_pair_mutation (compound-target pairs including mutation annotations) - - :param df: Pandas Dataframe for which the stats should be calculated - :type df: pd.DataFrame - :param output_file: Path and filename to write the dataset stats to - :type output_file: str - :param write_to_csv: True if stats should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if stats should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - """ - df_columns = [ - "parent_molregno", - "tid", - "tid_mutation", - "cpd_target_pair", - "cpd_target_pair_mutation", - ] - columns_descs = [ - "compound ID", - "target ID", - "target ID with mutation annotations", - "compound-target pair", - "compound-target pair with mutation annotations", - ] - - logging.debug("Stats for %s", output_file) - stats = [] - for column, columns_desc in zip(df_columns, columns_descs): - logging.debug("Stats for column %s:", column) - column_stats = get_stats.get_stats_for_column(df, column, columns_desc) - stats += column_stats - for colum_stat in column_stats: - logging.debug( - "%20s %s", - colum_stat[2], - colum_stat[3], - ) - - df_stats = pd.DataFrame( - stats, columns=["column", "column_description", "subset_type", "counts"] - ) - write_output(df_stats, output_file, write_to_csv, write_to_excel, delimiter)