From c7e76b544d892c1704762dad552200c565400b20 Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Thu, 15 Feb 2024 14:53:30 +0000 Subject: [PATCH 1/8] Group arguments together in dataclasses - add CalculationArgs dataclass with arguments related to how the dataset is calculated - add OutputArgs dataclass with arguments related to the output --- src/add_chembl_target_class_annotations.py | 35 +- src/arguments.py | 172 +++++++ src/get_activity_ct_pairs.py | 15 +- src/get_dataset.py | 140 +----- src/main.py | 120 +---- src/write_subsets.py | 541 +++++++++------------ 6 files changed, 458 insertions(+), 565 deletions(-) create mode 100644 src/arguments.py diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py index 25c1d01..009b8d4 100644 --- a/src/add_chembl_target_class_annotations.py +++ b/src/add_chembl_target_class_annotations.py @@ -5,6 +5,7 @@ import pandas as pd import write_subsets +from arguments import OutputArgs, CalculationArgs ########### Add Target Class Annotations Based on ChEMBL Data ########### @@ -81,12 +82,8 @@ def get_target_class_table( def add_chembl_target_class_annotations( df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, - output_path: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - chembl_version: str, - limited_flag: str, + args: CalculationArgs, + out: OutputArgs, ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Add level 1 and 2 target class annotations. @@ -101,19 +98,10 @@ def add_chembl_target_class_annotations( :type df_combined: pd.DataFrame :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :param output_path: Path to write the targets with more than one target class assignment to - :type output_path: str - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs :return: - Pandas DataFrame with added target class annotations \\ - Pandas DataFrame with mapping from target id to level 1 target class \\ - Pandas DataFrame with mapping from target id to level 2 target class @@ -199,15 +187,14 @@ def add_chembl_target_class_annotations( ) name_more_than_one_tclass = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_targets_w_more_than_one_tclass", + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass", ) write_subsets.write_output( more_than_one_tclass, name_more_than_one_tclass, - write_to_csv, - write_to_excel, - delimiter, + out, ) return df_combined, target_classes_level1, target_classes_level2 diff --git a/src/arguments.py b/src/arguments.py new file mode 100644 index 0000000..080b331 --- /dev/null +++ b/src/arguments.py @@ -0,0 +1,172 @@ +import argparse +from dataclasses import dataclass + + +@dataclass(frozen=True) +class CalculationArgs: + """ + Collection of arguments related to how to calculate the dataset. + + chembl_version: Version of ChEMBL for output file names + calculate_rdkit: True if RDKit-based compound properties should be calculated + limit_to_literature: Include only literature sources if True + limited_flag: String version of limit_to_literature used in file names + min_nof_cpds_bf: Minimum number of compounds per target for the BF subset + min_nof_cpds_b: Minimum number of compounds per target for the B subset + """ + + chembl_version: str + calculate_rdkit: bool + limit_to_literature: bool + limited_flag: str + min_nof_cpds_bf: int + min_nof_cpds_b: int + + +@dataclass(frozen=True) +class OutputArgs: + """ + Collection of arguments related to how to output the dataset. + + output_path: Path to write output files to + delimiter: Delimiter in csv-output + write_to_csv: True if output should be written to csv + write_to_excel: True if output should be written to excel + write_full_dataset: True if the full dataset should be written to output + write_bf: True if subsets based on binding+functional data should be written to output + write_b: True if subsets based on binding data only should be written to output + """ + + output_path: str + delimiter: str + write_to_csv: bool + write_to_excel: bool + write_full_dataset: bool + write_bf: bool + write_b: bool + + +def parse_args() -> argparse.Namespace: + """ + Get arguments with argparse. + + :return: Populated argparse.Namespace + :rtype: argparse.Namespace + """ + parser = argparse.ArgumentParser( + description="Extract the compound-target pairs dataset from ChEMBL. \ + The full dataset plus filtering columns for binding vs. binding+functional data \ + will always be written to csv. \ + Additional outputs and output types can be chosen with the parameters below." + ) + + parser.add_argument( + "--chembl", + "-v", + dest="chembl_version", + metavar="", + type=str, + default=None, + help="ChEMBL version. \ + Latest version if None. \ + Required if a path to a SQLite database is provided, \ + i.e., if --sqlite is set. (default: None)", + ) + parser.add_argument( + "--sqlite", + "-s", + metavar="", + type=str, + default=None, + help="Path to SQLite database. \ + ChEMBL is downloaded as an SQLite database \ + and handled by chembl_downloader if None. (default: None)", + ) + parser.add_argument( + "--output", + "-o", + dest="output_path", + metavar="", + type=str, + required=True, + help="Path to write the output file(s) to. (required)", + ) + parser.add_argument( + "--delimiter", + "-d", + metavar="", + type=str, + default=";", + help="Delimiter in output csv-files. (default: ;)", + ) + parser.add_argument( + "--all_sources", + action="store_true", + help="If this is set, the dataset is calculated based on all sources in ChEMBL. \ + This includes data from BindingDB which may skew the results. \ + Default (not set): the dataset is calculated based on only literature data.", + ) + parser.add_argument( + "--rdkit", + dest="calculate_rdkit", + action="store_true", + help="Calculate RDKit-based compound properties.", + ) + parser.add_argument( + "--excel", + dest="write_to_excel", + action="store_true", + help="Write the results to excel. Note: this may fail if the output is too large.", + ) + parser.add_argument( + "--BF", + dest="write_bf", + action="store_true", + help="Write binding+functional data subsets.", + ) + parser.add_argument( + "--B", dest="write_b", action="store_true", help="Write binding data subsets." + ) + parser.add_argument( + "--debug", action="store_true", help="Log additional debugging information." + ) + args = parser.parse_args() + + return args + + +def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]: + """ + Get parsed and default arguments. + + :return: parserd arguments, + arguments related to how to calculate the dataset as CalculationArgs, + arguments related to how to output the dataset as OutputArgs + :rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs] + """ + args = parse_args() + + calc_args = CalculationArgs( + chembl_version=args.chembl_version, + calculate_rdkit=args.calculate_rdkit, + limit_to_literature=not args.all_sources, + # used in file names + limited_flag="literature_only" if not args.all_sources else "all_sources", + min_nof_cpds_bf=100, + min_nof_cpds_b=100, + ) + + output_args = OutputArgs( + output_path=args.output_path, + delimiter=args.delimiter, + # Always write the results to csv. + write_to_csv=True, + write_to_excel=args.write_to_excel, + # Always write the full dataset plus filtering columns + # for binding vs. binding+functional data. + write_full_dataset=True, + write_bf=args.write_bf, + write_b=args.write_b, + ) + + return args, calc_args, output_args diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py index 7c825db..4e440da 100644 --- a/src/get_activity_ct_pairs.py +++ b/src/get_activity_ct_pairs.py @@ -1,17 +1,13 @@ -import logging import sqlite3 import numpy as np import pandas as pd -import get_stats - ########### Get Initial Compound-Target Data From ChEMBL ########### def get_compound_target_pairs_with_pchembl( chembl_con: sqlite3.Connection, limit_to_literature: bool, - df_sizes: list[list[int], list[int]], ) -> pd.DataFrame: """ Query ChEMBL activities and related assay for compound-target pairs @@ -27,8 +23,6 @@ def get_compound_target_pairs_with_pchembl( :param limit_to_literature: Include only literature sources if True. Include all available sources otherwise. :type limit_to_literature: bool - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] :return: Pandas DataFrame with compound-target pairs with a pchembl value. :rtype: pd.DataFrame """ @@ -84,9 +78,6 @@ def get_compound_target_pairs_with_pchembl( f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"]) ] - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_mols, "initial query", df_sizes) - return df_mols @@ -173,7 +164,6 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame: def get_aggregated_activity_ct_pairs( chembl_con: sqlite3.Connection, limit_to_literature: bool, - df_sizes: list[list[int], list[int]], ) -> pd.DataFrame: """ Get dataset of compound target-pairs with an associated pchembl value @@ -194,14 +184,13 @@ def get_aggregated_activity_ct_pairs( :param limit_to_literature: Include only literature sources if True. Include all available sources otherwise. :type limit_to_literature: bool - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data aggregated into one entry per compound-target pair. :rtype: pd.DataFrame """ df_mols = get_compound_target_pairs_with_pchembl( - chembl_con, limit_to_literature, df_sizes + chembl_con, + limit_to_literature, ) # Summarise the information for binding and functional assays diff --git a/src/get_dataset.py b/src/get_dataset.py index 2fa6b91..053ec8e 100644 --- a/src/get_dataset.py +++ b/src/get_dataset.py @@ -1,5 +1,4 @@ import logging -import os import sqlite3 import get_activity_ct_pairs @@ -12,60 +11,28 @@ import sanity_checks import write_subsets import get_stats +from arguments import OutputArgs, CalculationArgs def get_ct_pair_dataset( - chembl_con: sqlite3.Connection, - chembl_version: str, - output_path: str, - limit_to_literature: bool, - calculate_rdkit: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - write_full_dataset: bool, - write_bf: bool, - write_b: bool, + chembl_con: sqlite3.Connection, args: CalculationArgs, out: OutputArgs ): """ Calculate and output the compound-target pair dataset. :param chembl_con: Sqlite3 connection to ChEMBL database :type chembl_con: sqlite3.Connection - :param chembl_version: Version of ChEMBL for output file names - :type chembl_version: str - :param output_path: Path to write output files to - :type output_path: str - :param limit_to_literature: Include only literature sources if True. - Include all available sources otherwise. - :type limit_to_literature: bool - :param calculate_rdkit: True if RDKit-based compound properties should be calculated - :type calculate_rdkit: bool - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str - :param write_full_dataset: True if the full dataset should be written to output - :type write_full_dataset: bool - :param write_bf: True if subsets based on binding+functional data should be written to output - :type write_bf: bool - :param write_b: True if subsets based on binding data only should be written to output - :type write_b: bool + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs """ # list with sizes of full dataset and dataset subset with pchembl values for debugging df_sizes = [[], []] - # used in file names - if limit_to_literature: - limited_flag = "literature_only" - else: - limited_flag = "all_sources" - logging.info("get_aggregated_activity_ct_pairs") df_combined = get_activity_ct_pairs.get_aggregated_activity_ct_pairs( - chembl_con, limit_to_literature, df_sizes + chembl_con, args.limit_to_literature ) if logging.DEBUG >= logging.root.level: get_stats.add_dataset_sizes(df_combined, "activity ct-pairs", df_sizes) @@ -87,7 +54,7 @@ def get_ct_pair_dataset( logging.info("add_all_chembl_compound_properties") df_combined, df_cpd_props, atc_levels = ( add_chembl_compound_properties.add_all_chembl_compound_properties( - df_combined, chembl_con, limit_to_literature + df_combined, chembl_con, args.limit_to_literature ) ) if logging.DEBUG >= logging.root.level: @@ -105,19 +72,15 @@ def get_ct_pair_dataset( add_chembl_target_class_annotations.add_chembl_target_class_annotations( df_combined, chembl_con, - output_path, - write_to_csv, - write_to_excel, - delimiter, - chembl_version, - limited_flag, + args, + out, ) ) if logging.DEBUG >= logging.root.level: get_stats.add_dataset_sizes(df_combined, "tclass annotations", df_sizes) logging.info("add_rdkit_compound_descriptors") - if calculate_rdkit: + if args.calculate_rdkit: df_combined = add_rdkit_compound_descriptors.add_rdkit_compound_descriptors( df_combined ) @@ -125,7 +88,7 @@ def get_ct_pair_dataset( get_stats.add_dataset_sizes(df_combined, "RDKit props", df_sizes) logging.info("clean_dataset") - df_combined = clean_dataset.clean_dataset(df_combined, calculate_rdkit) + df_combined = clean_dataset.clean_dataset(df_combined, args.calculate_rdkit) if logging.DEBUG >= logging.root.level: get_stats.add_dataset_sizes(df_combined, "clean df", df_sizes) @@ -136,89 +99,34 @@ def get_ct_pair_dataset( atc_levels, target_classes_level1, target_classes_level2, - calculate_rdkit, + args.calculate_rdkit, ) logging.info("write_BF_to_file") - min_nof_cpds_bf = 100 - df_combined_annotated = write_subsets.write_bf_to_file( + df_combined = write_subsets.write_bf_to_file( df_combined, - chembl_version, - min_nof_cpds_bf, - output_path, - write_bf, - write_to_csv, - write_to_excel, - delimiter, - limited_flag, - calculate_rdkit, df_sizes, + args, + out, ) logging.info("write_B_to_file") - min_nof_cpds_b = 100 - df_combined_annotated = write_subsets.write_b_to_file( + df_combined = write_subsets.write_b_to_file( df_combined, - df_combined_annotated, - chembl_version, - min_nof_cpds_b, - output_path, - write_b, - write_to_csv, - write_to_excel, - delimiter, - limited_flag, - calculate_rdkit, df_sizes, + args, + out, ) logging.info("write_full_dataset_to_file") write_subsets.write_full_dataset_to_file( - df_combined_annotated, - chembl_version, - output_path, - write_full_dataset, - write_to_csv, - write_to_excel, - delimiter, - limited_flag, - calculate_rdkit, + df_combined, + args, + out, ) logging.info("output_stats") - - output_file = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset_stats" - ) - write_subsets.output_stats( - df_combined_annotated, output_file, write_to_csv, write_to_excel, delimiter - ) - if write_bf: - output_file = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt_stats", - ) - write_subsets.output_stats( - df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]], - output_file, - write_to_csv, - write_to_excel, - delimiter, - ) - if write_b: - output_file = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt_stats", - ) - write_subsets.output_stats( - df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]], - output_file, - write_to_csv, - write_to_excel, - delimiter, - ) + write_subsets.output_all_stats(df_combined, args, out) if logging.DEBUG >= logging.root.level: - write_subsets.output_debug_sizes( - df_sizes, output_path, write_to_csv, write_to_excel, delimiter - ) + write_subsets.output_debug_sizes(df_sizes, out) diff --git a/src/main.py b/src/main.py index 8198f3e..ffe2bdd 100644 --- a/src/main.py +++ b/src/main.py @@ -1,103 +1,17 @@ -import argparse import logging import sqlite3 import chembl_downloader +import arguments import get_dataset -def parse_args() -> argparse.Namespace: - """ - Get arguments with argparse. - - :return: Populated argparse.Namespace - :rtype: argparse.Namespace - """ - parser = argparse.ArgumentParser( - description="Extract the compound-target pairs dataset from ChEMBL. \ - The full dataset plus filtering columns for binding vs. binding+functional data \ - will always be written to csv. \ - Additional outputs and output types can be chosen with the parameters below." - ) - - parser.add_argument( - "--chembl", - "-v", - metavar="", - type=str, - default=None, - help="ChEMBL version. \ - Latest version if None. \ - Required if a path to a SQLite database is provided, \ - i.e., if --sqlite is set. (default: None)", - ) - parser.add_argument( - "--sqlite", - "-s", - metavar="", - type=str, - default=None, - help="Path to SQLite database. \ - ChEMBL is downloaded as an SQLite database \ - and handled by chembl_downloader if None. (default: None)", - ) - parser.add_argument( - "--output", - "-o", - metavar="", - type=str, - required=True, - help="Path to write the output file(s) to. (required)", - ) - parser.add_argument( - "--delimiter", - "-d", - metavar="", - type=str, - default=";", - help="Delimiter in output csv-files. (default: ;)", - ) - parser.add_argument( - "--all_sources", - action="store_true", - help="If this is set, the dataset is calculated based on all sources in ChEMBL. \ - This includes data from BindingDB which may skew the results. \ - Default (not set): the dataset is calculated based on only literature data.", - ) - parser.add_argument( - "--rdkit", - action="store_true", - help="Calculate RDKit-based compound properties.", - ) - parser.add_argument( - "--excel", - action="store_true", - help="Write the results to excel. Note: this may fail if the output is too large.", - ) - parser.add_argument( - "--BF", action="store_true", help="Write binding+functional data subsets." - ) - parser.add_argument("--B", action="store_true", help="Write binding data subsets.") - parser.add_argument( - "--debug", action="store_true", help="Log additional debugging information." - ) - args = parser.parse_args() - - return args - - def main(): """ Call get_ct_pair_dataset to get the compound-target dataset using the given arguments. """ - args = parse_args() - - # Set arguments that are always true. - # Write the results to csv. - csv = True - # Write the full dataset plus filtering columns for binding vs. binding+functional data. - full_df = True + args, calc_args, output_args = arguments.get_args() log_level = "DEBUG" if args.debug else "INFO" numeric_log_level = getattr(logging, log_level, None) @@ -112,35 +26,19 @@ def main(): with sqlite3.connect(args.sqlite) as chembl_con: get_dataset.get_ct_pair_dataset( chembl_con, - args.chembl, - args.output, - not args.all_sources, - args.rdkit, - csv, - args.excel, - args.delimiter, - full_df, - args.BF, - args.B, + calc_args, + output_args, ) else: logging.info("Using chembl_downloader to connect to ChEMBL.") - if args.chembl is None: - args.chembl = chembl_downloader.latest() + if args.chembl_version is None: + args.chembl_version = chembl_downloader.latest() - with chembl_downloader.connect(version=args.chembl) as chembl_con: + with chembl_downloader.connect(version=args.chembl_version) as chembl_con: get_dataset.get_ct_pair_dataset( chembl_con, - args.chembl, - args.output, - not args.all_sources, - args.rdkit, - csv, - args.excel, - args.delimiter, - full_df, - args.BF, - args.B, + calc_args, + output_args, ) diff --git a/src/write_subsets.py b/src/write_subsets.py index b138b35..c979511 100644 --- a/src/write_subsets.py +++ b/src/write_subsets.py @@ -4,14 +4,13 @@ import sanity_checks import get_stats +from arguments import OutputArgs, CalculationArgs def write_output( df: pd.DataFrame, filename: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, + out: OutputArgs, ) -> list[str]: """ Write DataFrame df to output file named . @@ -20,20 +19,16 @@ def write_output( :type df: pd.DataFrame :param filename: Filename to write the output to :type filename: bool - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str + :param out: Arguments related to how to output the dataset + :type out: OutputArgs :return: Returns list of types of files that was written to (csv and/or xlsx) :rtype: list[str] """ file_type_list = [] - if write_to_csv: - df.to_csv(f"{filename}.csv", sep=delimiter, index=False) + if out.write_to_csv: + df.to_csv(f"{filename}.csv", sep=out.delimiter, index=False) file_type_list.append("csv") - if write_to_excel: + if out.write_to_excel: try: with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer: writer.book.use_zip64() @@ -50,11 +45,9 @@ def write_output( def write_and_check_output( df: pd.DataFrame, filename: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, assay_type: str, - calculate_rdkit: bool, + args: CalculationArgs, + out: OutputArgs, ): """ Write df to file and check that writing was successful. @@ -63,23 +56,19 @@ def write_and_check_output( :type df: pd.DataFrame :param filename: Filename to write the output to :type filename: bool - :param write_to_csv: True if output should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if output should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str :param assay_type: Types of assays current_df contains information about. \ Options: "BF" (binding+functional), "B" (binding), "all" (contains both BF and B information) :type assay_type: str - :param calculate_rdkit: If True, current_df contains RDKit-based columns - :type calculate_rdkit: bool + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs """ - file_type_list = write_output(df, filename, write_to_csv, write_to_excel, delimiter) + file_type_list = write_output(df, filename, out) sanity_checks.test_equality( - df, filename, assay_type, file_type_list, calculate_rdkit + df, filename, assay_type, file_type_list, args.calculate_rdkit ) @@ -127,6 +116,9 @@ def get_data_subsets( f"SEI_{drop_desc}", f"LLE_{drop_desc}", ] + + [ # exclude columns related to the other assay types + col for col in data.columns if col.startswith("B_") or col.startswith("BF_") + ] # exclude filtering columns ).drop_duplicates() # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value. @@ -165,145 +157,167 @@ def get_data_subsets( return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt -def write_bf_to_file( +def write_subset_to_file( + df_combined_subset: pd.DataFrame, df_combined: pd.DataFrame, - chembl_version: str, - min_nof_cpds_bf: int, - output_path: str, - write_bf: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - limited_flag: str, - calculate_rdkit: bool, - df_sizes: list[list[int], list[int]], -) -> pd.DataFrame: + desc: str, + args: CalculationArgs, + out: OutputArgs, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ - Calculate relevant subsets for the portion of df_combined - that is based on binding+functional data. - If write_bf the subsets are written to output_path. - Independent of write_bf, filtering columns for BF are added to df_combined and returned. + Write BF or B subsets to file. + :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data + in df_combined + :type df_combined_subset: pd.DataFrame :param df_combined: Pandas DataFrame with compound-target pairs :type df_combined: pd.DataFrame - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param min_nof_cpds_bf: Miminum number of compounds per target - :type min_nof_cpds_bf: int - :param output_path: Path to write the output to - :type output_path: str - :param write_bf: Should the subsets be written to files? - :type write_bf: bool - :param write_to_csv: Should the subsets be written to csv? - :type write_to_csv: bool - :param write_to_excel: Should the subsets be written to excel? - :type write_to_excel: bool - :param delimiter: Delimiter for csv output - :type delimiter: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str - :param calculate_rdkit: Does df_combined include RDKit-based columns? - :type calculate_rdkit: bool - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :return: Pandas DataFrame with additional filtering columns for BF subsets - :rtype: pd.Dataframe + :param desc: Assay description, + either "BF" (binding+functional) or "B" (binding) + :type desc: str + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + :return: List of calculated subsets + :rtype: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame] """ - # consider binding and functional assays - # assay description = binding+functional - desc = "BF" - # df_combined with additional filtering columns - df_combined_annotated = df_combined.copy() - # df_combined without binding only data - df_combined_bf = df_combined.copy() ( - df_combined_bf, - df_combined_bf_enough_cpds, - df_combined_bf_c_dt_d_dt, - df_combined_bf_d_dt, - ) = get_data_subsets(df_combined_bf, min_nof_cpds_bf, desc) + df_combined_subset, + df_combined_subset_enough_cpds, + df_combined_subset_c_dt_d_dt, + df_combined_subset_d_dt, + ) = get_data_subsets( + df_combined_subset, + args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b, + desc, + ) - # add filtering columns to df_combined_annotated + # add filtering columns to df_combined for df, col_name in zip( [ - df_combined_bf_enough_cpds, - df_combined_bf_c_dt_d_dt, - df_combined_bf_d_dt, + df_combined_subset_enough_cpds, + df_combined_subset_c_dt_d_dt, + df_combined_subset_d_dt, ], [ - f"BF_{min_nof_cpds_bf}", - f"BF_{min_nof_cpds_bf}_c_dt_d_dt", - f"BF_{min_nof_cpds_bf}_d_dt", + f"{desc}_{args.min_nof_cpds_bf}", + f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", + f"{desc}_{args.min_nof_cpds_bf}_d_dt", ], ): - df_combined_annotated[col_name] = False - df_combined_annotated.loc[ - (df_combined_annotated.index.isin(df.index)), col_name - ] = True + df_combined[col_name] = False + df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True # check that filtering works - assert df_combined_annotated[df_combined_annotated[col_name] == True][ - df.columns - ].equals(df), f"Filtering is not accurate for {col_name}." - - if write_bf: - # NOTE: This is almost identical to the full dataset which will be saved later on. - # However, the binding-related columns are dropped - name_bf = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF" + assert df_combined[df_combined[col_name] == True][df.columns].equals( + df + ), f"Filtering is not accurate for {col_name}." + + if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b): + # NOTE: For BF this is almost identical to the full dataset + # which will be saved later on. + # However, the binding-related columns are dropped. + name_subset = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_{desc}", ) write_and_check_output( - df_combined_bf, - name_bf, - write_to_csv, - write_to_excel, - delimiter, + df_combined_subset, + name_subset, desc, - calculate_rdkit, + args, + out, ) - name_bf_100 = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}", + name_subset_100 = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"{desc}_{args.min_nof_cpds_bf}", ) write_and_check_output( - df_combined_bf_enough_cpds, - name_bf_100, - write_to_csv, - write_to_excel, - delimiter, + df_combined_subset_enough_cpds, + name_subset_100, desc, - calculate_rdkit, + args, + out, ) - name_bf_100_c_dt_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt", + name_subset_100_c_dt_d_dt = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", ) write_and_check_output( - df_combined_bf_c_dt_d_dt, - name_bf_100_c_dt_d_dt, - write_to_csv, - write_to_excel, - delimiter, + df_combined_subset_c_dt_d_dt, + name_subset_100_c_dt_d_dt, desc, - calculate_rdkit, + args, + out, ) - name_bf_100_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_d_dt", + name_subset_100_d_dt = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"{desc}_{args.min_nof_cpds_bf}_d_dt", ) write_and_check_output( - df_combined_bf_d_dt, - name_bf_100_d_dt, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, + df_combined_subset_d_dt, name_subset_100_d_dt, desc, args, out ) + return ( + df_combined, + df_combined_subset, + df_combined_subset_enough_cpds, + df_combined_subset_c_dt_d_dt, + df_combined_subset_d_dt, + ) + + +def write_bf_to_file( + df_combined: pd.DataFrame, + df_sizes: list[list[int], list[int]], + args: CalculationArgs, + out: OutputArgs, +) -> pd.DataFrame: + """ + Calculate relevant subsets for the portion of df_combined + that is based on binding+functional data. + If write_bf the subsets are written to output_path. + Independent of write_bf, filtering columns for BF are added to df_combined and returned. + + :param df_combined: Pandas DataFrame with compound-target pairs + :type df_combined: pd.DataFrame + :param df_sizes: List of intermediate sized of the dataset used for debugging. + :type df_sizes: list[list[int], list[int]] + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + :return: Pandas DataFrame with additional filtering columns for BF subsets + :rtype: pd.Dataframe + """ + # consider binding and functional assays + # assay description = binding+functional + desc = "BF" + # df_combined without binding only data + df_combined_subset = df_combined.copy() + ( + df_combined, + df_combined_bf, + df_combined_bf_enough_cpds, + df_combined_bf_c_dt_d_dt, + df_combined_bf_d_dt, + ) = write_subset_to_file( + df_combined_subset, + df_combined, + desc, + args, + out, + ) + if logging.DEBUG >= logging.root.level: get_stats.add_dataset_sizes(df_combined_bf, "binding + functional", df_sizes) get_stats.add_dataset_sizes(df_combined_bf_enough_cpds, "BF, >= 100", df_sizes) @@ -312,22 +326,14 @@ def write_bf_to_file( ) get_stats.add_dataset_sizes(df_combined_bf_d_dt, "BF, >= 100, d_dt", df_sizes) - return df_combined_annotated + return df_combined def write_b_to_file( df_combined: pd.DataFrame, - df_combined_annotated: pd.DataFrame, - chembl_version: str, - min_nof_cpds_b: int, - output_path: str, - write_b: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - limited_flag: str, - calculate_rdkit: bool, df_sizes: list[list[int], list[int]], + args: CalculationArgs, + out: OutputArgs, ) -> pd.DataFrame: """ Calculate relevant subsets for the portion of df_combined that is based on binding data. @@ -336,115 +342,32 @@ def write_b_to_file( :param df_combined: Pandas DataFrame with compound-target pairs :type df_combined: pd.DataFrame - :param df_combined_annotated: Pandas DataFrame with additional filtering columns - :type df_combined_annotated: pd.DataFrame - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param min_nof_cpds_b: Miminum number of compounds per target - :type min_nof_cpds_b: int - :param output_path: Path to write the output to - :type output_path: str - :param write_b: Should the subsets be written to files? - :type write_b: bool - :param write_to_csv: Should the subsets be written to csv? - :type write_to_csv: bool - :param write_to_excel: Should the subsets be written to excel? - :type write_to_excel: bool - :param delimiter: Delimiter for csv output - :type delimiter: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str - :param calculate_rdkit: Does df_combined include RDKit-based columns? - :type calculate_rdkit: bool :param df_sizes: List of intermediate sized of the dataset used for debugging. :type df_sizes: list[list[int], list[int]] + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs :return: Pandas DataFrame with additional filtering columns for B subsets :rtype: pd.Dataframe """ # consider only binding assays # assay description = binding desc = "B" - df_combined_b = df_combined[df_combined["keep_for_binding"] == True].copy() + df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy() ( + df_combined, df_combined_b, df_combined_b_enough_cpds, df_combined_b_c_dt_d_dt, df_combined_b_d_dt, - ) = get_data_subsets(df_combined_b, min_nof_cpds_b, desc) - - # add filtering columns to df_combined_annotated - for df, col_name in zip( - [df_combined_b_enough_cpds, df_combined_b_c_dt_d_dt, df_combined_b_d_dt], - [ - f"B_{min_nof_cpds_b}", - f"B_{min_nof_cpds_b}_c_dt_d_dt", - f"B_{min_nof_cpds_b}_d_dt", - ], - ): - df_combined_annotated[col_name] = False - df_combined_annotated.loc[ - (df_combined_annotated.index.isin(df.index)), col_name - ] = True - # check that filtering works - assert df_combined_annotated[df_combined_annotated[col_name] == True][ - df.columns - ].equals(df), f"Filtering is not accurate for {col_name}." - - if write_b: - name_b = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B" - ) - write_and_check_output( - df_combined_b, - name_b, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_b_100 = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}" - ) - write_and_check_output( - df_combined_b_enough_cpds, - name_b_100, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_b_100_c_dt_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt", - ) - write_and_check_output( - df_combined_b_c_dt_d_dt, - name_b_100_c_dt_d_dt, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) - - name_b_100_d_dt = os.path.join( - output_path, - f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_d_dt", - ) - write_and_check_output( - df_combined_b_d_dt, - name_b_100_d_dt, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, - ) + ) = write_subset_to_file( + df_combined_subset, + df_combined, + desc, + args, + out, + ) if logging.DEBUG >= logging.root.level: get_stats.add_dataset_sizes(df_combined_b, "binding", df_sizes) @@ -454,79 +377,46 @@ def write_b_to_file( ) get_stats.add_dataset_sizes(df_combined_b_d_dt, "B, >= 100, d_dt", df_sizes) - return df_combined_annotated + return df_combined def write_full_dataset_to_file( df_combined: pd.DataFrame, - chembl_version: str, - output_path: str, - write_full_dataset: bool, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, - limited_flag: str, - calculate_rdkit: bool, + args: CalculationArgs, + out: OutputArgs, ): """ If write_full_dataset, write df_combined with filtering columns to output_path. :param df_combined: Pandas DataFrame with compound-target pairs and filtering columns :type df_combined: pd.DataFrame - :param chembl_version: Version of ChEMBL for output files - :type chembl_version: str - :param output_path: Path to write the output to - :type output_path: str - :param write_full_dataset: Should the subsets be written to files? - :type write_full_dataset: bool - :param write_to_csv: Should the subsets be written to csv? - :type write_to_csv: bool - :param write_to_excel: Should the subsets be written to excel? - :type write_to_excel: bool - :param delimiter: Delimiter for csv output - :type delimiter: str - :param limited_flag: Document suffix indicating - whether the dataset was limited to literature sources - :type limited_flag: str - :param calculate_rdkit: Does df_combined include RDKit-based columns? - :type calculate_rdkit: bool + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs """ desc = "all" - if write_full_dataset: + if out.write_full_dataset: name_all = os.path.join( - output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset" - ) - write_and_check_output( - df_combined, - name_all, - write_to_csv, - write_to_excel, - delimiter, - desc, - calculate_rdkit, + out.output_path, + f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset", ) + write_and_check_output(df_combined, name_all, desc, args, out) def output_debug_sizes( df_sizes: list[list[int], list[int]], - output_path: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, + out: OutputArgs, ): """ Output counts at various points during calculating the final dataset for debugging. :param df_sizes: List of intermediate sized of the dataset used for debugging. :type df_sizes: list[list[int], list[int]] - :param output_path: Path to write the dataset counts to - :type output_path: str - :param write_to_csv: True if counts should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if counts should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs """ column_names = [ "type", @@ -545,9 +435,11 @@ def output_debug_sizes( logging.debug("Size of full dataset at different points.") full_df_sizes = pd.DataFrame(df_sizes[0], columns=column_names) logging.debug(full_df_sizes) - name_full_df_sizes = os.path.join(output_path, "debug_full_df_sizes") + name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes") write_output( - full_df_sizes, name_full_df_sizes, write_to_csv, write_to_excel, delimiter + full_df_sizes, + name_full_df_sizes, + out, ) logging.debug("Size of dataset with any pchembl values at different points.") @@ -557,18 +449,18 @@ def output_debug_sizes( ) df_pchembl_sizes = pd.DataFrame(df_sizes[1], columns=column_names) logging.debug(df_pchembl_sizes) - name_pchembl_df_sizes = os.path.join(output_path, "debug_pchembl_df_sizes") + name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes") write_output( - full_df_sizes, name_pchembl_df_sizes, write_to_csv, write_to_excel, delimiter + full_df_sizes, + name_pchembl_df_sizes, + out, ) def output_stats( df: pd.DataFrame, output_file: str, - write_to_csv: bool, - write_to_excel: bool, - delimiter: str, + out: OutputArgs, ): """ Summarise and output the number of unique values in the following columns: @@ -583,12 +475,8 @@ def output_stats( :type df: pd.DataFrame :param output_file: Path and filename to write the dataset stats to :type output_file: str - :param write_to_csv: True if stats should be written to csv - :type write_to_csv: bool - :param write_to_excel: True if stats should be written to excel - :type write_to_excel: bool - :param delimiter: Delimiter in csv-output - :type delimiter: str + :param out: Arguments related to how to output the dataset + :type out: OutputArgs """ df_columns = [ "parent_molregno", @@ -621,4 +509,55 @@ def output_stats( df_stats = pd.DataFrame( stats, columns=["column", "column_description", "subset_type", "counts"] ) - write_output(df_stats, output_file, write_to_csv, write_to_excel, delimiter) + write_output( + df_stats, + output_file, + out, + ) + + +def output_all_stats( + df_combined_annotated: pd.DataFrame, args: CalculationArgs, out: OutputArgs +): + """ + Output stats for all datasets and subsets calculated. + + :param df_combined_annotated: Pandas DataFrame with additional filtering columns + :type df_combined_annotated: pd.DataFrame + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + output_file = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset_stats", + ) + + output_stats(df_combined_annotated, output_file, out) + + if out.write_bf: + output_file = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"BF_{args.min_nof_cpds_bf}_c_dt_d_dt_stats", + ) + output_stats( + df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]], + output_file, + out, + ) + + if out.write_b: + output_file = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"B_{args.min_nof_cpds_b}_c_dt_d_dt_stats", + ) + output_stats( + df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]], + output_file, + out, + ) From d8331c9bb26828d4800250e06568ef756085d38c Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Thu, 15 Feb 2024 16:32:10 +0000 Subject: [PATCH 2/8] Add module to add filtering columns --- src/add_filtering_columns.py | 233 ++++++++++++++++++++++++++ src/get_dataset.py | 13 +- src/write_subsets.py | 308 ----------------------------------- 3 files changed, 236 insertions(+), 318 deletions(-) create mode 100644 src/add_filtering_columns.py diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py new file mode 100644 index 0000000..053d9ad --- /dev/null +++ b/src/add_filtering_columns.py @@ -0,0 +1,233 @@ +import logging +import os + +import pandas as pd + +from arguments import CalculationArgs, OutputArgs +import get_stats +import write_subsets + + +def get_data_subsets( + data: pd.DataFrame, min_nof_cpds: int, desc: str +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Calculate and return the different subsets of interest. + + :param data: Pandas DataFrame with compound-target pairs + :type data: pd.DataFrame + :param min_nof_cpds: Miminum number of compounds per target + :type min_nof_cpds: int + :param desc: Types of assays current_df contains information about. \ + Options: "BF" (binding+functional), "B" (binding) + :type desc: str + :return: + - data: Pandas DataFrame with compound-target pairs + without the annotations for the opposite desc, \ + e.g. if desc = "BF", the average pchembl value based on + binding data only is dropped + - df_enough_cpds: Pandas DataFrame with targets + with at least compounds with a pchembl value, + - df_c_dt_d_dt: As df_enough_cpds but with \ + at least one compound-target pair labelled as + 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), + - df_d_dt: As df_enough_cpds but with \ + at least one compound-target pair labelled as + 'D_DT' (i.e., known drug-target interaction) + :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame) + """ + if desc == "B": + drop_desc = "BF" + else: + drop_desc = "B" + data = data.drop( + columns=[ + f"pchembl_value_mean_{drop_desc}", + f"pchembl_value_max_{drop_desc}", + f"pchembl_value_median_{drop_desc}", + f"first_publication_cpd_target_pair_{drop_desc}", + f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}", + f"LE_{drop_desc}", + f"BEI_{drop_desc}", + f"SEI_{drop_desc}", + f"LLE_{drop_desc}", + ] + + [ # exclude columns related to the other assay types + col for col in data.columns if col.startswith("B_") or col.startswith("BF_") + ] # exclude filtering columns + ).drop_duplicates() + + # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value. + comparator_counts = ( + data[data[f"pchembl_value_mean_{desc}"].notnull()] + .groupby(["tid_mutation"])["parent_molregno"] + .count() + ) + # pylint: disable-next=unused-variable + targets_w_enough_cpds = comparator_counts[ + comparator_counts >= min_nof_cpds + ].index.tolist() + df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds") + + # Restrict the dataset further to targets + # with at least one compound-target pair labelled as + # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT', + # i.e., compound-target pairs with a known interactions. + # pylint: disable-next=unused-variable + c_dt_d_dt_targets = set( + df_enough_cpds[ + df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"]) + ].tid_mutation.to_list() + ) + df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets") + + # Restrict the dataset further to targets with + # at least one compound-target pair labelled as 'D_DT', + # i.e., known drug-target interactions. + # pylint: disable-next=unused-variable + d_dt_targets = set( + df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list() + ) + df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets") + + return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt + + +def add_subset_filtering_columns( + df_combined_subset: pd.DataFrame, + df_combined: pd.DataFrame, + desc: str, + args: CalculationArgs, + out: OutputArgs, + df_sizes, +) -> pd.DataFrame: + # TODO update documentation + """ + Add filtering column for binding + functional vs binding + + :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data + in df_combined + :type df_combined_subset: pd.DataFrame + :param df_combined: Pandas DataFrame with compound-target pairs + :type df_combined: pd.DataFrame + :param desc: Assay description, + either "BF" (binding+functional) or "B" (binding) + :type desc: str + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :return: List of calculated subsets + :rtype: pd.DataFrame + """ + ( + df_combined_subset, + df_combined_subset_enough_cpds, + df_combined_subset_c_dt_d_dt, + df_combined_subset_d_dt, + ) = get_data_subsets( + df_combined_subset, + args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b, + desc, + ) + + # write subsets if required + if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b): + for df_subset, subset_desc in zip( + [ + df_combined_subset, + df_combined_subset_enough_cpds, + df_combined_subset_c_dt_d_dt, + df_combined_subset_d_dt, + ], + [ + f"{desc}", + f"{desc}_{args.min_nof_cpds_bf}", + f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", + f"{desc}_{args.min_nof_cpds_bf}_d_dt", + ], + ): + name_subset = os.path.join( + out.output_path, + f"ChEMBL{args.chembl_version}_" + f"CTI_{args.limited_flag}_" + f"{subset_desc}", + ) + write_subsets.write_and_check_output( + df_subset, + name_subset, + desc, + args, + out, + ) + + # add filtering columns to df_combined + for df, col_name in zip( + [ + df_combined_subset_enough_cpds, + df_combined_subset_c_dt_d_dt, + df_combined_subset_d_dt, + ], + [ + f"{desc}_{args.min_nof_cpds_bf}", + f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", + f"{desc}_{args.min_nof_cpds_bf}_d_dt", + ], + ): + df_combined[col_name] = False + df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True + # check that filtering works + assert df_combined[df_combined[col_name] == True][df.columns].equals( + df + ), f"Filtering is not accurate for {col_name}." + + if logging.DEBUG >= logging.root.level: + get_stats.add_dataset_sizes( + df_combined_subset, "binding + functional", df_sizes + ) + get_stats.add_dataset_sizes( + df_combined_subset_enough_cpds, "BF, >= 100", df_sizes + ) + get_stats.add_dataset_sizes( + df_combined_subset_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes + ) + get_stats.add_dataset_sizes( + df_combined_subset_d_dt, "BF, >= 100, d_dt", df_sizes + ) + + return df_combined + + +def add_filtering_columns( + df_combined, + df_sizes, + args, + out, +): + # TODO: documentation + # consider binding and functional assays + # assay description = binding+functional + desc = "BF" + # df_combined without binding only data + df_combined_subset = df_combined.copy() + df_combined = add_subset_filtering_columns( + df_combined_subset, + df_combined, + desc, + args, + out, + df_sizes, + ) + + # consider only binding assays + # assay description = binding + desc = "B" + df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy() + df_combined = add_subset_filtering_columns( + df_combined_subset, + df_combined, + desc, + args, + out, + df_sizes, + ) + + return df_combined diff --git a/src/get_dataset.py b/src/get_dataset.py index 053ec8e..7fd86c3 100644 --- a/src/get_dataset.py +++ b/src/get_dataset.py @@ -12,6 +12,7 @@ import write_subsets import get_stats from arguments import OutputArgs, CalculationArgs +import add_filtering_columns def get_ct_pair_dataset( @@ -102,16 +103,8 @@ def get_ct_pair_dataset( args.calculate_rdkit, ) - logging.info("write_BF_to_file") - df_combined = write_subsets.write_bf_to_file( - df_combined, - df_sizes, - args, - out, - ) - - logging.info("write_B_to_file") - df_combined = write_subsets.write_b_to_file( + logging.info("add_filtering_columns") + add_filtering_columns.add_filtering_columns( df_combined, df_sizes, args, diff --git a/src/write_subsets.py b/src/write_subsets.py index c979511..5b0bc21 100644 --- a/src/write_subsets.py +++ b/src/write_subsets.py @@ -72,314 +72,6 @@ def write_and_check_output( ) -def get_data_subsets( - data: pd.DataFrame, min_nof_cpds: int, desc: str -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Calculate and return the different subsets of interest. - - :param data: Pandas DataFrame with compound-target pairs - :type data: pd.DataFrame - :param min_nof_cpds: Miminum number of compounds per target - :type min_nof_cpds: int - :param desc: Types of assays current_df contains information about. \ - Options: "BF" (binding+functional), "B" (binding) - :type desc: str - :return: - - data: Pandas DataFrame with compound-target pairs - without the annotations for the opposite desc, \ - e.g. if desc = "BF", the average pchembl value based on - binding data only is dropped - - df_enough_cpds: Pandas DataFrame with targets - with at least compounds with a pchembl value, - - df_c_dt_d_dt: As df_enough_cpds but with \ - at least one compound-target pair labelled as - 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), - - df_d_dt: As df_enough_cpds but with \ - at least one compound-target pair labelled as - 'D_DT' (i.e., known drug-target interaction) - :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame) - """ - if desc == "B": - drop_desc = "BF" - else: - drop_desc = "B" - data = data.drop( - columns=[ - f"pchembl_value_mean_{drop_desc}", - f"pchembl_value_max_{drop_desc}", - f"pchembl_value_median_{drop_desc}", - f"first_publication_cpd_target_pair_{drop_desc}", - f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}", - f"LE_{drop_desc}", - f"BEI_{drop_desc}", - f"SEI_{drop_desc}", - f"LLE_{drop_desc}", - ] - + [ # exclude columns related to the other assay types - col for col in data.columns if col.startswith("B_") or col.startswith("BF_") - ] # exclude filtering columns - ).drop_duplicates() - - # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value. - comparator_counts = ( - data[data[f"pchembl_value_mean_{desc}"].notnull()] - .groupby(["tid_mutation"])["parent_molregno"] - .count() - ) - # pylint: disable-next=unused-variable - targets_w_enough_cpds = comparator_counts[ - comparator_counts >= min_nof_cpds - ].index.tolist() - df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds") - - # Restrict the dataset further to targets - # with at least one compound-target pair labelled as - # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT', - # i.e., compound-target pairs with a known interactions. - # pylint: disable-next=unused-variable - c_dt_d_dt_targets = set( - df_enough_cpds[ - df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"]) - ].tid_mutation.to_list() - ) - df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets") - - # Restrict the dataset further to targets with - # at least one compound-target pair labelled as 'D_DT', - # i.e., known drug-target interactions. - # pylint: disable-next=unused-variable - d_dt_targets = set( - df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list() - ) - df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets") - - return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt - - -def write_subset_to_file( - df_combined_subset: pd.DataFrame, - df_combined: pd.DataFrame, - desc: str, - args: CalculationArgs, - out: OutputArgs, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """ - Write BF or B subsets to file. - - :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data - in df_combined - :type df_combined_subset: pd.DataFrame - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param desc: Assay description, - either "BF" (binding+functional) or "B" (binding) - :type desc: str - :param args: Arguments related to how to calculate the dataset - :type args: CalculationArgs - :param out: Arguments related to how to output the dataset - :type out: OutputArgs - :return: List of calculated subsets - :rtype: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame] - """ - ( - df_combined_subset, - df_combined_subset_enough_cpds, - df_combined_subset_c_dt_d_dt, - df_combined_subset_d_dt, - ) = get_data_subsets( - df_combined_subset, - args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b, - desc, - ) - - # add filtering columns to df_combined - for df, col_name in zip( - [ - df_combined_subset_enough_cpds, - df_combined_subset_c_dt_d_dt, - df_combined_subset_d_dt, - ], - [ - f"{desc}_{args.min_nof_cpds_bf}", - f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", - f"{desc}_{args.min_nof_cpds_bf}_d_dt", - ], - ): - df_combined[col_name] = False - df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True - # check that filtering works - assert df_combined[df_combined[col_name] == True][df.columns].equals( - df - ), f"Filtering is not accurate for {col_name}." - - if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b): - # NOTE: For BF this is almost identical to the full dataset - # which will be saved later on. - # However, the binding-related columns are dropped. - name_subset = os.path.join( - out.output_path, - f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_{desc}", - ) - write_and_check_output( - df_combined_subset, - name_subset, - desc, - args, - out, - ) - - name_subset_100 = os.path.join( - out.output_path, - f"ChEMBL{args.chembl_version}_" - f"CTI_{args.limited_flag}_" - f"{desc}_{args.min_nof_cpds_bf}", - ) - write_and_check_output( - df_combined_subset_enough_cpds, - name_subset_100, - desc, - args, - out, - ) - - name_subset_100_c_dt_d_dt = os.path.join( - out.output_path, - f"ChEMBL{args.chembl_version}_" - f"CTI_{args.limited_flag}_" - f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", - ) - write_and_check_output( - df_combined_subset_c_dt_d_dt, - name_subset_100_c_dt_d_dt, - desc, - args, - out, - ) - - name_subset_100_d_dt = os.path.join( - out.output_path, - f"ChEMBL{args.chembl_version}_" - f"CTI_{args.limited_flag}_" - f"{desc}_{args.min_nof_cpds_bf}_d_dt", - ) - write_and_check_output( - df_combined_subset_d_dt, name_subset_100_d_dt, desc, args, out - ) - - return ( - df_combined, - df_combined_subset, - df_combined_subset_enough_cpds, - df_combined_subset_c_dt_d_dt, - df_combined_subset_d_dt, - ) - - -def write_bf_to_file( - df_combined: pd.DataFrame, - df_sizes: list[list[int], list[int]], - args: CalculationArgs, - out: OutputArgs, -) -> pd.DataFrame: - """ - Calculate relevant subsets for the portion of df_combined - that is based on binding+functional data. - If write_bf the subsets are written to output_path. - Independent of write_bf, filtering columns for BF are added to df_combined and returned. - - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :param args: Arguments related to how to calculate the dataset - :type args: CalculationArgs - :param out: Arguments related to how to output the dataset - :type out: OutputArgs - :return: Pandas DataFrame with additional filtering columns for BF subsets - :rtype: pd.Dataframe - """ - # consider binding and functional assays - # assay description = binding+functional - desc = "BF" - # df_combined without binding only data - df_combined_subset = df_combined.copy() - ( - df_combined, - df_combined_bf, - df_combined_bf_enough_cpds, - df_combined_bf_c_dt_d_dt, - df_combined_bf_d_dt, - ) = write_subset_to_file( - df_combined_subset, - df_combined, - desc, - args, - out, - ) - - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined_bf, "binding + functional", df_sizes) - get_stats.add_dataset_sizes(df_combined_bf_enough_cpds, "BF, >= 100", df_sizes) - get_stats.add_dataset_sizes( - df_combined_bf_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes - ) - get_stats.add_dataset_sizes(df_combined_bf_d_dt, "BF, >= 100, d_dt", df_sizes) - - return df_combined - - -def write_b_to_file( - df_combined: pd.DataFrame, - df_sizes: list[list[int], list[int]], - args: CalculationArgs, - out: OutputArgs, -) -> pd.DataFrame: - """ - Calculate relevant subsets for the portion of df_combined that is based on binding data. - If write_b the subsets are written to output_path. - Independent of write_b, filtering columns for B are added to df_combined_annotated. - - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :param args: Arguments related to how to calculate the dataset - :type args: CalculationArgs - :param out: Arguments related to how to output the dataset - :type out: OutputArgs - :return: Pandas DataFrame with additional filtering columns for B subsets - :rtype: pd.Dataframe - """ - # consider only binding assays - # assay description = binding - desc = "B" - df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy() - ( - df_combined, - df_combined_b, - df_combined_b_enough_cpds, - df_combined_b_c_dt_d_dt, - df_combined_b_d_dt, - ) = write_subset_to_file( - df_combined_subset, - df_combined, - desc, - args, - out, - ) - - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined_b, "binding", df_sizes) - get_stats.add_dataset_sizes(df_combined_b_enough_cpds, "B, >= 100", df_sizes) - get_stats.add_dataset_sizes( - df_combined_b_c_dt_d_dt, "B, >= 100, c_dt and d_dt", df_sizes - ) - get_stats.add_dataset_sizes(df_combined_b_d_dt, "B, >= 100, d_dt", df_sizes) - - return df_combined - - def write_full_dataset_to_file( df_combined: pd.DataFrame, args: CalculationArgs, From 4fe5118f13ee4fa6a73511fb1b68ce01e6e551ee Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Mon, 19 Feb 2024 16:06:47 +0000 Subject: [PATCH 3/8] Improve add_filtering_columns --- src/add_filtering_columns.py | 130 ++++++++++++++++------------------- 1 file changed, 60 insertions(+), 70 deletions(-) diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py index 053d9ad..ce6a575 100644 --- a/src/add_filtering_columns.py +++ b/src/add_filtering_columns.py @@ -8,9 +8,12 @@ import write_subsets -def get_data_subsets( - data: pd.DataFrame, min_nof_cpds: int, desc: str -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: +def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[ + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], +]: """ Calculate and return the different subsets of interest. @@ -18,23 +21,27 @@ def get_data_subsets( :type data: pd.DataFrame :param min_nof_cpds: Miminum number of compounds per target :type min_nof_cpds: int - :param desc: Types of assays current_df contains information about. \ + :param desc: Types of assays current_df contains information about. Options: "BF" (binding+functional), "B" (binding) :type desc: str - :return: - - data: Pandas DataFrame with compound-target pairs - without the annotations for the opposite desc, \ - e.g. if desc = "BF", the average pchembl value based on + :return: List of dataset subsets and the string describing them + - data: Pandas DataFrame with compound-target pairs + without filtering columns and without + the annotations for the opposite desc, + e.g. if desc = "BF", the average pchembl value based on binding data only is dropped - - df_enough_cpds: Pandas DataFrame with targets - with at least compounds with a pchembl value, - - df_c_dt_d_dt: As df_enough_cpds but with \ - at least one compound-target pair labelled as - 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), - - df_d_dt: As df_enough_cpds but with \ - at least one compound-target pair labelled as + - df_enough_cpds: Pandas DataFrame with targets + with at least compounds with a pchembl value, + - df_c_dt_d_dt: As df_enough_cpds but with + at least one compound-target pair labelled as + 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), + - df_d_dt: As df_enough_cpds but with + at least one compound-target pair labelled as 'D_DT' (i.e., known drug-target interaction) - :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame) + :rtype: tuple[tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str], + tuple[pd.DataFrame, str]] """ if desc == "B": drop_desc = "BF" @@ -90,7 +97,12 @@ def get_data_subsets( ) df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets") - return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt + return [ + [data, f"{desc}"], + [df_enough_cpds, f"{desc}_{min_nof_cpds}"], + [df_c_dt_d_dt, f"{desc}_{min_nof_cpds}_c_dt_d_dt"], + [df_d_dt, f"{desc}_{min_nof_cpds}_d_dt"], + ] def add_subset_filtering_columns( @@ -99,9 +111,8 @@ def add_subset_filtering_columns( desc: str, args: CalculationArgs, out: OutputArgs, - df_sizes, + df_sizes: list[list[int], list[int]], ) -> pd.DataFrame: - # TODO update documentation """ Add filtering column for binding + functional vs binding @@ -115,15 +126,14 @@ def add_subset_filtering_columns( :type desc: str :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs - :return: List of calculated subsets + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + :param df_sizes: List of intermediate sized of the dataset used for debugging. + :type df_sizes: list[list[int], list[int]] + :return: Pandas DataFrame with added filering columns :rtype: pd.DataFrame """ - ( - df_combined_subset, - df_combined_subset_enough_cpds, - df_combined_subset_c_dt_d_dt, - df_combined_subset_d_dt, - ) = get_data_subsets( + subsets = get_data_subsets( df_combined_subset, args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b, desc, @@ -131,20 +141,7 @@ def add_subset_filtering_columns( # write subsets if required if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b): - for df_subset, subset_desc in zip( - [ - df_combined_subset, - df_combined_subset_enough_cpds, - df_combined_subset_c_dt_d_dt, - df_combined_subset_d_dt, - ], - [ - f"{desc}", - f"{desc}_{args.min_nof_cpds_bf}", - f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", - f"{desc}_{args.min_nof_cpds_bf}_d_dt", - ], - ): + for [df_subset, subset_desc] in subsets: name_subset = os.path.join( out.output_path, f"ChEMBL{args.chembl_version}_" @@ -160,18 +157,8 @@ def add_subset_filtering_columns( ) # add filtering columns to df_combined - for df, col_name in zip( - [ - df_combined_subset_enough_cpds, - df_combined_subset_c_dt_d_dt, - df_combined_subset_d_dt, - ], - [ - f"{desc}_{args.min_nof_cpds_bf}", - f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt", - f"{desc}_{args.min_nof_cpds_bf}_d_dt", - ], - ): + # do not add a filtering column for BF / B (-> [1:]) + for [df, col_name] in subsets[1:]: df_combined[col_name] = False df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True # check that filtering works @@ -180,29 +167,32 @@ def add_subset_filtering_columns( ), f"Filtering is not accurate for {col_name}." if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes( - df_combined_subset, "binding + functional", df_sizes - ) - get_stats.add_dataset_sizes( - df_combined_subset_enough_cpds, "BF, >= 100", df_sizes - ) - get_stats.add_dataset_sizes( - df_combined_subset_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes - ) - get_stats.add_dataset_sizes( - df_combined_subset_d_dt, "BF, >= 100, d_dt", df_sizes - ) + for [df_subset, subset_desc] in subsets: + get_stats.add_dataset_sizes(df_subset, subset_desc, df_sizes) return df_combined def add_filtering_columns( - df_combined, - df_sizes, - args, - out, -): - # TODO: documentation + df_combined: pd.DataFrame, + df_sizes: list[list[int], list[int]], + args: CalculationArgs, + out: OutputArgs, +) -> pd.DataFrame: + """ + Add filtering columns to main dataset and save subsets if required. + + :param df_combined: Pandas DataFrame with compound-target pairs + :type df_combined: pd.DataFrame + :param df_sizes: List of intermediate sized of the dataset used for debugging. + :type df_sizes: list[list[int], list[int]] + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + :return: Pandas DataFrame with added filering columns + :rtype: pd.DataFrame + """ # consider binding and functional assays # assay description = binding+functional desc = "BF" From 1ee99f99b61f47f81cd31c39bebfdbc41e0ba8cd Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Tue, 20 Feb 2024 20:14:13 +0000 Subject: [PATCH 4/8] Add dataset dataclass --- src/add_chembl_compound_properties.py | 138 +++++++++--------- src/add_chembl_target_class_annotations.py | 49 ++++--- src/add_dti_annotations.py | 89 ++++++------ src/add_filtering_columns.py | 61 ++++---- src/add_rdkit_compound_descriptors.py | 117 ++++++++-------- src/clean_dataset.py | 102 +++++++------- src/dataset.py | 29 ++++ src/get_activity_ct_pairs.py | 23 ++- src/get_dataset.py | 93 ++++-------- src/get_drug_mechanism_ct_pairs.py | 57 ++++---- src/get_stats.py | 30 +++- src/sanity_checks.py | 156 ++++++++++----------- src/write_subsets.py | 33 +++-- 13 files changed, 496 insertions(+), 481 deletions(-) create mode 100644 src/dataset.py diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py index 836b06a..4e5d623 100644 --- a/src/add_chembl_compound_properties.py +++ b/src/add_chembl_compound_properties.py @@ -2,11 +2,13 @@ import pandas as pd +from dataset import Dataset + ########### Add Compound Properties Based on ChEMBL Data ########### def add_first_publication_date( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool -) -> pd.DataFrame: + dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool +): """ Query and calculate the first publication of a compound based on ChEMBL data (column name: first_publication_cpd). @@ -14,14 +16,13 @@ def add_first_publication_date( of the compound in the literature according to ChEMBL. Otherwise this is the first appearance in any source in ChEMBL. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to include first_publication_cpd + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :param limit_to_literature: Base first_publication_cpd on literature sources only if True. :type limit_to_literature: bool - :return: Pandas DataFrame with added first_publication_cpd. - :rtype: pd.DataFrame """ # information about salts is aggregated in the parent sql = """ @@ -42,26 +43,26 @@ def add_first_publication_date( ].transform("min") df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates() - df_combined = df_combined.merge(df_docs, on="parent_molregno", how="left") - - return df_combined + dataset.df_result = dataset.df_result.merge( + df_docs, on="parent_molregno", how="left" + ) def add_chembl_properties_and_structures( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> tuple[pd.DataFrame, pd.DataFrame]: + dataset: Dataset, chembl_con: sqlite3.Connection +): """ - Add compound properties from the compound_properties table + Add compound properties from the compound_properties table (e.g., alogp, #hydrogen bond acceptors / donors, etc.). - Add InChI, InChI key and canonical smiles. + Add InChI, InChI key and canonical smiles. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to include compound properties and structures. + dataset.df_cpd_props will be set to + compound properties and structures for all compound ids in ChEMBL. + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :return: - Pandas DataFrame with added compound properties and structures. \\ - - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL. - :rtype: (pd.DataFrame, pd.DataFrame) """ sql = """ SELECT DISTINCT mh.parent_molregno, @@ -78,13 +79,14 @@ def add_chembl_properties_and_structures( """ df_cpd_props = pd.read_sql_query(sql, con=chembl_con) + dataset.df_cpd_props = df_cpd_props - df_combined = df_combined.merge(df_cpd_props, on="parent_molregno", how="left") - - return df_combined, df_cpd_props + dataset.df_result = dataset.df_result.merge( + df_cpd_props, on="parent_molregno", how="left" + ) -def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame: +def add_ligand_efficiency_metrics(dataset: Dataset): """ Calculate the ligand efficiency metrics for the compounds based on the mean pchembl values for a compound-target pair and @@ -108,33 +110,37 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame: Once for the pchembl values based on binding + functional assays (BF) and once for the pchembl values based on binding assays only (B). - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added ligand efficiency metrics - :rtype: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to include ligand efficiency metrics. + :type dataset: Dataset """ for suffix in ["BF", "B"]: - df_combined.loc[df_combined["heavy_atoms"] != 0, f"LE_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] - / df_combined["heavy_atoms"] + dataset.df_result.loc[dataset.df_result["heavy_atoms"] != 0, f"LE_{suffix}"] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] + / dataset.df_result["heavy_atoms"] * (2.303 * 298 * 0.00199) ) - df_combined.loc[df_combined["mw_freebase"] != 0, f"BEI_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] + dataset.df_result.loc[ + dataset.df_result["mw_freebase"] != 0, f"BEI_{suffix}" + ] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] * 1000 - / df_combined["mw_freebase"] + / dataset.df_result["mw_freebase"] ) - df_combined.loc[df_combined["psa"] != 0, f"SEI_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] * 100 / df_combined["psa"] + dataset.df_result.loc[dataset.df_result["psa"] != 0, f"SEI_{suffix}"] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] + * 100 + / dataset.df_result["psa"] ) - df_combined[f"LLE_{suffix}"] = ( - df_combined[f"pchembl_value_mean_{suffix}"] - df_combined["alogp"] + dataset.df_result[f"LLE_{suffix}"] = ( + dataset.df_result[f"pchembl_value_mean_{suffix}"] + - dataset.df_result["alogp"] ) - df_combined = df_combined.astype( + dataset.df_result = dataset.df_result.astype( { f"LE_{suffix}": "float64", f"BEI_{suffix}": "float64", @@ -143,26 +149,21 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame: } ) - return df_combined - -def add_atc_classification( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> tuple[pd.DataFrame, pd.DataFrame]: +def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection): """ - Query and add ATC classifications (level 1) from the atc_classification and + Query and add ATC classifications (level 1) from the atc_classification and molecule_atc_classification tables. - ATC level annotations for the same parent_molregno are combined into one description - that concatenates all descriptions sorted alphabetically + ATC level annotations for the same parent_molregno are combined into one description + that concatenates all descriptions sorted alphabetically into one string with ' | ' as a separator. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to include ATC classifications. + dataset.atc_levels will be set to ATC annotations in ChEMBL. + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :return: - Pandas DataFrame with added ATC classifications \\ - - Pandas DataFrame with ATC annotations in ChEMBL - :rtype: (pd.DataFrame, pd.DataFrame) """ sql = """ SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description @@ -185,14 +186,16 @@ def add_atc_classification( ].transform(lambda x: between_str_join.join(sorted(x))) atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates() - df_combined = df_combined.merge(atc_levels, on="parent_molregno", how="left") + dataset.atc_levels = atc_levels - return df_combined, atc_levels + dataset.df_result = dataset.df_result.merge( + atc_levels, on="parent_molregno", how="left" + ) def add_all_chembl_compound_properties( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool +): """ Add ChEMBL-based compound properties to the given compound-target pairs, specifically: @@ -202,24 +205,19 @@ def add_all_chembl_compound_properties( - ligand efficiency metrics - ATC classifications - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to include compound properties. + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :param limit_to_literature: Base first_publication_cpd on literature sources only if True. + :param limit_to_literature: Base first_publication_cpd on literature sources only if True. Base it on all available sources otherwise. :type limit_to_literature: bool - :return: - Pandas DataFrame with added compound properties \\ - - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL \\ - - Pandas DataFrame with ATC annotations in ChEMBL - :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame) """ - df_combined = add_first_publication_date( - df_combined, chembl_con, limit_to_literature - ) - df_combined, df_cpd_props = add_chembl_properties_and_structures( - df_combined, chembl_con - ) - df_combined = add_ligand_efficiency_metrics(df_combined) - df_combined, atc_levels = add_atc_classification(df_combined, chembl_con) - return df_combined, df_cpd_props, atc_levels + add_first_publication_date(dataset, chembl_con, limit_to_literature) + + add_chembl_properties_and_structures(dataset, chembl_con) + + add_ligand_efficiency_metrics(dataset) + + add_atc_classification(dataset, chembl_con) diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py index 009b8d4..0cb388c 100644 --- a/src/add_chembl_target_class_annotations.py +++ b/src/add_chembl_target_class_annotations.py @@ -6,6 +6,7 @@ import write_subsets from arguments import OutputArgs, CalculationArgs +from dataset import Dataset ########### Add Target Class Annotations Based on ChEMBL Data ########### @@ -80,34 +81,35 @@ def get_target_class_table( def add_chembl_target_class_annotations( - df_combined: pd.DataFrame, + dataset: Dataset, chembl_con: sqlite3.Connection, args: CalculationArgs, out: OutputArgs, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: +): """ - Add level 1 and 2 target class annotations. - Assignments for target IDs with more than one target class assignment per level - are summarised into one string with '|' as a separator + Add level 1 and 2 target class annotations. + Assignments for target IDs with more than one target class assignment per level + are summarised into one string with '|' as a separator between the different target class annotations. Targets with more than one level 1 / level 2 target class assignment are written to a file. These could be reassigned by hand if a single target class is preferable. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include target class annotations. + dataset.target_classes_level1 will be set to + pandas DataFrame with mapping from target id to level 1 target class + dataset.target_classes_level2 will be set to + pandas DataFrame with mapping from target id to level 2 target class + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset :type out: OutputArgs - :return: - Pandas DataFrame with added target class annotations \\ - - Pandas DataFrame with mapping from target id to level 1 target class \\ - - Pandas DataFrame with mapping from target id to level 2 target class - :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame) """ - current_tids = set(df_combined["tid"]) + current_tids = set(dataset.df_result["tid"]) df_target_classes = get_target_class_table(chembl_con, current_tids) # Summarise the information for a target id with @@ -143,7 +145,9 @@ def add_chembl_target_class_annotations( ["tid", "target_class_l1"] ].drop_duplicates() - df_combined = df_combined.merge(target_classes_level1, on="tid", how="left") + dataset.df_result = dataset.df_result.merge( + target_classes_level1, on="tid", how="left" + ) # Repeat the summary step for target classes of level 2. level = "l2" @@ -155,12 +159,14 @@ def add_chembl_target_class_annotations( ["tid", "target_class_l2"] ].drop_duplicates() - df_combined = df_combined.merge(target_classes_level2, on="tid", how="left") + dataset.df_result = dataset.df_result.merge( + target_classes_level2, on="tid", how="left" + ) # Output targets have more than one target class assignment - more_than_one_level_1 = df_combined[ - (df_combined["target_class_l1"].notnull()) - & (df_combined["target_class_l1"].str.contains("|", regex=False)) + more_than_one_level_1 = dataset.df_result[ + (dataset.df_result["target_class_l1"].notnull()) + & (dataset.df_result["target_class_l1"].str.contains("|", regex=False)) ][ ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"] ].drop_duplicates() @@ -168,9 +174,9 @@ def add_chembl_target_class_annotations( "Targets with more than one level 1 target class assignment: %s", len(more_than_one_level_1), ) - more_than_one_level_2 = df_combined[ - (df_combined["target_class_l2"].notnull()) - & (df_combined["target_class_l2"].str.contains("|", regex=False)) + more_than_one_level_2 = dataset.df_result[ + (dataset.df_result["target_class_l2"].notnull()) + & (dataset.df_result["target_class_l2"].str.contains("|", regex=False)) ][ ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"] ].drop_duplicates() @@ -197,4 +203,5 @@ def add_chembl_target_class_annotations( out, ) - return df_combined, target_classes_level1, target_classes_level2 + dataset.target_classes_level1 = target_classes_level1 + dataset.target_classes_level2 = target_classes_level2 diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py index 9f181c5..533aa2d 100644 --- a/src/add_dti_annotations.py +++ b/src/add_dti_annotations.py @@ -1,12 +1,10 @@ -import pandas as pd +from dataset import Dataset ########### CTI (Compound-Target Interaction) Annotations ########### def add_dti_annotations( - df_combined: pd.DataFrame, - drug_mechanism_pairs_set: set, - drug_mechanism_targets_set: set, -) -> pd.DataFrame: + dataset: Dataset, +): """ Every compound-target pair is assigned a DTI (drug target interaction) annotation. @@ -60,84 +58,91 @@ def add_dti_annotations( and for which the target was also not in the drug_mechanisms table (not a comparator compound), are discarded. - :param df_combined: Pandas DataFrame with compound-target pairs - based on activities AND drug_mechanism table - :type df_combined: pd.DataFrame - :param drug_mechanism_pairs_set: set of compound-target pairs in the drug_mechanism table - :type drug_mechanism_pairs_set: set - :param drug_mechanism_targets_set: set of targets in the drug_mechanism table - :type drug_mechanism_targets_set: set - :return: Pandas DataFrame with all compound-target pairs and their DTI annotations. - :rtype: pd.DataFrame + :param dataset: Dataset with all relevant information: + - Pandas DataFrame with compound-target pairs + based on activities AND drug_mechanism table + - set of compound-target pairs in the drug_mechanism table + - set of targets in the drug_mechanism table + :type dataset: Dataset """ # Add a new column *therapeutic_target* which is set to True # if the target is in the drug_mechanism table - df_combined["therapeutic_target"] = df_combined["tid"].isin( - drug_mechanism_targets_set + dataset.df_result["therapeutic_target"] = dataset.df_result["tid"].isin( + dataset.drug_mechanism_targets_set ) # Assign the annotations based on the table. # Compound-target pairs from the drug mechanism table - df_combined.loc[ + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 4) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 4) ), "DTI", ] = "D_DT" - df_combined.loc[ + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 3) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 3) ), "DTI", ] = "C3_DT" - df_combined.loc[ + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 2) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 2) ), "DTI", ] = "C2_DT" - df_combined.loc[ + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (df_combined["max_phase"] == 1) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (dataset.df_result["max_phase"] == 1) ), "DTI", ] = "C1_DT" # Compounds that are in the drug_mechanism table but don't have a known phase between 1-4: - df_combined.loc[ + dataset.df_result.loc[ ( - df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set) - & (~df_combined["max_phase"].isin([1, 2, 3, 4])) + dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) + & (~dataset.df_result["max_phase"].isin([1, 2, 3, 4])) ), "DTI", ] = "C0_DT" # Target is in the drug mechanism table - df_combined.loc[ + dataset.df_result.loc[ ( - (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)) - & (df_combined["therapeutic_target"] == True) + ( + ~dataset.df_result["cpd_target_pair"].isin( + dataset.drug_mechanism_pairs_set + ) + ) + & (dataset.df_result["therapeutic_target"] == True) ), "DTI", ] = "DT" # Other compound-target pairs # if target is not a therapeutic target, 'cpd_target_pair' cannot be in DTIs_set - # (~df_combined['cpd_target_pair'].isin(DTIs_set)) is included for clarity - df_combined.loc[ + # (~dataset.df_result['cpd_target_pair'].isin(DTIs_set)) is included for clarity + dataset.df_result.loc[ ( - (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)) - & (df_combined["therapeutic_target"] == False) + ( + ~dataset.df_result["cpd_target_pair"].isin( + dataset.drug_mechanism_pairs_set + ) + ) + & (dataset.df_result["therapeutic_target"] == False) ), "DTI", ] = "NDT" # Discard NDT rows - df_combined = df_combined[ - (df_combined["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"])) + dataset.df_result = dataset.df_result[ + ( + dataset.df_result["DTI"].isin( + ["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"] + ) + ) ] - - return df_combined diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py index ce6a575..c26a721 100644 --- a/src/add_filtering_columns.py +++ b/src/add_filtering_columns.py @@ -6,6 +6,7 @@ from arguments import CalculationArgs, OutputArgs import get_stats import write_subsets +from dataset import Dataset def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[ @@ -107,20 +108,20 @@ def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[ def add_subset_filtering_columns( df_combined_subset: pd.DataFrame, - df_combined: pd.DataFrame, + dataset: Dataset, desc: str, args: CalculationArgs, out: OutputArgs, - df_sizes: list[list[int], list[int]], -) -> pd.DataFrame: +): """ Add filtering column for binding + functional vs binding :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data in df_combined :type df_combined_subset: pd.DataFrame - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include filtering columns. + :type dataset: Dataset :param desc: Assay description, either "BF" (binding+functional) or "B" (binding) :type desc: str @@ -128,10 +129,6 @@ def add_subset_filtering_columns( :type args: CalculationArgs :param out: Arguments related to how to output the dataset :type out: OutputArgs - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] - :return: Pandas DataFrame with added filering columns - :rtype: pd.DataFrame """ subsets = get_data_subsets( df_combined_subset, @@ -159,65 +156,57 @@ def add_subset_filtering_columns( # add filtering columns to df_combined # do not add a filtering column for BF / B (-> [1:]) for [df, col_name] in subsets[1:]: - df_combined[col_name] = False - df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True + dataset.df_result[col_name] = False + dataset.df_result.loc[(dataset.df_result.index.isin(df.index)), col_name] = True # check that filtering works - assert df_combined[df_combined[col_name] == True][df.columns].equals( - df - ), f"Filtering is not accurate for {col_name}." + assert dataset.df_result[dataset.df_result[col_name] == True][ + df.columns + ].equals(df), f"Filtering is not accurate for {col_name}." if logging.DEBUG >= logging.root.level: for [df_subset, subset_desc] in subsets: - get_stats.add_dataset_sizes(df_subset, subset_desc, df_sizes) - - return df_combined + get_stats.add_debugging_info(dataset, df_subset, subset_desc) def add_filtering_columns( - df_combined: pd.DataFrame, - df_sizes: list[list[int], list[int]], + dataset: Dataset, args: CalculationArgs, out: OutputArgs, -) -> pd.DataFrame: +): """ Add filtering columns to main dataset and save subsets if required. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] + :param dataset: Dataset with compound-target pairs. + Will be updated to only include filtering columns. + :type dataset: Dataset :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset :type out: OutputArgs - :return: Pandas DataFrame with added filering columns - :rtype: pd.DataFrame """ # consider binding and functional assays # assay description = binding+functional desc = "BF" # df_combined without binding only data - df_combined_subset = df_combined.copy() - df_combined = add_subset_filtering_columns( + df_combined_subset = dataset.df_result.copy() + add_subset_filtering_columns( df_combined_subset, - df_combined, + dataset, desc, args, out, - df_sizes, ) # consider only binding assays # assay description = binding desc = "B" - df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy() - df_combined = add_subset_filtering_columns( + df_combined_subset = dataset.df_result[ + dataset.df_result["keep_for_binding"] == True + ].copy() + add_subset_filtering_columns( df_combined_subset, - df_combined, + dataset, desc, args, out, - df_sizes, ) - - return df_combined diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py index 5c7d4e8..1bc9268 100644 --- a/src/add_rdkit_compound_descriptors.py +++ b/src/add_rdkit_compound_descriptors.py @@ -1,72 +1,74 @@ -import pandas as pd from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.Chem import PandasTools from tqdm import tqdm +from dataset import Dataset -def add_built_in_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: + +def add_built_in_descriptors(dataset: Dataset): """ Add RDKit built-in compound descriptors. + :param dataset: Dataset with compound-target pairs. + Will be updated to only include built-in RDKit compound descriptors. + :type dataset: Dataset :param df_combined: Pandas DataFrame with compound-target pairs :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added built-in RDKit compound descriptors - :rtype: pd.DataFrame """ # add a column with RDKit molecules, used to calculate the descriptors PandasTools.AddMoleculeColumnToFrame( - df_combined, "canonical_smiles", "mol", includeFingerprints=False + dataset.df_result, "canonical_smiles", "mol", includeFingerprints=False ) - df_combined.loc[:, "fraction_csp3"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "fraction_csp3"] = dataset.df_result["mol"].apply( Descriptors.FractionCSP3 ) - df_combined.loc[:, "ring_count"] = df_combined["mol"].apply(Descriptors.RingCount) - df_combined.loc[:, "num_aliphatic_rings"] = df_combined["mol"].apply( - Descriptors.NumAliphaticRings - ) - df_combined.loc[:, "num_aliphatic_carbocycles"] = df_combined["mol"].apply( - Descriptors.NumAliphaticCarbocycles + dataset.df_result.loc[:, "ring_count"] = dataset.df_result["mol"].apply( + Descriptors.RingCount ) - df_combined.loc[:, "num_aliphatic_heterocycles"] = df_combined["mol"].apply( - Descriptors.NumAliphaticHeterocycles + dataset.df_result.loc[:, "num_aliphatic_rings"] = dataset.df_result["mol"].apply( + Descriptors.NumAliphaticRings ) - df_combined.loc[:, "num_aromatic_rings"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_aliphatic_carbocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAliphaticCarbocycles) + dataset.df_result.loc[:, "num_aliphatic_heterocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAliphaticHeterocycles) + dataset.df_result.loc[:, "num_aromatic_rings"] = dataset.df_result["mol"].apply( Descriptors.NumAromaticRings ) - df_combined.loc[:, "num_aromatic_carbocycles"] = df_combined["mol"].apply( - Descriptors.NumAromaticCarbocycles - ) - df_combined.loc[:, "num_aromatic_heterocycles"] = df_combined["mol"].apply( - Descriptors.NumAromaticHeterocycles - ) - df_combined.loc[:, "num_saturated_rings"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_aromatic_carbocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAromaticCarbocycles) + dataset.df_result.loc[:, "num_aromatic_heterocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumAromaticHeterocycles) + dataset.df_result.loc[:, "num_saturated_rings"] = dataset.df_result["mol"].apply( Descriptors.NumSaturatedRings ) - df_combined.loc[:, "num_saturated_carbocycles"] = df_combined["mol"].apply( - Descriptors.NumSaturatedCarbocycles - ) - df_combined.loc[:, "num_saturated_heterocycles"] = df_combined["mol"].apply( - Descriptors.NumSaturatedHeterocycles - ) - df_combined.loc[:, "num_stereocentres"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_saturated_carbocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumSaturatedCarbocycles) + dataset.df_result.loc[:, "num_saturated_heterocycles"] = dataset.df_result[ + "mol" + ].apply(Descriptors.NumSaturatedHeterocycles) + dataset.df_result.loc[:, "num_stereocentres"] = dataset.df_result["mol"].apply( Chem.rdMolDescriptors.CalcNumAtomStereoCenters ) - df_combined.loc[:, "num_heteroatoms"] = df_combined["mol"].apply( + dataset.df_result.loc[:, "num_heteroatoms"] = dataset.df_result["mol"].apply( Descriptors.NumHeteroatoms ) # add scaffolds - PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_w_stereo") + PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_w_stereo") # remove stereo information of the molecule to add scaffolds without stereo information - df_combined["mol"].apply(Chem.RemoveStereochemistry) - PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_wo_stereo") + dataset.df_result["mol"].apply(Chem.RemoveStereochemistry) + PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_wo_stereo") # drop the column with RDKit molecules - df_combined = df_combined.drop(["mol"], axis=1) - - return df_combined + dataset.df_result = dataset.df_result.drop(["mol"], axis=1) def calculate_aromatic_atoms( @@ -121,7 +123,7 @@ def calculate_aromatic_atoms( return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict -def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: +def add_aromaticity_descriptors(dataset: Dataset): """ Add number of aromatic atoms in a compounds, specifically: @@ -130,40 +132,39 @@ def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: - # aromatic nitrogen atoms (aromatic_n) - # aromatic hetero atoms (aromatic_hetero) - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added counts of aromatic atoms - :rtype: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include counts of aromatic atoms + :type dataset: Dataset """ # use df_combined_w_smiles to exclude null values - smiles_set = set(df_combined["canonical_smiles"]) + smiles_set = set(dataset.df_result["canonical_smiles"]) aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = ( calculate_aromatic_atoms(smiles_set) ) - df_combined["aromatic_atoms"] = df_combined["canonical_smiles"].map( + dataset.df_result["aromatic_atoms"] = dataset.df_result["canonical_smiles"].map( aromatic_atoms_dict ) - df_combined["aromatic_c"] = df_combined["canonical_smiles"].map(aromatic_c_dict) - df_combined["aromatic_n"] = df_combined["canonical_smiles"].map(aromatic_n_dict) - df_combined["aromatic_hetero"] = df_combined["canonical_smiles"].map( + dataset.df_result["aromatic_c"] = dataset.df_result["canonical_smiles"].map( + aromatic_c_dict + ) + dataset.df_result["aromatic_n"] = dataset.df_result["canonical_smiles"].map( + aromatic_n_dict + ) + dataset.df_result["aromatic_hetero"] = dataset.df_result["canonical_smiles"].map( aromatic_hetero_dict ) - return df_combined - -def add_rdkit_compound_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame: +def add_rdkit_compound_descriptors(dataset: Dataset): """ Add RDKit-based compound descriptors (built-in and numbers of aromatic atoms). - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :return: Pandas DataFrame with added built-in RDKit compound descriptors - and numbers of aromatic atoms - :rtype: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include + built-in RDKit compound descriptors + and numbers of aromatic atoms. + :type dataset: Dataset """ - df_combined = add_built_in_descriptors(df_combined) - df_combined = add_aromaticity_descriptors(df_combined) - - return df_combined + add_built_in_descriptors(dataset) + add_aromaticity_descriptors(dataset) diff --git a/src/clean_dataset.py b/src/clean_dataset.py index 01a2564..6efda90 100644 --- a/src/clean_dataset.py +++ b/src/clean_dataset.py @@ -3,10 +3,12 @@ import pandas as pd +from dataset import Dataset + def remove_compounds_without_smiles_and_mixtures( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> pd.DataFrame: + dataset: Dataset, chembl_con: sqlite3.Connection +): """ Remove @@ -16,12 +18,12 @@ def remove_compounds_without_smiles_and_mixtures( Since compound information is aggregated for the parents of salts, the number of smiles with a dot is relatively low. - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to only include + compound-target pairs with a smiles that does not contain a '.' + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :return: Pandas DataFrame with compound-target pairs with a smiles that does not contain a '.' - :rtype: pd.DataFrame """ # Double-check that rows with a SMILES containing a '.' are the parent structures, # i.e., there was no error in using salt information instead of parent information. @@ -31,9 +33,9 @@ def remove_compounds_without_smiles_and_mixtures( """ df_hierarchy = pd.read_sql_query(sql, con=chembl_con) - smiles_with_dot = df_combined[ - df_combined["canonical_smiles"].notnull() - & df_combined["canonical_smiles"].str.contains(".", regex=False) + smiles_with_dot = dataset.df_result[ + dataset.df_result["canonical_smiles"].notnull() + & dataset.df_result["canonical_smiles"].str.contains(".", regex=False) ][["canonical_smiles", "parent_molregno"]].drop_duplicates() for parent_molregno in set(smiles_with_dot["parent_molregno"]): @@ -72,42 +74,46 @@ def remove_compounds_without_smiles_and_mixtures( the smiles for the compound in ChEMBL ({parent_smiles_in_chembl})." # Remove rows that contain a SMILES with a dot or that don't have a SMILES. - len_missing_smiles = len(df_combined[df_combined["canonical_smiles"].isnull()]) + len_missing_smiles = len( + dataset.df_result[dataset.df_result["canonical_smiles"].isnull()] + ) len_smiles_w_dot = len( - df_combined[ - df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"])) + dataset.df_result[ + dataset.df_result["parent_molregno"].isin( + set(smiles_with_dot["parent_molregno"]) + ) ] ) logging.debug("#Compounds without a SMILES: %s", len_missing_smiles) logging.debug("#SMILES with a dot: %s", len_smiles_w_dot) - df_combined = df_combined[ - (df_combined["canonical_smiles"].notnull()) + dataset.df_result = dataset.df_result[ + (dataset.df_result["canonical_smiles"].notnull()) & ~( - df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"])) + dataset.df_result["parent_molregno"].isin( + set(smiles_with_dot["parent_molregno"]) + ) ) ] - return df_combined + return dataset.df_result -def clean_none_values(df_combined): +def clean_none_values(dataset: Dataset): """ Change nan values and empty strings to None for consistency. """ # Change all None / nan values to None - df_combined = df_combined.where(pd.notnull(df_combined), None) + dataset.df_result = dataset.df_result.where(pd.notnull(dataset.df_result), None) # replace empty strings with None - df_combined = df_combined.replace("", None).reset_index(drop=True) - - return df_combined + dataset.df_result = dataset.df_result.replace("", None).reset_index(drop=True) -def set_types_to_int(df_combined, calculate_rdkit): +def set_types_to_int(dataset, calculate_rdkit): """ Set the type of relevant columns to Int64. """ - df_combined = df_combined.astype( + dataset.df_result = dataset.df_result.astype( { "first_approval": "Int64", "usan_year": "Int64", @@ -129,7 +135,7 @@ def set_types_to_int(df_combined, calculate_rdkit): ) if calculate_rdkit: - df_combined = df_combined.astype( + dataset.df_result = dataset.df_result.astype( { "num_aliphatic_carbocycles": "Int64", "num_aliphatic_heterocycles": "Int64", @@ -150,26 +156,26 @@ def set_types_to_int(df_combined, calculate_rdkit): } ) - return df_combined - -def round_floats(df_combined, decimal_places=4): +def round_floats(dataset, decimal_places=4): """ Round float columns to decimal places. This does not apply to max_phase. """ - for _, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()): + for _, (col, dtype) in enumerate(dataset.df_result.dtypes.to_dict().items()): if (dtype in ("float64", "Float64")) and col != "max_phase": - df_combined[col] = df_combined[col].round(decimals=decimal_places) + dataset.df_result[col] = dataset.df_result[col].round( + decimals=decimal_places + ) - return df_combined + return dataset.df_result -def reorder_columns(df_combined, calculate_rdkit): +def reorder_columns(dataset, calculate_rdkit): """ Reorder the columns in the DataFrame. """ - len_columns_before = len(df_combined.columns) + len_columns_before = len(dataset.df_result.columns) compound_target_pair_columns = [ "parent_molregno", @@ -283,7 +289,7 @@ def reorder_columns(df_combined, calculate_rdkit): + rdkit_columns + filtering_columns ) - df_combined = df_combined[columns] + dataset.df_result = dataset.df_result[columns] else: columns = ( compound_target_pair_columns @@ -296,18 +302,16 @@ def reorder_columns(df_combined, calculate_rdkit): + chembl_target_annotations + filtering_columns ) - df_combined = df_combined[columns] + dataset.df_result = dataset.df_result[columns] - len_columns_after = len(df_combined.columns) + len_columns_after = len(dataset.df_result.columns) assert ( len_columns_before == len_columns_after ), f"Different number of columns after reordering \ (before: {len_columns_before}, after: {len_columns_after})." - return df_combined - -def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFrame: +def clean_dataset(dataset: Dataset, calculate_rdkit: bool) -> pd.DataFrame: """ Clean the dataset by @@ -317,18 +321,16 @@ def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFr - reordering columns - sorting rows by cpd_target_pair_mutation - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + Will be updated to clean version with the updates described above. + :type dataset: Dataset :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties :type calculate_rdkit: bool - :return: Cleaned pandas DataFrame with compound-target pairs - :rtype: pd.DataFrame """ - df_combined = clean_none_values(df_combined) - df_combined = set_types_to_int(df_combined, calculate_rdkit) - df_combined = round_floats(df_combined, decimal_places=4) - df_combined = reorder_columns(df_combined, calculate_rdkit) - df_combined = df_combined.sort_values(by=["cpd_target_pair_mutation"]).reset_index( - drop=True - ) - return df_combined + clean_none_values(dataset) + set_types_to_int(dataset, calculate_rdkit) + round_floats(dataset, decimal_places=4) + reorder_columns(dataset, calculate_rdkit) + dataset.df_result = dataset.df_result.sort_values( + by=["cpd_target_pair_mutation"] + ).reset_index(drop=True) diff --git a/src/dataset.py b/src/dataset.py new file mode 100644 index 0000000..8b3d29f --- /dev/null +++ b/src/dataset.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass + +import pandas as pd + + +@dataclass +class Dataset: + """ + df_result: Pandas DataFrame with the full dataset + df_sizes_all: List of intermediate sized of the dataset used for debugging + df_sizes_pchembl: List of intermediate sized of the dataset used for debugging + drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table + drug_mechanism_targets_set: Set of targets in the drug_mechanism table + df_cpd_props: Pandas DataFrame with compound properties and + structures for all compound ids in ChEMBL + atc_levels: Pandas DataFrame with ATC annotations in ChEMBL + target_classes_level1: Pandas DataFrame with mapping from target id to level 1 target class + target_classes_level2: Pandas DataFrame with mapping from target id to level 2 target class + """ + + df_result: pd.DataFrame + df_cpd_props: pd.DataFrame + atc_levels: pd.DataFrame + target_classes_level1: pd.DataFrame + target_classes_level2: pd.DataFrame + drug_mechanism_pairs_set: set + drug_mechanism_targets_set: set + df_sizes_all: list[int] + df_sizes_pchembl: list[int] diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py index 4e440da..ba6811e 100644 --- a/src/get_activity_ct_pairs.py +++ b/src/get_activity_ct_pairs.py @@ -3,6 +3,8 @@ import numpy as np import pandas as pd +from dataset import Dataset + ########### Get Initial Compound-Target Data From ChEMBL ########### def get_compound_target_pairs_with_pchembl( @@ -164,7 +166,7 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame: def get_aggregated_activity_ct_pairs( chembl_con: sqlite3.Connection, limit_to_literature: bool, -) -> pd.DataFrame: +) -> Dataset: """ Get dataset of compound target-pairs with an associated pchembl value with pchembl and publication dates aggregated into one entry per pair. @@ -184,9 +186,9 @@ def get_aggregated_activity_ct_pairs( :param limit_to_literature: Include only literature sources if True. Include all available sources otherwise. :type limit_to_literature: bool - :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data - aggregated into one entry per compound-target pair. - :rtype: pd.DataFrame + :return: Dataset with a pandas Dataframe with compound-target pairs + based on ChEMBL activity data aggregated into one entry per compound-target pair. + :rtype: Dataset """ df_mols = get_compound_target_pairs_with_pchembl( chembl_con, @@ -220,4 +222,15 @@ def get_aggregated_activity_ct_pairs( how="left", ) - return df_combined + dataset = Dataset( + df_combined, + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame(), + pd.DataFrame(), + set(), + set(), + [], + [], + ) + return dataset diff --git a/src/get_dataset.py b/src/get_dataset.py index 7fd86c3..c9966d5 100644 --- a/src/get_dataset.py +++ b/src/get_dataset.py @@ -28,98 +28,59 @@ def get_ct_pair_dataset( :param out: Arguments related to how to output the dataset :type out: OutputArgs """ - # list with sizes of full dataset and dataset subset with pchembl values for debugging - df_sizes = [[], []] - logging.info("get_aggregated_activity_ct_pairs") - df_combined = get_activity_ct_pairs.get_aggregated_activity_ct_pairs( + dataset = get_activity_ct_pairs.get_aggregated_activity_ct_pairs( chembl_con, args.limit_to_literature ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "activity ct-pairs", df_sizes) + get_stats.add_debugging_info(dataset, dataset.df_result, "activity ct-pairs") logging.info("add_cti_from_drug_mechanisms") - df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set = ( - get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(df_combined, chembl_con) - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "dm ct-pairs", df_sizes) + get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(dataset, chembl_con) + get_stats.add_debugging_info(dataset, dataset.df_result, "dm ct-pairs") logging.info("add_cti_annotations") - df_combined = add_dti_annotations.add_dti_annotations( - df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "DTI annotations", df_sizes) + add_dti_annotations.add_dti_annotations(dataset) + get_stats.add_debugging_info(dataset, dataset.df_result, "DTI annotations") logging.info("add_all_chembl_compound_properties") - df_combined, df_cpd_props, atc_levels = ( - add_chembl_compound_properties.add_all_chembl_compound_properties( - df_combined, chembl_con, args.limit_to_literature - ) + add_chembl_compound_properties.add_all_chembl_compound_properties( + dataset, chembl_con, args.limit_to_literature ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "ChEMBL props", df_sizes) + get_stats.add_debugging_info(dataset, dataset.df_result, "ChEMBL props") logging.info("remove_compounds_without_smiles_and_mixtures") - df_combined = clean_dataset.remove_compounds_without_smiles_and_mixtures( - df_combined, chembl_con - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "removed smiles", df_sizes) + clean_dataset.remove_compounds_without_smiles_and_mixtures(dataset, chembl_con) + get_stats.add_debugging_info(dataset, dataset.df_result, "removed smiles") logging.info("add_chembl_target_class_annotations") - df_combined, target_classes_level1, target_classes_level2 = ( - add_chembl_target_class_annotations.add_chembl_target_class_annotations( - df_combined, - chembl_con, - args, - out, - ) + add_chembl_target_class_annotations.add_chembl_target_class_annotations( + dataset, + chembl_con, + args, + out, ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "tclass annotations", df_sizes) + get_stats.add_debugging_info(dataset, dataset.df_result, "tclass annotations") - logging.info("add_rdkit_compound_descriptors") if args.calculate_rdkit: - df_combined = add_rdkit_compound_descriptors.add_rdkit_compound_descriptors( - df_combined - ) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "RDKit props", df_sizes) + logging.info("add_rdkit_compound_descriptors") + add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(dataset) + get_stats.add_debugging_info(dataset, dataset.df_result, "RDKit props") logging.info("clean_dataset") - df_combined = clean_dataset.clean_dataset(df_combined, args.calculate_rdkit) - if logging.DEBUG >= logging.root.level: - get_stats.add_dataset_sizes(df_combined, "clean df", df_sizes) + clean_dataset.clean_dataset(dataset, args.calculate_rdkit) + get_stats.add_debugging_info(dataset, dataset.df_result, "clean df") logging.info("sanity_checks") - sanity_checks.sanity_checks( - df_combined, - df_cpd_props, - atc_levels, - target_classes_level1, - target_classes_level2, - args.calculate_rdkit, - ) + sanity_checks.sanity_checks(dataset, args.calculate_rdkit) logging.info("add_filtering_columns") - add_filtering_columns.add_filtering_columns( - df_combined, - df_sizes, - args, - out, - ) + add_filtering_columns.add_filtering_columns(dataset, args, out) logging.info("write_full_dataset_to_file") - write_subsets.write_full_dataset_to_file( - df_combined, - args, - out, - ) + write_subsets.write_full_dataset_to_file(dataset, args, out) logging.info("output_stats") - write_subsets.output_all_stats(df_combined, args, out) + write_subsets.output_all_stats(dataset, args, out) if logging.DEBUG >= logging.root.level: - write_subsets.output_debug_sizes(df_sizes, out) + write_subsets.output_debug_sizes(dataset, out) diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py index be67c59..ebabf45 100644 --- a/src/get_drug_mechanism_ct_pairs.py +++ b/src/get_drug_mechanism_ct_pairs.py @@ -3,6 +3,8 @@ import pandas as pd +from dataset import Dataset + ########### Extract Drug-Target Interactions From the drug_mechanism Table ########### def get_drug_mechanisms_interactions(chembl_con: sqlite3.Connection) -> pd.DataFrame: @@ -151,7 +153,7 @@ def add_annotations_to_drug_mechanisms_cti( :return: Updated pandas DataFrame with the additional annotations. :rtype: pd.DataFrame """ - ##### Set columns existing in the df_combined table. ##### + ##### Set columns existing in the df_results table. ##### # None of the targets from the drug mechanism table have any mutation annotation, # hence tid_mutation = tid cpd_target_pairs["tid_mutation"] = cpd_target_pairs["tid"].astype("str") @@ -239,47 +241,46 @@ def get_drug_mechanism_ct_pairs(chembl_con: sqlite3.Connection) -> pd.DataFrame: ########### Add Compounds From the drug_mechanism Table to the Dataset ########### -def add_drug_mechanism_ct_pairs( - df_combined: pd.DataFrame, chembl_con: sqlite3.Connection -) -> tuple[pd.DataFrame, set, set]: +def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection): """ - Add compound-target pairs from the drug_mechanism table + Add compound-target pairs from the drug_mechanism table that are not in the dataset based on the initial ChEMBL query. These are compound-target pairs for which there is no associated pchembl value data. - Since the pairs are known interactions, + Since the pairs are known interactions, they are added to the dataset despite not having a pchembl value. + Add the set of compound-target pairs in the drug_mechanism table and + the set of targets in the drug_mechanism table to the dataset. - :param df_combined: Pandas Dataframe with compound-target pairs based on ChEMBL activity data - :type df_combined: pd.DataFrame + :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data + :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :return: - Pandas DataFrame with compound-target pairs - based on activities AND drug_mechanism table \\ - - set of compound-target pairs in the drug_mechanism table \\ - - set of targets in the drug_mechanism table - :rtype: (pd.DataFrame, set, set) """ cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con) - drug_mechanism_pairs_set = set( + dataset.drug_mechanism_pairs_set = set( f"{a}_{b}" for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"]) ) - drug_mechanism_targets_set = set(cpd_target_pairs["tid"]) + dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"]) # Add a new column *pair_mutation_in_dm_table* which is set to True if the compound target pair # (taking mutation annotations into account) is in the drug_mechanism table. - df_combined["pair_mutation_in_dm_table"] = False - df_combined.loc[ - (df_combined["cpd_target_pair_mutation"].isin(drug_mechanism_pairs_set)), + dataset.df_result["pair_mutation_in_dm_table"] = False + dataset.df_result.loc[ + ( + dataset.df_result["cpd_target_pair_mutation"].isin( + dataset.drug_mechanism_pairs_set + ) + ), "pair_mutation_in_dm_table", ] = True # Add a new column *pair_in_dm_table* which is set to True if the compound target pair # (NOT taking mutation annotations into account) is in the drug_mechanism table. - df_combined["pair_in_dm_table"] = False - df_combined.loc[ - (df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)), + dataset.df_result["pair_in_dm_table"] = False + dataset.df_result.loc[ + (dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)), "pair_in_dm_table", ] = True @@ -291,7 +292,7 @@ def add_drug_mechanism_ct_pairs( cpd_target_pairs = cpd_target_pairs[ ~( cpd_target_pairs["cpd_target_pair_mutation"].isin( - set(df_combined["cpd_target_pair_mutation"]) + set(dataset.df_result["cpd_target_pair_mutation"]) ) ) ].copy() @@ -302,20 +303,18 @@ def add_drug_mechanism_ct_pairs( ) # Combined data of existing query with new compound-target pairs. - df_combined = pd.concat([df_combined, cpd_target_pairs]) + dataset.df_result = pd.concat([dataset.df_result, cpd_target_pairs]) # Add a new column *keep_for_binding* which is set to True if the row should be kept # if you want to limit the dataset to only data based on binding assays. # Rows are kept if # - there is a binding data-based pchembl value or # - the compound-target pair (including mutation info) is in the drug_mechanism table - df_combined["keep_for_binding"] = False - df_combined.loc[ + dataset.df_result["keep_for_binding"] = False + dataset.df_result.loc[ ( - (df_combined["pchembl_value_mean_B"].notnull()) - | (df_combined["pair_mutation_in_dm_table"] == True) + (dataset.df_result["pchembl_value_mean_B"].notnull()) + | (dataset.df_result["pair_mutation_in_dm_table"] == True) ), "keep_for_binding", ] = True - - return df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set diff --git a/src/get_stats.py b/src/get_stats.py index e27a4a5..0b96abc 100644 --- a/src/get_stats.py +++ b/src/get_stats.py @@ -1,5 +1,8 @@ +import logging import pandas as pd +from dataset import Dataset + ##### Debugging Stats ##### def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]: @@ -45,20 +48,22 @@ def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]: def add_dataset_sizes( - df: pd.DataFrame, label: str, df_sizes: list[list[int], list[int]] + dataset: Dataset, + df: pd.DataFrame, + label: str, ): """ - Count and add representative counts of df to the list df_sizes used for debugging. + Count and add representative counts of df used for debugging to the dataset. + :param dataset: Dataset with compound-target pairs and debugging sizes. + :type dataset: Dataset :param df: Pandas DataFrame with current compound-target pairs :type df: pd.DataFrame :param label: Description of pipeline step (e.g., initial query). :type label: str - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] """ df_copy = df.copy() - df_sizes[0].append([label] + calculate_dataset_sizes(df_copy)) + dataset.df_sizes_all.append([label] + calculate_dataset_sizes(df_copy)) # restrict to data with any pchembl value (any data with a pchembl, # even if it is based on only functional data) @@ -68,7 +73,20 @@ def add_dataset_sizes( df_pchembl = df_copy.dropna( subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all" ) - df_sizes[1].append([label] + calculate_dataset_sizes(df_pchembl)) + dataset.df_sizes_pchembl.append([label] + calculate_dataset_sizes(df_pchembl)) + + +def add_debugging_info( + dataset: Dataset, + df: pd.DataFrame, + label: str, +): + """ + Wrapper for add_dataset_sizes. + Handles logging level. + """ + if logging.DEBUG >= logging.root.level: + add_dataset_sizes(dataset, df, label) ##### Logging Stats ##### diff --git a/src/sanity_checks.py b/src/sanity_checks.py index b34517e..8b74546 100644 --- a/src/sanity_checks.py +++ b/src/sanity_checks.py @@ -1,13 +1,15 @@ import pandas as pd +from dataset import Dataset + ########### Sanity checks for the dataset ########### -def check_null_values(df_combined: pd.DataFrame): +def check_null_values(df_result: pd.DataFrame): """ Check if any columns contain nan or null which aren't recognised as null values. """ - for col in df_combined.columns: - col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str)) + for col in df_result.columns: + col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) assert ( "nan" not in col_as_str ), f"Problem with unrecognised nan value in column {col}" @@ -16,14 +18,14 @@ def check_null_values(df_combined: pd.DataFrame): ), f"Problem with unrecognised null value in column {col}" -def check_for_mixed_types(df_combined: pd.DataFrame): +def check_for_mixed_types(df_result: pd.DataFrame): """ Check that there are no mixed types in columns with dtype=object. """ - for col, dtype in df_combined.dtypes.to_dict().items(): + for col, dtype in df_result.dtypes.to_dict().items(): if dtype == object: - col_original = set(df_combined[df_combined[col].notnull()][col]) - col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str)) + col_original = set(df_result[df_result[col].notnull()][col]) + col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) # is there a difference in the two sets (ignoring null values) assert ( len(col_original - col_as_str) == 0 @@ -33,7 +35,7 @@ def check_for_mixed_types(df_combined: pd.DataFrame): ), f"Mixed types in colum {col}: {col_as_str-col_original}" -def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame): +def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame): """ Check that rows without a pchembl value based on binding+functional assays (pchembl_x_BF) are in the drug_mechanism table. @@ -47,15 +49,15 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame "pchembl_value_max_BF", "pchembl_value_median_BF", ]: - assert df_combined[(df_combined[pchembl_col].isnull())].equals( - df_combined[ - (df_combined["pair_mutation_in_dm_table"] == True) - & (df_combined[pchembl_col].isnull()) + assert df_result[(df_result[pchembl_col].isnull())].equals( + df_result[ + (df_result["pair_mutation_in_dm_table"] == True) + & (df_result[pchembl_col].isnull()) ] ), f"Missing pchembl value in column {pchembl_col}" -def check_ligand_efficiency_metrics(df_combined: pd.DataFrame): +def check_ligand_efficiency_metrics(df_result: pd.DataFrame): """ Check that ligand efficiency metrics are only null when at least one of the values used to calculate them is null. @@ -63,39 +65,39 @@ def check_ligand_efficiency_metrics(df_combined: pd.DataFrame): one of the values used to calculate them is null. """ for suffix in ["BF", "B"]: - assert df_combined[(df_combined[f"LE_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["heavy_atoms"].isnull()) - | (df_combined["heavy_atoms"] == 0) + assert df_result[(df_result[f"LE_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["heavy_atoms"].isnull()) + | (df_result["heavy_atoms"] == 0) ] ), f"Missing LE value in LE_{suffix}" - assert df_combined[(df_combined[f"BEI_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["mw_freebase"].isnull()) - | (df_combined["mw_freebase"] == 0) + assert df_result[(df_result[f"BEI_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["mw_freebase"].isnull()) + | (df_result["mw_freebase"] == 0) ] ), f"Missing BEI value in BEI_{suffix}" - assert df_combined[(df_combined[f"SEI_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["psa"].isnull()) - | (df_combined["psa"] == 0) + assert df_result[(df_result[f"SEI_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["psa"].isnull()) + | (df_result["psa"] == 0) ] ), f"Missing SEI value in SEI_{suffix}" - assert df_combined[(df_combined[f"LLE_{suffix}"].isnull())].equals( - df_combined[ - (df_combined[f"pchembl_value_mean_{suffix}"].isnull()) - | (df_combined["alogp"].isnull()) + assert df_result[(df_result[f"LLE_{suffix}"].isnull())].equals( + df_result[ + (df_result[f"pchembl_value_mean_{suffix}"].isnull()) + | (df_result["alogp"].isnull()) ] ), f"Missing LLE value in LLE_{suffix}" -def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame): +def check_compound_props(dataset: Dataset): """ Check that compound props are only null if @@ -104,56 +106,65 @@ def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame): """ # missing values because the parent_molregno is not in the compound props table no_cpd_prop_info = len( - df_combined[ - ~df_combined["parent_molregno"].isin(set(df_cpd_props["parent_molregno"])) + dataset.df_result[ + ~dataset.df_result["parent_molregno"].isin( + set(dataset.df_cpd_props["parent_molregno"]) + ) ] ) - for col in df_cpd_props.columns: + for col in dataset.df_cpd_props.columns: if col != "parent_molregno": # missing values because the compound props query returns null (exists but is null) missing_values = len( - df_combined[ - df_combined["parent_molregno"].isin( - set(df_cpd_props[df_cpd_props[col].isnull()]["parent_molregno"]) + dataset.df_result[ + dataset.df_result["parent_molregno"].isin( + set( + dataset.df_cpd_props[dataset.df_cpd_props[col].isnull()][ + "parent_molregno" + ] + ) ) ] ) null_values = no_cpd_prop_info + missing_values assert null_values == len( - df_combined[df_combined[col].isnull()] + dataset.df_result[dataset.df_result[col].isnull()] ), f"Too many null values in {col}" def check_atc_and_target_classes( - df_combined: pd.DataFrame, - atc_levels: pd.DataFrame, - target_classes_level1: pd.DataFrame, - target_classes_level2: pd.DataFrame, + dataset: Dataset, ): """ Check that atc_level1 and target class information is only null if the parent_molregno / target id is not in the respective table. """ - assert df_combined[(df_combined["atc_level1"].isnull())].equals( - df_combined[ - ~df_combined["parent_molregno"].isin(set(atc_levels["parent_molregno"])) + assert dataset.df_result[(dataset.df_result["atc_level1"].isnull())].equals( + dataset.df_result[ + ~dataset.df_result["parent_molregno"].isin( + set(dataset.atc_levels["parent_molregno"]) + ) ] ), "Null values in atc_level1 are not exclusively \ because the parent_molregno is not in the atc_classification table." - assert df_combined[(df_combined["target_class_l1"].isnull())].equals( - df_combined[~df_combined["tid"].isin(set(target_classes_level1["tid"]))] + assert dataset.df_result[(dataset.df_result["target_class_l1"].isnull())].equals( + dataset.df_result[ + ~dataset.df_result["tid"].isin(set(dataset.target_classes_level1["tid"])) + ] ), "Null values in target_class_l1 are not exclusively \ because the tid is not in the protein_classification table." - assert df_combined[(df_combined["target_class_l2"].isnull())].equals( - df_combined[~df_combined["tid"].isin(set(target_classes_level2["tid"]))] + assert dataset.df_result[(dataset.df_result["target_class_l2"].isnull())].equals( + dataset.df_result[ + ~dataset.df_result["tid"].isin(set(dataset.target_classes_level2["tid"])) + ] ), "Null values in target_class_l2 are not exclusively \ because the tid is not in the protein_classification table." -def check_rdkit_props(df_combined: pd.DataFrame): +def check_rdkit_props(df_result: pd.DataFrame): """ Check that columns set by the RDKit are only null if there is no canonical SMILES for the molecule. @@ -179,17 +190,13 @@ def check_rdkit_props(df_combined: pd.DataFrame): "aromatic_n", "aromatic_hetero", ]: - assert len(df_combined[df_combined[col].isnull()]) == len( - df_combined[df_combined["canonical_smiles"].isnull()].copy() + assert len(df_result[df_result[col].isnull()]) == len( + df_result[df_result["canonical_smiles"].isnull()].copy() ), f"Missing value in {col} despite a smiles being available." def sanity_checks( - df_combined: pd.DataFrame, - df_cpd_props: pd.DataFrame, - atc_levels: pd.DataFrame, - target_classes_level1: pd.DataFrame, - target_classes_level2: pd.DataFrame, + dataset: Dataset, calculate_rdkit: bool, ): """ @@ -208,32 +215,19 @@ def sanity_checks( - columns set by the RDKit are only null if there is no canonical SMILES for the molecule (excluding scaffolds) - :param df_combined: Pandas DataFrame with compound-target pairs - :type df_combined: pd.DataFrame - :param df_cpd_props: Pandas DataFrame with compound properties - and structures for all compound ids in ChEMBL. - :type df_cpd_props: pd.DataFrame - :param atc_levels: Pandas DataFrame with ATC annotations in ChEMBL - :type atc_levels: pd.DataFrame - :param target_classes_level1: Pandas DataFrame with mapping - from target id to level 1 target class - :type target_classes_level1: pd.DataFrame - :param target_classes_level2: Pandas DataFrame with mapping - from target id to level 2 target class - :type target_classes_level2: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties :type calculate_rdkit: bool """ - check_null_values(df_combined) - check_for_mixed_types(df_combined) - check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined) - check_ligand_efficiency_metrics(df_combined) - check_compound_props(df_combined, df_cpd_props) - check_atc_and_target_classes( - df_combined, atc_levels, target_classes_level1, target_classes_level2 - ) + check_null_values(dataset.df_result) + check_for_mixed_types(dataset.df_result) + check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result) + check_ligand_efficiency_metrics(dataset.df_result) + check_compound_props(dataset) + check_atc_and_target_classes(dataset) if calculate_rdkit: - check_rdkit_props(df_combined) + check_rdkit_props(dataset.df_result) ########### Sanity checks for writing and reading a dataset ########### diff --git a/src/write_subsets.py b/src/write_subsets.py index 5b0bc21..a5635a1 100644 --- a/src/write_subsets.py +++ b/src/write_subsets.py @@ -5,6 +5,7 @@ import get_stats from arguments import OutputArgs, CalculationArgs +from dataset import Dataset def write_output( @@ -73,15 +74,15 @@ def write_and_check_output( def write_full_dataset_to_file( - df_combined: pd.DataFrame, + dataset: Dataset, args: CalculationArgs, out: OutputArgs, ): """ If write_full_dataset, write df_combined with filtering columns to output_path. - :param df_combined: Pandas DataFrame with compound-target pairs and filtering columns - :type df_combined: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset @@ -93,18 +94,18 @@ def write_full_dataset_to_file( out.output_path, f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset", ) - write_and_check_output(df_combined, name_all, desc, args, out) + write_and_check_output(dataset.df_result, name_all, desc, args, out) def output_debug_sizes( - df_sizes: list[list[int], list[int]], + dataset: Dataset, out: OutputArgs, ): """ Output counts at various points during calculating the final dataset for debugging. - :param df_sizes: List of intermediate sized of the dataset used for debugging. - :type df_sizes: list[list[int], list[int]] + :param dataset: Dataset with compound-target pairs and debugging sizes. + :type dataset: Dataset :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset @@ -125,7 +126,7 @@ def output_debug_sizes( ] logging.debug("Size of full dataset at different points.") - full_df_sizes = pd.DataFrame(df_sizes[0], columns=column_names) + full_df_sizes = pd.DataFrame(dataset.df_sizes_all, columns=column_names) logging.debug(full_df_sizes) name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes") write_output( @@ -139,7 +140,7 @@ def output_debug_sizes( "This includes data for which we only have pchembl data \ for functional assays but not for binding assays." ) - df_pchembl_sizes = pd.DataFrame(df_sizes[1], columns=column_names) + df_pchembl_sizes = pd.DataFrame(dataset.df_sizes_pchembl, columns=column_names) logging.debug(df_pchembl_sizes) name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes") write_output( @@ -208,14 +209,12 @@ def output_stats( ) -def output_all_stats( - df_combined_annotated: pd.DataFrame, args: CalculationArgs, out: OutputArgs -): +def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs): """ Output stats for all datasets and subsets calculated. - :param df_combined_annotated: Pandas DataFrame with additional filtering columns - :type df_combined_annotated: pd.DataFrame + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset @@ -226,7 +225,7 @@ def output_all_stats( f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset_stats", ) - output_stats(df_combined_annotated, output_file, out) + output_stats(dataset.df_result, output_file, out) if out.write_bf: output_file = os.path.join( @@ -236,7 +235,7 @@ def output_all_stats( f"BF_{args.min_nof_cpds_bf}_c_dt_d_dt_stats", ) output_stats( - df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]], + dataset.df_result[dataset.df_result["BF_100_c_dt_d_dt"]], output_file, out, ) @@ -249,7 +248,7 @@ def output_all_stats( f"B_{args.min_nof_cpds_b}_c_dt_d_dt_stats", ) output_stats( - df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]], + dataset.df_result[dataset.df_result["B_100_c_dt_d_dt"]], output_file, out, ) From a4721c664830bede376b8f278cba2e05de4c4aae Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Tue, 20 Feb 2024 20:38:51 +0000 Subject: [PATCH 5/8] Fix handling truth values of filtering columns in pandas --- src/add_dti_annotations.py | 4 ++-- src/add_filtering_columns.py | 10 ++++------ src/get_drug_mechanism_ct_pairs.py | 2 +- src/sanity_checks.py | 2 +- 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py index 533aa2d..6596535 100644 --- a/src/add_dti_annotations.py +++ b/src/add_dti_annotations.py @@ -118,7 +118,7 @@ def add_dti_annotations( dataset.drug_mechanism_pairs_set ) ) - & (dataset.df_result["therapeutic_target"] == True) + & (dataset.df_result["therapeutic_target"]) ), "DTI", ] = "DT" @@ -133,7 +133,7 @@ def add_dti_annotations( dataset.drug_mechanism_pairs_set ) ) - & (dataset.df_result["therapeutic_target"] == False) + & ~(dataset.df_result["therapeutic_target"]) ), "DTI", ] = "NDT" diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py index c26a721..88ce052 100644 --- a/src/add_filtering_columns.py +++ b/src/add_filtering_columns.py @@ -159,9 +159,9 @@ def add_subset_filtering_columns( dataset.df_result[col_name] = False dataset.df_result.loc[(dataset.df_result.index.isin(df.index)), col_name] = True # check that filtering works - assert dataset.df_result[dataset.df_result[col_name] == True][ - df.columns - ].equals(df), f"Filtering is not accurate for {col_name}." + assert dataset.df_result[dataset.df_result[col_name]][df.columns].equals( + df + ), f"Filtering is not accurate for {col_name}." if logging.DEBUG >= logging.root.level: for [df_subset, subset_desc] in subsets: @@ -200,9 +200,7 @@ def add_filtering_columns( # consider only binding assays # assay description = binding desc = "B" - df_combined_subset = dataset.df_result[ - dataset.df_result["keep_for_binding"] == True - ].copy() + df_combined_subset = dataset.df_result[dataset.df_result["keep_for_binding"]].copy() add_subset_filtering_columns( df_combined_subset, dataset, diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py index ebabf45..5f1fa7c 100644 --- a/src/get_drug_mechanism_ct_pairs.py +++ b/src/get_drug_mechanism_ct_pairs.py @@ -314,7 +314,7 @@ def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection dataset.df_result.loc[ ( (dataset.df_result["pchembl_value_mean_B"].notnull()) - | (dataset.df_result["pair_mutation_in_dm_table"] == True) + | (dataset.df_result["pair_mutation_in_dm_table"]) ), "keep_for_binding", ] = True diff --git a/src/sanity_checks.py b/src/sanity_checks.py index 8b74546..ee8844d 100644 --- a/src/sanity_checks.py +++ b/src/sanity_checks.py @@ -51,7 +51,7 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame): ]: assert df_result[(df_result[pchembl_col].isnull())].equals( df_result[ - (df_result["pair_mutation_in_dm_table"] == True) + (df_result["pair_mutation_in_dm_table"]) & (df_result[pchembl_col].isnull()) ] ), f"Missing pchembl value in column {pchembl_col}" From fe2a49a4108134cc6cffcf196d8c6e4e1004ffe7 Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Tue, 20 Feb 2024 23:41:07 +0000 Subject: [PATCH 6/8] Remove unnecessary variables from Dataset --- src/add_chembl_compound_properties.py | 80 ++++----- src/add_chembl_target_class_annotations.py | 104 ++++++++---- src/add_dti_annotations.py | 4 + src/add_rdkit_compound_descriptors.py | 2 + src/clean_dataset.py | 4 +- src/dataset.py | 15 +- src/get_activity_ct_pairs.py | 38 ++++- src/get_dataset.py | 2 +- src/get_drug_mechanism_ct_pairs.py | 77 +++++---- src/sanity_checks.py | 185 ++++++++++----------- 10 files changed, 285 insertions(+), 226 deletions(-) diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py index 4e5d623..879c8dc 100644 --- a/src/add_chembl_compound_properties.py +++ b/src/add_chembl_compound_properties.py @@ -3,12 +3,13 @@ import pandas as pd from dataset import Dataset +import sanity_checks ########### Add Compound Properties Based on ChEMBL Data ########### -def add_first_publication_date( - dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool -): +def get_first_publication_cpd_date( + chembl_con: sqlite3.Connection, limit_to_literature: bool +) -> pd.DataFrame: """ Query and calculate the first publication of a compound based on ChEMBL data (column name: first_publication_cpd). @@ -16,13 +17,12 @@ def add_first_publication_date( of the compound in the literature according to ChEMBL. Otherwise this is the first appearance in any source in ChEMBL. - :param dataset: Dataset with compound-target pairs. - Will be updated to include first_publication_cpd - :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :param limit_to_literature: Base first_publication_cpd on literature sources only if True. :type limit_to_literature: bool + :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL. + :rtype: pd.DataFrame """ # information about salts is aggregated in the parent sql = """ @@ -43,26 +43,21 @@ def add_first_publication_date( ].transform("min") df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates() - dataset.df_result = dataset.df_result.merge( - df_docs, on="parent_molregno", how="left" - ) + return df_docs -def add_chembl_properties_and_structures( - dataset: Dataset, chembl_con: sqlite3.Connection -): +def get_chembl_properties_and_structures( + chembl_con: sqlite3.Connection, +) -> pd.DataFrame: """ - Add compound properties from the compound_properties table + Get compound properties from the compound_properties table (e.g., alogp, #hydrogen bond acceptors / donors, etc.). - Add InChI, InChI key and canonical smiles. + Get InChI, InChI key and canonical smiles. - :param dataset: Dataset with compound-target pairs. - Will be updated to include compound properties and structures. - dataset.df_cpd_props will be set to - compound properties and structures for all compound ids in ChEMBL. - :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection + :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL + :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT mh.parent_molregno, @@ -79,16 +74,13 @@ def add_chembl_properties_and_structures( """ df_cpd_props = pd.read_sql_query(sql, con=chembl_con) - dataset.df_cpd_props = df_cpd_props - dataset.df_result = dataset.df_result.merge( - df_cpd_props, on="parent_molregno", how="left" - ) + return df_cpd_props -def add_ligand_efficiency_metrics(dataset: Dataset): +def calculate_ligand_efficiency_metrics(dataset: Dataset): """ - Calculate the ligand efficiency metrics for the compounds + Calculate and add the ligand efficiency metrics for the compounds based on the mean pchembl values for a compound-target pair and the following ligand efficiency (LE) formulas: @@ -150,20 +142,18 @@ def add_ligand_efficiency_metrics(dataset: Dataset): ) -def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection): +def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame: """ - Query and add ATC classifications (level 1) from the atc_classification and + Query ATC classifications (level 1) from the atc_classification and molecule_atc_classification tables. ATC level annotations for the same parent_molregno are combined into one description that concatenates all descriptions sorted alphabetically into one string with ' | ' as a separator. - :param dataset: Dataset with compound-target pairs. - Will be updated to include ATC classifications. - dataset.atc_levels will be set to ATC annotations in ChEMBL. - :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection + :return: Pandas DataFrame with ATC annotations in ChEMBL. + :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description @@ -186,11 +176,7 @@ def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection): ].transform(lambda x: between_str_join.join(sorted(x))) atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates() - dataset.atc_levels = atc_levels - - dataset.df_result = dataset.df_result.merge( - atc_levels, on="parent_molregno", how="left" - ) + return atc_levels def add_all_chembl_compound_properties( @@ -214,10 +200,24 @@ def add_all_chembl_compound_properties( Base it on all available sources otherwise. :type limit_to_literature: bool """ - add_first_publication_date(dataset, chembl_con, limit_to_literature) + df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature) + dataset.df_result = dataset.df_result.merge( + df_docs, on="parent_molregno", how="left" + ) - add_chembl_properties_and_structures(dataset, chembl_con) + df_cpd_props = get_chembl_properties_and_structures(chembl_con) + dataset.df_cpd_props = df_cpd_props + dataset.df_result = dataset.df_result.merge( + df_cpd_props, on="parent_molregno", how="left" + ) + sanity_checks.check_compound_props(dataset.df_result, df_cpd_props) - add_ligand_efficiency_metrics(dataset) + calculate_ligand_efficiency_metrics(dataset) + sanity_checks.check_ligand_efficiency_metrics(dataset.df_result) - add_atc_classification(dataset, chembl_con) + atc_levels = get_atc_classification(chembl_con) + dataset.atc_levels = atc_levels + dataset.df_result = dataset.df_result.merge( + atc_levels, on="parent_molregno", how="left" + ) + sanity_checks.check_atc(dataset.df_result, atc_levels) diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py index 0cb388c..d9aca47 100644 --- a/src/add_chembl_target_class_annotations.py +++ b/src/add_chembl_target_class_annotations.py @@ -7,6 +7,7 @@ import write_subsets from arguments import OutputArgs, CalculationArgs from dataset import Dataset +import sanity_checks ########### Add Target Class Annotations Based on ChEMBL Data ########### @@ -80,44 +81,31 @@ def get_target_class_table( return df_target_classes -def add_chembl_target_class_annotations( +def get_aggregated_target_classes( dataset: Dataset, chembl_con: sqlite3.Connection, - args: CalculationArgs, - out: OutputArgs, -): +) -> tuple[pd.DataFrame, pd.DataFrame]: """ - Add level 1 and 2 target class annotations. - Assignments for target IDs with more than one target class assignment per level - are summarised into one string with '|' as a separator - between the different target class annotations. - - Targets with more than one level 1 / level 2 target class assignment are written to a file. - These could be reassigned by hand if a single target class is preferable. + Get mappings for target id to aggregated level 1 / level 2 target class. :param dataset: Dataset with compound-target pairs. - Will be updated to only include target class annotations. - dataset.target_classes_level1 will be set to - pandas DataFrame with mapping from target id to level 1 target class - dataset.target_classes_level2 will be set to - pandas DataFrame with mapping from target id to level 2 target class :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection - :param args: Arguments related to how to calculate the dataset - :type args: CalculationArgs - :param out: Arguments related to how to output the dataset - :type out: OutputArgs + :return: [pandas DataFrame with mapping from target id to level 1 target class, + pandas DataFrame with mapping from target id to level 2 target class] + :rtype: tuple[pd.DataFrame, pd.DataFrame] """ current_tids = set(dataset.df_result["tid"]) df_target_classes = get_target_class_table(chembl_con, current_tids) + between_str_join = "|" + # Summarise the information for a target id with # several assigned target classes of level 1 into one description. # If a target id has more than one assigned target class, # the target class 'Unclassified protein' is discarded. level = "l1" - between_str_join = "|" target_classes_level1 = df_target_classes[["tid", level]].drop_duplicates().dropna() # remove 'Unclassified protein' from targets with more than one target class, level 1 @@ -145,10 +133,6 @@ def add_chembl_target_class_annotations( ["tid", "target_class_l1"] ].drop_duplicates() - dataset.df_result = dataset.df_result.merge( - target_classes_level1, on="tid", how="left" - ) - # Repeat the summary step for target classes of level 2. level = "l2" target_classes_level2 = df_target_classes[["tid", level]].drop_duplicates().dropna() @@ -159,11 +143,24 @@ def add_chembl_target_class_annotations( ["tid", "target_class_l2"] ].drop_duplicates() - dataset.df_result = dataset.df_result.merge( - target_classes_level2, on="tid", how="left" - ) + return target_classes_level1, target_classes_level2 + - # Output targets have more than one target class assignment +def output_ambiguous_target_classes( + dataset: Dataset, + args: CalculationArgs, + out: OutputArgs, +): + """ + Output targets have more than one target class assignment + + :param dataset: Dataset with compound-target pairs. + :type dataset: Dataset + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ more_than_one_level_1 = dataset.df_result[ (dataset.df_result["target_class_l1"].notnull()) & (dataset.df_result["target_class_l1"].str.contains("|", regex=False)) @@ -203,5 +200,50 @@ def add_chembl_target_class_annotations( out, ) - dataset.target_classes_level1 = target_classes_level1 - dataset.target_classes_level2 = target_classes_level2 + +def add_chembl_target_class_annotations( + dataset: Dataset, + chembl_con: sqlite3.Connection, + args: CalculationArgs, + out: OutputArgs, +): + """ + Add level 1 and 2 target class annotations. + Assignments for target IDs with more than one target class assignment per level + are summarised into one string with '|' as a separator + between the different target class annotations. + + Targets with more than one level 1 / level 2 target class assignment are written to a file. + These could be reassigned by hand if a single target class is preferable. + + :param dataset: Dataset with compound-target pairs. + Will be updated to only include target class annotations. + dataset.target_classes_level1 will be set to + pandas DataFrame with mapping from target id to level 1 target class + dataset.target_classes_level2 will be set to + pandas DataFrame with mapping from target id to level 2 target class + :type dataset: Dataset + :param chembl_con: Sqlite3 connection to ChEMBL database. + :type chembl_con: sqlite3.Connection + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + target_classes_level1, target_classes_level2 = get_aggregated_target_classes( + dataset, chembl_con + ) + + dataset.df_result = dataset.df_result.merge( + target_classes_level1, on="tid", how="left" + ) + + dataset.df_result = dataset.df_result.merge( + target_classes_level2, on="tid", how="left" + ) + + sanity_checks.check_target_classes( + dataset.df_result, target_classes_level1, target_classes_level2 + ) + + output_ambiguous_target_classes(dataset, args, out) diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py index 6596535..b1fdda3 100644 --- a/src/add_dti_annotations.py +++ b/src/add_dti_annotations.py @@ -80,6 +80,7 @@ def add_dti_annotations( ), "DTI", ] = "D_DT" + dataset.df_result.loc[ ( dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) @@ -87,6 +88,7 @@ def add_dti_annotations( ), "DTI", ] = "C3_DT" + dataset.df_result.loc[ ( dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) @@ -94,6 +96,7 @@ def add_dti_annotations( ), "DTI", ] = "C2_DT" + dataset.df_result.loc[ ( dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set) @@ -101,6 +104,7 @@ def add_dti_annotations( ), "DTI", ] = "C1_DT" + # Compounds that are in the drug_mechanism table but don't have a known phase between 1-4: dataset.df_result.loc[ ( diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py index 1bc9268..1d9ccc3 100644 --- a/src/add_rdkit_compound_descriptors.py +++ b/src/add_rdkit_compound_descriptors.py @@ -4,6 +4,7 @@ from tqdm import tqdm from dataset import Dataset +import sanity_checks def add_built_in_descriptors(dataset: Dataset): @@ -168,3 +169,4 @@ def add_rdkit_compound_descriptors(dataset: Dataset): """ add_built_in_descriptors(dataset) add_aromaticity_descriptors(dataset) + sanity_checks.check_rdkit_props(dataset.df_result) diff --git a/src/clean_dataset.py b/src/clean_dataset.py index 6efda90..d20c1de 100644 --- a/src/clean_dataset.py +++ b/src/clean_dataset.py @@ -6,6 +6,7 @@ from dataset import Dataset +########### Remove Irrelevant Compounds ########### def remove_compounds_without_smiles_and_mixtures( dataset: Dataset, chembl_con: sqlite3.Connection ): @@ -96,9 +97,8 @@ def remove_compounds_without_smiles_and_mixtures( ) ] - return dataset.df_result - +########### General Cleaning Steps ########### def clean_none_values(dataset: Dataset): """ Change nan values and empty strings to None for consistency. diff --git a/src/dataset.py b/src/dataset.py index 8b3d29f..2a39237 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -7,22 +7,15 @@ class Dataset: """ df_result: Pandas DataFrame with the full dataset + drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table, + used for DTI assignments + drug_mechanism_targets_set: Set of targets in the drug_mechanism table, + used for DTI assigments df_sizes_all: List of intermediate sized of the dataset used for debugging df_sizes_pchembl: List of intermediate sized of the dataset used for debugging - drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table - drug_mechanism_targets_set: Set of targets in the drug_mechanism table - df_cpd_props: Pandas DataFrame with compound properties and - structures for all compound ids in ChEMBL - atc_levels: Pandas DataFrame with ATC annotations in ChEMBL - target_classes_level1: Pandas DataFrame with mapping from target id to level 1 target class - target_classes_level2: Pandas DataFrame with mapping from target id to level 2 target class """ df_result: pd.DataFrame - df_cpd_props: pd.DataFrame - atc_levels: pd.DataFrame - target_classes_level1: pd.DataFrame - target_classes_level2: pd.DataFrame drug_mechanism_pairs_set: set drug_mechanism_targets_set: set df_sizes_all: list[int] diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py index ba6811e..10cdef4 100644 --- a/src/get_activity_ct_pairs.py +++ b/src/get_activity_ct_pairs.py @@ -163,10 +163,10 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame: ########### Get Aggregated Compound-Target Pair Information ########### -def get_aggregated_activity_ct_pairs( +def get_aggregated_compound_target_pairs_with_pchembl( chembl_con: sqlite3.Connection, limit_to_literature: bool, -) -> Dataset: +) -> pd.DataFrame: """ Get dataset of compound target-pairs with an associated pchembl value with pchembl and publication dates aggregated into one entry per pair. @@ -186,9 +186,9 @@ def get_aggregated_activity_ct_pairs( :param limit_to_literature: Include only literature sources if True. Include all available sources otherwise. :type limit_to_literature: bool - :return: Dataset with a pandas Dataframe with compound-target pairs + :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data aggregated into one entry per compound-target pair. - :rtype: Dataset + :rtype: pd.DataFrame """ df_mols = get_compound_target_pairs_with_pchembl( chembl_con, @@ -222,12 +222,32 @@ def get_aggregated_activity_ct_pairs( how="left", ) + return df_combined + + +def get_aggregated_activity_ct_pairs( + chembl_con: sqlite3.Connection, + limit_to_literature: bool, +) -> Dataset: + """ + Wrapper for get_aggregated_compound_target_pairs_with_pchembl, + initialising a dataset. + + :param chembl_con: Sqlite3 connection to ChEMBL database. + :type chembl_con: sqlite3.Connection + :param limit_to_literature: Include only literature sources if True. + Include all available sources otherwise. + :type limit_to_literature: bool + :return: Dataset with a pandas Dataframe with compound-target pairs + based on ChEMBL activity data aggregated into one entry per compound-target pair. + :rtype: Dataset + """ + df_result = get_aggregated_compound_target_pairs_with_pchembl( + chembl_con, limit_to_literature + ) + dataset = Dataset( - df_combined, - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame(), + df_result, set(), set(), [], diff --git a/src/get_dataset.py b/src/get_dataset.py index c9966d5..8717cce 100644 --- a/src/get_dataset.py +++ b/src/get_dataset.py @@ -71,7 +71,7 @@ def get_ct_pair_dataset( get_stats.add_debugging_info(dataset, dataset.df_result, "clean df") logging.info("sanity_checks") - sanity_checks.sanity_checks(dataset, args.calculate_rdkit) + sanity_checks.sanity_checks(dataset) logging.info("add_filtering_columns") add_filtering_columns.add_filtering_columns(dataset, args, out) diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py index 5f1fa7c..35b430f 100644 --- a/src/get_drug_mechanism_ct_pairs.py +++ b/src/get_drug_mechanism_ct_pairs.py @@ -4,6 +4,7 @@ import pandas as pd from dataset import Dataset +import sanity_checks ########### Extract Drug-Target Interactions From the drug_mechanism Table ########### @@ -241,29 +242,16 @@ def get_drug_mechanism_ct_pairs(chembl_con: sqlite3.Connection) -> pd.DataFrame: ########### Add Compounds From the drug_mechanism Table to the Dataset ########### -def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection): +def add_dm_filtering_columns(dataset: Dataset): """ - Add compound-target pairs from the drug_mechanism table - that are not in the dataset based on the initial ChEMBL query. - These are compound-target pairs for which there is no associated pchembl value data. - Since the pairs are known interactions, - they are added to the dataset despite not having a pchembl value. - Add the set of compound-target pairs in the drug_mechanism table and - the set of targets in the drug_mechanism table to the dataset. + Add filtering columns related to the drug_mechanism table. + - pair_mutation_in_dm_table: pair is in dm table (incl. mutations) + - pair_in_dm_table: pair is in dm table (excl. mutations) + - keep_for_binding: use to limit to binding assays :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data :type dataset: Dataset - :param chembl_con: Sqlite3 connection to ChEMBL database. - :type chembl_con: sqlite3.Connection """ - cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con) - dataset.drug_mechanism_pairs_set = set( - f"{a}_{b}" - for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"]) - ) - - dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"]) - # Add a new column *pair_mutation_in_dm_table* which is set to True if the compound target pair # (taking mutation annotations into account) is in the drug_mechanism table. dataset.df_result["pair_mutation_in_dm_table"] = False @@ -284,6 +272,43 @@ def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection "pair_in_dm_table", ] = True + # Add a new column *keep_for_binding* which is set to True if the row should be kept + # if you want to limit the dataset to only data based on binding assays. + # Rows are kept if + # - there is a binding data-based pchembl value or + # - the compound-target pair (including mutation info) is in the drug_mechanism table + dataset.df_result["keep_for_binding"] = False + dataset.df_result.loc[ + ( + (dataset.df_result["pchembl_value_mean_B"].notnull()) + | (dataset.df_result["pair_mutation_in_dm_table"]) + ), + "keep_for_binding", + ] = True + + +def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection): + """ + Add compound-target pairs from the drug_mechanism table + that are not in the dataset based on the initial ChEMBL query. + These are compound-target pairs for which there is no associated pchembl value data. + Since the pairs are known interactions, + they are added to the dataset despite not having a pchembl value. + Add the set of compound-target pairs in the drug_mechanism table and + the set of targets in the drug_mechanism table to the dataset. + + :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data + :type dataset: Dataset + :param chembl_con: Sqlite3 connection to ChEMBL database. + :type chembl_con: sqlite3.Connection + """ + cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con) + dataset.drug_mechanism_pairs_set = set( + f"{a}_{b}" + for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"]) + ) + dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"]) + ##### Limit the drug_mechanism pairs to the ones that are not yet in the dataset. ##### # Mutation annotations are taken into account. # Therefore, *(cpd A, target B without mutation)* will be added @@ -305,16 +330,6 @@ def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection # Combined data of existing query with new compound-target pairs. dataset.df_result = pd.concat([dataset.df_result, cpd_target_pairs]) - # Add a new column *keep_for_binding* which is set to True if the row should be kept - # if you want to limit the dataset to only data based on binding assays. - # Rows are kept if - # - there is a binding data-based pchembl value or - # - the compound-target pair (including mutation info) is in the drug_mechanism table - dataset.df_result["keep_for_binding"] = False - dataset.df_result.loc[ - ( - (dataset.df_result["pchembl_value_mean_B"].notnull()) - | (dataset.df_result["pair_mutation_in_dm_table"]) - ), - "keep_for_binding", - ] = True + add_dm_filtering_columns(dataset) + + sanity_checks.check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result) diff --git a/src/sanity_checks.py b/src/sanity_checks.py index ee8844d..ad94c89 100644 --- a/src/sanity_checks.py +++ b/src/sanity_checks.py @@ -3,38 +3,7 @@ from dataset import Dataset -########### Sanity checks for the dataset ########### -def check_null_values(df_result: pd.DataFrame): - """ - Check if any columns contain nan or null which aren't recognised as null values. - """ - for col in df_result.columns: - col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) - assert ( - "nan" not in col_as_str - ), f"Problem with unrecognised nan value in column {col}" - assert ( - "null" not in col_as_str - ), f"Problem with unrecognised null value in column {col}" - - -def check_for_mixed_types(df_result: pd.DataFrame): - """ - Check that there are no mixed types in columns with dtype=object. - """ - for col, dtype in df_result.dtypes.to_dict().items(): - if dtype == object: - col_original = set(df_result[df_result[col].notnull()][col]) - col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) - # is there a difference in the two sets (ignoring null values) - assert ( - len(col_original - col_as_str) == 0 - ), f"Mixed types in colum {col}: {col_original-col_as_str}" - assert ( - len(col_as_str - col_original) == 0 - ), f"Mixed types in colum {col}: {col_as_str-col_original}" - - +########### Sanity checks during assignments ########### def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame): """ Check that rows without a pchembl value based on binding+functional assays (pchembl_x_BF) @@ -57,6 +26,36 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame): ), f"Missing pchembl value in column {pchembl_col}" +def check_compound_props(df_result: pd.DataFrame, df_cpd_props: pd.DataFrame): + """ + Check that compound props are only null if + + - the property in the parent_molregno is not in df_cpd_props + - or if the value in the compound props table is null. + """ + # missing values because the parent_molregno is not in the compound props table + no_cpd_prop_info = len( + df_result[ + ~df_result["parent_molregno"].isin(set(df_cpd_props["parent_molregno"])) + ] + ) + + for col in df_cpd_props.columns: + if col != "parent_molregno": + # missing values because the compound props query returns null (exists but is null) + missing_values = len( + df_result[ + df_result["parent_molregno"].isin( + set(df_cpd_props[df_cpd_props[col].isnull()]["parent_molregno"]) + ) + ] + ) + null_values = no_cpd_prop_info + missing_values + assert null_values == len( + df_result[df_result[col].isnull()] + ), f"Too many null values in {col}" + + def check_ligand_efficiency_metrics(df_result: pd.DataFrame): """ Check that ligand efficiency metrics are only null @@ -97,69 +96,38 @@ def check_ligand_efficiency_metrics(df_result: pd.DataFrame): ), f"Missing LLE value in LLE_{suffix}" -def check_compound_props(dataset: Dataset): +def check_atc( + df_result: pd.DataFrame, + atc_levels: pd.DataFrame, +): """ - Check that compound props are only null if - - - the property in the parent_molregno is not in df_cpd_props - - or if the value in the compound props table is null. + Check that atc_level1 information is only null + if the parent_molregno is not in the respective table. """ - # missing values because the parent_molregno is not in the compound props table - no_cpd_prop_info = len( - dataset.df_result[ - ~dataset.df_result["parent_molregno"].isin( - set(dataset.df_cpd_props["parent_molregno"]) - ) + assert df_result[(df_result["atc_level1"].isnull())].equals( + df_result[ + ~df_result["parent_molregno"].isin(set(atc_levels["parent_molregno"])) ] - ) - - for col in dataset.df_cpd_props.columns: - if col != "parent_molregno": - # missing values because the compound props query returns null (exists but is null) - missing_values = len( - dataset.df_result[ - dataset.df_result["parent_molregno"].isin( - set( - dataset.df_cpd_props[dataset.df_cpd_props[col].isnull()][ - "parent_molregno" - ] - ) - ) - ] - ) - null_values = no_cpd_prop_info + missing_values - assert null_values == len( - dataset.df_result[dataset.df_result[col].isnull()] - ), f"Too many null values in {col}" + ), "Null values in atc_level1 are not exclusively \ + because the parent_molregno is not in the atc_classification table." -def check_atc_and_target_classes( - dataset: Dataset, +def check_target_classes( + df_result: pd.DataFrame, + target_classes_level1: pd.DataFrame, + target_classes_level2: pd.DataFrame, ): """ - Check that atc_level1 and target class information is only null - if the parent_molregno / target id is not in the respective table. + Check that target class information is only null + if the target id is not in the respective table. """ - assert dataset.df_result[(dataset.df_result["atc_level1"].isnull())].equals( - dataset.df_result[ - ~dataset.df_result["parent_molregno"].isin( - set(dataset.atc_levels["parent_molregno"]) - ) - ] - ), "Null values in atc_level1 are not exclusively \ - because the parent_molregno is not in the atc_classification table." - - assert dataset.df_result[(dataset.df_result["target_class_l1"].isnull())].equals( - dataset.df_result[ - ~dataset.df_result["tid"].isin(set(dataset.target_classes_level1["tid"])) - ] + assert df_result[(df_result["target_class_l1"].isnull())].equals( + df_result[~df_result["tid"].isin(set(target_classes_level1["tid"]))] ), "Null values in target_class_l1 are not exclusively \ because the tid is not in the protein_classification table." - assert dataset.df_result[(dataset.df_result["target_class_l2"].isnull())].equals( - dataset.df_result[ - ~dataset.df_result["tid"].isin(set(dataset.target_classes_level2["tid"])) - ] + assert df_result[(df_result["target_class_l2"].isnull())].equals( + df_result[~df_result["tid"].isin(set(target_classes_level2["tid"]))] ), "Null values in target_class_l2 are not exclusively \ because the tid is not in the protein_classification table." @@ -195,25 +163,46 @@ def check_rdkit_props(df_result: pd.DataFrame): ), f"Missing value in {col} despite a smiles being available." +########### Final sanity checks for the dataset ########### +def check_null_values(df_result: pd.DataFrame): + """ + Check if any columns contain nan or null which aren't recognised as null values. + """ + for col in df_result.columns: + col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) + assert ( + "nan" not in col_as_str + ), f"Problem with unrecognised nan value in column {col}" + assert ( + "null" not in col_as_str + ), f"Problem with unrecognised null value in column {col}" + + +def check_for_mixed_types(df_result: pd.DataFrame): + """ + Check that there are no mixed types in columns with dtype=object. + """ + for col, dtype in df_result.dtypes.to_dict().items(): + if dtype == object: + col_original = set(df_result[df_result[col].notnull()][col]) + col_as_str = set(df_result[df_result[col].notnull()][col].astype(str)) + # is there a difference in the two sets (ignoring null values) + assert ( + len(col_original - col_as_str) == 0 + ), f"Mixed types in colum {col}: {col_original-col_as_str}" + assert ( + len(col_as_str - col_original) == 0 + ), f"Mixed types in colum {col}: {col_as_str-col_original}" + + def sanity_checks( dataset: Dataset, - calculate_rdkit: bool, ): """ Check basic assumptions about the finished dataset, specifically: - no columns contain nan or null values which aren't recognised as null values - there are no mixed types in columns with dtype=object - - rows without a pchembl value based on binding+functional assays (pchembl_x_BF) - are in the drug_mechanism table - - ligand efficiency metrics are only null when at least one of the values - used to calculate them is null - - compound props are only null if the compound is not in df_cpd_props - or the value in that table is null - - atc_level1 and target class information is only null if - the parent_molregno / target id is not in the respective table - - columns set by the RDKit are only null if there is no canonical SMILES - for the molecule (excluding scaffolds) :param dataset: Dataset with compound-target pairs. :type dataset: Dataset @@ -222,12 +211,6 @@ def sanity_checks( """ check_null_values(dataset.df_result) check_for_mixed_types(dataset.df_result) - check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result) - check_ligand_efficiency_metrics(dataset.df_result) - check_compound_props(dataset) - check_atc_and_target_classes(dataset) - if calculate_rdkit: - check_rdkit_props(dataset.df_result) ########### Sanity checks for writing and reading a dataset ########### From 0c8274873e1678dd0e1b22d7817990df76b61473 Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Wed, 21 Feb 2024 13:43:09 +0000 Subject: [PATCH 7/8] Simplify methods to output stats --- src/dataset.py | 11 ++- src/get_activity_ct_pairs.py | 4 +- src/get_dataset.py | 2 +- src/get_stats.py | 178 +++++++++++++++++++---------------- src/write_subsets.py | 110 ++++++++-------------- 5 files changed, 143 insertions(+), 162 deletions(-) diff --git a/src/dataset.py b/src/dataset.py index 2a39237..352f5bd 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -11,12 +11,15 @@ class Dataset: used for DTI assignments drug_mechanism_targets_set: Set of targets in the drug_mechanism table, used for DTI assigments - df_sizes_all: List of intermediate sized of the dataset used for debugging - df_sizes_pchembl: List of intermediate sized of the dataset used for debugging + df_sizes_all: Pandas DataFrame of intermediate sizes of the dataset, + used for debugging + df_sizes_pchembl: Pandas DataFrame of intermediate sizes of the dataset, + restricted to entries with a pchembl value, + used for debugging """ df_result: pd.DataFrame drug_mechanism_pairs_set: set drug_mechanism_targets_set: set - df_sizes_all: list[int] - df_sizes_pchembl: list[int] + df_sizes_all: pd.DataFrame + df_sizes_pchembl: pd.DataFrame diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py index 10cdef4..1b68394 100644 --- a/src/get_activity_ct_pairs.py +++ b/src/get_activity_ct_pairs.py @@ -250,7 +250,7 @@ def get_aggregated_activity_ct_pairs( df_result, set(), set(), - [], - [], + pd.DataFrame(), + pd.DataFrame(), ) return dataset diff --git a/src/get_dataset.py b/src/get_dataset.py index 8717cce..5324e9e 100644 --- a/src/get_dataset.py +++ b/src/get_dataset.py @@ -83,4 +83,4 @@ def get_ct_pair_dataset( write_subsets.output_all_stats(dataset, args, out) if logging.DEBUG >= logging.root.level: - write_subsets.output_debug_sizes(dataset, out) + write_subsets.write_debug_sizes(dataset, out) diff --git a/src/get_stats.py b/src/get_stats.py index 0b96abc..c6ee8e2 100644 --- a/src/get_stats.py +++ b/src/get_stats.py @@ -4,94 +4,33 @@ from dataset import Dataset -##### Debugging Stats ##### -def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]: +##### Logging Stats ##### +def get_stats_columns() -> tuple[list[str], list[str]]: """ - Calculate the number of unique compounds, targets and pairs - for df and df limited to drugs. - - :param df: Pandas DataFrame for which the dataset sizes should be calculated. - :type df: pd.DataFrame - :return: List of calculated unique counts. - :rtype: list[int] + Get the relevant columns for which stats should be calculated + and a list of descriptions corresponding to the columns. """ - now_mols = df["parent_molregno"].nunique() - now_targets = df["tid"].nunique() - now_targets_mutation = df["tid_mutation"].nunique() - now_pairs = df["cpd_target_pair"].nunique() - now_pairs_mutation = df["cpd_target_pair_mutation"].nunique() - - if "DTI" in df.columns: - # drugs = compounds of a compound-target pair with a known interaction - df_drugs = df[df["DTI"] == "D_DT"] - else: - df_drugs = df[df["max_phase"] == 4] - - now_drugs = df_drugs["parent_molregno"].nunique() - now_drug_targets = df_drugs["tid"].nunique() - now_drug_targets_mutation = df_drugs["tid_mutation"].nunique() - now_drug_pairs = df_drugs["cpd_target_pair"].nunique() - now_drug_pairs_mutation = df_drugs["cpd_target_pair_mutation"].nunique() - - return [ - now_mols, - now_drugs, - now_targets, - now_drug_targets, - now_targets_mutation, - now_drug_targets_mutation, - now_pairs, - now_drug_pairs, - now_pairs_mutation, - now_drug_pairs_mutation, + df_columns = [ + "parent_molregno", + "tid", + "tid_mutation", + "cpd_target_pair", + "cpd_target_pair_mutation", ] + columns_descs = [ + "compound ID", + "target ID", + "target ID with mutation annotations", + "compound-target pair", + "compound-target pair with mutation annotations", + ] + return df_columns, columns_descs -def add_dataset_sizes( - dataset: Dataset, - df: pd.DataFrame, - label: str, -): - """ - Count and add representative counts of df used for debugging to the dataset. - - :param dataset: Dataset with compound-target pairs and debugging sizes. - :type dataset: Dataset - :param df: Pandas DataFrame with current compound-target pairs - :type df: pd.DataFrame - :param label: Description of pipeline step (e.g., initial query). - :type label: str - """ - df_copy = df.copy() - dataset.df_sizes_all.append([label] + calculate_dataset_sizes(df_copy)) - - # restrict to data with any pchembl value (any data with a pchembl, - # even if it is based on only functional data) - # these statistics are purely based on removing - # compound-target pairs without pchembl information, - # i.e., the subset of the dataset is determined by the given df and not recalculated - df_pchembl = df_copy.dropna( - subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all" - ) - dataset.df_sizes_pchembl.append([label] + calculate_dataset_sizes(df_pchembl)) - - -def add_debugging_info( - dataset: Dataset, - df: pd.DataFrame, - label: str, -): - """ - Wrapper for add_dataset_sizes. - Handles logging level. - """ - if logging.DEBUG >= logging.root.level: - add_dataset_sizes(dataset, df, label) - - -##### Logging Stats ##### def get_stats_for_column( - df: pd.DataFrame, column: str, columns_desc: str + df: pd.DataFrame, + column: str, + columns_desc: str, ) -> list[list[str, str, int]]: """ Calculate the number of unique values in df[column] and various subsets of df. @@ -145,3 +84,78 @@ def get_stats_for_column( df[df["DTI"] == "C0_DT"][column].nunique(), ], ] + + +##### Debugging Stats ##### +def get_dataset_sizes(df: pd.DataFrame, label: str) -> pd.DataFrame: + """ + Calculate the number of unique compounds, targets and pairs + for df and df limited to drugs. + + :param df: Pandas DataFrame for which the dataset sizes should be calculated. + :type df: pd.DataFrame + :param label: Description of pipeline step (e.g., initial query). + :type label: str + :return: Pandas DataFrame with calculated unique counts. + :rtype: pd.DataFrame + """ + stats = {"step": label} + + if "DTI" in df.columns: + # drugs = compounds of a compound-target pair with a known interaction + df_drugs = df[df["DTI"] == "D_DT"] + else: + df_drugs = df[df["max_phase"] == 4] + + df_columns, _ = get_stats_columns() + for column in df_columns: + stats[f"{column}_all"] = df[column].nunique() + stats[f"{column}_drugs"] = df_drugs[column].nunique() + + df_stats = pd.DataFrame([stats]) + return df_stats + + +def add_dataset_sizes( + dataset: Dataset, + df: pd.DataFrame, + label: str, +): + """ + Count and add representative counts of df used for debugging to the dataset. + + :param dataset: Dataset with compound-target pairs and debugging sizes. + :type dataset: Dataset + :param df: Pandas DataFrame with current compound-target pairs + :type df: pd.DataFrame + :param label: Description of pipeline step (e.g., initial query). + :type label: str + """ + df_stats = get_dataset_sizes(df, label) + + dataset.df_sizes_all = pd.concat([dataset.df_sizes_all, df_stats]) + + # restrict to data with any pchembl value (any data with a pchembl, + # even if it is based on only functional data) + # these statistics are purely based on removing + # compound-target pairs without pchembl information, + # i.e., the subset of the dataset is determined by the given df and not recalculated + df_copy = df.copy() + df_pchembl = df_copy.dropna( + subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all" + ) + df_stats = get_dataset_sizes(df_pchembl, label) + dataset.df_sizes_pchembl = pd.concat([dataset.df_sizes_pchembl, df_stats]) + + +def add_debugging_info( + dataset: Dataset, + df: pd.DataFrame, + label: str, +): + """ + Wrapper for add_dataset_sizes. + Handles logging level. + """ + if logging.DEBUG >= logging.root.level: + add_dataset_sizes(dataset, df, label) diff --git a/src/write_subsets.py b/src/write_subsets.py index a5635a1..edf08b3 100644 --- a/src/write_subsets.py +++ b/src/write_subsets.py @@ -8,6 +8,7 @@ from dataset import Dataset +##### Writing Output ##### def write_output( df: pd.DataFrame, filename: str, @@ -73,6 +74,7 @@ def write_and_check_output( ) +##### Output Specific Results ##### def write_full_dataset_to_file( dataset: Dataset, args: CalculationArgs, @@ -97,59 +99,6 @@ def write_full_dataset_to_file( write_and_check_output(dataset.df_result, name_all, desc, args, out) -def output_debug_sizes( - dataset: Dataset, - out: OutputArgs, -): - """ - Output counts at various points during calculating the final dataset for debugging. - - :param dataset: Dataset with compound-target pairs and debugging sizes. - :type dataset: Dataset - :param args: Arguments related to how to calculate the dataset - :type args: CalculationArgs - :param out: Arguments related to how to output the dataset - :type out: OutputArgs - """ - column_names = [ - "type", - "#mols", - "#drugs", - "#targets", - "#drug_ targets", - "#targets_ mutation", - "#drug_ targets_mutation", - "#cpd_tid_ pairs", - "#drug_tid_ pairs", - "#cpd_ tid_mutation_ pairs", - "#drug_ tid_mutation_ pairs", - ] - - logging.debug("Size of full dataset at different points.") - full_df_sizes = pd.DataFrame(dataset.df_sizes_all, columns=column_names) - logging.debug(full_df_sizes) - name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes") - write_output( - full_df_sizes, - name_full_df_sizes, - out, - ) - - logging.debug("Size of dataset with any pchembl values at different points.") - logging.debug( - "This includes data for which we only have pchembl data \ - for functional assays but not for binding assays." - ) - df_pchembl_sizes = pd.DataFrame(dataset.df_sizes_pchembl, columns=column_names) - logging.debug(df_pchembl_sizes) - name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes") - write_output( - full_df_sizes, - name_pchembl_df_sizes, - out, - ) - - def output_stats( df: pd.DataFrame, output_file: str, @@ -171,33 +120,15 @@ def output_stats( :param out: Arguments related to how to output the dataset :type out: OutputArgs """ - df_columns = [ - "parent_molregno", - "tid", - "tid_mutation", - "cpd_target_pair", - "cpd_target_pair_mutation", - ] - columns_descs = [ - "compound ID", - "target ID", - "target ID with mutation annotations", - "compound-target pair", - "compound-target pair with mutation annotations", - ] - logging.debug("Stats for %s", output_file) stats = [] + df_columns, columns_descs = get_stats.get_stats_columns() for column, columns_desc in zip(df_columns, columns_descs): logging.debug("Stats for column %s:", column) column_stats = get_stats.get_stats_for_column(df, column, columns_desc) stats += column_stats for colum_stat in column_stats: - logging.debug( - "%20s %s", - colum_stat[2], - colum_stat[3], - ) + logging.debug("%20s %s", colum_stat[2], colum_stat[3]) df_stats = pd.DataFrame( stats, columns=["column", "column_description", "subset_type", "counts"] @@ -252,3 +183,36 @@ def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs): output_file, out, ) + + +def write_debug_sizes( + dataset: Dataset, + out: OutputArgs, +): + """ + Output counts at various points during calculating the final dataset for debugging. + + :param dataset: Dataset with compound-target pairs and debugging sizes. + :type dataset: Dataset + :param args: Arguments related to how to calculate the dataset + :type args: CalculationArgs + :param out: Arguments related to how to output the dataset + :type out: OutputArgs + """ + # Size of full dataset at different points. + name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes") + write_output( + dataset.df_sizes_all, + name_full_df_sizes, + out, + ) + + # Size of dataset with any pchembl values at different points. + # This includes data for which we only have pchembl data + # for functional assays but not for binding assays. + name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes") + write_output( + dataset.df_sizes_pchembl, + name_pchembl_df_sizes, + out, + ) From 3fd8e87071bc6ae6c07b3cabce9cc8c8dede6b81 Mon Sep 17 00:00:00 2001 From: Lina Heinzke Date: Wed, 21 Feb 2024 16:48:06 +0000 Subject: [PATCH 8/8] Add module docstrings --- src/add_chembl_compound_properties.py | 4 ++++ src/add_chembl_target_class_annotations.py | 8 ++++++-- src/add_dti_annotations.py | 6 +++++- src/add_filtering_columns.py | 10 ++++++--- src/add_rdkit_compound_descriptors.py | 4 ++++ src/arguments.py | 6 ++++++ src/clean_dataset.py | 4 ++++ src/dataset.py | 5 +++++ src/get_activity_ct_pairs.py | 5 +++++ src/get_dataset.py | 24 +++++++++++++--------- src/get_drug_mechanism_ct_pairs.py | 5 +++++ src/get_stats.py | 4 ++++ src/main.py | 4 ++++ src/{write_subsets.py => output.py} | 7 ++++++- src/sanity_checks.py | 4 ++++ 15 files changed, 83 insertions(+), 17 deletions(-) rename src/{write_subsets.py => output.py} (98%) diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py index 879c8dc..5f6afe8 100644 --- a/src/add_chembl_compound_properties.py +++ b/src/add_chembl_compound_properties.py @@ -1,3 +1,7 @@ +""" +Add ChEMBL compound properties to the dataset. +""" + import sqlite3 import pandas as pd diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py index d9aca47..bb8d080 100644 --- a/src/add_chembl_target_class_annotations.py +++ b/src/add_chembl_target_class_annotations.py @@ -1,12 +1,16 @@ +""" +Add target class annotations based on ChEMBL data to the dataset. +""" + import logging import os import sqlite3 import pandas as pd -import write_subsets from arguments import OutputArgs, CalculationArgs from dataset import Dataset +import output import sanity_checks @@ -194,7 +198,7 @@ def output_ambiguous_target_classes( f"ChEMBL{args.chembl_version}_" f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass", ) - write_subsets.write_output( + output.write_output( more_than_one_tclass, name_more_than_one_tclass, out, diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py index b1fdda3..94367e9 100644 --- a/src/add_dti_annotations.py +++ b/src/add_dti_annotations.py @@ -1,7 +1,11 @@ +""" +Add DTI (Drug-Target Interaction) Annotations to the dataset. +""" + from dataset import Dataset -########### CTI (Compound-Target Interaction) Annotations ########### +########### DTI (Drug-Target Interaction) Annotations ########### def add_dti_annotations( dataset: Dataset, ): diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py index 88ce052..27d4076 100644 --- a/src/add_filtering_columns.py +++ b/src/add_filtering_columns.py @@ -1,12 +1,16 @@ +""" +Add filtering columns for obtaining the different subsets to the dataset. +""" + import logging import os import pandas as pd from arguments import CalculationArgs, OutputArgs -import get_stats -import write_subsets from dataset import Dataset +import get_stats +import output def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[ @@ -145,7 +149,7 @@ def add_subset_filtering_columns( f"CTI_{args.limited_flag}_" f"{subset_desc}", ) - write_subsets.write_and_check_output( + output.write_and_check_output( df_subset, name_subset, desc, diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py index 1d9ccc3..889ff70 100644 --- a/src/add_rdkit_compound_descriptors.py +++ b/src/add_rdkit_compound_descriptors.py @@ -1,3 +1,7 @@ +""" +Add RDKit-based compound properties to the dataset. +""" + from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.Chem import PandasTools diff --git a/src/arguments.py b/src/arguments.py index 080b331..ea02154 100644 --- a/src/arguments.py +++ b/src/arguments.py @@ -1,4 +1,10 @@ +""" +Dataclasses related to handling arguments, +specifically arguments related to how to calculate or output the dataset. +""" + import argparse + from dataclasses import dataclass diff --git a/src/clean_dataset.py b/src/clean_dataset.py index d20c1de..56517dd 100644 --- a/src/clean_dataset.py +++ b/src/clean_dataset.py @@ -1,3 +1,7 @@ +""" +Methods related to cleaning the dataset. +""" + import logging import sqlite3 diff --git a/src/dataset.py b/src/dataset.py index 352f5bd..19fd259 100644 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,3 +1,8 @@ +""" +Dataclass for handling the calculated compound-target pair dataset +and related data. +""" + from dataclasses import dataclass import pandas as pd diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py index 1b68394..3f6a02c 100644 --- a/src/get_activity_ct_pairs.py +++ b/src/get_activity_ct_pairs.py @@ -1,3 +1,8 @@ +""" +Get initial set of compound-target pairs with an associated activity +for the dataset. +""" + import sqlite3 import numpy as np diff --git a/src/get_dataset.py b/src/get_dataset.py index 5324e9e..faffb3b 100644 --- a/src/get_dataset.py +++ b/src/get_dataset.py @@ -1,18 +1,22 @@ +""" +Main workflow to calculate the compound-target pairs dataset. +""" + import logging import sqlite3 +from arguments import OutputArgs, CalculationArgs +import add_filtering_columns import get_activity_ct_pairs -import get_drug_mechanism_ct_pairs -import add_dti_annotations import add_chembl_compound_properties -import clean_dataset import add_chembl_target_class_annotations +import get_drug_mechanism_ct_pairs +import add_dti_annotations import add_rdkit_compound_descriptors -import sanity_checks -import write_subsets +import clean_dataset import get_stats -from arguments import OutputArgs, CalculationArgs -import add_filtering_columns +import output +import sanity_checks def get_ct_pair_dataset( @@ -77,10 +81,10 @@ def get_ct_pair_dataset( add_filtering_columns.add_filtering_columns(dataset, args, out) logging.info("write_full_dataset_to_file") - write_subsets.write_full_dataset_to_file(dataset, args, out) + output.write_full_dataset_to_file(dataset, args, out) logging.info("output_stats") - write_subsets.output_all_stats(dataset, args, out) + output.output_all_stats(dataset, args, out) if logging.DEBUG >= logging.root.level: - write_subsets.write_debug_sizes(dataset, out) + output.write_debug_sizes(dataset, out) diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py index 35b430f..6121acd 100644 --- a/src/get_drug_mechanism_ct_pairs.py +++ b/src/get_drug_mechanism_ct_pairs.py @@ -1,3 +1,8 @@ +""" +Get and add compound-target pairs based on information +in the drug_mechanism table. +""" + import logging import sqlite3 diff --git a/src/get_stats.py b/src/get_stats.py index c6ee8e2..662f937 100644 --- a/src/get_stats.py +++ b/src/get_stats.py @@ -1,3 +1,7 @@ +""" +Get statistics of dataset for final results and debugging. +""" + import logging import pandas as pd diff --git a/src/main.py b/src/main.py index ffe2bdd..5b297b6 100644 --- a/src/main.py +++ b/src/main.py @@ -1,3 +1,7 @@ +""" +Get the compound-target pairs dataset from ChEMBL using the given arguments. +""" + import logging import sqlite3 diff --git a/src/write_subsets.py b/src/output.py similarity index 98% rename from src/write_subsets.py rename to src/output.py index edf08b3..f1a4a5f 100644 --- a/src/write_subsets.py +++ b/src/output.py @@ -1,11 +1,16 @@ +""" +Write the dataset, subsets and related statistics to files +and to the command line. +""" + import logging import os import pandas as pd import sanity_checks -import get_stats from arguments import OutputArgs, CalculationArgs from dataset import Dataset +import get_stats ##### Writing Output ##### diff --git a/src/sanity_checks.py b/src/sanity_checks.py index ad94c89..94c5d6d 100644 --- a/src/sanity_checks.py +++ b/src/sanity_checks.py @@ -1,3 +1,7 @@ +""" +Perform sanity checks on the dataset. +""" + import pandas as pd from dataset import Dataset