From c7e76b544d892c1704762dad552200c565400b20 Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Thu, 15 Feb 2024 14:53:30 +0000
Subject: [PATCH 1/8] Group arguments together in dataclasses

- add CalculationArgs dataclass with arguments related to how the dataset is calculated
- add OutputArgs dataclass with arguments related to the output
---
 src/add_chembl_target_class_annotations.py |  35 +-
 src/arguments.py                           | 172 +++++++
 src/get_activity_ct_pairs.py               |  15 +-
 src/get_dataset.py                         | 140 +-----
 src/main.py                                | 120 +----
 src/write_subsets.py                       | 541 +++++++++------------
 6 files changed, 458 insertions(+), 565 deletions(-)
 create mode 100644 src/arguments.py

diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
index 25c1d01..009b8d4 100644
--- a/src/add_chembl_target_class_annotations.py
+++ b/src/add_chembl_target_class_annotations.py
@@ -5,6 +5,7 @@
 import pandas as pd
 
 import write_subsets
+from arguments import OutputArgs, CalculationArgs
 
 
 ########### Add Target Class Annotations Based on ChEMBL Data ###########
@@ -81,12 +82,8 @@ def get_target_class_table(
 def add_chembl_target_class_annotations(
     df_combined: pd.DataFrame,
     chembl_con: sqlite3.Connection,
-    output_path: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    chembl_version: str,
-    limited_flag: str,
+    args: CalculationArgs,
+    out: OutputArgs,
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     Add level 1 and 2 target class annotations. 
@@ -101,19 +98,10 @@ def add_chembl_target_class_annotations(
     :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param output_path: Path to write the targets with more than one target class assignment to
-    :type output_path: str
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param limited_flag: Document suffix indicating 
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     :return: - Pandas DataFrame with added target class annotations \\
         - Pandas DataFrame with mapping from target id to level 1 target class \\
         - Pandas DataFrame with mapping from target id to level 2 target class
@@ -199,15 +187,14 @@ def add_chembl_target_class_annotations(
     )
 
     name_more_than_one_tclass = os.path.join(
-        output_path,
-        f"ChEMBL{chembl_version}_CTI_{limited_flag}_targets_w_more_than_one_tclass",
+        out.output_path,
+        f"ChEMBL{args.chembl_version}_"
+        f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass",
     )
     write_subsets.write_output(
         more_than_one_tclass,
         name_more_than_one_tclass,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
+        out,
     )
 
     return df_combined, target_classes_level1, target_classes_level2
diff --git a/src/arguments.py b/src/arguments.py
new file mode 100644
index 0000000..080b331
--- /dev/null
+++ b/src/arguments.py
@@ -0,0 +1,172 @@
+import argparse
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class CalculationArgs:
+    """
+    Collection of arguments related to how to calculate the dataset.
+
+    chembl_version:         Version of ChEMBL for output file names
+    calculate_rdkit:        True if RDKit-based compound properties should be calculated
+    limit_to_literature:    Include only literature sources if True
+    limited_flag:           String version of limit_to_literature used in file names
+    min_nof_cpds_bf:        Minimum number of compounds per target for the BF subset
+    min_nof_cpds_b:         Minimum number of compounds per target for the B subset
+    """
+
+    chembl_version: str
+    calculate_rdkit: bool
+    limit_to_literature: bool
+    limited_flag: str
+    min_nof_cpds_bf: int
+    min_nof_cpds_b: int
+
+
+@dataclass(frozen=True)
+class OutputArgs:
+    """
+    Collection of arguments related to how to output the dataset.
+
+    output_path:        Path to write output files to
+    delimiter:          Delimiter in csv-output
+    write_to_csv:       True if output should be written to csv
+    write_to_excel:     True if output should be written to excel
+    write_full_dataset: True if the full dataset should be written to output
+    write_bf:           True if subsets based on binding+functional data should be written to output
+    write_b:            True if subsets based on binding data only should be written to output
+    """
+
+    output_path: str
+    delimiter: str
+    write_to_csv: bool
+    write_to_excel: bool
+    write_full_dataset: bool
+    write_bf: bool
+    write_b: bool
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Get arguments with argparse.
+
+    :return: Populated argparse.Namespace
+    :rtype: argparse.Namespace
+    """
+    parser = argparse.ArgumentParser(
+        description="Extract the compound-target pairs dataset from ChEMBL. \
+            The full dataset plus filtering columns for binding vs. binding+functional data \
+            will always be written to csv. \
+            Additional outputs and output types can be chosen with the parameters below."
+    )
+
+    parser.add_argument(
+        "--chembl",
+        "-v",
+        dest="chembl_version",
+        metavar="<version>",
+        type=str,
+        default=None,
+        help="ChEMBL version. \
+            Latest version if None. \
+            Required if a path to a SQLite database is provided, \
+            i.e., if --sqlite is set. (default: None)",
+    )
+    parser.add_argument(
+        "--sqlite",
+        "-s",
+        metavar="<path>",
+        type=str,
+        default=None,
+        help="Path to SQLite database. \
+            ChEMBL is downloaded as an SQLite database \
+            and handled by chembl_downloader if None. (default: None)",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        dest="output_path",
+        metavar="<path>",
+        type=str,
+        required=True,
+        help="Path to write the output file(s) to. (required)",
+    )
+    parser.add_argument(
+        "--delimiter",
+        "-d",
+        metavar="<delimiter>",
+        type=str,
+        default=";",
+        help="Delimiter in output csv-files.  (default: ;)",
+    )
+    parser.add_argument(
+        "--all_sources",
+        action="store_true",
+        help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
+            This includes data from BindingDB which may skew the results. \
+            Default (not set): the dataset is calculated based on only literature data.",
+    )
+    parser.add_argument(
+        "--rdkit",
+        dest="calculate_rdkit",
+        action="store_true",
+        help="Calculate RDKit-based compound properties.",
+    )
+    parser.add_argument(
+        "--excel",
+        dest="write_to_excel",
+        action="store_true",
+        help="Write the results to excel. Note: this may fail if the output is too large.",
+    )
+    parser.add_argument(
+        "--BF",
+        dest="write_bf",
+        action="store_true",
+        help="Write binding+functional data subsets.",
+    )
+    parser.add_argument(
+        "--B", dest="write_b", action="store_true", help="Write binding data subsets."
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Log additional debugging information."
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]:
+    """
+    Get parsed and default arguments.
+
+    :return: parserd arguments,
+        arguments related to how to calculate the dataset as CalculationArgs,
+        arguments related to how to output the dataset as OutputArgs
+    :rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs]
+    """
+    args = parse_args()
+
+    calc_args = CalculationArgs(
+        chembl_version=args.chembl_version,
+        calculate_rdkit=args.calculate_rdkit,
+        limit_to_literature=not args.all_sources,
+        # used in file names
+        limited_flag="literature_only" if not args.all_sources else "all_sources",
+        min_nof_cpds_bf=100,
+        min_nof_cpds_b=100,
+    )
+
+    output_args = OutputArgs(
+        output_path=args.output_path,
+        delimiter=args.delimiter,
+        # Always write the results to csv.
+        write_to_csv=True,
+        write_to_excel=args.write_to_excel,
+        # Always write the full dataset plus filtering columns
+        # for binding vs. binding+functional data.
+        write_full_dataset=True,
+        write_bf=args.write_bf,
+        write_b=args.write_b,
+    )
+
+    return args, calc_args, output_args
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
index 7c825db..4e440da 100644
--- a/src/get_activity_ct_pairs.py
+++ b/src/get_activity_ct_pairs.py
@@ -1,17 +1,13 @@
-import logging
 import sqlite3
 
 import numpy as np
 import pandas as pd
 
-import get_stats
-
 
 ########### Get Initial Compound-Target Data From ChEMBL ###########
 def get_compound_target_pairs_with_pchembl(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-    df_sizes: list[list[int], list[int]],
 ) -> pd.DataFrame:
     """
     Query ChEMBL activities and related assay for compound-target pairs
@@ -27,8 +23,6 @@ def get_compound_target_pairs_with_pchembl(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
     :return: Pandas DataFrame with compound-target pairs with a pchembl value.
     :rtype: pd.DataFrame
     """
@@ -84,9 +78,6 @@ def get_compound_target_pairs_with_pchembl(
         f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"])
     ]
 
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_mols, "initial query", df_sizes)
-
     return df_mols
 
 
@@ -173,7 +164,6 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
 def get_aggregated_activity_ct_pairs(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-    df_sizes: list[list[int], list[int]],
 ) -> pd.DataFrame:
     """
     Get dataset of compound target-pairs with an associated pchembl value
@@ -194,14 +184,13 @@ def get_aggregated_activity_ct_pairs(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
     :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
         aggregated into one entry per compound-target pair.
     :rtype: pd.DataFrame
     """
     df_mols = get_compound_target_pairs_with_pchembl(
-        chembl_con, limit_to_literature, df_sizes
+        chembl_con,
+        limit_to_literature,
     )
 
     # Summarise the information for binding and functional assays
diff --git a/src/get_dataset.py b/src/get_dataset.py
index 2fa6b91..053ec8e 100644
--- a/src/get_dataset.py
+++ b/src/get_dataset.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import sqlite3
 
 import get_activity_ct_pairs
@@ -12,60 +11,28 @@
 import sanity_checks
 import write_subsets
 import get_stats
+from arguments import OutputArgs, CalculationArgs
 
 
 def get_ct_pair_dataset(
-    chembl_con: sqlite3.Connection,
-    chembl_version: str,
-    output_path: str,
-    limit_to_literature: bool,
-    calculate_rdkit: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    write_full_dataset: bool,
-    write_bf: bool,
-    write_b: bool,
+    chembl_con: sqlite3.Connection, args: CalculationArgs, out: OutputArgs
 ):
     """
     Calculate and output the compound-target pair dataset.
 
     :param chembl_con: Sqlite3 connection to ChEMBL database
     :type chembl_con: sqlite3.Connection
-    :param chembl_version: Version of ChEMBL for output file names
-    :type chembl_version: str
-    :param output_path: Path to write output files to
-    :type output_path: str
-    :param limit_to_literature: Include only literature sources if True.
-        Include all available sources otherwise.
-    :type limit_to_literature: bool
-    :param calculate_rdkit: True if RDKit-based compound properties should be calculated
-    :type calculate_rdkit: bool
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    :param write_full_dataset: True if the full dataset should be written to output
-    :type write_full_dataset: bool
-    :param write_bf: True if subsets based on binding+functional data should be written to output
-    :type write_bf: bool
-    :param write_b: True if subsets based on binding data only should be written to output
-    :type write_b: bool
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     """
     # list with sizes of full dataset and dataset subset with pchembl values for debugging
     df_sizes = [[], []]
 
-    # used in file names
-    if limit_to_literature:
-        limited_flag = "literature_only"
-    else:
-        limited_flag = "all_sources"
-
     logging.info("get_aggregated_activity_ct_pairs")
     df_combined = get_activity_ct_pairs.get_aggregated_activity_ct_pairs(
-        chembl_con, limit_to_literature, df_sizes
+        chembl_con, args.limit_to_literature
     )
     if logging.DEBUG >= logging.root.level:
         get_stats.add_dataset_sizes(df_combined, "activity ct-pairs", df_sizes)
@@ -87,7 +54,7 @@ def get_ct_pair_dataset(
     logging.info("add_all_chembl_compound_properties")
     df_combined, df_cpd_props, atc_levels = (
         add_chembl_compound_properties.add_all_chembl_compound_properties(
-            df_combined, chembl_con, limit_to_literature
+            df_combined, chembl_con, args.limit_to_literature
         )
     )
     if logging.DEBUG >= logging.root.level:
@@ -105,19 +72,15 @@ def get_ct_pair_dataset(
         add_chembl_target_class_annotations.add_chembl_target_class_annotations(
             df_combined,
             chembl_con,
-            output_path,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            chembl_version,
-            limited_flag,
+            args,
+            out,
         )
     )
     if logging.DEBUG >= logging.root.level:
         get_stats.add_dataset_sizes(df_combined, "tclass annotations", df_sizes)
 
     logging.info("add_rdkit_compound_descriptors")
-    if calculate_rdkit:
+    if args.calculate_rdkit:
         df_combined = add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(
             df_combined
         )
@@ -125,7 +88,7 @@ def get_ct_pair_dataset(
             get_stats.add_dataset_sizes(df_combined, "RDKit props", df_sizes)
 
     logging.info("clean_dataset")
-    df_combined = clean_dataset.clean_dataset(df_combined, calculate_rdkit)
+    df_combined = clean_dataset.clean_dataset(df_combined, args.calculate_rdkit)
     if logging.DEBUG >= logging.root.level:
         get_stats.add_dataset_sizes(df_combined, "clean df", df_sizes)
 
@@ -136,89 +99,34 @@ def get_ct_pair_dataset(
         atc_levels,
         target_classes_level1,
         target_classes_level2,
-        calculate_rdkit,
+        args.calculate_rdkit,
     )
 
     logging.info("write_BF_to_file")
-    min_nof_cpds_bf = 100
-    df_combined_annotated = write_subsets.write_bf_to_file(
+    df_combined = write_subsets.write_bf_to_file(
         df_combined,
-        chembl_version,
-        min_nof_cpds_bf,
-        output_path,
-        write_bf,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
-        limited_flag,
-        calculate_rdkit,
         df_sizes,
+        args,
+        out,
     )
 
     logging.info("write_B_to_file")
-    min_nof_cpds_b = 100
-    df_combined_annotated = write_subsets.write_b_to_file(
+    df_combined = write_subsets.write_b_to_file(
         df_combined,
-        df_combined_annotated,
-        chembl_version,
-        min_nof_cpds_b,
-        output_path,
-        write_b,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
-        limited_flag,
-        calculate_rdkit,
         df_sizes,
+        args,
+        out,
     )
 
     logging.info("write_full_dataset_to_file")
     write_subsets.write_full_dataset_to_file(
-        df_combined_annotated,
-        chembl_version,
-        output_path,
-        write_full_dataset,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
-        limited_flag,
-        calculate_rdkit,
+        df_combined,
+        args,
+        out,
     )
 
     logging.info("output_stats")
-
-    output_file = os.path.join(
-        output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset_stats"
-    )
-    write_subsets.output_stats(
-        df_combined_annotated, output_file, write_to_csv, write_to_excel, delimiter
-    )
-    if write_bf:
-        output_file = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt_stats",
-        )
-        write_subsets.output_stats(
-            df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]],
-            output_file,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-        )
-    if write_b:
-        output_file = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt_stats",
-        )
-        write_subsets.output_stats(
-            df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]],
-            output_file,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-        )
+    write_subsets.output_all_stats(df_combined, args, out)
 
     if logging.DEBUG >= logging.root.level:
-        write_subsets.output_debug_sizes(
-            df_sizes, output_path, write_to_csv, write_to_excel, delimiter
-        )
+        write_subsets.output_debug_sizes(df_sizes, out)
diff --git a/src/main.py b/src/main.py
index 8198f3e..ffe2bdd 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,103 +1,17 @@
-import argparse
 import logging
 import sqlite3
 
 import chembl_downloader
 
+import arguments
 import get_dataset
 
 
-def parse_args() -> argparse.Namespace:
-    """
-    Get arguments with argparse.
-
-    :return: Populated argparse.Namespace
-    :rtype: argparse.Namespace
-    """
-    parser = argparse.ArgumentParser(
-        description="Extract the compound-target pairs dataset from ChEMBL. \
-            The full dataset plus filtering columns for binding vs. binding+functional data \
-            will always be written to csv. \
-            Additional outputs and output types can be chosen with the parameters below."
-    )
-
-    parser.add_argument(
-        "--chembl",
-        "-v",
-        metavar="<version>",
-        type=str,
-        default=None,
-        help="ChEMBL version. \
-            Latest version if None. \
-            Required if a path to a SQLite database is provided, \
-            i.e., if --sqlite is set. (default: None)",
-    )
-    parser.add_argument(
-        "--sqlite",
-        "-s",
-        metavar="<path>",
-        type=str,
-        default=None,
-        help="Path to SQLite database. \
-            ChEMBL is downloaded as an SQLite database \
-            and handled by chembl_downloader if None. (default: None)",
-    )
-    parser.add_argument(
-        "--output",
-        "-o",
-        metavar="<path>",
-        type=str,
-        required=True,
-        help="Path to write the output file(s) to. (required)",
-    )
-    parser.add_argument(
-        "--delimiter",
-        "-d",
-        metavar="<delimiter>",
-        type=str,
-        default=";",
-        help="Delimiter in output csv-files.  (default: ;)",
-    )
-    parser.add_argument(
-        "--all_sources",
-        action="store_true",
-        help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
-            This includes data from BindingDB which may skew the results. \
-            Default (not set): the dataset is calculated based on only literature data.",
-    )
-    parser.add_argument(
-        "--rdkit",
-        action="store_true",
-        help="Calculate RDKit-based compound properties.",
-    )
-    parser.add_argument(
-        "--excel",
-        action="store_true",
-        help="Write the results to excel. Note: this may fail if the output is too large.",
-    )
-    parser.add_argument(
-        "--BF", action="store_true", help="Write binding+functional data subsets."
-    )
-    parser.add_argument("--B", action="store_true", help="Write binding data subsets.")
-    parser.add_argument(
-        "--debug", action="store_true", help="Log additional debugging information."
-    )
-    args = parser.parse_args()
-
-    return args
-
-
 def main():
     """
     Call get_ct_pair_dataset to get the compound-target dataset using the given arguments.
     """
-    args = parse_args()
-
-    # Set arguments that are always true.
-    # Write the results to csv.
-    csv = True
-    # Write the full dataset plus filtering columns for binding vs. binding+functional data.
-    full_df = True
+    args, calc_args, output_args = arguments.get_args()
 
     log_level = "DEBUG" if args.debug else "INFO"
     numeric_log_level = getattr(logging, log_level, None)
@@ -112,35 +26,19 @@ def main():
         with sqlite3.connect(args.sqlite) as chembl_con:
             get_dataset.get_ct_pair_dataset(
                 chembl_con,
-                args.chembl,
-                args.output,
-                not args.all_sources,
-                args.rdkit,
-                csv,
-                args.excel,
-                args.delimiter,
-                full_df,
-                args.BF,
-                args.B,
+                calc_args,
+                output_args,
             )
     else:
         logging.info("Using chembl_downloader to connect to ChEMBL.")
-        if args.chembl is None:
-            args.chembl = chembl_downloader.latest()
+        if args.chembl_version is None:
+            args.chembl_version = chembl_downloader.latest()
 
-        with chembl_downloader.connect(version=args.chembl) as chembl_con:
+        with chembl_downloader.connect(version=args.chembl_version) as chembl_con:
             get_dataset.get_ct_pair_dataset(
                 chembl_con,
-                args.chembl,
-                args.output,
-                not args.all_sources,
-                args.rdkit,
-                csv,
-                args.excel,
-                args.delimiter,
-                full_df,
-                args.BF,
-                args.B,
+                calc_args,
+                output_args,
             )
 
 
diff --git a/src/write_subsets.py b/src/write_subsets.py
index b138b35..c979511 100644
--- a/src/write_subsets.py
+++ b/src/write_subsets.py
@@ -4,14 +4,13 @@
 import sanity_checks
 
 import get_stats
+from arguments import OutputArgs, CalculationArgs
 
 
 def write_output(
     df: pd.DataFrame,
     filename: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
+    out: OutputArgs,
 ) -> list[str]:
     """
     Write DataFrame df to output file named <filename>.
@@ -20,20 +19,16 @@ def write_output(
     :type df: pd.DataFrame
     :param filename: Filename to write the output to
     :type filename: bool
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     :return: Returns list of types of files that was written to (csv and/or xlsx)
     :rtype: list[str]
     """
     file_type_list = []
-    if write_to_csv:
-        df.to_csv(f"{filename}.csv", sep=delimiter, index=False)
+    if out.write_to_csv:
+        df.to_csv(f"{filename}.csv", sep=out.delimiter, index=False)
         file_type_list.append("csv")
-    if write_to_excel:
+    if out.write_to_excel:
         try:
             with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer:
                 writer.book.use_zip64()
@@ -50,11 +45,9 @@ def write_output(
 def write_and_check_output(
     df: pd.DataFrame,
     filename: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
     assay_type: str,
-    calculate_rdkit: bool,
+    args: CalculationArgs,
+    out: OutputArgs,
 ):
     """
     Write df to file and check that writing was successful.
@@ -63,23 +56,19 @@ def write_and_check_output(
     :type df: pd.DataFrame
     :param filename: Filename to write the output to
     :type filename: bool
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
     :param assay_type: Types of assays current_df contains information about. \
         Options: "BF" (binding+functional), 
         "B" (binding), 
         "all" (contains both BF and B information)
     :type assay_type: str
-    :param calculate_rdkit: If True, current_df contains RDKit-based columns
-    :type calculate_rdkit: bool
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     """
-    file_type_list = write_output(df, filename, write_to_csv, write_to_excel, delimiter)
+    file_type_list = write_output(df, filename, out)
     sanity_checks.test_equality(
-        df, filename, assay_type, file_type_list, calculate_rdkit
+        df, filename, assay_type, file_type_list, args.calculate_rdkit
     )
 
 
@@ -127,6 +116,9 @@ def get_data_subsets(
             f"SEI_{drop_desc}",
             f"LLE_{drop_desc}",
         ]
+        + [  # exclude columns related to the other assay types
+            col for col in data.columns if col.startswith("B_") or col.startswith("BF_")
+        ]  # exclude filtering columns
     ).drop_duplicates()
 
     # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
@@ -165,145 +157,167 @@ def get_data_subsets(
     return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt
 
 
-def write_bf_to_file(
+def write_subset_to_file(
+    df_combined_subset: pd.DataFrame,
     df_combined: pd.DataFrame,
-    chembl_version: str,
-    min_nof_cpds_bf: int,
-    output_path: str,
-    write_bf: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    limited_flag: str,
-    calculate_rdkit: bool,
-    df_sizes: list[list[int], list[int]],
-) -> pd.DataFrame:
+    desc: str,
+    args: CalculationArgs,
+    out: OutputArgs,
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
-    Calculate relevant subsets for the portion of df_combined
-    that is based on binding+functional data.
-    If write_bf the subsets are written to output_path.
-    Independent of write_bf, filtering columns for BF are added to df_combined and returned.
+    Write BF or B subsets to file.
 
+    :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data
+        in df_combined
+    :type df_combined_subset: pd.DataFrame
     :param df_combined: Pandas DataFrame with compound-target pairs
     :type df_combined: pd.DataFrame
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param min_nof_cpds_bf: Miminum number of compounds per target
-    :type min_nof_cpds_bf: int
-    :param output_path: Path to write the output to
-    :type output_path: str
-    :param write_bf: Should the subsets be written to files?
-    :type write_bf: bool
-    :param write_to_csv: Should the subsets be written to csv?
-    :type write_to_csv: bool
-    :param write_to_excel: Should the subsets be written to excel?
-    :type write_to_excel: bool
-    :param delimiter: Delimiter for csv output
-    :type delimiter: str
-    :param limited_flag: Document suffix indicating
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
-    :param calculate_rdkit: Does df_combined include RDKit-based columns?
-    :type calculate_rdkit: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :return: Pandas DataFrame with additional filtering columns for BF subsets
-    :rtype: pd.Dataframe
+    :param desc: Assay description,
+        either "BF" (binding+functional) or "B" (binding)
+    :type desc: str
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    :return: List of calculated subsets
+    :rtype: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]
     """
-    # consider binding and functional assays
-    # assay description = binding+functional
-    desc = "BF"
-    # df_combined with additional filtering columns
-    df_combined_annotated = df_combined.copy()
-    # df_combined without binding only data
-    df_combined_bf = df_combined.copy()
     (
-        df_combined_bf,
-        df_combined_bf_enough_cpds,
-        df_combined_bf_c_dt_d_dt,
-        df_combined_bf_d_dt,
-    ) = get_data_subsets(df_combined_bf, min_nof_cpds_bf, desc)
+        df_combined_subset,
+        df_combined_subset_enough_cpds,
+        df_combined_subset_c_dt_d_dt,
+        df_combined_subset_d_dt,
+    ) = get_data_subsets(
+        df_combined_subset,
+        args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b,
+        desc,
+    )
 
-    # add filtering columns to df_combined_annotated
+    # add filtering columns to df_combined
     for df, col_name in zip(
         [
-            df_combined_bf_enough_cpds,
-            df_combined_bf_c_dt_d_dt,
-            df_combined_bf_d_dt,
+            df_combined_subset_enough_cpds,
+            df_combined_subset_c_dt_d_dt,
+            df_combined_subset_d_dt,
         ],
         [
-            f"BF_{min_nof_cpds_bf}",
-            f"BF_{min_nof_cpds_bf}_c_dt_d_dt",
-            f"BF_{min_nof_cpds_bf}_d_dt",
+            f"{desc}_{args.min_nof_cpds_bf}",
+            f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
+            f"{desc}_{args.min_nof_cpds_bf}_d_dt",
         ],
     ):
-        df_combined_annotated[col_name] = False
-        df_combined_annotated.loc[
-            (df_combined_annotated.index.isin(df.index)), col_name
-        ] = True
+        df_combined[col_name] = False
+        df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True
         # check that filtering works
-        assert df_combined_annotated[df_combined_annotated[col_name] == True][
-            df.columns
-        ].equals(df), f"Filtering is not accurate for {col_name}."
-
-    if write_bf:
-        # NOTE: This is almost identical to the full dataset which will be saved later on.
-        # However, the binding-related columns are dropped
-        name_bf = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF"
+        assert df_combined[df_combined[col_name] == True][df.columns].equals(
+            df
+        ), f"Filtering is not accurate for {col_name}."
+
+    if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b):
+        # NOTE: For BF this is almost identical to the full dataset
+        # which will be saved later on.
+        # However, the binding-related columns are dropped.
+        name_subset = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_{desc}",
         )
         write_and_check_output(
-            df_combined_bf,
-            name_bf,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
+            df_combined_subset,
+            name_subset,
             desc,
-            calculate_rdkit,
+            args,
+            out,
         )
 
-        name_bf_100 = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}",
+        name_subset_100 = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_"
+            f"CTI_{args.limited_flag}_"
+            f"{desc}_{args.min_nof_cpds_bf}",
         )
         write_and_check_output(
-            df_combined_bf_enough_cpds,
-            name_bf_100,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
+            df_combined_subset_enough_cpds,
+            name_subset_100,
             desc,
-            calculate_rdkit,
+            args,
+            out,
         )
 
-        name_bf_100_c_dt_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt",
+        name_subset_100_c_dt_d_dt = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_"
+            f"CTI_{args.limited_flag}_"
+            f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
         )
         write_and_check_output(
-            df_combined_bf_c_dt_d_dt,
-            name_bf_100_c_dt_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
+            df_combined_subset_c_dt_d_dt,
+            name_subset_100_c_dt_d_dt,
             desc,
-            calculate_rdkit,
+            args,
+            out,
         )
 
-        name_bf_100_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_d_dt",
+        name_subset_100_d_dt = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_"
+            f"CTI_{args.limited_flag}_"
+            f"{desc}_{args.min_nof_cpds_bf}_d_dt",
         )
         write_and_check_output(
-            df_combined_bf_d_dt,
-            name_bf_100_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
+            df_combined_subset_d_dt, name_subset_100_d_dt, desc, args, out
         )
 
+    return (
+        df_combined,
+        df_combined_subset,
+        df_combined_subset_enough_cpds,
+        df_combined_subset_c_dt_d_dt,
+        df_combined_subset_d_dt,
+    )
+
+
+def write_bf_to_file(
+    df_combined: pd.DataFrame,
+    df_sizes: list[list[int], list[int]],
+    args: CalculationArgs,
+    out: OutputArgs,
+) -> pd.DataFrame:
+    """
+    Calculate relevant subsets for the portion of df_combined
+    that is based on binding+functional data.
+    If write_bf the subsets are written to output_path.
+    Independent of write_bf, filtering columns for BF are added to df_combined and returned.
+
+    :param df_combined: Pandas DataFrame with compound-target pairs
+    :type df_combined: pd.DataFrame
+    :param df_sizes: List of intermediate sized of the dataset used for debugging.
+    :type df_sizes: list[list[int], list[int]]
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    :return: Pandas DataFrame with additional filtering columns for BF subsets
+    :rtype: pd.Dataframe
+    """
+    # consider binding and functional assays
+    # assay description = binding+functional
+    desc = "BF"
+    # df_combined without binding only data
+    df_combined_subset = df_combined.copy()
+    (
+        df_combined,
+        df_combined_bf,
+        df_combined_bf_enough_cpds,
+        df_combined_bf_c_dt_d_dt,
+        df_combined_bf_d_dt,
+    ) = write_subset_to_file(
+        df_combined_subset,
+        df_combined,
+        desc,
+        args,
+        out,
+    )
+
     if logging.DEBUG >= logging.root.level:
         get_stats.add_dataset_sizes(df_combined_bf, "binding + functional", df_sizes)
         get_stats.add_dataset_sizes(df_combined_bf_enough_cpds, "BF, >= 100", df_sizes)
@@ -312,22 +326,14 @@ def write_bf_to_file(
         )
         get_stats.add_dataset_sizes(df_combined_bf_d_dt, "BF, >= 100, d_dt", df_sizes)
 
-    return df_combined_annotated
+    return df_combined
 
 
 def write_b_to_file(
     df_combined: pd.DataFrame,
-    df_combined_annotated: pd.DataFrame,
-    chembl_version: str,
-    min_nof_cpds_b: int,
-    output_path: str,
-    write_b: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    limited_flag: str,
-    calculate_rdkit: bool,
     df_sizes: list[list[int], list[int]],
+    args: CalculationArgs,
+    out: OutputArgs,
 ) -> pd.DataFrame:
     """
     Calculate relevant subsets for the portion of df_combined that is based on binding data.
@@ -336,115 +342,32 @@ def write_b_to_file(
 
     :param df_combined: Pandas DataFrame with compound-target pairs
     :type df_combined: pd.DataFrame
-    :param df_combined_annotated: Pandas DataFrame with additional filtering columns
-    :type df_combined_annotated: pd.DataFrame
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param min_nof_cpds_b: Miminum number of compounds per target
-    :type min_nof_cpds_b: int
-    :param output_path: Path to write the output to
-    :type output_path: str
-    :param write_b: Should the subsets be written to files?
-    :type write_b: bool
-    :param write_to_csv: Should the subsets be written to csv?
-    :type write_to_csv: bool
-    :param write_to_excel: Should the subsets be written to excel?
-    :type write_to_excel: bool
-    :param delimiter: Delimiter for csv output
-    :type delimiter: str
-    :param limited_flag: Document suffix indicating
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
-    :param calculate_rdkit: Does df_combined include RDKit-based columns?
-    :type calculate_rdkit: bool
     :param df_sizes: List of intermediate sized of the dataset used for debugging.
     :type df_sizes: list[list[int], list[int]]
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     :return: Pandas DataFrame with additional filtering columns for B subsets
     :rtype: pd.Dataframe
     """
     # consider only binding assays
     # assay description = binding
     desc = "B"
-    df_combined_b = df_combined[df_combined["keep_for_binding"] == True].copy()
+    df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy()
     (
+        df_combined,
         df_combined_b,
         df_combined_b_enough_cpds,
         df_combined_b_c_dt_d_dt,
         df_combined_b_d_dt,
-    ) = get_data_subsets(df_combined_b, min_nof_cpds_b, desc)
-
-    # add filtering columns to df_combined_annotated
-    for df, col_name in zip(
-        [df_combined_b_enough_cpds, df_combined_b_c_dt_d_dt, df_combined_b_d_dt],
-        [
-            f"B_{min_nof_cpds_b}",
-            f"B_{min_nof_cpds_b}_c_dt_d_dt",
-            f"B_{min_nof_cpds_b}_d_dt",
-        ],
-    ):
-        df_combined_annotated[col_name] = False
-        df_combined_annotated.loc[
-            (df_combined_annotated.index.isin(df.index)), col_name
-        ] = True
-        # check that filtering works
-        assert df_combined_annotated[df_combined_annotated[col_name] == True][
-            df.columns
-        ].equals(df), f"Filtering is not accurate for {col_name}."
-
-    if write_b:
-        name_b = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B"
-        )
-        write_and_check_output(
-            df_combined_b,
-            name_b,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_b_100 = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}"
-        )
-        write_and_check_output(
-            df_combined_b_enough_cpds,
-            name_b_100,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_b_100_c_dt_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt",
-        )
-        write_and_check_output(
-            df_combined_b_c_dt_d_dt,
-            name_b_100_c_dt_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_b_100_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_d_dt",
-        )
-        write_and_check_output(
-            df_combined_b_d_dt,
-            name_b_100_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
+    ) = write_subset_to_file(
+        df_combined_subset,
+        df_combined,
+        desc,
+        args,
+        out,
+    )
 
     if logging.DEBUG >= logging.root.level:
         get_stats.add_dataset_sizes(df_combined_b, "binding", df_sizes)
@@ -454,79 +377,46 @@ def write_b_to_file(
         )
         get_stats.add_dataset_sizes(df_combined_b_d_dt, "B, >= 100, d_dt", df_sizes)
 
-    return df_combined_annotated
+    return df_combined
 
 
 def write_full_dataset_to_file(
     df_combined: pd.DataFrame,
-    chembl_version: str,
-    output_path: str,
-    write_full_dataset: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    limited_flag: str,
-    calculate_rdkit: bool,
+    args: CalculationArgs,
+    out: OutputArgs,
 ):
     """
     If write_full_dataset, write df_combined with filtering columns to output_path.
 
     :param df_combined: Pandas DataFrame with compound-target pairs and filtering columns
     :type df_combined: pd.DataFrame
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param output_path: Path to write the output to
-    :type output_path: str
-    :param write_full_dataset: Should the subsets be written to files?
-    :type write_full_dataset: bool
-    :param write_to_csv: Should the subsets be written to csv?
-    :type write_to_csv: bool
-    :param write_to_excel: Should the subsets be written to excel?
-    :type write_to_excel: bool
-    :param delimiter: Delimiter for csv output
-    :type delimiter: str
-    :param limited_flag: Document suffix indicating
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
-    :param calculate_rdkit: Does df_combined include RDKit-based columns?
-    :type calculate_rdkit: bool
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     """
     desc = "all"
-    if write_full_dataset:
+    if out.write_full_dataset:
         name_all = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset"
-        )
-        write_and_check_output(
-            df_combined,
-            name_all,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset",
         )
+        write_and_check_output(df_combined, name_all, desc, args, out)
 
 
 def output_debug_sizes(
     df_sizes: list[list[int], list[int]],
-    output_path: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
+    out: OutputArgs,
 ):
     """
     Output counts at various points during calculating the final dataset for debugging.
 
     :param df_sizes: List of intermediate sized of the dataset used for debugging.
     :type df_sizes: list[list[int], list[int]]
-    :param output_path: Path to write the dataset counts to
-    :type output_path: str
-    :param write_to_csv: True if counts should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if counts should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     """
     column_names = [
         "type",
@@ -545,9 +435,11 @@ def output_debug_sizes(
     logging.debug("Size of full dataset at different points.")
     full_df_sizes = pd.DataFrame(df_sizes[0], columns=column_names)
     logging.debug(full_df_sizes)
-    name_full_df_sizes = os.path.join(output_path, "debug_full_df_sizes")
+    name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
     write_output(
-        full_df_sizes, name_full_df_sizes, write_to_csv, write_to_excel, delimiter
+        full_df_sizes,
+        name_full_df_sizes,
+        out,
     )
 
     logging.debug("Size of dataset with any pchembl values at different points.")
@@ -557,18 +449,18 @@ def output_debug_sizes(
     )
     df_pchembl_sizes = pd.DataFrame(df_sizes[1], columns=column_names)
     logging.debug(df_pchembl_sizes)
-    name_pchembl_df_sizes = os.path.join(output_path, "debug_pchembl_df_sizes")
+    name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
     write_output(
-        full_df_sizes, name_pchembl_df_sizes, write_to_csv, write_to_excel, delimiter
+        full_df_sizes,
+        name_pchembl_df_sizes,
+        out,
     )
 
 
 def output_stats(
     df: pd.DataFrame,
     output_file: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
+    out: OutputArgs,
 ):
     """
     Summarise and output the number of unique values in the following columns:
@@ -583,12 +475,8 @@ def output_stats(
     :type df: pd.DataFrame
     :param output_file: Path and filename to write the dataset stats to
     :type output_file: str
-    :param write_to_csv: True if stats should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if stats should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     """
     df_columns = [
         "parent_molregno",
@@ -621,4 +509,55 @@ def output_stats(
     df_stats = pd.DataFrame(
         stats, columns=["column", "column_description", "subset_type", "counts"]
     )
-    write_output(df_stats, output_file, write_to_csv, write_to_excel, delimiter)
+    write_output(
+        df_stats,
+        output_file,
+        out,
+    )
+
+
+def output_all_stats(
+    df_combined_annotated: pd.DataFrame, args: CalculationArgs, out: OutputArgs
+):
+    """
+    Output stats for all datasets and subsets calculated.
+
+    :param df_combined_annotated: Pandas DataFrame with additional filtering columns
+    :type df_combined_annotated: pd.DataFrame
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    output_file = os.path.join(
+        out.output_path,
+        f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset_stats",
+    )
+
+    output_stats(df_combined_annotated, output_file, out)
+
+    if out.write_bf:
+        output_file = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_"
+            f"CTI_{args.limited_flag}_"
+            f"BF_{args.min_nof_cpds_bf}_c_dt_d_dt_stats",
+        )
+        output_stats(
+            df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]],
+            output_file,
+            out,
+        )
+
+    if out.write_b:
+        output_file = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_"
+            f"CTI_{args.limited_flag}_"
+            f"B_{args.min_nof_cpds_b}_c_dt_d_dt_stats",
+        )
+        output_stats(
+            df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]],
+            output_file,
+            out,
+        )

From d8331c9bb26828d4800250e06568ef756085d38c Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Thu, 15 Feb 2024 16:32:10 +0000
Subject: [PATCH 2/8] Add module to add filtering columns

---
 src/add_filtering_columns.py | 233 ++++++++++++++++++++++++++
 src/get_dataset.py           |  13 +-
 src/write_subsets.py         | 308 -----------------------------------
 3 files changed, 236 insertions(+), 318 deletions(-)
 create mode 100644 src/add_filtering_columns.py

diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py
new file mode 100644
index 0000000..053d9ad
--- /dev/null
+++ b/src/add_filtering_columns.py
@@ -0,0 +1,233 @@
+import logging
+import os
+
+import pandas as pd
+
+from arguments import CalculationArgs, OutputArgs
+import get_stats
+import write_subsets
+
+
+def get_data_subsets(
+    data: pd.DataFrame, min_nof_cpds: int, desc: str
+) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """
+    Calculate and return the different subsets of interest.
+
+    :param data: Pandas DataFrame with compound-target pairs
+    :type data: pd.DataFrame
+    :param min_nof_cpds: Miminum number of compounds per target
+    :type min_nof_cpds: int
+    :param desc: Types of assays current_df contains information about. \
+        Options: "BF" (binding+functional), "B" (binding)
+    :type desc: str
+    :return: 
+        - data: Pandas DataFrame with compound-target pairs 
+            without the annotations for the opposite desc, \
+            e.g. if desc = "BF", the average pchembl value based on 
+            binding data only is dropped
+        - df_enough_cpds: Pandas DataFrame with targets 
+            with at least <min_nof_cpds> compounds with a pchembl value, 
+        - df_c_dt_d_dt: As df_enough_cpds but with \
+            at least one compound-target pair labelled as 
+            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), 
+        - df_d_dt: As df_enough_cpds but with \
+            at least one compound-target pair labelled as 
+            'D_DT' (i.e., known drug-target interaction)
+    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame)
+    """
+    if desc == "B":
+        drop_desc = "BF"
+    else:
+        drop_desc = "B"
+    data = data.drop(
+        columns=[
+            f"pchembl_value_mean_{drop_desc}",
+            f"pchembl_value_max_{drop_desc}",
+            f"pchembl_value_median_{drop_desc}",
+            f"first_publication_cpd_target_pair_{drop_desc}",
+            f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}",
+            f"LE_{drop_desc}",
+            f"BEI_{drop_desc}",
+            f"SEI_{drop_desc}",
+            f"LLE_{drop_desc}",
+        ]
+        + [  # exclude columns related to the other assay types
+            col for col in data.columns if col.startswith("B_") or col.startswith("BF_")
+        ]  # exclude filtering columns
+    ).drop_duplicates()
+
+    # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
+    comparator_counts = (
+        data[data[f"pchembl_value_mean_{desc}"].notnull()]
+        .groupby(["tid_mutation"])["parent_molregno"]
+        .count()
+    )
+    # pylint: disable-next=unused-variable
+    targets_w_enough_cpds = comparator_counts[
+        comparator_counts >= min_nof_cpds
+    ].index.tolist()
+    df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds")
+
+    # Restrict the dataset further to targets
+    # with at least one compound-target pair labelled as
+    # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT',
+    # i.e., compound-target pairs with a known interactions.
+    # pylint: disable-next=unused-variable
+    c_dt_d_dt_targets = set(
+        df_enough_cpds[
+            df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"])
+        ].tid_mutation.to_list()
+    )
+    df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets")
+
+    # Restrict the dataset further to targets with
+    # at least one compound-target pair labelled as 'D_DT',
+    # i.e., known drug-target interactions.
+    # pylint: disable-next=unused-variable
+    d_dt_targets = set(
+        df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list()
+    )
+    df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets")
+
+    return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt
+
+
+def add_subset_filtering_columns(
+    df_combined_subset: pd.DataFrame,
+    df_combined: pd.DataFrame,
+    desc: str,
+    args: CalculationArgs,
+    out: OutputArgs,
+    df_sizes,
+) -> pd.DataFrame:
+    # TODO update documentation
+    """
+    Add filtering column for binding + functional vs binding
+
+    :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data
+        in df_combined
+    :type df_combined_subset: pd.DataFrame
+    :param df_combined: Pandas DataFrame with compound-target pairs
+    :type df_combined: pd.DataFrame
+    :param desc: Assay description,
+        either "BF" (binding+functional) or "B" (binding)
+    :type desc: str
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :return: List of calculated subsets
+    :rtype: pd.DataFrame
+    """
+    (
+        df_combined_subset,
+        df_combined_subset_enough_cpds,
+        df_combined_subset_c_dt_d_dt,
+        df_combined_subset_d_dt,
+    ) = get_data_subsets(
+        df_combined_subset,
+        args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b,
+        desc,
+    )
+
+    # write subsets if required
+    if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b):
+        for df_subset, subset_desc in zip(
+            [
+                df_combined_subset,
+                df_combined_subset_enough_cpds,
+                df_combined_subset_c_dt_d_dt,
+                df_combined_subset_d_dt,
+            ],
+            [
+                f"{desc}",
+                f"{desc}_{args.min_nof_cpds_bf}",
+                f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
+                f"{desc}_{args.min_nof_cpds_bf}_d_dt",
+            ],
+        ):
+            name_subset = os.path.join(
+                out.output_path,
+                f"ChEMBL{args.chembl_version}_"
+                f"CTI_{args.limited_flag}_"
+                f"{subset_desc}",
+            )
+            write_subsets.write_and_check_output(
+                df_subset,
+                name_subset,
+                desc,
+                args,
+                out,
+            )
+
+    # add filtering columns to df_combined
+    for df, col_name in zip(
+        [
+            df_combined_subset_enough_cpds,
+            df_combined_subset_c_dt_d_dt,
+            df_combined_subset_d_dt,
+        ],
+        [
+            f"{desc}_{args.min_nof_cpds_bf}",
+            f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
+            f"{desc}_{args.min_nof_cpds_bf}_d_dt",
+        ],
+    ):
+        df_combined[col_name] = False
+        df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True
+        # check that filtering works
+        assert df_combined[df_combined[col_name] == True][df.columns].equals(
+            df
+        ), f"Filtering is not accurate for {col_name}."
+
+    if logging.DEBUG >= logging.root.level:
+        get_stats.add_dataset_sizes(
+            df_combined_subset, "binding + functional", df_sizes
+        )
+        get_stats.add_dataset_sizes(
+            df_combined_subset_enough_cpds, "BF, >= 100", df_sizes
+        )
+        get_stats.add_dataset_sizes(
+            df_combined_subset_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes
+        )
+        get_stats.add_dataset_sizes(
+            df_combined_subset_d_dt, "BF, >= 100, d_dt", df_sizes
+        )
+
+    return df_combined
+
+
+def add_filtering_columns(
+    df_combined,
+    df_sizes,
+    args,
+    out,
+):
+    # TODO: documentation
+    # consider binding and functional assays
+    # assay description = binding+functional
+    desc = "BF"
+    # df_combined without binding only data
+    df_combined_subset = df_combined.copy()
+    df_combined = add_subset_filtering_columns(
+        df_combined_subset,
+        df_combined,
+        desc,
+        args,
+        out,
+        df_sizes,
+    )
+
+    # consider only binding assays
+    # assay description = binding
+    desc = "B"
+    df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy()
+    df_combined = add_subset_filtering_columns(
+        df_combined_subset,
+        df_combined,
+        desc,
+        args,
+        out,
+        df_sizes,
+    )
+
+    return df_combined
diff --git a/src/get_dataset.py b/src/get_dataset.py
index 053ec8e..7fd86c3 100644
--- a/src/get_dataset.py
+++ b/src/get_dataset.py
@@ -12,6 +12,7 @@
 import write_subsets
 import get_stats
 from arguments import OutputArgs, CalculationArgs
+import add_filtering_columns
 
 
 def get_ct_pair_dataset(
@@ -102,16 +103,8 @@ def get_ct_pair_dataset(
         args.calculate_rdkit,
     )
 
-    logging.info("write_BF_to_file")
-    df_combined = write_subsets.write_bf_to_file(
-        df_combined,
-        df_sizes,
-        args,
-        out,
-    )
-
-    logging.info("write_B_to_file")
-    df_combined = write_subsets.write_b_to_file(
+    logging.info("add_filtering_columns")
+    add_filtering_columns.add_filtering_columns(
         df_combined,
         df_sizes,
         args,
diff --git a/src/write_subsets.py b/src/write_subsets.py
index c979511..5b0bc21 100644
--- a/src/write_subsets.py
+++ b/src/write_subsets.py
@@ -72,314 +72,6 @@ def write_and_check_output(
     )
 
 
-def get_data_subsets(
-    data: pd.DataFrame, min_nof_cpds: int, desc: str
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """
-    Calculate and return the different subsets of interest.
-
-    :param data: Pandas DataFrame with compound-target pairs
-    :type data: pd.DataFrame
-    :param min_nof_cpds: Miminum number of compounds per target
-    :type min_nof_cpds: int
-    :param desc: Types of assays current_df contains information about. \
-        Options: "BF" (binding+functional), "B" (binding)
-    :type desc: str
-    :return: 
-        - data: Pandas DataFrame with compound-target pairs 
-            without the annotations for the opposite desc, \
-            e.g. if desc = "BF", the average pchembl value based on 
-            binding data only is dropped
-        - df_enough_cpds: Pandas DataFrame with targets 
-            with at least <min_nof_cpds> compounds with a pchembl value, 
-        - df_c_dt_d_dt: As df_enough_cpds but with \
-            at least one compound-target pair labelled as 
-            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), 
-        - df_d_dt: As df_enough_cpds but with \
-            at least one compound-target pair labelled as 
-            'D_DT' (i.e., known drug-target interaction)
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame)
-    """
-    if desc == "B":
-        drop_desc = "BF"
-    else:
-        drop_desc = "B"
-    data = data.drop(
-        columns=[
-            f"pchembl_value_mean_{drop_desc}",
-            f"pchembl_value_max_{drop_desc}",
-            f"pchembl_value_median_{drop_desc}",
-            f"first_publication_cpd_target_pair_{drop_desc}",
-            f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}",
-            f"LE_{drop_desc}",
-            f"BEI_{drop_desc}",
-            f"SEI_{drop_desc}",
-            f"LLE_{drop_desc}",
-        ]
-        + [  # exclude columns related to the other assay types
-            col for col in data.columns if col.startswith("B_") or col.startswith("BF_")
-        ]  # exclude filtering columns
-    ).drop_duplicates()
-
-    # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
-    comparator_counts = (
-        data[data[f"pchembl_value_mean_{desc}"].notnull()]
-        .groupby(["tid_mutation"])["parent_molregno"]
-        .count()
-    )
-    # pylint: disable-next=unused-variable
-    targets_w_enough_cpds = comparator_counts[
-        comparator_counts >= min_nof_cpds
-    ].index.tolist()
-    df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds")
-
-    # Restrict the dataset further to targets
-    # with at least one compound-target pair labelled as
-    # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT',
-    # i.e., compound-target pairs with a known interactions.
-    # pylint: disable-next=unused-variable
-    c_dt_d_dt_targets = set(
-        df_enough_cpds[
-            df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"])
-        ].tid_mutation.to_list()
-    )
-    df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets")
-
-    # Restrict the dataset further to targets with
-    # at least one compound-target pair labelled as 'D_DT',
-    # i.e., known drug-target interactions.
-    # pylint: disable-next=unused-variable
-    d_dt_targets = set(
-        df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list()
-    )
-    df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets")
-
-    return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt
-
-
-def write_subset_to_file(
-    df_combined_subset: pd.DataFrame,
-    df_combined: pd.DataFrame,
-    desc: str,
-    args: CalculationArgs,
-    out: OutputArgs,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """
-    Write BF or B subsets to file.
-
-    :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data
-        in df_combined
-    :type df_combined_subset: pd.DataFrame
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param desc: Assay description,
-        either "BF" (binding+functional) or "B" (binding)
-    :type desc: str
-    :param args: Arguments related to how to calculate the dataset
-    :type args: CalculationArgs
-    :param out: Arguments related to how to output the dataset
-    :type out: OutputArgs
-    :return: List of calculated subsets
-    :rtype: tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]
-    """
-    (
-        df_combined_subset,
-        df_combined_subset_enough_cpds,
-        df_combined_subset_c_dt_d_dt,
-        df_combined_subset_d_dt,
-    ) = get_data_subsets(
-        df_combined_subset,
-        args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b,
-        desc,
-    )
-
-    # add filtering columns to df_combined
-    for df, col_name in zip(
-        [
-            df_combined_subset_enough_cpds,
-            df_combined_subset_c_dt_d_dt,
-            df_combined_subset_d_dt,
-        ],
-        [
-            f"{desc}_{args.min_nof_cpds_bf}",
-            f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
-            f"{desc}_{args.min_nof_cpds_bf}_d_dt",
-        ],
-    ):
-        df_combined[col_name] = False
-        df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True
-        # check that filtering works
-        assert df_combined[df_combined[col_name] == True][df.columns].equals(
-            df
-        ), f"Filtering is not accurate for {col_name}."
-
-    if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b):
-        # NOTE: For BF this is almost identical to the full dataset
-        # which will be saved later on.
-        # However, the binding-related columns are dropped.
-        name_subset = os.path.join(
-            out.output_path,
-            f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_{desc}",
-        )
-        write_and_check_output(
-            df_combined_subset,
-            name_subset,
-            desc,
-            args,
-            out,
-        )
-
-        name_subset_100 = os.path.join(
-            out.output_path,
-            f"ChEMBL{args.chembl_version}_"
-            f"CTI_{args.limited_flag}_"
-            f"{desc}_{args.min_nof_cpds_bf}",
-        )
-        write_and_check_output(
-            df_combined_subset_enough_cpds,
-            name_subset_100,
-            desc,
-            args,
-            out,
-        )
-
-        name_subset_100_c_dt_d_dt = os.path.join(
-            out.output_path,
-            f"ChEMBL{args.chembl_version}_"
-            f"CTI_{args.limited_flag}_"
-            f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
-        )
-        write_and_check_output(
-            df_combined_subset_c_dt_d_dt,
-            name_subset_100_c_dt_d_dt,
-            desc,
-            args,
-            out,
-        )
-
-        name_subset_100_d_dt = os.path.join(
-            out.output_path,
-            f"ChEMBL{args.chembl_version}_"
-            f"CTI_{args.limited_flag}_"
-            f"{desc}_{args.min_nof_cpds_bf}_d_dt",
-        )
-        write_and_check_output(
-            df_combined_subset_d_dt, name_subset_100_d_dt, desc, args, out
-        )
-
-    return (
-        df_combined,
-        df_combined_subset,
-        df_combined_subset_enough_cpds,
-        df_combined_subset_c_dt_d_dt,
-        df_combined_subset_d_dt,
-    )
-
-
-def write_bf_to_file(
-    df_combined: pd.DataFrame,
-    df_sizes: list[list[int], list[int]],
-    args: CalculationArgs,
-    out: OutputArgs,
-) -> pd.DataFrame:
-    """
-    Calculate relevant subsets for the portion of df_combined
-    that is based on binding+functional data.
-    If write_bf the subsets are written to output_path.
-    Independent of write_bf, filtering columns for BF are added to df_combined and returned.
-
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :param args: Arguments related to how to calculate the dataset
-    :type args: CalculationArgs
-    :param out: Arguments related to how to output the dataset
-    :type out: OutputArgs
-    :return: Pandas DataFrame with additional filtering columns for BF subsets
-    :rtype: pd.Dataframe
-    """
-    # consider binding and functional assays
-    # assay description = binding+functional
-    desc = "BF"
-    # df_combined without binding only data
-    df_combined_subset = df_combined.copy()
-    (
-        df_combined,
-        df_combined_bf,
-        df_combined_bf_enough_cpds,
-        df_combined_bf_c_dt_d_dt,
-        df_combined_bf_d_dt,
-    ) = write_subset_to_file(
-        df_combined_subset,
-        df_combined,
-        desc,
-        args,
-        out,
-    )
-
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined_bf, "binding + functional", df_sizes)
-        get_stats.add_dataset_sizes(df_combined_bf_enough_cpds, "BF, >= 100", df_sizes)
-        get_stats.add_dataset_sizes(
-            df_combined_bf_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes
-        )
-        get_stats.add_dataset_sizes(df_combined_bf_d_dt, "BF, >= 100, d_dt", df_sizes)
-
-    return df_combined
-
-
-def write_b_to_file(
-    df_combined: pd.DataFrame,
-    df_sizes: list[list[int], list[int]],
-    args: CalculationArgs,
-    out: OutputArgs,
-) -> pd.DataFrame:
-    """
-    Calculate relevant subsets for the portion of df_combined that is based on binding data.
-    If write_b the subsets are written to output_path.
-    Independent of write_b, filtering columns for B are added to df_combined_annotated.
-
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :param args: Arguments related to how to calculate the dataset
-    :type args: CalculationArgs
-    :param out: Arguments related to how to output the dataset
-    :type out: OutputArgs
-    :return: Pandas DataFrame with additional filtering columns for B subsets
-    :rtype: pd.Dataframe
-    """
-    # consider only binding assays
-    # assay description = binding
-    desc = "B"
-    df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy()
-    (
-        df_combined,
-        df_combined_b,
-        df_combined_b_enough_cpds,
-        df_combined_b_c_dt_d_dt,
-        df_combined_b_d_dt,
-    ) = write_subset_to_file(
-        df_combined_subset,
-        df_combined,
-        desc,
-        args,
-        out,
-    )
-
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined_b, "binding", df_sizes)
-        get_stats.add_dataset_sizes(df_combined_b_enough_cpds, "B, >= 100", df_sizes)
-        get_stats.add_dataset_sizes(
-            df_combined_b_c_dt_d_dt, "B, >= 100, c_dt and d_dt", df_sizes
-        )
-        get_stats.add_dataset_sizes(df_combined_b_d_dt, "B, >= 100, d_dt", df_sizes)
-
-    return df_combined
-
-
 def write_full_dataset_to_file(
     df_combined: pd.DataFrame,
     args: CalculationArgs,

From 4fe5118f13ee4fa6a73511fb1b68ce01e6e551ee Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Mon, 19 Feb 2024 16:06:47 +0000
Subject: [PATCH 3/8] Improve add_filtering_columns

---
 src/add_filtering_columns.py | 130 ++++++++++++++++-------------------
 1 file changed, 60 insertions(+), 70 deletions(-)

diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py
index 053d9ad..ce6a575 100644
--- a/src/add_filtering_columns.py
+++ b/src/add_filtering_columns.py
@@ -8,9 +8,12 @@
 import write_subsets
 
 
-def get_data_subsets(
-    data: pd.DataFrame, min_nof_cpds: int, desc: str
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[
+    tuple[pd.DataFrame, str],
+    tuple[pd.DataFrame, str],
+    tuple[pd.DataFrame, str],
+    tuple[pd.DataFrame, str],
+]:
     """
     Calculate and return the different subsets of interest.
 
@@ -18,23 +21,27 @@ def get_data_subsets(
     :type data: pd.DataFrame
     :param min_nof_cpds: Miminum number of compounds per target
     :type min_nof_cpds: int
-    :param desc: Types of assays current_df contains information about. \
+    :param desc: Types of assays current_df contains information about.
         Options: "BF" (binding+functional), "B" (binding)
     :type desc: str
-    :return: 
-        - data: Pandas DataFrame with compound-target pairs 
-            without the annotations for the opposite desc, \
-            e.g. if desc = "BF", the average pchembl value based on 
+    :return: List of dataset subsets and the string describing them
+        - data: Pandas DataFrame with compound-target pairs
+            without filtering columns and without
+            the annotations for the opposite desc,
+            e.g. if desc = "BF", the average pchembl value based on
             binding data only is dropped
-        - df_enough_cpds: Pandas DataFrame with targets 
-            with at least <min_nof_cpds> compounds with a pchembl value, 
-        - df_c_dt_d_dt: As df_enough_cpds but with \
-            at least one compound-target pair labelled as 
-            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), 
-        - df_d_dt: As df_enough_cpds but with \
-            at least one compound-target pair labelled as 
+        - df_enough_cpds: Pandas DataFrame with targets
+            with at least <min_nof_cpds> compounds with a pchembl value,
+        - df_c_dt_d_dt: As df_enough_cpds but with
+            at least one compound-target pair labelled as
+            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction),
+        - df_d_dt: As df_enough_cpds but with
+            at least one compound-target pair labelled as
             'D_DT' (i.e., known drug-target interaction)
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame)
+    :rtype: tuple[tuple[pd.DataFrame, str],
+           tuple[pd.DataFrame, str],
+           tuple[pd.DataFrame, str],
+           tuple[pd.DataFrame, str]]
     """
     if desc == "B":
         drop_desc = "BF"
@@ -90,7 +97,12 @@ def get_data_subsets(
     )
     df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets")
 
-    return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt
+    return [
+        [data, f"{desc}"],
+        [df_enough_cpds, f"{desc}_{min_nof_cpds}"],
+        [df_c_dt_d_dt, f"{desc}_{min_nof_cpds}_c_dt_d_dt"],
+        [df_d_dt, f"{desc}_{min_nof_cpds}_d_dt"],
+    ]
 
 
 def add_subset_filtering_columns(
@@ -99,9 +111,8 @@ def add_subset_filtering_columns(
     desc: str,
     args: CalculationArgs,
     out: OutputArgs,
-    df_sizes,
+    df_sizes: list[list[int], list[int]],
 ) -> pd.DataFrame:
-    # TODO update documentation
     """
     Add filtering column for binding + functional vs binding
 
@@ -115,15 +126,14 @@ def add_subset_filtering_columns(
     :type desc: str
     :param args: Arguments related to how to calculate the dataset
     :type args: CalculationArgs
-    :return: List of calculated subsets
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    :param df_sizes: List of intermediate sized of the dataset used for debugging.
+    :type df_sizes: list[list[int], list[int]]
+    :return: Pandas DataFrame with added filering columns
     :rtype: pd.DataFrame
     """
-    (
-        df_combined_subset,
-        df_combined_subset_enough_cpds,
-        df_combined_subset_c_dt_d_dt,
-        df_combined_subset_d_dt,
-    ) = get_data_subsets(
+    subsets = get_data_subsets(
         df_combined_subset,
         args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b,
         desc,
@@ -131,20 +141,7 @@ def add_subset_filtering_columns(
 
     # write subsets if required
     if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b):
-        for df_subset, subset_desc in zip(
-            [
-                df_combined_subset,
-                df_combined_subset_enough_cpds,
-                df_combined_subset_c_dt_d_dt,
-                df_combined_subset_d_dt,
-            ],
-            [
-                f"{desc}",
-                f"{desc}_{args.min_nof_cpds_bf}",
-                f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
-                f"{desc}_{args.min_nof_cpds_bf}_d_dt",
-            ],
-        ):
+        for [df_subset, subset_desc] in subsets:
             name_subset = os.path.join(
                 out.output_path,
                 f"ChEMBL{args.chembl_version}_"
@@ -160,18 +157,8 @@ def add_subset_filtering_columns(
             )
 
     # add filtering columns to df_combined
-    for df, col_name in zip(
-        [
-            df_combined_subset_enough_cpds,
-            df_combined_subset_c_dt_d_dt,
-            df_combined_subset_d_dt,
-        ],
-        [
-            f"{desc}_{args.min_nof_cpds_bf}",
-            f"{desc}_{args.min_nof_cpds_bf}_c_dt_d_dt",
-            f"{desc}_{args.min_nof_cpds_bf}_d_dt",
-        ],
-    ):
+    # do not add a filtering column for BF / B (-> [1:])
+    for [df, col_name] in subsets[1:]:
         df_combined[col_name] = False
         df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True
         # check that filtering works
@@ -180,29 +167,32 @@ def add_subset_filtering_columns(
         ), f"Filtering is not accurate for {col_name}."
 
     if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(
-            df_combined_subset, "binding + functional", df_sizes
-        )
-        get_stats.add_dataset_sizes(
-            df_combined_subset_enough_cpds, "BF, >= 100", df_sizes
-        )
-        get_stats.add_dataset_sizes(
-            df_combined_subset_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes
-        )
-        get_stats.add_dataset_sizes(
-            df_combined_subset_d_dt, "BF, >= 100, d_dt", df_sizes
-        )
+        for [df_subset, subset_desc] in subsets:
+            get_stats.add_dataset_sizes(df_subset, subset_desc, df_sizes)
 
     return df_combined
 
 
 def add_filtering_columns(
-    df_combined,
-    df_sizes,
-    args,
-    out,
-):
-    # TODO: documentation
+    df_combined: pd.DataFrame,
+    df_sizes: list[list[int], list[int]],
+    args: CalculationArgs,
+    out: OutputArgs,
+) -> pd.DataFrame:
+    """
+    Add filtering columns to main dataset and save subsets if required.
+
+    :param df_combined: Pandas DataFrame with compound-target pairs
+    :type df_combined: pd.DataFrame
+    :param df_sizes: List of intermediate sized of the dataset used for debugging.
+    :type df_sizes: list[list[int], list[int]]
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    :return: Pandas DataFrame with added filering columns
+    :rtype: pd.DataFrame
+    """
     # consider binding and functional assays
     # assay description = binding+functional
     desc = "BF"

From 1ee99f99b61f47f81cd31c39bebfdbc41e0ba8cd Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Tue, 20 Feb 2024 20:14:13 +0000
Subject: [PATCH 4/8] Add dataset dataclass

---
 src/add_chembl_compound_properties.py      | 138 +++++++++---------
 src/add_chembl_target_class_annotations.py |  49 ++++---
 src/add_dti_annotations.py                 |  89 ++++++------
 src/add_filtering_columns.py               |  61 ++++----
 src/add_rdkit_compound_descriptors.py      | 117 ++++++++--------
 src/clean_dataset.py                       | 102 +++++++-------
 src/dataset.py                             |  29 ++++
 src/get_activity_ct_pairs.py               |  23 ++-
 src/get_dataset.py                         |  93 ++++--------
 src/get_drug_mechanism_ct_pairs.py         |  57 ++++----
 src/get_stats.py                           |  30 +++-
 src/sanity_checks.py                       | 156 ++++++++++-----------
 src/write_subsets.py                       |  33 +++--
 13 files changed, 496 insertions(+), 481 deletions(-)
 create mode 100644 src/dataset.py

diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py
index 836b06a..4e5d623 100644
--- a/src/add_chembl_compound_properties.py
+++ b/src/add_chembl_compound_properties.py
@@ -2,11 +2,13 @@
 
 import pandas as pd
 
+from dataset import Dataset
+
 
 ########### Add Compound Properties Based on ChEMBL Data ###########
 def add_first_publication_date(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool
-) -> pd.DataFrame:
+    dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
+):
     """
     Query and calculate the first publication of a compound
     based on ChEMBL data (column name: first_publication_cpd).
@@ -14,14 +16,13 @@ def add_first_publication_date(
     of the compound in the literature according to ChEMBL.
     Otherwise this is the first appearance in any source in ChEMBL.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include first_publication_cpd
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
     :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
     :type limit_to_literature: bool
-    :return: Pandas DataFrame with added first_publication_cpd.
-    :rtype: pd.DataFrame
     """
     # information about salts is aggregated in the parent
     sql = """
@@ -42,26 +43,26 @@ def add_first_publication_date(
     ].transform("min")
     df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates()
 
-    df_combined = df_combined.merge(df_docs, on="parent_molregno", how="left")
-
-    return df_combined
+    dataset.df_result = dataset.df_result.merge(
+        df_docs, on="parent_molregno", how="left"
+    )
 
 
 def add_chembl_properties_and_structures(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+    dataset: Dataset, chembl_con: sqlite3.Connection
+):
     """
-    Add compound properties from the compound_properties table 
+    Add compound properties from the compound_properties table
     (e.g., alogp, #hydrogen bond acceptors / donors, etc.).
-    Add InChI, InChI key and canonical smiles. 
+    Add InChI, InChI key and canonical smiles.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include compound properties and structures.
+        dataset.df_cpd_props will be set to
+        compound properties and structures for all compound ids in ChEMBL.
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with added compound properties and structures. \\
-        - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL.
-    :rtype: (pd.DataFrame, pd.DataFrame)
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, 
@@ -78,13 +79,14 @@ def add_chembl_properties_and_structures(
     """
 
     df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
+    dataset.df_cpd_props = df_cpd_props
 
-    df_combined = df_combined.merge(df_cpd_props, on="parent_molregno", how="left")
-
-    return df_combined, df_cpd_props
+    dataset.df_result = dataset.df_result.merge(
+        df_cpd_props, on="parent_molregno", how="left"
+    )
 
 
-def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
+def add_ligand_efficiency_metrics(dataset: Dataset):
     """
     Calculate the ligand efficiency metrics for the compounds
     based on the mean pchembl values for a compound-target pair and
@@ -108,33 +110,37 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
     Once for the pchembl values based on binding + functional assays (BF)
     and once for the pchembl values based on binding assays only (B).
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added ligand efficiency metrics
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include ligand efficiency metrics.
+    :type dataset: Dataset
     """
     for suffix in ["BF", "B"]:
-        df_combined.loc[df_combined["heavy_atoms"] != 0, f"LE_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"]
-            / df_combined["heavy_atoms"]
+        dataset.df_result.loc[dataset.df_result["heavy_atoms"] != 0, f"LE_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            / dataset.df_result["heavy_atoms"]
             * (2.303 * 298 * 0.00199)
         )
 
-        df_combined.loc[df_combined["mw_freebase"] != 0, f"BEI_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"]
+        dataset.df_result.loc[
+            dataset.df_result["mw_freebase"] != 0, f"BEI_{suffix}"
+        ] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
             * 1000
-            / df_combined["mw_freebase"]
+            / dataset.df_result["mw_freebase"]
         )
 
-        df_combined.loc[df_combined["psa"] != 0, f"SEI_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"] * 100 / df_combined["psa"]
+        dataset.df_result.loc[dataset.df_result["psa"] != 0, f"SEI_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            * 100
+            / dataset.df_result["psa"]
         )
 
-        df_combined[f"LLE_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"] - df_combined["alogp"]
+        dataset.df_result[f"LLE_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            - dataset.df_result["alogp"]
         )
 
-        df_combined = df_combined.astype(
+        dataset.df_result = dataset.df_result.astype(
             {
                 f"LE_{suffix}": "float64",
                 f"BEI_{suffix}": "float64",
@@ -143,26 +149,21 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
             }
         )
 
-    return df_combined
-
 
-def add_atc_classification(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection):
     """
-    Query and add ATC classifications (level 1) from the atc_classification and 
+    Query and add ATC classifications (level 1) from the atc_classification and
     molecule_atc_classification tables.
-    ATC level annotations for the same parent_molregno are combined into one description 
-    that concatenates all descriptions sorted alphabetically 
+    ATC level annotations for the same parent_molregno are combined into one description
+    that concatenates all descriptions sorted alphabetically
     into one string with ' | ' as a separator.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include ATC classifications.
+        dataset.atc_levels will be set to ATC annotations in ChEMBL.
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with added ATC classifications \\
-        - Pandas DataFrame with ATC annotations in ChEMBL
-    :rtype: (pd.DataFrame, pd.DataFrame)
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description
@@ -185,14 +186,16 @@ def add_atc_classification(
     ].transform(lambda x: between_str_join.join(sorted(x)))
     atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates()
 
-    df_combined = df_combined.merge(atc_levels, on="parent_molregno", how="left")
+    dataset.atc_levels = atc_levels
 
-    return df_combined, atc_levels
+    dataset.df_result = dataset.df_result.merge(
+        atc_levels, on="parent_molregno", how="left"
+    )
 
 
 def add_all_chembl_compound_properties(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
+):
     """
     Add ChEMBL-based compound properties to the given compound-target pairs, specifically:
 
@@ -202,24 +205,19 @@ def add_all_chembl_compound_properties(
     - ligand efficiency metrics
     - ATC classifications
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include compound properties.
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param limit_to_literature: Base first_publication_cpd on literature sources only if True. 
+    :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
         Base it on all available sources otherwise.
     :type limit_to_literature: bool
-    :return: - Pandas DataFrame with added compound properties \\
-        - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL \\
-        - Pandas DataFrame with ATC annotations in ChEMBL
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
     """
-    df_combined = add_first_publication_date(
-        df_combined, chembl_con, limit_to_literature
-    )
-    df_combined, df_cpd_props = add_chembl_properties_and_structures(
-        df_combined, chembl_con
-    )
-    df_combined = add_ligand_efficiency_metrics(df_combined)
-    df_combined, atc_levels = add_atc_classification(df_combined, chembl_con)
-    return df_combined, df_cpd_props, atc_levels
+    add_first_publication_date(dataset, chembl_con, limit_to_literature)
+
+    add_chembl_properties_and_structures(dataset, chembl_con)
+
+    add_ligand_efficiency_metrics(dataset)
+
+    add_atc_classification(dataset, chembl_con)
diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
index 009b8d4..0cb388c 100644
--- a/src/add_chembl_target_class_annotations.py
+++ b/src/add_chembl_target_class_annotations.py
@@ -6,6 +6,7 @@
 
 import write_subsets
 from arguments import OutputArgs, CalculationArgs
+from dataset import Dataset
 
 
 ########### Add Target Class Annotations Based on ChEMBL Data ###########
@@ -80,34 +81,35 @@ def get_target_class_table(
 
 
 def add_chembl_target_class_annotations(
-    df_combined: pd.DataFrame,
+    dataset: Dataset,
     chembl_con: sqlite3.Connection,
     args: CalculationArgs,
     out: OutputArgs,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+):
     """
-    Add level 1 and 2 target class annotations. 
-    Assignments for target IDs with more than one target class assignment per level 
-    are summarised into one string with '|' as a separator 
+    Add level 1 and 2 target class annotations.
+    Assignments for target IDs with more than one target class assignment per level
+    are summarised into one string with '|' as a separator
     between the different target class annotations.
 
     Targets with more than one level 1 / level 2 target class assignment are written to a file.
     These could be reassigned by hand if a single target class is preferable.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include target class annotations.
+        dataset.target_classes_level1 will be set to
+            pandas DataFrame with mapping from target id to level 1 target class
+        dataset.target_classes_level2 will be set to
+            pandas DataFrame with mapping from target id to level 2 target class
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
     :param args: Arguments related to how to calculate the dataset
     :type args: CalculationArgs
     :param out: Arguments related to how to output the dataset
     :type out: OutputArgs
-    :return: - Pandas DataFrame with added target class annotations \\
-        - Pandas DataFrame with mapping from target id to level 1 target class \\
-        - Pandas DataFrame with mapping from target id to level 2 target class
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
     """
-    current_tids = set(df_combined["tid"])
+    current_tids = set(dataset.df_result["tid"])
     df_target_classes = get_target_class_table(chembl_con, current_tids)
 
     # Summarise the information for a target id with
@@ -143,7 +145,9 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l1"]
     ].drop_duplicates()
 
-    df_combined = df_combined.merge(target_classes_level1, on="tid", how="left")
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level1, on="tid", how="left"
+    )
 
     # Repeat the summary step for target classes of level 2.
     level = "l2"
@@ -155,12 +159,14 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l2"]
     ].drop_duplicates()
 
-    df_combined = df_combined.merge(target_classes_level2, on="tid", how="left")
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level2, on="tid", how="left"
+    )
 
     # Output targets have more than one target class assignment
-    more_than_one_level_1 = df_combined[
-        (df_combined["target_class_l1"].notnull())
-        & (df_combined["target_class_l1"].str.contains("|", regex=False))
+    more_than_one_level_1 = dataset.df_result[
+        (dataset.df_result["target_class_l1"].notnull())
+        & (dataset.df_result["target_class_l1"].str.contains("|", regex=False))
     ][
         ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"]
     ].drop_duplicates()
@@ -168,9 +174,9 @@ def add_chembl_target_class_annotations(
         "Targets with more than one level 1 target class assignment: %s",
         len(more_than_one_level_1),
     )
-    more_than_one_level_2 = df_combined[
-        (df_combined["target_class_l2"].notnull())
-        & (df_combined["target_class_l2"].str.contains("|", regex=False))
+    more_than_one_level_2 = dataset.df_result[
+        (dataset.df_result["target_class_l2"].notnull())
+        & (dataset.df_result["target_class_l2"].str.contains("|", regex=False))
     ][
         ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"]
     ].drop_duplicates()
@@ -197,4 +203,5 @@ def add_chembl_target_class_annotations(
         out,
     )
 
-    return df_combined, target_classes_level1, target_classes_level2
+    dataset.target_classes_level1 = target_classes_level1
+    dataset.target_classes_level2 = target_classes_level2
diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py
index 9f181c5..533aa2d 100644
--- a/src/add_dti_annotations.py
+++ b/src/add_dti_annotations.py
@@ -1,12 +1,10 @@
-import pandas as pd
+from dataset import Dataset
 
 
 ########### CTI (Compound-Target Interaction) Annotations ###########
 def add_dti_annotations(
-    df_combined: pd.DataFrame,
-    drug_mechanism_pairs_set: set,
-    drug_mechanism_targets_set: set,
-) -> pd.DataFrame:
+    dataset: Dataset,
+):
     """
     Every compound-target pair is assigned a DTI (drug target interaction) annotation.  
 
@@ -60,84 +58,91 @@ def add_dti_annotations(
     and for which the target was also not in the drug_mechanisms table 
     (not a comparator compound), are discarded.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs 
-        based on activities AND drug_mechanism table
-    :type df_combined: pd.DataFrame
-    :param drug_mechanism_pairs_set: set of compound-target pairs in the drug_mechanism table
-    :type drug_mechanism_pairs_set: set
-    :param drug_mechanism_targets_set: set of targets in the drug_mechanism table
-    :type drug_mechanism_targets_set: set
-    :return: Pandas DataFrame with all compound-target pairs and their DTI annotations.
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with all relevant information:
+        - Pandas DataFrame with compound-target pairs
+            based on activities AND drug_mechanism table
+        - set of compound-target pairs in the drug_mechanism table
+        - set of targets in the drug_mechanism table
+    :type dataset: Dataset
     """
     # Add a new column *therapeutic_target* which is set to True
     # if the target is in the drug_mechanism table
-    df_combined["therapeutic_target"] = df_combined["tid"].isin(
-        drug_mechanism_targets_set
+    dataset.df_result["therapeutic_target"] = dataset.df_result["tid"].isin(
+        dataset.drug_mechanism_targets_set
     )
 
     # Assign the annotations based on the table.
     # Compound-target pairs from the drug mechanism table
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 4)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 4)
         ),
         "DTI",
     ] = "D_DT"
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 3)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 3)
         ),
         "DTI",
     ] = "C3_DT"
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 2)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 2)
         ),
         "DTI",
     ] = "C2_DT"
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 1)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 1)
         ),
         "DTI",
     ] = "C1_DT"
     # Compounds that are in the drug_mechanism table but don't have a known phase between 1-4:
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (~df_combined["max_phase"].isin([1, 2, 3, 4]))
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (~dataset.df_result["max_phase"].isin([1, 2, 3, 4]))
         ),
         "DTI",
     ] = "C0_DT"
 
     # Target is in the drug mechanism table
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set))
-            & (df_combined["therapeutic_target"] == True)
+            (
+                ~dataset.df_result["cpd_target_pair"].isin(
+                    dataset.drug_mechanism_pairs_set
+                )
+            )
+            & (dataset.df_result["therapeutic_target"] == True)
         ),
         "DTI",
     ] = "DT"
 
     # Other compound-target pairs
     # if target is not a therapeutic target, 'cpd_target_pair' cannot be in DTIs_set
-    # (~df_combined['cpd_target_pair'].isin(DTIs_set)) is included for clarity
-    df_combined.loc[
+    # (~dataset.df_result['cpd_target_pair'].isin(DTIs_set)) is included for clarity
+    dataset.df_result.loc[
         (
-            (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set))
-            & (df_combined["therapeutic_target"] == False)
+            (
+                ~dataset.df_result["cpd_target_pair"].isin(
+                    dataset.drug_mechanism_pairs_set
+                )
+            )
+            & (dataset.df_result["therapeutic_target"] == False)
         ),
         "DTI",
     ] = "NDT"
 
     # Discard NDT rows
-    df_combined = df_combined[
-        (df_combined["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"]))
+    dataset.df_result = dataset.df_result[
+        (
+            dataset.df_result["DTI"].isin(
+                ["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"]
+            )
+        )
     ]
-
-    return df_combined
diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py
index ce6a575..c26a721 100644
--- a/src/add_filtering_columns.py
+++ b/src/add_filtering_columns.py
@@ -6,6 +6,7 @@
 from arguments import CalculationArgs, OutputArgs
 import get_stats
 import write_subsets
+from dataset import Dataset
 
 
 def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[
@@ -107,20 +108,20 @@ def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[
 
 def add_subset_filtering_columns(
     df_combined_subset: pd.DataFrame,
-    df_combined: pd.DataFrame,
+    dataset: Dataset,
     desc: str,
     args: CalculationArgs,
     out: OutputArgs,
-    df_sizes: list[list[int], list[int]],
-) -> pd.DataFrame:
+):
     """
     Add filtering column for binding + functional vs binding
 
     :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data
         in df_combined
     :type df_combined_subset: pd.DataFrame
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include filtering columns.
+    :type dataset: Dataset
     :param desc: Assay description,
         either "BF" (binding+functional) or "B" (binding)
     :type desc: str
@@ -128,10 +129,6 @@ def add_subset_filtering_columns(
     :type args: CalculationArgs
     :param out: Arguments related to how to output the dataset
     :type out: OutputArgs
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :return: Pandas DataFrame with added filering columns
-    :rtype: pd.DataFrame
     """
     subsets = get_data_subsets(
         df_combined_subset,
@@ -159,65 +156,57 @@ def add_subset_filtering_columns(
     # add filtering columns to df_combined
     # do not add a filtering column for BF / B (-> [1:])
     for [df, col_name] in subsets[1:]:
-        df_combined[col_name] = False
-        df_combined.loc[(df_combined.index.isin(df.index)), col_name] = True
+        dataset.df_result[col_name] = False
+        dataset.df_result.loc[(dataset.df_result.index.isin(df.index)), col_name] = True
         # check that filtering works
-        assert df_combined[df_combined[col_name] == True][df.columns].equals(
-            df
-        ), f"Filtering is not accurate for {col_name}."
+        assert dataset.df_result[dataset.df_result[col_name] == True][
+            df.columns
+        ].equals(df), f"Filtering is not accurate for {col_name}."
 
     if logging.DEBUG >= logging.root.level:
         for [df_subset, subset_desc] in subsets:
-            get_stats.add_dataset_sizes(df_subset, subset_desc, df_sizes)
-
-    return df_combined
+            get_stats.add_debugging_info(dataset, df_subset, subset_desc)
 
 
 def add_filtering_columns(
-    df_combined: pd.DataFrame,
-    df_sizes: list[list[int], list[int]],
+    dataset: Dataset,
     args: CalculationArgs,
     out: OutputArgs,
-) -> pd.DataFrame:
+):
     """
     Add filtering columns to main dataset and save subsets if required.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include filtering columns.
+    :type dataset: Dataset
     :param args: Arguments related to how to calculate the dataset
     :type args: CalculationArgs
     :param out: Arguments related to how to output the dataset
     :type out: OutputArgs
-    :return: Pandas DataFrame with added filering columns
-    :rtype: pd.DataFrame
     """
     # consider binding and functional assays
     # assay description = binding+functional
     desc = "BF"
     # df_combined without binding only data
-    df_combined_subset = df_combined.copy()
-    df_combined = add_subset_filtering_columns(
+    df_combined_subset = dataset.df_result.copy()
+    add_subset_filtering_columns(
         df_combined_subset,
-        df_combined,
+        dataset,
         desc,
         args,
         out,
-        df_sizes,
     )
 
     # consider only binding assays
     # assay description = binding
     desc = "B"
-    df_combined_subset = df_combined[df_combined["keep_for_binding"] == True].copy()
-    df_combined = add_subset_filtering_columns(
+    df_combined_subset = dataset.df_result[
+        dataset.df_result["keep_for_binding"] == True
+    ].copy()
+    add_subset_filtering_columns(
         df_combined_subset,
-        df_combined,
+        dataset,
         desc,
         args,
         out,
-        df_sizes,
     )
-
-    return df_combined
diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py
index 5c7d4e8..1bc9268 100644
--- a/src/add_rdkit_compound_descriptors.py
+++ b/src/add_rdkit_compound_descriptors.py
@@ -1,72 +1,74 @@
-import pandas as pd
 from rdkit import Chem
 from rdkit.Chem import Descriptors
 from rdkit.Chem import PandasTools
 from tqdm import tqdm
 
+from dataset import Dataset
 
-def add_built_in_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
+
+def add_built_in_descriptors(dataset: Dataset):
     """
     Add RDKit built-in compound descriptors.
 
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include built-in RDKit compound descriptors.
+    :type dataset: Dataset
     :param df_combined: Pandas DataFrame with compound-target pairs
     :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added built-in RDKit compound descriptors
-    :rtype: pd.DataFrame
     """
     # add a column with RDKit molecules, used to calculate the descriptors
     PandasTools.AddMoleculeColumnToFrame(
-        df_combined, "canonical_smiles", "mol", includeFingerprints=False
+        dataset.df_result, "canonical_smiles", "mol", includeFingerprints=False
     )
 
-    df_combined.loc[:, "fraction_csp3"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "fraction_csp3"] = dataset.df_result["mol"].apply(
         Descriptors.FractionCSP3
     )
-    df_combined.loc[:, "ring_count"] = df_combined["mol"].apply(Descriptors.RingCount)
-    df_combined.loc[:, "num_aliphatic_rings"] = df_combined["mol"].apply(
-        Descriptors.NumAliphaticRings
-    )
-    df_combined.loc[:, "num_aliphatic_carbocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAliphaticCarbocycles
+    dataset.df_result.loc[:, "ring_count"] = dataset.df_result["mol"].apply(
+        Descriptors.RingCount
     )
-    df_combined.loc[:, "num_aliphatic_heterocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAliphaticHeterocycles
+    dataset.df_result.loc[:, "num_aliphatic_rings"] = dataset.df_result["mol"].apply(
+        Descriptors.NumAliphaticRings
     )
-    df_combined.loc[:, "num_aromatic_rings"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_aliphatic_carbocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAliphaticCarbocycles)
+    dataset.df_result.loc[:, "num_aliphatic_heterocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAliphaticHeterocycles)
+    dataset.df_result.loc[:, "num_aromatic_rings"] = dataset.df_result["mol"].apply(
         Descriptors.NumAromaticRings
     )
-    df_combined.loc[:, "num_aromatic_carbocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAromaticCarbocycles
-    )
-    df_combined.loc[:, "num_aromatic_heterocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAromaticHeterocycles
-    )
-    df_combined.loc[:, "num_saturated_rings"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_aromatic_carbocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAromaticCarbocycles)
+    dataset.df_result.loc[:, "num_aromatic_heterocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAromaticHeterocycles)
+    dataset.df_result.loc[:, "num_saturated_rings"] = dataset.df_result["mol"].apply(
         Descriptors.NumSaturatedRings
     )
-    df_combined.loc[:, "num_saturated_carbocycles"] = df_combined["mol"].apply(
-        Descriptors.NumSaturatedCarbocycles
-    )
-    df_combined.loc[:, "num_saturated_heterocycles"] = df_combined["mol"].apply(
-        Descriptors.NumSaturatedHeterocycles
-    )
-    df_combined.loc[:, "num_stereocentres"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_saturated_carbocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumSaturatedCarbocycles)
+    dataset.df_result.loc[:, "num_saturated_heterocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumSaturatedHeterocycles)
+    dataset.df_result.loc[:, "num_stereocentres"] = dataset.df_result["mol"].apply(
         Chem.rdMolDescriptors.CalcNumAtomStereoCenters
     )
-    df_combined.loc[:, "num_heteroatoms"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_heteroatoms"] = dataset.df_result["mol"].apply(
         Descriptors.NumHeteroatoms
     )
 
     # add scaffolds
-    PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_w_stereo")
+    PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_w_stereo")
     # remove stereo information of the molecule to add scaffolds without stereo information
-    df_combined["mol"].apply(Chem.RemoveStereochemistry)
-    PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_wo_stereo")
+    dataset.df_result["mol"].apply(Chem.RemoveStereochemistry)
+    PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_wo_stereo")
 
     # drop the column with RDKit molecules
-    df_combined = df_combined.drop(["mol"], axis=1)
-
-    return df_combined
+    dataset.df_result = dataset.df_result.drop(["mol"], axis=1)
 
 
 def calculate_aromatic_atoms(
@@ -121,7 +123,7 @@ def calculate_aromatic_atoms(
     return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict
 
 
-def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
+def add_aromaticity_descriptors(dataset: Dataset):
     """
     Add number of aromatic atoms in a compounds, specifically:
 
@@ -130,40 +132,39 @@ def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
     - # aromatic nitrogen atoms (aromatic_n)
     - # aromatic hetero atoms (aromatic_hetero)
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added counts of aromatic atoms
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include counts of aromatic atoms
+    :type dataset: Dataset
     """
     # use df_combined_w_smiles to exclude null values
-    smiles_set = set(df_combined["canonical_smiles"])
+    smiles_set = set(dataset.df_result["canonical_smiles"])
     aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = (
         calculate_aromatic_atoms(smiles_set)
     )
 
-    df_combined["aromatic_atoms"] = df_combined["canonical_smiles"].map(
+    dataset.df_result["aromatic_atoms"] = dataset.df_result["canonical_smiles"].map(
         aromatic_atoms_dict
     )
-    df_combined["aromatic_c"] = df_combined["canonical_smiles"].map(aromatic_c_dict)
-    df_combined["aromatic_n"] = df_combined["canonical_smiles"].map(aromatic_n_dict)
-    df_combined["aromatic_hetero"] = df_combined["canonical_smiles"].map(
+    dataset.df_result["aromatic_c"] = dataset.df_result["canonical_smiles"].map(
+        aromatic_c_dict
+    )
+    dataset.df_result["aromatic_n"] = dataset.df_result["canonical_smiles"].map(
+        aromatic_n_dict
+    )
+    dataset.df_result["aromatic_hetero"] = dataset.df_result["canonical_smiles"].map(
         aromatic_hetero_dict
     )
 
-    return df_combined
 
-
-def add_rdkit_compound_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
+def add_rdkit_compound_descriptors(dataset: Dataset):
     """
     Add RDKit-based compound descriptors (built-in and numbers of aromatic atoms).
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added built-in RDKit compound descriptors
-        and numbers of aromatic atoms
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include
+        built-in RDKit compound descriptors
+        and numbers of aromatic atoms.
+    :type dataset: Dataset
     """
-    df_combined = add_built_in_descriptors(df_combined)
-    df_combined = add_aromaticity_descriptors(df_combined)
-
-    return df_combined
+    add_built_in_descriptors(dataset)
+    add_aromaticity_descriptors(dataset)
diff --git a/src/clean_dataset.py b/src/clean_dataset.py
index 01a2564..6efda90 100644
--- a/src/clean_dataset.py
+++ b/src/clean_dataset.py
@@ -3,10 +3,12 @@
 
 import pandas as pd
 
+from dataset import Dataset
+
 
 def remove_compounds_without_smiles_and_mixtures(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> pd.DataFrame:
+    dataset: Dataset, chembl_con: sqlite3.Connection
+):
     """
     Remove
 
@@ -16,12 +18,12 @@ def remove_compounds_without_smiles_and_mixtures(
     Since compound information is aggregated for the parents of salts,
     the number of smiles with a dot is relatively low.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include
+        compound-target pairs with a smiles that does not contain a '.'
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: Pandas DataFrame with compound-target pairs with a smiles that does not contain a '.'
-    :rtype: pd.DataFrame
     """
     # Double-check that rows with a SMILES containing a '.' are the parent structures,
     # i.e., there was no error in using salt information instead of parent information.
@@ -31,9 +33,9 @@ def remove_compounds_without_smiles_and_mixtures(
     """
     df_hierarchy = pd.read_sql_query(sql, con=chembl_con)
 
-    smiles_with_dot = df_combined[
-        df_combined["canonical_smiles"].notnull()
-        & df_combined["canonical_smiles"].str.contains(".", regex=False)
+    smiles_with_dot = dataset.df_result[
+        dataset.df_result["canonical_smiles"].notnull()
+        & dataset.df_result["canonical_smiles"].str.contains(".", regex=False)
     ][["canonical_smiles", "parent_molregno"]].drop_duplicates()
 
     for parent_molregno in set(smiles_with_dot["parent_molregno"]):
@@ -72,42 +74,46 @@ def remove_compounds_without_smiles_and_mixtures(
                 the smiles for the compound in ChEMBL ({parent_smiles_in_chembl})."
 
     # Remove rows that contain a SMILES with a dot or that don't have a SMILES.
-    len_missing_smiles = len(df_combined[df_combined["canonical_smiles"].isnull()])
+    len_missing_smiles = len(
+        dataset.df_result[dataset.df_result["canonical_smiles"].isnull()]
+    )
     len_smiles_w_dot = len(
-        df_combined[
-            df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"]))
+        dataset.df_result[
+            dataset.df_result["parent_molregno"].isin(
+                set(smiles_with_dot["parent_molregno"])
+            )
         ]
     )
     logging.debug("#Compounds without a SMILES: %s", len_missing_smiles)
     logging.debug("#SMILES with a dot: %s", len_smiles_w_dot)
 
-    df_combined = df_combined[
-        (df_combined["canonical_smiles"].notnull())
+    dataset.df_result = dataset.df_result[
+        (dataset.df_result["canonical_smiles"].notnull())
         & ~(
-            df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"]))
+            dataset.df_result["parent_molregno"].isin(
+                set(smiles_with_dot["parent_molregno"])
+            )
         )
     ]
 
-    return df_combined
+    return dataset.df_result
 
 
-def clean_none_values(df_combined):
+def clean_none_values(dataset: Dataset):
     """
     Change nan values and empty strings to None for consistency.
     """
     # Change all None / nan values to None
-    df_combined = df_combined.where(pd.notnull(df_combined), None)
+    dataset.df_result = dataset.df_result.where(pd.notnull(dataset.df_result), None)
     # replace empty strings with None
-    df_combined = df_combined.replace("", None).reset_index(drop=True)
-
-    return df_combined
+    dataset.df_result = dataset.df_result.replace("", None).reset_index(drop=True)
 
 
-def set_types_to_int(df_combined, calculate_rdkit):
+def set_types_to_int(dataset, calculate_rdkit):
     """
     Set the type of relevant columns to Int64.
     """
-    df_combined = df_combined.astype(
+    dataset.df_result = dataset.df_result.astype(
         {
             "first_approval": "Int64",
             "usan_year": "Int64",
@@ -129,7 +135,7 @@ def set_types_to_int(df_combined, calculate_rdkit):
     )
 
     if calculate_rdkit:
-        df_combined = df_combined.astype(
+        dataset.df_result = dataset.df_result.astype(
             {
                 "num_aliphatic_carbocycles": "Int64",
                 "num_aliphatic_heterocycles": "Int64",
@@ -150,26 +156,26 @@ def set_types_to_int(df_combined, calculate_rdkit):
             }
         )
 
-    return df_combined
 
-
-def round_floats(df_combined, decimal_places=4):
+def round_floats(dataset, decimal_places=4):
     """
     Round float columns to <decimal_places> decimal places.
     This does not apply to max_phase.
     """
-    for _, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()):
+    for _, (col, dtype) in enumerate(dataset.df_result.dtypes.to_dict().items()):
         if (dtype in ("float64", "Float64")) and col != "max_phase":
-            df_combined[col] = df_combined[col].round(decimals=decimal_places)
+            dataset.df_result[col] = dataset.df_result[col].round(
+                decimals=decimal_places
+            )
 
-    return df_combined
+    return dataset.df_result
 
 
-def reorder_columns(df_combined, calculate_rdkit):
+def reorder_columns(dataset, calculate_rdkit):
     """
     Reorder the columns in the DataFrame.
     """
-    len_columns_before = len(df_combined.columns)
+    len_columns_before = len(dataset.df_result.columns)
 
     compound_target_pair_columns = [
         "parent_molregno",
@@ -283,7 +289,7 @@ def reorder_columns(df_combined, calculate_rdkit):
             + rdkit_columns
             + filtering_columns
         )
-        df_combined = df_combined[columns]
+        dataset.df_result = dataset.df_result[columns]
     else:
         columns = (
             compound_target_pair_columns
@@ -296,18 +302,16 @@ def reorder_columns(df_combined, calculate_rdkit):
             + chembl_target_annotations
             + filtering_columns
         )
-        df_combined = df_combined[columns]
+        dataset.df_result = dataset.df_result[columns]
 
-    len_columns_after = len(df_combined.columns)
+    len_columns_after = len(dataset.df_result.columns)
     assert (
         len_columns_before == len_columns_after
     ), f"Different number of columns after reordering \
         (before: {len_columns_before}, after: {len_columns_after})."
 
-    return df_combined
-
 
-def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFrame:
+def clean_dataset(dataset: Dataset, calculate_rdkit: bool) -> pd.DataFrame:
     """
     Clean the dataset by
 
@@ -317,18 +321,16 @@ def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFr
     - reordering columns
     - sorting rows by cpd_target_pair_mutation
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to clean version with the updates described above.
+    :type dataset: Dataset
     :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties
     :type calculate_rdkit: bool
-    :return: Cleaned pandas DataFrame with compound-target pairs
-    :rtype: pd.DataFrame
     """
-    df_combined = clean_none_values(df_combined)
-    df_combined = set_types_to_int(df_combined, calculate_rdkit)
-    df_combined = round_floats(df_combined, decimal_places=4)
-    df_combined = reorder_columns(df_combined, calculate_rdkit)
-    df_combined = df_combined.sort_values(by=["cpd_target_pair_mutation"]).reset_index(
-        drop=True
-    )
-    return df_combined
+    clean_none_values(dataset)
+    set_types_to_int(dataset, calculate_rdkit)
+    round_floats(dataset, decimal_places=4)
+    reorder_columns(dataset, calculate_rdkit)
+    dataset.df_result = dataset.df_result.sort_values(
+        by=["cpd_target_pair_mutation"]
+    ).reset_index(drop=True)
diff --git a/src/dataset.py b/src/dataset.py
new file mode 100644
index 0000000..8b3d29f
--- /dev/null
+++ b/src/dataset.py
@@ -0,0 +1,29 @@
+from dataclasses import dataclass
+
+import pandas as pd
+
+
+@dataclass
+class Dataset:
+    """
+    df_result:                  Pandas DataFrame with the full dataset
+    df_sizes_all:               List of intermediate sized of the dataset used for debugging
+    df_sizes_pchembl:           List of intermediate sized of the dataset used for debugging
+    drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table
+    drug_mechanism_targets_set: Set of targets in the drug_mechanism table
+    df_cpd_props:               Pandas DataFrame with compound properties and
+                                structures for all compound ids in ChEMBL
+    atc_levels:                 Pandas DataFrame with ATC annotations in ChEMBL
+    target_classes_level1:      Pandas DataFrame with mapping from target id to level 1 target class
+    target_classes_level2:      Pandas DataFrame with mapping from target id to level 2 target class
+    """
+
+    df_result: pd.DataFrame
+    df_cpd_props: pd.DataFrame
+    atc_levels: pd.DataFrame
+    target_classes_level1: pd.DataFrame
+    target_classes_level2: pd.DataFrame
+    drug_mechanism_pairs_set: set
+    drug_mechanism_targets_set: set
+    df_sizes_all: list[int]
+    df_sizes_pchembl: list[int]
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
index 4e440da..ba6811e 100644
--- a/src/get_activity_ct_pairs.py
+++ b/src/get_activity_ct_pairs.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pandas as pd
 
+from dataset import Dataset
+
 
 ########### Get Initial Compound-Target Data From ChEMBL ###########
 def get_compound_target_pairs_with_pchembl(
@@ -164,7 +166,7 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
 def get_aggregated_activity_ct_pairs(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-) -> pd.DataFrame:
+) -> Dataset:
     """
     Get dataset of compound target-pairs with an associated pchembl value
     with pchembl and publication dates aggregated into one entry per pair.
@@ -184,9 +186,9 @@ def get_aggregated_activity_ct_pairs(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
-        aggregated into one entry per compound-target pair.
-    :rtype: pd.DataFrame
+    :return: Dataset with a pandas Dataframe with compound-target pairs
+        based on ChEMBL activity data aggregated into one entry per compound-target pair.
+    :rtype: Dataset
     """
     df_mols = get_compound_target_pairs_with_pchembl(
         chembl_con,
@@ -220,4 +222,15 @@ def get_aggregated_activity_ct_pairs(
         how="left",
     )
 
-    return df_combined
+    dataset = Dataset(
+        df_combined,
+        pd.DataFrame(),
+        pd.DataFrame(),
+        pd.DataFrame(),
+        pd.DataFrame(),
+        set(),
+        set(),
+        [],
+        [],
+    )
+    return dataset
diff --git a/src/get_dataset.py b/src/get_dataset.py
index 7fd86c3..c9966d5 100644
--- a/src/get_dataset.py
+++ b/src/get_dataset.py
@@ -28,98 +28,59 @@ def get_ct_pair_dataset(
     :param out: Arguments related to how to output the dataset
     :type out: OutputArgs
     """
-    # list with sizes of full dataset and dataset subset with pchembl values for debugging
-    df_sizes = [[], []]
-
     logging.info("get_aggregated_activity_ct_pairs")
-    df_combined = get_activity_ct_pairs.get_aggregated_activity_ct_pairs(
+    dataset = get_activity_ct_pairs.get_aggregated_activity_ct_pairs(
         chembl_con, args.limit_to_literature
     )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "activity ct-pairs", df_sizes)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "activity ct-pairs")
 
     logging.info("add_cti_from_drug_mechanisms")
-    df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set = (
-        get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(df_combined, chembl_con)
-    )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "dm ct-pairs", df_sizes)
+    get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(dataset, chembl_con)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "dm ct-pairs")
 
     logging.info("add_cti_annotations")
-    df_combined = add_dti_annotations.add_dti_annotations(
-        df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set
-    )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "DTI annotations", df_sizes)
+    add_dti_annotations.add_dti_annotations(dataset)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "DTI annotations")
 
     logging.info("add_all_chembl_compound_properties")
-    df_combined, df_cpd_props, atc_levels = (
-        add_chembl_compound_properties.add_all_chembl_compound_properties(
-            df_combined, chembl_con, args.limit_to_literature
-        )
+    add_chembl_compound_properties.add_all_chembl_compound_properties(
+        dataset, chembl_con, args.limit_to_literature
     )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "ChEMBL props", df_sizes)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "ChEMBL props")
 
     logging.info("remove_compounds_without_smiles_and_mixtures")
-    df_combined = clean_dataset.remove_compounds_without_smiles_and_mixtures(
-        df_combined, chembl_con
-    )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "removed smiles", df_sizes)
+    clean_dataset.remove_compounds_without_smiles_and_mixtures(dataset, chembl_con)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "removed smiles")
 
     logging.info("add_chembl_target_class_annotations")
-    df_combined, target_classes_level1, target_classes_level2 = (
-        add_chembl_target_class_annotations.add_chembl_target_class_annotations(
-            df_combined,
-            chembl_con,
-            args,
-            out,
-        )
+    add_chembl_target_class_annotations.add_chembl_target_class_annotations(
+        dataset,
+        chembl_con,
+        args,
+        out,
     )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "tclass annotations", df_sizes)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "tclass annotations")
 
-    logging.info("add_rdkit_compound_descriptors")
     if args.calculate_rdkit:
-        df_combined = add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(
-            df_combined
-        )
-        if logging.DEBUG >= logging.root.level:
-            get_stats.add_dataset_sizes(df_combined, "RDKit props", df_sizes)
+        logging.info("add_rdkit_compound_descriptors")
+        add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(dataset)
+        get_stats.add_debugging_info(dataset, dataset.df_result, "RDKit props")
 
     logging.info("clean_dataset")
-    df_combined = clean_dataset.clean_dataset(df_combined, args.calculate_rdkit)
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "clean df", df_sizes)
+    clean_dataset.clean_dataset(dataset, args.calculate_rdkit)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "clean df")
 
     logging.info("sanity_checks")
-    sanity_checks.sanity_checks(
-        df_combined,
-        df_cpd_props,
-        atc_levels,
-        target_classes_level1,
-        target_classes_level2,
-        args.calculate_rdkit,
-    )
+    sanity_checks.sanity_checks(dataset, args.calculate_rdkit)
 
     logging.info("add_filtering_columns")
-    add_filtering_columns.add_filtering_columns(
-        df_combined,
-        df_sizes,
-        args,
-        out,
-    )
+    add_filtering_columns.add_filtering_columns(dataset, args, out)
 
     logging.info("write_full_dataset_to_file")
-    write_subsets.write_full_dataset_to_file(
-        df_combined,
-        args,
-        out,
-    )
+    write_subsets.write_full_dataset_to_file(dataset, args, out)
 
     logging.info("output_stats")
-    write_subsets.output_all_stats(df_combined, args, out)
+    write_subsets.output_all_stats(dataset, args, out)
 
     if logging.DEBUG >= logging.root.level:
-        write_subsets.output_debug_sizes(df_sizes, out)
+        write_subsets.output_debug_sizes(dataset, out)
diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py
index be67c59..ebabf45 100644
--- a/src/get_drug_mechanism_ct_pairs.py
+++ b/src/get_drug_mechanism_ct_pairs.py
@@ -3,6 +3,8 @@
 
 import pandas as pd
 
+from dataset import Dataset
+
 
 ########### Extract Drug-Target Interactions From the drug_mechanism Table ###########
 def get_drug_mechanisms_interactions(chembl_con: sqlite3.Connection) -> pd.DataFrame:
@@ -151,7 +153,7 @@ def add_annotations_to_drug_mechanisms_cti(
     :return: Updated pandas DataFrame with the additional annotations.
     :rtype: pd.DataFrame
     """
-    ##### Set columns existing in the df_combined table. #####
+    ##### Set columns existing in the df_results table. #####
     # None of the targets from the drug mechanism table have any mutation annotation,
     # hence tid_mutation = tid
     cpd_target_pairs["tid_mutation"] = cpd_target_pairs["tid"].astype("str")
@@ -239,47 +241,46 @@ def get_drug_mechanism_ct_pairs(chembl_con: sqlite3.Connection) -> pd.DataFrame:
 
 
 ########### Add Compounds From the drug_mechanism Table to the Dataset ###########
-def add_drug_mechanism_ct_pairs(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, set, set]:
+def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection):
     """
-    Add compound-target pairs from the drug_mechanism table 
+    Add compound-target pairs from the drug_mechanism table
     that are not in the dataset based on the initial ChEMBL query.
     These are compound-target pairs for which there is no associated pchembl value data.
-    Since the pairs are known interactions, 
+    Since the pairs are known interactions,
     they are added to the dataset despite not having a pchembl value.
+    Add the set of compound-target pairs in the drug_mechanism table and
+    the set of targets in the drug_mechanism table to the dataset.
 
-    :param df_combined: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
-    :type df_combined: pd.DataFrame
+    :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with compound-target pairs
-                based on activities AND drug_mechanism table \\
-        - set of compound-target pairs in the drug_mechanism table \\
-        - set of targets in the drug_mechanism table
-    :rtype: (pd.DataFrame, set, set)
     """
     cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con)
-    drug_mechanism_pairs_set = set(
+    dataset.drug_mechanism_pairs_set = set(
         f"{a}_{b}"
         for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"])
     )
 
-    drug_mechanism_targets_set = set(cpd_target_pairs["tid"])
+    dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"])
 
     # Add a new column *pair_mutation_in_dm_table* which is set to True if the compound target pair
     # (taking mutation annotations into account) is in the drug_mechanism table.
-    df_combined["pair_mutation_in_dm_table"] = False
-    df_combined.loc[
-        (df_combined["cpd_target_pair_mutation"].isin(drug_mechanism_pairs_set)),
+    dataset.df_result["pair_mutation_in_dm_table"] = False
+    dataset.df_result.loc[
+        (
+            dataset.df_result["cpd_target_pair_mutation"].isin(
+                dataset.drug_mechanism_pairs_set
+            )
+        ),
         "pair_mutation_in_dm_table",
     ] = True
 
     # Add a new column *pair_in_dm_table* which is set to True if the compound target pair
     # (NOT taking mutation annotations into account) is in the drug_mechanism table.
-    df_combined["pair_in_dm_table"] = False
-    df_combined.loc[
-        (df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)),
+    dataset.df_result["pair_in_dm_table"] = False
+    dataset.df_result.loc[
+        (dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)),
         "pair_in_dm_table",
     ] = True
 
@@ -291,7 +292,7 @@ def add_drug_mechanism_ct_pairs(
     cpd_target_pairs = cpd_target_pairs[
         ~(
             cpd_target_pairs["cpd_target_pair_mutation"].isin(
-                set(df_combined["cpd_target_pair_mutation"])
+                set(dataset.df_result["cpd_target_pair_mutation"])
             )
         )
     ].copy()
@@ -302,20 +303,18 @@ def add_drug_mechanism_ct_pairs(
     )
 
     # Combined data of existing query with new compound-target pairs.
-    df_combined = pd.concat([df_combined, cpd_target_pairs])
+    dataset.df_result = pd.concat([dataset.df_result, cpd_target_pairs])
 
     # Add a new column *keep_for_binding* which is set to True if the row should be kept
     # if you want to limit the dataset to only data based on binding assays.
     # Rows are kept if
     # - there is a binding data-based pchembl value or
     # - the compound-target pair (including mutation info) is in the drug_mechanism table
-    df_combined["keep_for_binding"] = False
-    df_combined.loc[
+    dataset.df_result["keep_for_binding"] = False
+    dataset.df_result.loc[
         (
-            (df_combined["pchembl_value_mean_B"].notnull())
-            | (df_combined["pair_mutation_in_dm_table"] == True)
+            (dataset.df_result["pchembl_value_mean_B"].notnull())
+            | (dataset.df_result["pair_mutation_in_dm_table"] == True)
         ),
         "keep_for_binding",
     ] = True
-
-    return df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set
diff --git a/src/get_stats.py b/src/get_stats.py
index e27a4a5..0b96abc 100644
--- a/src/get_stats.py
+++ b/src/get_stats.py
@@ -1,5 +1,8 @@
+import logging
 import pandas as pd
 
+from dataset import Dataset
+
 
 ##### Debugging Stats #####
 def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]:
@@ -45,20 +48,22 @@ def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]:
 
 
 def add_dataset_sizes(
-    df: pd.DataFrame, label: str, df_sizes: list[list[int], list[int]]
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
 ):
     """
-    Count and add representative counts of df to the list df_sizes used for debugging.
+    Count and add representative counts of df used for debugging to the dataset.
 
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
     :param df: Pandas DataFrame with current compound-target pairs
     :type df: pd.DataFrame
     :param label: Description of pipeline step (e.g., initial query).
     :type label: str
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
     """
     df_copy = df.copy()
-    df_sizes[0].append([label] + calculate_dataset_sizes(df_copy))
+    dataset.df_sizes_all.append([label] + calculate_dataset_sizes(df_copy))
 
     # restrict to data with any pchembl value (any data with a pchembl,
     # even if it is based on only functional data)
@@ -68,7 +73,20 @@ def add_dataset_sizes(
     df_pchembl = df_copy.dropna(
         subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
     )
-    df_sizes[1].append([label] + calculate_dataset_sizes(df_pchembl))
+    dataset.df_sizes_pchembl.append([label] + calculate_dataset_sizes(df_pchembl))
+
+
+def add_debugging_info(
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
+):
+    """
+    Wrapper for add_dataset_sizes.
+    Handles logging level.
+    """
+    if logging.DEBUG >= logging.root.level:
+        add_dataset_sizes(dataset, df, label)
 
 
 ##### Logging Stats #####
diff --git a/src/sanity_checks.py b/src/sanity_checks.py
index b34517e..8b74546 100644
--- a/src/sanity_checks.py
+++ b/src/sanity_checks.py
@@ -1,13 +1,15 @@
 import pandas as pd
 
+from dataset import Dataset
+
 
 ########### Sanity checks for the dataset ###########
-def check_null_values(df_combined: pd.DataFrame):
+def check_null_values(df_result: pd.DataFrame):
     """
     Check if any columns contain nan or null which aren't recognised as null values.
     """
-    for col in df_combined.columns:
-        col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str))
+    for col in df_result.columns:
+        col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
         assert (
             "nan" not in col_as_str
         ), f"Problem with unrecognised nan value in column {col}"
@@ -16,14 +18,14 @@ def check_null_values(df_combined: pd.DataFrame):
         ), f"Problem with unrecognised null value in column {col}"
 
 
-def check_for_mixed_types(df_combined: pd.DataFrame):
+def check_for_mixed_types(df_result: pd.DataFrame):
     """
     Check that there are no mixed types in columns with dtype=object.
     """
-    for col, dtype in df_combined.dtypes.to_dict().items():
+    for col, dtype in df_result.dtypes.to_dict().items():
         if dtype == object:
-            col_original = set(df_combined[df_combined[col].notnull()][col])
-            col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str))
+            col_original = set(df_result[df_result[col].notnull()][col])
+            col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
             # is there a difference in the two sets (ignoring null values)
             assert (
                 len(col_original - col_as_str) == 0
@@ -33,7 +35,7 @@ def check_for_mixed_types(df_combined: pd.DataFrame):
             ), f"Mixed types in colum {col}: {col_as_str-col_original}"
 
 
-def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame):
+def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame):
     """
     Check that rows without a pchembl value based on binding+functional assays (pchembl_x_BF)
     are in the drug_mechanism table.
@@ -47,15 +49,15 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame
         "pchembl_value_max_BF",
         "pchembl_value_median_BF",
     ]:
-        assert df_combined[(df_combined[pchembl_col].isnull())].equals(
-            df_combined[
-                (df_combined["pair_mutation_in_dm_table"] == True)
-                & (df_combined[pchembl_col].isnull())
+        assert df_result[(df_result[pchembl_col].isnull())].equals(
+            df_result[
+                (df_result["pair_mutation_in_dm_table"] == True)
+                & (df_result[pchembl_col].isnull())
             ]
         ), f"Missing pchembl value in column {pchembl_col}"
 
 
-def check_ligand_efficiency_metrics(df_combined: pd.DataFrame):
+def check_ligand_efficiency_metrics(df_result: pd.DataFrame):
     """
     Check that ligand efficiency metrics are only null
     when at least one of the values used to calculate them is null.
@@ -63,39 +65,39 @@ def check_ligand_efficiency_metrics(df_combined: pd.DataFrame):
     one of the values used to calculate them is null.
     """
     for suffix in ["BF", "B"]:
-        assert df_combined[(df_combined[f"LE_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["heavy_atoms"].isnull())
-                | (df_combined["heavy_atoms"] == 0)
+        assert df_result[(df_result[f"LE_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["heavy_atoms"].isnull())
+                | (df_result["heavy_atoms"] == 0)
             ]
         ), f"Missing LE value in LE_{suffix}"
 
-        assert df_combined[(df_combined[f"BEI_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["mw_freebase"].isnull())
-                | (df_combined["mw_freebase"] == 0)
+        assert df_result[(df_result[f"BEI_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["mw_freebase"].isnull())
+                | (df_result["mw_freebase"] == 0)
             ]
         ), f"Missing BEI value in BEI_{suffix}"
 
-        assert df_combined[(df_combined[f"SEI_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["psa"].isnull())
-                | (df_combined["psa"] == 0)
+        assert df_result[(df_result[f"SEI_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["psa"].isnull())
+                | (df_result["psa"] == 0)
             ]
         ), f"Missing SEI value in SEI_{suffix}"
 
-        assert df_combined[(df_combined[f"LLE_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["alogp"].isnull())
+        assert df_result[(df_result[f"LLE_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["alogp"].isnull())
             ]
         ), f"Missing LLE value in LLE_{suffix}"
 
 
-def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame):
+def check_compound_props(dataset: Dataset):
     """
     Check that compound props are only null if
 
@@ -104,56 +106,65 @@ def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame):
     """
     # missing values because the parent_molregno is not in the compound props table
     no_cpd_prop_info = len(
-        df_combined[
-            ~df_combined["parent_molregno"].isin(set(df_cpd_props["parent_molregno"]))
+        dataset.df_result[
+            ~dataset.df_result["parent_molregno"].isin(
+                set(dataset.df_cpd_props["parent_molregno"])
+            )
         ]
     )
 
-    for col in df_cpd_props.columns:
+    for col in dataset.df_cpd_props.columns:
         if col != "parent_molregno":
             # missing values because the compound props query returns null (exists but is null)
             missing_values = len(
-                df_combined[
-                    df_combined["parent_molregno"].isin(
-                        set(df_cpd_props[df_cpd_props[col].isnull()]["parent_molregno"])
+                dataset.df_result[
+                    dataset.df_result["parent_molregno"].isin(
+                        set(
+                            dataset.df_cpd_props[dataset.df_cpd_props[col].isnull()][
+                                "parent_molregno"
+                            ]
+                        )
                     )
                 ]
             )
             null_values = no_cpd_prop_info + missing_values
             assert null_values == len(
-                df_combined[df_combined[col].isnull()]
+                dataset.df_result[dataset.df_result[col].isnull()]
             ), f"Too many null values in {col}"
 
 
 def check_atc_and_target_classes(
-    df_combined: pd.DataFrame,
-    atc_levels: pd.DataFrame,
-    target_classes_level1: pd.DataFrame,
-    target_classes_level2: pd.DataFrame,
+    dataset: Dataset,
 ):
     """
     Check that atc_level1 and target class information is only null
     if the parent_molregno / target id is not in the respective table.
     """
-    assert df_combined[(df_combined["atc_level1"].isnull())].equals(
-        df_combined[
-            ~df_combined["parent_molregno"].isin(set(atc_levels["parent_molregno"]))
+    assert dataset.df_result[(dataset.df_result["atc_level1"].isnull())].equals(
+        dataset.df_result[
+            ~dataset.df_result["parent_molregno"].isin(
+                set(dataset.atc_levels["parent_molregno"])
+            )
         ]
     ), "Null values in atc_level1 are not exclusively \
         because the parent_molregno is not in the atc_classification table."
 
-    assert df_combined[(df_combined["target_class_l1"].isnull())].equals(
-        df_combined[~df_combined["tid"].isin(set(target_classes_level1["tid"]))]
+    assert dataset.df_result[(dataset.df_result["target_class_l1"].isnull())].equals(
+        dataset.df_result[
+            ~dataset.df_result["tid"].isin(set(dataset.target_classes_level1["tid"]))
+        ]
     ), "Null values in target_class_l1 are not exclusively \
         because the tid is not in the protein_classification table."
 
-    assert df_combined[(df_combined["target_class_l2"].isnull())].equals(
-        df_combined[~df_combined["tid"].isin(set(target_classes_level2["tid"]))]
+    assert dataset.df_result[(dataset.df_result["target_class_l2"].isnull())].equals(
+        dataset.df_result[
+            ~dataset.df_result["tid"].isin(set(dataset.target_classes_level2["tid"]))
+        ]
     ), "Null values in target_class_l2 are not exclusively \
         because the tid is not in the protein_classification table."
 
 
-def check_rdkit_props(df_combined: pd.DataFrame):
+def check_rdkit_props(df_result: pd.DataFrame):
     """
     Check that columns set by the RDKit are only null
     if there is no canonical SMILES for the molecule.
@@ -179,17 +190,13 @@ def check_rdkit_props(df_combined: pd.DataFrame):
         "aromatic_n",
         "aromatic_hetero",
     ]:
-        assert len(df_combined[df_combined[col].isnull()]) == len(
-            df_combined[df_combined["canonical_smiles"].isnull()].copy()
+        assert len(df_result[df_result[col].isnull()]) == len(
+            df_result[df_result["canonical_smiles"].isnull()].copy()
         ), f"Missing value in {col} despite a smiles being available."
 
 
 def sanity_checks(
-    df_combined: pd.DataFrame,
-    df_cpd_props: pd.DataFrame,
-    atc_levels: pd.DataFrame,
-    target_classes_level1: pd.DataFrame,
-    target_classes_level2: pd.DataFrame,
+    dataset: Dataset,
     calculate_rdkit: bool,
 ):
     """
@@ -208,32 +215,19 @@ def sanity_checks(
     - columns set by the RDKit are only null if there is no canonical SMILES
         for the molecule (excluding scaffolds)
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param df_cpd_props: Pandas DataFrame with compound properties
-        and structures for all compound ids in ChEMBL.
-    :type df_cpd_props: pd.DataFrame
-    :param atc_levels: Pandas DataFrame with ATC annotations in ChEMBL
-    :type atc_levels: pd.DataFrame
-    :param target_classes_level1: Pandas DataFrame with mapping
-        from target id to level 1 target class
-    :type target_classes_level1: pd.DataFrame
-    :param target_classes_level2: Pandas DataFrame with mapping
-        from target id to level 2 target class
-    :type target_classes_level2: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
     :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties
     :type calculate_rdkit: bool
     """
-    check_null_values(df_combined)
-    check_for_mixed_types(df_combined)
-    check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined)
-    check_ligand_efficiency_metrics(df_combined)
-    check_compound_props(df_combined, df_cpd_props)
-    check_atc_and_target_classes(
-        df_combined, atc_levels, target_classes_level1, target_classes_level2
-    )
+    check_null_values(dataset.df_result)
+    check_for_mixed_types(dataset.df_result)
+    check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result)
+    check_ligand_efficiency_metrics(dataset.df_result)
+    check_compound_props(dataset)
+    check_atc_and_target_classes(dataset)
     if calculate_rdkit:
-        check_rdkit_props(df_combined)
+        check_rdkit_props(dataset.df_result)
 
 
 ########### Sanity checks for writing and reading a dataset ###########
diff --git a/src/write_subsets.py b/src/write_subsets.py
index 5b0bc21..a5635a1 100644
--- a/src/write_subsets.py
+++ b/src/write_subsets.py
@@ -5,6 +5,7 @@
 
 import get_stats
 from arguments import OutputArgs, CalculationArgs
+from dataset import Dataset
 
 
 def write_output(
@@ -73,15 +74,15 @@ def write_and_check_output(
 
 
 def write_full_dataset_to_file(
-    df_combined: pd.DataFrame,
+    dataset: Dataset,
     args: CalculationArgs,
     out: OutputArgs,
 ):
     """
     If write_full_dataset, write df_combined with filtering columns to output_path.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs and filtering columns
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
     :param args: Arguments related to how to calculate the dataset
     :type args: CalculationArgs
     :param out: Arguments related to how to output the dataset
@@ -93,18 +94,18 @@ def write_full_dataset_to_file(
             out.output_path,
             f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset",
         )
-        write_and_check_output(df_combined, name_all, desc, args, out)
+        write_and_check_output(dataset.df_result, name_all, desc, args, out)
 
 
 def output_debug_sizes(
-    df_sizes: list[list[int], list[int]],
+    dataset: Dataset,
     out: OutputArgs,
 ):
     """
     Output counts at various points during calculating the final dataset for debugging.
 
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
     :param args: Arguments related to how to calculate the dataset
     :type args: CalculationArgs
     :param out: Arguments related to how to output the dataset
@@ -125,7 +126,7 @@ def output_debug_sizes(
     ]
 
     logging.debug("Size of full dataset at different points.")
-    full_df_sizes = pd.DataFrame(df_sizes[0], columns=column_names)
+    full_df_sizes = pd.DataFrame(dataset.df_sizes_all, columns=column_names)
     logging.debug(full_df_sizes)
     name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
     write_output(
@@ -139,7 +140,7 @@ def output_debug_sizes(
         "This includes data for which we only have pchembl data \
             for functional assays but not for binding assays."
     )
-    df_pchembl_sizes = pd.DataFrame(df_sizes[1], columns=column_names)
+    df_pchembl_sizes = pd.DataFrame(dataset.df_sizes_pchembl, columns=column_names)
     logging.debug(df_pchembl_sizes)
     name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
     write_output(
@@ -208,14 +209,12 @@ def output_stats(
     )
 
 
-def output_all_stats(
-    df_combined_annotated: pd.DataFrame, args: CalculationArgs, out: OutputArgs
-):
+def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs):
     """
     Output stats for all datasets and subsets calculated.
 
-    :param df_combined_annotated: Pandas DataFrame with additional filtering columns
-    :type df_combined_annotated: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
     :param args: Arguments related to how to calculate the dataset
     :type args: CalculationArgs
     :param out: Arguments related to how to output the dataset
@@ -226,7 +225,7 @@ def output_all_stats(
         f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset_stats",
     )
 
-    output_stats(df_combined_annotated, output_file, out)
+    output_stats(dataset.df_result, output_file, out)
 
     if out.write_bf:
         output_file = os.path.join(
@@ -236,7 +235,7 @@ def output_all_stats(
             f"BF_{args.min_nof_cpds_bf}_c_dt_d_dt_stats",
         )
         output_stats(
-            df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]],
+            dataset.df_result[dataset.df_result["BF_100_c_dt_d_dt"]],
             output_file,
             out,
         )
@@ -249,7 +248,7 @@ def output_all_stats(
             f"B_{args.min_nof_cpds_b}_c_dt_d_dt_stats",
         )
         output_stats(
-            df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]],
+            dataset.df_result[dataset.df_result["B_100_c_dt_d_dt"]],
             output_file,
             out,
         )

From a4721c664830bede376b8f278cba2e05de4c4aae Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Tue, 20 Feb 2024 20:38:51 +0000
Subject: [PATCH 5/8] Fix handling truth values of filtering columns in pandas

---
 src/add_dti_annotations.py         |  4 ++--
 src/add_filtering_columns.py       | 10 ++++------
 src/get_drug_mechanism_ct_pairs.py |  2 +-
 src/sanity_checks.py               |  2 +-
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py
index 533aa2d..6596535 100644
--- a/src/add_dti_annotations.py
+++ b/src/add_dti_annotations.py
@@ -118,7 +118,7 @@ def add_dti_annotations(
                     dataset.drug_mechanism_pairs_set
                 )
             )
-            & (dataset.df_result["therapeutic_target"] == True)
+            & (dataset.df_result["therapeutic_target"])
         ),
         "DTI",
     ] = "DT"
@@ -133,7 +133,7 @@ def add_dti_annotations(
                     dataset.drug_mechanism_pairs_set
                 )
             )
-            & (dataset.df_result["therapeutic_target"] == False)
+            & ~(dataset.df_result["therapeutic_target"])
         ),
         "DTI",
     ] = "NDT"
diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py
index c26a721..88ce052 100644
--- a/src/add_filtering_columns.py
+++ b/src/add_filtering_columns.py
@@ -159,9 +159,9 @@ def add_subset_filtering_columns(
         dataset.df_result[col_name] = False
         dataset.df_result.loc[(dataset.df_result.index.isin(df.index)), col_name] = True
         # check that filtering works
-        assert dataset.df_result[dataset.df_result[col_name] == True][
-            df.columns
-        ].equals(df), f"Filtering is not accurate for {col_name}."
+        assert dataset.df_result[dataset.df_result[col_name]][df.columns].equals(
+            df
+        ), f"Filtering is not accurate for {col_name}."
 
     if logging.DEBUG >= logging.root.level:
         for [df_subset, subset_desc] in subsets:
@@ -200,9 +200,7 @@ def add_filtering_columns(
     # consider only binding assays
     # assay description = binding
     desc = "B"
-    df_combined_subset = dataset.df_result[
-        dataset.df_result["keep_for_binding"] == True
-    ].copy()
+    df_combined_subset = dataset.df_result[dataset.df_result["keep_for_binding"]].copy()
     add_subset_filtering_columns(
         df_combined_subset,
         dataset,
diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py
index ebabf45..5f1fa7c 100644
--- a/src/get_drug_mechanism_ct_pairs.py
+++ b/src/get_drug_mechanism_ct_pairs.py
@@ -314,7 +314,7 @@ def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection
     dataset.df_result.loc[
         (
             (dataset.df_result["pchembl_value_mean_B"].notnull())
-            | (dataset.df_result["pair_mutation_in_dm_table"] == True)
+            | (dataset.df_result["pair_mutation_in_dm_table"])
         ),
         "keep_for_binding",
     ] = True
diff --git a/src/sanity_checks.py b/src/sanity_checks.py
index 8b74546..ee8844d 100644
--- a/src/sanity_checks.py
+++ b/src/sanity_checks.py
@@ -51,7 +51,7 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame):
     ]:
         assert df_result[(df_result[pchembl_col].isnull())].equals(
             df_result[
-                (df_result["pair_mutation_in_dm_table"] == True)
+                (df_result["pair_mutation_in_dm_table"])
                 & (df_result[pchembl_col].isnull())
             ]
         ), f"Missing pchembl value in column {pchembl_col}"

From fe2a49a4108134cc6cffcf196d8c6e4e1004ffe7 Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Tue, 20 Feb 2024 23:41:07 +0000
Subject: [PATCH 6/8] Remove unnecessary variables from Dataset

---
 src/add_chembl_compound_properties.py      |  80 ++++-----
 src/add_chembl_target_class_annotations.py | 104 ++++++++----
 src/add_dti_annotations.py                 |   4 +
 src/add_rdkit_compound_descriptors.py      |   2 +
 src/clean_dataset.py                       |   4 +-
 src/dataset.py                             |  15 +-
 src/get_activity_ct_pairs.py               |  38 ++++-
 src/get_dataset.py                         |   2 +-
 src/get_drug_mechanism_ct_pairs.py         |  77 +++++----
 src/sanity_checks.py                       | 185 ++++++++++-----------
 10 files changed, 285 insertions(+), 226 deletions(-)

diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py
index 4e5d623..879c8dc 100644
--- a/src/add_chembl_compound_properties.py
+++ b/src/add_chembl_compound_properties.py
@@ -3,12 +3,13 @@
 import pandas as pd
 
 from dataset import Dataset
+import sanity_checks
 
 
 ########### Add Compound Properties Based on ChEMBL Data ###########
-def add_first_publication_date(
-    dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
-):
+def get_first_publication_cpd_date(
+    chembl_con: sqlite3.Connection, limit_to_literature: bool
+) -> pd.DataFrame:
     """
     Query and calculate the first publication of a compound
     based on ChEMBL data (column name: first_publication_cpd).
@@ -16,13 +17,12 @@ def add_first_publication_date(
     of the compound in the literature according to ChEMBL.
     Otherwise this is the first appearance in any source in ChEMBL.
 
-    :param dataset: Dataset with compound-target pairs.
-        Will be updated to include first_publication_cpd
-    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
     :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
     :type limit_to_literature: bool
+    :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL.
+    :rtype: pd.DataFrame
     """
     # information about salts is aggregated in the parent
     sql = """
@@ -43,26 +43,21 @@ def add_first_publication_date(
     ].transform("min")
     df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates()
 
-    dataset.df_result = dataset.df_result.merge(
-        df_docs, on="parent_molregno", how="left"
-    )
+    return df_docs
 
 
-def add_chembl_properties_and_structures(
-    dataset: Dataset, chembl_con: sqlite3.Connection
-):
+def get_chembl_properties_and_structures(
+    chembl_con: sqlite3.Connection,
+) -> pd.DataFrame:
     """
-    Add compound properties from the compound_properties table
+    Get compound properties from the compound_properties table
     (e.g., alogp, #hydrogen bond acceptors / donors, etc.).
-    Add InChI, InChI key and canonical smiles.
+    Get InChI, InChI key and canonical smiles.
 
-    :param dataset: Dataset with compound-target pairs.
-        Will be updated to include compound properties and structures.
-        dataset.df_cpd_props will be set to
-        compound properties and structures for all compound ids in ChEMBL.
-    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
+    :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, 
@@ -79,16 +74,13 @@ def add_chembl_properties_and_structures(
     """
 
     df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
-    dataset.df_cpd_props = df_cpd_props
 
-    dataset.df_result = dataset.df_result.merge(
-        df_cpd_props, on="parent_molregno", how="left"
-    )
+    return df_cpd_props
 
 
-def add_ligand_efficiency_metrics(dataset: Dataset):
+def calculate_ligand_efficiency_metrics(dataset: Dataset):
     """
-    Calculate the ligand efficiency metrics for the compounds
+    Calculate and add the ligand efficiency metrics for the compounds
     based on the mean pchembl values for a compound-target pair and
     the following ligand efficiency (LE) formulas:
 
@@ -150,20 +142,18 @@ def add_ligand_efficiency_metrics(dataset: Dataset):
         )
 
 
-def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection):
+def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame:
     """
-    Query and add ATC classifications (level 1) from the atc_classification and
+    Query ATC classifications (level 1) from the atc_classification and
     molecule_atc_classification tables.
     ATC level annotations for the same parent_molregno are combined into one description
     that concatenates all descriptions sorted alphabetically
     into one string with ' | ' as a separator.
 
-    :param dataset: Dataset with compound-target pairs.
-        Will be updated to include ATC classifications.
-        dataset.atc_levels will be set to ATC annotations in ChEMBL.
-    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
+    :return: Pandas DataFrame with ATC annotations in ChEMBL.
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description
@@ -186,11 +176,7 @@ def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection):
     ].transform(lambda x: between_str_join.join(sorted(x)))
     atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates()
 
-    dataset.atc_levels = atc_levels
-
-    dataset.df_result = dataset.df_result.merge(
-        atc_levels, on="parent_molregno", how="left"
-    )
+    return atc_levels
 
 
 def add_all_chembl_compound_properties(
@@ -214,10 +200,24 @@ def add_all_chembl_compound_properties(
         Base it on all available sources otherwise.
     :type limit_to_literature: bool
     """
-    add_first_publication_date(dataset, chembl_con, limit_to_literature)
+    df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature)
+    dataset.df_result = dataset.df_result.merge(
+        df_docs, on="parent_molregno", how="left"
+    )
 
-    add_chembl_properties_and_structures(dataset, chembl_con)
+    df_cpd_props = get_chembl_properties_and_structures(chembl_con)
+    dataset.df_cpd_props = df_cpd_props
+    dataset.df_result = dataset.df_result.merge(
+        df_cpd_props, on="parent_molregno", how="left"
+    )
+    sanity_checks.check_compound_props(dataset.df_result, df_cpd_props)
 
-    add_ligand_efficiency_metrics(dataset)
+    calculate_ligand_efficiency_metrics(dataset)
+    sanity_checks.check_ligand_efficiency_metrics(dataset.df_result)
 
-    add_atc_classification(dataset, chembl_con)
+    atc_levels = get_atc_classification(chembl_con)
+    dataset.atc_levels = atc_levels
+    dataset.df_result = dataset.df_result.merge(
+        atc_levels, on="parent_molregno", how="left"
+    )
+    sanity_checks.check_atc(dataset.df_result, atc_levels)
diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
index 0cb388c..d9aca47 100644
--- a/src/add_chembl_target_class_annotations.py
+++ b/src/add_chembl_target_class_annotations.py
@@ -7,6 +7,7 @@
 import write_subsets
 from arguments import OutputArgs, CalculationArgs
 from dataset import Dataset
+import sanity_checks
 
 
 ########### Add Target Class Annotations Based on ChEMBL Data ###########
@@ -80,44 +81,31 @@ def get_target_class_table(
     return df_target_classes
 
 
-def add_chembl_target_class_annotations(
+def get_aggregated_target_classes(
     dataset: Dataset,
     chembl_con: sqlite3.Connection,
-    args: CalculationArgs,
-    out: OutputArgs,
-):
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
-    Add level 1 and 2 target class annotations.
-    Assignments for target IDs with more than one target class assignment per level
-    are summarised into one string with '|' as a separator
-    between the different target class annotations.
-
-    Targets with more than one level 1 / level 2 target class assignment are written to a file.
-    These could be reassigned by hand if a single target class is preferable.
+    Get mappings for target id to aggregated level 1 / level 2 target class.
 
     :param dataset: Dataset with compound-target pairs.
-        Will be updated to only include target class annotations.
-        dataset.target_classes_level1 will be set to
-            pandas DataFrame with mapping from target id to level 1 target class
-        dataset.target_classes_level2 will be set to
-            pandas DataFrame with mapping from target id to level 2 target class
     :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param args: Arguments related to how to calculate the dataset
-    :type args: CalculationArgs
-    :param out: Arguments related to how to output the dataset
-    :type out: OutputArgs
+    :return: [pandas DataFrame with mapping from target id to level 1 target class,
+        pandas DataFrame with mapping from target id to level 2 target class]
+    :rtype: tuple[pd.DataFrame, pd.DataFrame]
     """
     current_tids = set(dataset.df_result["tid"])
     df_target_classes = get_target_class_table(chembl_con, current_tids)
 
+    between_str_join = "|"
+
     # Summarise the information for a target id with
     # several assigned target classes of level 1 into one description.
     # If a target id has more than one assigned target class,
     # the target class 'Unclassified protein' is discarded.
     level = "l1"
-    between_str_join = "|"
     target_classes_level1 = df_target_classes[["tid", level]].drop_duplicates().dropna()
 
     # remove 'Unclassified protein' from targets with more than one target class, level 1
@@ -145,10 +133,6 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l1"]
     ].drop_duplicates()
 
-    dataset.df_result = dataset.df_result.merge(
-        target_classes_level1, on="tid", how="left"
-    )
-
     # Repeat the summary step for target classes of level 2.
     level = "l2"
     target_classes_level2 = df_target_classes[["tid", level]].drop_duplicates().dropna()
@@ -159,11 +143,24 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l2"]
     ].drop_duplicates()
 
-    dataset.df_result = dataset.df_result.merge(
-        target_classes_level2, on="tid", how="left"
-    )
+    return target_classes_level1, target_classes_level2
+
 
-    # Output targets have more than one target class assignment
+def output_ambiguous_target_classes(
+    dataset: Dataset,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Output targets have more than one target class assignment
+
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
     more_than_one_level_1 = dataset.df_result[
         (dataset.df_result["target_class_l1"].notnull())
         & (dataset.df_result["target_class_l1"].str.contains("|", regex=False))
@@ -203,5 +200,50 @@ def add_chembl_target_class_annotations(
         out,
     )
 
-    dataset.target_classes_level1 = target_classes_level1
-    dataset.target_classes_level2 = target_classes_level2
+
+def add_chembl_target_class_annotations(
+    dataset: Dataset,
+    chembl_con: sqlite3.Connection,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Add level 1 and 2 target class annotations.
+    Assignments for target IDs with more than one target class assignment per level
+    are summarised into one string with '|' as a separator
+    between the different target class annotations.
+
+    Targets with more than one level 1 / level 2 target class assignment are written to a file.
+    These could be reassigned by hand if a single target class is preferable.
+
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include target class annotations.
+        dataset.target_classes_level1 will be set to
+            pandas DataFrame with mapping from target id to level 1 target class
+        dataset.target_classes_level2 will be set to
+            pandas DataFrame with mapping from target id to level 2 target class
+    :type dataset: Dataset
+    :param chembl_con: Sqlite3 connection to ChEMBL database.
+    :type chembl_con: sqlite3.Connection
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    target_classes_level1, target_classes_level2 = get_aggregated_target_classes(
+        dataset, chembl_con
+    )
+
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level1, on="tid", how="left"
+    )
+
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level2, on="tid", how="left"
+    )
+
+    sanity_checks.check_target_classes(
+        dataset.df_result, target_classes_level1, target_classes_level2
+    )
+
+    output_ambiguous_target_classes(dataset, args, out)
diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py
index 6596535..b1fdda3 100644
--- a/src/add_dti_annotations.py
+++ b/src/add_dti_annotations.py
@@ -80,6 +80,7 @@ def add_dti_annotations(
         ),
         "DTI",
     ] = "D_DT"
+
     dataset.df_result.loc[
         (
             dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
@@ -87,6 +88,7 @@ def add_dti_annotations(
         ),
         "DTI",
     ] = "C3_DT"
+
     dataset.df_result.loc[
         (
             dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
@@ -94,6 +96,7 @@ def add_dti_annotations(
         ),
         "DTI",
     ] = "C2_DT"
+
     dataset.df_result.loc[
         (
             dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
@@ -101,6 +104,7 @@ def add_dti_annotations(
         ),
         "DTI",
     ] = "C1_DT"
+
     # Compounds that are in the drug_mechanism table but don't have a known phase between 1-4:
     dataset.df_result.loc[
         (
diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py
index 1bc9268..1d9ccc3 100644
--- a/src/add_rdkit_compound_descriptors.py
+++ b/src/add_rdkit_compound_descriptors.py
@@ -4,6 +4,7 @@
 from tqdm import tqdm
 
 from dataset import Dataset
+import sanity_checks
 
 
 def add_built_in_descriptors(dataset: Dataset):
@@ -168,3 +169,4 @@ def add_rdkit_compound_descriptors(dataset: Dataset):
     """
     add_built_in_descriptors(dataset)
     add_aromaticity_descriptors(dataset)
+    sanity_checks.check_rdkit_props(dataset.df_result)
diff --git a/src/clean_dataset.py b/src/clean_dataset.py
index 6efda90..d20c1de 100644
--- a/src/clean_dataset.py
+++ b/src/clean_dataset.py
@@ -6,6 +6,7 @@
 from dataset import Dataset
 
 
+########### Remove Irrelevant Compounds ###########
 def remove_compounds_without_smiles_and_mixtures(
     dataset: Dataset, chembl_con: sqlite3.Connection
 ):
@@ -96,9 +97,8 @@ def remove_compounds_without_smiles_and_mixtures(
         )
     ]
 
-    return dataset.df_result
-
 
+########### General Cleaning Steps ###########
 def clean_none_values(dataset: Dataset):
     """
     Change nan values and empty strings to None for consistency.
diff --git a/src/dataset.py b/src/dataset.py
index 8b3d29f..2a39237 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -7,22 +7,15 @@
 class Dataset:
     """
     df_result:                  Pandas DataFrame with the full dataset
+    drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table,
+                                used for DTI assignments
+    drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
+                                used for DTI assigments
     df_sizes_all:               List of intermediate sized of the dataset used for debugging
     df_sizes_pchembl:           List of intermediate sized of the dataset used for debugging
-    drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table
-    drug_mechanism_targets_set: Set of targets in the drug_mechanism table
-    df_cpd_props:               Pandas DataFrame with compound properties and
-                                structures for all compound ids in ChEMBL
-    atc_levels:                 Pandas DataFrame with ATC annotations in ChEMBL
-    target_classes_level1:      Pandas DataFrame with mapping from target id to level 1 target class
-    target_classes_level2:      Pandas DataFrame with mapping from target id to level 2 target class
     """
 
     df_result: pd.DataFrame
-    df_cpd_props: pd.DataFrame
-    atc_levels: pd.DataFrame
-    target_classes_level1: pd.DataFrame
-    target_classes_level2: pd.DataFrame
     drug_mechanism_pairs_set: set
     drug_mechanism_targets_set: set
     df_sizes_all: list[int]
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
index ba6811e..10cdef4 100644
--- a/src/get_activity_ct_pairs.py
+++ b/src/get_activity_ct_pairs.py
@@ -163,10 +163,10 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
 
 
 ########### Get Aggregated Compound-Target Pair Information ###########
-def get_aggregated_activity_ct_pairs(
+def get_aggregated_compound_target_pairs_with_pchembl(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-) -> Dataset:
+) -> pd.DataFrame:
     """
     Get dataset of compound target-pairs with an associated pchembl value
     with pchembl and publication dates aggregated into one entry per pair.
@@ -186,9 +186,9 @@ def get_aggregated_activity_ct_pairs(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :return: Dataset with a pandas Dataframe with compound-target pairs
+    :return: Pandas Dataframe with compound-target pairs
         based on ChEMBL activity data aggregated into one entry per compound-target pair.
-    :rtype: Dataset
+    :rtype: pd.DataFrame
     """
     df_mols = get_compound_target_pairs_with_pchembl(
         chembl_con,
@@ -222,12 +222,32 @@ def get_aggregated_activity_ct_pairs(
         how="left",
     )
 
+    return df_combined
+
+
+def get_aggregated_activity_ct_pairs(
+    chembl_con: sqlite3.Connection,
+    limit_to_literature: bool,
+) -> Dataset:
+    """
+    Wrapper for get_aggregated_compound_target_pairs_with_pchembl,
+    initialising a dataset.
+
+    :param chembl_con: Sqlite3 connection to ChEMBL database.
+    :type chembl_con: sqlite3.Connection
+    :param limit_to_literature: Include only literature sources if True.
+        Include all available sources otherwise.
+    :type limit_to_literature: bool
+    :return: Dataset with a pandas Dataframe with compound-target pairs
+        based on ChEMBL activity data aggregated into one entry per compound-target pair.
+    :rtype: Dataset
+    """
+    df_result = get_aggregated_compound_target_pairs_with_pchembl(
+        chembl_con, limit_to_literature
+    )
+
     dataset = Dataset(
-        df_combined,
-        pd.DataFrame(),
-        pd.DataFrame(),
-        pd.DataFrame(),
-        pd.DataFrame(),
+        df_result,
         set(),
         set(),
         [],
diff --git a/src/get_dataset.py b/src/get_dataset.py
index c9966d5..8717cce 100644
--- a/src/get_dataset.py
+++ b/src/get_dataset.py
@@ -71,7 +71,7 @@ def get_ct_pair_dataset(
     get_stats.add_debugging_info(dataset, dataset.df_result, "clean df")
 
     logging.info("sanity_checks")
-    sanity_checks.sanity_checks(dataset, args.calculate_rdkit)
+    sanity_checks.sanity_checks(dataset)
 
     logging.info("add_filtering_columns")
     add_filtering_columns.add_filtering_columns(dataset, args, out)
diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py
index 5f1fa7c..35b430f 100644
--- a/src/get_drug_mechanism_ct_pairs.py
+++ b/src/get_drug_mechanism_ct_pairs.py
@@ -4,6 +4,7 @@
 import pandas as pd
 
 from dataset import Dataset
+import sanity_checks
 
 
 ########### Extract Drug-Target Interactions From the drug_mechanism Table ###########
@@ -241,29 +242,16 @@ def get_drug_mechanism_ct_pairs(chembl_con: sqlite3.Connection) -> pd.DataFrame:
 
 
 ########### Add Compounds From the drug_mechanism Table to the Dataset ###########
-def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection):
+def add_dm_filtering_columns(dataset: Dataset):
     """
-    Add compound-target pairs from the drug_mechanism table
-    that are not in the dataset based on the initial ChEMBL query.
-    These are compound-target pairs for which there is no associated pchembl value data.
-    Since the pairs are known interactions,
-    they are added to the dataset despite not having a pchembl value.
-    Add the set of compound-target pairs in the drug_mechanism table and
-    the set of targets in the drug_mechanism table to the dataset.
+    Add filtering columns related to the drug_mechanism table.
+        - pair_mutation_in_dm_table: pair is in dm table (incl. mutations)
+        - pair_in_dm_table: pair is in dm table (excl. mutations)
+        - keep_for_binding: use to limit to binding assays
 
     :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
     :type dataset: Dataset
-    :param chembl_con: Sqlite3 connection to ChEMBL database.
-    :type chembl_con: sqlite3.Connection
     """
-    cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con)
-    dataset.drug_mechanism_pairs_set = set(
-        f"{a}_{b}"
-        for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"])
-    )
-
-    dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"])
-
     # Add a new column *pair_mutation_in_dm_table* which is set to True if the compound target pair
     # (taking mutation annotations into account) is in the drug_mechanism table.
     dataset.df_result["pair_mutation_in_dm_table"] = False
@@ -284,6 +272,43 @@ def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection
         "pair_in_dm_table",
     ] = True
 
+    # Add a new column *keep_for_binding* which is set to True if the row should be kept
+    # if you want to limit the dataset to only data based on binding assays.
+    # Rows are kept if
+    # - there is a binding data-based pchembl value or
+    # - the compound-target pair (including mutation info) is in the drug_mechanism table
+    dataset.df_result["keep_for_binding"] = False
+    dataset.df_result.loc[
+        (
+            (dataset.df_result["pchembl_value_mean_B"].notnull())
+            | (dataset.df_result["pair_mutation_in_dm_table"])
+        ),
+        "keep_for_binding",
+    ] = True
+
+
+def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection):
+    """
+    Add compound-target pairs from the drug_mechanism table
+    that are not in the dataset based on the initial ChEMBL query.
+    These are compound-target pairs for which there is no associated pchembl value data.
+    Since the pairs are known interactions,
+    they are added to the dataset despite not having a pchembl value.
+    Add the set of compound-target pairs in the drug_mechanism table and
+    the set of targets in the drug_mechanism table to the dataset.
+
+    :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
+    :type dataset: Dataset
+    :param chembl_con: Sqlite3 connection to ChEMBL database.
+    :type chembl_con: sqlite3.Connection
+    """
+    cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con)
+    dataset.drug_mechanism_pairs_set = set(
+        f"{a}_{b}"
+        for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"])
+    )
+    dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"])
+
     ##### Limit the drug_mechanism pairs to the ones that are not yet in the dataset. #####
     # Mutation annotations are taken into account.
     # Therefore, *(cpd A, target B without mutation)* will be added
@@ -305,16 +330,6 @@ def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection
     # Combined data of existing query with new compound-target pairs.
     dataset.df_result = pd.concat([dataset.df_result, cpd_target_pairs])
 
-    # Add a new column *keep_for_binding* which is set to True if the row should be kept
-    # if you want to limit the dataset to only data based on binding assays.
-    # Rows are kept if
-    # - there is a binding data-based pchembl value or
-    # - the compound-target pair (including mutation info) is in the drug_mechanism table
-    dataset.df_result["keep_for_binding"] = False
-    dataset.df_result.loc[
-        (
-            (dataset.df_result["pchembl_value_mean_B"].notnull())
-            | (dataset.df_result["pair_mutation_in_dm_table"])
-        ),
-        "keep_for_binding",
-    ] = True
+    add_dm_filtering_columns(dataset)
+
+    sanity_checks.check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result)
diff --git a/src/sanity_checks.py b/src/sanity_checks.py
index ee8844d..ad94c89 100644
--- a/src/sanity_checks.py
+++ b/src/sanity_checks.py
@@ -3,38 +3,7 @@
 from dataset import Dataset
 
 
-########### Sanity checks for the dataset ###########
-def check_null_values(df_result: pd.DataFrame):
-    """
-    Check if any columns contain nan or null which aren't recognised as null values.
-    """
-    for col in df_result.columns:
-        col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
-        assert (
-            "nan" not in col_as_str
-        ), f"Problem with unrecognised nan value in column {col}"
-        assert (
-            "null" not in col_as_str
-        ), f"Problem with unrecognised null value in column {col}"
-
-
-def check_for_mixed_types(df_result: pd.DataFrame):
-    """
-    Check that there are no mixed types in columns with dtype=object.
-    """
-    for col, dtype in df_result.dtypes.to_dict().items():
-        if dtype == object:
-            col_original = set(df_result[df_result[col].notnull()][col])
-            col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
-            # is there a difference in the two sets (ignoring null values)
-            assert (
-                len(col_original - col_as_str) == 0
-            ), f"Mixed types in colum {col}: {col_original-col_as_str}"
-            assert (
-                len(col_as_str - col_original) == 0
-            ), f"Mixed types in colum {col}: {col_as_str-col_original}"
-
-
+########### Sanity checks during assignments ###########
 def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame):
     """
     Check that rows without a pchembl value based on binding+functional assays (pchembl_x_BF)
@@ -57,6 +26,36 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame):
         ), f"Missing pchembl value in column {pchembl_col}"
 
 
+def check_compound_props(df_result: pd.DataFrame, df_cpd_props: pd.DataFrame):
+    """
+    Check that compound props are only null if
+
+    - the property in the parent_molregno is not in df_cpd_props
+    - or if the value in the compound props table is null.
+    """
+    # missing values because the parent_molregno is not in the compound props table
+    no_cpd_prop_info = len(
+        df_result[
+            ~df_result["parent_molregno"].isin(set(df_cpd_props["parent_molregno"]))
+        ]
+    )
+
+    for col in df_cpd_props.columns:
+        if col != "parent_molregno":
+            # missing values because the compound props query returns null (exists but is null)
+            missing_values = len(
+                df_result[
+                    df_result["parent_molregno"].isin(
+                        set(df_cpd_props[df_cpd_props[col].isnull()]["parent_molregno"])
+                    )
+                ]
+            )
+            null_values = no_cpd_prop_info + missing_values
+            assert null_values == len(
+                df_result[df_result[col].isnull()]
+            ), f"Too many null values in {col}"
+
+
 def check_ligand_efficiency_metrics(df_result: pd.DataFrame):
     """
     Check that ligand efficiency metrics are only null
@@ -97,69 +96,38 @@ def check_ligand_efficiency_metrics(df_result: pd.DataFrame):
         ), f"Missing LLE value in LLE_{suffix}"
 
 
-def check_compound_props(dataset: Dataset):
+def check_atc(
+    df_result: pd.DataFrame,
+    atc_levels: pd.DataFrame,
+):
     """
-    Check that compound props are only null if
-
-    - the property in the parent_molregno is not in df_cpd_props
-    - or if the value in the compound props table is null.
+    Check that atc_level1 information is only null
+    if the parent_molregno is not in the respective table.
     """
-    # missing values because the parent_molregno is not in the compound props table
-    no_cpd_prop_info = len(
-        dataset.df_result[
-            ~dataset.df_result["parent_molregno"].isin(
-                set(dataset.df_cpd_props["parent_molregno"])
-            )
+    assert df_result[(df_result["atc_level1"].isnull())].equals(
+        df_result[
+            ~df_result["parent_molregno"].isin(set(atc_levels["parent_molregno"]))
         ]
-    )
-
-    for col in dataset.df_cpd_props.columns:
-        if col != "parent_molregno":
-            # missing values because the compound props query returns null (exists but is null)
-            missing_values = len(
-                dataset.df_result[
-                    dataset.df_result["parent_molregno"].isin(
-                        set(
-                            dataset.df_cpd_props[dataset.df_cpd_props[col].isnull()][
-                                "parent_molregno"
-                            ]
-                        )
-                    )
-                ]
-            )
-            null_values = no_cpd_prop_info + missing_values
-            assert null_values == len(
-                dataset.df_result[dataset.df_result[col].isnull()]
-            ), f"Too many null values in {col}"
+    ), "Null values in atc_level1 are not exclusively \
+        because the parent_molregno is not in the atc_classification table."
 
 
-def check_atc_and_target_classes(
-    dataset: Dataset,
+def check_target_classes(
+    df_result: pd.DataFrame,
+    target_classes_level1: pd.DataFrame,
+    target_classes_level2: pd.DataFrame,
 ):
     """
-    Check that atc_level1 and target class information is only null
-    if the parent_molregno / target id is not in the respective table.
+    Check that target class information is only null
+    if the target id is not in the respective table.
     """
-    assert dataset.df_result[(dataset.df_result["atc_level1"].isnull())].equals(
-        dataset.df_result[
-            ~dataset.df_result["parent_molregno"].isin(
-                set(dataset.atc_levels["parent_molregno"])
-            )
-        ]
-    ), "Null values in atc_level1 are not exclusively \
-        because the parent_molregno is not in the atc_classification table."
-
-    assert dataset.df_result[(dataset.df_result["target_class_l1"].isnull())].equals(
-        dataset.df_result[
-            ~dataset.df_result["tid"].isin(set(dataset.target_classes_level1["tid"]))
-        ]
+    assert df_result[(df_result["target_class_l1"].isnull())].equals(
+        df_result[~df_result["tid"].isin(set(target_classes_level1["tid"]))]
     ), "Null values in target_class_l1 are not exclusively \
         because the tid is not in the protein_classification table."
 
-    assert dataset.df_result[(dataset.df_result["target_class_l2"].isnull())].equals(
-        dataset.df_result[
-            ~dataset.df_result["tid"].isin(set(dataset.target_classes_level2["tid"]))
-        ]
+    assert df_result[(df_result["target_class_l2"].isnull())].equals(
+        df_result[~df_result["tid"].isin(set(target_classes_level2["tid"]))]
     ), "Null values in target_class_l2 are not exclusively \
         because the tid is not in the protein_classification table."
 
@@ -195,25 +163,46 @@ def check_rdkit_props(df_result: pd.DataFrame):
         ), f"Missing value in {col} despite a smiles being available."
 
 
+########### Final sanity checks for the dataset ###########
+def check_null_values(df_result: pd.DataFrame):
+    """
+    Check if any columns contain nan or null which aren't recognised as null values.
+    """
+    for col in df_result.columns:
+        col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
+        assert (
+            "nan" not in col_as_str
+        ), f"Problem with unrecognised nan value in column {col}"
+        assert (
+            "null" not in col_as_str
+        ), f"Problem with unrecognised null value in column {col}"
+
+
+def check_for_mixed_types(df_result: pd.DataFrame):
+    """
+    Check that there are no mixed types in columns with dtype=object.
+    """
+    for col, dtype in df_result.dtypes.to_dict().items():
+        if dtype == object:
+            col_original = set(df_result[df_result[col].notnull()][col])
+            col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
+            # is there a difference in the two sets (ignoring null values)
+            assert (
+                len(col_original - col_as_str) == 0
+            ), f"Mixed types in colum {col}: {col_original-col_as_str}"
+            assert (
+                len(col_as_str - col_original) == 0
+            ), f"Mixed types in colum {col}: {col_as_str-col_original}"
+
+
 def sanity_checks(
     dataset: Dataset,
-    calculate_rdkit: bool,
 ):
     """
     Check basic assumptions about the finished dataset, specifically:
 
     - no columns contain nan or null values which aren't recognised as null values
     - there are no mixed types in columns with dtype=object
-    - rows without a pchembl value based on binding+functional assays (pchembl_x_BF)
-        are in the drug_mechanism table
-    - ligand efficiency metrics are only null when at least one of the values
-        used to calculate them is null
-    - compound props are only null if the compound is not in df_cpd_props
-        or the value in that table is null
-    - atc_level1 and target class information is only null if
-        the parent_molregno / target id is not in the respective table
-    - columns set by the RDKit are only null if there is no canonical SMILES
-        for the molecule (excluding scaffolds)
 
     :param dataset: Dataset with compound-target pairs.
     :type dataset: Dataset
@@ -222,12 +211,6 @@ def sanity_checks(
     """
     check_null_values(dataset.df_result)
     check_for_mixed_types(dataset.df_result)
-    check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result)
-    check_ligand_efficiency_metrics(dataset.df_result)
-    check_compound_props(dataset)
-    check_atc_and_target_classes(dataset)
-    if calculate_rdkit:
-        check_rdkit_props(dataset.df_result)
 
 
 ########### Sanity checks for writing and reading a dataset ###########

From 0c8274873e1678dd0e1b22d7817990df76b61473 Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Wed, 21 Feb 2024 13:43:09 +0000
Subject: [PATCH 7/8] Simplify methods to output stats

---
 src/dataset.py               |  11 ++-
 src/get_activity_ct_pairs.py |   4 +-
 src/get_dataset.py           |   2 +-
 src/get_stats.py             | 178 +++++++++++++++++++----------------
 src/write_subsets.py         | 110 ++++++++--------------
 5 files changed, 143 insertions(+), 162 deletions(-)

diff --git a/src/dataset.py b/src/dataset.py
index 2a39237..352f5bd 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -11,12 +11,15 @@ class Dataset:
                                 used for DTI assignments
     drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
                                 used for DTI assigments
-    df_sizes_all:               List of intermediate sized of the dataset used for debugging
-    df_sizes_pchembl:           List of intermediate sized of the dataset used for debugging
+    df_sizes_all:               Pandas DataFrame of intermediate sizes of the dataset,
+                                used for debugging
+    df_sizes_pchembl:           Pandas DataFrame of intermediate sizes of the dataset,
+                                restricted to entries with a pchembl value,
+                                used for debugging
     """
 
     df_result: pd.DataFrame
     drug_mechanism_pairs_set: set
     drug_mechanism_targets_set: set
-    df_sizes_all: list[int]
-    df_sizes_pchembl: list[int]
+    df_sizes_all: pd.DataFrame
+    df_sizes_pchembl: pd.DataFrame
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
index 10cdef4..1b68394 100644
--- a/src/get_activity_ct_pairs.py
+++ b/src/get_activity_ct_pairs.py
@@ -250,7 +250,7 @@ def get_aggregated_activity_ct_pairs(
         df_result,
         set(),
         set(),
-        [],
-        [],
+        pd.DataFrame(),
+        pd.DataFrame(),
     )
     return dataset
diff --git a/src/get_dataset.py b/src/get_dataset.py
index 8717cce..5324e9e 100644
--- a/src/get_dataset.py
+++ b/src/get_dataset.py
@@ -83,4 +83,4 @@ def get_ct_pair_dataset(
     write_subsets.output_all_stats(dataset, args, out)
 
     if logging.DEBUG >= logging.root.level:
-        write_subsets.output_debug_sizes(dataset, out)
+        write_subsets.write_debug_sizes(dataset, out)
diff --git a/src/get_stats.py b/src/get_stats.py
index 0b96abc..c6ee8e2 100644
--- a/src/get_stats.py
+++ b/src/get_stats.py
@@ -4,94 +4,33 @@
 from dataset import Dataset
 
 
-##### Debugging Stats #####
-def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]:
+##### Logging Stats #####
+def get_stats_columns() -> tuple[list[str], list[str]]:
     """
-    Calculate the number of unique compounds, targets and pairs
-    for df and df limited to drugs.
-
-    :param df: Pandas DataFrame for which the dataset sizes should be calculated.
-    :type df: pd.DataFrame
-    :return: List of calculated unique counts.
-    :rtype: list[int]
+    Get the relevant columns for which stats should be calculated
+    and a list of descriptions corresponding to the columns.
     """
-    now_mols = df["parent_molregno"].nunique()
-    now_targets = df["tid"].nunique()
-    now_targets_mutation = df["tid_mutation"].nunique()
-    now_pairs = df["cpd_target_pair"].nunique()
-    now_pairs_mutation = df["cpd_target_pair_mutation"].nunique()
-
-    if "DTI" in df.columns:
-        # drugs = compounds of a compound-target pair with a known interaction
-        df_drugs = df[df["DTI"] == "D_DT"]
-    else:
-        df_drugs = df[df["max_phase"] == 4]
-
-    now_drugs = df_drugs["parent_molregno"].nunique()
-    now_drug_targets = df_drugs["tid"].nunique()
-    now_drug_targets_mutation = df_drugs["tid_mutation"].nunique()
-    now_drug_pairs = df_drugs["cpd_target_pair"].nunique()
-    now_drug_pairs_mutation = df_drugs["cpd_target_pair_mutation"].nunique()
-
-    return [
-        now_mols,
-        now_drugs,
-        now_targets,
-        now_drug_targets,
-        now_targets_mutation,
-        now_drug_targets_mutation,
-        now_pairs,
-        now_drug_pairs,
-        now_pairs_mutation,
-        now_drug_pairs_mutation,
+    df_columns = [
+        "parent_molregno",
+        "tid",
+        "tid_mutation",
+        "cpd_target_pair",
+        "cpd_target_pair_mutation",
     ]
+    columns_descs = [
+        "compound ID",
+        "target ID",
+        "target ID with mutation annotations",
+        "compound-target pair",
+        "compound-target pair with mutation annotations",
+    ]
+    return df_columns, columns_descs
 
 
-def add_dataset_sizes(
-    dataset: Dataset,
-    df: pd.DataFrame,
-    label: str,
-):
-    """
-    Count and add representative counts of df used for debugging to the dataset.
-
-    :param dataset: Dataset with compound-target pairs and debugging sizes.
-    :type dataset: Dataset
-    :param df: Pandas DataFrame with current compound-target pairs
-    :type df: pd.DataFrame
-    :param label: Description of pipeline step (e.g., initial query).
-    :type label: str
-    """
-    df_copy = df.copy()
-    dataset.df_sizes_all.append([label] + calculate_dataset_sizes(df_copy))
-
-    # restrict to data with any pchembl value (any data with a pchembl,
-    # even if it is based on only functional data)
-    # these statistics are purely based on removing
-    # compound-target pairs without pchembl information,
-    # i.e., the subset of the dataset is determined by the given df and not recalculated
-    df_pchembl = df_copy.dropna(
-        subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
-    )
-    dataset.df_sizes_pchembl.append([label] + calculate_dataset_sizes(df_pchembl))
-
-
-def add_debugging_info(
-    dataset: Dataset,
-    df: pd.DataFrame,
-    label: str,
-):
-    """
-    Wrapper for add_dataset_sizes.
-    Handles logging level.
-    """
-    if logging.DEBUG >= logging.root.level:
-        add_dataset_sizes(dataset, df, label)
-
-
-##### Logging Stats #####
 def get_stats_for_column(
-    df: pd.DataFrame, column: str, columns_desc: str
+    df: pd.DataFrame,
+    column: str,
+    columns_desc: str,
 ) -> list[list[str, str, int]]:
     """
     Calculate the number of unique values in df[column] and various subsets of df.
@@ -145,3 +84,78 @@ def get_stats_for_column(
             df[df["DTI"] == "C0_DT"][column].nunique(),
         ],
     ]
+
+
+##### Debugging Stats #####
+def get_dataset_sizes(df: pd.DataFrame, label: str) -> pd.DataFrame:
+    """
+    Calculate the number of unique compounds, targets and pairs
+    for df and df limited to drugs.
+
+    :param df: Pandas DataFrame for which the dataset sizes should be calculated.
+    :type df: pd.DataFrame
+    :param label: Description of pipeline step (e.g., initial query).
+    :type label: str
+    :return: Pandas DataFrame with calculated unique counts.
+    :rtype: pd.DataFrame
+    """
+    stats = {"step": label}
+
+    if "DTI" in df.columns:
+        # drugs = compounds of a compound-target pair with a known interaction
+        df_drugs = df[df["DTI"] == "D_DT"]
+    else:
+        df_drugs = df[df["max_phase"] == 4]
+
+    df_columns, _ = get_stats_columns()
+    for column in df_columns:
+        stats[f"{column}_all"] = df[column].nunique()
+        stats[f"{column}_drugs"] = df_drugs[column].nunique()
+
+    df_stats = pd.DataFrame([stats])
+    return df_stats
+
+
+def add_dataset_sizes(
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
+):
+    """
+    Count and add representative counts of df used for debugging to the dataset.
+
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
+    :param df: Pandas DataFrame with current compound-target pairs
+    :type df: pd.DataFrame
+    :param label: Description of pipeline step (e.g., initial query).
+    :type label: str
+    """
+    df_stats = get_dataset_sizes(df, label)
+
+    dataset.df_sizes_all = pd.concat([dataset.df_sizes_all, df_stats])
+
+    # restrict to data with any pchembl value (any data with a pchembl,
+    # even if it is based on only functional data)
+    # these statistics are purely based on removing
+    # compound-target pairs without pchembl information,
+    # i.e., the subset of the dataset is determined by the given df and not recalculated
+    df_copy = df.copy()
+    df_pchembl = df_copy.dropna(
+        subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
+    )
+    df_stats = get_dataset_sizes(df_pchembl, label)
+    dataset.df_sizes_pchembl = pd.concat([dataset.df_sizes_pchembl, df_stats])
+
+
+def add_debugging_info(
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
+):
+    """
+    Wrapper for add_dataset_sizes.
+    Handles logging level.
+    """
+    if logging.DEBUG >= logging.root.level:
+        add_dataset_sizes(dataset, df, label)
diff --git a/src/write_subsets.py b/src/write_subsets.py
index a5635a1..edf08b3 100644
--- a/src/write_subsets.py
+++ b/src/write_subsets.py
@@ -8,6 +8,7 @@
 from dataset import Dataset
 
 
+##### Writing Output #####
 def write_output(
     df: pd.DataFrame,
     filename: str,
@@ -73,6 +74,7 @@ def write_and_check_output(
     )
 
 
+##### Output Specific Results #####
 def write_full_dataset_to_file(
     dataset: Dataset,
     args: CalculationArgs,
@@ -97,59 +99,6 @@ def write_full_dataset_to_file(
         write_and_check_output(dataset.df_result, name_all, desc, args, out)
 
 
-def output_debug_sizes(
-    dataset: Dataset,
-    out: OutputArgs,
-):
-    """
-    Output counts at various points during calculating the final dataset for debugging.
-
-    :param dataset: Dataset with compound-target pairs and debugging sizes.
-    :type dataset: Dataset
-    :param args: Arguments related to how to calculate the dataset
-    :type args: CalculationArgs
-    :param out: Arguments related to how to output the dataset
-    :type out: OutputArgs
-    """
-    column_names = [
-        "type",
-        "#mols",
-        "#drugs",
-        "#targets",
-        "#drug_ targets",
-        "#targets_ mutation",
-        "#drug_ targets_mutation",
-        "#cpd_tid_ pairs",
-        "#drug_tid_ pairs",
-        "#cpd_ tid_mutation_ pairs",
-        "#drug_ tid_mutation_ pairs",
-    ]
-
-    logging.debug("Size of full dataset at different points.")
-    full_df_sizes = pd.DataFrame(dataset.df_sizes_all, columns=column_names)
-    logging.debug(full_df_sizes)
-    name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
-    write_output(
-        full_df_sizes,
-        name_full_df_sizes,
-        out,
-    )
-
-    logging.debug("Size of dataset with any pchembl values at different points.")
-    logging.debug(
-        "This includes data for which we only have pchembl data \
-            for functional assays but not for binding assays."
-    )
-    df_pchembl_sizes = pd.DataFrame(dataset.df_sizes_pchembl, columns=column_names)
-    logging.debug(df_pchembl_sizes)
-    name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
-    write_output(
-        full_df_sizes,
-        name_pchembl_df_sizes,
-        out,
-    )
-
-
 def output_stats(
     df: pd.DataFrame,
     output_file: str,
@@ -171,33 +120,15 @@ def output_stats(
     :param out: Arguments related to how to output the dataset
     :type out: OutputArgs
     """
-    df_columns = [
-        "parent_molregno",
-        "tid",
-        "tid_mutation",
-        "cpd_target_pair",
-        "cpd_target_pair_mutation",
-    ]
-    columns_descs = [
-        "compound ID",
-        "target ID",
-        "target ID with mutation annotations",
-        "compound-target pair",
-        "compound-target pair with mutation annotations",
-    ]
-
     logging.debug("Stats for %s", output_file)
     stats = []
+    df_columns, columns_descs = get_stats.get_stats_columns()
     for column, columns_desc in zip(df_columns, columns_descs):
         logging.debug("Stats for column %s:", column)
         column_stats = get_stats.get_stats_for_column(df, column, columns_desc)
         stats += column_stats
         for colum_stat in column_stats:
-            logging.debug(
-                "%20s %s",
-                colum_stat[2],
-                colum_stat[3],
-            )
+            logging.debug("%20s %s", colum_stat[2], colum_stat[3])
 
     df_stats = pd.DataFrame(
         stats, columns=["column", "column_description", "subset_type", "counts"]
@@ -252,3 +183,36 @@ def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs):
             output_file,
             out,
         )
+
+
+def write_debug_sizes(
+    dataset: Dataset,
+    out: OutputArgs,
+):
+    """
+    Output counts at various points during calculating the final dataset for debugging.
+
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    # Size of full dataset at different points.
+    name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
+    write_output(
+        dataset.df_sizes_all,
+        name_full_df_sizes,
+        out,
+    )
+
+    # Size of dataset with any pchembl values at different points.
+    # This includes data for which we only have pchembl data
+    # for functional assays but not for binding assays.
+    name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
+    write_output(
+        dataset.df_sizes_pchembl,
+        name_pchembl_df_sizes,
+        out,
+    )

From 3fd8e87071bc6ae6c07b3cabce9cc8c8dede6b81 Mon Sep 17 00:00:00 2001
From: Lina Heinzke <heinzke@ebi.ac.uk>
Date: Wed, 21 Feb 2024 16:48:06 +0000
Subject: [PATCH 8/8] Add module docstrings

---
 src/add_chembl_compound_properties.py      |  4 ++++
 src/add_chembl_target_class_annotations.py |  8 ++++++--
 src/add_dti_annotations.py                 |  6 +++++-
 src/add_filtering_columns.py               | 10 ++++++---
 src/add_rdkit_compound_descriptors.py      |  4 ++++
 src/arguments.py                           |  6 ++++++
 src/clean_dataset.py                       |  4 ++++
 src/dataset.py                             |  5 +++++
 src/get_activity_ct_pairs.py               |  5 +++++
 src/get_dataset.py                         | 24 +++++++++++++---------
 src/get_drug_mechanism_ct_pairs.py         |  5 +++++
 src/get_stats.py                           |  4 ++++
 src/main.py                                |  4 ++++
 src/{write_subsets.py => output.py}        |  7 ++++++-
 src/sanity_checks.py                       |  4 ++++
 15 files changed, 83 insertions(+), 17 deletions(-)
 rename src/{write_subsets.py => output.py} (98%)

diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py
index 879c8dc..5f6afe8 100644
--- a/src/add_chembl_compound_properties.py
+++ b/src/add_chembl_compound_properties.py
@@ -1,3 +1,7 @@
+"""
+Add ChEMBL compound properties to the dataset.
+"""
+
 import sqlite3
 
 import pandas as pd
diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
index d9aca47..bb8d080 100644
--- a/src/add_chembl_target_class_annotations.py
+++ b/src/add_chembl_target_class_annotations.py
@@ -1,12 +1,16 @@
+"""
+Add target class annotations based on ChEMBL data to the dataset.
+"""
+
 import logging
 import os
 import sqlite3
 
 import pandas as pd
 
-import write_subsets
 from arguments import OutputArgs, CalculationArgs
 from dataset import Dataset
+import output
 import sanity_checks
 
 
@@ -194,7 +198,7 @@ def output_ambiguous_target_classes(
         f"ChEMBL{args.chembl_version}_"
         f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass",
     )
-    write_subsets.write_output(
+    output.write_output(
         more_than_one_tclass,
         name_more_than_one_tclass,
         out,
diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py
index b1fdda3..94367e9 100644
--- a/src/add_dti_annotations.py
+++ b/src/add_dti_annotations.py
@@ -1,7 +1,11 @@
+"""
+Add DTI (Drug-Target Interaction) Annotations to the dataset.
+"""
+
 from dataset import Dataset
 
 
-########### CTI (Compound-Target Interaction) Annotations ###########
+########### DTI (Drug-Target Interaction) Annotations ###########
 def add_dti_annotations(
     dataset: Dataset,
 ):
diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py
index 88ce052..27d4076 100644
--- a/src/add_filtering_columns.py
+++ b/src/add_filtering_columns.py
@@ -1,12 +1,16 @@
+"""
+Add filtering columns for obtaining the different subsets to the dataset.
+"""
+
 import logging
 import os
 
 import pandas as pd
 
 from arguments import CalculationArgs, OutputArgs
-import get_stats
-import write_subsets
 from dataset import Dataset
+import get_stats
+import output
 
 
 def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[
@@ -145,7 +149,7 @@ def add_subset_filtering_columns(
                 f"CTI_{args.limited_flag}_"
                 f"{subset_desc}",
             )
-            write_subsets.write_and_check_output(
+            output.write_and_check_output(
                 df_subset,
                 name_subset,
                 desc,
diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py
index 1d9ccc3..889ff70 100644
--- a/src/add_rdkit_compound_descriptors.py
+++ b/src/add_rdkit_compound_descriptors.py
@@ -1,3 +1,7 @@
+"""
+Add RDKit-based compound properties to the dataset.
+"""
+
 from rdkit import Chem
 from rdkit.Chem import Descriptors
 from rdkit.Chem import PandasTools
diff --git a/src/arguments.py b/src/arguments.py
index 080b331..ea02154 100644
--- a/src/arguments.py
+++ b/src/arguments.py
@@ -1,4 +1,10 @@
+"""
+Dataclasses related to handling arguments, 
+specifically arguments related to how to calculate or output the dataset. 
+"""
+
 import argparse
+
 from dataclasses import dataclass
 
 
diff --git a/src/clean_dataset.py b/src/clean_dataset.py
index d20c1de..56517dd 100644
--- a/src/clean_dataset.py
+++ b/src/clean_dataset.py
@@ -1,3 +1,7 @@
+"""
+Methods related to cleaning the dataset.
+"""
+
 import logging
 import sqlite3
 
diff --git a/src/dataset.py b/src/dataset.py
index 352f5bd..19fd259 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,3 +1,8 @@
+"""
+Dataclass for handling the calculated compound-target pair dataset 
+and related data.
+"""
+
 from dataclasses import dataclass
 
 import pandas as pd
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
index 1b68394..3f6a02c 100644
--- a/src/get_activity_ct_pairs.py
+++ b/src/get_activity_ct_pairs.py
@@ -1,3 +1,8 @@
+"""
+Get initial set of compound-target pairs with an associated activity
+for the dataset.
+"""
+
 import sqlite3
 
 import numpy as np
diff --git a/src/get_dataset.py b/src/get_dataset.py
index 5324e9e..faffb3b 100644
--- a/src/get_dataset.py
+++ b/src/get_dataset.py
@@ -1,18 +1,22 @@
+"""
+Main workflow to calculate the compound-target pairs dataset.
+"""
+
 import logging
 import sqlite3
 
+from arguments import OutputArgs, CalculationArgs
+import add_filtering_columns
 import get_activity_ct_pairs
-import get_drug_mechanism_ct_pairs
-import add_dti_annotations
 import add_chembl_compound_properties
-import clean_dataset
 import add_chembl_target_class_annotations
+import get_drug_mechanism_ct_pairs
+import add_dti_annotations
 import add_rdkit_compound_descriptors
-import sanity_checks
-import write_subsets
+import clean_dataset
 import get_stats
-from arguments import OutputArgs, CalculationArgs
-import add_filtering_columns
+import output
+import sanity_checks
 
 
 def get_ct_pair_dataset(
@@ -77,10 +81,10 @@ def get_ct_pair_dataset(
     add_filtering_columns.add_filtering_columns(dataset, args, out)
 
     logging.info("write_full_dataset_to_file")
-    write_subsets.write_full_dataset_to_file(dataset, args, out)
+    output.write_full_dataset_to_file(dataset, args, out)
 
     logging.info("output_stats")
-    write_subsets.output_all_stats(dataset, args, out)
+    output.output_all_stats(dataset, args, out)
 
     if logging.DEBUG >= logging.root.level:
-        write_subsets.write_debug_sizes(dataset, out)
+        output.write_debug_sizes(dataset, out)
diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py
index 35b430f..6121acd 100644
--- a/src/get_drug_mechanism_ct_pairs.py
+++ b/src/get_drug_mechanism_ct_pairs.py
@@ -1,3 +1,8 @@
+"""
+Get and add compound-target pairs based on information 
+in the drug_mechanism table.
+"""
+
 import logging
 import sqlite3
 
diff --git a/src/get_stats.py b/src/get_stats.py
index c6ee8e2..662f937 100644
--- a/src/get_stats.py
+++ b/src/get_stats.py
@@ -1,3 +1,7 @@
+"""
+Get statistics of dataset for final results and debugging.
+"""
+
 import logging
 import pandas as pd
 
diff --git a/src/main.py b/src/main.py
index ffe2bdd..5b297b6 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,3 +1,7 @@
+"""
+Get the compound-target pairs dataset from ChEMBL using the given arguments.
+"""
+
 import logging
 import sqlite3
 
diff --git a/src/write_subsets.py b/src/output.py
similarity index 98%
rename from src/write_subsets.py
rename to src/output.py
index edf08b3..f1a4a5f 100644
--- a/src/write_subsets.py
+++ b/src/output.py
@@ -1,11 +1,16 @@
+"""
+Write the dataset, subsets and related statistics to files 
+and to the command line.
+"""
+
 import logging
 import os
 import pandas as pd
 import sanity_checks
 
-import get_stats
 from arguments import OutputArgs, CalculationArgs
 from dataset import Dataset
+import get_stats
 
 
 ##### Writing Output #####
diff --git a/src/sanity_checks.py b/src/sanity_checks.py
index ad94c89..94c5d6d 100644
--- a/src/sanity_checks.py
+++ b/src/sanity_checks.py
@@ -1,3 +1,7 @@
+"""
+Perform sanity checks on the dataset.
+"""
+
 import pandas as pd
 
 from dataset import Dataset