diff --git a/src/add_chembl_compound_properties.py b/src/add_chembl_compound_properties.py
index 836b06a..5f6afe8 100644
--- a/src/add_chembl_compound_properties.py
+++ b/src/add_chembl_compound_properties.py
@@ -1,11 +1,18 @@
+"""
+Add ChEMBL compound properties to the dataset.
+"""
+
 import sqlite3
 
 import pandas as pd
 
+from dataset import Dataset
+import sanity_checks
+
 
 ########### Add Compound Properties Based on ChEMBL Data ###########
-def add_first_publication_date(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool
+def get_first_publication_cpd_date(
+    chembl_con: sqlite3.Connection, limit_to_literature: bool
 ) -> pd.DataFrame:
     """
     Query and calculate the first publication of a compound
@@ -14,13 +21,11 @@ def add_first_publication_date(
     of the compound in the literature according to ChEMBL.
     Otherwise this is the first appearance in any source in ChEMBL.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
     :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
     :type limit_to_literature: bool
-    :return: Pandas DataFrame with added first_publication_cpd.
+    :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL.
     :rtype: pd.DataFrame
     """
     # information about salts is aggregated in the parent
@@ -42,26 +47,21 @@ def add_first_publication_date(
     ].transform("min")
     df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates()
 
-    df_combined = df_combined.merge(df_docs, on="parent_molregno", how="left")
+    return df_docs
 
-    return df_combined
 
-
-def add_chembl_properties_and_structures(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def get_chembl_properties_and_structures(
+    chembl_con: sqlite3.Connection,
+) -> pd.DataFrame:
     """
-    Add compound properties from the compound_properties table 
+    Get compound properties from the compound_properties table
     (e.g., alogp, #hydrogen bond acceptors / donors, etc.).
-    Add InChI, InChI key and canonical smiles. 
+    Get InChI, InChI key and canonical smiles.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with added compound properties and structures. \\
-        - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL.
-    :rtype: (pd.DataFrame, pd.DataFrame)
+    :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, 
@@ -79,14 +79,12 @@ def add_chembl_properties_and_structures(
 
     df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
 
-    df_combined = df_combined.merge(df_cpd_props, on="parent_molregno", how="left")
-
-    return df_combined, df_cpd_props
+    return df_cpd_props
 
 
-def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
+def calculate_ligand_efficiency_metrics(dataset: Dataset):
     """
-    Calculate the ligand efficiency metrics for the compounds
+    Calculate and add the ligand efficiency metrics for the compounds
     based on the mean pchembl values for a compound-target pair and
     the following ligand efficiency (LE) formulas:
 
@@ -108,33 +106,37 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
     Once for the pchembl values based on binding + functional assays (BF)
     and once for the pchembl values based on binding assays only (B).
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added ligand efficiency metrics
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include ligand efficiency metrics.
+    :type dataset: Dataset
     """
     for suffix in ["BF", "B"]:
-        df_combined.loc[df_combined["heavy_atoms"] != 0, f"LE_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"]
-            / df_combined["heavy_atoms"]
+        dataset.df_result.loc[dataset.df_result["heavy_atoms"] != 0, f"LE_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            / dataset.df_result["heavy_atoms"]
             * (2.303 * 298 * 0.00199)
         )
 
-        df_combined.loc[df_combined["mw_freebase"] != 0, f"BEI_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"]
+        dataset.df_result.loc[
+            dataset.df_result["mw_freebase"] != 0, f"BEI_{suffix}"
+        ] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
             * 1000
-            / df_combined["mw_freebase"]
+            / dataset.df_result["mw_freebase"]
         )
 
-        df_combined.loc[df_combined["psa"] != 0, f"SEI_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"] * 100 / df_combined["psa"]
+        dataset.df_result.loc[dataset.df_result["psa"] != 0, f"SEI_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            * 100
+            / dataset.df_result["psa"]
         )
 
-        df_combined[f"LLE_{suffix}"] = (
-            df_combined[f"pchembl_value_mean_{suffix}"] - df_combined["alogp"]
+        dataset.df_result[f"LLE_{suffix}"] = (
+            dataset.df_result[f"pchembl_value_mean_{suffix}"]
+            - dataset.df_result["alogp"]
         )
 
-        df_combined = df_combined.astype(
+        dataset.df_result = dataset.df_result.astype(
             {
                 f"LE_{suffix}": "float64",
                 f"BEI_{suffix}": "float64",
@@ -143,26 +145,19 @@ def add_ligand_efficiency_metrics(df_combined: pd.DataFrame) -> pd.DataFrame:
             }
         )
 
-    return df_combined
-
 
-def add_atc_classification(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, pd.DataFrame]:
+def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame:
     """
-    Query and add ATC classifications (level 1) from the atc_classification and 
+    Query ATC classifications (level 1) from the atc_classification and
     molecule_atc_classification tables.
-    ATC level annotations for the same parent_molregno are combined into one description 
-    that concatenates all descriptions sorted alphabetically 
+    ATC level annotations for the same parent_molregno are combined into one description
+    that concatenates all descriptions sorted alphabetically
     into one string with ' | ' as a separator.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with added ATC classifications \\
-        - Pandas DataFrame with ATC annotations in ChEMBL
-    :rtype: (pd.DataFrame, pd.DataFrame)
+    :return: Pandas DataFrame with ATC annotations in ChEMBL.
+    :rtype: pd.DataFrame
     """
     sql = """
     SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description
@@ -185,14 +180,12 @@ def add_atc_classification(
     ].transform(lambda x: between_str_join.join(sorted(x)))
     atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates()
 
-    df_combined = df_combined.merge(atc_levels, on="parent_molregno", how="left")
-
-    return df_combined, atc_levels
+    return atc_levels
 
 
 def add_all_chembl_compound_properties(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection, limit_to_literature: bool
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
+):
     """
     Add ChEMBL-based compound properties to the given compound-target pairs, specifically:
 
@@ -202,24 +195,33 @@ def add_all_chembl_compound_properties(
     - ligand efficiency metrics
     - ATC classifications
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to include compound properties.
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param limit_to_literature: Base first_publication_cpd on literature sources only if True. 
+    :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
         Base it on all available sources otherwise.
     :type limit_to_literature: bool
-    :return: - Pandas DataFrame with added compound properties \\
-        - Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL \\
-        - Pandas DataFrame with ATC annotations in ChEMBL
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
     """
-    df_combined = add_first_publication_date(
-        df_combined, chembl_con, limit_to_literature
+    df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature)
+    dataset.df_result = dataset.df_result.merge(
+        df_docs, on="parent_molregno", how="left"
     )
-    df_combined, df_cpd_props = add_chembl_properties_and_structures(
-        df_combined, chembl_con
+
+    df_cpd_props = get_chembl_properties_and_structures(chembl_con)
+    dataset.df_cpd_props = df_cpd_props
+    dataset.df_result = dataset.df_result.merge(
+        df_cpd_props, on="parent_molregno", how="left"
+    )
+    sanity_checks.check_compound_props(dataset.df_result, df_cpd_props)
+
+    calculate_ligand_efficiency_metrics(dataset)
+    sanity_checks.check_ligand_efficiency_metrics(dataset.df_result)
+
+    atc_levels = get_atc_classification(chembl_con)
+    dataset.atc_levels = atc_levels
+    dataset.df_result = dataset.df_result.merge(
+        atc_levels, on="parent_molregno", how="left"
     )
-    df_combined = add_ligand_efficiency_metrics(df_combined)
-    df_combined, atc_levels = add_atc_classification(df_combined, chembl_con)
-    return df_combined, df_cpd_props, atc_levels
+    sanity_checks.check_atc(dataset.df_result, atc_levels)
diff --git a/src/add_chembl_target_class_annotations.py b/src/add_chembl_target_class_annotations.py
index 25c1d01..bb8d080 100644
--- a/src/add_chembl_target_class_annotations.py
+++ b/src/add_chembl_target_class_annotations.py
@@ -1,10 +1,17 @@
+"""
+Add target class annotations based on ChEMBL data to the dataset.
+"""
+
 import logging
 import os
 import sqlite3
 
 import pandas as pd
 
-import write_subsets
+from arguments import OutputArgs, CalculationArgs
+from dataset import Dataset
+import output
+import sanity_checks
 
 
 ########### Add Target Class Annotations Based on ChEMBL Data ###########
@@ -78,56 +85,31 @@ def get_target_class_table(
     return df_target_classes
 
 
-def add_chembl_target_class_annotations(
-    df_combined: pd.DataFrame,
+def get_aggregated_target_classes(
+    dataset: Dataset,
     chembl_con: sqlite3.Connection,
-    output_path: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    chembl_version: str,
-    limited_flag: str,
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+) -> tuple[pd.DataFrame, pd.DataFrame]:
     """
-    Add level 1 and 2 target class annotations. 
-    Assignments for target IDs with more than one target class assignment per level 
-    are summarised into one string with '|' as a separator 
-    between the different target class annotations.
+    Get mappings for target id to aggregated level 1 / level 2 target class.
 
-    Targets with more than one level 1 / level 2 target class assignment are written to a file.
-    These could be reassigned by hand if a single target class is preferable.
-
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :param output_path: Path to write the targets with more than one target class assignment to
-    :type output_path: str
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param limited_flag: Document suffix indicating 
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
-    :return: - Pandas DataFrame with added target class annotations \\
-        - Pandas DataFrame with mapping from target id to level 1 target class \\
-        - Pandas DataFrame with mapping from target id to level 2 target class
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame)
+    :return: [pandas DataFrame with mapping from target id to level 1 target class,
+        pandas DataFrame with mapping from target id to level 2 target class]
+    :rtype: tuple[pd.DataFrame, pd.DataFrame]
     """
-    current_tids = set(df_combined["tid"])
+    current_tids = set(dataset.df_result["tid"])
     df_target_classes = get_target_class_table(chembl_con, current_tids)
 
+    between_str_join = "|"
+
     # Summarise the information for a target id with
     # several assigned target classes of level 1 into one description.
     # If a target id has more than one assigned target class,
     # the target class 'Unclassified protein' is discarded.
     level = "l1"
-    between_str_join = "|"
     target_classes_level1 = df_target_classes[["tid", level]].drop_duplicates().dropna()
 
     # remove 'Unclassified protein' from targets with more than one target class, level 1
@@ -155,8 +137,6 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l1"]
     ].drop_duplicates()
 
-    df_combined = df_combined.merge(target_classes_level1, on="tid", how="left")
-
     # Repeat the summary step for target classes of level 2.
     level = "l2"
     target_classes_level2 = df_target_classes[["tid", level]].drop_duplicates().dropna()
@@ -167,12 +147,27 @@ def add_chembl_target_class_annotations(
         ["tid", "target_class_l2"]
     ].drop_duplicates()
 
-    df_combined = df_combined.merge(target_classes_level2, on="tid", how="left")
+    return target_classes_level1, target_classes_level2
+
 
-    # Output targets have more than one target class assignment
-    more_than_one_level_1 = df_combined[
-        (df_combined["target_class_l1"].notnull())
-        & (df_combined["target_class_l1"].str.contains("|", regex=False))
+def output_ambiguous_target_classes(
+    dataset: Dataset,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Output targets have more than one target class assignment
+
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    more_than_one_level_1 = dataset.df_result[
+        (dataset.df_result["target_class_l1"].notnull())
+        & (dataset.df_result["target_class_l1"].str.contains("|", regex=False))
     ][
         ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"]
     ].drop_duplicates()
@@ -180,9 +175,9 @@ def add_chembl_target_class_annotations(
         "Targets with more than one level 1 target class assignment: %s",
         len(more_than_one_level_1),
     )
-    more_than_one_level_2 = df_combined[
-        (df_combined["target_class_l2"].notnull())
-        & (df_combined["target_class_l2"].str.contains("|", regex=False))
+    more_than_one_level_2 = dataset.df_result[
+        (dataset.df_result["target_class_l2"].notnull())
+        & (dataset.df_result["target_class_l2"].str.contains("|", regex=False))
     ][
         ["tid", "target_pref_name", "target_type", "target_class_l1", "target_class_l2"]
     ].drop_duplicates()
@@ -199,15 +194,60 @@ def add_chembl_target_class_annotations(
     )
 
     name_more_than_one_tclass = os.path.join(
-        output_path,
-        f"ChEMBL{chembl_version}_CTI_{limited_flag}_targets_w_more_than_one_tclass",
+        out.output_path,
+        f"ChEMBL{args.chembl_version}_"
+        f"CTI_{args.limited_flag}_targets_w_more_than_one_tclass",
     )
-    write_subsets.write_output(
+    output.write_output(
         more_than_one_tclass,
         name_more_than_one_tclass,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
+        out,
+    )
+
+
+def add_chembl_target_class_annotations(
+    dataset: Dataset,
+    chembl_con: sqlite3.Connection,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Add level 1 and 2 target class annotations.
+    Assignments for target IDs with more than one target class assignment per level
+    are summarised into one string with '|' as a separator
+    between the different target class annotations.
+
+    Targets with more than one level 1 / level 2 target class assignment are written to a file.
+    These could be reassigned by hand if a single target class is preferable.
+
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include target class annotations.
+        dataset.target_classes_level1 will be set to
+            pandas DataFrame with mapping from target id to level 1 target class
+        dataset.target_classes_level2 will be set to
+            pandas DataFrame with mapping from target id to level 2 target class
+    :type dataset: Dataset
+    :param chembl_con: Sqlite3 connection to ChEMBL database.
+    :type chembl_con: sqlite3.Connection
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    target_classes_level1, target_classes_level2 = get_aggregated_target_classes(
+        dataset, chembl_con
+    )
+
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level1, on="tid", how="left"
+    )
+
+    dataset.df_result = dataset.df_result.merge(
+        target_classes_level2, on="tid", how="left"
+    )
+
+    sanity_checks.check_target_classes(
+        dataset.df_result, target_classes_level1, target_classes_level2
     )
 
-    return df_combined, target_classes_level1, target_classes_level2
+    output_ambiguous_target_classes(dataset, args, out)
diff --git a/src/add_dti_annotations.py b/src/add_dti_annotations.py
index 9f181c5..94367e9 100644
--- a/src/add_dti_annotations.py
+++ b/src/add_dti_annotations.py
@@ -1,12 +1,14 @@
-import pandas as pd
+"""
+Add DTI (Drug-Target Interaction) Annotations to the dataset.
+"""
 
+from dataset import Dataset
 
-########### CTI (Compound-Target Interaction) Annotations ###########
+
+########### DTI (Drug-Target Interaction) Annotations ###########
 def add_dti_annotations(
-    df_combined: pd.DataFrame,
-    drug_mechanism_pairs_set: set,
-    drug_mechanism_targets_set: set,
-) -> pd.DataFrame:
+    dataset: Dataset,
+):
     """
     Every compound-target pair is assigned a DTI (drug target interaction) annotation.  
 
@@ -60,84 +62,95 @@ def add_dti_annotations(
     and for which the target was also not in the drug_mechanisms table 
     (not a comparator compound), are discarded.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs 
-        based on activities AND drug_mechanism table
-    :type df_combined: pd.DataFrame
-    :param drug_mechanism_pairs_set: set of compound-target pairs in the drug_mechanism table
-    :type drug_mechanism_pairs_set: set
-    :param drug_mechanism_targets_set: set of targets in the drug_mechanism table
-    :type drug_mechanism_targets_set: set
-    :return: Pandas DataFrame with all compound-target pairs and their DTI annotations.
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with all relevant information:
+        - Pandas DataFrame with compound-target pairs
+            based on activities AND drug_mechanism table
+        - set of compound-target pairs in the drug_mechanism table
+        - set of targets in the drug_mechanism table
+    :type dataset: Dataset
     """
     # Add a new column *therapeutic_target* which is set to True
     # if the target is in the drug_mechanism table
-    df_combined["therapeutic_target"] = df_combined["tid"].isin(
-        drug_mechanism_targets_set
+    dataset.df_result["therapeutic_target"] = dataset.df_result["tid"].isin(
+        dataset.drug_mechanism_targets_set
     )
 
     # Assign the annotations based on the table.
     # Compound-target pairs from the drug mechanism table
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 4)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 4)
         ),
         "DTI",
     ] = "D_DT"
-    df_combined.loc[
+
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 3)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 3)
         ),
         "DTI",
     ] = "C3_DT"
-    df_combined.loc[
+
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 2)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 2)
         ),
         "DTI",
     ] = "C2_DT"
-    df_combined.loc[
+
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (df_combined["max_phase"] == 1)
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (dataset.df_result["max_phase"] == 1)
         ),
         "DTI",
     ] = "C1_DT"
+
     # Compounds that are in the drug_mechanism table but don't have a known phase between 1-4:
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)
-            & (~df_combined["max_phase"].isin([1, 2, 3, 4]))
+            dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
+            & (~dataset.df_result["max_phase"].isin([1, 2, 3, 4]))
         ),
         "DTI",
     ] = "C0_DT"
 
     # Target is in the drug mechanism table
-    df_combined.loc[
+    dataset.df_result.loc[
         (
-            (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set))
-            & (df_combined["therapeutic_target"] == True)
+            (
+                ~dataset.df_result["cpd_target_pair"].isin(
+                    dataset.drug_mechanism_pairs_set
+                )
+            )
+            & (dataset.df_result["therapeutic_target"])
         ),
         "DTI",
     ] = "DT"
 
     # Other compound-target pairs
     # if target is not a therapeutic target, 'cpd_target_pair' cannot be in DTIs_set
-    # (~df_combined['cpd_target_pair'].isin(DTIs_set)) is included for clarity
-    df_combined.loc[
+    # (~dataset.df_result['cpd_target_pair'].isin(DTIs_set)) is included for clarity
+    dataset.df_result.loc[
         (
-            (~df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set))
-            & (df_combined["therapeutic_target"] == False)
+            (
+                ~dataset.df_result["cpd_target_pair"].isin(
+                    dataset.drug_mechanism_pairs_set
+                )
+            )
+            & ~(dataset.df_result["therapeutic_target"])
         ),
         "DTI",
     ] = "NDT"
 
     # Discard NDT rows
-    df_combined = df_combined[
-        (df_combined["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"]))
+    dataset.df_result = dataset.df_result[
+        (
+            dataset.df_result["DTI"].isin(
+                ["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT", "DT"]
+            )
+        )
     ]
-
-    return df_combined
diff --git a/src/add_filtering_columns.py b/src/add_filtering_columns.py
new file mode 100644
index 0000000..27d4076
--- /dev/null
+++ b/src/add_filtering_columns.py
@@ -0,0 +1,214 @@
+"""
+Add filtering columns for obtaining the different subsets to the dataset.
+"""
+
+import logging
+import os
+
+import pandas as pd
+
+from arguments import CalculationArgs, OutputArgs
+from dataset import Dataset
+import get_stats
+import output
+
+
+def get_data_subsets(data: pd.DataFrame, min_nof_cpds: int, desc: str) -> tuple[
+    tuple[pd.DataFrame, str],
+    tuple[pd.DataFrame, str],
+    tuple[pd.DataFrame, str],
+    tuple[pd.DataFrame, str],
+]:
+    """
+    Calculate and return the different subsets of interest.
+
+    :param data: Pandas DataFrame with compound-target pairs
+    :type data: pd.DataFrame
+    :param min_nof_cpds: Miminum number of compounds per target
+    :type min_nof_cpds: int
+    :param desc: Types of assays current_df contains information about.
+        Options: "BF" (binding+functional), "B" (binding)
+    :type desc: str
+    :return: List of dataset subsets and the string describing them
+        - data: Pandas DataFrame with compound-target pairs
+            without filtering columns and without
+            the annotations for the opposite desc,
+            e.g. if desc = "BF", the average pchembl value based on
+            binding data only is dropped
+        - df_enough_cpds: Pandas DataFrame with targets
+            with at least <min_nof_cpds> compounds with a pchembl value,
+        - df_c_dt_d_dt: As df_enough_cpds but with
+            at least one compound-target pair labelled as
+            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction),
+        - df_d_dt: As df_enough_cpds but with
+            at least one compound-target pair labelled as
+            'D_DT' (i.e., known drug-target interaction)
+    :rtype: tuple[tuple[pd.DataFrame, str],
+           tuple[pd.DataFrame, str],
+           tuple[pd.DataFrame, str],
+           tuple[pd.DataFrame, str]]
+    """
+    if desc == "B":
+        drop_desc = "BF"
+    else:
+        drop_desc = "B"
+    data = data.drop(
+        columns=[
+            f"pchembl_value_mean_{drop_desc}",
+            f"pchembl_value_max_{drop_desc}",
+            f"pchembl_value_median_{drop_desc}",
+            f"first_publication_cpd_target_pair_{drop_desc}",
+            f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}",
+            f"LE_{drop_desc}",
+            f"BEI_{drop_desc}",
+            f"SEI_{drop_desc}",
+            f"LLE_{drop_desc}",
+        ]
+        + [  # exclude columns related to the other assay types
+            col for col in data.columns if col.startswith("B_") or col.startswith("BF_")
+        ]  # exclude filtering columns
+    ).drop_duplicates()
+
+    # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
+    comparator_counts = (
+        data[data[f"pchembl_value_mean_{desc}"].notnull()]
+        .groupby(["tid_mutation"])["parent_molregno"]
+        .count()
+    )
+    # pylint: disable-next=unused-variable
+    targets_w_enough_cpds = comparator_counts[
+        comparator_counts >= min_nof_cpds
+    ].index.tolist()
+    df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds")
+
+    # Restrict the dataset further to targets
+    # with at least one compound-target pair labelled as
+    # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT',
+    # i.e., compound-target pairs with a known interactions.
+    # pylint: disable-next=unused-variable
+    c_dt_d_dt_targets = set(
+        df_enough_cpds[
+            df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"])
+        ].tid_mutation.to_list()
+    )
+    df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets")
+
+    # Restrict the dataset further to targets with
+    # at least one compound-target pair labelled as 'D_DT',
+    # i.e., known drug-target interactions.
+    # pylint: disable-next=unused-variable
+    d_dt_targets = set(
+        df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list()
+    )
+    df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets")
+
+    return [
+        [data, f"{desc}"],
+        [df_enough_cpds, f"{desc}_{min_nof_cpds}"],
+        [df_c_dt_d_dt, f"{desc}_{min_nof_cpds}_c_dt_d_dt"],
+        [df_d_dt, f"{desc}_{min_nof_cpds}_d_dt"],
+    ]
+
+
+def add_subset_filtering_columns(
+    df_combined_subset: pd.DataFrame,
+    dataset: Dataset,
+    desc: str,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Add filtering column for binding + functional vs binding
+
+    :param df_combined_subset: Subset with binding+functional (BF) or binding (B) assay-based data
+        in df_combined
+    :type df_combined_subset: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include filtering columns.
+    :type dataset: Dataset
+    :param desc: Assay description,
+        either "BF" (binding+functional) or "B" (binding)
+    :type desc: str
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    subsets = get_data_subsets(
+        df_combined_subset,
+        args.min_nof_cpds_bf if desc == "BF" else args.min_nof_cpds_b,
+        desc,
+    )
+
+    # write subsets if required
+    if (desc == "BF" and out.write_bf) or (desc == "B" and out.write_b):
+        for [df_subset, subset_desc] in subsets:
+            name_subset = os.path.join(
+                out.output_path,
+                f"ChEMBL{args.chembl_version}_"
+                f"CTI_{args.limited_flag}_"
+                f"{subset_desc}",
+            )
+            output.write_and_check_output(
+                df_subset,
+                name_subset,
+                desc,
+                args,
+                out,
+            )
+
+    # add filtering columns to df_combined
+    # do not add a filtering column for BF / B (-> [1:])
+    for [df, col_name] in subsets[1:]:
+        dataset.df_result[col_name] = False
+        dataset.df_result.loc[(dataset.df_result.index.isin(df.index)), col_name] = True
+        # check that filtering works
+        assert dataset.df_result[dataset.df_result[col_name]][df.columns].equals(
+            df
+        ), f"Filtering is not accurate for {col_name}."
+
+    if logging.DEBUG >= logging.root.level:
+        for [df_subset, subset_desc] in subsets:
+            get_stats.add_debugging_info(dataset, df_subset, subset_desc)
+
+
+def add_filtering_columns(
+    dataset: Dataset,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Add filtering columns to main dataset and save subsets if required.
+
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include filtering columns.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    # consider binding and functional assays
+    # assay description = binding+functional
+    desc = "BF"
+    # df_combined without binding only data
+    df_combined_subset = dataset.df_result.copy()
+    add_subset_filtering_columns(
+        df_combined_subset,
+        dataset,
+        desc,
+        args,
+        out,
+    )
+
+    # consider only binding assays
+    # assay description = binding
+    desc = "B"
+    df_combined_subset = dataset.df_result[dataset.df_result["keep_for_binding"]].copy()
+    add_subset_filtering_columns(
+        df_combined_subset,
+        dataset,
+        desc,
+        args,
+        out,
+    )
diff --git a/src/add_rdkit_compound_descriptors.py b/src/add_rdkit_compound_descriptors.py
index 5c7d4e8..889ff70 100644
--- a/src/add_rdkit_compound_descriptors.py
+++ b/src/add_rdkit_compound_descriptors.py
@@ -1,72 +1,79 @@
-import pandas as pd
+"""
+Add RDKit-based compound properties to the dataset.
+"""
+
 from rdkit import Chem
 from rdkit.Chem import Descriptors
 from rdkit.Chem import PandasTools
 from tqdm import tqdm
 
+from dataset import Dataset
+import sanity_checks
+
 
-def add_built_in_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
+def add_built_in_descriptors(dataset: Dataset):
     """
     Add RDKit built-in compound descriptors.
 
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include built-in RDKit compound descriptors.
+    :type dataset: Dataset
     :param df_combined: Pandas DataFrame with compound-target pairs
     :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added built-in RDKit compound descriptors
-    :rtype: pd.DataFrame
     """
     # add a column with RDKit molecules, used to calculate the descriptors
     PandasTools.AddMoleculeColumnToFrame(
-        df_combined, "canonical_smiles", "mol", includeFingerprints=False
+        dataset.df_result, "canonical_smiles", "mol", includeFingerprints=False
     )
 
-    df_combined.loc[:, "fraction_csp3"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "fraction_csp3"] = dataset.df_result["mol"].apply(
         Descriptors.FractionCSP3
     )
-    df_combined.loc[:, "ring_count"] = df_combined["mol"].apply(Descriptors.RingCount)
-    df_combined.loc[:, "num_aliphatic_rings"] = df_combined["mol"].apply(
-        Descriptors.NumAliphaticRings
+    dataset.df_result.loc[:, "ring_count"] = dataset.df_result["mol"].apply(
+        Descriptors.RingCount
     )
-    df_combined.loc[:, "num_aliphatic_carbocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAliphaticCarbocycles
-    )
-    df_combined.loc[:, "num_aliphatic_heterocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAliphaticHeterocycles
+    dataset.df_result.loc[:, "num_aliphatic_rings"] = dataset.df_result["mol"].apply(
+        Descriptors.NumAliphaticRings
     )
-    df_combined.loc[:, "num_aromatic_rings"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_aliphatic_carbocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAliphaticCarbocycles)
+    dataset.df_result.loc[:, "num_aliphatic_heterocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAliphaticHeterocycles)
+    dataset.df_result.loc[:, "num_aromatic_rings"] = dataset.df_result["mol"].apply(
         Descriptors.NumAromaticRings
     )
-    df_combined.loc[:, "num_aromatic_carbocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAromaticCarbocycles
-    )
-    df_combined.loc[:, "num_aromatic_heterocycles"] = df_combined["mol"].apply(
-        Descriptors.NumAromaticHeterocycles
-    )
-    df_combined.loc[:, "num_saturated_rings"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_aromatic_carbocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAromaticCarbocycles)
+    dataset.df_result.loc[:, "num_aromatic_heterocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumAromaticHeterocycles)
+    dataset.df_result.loc[:, "num_saturated_rings"] = dataset.df_result["mol"].apply(
         Descriptors.NumSaturatedRings
     )
-    df_combined.loc[:, "num_saturated_carbocycles"] = df_combined["mol"].apply(
-        Descriptors.NumSaturatedCarbocycles
-    )
-    df_combined.loc[:, "num_saturated_heterocycles"] = df_combined["mol"].apply(
-        Descriptors.NumSaturatedHeterocycles
-    )
-    df_combined.loc[:, "num_stereocentres"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_saturated_carbocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumSaturatedCarbocycles)
+    dataset.df_result.loc[:, "num_saturated_heterocycles"] = dataset.df_result[
+        "mol"
+    ].apply(Descriptors.NumSaturatedHeterocycles)
+    dataset.df_result.loc[:, "num_stereocentres"] = dataset.df_result["mol"].apply(
         Chem.rdMolDescriptors.CalcNumAtomStereoCenters
     )
-    df_combined.loc[:, "num_heteroatoms"] = df_combined["mol"].apply(
+    dataset.df_result.loc[:, "num_heteroatoms"] = dataset.df_result["mol"].apply(
         Descriptors.NumHeteroatoms
     )
 
     # add scaffolds
-    PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_w_stereo")
+    PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_w_stereo")
     # remove stereo information of the molecule to add scaffolds without stereo information
-    df_combined["mol"].apply(Chem.RemoveStereochemistry)
-    PandasTools.AddMurckoToFrame(df_combined, "mol", "scaffold_wo_stereo")
+    dataset.df_result["mol"].apply(Chem.RemoveStereochemistry)
+    PandasTools.AddMurckoToFrame(dataset.df_result, "mol", "scaffold_wo_stereo")
 
     # drop the column with RDKit molecules
-    df_combined = df_combined.drop(["mol"], axis=1)
-
-    return df_combined
+    dataset.df_result = dataset.df_result.drop(["mol"], axis=1)
 
 
 def calculate_aromatic_atoms(
@@ -121,7 +128,7 @@ def calculate_aromatic_atoms(
     return aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict
 
 
-def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
+def add_aromaticity_descriptors(dataset: Dataset):
     """
     Add number of aromatic atoms in a compounds, specifically:
 
@@ -130,40 +137,40 @@ def add_aromaticity_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
     - # aromatic nitrogen atoms (aromatic_n)
     - # aromatic hetero atoms (aromatic_hetero)
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added counts of aromatic atoms
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include counts of aromatic atoms
+    :type dataset: Dataset
     """
     # use df_combined_w_smiles to exclude null values
-    smiles_set = set(df_combined["canonical_smiles"])
+    smiles_set = set(dataset.df_result["canonical_smiles"])
     aromatic_atoms_dict, aromatic_c_dict, aromatic_n_dict, aromatic_hetero_dict = (
         calculate_aromatic_atoms(smiles_set)
     )
 
-    df_combined["aromatic_atoms"] = df_combined["canonical_smiles"].map(
+    dataset.df_result["aromatic_atoms"] = dataset.df_result["canonical_smiles"].map(
         aromatic_atoms_dict
     )
-    df_combined["aromatic_c"] = df_combined["canonical_smiles"].map(aromatic_c_dict)
-    df_combined["aromatic_n"] = df_combined["canonical_smiles"].map(aromatic_n_dict)
-    df_combined["aromatic_hetero"] = df_combined["canonical_smiles"].map(
+    dataset.df_result["aromatic_c"] = dataset.df_result["canonical_smiles"].map(
+        aromatic_c_dict
+    )
+    dataset.df_result["aromatic_n"] = dataset.df_result["canonical_smiles"].map(
+        aromatic_n_dict
+    )
+    dataset.df_result["aromatic_hetero"] = dataset.df_result["canonical_smiles"].map(
         aromatic_hetero_dict
     )
 
-    return df_combined
 
-
-def add_rdkit_compound_descriptors(df_combined: pd.DataFrame) -> pd.DataFrame:
+def add_rdkit_compound_descriptors(dataset: Dataset):
     """
     Add RDKit-based compound descriptors (built-in and numbers of aromatic atoms).
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :return: Pandas DataFrame with added built-in RDKit compound descriptors
-        and numbers of aromatic atoms
-    :rtype: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include
+        built-in RDKit compound descriptors
+        and numbers of aromatic atoms.
+    :type dataset: Dataset
     """
-    df_combined = add_built_in_descriptors(df_combined)
-    df_combined = add_aromaticity_descriptors(df_combined)
-
-    return df_combined
+    add_built_in_descriptors(dataset)
+    add_aromaticity_descriptors(dataset)
+    sanity_checks.check_rdkit_props(dataset.df_result)
diff --git a/src/arguments.py b/src/arguments.py
new file mode 100644
index 0000000..ea02154
--- /dev/null
+++ b/src/arguments.py
@@ -0,0 +1,178 @@
+"""
+Dataclasses related to handling arguments, 
+specifically arguments related to how to calculate or output the dataset. 
+"""
+
+import argparse
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class CalculationArgs:
+    """
+    Collection of arguments related to how to calculate the dataset.
+
+    chembl_version:         Version of ChEMBL for output file names
+    calculate_rdkit:        True if RDKit-based compound properties should be calculated
+    limit_to_literature:    Include only literature sources if True
+    limited_flag:           String version of limit_to_literature used in file names
+    min_nof_cpds_bf:        Minimum number of compounds per target for the BF subset
+    min_nof_cpds_b:         Minimum number of compounds per target for the B subset
+    """
+
+    chembl_version: str
+    calculate_rdkit: bool
+    limit_to_literature: bool
+    limited_flag: str
+    min_nof_cpds_bf: int
+    min_nof_cpds_b: int
+
+
+@dataclass(frozen=True)
+class OutputArgs:
+    """
+    Collection of arguments related to how to output the dataset.
+
+    output_path:        Path to write output files to
+    delimiter:          Delimiter in csv-output
+    write_to_csv:       True if output should be written to csv
+    write_to_excel:     True if output should be written to excel
+    write_full_dataset: True if the full dataset should be written to output
+    write_bf:           True if subsets based on binding+functional data should be written to output
+    write_b:            True if subsets based on binding data only should be written to output
+    """
+
+    output_path: str
+    delimiter: str
+    write_to_csv: bool
+    write_to_excel: bool
+    write_full_dataset: bool
+    write_bf: bool
+    write_b: bool
+
+
+def parse_args() -> argparse.Namespace:
+    """
+    Get arguments with argparse.
+
+    :return: Populated argparse.Namespace
+    :rtype: argparse.Namespace
+    """
+    parser = argparse.ArgumentParser(
+        description="Extract the compound-target pairs dataset from ChEMBL. \
+            The full dataset plus filtering columns for binding vs. binding+functional data \
+            will always be written to csv. \
+            Additional outputs and output types can be chosen with the parameters below."
+    )
+
+    parser.add_argument(
+        "--chembl",
+        "-v",
+        dest="chembl_version",
+        metavar="<version>",
+        type=str,
+        default=None,
+        help="ChEMBL version. \
+            Latest version if None. \
+            Required if a path to a SQLite database is provided, \
+            i.e., if --sqlite is set. (default: None)",
+    )
+    parser.add_argument(
+        "--sqlite",
+        "-s",
+        metavar="<path>",
+        type=str,
+        default=None,
+        help="Path to SQLite database. \
+            ChEMBL is downloaded as an SQLite database \
+            and handled by chembl_downloader if None. (default: None)",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        dest="output_path",
+        metavar="<path>",
+        type=str,
+        required=True,
+        help="Path to write the output file(s) to. (required)",
+    )
+    parser.add_argument(
+        "--delimiter",
+        "-d",
+        metavar="<delimiter>",
+        type=str,
+        default=";",
+        help="Delimiter in output csv-files.  (default: ;)",
+    )
+    parser.add_argument(
+        "--all_sources",
+        action="store_true",
+        help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
+            This includes data from BindingDB which may skew the results. \
+            Default (not set): the dataset is calculated based on only literature data.",
+    )
+    parser.add_argument(
+        "--rdkit",
+        dest="calculate_rdkit",
+        action="store_true",
+        help="Calculate RDKit-based compound properties.",
+    )
+    parser.add_argument(
+        "--excel",
+        dest="write_to_excel",
+        action="store_true",
+        help="Write the results to excel. Note: this may fail if the output is too large.",
+    )
+    parser.add_argument(
+        "--BF",
+        dest="write_bf",
+        action="store_true",
+        help="Write binding+functional data subsets.",
+    )
+    parser.add_argument(
+        "--B", dest="write_b", action="store_true", help="Write binding data subsets."
+    )
+    parser.add_argument(
+        "--debug", action="store_true", help="Log additional debugging information."
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]:
+    """
+    Get parsed and default arguments.
+
+    :return: parserd arguments,
+        arguments related to how to calculate the dataset as CalculationArgs,
+        arguments related to how to output the dataset as OutputArgs
+    :rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs]
+    """
+    args = parse_args()
+
+    calc_args = CalculationArgs(
+        chembl_version=args.chembl_version,
+        calculate_rdkit=args.calculate_rdkit,
+        limit_to_literature=not args.all_sources,
+        # used in file names
+        limited_flag="literature_only" if not args.all_sources else "all_sources",
+        min_nof_cpds_bf=100,
+        min_nof_cpds_b=100,
+    )
+
+    output_args = OutputArgs(
+        output_path=args.output_path,
+        delimiter=args.delimiter,
+        # Always write the results to csv.
+        write_to_csv=True,
+        write_to_excel=args.write_to_excel,
+        # Always write the full dataset plus filtering columns
+        # for binding vs. binding+functional data.
+        write_full_dataset=True,
+        write_bf=args.write_bf,
+        write_b=args.write_b,
+    )
+
+    return args, calc_args, output_args
diff --git a/src/clean_dataset.py b/src/clean_dataset.py
index 01a2564..56517dd 100644
--- a/src/clean_dataset.py
+++ b/src/clean_dataset.py
@@ -1,12 +1,19 @@
+"""
+Methods related to cleaning the dataset.
+"""
+
 import logging
 import sqlite3
 
 import pandas as pd
 
+from dataset import Dataset
+
 
+########### Remove Irrelevant Compounds ###########
 def remove_compounds_without_smiles_and_mixtures(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> pd.DataFrame:
+    dataset: Dataset, chembl_con: sqlite3.Connection
+):
     """
     Remove
 
@@ -16,12 +23,12 @@ def remove_compounds_without_smiles_and_mixtures(
     Since compound information is aggregated for the parents of salts,
     the number of smiles with a dot is relatively low.
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to only include
+        compound-target pairs with a smiles that does not contain a '.'
+    :type dataset: Dataset
     :param chembl_con: Sqlite3 connection to ChEMBL database.
     :type chembl_con: sqlite3.Connection
-    :return: Pandas DataFrame with compound-target pairs with a smiles that does not contain a '.'
-    :rtype: pd.DataFrame
     """
     # Double-check that rows with a SMILES containing a '.' are the parent structures,
     # i.e., there was no error in using salt information instead of parent information.
@@ -31,9 +38,9 @@ def remove_compounds_without_smiles_and_mixtures(
     """
     df_hierarchy = pd.read_sql_query(sql, con=chembl_con)
 
-    smiles_with_dot = df_combined[
-        df_combined["canonical_smiles"].notnull()
-        & df_combined["canonical_smiles"].str.contains(".", regex=False)
+    smiles_with_dot = dataset.df_result[
+        dataset.df_result["canonical_smiles"].notnull()
+        & dataset.df_result["canonical_smiles"].str.contains(".", regex=False)
     ][["canonical_smiles", "parent_molregno"]].drop_duplicates()
 
     for parent_molregno in set(smiles_with_dot["parent_molregno"]):
@@ -72,42 +79,45 @@ def remove_compounds_without_smiles_and_mixtures(
                 the smiles for the compound in ChEMBL ({parent_smiles_in_chembl})."
 
     # Remove rows that contain a SMILES with a dot or that don't have a SMILES.
-    len_missing_smiles = len(df_combined[df_combined["canonical_smiles"].isnull()])
+    len_missing_smiles = len(
+        dataset.df_result[dataset.df_result["canonical_smiles"].isnull()]
+    )
     len_smiles_w_dot = len(
-        df_combined[
-            df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"]))
+        dataset.df_result[
+            dataset.df_result["parent_molregno"].isin(
+                set(smiles_with_dot["parent_molregno"])
+            )
         ]
     )
     logging.debug("#Compounds without a SMILES: %s", len_missing_smiles)
     logging.debug("#SMILES with a dot: %s", len_smiles_w_dot)
 
-    df_combined = df_combined[
-        (df_combined["canonical_smiles"].notnull())
+    dataset.df_result = dataset.df_result[
+        (dataset.df_result["canonical_smiles"].notnull())
         & ~(
-            df_combined["parent_molregno"].isin(set(smiles_with_dot["parent_molregno"]))
+            dataset.df_result["parent_molregno"].isin(
+                set(smiles_with_dot["parent_molregno"])
+            )
         )
     ]
 
-    return df_combined
-
 
-def clean_none_values(df_combined):
+########### General Cleaning Steps ###########
+def clean_none_values(dataset: Dataset):
     """
     Change nan values and empty strings to None for consistency.
     """
     # Change all None / nan values to None
-    df_combined = df_combined.where(pd.notnull(df_combined), None)
+    dataset.df_result = dataset.df_result.where(pd.notnull(dataset.df_result), None)
     # replace empty strings with None
-    df_combined = df_combined.replace("", None).reset_index(drop=True)
+    dataset.df_result = dataset.df_result.replace("", None).reset_index(drop=True)
 
-    return df_combined
 
-
-def set_types_to_int(df_combined, calculate_rdkit):
+def set_types_to_int(dataset, calculate_rdkit):
     """
     Set the type of relevant columns to Int64.
     """
-    df_combined = df_combined.astype(
+    dataset.df_result = dataset.df_result.astype(
         {
             "first_approval": "Int64",
             "usan_year": "Int64",
@@ -129,7 +139,7 @@ def set_types_to_int(df_combined, calculate_rdkit):
     )
 
     if calculate_rdkit:
-        df_combined = df_combined.astype(
+        dataset.df_result = dataset.df_result.astype(
             {
                 "num_aliphatic_carbocycles": "Int64",
                 "num_aliphatic_heterocycles": "Int64",
@@ -150,26 +160,26 @@ def set_types_to_int(df_combined, calculate_rdkit):
             }
         )
 
-    return df_combined
-
 
-def round_floats(df_combined, decimal_places=4):
+def round_floats(dataset, decimal_places=4):
     """
     Round float columns to <decimal_places> decimal places.
     This does not apply to max_phase.
     """
-    for _, (col, dtype) in enumerate(df_combined.dtypes.to_dict().items()):
+    for _, (col, dtype) in enumerate(dataset.df_result.dtypes.to_dict().items()):
         if (dtype in ("float64", "Float64")) and col != "max_phase":
-            df_combined[col] = df_combined[col].round(decimals=decimal_places)
+            dataset.df_result[col] = dataset.df_result[col].round(
+                decimals=decimal_places
+            )
 
-    return df_combined
+    return dataset.df_result
 
 
-def reorder_columns(df_combined, calculate_rdkit):
+def reorder_columns(dataset, calculate_rdkit):
     """
     Reorder the columns in the DataFrame.
     """
-    len_columns_before = len(df_combined.columns)
+    len_columns_before = len(dataset.df_result.columns)
 
     compound_target_pair_columns = [
         "parent_molregno",
@@ -283,7 +293,7 @@ def reorder_columns(df_combined, calculate_rdkit):
             + rdkit_columns
             + filtering_columns
         )
-        df_combined = df_combined[columns]
+        dataset.df_result = dataset.df_result[columns]
     else:
         columns = (
             compound_target_pair_columns
@@ -296,18 +306,16 @@ def reorder_columns(df_combined, calculate_rdkit):
             + chembl_target_annotations
             + filtering_columns
         )
-        df_combined = df_combined[columns]
+        dataset.df_result = dataset.df_result[columns]
 
-    len_columns_after = len(df_combined.columns)
+    len_columns_after = len(dataset.df_result.columns)
     assert (
         len_columns_before == len_columns_after
     ), f"Different number of columns after reordering \
         (before: {len_columns_before}, after: {len_columns_after})."
 
-    return df_combined
 
-
-def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFrame:
+def clean_dataset(dataset: Dataset, calculate_rdkit: bool) -> pd.DataFrame:
     """
     Clean the dataset by
 
@@ -317,18 +325,16 @@ def clean_dataset(df_combined: pd.DataFrame, calculate_rdkit: bool) -> pd.DataFr
     - reordering columns
     - sorting rows by cpd_target_pair_mutation
 
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
+    :param dataset: Dataset with compound-target pairs.
+        Will be updated to clean version with the updates described above.
+    :type dataset: Dataset
     :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties
     :type calculate_rdkit: bool
-    :return: Cleaned pandas DataFrame with compound-target pairs
-    :rtype: pd.DataFrame
     """
-    df_combined = clean_none_values(df_combined)
-    df_combined = set_types_to_int(df_combined, calculate_rdkit)
-    df_combined = round_floats(df_combined, decimal_places=4)
-    df_combined = reorder_columns(df_combined, calculate_rdkit)
-    df_combined = df_combined.sort_values(by=["cpd_target_pair_mutation"]).reset_index(
-        drop=True
-    )
-    return df_combined
+    clean_none_values(dataset)
+    set_types_to_int(dataset, calculate_rdkit)
+    round_floats(dataset, decimal_places=4)
+    reorder_columns(dataset, calculate_rdkit)
+    dataset.df_result = dataset.df_result.sort_values(
+        by=["cpd_target_pair_mutation"]
+    ).reset_index(drop=True)
diff --git a/src/dataset.py b/src/dataset.py
new file mode 100644
index 0000000..19fd259
--- /dev/null
+++ b/src/dataset.py
@@ -0,0 +1,30 @@
+"""
+Dataclass for handling the calculated compound-target pair dataset 
+and related data.
+"""
+
+from dataclasses import dataclass
+
+import pandas as pd
+
+
+@dataclass
+class Dataset:
+    """
+    df_result:                  Pandas DataFrame with the full dataset
+    drug_mechanism_pairs_set:   Set of compound-target pairs in the drug_mechanism table,
+                                used for DTI assignments
+    drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
+                                used for DTI assigments
+    df_sizes_all:               Pandas DataFrame of intermediate sizes of the dataset,
+                                used for debugging
+    df_sizes_pchembl:           Pandas DataFrame of intermediate sizes of the dataset,
+                                restricted to entries with a pchembl value,
+                                used for debugging
+    """
+
+    df_result: pd.DataFrame
+    drug_mechanism_pairs_set: set
+    drug_mechanism_targets_set: set
+    df_sizes_all: pd.DataFrame
+    df_sizes_pchembl: pd.DataFrame
diff --git a/src/get_activity_ct_pairs.py b/src/get_activity_ct_pairs.py
index 7c825db..3f6a02c 100644
--- a/src/get_activity_ct_pairs.py
+++ b/src/get_activity_ct_pairs.py
@@ -1,17 +1,20 @@
-import logging
+"""
+Get initial set of compound-target pairs with an associated activity
+for the dataset.
+"""
+
 import sqlite3
 
 import numpy as np
 import pandas as pd
 
-import get_stats
+from dataset import Dataset
 
 
 ########### Get Initial Compound-Target Data From ChEMBL ###########
 def get_compound_target_pairs_with_pchembl(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-    df_sizes: list[list[int], list[int]],
 ) -> pd.DataFrame:
     """
     Query ChEMBL activities and related assay for compound-target pairs
@@ -27,8 +30,6 @@ def get_compound_target_pairs_with_pchembl(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
     :return: Pandas DataFrame with compound-target pairs with a pchembl value.
     :rtype: pd.DataFrame
     """
@@ -84,9 +85,6 @@ def get_compound_target_pairs_with_pchembl(
         f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"])
     ]
 
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_mols, "initial query", df_sizes)
-
     return df_mols
 
 
@@ -170,10 +168,9 @@ def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
 
 
 ########### Get Aggregated Compound-Target Pair Information ###########
-def get_aggregated_activity_ct_pairs(
+def get_aggregated_compound_target_pairs_with_pchembl(
     chembl_con: sqlite3.Connection,
     limit_to_literature: bool,
-    df_sizes: list[list[int], list[int]],
 ) -> pd.DataFrame:
     """
     Get dataset of compound target-pairs with an associated pchembl value
@@ -194,14 +191,13 @@ def get_aggregated_activity_ct_pairs(
     :param limit_to_literature: Include only literature sources if True.
         Include all available sources otherwise.
     :type limit_to_literature: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :return: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
-        aggregated into one entry per compound-target pair.
+    :return: Pandas Dataframe with compound-target pairs
+        based on ChEMBL activity data aggregated into one entry per compound-target pair.
     :rtype: pd.DataFrame
     """
     df_mols = get_compound_target_pairs_with_pchembl(
-        chembl_con, limit_to_literature, df_sizes
+        chembl_con,
+        limit_to_literature,
     )
 
     # Summarise the information for binding and functional assays
@@ -232,3 +228,34 @@ def get_aggregated_activity_ct_pairs(
     )
 
     return df_combined
+
+
+def get_aggregated_activity_ct_pairs(
+    chembl_con: sqlite3.Connection,
+    limit_to_literature: bool,
+) -> Dataset:
+    """
+    Wrapper for get_aggregated_compound_target_pairs_with_pchembl,
+    initialising a dataset.
+
+    :param chembl_con: Sqlite3 connection to ChEMBL database.
+    :type chembl_con: sqlite3.Connection
+    :param limit_to_literature: Include only literature sources if True.
+        Include all available sources otherwise.
+    :type limit_to_literature: bool
+    :return: Dataset with a pandas Dataframe with compound-target pairs
+        based on ChEMBL activity data aggregated into one entry per compound-target pair.
+    :rtype: Dataset
+    """
+    df_result = get_aggregated_compound_target_pairs_with_pchembl(
+        chembl_con, limit_to_literature
+    )
+
+    dataset = Dataset(
+        df_result,
+        set(),
+        set(),
+        pd.DataFrame(),
+        pd.DataFrame(),
+    )
+    return dataset
diff --git a/src/get_dataset.py b/src/get_dataset.py
index 2fa6b91..faffb3b 100644
--- a/src/get_dataset.py
+++ b/src/get_dataset.py
@@ -1,224 +1,90 @@
+"""
+Main workflow to calculate the compound-target pairs dataset.
+"""
+
 import logging
-import os
 import sqlite3
 
+from arguments import OutputArgs, CalculationArgs
+import add_filtering_columns
 import get_activity_ct_pairs
-import get_drug_mechanism_ct_pairs
-import add_dti_annotations
 import add_chembl_compound_properties
-import clean_dataset
 import add_chembl_target_class_annotations
+import get_drug_mechanism_ct_pairs
+import add_dti_annotations
 import add_rdkit_compound_descriptors
-import sanity_checks
-import write_subsets
+import clean_dataset
 import get_stats
+import output
+import sanity_checks
 
 
 def get_ct_pair_dataset(
-    chembl_con: sqlite3.Connection,
-    chembl_version: str,
-    output_path: str,
-    limit_to_literature: bool,
-    calculate_rdkit: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    write_full_dataset: bool,
-    write_bf: bool,
-    write_b: bool,
+    chembl_con: sqlite3.Connection, args: CalculationArgs, out: OutputArgs
 ):
     """
     Calculate and output the compound-target pair dataset.
 
     :param chembl_con: Sqlite3 connection to ChEMBL database
     :type chembl_con: sqlite3.Connection
-    :param chembl_version: Version of ChEMBL for output file names
-    :type chembl_version: str
-    :param output_path: Path to write output files to
-    :type output_path: str
-    :param limit_to_literature: Include only literature sources if True.
-        Include all available sources otherwise.
-    :type limit_to_literature: bool
-    :param calculate_rdkit: True if RDKit-based compound properties should be calculated
-    :type calculate_rdkit: bool
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    :param write_full_dataset: True if the full dataset should be written to output
-    :type write_full_dataset: bool
-    :param write_bf: True if subsets based on binding+functional data should be written to output
-    :type write_bf: bool
-    :param write_b: True if subsets based on binding data only should be written to output
-    :type write_b: bool
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
     """
-    # list with sizes of full dataset and dataset subset with pchembl values for debugging
-    df_sizes = [[], []]
-
-    # used in file names
-    if limit_to_literature:
-        limited_flag = "literature_only"
-    else:
-        limited_flag = "all_sources"
-
     logging.info("get_aggregated_activity_ct_pairs")
-    df_combined = get_activity_ct_pairs.get_aggregated_activity_ct_pairs(
-        chembl_con, limit_to_literature, df_sizes
+    dataset = get_activity_ct_pairs.get_aggregated_activity_ct_pairs(
+        chembl_con, args.limit_to_literature
     )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "activity ct-pairs", df_sizes)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "activity ct-pairs")
 
     logging.info("add_cti_from_drug_mechanisms")
-    df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set = (
-        get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(df_combined, chembl_con)
-    )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "dm ct-pairs", df_sizes)
+    get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(dataset, chembl_con)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "dm ct-pairs")
 
     logging.info("add_cti_annotations")
-    df_combined = add_dti_annotations.add_dti_annotations(
-        df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set
-    )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "DTI annotations", df_sizes)
+    add_dti_annotations.add_dti_annotations(dataset)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "DTI annotations")
 
     logging.info("add_all_chembl_compound_properties")
-    df_combined, df_cpd_props, atc_levels = (
-        add_chembl_compound_properties.add_all_chembl_compound_properties(
-            df_combined, chembl_con, limit_to_literature
-        )
+    add_chembl_compound_properties.add_all_chembl_compound_properties(
+        dataset, chembl_con, args.limit_to_literature
     )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "ChEMBL props", df_sizes)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "ChEMBL props")
 
     logging.info("remove_compounds_without_smiles_and_mixtures")
-    df_combined = clean_dataset.remove_compounds_without_smiles_and_mixtures(
-        df_combined, chembl_con
-    )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "removed smiles", df_sizes)
+    clean_dataset.remove_compounds_without_smiles_and_mixtures(dataset, chembl_con)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "removed smiles")
 
     logging.info("add_chembl_target_class_annotations")
-    df_combined, target_classes_level1, target_classes_level2 = (
-        add_chembl_target_class_annotations.add_chembl_target_class_annotations(
-            df_combined,
-            chembl_con,
-            output_path,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            chembl_version,
-            limited_flag,
-        )
+    add_chembl_target_class_annotations.add_chembl_target_class_annotations(
+        dataset,
+        chembl_con,
+        args,
+        out,
     )
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "tclass annotations", df_sizes)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "tclass annotations")
 
-    logging.info("add_rdkit_compound_descriptors")
-    if calculate_rdkit:
-        df_combined = add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(
-            df_combined
-        )
-        if logging.DEBUG >= logging.root.level:
-            get_stats.add_dataset_sizes(df_combined, "RDKit props", df_sizes)
+    if args.calculate_rdkit:
+        logging.info("add_rdkit_compound_descriptors")
+        add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(dataset)
+        get_stats.add_debugging_info(dataset, dataset.df_result, "RDKit props")
 
     logging.info("clean_dataset")
-    df_combined = clean_dataset.clean_dataset(df_combined, calculate_rdkit)
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined, "clean df", df_sizes)
+    clean_dataset.clean_dataset(dataset, args.calculate_rdkit)
+    get_stats.add_debugging_info(dataset, dataset.df_result, "clean df")
 
     logging.info("sanity_checks")
-    sanity_checks.sanity_checks(
-        df_combined,
-        df_cpd_props,
-        atc_levels,
-        target_classes_level1,
-        target_classes_level2,
-        calculate_rdkit,
-    )
+    sanity_checks.sanity_checks(dataset)
 
-    logging.info("write_BF_to_file")
-    min_nof_cpds_bf = 100
-    df_combined_annotated = write_subsets.write_bf_to_file(
-        df_combined,
-        chembl_version,
-        min_nof_cpds_bf,
-        output_path,
-        write_bf,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
-        limited_flag,
-        calculate_rdkit,
-        df_sizes,
-    )
-
-    logging.info("write_B_to_file")
-    min_nof_cpds_b = 100
-    df_combined_annotated = write_subsets.write_b_to_file(
-        df_combined,
-        df_combined_annotated,
-        chembl_version,
-        min_nof_cpds_b,
-        output_path,
-        write_b,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
-        limited_flag,
-        calculate_rdkit,
-        df_sizes,
-    )
+    logging.info("add_filtering_columns")
+    add_filtering_columns.add_filtering_columns(dataset, args, out)
 
     logging.info("write_full_dataset_to_file")
-    write_subsets.write_full_dataset_to_file(
-        df_combined_annotated,
-        chembl_version,
-        output_path,
-        write_full_dataset,
-        write_to_csv,
-        write_to_excel,
-        delimiter,
-        limited_flag,
-        calculate_rdkit,
-    )
+    output.write_full_dataset_to_file(dataset, args, out)
 
     logging.info("output_stats")
-
-    output_file = os.path.join(
-        output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset_stats"
-    )
-    write_subsets.output_stats(
-        df_combined_annotated, output_file, write_to_csv, write_to_excel, delimiter
-    )
-    if write_bf:
-        output_file = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt_stats",
-        )
-        write_subsets.output_stats(
-            df_combined_annotated[df_combined_annotated["BF_100_c_dt_d_dt"]],
-            output_file,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-        )
-    if write_b:
-        output_file = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt_stats",
-        )
-        write_subsets.output_stats(
-            df_combined_annotated[df_combined_annotated["B_100_c_dt_d_dt"]],
-            output_file,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-        )
+    output.output_all_stats(dataset, args, out)
 
     if logging.DEBUG >= logging.root.level:
-        write_subsets.output_debug_sizes(
-            df_sizes, output_path, write_to_csv, write_to_excel, delimiter
-        )
+        output.write_debug_sizes(dataset, out)
diff --git a/src/get_drug_mechanism_ct_pairs.py b/src/get_drug_mechanism_ct_pairs.py
index be67c59..6121acd 100644
--- a/src/get_drug_mechanism_ct_pairs.py
+++ b/src/get_drug_mechanism_ct_pairs.py
@@ -1,8 +1,16 @@
+"""
+Get and add compound-target pairs based on information 
+in the drug_mechanism table.
+"""
+
 import logging
 import sqlite3
 
 import pandas as pd
 
+from dataset import Dataset
+import sanity_checks
+
 
 ########### Extract Drug-Target Interactions From the drug_mechanism Table ###########
 def get_drug_mechanisms_interactions(chembl_con: sqlite3.Connection) -> pd.DataFrame:
@@ -151,7 +159,7 @@ def add_annotations_to_drug_mechanisms_cti(
     :return: Updated pandas DataFrame with the additional annotations.
     :rtype: pd.DataFrame
     """
-    ##### Set columns existing in the df_combined table. #####
+    ##### Set columns existing in the df_results table. #####
     # None of the targets from the drug mechanism table have any mutation annotation,
     # hence tid_mutation = tid
     cpd_target_pairs["tid_mutation"] = cpd_target_pairs["tid"].astype("str")
@@ -239,50 +247,73 @@ def get_drug_mechanism_ct_pairs(chembl_con: sqlite3.Connection) -> pd.DataFrame:
 
 
 ########### Add Compounds From the drug_mechanism Table to the Dataset ###########
-def add_drug_mechanism_ct_pairs(
-    df_combined: pd.DataFrame, chembl_con: sqlite3.Connection
-) -> tuple[pd.DataFrame, set, set]:
+def add_dm_filtering_columns(dataset: Dataset):
     """
-    Add compound-target pairs from the drug_mechanism table 
-    that are not in the dataset based on the initial ChEMBL query.
-    These are compound-target pairs for which there is no associated pchembl value data.
-    Since the pairs are known interactions, 
-    they are added to the dataset despite not having a pchembl value.
+    Add filtering columns related to the drug_mechanism table.
+        - pair_mutation_in_dm_table: pair is in dm table (incl. mutations)
+        - pair_in_dm_table: pair is in dm table (excl. mutations)
+        - keep_for_binding: use to limit to binding assays
 
-    :param df_combined: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
-    :type df_combined: pd.DataFrame
-    :param chembl_con: Sqlite3 connection to ChEMBL database.
-    :type chembl_con: sqlite3.Connection
-    :return: - Pandas DataFrame with compound-target pairs
-                based on activities AND drug_mechanism table \\
-        - set of compound-target pairs in the drug_mechanism table \\
-        - set of targets in the drug_mechanism table
-    :rtype: (pd.DataFrame, set, set)
+    :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
+    :type dataset: Dataset
     """
-    cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con)
-    drug_mechanism_pairs_set = set(
-        f"{a}_{b}"
-        for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"])
-    )
-
-    drug_mechanism_targets_set = set(cpd_target_pairs["tid"])
-
     # Add a new column *pair_mutation_in_dm_table* which is set to True if the compound target pair
     # (taking mutation annotations into account) is in the drug_mechanism table.
-    df_combined["pair_mutation_in_dm_table"] = False
-    df_combined.loc[
-        (df_combined["cpd_target_pair_mutation"].isin(drug_mechanism_pairs_set)),
+    dataset.df_result["pair_mutation_in_dm_table"] = False
+    dataset.df_result.loc[
+        (
+            dataset.df_result["cpd_target_pair_mutation"].isin(
+                dataset.drug_mechanism_pairs_set
+            )
+        ),
         "pair_mutation_in_dm_table",
     ] = True
 
     # Add a new column *pair_in_dm_table* which is set to True if the compound target pair
     # (NOT taking mutation annotations into account) is in the drug_mechanism table.
-    df_combined["pair_in_dm_table"] = False
-    df_combined.loc[
-        (df_combined["cpd_target_pair"].isin(drug_mechanism_pairs_set)),
+    dataset.df_result["pair_in_dm_table"] = False
+    dataset.df_result.loc[
+        (dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)),
         "pair_in_dm_table",
     ] = True
 
+    # Add a new column *keep_for_binding* which is set to True if the row should be kept
+    # if you want to limit the dataset to only data based on binding assays.
+    # Rows are kept if
+    # - there is a binding data-based pchembl value or
+    # - the compound-target pair (including mutation info) is in the drug_mechanism table
+    dataset.df_result["keep_for_binding"] = False
+    dataset.df_result.loc[
+        (
+            (dataset.df_result["pchembl_value_mean_B"].notnull())
+            | (dataset.df_result["pair_mutation_in_dm_table"])
+        ),
+        "keep_for_binding",
+    ] = True
+
+
+def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection):
+    """
+    Add compound-target pairs from the drug_mechanism table
+    that are not in the dataset based on the initial ChEMBL query.
+    These are compound-target pairs for which there is no associated pchembl value data.
+    Since the pairs are known interactions,
+    they are added to the dataset despite not having a pchembl value.
+    Add the set of compound-target pairs in the drug_mechanism table and
+    the set of targets in the drug_mechanism table to the dataset.
+
+    :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data
+    :type dataset: Dataset
+    :param chembl_con: Sqlite3 connection to ChEMBL database.
+    :type chembl_con: sqlite3.Connection
+    """
+    cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con)
+    dataset.drug_mechanism_pairs_set = set(
+        f"{a}_{b}"
+        for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"])
+    )
+    dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"])
+
     ##### Limit the drug_mechanism pairs to the ones that are not yet in the dataset. #####
     # Mutation annotations are taken into account.
     # Therefore, *(cpd A, target B without mutation)* will be added
@@ -291,7 +322,7 @@ def add_drug_mechanism_ct_pairs(
     cpd_target_pairs = cpd_target_pairs[
         ~(
             cpd_target_pairs["cpd_target_pair_mutation"].isin(
-                set(df_combined["cpd_target_pair_mutation"])
+                set(dataset.df_result["cpd_target_pair_mutation"])
             )
         )
     ].copy()
@@ -302,20 +333,8 @@ def add_drug_mechanism_ct_pairs(
     )
 
     # Combined data of existing query with new compound-target pairs.
-    df_combined = pd.concat([df_combined, cpd_target_pairs])
+    dataset.df_result = pd.concat([dataset.df_result, cpd_target_pairs])
 
-    # Add a new column *keep_for_binding* which is set to True if the row should be kept
-    # if you want to limit the dataset to only data based on binding assays.
-    # Rows are kept if
-    # - there is a binding data-based pchembl value or
-    # - the compound-target pair (including mutation info) is in the drug_mechanism table
-    df_combined["keep_for_binding"] = False
-    df_combined.loc[
-        (
-            (df_combined["pchembl_value_mean_B"].notnull())
-            | (df_combined["pair_mutation_in_dm_table"] == True)
-        ),
-        "keep_for_binding",
-    ] = True
+    add_dm_filtering_columns(dataset)
 
-    return df_combined, drug_mechanism_pairs_set, drug_mechanism_targets_set
+    sanity_checks.check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result)
diff --git a/src/get_stats.py b/src/get_stats.py
index e27a4a5..662f937 100644
--- a/src/get_stats.py
+++ b/src/get_stats.py
@@ -1,79 +1,40 @@
-import pandas as pd
-
-
-##### Debugging Stats #####
-def calculate_dataset_sizes(df: pd.DataFrame) -> list[int]:
-    """
-    Calculate the number of unique compounds, targets and pairs
-    for df and df limited to drugs.
-
-    :param df: Pandas DataFrame for which the dataset sizes should be calculated.
-    :type df: pd.DataFrame
-    :return: List of calculated unique counts.
-    :rtype: list[int]
-    """
-    now_mols = df["parent_molregno"].nunique()
-    now_targets = df["tid"].nunique()
-    now_targets_mutation = df["tid_mutation"].nunique()
-    now_pairs = df["cpd_target_pair"].nunique()
-    now_pairs_mutation = df["cpd_target_pair_mutation"].nunique()
+"""
+Get statistics of dataset for final results and debugging.
+"""
 
-    if "DTI" in df.columns:
-        # drugs = compounds of a compound-target pair with a known interaction
-        df_drugs = df[df["DTI"] == "D_DT"]
-    else:
-        df_drugs = df[df["max_phase"] == 4]
-
-    now_drugs = df_drugs["parent_molregno"].nunique()
-    now_drug_targets = df_drugs["tid"].nunique()
-    now_drug_targets_mutation = df_drugs["tid_mutation"].nunique()
-    now_drug_pairs = df_drugs["cpd_target_pair"].nunique()
-    now_drug_pairs_mutation = df_drugs["cpd_target_pair_mutation"].nunique()
+import logging
+import pandas as pd
 
-    return [
-        now_mols,
-        now_drugs,
-        now_targets,
-        now_drug_targets,
-        now_targets_mutation,
-        now_drug_targets_mutation,
-        now_pairs,
-        now_drug_pairs,
-        now_pairs_mutation,
-        now_drug_pairs_mutation,
-    ]
+from dataset import Dataset
 
 
-def add_dataset_sizes(
-    df: pd.DataFrame, label: str, df_sizes: list[list[int], list[int]]
-):
+##### Logging Stats #####
+def get_stats_columns() -> tuple[list[str], list[str]]:
     """
-    Count and add representative counts of df to the list df_sizes used for debugging.
-
-    :param df: Pandas DataFrame with current compound-target pairs
-    :type df: pd.DataFrame
-    :param label: Description of pipeline step (e.g., initial query).
-    :type label: str
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
+    Get the relevant columns for which stats should be calculated
+    and a list of descriptions corresponding to the columns.
     """
-    df_copy = df.copy()
-    df_sizes[0].append([label] + calculate_dataset_sizes(df_copy))
-
-    # restrict to data with any pchembl value (any data with a pchembl,
-    # even if it is based on only functional data)
-    # these statistics are purely based on removing
-    # compound-target pairs without pchembl information,
-    # i.e., the subset of the dataset is determined by the given df and not recalculated
-    df_pchembl = df_copy.dropna(
-        subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
-    )
-    df_sizes[1].append([label] + calculate_dataset_sizes(df_pchembl))
+    df_columns = [
+        "parent_molregno",
+        "tid",
+        "tid_mutation",
+        "cpd_target_pair",
+        "cpd_target_pair_mutation",
+    ]
+    columns_descs = [
+        "compound ID",
+        "target ID",
+        "target ID with mutation annotations",
+        "compound-target pair",
+        "compound-target pair with mutation annotations",
+    ]
+    return df_columns, columns_descs
 
 
-##### Logging Stats #####
 def get_stats_for_column(
-    df: pd.DataFrame, column: str, columns_desc: str
+    df: pd.DataFrame,
+    column: str,
+    columns_desc: str,
 ) -> list[list[str, str, int]]:
     """
     Calculate the number of unique values in df[column] and various subsets of df.
@@ -127,3 +88,78 @@ def get_stats_for_column(
             df[df["DTI"] == "C0_DT"][column].nunique(),
         ],
     ]
+
+
+##### Debugging Stats #####
+def get_dataset_sizes(df: pd.DataFrame, label: str) -> pd.DataFrame:
+    """
+    Calculate the number of unique compounds, targets and pairs
+    for df and df limited to drugs.
+
+    :param df: Pandas DataFrame for which the dataset sizes should be calculated.
+    :type df: pd.DataFrame
+    :param label: Description of pipeline step (e.g., initial query).
+    :type label: str
+    :return: Pandas DataFrame with calculated unique counts.
+    :rtype: pd.DataFrame
+    """
+    stats = {"step": label}
+
+    if "DTI" in df.columns:
+        # drugs = compounds of a compound-target pair with a known interaction
+        df_drugs = df[df["DTI"] == "D_DT"]
+    else:
+        df_drugs = df[df["max_phase"] == 4]
+
+    df_columns, _ = get_stats_columns()
+    for column in df_columns:
+        stats[f"{column}_all"] = df[column].nunique()
+        stats[f"{column}_drugs"] = df_drugs[column].nunique()
+
+    df_stats = pd.DataFrame([stats])
+    return df_stats
+
+
+def add_dataset_sizes(
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
+):
+    """
+    Count and add representative counts of df used for debugging to the dataset.
+
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
+    :param df: Pandas DataFrame with current compound-target pairs
+    :type df: pd.DataFrame
+    :param label: Description of pipeline step (e.g., initial query).
+    :type label: str
+    """
+    df_stats = get_dataset_sizes(df, label)
+
+    dataset.df_sizes_all = pd.concat([dataset.df_sizes_all, df_stats])
+
+    # restrict to data with any pchembl value (any data with a pchembl,
+    # even if it is based on only functional data)
+    # these statistics are purely based on removing
+    # compound-target pairs without pchembl information,
+    # i.e., the subset of the dataset is determined by the given df and not recalculated
+    df_copy = df.copy()
+    df_pchembl = df_copy.dropna(
+        subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
+    )
+    df_stats = get_dataset_sizes(df_pchembl, label)
+    dataset.df_sizes_pchembl = pd.concat([dataset.df_sizes_pchembl, df_stats])
+
+
+def add_debugging_info(
+    dataset: Dataset,
+    df: pd.DataFrame,
+    label: str,
+):
+    """
+    Wrapper for add_dataset_sizes.
+    Handles logging level.
+    """
+    if logging.DEBUG >= logging.root.level:
+        add_dataset_sizes(dataset, df, label)
diff --git a/src/main.py b/src/main.py
index 8198f3e..5b297b6 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,103 +1,21 @@
-import argparse
+"""
+Get the compound-target pairs dataset from ChEMBL using the given arguments.
+"""
+
 import logging
 import sqlite3
 
 import chembl_downloader
 
+import arguments
 import get_dataset
 
 
-def parse_args() -> argparse.Namespace:
-    """
-    Get arguments with argparse.
-
-    :return: Populated argparse.Namespace
-    :rtype: argparse.Namespace
-    """
-    parser = argparse.ArgumentParser(
-        description="Extract the compound-target pairs dataset from ChEMBL. \
-            The full dataset plus filtering columns for binding vs. binding+functional data \
-            will always be written to csv. \
-            Additional outputs and output types can be chosen with the parameters below."
-    )
-
-    parser.add_argument(
-        "--chembl",
-        "-v",
-        metavar="<version>",
-        type=str,
-        default=None,
-        help="ChEMBL version. \
-            Latest version if None. \
-            Required if a path to a SQLite database is provided, \
-            i.e., if --sqlite is set. (default: None)",
-    )
-    parser.add_argument(
-        "--sqlite",
-        "-s",
-        metavar="<path>",
-        type=str,
-        default=None,
-        help="Path to SQLite database. \
-            ChEMBL is downloaded as an SQLite database \
-            and handled by chembl_downloader if None. (default: None)",
-    )
-    parser.add_argument(
-        "--output",
-        "-o",
-        metavar="<path>",
-        type=str,
-        required=True,
-        help="Path to write the output file(s) to. (required)",
-    )
-    parser.add_argument(
-        "--delimiter",
-        "-d",
-        metavar="<delimiter>",
-        type=str,
-        default=";",
-        help="Delimiter in output csv-files.  (default: ;)",
-    )
-    parser.add_argument(
-        "--all_sources",
-        action="store_true",
-        help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
-            This includes data from BindingDB which may skew the results. \
-            Default (not set): the dataset is calculated based on only literature data.",
-    )
-    parser.add_argument(
-        "--rdkit",
-        action="store_true",
-        help="Calculate RDKit-based compound properties.",
-    )
-    parser.add_argument(
-        "--excel",
-        action="store_true",
-        help="Write the results to excel. Note: this may fail if the output is too large.",
-    )
-    parser.add_argument(
-        "--BF", action="store_true", help="Write binding+functional data subsets."
-    )
-    parser.add_argument("--B", action="store_true", help="Write binding data subsets.")
-    parser.add_argument(
-        "--debug", action="store_true", help="Log additional debugging information."
-    )
-    args = parser.parse_args()
-
-    return args
-
-
 def main():
     """
     Call get_ct_pair_dataset to get the compound-target dataset using the given arguments.
     """
-    args = parse_args()
-
-    # Set arguments that are always true.
-    # Write the results to csv.
-    csv = True
-    # Write the full dataset plus filtering columns for binding vs. binding+functional data.
-    full_df = True
+    args, calc_args, output_args = arguments.get_args()
 
     log_level = "DEBUG" if args.debug else "INFO"
     numeric_log_level = getattr(logging, log_level, None)
@@ -112,35 +30,19 @@ def main():
         with sqlite3.connect(args.sqlite) as chembl_con:
             get_dataset.get_ct_pair_dataset(
                 chembl_con,
-                args.chembl,
-                args.output,
-                not args.all_sources,
-                args.rdkit,
-                csv,
-                args.excel,
-                args.delimiter,
-                full_df,
-                args.BF,
-                args.B,
+                calc_args,
+                output_args,
             )
     else:
         logging.info("Using chembl_downloader to connect to ChEMBL.")
-        if args.chembl is None:
-            args.chembl = chembl_downloader.latest()
+        if args.chembl_version is None:
+            args.chembl_version = chembl_downloader.latest()
 
-        with chembl_downloader.connect(version=args.chembl) as chembl_con:
+        with chembl_downloader.connect(version=args.chembl_version) as chembl_con:
             get_dataset.get_ct_pair_dataset(
                 chembl_con,
-                args.chembl,
-                args.output,
-                not args.all_sources,
-                args.rdkit,
-                csv,
-                args.excel,
-                args.delimiter,
-                full_df,
-                args.BF,
-                args.B,
+                calc_args,
+                output_args,
             )
 
 
diff --git a/src/output.py b/src/output.py
new file mode 100644
index 0000000..f1a4a5f
--- /dev/null
+++ b/src/output.py
@@ -0,0 +1,223 @@
+"""
+Write the dataset, subsets and related statistics to files 
+and to the command line.
+"""
+
+import logging
+import os
+import pandas as pd
+import sanity_checks
+
+from arguments import OutputArgs, CalculationArgs
+from dataset import Dataset
+import get_stats
+
+
+##### Writing Output #####
+def write_output(
+    df: pd.DataFrame,
+    filename: str,
+    out: OutputArgs,
+) -> list[str]:
+    """
+    Write DataFrame df to output file named <filename>.
+
+    :param df: Pandas Dataframe to write to output file.
+    :type df: pd.DataFrame
+    :param filename: Filename to write the output to
+    :type filename: bool
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    :return: Returns list of types of files that was written to (csv and/or xlsx)
+    :rtype: list[str]
+    """
+    file_type_list = []
+    if out.write_to_csv:
+        df.to_csv(f"{filename}.csv", sep=out.delimiter, index=False)
+        file_type_list.append("csv")
+    if out.write_to_excel:
+        try:
+            with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer:
+                writer.book.use_zip64()
+                df.to_excel(writer, index=False)
+            file_type_list.append("xlsx")
+        except ValueError as e:  # full dataset may be too large to write to excel
+            # remove empty file in case of error to avoid confusion
+            if os.path.exists(f"{filename}.xlsx"):
+                os.remove(f"{filename}.xlsx")
+            print(e)
+    return file_type_list
+
+
+def write_and_check_output(
+    df: pd.DataFrame,
+    filename: str,
+    assay_type: str,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    Write df to file and check that writing was successful.
+
+    :param df: Pandas Dataframe to write to output file.
+    :type df: pd.DataFrame
+    :param filename: Filename to write the output to
+    :type filename: bool
+    :param assay_type: Types of assays current_df contains information about. \
+        Options: "BF" (binding+functional), 
+        "B" (binding), 
+        "all" (contains both BF and B information)
+    :type assay_type: str
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    file_type_list = write_output(df, filename, out)
+    sanity_checks.test_equality(
+        df, filename, assay_type, file_type_list, args.calculate_rdkit
+    )
+
+
+##### Output Specific Results #####
+def write_full_dataset_to_file(
+    dataset: Dataset,
+    args: CalculationArgs,
+    out: OutputArgs,
+):
+    """
+    If write_full_dataset, write df_combined with filtering columns to output_path.
+
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    desc = "all"
+    if out.write_full_dataset:
+        name_all = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset",
+        )
+        write_and_check_output(dataset.df_result, name_all, desc, args, out)
+
+
+def output_stats(
+    df: pd.DataFrame,
+    output_file: str,
+    out: OutputArgs,
+):
+    """
+    Summarise and output the number of unique values in the following columns:
+
+    - parent_molregno (compound ID)
+    - tid (target ID)
+    - tid_mutation (target ID + mutation annotations)
+    - cpd_target_pair (compound-target pairs)
+    - cpd_target_pair_mutation (compound-target pairs including mutation annotations)
+
+    :param df: Pandas Dataframe for which the stats should be calculated
+    :type df: pd.DataFrame
+    :param output_file: Path and filename to write the dataset stats to
+    :type output_file: str
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    logging.debug("Stats for %s", output_file)
+    stats = []
+    df_columns, columns_descs = get_stats.get_stats_columns()
+    for column, columns_desc in zip(df_columns, columns_descs):
+        logging.debug("Stats for column %s:", column)
+        column_stats = get_stats.get_stats_for_column(df, column, columns_desc)
+        stats += column_stats
+        for colum_stat in column_stats:
+            logging.debug("%20s %s", colum_stat[2], colum_stat[3])
+
+    df_stats = pd.DataFrame(
+        stats, columns=["column", "column_description", "subset_type", "counts"]
+    )
+    write_output(
+        df_stats,
+        output_file,
+        out,
+    )
+
+
+def output_all_stats(dataset: Dataset, args: CalculationArgs, out: OutputArgs):
+    """
+    Output stats for all datasets and subsets calculated.
+
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    output_file = os.path.join(
+        out.output_path,
+        f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset_stats",
+    )
+
+    output_stats(dataset.df_result, output_file, out)
+
+    if out.write_bf:
+        output_file = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_"
+            f"CTI_{args.limited_flag}_"
+            f"BF_{args.min_nof_cpds_bf}_c_dt_d_dt_stats",
+        )
+        output_stats(
+            dataset.df_result[dataset.df_result["BF_100_c_dt_d_dt"]],
+            output_file,
+            out,
+        )
+
+    if out.write_b:
+        output_file = os.path.join(
+            out.output_path,
+            f"ChEMBL{args.chembl_version}_"
+            f"CTI_{args.limited_flag}_"
+            f"B_{args.min_nof_cpds_b}_c_dt_d_dt_stats",
+        )
+        output_stats(
+            dataset.df_result[dataset.df_result["B_100_c_dt_d_dt"]],
+            output_file,
+            out,
+        )
+
+
+def write_debug_sizes(
+    dataset: Dataset,
+    out: OutputArgs,
+):
+    """
+    Output counts at various points during calculating the final dataset for debugging.
+
+    :param dataset: Dataset with compound-target pairs and debugging sizes.
+    :type dataset: Dataset
+    :param args: Arguments related to how to calculate the dataset
+    :type args: CalculationArgs
+    :param out: Arguments related to how to output the dataset
+    :type out: OutputArgs
+    """
+    # Size of full dataset at different points.
+    name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
+    write_output(
+        dataset.df_sizes_all,
+        name_full_df_sizes,
+        out,
+    )
+
+    # Size of dataset with any pchembl values at different points.
+    # This includes data for which we only have pchembl data
+    # for functional assays but not for binding assays.
+    name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
+    write_output(
+        dataset.df_sizes_pchembl,
+        name_pchembl_df_sizes,
+        out,
+    )
diff --git a/src/sanity_checks.py b/src/sanity_checks.py
index b34517e..94c5d6d 100644
--- a/src/sanity_checks.py
+++ b/src/sanity_checks.py
@@ -1,39 +1,14 @@
-import pandas as pd
-
-
-########### Sanity checks for the dataset ###########
-def check_null_values(df_combined: pd.DataFrame):
-    """
-    Check if any columns contain nan or null which aren't recognised as null values.
-    """
-    for col in df_combined.columns:
-        col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str))
-        assert (
-            "nan" not in col_as_str
-        ), f"Problem with unrecognised nan value in column {col}"
-        assert (
-            "null" not in col_as_str
-        ), f"Problem with unrecognised null value in column {col}"
+"""
+Perform sanity checks on the dataset.
+"""
 
+import pandas as pd
 
-def check_for_mixed_types(df_combined: pd.DataFrame):
-    """
-    Check that there are no mixed types in columns with dtype=object.
-    """
-    for col, dtype in df_combined.dtypes.to_dict().items():
-        if dtype == object:
-            col_original = set(df_combined[df_combined[col].notnull()][col])
-            col_as_str = set(df_combined[df_combined[col].notnull()][col].astype(str))
-            # is there a difference in the two sets (ignoring null values)
-            assert (
-                len(col_original - col_as_str) == 0
-            ), f"Mixed types in colum {col}: {col_original-col_as_str}"
-            assert (
-                len(col_as_str - col_original) == 0
-            ), f"Mixed types in colum {col}: {col_as_str-col_original}"
+from dataset import Dataset
 
 
-def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame):
+########### Sanity checks during assignments ###########
+def check_pairs_without_pchembl_are_in_drug_mechanisms(df_result: pd.DataFrame):
     """
     Check that rows without a pchembl value based on binding+functional assays (pchembl_x_BF)
     are in the drug_mechanism table.
@@ -47,55 +22,15 @@ def check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined: pd.DataFrame
         "pchembl_value_max_BF",
         "pchembl_value_median_BF",
     ]:
-        assert df_combined[(df_combined[pchembl_col].isnull())].equals(
-            df_combined[
-                (df_combined["pair_mutation_in_dm_table"] == True)
-                & (df_combined[pchembl_col].isnull())
+        assert df_result[(df_result[pchembl_col].isnull())].equals(
+            df_result[
+                (df_result["pair_mutation_in_dm_table"])
+                & (df_result[pchembl_col].isnull())
             ]
         ), f"Missing pchembl value in column {pchembl_col}"
 
 
-def check_ligand_efficiency_metrics(df_combined: pd.DataFrame):
-    """
-    Check that ligand efficiency metrics are only null
-    when at least one of the values used to calculate them is null.
-    Ligand efficiency metrics are only null when at least
-    one of the values used to calculate them is null.
-    """
-    for suffix in ["BF", "B"]:
-        assert df_combined[(df_combined[f"LE_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["heavy_atoms"].isnull())
-                | (df_combined["heavy_atoms"] == 0)
-            ]
-        ), f"Missing LE value in LE_{suffix}"
-
-        assert df_combined[(df_combined[f"BEI_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["mw_freebase"].isnull())
-                | (df_combined["mw_freebase"] == 0)
-            ]
-        ), f"Missing BEI value in BEI_{suffix}"
-
-        assert df_combined[(df_combined[f"SEI_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["psa"].isnull())
-                | (df_combined["psa"] == 0)
-            ]
-        ), f"Missing SEI value in SEI_{suffix}"
-
-        assert df_combined[(df_combined[f"LLE_{suffix}"].isnull())].equals(
-            df_combined[
-                (df_combined[f"pchembl_value_mean_{suffix}"].isnull())
-                | (df_combined["alogp"].isnull())
-            ]
-        ), f"Missing LLE value in LLE_{suffix}"
-
-
-def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame):
+def check_compound_props(df_result: pd.DataFrame, df_cpd_props: pd.DataFrame):
     """
     Check that compound props are only null if
 
@@ -104,8 +39,8 @@ def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame):
     """
     # missing values because the parent_molregno is not in the compound props table
     no_cpd_prop_info = len(
-        df_combined[
-            ~df_combined["parent_molregno"].isin(set(df_cpd_props["parent_molregno"]))
+        df_result[
+            ~df_result["parent_molregno"].isin(set(df_cpd_props["parent_molregno"]))
         ]
     )
 
@@ -113,47 +48,95 @@ def check_compound_props(df_combined: pd.DataFrame, df_cpd_props: pd.DataFrame):
         if col != "parent_molregno":
             # missing values because the compound props query returns null (exists but is null)
             missing_values = len(
-                df_combined[
-                    df_combined["parent_molregno"].isin(
+                df_result[
+                    df_result["parent_molregno"].isin(
                         set(df_cpd_props[df_cpd_props[col].isnull()]["parent_molregno"])
                     )
                 ]
             )
             null_values = no_cpd_prop_info + missing_values
             assert null_values == len(
-                df_combined[df_combined[col].isnull()]
+                df_result[df_result[col].isnull()]
             ), f"Too many null values in {col}"
 
 
-def check_atc_and_target_classes(
-    df_combined: pd.DataFrame,
+def check_ligand_efficiency_metrics(df_result: pd.DataFrame):
+    """
+    Check that ligand efficiency metrics are only null
+    when at least one of the values used to calculate them is null.
+    Ligand efficiency metrics are only null when at least
+    one of the values used to calculate them is null.
+    """
+    for suffix in ["BF", "B"]:
+        assert df_result[(df_result[f"LE_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["heavy_atoms"].isnull())
+                | (df_result["heavy_atoms"] == 0)
+            ]
+        ), f"Missing LE value in LE_{suffix}"
+
+        assert df_result[(df_result[f"BEI_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["mw_freebase"].isnull())
+                | (df_result["mw_freebase"] == 0)
+            ]
+        ), f"Missing BEI value in BEI_{suffix}"
+
+        assert df_result[(df_result[f"SEI_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["psa"].isnull())
+                | (df_result["psa"] == 0)
+            ]
+        ), f"Missing SEI value in SEI_{suffix}"
+
+        assert df_result[(df_result[f"LLE_{suffix}"].isnull())].equals(
+            df_result[
+                (df_result[f"pchembl_value_mean_{suffix}"].isnull())
+                | (df_result["alogp"].isnull())
+            ]
+        ), f"Missing LLE value in LLE_{suffix}"
+
+
+def check_atc(
+    df_result: pd.DataFrame,
     atc_levels: pd.DataFrame,
-    target_classes_level1: pd.DataFrame,
-    target_classes_level2: pd.DataFrame,
 ):
     """
-    Check that atc_level1 and target class information is only null
-    if the parent_molregno / target id is not in the respective table.
+    Check that atc_level1 information is only null
+    if the parent_molregno is not in the respective table.
     """
-    assert df_combined[(df_combined["atc_level1"].isnull())].equals(
-        df_combined[
-            ~df_combined["parent_molregno"].isin(set(atc_levels["parent_molregno"]))
+    assert df_result[(df_result["atc_level1"].isnull())].equals(
+        df_result[
+            ~df_result["parent_molregno"].isin(set(atc_levels["parent_molregno"]))
         ]
     ), "Null values in atc_level1 are not exclusively \
         because the parent_molregno is not in the atc_classification table."
 
-    assert df_combined[(df_combined["target_class_l1"].isnull())].equals(
-        df_combined[~df_combined["tid"].isin(set(target_classes_level1["tid"]))]
+
+def check_target_classes(
+    df_result: pd.DataFrame,
+    target_classes_level1: pd.DataFrame,
+    target_classes_level2: pd.DataFrame,
+):
+    """
+    Check that target class information is only null
+    if the target id is not in the respective table.
+    """
+    assert df_result[(df_result["target_class_l1"].isnull())].equals(
+        df_result[~df_result["tid"].isin(set(target_classes_level1["tid"]))]
     ), "Null values in target_class_l1 are not exclusively \
         because the tid is not in the protein_classification table."
 
-    assert df_combined[(df_combined["target_class_l2"].isnull())].equals(
-        df_combined[~df_combined["tid"].isin(set(target_classes_level2["tid"]))]
+    assert df_result[(df_result["target_class_l2"].isnull())].equals(
+        df_result[~df_result["tid"].isin(set(target_classes_level2["tid"]))]
     ), "Null values in target_class_l2 are not exclusively \
         because the tid is not in the protein_classification table."
 
 
-def check_rdkit_props(df_combined: pd.DataFrame):
+def check_rdkit_props(df_result: pd.DataFrame):
     """
     Check that columns set by the RDKit are only null
     if there is no canonical SMILES for the molecule.
@@ -179,61 +162,59 @@ def check_rdkit_props(df_combined: pd.DataFrame):
         "aromatic_n",
         "aromatic_hetero",
     ]:
-        assert len(df_combined[df_combined[col].isnull()]) == len(
-            df_combined[df_combined["canonical_smiles"].isnull()].copy()
+        assert len(df_result[df_result[col].isnull()]) == len(
+            df_result[df_result["canonical_smiles"].isnull()].copy()
         ), f"Missing value in {col} despite a smiles being available."
 
 
+########### Final sanity checks for the dataset ###########
+def check_null_values(df_result: pd.DataFrame):
+    """
+    Check if any columns contain nan or null which aren't recognised as null values.
+    """
+    for col in df_result.columns:
+        col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
+        assert (
+            "nan" not in col_as_str
+        ), f"Problem with unrecognised nan value in column {col}"
+        assert (
+            "null" not in col_as_str
+        ), f"Problem with unrecognised null value in column {col}"
+
+
+def check_for_mixed_types(df_result: pd.DataFrame):
+    """
+    Check that there are no mixed types in columns with dtype=object.
+    """
+    for col, dtype in df_result.dtypes.to_dict().items():
+        if dtype == object:
+            col_original = set(df_result[df_result[col].notnull()][col])
+            col_as_str = set(df_result[df_result[col].notnull()][col].astype(str))
+            # is there a difference in the two sets (ignoring null values)
+            assert (
+                len(col_original - col_as_str) == 0
+            ), f"Mixed types in colum {col}: {col_original-col_as_str}"
+            assert (
+                len(col_as_str - col_original) == 0
+            ), f"Mixed types in colum {col}: {col_as_str-col_original}"
+
+
 def sanity_checks(
-    df_combined: pd.DataFrame,
-    df_cpd_props: pd.DataFrame,
-    atc_levels: pd.DataFrame,
-    target_classes_level1: pd.DataFrame,
-    target_classes_level2: pd.DataFrame,
-    calculate_rdkit: bool,
+    dataset: Dataset,
 ):
     """
     Check basic assumptions about the finished dataset, specifically:
 
     - no columns contain nan or null values which aren't recognised as null values
     - there are no mixed types in columns with dtype=object
-    - rows without a pchembl value based on binding+functional assays (pchembl_x_BF)
-        are in the drug_mechanism table
-    - ligand efficiency metrics are only null when at least one of the values
-        used to calculate them is null
-    - compound props are only null if the compound is not in df_cpd_props
-        or the value in that table is null
-    - atc_level1 and target class information is only null if
-        the parent_molregno / target id is not in the respective table
-    - columns set by the RDKit are only null if there is no canonical SMILES
-        for the molecule (excluding scaffolds)
-
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param df_cpd_props: Pandas DataFrame with compound properties
-        and structures for all compound ids in ChEMBL.
-    :type df_cpd_props: pd.DataFrame
-    :param atc_levels: Pandas DataFrame with ATC annotations in ChEMBL
-    :type atc_levels: pd.DataFrame
-    :param target_classes_level1: Pandas DataFrame with mapping
-        from target id to level 1 target class
-    :type target_classes_level1: pd.DataFrame
-    :param target_classes_level2: Pandas DataFrame with mapping
-        from target id to level 2 target class
-    :type target_classes_level2: pd.DataFrame
+
+    :param dataset: Dataset with compound-target pairs.
+    :type dataset: Dataset
     :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties
     :type calculate_rdkit: bool
     """
-    check_null_values(df_combined)
-    check_for_mixed_types(df_combined)
-    check_pairs_without_pchembl_are_in_drug_mechanisms(df_combined)
-    check_ligand_efficiency_metrics(df_combined)
-    check_compound_props(df_combined, df_cpd_props)
-    check_atc_and_target_classes(
-        df_combined, atc_levels, target_classes_level1, target_classes_level2
-    )
-    if calculate_rdkit:
-        check_rdkit_props(df_combined)
+    check_null_values(dataset.df_result)
+    check_for_mixed_types(dataset.df_result)
 
 
 ########### Sanity checks for writing and reading a dataset ###########
diff --git a/src/write_subsets.py b/src/write_subsets.py
deleted file mode 100644
index b138b35..0000000
--- a/src/write_subsets.py
+++ /dev/null
@@ -1,624 +0,0 @@
-import logging
-import os
-import pandas as pd
-import sanity_checks
-
-import get_stats
-
-
-def write_output(
-    df: pd.DataFrame,
-    filename: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-) -> list[str]:
-    """
-    Write DataFrame df to output file named <filename>.
-
-    :param df: Pandas Dataframe to write to output file.
-    :type df: pd.DataFrame
-    :param filename: Filename to write the output to
-    :type filename: bool
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    :return: Returns list of types of files that was written to (csv and/or xlsx)
-    :rtype: list[str]
-    """
-    file_type_list = []
-    if write_to_csv:
-        df.to_csv(f"{filename}.csv", sep=delimiter, index=False)
-        file_type_list.append("csv")
-    if write_to_excel:
-        try:
-            with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer:
-                writer.book.use_zip64()
-                df.to_excel(writer, index=False)
-            file_type_list.append("xlsx")
-        except ValueError as e:  # full dataset may be too large to write to excel
-            # remove empty file in case of error to avoid confusion
-            if os.path.exists(f"{filename}.xlsx"):
-                os.remove(f"{filename}.xlsx")
-            print(e)
-    return file_type_list
-
-
-def write_and_check_output(
-    df: pd.DataFrame,
-    filename: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    assay_type: str,
-    calculate_rdkit: bool,
-):
-    """
-    Write df to file and check that writing was successful.
-
-    :param df: Pandas Dataframe to write to output file.
-    :type df: pd.DataFrame
-    :param filename: Filename to write the output to
-    :type filename: bool
-    :param write_to_csv: True if output should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if output should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    :param assay_type: Types of assays current_df contains information about. \
-        Options: "BF" (binding+functional), 
-        "B" (binding), 
-        "all" (contains both BF and B information)
-    :type assay_type: str
-    :param calculate_rdkit: If True, current_df contains RDKit-based columns
-    :type calculate_rdkit: bool
-    """
-    file_type_list = write_output(df, filename, write_to_csv, write_to_excel, delimiter)
-    sanity_checks.test_equality(
-        df, filename, assay_type, file_type_list, calculate_rdkit
-    )
-
-
-def get_data_subsets(
-    data: pd.DataFrame, min_nof_cpds: int, desc: str
-) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
-    """
-    Calculate and return the different subsets of interest.
-
-    :param data: Pandas DataFrame with compound-target pairs
-    :type data: pd.DataFrame
-    :param min_nof_cpds: Miminum number of compounds per target
-    :type min_nof_cpds: int
-    :param desc: Types of assays current_df contains information about. \
-        Options: "BF" (binding+functional), "B" (binding)
-    :type desc: str
-    :return: 
-        - data: Pandas DataFrame with compound-target pairs 
-            without the annotations for the opposite desc, \
-            e.g. if desc = "BF", the average pchembl value based on 
-            binding data only is dropped
-        - df_enough_cpds: Pandas DataFrame with targets 
-            with at least <min_nof_cpds> compounds with a pchembl value, 
-        - df_c_dt_d_dt: As df_enough_cpds but with \
-            at least one compound-target pair labelled as 
-            'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT' (i.e., known interaction), 
-        - df_d_dt: As df_enough_cpds but with \
-            at least one compound-target pair labelled as 
-            'D_DT' (i.e., known drug-target interaction)
-    :rtype: (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame)
-    """
-    if desc == "B":
-        drop_desc = "BF"
-    else:
-        drop_desc = "B"
-    data = data.drop(
-        columns=[
-            f"pchembl_value_mean_{drop_desc}",
-            f"pchembl_value_max_{drop_desc}",
-            f"pchembl_value_median_{drop_desc}",
-            f"first_publication_cpd_target_pair_{drop_desc}",
-            f"first_publication_cpd_target_pair_w_pchembl_{drop_desc}",
-            f"LE_{drop_desc}",
-            f"BEI_{drop_desc}",
-            f"SEI_{drop_desc}",
-            f"LLE_{drop_desc}",
-        ]
-    ).drop_duplicates()
-
-    # Restrict the dataset to targets with at least *min_nof_cpds* compounds with a pchembl value.
-    comparator_counts = (
-        data[data[f"pchembl_value_mean_{desc}"].notnull()]
-        .groupby(["tid_mutation"])["parent_molregno"]
-        .count()
-    )
-    # pylint: disable-next=unused-variable
-    targets_w_enough_cpds = comparator_counts[
-        comparator_counts >= min_nof_cpds
-    ].index.tolist()
-    df_enough_cpds = data.query("tid_mutation in @targets_w_enough_cpds")
-
-    # Restrict the dataset further to targets
-    # with at least one compound-target pair labelled as
-    # 'D_DT', 'C3_DT', 'C2_DT', 'C1_DT' or 'C0_DT',
-    # i.e., compound-target pairs with a known interactions.
-    # pylint: disable-next=unused-variable
-    c_dt_d_dt_targets = set(
-        df_enough_cpds[
-            df_enough_cpds["DTI"].isin(["D_DT", "C3_DT", "C2_DT", "C1_DT", "C0_DT"])
-        ].tid_mutation.to_list()
-    )
-    df_c_dt_d_dt = df_enough_cpds.query("tid_mutation in @c_dt_d_dt_targets")
-
-    # Restrict the dataset further to targets with
-    # at least one compound-target pair labelled as 'D_DT',
-    # i.e., known drug-target interactions.
-    # pylint: disable-next=unused-variable
-    d_dt_targets = set(
-        df_enough_cpds[df_enough_cpds["DTI"] == "D_DT"].tid_mutation.to_list()
-    )
-    df_d_dt = df_enough_cpds.query("tid_mutation in @d_dt_targets")
-
-    return data, df_enough_cpds, df_c_dt_d_dt, df_d_dt
-
-
-def write_bf_to_file(
-    df_combined: pd.DataFrame,
-    chembl_version: str,
-    min_nof_cpds_bf: int,
-    output_path: str,
-    write_bf: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    limited_flag: str,
-    calculate_rdkit: bool,
-    df_sizes: list[list[int], list[int]],
-) -> pd.DataFrame:
-    """
-    Calculate relevant subsets for the portion of df_combined
-    that is based on binding+functional data.
-    If write_bf the subsets are written to output_path.
-    Independent of write_bf, filtering columns for BF are added to df_combined and returned.
-
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param min_nof_cpds_bf: Miminum number of compounds per target
-    :type min_nof_cpds_bf: int
-    :param output_path: Path to write the output to
-    :type output_path: str
-    :param write_bf: Should the subsets be written to files?
-    :type write_bf: bool
-    :param write_to_csv: Should the subsets be written to csv?
-    :type write_to_csv: bool
-    :param write_to_excel: Should the subsets be written to excel?
-    :type write_to_excel: bool
-    :param delimiter: Delimiter for csv output
-    :type delimiter: str
-    :param limited_flag: Document suffix indicating
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
-    :param calculate_rdkit: Does df_combined include RDKit-based columns?
-    :type calculate_rdkit: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :return: Pandas DataFrame with additional filtering columns for BF subsets
-    :rtype: pd.Dataframe
-    """
-    # consider binding and functional assays
-    # assay description = binding+functional
-    desc = "BF"
-    # df_combined with additional filtering columns
-    df_combined_annotated = df_combined.copy()
-    # df_combined without binding only data
-    df_combined_bf = df_combined.copy()
-    (
-        df_combined_bf,
-        df_combined_bf_enough_cpds,
-        df_combined_bf_c_dt_d_dt,
-        df_combined_bf_d_dt,
-    ) = get_data_subsets(df_combined_bf, min_nof_cpds_bf, desc)
-
-    # add filtering columns to df_combined_annotated
-    for df, col_name in zip(
-        [
-            df_combined_bf_enough_cpds,
-            df_combined_bf_c_dt_d_dt,
-            df_combined_bf_d_dt,
-        ],
-        [
-            f"BF_{min_nof_cpds_bf}",
-            f"BF_{min_nof_cpds_bf}_c_dt_d_dt",
-            f"BF_{min_nof_cpds_bf}_d_dt",
-        ],
-    ):
-        df_combined_annotated[col_name] = False
-        df_combined_annotated.loc[
-            (df_combined_annotated.index.isin(df.index)), col_name
-        ] = True
-        # check that filtering works
-        assert df_combined_annotated[df_combined_annotated[col_name] == True][
-            df.columns
-        ].equals(df), f"Filtering is not accurate for {col_name}."
-
-    if write_bf:
-        # NOTE: This is almost identical to the full dataset which will be saved later on.
-        # However, the binding-related columns are dropped
-        name_bf = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF"
-        )
-        write_and_check_output(
-            df_combined_bf,
-            name_bf,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_bf_100 = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}",
-        )
-        write_and_check_output(
-            df_combined_bf_enough_cpds,
-            name_bf_100,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_bf_100_c_dt_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_c_dt_d_dt",
-        )
-        write_and_check_output(
-            df_combined_bf_c_dt_d_dt,
-            name_bf_100_c_dt_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_bf_100_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_BF_{min_nof_cpds_bf}_d_dt",
-        )
-        write_and_check_output(
-            df_combined_bf_d_dt,
-            name_bf_100_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined_bf, "binding + functional", df_sizes)
-        get_stats.add_dataset_sizes(df_combined_bf_enough_cpds, "BF, >= 100", df_sizes)
-        get_stats.add_dataset_sizes(
-            df_combined_bf_c_dt_d_dt, "BF, >= 100, c_dt and d_dt", df_sizes
-        )
-        get_stats.add_dataset_sizes(df_combined_bf_d_dt, "BF, >= 100, d_dt", df_sizes)
-
-    return df_combined_annotated
-
-
-def write_b_to_file(
-    df_combined: pd.DataFrame,
-    df_combined_annotated: pd.DataFrame,
-    chembl_version: str,
-    min_nof_cpds_b: int,
-    output_path: str,
-    write_b: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    limited_flag: str,
-    calculate_rdkit: bool,
-    df_sizes: list[list[int], list[int]],
-) -> pd.DataFrame:
-    """
-    Calculate relevant subsets for the portion of df_combined that is based on binding data.
-    If write_b the subsets are written to output_path.
-    Independent of write_b, filtering columns for B are added to df_combined_annotated.
-
-    :param df_combined: Pandas DataFrame with compound-target pairs
-    :type df_combined: pd.DataFrame
-    :param df_combined_annotated: Pandas DataFrame with additional filtering columns
-    :type df_combined_annotated: pd.DataFrame
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param min_nof_cpds_b: Miminum number of compounds per target
-    :type min_nof_cpds_b: int
-    :param output_path: Path to write the output to
-    :type output_path: str
-    :param write_b: Should the subsets be written to files?
-    :type write_b: bool
-    :param write_to_csv: Should the subsets be written to csv?
-    :type write_to_csv: bool
-    :param write_to_excel: Should the subsets be written to excel?
-    :type write_to_excel: bool
-    :param delimiter: Delimiter for csv output
-    :type delimiter: str
-    :param limited_flag: Document suffix indicating
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
-    :param calculate_rdkit: Does df_combined include RDKit-based columns?
-    :type calculate_rdkit: bool
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :return: Pandas DataFrame with additional filtering columns for B subsets
-    :rtype: pd.Dataframe
-    """
-    # consider only binding assays
-    # assay description = binding
-    desc = "B"
-    df_combined_b = df_combined[df_combined["keep_for_binding"] == True].copy()
-    (
-        df_combined_b,
-        df_combined_b_enough_cpds,
-        df_combined_b_c_dt_d_dt,
-        df_combined_b_d_dt,
-    ) = get_data_subsets(df_combined_b, min_nof_cpds_b, desc)
-
-    # add filtering columns to df_combined_annotated
-    for df, col_name in zip(
-        [df_combined_b_enough_cpds, df_combined_b_c_dt_d_dt, df_combined_b_d_dt],
-        [
-            f"B_{min_nof_cpds_b}",
-            f"B_{min_nof_cpds_b}_c_dt_d_dt",
-            f"B_{min_nof_cpds_b}_d_dt",
-        ],
-    ):
-        df_combined_annotated[col_name] = False
-        df_combined_annotated.loc[
-            (df_combined_annotated.index.isin(df.index)), col_name
-        ] = True
-        # check that filtering works
-        assert df_combined_annotated[df_combined_annotated[col_name] == True][
-            df.columns
-        ].equals(df), f"Filtering is not accurate for {col_name}."
-
-    if write_b:
-        name_b = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B"
-        )
-        write_and_check_output(
-            df_combined_b,
-            name_b,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_b_100 = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}"
-        )
-        write_and_check_output(
-            df_combined_b_enough_cpds,
-            name_b_100,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_b_100_c_dt_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_c_dt_d_dt",
-        )
-        write_and_check_output(
-            df_combined_b_c_dt_d_dt,
-            name_b_100_c_dt_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-        name_b_100_d_dt = os.path.join(
-            output_path,
-            f"ChEMBL{chembl_version}_CTI_{limited_flag}_B_{min_nof_cpds_b}_d_dt",
-        )
-        write_and_check_output(
-            df_combined_b_d_dt,
-            name_b_100_d_dt,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-    if logging.DEBUG >= logging.root.level:
-        get_stats.add_dataset_sizes(df_combined_b, "binding", df_sizes)
-        get_stats.add_dataset_sizes(df_combined_b_enough_cpds, "B, >= 100", df_sizes)
-        get_stats.add_dataset_sizes(
-            df_combined_b_c_dt_d_dt, "B, >= 100, c_dt and d_dt", df_sizes
-        )
-        get_stats.add_dataset_sizes(df_combined_b_d_dt, "B, >= 100, d_dt", df_sizes)
-
-    return df_combined_annotated
-
-
-def write_full_dataset_to_file(
-    df_combined: pd.DataFrame,
-    chembl_version: str,
-    output_path: str,
-    write_full_dataset: bool,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-    limited_flag: str,
-    calculate_rdkit: bool,
-):
-    """
-    If write_full_dataset, write df_combined with filtering columns to output_path.
-
-    :param df_combined: Pandas DataFrame with compound-target pairs and filtering columns
-    :type df_combined: pd.DataFrame
-    :param chembl_version: Version of ChEMBL for output files
-    :type chembl_version: str
-    :param output_path: Path to write the output to
-    :type output_path: str
-    :param write_full_dataset: Should the subsets be written to files?
-    :type write_full_dataset: bool
-    :param write_to_csv: Should the subsets be written to csv?
-    :type write_to_csv: bool
-    :param write_to_excel: Should the subsets be written to excel?
-    :type write_to_excel: bool
-    :param delimiter: Delimiter for csv output
-    :type delimiter: str
-    :param limited_flag: Document suffix indicating
-        whether the dataset was limited to literature sources
-    :type limited_flag: str
-    :param calculate_rdkit: Does df_combined include RDKit-based columns?
-    :type calculate_rdkit: bool
-    """
-    desc = "all"
-    if write_full_dataset:
-        name_all = os.path.join(
-            output_path, f"ChEMBL{chembl_version}_CTI_{limited_flag}_full_dataset"
-        )
-        write_and_check_output(
-            df_combined,
-            name_all,
-            write_to_csv,
-            write_to_excel,
-            delimiter,
-            desc,
-            calculate_rdkit,
-        )
-
-
-def output_debug_sizes(
-    df_sizes: list[list[int], list[int]],
-    output_path: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-):
-    """
-    Output counts at various points during calculating the final dataset for debugging.
-
-    :param df_sizes: List of intermediate sized of the dataset used for debugging.
-    :type df_sizes: list[list[int], list[int]]
-    :param output_path: Path to write the dataset counts to
-    :type output_path: str
-    :param write_to_csv: True if counts should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if counts should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    """
-    column_names = [
-        "type",
-        "#mols",
-        "#drugs",
-        "#targets",
-        "#drug_ targets",
-        "#targets_ mutation",
-        "#drug_ targets_mutation",
-        "#cpd_tid_ pairs",
-        "#drug_tid_ pairs",
-        "#cpd_ tid_mutation_ pairs",
-        "#drug_ tid_mutation_ pairs",
-    ]
-
-    logging.debug("Size of full dataset at different points.")
-    full_df_sizes = pd.DataFrame(df_sizes[0], columns=column_names)
-    logging.debug(full_df_sizes)
-    name_full_df_sizes = os.path.join(output_path, "debug_full_df_sizes")
-    write_output(
-        full_df_sizes, name_full_df_sizes, write_to_csv, write_to_excel, delimiter
-    )
-
-    logging.debug("Size of dataset with any pchembl values at different points.")
-    logging.debug(
-        "This includes data for which we only have pchembl data \
-            for functional assays but not for binding assays."
-    )
-    df_pchembl_sizes = pd.DataFrame(df_sizes[1], columns=column_names)
-    logging.debug(df_pchembl_sizes)
-    name_pchembl_df_sizes = os.path.join(output_path, "debug_pchembl_df_sizes")
-    write_output(
-        full_df_sizes, name_pchembl_df_sizes, write_to_csv, write_to_excel, delimiter
-    )
-
-
-def output_stats(
-    df: pd.DataFrame,
-    output_file: str,
-    write_to_csv: bool,
-    write_to_excel: bool,
-    delimiter: str,
-):
-    """
-    Summarise and output the number of unique values in the following columns:
-
-    - parent_molregno (compound ID)
-    - tid (target ID)
-    - tid_mutation (target ID + mutation annotations)
-    - cpd_target_pair (compound-target pairs)
-    - cpd_target_pair_mutation (compound-target pairs including mutation annotations)
-
-    :param df: Pandas Dataframe for which the stats should be calculated
-    :type df: pd.DataFrame
-    :param output_file: Path and filename to write the dataset stats to
-    :type output_file: str
-    :param write_to_csv: True if stats should be written to csv
-    :type write_to_csv: bool
-    :param write_to_excel: True if stats should be written to excel
-    :type write_to_excel: bool
-    :param delimiter: Delimiter in csv-output
-    :type delimiter: str
-    """
-    df_columns = [
-        "parent_molregno",
-        "tid",
-        "tid_mutation",
-        "cpd_target_pair",
-        "cpd_target_pair_mutation",
-    ]
-    columns_descs = [
-        "compound ID",
-        "target ID",
-        "target ID with mutation annotations",
-        "compound-target pair",
-        "compound-target pair with mutation annotations",
-    ]
-
-    logging.debug("Stats for %s", output_file)
-    stats = []
-    for column, columns_desc in zip(df_columns, columns_descs):
-        logging.debug("Stats for column %s:", column)
-        column_stats = get_stats.get_stats_for_column(df, column, columns_desc)
-        stats += column_stats
-        for colum_stat in column_stats:
-            logging.debug(
-                "%20s %s",
-                colum_stat[2],
-                colum_stat[3],
-            )
-
-    df_stats = pd.DataFrame(
-        stats, columns=["column", "column_description", "subset_type", "counts"]
-    )
-    write_output(df_stats, output_file, write_to_csv, write_to_excel, delimiter)