Skip to content

Commit

Permalink
Remove unnecessary variables from Dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
LinaHeinzke committed Feb 20, 2024
1 parent a4721c6 commit fe2a49a
Show file tree
Hide file tree
Showing 10 changed files with 285 additions and 226 deletions.
80 changes: 40 additions & 40 deletions src/add_chembl_compound_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,26 @@
import pandas as pd

from dataset import Dataset
import sanity_checks


########### Add Compound Properties Based on ChEMBL Data ###########
def add_first_publication_date(
dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
):
def get_first_publication_cpd_date(
chembl_con: sqlite3.Connection, limit_to_literature: bool
) -> pd.DataFrame:
"""
Query and calculate the first publication of a compound
based on ChEMBL data (column name: first_publication_cpd).
If limit_to_literature is True, this corresponds to the first appearance
of the compound in the literature according to ChEMBL.
Otherwise this is the first appearance in any source in ChEMBL.
:param dataset: Dataset with compound-target pairs.
Will be updated to include first_publication_cpd
:type dataset: Dataset
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:param limit_to_literature: Base first_publication_cpd on literature sources only if True.
:type limit_to_literature: bool
:return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL.
:rtype: pd.DataFrame
"""
# information about salts is aggregated in the parent
sql = """
Expand All @@ -43,26 +43,21 @@ def add_first_publication_date(
].transform("min")
df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates()

dataset.df_result = dataset.df_result.merge(
df_docs, on="parent_molregno", how="left"
)
return df_docs


def add_chembl_properties_and_structures(
dataset: Dataset, chembl_con: sqlite3.Connection
):
def get_chembl_properties_and_structures(
chembl_con: sqlite3.Connection,
) -> pd.DataFrame:
"""
Add compound properties from the compound_properties table
Get compound properties from the compound_properties table
(e.g., alogp, #hydrogen bond acceptors / donors, etc.).
Add InChI, InChI key and canonical smiles.
Get InChI, InChI key and canonical smiles.
:param dataset: Dataset with compound-target pairs.
Will be updated to include compound properties and structures.
dataset.df_cpd_props will be set to
compound properties and structures for all compound ids in ChEMBL.
:type dataset: Dataset
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL
:rtype: pd.DataFrame
"""
sql = """
SELECT DISTINCT mh.parent_molregno,
Expand All @@ -79,16 +74,13 @@ def add_chembl_properties_and_structures(
"""

df_cpd_props = pd.read_sql_query(sql, con=chembl_con)
dataset.df_cpd_props = df_cpd_props

dataset.df_result = dataset.df_result.merge(
df_cpd_props, on="parent_molregno", how="left"
)
return df_cpd_props


def add_ligand_efficiency_metrics(dataset: Dataset):
def calculate_ligand_efficiency_metrics(dataset: Dataset):
"""
Calculate the ligand efficiency metrics for the compounds
Calculate and add the ligand efficiency metrics for the compounds
based on the mean pchembl values for a compound-target pair and
the following ligand efficiency (LE) formulas:
Expand Down Expand Up @@ -150,20 +142,18 @@ def add_ligand_efficiency_metrics(dataset: Dataset):
)


def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection):
def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame:
"""
Query and add ATC classifications (level 1) from the atc_classification and
Query ATC classifications (level 1) from the atc_classification and
molecule_atc_classification tables.
ATC level annotations for the same parent_molregno are combined into one description
that concatenates all descriptions sorted alphabetically
into one string with ' | ' as a separator.
:param dataset: Dataset with compound-target pairs.
Will be updated to include ATC classifications.
dataset.atc_levels will be set to ATC annotations in ChEMBL.
:type dataset: Dataset
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:return: Pandas DataFrame with ATC annotations in ChEMBL.
:rtype: pd.DataFrame
"""
sql = """
SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description
Expand All @@ -186,11 +176,7 @@ def add_atc_classification(dataset: Dataset, chembl_con: sqlite3.Connection):
].transform(lambda x: between_str_join.join(sorted(x)))
atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates()

dataset.atc_levels = atc_levels

dataset.df_result = dataset.df_result.merge(
atc_levels, on="parent_molregno", how="left"
)
return atc_levels


def add_all_chembl_compound_properties(
Expand All @@ -214,10 +200,24 @@ def add_all_chembl_compound_properties(
Base it on all available sources otherwise.
:type limit_to_literature: bool
"""
add_first_publication_date(dataset, chembl_con, limit_to_literature)
df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature)
dataset.df_result = dataset.df_result.merge(
df_docs, on="parent_molregno", how="left"
)

add_chembl_properties_and_structures(dataset, chembl_con)
df_cpd_props = get_chembl_properties_and_structures(chembl_con)
dataset.df_cpd_props = df_cpd_props
dataset.df_result = dataset.df_result.merge(
df_cpd_props, on="parent_molregno", how="left"
)
sanity_checks.check_compound_props(dataset.df_result, df_cpd_props)

add_ligand_efficiency_metrics(dataset)
calculate_ligand_efficiency_metrics(dataset)
sanity_checks.check_ligand_efficiency_metrics(dataset.df_result)

add_atc_classification(dataset, chembl_con)
atc_levels = get_atc_classification(chembl_con)
dataset.atc_levels = atc_levels
dataset.df_result = dataset.df_result.merge(
atc_levels, on="parent_molregno", how="left"
)
sanity_checks.check_atc(dataset.df_result, atc_levels)
104 changes: 73 additions & 31 deletions src/add_chembl_target_class_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import write_subsets
from arguments import OutputArgs, CalculationArgs
from dataset import Dataset
import sanity_checks


########### Add Target Class Annotations Based on ChEMBL Data ###########
Expand Down Expand Up @@ -80,44 +81,31 @@ def get_target_class_table(
return df_target_classes


def add_chembl_target_class_annotations(
def get_aggregated_target_classes(
dataset: Dataset,
chembl_con: sqlite3.Connection,
args: CalculationArgs,
out: OutputArgs,
):
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Add level 1 and 2 target class annotations.
Assignments for target IDs with more than one target class assignment per level
are summarised into one string with '|' as a separator
between the different target class annotations.
Targets with more than one level 1 / level 2 target class assignment are written to a file.
These could be reassigned by hand if a single target class is preferable.
Get mappings for target id to aggregated level 1 / level 2 target class.
:param dataset: Dataset with compound-target pairs.
Will be updated to only include target class annotations.
dataset.target_classes_level1 will be set to
pandas DataFrame with mapping from target id to level 1 target class
dataset.target_classes_level2 will be set to
pandas DataFrame with mapping from target id to level 2 target class
:type dataset: Dataset
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
:return: [pandas DataFrame with mapping from target id to level 1 target class,
pandas DataFrame with mapping from target id to level 2 target class]
:rtype: tuple[pd.DataFrame, pd.DataFrame]
"""
current_tids = set(dataset.df_result["tid"])
df_target_classes = get_target_class_table(chembl_con, current_tids)

between_str_join = "|"

# Summarise the information for a target id with
# several assigned target classes of level 1 into one description.
# If a target id has more than one assigned target class,
# the target class 'Unclassified protein' is discarded.
level = "l1"
between_str_join = "|"
target_classes_level1 = df_target_classes[["tid", level]].drop_duplicates().dropna()

# remove 'Unclassified protein' from targets with more than one target class, level 1
Expand Down Expand Up @@ -145,10 +133,6 @@ def add_chembl_target_class_annotations(
["tid", "target_class_l1"]
].drop_duplicates()

dataset.df_result = dataset.df_result.merge(
target_classes_level1, on="tid", how="left"
)

# Repeat the summary step for target classes of level 2.
level = "l2"
target_classes_level2 = df_target_classes[["tid", level]].drop_duplicates().dropna()
Expand All @@ -159,11 +143,24 @@ def add_chembl_target_class_annotations(
["tid", "target_class_l2"]
].drop_duplicates()

dataset.df_result = dataset.df_result.merge(
target_classes_level2, on="tid", how="left"
)
return target_classes_level1, target_classes_level2


# Output targets have more than one target class assignment
def output_ambiguous_target_classes(
dataset: Dataset,
args: CalculationArgs,
out: OutputArgs,
):
"""
Output targets have more than one target class assignment
:param dataset: Dataset with compound-target pairs.
:type dataset: Dataset
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
"""
more_than_one_level_1 = dataset.df_result[
(dataset.df_result["target_class_l1"].notnull())
& (dataset.df_result["target_class_l1"].str.contains("|", regex=False))
Expand Down Expand Up @@ -203,5 +200,50 @@ def add_chembl_target_class_annotations(
out,
)

dataset.target_classes_level1 = target_classes_level1
dataset.target_classes_level2 = target_classes_level2

def add_chembl_target_class_annotations(
dataset: Dataset,
chembl_con: sqlite3.Connection,
args: CalculationArgs,
out: OutputArgs,
):
"""
Add level 1 and 2 target class annotations.
Assignments for target IDs with more than one target class assignment per level
are summarised into one string with '|' as a separator
between the different target class annotations.
Targets with more than one level 1 / level 2 target class assignment are written to a file.
These could be reassigned by hand if a single target class is preferable.
:param dataset: Dataset with compound-target pairs.
Will be updated to only include target class annotations.
dataset.target_classes_level1 will be set to
pandas DataFrame with mapping from target id to level 1 target class
dataset.target_classes_level2 will be set to
pandas DataFrame with mapping from target id to level 2 target class
:type dataset: Dataset
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
"""
target_classes_level1, target_classes_level2 = get_aggregated_target_classes(
dataset, chembl_con
)

dataset.df_result = dataset.df_result.merge(
target_classes_level1, on="tid", how="left"
)

dataset.df_result = dataset.df_result.merge(
target_classes_level2, on="tid", how="left"
)

sanity_checks.check_target_classes(
dataset.df_result, target_classes_level1, target_classes_level2
)

output_ambiguous_target_classes(dataset, args, out)
4 changes: 4 additions & 0 deletions src/add_dti_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,27 +80,31 @@ def add_dti_annotations(
),
"DTI",
] = "D_DT"

dataset.df_result.loc[
(
dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
& (dataset.df_result["max_phase"] == 3)
),
"DTI",
] = "C3_DT"

dataset.df_result.loc[
(
dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
& (dataset.df_result["max_phase"] == 2)
),
"DTI",
] = "C2_DT"

dataset.df_result.loc[
(
dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)
& (dataset.df_result["max_phase"] == 1)
),
"DTI",
] = "C1_DT"

# Compounds that are in the drug_mechanism table but don't have a known phase between 1-4:
dataset.df_result.loc[
(
Expand Down
2 changes: 2 additions & 0 deletions src/add_rdkit_compound_descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from tqdm import tqdm

from dataset import Dataset
import sanity_checks


def add_built_in_descriptors(dataset: Dataset):
Expand Down Expand Up @@ -168,3 +169,4 @@ def add_rdkit_compound_descriptors(dataset: Dataset):
"""
add_built_in_descriptors(dataset)
add_aromaticity_descriptors(dataset)
sanity_checks.check_rdkit_props(dataset.df_result)
4 changes: 2 additions & 2 deletions src/clean_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dataset import Dataset


########### Remove Irrelevant Compounds ###########
def remove_compounds_without_smiles_and_mixtures(
dataset: Dataset, chembl_con: sqlite3.Connection
):
Expand Down Expand Up @@ -96,9 +97,8 @@ def remove_compounds_without_smiles_and_mixtures(
)
]

return dataset.df_result


########### General Cleaning Steps ###########
def clean_none_values(dataset: Dataset):
"""
Change nan values and empty strings to None for consistency.
Expand Down
15 changes: 4 additions & 11 deletions src/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,15 @@
class Dataset:
"""
df_result: Pandas DataFrame with the full dataset
drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table,
used for DTI assignments
drug_mechanism_targets_set: Set of targets in the drug_mechanism table,
used for DTI assigments
df_sizes_all: List of intermediate sized of the dataset used for debugging
df_sizes_pchembl: List of intermediate sized of the dataset used for debugging
drug_mechanism_pairs_set: Set of compound-target pairs in the drug_mechanism table
drug_mechanism_targets_set: Set of targets in the drug_mechanism table
df_cpd_props: Pandas DataFrame with compound properties and
structures for all compound ids in ChEMBL
atc_levels: Pandas DataFrame with ATC annotations in ChEMBL
target_classes_level1: Pandas DataFrame with mapping from target id to level 1 target class
target_classes_level2: Pandas DataFrame with mapping from target id to level 2 target class
"""

df_result: pd.DataFrame
df_cpd_props: pd.DataFrame
atc_levels: pd.DataFrame
target_classes_level1: pd.DataFrame
target_classes_level2: pd.DataFrame
drug_mechanism_pairs_set: set
drug_mechanism_targets_set: set
df_sizes_all: list[int]
Expand Down
Loading

0 comments on commit fe2a49a

Please sign in to comment.