Skip to content

Commit

Permalink
fix(datasets): seq len cutoff #50
Browse files Browse the repository at this point in the history
Was running into silly mem issues with PDBbind since 5 proteins are above 2000 amino acids in length.

#50
  • Loading branch information
jyaacoub committed Oct 31, 2023
1 parent fcce036 commit 0ab364e
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 12 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,6 @@ lib/mgltools_x86_64Linux2_1.5.7/MGLToolsPckgs/AutoDockTools/Utilities24/*
lib/mgltools_x86_64Linux2_1.5.7p1.tar.gz

log_test/
slurm_tests/
slurm_out_DDP/
/*.sh
results/model_checkpoints/ours/*.model*
32 changes: 27 additions & 5 deletions playground.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,31 @@
# %%
from src.data_analysis.figures import prepare_df, fig3_edge_feat
df = prepare_df('results/model_media/model_stats.csv')
#%%
from src.data_processing.datasets import PDBbindDataset
from src.utils import config as cfg
import pandas as pd
import matplotlib.pyplot as plt

# d0 = pd.read_csv(f'{cfg.DATA_ROOT}/DavisKibaDataset/davis/nomsa_anm/full/XY.csv', index_col=0)
d0 = pd.read_csv(f'{cfg.DATA_ROOT}/PDBbindDataset/nomsa_anm/full/XY.csv', index_col=0)

d0['len'] = d0.prot_seq.str.len()

# %%
fig3_edge_feat(df, show=True, exclude=[])
n, bins, patches = plt.hist(d0['len'], bins=20)
# Set labels and title
plt.xlabel('Protein Sequence length')
plt.ylabel('Frequency')
plt.title('Histogram of Protein Sequence length (davis)')

# Add counts to each bin
for count, x, patch in zip(n, bins, patches):
plt.text(x + 0.5, count, str(int(count)), ha='center', va='bottom')

cutoff= 1500
print(f"Eliminating codes above {cutoff} length would reduce the dataset by: {len(d0[d0['len'] > cutoff])}")
print(f"\t - Eliminates {len(d0[d0['len'] > cutoff].index.unique())} unique proteins")

# %% -d PDBbind -f nomsa -e anm
from src.utils.loader import Loader
d1 = Loader.load_dataset('PDBbind', 'nomsa', 'anm')

# %%
print('test')
8 changes: 4 additions & 4 deletions src/data_processing/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
# See: https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html
# for details on how to create a dataset
class BaseDataset(torchg.data.InMemoryDataset, abc.ABC):
FEATURE_OPTIONS = cfg.PRO_FEAT_OPT
EDGE_OPTIONS = cfg.EDGE_OPT
LIGAND_FEATURE_OPTIONS = cfg.LIG_FEAT_OPT
FEATURE_OPTIONS = cfg.PRO_FEAT_OPT
LIGAND_EDGE_OPTIONS = cfg.LIG_EDGE_OPT
LIGAND_FEATURE_OPTIONS = cfg.LIG_FEAT_OPT

def __init__(self, save_root:str, data_root:str, aln_dir:str,
cmap_threshold:float, feature_opt='nomsa',
Expand Down Expand Up @@ -92,7 +92,7 @@ def __init__(self, save_root:str, data_root:str, aln_dir:str,
self.data_root = data_root
self.cmap_threshold = cmap_threshold
self.overwrite = overwrite
max_seq_len = 100000 or max_seq_len
max_seq_len = max_seq_len or 100000
assert max_seq_len >= 100, 'max_seq_len cant be smaller than 100.'
self.max_seq_len = max_seq_len

Expand Down Expand Up @@ -383,7 +383,7 @@ def process(self):


class PDBbindDataset(BaseDataset): # InMemoryDataset is used if the dataset is small and can fit in CPU memory
def __init__(self, save_root=f'{cfg.DATA_ROOT}/PDBbindDataset/nomsa',
def __init__(self, save_root=f'{cfg.DATA_ROOT}/PDBbindDataset',
data_root=f'{cfg.DATA_ROOT}/v2020-other-PL',
aln_dir=None,
cmap_threshold=8.0, feature_opt='nomsa', *args, **kwargs):
Expand Down
6 changes: 4 additions & 2 deletions src/utils/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def load_dataset(data:str, pro_feature:str, edge_opt:str, subset:str=None, path:
subset=subset,
af_conf_dir='../colabfold/pdbbind_af2_out/out0',
ligand_feature=ligand_feature,
ligand_edge=ligand_edge
ligand_edge=ligand_edge,
max_seq_len=1500
)
elif data in ['davis', 'kiba']:
dataset = DavisKibaDataset(
Expand All @@ -123,7 +124,8 @@ def load_dataset(data:str, pro_feature:str, edge_opt:str, subset:str=None, path:
edge_opt=edge_opt,
subset=subset,
ligand_feature=ligand_feature,
ligand_edge=ligand_edge
ligand_edge=ligand_edge,
max_seq_len=1500
)
else:
raise Exception(f'Invalid data option, pick from {Loader.data_opt}')
Expand Down

0 comments on commit 0ab364e

Please sign in to comment.