Skip to content

Commit

Permalink
Merge pull request #144 from jyaacoub/final-paper
Browse files Browse the repository at this point in the history
Final paper
  • Loading branch information
jyaacoub authored Oct 28, 2024
2 parents a555663 + 6617610 commit fac38ef
Show file tree
Hide file tree
Showing 28 changed files with 838,789 additions and 125 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -228,3 +228,4 @@ splits/**/*.csv
results/*/model_media/*/train_log/*.json
results/model_checkpoints.tar.gz
*.out
DDP_outs
74 changes: 74 additions & 0 deletions SBATCH/DDP_train_folds.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/bin/bash
# WARNING: DDP submission doesnt work with ARRAYS!
#SBATCH -t 20
#SBATCH --job-name=DDP_sub-folds #NOTE: change this and fold_selection arg below

#SBATCH --mail-type=FAIL
##SBATCH [email protected]

#SBATCH -c 4
#SBATCH --mem=4G

#SBATCH --output=./outs/%x.out
##SBATCH --array=0-4 # WARNING: Doesnt work with submitit

module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
ROOT_DIR="/lustre06/project/6069023"
cd ${ROOT_DIR}/jyaacoub/MutDTA/

source .venv/bin/activate
# 3days == 4320 mins
# batch size is local batch sizes
for fold in {0..4} #{0..4}
do
echo "------------ FOLD $fold ------------"
# pdbbind_esm
python -u train_test_DDP.py --model_opt GVPL_ESM \
--data_opt PDBbind \
\
--feature_opt nomsa \
--edge_opt binary \
--ligand_feature_opt gvp \
--ligand_edge_opt binary \
\
--learning_rate 0.0001 \
--batch_size 40 \
\
--dropout 0.2328 \
--dropout_prot 0.0 \
--output_dim 128 \
--num_GVPLlayers 3 \
--pro_dropout_gnn 0.1582 \
--pro_extra_fc_lyr False \
--pro_emb_dim 512 \
\
--train \
--fold_selection $fold \
--num_epochs 2000 \
-odir ./DDP_outs/pdbbind_gvpl_esm/%j \
-s_t 4320 -s_m 18GB -s_nn 1 -s_ng 4 -s_cp 4
# # kiba_esm
# python -u train_test_DDP.py --model_opt EDI \
# --data_opt kiba \
# \
# --feature_opt nomsa \
# --edge_opt binary \
# --ligand_feature_opt original \
# --ligand_edge_opt binary \
# \
# --learning_rate 0.0001 \
# --batch_size 12 \
# \
# --dropout 0.4 \
# --dropout_prot 0.0 \
# --output_dim 128 \
# --pro_extra_fc_lyr False \
# --pro_emb_dim 512 \
# \
# --train \
# --fold_selection 0 \
# --num_epochs 2000 \
# -odir ./slurm_outs/kiba_esm/%j \
# -s_t 4320 -s_m 18GB -s_nn 1 -s_ng 4 -s_cp 4
done

150 changes: 150 additions & 0 deletions SBATCH/MAIN_SCRIPT.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/bin/bash
#SBATCH -t 15
#SBATCH --job-name=AlphaFlow_inference
#SBATCH --cpus-per-task=8
#SBATCH --mem=4G
#SBATCH --output=./%x_%A.out
# THIS SCRIPT RUNS MSA INPUTS THROUGH ALPHAFLOW AND INFERENCE ON PRETRAINED MODELS
# REQUIRES:
# - MSA dir full of relevant .a3m files for alphaflow
# - ligand sdf file

ROOT_DIR="/lustre06/project/6069023"
proj_dir="${ROOT_DIR}/jyaacoub/" # contains the relevant alphaflow/ and MutDTA/ repos with their venvs
MSA_DIRECTORY_PATH="${proj_dir}/MutDTASBATCH/samples/input/alphaflow/msa_input"
# MSA is needed for input so that we can run it through alphaflow to get our predictions
# MSA input format must match the following depending on its extension:
# 1. ".aln" - purely just protein sequences (each line is a protein sequence) with the first line being the target.
# 2. ".a3m" - Protein sequences on even line numbers (normal msa format with '>' preceeding each protein sequence)
# with the second line being the target sequence - no blank lines allowed
#
# NOTE: this will run all msas (files ending in .a3m or .aln) in this directory through alphaflow

OUT_DIR="${ROOT_DIR}/MutDTA_outputs/" # will populate alphaflow_io and results csv file
SBATCH_OUTS="${OUT_DIR}/SBATCH_OUTS/"
LIGAND_SDF_FILE="${proj_dir}/MutDTA/SBATCH/samples/input/inference/VWW_ideal.sdf" # SDF file is needed for GVPL features
MODEL_opt="PDBbind_gvpl_aflow"
# select from one of the following:
# davis_DG, davis_gvpl, davis_esm,
# kiba_DG, kiba_esm, kiba_gvpl,
# PDBbind_DG, PDBbind_esm, PDBbind_gvpl, PDBbind_gvpl_aflow

N_CONFORMATIONS=10 # number of conformations to generate by alphaflow
ALPHAFLOW_TIME="3:00:00" # for 1 protein it should not take longer than an hour or so

mkdir -p ${SBATCH_OUTS}

###########################################
# ALPHAFLOW STEP:
###########################################
io_dir="${OUT_DIR}/alphaflow_io/" #PATH TO INPUT_CSVs DIR

# Check if Alphaflow output already exists
if [ -d "${io_dir}/out_pdb" ] && [ "$(ls -A ${io_dir}/out_pdb)" ]; then
echo "Alphaflow output already exists, skipping Alphaflow preparation and submission steps."
skip_alphaflow=true
else
skip_alphaflow=false
fi

if [ "$skip_alphaflow" = false ]; then
echo "Loading up Python virtual environment for Alphaflow prepare input."
cd "${proj_dir}/alphaflow"
source ".venv-ds/bin/activate"

echo "preparing msa inputs"
python -u prepare_input.py $MSA_DIRECTORY_PATH $io_dir

if [[ $? -ne 0 ]]; then
echo -e "\nERROR: non-zero exit for prepare_input.py"
fi;

alphaflow_job_id=$(sbatch << EOF | awk '{print $4}'
#!/bin/bash
#SBATCH -t ${ALPHAFLOW_TIME}
#SBATCH --job-name=AlphaFlow
#SBATCH --gpus-per-node=a100:1 # If running into CUDA MEM errors increase this to 2
#SBATCH --cpus-per-task=2
#SBATCH --mem=8G
#SBATCH --output=${SBATCH_OUTS}/alphaflow/%x_%a.out
cd "${proj_dir}/alphaflow"
source ".venv-ds/bin/activate"
# For LMA use only, defaults are:
export DEFAULT_LMA_Q_CHUNK_SIZE=512 # 1024
export DEFAULT_LMA_KV_CHUNK_SIZE=2048 # 4096
export CUDA_LAUNCH_BLOCKING=1
# Determine the number of GPUs available
world_size=\$(nvidia-smi --list-gpus | wc -l)
# runs deepspeed if there are two or more GPUs requested
run_command() {
if [[ \$world_size -ge 2 ]]; then
deepspeed --num_gpus \$world_size predict_deepspeed.py --world_size \$world_size \$@
else
python predict.py \$@
fi
}
run_command --mode alphafold \
--input_csv "${io_dir}/input.csv" \
--msa_dir "${io_dir}/msa_dir" \
--weights "weights/alphaflow_md_distilled_202402.pt" \
--outpdb "${io_dir}/out_pdb" \
--samples ${N_CONFORMATIONS} \
--no_overwrite \
--no_diffusion \
--noisy_first
EOF
)

else
echo "Proceeding directly to the inference step."
alphaflow_job_id=""
fi


###########################################
# INFERENCE STEP:
###########################################

echo "Submitting second job for running through the selected model."

if [ -z "$alphaflow_job_id" ]; then
dependency_line=""
else
dependency_line="#SBATCH --dependency=afterok:${alphaflow_job_id}"
fi

SBATCH << EOF
#!/bin/bash
#SBATCH -t 30:00
#SBATCH --job-name=run_mutagenesis_davis_esm
#SBATCH --mem=10G
#SBATCH --gpus-per-node=a100:1
#SBATCH --cpus-per-task=4
#SBATCH --output=${SBATCH_OUTS}/%x_%a.out
#SBATCH --array=0-4
${dependency_line}
CSV_OUT="${OUT_DIR}/inference_test.csv" # this is used for outputs
module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
cd ${proj_dir}/MutDTA
source .venv/bin/activate
python -u inference.py \
--ligand_sdf ${LIGAND_SDF_FILE} \
--pdb_files ${io_dir}/out_pdb/*.pdb \
--csv_out ${CSV_OUT} \
--model_opt ${MODEL_OPT} \
--fold \${SLURM_ARRAY_TASK_ID} \
--ligand_id "1a30_ligand" \
--batch_size 8
EOF
File renamed without changes.
37 changes: 37 additions & 0 deletions SBATCH/run_inference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/bin/bash
#SBATCH -t 30:00
#SBATCH --job-name=run_mutagenesis_davis_esm
#SBATCH --mem=10G

#SBATCH --gpus-per-node=a100:1
#SBATCH --cpus-per-task=4

#SBATCH --output=./outs/%x_%a.out
#SBATCH --array=0

ROOT_DIR="/lustre06/project/6069023"
CSV_OUT="${ROOT_DIR}/jyaacoub/MutDTA/SBATCH/samples/out/inference_test.csv" # this is used for outputs
proj_dir="${ROOT_DIR}/jyaacoub/MutDTA"

module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
cd ${proj_dir}
source .venv/bin/activate

# NOTE: To get SMILE from a .mol2 or .sdf file you can use RDKIT:
#
# from rdkit import Chem
# Chem.MolToSmiles(Chem.MolFromMol2File(x), isomericSmiles=False)
# OR just pass in an sdf file and the script will extract the SMILE

python -u inference.py \
--ligand_sdf "${proj_dir}/SBATCH/samples/input/inference/VWW_ideal.sdf" \
--pdb_files ${proj_dir}/SBATCH/samples/input/inference/alphaflow_pdbs/*.pdb \
--csv_out ${CSV_OUT} \
--model_opt PDBbind_gvpl_aflow \
--fold ${SLURM_ARRAY_TASK_ID} \
--ligand_id "1a30_ligand" \
--batch_size 8
# batch size is used for when we have multiple input pdbs and we want to optimize inference times
# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_binaryE_128B_0.0002LR_0.2D_2000E_gvpLF_binaryLE.model
# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_aflowE_128B_0.00022659LR_0.02414D_2000E_gvpLF_binaryLE.model
# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_binaryE_128B_0.0002LR_0.46616D_2000E_gvpLF_binaryLE.model_tmp
12 changes: 7 additions & 5 deletions SBATCH/run_mutagenesis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#SBATCH --gpus-per-node=a100:1
#SBATCH --cpus-per-task=4

#SBATCH --output=./%x_%a.out
#SBATCH --output=./outs/%x_%a.out
#SBATCH --array=0

# runs across all folds for a model
Expand All @@ -15,14 +15,16 @@
# Then to get most accurate mutagenesis you can average these matrices
# and visualize them with src.analysis.mutagenesis_plot.plot_sequence
ROOT_DIR="/lustre06/project/6069023"
SCRATCH_DIR="/lustre07/scratch/jyaacoub/" # this is used for outputs on narval
OUT_DIR="${ROOT_DIR}/jyaacoub/MutDTA/SBATCH/outs/mutagenesis_tests" # this is used for outputs
BIN_DIR="${ROOT_DIR}/jyaacoub/bin" # for modeller
proj_dir="${ROOT_DIR}/jyaacoub/MutDTA"

module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
# Modeller is needed for this to run... (see: Generic install - https://salilab.org/modeller/10.5/release.html#unix)
export PYTHONPATH="${PYTHONPATH}:${BIN_DIR}/modeller10.5/lib/x86_64-intel8/python3.3:${BIN_DIR}/modeller10.5/lib/x86_64-intel8:${BIN_DIR}/modeller10.5/modlib"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${BIN_DIR}/modeller10.5/lib/x86_64-intel8"

cd ${ROOT_DIR}/jyaacoub/MutDTA
cd ${proj_dir}
source .venv/bin/activate

# NOTE: To get SMILE from a .mol2 or .sdf file you can use RDKIT:
Expand All @@ -34,8 +36,8 @@ source .venv/bin/activate
python -u run_mutagenesis.py \
--ligand_smile "CC(C)CC(NC(=O)C(CC(=O)[O-])NC(=O)C([NH3+])CCC(=O)[O-])C(=O)[O-]" \
--ligand_smile_name "1a30_ligand" \
--pdb_file "${ROOT_DIR}/jyaacoub/data/kiba/alphaflow_io/out_pdb_MD-distilled/P67870.pdb" \
--out_path "${SCRATCH_DIR}/mutagenesis_tests/" \
--pdb_file "${proj_dir}/SBATCH/samples/input/mutagenesis/P67870.pdb" \
--out_path "${OUT_DIR}/" \
--res_start 0 \
--res_end 5 \
--model_opt davis_esm \
Expand Down
2 changes: 1 addition & 1 deletion SBATCH/run_platinum.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#SBATCH --gpus-per-node=a100:1
#SBATCH --cpus-per-task=4

#SBATCH --output=./%x_%a.out
#SBATCH --output=./outs/%x_%a.out
#SBATCH --array=0

# runs across all folds for a model
Expand Down
Loading

0 comments on commit fac38ef

Please sign in to comment.