Merge pull request #144 from jyaacoub/final-paper

Final paper
jyaacoub · Oct 28, 2024 · fac38ef · fac38ef
2 parents a555663 + 6617610
commit fac38ef
Show file tree

Hide file tree

Showing 28 changed files with 838,789 additions and 125 deletions.
diff --git a/.gitignore b/.gitignore
@@ -228,3 +228,4 @@ splits/**/*.csv
 results/*/model_media/*/train_log/*.json
 results/model_checkpoints.tar.gz
 *.out
+DDP_outs
diff --git a/SBATCH/DDP_train_folds.sh b/SBATCH/DDP_train_folds.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# WARNING: DDP submission doesnt work with ARRAYS!
+#SBATCH -t 20
+#SBATCH --job-name=DDP_sub-folds #NOTE: change this and fold_selection arg below
+
+#SBATCH --mail-type=FAIL
+##SBATCH [email protected]
+
+#SBATCH -c 4
+#SBATCH --mem=4G
+
+#SBATCH --output=./outs/%x.out
+##SBATCH --array=0-4 # WARNING: Doesnt work with submitit
+
+module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
+ROOT_DIR="/lustre06/project/6069023"
+cd ${ROOT_DIR}/jyaacoub/MutDTA/
+
+source .venv/bin/activate
+# 3days == 4320 mins
+# batch size is local batch sizes
+for fold in {0..4} #{0..4}
+do
+	echo "------------ FOLD $fold ------------"
+	# pdbbind_esm
+	python -u train_test_DDP.py --model_opt GVPL_ESM \
+						--data_opt PDBbind \
+						\
+						--feature_opt nomsa \
+						--edge_opt binary \
+						--ligand_feature_opt gvp \
+						--ligand_edge_opt binary \
+						\
+						--learning_rate 0.0001 \
+						--batch_size 40 \
+						\
+						--dropout 0.2328 \
+						--dropout_prot 0.0 \
+						--output_dim 128 \
+						--num_GVPLlayers 3 \
+						--pro_dropout_gnn 0.1582 \
+						--pro_extra_fc_lyr False \
+						--pro_emb_dim 512 \
+						\
+						--train \
+						--fold_selection $fold \
+						--num_epochs 2000 \
+						-odir ./DDP_outs/pdbbind_gvpl_esm/%j \
+						-s_t 4320 -s_m 18GB -s_nn 1 -s_ng 4 -s_cp 4
+	# # kiba_esm
+	# python -u train_test_DDP.py --model_opt EDI \
+	# 					--data_opt kiba \
+	# 					\
+	# 					--feature_opt nomsa \
+	# 					--edge_opt binary \
+	# 					--ligand_feature_opt original \
+	# 					--ligand_edge_opt binary \
+	# 					\
+	# 					--learning_rate 0.0001 \
+	# 					--batch_size 12 \
+	# 					\
+	# 					--dropout 0.4 \
+	# 					--dropout_prot 0.0 \
+	# 					--output_dim 128 \
+	# 					--pro_extra_fc_lyr False \
+	# 					--pro_emb_dim 512 \
+	# 					\
+	# 					--train \
+	# 					--fold_selection 0 \
+	# 					--num_epochs 2000 \
+	# 					-odir ./slurm_outs/kiba_esm/%j \
+	# 					-s_t 4320 -s_m 18GB -s_nn 1 -s_ng 4 -s_cp 4
+done
+
diff --git a/SBATCH/MAIN_SCRIPT.sh b/SBATCH/MAIN_SCRIPT.sh
@@ -0,0 +1,150 @@
+#!/bin/bash
+#SBATCH -t 15
+#SBATCH --job-name=AlphaFlow_inference
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=4G
+#SBATCH --output=./%x_%A.out
+# THIS SCRIPT RUNS MSA INPUTS THROUGH ALPHAFLOW AND INFERENCE ON PRETRAINED MODELS
+# REQUIRES:
+#   - MSA dir full of relevant .a3m files for alphaflow 
+#   - ligand sdf file
+
+ROOT_DIR="/lustre06/project/6069023"
+proj_dir="${ROOT_DIR}/jyaacoub/" # contains the relevant alphaflow/ and MutDTA/ repos with their venvs
+MSA_DIRECTORY_PATH="${proj_dir}/MutDTASBATCH/samples/input/alphaflow/msa_input"
+# MSA is needed for input so that we can run it through alphaflow to get our predictions
+# MSA input format must match the following depending on its extension:
+#   1. ".aln" - purely just protein sequences (each line is a protein sequence) with the first line being the target.
+#   2. ".a3m" - Protein sequences on even line numbers (normal msa format with '>' preceeding each protein sequence) 
+#       with the second line being the target sequence - no blank lines allowed
+#
+# NOTE: this will run all msas (files ending in .a3m or .aln) in this directory through alphaflow
+
+OUT_DIR="${ROOT_DIR}/MutDTA_outputs/" # will populate alphaflow_io and results csv file
+SBATCH_OUTS="${OUT_DIR}/SBATCH_OUTS/"
+LIGAND_SDF_FILE="${proj_dir}/MutDTA/SBATCH/samples/input/inference/VWW_ideal.sdf" # SDF file is needed for GVPL features
+MODEL_opt="PDBbind_gvpl_aflow"
+# select from one of the following:
+#   davis_DG, davis_gvpl, davis_esm, 
+#   kiba_DG, kiba_esm, kiba_gvpl, 
+#   PDBbind_DG, PDBbind_esm, PDBbind_gvpl, PDBbind_gvpl_aflow
+
+N_CONFORMATIONS=10 # number of conformations to generate by alphaflow
+ALPHAFLOW_TIME="3:00:00" # for 1 protein it should not take longer than an hour or so
+
+mkdir -p ${SBATCH_OUTS}
+
+###########################################
+# ALPHAFLOW STEP:
+###########################################
+io_dir="${OUT_DIR}/alphaflow_io/" #PATH TO INPUT_CSVs DIR
+
+# Check if Alphaflow output already exists
+if [ -d "${io_dir}/out_pdb" ] && [ "$(ls -A ${io_dir}/out_pdb)" ]; then
+    echo "Alphaflow output already exists, skipping Alphaflow preparation and submission steps."
+    skip_alphaflow=true
+else
+    skip_alphaflow=false
+fi
+
+if [ "$skip_alphaflow" = false ]; then
+    echo "Loading up Python virtual environment for Alphaflow prepare input."
+    cd "${proj_dir}/alphaflow"
+    source ".venv-ds/bin/activate"
+
+    echo "preparing msa inputs"
+    python -u prepare_input.py $MSA_DIRECTORY_PATH $io_dir
+
+    if [[ $? -ne 0 ]]; then
+        echo -e "\nERROR: non-zero exit for prepare_input.py"
+    fi;
+
+    alphaflow_job_id=$(sbatch << EOF | awk '{print $4}'
+#!/bin/bash
+#SBATCH -t ${ALPHAFLOW_TIME}
+#SBATCH --job-name=AlphaFlow
+
+#SBATCH --gpus-per-node=a100:1 # If running into CUDA MEM errors increase this to 2
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=8G
+#SBATCH --output=${SBATCH_OUTS}/alphaflow/%x_%a.out
+
+cd "${proj_dir}/alphaflow"
+source ".venv-ds/bin/activate"
+
+#                                   For LMA use only, defaults are:
+export DEFAULT_LMA_Q_CHUNK_SIZE=512     # 1024
+export DEFAULT_LMA_KV_CHUNK_SIZE=2048   # 4096
+
+export CUDA_LAUNCH_BLOCKING=1
+
+# Determine the number of GPUs available
+world_size=\$(nvidia-smi --list-gpus | wc -l)
+
+# runs deepspeed if there are two or more GPUs requested
+run_command() {
+    if [[ \$world_size -ge 2 ]]; then
+        deepspeed --num_gpus \$world_size predict_deepspeed.py --world_size \$world_size \$@
+    else
+        python predict.py \$@
+    fi
+}
+
+run_command --mode alphafold \
+            --input_csv "${io_dir}/input.csv" \
+            --msa_dir "${io_dir}/msa_dir" \
+            --weights "weights/alphaflow_md_distilled_202402.pt" \
+            --outpdb "${io_dir}/out_pdb" \
+            --samples ${N_CONFORMATIONS} \
+            --no_overwrite \
+            --no_diffusion \
+            --noisy_first
+EOF
+    )
+
+else
+    echo "Proceeding directly to the inference step."
+    alphaflow_job_id=""
+fi
+
+
+###########################################
+# INFERENCE STEP:
+###########################################
+
+echo "Submitting second job for running through the selected model."
+
+if [ -z "$alphaflow_job_id" ]; then
+    dependency_line=""
+else
+    dependency_line="#SBATCH --dependency=afterok:${alphaflow_job_id}"
+fi
+
+SBATCH << EOF
+#!/bin/bash
+#SBATCH -t 30:00
+#SBATCH --job-name=run_mutagenesis_davis_esm
+#SBATCH --mem=10G
+
+#SBATCH --gpus-per-node=a100:1
+#SBATCH --cpus-per-task=4
+
+#SBATCH --output=${SBATCH_OUTS}/%x_%a.out
+#SBATCH --array=0-4
+${dependency_line}
+
+CSV_OUT="${OUT_DIR}/inference_test.csv" # this is used for outputs
+
+module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
+cd ${proj_dir}/MutDTA
+source .venv/bin/activate
+
+python -u inference.py \
+            --ligand_sdf ${LIGAND_SDF_FILE} \
+            --pdb_files ${io_dir}/out_pdb/*.pdb \
+            --csv_out ${CSV_OUT} \
+            --model_opt ${MODEL_OPT} \
+            --fold \${SLURM_ARRAY_TASK_ID} \
+            --ligand_id "1a30_ligand" \
+            --batch_size 8 
+EOF
diff --git a/SBATCH/test.sh → SBATCH/playground.sh b/SBATCH/test.sh → SBATCH/playground.sh
diff --git a/SBATCH/run_inference.sh b/SBATCH/run_inference.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#SBATCH -t 30:00
+#SBATCH --job-name=run_mutagenesis_davis_esm
+#SBATCH --mem=10G
+
+#SBATCH --gpus-per-node=a100:1
+#SBATCH --cpus-per-task=4
+
+#SBATCH --output=./outs/%x_%a.out
+#SBATCH --array=0
+
+ROOT_DIR="/lustre06/project/6069023"
+CSV_OUT="${ROOT_DIR}/jyaacoub/MutDTA/SBATCH/samples/out/inference_test.csv" # this is used for outputs
+proj_dir="${ROOT_DIR}/jyaacoub/MutDTA"
+
+module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
+cd ${proj_dir}
+source .venv/bin/activate
+
+# NOTE: To get SMILE from a .mol2 or .sdf file you can use RDKIT:
+#
+#    from rdkit import Chem
+#    Chem.MolToSmiles(Chem.MolFromMol2File(x), isomericSmiles=False)
+# OR just pass in an sdf file and the script will extract the SMILE
+
+python -u inference.py \
+            --ligand_sdf "${proj_dir}/SBATCH/samples/input/inference/VWW_ideal.sdf" \
+            --pdb_files ${proj_dir}/SBATCH/samples/input/inference/alphaflow_pdbs/*.pdb \
+            --csv_out ${CSV_OUT} \
+            --model_opt PDBbind_gvpl_aflow \
+            --fold ${SLURM_ARRAY_TASK_ID} \
+            --ligand_id "1a30_ligand" \
+            --batch_size 8 
+# batch size is used for when we have multiple input pdbs and we want to optimize inference times
+# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_binaryE_128B_0.0002LR_0.2D_2000E_gvpLF_binaryLE.model
+# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_aflowE_128B_0.00022659LR_0.02414D_2000E_gvpLF_binaryLE.model
+# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_binaryE_128B_0.0002LR_0.46616D_2000E_gvpLF_binaryLE.model_tmp
diff --git a/SBATCH/run_mutagenesis.sh b/SBATCH/run_mutagenesis.sh
@@ -6,7 +6,7 @@
 #SBATCH --gpus-per-node=a100:1
 #SBATCH --cpus-per-task=4
 
-#SBATCH --output=./%x_%a.out
+#SBATCH --output=./outs/%x_%a.out
 #SBATCH --array=0
 
 # runs across all folds for a model
@@ -15,14 +15,16 @@
 # Then to get most accurate mutagenesis you can average these matrices
 # and visualize them with src.analysis.mutagenesis_plot.plot_sequence
 ROOT_DIR="/lustre06/project/6069023"
-SCRATCH_DIR="/lustre07/scratch/jyaacoub/" # this is used for outputs on narval
+OUT_DIR="${ROOT_DIR}/jyaacoub/MutDTA/SBATCH/outs/mutagenesis_tests" # this is used for outputs
 BIN_DIR="${ROOT_DIR}/jyaacoub/bin" # for modeller
+proj_dir="${ROOT_DIR}/jyaacoub/MutDTA"
 
+module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1
 # Modeller is needed for this to run... (see: Generic install - https://salilab.org/modeller/10.5/release.html#unix)
 export PYTHONPATH="${PYTHONPATH}:${BIN_DIR}/modeller10.5/lib/x86_64-intel8/python3.3:${BIN_DIR}/modeller10.5/lib/x86_64-intel8:${BIN_DIR}/modeller10.5/modlib"
 export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${BIN_DIR}/modeller10.5/lib/x86_64-intel8"
 
-cd ${ROOT_DIR}/jyaacoub/MutDTA
+cd ${proj_dir}
 source .venv/bin/activate
 
 # NOTE: To get SMILE from a .mol2 or .sdf file you can use RDKIT:
@@ -34,8 +36,8 @@ source .venv/bin/activate
 python -u run_mutagenesis.py \
                     --ligand_smile "CC(C)CC(NC(=O)C(CC(=O)[O-])NC(=O)C([NH3+])CCC(=O)[O-])C(=O)[O-]" \
                     --ligand_smile_name "1a30_ligand" \
-                    --pdb_file "${ROOT_DIR}/jyaacoub/data/kiba/alphaflow_io/out_pdb_MD-distilled/P67870.pdb" \
-                    --out_path "${SCRATCH_DIR}/mutagenesis_tests/" \
+                    --pdb_file "${proj_dir}/SBATCH/samples/input/mutagenesis/P67870.pdb" \
+                    --out_path "${OUT_DIR}/" \
                     --res_start 0 \
                     --res_end 5 \
                     --model_opt davis_esm \

diff --git a/SBATCH/run_platinum.sh b/SBATCH/run_platinum.sh
@@ -6,7 +6,7 @@
 #SBATCH --gpus-per-node=a100:1
 #SBATCH --cpus-per-task=4
 
-#SBATCH --output=./%x_%a.out
+#SBATCH --output=./outs/%x_%a.out
 #SBATCH --array=0
 
 # runs across all folds for a model