-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #144 from jyaacoub/final-paper
Final paper
- Loading branch information
Showing
28 changed files
with
838,789 additions
and
125 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,3 +228,4 @@ splits/**/*.csv | |
results/*/model_media/*/train_log/*.json | ||
results/model_checkpoints.tar.gz | ||
*.out | ||
DDP_outs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/bin/bash | ||
# WARNING: DDP submission doesnt work with ARRAYS! | ||
#SBATCH -t 20 | ||
#SBATCH --job-name=DDP_sub-folds #NOTE: change this and fold_selection arg below | ||
|
||
#SBATCH --mail-type=FAIL | ||
##SBATCH [email protected] | ||
|
||
#SBATCH -c 4 | ||
#SBATCH --mem=4G | ||
|
||
#SBATCH --output=./outs/%x.out | ||
##SBATCH --array=0-4 # WARNING: Doesnt work with submitit | ||
|
||
module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1 | ||
ROOT_DIR="/lustre06/project/6069023" | ||
cd ${ROOT_DIR}/jyaacoub/MutDTA/ | ||
|
||
source .venv/bin/activate | ||
# 3days == 4320 mins | ||
# batch size is local batch sizes | ||
for fold in {0..4} #{0..4} | ||
do | ||
echo "------------ FOLD $fold ------------" | ||
# pdbbind_esm | ||
python -u train_test_DDP.py --model_opt GVPL_ESM \ | ||
--data_opt PDBbind \ | ||
\ | ||
--feature_opt nomsa \ | ||
--edge_opt binary \ | ||
--ligand_feature_opt gvp \ | ||
--ligand_edge_opt binary \ | ||
\ | ||
--learning_rate 0.0001 \ | ||
--batch_size 40 \ | ||
\ | ||
--dropout 0.2328 \ | ||
--dropout_prot 0.0 \ | ||
--output_dim 128 \ | ||
--num_GVPLlayers 3 \ | ||
--pro_dropout_gnn 0.1582 \ | ||
--pro_extra_fc_lyr False \ | ||
--pro_emb_dim 512 \ | ||
\ | ||
--train \ | ||
--fold_selection $fold \ | ||
--num_epochs 2000 \ | ||
-odir ./DDP_outs/pdbbind_gvpl_esm/%j \ | ||
-s_t 4320 -s_m 18GB -s_nn 1 -s_ng 4 -s_cp 4 | ||
# # kiba_esm | ||
# python -u train_test_DDP.py --model_opt EDI \ | ||
# --data_opt kiba \ | ||
# \ | ||
# --feature_opt nomsa \ | ||
# --edge_opt binary \ | ||
# --ligand_feature_opt original \ | ||
# --ligand_edge_opt binary \ | ||
# \ | ||
# --learning_rate 0.0001 \ | ||
# --batch_size 12 \ | ||
# \ | ||
# --dropout 0.4 \ | ||
# --dropout_prot 0.0 \ | ||
# --output_dim 128 \ | ||
# --pro_extra_fc_lyr False \ | ||
# --pro_emb_dim 512 \ | ||
# \ | ||
# --train \ | ||
# --fold_selection 0 \ | ||
# --num_epochs 2000 \ | ||
# -odir ./slurm_outs/kiba_esm/%j \ | ||
# -s_t 4320 -s_m 18GB -s_nn 1 -s_ng 4 -s_cp 4 | ||
done | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
#!/bin/bash | ||
#SBATCH -t 15 | ||
#SBATCH --job-name=AlphaFlow_inference | ||
#SBATCH --cpus-per-task=8 | ||
#SBATCH --mem=4G | ||
#SBATCH --output=./%x_%A.out | ||
# THIS SCRIPT RUNS MSA INPUTS THROUGH ALPHAFLOW AND INFERENCE ON PRETRAINED MODELS | ||
# REQUIRES: | ||
# - MSA dir full of relevant .a3m files for alphaflow | ||
# - ligand sdf file | ||
|
||
ROOT_DIR="/lustre06/project/6069023" | ||
proj_dir="${ROOT_DIR}/jyaacoub/" # contains the relevant alphaflow/ and MutDTA/ repos with their venvs | ||
MSA_DIRECTORY_PATH="${proj_dir}/MutDTASBATCH/samples/input/alphaflow/msa_input" | ||
# MSA is needed for input so that we can run it through alphaflow to get our predictions | ||
# MSA input format must match the following depending on its extension: | ||
# 1. ".aln" - purely just protein sequences (each line is a protein sequence) with the first line being the target. | ||
# 2. ".a3m" - Protein sequences on even line numbers (normal msa format with '>' preceeding each protein sequence) | ||
# with the second line being the target sequence - no blank lines allowed | ||
# | ||
# NOTE: this will run all msas (files ending in .a3m or .aln) in this directory through alphaflow | ||
|
||
OUT_DIR="${ROOT_DIR}/MutDTA_outputs/" # will populate alphaflow_io and results csv file | ||
SBATCH_OUTS="${OUT_DIR}/SBATCH_OUTS/" | ||
LIGAND_SDF_FILE="${proj_dir}/MutDTA/SBATCH/samples/input/inference/VWW_ideal.sdf" # SDF file is needed for GVPL features | ||
MODEL_opt="PDBbind_gvpl_aflow" | ||
# select from one of the following: | ||
# davis_DG, davis_gvpl, davis_esm, | ||
# kiba_DG, kiba_esm, kiba_gvpl, | ||
# PDBbind_DG, PDBbind_esm, PDBbind_gvpl, PDBbind_gvpl_aflow | ||
|
||
N_CONFORMATIONS=10 # number of conformations to generate by alphaflow | ||
ALPHAFLOW_TIME="3:00:00" # for 1 protein it should not take longer than an hour or so | ||
|
||
mkdir -p ${SBATCH_OUTS} | ||
|
||
########################################### | ||
# ALPHAFLOW STEP: | ||
########################################### | ||
io_dir="${OUT_DIR}/alphaflow_io/" #PATH TO INPUT_CSVs DIR | ||
|
||
# Check if Alphaflow output already exists | ||
if [ -d "${io_dir}/out_pdb" ] && [ "$(ls -A ${io_dir}/out_pdb)" ]; then | ||
echo "Alphaflow output already exists, skipping Alphaflow preparation and submission steps." | ||
skip_alphaflow=true | ||
else | ||
skip_alphaflow=false | ||
fi | ||
|
||
if [ "$skip_alphaflow" = false ]; then | ||
echo "Loading up Python virtual environment for Alphaflow prepare input." | ||
cd "${proj_dir}/alphaflow" | ||
source ".venv-ds/bin/activate" | ||
|
||
echo "preparing msa inputs" | ||
python -u prepare_input.py $MSA_DIRECTORY_PATH $io_dir | ||
|
||
if [[ $? -ne 0 ]]; then | ||
echo -e "\nERROR: non-zero exit for prepare_input.py" | ||
fi; | ||
|
||
alphaflow_job_id=$(sbatch << EOF | awk '{print $4}' | ||
#!/bin/bash | ||
#SBATCH -t ${ALPHAFLOW_TIME} | ||
#SBATCH --job-name=AlphaFlow | ||
#SBATCH --gpus-per-node=a100:1 # If running into CUDA MEM errors increase this to 2 | ||
#SBATCH --cpus-per-task=2 | ||
#SBATCH --mem=8G | ||
#SBATCH --output=${SBATCH_OUTS}/alphaflow/%x_%a.out | ||
cd "${proj_dir}/alphaflow" | ||
source ".venv-ds/bin/activate" | ||
# For LMA use only, defaults are: | ||
export DEFAULT_LMA_Q_CHUNK_SIZE=512 # 1024 | ||
export DEFAULT_LMA_KV_CHUNK_SIZE=2048 # 4096 | ||
export CUDA_LAUNCH_BLOCKING=1 | ||
# Determine the number of GPUs available | ||
world_size=\$(nvidia-smi --list-gpus | wc -l) | ||
# runs deepspeed if there are two or more GPUs requested | ||
run_command() { | ||
if [[ \$world_size -ge 2 ]]; then | ||
deepspeed --num_gpus \$world_size predict_deepspeed.py --world_size \$world_size \$@ | ||
else | ||
python predict.py \$@ | ||
fi | ||
} | ||
run_command --mode alphafold \ | ||
--input_csv "${io_dir}/input.csv" \ | ||
--msa_dir "${io_dir}/msa_dir" \ | ||
--weights "weights/alphaflow_md_distilled_202402.pt" \ | ||
--outpdb "${io_dir}/out_pdb" \ | ||
--samples ${N_CONFORMATIONS} \ | ||
--no_overwrite \ | ||
--no_diffusion \ | ||
--noisy_first | ||
EOF | ||
) | ||
|
||
else | ||
echo "Proceeding directly to the inference step." | ||
alphaflow_job_id="" | ||
fi | ||
|
||
|
||
########################################### | ||
# INFERENCE STEP: | ||
########################################### | ||
|
||
echo "Submitting second job for running through the selected model." | ||
|
||
if [ -z "$alphaflow_job_id" ]; then | ||
dependency_line="" | ||
else | ||
dependency_line="#SBATCH --dependency=afterok:${alphaflow_job_id}" | ||
fi | ||
|
||
SBATCH << EOF | ||
#!/bin/bash | ||
#SBATCH -t 30:00 | ||
#SBATCH --job-name=run_mutagenesis_davis_esm | ||
#SBATCH --mem=10G | ||
#SBATCH --gpus-per-node=a100:1 | ||
#SBATCH --cpus-per-task=4 | ||
#SBATCH --output=${SBATCH_OUTS}/%x_%a.out | ||
#SBATCH --array=0-4 | ||
${dependency_line} | ||
CSV_OUT="${OUT_DIR}/inference_test.csv" # this is used for outputs | ||
module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1 | ||
cd ${proj_dir}/MutDTA | ||
source .venv/bin/activate | ||
python -u inference.py \ | ||
--ligand_sdf ${LIGAND_SDF_FILE} \ | ||
--pdb_files ${io_dir}/out_pdb/*.pdb \ | ||
--csv_out ${CSV_OUT} \ | ||
--model_opt ${MODEL_OPT} \ | ||
--fold \${SLURM_ARRAY_TASK_ID} \ | ||
--ligand_id "1a30_ligand" \ | ||
--batch_size 8 | ||
EOF |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#!/bin/bash | ||
#SBATCH -t 30:00 | ||
#SBATCH --job-name=run_mutagenesis_davis_esm | ||
#SBATCH --mem=10G | ||
|
||
#SBATCH --gpus-per-node=a100:1 | ||
#SBATCH --cpus-per-task=4 | ||
|
||
#SBATCH --output=./outs/%x_%a.out | ||
#SBATCH --array=0 | ||
|
||
ROOT_DIR="/lustre06/project/6069023" | ||
CSV_OUT="${ROOT_DIR}/jyaacoub/MutDTA/SBATCH/samples/out/inference_test.csv" # this is used for outputs | ||
proj_dir="${ROOT_DIR}/jyaacoub/MutDTA" | ||
|
||
module load StdEnv/2020 && module load gcc/9.3.0 && module load arrow/12.0.1 | ||
cd ${proj_dir} | ||
source .venv/bin/activate | ||
|
||
# NOTE: To get SMILE from a .mol2 or .sdf file you can use RDKIT: | ||
# | ||
# from rdkit import Chem | ||
# Chem.MolToSmiles(Chem.MolFromMol2File(x), isomericSmiles=False) | ||
# OR just pass in an sdf file and the script will extract the SMILE | ||
|
||
python -u inference.py \ | ||
--ligand_sdf "${proj_dir}/SBATCH/samples/input/inference/VWW_ideal.sdf" \ | ||
--pdb_files ${proj_dir}/SBATCH/samples/input/inference/alphaflow_pdbs/*.pdb \ | ||
--csv_out ${CSV_OUT} \ | ||
--model_opt PDBbind_gvpl_aflow \ | ||
--fold ${SLURM_ARRAY_TASK_ID} \ | ||
--ligand_id "1a30_ligand" \ | ||
--batch_size 8 | ||
# batch size is used for when we have multiple input pdbs and we want to optimize inference times | ||
# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_binaryE_128B_0.0002LR_0.2D_2000E_gvpLF_binaryLE.model | ||
# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_aflowE_128B_0.00022659LR_0.02414D_2000E_gvpLF_binaryLE.model | ||
# ./results/model_checkpoints/ours/GVPLM_PDBbind0D_nomsaF_binaryE_128B_0.0002LR_0.46616D_2000E_gvpLF_binaryLE.model_tmp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.