From ed4468b144e72a337bd40904ff9dbc4e98409dde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Sat, 10 Aug 2024 01:58:50 +0000 Subject: [PATCH 01/15] added initial files --- notebooks/OpenFoldLocal.ipynb | 123 +++++++++++++++++++++++ notebooks/utils/__init__.py | 0 notebooks/utils/docker_commands.py | 152 +++++++++++++++++++++++++++++ notebooks/utils/model.py | 20 ++++ notebooks/utils/requirements.txt | 1 + 5 files changed, 296 insertions(+) create mode 100644 notebooks/OpenFoldLocal.ipynb create mode 100644 notebooks/utils/__init__.py create mode 100644 notebooks/utils/docker_commands.py create mode 100644 notebooks/utils/model.py create mode 100644 notebooks/utils/requirements.txt diff --git a/notebooks/OpenFoldLocal.ipynb b/notebooks/OpenFoldLocal.ipynb new file mode 100644 index 00000000..56214b79 --- /dev/null +++ b/notebooks/OpenFoldLocal.ipynb @@ -0,0 +1,123 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OpenFold Local Notebook\n", + "\n", + "Provides the flexibility to run inference using a local installation of OpenFold with Docker, along with the convenience of visualizing results using the same plots from the OpenFold Colab Notebook.\n", + "\n", + "If you have access to a machine and want to do quick inference and visualize results, there are some useful things you can do with this notebook:\n", + "\n", + "- You can use precomputed alignments, which enables you to run inference with different model parameters to compare results.\n", + "- Get the best model and plots using the provided utility functions.\n", + "- Handle long-running executions.\n", + "\n", + "\n", + "Of course, you can do this solely using Docker commands in the terminal, but you would need to code/adjust the Colab functions to work with data locally. This notebook gives you a head start." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup the notebook\n", + "\n", + "Start your notebook:\n", + "\n", + "Go to the notebook folder:\n", + "\n", + "`cd notebooks`\n", + "\n", + "Install the requirements in your env:\n", + "\n", + "`pip install -r utils/requirements.txt`\n", + "\n", + "Start your Jupyter server\n", + "\n", + "`jupyter lab . --ip=\"0.0.0.0\"`\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running Inference " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage 1: Using a sequence string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils.docker_commands import run_inference\n", + "\n", + "# For multiple sequences, separate sequences with a colon `:`\n", + "inference_input = \"DAGAQGAAIGSPGVLSGNVVQVPVHVPVNVCGNTVSVIGLLNPAFGNTCVNA:AGETGRTGVLVTSSATNDGDSGWGRFAG\"\n", + "\n", + "database_dir = \"/path\"\n", + "model_name = \"multimer\"\n", + "weight_set = 'AlphaFold'\n", + "\n", + "run_inference(inference_input, database_dir, weight_set, model_name)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example usage 2: Using a pre-existing fasta directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from utils.docker_commands import run_inference\n", + "\n", + "# Example usage, put your .fasta file with the sequences to process in the inference dir\n", + "inference_dir = \"/path\"\n", + "databases_dir = \"/path\"\n", + "\n", + "\n", + "weight_set = 'AlphaFold'\n", + "model_name = \"multimer\"\n", + "\n", + "run_inference(inference_dir, databases_dir, weight_set, model_name, use_precomputed_alignments=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/utils/__init__.py b/notebooks/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/notebooks/utils/docker_commands.py b/notebooks/utils/docker_commands.py new file mode 100644 index 00000000..0aafc10d --- /dev/null +++ b/notebooks/utils/docker_commands.py @@ -0,0 +1,152 @@ +from .model import get_config_preset_list_for_model +import docker +import random +import string +import os + +def generate_random_directory_name(prefix="fasta_dir_", length=6): + return prefix + ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) + +def generate_random_sequence_name(length=8): + return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) + +def write_sequences_to_fasta(sequences, fasta_dir): + os.makedirs(f'inference/{fasta_dir}', exist_ok=True) + fasta_file = os.path.join(f'inference/{fasta_dir}', "sequences.fasta") + + with open(fasta_file, 'w') as f: + for seq in sequences: + sequence_name = f"sequence_{generate_random_sequence_name()}" + f.write(f">{sequence_name}\n") + f.write(f"{seq}\n") + + return fasta_dir + +def validate_sequence(input_sequence, weight_set): + # Remove all whitespaces, tabs, and end lines; convert to upper-case + input_sequence = input_sequence.translate(str.maketrans('', '', ' \n\t')).upper() + aatypes = set('ACDEFGHIKLMNPQRSTVWY') # 20 standard amino acids + allowed_chars = aatypes.union({':'}) + + if not set(input_sequence).issubset(allowed_chars): + raise Exception(f'Input sequence contains non-amino acid letters: {set(input_sequence) - allowed_chars}. OpenFold only supports 20 standard amino acids as inputs.') + + if ':' in input_sequence and weight_set != 'AlphaFold': + raise ValueError('Input sequence is a multimer, must select Alphafold weight set') + + return input_sequence + + +def run_inference( + inference_input, + database_dir, + weight_set, + model_name, + use_precomputed_alignments=False): + + default_fasta_dir_name = '' + + # Determine if the inference_input is a directory or a sequence string + if os.path.isdir(inference_input): + fasta_dir = inference_input + default_fasta_dir_name = 'fasta_dir' + print(f"Input is a directory: {fasta_dir}/{default_fasta_dir_name}") + else: + # Treat inference_input as a sequence string + sequence_string = inference_input + print(f"Input is a sequence string: {sequence_string}") + + # Validate and clean the sequence string + sequence_string = validate_sequence(inference_input, weight_set) + + os.makedirs('inference', exist_ok=True) + + fasta_dir = f'{os.getcwd()}/inference' + + # Generate a random directory name for storing the fasta file + default_fasta_dir_name = generate_random_directory_name() + print(f"Generated directory: {default_fasta_dir_name}") + + inference_dir = write_sequences_to_fasta(sequence_string.split(':'), default_fasta_dir_name) + print(f"Sequences written to FASTA file in directory: {inference_dir}") + + + config_preset_list = get_config_preset_list_for_model(weight_set, model_name) + + for config_preset in config_preset_list: + run_inference_for_model( + inference_input, + database_dir, + fasta_dir, + default_fasta_dir_name, + config_preset, + use_precomputed_alignments + ) + + +def run_inference_for_model( + inference_input, + database_dir, + fasta_dir, + default_fasta_dir_name, + config_preset, + use_precomputed_alignments, + ): + + client = docker.from_env() + + # Set up the volumes dictionary + volumes = { + fasta_dir: {'bind': '/inference', 'mode': 'rw'}, + } + + volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} + + command = [ + "python3", "/opt/openfold/run_pretrained_openfold.py", + f"/inference/{default_fasta_dir_name}", + "/database/pdb_mmcif/mmcif_files/", + # Databases + "--uniref90_database_path", "/database/uniref90/uniref90.fasta", + "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", + "--pdb70_database_path", "/database/pdb70/pdb70", + "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", + "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", + # Search MSA + "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", + "--hhblits_binary_path", "/opt/conda/bin/hhblits", + "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", + "--kalign_binary_path", "/opt/conda/bin/kalign", + # Inference settings + "--model_device", "cuda:0", + "--config_preset", config_preset, + "--save_outputs", + "--output_dir", f"/inference/results/{config_preset}", + ] + + if use_precomputed_alignments: + command.extend(["--use_precomputed_alignments", "/inference/alignments"]) + + try: + print("Running Docker container for inference...") + container = client.containers.run( + image="openfold:latest", + command=command, + volumes=volumes, + runtime="nvidia", # This is required for GPU usage + remove=True, + detach=True, # Running in the background to allow log streaming + stdout=True, + stderr=True + ) + + # Stream logs in real-time + for log in container.logs(stream=True): + print(log.decode('utf-8'), end='') + + except docker.errors.ContainerError as e: + print(f"ContainerError: {e}") + except docker.errors.ImageNotFound as e: + print(f"ImageNotFound: {e}") + except docker.errors.APIError as e: + print(f"APIError: {e}") \ No newline at end of file diff --git a/notebooks/utils/model.py b/notebooks/utils/model.py new file mode 100644 index 00000000..6029bd06 --- /dev/null +++ b/notebooks/utils/model.py @@ -0,0 +1,20 @@ + +def get_config_preset_list_for_model(weight_set, model_name): + if weight_set == "OpenFold" and model_name == "monomer": + model_names = [ + 'finetuning_3.pt', + 'finetuning_4.pt', + 'finetuning_5.pt', + 'finetuning_ptm_2.pt', + 'finetuning_no_templ_ptm_1.pt' + ] + elif weight_set == "AlphaFold" and model_name == "multimer": + # Generate 'model_1_multimer_v3' to 'model_5_multimer_v3' using a loop + model_names = [f'model_{i}_multimer_v3' for i in range(1, 6)] + elif weight_set == "AlphaFold" and model_name == "monomer": + # Generate 'model_1' to 'model_5' using a loop + model_names = [f'model_{i}' for i in range(1, 6)] + else: + raise ValueError(f"Invalid combination of weight_set '{weight_set}' and model_name '{model_name}'") + + return model_names \ No newline at end of file diff --git a/notebooks/utils/requirements.txt b/notebooks/utils/requirements.txt new file mode 100644 index 00000000..de5cce34 --- /dev/null +++ b/notebooks/utils/requirements.txt @@ -0,0 +1 @@ +docker==7.1.0 \ No newline at end of file From 5e0357766c7991ea5fa7180f21c59741effeb02f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 00:34:00 +0000 Subject: [PATCH 02/15] refactored utils --- notebooks/utils/utils.py | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 notebooks/utils/utils.py diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py new file mode 100644 index 00000000..50a13683 --- /dev/null +++ b/notebooks/utils/utils.py @@ -0,0 +1,73 @@ +import random +import string +import os + + +def generate_random_run_id(length=6): + run_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) + print(f"Run ID: {run_id}") + return run_id + +def generate_random_sequence_name(length=8): + return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) + +def write_sequences_to_fasta(sequences, fasta_dir_root_path): + # os.makedirs(f'inference/{fasta_dir}', exist_ok=True) + fasta_file = os.path.join(fasta_dir_root_path, "sequences.fasta") + + with open(fasta_file, 'w') as f: + for seq in sequences: + sequence_name = f"sequence_{generate_random_sequence_name()}" + f.write(f">{sequence_name}\n") + f.write(f"{seq}\n") + + return fasta_file + +def validate_sequence(input_sequence, weight_set): + # Remove all whitespaces, tabs, and end lines; convert to upper-case + input_sequence = input_sequence.translate(str.maketrans('', '', ' \n\t')).upper() + aatypes = set('ACDEFGHIKLMNPQRSTVWY') # 20 standard amino acids + allowed_chars = aatypes.union({':'}) + + if not set(input_sequence).issubset(allowed_chars): + raise Exception(f'Input sequence contains non-amino acid letters: {set(input_sequence) - allowed_chars}. OpenFold only supports 20 standard amino acids as inputs.') + + if ':' in input_sequence and weight_set != 'AlphaFold': + raise ValueError('Input sequence is a multimer, must select Alphafold weight set') + + return input_sequence + + + +def generate_individual_sequence_files(output_path): + with open(f"{output_path}/fasta_dir/sequences.fasta", "r") as infile: + sequence_id = None + sequence_lines = [] + + output_path = f"{output_path}/fasta_dir/tmp" + os.makedirs(output_path, exist_ok=True) + + for line in infile: + line = line.strip() + if line.startswith(">"): + if sequence_id is not None: + # Save the previous sequence to a file + output_file = os.path.join(output_path, f"{sequence_id}.fasta") + with open(output_file, "w") as outfile: + outfile.write(f">{sequence_id}\n") + outfile.write("\n".join(sequence_lines) + "\n") + print(f"Saved {sequence_id} to {output_file}") + + # Start a new sequence + sequence_id = line[1:].split('.')[0] # Remove '>' and split by '.' to remove the suffix + sequence_lines = [] + else: + sequence_lines.append(line) + + # Save the last sequence + if sequence_id is not None: + output_file = os.path.join(output_path, f"{sequence_id}.fasta") + with open(output_file, "w") as outfile: + outfile.write(f">{sequence_id}\n") + outfile.write("\n".join(sequence_lines) + "\n") + print(f"Saved {sequence_id} to {output_file}") From 5db2d430cc8373982a49ee344f26d4d83eeac40e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 00:34:28 +0000 Subject: [PATCH 03/15] added msa and inference commands --- notebooks/utils/model.py | 75 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/notebooks/utils/model.py b/notebooks/utils/model.py index 6029bd06..e211c757 100644 --- a/notebooks/utils/model.py +++ b/notebooks/utils/model.py @@ -1,3 +1,78 @@ +import os +import shutil +from .docker_commands import run_inference_for_model, run_msa_alignment +from .utils import validate_sequence, generate_random_run_id, write_sequences_to_fasta, generate_individual_sequence_files + +default_fasta_dir_name = 'fasta_dir' + +def run_inference( + inference_input, + database_dir, + weight_set, + model_name, + use_precomputed_alignments=False, + run_id=None): + + os.makedirs('data', exist_ok=True) + + if use_precomputed_alignments: + if not run_id: + raise ValueError(f"run_id is required when using pre computed aligments.") + + if not run_id: + + # Generate a random directory name for storing the fasta file + run_id = generate_random_run_id() + + run_path = f"{os.getcwd()}/data/run_{run_id}" + fasta_dir_root_path = f'{run_path}/{default_fasta_dir_name}' + + os.makedirs(fasta_dir_root_path, exist_ok=True) + + print(f"Fasta root directory: {fasta_dir_root_path}") + + # Determine if the inference_input is a directory or a sequence string + if os.path.isfile(inference_input): + shutil.copy2(inference_input, f"{fasta_dir_root_path}/sequences.fasta") + + print(f"Using input file : {inference_input}") + else: + # Treat inference_input as a sequence string + sequence_string = inference_input + print(f"Input is a sequence string: {sequence_string}") + + # Validate and clean the sequence string + sequence_string = validate_sequence(inference_input, weight_set) + + fasta_file = write_sequences_to_fasta(sequence_string.split(':'), fasta_dir_root_path) + print(f"Sequences written to FASTA file: {fasta_file}") + + + generate_individual_sequence_files(run_path) + + run_msa_alignment(run_path, database_dir, default_fasta_dir_name) + + run_model_with(run_path, database_dir, default_fasta_dir_name, weight_set, model_name) + + elif run_id and use_precomputed_alignments: + # Run inference with given Run ID + run_path = f"{os.getcwd()}/data/run_{run_id}" + + if os.path.isdir(run_path): + run_model_with(run_path, database_dir, default_fasta_dir_name, weight_set, model_name) + else: + raise ValueError(f"Provided Run ID does not exists") + +def run_model_with(run_path, database_dir, default_fasta_dir_name, weight_set, model_name): + config_preset_list = get_config_preset_list_for_model(weight_set, model_name) + + for config_preset in config_preset_list: + run_inference_for_model( + run_path, + database_dir, + default_fasta_dir_name, + config_preset + ) def get_config_preset_list_for_model(weight_set, model_name): if weight_set == "OpenFold" and model_name == "monomer": From 27dff8ad92a21b617573e72d055b60fc170c3ee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 00:34:52 +0000 Subject: [PATCH 04/15] added docker commad for msa --- notebooks/utils/docker_commands.py | 176 ++++++++++++++--------------- 1 file changed, 83 insertions(+), 93 deletions(-) diff --git a/notebooks/utils/docker_commands.py b/notebooks/utils/docker_commands.py index 0aafc10d..a4e7831e 100644 --- a/notebooks/utils/docker_commands.py +++ b/notebooks/utils/docker_commands.py @@ -1,110 +1,29 @@ -from .model import get_config_preset_list_for_model import docker -import random -import string import os -def generate_random_directory_name(prefix="fasta_dir_", length=6): - return prefix + ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) - -def generate_random_sequence_name(length=8): - return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) - -def write_sequences_to_fasta(sequences, fasta_dir): - os.makedirs(f'inference/{fasta_dir}', exist_ok=True) - fasta_file = os.path.join(f'inference/{fasta_dir}', "sequences.fasta") - - with open(fasta_file, 'w') as f: - for seq in sequences: - sequence_name = f"sequence_{generate_random_sequence_name()}" - f.write(f">{sequence_name}\n") - f.write(f"{seq}\n") - - return fasta_dir - -def validate_sequence(input_sequence, weight_set): - # Remove all whitespaces, tabs, and end lines; convert to upper-case - input_sequence = input_sequence.translate(str.maketrans('', '', ' \n\t')).upper() - aatypes = set('ACDEFGHIKLMNPQRSTVWY') # 20 standard amino acids - allowed_chars = aatypes.union({':'}) - - if not set(input_sequence).issubset(allowed_chars): - raise Exception(f'Input sequence contains non-amino acid letters: {set(input_sequence) - allowed_chars}. OpenFold only supports 20 standard amino acids as inputs.') - - if ':' in input_sequence and weight_set != 'AlphaFold': - raise ValueError('Input sequence is a multimer, must select Alphafold weight set') - - return input_sequence - - -def run_inference( - inference_input, - database_dir, - weight_set, - model_name, - use_precomputed_alignments=False): - - default_fasta_dir_name = '' - - # Determine if the inference_input is a directory or a sequence string - if os.path.isdir(inference_input): - fasta_dir = inference_input - default_fasta_dir_name = 'fasta_dir' - print(f"Input is a directory: {fasta_dir}/{default_fasta_dir_name}") - else: - # Treat inference_input as a sequence string - sequence_string = inference_input - print(f"Input is a sequence string: {sequence_string}") - - # Validate and clean the sequence string - sequence_string = validate_sequence(inference_input, weight_set) - - os.makedirs('inference', exist_ok=True) - - fasta_dir = f'{os.getcwd()}/inference' - - # Generate a random directory name for storing the fasta file - default_fasta_dir_name = generate_random_directory_name() - print(f"Generated directory: {default_fasta_dir_name}") - - inference_dir = write_sequences_to_fasta(sequence_string.split(':'), default_fasta_dir_name) - print(f"Sequences written to FASTA file in directory: {inference_dir}") - - - config_preset_list = get_config_preset_list_for_model(weight_set, model_name) - - for config_preset in config_preset_list: - run_inference_for_model( - inference_input, - database_dir, - fasta_dir, - default_fasta_dir_name, - config_preset, - use_precomputed_alignments - ) - - def run_inference_for_model( - inference_input, + run_path, database_dir, - fasta_dir, default_fasta_dir_name, config_preset, - use_precomputed_alignments, + use_precomputed_alignments=True, ): client = docker.from_env() - # Set up the volumes dictionary volumes = { - fasta_dir: {'bind': '/inference', 'mode': 'rw'}, + run_path: {'bind': '/run_path', 'mode': 'rw'}, } volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} - + + fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" + outpur_dir = "/run_path/output" + precomputed_alignments_dir = "/run_path/output/alignments" + command = [ "python3", "/opt/openfold/run_pretrained_openfold.py", - f"/inference/{default_fasta_dir_name}", + fasta_dir, "/database/pdb_mmcif/mmcif_files/", # Databases "--uniref90_database_path", "/database/uniref90/uniref90.fasta", @@ -117,15 +36,20 @@ def run_inference_for_model( "--hhblits_binary_path", "/opt/conda/bin/hhblits", "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", "--kalign_binary_path", "/opt/conda/bin/kalign", + "--pdb_seqres_database_path", "/database/pdb_seqres/pdb_seqres.txt", + "--uniref30_database_path", "/database/uniref30/UniRef30_2021_03", + "--uniprot_database_path", "/database/uniprot/uniprot.fasta", + "--hmmsearch_binary_path", "/opt/conda/bin/hmmsearch", + "--hmmbuild_binary_path", "/opt/conda/bin/hmmbuild", # Inference settings "--model_device", "cuda:0", "--config_preset", config_preset, "--save_outputs", - "--output_dir", f"/inference/results/{config_preset}", + "--output_dir", outpur_dir ] if use_precomputed_alignments: - command.extend(["--use_precomputed_alignments", "/inference/alignments"]) + command.extend(["--use_precomputed_alignments", precomputed_alignments_dir]) try: print("Running Docker container for inference...") @@ -149,4 +73,70 @@ def run_inference_for_model( except docker.errors.ImageNotFound as e: print(f"ImageNotFound: {e}") except docker.errors.APIError as e: - print(f"APIError: {e}") \ No newline at end of file + print(f"APIError: {e}") + +def run_msa_alignment( + run_path, + database_dir, + default_fasta_dir_name, + cpus_per_task=32, + no_tasks=1, +): + client = docker.from_env() + + precomputed_alignments_dir = f"{run_path}/output/alignments" + os.makedirs(precomputed_alignments_dir, exist_ok=True) + + volumes = { + run_path: {'bind': '/run_path', 'mode': 'rw'}, + } + + volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} + + fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" + precomputed_alignments_dir_docker = "/run_path/output/alignments" + + # Define the Docker command + command = [ + "python3", "/opt/openfold/scripts/precompute_alignments.py", + fasta_dir, + precomputed_alignments_dir_docker, + "--uniref90_database_path", "/database/uniref90/uniref90.fasta", + "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", + "--pdb70_database_path", "/database/pdb70/pdb70", + "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", + "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", + "--cpus_per_task", str(cpus_per_task), + "--no_tasks", str(no_tasks), + "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", + "--hhblits_binary_path", "/opt/conda/bin/hhblits", + "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", + "--kalign_binary_path", "/opt/conda/bin/kalign" + ] + + try: + print("Running Docker container for MSA alignment...") + container = client.containers.run( + image="openfold:latest", + command=command, + volumes=volumes, + runtime="nvidia", # This is required for GPU usage + remove=True, + detach=True, # Running in the background to allow log streaming + stdout=True, + stderr=True + ) + + # Stream logs in real-time + for log in container.logs(stream=True): + print(log.decode('utf-8'), end='') + + except docker.errors.ContainerError as e: + print(f"ContainerError: {e}") + raise e + except docker.errors.ImageNotFound as e: + print(f"ImageNotFound: {e}") + raise e + except docker.errors.APIError as e: + print(f"APIError: {e}") + raise e \ No newline at end of file From d2059ba179cbae2da37909714cfd0db33275f703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 02:31:24 +0000 Subject: [PATCH 05/15] rename codebase folder --- notebooks/utils/__init__.py | 0 notebooks/utils/docker_commands.py | 142 ----------------------------- notebooks/utils/model.py | 95 ------------------- notebooks/utils/requirements.txt | 1 - notebooks/utils/utils.py | 73 --------------- 5 files changed, 311 deletions(-) delete mode 100644 notebooks/utils/__init__.py delete mode 100644 notebooks/utils/docker_commands.py delete mode 100644 notebooks/utils/model.py delete mode 100644 notebooks/utils/requirements.txt delete mode 100644 notebooks/utils/utils.py diff --git a/notebooks/utils/__init__.py b/notebooks/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/notebooks/utils/docker_commands.py b/notebooks/utils/docker_commands.py deleted file mode 100644 index a4e7831e..00000000 --- a/notebooks/utils/docker_commands.py +++ /dev/null @@ -1,142 +0,0 @@ -import docker -import os - -def run_inference_for_model( - run_path, - database_dir, - default_fasta_dir_name, - config_preset, - use_precomputed_alignments=True, - ): - - client = docker.from_env() - # Set up the volumes dictionary - volumes = { - run_path: {'bind': '/run_path', 'mode': 'rw'}, - } - - volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} - - fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" - outpur_dir = "/run_path/output" - precomputed_alignments_dir = "/run_path/output/alignments" - - command = [ - "python3", "/opt/openfold/run_pretrained_openfold.py", - fasta_dir, - "/database/pdb_mmcif/mmcif_files/", - # Databases - "--uniref90_database_path", "/database/uniref90/uniref90.fasta", - "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", - "--pdb70_database_path", "/database/pdb70/pdb70", - "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", - "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", - # Search MSA - "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", - "--hhblits_binary_path", "/opt/conda/bin/hhblits", - "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", - "--kalign_binary_path", "/opt/conda/bin/kalign", - "--pdb_seqres_database_path", "/database/pdb_seqres/pdb_seqres.txt", - "--uniref30_database_path", "/database/uniref30/UniRef30_2021_03", - "--uniprot_database_path", "/database/uniprot/uniprot.fasta", - "--hmmsearch_binary_path", "/opt/conda/bin/hmmsearch", - "--hmmbuild_binary_path", "/opt/conda/bin/hmmbuild", - # Inference settings - "--model_device", "cuda:0", - "--config_preset", config_preset, - "--save_outputs", - "--output_dir", outpur_dir - ] - - if use_precomputed_alignments: - command.extend(["--use_precomputed_alignments", precomputed_alignments_dir]) - - try: - print("Running Docker container for inference...") - container = client.containers.run( - image="openfold:latest", - command=command, - volumes=volumes, - runtime="nvidia", # This is required for GPU usage - remove=True, - detach=True, # Running in the background to allow log streaming - stdout=True, - stderr=True - ) - - # Stream logs in real-time - for log in container.logs(stream=True): - print(log.decode('utf-8'), end='') - - except docker.errors.ContainerError as e: - print(f"ContainerError: {e}") - except docker.errors.ImageNotFound as e: - print(f"ImageNotFound: {e}") - except docker.errors.APIError as e: - print(f"APIError: {e}") - -def run_msa_alignment( - run_path, - database_dir, - default_fasta_dir_name, - cpus_per_task=32, - no_tasks=1, -): - client = docker.from_env() - - precomputed_alignments_dir = f"{run_path}/output/alignments" - os.makedirs(precomputed_alignments_dir, exist_ok=True) - - volumes = { - run_path: {'bind': '/run_path', 'mode': 'rw'}, - } - - volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} - - fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" - precomputed_alignments_dir_docker = "/run_path/output/alignments" - - # Define the Docker command - command = [ - "python3", "/opt/openfold/scripts/precompute_alignments.py", - fasta_dir, - precomputed_alignments_dir_docker, - "--uniref90_database_path", "/database/uniref90/uniref90.fasta", - "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", - "--pdb70_database_path", "/database/pdb70/pdb70", - "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", - "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", - "--cpus_per_task", str(cpus_per_task), - "--no_tasks", str(no_tasks), - "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", - "--hhblits_binary_path", "/opt/conda/bin/hhblits", - "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", - "--kalign_binary_path", "/opt/conda/bin/kalign" - ] - - try: - print("Running Docker container for MSA alignment...") - container = client.containers.run( - image="openfold:latest", - command=command, - volumes=volumes, - runtime="nvidia", # This is required for GPU usage - remove=True, - detach=True, # Running in the background to allow log streaming - stdout=True, - stderr=True - ) - - # Stream logs in real-time - for log in container.logs(stream=True): - print(log.decode('utf-8'), end='') - - except docker.errors.ContainerError as e: - print(f"ContainerError: {e}") - raise e - except docker.errors.ImageNotFound as e: - print(f"ImageNotFound: {e}") - raise e - except docker.errors.APIError as e: - print(f"APIError: {e}") - raise e \ No newline at end of file diff --git a/notebooks/utils/model.py b/notebooks/utils/model.py deleted file mode 100644 index e211c757..00000000 --- a/notebooks/utils/model.py +++ /dev/null @@ -1,95 +0,0 @@ -import os -import shutil -from .docker_commands import run_inference_for_model, run_msa_alignment -from .utils import validate_sequence, generate_random_run_id, write_sequences_to_fasta, generate_individual_sequence_files - -default_fasta_dir_name = 'fasta_dir' - -def run_inference( - inference_input, - database_dir, - weight_set, - model_name, - use_precomputed_alignments=False, - run_id=None): - - os.makedirs('data', exist_ok=True) - - if use_precomputed_alignments: - if not run_id: - raise ValueError(f"run_id is required when using pre computed aligments.") - - if not run_id: - - # Generate a random directory name for storing the fasta file - run_id = generate_random_run_id() - - run_path = f"{os.getcwd()}/data/run_{run_id}" - fasta_dir_root_path = f'{run_path}/{default_fasta_dir_name}' - - os.makedirs(fasta_dir_root_path, exist_ok=True) - - print(f"Fasta root directory: {fasta_dir_root_path}") - - # Determine if the inference_input is a directory or a sequence string - if os.path.isfile(inference_input): - shutil.copy2(inference_input, f"{fasta_dir_root_path}/sequences.fasta") - - print(f"Using input file : {inference_input}") - else: - # Treat inference_input as a sequence string - sequence_string = inference_input - print(f"Input is a sequence string: {sequence_string}") - - # Validate and clean the sequence string - sequence_string = validate_sequence(inference_input, weight_set) - - fasta_file = write_sequences_to_fasta(sequence_string.split(':'), fasta_dir_root_path) - print(f"Sequences written to FASTA file: {fasta_file}") - - - generate_individual_sequence_files(run_path) - - run_msa_alignment(run_path, database_dir, default_fasta_dir_name) - - run_model_with(run_path, database_dir, default_fasta_dir_name, weight_set, model_name) - - elif run_id and use_precomputed_alignments: - # Run inference with given Run ID - run_path = f"{os.getcwd()}/data/run_{run_id}" - - if os.path.isdir(run_path): - run_model_with(run_path, database_dir, default_fasta_dir_name, weight_set, model_name) - else: - raise ValueError(f"Provided Run ID does not exists") - -def run_model_with(run_path, database_dir, default_fasta_dir_name, weight_set, model_name): - config_preset_list = get_config_preset_list_for_model(weight_set, model_name) - - for config_preset in config_preset_list: - run_inference_for_model( - run_path, - database_dir, - default_fasta_dir_name, - config_preset - ) - -def get_config_preset_list_for_model(weight_set, model_name): - if weight_set == "OpenFold" and model_name == "monomer": - model_names = [ - 'finetuning_3.pt', - 'finetuning_4.pt', - 'finetuning_5.pt', - 'finetuning_ptm_2.pt', - 'finetuning_no_templ_ptm_1.pt' - ] - elif weight_set == "AlphaFold" and model_name == "multimer": - # Generate 'model_1_multimer_v3' to 'model_5_multimer_v3' using a loop - model_names = [f'model_{i}_multimer_v3' for i in range(1, 6)] - elif weight_set == "AlphaFold" and model_name == "monomer": - # Generate 'model_1' to 'model_5' using a loop - model_names = [f'model_{i}' for i in range(1, 6)] - else: - raise ValueError(f"Invalid combination of weight_set '{weight_set}' and model_name '{model_name}'") - - return model_names \ No newline at end of file diff --git a/notebooks/utils/requirements.txt b/notebooks/utils/requirements.txt deleted file mode 100644 index de5cce34..00000000 --- a/notebooks/utils/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -docker==7.1.0 \ No newline at end of file diff --git a/notebooks/utils/utils.py b/notebooks/utils/utils.py deleted file mode 100644 index 50a13683..00000000 --- a/notebooks/utils/utils.py +++ /dev/null @@ -1,73 +0,0 @@ -import random -import string -import os - - -def generate_random_run_id(length=6): - run_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) - print(f"Run ID: {run_id}") - return run_id - -def generate_random_sequence_name(length=8): - return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) - -def write_sequences_to_fasta(sequences, fasta_dir_root_path): - # os.makedirs(f'inference/{fasta_dir}', exist_ok=True) - fasta_file = os.path.join(fasta_dir_root_path, "sequences.fasta") - - with open(fasta_file, 'w') as f: - for seq in sequences: - sequence_name = f"sequence_{generate_random_sequence_name()}" - f.write(f">{sequence_name}\n") - f.write(f"{seq}\n") - - return fasta_file - -def validate_sequence(input_sequence, weight_set): - # Remove all whitespaces, tabs, and end lines; convert to upper-case - input_sequence = input_sequence.translate(str.maketrans('', '', ' \n\t')).upper() - aatypes = set('ACDEFGHIKLMNPQRSTVWY') # 20 standard amino acids - allowed_chars = aatypes.union({':'}) - - if not set(input_sequence).issubset(allowed_chars): - raise Exception(f'Input sequence contains non-amino acid letters: {set(input_sequence) - allowed_chars}. OpenFold only supports 20 standard amino acids as inputs.') - - if ':' in input_sequence and weight_set != 'AlphaFold': - raise ValueError('Input sequence is a multimer, must select Alphafold weight set') - - return input_sequence - - - -def generate_individual_sequence_files(output_path): - with open(f"{output_path}/fasta_dir/sequences.fasta", "r") as infile: - sequence_id = None - sequence_lines = [] - - output_path = f"{output_path}/fasta_dir/tmp" - os.makedirs(output_path, exist_ok=True) - - for line in infile: - line = line.strip() - if line.startswith(">"): - if sequence_id is not None: - # Save the previous sequence to a file - output_file = os.path.join(output_path, f"{sequence_id}.fasta") - with open(output_file, "w") as outfile: - outfile.write(f">{sequence_id}\n") - outfile.write("\n".join(sequence_lines) + "\n") - print(f"Saved {sequence_id} to {output_file}") - - # Start a new sequence - sequence_id = line[1:].split('.')[0] # Remove '>' and split by '.' to remove the suffix - sequence_lines = [] - else: - sequence_lines.append(line) - - # Save the last sequence - if sequence_id is not None: - output_file = os.path.join(output_path, f"{sequence_id}.fasta") - with open(output_file, "w") as outfile: - outfile.write(f">{sequence_id}\n") - outfile.write("\n".join(sequence_lines) + "\n") - print(f"Saved {sequence_id} to {output_file}") From 42b1383433f33d0b0f25f31653a4c64f502a4978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 02:31:51 +0000 Subject: [PATCH 06/15] refactor to use class based client --- notebooks/src/__init__.py | 0 notebooks/src/docker_commands.py | 142 +++++++++++++++++++++++++++++++ notebooks/src/model.py | 86 +++++++++++++++++++ notebooks/src/requirements.txt | 1 + notebooks/src/utils.py | 71 ++++++++++++++++ 5 files changed, 300 insertions(+) create mode 100644 notebooks/src/__init__.py create mode 100644 notebooks/src/docker_commands.py create mode 100644 notebooks/src/model.py create mode 100644 notebooks/src/requirements.txt create mode 100644 notebooks/src/utils.py diff --git a/notebooks/src/__init__.py b/notebooks/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/notebooks/src/docker_commands.py b/notebooks/src/docker_commands.py new file mode 100644 index 00000000..a4e7831e --- /dev/null +++ b/notebooks/src/docker_commands.py @@ -0,0 +1,142 @@ +import docker +import os + +def run_inference_for_model( + run_path, + database_dir, + default_fasta_dir_name, + config_preset, + use_precomputed_alignments=True, + ): + + client = docker.from_env() + # Set up the volumes dictionary + volumes = { + run_path: {'bind': '/run_path', 'mode': 'rw'}, + } + + volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} + + fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" + outpur_dir = "/run_path/output" + precomputed_alignments_dir = "/run_path/output/alignments" + + command = [ + "python3", "/opt/openfold/run_pretrained_openfold.py", + fasta_dir, + "/database/pdb_mmcif/mmcif_files/", + # Databases + "--uniref90_database_path", "/database/uniref90/uniref90.fasta", + "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", + "--pdb70_database_path", "/database/pdb70/pdb70", + "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", + "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", + # Search MSA + "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", + "--hhblits_binary_path", "/opt/conda/bin/hhblits", + "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", + "--kalign_binary_path", "/opt/conda/bin/kalign", + "--pdb_seqres_database_path", "/database/pdb_seqres/pdb_seqres.txt", + "--uniref30_database_path", "/database/uniref30/UniRef30_2021_03", + "--uniprot_database_path", "/database/uniprot/uniprot.fasta", + "--hmmsearch_binary_path", "/opt/conda/bin/hmmsearch", + "--hmmbuild_binary_path", "/opt/conda/bin/hmmbuild", + # Inference settings + "--model_device", "cuda:0", + "--config_preset", config_preset, + "--save_outputs", + "--output_dir", outpur_dir + ] + + if use_precomputed_alignments: + command.extend(["--use_precomputed_alignments", precomputed_alignments_dir]) + + try: + print("Running Docker container for inference...") + container = client.containers.run( + image="openfold:latest", + command=command, + volumes=volumes, + runtime="nvidia", # This is required for GPU usage + remove=True, + detach=True, # Running in the background to allow log streaming + stdout=True, + stderr=True + ) + + # Stream logs in real-time + for log in container.logs(stream=True): + print(log.decode('utf-8'), end='') + + except docker.errors.ContainerError as e: + print(f"ContainerError: {e}") + except docker.errors.ImageNotFound as e: + print(f"ImageNotFound: {e}") + except docker.errors.APIError as e: + print(f"APIError: {e}") + +def run_msa_alignment( + run_path, + database_dir, + default_fasta_dir_name, + cpus_per_task=32, + no_tasks=1, +): + client = docker.from_env() + + precomputed_alignments_dir = f"{run_path}/output/alignments" + os.makedirs(precomputed_alignments_dir, exist_ok=True) + + volumes = { + run_path: {'bind': '/run_path', 'mode': 'rw'}, + } + + volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} + + fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" + precomputed_alignments_dir_docker = "/run_path/output/alignments" + + # Define the Docker command + command = [ + "python3", "/opt/openfold/scripts/precompute_alignments.py", + fasta_dir, + precomputed_alignments_dir_docker, + "--uniref90_database_path", "/database/uniref90/uniref90.fasta", + "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", + "--pdb70_database_path", "/database/pdb70/pdb70", + "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", + "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", + "--cpus_per_task", str(cpus_per_task), + "--no_tasks", str(no_tasks), + "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", + "--hhblits_binary_path", "/opt/conda/bin/hhblits", + "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", + "--kalign_binary_path", "/opt/conda/bin/kalign" + ] + + try: + print("Running Docker container for MSA alignment...") + container = client.containers.run( + image="openfold:latest", + command=command, + volumes=volumes, + runtime="nvidia", # This is required for GPU usage + remove=True, + detach=True, # Running in the background to allow log streaming + stdout=True, + stderr=True + ) + + # Stream logs in real-time + for log in container.logs(stream=True): + print(log.decode('utf-8'), end='') + + except docker.errors.ContainerError as e: + print(f"ContainerError: {e}") + raise e + except docker.errors.ImageNotFound as e: + print(f"ImageNotFound: {e}") + raise e + except docker.errors.APIError as e: + print(f"APIError: {e}") + raise e \ No newline at end of file diff --git a/notebooks/src/model.py b/notebooks/src/model.py new file mode 100644 index 00000000..20354d1b --- /dev/null +++ b/notebooks/src/model.py @@ -0,0 +1,86 @@ +import os +import shutil +from .docker_commands import run_inference_for_model, run_msa_alignment +from .utils import validate_sequence, generate_random_run_id, write_sequences_to_fasta, generate_individual_sequence_files + +class InferenceClientOpenFold: + default_fasta_dir_name = 'fasta_dir' + + def __init__(self, database_dir): + self.database_dir = database_dir + + def run_inference(self, weight_set, model_name, inference_input=None, use_precomputed_alignments=False, run_id=None): + os.makedirs('data', exist_ok=True) + + if use_precomputed_alignments: + if not run_id: + raise ValueError("run_id is required when using pre-computed alignments.") + else: + if not inference_input: + raise ValueError("inference_input is required to compute alignments.") + + if not run_id: + run_id = generate_random_run_id() + run_path = f"{os.getcwd()}/data/run_{run_id}" + fasta_dir_root_path = f'{run_path}/{self.default_fasta_dir_name}' + + os.makedirs(fasta_dir_root_path, exist_ok=True) + print(f"Fasta root directory: {fasta_dir_root_path}") + + if os.path.isfile(inference_input): + shutil.copy2(inference_input, f"{fasta_dir_root_path}/sequences.fasta") + print(f"Using input file: {inference_input}") + else: + sequence_string = inference_input + print(f"Input is a sequence string: {sequence_string}") + + sequence_string = validate_sequence(inference_input, weight_set) + fasta_file = write_sequences_to_fasta(sequence_string.split(':'), fasta_dir_root_path) + print(f"Sequences written to FASTA file: {fasta_file}") + + generate_individual_sequence_files(run_path) + + self.run_msa_alignment(run_path) + self.run_model_with(run_path, weight_set, model_name) + + elif run_id and use_precomputed_alignments: + run_path = f"{os.getcwd()}/data/run_{run_id}" + + if os.path.isdir(run_path): + self.run_model_with(run_path, weight_set, model_name) + else: + raise ValueError("Provided Run ID does not exist") + + return run_id + + def run_model_with(self, run_path, weight_set, model_name): + config_preset_list = self.get_config_preset_list_for_model(weight_set, model_name) + + for config_preset in config_preset_list: + run_inference_for_model( + run_path, + self.database_dir, + self.default_fasta_dir_name, + config_preset + ) + + def run_msa_alignment(self, run_path): + run_msa_alignment(run_path, self.database_dir, self.default_fasta_dir_name) + + def get_config_preset_list_for_model(self, weight_set, model_name): + if weight_set == "OpenFold" and model_name == "monomer": + model_names = [ + 'finetuning_3.pt', + 'finetuning_4.pt', + 'finetuning_5.pt', + 'finetuning_ptm_2.pt', + 'finetuning_no_templ_ptm_1.pt' + ] + elif weight_set == "AlphaFold" and model_name == "multimer": + model_names = [f'model_{i}_multimer_v3' for i in range(1, 6)] + elif weight_set == "AlphaFold" and model_name == "monomer": + model_names = [f'model_{i}' for i in range(1, 6)] + else: + raise ValueError(f"Invalid combination of weight_set '{weight_set}' and model_name '{model_name}'") + + return model_names \ No newline at end of file diff --git a/notebooks/src/requirements.txt b/notebooks/src/requirements.txt new file mode 100644 index 00000000..de5cce34 --- /dev/null +++ b/notebooks/src/requirements.txt @@ -0,0 +1 @@ +docker==7.1.0 \ No newline at end of file diff --git a/notebooks/src/utils.py b/notebooks/src/utils.py new file mode 100644 index 00000000..38761c6f --- /dev/null +++ b/notebooks/src/utils.py @@ -0,0 +1,71 @@ +import random +import string +import os + + +def generate_random_run_id(length=6): + run_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) + print(f"Run ID: {run_id}") + return run_id + +def generate_random_sequence_name(length=8): + return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) + +def write_sequences_to_fasta(sequences, fasta_dir_root_path): + # os.makedirs(f'inference/{fasta_dir}', exist_ok=True) + fasta_file = os.path.join(fasta_dir_root_path, "sequences.fasta") + + with open(fasta_file, 'w') as f: + for seq in sequences: + sequence_name = f"sequence_{generate_random_sequence_name()}" + f.write(f">{sequence_name}\n") + f.write(f"{seq}\n") + + return fasta_file + +def validate_sequence(input_sequence, weight_set): + # Remove all whitespaces, tabs, and end lines; convert to upper-case + input_sequence = input_sequence.translate(str.maketrans('', '', ' \n\t')).upper() + aatypes = set('ACDEFGHIKLMNPQRSTVWY') # 20 standard amino acids + allowed_chars = aatypes.union({':'}) + + if not set(input_sequence).issubset(allowed_chars): + raise Exception(f'Input sequence contains non-amino acid letters: {set(input_sequence) - allowed_chars}. OpenFold only supports 20 standard amino acids as inputs.') + + if ':' in input_sequence and weight_set != 'AlphaFold': + raise ValueError('Input sequence is a multimer, must select Alphafold weight set') + + return input_sequence + +def generate_individual_sequence_files(output_path): + with open(f"{output_path}/fasta_dir/sequences.fasta", "r") as infile: + sequence_id = None + sequence_lines = [] + + output_path = f"{output_path}/fasta_dir/tmp" + os.makedirs(output_path, exist_ok=True) + + for line in infile: + line = line.strip() + if line.startswith(">"): + if sequence_id is not None: + # Save the previous sequence to a file + output_file = os.path.join(output_path, f"{sequence_id}.fasta") + with open(output_file, "w") as outfile: + outfile.write(f">{sequence_id}\n") + outfile.write("\n".join(sequence_lines) + "\n") + print(f"Saved {sequence_id} to {output_file}") + + # Start a new sequence + sequence_id = line[1:].split('.')[0] # Remove '>' and split by '.' to remove the suffix + sequence_lines = [] + else: + sequence_lines.append(line) + + # Save the last sequence + if sequence_id is not None: + output_file = os.path.join(output_path, f"{sequence_id}.fasta") + with open(output_file, "w") as outfile: + outfile.write(f">{sequence_id}\n") + outfile.write("\n".join(sequence_lines) + "\n") + print(f"Saved {sequence_id} to {output_file}") From 0a0f88f930b0553aa4a2fef84f43778977f0be83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 02:32:06 +0000 Subject: [PATCH 07/15] Updated notebook examples --- notebooks/OpenFoldLocal.ipynb | 84 ++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/notebooks/OpenFoldLocal.ipynb b/notebooks/OpenFoldLocal.ipynb index 56214b79..dc8cca88 100644 --- a/notebooks/OpenFoldLocal.ipynb +++ b/notebooks/OpenFoldLocal.ipynb @@ -8,12 +8,14 @@ "\n", "Provides the flexibility to run inference using a local installation of OpenFold with Docker, along with the convenience of visualizing results using the same plots from the OpenFold Colab Notebook.\n", "\n", - "If you have access to a machine and want to do quick inference and visualize results, there are some useful things you can do with this notebook:\n", + "This notebook uses the provided utility functions to execute OpenFold via Docker it adds logic to handle results, so you can experiment with different parameters, re-use computed msas, filter the best model, plot metrics and, async and handle long running executions.\n", "\n", - "- You can use precomputed alignments, which enables you to run inference with different model parameters to compare results.\n", - "- Get the best model and plots using the provided utility functions.\n", - "- Handle long-running executions.\n", + "If you have access to a machine and want to do quick inference and visualize results, there are some useful things you can do with this notebook:\n", "\n", + "- You can use precomputed alignments, which enables you to run inference with different model parameters to compare results\n", + "- Get the best model and metric plots \n", + "- Handle long-running executions\n", + "- Work with big datasets i.e split your input and perform async runs using asyncio on multiple GPUs\n", "\n", "Of course, you can do this solely using Docker commands in the terminal, but you would need to code/adjust the Colab functions to work with data locally. This notebook gives you a head start." ] @@ -24,6 +26,8 @@ "source": [ "## Setup the notebook\n", "\n", + "Fist, build Openfold using Docker. Follow this [guide](https://openfold.readthedocs.io/en/latest/original_readme.html#building-and-using-the-docker-container).\n", + "\n", "Start your notebook:\n", "\n", "Go to the notebook folder:\n", @@ -36,22 +40,37 @@ "\n", "Start your Jupyter server\n", "\n", - "`jupyter lab . --ip=\"0.0.0.0\"`\n", - "\n" + "`jupyter lab . --ip=\"0.0.0.0\"`\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Running Inference " + "## Running Inference \n", + "\n", + "**Inputs:** files or strings with sequences\n", + "\n", + "**Output:** \n", + "\n", + "```bash\n", + "data/ \n", + "├── run_/ # each is run stored with a random ID, this id can be use to re-run inference \n", + "│ ├── fasta_dir/ \n", + "│ │ ├── tmp/ # generated .fasta file per sequence\n", + "│ │ └── sequences.fasta # validated input sequences are merged into a .fasta file\n", + "│ └── output/\n", + "│ ├── alignments/ # one folder per sequence of resulted MSA\n", + "│ ├── predictions/ # inference results .pkl and .pdb files\n", + "│ └── timings.json # inference time\n", + "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Example usage 1: Using a sequence string" + "#### Example usage 1: Using a sequence string" ] }, { @@ -60,23 +79,28 @@ "metadata": {}, "outputs": [], "source": [ - "from utils.docker_commands import run_inference\n", + "from src.model import InferenceClientOpenFold\n", "\n", - "# For multiple sequences, separate sequences with a colon `:`\n", - "inference_input = \"DAGAQGAAIGSPGVLSGNVVQVPVHVPVNVCGNTVSVIGLLNPAFGNTCVNA:AGETGRTGVLVTSSATNDGDSGWGRFAG\"\n", + "# Initialize the OpenFold Docker client setting the database path \n", + "databases_dir = \"/home/juliocesar/DataZ/Models/alphafold_all_data\"\n", + "openfold_client = InferenceClientOpenFold(databases_dir)\n", + "\n", + "# Set the input, for multiple sequences, separate sequences with a colon `:`\n", + "input_string = \"DAGAQGAAIGSPGVLSGNVVQVPVHVPVNVCGNTVSVIGLLNPAFGNTCVNA:AGETGRTGVLVTSSATNDGDSGWGRFAG\"\n", "\n", - "database_dir = \"/path\"\n", "model_name = \"multimer\"\n", "weight_set = 'AlphaFold'\n", "\n", - "run_inference(inference_input, database_dir, weight_set, model_name)\n" + "\n", + "# Run inference\n", + "run_id = openfold_client.run_inference(weight_set, model_name, inference_input=input_string)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Example usage 2: Using a pre-existing fasta directory" + "#### Example usage 2: Using a fasta file" ] }, { @@ -85,25 +109,33 @@ "metadata": {}, "outputs": [], "source": [ - "from utils.docker_commands import run_inference\n", - "\n", - "# Example usage, put your .fasta file with the sequences to process in the inference dir\n", - "inference_dir = \"/path\"\n", - "databases_dir = \"/path\"\n", + "input_file = \"/home/juliocesar/Models/openfold/notebooks/data/test.fasta\"\n", "\n", - "\n", - "weight_set = 'AlphaFold'\n", - "model_name = \"multimer\"\n", - "\n", - "run_inference(inference_dir, databases_dir, weight_set, model_name, use_precomputed_alignments=True)" + "run_id = openfold_client.run_inference(weight_set, model_name, inference_input=input_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example usage 3: Using a pre-computed aligments for a run_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "openfold_client.run_inference(weight_set, model_name, use_precomputed_alignments=True, run_id=\"8CJUIY\")" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "signalp-6.0", "language": "python", - "name": "python3" + "name": "signalp-6.0" }, "language_info": { "codemirror_mode": { From b8d26e30540a48a5fd56591c4df6e35369496ed7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 03:30:46 +0000 Subject: [PATCH 08/15] updated notebook --- notebooks/OpenFoldLocal.ipynb | 41 +++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/notebooks/OpenFoldLocal.ipynb b/notebooks/OpenFoldLocal.ipynb index dc8cca88..c72dc7fa 100644 --- a/notebooks/OpenFoldLocal.ipynb +++ b/notebooks/OpenFoldLocal.ipynb @@ -70,28 +70,48 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Example usage 1: Using a sequence string" + "#### Initialize the client" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "from src.model import InferenceClientOpenFold\n", + "import docker\n", + "from src.inference import InferenceClientOpenFold\n", + "\n", + "# You can also use a remote docker server \n", + "docker_client = docker.from_env()\n", + "\n", + "# i.e connect to the remote Docker daemon\n", + "# remote_docker_client = docker.DockerClient(base_url='tcp://:2375')\n", "\n", "# Initialize the OpenFold Docker client setting the database path \n", "databases_dir = \"/home/juliocesar/DataZ/Models/alphafold_all_data\"\n", - "openfold_client = InferenceClientOpenFold(databases_dir)\n", - "\n", - "# Set the input, for multiple sequences, separate sequences with a colon `:`\n", + "openfold_client = InferenceClientOpenFold(databases_dir, docker_client)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Inference using a sequence string" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For multiple sequences, separate sequences with a colon `:`\n", "input_string = \"DAGAQGAAIGSPGVLSGNVVQVPVHVPVNVCGNTVSVIGLLNPAFGNTCVNA:AGETGRTGVLVTSSATNDGDSGWGRFAG\"\n", "\n", "model_name = \"multimer\"\n", "weight_set = 'AlphaFold'\n", "\n", - "\n", "# Run inference\n", "run_id = openfold_client.run_inference(weight_set, model_name, inference_input=input_string)" ] @@ -100,7 +120,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Example usage 2: Using a fasta file" + "#### Inference using a fasta file" ] }, { @@ -118,7 +138,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Example usage 3: Using a pre-computed aligments for a run_id" + "#### Inference using pre-computed aligments for a run_id" ] }, { @@ -127,6 +147,9 @@ "metadata": {}, "outputs": [], "source": [ + "model_name = \"multimer\"\n", + "weight_set = 'AlphaFold'\n", + "\n", "openfold_client.run_inference(weight_set, model_name, use_precomputed_alignments=True, run_id=\"8CJUIY\")" ] } From 9ea27338a93595dd9c01fb5e09ba2ed1a273feae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 03:31:03 +0000 Subject: [PATCH 09/15] refactor to granular class --- notebooks/src/docker_commands.py | 142 ------------------------------- notebooks/src/docker_runner.py | 113 ++++++++++++++++++++++++ notebooks/src/inference.py | 100 ++++++++++++++++++++++ notebooks/src/model.py | 86 ------------------- 4 files changed, 213 insertions(+), 228 deletions(-) delete mode 100644 notebooks/src/docker_commands.py create mode 100644 notebooks/src/docker_runner.py create mode 100644 notebooks/src/inference.py delete mode 100644 notebooks/src/model.py diff --git a/notebooks/src/docker_commands.py b/notebooks/src/docker_commands.py deleted file mode 100644 index a4e7831e..00000000 --- a/notebooks/src/docker_commands.py +++ /dev/null @@ -1,142 +0,0 @@ -import docker -import os - -def run_inference_for_model( - run_path, - database_dir, - default_fasta_dir_name, - config_preset, - use_precomputed_alignments=True, - ): - - client = docker.from_env() - # Set up the volumes dictionary - volumes = { - run_path: {'bind': '/run_path', 'mode': 'rw'}, - } - - volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} - - fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" - outpur_dir = "/run_path/output" - precomputed_alignments_dir = "/run_path/output/alignments" - - command = [ - "python3", "/opt/openfold/run_pretrained_openfold.py", - fasta_dir, - "/database/pdb_mmcif/mmcif_files/", - # Databases - "--uniref90_database_path", "/database/uniref90/uniref90.fasta", - "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", - "--pdb70_database_path", "/database/pdb70/pdb70", - "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", - "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", - # Search MSA - "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", - "--hhblits_binary_path", "/opt/conda/bin/hhblits", - "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", - "--kalign_binary_path", "/opt/conda/bin/kalign", - "--pdb_seqres_database_path", "/database/pdb_seqres/pdb_seqres.txt", - "--uniref30_database_path", "/database/uniref30/UniRef30_2021_03", - "--uniprot_database_path", "/database/uniprot/uniprot.fasta", - "--hmmsearch_binary_path", "/opt/conda/bin/hmmsearch", - "--hmmbuild_binary_path", "/opt/conda/bin/hmmbuild", - # Inference settings - "--model_device", "cuda:0", - "--config_preset", config_preset, - "--save_outputs", - "--output_dir", outpur_dir - ] - - if use_precomputed_alignments: - command.extend(["--use_precomputed_alignments", precomputed_alignments_dir]) - - try: - print("Running Docker container for inference...") - container = client.containers.run( - image="openfold:latest", - command=command, - volumes=volumes, - runtime="nvidia", # This is required for GPU usage - remove=True, - detach=True, # Running in the background to allow log streaming - stdout=True, - stderr=True - ) - - # Stream logs in real-time - for log in container.logs(stream=True): - print(log.decode('utf-8'), end='') - - except docker.errors.ContainerError as e: - print(f"ContainerError: {e}") - except docker.errors.ImageNotFound as e: - print(f"ImageNotFound: {e}") - except docker.errors.APIError as e: - print(f"APIError: {e}") - -def run_msa_alignment( - run_path, - database_dir, - default_fasta_dir_name, - cpus_per_task=32, - no_tasks=1, -): - client = docker.from_env() - - precomputed_alignments_dir = f"{run_path}/output/alignments" - os.makedirs(precomputed_alignments_dir, exist_ok=True) - - volumes = { - run_path: {'bind': '/run_path', 'mode': 'rw'}, - } - - volumes[database_dir] = {'bind': '/database', 'mode': 'rw'} - - fasta_dir = f"/run_path/{default_fasta_dir_name}/tmp" - precomputed_alignments_dir_docker = "/run_path/output/alignments" - - # Define the Docker command - command = [ - "python3", "/opt/openfold/scripts/precompute_alignments.py", - fasta_dir, - precomputed_alignments_dir_docker, - "--uniref90_database_path", "/database/uniref90/uniref90.fasta", - "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", - "--pdb70_database_path", "/database/pdb70/pdb70", - "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", - "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", - "--cpus_per_task", str(cpus_per_task), - "--no_tasks", str(no_tasks), - "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", - "--hhblits_binary_path", "/opt/conda/bin/hhblits", - "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", - "--kalign_binary_path", "/opt/conda/bin/kalign" - ] - - try: - print("Running Docker container for MSA alignment...") - container = client.containers.run( - image="openfold:latest", - command=command, - volumes=volumes, - runtime="nvidia", # This is required for GPU usage - remove=True, - detach=True, # Running in the background to allow log streaming - stdout=True, - stderr=True - ) - - # Stream logs in real-time - for log in container.logs(stream=True): - print(log.decode('utf-8'), end='') - - except docker.errors.ContainerError as e: - print(f"ContainerError: {e}") - raise e - except docker.errors.ImageNotFound as e: - print(f"ImageNotFound: {e}") - raise e - except docker.errors.APIError as e: - print(f"APIError: {e}") - raise e \ No newline at end of file diff --git a/notebooks/src/docker_runner.py b/notebooks/src/docker_runner.py new file mode 100644 index 00000000..70494003 --- /dev/null +++ b/notebooks/src/docker_runner.py @@ -0,0 +1,113 @@ +import os + +class DockerRunner: + def __init__(self, client, run_path, database_dir, default_fasta_dir_name): + self.client = client + self.run_path = run_path + self.database_dir = database_dir + self.default_fasta_dir_name = default_fasta_dir_name + + def _setup_volumes(self): + return { + self.run_path: {'bind': '/run_path', 'mode': 'rw'}, + self.database_dir: {'bind': '/database', 'mode': 'rw'} + } + + def _stream_logs(self, container): + for log in container.logs(stream=True): + print(log.decode('utf-8'), end='') + + def run_inference_for_model(self, config_preset, use_precomputed_alignments=True): + command = self._build_inference_command(config_preset, use_precomputed_alignments) + self._run_container(command) + + def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): + precomputed_alignments_dir = f"{self.run_path}/output/alignments" + os.makedirs(precomputed_alignments_dir, exist_ok=True) + command = self._build_msa_alignment_command(cpus_per_task, no_tasks) + self._run_container(command) + + def _build_inference_command(self, config_preset, use_precomputed_alignments): + fasta_dir = f"/run_path/{self.default_fasta_dir_name}/tmp" + output_dir = "/run_path/output" + precomputed_alignments_dir = "/run_path/output/alignments" + + command = [ + "python3", "/opt/openfold/run_pretrained_openfold.py", + fasta_dir, + "/database/pdb_mmcif/mmcif_files/", + "--uniref90_database_path", "/database/uniref90/uniref90.fasta", + "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", + "--pdb70_database_path", "/database/pdb70/pdb70", + "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", + "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", + "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", + "--hhblits_binary_path", "/opt/conda/bin/hhblits", + "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", + "--kalign_binary_path", "/opt/conda/bin/kalign", + "--pdb_seqres_database_path", "/database/pdb_seqres/pdb_seqres.txt", + "--uniref30_database_path", "/database/uniref30/UniRef30_2021_03", + "--uniprot_database_path", "/database/uniprot/uniprot.fasta", + "--hmmsearch_binary_path", "/opt/conda/bin/hmmsearch", + "--hmmbuild_binary_path", "/opt/conda/bin/hmmbuild", + "--model_device", "cuda:0", + "--config_preset", config_preset, + "--save_outputs", + "--output_dir", output_dir + ] + + if use_precomputed_alignments: + command.extend(["--use_precomputed_alignments", precomputed_alignments_dir]) + + return command + + def _build_msa_alignment_command(self, cpus_per_task, no_tasks): + fasta_dir = f"/run_path/{self.default_fasta_dir_name}/tmp" + precomputed_alignments_dir_docker = "/run_path/output/alignments" + + command = [ + "python3", "/opt/openfold/scripts/precompute_alignments.py", + fasta_dir, + precomputed_alignments_dir_docker, + "--uniref90_database_path", "/database/uniref90/uniref90.fasta", + "--mgnify_database_path", "/database/mgnify/mgy_clusters_2022_05.fa", + "--pdb70_database_path", "/database/pdb70/pdb70", + "--uniclust30_database_path", "/database/uniclust30/uniclust30_2018_08/uniclust30_2018_08", + "--bfd_database_path", "/database/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt", + "--cpus_per_task", str(cpus_per_task), + "--no_tasks", str(no_tasks), + "--jackhmmer_binary_path", "/opt/conda/bin/jackhmmer", + "--hhblits_binary_path", "/opt/conda/bin/hhblits", + "--hhsearch_binary_path", "/opt/conda/bin/hhsearch", + "--kalign_binary_path", "/opt/conda/bin/kalign" + ] + + return command + + def _run_container(self, command): + volumes = self._setup_volumes() + + try: + print("Running Docker container...") + container = self.client.containers.run( + image="openfold:latest", + command=command, + volumes=volumes, + runtime="nvidia", + remove=True, + detach=True, + stdout=True, + stderr=True + ) + + self._stream_logs(container) + + except docker.errors.ContainerError as e: + print(f"ContainerError: {e}") + raise e + except docker.errors.ImageNotFound as e: + print(f"ImageNotFound: {e}") + raise e + except docker.errors.APIError as e: + print(f"APIError: {e}") + raise e \ No newline at end of file diff --git a/notebooks/src/inference.py b/notebooks/src/inference.py new file mode 100644 index 00000000..6d53e508 --- /dev/null +++ b/notebooks/src/inference.py @@ -0,0 +1,100 @@ +import os +import shutil +from .docker_runner import DockerRunner +from .utils import validate_sequence, generate_random_run_id, write_sequences_to_fasta, generate_individual_sequence_files + +class InferenceClientOpenFold: + default_fasta_dir_name = 'fasta_dir' + + def __init__(self, database_dir, docker_client): + self.database_dir = database_dir + self.docker_client = docker_client + self.docker_runner = None + + def run_inference(self, weight_set, model_name, inference_input=None, use_precomputed_alignments=False, run_id=None): + os.makedirs('data', exist_ok=True) + + if use_precomputed_alignments: + if not run_id: + raise ValueError("run_id is required when using pre-computed alignments.") + return self._run_with_precomputed_alignments(run_id, weight_set, model_name) + + if not inference_input: + raise ValueError("inference_input is required to compute alignments.") + + if not run_id: + run_id = generate_random_run_id() + + return self._run_new_inference(run_id, weight_set, model_name, inference_input) + + def _run_new_inference(self, run_id, weight_set, model_name, inference_input): + run_path = os.path.join(os.getcwd(), 'data', f'run_{run_id}') + fasta_dir_root_path = os.path.join(run_path, self.default_fasta_dir_name) + self._initialize_docker_runner(run_path) + + self._prepare_fasta_directory(fasta_dir_root_path, inference_input, weight_set) + generate_individual_sequence_files(run_path) + + self.run_msa_alignment() + self.run_model_with_preset(run_path, weight_set, model_name) + + return run_id + + def _run_with_precomputed_alignments(self, run_id, weight_set, model_name): + run_path = os.path.join(os.getcwd(), 'data', f'run_{run_id}') + if not os.path.isdir(run_path): + raise ValueError(f"Provided Run ID '{run_id}' does not exist.") + + self._initialize_docker_runner(run_path) + self.run_model_with_preset(run_path, weight_set, model_name) + + return run_id + + def _initialize_docker_runner(self, run_path): + self.docker_runner = DockerRunner(self.docker_client, run_path, self.database_dir, self.default_fasta_dir_name) + + def _prepare_fasta_directory(self, fasta_dir_root_path, inference_input, weight_set): + os.makedirs(fasta_dir_root_path, exist_ok=True) + print(f"Fasta root directory: {fasta_dir_root_path}") + + if os.path.isfile(inference_input): + self._copy_input_file(inference_input, fasta_dir_root_path) + else: + self._write_sequence_to_fasta(inference_input, fasta_dir_root_path, weight_set) + + def _copy_input_file(self, input_file, fasta_dir_root_path): + destination = os.path.join(fasta_dir_root_path, 'sequences.fasta') + shutil.copy2(input_file, destination) + print(f"Using input file: {input_file}") + + def _write_sequence_to_fasta(self, sequence_string, fasta_dir_root_path, weight_set): + validated_sequence = validate_sequence(sequence_string, weight_set) + fasta_file = write_sequences_to_fasta(validated_sequence.split(':'), fasta_dir_root_path) + print(f"Sequences written to FASTA file: {fasta_file}") + + def run_model_with_preset(self, run_path, weight_set, model_name): + config_preset_list = self._get_config_preset_list_for_model(weight_set, model_name) + for config_preset in config_preset_list: + self.docker_runner.run_inference_for_model(config_preset) + + def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): + self.docker_runner.run_msa_alignment(cpus_per_task=cpus_per_task, no_tasks=no_tasks) + + def _get_config_preset_list_for_model(self, weight_set, model_name): + model_configurations = { + ("OpenFold", "monomer"): [ + 'finetuning_3.pt', + 'finetuning_4.pt', + 'finetuning_5.pt', + 'finetuning_ptm_2.pt', + 'finetuning_no_templ_ptm_1.pt' + ], + ("AlphaFold", "multimer"): [f'model_{i}_multimer_v3' for i in range(1, 6)], + ("AlphaFold", "monomer"): [f'model_{i}' for i in range(1, 6)] + } + + config_preset_list = model_configurations.get((weight_set, model_name)) + if not config_preset_list: + raise ValueError(f"Invalid combination of weight_set '{weight_set}' and model_name '{model_name}'") + + return config_preset_list \ No newline at end of file diff --git a/notebooks/src/model.py b/notebooks/src/model.py deleted file mode 100644 index 20354d1b..00000000 --- a/notebooks/src/model.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import shutil -from .docker_commands import run_inference_for_model, run_msa_alignment -from .utils import validate_sequence, generate_random_run_id, write_sequences_to_fasta, generate_individual_sequence_files - -class InferenceClientOpenFold: - default_fasta_dir_name = 'fasta_dir' - - def __init__(self, database_dir): - self.database_dir = database_dir - - def run_inference(self, weight_set, model_name, inference_input=None, use_precomputed_alignments=False, run_id=None): - os.makedirs('data', exist_ok=True) - - if use_precomputed_alignments: - if not run_id: - raise ValueError("run_id is required when using pre-computed alignments.") - else: - if not inference_input: - raise ValueError("inference_input is required to compute alignments.") - - if not run_id: - run_id = generate_random_run_id() - run_path = f"{os.getcwd()}/data/run_{run_id}" - fasta_dir_root_path = f'{run_path}/{self.default_fasta_dir_name}' - - os.makedirs(fasta_dir_root_path, exist_ok=True) - print(f"Fasta root directory: {fasta_dir_root_path}") - - if os.path.isfile(inference_input): - shutil.copy2(inference_input, f"{fasta_dir_root_path}/sequences.fasta") - print(f"Using input file: {inference_input}") - else: - sequence_string = inference_input - print(f"Input is a sequence string: {sequence_string}") - - sequence_string = validate_sequence(inference_input, weight_set) - fasta_file = write_sequences_to_fasta(sequence_string.split(':'), fasta_dir_root_path) - print(f"Sequences written to FASTA file: {fasta_file}") - - generate_individual_sequence_files(run_path) - - self.run_msa_alignment(run_path) - self.run_model_with(run_path, weight_set, model_name) - - elif run_id and use_precomputed_alignments: - run_path = f"{os.getcwd()}/data/run_{run_id}" - - if os.path.isdir(run_path): - self.run_model_with(run_path, weight_set, model_name) - else: - raise ValueError("Provided Run ID does not exist") - - return run_id - - def run_model_with(self, run_path, weight_set, model_name): - config_preset_list = self.get_config_preset_list_for_model(weight_set, model_name) - - for config_preset in config_preset_list: - run_inference_for_model( - run_path, - self.database_dir, - self.default_fasta_dir_name, - config_preset - ) - - def run_msa_alignment(self, run_path): - run_msa_alignment(run_path, self.database_dir, self.default_fasta_dir_name) - - def get_config_preset_list_for_model(self, weight_set, model_name): - if weight_set == "OpenFold" and model_name == "monomer": - model_names = [ - 'finetuning_3.pt', - 'finetuning_4.pt', - 'finetuning_5.pt', - 'finetuning_ptm_2.pt', - 'finetuning_no_templ_ptm_1.pt' - ] - elif weight_set == "AlphaFold" and model_name == "multimer": - model_names = [f'model_{i}_multimer_v3' for i in range(1, 6)] - elif weight_set == "AlphaFold" and model_name == "monomer": - model_names = [f'model_{i}' for i in range(1, 6)] - else: - raise ValueError(f"Invalid combination of weight_set '{weight_set}' and model_name '{model_name}'") - - return model_names \ No newline at end of file From 55fcc7041099c9d47ea434300cea987924c20262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 21:57:19 +0000 Subject: [PATCH 10/15] added metrics and plots --- notebooks/src/metrics.py | 205 +++++++++++++++++++++++++++++++++ notebooks/src/plot_msas.py | 115 ++++++++++++++++++ notebooks/src/requirements.txt | 14 ++- notebooks/src/utils.py | 31 +++++ 4 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 notebooks/src/metrics.py create mode 100644 notebooks/src/plot_msas.py diff --git a/notebooks/src/metrics.py b/notebooks/src/metrics.py new file mode 100644 index 00000000..8d755bb9 --- /dev/null +++ b/notebooks/src/metrics.py @@ -0,0 +1,205 @@ +import os +import json +import numpy as np +import pickle +import matplotlib +import matplotlib.pyplot as plt +import py3Dmol +from openfold.np import protein +from openfold.np.relax.utils import overwrite_b_factors +from .utils import get_config_preset_list_for_model, get_run_folder_by_id + +# Color bands for visualizing plddt +PLDDT_BANDS = [ + (0, 50, '#FF7D45'), + (50, 70, '#FFDB13'), + (70, 90, '#65CBF3'), + (90, 100, '#0053D6') +] + +plddts = {} +pae_outputs = {} +weighted_ptms = {} + + +def load_prediction_results(pkl_dir, fasta_id, model_name): + with open(f'{pkl_dir}/{fasta_id}_{model_name}_output_dict.pkl', 'rb') as f: + prediction_result = pickle.load(f) + return prediction_result + +def get_metrics_and_visualizations(run_id, weight_set, model_name, fasta_id, relax_prediction=False): + best_model_name, to_visualize_pdb_path = get_best_prediction_by_plddt(run_id, weight_set, model_name, fasta_id, relax_prediction) + plot_3d_structure(to_visualize_pdb_path) + get_plddt_pae(f"{get_run_folder_by_id(run_id)}/output/selected_predictions", best_model_name, fasta_id) + + +def get_best_prediction_by_plddt(run_id, weight_set, model_name, fasta_id, relax_prediction=False): + global plddts, pae_outputs, weighted_ptms, best_unrelaxed_prot + + run_folder = get_run_folder_by_id(run_id) + + pkl_dir = os.path.join(f"{run_folder}/output", 'predictions') + output_dir = os.path.join(f"{run_folder}/output", 'selected_predictions') + os.makedirs(output_dir, exist_ok=True) + + config_preset_list = get_config_preset_list_for_model(weight_set, model_name) + + for config_preset_item in config_preset_list: + prediction_result = load_prediction_results(pkl_dir, fasta_id, config_preset_item) + mean_plddt = prediction_result['plddt'].mean() + + plddts[config_preset_item] = prediction_result['plddt'] + + if model_name == 'multimer': + pae_outputs[config_preset_item] = (prediction_result['predicted_aligned_error'], prediction_result['max_predicted_aligned_error']) + weighted_ptms[config_preset_item] = prediction_result['weighted_ptm_score'] + + final_atom_mask = prediction_result['final_atom_mask'] + + # Find the best model according to the mean pLDDT. + if model_name == 'monomer' or model_name == 'monomer_ptm': + best_model_name = max(plddts.keys(), key=lambda x: plddts[x].mean()) + elif model_name =='multimer': + best_model_name = max(weighted_ptms.keys(), key=lambda x: weighted_ptms[x]) + + print(f"Best model is: {best_model_name}, relaxed: {relax_prediction}") + + best_model_plddts = plddts[best_model_name].mean() + + print(f"Mean PLDDT: {best_model_plddts} ") + + # Save the mean pLDDT + pred_output_path = os.path.join(output_dir, f'{fasta_id}_mean_plddt.txt') + with open(pred_output_path, 'w') as f: + f.write(str(best_model_plddts)) + + unrelaxed_file_name = f'{pkl_dir}/{fasta_id}_{best_model_name}_unrelaxed.pdb' + + if relax_prediction: + pdb_file_name = f'{pkl_dir}/{fasta_id}_{best_model_name}_relaxed.pdb' + else: + pdb_file_name = unrelaxed_file_name + + with open(pdb_file_name, 'r') as file: + best_pdb = file.read() + + with open(unrelaxed_file_name, 'r') as file: + best_unrelaxed_pdb_str = file.read() + + best_unrelaxed_prot = protein.from_pdb_string(best_unrelaxed_pdb_str) + + pred_output_path = os.path.join(output_dir, f'{fasta_id}_selected_prediction.pdb') + with open(pred_output_path, 'w') as f: + f.write(best_pdb) + + banded_b_factors = [] + for plddt in plddts[best_model_name]: + for idx, (min_val, max_val, _) in enumerate(PLDDT_BANDS): + if plddt >= min_val and plddt <= max_val: + banded_b_factors.append(idx) + break + banded_b_factors = np.array(banded_b_factors)[:, None] * final_atom_mask + to_visualize_pdb = overwrite_b_factors(best_pdb, banded_b_factors) + + visualize_output_path = os.path.join(output_dir, f'{fasta_id}_selected_prediction_visualize.pdb') + with open(visualize_output_path, 'w') as f: + f.write(to_visualize_pdb) + + pae_output_path = os.path.join(output_dir, f'{fasta_id}_predicted_aligned_error.json') + if pae_outputs: + rounded_errors = np.round(pae_outputs[best_model_name][0].astype(np.float64), decimals=1) + indices = np.indices((len(rounded_errors), len(rounded_errors))) + 1 + indices_1 = indices[0].flatten().tolist() + indices_2 = indices[1].flatten().tolist() + pae_data = json.dumps([{ + 'residue1': indices_1, + 'residue2': indices_2, + 'distance': rounded_errors.flatten().tolist(), + 'max_predicted_aligned_error': pae_outputs[best_model_name][1].item() + }], + indent=None, + separators=(',', ':')) + with open(pae_output_path, 'w') as f: + f.write(pae_data) + + return best_model_name, visualize_output_path + + +def get_plddt_pae(output_dir, best_model_name, fasta_id): + if pae_outputs: + num_plots = 2 + else: + num_plots = 1 + + plt.figure(figsize=[8 * num_plots, 6]) + plt.subplot(1, num_plots, 1) + plt.plot(plddts[best_model_name]) + plt.title('Predicted LDDT') + plt.xlabel('Residue') + plt.ylabel('pLDDT') + + if num_plots == 2: + plt.subplot(1, 2, 2) + pae, max_pae = list(pae_outputs.values())[0] + plt.imshow(pae, vmin=0., vmax=max_pae, cmap='Greens_r') + plt.colorbar(fraction=0.046, pad=0.04) + + total_num_res = best_unrelaxed_prot.residue_index.shape[-1] + chain_ids = best_unrelaxed_prot.chain_index + for chain_boundary in np.nonzero(chain_ids[:-1] - chain_ids[1:]): + if chain_boundary.size: + plt.plot([0, total_num_res], [chain_boundary, chain_boundary], color='red') + plt.plot([chain_boundary, chain_boundary], [0, total_num_res], color='red') + plt.title('Predicted Aligned Error') + plt.xlabel('Scored residue') + plt.ylabel('Aligned residue') + + # Save the pLDDT and predicted aligned error plots as PNG + plt.savefig(os.path.join(output_dir, f'{fasta_id}_plddt_pae.png')) + print(f"Saved pLDDT and predicted aligned error plots as PNG to {output_dir}") + return plt + +def plot_plddt_legend(): + thresh = [ + 'Very low (pLDDT < 50)', + 'Low (70 > pLDDT > 50)', + 'Confident (90 > pLDDT > 70)', + 'Very high (pLDDT > 90)'] + + colors = [x[2] for x in PLDDT_BANDS] + + plt.figure(figsize=(2, 2)) + for c in colors: + plt.bar(0, 0, color=c) + plt.legend(thresh, frameon=False, loc='center', fontsize=20) + plt.xticks([]) + plt.yticks([]) + ax = plt.gca() + ax.spines['right'].set_visible(False) + ax.spines['top'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['bottom'].set_visible(False) + plt.title('Model Confidence', fontsize=20, pad=20) + plt.show() + return plt + +def plot_3d_structure(pdb_file_path): + """Plots the 3D structure for use in a Jupyter notebook.""" + show_sidechains = True + + with open(pdb_file_path, 'r') as f: + pdb_content = f.read() + + color_map = {i: bands[2] for i, bands in enumerate(PLDDT_BANDS)} + view = py3Dmol.view(width=800, height=600) + view.addModelsAsFrames(pdb_content) + style = {'cartoon': { + 'colorscheme': { + 'prop': 'b', + 'map': color_map} + }} + if show_sidechains: + style['stick'] = {} + view.setStyle({'model': -1}, style) + view.zoomTo() + view.show() \ No newline at end of file diff --git a/notebooks/src/plot_msas.py b/notebooks/src/plot_msas.py new file mode 100644 index 00000000..310a4607 --- /dev/null +++ b/notebooks/src/plot_msas.py @@ -0,0 +1,115 @@ +import os +import argparse +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +from openfold.data import parsers +from .utils import get_run_folder_by_id, read_fasta_file + + +def get_msa_plot(run_id, fasta_id=None): + + run_folder = get_run_folder_by_id(run_id) + input_msa_folder = os.path.join(f"{run_folder}/output", 'alignments') + output_folder = os.path.join(f"{run_folder}/output", 'msa_plots') + os.makedirs(output_folder, exist_ok=True) + + + if fasta_id: + fasta_file_path = os.path.join(f"{run_folder}/fasta_dir/tmp", f"{fasta_id}.fasta") + sequence = read_fasta_file(fasta_file_path) + output_file_plot = os.path.join(output_folder, f"{fasta_id}_msa_plot.png") + create_msa_plot(fasta_id, sequence, output_file_plot, input_msa_folder) + + else: + # Iterate over each subfolder in output_msa_openfold + for subfolder in os.listdir(input_msa_folder): + + fasta_id = subfolder + fasta_file_path = os.path.join(f"{run_folder}/fasta_dir/tmp", f"{fasta_id}.fasta") + + if os.path.exists(fasta_file_path): + sequence = read_fasta_file(fasta_file_path) + output_file_plot = os.path.join(output_folder, f"{fasta_id}_msa_plot.png") + create_msa_plot(fasta_id, sequence, output_file_plot, input_msa_folder) + +def create_msa_plot(fasta_id, original_sequence, output_file, input_msa_folder): + # Path to the search results files + search_results_files = { + original_sequence: { + 'uniref90': os.path.join(input_msa_folder, f'{fasta_id}/uniref90_hits.sto'), + 'mgnify': os.path.join(input_msa_folder, f'{fasta_id}/mgnify_hits.sto'), + 'smallbfd': os.path.join(input_msa_folder, f'{fasta_id}/bfd_uniclust_hits.a3m'), + # Add other databases if needed + }, + } + + MAX_HITS_BY_DB = { + 'uniref90': 10000, + 'mgnify': 501, + 'smallbfd': 5000, + } + + msas_by_seq_by_db = {seq: {} for seq in search_results_files.keys()} + full_msa_by_seq = {seq: [] for seq in search_results_files.keys()} + + # Function to parse the MSA files + def parse_msa_file(file_path, file_type): + if file_type == 'sto': + with open(file_path, 'r') as f: + sto_content = f.read() + msa_obj = parsers.parse_stockholm(sto_content) + elif file_type == 'a3m': + with open(file_path, 'r') as f: + a3m_content = f.read() + msa_obj = parsers.parse_a3m(a3m_content) + return msa_obj + + # Load the search results from files + for seq_name, db_files in search_results_files.items(): + print(f'Loading results for sequence: {fasta_id}:{seq_name}') + for db_name, result_file in db_files.items(): + print(f' Loading database: {db_name}') + file_type = result_file.split('.')[-1] + msa_obj = parse_msa_file(result_file, file_type) + + msas, del_matrix, targets = msa_obj.sequences, msa_obj.deletion_matrix, msa_obj.descriptions + db_msas = parsers.Msa(msas, del_matrix, targets) + if db_msas: + if db_name in MAX_HITS_BY_DB: + db_msas.truncate(MAX_HITS_BY_DB[db_name]) + msas_by_seq_by_db[seq_name][db_name] = db_msas + full_msa_by_seq[seq_name].extend(msas) + msa_size = len(set(msas)) + print(f'{msa_size} Sequences Found in {db_name}') + + # Deduplicate full MSA and calculate total MSA size + for seq_name in full_msa_by_seq.keys(): + full_msa_by_seq[seq_name] = list(dict.fromkeys(full_msa_by_seq[seq_name])) + total_msa_size = len(full_msa_by_seq[seq_name]) + print(f'\n{total_msa_size} Sequences Found in Total for {seq_name}\n') + + # Visualize the results + fig = plt.figure(figsize=(12, 3)) + max_num_alignments = 0 + + for seq_idx, seq_name in enumerate(search_results_files.keys()): + full_msas = full_msa_by_seq[seq_name] + deduped_full_msa = list(dict.fromkeys(full_msas)) + total_msa_size = len(deduped_full_msa) + + aa_map = {restype: i for i, restype in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ-')} + msa_arr = np.array([[aa_map[aa] for aa in seq] for seq in deduped_full_msa]) + num_alignments, num_res = msa_arr.shape + plt.plot(np.sum(msa_arr != aa_map['-'], axis=0), label=f'Chain {seq_idx}') + max_num_alignments = max(num_alignments, max_num_alignments) + + plt.title('Per-Residue Count of Non-Gap Amino Acids in the MSA') + plt.ylabel('Non-Gap Count') + plt.yticks(range(0, max_num_alignments + 1, max(1, int(max_num_alignments / 3)))) + plt.legend() + + # Save the plot to a file + plt.savefig(output_file) + print(f'MSA plot saved to: {output_file}') + return fig \ No newline at end of file diff --git a/notebooks/src/requirements.txt b/notebooks/src/requirements.txt index de5cce34..3d5a234e 100644 --- a/notebooks/src/requirements.txt +++ b/notebooks/src/requirements.txt @@ -1 +1,13 @@ -docker==7.1.0 \ No newline at end of file +docker==7.1.0 +py3Dmol==2.3.0 +torch==2.4.0 +ml_collections==0.1.1 +modelcif==1.0 +tqdm==4.66.5 +dm-tree==0.1.8 +biopython==1.83 +OpenMM +numpy==1.23.5 +git+https://github.com/aqlaboratory/openfold.git@3bec3e9b2d1e8bdb83887899102eff7d42dc2ba9 +matplotlib +notebook \ No newline at end of file diff --git a/notebooks/src/utils.py b/notebooks/src/utils.py index 38761c6f..fedf0b87 100644 --- a/notebooks/src/utils.py +++ b/notebooks/src/utils.py @@ -2,6 +2,18 @@ import string import os +def get_run_folder_by_id(run_id): + base_path = os.path.join(os.getcwd(), 'data') + run_folder = next((entry.path for entry in os.scandir(base_path) if entry.is_dir() and entry.name.endswith(f'_{run_id}')), None) + if run_folder is None: + raise ValueError(f"Run ID '{run_id}' does not exist.") + return run_folder + +def read_fasta_file(fasta_file_path): + with open(fasta_file_path, 'r') as file: + lines = file.readlines() + sequence = ''.join(line.strip() for line in lines if not line.startswith('>')) + return sequence def generate_random_run_id(length=6): run_id = ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) @@ -69,3 +81,22 @@ def generate_individual_sequence_files(output_path): outfile.write(f">{sequence_id}\n") outfile.write("\n".join(sequence_lines) + "\n") print(f"Saved {sequence_id} to {output_file}") + +def get_config_preset_list_for_model(weight_set, model_name): + model_configurations = { + ("OpenFold", "monomer"): [ + 'finetuning_3.pt', + 'finetuning_4.pt', + 'finetuning_5.pt', + 'finetuning_ptm_2.pt', + 'finetuning_no_templ_ptm_1.pt' + ], + ("AlphaFold", "multimer"): [f'model_{i}_multimer_v3' for i in range(1, 6)], + ("AlphaFold", "monomer"): [f'model_{i}' for i in range(1, 6)] + } + + config_preset_list = model_configurations.get((weight_set, model_name)) + if not config_preset_list: + raise ValueError(f"Invalid combination of weight_set '{weight_set}' and model_name '{model_name}'") + + return config_preset_list From 0bf3be00d8888eda1366795f5ed27d627578dee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 21:57:31 +0000 Subject: [PATCH 11/15] added date to run folder --- notebooks/src/inference.py | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/notebooks/src/inference.py b/notebooks/src/inference.py index 6d53e508..2fd317cb 100644 --- a/notebooks/src/inference.py +++ b/notebooks/src/inference.py @@ -1,7 +1,8 @@ import os import shutil from .docker_runner import DockerRunner -from .utils import validate_sequence, generate_random_run_id, write_sequences_to_fasta, generate_individual_sequence_files +from datetime import datetime +from .utils import validate_sequence, generate_random_run_id, write_sequences_to_fasta, generate_individual_sequence_files, get_run_folder_by_id, get_config_preset_list_for_model class InferenceClientOpenFold: default_fasta_dir_name = 'fasta_dir' @@ -28,7 +29,10 @@ def run_inference(self, weight_set, model_name, inference_input=None, use_precom return self._run_new_inference(run_id, weight_set, model_name, inference_input) def _run_new_inference(self, run_id, weight_set, model_name, inference_input): - run_path = os.path.join(os.getcwd(), 'data', f'run_{run_id}') + # Get the current date and time in dd_MM_yy_hh_mm_ss format + current_datetime = datetime.now().strftime('%d_%m_%y_%H_%M_%S') + run_path = os.path.join(os.getcwd(), 'data', f'run_{current_datetime}_{run_id}') + # run_path = os.path.join(os.getcwd(), 'data', f'run_{run_id}') fasta_dir_root_path = os.path.join(run_path, self.default_fasta_dir_name) self._initialize_docker_runner(run_path) @@ -41,7 +45,8 @@ def _run_new_inference(self, run_id, weight_set, model_name, inference_input): return run_id def _run_with_precomputed_alignments(self, run_id, weight_set, model_name): - run_path = os.path.join(os.getcwd(), 'data', f'run_{run_id}') + run_path = get_run_folder_by_id(run_id) + print(f"Using pre-computed alignments from: {run_path}") if not os.path.isdir(run_path): raise ValueError(f"Provided Run ID '{run_id}' does not exist.") @@ -73,28 +78,9 @@ def _write_sequence_to_fasta(self, sequence_string, fasta_dir_root_path, weight_ print(f"Sequences written to FASTA file: {fasta_file}") def run_model_with_preset(self, run_path, weight_set, model_name): - config_preset_list = self._get_config_preset_list_for_model(weight_set, model_name) + config_preset_list = get_config_preset_list_for_model(weight_set, model_name) for config_preset in config_preset_list: self.docker_runner.run_inference_for_model(config_preset) def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): - self.docker_runner.run_msa_alignment(cpus_per_task=cpus_per_task, no_tasks=no_tasks) - - def _get_config_preset_list_for_model(self, weight_set, model_name): - model_configurations = { - ("OpenFold", "monomer"): [ - 'finetuning_3.pt', - 'finetuning_4.pt', - 'finetuning_5.pt', - 'finetuning_ptm_2.pt', - 'finetuning_no_templ_ptm_1.pt' - ], - ("AlphaFold", "multimer"): [f'model_{i}_multimer_v3' for i in range(1, 6)], - ("AlphaFold", "monomer"): [f'model_{i}' for i in range(1, 6)] - } - - config_preset_list = model_configurations.get((weight_set, model_name)) - if not config_preset_list: - raise ValueError(f"Invalid combination of weight_set '{weight_set}' and model_name '{model_name}'") - - return config_preset_list \ No newline at end of file + self.docker_runner.run_msa_alignment(cpus_per_task=cpus_per_task, no_tasks=no_tasks) \ No newline at end of file From f5cce3e3d5b4d806feec2be4fe9a91bde4197bc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 23:16:18 +0000 Subject: [PATCH 12/15] added GPU param --- notebooks/src/docker_runner.py | 8 ++++---- notebooks/src/inference.py | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/notebooks/src/docker_runner.py b/notebooks/src/docker_runner.py index 70494003..6f5605d9 100644 --- a/notebooks/src/docker_runner.py +++ b/notebooks/src/docker_runner.py @@ -17,8 +17,8 @@ def _stream_logs(self, container): for log in container.logs(stream=True): print(log.decode('utf-8'), end='') - def run_inference_for_model(self, config_preset, use_precomputed_alignments=True): - command = self._build_inference_command(config_preset, use_precomputed_alignments) + def run_inference_for_model(self, config_preset, gpu, use_precomputed_alignments=True): + command = self._build_inference_command(config_preset, gpu, use_precomputed_alignments) self._run_container(command) def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): @@ -27,7 +27,7 @@ def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): command = self._build_msa_alignment_command(cpus_per_task, no_tasks) self._run_container(command) - def _build_inference_command(self, config_preset, use_precomputed_alignments): + def _build_inference_command(self, config_preset, gpu, use_precomputed_alignments): fasta_dir = f"/run_path/{self.default_fasta_dir_name}/tmp" output_dir = "/run_path/output" precomputed_alignments_dir = "/run_path/output/alignments" @@ -50,7 +50,7 @@ def _build_inference_command(self, config_preset, use_precomputed_alignments): "--uniprot_database_path", "/database/uniprot/uniprot.fasta", "--hmmsearch_binary_path", "/opt/conda/bin/hmmsearch", "--hmmbuild_binary_path", "/opt/conda/bin/hmmbuild", - "--model_device", "cuda:0", + "--model_device", gpu, "--config_preset", config_preset, "--save_outputs", "--output_dir", output_dir diff --git a/notebooks/src/inference.py b/notebooks/src/inference.py index 2fd317cb..33e4366d 100644 --- a/notebooks/src/inference.py +++ b/notebooks/src/inference.py @@ -12,13 +12,13 @@ def __init__(self, database_dir, docker_client): self.docker_client = docker_client self.docker_runner = None - def run_inference(self, weight_set, model_name, inference_input=None, use_precomputed_alignments=False, run_id=None): + def run_inference(self, weight_set, model_name, inference_input=None, use_precomputed_alignments=False, run_id=None, gpu="cuda:0"): os.makedirs('data', exist_ok=True) if use_precomputed_alignments: if not run_id: raise ValueError("run_id is required when using pre-computed alignments.") - return self._run_with_precomputed_alignments(run_id, weight_set, model_name) + return self._run_with_precomputed_alignments(run_id, weight_set, model_name, gpu) if not inference_input: raise ValueError("inference_input is required to compute alignments.") @@ -26,9 +26,9 @@ def run_inference(self, weight_set, model_name, inference_input=None, use_precom if not run_id: run_id = generate_random_run_id() - return self._run_new_inference(run_id, weight_set, model_name, inference_input) + return self._run_new_inference(run_id, weight_set, model_name, inference_input, gpu) - def _run_new_inference(self, run_id, weight_set, model_name, inference_input): + def _run_new_inference(self, run_id, weight_set, model_name, inference_input, gpu): # Get the current date and time in dd_MM_yy_hh_mm_ss format current_datetime = datetime.now().strftime('%d_%m_%y_%H_%M_%S') run_path = os.path.join(os.getcwd(), 'data', f'run_{current_datetime}_{run_id}') @@ -40,18 +40,18 @@ def _run_new_inference(self, run_id, weight_set, model_name, inference_input): generate_individual_sequence_files(run_path) self.run_msa_alignment() - self.run_model_with_preset(run_path, weight_set, model_name) + self.run_model_with_preset(run_path, weight_set, model_name, gpu) return run_id - def _run_with_precomputed_alignments(self, run_id, weight_set, model_name): + def _run_with_precomputed_alignments(self, run_id, weight_set, model_name, gpu): run_path = get_run_folder_by_id(run_id) print(f"Using pre-computed alignments from: {run_path}") if not os.path.isdir(run_path): raise ValueError(f"Provided Run ID '{run_id}' does not exist.") self._initialize_docker_runner(run_path) - self.run_model_with_preset(run_path, weight_set, model_name) + self.run_model_with_preset(run_path, weight_set, model_name, gpu) return run_id @@ -77,10 +77,10 @@ def _write_sequence_to_fasta(self, sequence_string, fasta_dir_root_path, weight_ fasta_file = write_sequences_to_fasta(validated_sequence.split(':'), fasta_dir_root_path) print(f"Sequences written to FASTA file: {fasta_file}") - def run_model_with_preset(self, run_path, weight_set, model_name): + def run_model_with_preset(self, run_path, weight_set, model_name, gpu): config_preset_list = get_config_preset_list_for_model(weight_set, model_name) for config_preset in config_preset_list: - self.docker_runner.run_inference_for_model(config_preset) + self.docker_runner.run_inference_for_model(config_preset, gpu) def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): self.docker_runner.run_msa_alignment(cpus_per_task=cpus_per_task, no_tasks=no_tasks) \ No newline at end of file From e36d391e3f0fcc60a0bf15b15376e2d137cd11ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Tue, 27 Aug 2024 23:19:15 +0000 Subject: [PATCH 13/15] Updated usage examples --- notebooks/OpenFoldLocal.ipynb | 255 ++++++++++++++++++++++++++++++---- 1 file changed, 228 insertions(+), 27 deletions(-) diff --git a/notebooks/OpenFoldLocal.ipynb b/notebooks/OpenFoldLocal.ipynb index c72dc7fa..57a7e463 100644 --- a/notebooks/OpenFoldLocal.ipynb +++ b/notebooks/OpenFoldLocal.ipynb @@ -6,18 +6,18 @@ "source": [ "# OpenFold Local Notebook\n", "\n", - "Provides the flexibility to run inference using a local installation of OpenFold with Docker, along with the convenience of visualizing results using the same plots from the OpenFold Colab Notebook.\n", + "Provides the flexibility to run inference on a target sequence using a local installation of OpenFold with Docker, along with the convenience of visualizing results using the same plots from the OpenFold Colab Notebook.\n", "\n", - "This notebook uses the provided utility functions to execute OpenFold via Docker it adds logic to handle results, so you can experiment with different parameters, re-use computed msas, filter the best model, plot metrics and, async and handle long running executions.\n", + "This notebook utilizes the provided utility functions to execute OpenFold via Docker. It includes logic to handle results, allowing you to experiment with different parameters, reuse computed MSAs, filter the best model, and plot metrics. It also supports asynchronous and long-running executions.\n", "\n", - "If you have access to a machine and want to do quick inference and visualize results, there are some useful things you can do with this notebook:\n", + "If you have access to a machine and want to perform quick inference and visualize results, this notebook offers several useful features:\n", "\n", - "- You can use precomputed alignments, which enables you to run inference with different model parameters to compare results\n", - "- Get the best model and metric plots \n", - "- Handle long-running executions\n", - "- Work with big datasets i.e split your input and perform async runs using asyncio on multiple GPUs\n", + "- Use precomputed alignments, enabling you to run inference with different model parameters for result comparison.\n", + "- Identify the best model and generate metric plots.\n", + "- Manage long-running executions.\n", + "- Work with large datasets by splitting your input and performing asynchronous runs using threads on multiple GPUs.\n", "\n", - "Of course, you can do this solely using Docker commands in the terminal, but you would need to code/adjust the Colab functions to work with data locally. This notebook gives you a head start." + "While you can achieve this entirely through Docker commands in the terminal, you would need to code or adjust the Colab functions to work with local data. This notebook gives you a head start." ] }, { @@ -28,19 +28,27 @@ "\n", "Fist, build Openfold using Docker. Follow this [guide](https://openfold.readthedocs.io/en/latest/original_readme.html#building-and-using-the-docker-container).\n", "\n", - "Start your notebook:\n", - "\n", - "Go to the notebook folder:\n", + "Then, go to the notebook folder\n", "\n", "`cd notebooks`\n", "\n", - "Install the requirements in your env:\n", + "Create an environment to run Jupyter with the requirements\n", + "\n", + "`mamba create -n openfold_notebook python==3.10`\n", + "\n", + "Activate the environment\n", + "\n", + "`mamba activate openfold_notebook`\n", "\n", - "`pip install -r utils/requirements.txt`\n", + "Install the requirements\n", "\n", - "Start your Jupyter server\n", + "`pip install -r src/requirements.txt`\n", "\n", - "`jupyter lab . --ip=\"0.0.0.0\"`\n" + "Start your Jupyter server in the current folder\n", + "\n", + "`jupyter lab . --ip=\"0.0.0.0\"`\n", + "\n", + "Access the notebook URL or connect remotely using VSCode.\n" ] }, { @@ -55,13 +63,15 @@ "\n", "```bash\n", "data/ \n", - "├── run_/ # each is run stored with a random ID, this id can be use to re-run inference \n", + "├── run__/ # each is run stored with a random ID, this id can be use to re-run inference \n", "│ ├── fasta_dir/ \n", "│ │ ├── tmp/ # generated .fasta file per sequence\n", "│ │ └── sequences.fasta # validated input sequences are merged into a .fasta file\n", "│ └── output/\n", "│ ├── alignments/ # one folder per sequence of resulted MSA\n", + "│ ├── msa_plots/ # one file per aligment .png\n", "│ ├── predictions/ # inference results .pkl and .pdb files\n", + "│ ├── selected_predictions/ # selected best inferece and metrics plots\n", "│ └── timings.json # inference time\n", "```" ] @@ -75,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -89,7 +99,9 @@ "# remote_docker_client = docker.DockerClient(base_url='tcp://:2375')\n", "\n", "# Initialize the OpenFold Docker client setting the database path \n", - "databases_dir = \"/home/juliocesar/DataZ/Models/alphafold_all_data\"\n", + "\n", + "databases_dir = \"/path/to/databases\"\n", + "\n", "openfold_client = InferenceClientOpenFold(databases_dir, docker_client)" ] }, @@ -102,15 +114,51 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Run ID: 0KCTJ6\n", + "Fasta root directory: /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir\n", + "Sequences written to FASTA file: /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir/sequences.fasta\n", + "Saved sequence_E3XARJK5 to /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir/tmp/sequence_E3XARJK5.fasta\n", + "Saved sequence_OYILNP8A to /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir/tmp/sequence_OYILNP8A.fasta\n", + "Running Docker container...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==========\n", + "== CUDA ==\n", + "==========\n", + "\n", + "CUDA Version 11.3.1\n", + "\n", + "Container image Copyright (c) 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n", + "\n", + "This container image and its contents are governed by the NVIDIA Deep Learning Container License.\n", + "By pulling and using the container, you accept the terms and conditions of this license:\n", + "https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license\n", + "\n", + "A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.\n", + "\n", + "/opt/conda/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.26.4\n", + " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" + ] + } + ], "source": [ "# For multiple sequences, separate sequences with a colon `:`\n", "input_string = \"DAGAQGAAIGSPGVLSGNVVQVPVHVPVNVCGNTVSVIGLLNPAFGNTCVNA:AGETGRTGVLVTSSATNDGDSGWGRFAG\"\n", "\n", - "model_name = \"multimer\"\n", - "weight_set = 'AlphaFold'\n", + "model_name = \"multimer\" # or \"monomer\"\n", + "weight_set = 'AlphaFold' # or 'OpenFold'\n", "\n", "# Run inference\n", "run_id = openfold_client.run_inference(weight_set, model_name, inference_input=input_string)" @@ -129,7 +177,7 @@ "metadata": {}, "outputs": [], "source": [ - "input_file = \"/home/juliocesar/Models/openfold/notebooks/data/test.fasta\"\n", + "input_file = \"/path/to/test.fasta\"\n", "\n", "run_id = openfold_client.run_inference(weight_set, model_name, inference_input=input_file)" ] @@ -147,18 +195,171 @@ "metadata": {}, "outputs": [], "source": [ + "model_name = \"monomer\"\n", + "weight_set = 'OpenFold'\n", + "\n", + "openfold_client.run_inference(weight_set, model_name, use_precomputed_alignments=True, run_id=run_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Metrics and Visualizations " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get the MSA Plots for one sequence in a run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from src.plot_msas import get_msa_plot\n", + "\n", + "# Provide the fasta sequence id and the run_id\n", + "get_msa_plot(\"\", fasta_id=\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# To get all sequence aligments\n", + "get_msa_plot(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Get the best prediction by pLDDT and metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from src.metrics import get_metrics_and_visualizations, plot_plddt_legend\n", + "\n", "model_name = \"multimer\"\n", "weight_set = 'AlphaFold'\n", "\n", - "openfold_client.run_inference(weight_set, model_name, use_precomputed_alignments=True, run_id=\"8CJUIY\")" + "plot_plddt_legend()\n", + "get_metrics_and_visualizations(\"\", weight_set, model_name, \"sequence_id\", relax_prediction=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Concurrent/Async inference\n", + "\n", + "If you have multiple cards and want to run concurrent inference for experiments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from concurrent.futures import ProcessPoolExecutor, as_completed\n", + "import signal\n", + "\n", + "def experiment_1():\n", + " print(\"Experiment 1 is running\")\n", + " input_file = \"/path/to/experiment_1.fasta\"\n", + " gpu = \"cuda:0\"\n", + " model_name = \"multimer\"\n", + " weight_set = 'AlphaFold'\n", + " run_id = openfold_client.run_inference(weight_set, model_name, inference_input=input_file, gpu=gpu) \n", + " return \"Experiment 1 completed\"\n", + "\n", + "def experiment_2():\n", + " print(\"Experiment 2 is running\")\n", + " input_file = \"/path/to/experiment_2.fasta\"\n", + " gpu = \"cuda:1\"\n", + " model_name = \"monomer\"\n", + " weight_set = 'OpenFold'\n", + " run_id = openfold_client.run_inference(weight_set, model_name, inference_input=input_file, gpu=gpu)\n", + " return \"Experiment 2 completed\"\n", + "\n", + "experiments = [experiment_1, experiment_2]\n", + "\n", + "# Function to handle keyboard interrupt\n", + "def signal_handler(sig, frame):\n", + " print(\"Interrupt received, stopping...\")\n", + " raise KeyboardInterrupt\n", + "\n", + "# Register the signal handler\n", + "signal.signal(signal.SIGINT, signal_handler)\n", + "\n", + "try:\n", + " # Execute tasks in parallel\n", + " with ProcessPoolExecutor() as executor:\n", + " futures = [executor.submit(task) for task in experiments]\n", + " results = []\n", + " for future in as_completed(futures):\n", + " results.append(future.result())\n", + " print(\"Results:\", results)\n", + "except KeyboardInterrupt:\n", + " print(\"Execution interrupted by user.\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# License and Disclaimer\n", + "\n", + "This notebook and other information provided is for theoretical modelling only, caution should be exercised in its use. It is provided ‘as-is’ without any warranty of any kind, whether expressed or implied. Information is not intended to be a substitute for professional medical advice, diagnosis, or treatment, and does not constitute medical or other professional advice.\n", + "\n", + "## AlphaFold/OpenFold Code License\n", + "\n", + "Copyright 2021 AlQuraishi Laboratory\n", + "\n", + "Copyright 2021 DeepMind Technologies Limited.\n", + "\n", + "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0.\n", + "\n", + "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.\n", + "\n", + "## Model Parameters License\n", + "\n", + "DeepMind's AlphaFold parameters are made available under the terms of the Creative Commons Attribution 4.0 International (CC BY 4.0) license. You can find details at: https://creativecommons.org/licenses/by/4.0/legalcode\n", + "\n", + "\n", + "## Third-party software\n", + "\n", + "Use of the third-party software, libraries or code referred to in this notebook may be governed by separate terms and conditions or license provisions. Your use of the third-party software, libraries or code is subject to any such terms and you should check that you can comply with any applicable restrictions or terms and conditions before use.\n", + "\n", + "\n", + "## Mirrored Databases\n", + "\n", + "The following databases have been mirrored by DeepMind, and are available with reference to the following:\n", + "* UniRef90: v2021\\_03 (unmodified), by The UniProt Consortium, available under a [Creative Commons Attribution-NoDerivatives 4.0 International License](http://creativecommons.org/licenses/by-nd/4.0/).\n", + "* MGnify: v2019\\_05 (unmodified), by Mitchell AL et al., available free of all copyright restrictions and made fully and freely available for both non-commercial and commercial use under [CC0 1.0 Universal (CC0 1.0) Public Domain Dedication](https://creativecommons.org/publicdomain/zero/1.0/).\n", + "* BFD: (modified), by Steinegger M. and Söding J., modified by DeepMind, available under a [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by/4.0/). See the Methods section of the [AlphaFold proteome paper](https://www.nature.com/articles/s41586-021-03828-1) for details." ] } ], "metadata": { "kernelspec": { - "display_name": "signalp-6.0", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "signalp-6.0" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -170,7 +371,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.10.0" } }, "nbformat": 4, From 67ba1356e3aedbfd8b4c2aa5aec3b2cbc2151ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Wed, 28 Aug 2024 00:21:47 +0000 Subject: [PATCH 14/15] fixed openfold weight_set inference --- notebooks/src/docker_runner.py | 13 +++++++++---- notebooks/src/inference.py | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/notebooks/src/docker_runner.py b/notebooks/src/docker_runner.py index 6f5605d9..f048ba41 100644 --- a/notebooks/src/docker_runner.py +++ b/notebooks/src/docker_runner.py @@ -17,8 +17,8 @@ def _stream_logs(self, container): for log in container.logs(stream=True): print(log.decode('utf-8'), end='') - def run_inference_for_model(self, config_preset, gpu, use_precomputed_alignments=True): - command = self._build_inference_command(config_preset, gpu, use_precomputed_alignments) + def run_inference_for_model(self, weight_set, config_preset, gpu, use_precomputed_alignments=True): + command = self._build_inference_command(weight_set, config_preset, gpu, use_precomputed_alignments) self._run_container(command) def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): @@ -27,7 +27,7 @@ def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): command = self._build_msa_alignment_command(cpus_per_task, no_tasks) self._run_container(command) - def _build_inference_command(self, config_preset, gpu, use_precomputed_alignments): + def _build_inference_command(self, weight_set, config_preset, gpu, use_precomputed_alignments): fasta_dir = f"/run_path/{self.default_fasta_dir_name}/tmp" output_dir = "/run_path/output" precomputed_alignments_dir = "/run_path/output/alignments" @@ -51,10 +51,15 @@ def _build_inference_command(self, config_preset, gpu, use_precomputed_alignment "--hmmsearch_binary_path", "/opt/conda/bin/hmmsearch", "--hmmbuild_binary_path", "/opt/conda/bin/hmmbuild", "--model_device", gpu, - "--config_preset", config_preset, "--save_outputs", "--output_dir", output_dir ] + + if weight_set == "AlphaFold": + command.extend(["--config_preset", config_preset]) + + if weight_set == "OpenFold": + command.extend(["--openfold_checkpoint_path", f"/database/openfold_params/{config_preset}"]) if use_precomputed_alignments: command.extend(["--use_precomputed_alignments", precomputed_alignments_dir]) diff --git a/notebooks/src/inference.py b/notebooks/src/inference.py index 33e4366d..b053e711 100644 --- a/notebooks/src/inference.py +++ b/notebooks/src/inference.py @@ -80,7 +80,7 @@ def _write_sequence_to_fasta(self, sequence_string, fasta_dir_root_path, weight_ def run_model_with_preset(self, run_path, weight_set, model_name, gpu): config_preset_list = get_config_preset_list_for_model(weight_set, model_name) for config_preset in config_preset_list: - self.docker_runner.run_inference_for_model(config_preset, gpu) + self.docker_runner.run_inference_for_model(weight_set, config_preset, gpu) def run_msa_alignment(self, cpus_per_task=32, no_tasks=1): self.docker_runner.run_msa_alignment(cpus_per_task=cpus_per_task, no_tasks=no_tasks) \ No newline at end of file From 3505bc660915675c103bbafe11c620bda45f5ddc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julio=20C=C3=A9sar?= Date: Wed, 28 Aug 2024 00:23:35 +0000 Subject: [PATCH 15/15] Added related info and clean notebook --- notebooks/OpenFoldLocal.ipynb | 69 +++++++++++++---------------------- 1 file changed, 25 insertions(+), 44 deletions(-) diff --git a/notebooks/OpenFoldLocal.ipynb b/notebooks/OpenFoldLocal.ipynb index 57a7e463..36f72ccd 100644 --- a/notebooks/OpenFoldLocal.ipynb +++ b/notebooks/OpenFoldLocal.ipynb @@ -6,7 +6,7 @@ "source": [ "# OpenFold Local Notebook\n", "\n", - "Provides the flexibility to run inference on a target sequence using a local installation of OpenFold with Docker, along with the convenience of visualizing results using the same plots from the OpenFold Colab Notebook.\n", + "Provides the flexibility to run inference on a target sequence using a local Docker installation of [OpenFold](https://github.com/aqlaboratory/openfold), along with the convenience of visualizing results using the same plots from the OpenFold Colab Notebook.\n", "\n", "This notebook utilizes the provided utility functions to execute OpenFold via Docker. It includes logic to handle results, allowing you to experiment with different parameters, reuse computed MSAs, filter the best model, and plot metrics. It also supports asynchronous and long-running executions.\n", "\n", @@ -17,7 +17,24 @@ "- Manage long-running executions.\n", "- Work with large datasets by splitting your input and performing asynchronous runs using threads on multiple GPUs.\n", "\n", - "While you can achieve this entirely through Docker commands in the terminal, you would need to code or adjust the Colab functions to work with local data. This notebook gives you a head start." + "While you can achieve this entirely through Docker commands in the terminal, you would need to code or adjust the Colab functions to work with local data. This notebook gives you a head start.\n", + "\n", + "**Citing this work**\n", + "\n", + "Any publication that discloses findings arising from using this notebook should [cite](https://github.com/deepmind/alphafold/#citing-this-work) DeepMind's [AlphaFold paper](https://doi.org/10.1038/s41586-021-03819-2).\n", + "\n", + "**Licenses**\n", + "\n", + "This Notebook supports inference with the [AlphaFold model parameters](https://github.com/deepmind/alphafold/#model-parameters-license), made available under the Creative Commons Attribution 4.0 International ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)) license. The Colab itself is provided under the [Apache 2.0 license](https://www.apache.org/licenses/LICENSE-2.0). See the full license statement below.\n", + "\n", + "**More information**\n", + "\n", + "You can find more information about how AlphaFold/OpenFold works in DeepMind's two Nature papers:\n", + "\n", + "* [AlphaFold methods paper](https://www.nature.com/articles/s41586-021-03819-2)\n", + "* [AlphaFold predictions of the human proteome paper](https://www.nature.com/articles/s41586-021-03828-1)\n", + "\n", + "FAQ on how to interpret AlphaFold/OpenFold predictions are [here](https://alphafold.ebi.ac.uk/faq)." ] }, { @@ -85,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -114,45 +131,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Run ID: 0KCTJ6\n", - "Fasta root directory: /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir\n", - "Sequences written to FASTA file: /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir/sequences.fasta\n", - "Saved sequence_E3XARJK5 to /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir/tmp/sequence_E3XARJK5.fasta\n", - "Saved sequence_OYILNP8A to /home/juliocesar/Models/openfold/notebooks/data/run_27_08_24_23_06_24_0KCTJ6/fasta_dir/tmp/sequence_OYILNP8A.fasta\n", - "Running Docker container...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "==========\n", - "== CUDA ==\n", - "==========\n", - "\n", - "CUDA Version 11.3.1\n", - "\n", - "Container image Copyright (c) 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n", - "\n", - "This container image and its contents are governed by the NVIDIA Deep Learning Container License.\n", - "By pulling and using the container, you accept the terms and conditions of this license:\n", - "https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license\n", - "\n", - "A copy of this license is made available in this container at /NGC-DL-CONTAINER-LICENSE for your convenience.\n", - "\n", - "/opt/conda/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.26.4\n", - " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" - ] - } - ], + "outputs": [], "source": [ "# For multiple sequences, separate sequences with a colon `:`\n", "input_string = \"DAGAQGAAIGSPGVLSGNVVQVPVHVPVNVCGNTVSVIGLLNPAFGNTCVNA:AGETGRTGVLVTSSATNDGDSGWGRFAG\"\n", @@ -224,7 +205,7 @@ "from src.plot_msas import get_msa_plot\n", "\n", "# Provide the fasta sequence id and the run_id\n", - "get_msa_plot(\"\", fasta_id=\"\")" + "get_msa_plot(run_id, fasta_id=\"\")" ] }, { @@ -235,7 +216,7 @@ "source": [ "\n", "# To get all sequence aligments\n", - "get_msa_plot(\"\")" + "get_msa_plot(run_id)" ] }, { @@ -257,7 +238,7 @@ "weight_set = 'AlphaFold'\n", "\n", "plot_plddt_legend()\n", - "get_metrics_and_visualizations(\"\", weight_set, model_name, \"sequence_id\", relax_prediction=True)" + "get_metrics_and_visualizations(run_id, weight_set, model_name, \"\", relax_prediction=True)" ] }, {