Skip to content

Commit

Permalink
Release 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
boratyng committed Dec 8, 2022
1 parent cd5f062 commit 7314398
Show file tree
Hide file tree
Showing 47 changed files with 611 additions and 269 deletions.
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
cff-version: "1.1.0"
message: "If you use this software, please cite it using these metadata."
title: ElasticBLAST
version: "0.2.7"
date-released: 2022-08-11
version: "1.0.0"
date-released: 2022-12-05
license: "NCBI Public Domain"
repository-code: "https://github.com/ncbi/elastic-blast/"
authors:
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ elastic-blast3.7: ${PYTHON_SRC} ${YAML_TEMPLATES} ${VENV} validate-cf-templates
# Python support

${VENV}: requirements/base.txt requirements/test.txt
[ -d ${VENV} ] || python3 -m venv $@
[ -d ${VENV} ] || virtualenv -p python3 $@
source ${VENV}/bin/activate && pip3 install -qe . -r requirements/test.txt
source ${VENV}/bin/activate && python3 setup.py install_data

Expand Down Expand Up @@ -107,7 +107,7 @@ yamllint: ${VENV}
source ${VENV}/bin/activate && \
yamllint -d share/etc/yamllint-config.yaml src/elastic_blast/templates/storage-gcp-ssd.yaml
source ${VENV}/bin/activate && \
yamllint -d share/etc/yamllint-config.yaml src/elastic_blast/templates/pvc.yaml.template
yamllint -d share/etc/yamllint-config.yaml src/elastic_blast/templates/pvc-*.yaml.template
source ${VENV}/bin/activate && \
yamllint -d share/etc/yamllint-config.yaml src/elastic_blast/templates/job-init-*
source ${VENV}/bin/activate && \
Expand Down
1 change: 0 additions & 1 deletion bin/elastic-blast
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ from elastic_blast.constants import ElbCommand
from elastic_blast.constants import ELB_DFLT_LOGLEVEL, ELB_DFLT_LOGFILE
from elastic_blast.constants import CFG_CLOUD_PROVIDER, CFG_CP_GCP_PROJECT


DESC = r"""This application facilitates running BLAST on large amounts of query sequence data
on the cloud"""

Expand Down
10 changes: 10 additions & 0 deletions bin/fasta_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
from elastic_blast.split import FASTAReader
from elastic_blast.jobs import write_job_files
from elastic_blast.constants import ELB_QUERY_BATCH_FILE_PREFIX
from elastic_blast.constants import ELB_DFLT_LOGFILE, ELB_DFLT_LOGLEVEL
from elastic_blast.util import config_logging


DEFAULT_BATCH_LEN = 5000000
Expand Down Expand Up @@ -74,6 +76,12 @@ def parse_arguments():
help='file to report total number of bases/residues in input file')
parser.add_argument("-n", "--dry-run", action='store_true',
help="Do not run any commands, just show what would be executed")
parser.add_argument("--logfile", default=argparse.SUPPRESS, type=str,
help=f"Default: {ELB_DFLT_LOGFILE}")
parser.add_argument("--loglevel", default=argparse.SUPPRESS,
help=f"Default: {ELB_DFLT_LOGLEVEL}",
choices=["DEBUG", "INFO", "WARNING",
"ERROR", "CRITICAL"])
return parser.parse_args()

def main():
Expand All @@ -89,6 +97,8 @@ def main():
count_file = args.count
dry_run = args.dry_run
job_template_text = ''

config_logging(args)
try:
if job_template:
with open_for_read(job_template) as f:
Expand Down
2 changes: 1 addition & 1 deletion docker-blast/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ GCP_IMG?=gcr.io/ncbi-sandbox-blast/${IMG}
AWS_SERVER?=public.ecr.aws/i6v3i0i9
AWS_IMG?=${AWS_SERVER}/elasticblast-elb
AWS_REGION?=us-east-1
VERSION?=1.1.1
VERSION?=1.1.3

ifeq (, $(shell which vmtouch 2>/dev/null))
NOVMTOUCH?=--no-vmtouch
Expand Down
2 changes: 1 addition & 1 deletion docker-blast/splitq_download_db_search
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def _download_database(args, is_user, db_done):
verbose = ' --verbose --verbose --verbose --verbose --verbose --verbose' if args.verbose else ''
creds = ' --no-sign-request' if args.no_creds else ''
nprocs_to_download_db = min(MAX_PROCS_TO_DOWNLOAD_DB, int(os.cpu_count()/args.num_threads))
p = safe_exec(f"time update_blastdb.pl taxdb --decompress --source ncbi {verbose} --num_threads {nprocs_to_download_db}")
p = safe_exec(f"time update_blastdb.pl taxdb --decompress --source {args.source}{verbose} --num_threads {nprocs_to_download_db}")
print(p.stdout.decode(), end='')
print(p.stderr.decode(), end='')
if is_user:
Expand Down
80 changes: 47 additions & 33 deletions docker-blast/update_blastdb.pl
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@
use constant AMI_URL => "http://169.254.169.254/latest/meta-data/local-hostname";
use constant AWS_BUCKET => "ncbi-blast-databases";

use constant GCS_URL => "https://storage.googleapis.com";
use constant GCP_URL => "http://metadata.google.internal/computeMetadata/v1/instance/id";
use constant GCP_BUCKET => "gs://blast-db";
use constant GCP_BUCKET => "blast-db";

# TODO: deprecate this in the next release 2.14.x
#use constant BLASTDB_MANIFEST => "blastdb-manifest.json";
Expand All @@ -79,6 +80,7 @@
my $opt_source;
my $opt_legacy_exit_code = 0;
my $opt_nt = &get_num_cores();
my $opt_gcp_prj = undef;
my $result = GetOptions("verbose+" => \$opt_verbose,
"quiet" => \$opt_quiet,
"force" => \$opt_force_download,
Expand All @@ -89,6 +91,7 @@
"blastdb_version:i" => \$opt_blastdb_ver,
"decompress" => \$opt_decompress,
"source=s" => \$opt_source,
"gcp-project=s" => \$opt_gcp_prj,
"num_threads=i" => \$opt_nt,
"legacy_exit_code" => \$opt_legacy_exit_code,
"help" => \$opt_help);
Expand Down Expand Up @@ -168,15 +171,16 @@
print "Error: $0 depends on curl to fetch data from cloud storage, please install this utility to access this data source.\n";
exit(EXIT_FAILURE);
}
if ($location =~ /gcp/i and (not defined $gsutil or not defined $gcloud)) {
print "Error: $0 depends on gsutil and gcloud to fetch data from cloud storage, please install these utilities to access this data source.\n";
exit(EXIT_FAILURE);
}
my $gcp_prj = ($location =~ /gcp/i) ? &get_gcp_project() : undef;
if ($location =~ /gcp/i and not defined $gcp_prj) {
print "Error: $0 depends on gcloud being configured to fetch data from cloud storage, please configure it per the instructions in https://cloud.google.com/sdk/docs/initializing .\n";
if ($location =~ /gcp/i and defined($opt_gcp_prj) and (not defined $gsutil or not defined $gcloud)) {
print "Error: when providing a GCP project, $0 depends on gsutil and gcloud to fetch data from cloud storage, please install these utilities to access this data source.\n";
exit(EXIT_FAILURE);
}
my $gcp_prj = $opt_gcp_prj;
#my $gcp_prj = ($location =~ /gcp/i) ? &get_gcp_project() : undef;
#if ($location =~ /gcp/i and not defined $gcp_prj) {
# print "Error: $0 depends on gcloud being configured to fetch data from cloud storage, please configure it per the instructions in https://cloud.google.com/sdk/docs/initializing .\n";
# exit(EXIT_FAILURE);
#}

my $ftp;

Expand Down Expand Up @@ -285,10 +289,11 @@ sub showall_from_metadata_file_1_1
}
}
if (@files2download) {
my $gsutil = &get_gsutil_path();
my $awscli = &get_awscli_path();
my $cmd;
my $fh = File::Temp->new();
if ($location eq "GCP") {
if ($location eq "GCP" and defined($gcp_prj)) {
$cmd = "$gsutil -u $gcp_prj ";
if ($opt_nt > 1) {
$cmd .= "-m -q ";
Expand All @@ -298,29 +303,28 @@ sub showall_from_metadata_file_1_1
$cmd .= "-q cp ";
}
$cmd .= join(" ", @files2download) . " .";
} else {
if (defined ($awscli)) {
# https://registry.opendata.aws/ncbi-blast-databases/#usageexamples
my $aws_cmd = "$awscli s3 cp --no-sign-request ";
$aws_cmd .= "--only-show-errors " unless $opt_verbose >= 3;
} elsif ($location eq "AWS" and defined ($awscli)) {
# https://registry.opendata.aws/ncbi-blast-databases/#usageexamples
my $aws_cmd = "$awscli s3 cp --no-sign-request ";
$aws_cmd .= "--only-show-errors " unless $opt_verbose >= 3;
print $fh join("\n", @files2download);
$cmd = "/usr/bin/xargs -P $opt_nt -n 1 -I{}";
$cmd .= " -t" if $opt_verbose > 3;
$cmd .= " $aws_cmd {} .";
$cmd .= " <$fh " ;
} else { # fall back to curl
my $url = $location eq "AWS" ? AWS_URL : GCS_URL;
s,gs://,$url/, foreach (@files2download);
s,s3://,$url/, foreach (@files2download);
if ($opt_nt > 1 and -f "/usr/bin/xargs") {
print $fh join("\n", @files2download);
$cmd = "/usr/bin/xargs -P $opt_nt -n 1 -I{}";
$cmd = "/usr/bin/xargs -P $opt_nt -n 1";
$cmd .= " -t" if $opt_verbose > 3;
$cmd .= " $aws_cmd {} .";
$cmd .= " $curl -sSOR";
$cmd .= " <$fh " ;
} else { # fall back to curl for AWS only
my $url = AWS_URL;
s,s3://,$url/, foreach (@files2download);
if ($opt_nt > 1 and -f "/usr/bin/xargs") {
print $fh join("\n", @files2download);
$cmd = "/usr/bin/xargs -P $opt_nt -n 1";
$cmd .= " -t" if $opt_verbose > 3;
$cmd .= " $curl -sSOR";
$cmd .= " <$fh " ;
} else {
$cmd = "$curl -sSR";
$cmd .= " -O $_" foreach (@files2download);
}
} else {
$cmd = "$curl -sSR";
$cmd .= " -O $_" foreach (@files2download);
}
}
print "$cmd\n" if $opt_verbose > 3;
Expand Down Expand Up @@ -670,8 +674,13 @@ sub get_latest_dir
$url = AWS_URL . "/" . AWS_BUCKET . "/latest-dir";
$cmd = "$curl -s $url";
} else {
$url = GCP_BUCKET . "/latest-dir";
$cmd = "$gsutil -u $gcp_prj cat $url";
if (defined($gcp_prj)) {
$url = 'gs://' . GCP_BUCKET . "/latest-dir";
$cmd = "$gsutil -u $gcp_prj cat $url";
} else {
$url = GCS_URL . "/" . GCP_BUCKET . "/latest-dir";
$cmd = "$curl -s $url";
}
}
print "$cmd\n" if DEBUG;
chomp($retval = `$cmd`);
Expand All @@ -696,8 +705,13 @@ sub get_blastdb_metadata
$url = AWS_URL . "/" . AWS_BUCKET . "/$latest_dir/" . BLASTDB_METADATA;
$cmd = "curl -sf $url";
} elsif ($source eq "GCP") {
$url = GCP_BUCKET . "/$latest_dir/" . BLASTDB_METADATA;
$cmd = "$gsutil -u $gcp_prj cat $url";
if (defined($gcp_prj)) {
$url = 'gs://' . GCP_BUCKET . "/$latest_dir/" . BLASTDB_METADATA;
$cmd = "$gsutil -u $gcp_prj cat $url";
} else {
$url = GCS_URL . "/" . GCP_BUCKET . "/$latest_dir/" . BLASTDB_METADATA;
$cmd = "curl -sf $url";
}
} else {
$url = 'ftp://' . NCBI_FTP . "/blast/db/" . BLASTDB_METADATA;
$cmd = "curl -sf $url";
Expand Down
2 changes: 1 addition & 1 deletion docker-janitor/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ SHELL=/bin/bash
.PHONY: all pre-check check clean build publish gcp-build gcp-check gcp-clean

IMG?=ncbi/elasticblast-janitor
VERSION?=0.2.0
VERSION?=0.3.0
GCP_PROJECT?=$(shell gcloud config get-value project 2>/dev/null)
GCP_TEST_BUCKET?=gs://elasticblast-test/query-split-run-test

Expand Down
5 changes: 4 additions & 1 deletion docker-job-submit/Dockerfile.gcp
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,13 @@ LABEL Vendor="NCBI/NLM/NIH"
LABEL [email protected]

COPY cloud-job-submit.sh /usr/bin/
COPY templates/volume-snapshot-class.yaml /templates/
COPY templates/volume-snapshot.yaml /templates/
COPY templates/pvc-rom.yaml.template /templates/

RUN chmod +x /usr/bin/cloud-job-submit.sh && \
apk -U upgrade && \
apk add --no-cache bash gettext curl && \
apk add --no-cache bash gettext curl jq && \
curl -LO https://storage.googleapis.com/kubernetes-release/release/`curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt`/bin/linux/amd64/kubectl && \
chmod +x ./kubectl && \
mv kubectl /usr/bin/ && \
Expand Down
3 changes: 2 additions & 1 deletion docker-job-submit/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ SHELL=/bin/bash
.PHONY: all pre-check check clean build publish gcp-build gcp-check gcp-clean

IMG?=ncbi/elasticblast-job-submit
VERSION?=2.0.0
VERSION?=3.0.0
GCP_PROJECT?=$(shell gcloud config get-value project 2>/dev/null)
GCP_TEST_BUCKET?=gs://elasticblast-test/cloud-job-submission
AWS_REGION?=us-east-1
Expand Down Expand Up @@ -61,6 +61,7 @@ check:

.PHONY: gcp-build
gcp-build:
rsync -a ../src/elastic_blast/templates ${PWD}/
gcloud builds submit --config cloudbuild.yaml --substitutions _VERSION=${VERSION},_IMG=${IMG}

.PHONY: aws-build
Expand Down
75 changes: 73 additions & 2 deletions docker-job-submit/cloud-job-submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ K8S_JOB_GET_BLASTDB=get-blastdb
K8S_JOB_IMPORT_QUERY_BATCHES=import-query-batches
K8S_JOB_SUBMIT_JOBS=submit-jobs
ELB_PAUSE_AFTER_INIT_PV=150
ELB_DISK_ID_FILE=disk-id.txt

GSUTIL_COPY='gsutil -q cp'
GCLOUD=gcloud
Expand All @@ -53,6 +54,7 @@ ELB_RESULTS=test
ELB_CLUSTER_NAME=test-cluster
ELB_GCP_PROJECT=test-project
ELB_GCP_ZONE=test-zone
ELB_USE_LOCAL_SSD=false
mkdir -p test/metadata
cp ../src/elastic_blast/templates/blast-batch-job.yaml.template test/metadata/job.yaml.template
for ((i=0; i<1020; i++)) do printf 'batch_%03d.fa\n' "$i" >> test/metadata/batch_list.txt; done
Expand All @@ -78,20 +80,51 @@ if [[ "$s" != Complete*( Complete) ]]; then
exit 1
fi

# Unmount ReadWrite blastdb volume, necessary for cluster use

# Get init-pv job logs
pods=`kubectl get pods -l job-name=init-pv -o jsonpath='{.items[*].metadata.name}'`
for pod in $pods; do
for c in ${K8S_JOB_GET_BLASTDB} ${K8S_JOB_IMPORT_QUERY_BATCHES}; do
${KUBECTL} logs $pod -c $c --timestamps --since=24h --tail=-1 | ${GSUTIL_COPY} /dev/stdin ${ELB_RESULTS}/logs/k8s-$pod-$c.log
done
done
if [ ! -z "$pods" ]; then


# no need to deal with persistent disks and snapshots if a local SSD is used
if ! $ELB_USE_LOCAL_SSD ; then

# Create a volume snapshot
${KUBECTL} apply -f /templates/volume-snapshot-class.yaml
${KUBECTL} apply -f /templates/volume-snapshot.yaml
sleep 5

# Wait for the snapshot to be ready
while true; do
st=$(${KUBECTL} get volumesnapshot blast-dbs-snapshot -o jsonpath='{.status.readyToUse}')
[ $? -ne 0 ] && echo "ERROR: Getting volume snapshot status" && exit 1
[ $st == true ] && break
echo "Volume snapshot status: $st"
sleep 30
done

# save writable disk id
export pv_rwo=$(${KUBECTL} get pvc blast-dbs-pvc-rwo -o jsonpath='{.spec.volumeName}')

# Delete the job to unmount ReadWrite blastdb volume
${KUBECTL} delete job init-pv
# Wait for disk to be unmounted
echo Waiting for $ELB_PAUSE_AFTER_INIT_PV sec to unmount PV disk
sleep $ELB_PAUSE_AFTER_INIT_PV

# Delete ReadWriteOnce PVC
${KUBECTL} delete pvc blast-dbs-pvc-rwo

# Create ReadOnlyMany PVC
envsubst '${ELB_PD_SIZE}' </templates/pvc-rom.yaml.template >pvc-rom.yaml
${KUBECTL} apply -f pvc-rom.yaml
fi


# Debug job fail - set env variable ELB_DEBUG_SUBMIT_JOB_FAIL to non empty value
[ -n "${ELB_DEBUG_SUBMIT_JOB_FAIL:-}" ] && echo Job submit job failed for debug && exit 1

Expand Down Expand Up @@ -144,4 +177,42 @@ if ${GSUTIL_COPY} ${ELB_RESULTS}/${ELB_METADATA_DIR}/job.yaml.template . &&
fi
copy_job_logs_to_results_bucket submit "${K8S_JOB_SUBMIT_JOBS}"
echo Done
else
echo "Job file or batch list not found in GCS"
exit 1
fi


# no need to deal with persistent disks and snapshots if a local SSD is used
if $ELB_USE_LOCAL_SSD ; then
exit 0
fi


# wait for PVC to bind
while true; do
st=$(${KUBECTL} get -f pvc-rom.yaml -o jsonpath='{.status.phase}')
[ $? -ne 0 ] && echo "ERROR: Getting PVC bind state" && exit 1
[ $st == Bound ] && break
echo "PVC status: $st"
sleep 30
done

# label the new persistent disk
export pv=$(${KUBECTL} get -f pvc-rom.yaml -o jsonpath='{.spec.volumeName}')
jq -n --arg dd $pv '[$dd]' | gsutil cp - ${ELB_RESULTS}/${ELB_METADATA_DIR}/$ELB_DISK_ID_FILE
gcloud compute disks update $pv --update-labels ${ELB_LABELS} --zone ${ELB_GCP_ZONE} --project ${ELB_GCP_PROJECT}

# delete snapshot
${KUBECTL} delete volumesnapshot --all

# check if the writable disk was deleted and try deleting again,
# if unsuccessful save its id in GS
if gcloud compute disks describe $pv_rwo --zone $ELB_GCP_ZONE ; then
gcloud compute disks delete $pv_rwo --zone $ELB_GCP_ZONE
sleep 10

if gcloud compute disks describe $pv_rwo --zone $ELB_GCP_ZONE ; then
jq -n --arg d1 $pv_rwo --arg d2 $pv '[d1, d2]' | gsutil cp - ${ELB_RESULTS}/${ELB_METADATA_DIR}/$ELB_DISK_ID_FILE
fi
fi
Loading

0 comments on commit 7314398

Please sign in to comment.