Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/236 (fix Zenodo blocking downloads) #247

Merged
merged 5 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ DATETIME=$(shell date -u +"%Y_%m_%dT%H_%M_%S")

THREADS=$(shell grep "^threads:" config.yaml | awk '{print $$2}')
MAX_DOWNLOAD_THREADS=$(shell grep "^max_download_threads" config.yaml | awk '{print $$2}')
DOWNLOAD_RETRIES=$(shell grep "^download_retries" config.yaml | awk '{print $$2}')
MAX_IO_HEAVY_THREADS=$(shell grep "^max_io_heavy_threads" config.yaml | awk '{print $$2}')
MAX_RAM_MB=$(shell grep "^max_ram_gb:" config.yaml | awk '{print $$2*1024}')

Expand Down Expand Up @@ -69,7 +70,7 @@ conda: ## Create the conda environments
snakemake $(SMK_PARAMS) --conda-create-envs-only

download: ## Download the assemblies and COBS indexes
snakemake download $(SMK_PARAMS) -j 99999
snakemake download $(SMK_PARAMS) -j 99999 --restart-times $(DOWNLOAD_RETRIES)

download_asms: ## Download only the assemblies
snakemake download_asms_batches $(SMK_PARAMS) -j 99999
Expand Down
33 changes: 19 additions & 14 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import functools
import glob
from pathlib import Path
from snakemake.utils import min_version
import random
import re

##################################
Expand Down Expand Up @@ -199,10 +200,13 @@ def cobs_url_fct(wildcards):
else:
return f"https://zenodo.org/record/6845083/files/{x}.cobs_classic.xz"

def asms_url_fct(wildcards):
asm_zenodo = 4602622
asm_url = f"https://zenodo.org/record/{asm_zenodo}/files/{wildcards.batch}.tar.xz"
return asm_url

asm_zenodo = 4602622
asms_url = f"https://zenodo.org/record/{asm_zenodo}/files"

def get_sleep_amount(attempt):
return int(config["download_retry_wait"]) * (attempt - 1)

##################################
## Top-level rules
Expand Down Expand Up @@ -261,34 +265,35 @@ rule download_asm_batch:
"""
output:
xz=f"{assemblies_dir}/{{batch}}.tar.xz",
params:
url=asms_url,
threads: 1
resources:
max_download_threads=1,
mem_mb=200,
threads: 1
# note: sleep_amount has to be defined as a resource
# note: I tried a hack to route it to params, but it did not work, see https://github.com/snakemake/snakemake/issues/499
sleep_amount=lambda wildcards, attempt: get_sleep_amount(attempt)
params:
url=asms_url_fct
shell:
"""
curl -L "{params.url}/{wildcards.batch}.tar.xz" > {output.xz}
scripts/test_xz.py {output.xz}
scripts/download.sh {params.url} {output.xz} {resources.sleep_amount}
"""


rule download_cobs_batch:
"""Download compressed cobs indexes
"""
output:
xz=f"{cobs_dir}/{{batch}}.cobs_classic.xz",
params:
url=cobs_url_fct,
threads: 1
resources:
max_download_threads=1,
mem_mb=200,
threads: 1
sleep_amount=lambda wildcards, attempt: get_sleep_amount(attempt)
params:
url=cobs_url_fct
shell:
"""
curl -L "{params.url}" > {output.xz}
scripts/test_xz.py {output.xz}
scripts/download.sh {params.url} {output.xz} {resources.sleep_amount}
"""


Expand Down
22 changes: 16 additions & 6 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,25 @@ threads: all
# if index_load_mode == mmap-disk, then this parameter is ignored as the OS will manage RAM usage
# WARNING: this parameter is ignored when running on a cluster
max_ram_gb: 12
##################################################

##################################################
# download

# maximum number of download threads at a time (note: too many might slow down download speed)
# maximum number of download threads at a time (note: too many might slow down download speed and make Zenodo block your requests)
# WARNING: this parameter is ignored when running on a cluster
max_download_threads: 8

# how many times to retry a download if it fails
download_retries: 3

# how many seconds to wait between retries
download_retry_wait: 10

# directory to store all downloaded files. This is where an "asms" folder with all assemblies and a "cobs" folder
# with all COBS indexes will be created. This is a heavy directory, put it in a filesystem that has at least
# 100 GB free
download_dir: "."
##################################################

##################################################
Expand Down Expand Up @@ -121,9 +136,4 @@ keep_cobs_indexes: False
# directory to store the COBS decompressed indexes. Can be used to put the decompressed indexes in an external
# or large filesystem capable of holding them. If not defined, defaults to "intermediate/00_cobs"
# decompression_dir: cobs_decompressed_indexes

# directory to store all downloaded files. This is where an "asms" folder with all assemblies and a "cobs" folder
# with all COBS indexes will be created. This is a heavy directory, put it in a filesystem that has at least
# 100 GB free
download_dir: "."
###################################################################################################
14 changes: 14 additions & 0 deletions scripts/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash
url=$1
output=$2
sleep_amount=$3

if [ $sleep_amount -gt 0 ]; then
echo "Detected previous failed downloads, probably Zenodo blocking downloads."
echo "Now random sleeping for $sleep_amount seconds before retrying..."
sleep $sleep_amount
echo "Retrying..."
fi
echo "Downloading $url ..."
curl -s -L "${url}" > "${output}"
scripts/test_xz.py "${output}"
Loading