Skip to content

Commit

Permalink
Merge pull request #247 from leoisl/fix/236
Browse files Browse the repository at this point in the history
Fix/236 (fix Zenodo blocking downloads)
  • Loading branch information
karel-brinda authored Dec 11, 2023
2 parents 94e15be + 04c9095 commit 0041b89
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 21 deletions.
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ DATETIME=$(shell date -u +"%Y_%m_%dT%H_%M_%S")

THREADS=$(shell grep "^threads:" config.yaml | awk '{print $$2}')
MAX_DOWNLOAD_THREADS=$(shell grep "^max_download_threads" config.yaml | awk '{print $$2}')
DOWNLOAD_RETRIES=$(shell grep "^download_retries" config.yaml | awk '{print $$2}')
MAX_IO_HEAVY_THREADS=$(shell grep "^max_io_heavy_threads" config.yaml | awk '{print $$2}')
MAX_RAM_MB=$(shell grep "^max_ram_gb:" config.yaml | awk '{print $$2*1024}')

Expand Down Expand Up @@ -73,7 +74,7 @@ conda: ## Create the conda environments
snakemake $(SMK_PARAMS) --conda-create-envs-only

download: ## Download the assemblies and COBS indexes
snakemake download $(SMK_PARAMS) -j 99999
snakemake download $(SMK_PARAMS) -j 99999 --restart-times $(DOWNLOAD_RETRIES)

download_asms: ## Download only the assemblies
snakemake download_asms_batches $(SMK_PARAMS) -j 99999
Expand Down
33 changes: 19 additions & 14 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import functools
import glob
from pathlib import Path
from snakemake.utils import min_version
import random
import re

##################################
Expand Down Expand Up @@ -199,10 +200,13 @@ def cobs_url_fct(wildcards):
else:
return f"https://zenodo.org/record/6845083/files/{x}.cobs_classic.xz"

def asms_url_fct(wildcards):
asm_zenodo = 4602622
asm_url = f"https://zenodo.org/record/{asm_zenodo}/files/{wildcards.batch}.tar.xz"
return asm_url

asm_zenodo = 4602622
asms_url = f"https://zenodo.org/record/{asm_zenodo}/files"

def get_sleep_amount(attempt):
return int(config["download_retry_wait"]) * (attempt - 1)

##################################
## Top-level rules
Expand Down Expand Up @@ -261,34 +265,35 @@ rule download_asm_batch:
"""
output:
xz=f"{assemblies_dir}/{{batch}}.tar.xz",
params:
url=asms_url,
threads: 1
resources:
max_download_threads=1,
mem_mb=200,
threads: 1
# note: sleep_amount has to be defined as a resource
# note: I tried a hack to route it to params, but it did not work, see https://github.com/snakemake/snakemake/issues/499
sleep_amount=lambda wildcards, attempt: get_sleep_amount(attempt)
params:
url=asms_url_fct
shell:
"""
curl -L "{params.url}/{wildcards.batch}.tar.xz" > {output.xz}
scripts/test_xz.py {output.xz}
scripts/download.sh {params.url} {output.xz} {resources.sleep_amount}
"""


rule download_cobs_batch:
"""Download compressed cobs indexes
"""
output:
xz=f"{cobs_dir}/{{batch}}.cobs_classic.xz",
params:
url=cobs_url_fct,
threads: 1
resources:
max_download_threads=1,
mem_mb=200,
threads: 1
sleep_amount=lambda wildcards, attempt: get_sleep_amount(attempt)
params:
url=cobs_url_fct
shell:
"""
curl -L "{params.url}" > {output.xz}
scripts/test_xz.py {output.xz}
scripts/download.sh {params.url} {output.xz} {resources.sleep_amount}
"""


Expand Down
22 changes: 16 additions & 6 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,25 @@ threads: all
# if index_load_mode == mmap-disk, then this parameter is ignored as the OS will manage RAM usage
# WARNING: this parameter is ignored when running on a cluster
max_ram_gb: 12
##################################################

##################################################
# download

# maximum number of download threads at a time (note: too many might slow down download speed)
# maximum number of download threads at a time (note: too many might slow down download speed and make Zenodo block your requests)
# WARNING: this parameter is ignored when running on a cluster
max_download_threads: 8

# how many times to retry a download if it fails
download_retries: 3

# how many seconds to wait between retries
download_retry_wait: 10

# directory to store all downloaded files. This is where an "asms" folder with all assemblies and a "cobs" folder
# with all COBS indexes will be created. This is a heavy directory, put it in a filesystem that has at least
# 100 GB free
download_dir: "."
##################################################

##################################################
Expand Down Expand Up @@ -121,9 +136,4 @@ keep_cobs_indexes: False
# directory to store the COBS decompressed indexes. Can be used to put the decompressed indexes in an external
# or large filesystem capable of holding them. If not defined, defaults to "intermediate/00_cobs"
# decompression_dir: cobs_decompressed_indexes

# directory to store all downloaded files. This is where an "asms" folder with all assemblies and a "cobs" folder
# with all COBS indexes will be created. This is a heavy directory, put it in a filesystem that has at least
# 100 GB free
download_dir: "."
###################################################################################################
14 changes: 14 additions & 0 deletions scripts/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash
url=$1
output=$2
sleep_amount=$3

if [ $sleep_amount -gt 0 ]; then
echo "Detected previous failed downloads, probably Zenodo blocking downloads."
echo "Now random sleeping for $sleep_amount seconds before retrying..."
sleep $sleep_amount
echo "Retrying..."
fi
echo "Downloading $url ..."
curl -s -L "${url}" > "${output}"
scripts/test_xz.py "${output}"

0 comments on commit 0041b89

Please sign in to comment.