From e273b65f58e0ea1194ee33950a2db48c3fd47e4f Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Fri, 30 Aug 2024 13:19:22 -0700 Subject: [PATCH 1/4] Added ${N_JOBS} to download scripts --- CHANGELOG.md | 1 + install/download_databases-annotate.sh | 15 ++++++++------- install/download_databases-classify.sh | 15 ++++++++------- install/download_databases-contamination.sh | 5 +++-- install/download_databases-markers.sh | 5 +++-- install/download_databases.sh | 13 +++++++------ 6 files changed, 30 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7db7d40..b0c9159 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -472,6 +472,7 @@ ________________________________________________________________
Daily Change Log: +* [2024.8.30] - Added ${N_JOBS} to download scripts with default set to maximum threads available * [2024.8.29] - Added `VERSION` file created in `download_databases.sh` * [2024.7.11] - Alignment fraction threshold for genome clustering only applied to reference but should also apply to query. Added `--af_mode` with either `relaxed = max([Alignment_fraction_ref, Alignment_fraction_query]) > minimum_af` or `strict = (Alignment_fraction_ref > minimum_af) & (Alignment_fraction_query > minimum_af)` to `edgelist_to_clusters.py`, `global_clustering.py`, `local_clustering.py`, and `cluster.py`. * [2024.7.3] - Added `pigz` to `VEBA-annotate_env` which isn't a problem with most `conda` installations but needed for `docker` containers. diff --git a/install/download_databases-annotate.sh b/install/download_databases-annotate.sh index f3e44f4..7ea4346 100644 --- a/install/download_databases-annotate.sh +++ b/install/download_databases-annotate.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v7" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-annotate.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) # Database structure echo ". .. ... ..... ........ ............." @@ -78,19 +79,19 @@ mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/UniRef wget -v -P ${DATABASE_DIRECTORY}/Annotate/UniRef/ https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref90/uniref90.release_note wget -v -P ${DATABASE_DIRECTORY} https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz -diamond makedb --in ${DATABASE_DIRECTORY}/uniref90.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref90.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/uniref90.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref90.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/uniref90.fasta.gz wget -v -P ${DATABASE_DIRECTORY}/Annotate/UniRef/ https://ftp.uniprot.org/pub/databases/uniprot/current_release/uniref/uniref50/uniref50.release_note wget -v -P ${DATABASE_DIRECTORY} https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz -diamond makedb --in ${DATABASE_DIRECTORY}/uniref50.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref50.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/uniref50.fasta.gz --db ${DATABASE_DIRECTORY}/Annotate/UniRef/uniref50.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/uniref50.fasta.gz #MiBIG mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/MIBiG wget -v -P ${DATABASE_DIRECTORY} https://dl.secondarymetabolites.org/mibig/mibig_prot_seqs_3.1.fasta seqkit rmdup -s ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.fasta > ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta -diamond makedb --in ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta --db ${DATABASE_DIRECTORY}/Annotate/MIBiG/mibig_v3.1.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta --db ${DATABASE_DIRECTORY}/Annotate/MIBiG/mibig_v3.1.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.fasta rm -rf ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta @@ -104,13 +105,13 @@ rm -rf ${DATABASE_DIRECTORY}/mibig_prot_seqs_3.1.rmdup.fasta mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/VFDB wget -v -P ${DATABASE_DIRECTORY} http://www.mgc.ac.cn/VFs/Down/VFDB_setA_pro.fas.gz wget -v -P ${DATABASE_DIRECTORY}/Annotate/VFDB/ http://www.mgc.ac.cn/VFs/Down/VFs.xls.gz -diamond makedb --in ${DATABASE_DIRECTORY}/VFDB_setA_pro.fas.gz --db ${DATABASE_DIRECTORY}/Annotate/VFDB/VFDB_setA_pro.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/VFDB_setA_pro.fas.gz --db ${DATABASE_DIRECTORY}/Annotate/VFDB/VFDB_setA_pro.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/VFDB_setA_pro.fas.gz # CAZy mkdir -v -p ${DATABASE_DIRECTORY}/Annotate/CAZy wget -v -P ${DATABASE_DIRECTORY} https://bcb.unl.edu/dbCAN2/download/CAZyDB.07262023.fa -diamond makedb --in ${DATABASE_DIRECTORY}/CAZyDB.07262023.fa --db ${DATABASE_DIRECTORY}/Annotate/CAZy/CAZyDB.07262023.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/CAZyDB.07262023.fa --db ${DATABASE_DIRECTORY}/Annotate/CAZy/CAZyDB.07262023.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/CAZyDB.07262023.fa diff --git a/install/download_databases-classify.sh b/install/download_databases-classify.sh index 4689b3c..7140413 100644 --- a/install/download_databases-classify.sh +++ b/install/download_databases-classify.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8.1" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v7" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-classify.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) # Database structure echo ". .. ... ..... ........ ............." @@ -81,7 +82,7 @@ wget -v -P ${DATABASE_DIRECTORY} https://portal.nersc.gov/CheckV/checkv-db-${CHE tar xvzf ${DATABASE_DIRECTORY}/checkv-db-${CHECKVDB_VERSION}.tar.gz -C ${DATABASE_DIRECTORY} mv ${DATABASE_DIRECTORY}/checkv-db-${CHECKVDB_VERSION} ${DATABASE_DIRECTORY}/Classify/CheckV echo "${CHECKV_VERSION}" > ${DATABASE_DIRECTORY}/Classify/CheckV/database_version -diamond makedb --in ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.faa --db ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.dmnd +diamond makedb --in ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.faa --db ${DATABASE_DIRECTORY}/Classify/CheckV/genome_db/checkv_reps.dmnd --threads ${N_JOBS} rm -rf ${DATABASE_DIRECTORY}/checkv-db-${CHECKVDB_VERSION}.tar.gz # geNomad @@ -128,19 +129,19 @@ cp -rf ${DATABASE_DIRECTORY}/MicroEuk_v3/source_taxonomy.tsv.gz ${DATABASE_DIREC # MicroEuk100 gzip -d ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa.gz -mmseqs createdb --compressed 1 ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100 +mmseqs createdb --threads ${N_JOBS} --compressed 1 ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100 # MicroEuk100.eukaryota_odb10 gzip -d ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list.gz -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100.eukaryota_odb10 +seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --threads ${N_JOBS} --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100.eukaryota_odb10 # MicroEuk90 gzip -d -c ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90_clusters.tsv.gz | cut -f1 | sort -u > ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk90 +seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --threads ${N_JOBS} --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk90 # MicroEuk50 gzip -d -c ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50_clusters.tsv.gz | cut -f1 | sort -u > ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50.list -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk50 +seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --threads ${N_JOBS} --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk50 # source_to_lineage.dict.pkl.gz build_source_to_lineage_dictionary.py -i ${DATABASE_DIRECTORY}/MicroEuk_v3/source_taxonomy.tsv.gz -o ${DATABASE_DIRECTORY}/Classify/MicroEuk/source_to_lineage.dict.pkl.gz diff --git a/install/download_databases-contamination.sh b/install/download_databases-contamination.sh index 30c0d6b..fac6c26 100644 --- a/install/download_databases-contamination.sh +++ b/install/download_databases-contamination.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v7" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-contamination.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases-markers.sh b/install/download_databases-markers.sh index 185336e..ceb2c67 100644 --- a/install/download_databases-markers.sh +++ b/install/download_databases-markers.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.6.8" +# __version__ = "2024.8.30" # VEBA_DATABASE_VERSION = "VDB_v8" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases-preprocess.sh /path/to/veba_database_destination/ @@ -9,7 +9,8 @@ DATABASE_DIRECTORY=${1:-"."} REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases.sh b/install/download_databases.sh index fae2c96..4be5df2 100644 --- a/install/download_databases.sh +++ b/install/download_databases.sh @@ -1,5 +1,5 @@ #!/bin/bash -# __version__ = "2024.8.29" +# __version__ = "2024.8.30" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" # usage: bash veba/download_databases.sh /path/to/veba_database_destination/ [optional positional argument: /path/to/conda_environments/] # Version @@ -12,7 +12,8 @@ SCRIPT_DIRECTORY=$(dirname "$0") CONDA_ENVS_PATH=${2:-"$(conda info --base)/envs/"} -# N_JOBS=$(2:-"1") +MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") +N_JOBS=$(3:-${MAXIMUM_NUMBER_OF_CPU}) # Database structure echo ". .. ... ..... ........ ............." @@ -33,24 +34,24 @@ echo $VEBA_DATABASE_VERSION > ${DATABASE_DIRECTORY}/VERSION echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (markers)" echo ". .. ... ..... ........ ............." -bash ${SCRIPT_DIRECTORY}/download_databases-markers.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-markers.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (contamination)" echo ". .. ... ..... ........ ............." -bash ${SCRIPT_DIRECTORY}/download_databases-contamination.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-contamination.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (classify)" echo ". .. ... ..... ........ ............." echo "This might take a while depending on source database i/o speed..." -bash ${SCRIPT_DIRECTORY}/download_databases-classify.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-classify.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" echo ". .. ... ..... ........ ............." echo "Downloading and configuring database (annotate)" echo ". .. ... ..... ........ ............." echo "This might take a while depending on source database i/o speed..." -bash ${SCRIPT_DIRECTORY}/download_databases-annotate.sh ${DATABASE_DIRECTORY} | grep -v "\[partial-database\]" +bash ${SCRIPT_DIRECTORY}/download_databases-annotate.sh ${DATABASE_DIRECTORY} ${N_JOBS} | grep -v "\[partial-database\]" # Environment variables echo ". .. ... ..... ........ ............." From 494ee56f92339f409172c9b9e1581e01f8537490 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Fri, 30 Aug 2024 13:52:49 -0700 Subject: [PATCH 2/4] Update N_JOBS --- install/download_databases-annotate.sh | 2 +- install/download_databases-classify.sh | 2 +- install/download_databases-contamination.sh | 2 +- install/download_databases-markers.sh | 2 +- install/download_databases.sh | 8 ++++++-- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/install/download_databases-annotate.sh b/install/download_databases-annotate.sh index 7ea4346..7e8d5cc 100644 --- a/install/download_databases-annotate.sh +++ b/install/download_databases-annotate.sh @@ -10,7 +10,7 @@ REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") -N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases-classify.sh b/install/download_databases-classify.sh index 7140413..744e048 100644 --- a/install/download_databases-classify.sh +++ b/install/download_databases-classify.sh @@ -10,7 +10,7 @@ REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") -N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases-contamination.sh b/install/download_databases-contamination.sh index fac6c26..2cc5a1e 100644 --- a/install/download_databases-contamination.sh +++ b/install/download_databases-contamination.sh @@ -10,7 +10,7 @@ REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") -N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases-markers.sh b/install/download_databases-markers.sh index ceb2c67..1d715d4 100644 --- a/install/download_databases-markers.sh +++ b/install/download_databases-markers.sh @@ -10,7 +10,7 @@ REALPATH_DATABASE_DIRECTORY=$(realpath $DATABASE_DIRECTORY) SCRIPT_DIRECTORY=$(dirname "$0") MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") -N_JOBS=$(2:-${MAXIMUM_NUMBER_OF_CPU}) +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} # Database structure echo ". .. ... ..... ........ ............." diff --git a/install/download_databases.sh b/install/download_databases.sh index 4be5df2..1de8f0d 100644 --- a/install/download_databases.sh +++ b/install/download_databases.sh @@ -1,7 +1,7 @@ #!/bin/bash # __version__ = "2024.8.30" # MICROEUKAYROTIC_DATABASE_VERSION = "MicroEuk_v3" -# usage: bash veba/download_databases.sh /path/to/veba_database_destination/ [optional positional argument: /path/to/conda_environments/] +# usage: bash veba/download_databases.sh /path/to/veba_database_destination/ [optional positional argument: /path/to/conda_environments/ number_of_threads] # Version VEBA_DATABASE_VERSION="VDB_v7" @@ -13,7 +13,11 @@ SCRIPT_DIRECTORY=$(dirname "$0") CONDA_ENVS_PATH=${2:-"$(conda info --base)/envs/"} MAXIMUM_NUMBER_OF_CPU=$(python -c "from multiprocessing import cpu_count; print(cpu_count())") -N_JOBS=$(3:-${MAXIMUM_NUMBER_OF_CPU}) +N_JOBS=${3:-${MAXIMUM_NUMBER_OF_CPU}} +echo ". .. ... ..... ........ ............." +echo "Detected ${MAXIMUM_NUMBER_OF_CPU} available threads" +echo "Using ${N_JOBS} threads" +echo ". .. ... ..... ........ ............." # Database structure echo ". .. ... ..... ........ ............." From 559e26309b272ac8bc540a94c5d4e803dd30cf30 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Fri, 30 Aug 2024 23:12:51 -0700 Subject: [PATCH 3/4] Update download_databases-classify.sh --- install/download_databases-classify.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/install/download_databases-classify.sh b/install/download_databases-classify.sh index 744e048..b234167 100644 --- a/install/download_databases-classify.sh +++ b/install/download_databases-classify.sh @@ -129,19 +129,19 @@ cp -rf ${DATABASE_DIRECTORY}/MicroEuk_v3/source_taxonomy.tsv.gz ${DATABASE_DIREC # MicroEuk100 gzip -d ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa.gz -mmseqs createdb --threads ${N_JOBS} --compressed 1 ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100 +mmseqs createdb --compressed 1 ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100 # MicroEuk100.eukaryota_odb10 gzip -d ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list.gz -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --threads ${N_JOBS} --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100.eukaryota_odb10 +seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.eukaryota_odb10.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk100.eukaryota_odb10 # MicroEuk90 gzip -d -c ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90_clusters.tsv.gz | cut -f1 | sort -u > ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --threads ${N_JOBS} --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk90 +seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk90.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk90 # MicroEuk50 gzip -d -c ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50_clusters.tsv.gz | cut -f1 | sort -u > ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50.list -seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --threads ${N_JOBS} --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk50 +seqkit grep -f ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk50.list ${DATABASE_DIRECTORY}/MicroEuk_v3/MicroEuk100.faa | mmseqs createdb --compressed 1 stdin ${DATABASE_DIRECTORY}/Classify/MicroEuk/MicroEuk50 # source_to_lineage.dict.pkl.gz build_source_to_lineage_dictionary.py -i ${DATABASE_DIRECTORY}/MicroEuk_v3/source_taxonomy.tsv.gz -o ${DATABASE_DIRECTORY}/Classify/MicroEuk/source_to_lineage.dict.pkl.gz From a46659c2e452bd8174230139b01ef62932cb2069 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Sat, 21 Sep 2024 23:39:33 -0700 Subject: [PATCH 4/4] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e4ca980..0ab5b2e 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ ___________________________________________________________________ ### Announcements -* **Current Stable Version:** [`v2.2.1`](https://github.com/jolespin/veba/releases/tag/v2.2.0) +* **Current Stable Version:** [`v2.2.1`](https://github.com/jolespin/veba/releases/tag/v2.2.1) * **Current Database Version:** [`VDB_v7`](install/DATABASE.md)