From 4ce5e60cf23fe3cffe18f15bb0463f50ba64e6cf Mon Sep 17 00:00:00 2001 From: deliaBlue <103108590+deliaBlue@users.noreply.github.com> Date: Thu, 16 Nov 2023 13:15:24 +0100 Subject: [PATCH] test: standardized usage (#118) * docs: update main README with new test paths * ci: update tests * refactor: add .upper() to adapter retrieve * docs: update main README * refactor: add convert_lib_format function * style: apply snakemake formatting * ci: swap conda and singularity tests --- .github/workflows/tests.yml | 51 +++++++++---- .snakemake-workflow-catalog.yml | 5 ++ README.md | 32 ++++---- config/README.md | 125 ++++++++++++++++++++++++++++++++ config/config_schema.json | 2 +- test/test_snakefmt.sh | 20 +++++ test/test_snakemake_lint.sh | 26 +++++++ workflow/rules/common.smk | 13 ++++ workflow/rules/map.smk | 9 ++- 9 files changed, 249 insertions(+), 34 deletions(-) create mode 100644 .snakemake-workflow-catalog.yml create mode 100644 config/README.md create mode 100755 test/test_snakefmt.sh create mode 100755 test/test_snakemake_lint.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6713d4bf..48137d3b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -48,17 +48,8 @@ jobs: working-directory: ./scripts run: pylint --rcfile=../pylint.cfg ./*.py - - name: Check workflow descriptor files for lints - working-directory: ./test - run: | - snakemake --snakefile="../workflow/Snakefile" --configfile="config.yaml" --lint - snakemake --snakefile="../workflow/rules/common.smk" --configfile="config.yaml" --lint - snakemake --snakefile="../workflow/rules/prepare.smk" --configfile="config.yaml" --lint - snakemake --snakefile="../workflow/rules/map.smk" --configfile="config.yaml" --lint - snakemake --snakefile="../workflow/rules/quantify.smk" --configfile="config.yaml" --lint - - - snakemake-test: + + snakemake-format-graph-test: runs-on: ubuntu-latest defaults: run: @@ -77,17 +68,51 @@ jobs: environment-file: environment.yml auto-activate-base: false - - name: update mirflowz env with root packages - run: mamba env update -n mirflowz -f environment.root.yml + - name: update mirflowz env with dev packages + run: mamba env update -n mirflowz -f environment.dev.yml - name: display environment info run: | conda info -a conda list + - name: run test for snakemake format + run: bash test/test_snakefmt.sh + + - name: run test for snakemate lint + run: bash test/test_snakemake_lint.sh + - name: run test for rule graph run: bash test/test_rule_graph.sh + + + snakemake-integration-test: + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + steps: + + - name: check out repository + uses: actions/checkout@v4 + + - name: setup Conda/Mamba + uses: conda-incubator/setup-miniconda@v2 + with: + mamba-version: "*" + activate-environment: mirflowz + environment-file: environment.yml + auto-activate-base: false + - name: update mirflowz env with root packages + run: mamba env update -n mirflowz -f environment.root.yml + + - name: display environment info + run: | + conda info -a + conda list + - name: run local test with Singularity run: bash test/test_workflow_local_with_singularity.sh diff --git a/.snakemake-workflow-catalog.yml b/.snakemake-workflow-catalog.yml new file mode 100644 index 00000000..4ae567aa --- /dev/null +++ b/.snakemake-workflow-catalog.yml @@ -0,0 +1,5 @@ +usage: + software-stack-deployment: + conda: true + singularity: true + report: true diff --git a/README.md b/README.md index b0282e8c..eef5a214 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ _MIRFLOWZ_ is a [Snakemake][snakemake] workflow for mapping miRNAs and isomiRs. ## Installation The workflow lives inside this repository and will be available for you to run -after following the installation instructions layed out in this section. +after following the installation instructions laid out in this section. ### Cloning the repository @@ -58,8 +58,8 @@ conda env create -f environment.yml conda activate mirflowz ``` -If you plan to run _MIRFLOWZ_ via Conda, we recommend to use the following -command for a faster environment creation specially if it you will run it on a +If you plan to run _MIRFLOWZ_ via Conda, we recommend using the following +command for a faster environment creation, specially if you will run it on an HPC cluster. ```bash @@ -157,7 +157,7 @@ tested, you can go ahead and run the workflow on your samples. It is suggested to have all the input files for a given run (or hard links pointing to them) inside a dedicated directory, for instance under the -_MIRFLOWZ_ root directory. This way it is easier to keep the data together, +_MIRFLOWZ_ root directory. This way, it is easier to keep the data together, reproduce an analysis and set up Singularity access to them. #### 1. Prepare a sample table @@ -170,13 +170,12 @@ touch path/to/your/sample/table.tsv ``` > Fill the sample table according to the following requirements: > -> - `sample`. This column contains the library name. -> - `sample_file`. In this column, you must provide the path to the library file. -> The path must be relative to the working directory. -> - `adapter`. This field must contain the adapter sequence in capital letters. -> - `format`. In this field you must state the library format. It can either be -> `fa` if providing a FASTA file or `fastq` if the library is a FASTQ file. -> +> - `sample`. Arbitrary name for the miRNA sequencing library. +> - `sample_file`. Path to the miRNA sequencing library file. The path must be +> relative to the directory where the workflow will be run. +> - `adapter`. Sequence of the 3'-end adapter used during library preparation. +> - `format`. One of `fa`/`fasta` or `fq`/`fastq`, if the library file is in +> FASTA or FASTQ format, respectively. #### 2. Prepare genome resources @@ -190,15 +189,16 @@ There are 4 files you must provide: > _MIRFLOWZ_ expects both the reference sequence and gene annotation files to > follow [Ensembl][ensembl] style/formatting. If you obtained these files from -> a source other than Ensembl, you may first need to convert them to the -> expected style to avoid issues! +> a source other than Ensembl, you must ensure that they adhere to the +> expected format by converting them, if necessary. 3. An **uncompressed GFF3** file with **microRNA annotations** for the reference sequences above. > _MIRFLOWZ_ expects the miRNA annotations to follow [miRBase][mirbase] > style/formatting. If you obtained this file from a source other than miRBase, -> you may first need to convert it to the expected style to avoid issues! +> you must ensure that it adheres to the expected format by converting it, if +> necessary. 4. An **uncompressed tab-separated file** with a **mapping between the reference names** used in the miRNA annotation file (column 1; "UCSC style") @@ -223,7 +223,7 @@ cp config/config_template.yaml path/to/config.yaml Open the new copy in your editor of choice and adjust the configuration parameters to your liking. The template explains what each of the -parameters means and how you can meaningfully adjust them. +parameters mean and how you can meaningfully adjust them. ### Running the workflow @@ -243,7 +243,7 @@ snakemake \ ``` > **NOTE:** Depending on your working directory, you do not need to use the -> parameters `--snakefile` and `--configfile`. For instance, if the `Snakefile` +> parameters `--snakefile` and `--configfile`. For instance, if the `Snakefile` > is in the same directory or the `workflow/` directory is beneath the current > working directory, there's no need for the `--snakefile` directory. Refer to > the [Snakemake documentation][snakemakeDocu] for more information. diff --git a/config/README.md b/config/README.md new file mode 100644 index 00000000..a52e86ed --- /dev/null +++ b/config/README.md @@ -0,0 +1,125 @@ +# Dependencies installation + +Create and activate the virtual environment with the required dependencies +with Conda: + +```bash +conda env create -f environment.yml +conda activate mirflowz +``` + +If you plan to run _MIRFLOWZ_ via Conda, we recommend using the following +command for a faster environment creation, specially if you will run it on an +HPC cluster. + +```bash +conda config --set channel_priority strict +``` + +For a faster creation of the environment (and Conda environments in general), +you can also install [Mamba][mamba] on top of Conda. In that case, replace +`conda` with `mamba` in the commands above (particularly in +`conda env create`). + +## Running _MIRFLOWZ_ with Singularity + +If you want to run _MIRFLOWZ_ via Singularity and do not already +have it installed globally on your system, you must further update the Conda +environment with: + +```bash +conda env update -f environment.root.yml +``` + +> Mind that you must have the environment activated and root permissions on +> your system to install Singularity. If you want to run _MIRFLOWZ_ on an HPC +> cluster (recommended in almost all cases), ask your system administrator +> about Singularity. + +# Run the workflow on your own samples + +In order to run _MIRFLOWZ_ on your own samples, we recommend having all the +input files inside a dedicated directory. This way, it is easier to keep the +data together and reproduce an analysis. Assuming that your current directory +is the repository's root directory, create a directory to store all your data +and traverse to it with: + +```bash +mkdir path/to/your_run +cd path/to/your_run +``` + +## 1. Prepare the sample table + +Create an empty sample table. Refer to the +[sample.tsv](../test/test_files/samples_table.tsv) test file to see what the +table must look like or use it as a template. + +```bash +touch samples.tsv +``` + +> Fill the sample table according to the following requirements: +> +> - `sample`. Arbitrary name for the miRNA sequencing library. +> - `sample_file`. Path to the miRNA sequencing library file. The path must be +> relative to the directory where the workflow will be run. +> - `adapter`. Sequence of the 3'-end adapter used during library preparation. +> - `format`. One of `fa`/`fasta` or `fq`/`fastq`, if the library file is in +> FASTA or FASTQ format, respectively. + +## 2. Prepare the genome resources + +There are 4 files you must provide: + +1. A **`gzip`ped FASTA** file containing **reference sequences**, typically the + genome of the source/organism from which the library was extracted. + +2. A **`gzip`ped GTF** file with matching **gene annotations** for the + reference sequences above. + +> _MIRFLOWZ_ expects both the reference sequence and gene annotation files to +> follow [Ensembl][ensembl] style/formatting. If you obtained these files from +> a source other than Ensembl, you must ensure that they adhere to the +> expected format by converting them, if necessary. + +3. An **uncompressed GFF3** file with **microRNA annotations** for the reference + sequences above. + +> _MIRFLOWZ_ expects the miRNA annotations to follow [miRBase][mirbase] +> style/formatting. If you obtained this file from a source other than miRBase, +> you must ensure that it adheres to the expected format by converting it, if +> necessary. + + +4. An **uncompressed tab-separated file** with a **mapping between the + reference names** used in the miRNA annotation file (column 1; "UCSC style") + and in the gene annotations and reference sequence files (column 2; "Ensembl + style"). Values in column 1 are expected to be unique, no header is + expected, and any additional columns will be ignored. [This + resource][chrMap] provides such files for various organisms, and in the + expected format. + +> General note: If you want to process the genome resources before use (e.g., +> filtering), you can do that, but make sure the formats of any modified +> resource files meet the formatting expectations outlined above! + +## 3. Prepare the configuration file + +We recommend creating a copy of the +[configuration file template](config_template.yaml). + +```bash +cp ../config/config_template.yaml config.yaml + +``` + +Open the new copy in your editor of choice and adjust the configuration +parameters to your liking. The template explains what each of the parameters +mean and how you can meaningfully adjust them. + + +[chrMap]: +[ensembl]: +[mamba]: +[mirbase]: diff --git a/config/config_schema.json b/config/config_schema.json index 0131db1f..2d56367e 100644 --- a/config/config_schema.json +++ b/config/config_schema.json @@ -99,4 +99,4 @@ "default": ["isomir", "mirna", "pri-mir"] } } -} \ No newline at end of file +} diff --git a/test/test_snakefmt.sh b/test/test_snakefmt.sh new file mode 100755 index 00000000..f6128e6d --- /dev/null +++ b/test/test_snakefmt.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Tear down test environment +cleanup () { + rc=$? + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run tests +snakefmt --check -l 80 ../workflow diff --git a/test/test_snakemake_lint.sh b/test/test_snakemake_lint.sh new file mode 100755 index 00000000..2e986c6e --- /dev/null +++ b/test/test_snakemake_lint.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# This script is currently exiting with non-zero status. +# This is expected behaviour though, as several parameters can't be inferred from the test files. + +# Tear down test environment +cleanup () { + rc=$? + cd $user_dir + echo "Exit status: $rc" +} +trap cleanup EXIT + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run tests +snakemake \ + --snakefile="../workflow/Snakefile" \ + --configfile="config.yaml" \ + --lint diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c14b3d99..4bf25bc7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -11,3 +11,16 @@ def get_sample(column_id: str, sample_id: int = None) -> str: ) else: return str(samples_table[column_id].iloc[0]) + + +def convert_lib_format(lib_format: str) -> str: + """Convert library file format.""" + formats = { + "fa": "fa", + "fasta": "fa", + "FASTA": "fa", + "fq": "fastq", + "fastq": "fastq", + "FASTQ": "fastq", + } + return formats[lib_format] diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk index bc800e70..672b9c56 100644 --- a/workflow/rules/map.smk +++ b/workflow/rules/map.smk @@ -86,7 +86,7 @@ rule start: pd.Series( samples_table.loc[wildcards.sample, "sample_file"] ).values, - format=get_sample("format"), + format=convert_lib_format(get_sample("format")), ), output: reads=OUT_DIR / "{sample}" / "{format}" / "reads.{format}", @@ -162,7 +162,7 @@ rule format_fasta: input: reads=lambda wildcards: OUT_DIR / wildcards.sample - / get_sample("format", wildcards.sample) + / convert_lib_format(get_sample("format", wildcards.sample)) / "reads.fa", output: reads=OUT_DIR / "{sample}" / "reads_formatted.fasta", @@ -189,7 +189,9 @@ rule remove_adapters: output: reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta", params: - adapter=lambda wildcards: get_sample("adapter", wildcards.sample), + adapter=lambda wildcards: get_sample( + "adapter", wildcards.sample + ).upper(), error_rate=config["error_rate"], minimum_length=config["minimum_length"], overlap=config["overlap"], @@ -517,7 +519,6 @@ rule convert_transcriptome_to_sam_oligomap: > {output.tmap}) &> {log}" - ############################################################################### ### Merge genome mappings ###############################################################################