From 4ce5e60cf23fe3cffe18f15bb0463f50ba64e6cf Mon Sep 17 00:00:00 2001
From: deliaBlue <103108590+deliaBlue@users.noreply.github.com>
Date: Thu, 16 Nov 2023 13:15:24 +0100
Subject: [PATCH] test: standardized usage (#118)

* docs: update main README with new test paths

* ci: update tests

* refactor: add  .upper() to adapter retrieve

* docs: update main README

* refactor: add convert_lib_format function

* style: apply snakemake formatting

* ci: swap conda and singularity tests
---
 .github/workflows/tests.yml     |  51 +++++++++----
 .snakemake-workflow-catalog.yml |   5 ++
 README.md                       |  32 ++++----
 config/README.md                | 125 ++++++++++++++++++++++++++++++++
 config/config_schema.json       |   2 +-
 test/test_snakefmt.sh           |  20 +++++
 test/test_snakemake_lint.sh     |  26 +++++++
 workflow/rules/common.smk       |  13 ++++
 workflow/rules/map.smk          |   9 ++-
 9 files changed, 249 insertions(+), 34 deletions(-)
 create mode 100644 .snakemake-workflow-catalog.yml
 create mode 100644 config/README.md
 create mode 100755 test/test_snakefmt.sh
 create mode 100755 test/test_snakemake_lint.sh

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 6713d4bf..48137d3b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -48,17 +48,8 @@ jobs:
         working-directory: ./scripts
         run: pylint --rcfile=../pylint.cfg ./*.py
 
-      - name: Check workflow descriptor files for lints
-        working-directory: ./test
-        run: |
-          snakemake --snakefile="../workflow/Snakefile" --configfile="config.yaml" --lint
-          snakemake --snakefile="../workflow/rules/common.smk" --configfile="config.yaml" --lint
-          snakemake --snakefile="../workflow/rules/prepare.smk" --configfile="config.yaml" --lint
-          snakemake --snakefile="../workflow/rules/map.smk" --configfile="config.yaml" --lint
-          snakemake --snakefile="../workflow/rules/quantify.smk" --configfile="config.yaml" --lint
-        
-
-  snakemake-test:
+
+  snakemake-format-graph-test:
     runs-on: ubuntu-latest
     defaults:
       run:
@@ -77,17 +68,51 @@ jobs:
           environment-file: environment.yml
           auto-activate-base: false
       
-      - name: update mirflowz env with root packages
-        run: mamba env update -n mirflowz -f environment.root.yml
+      - name: update mirflowz env with dev packages
+        run: mamba env update -n mirflowz -f environment.dev.yml
       
       - name: display environment info
         run: |
           conda info -a
           conda list   
 
+      - name: run test for snakemake format
+        run: bash test/test_snakefmt.sh
+
+      - name: run test for snakemate lint
+        run: bash test/test_snakemake_lint.sh 
+
       - name: run test for rule graph
         run: bash test/test_rule_graph.sh
+
+
+  snakemake-integration-test:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+  
+    steps:
+    
+      - name: check out repository
+        uses: actions/checkout@v4
+         
+      - name: setup Conda/Mamba
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          mamba-version: "*"
+          activate-environment: mirflowz
+          environment-file: environment.yml
+          auto-activate-base: false
       
+      - name: update mirflowz env with root packages
+        run: mamba env update -n mirflowz -f environment.root.yml
+      
+      - name: display environment info
+        run: |
+          conda info -a
+          conda list   
+
       - name: run local test with Singularity
         run: bash test/test_workflow_local_with_singularity.sh
 
diff --git a/.snakemake-workflow-catalog.yml b/.snakemake-workflow-catalog.yml
new file mode 100644
index 00000000..4ae567aa
--- /dev/null
+++ b/.snakemake-workflow-catalog.yml
@@ -0,0 +1,5 @@
+usage:
+  software-stack-deployment:
+    conda: true
+    singularity: true
+  report: true
diff --git a/README.md b/README.md
index b0282e8c..eef5a214 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ _MIRFLOWZ_ is a [Snakemake][snakemake] workflow for mapping miRNAs and isomiRs.
 ## Installation
 
 The workflow lives inside this repository and will be available for you to run
-after following the installation instructions layed out in this section.
+after following the installation instructions laid out in this section.
 
 ### Cloning the repository
 
@@ -58,8 +58,8 @@ conda env create -f environment.yml
 conda activate mirflowz
 ```
 
-If you plan to run _MIRFLOWZ_ via Conda, we recommend to use the following
-command for a faster environment creation specially if it you will run it on a
+If you plan to run _MIRFLOWZ_ via Conda, we recommend using the following
+command for a faster environment creation, specially if you will run it on an
 HPC cluster.
 
 ```bash
@@ -157,7 +157,7 @@ tested, you can go ahead and run the workflow on your samples.
 
 It is suggested to have all the input files for a given run (or hard links 
 pointing to them) inside a dedicated directory, for instance under the 
-_MIRFLOWZ_ root directory. This way it is easier to keep the data together, 
+_MIRFLOWZ_ root directory. This way, it is easier to keep the data together,
 reproduce an analysis and set up Singularity access to them.  
 
 #### 1. Prepare a sample table
@@ -170,13 +170,12 @@ touch path/to/your/sample/table.tsv
 ```
 > Fill the sample table according to the following requirements:  
 >
-> - `sample`. This column contains the library name.  
-> - `sample_file`. In this column, you must provide the path to the library file.
-> The path must be relative to the working directory.  
-> - `adapter`.  This field must contain the adapter sequence in capital letters.  
-> - `format`. In this field you must state the library format. It can either be 
-> `fa` if providing a FASTA file or `fastq` if the library is a FASTQ file.  
-> 
+> - `sample`. Arbitrary name for the miRNA sequencing library.
+> - `sample_file`. Path to the miRNA sequencing library file. The path must be
+> relative to the directory where the workflow will be run.
+> - `adapter`. Sequence of the 3'-end adapter used during library preparation.
+> - `format`. One of `fa`/`fasta` or `fq`/`fastq`, if the library file is in
+> FASTA or FASTQ format, respectively.
 
 #### 2. Prepare genome resources
 
@@ -190,15 +189,16 @@ There are 4 files you must provide:
 
 > _MIRFLOWZ_ expects both the reference sequence and gene annotation files to
 > follow [Ensembl][ensembl] style/formatting. If you obtained these files from
-> a source other than Ensembl, you may first need to convert them to the
-> expected style to avoid issues!
+> a source other than Ensembl, you must ensure that they adhere to the
+> expected format by converting them, if necessary.
 
 3. An **uncompressed GFF3** file with **microRNA annotations** for the reference
    sequences above.
 
 > _MIRFLOWZ_ expects the miRNA annotations to follow [miRBase][mirbase]
 > style/formatting. If you obtained this file from a source other than miRBase,
-> you may first need to convert it to the expected style to avoid issues!
+> you must ensure that it adheres to the expected format by converting it, if
+> necessary.
 
 4. An **uncompressed tab-separated file** with a **mapping between the
    reference names** used in the miRNA annotation file (column 1; "UCSC style")
@@ -223,7 +223,7 @@ cp  config/config_template.yaml  path/to/config.yaml
 
 Open the new copy in your editor of choice and adjust the configuration
 parameters to your liking. The template explains what each of the
-parameters means and how you can meaningfully adjust them. 
+parameters mean and how you can meaningfully adjust them. 
 
 ### Running the workflow
 
@@ -243,7 +243,7 @@ snakemake \
 ```
 
 > **NOTE:** Depending on your working directory, you do not need to use the 
-> parameters  `--snakefile` and `--configfile`. For instance, if the `Snakefile`
+> parameters `--snakefile` and `--configfile`. For instance, if the `Snakefile`
 > is in the same directory or the `workflow/` directory is beneath the current
 > working directory, there's no need for the `--snakefile` directory. Refer to 
 > the [Snakemake documentation][snakemakeDocu] for more information.
diff --git a/config/README.md b/config/README.md
new file mode 100644
index 00000000..a52e86ed
--- /dev/null
+++ b/config/README.md
@@ -0,0 +1,125 @@
+# Dependencies installation
+
+Create and activate the virtual environment with the required dependencies
+with Conda:
+
+```bash
+conda env create -f environment.yml
+conda activate mirflowz
+```
+
+If you plan to run _MIRFLOWZ_ via Conda, we recommend using the following
+command for a faster environment creation, specially if you will run it on an
+HPC cluster.
+
+```bash
+conda config --set channel_priority strict
+```
+
+For a faster creation of the environment (and Conda environments in general),
+you can also install [Mamba][mamba] on top of Conda. In that case, replace
+`conda` with `mamba` in the commands above (particularly in 
+`conda env create`).
+
+## Running _MIRFLOWZ_ with Singularity
+
+If you want to run _MIRFLOWZ_ via Singularity and do not already
+have it installed globally on your system, you must further update the Conda
+environment with:
+
+```bash
+conda env update -f environment.root.yml
+```
+
+> Mind that you must have the environment activated and root permissions on
+> your system to install Singularity. If you want to run _MIRFLOWZ_ on an HPC
+> cluster (recommended in almost all cases), ask your system administrator
+> about Singularity.
+
+# Run the workflow on your own samples
+
+In order to run _MIRFLOWZ_ on your own samples, we recommend having all the
+input files inside a dedicated directory. This way, it is easier to keep the
+data together and reproduce an analysis. Assuming that your current directory
+is the repository's root directory, create a directory to store all your data
+and traverse to it with:
+
+```bash
+mkdir path/to/your_run
+cd path/to/your_run
+```
+
+## 1. Prepare the sample table
+
+Create an empty sample table. Refer to the
+[sample.tsv](../test/test_files/samples_table.tsv) test file to see what the
+table must look like or use it as a template.
+
+```bash
+touch samples.tsv
+```
+
+> Fill the sample table according to the following requirements:  
+>
+> - `sample`. Arbitrary name for the miRNA sequencing library.
+> - `sample_file`. Path to the miRNA sequencing library file. The path must be
+> relative to the directory where the workflow will be run.
+> - `adapter`. Sequence of the 3'-end adapter used during library preparation.
+> - `format`. One of `fa`/`fasta` or `fq`/`fastq`, if the library file is in
+> FASTA or FASTQ format, respectively.
+
+## 2. Prepare the genome resources
+
+There are 4 files you must provide: 
+
+1. A **`gzip`ped FASTA** file containing **reference sequences**, typically the
+   genome of the source/organism from which the library was extracted.
+
+2. A **`gzip`ped GTF** file with matching **gene annotations** for the
+   reference sequences above.
+
+> _MIRFLOWZ_ expects both the reference sequence and gene annotation files to
+> follow [Ensembl][ensembl] style/formatting. If you obtained these files from
+> a source other than Ensembl, you must ensure that they adhere to the
+> expected format by converting them, if necessary.
+
+3. An **uncompressed GFF3** file with **microRNA annotations** for the reference
+   sequences above.
+
+> _MIRFLOWZ_ expects the miRNA annotations to follow [miRBase][mirbase]
+> style/formatting. If you obtained this file from a source other than miRBase,
+> you must ensure that it adheres to the expected format by converting it, if
+> necessary.
+
+
+4. An **uncompressed tab-separated file** with a **mapping between the
+   reference names** used in the miRNA annotation file (column 1; "UCSC style")
+   and in the gene annotations and reference sequence files (column 2; "Ensembl
+   style"). Values in column 1 are expected to be unique, no header is
+   expected, and any additional columns will be ignored. [This
+   resource][chrMap] provides such files for various organisms, and in the
+   expected format.
+
+> General note: If you want to process the genome resources before use (e.g.,
+> filtering), you can do that, but make sure the formats of any modified
+> resource files meet the formatting expectations outlined above!
+
+## 3. Prepare the configuration file
+
+We recommend creating a copy of the
+[configuration file template](config_template.yaml).
+
+```bash
+cp ../config/config_template.yaml config.yaml
+
+```
+
+Open the new copy in your editor of choice and adjust the configuration
+parameters to your liking. The template explains what each of the parameters
+mean and how you can meaningfully adjust them.
+
+
+[chrMap]: <https://github.com/dpryan79/ChromosomeMappings>
+[ensembl]: <https://ensembl.org/>
+[mamba]: <https://github.com/mamba-org/mamba>
+[mirbase]: <https://mirbase.org/>
diff --git a/config/config_schema.json b/config/config_schema.json
index 0131db1f..2d56367e 100644
--- a/config/config_schema.json
+++ b/config/config_schema.json
@@ -99,4 +99,4 @@
             "default": ["isomir", "mirna", "pri-mir"]
         }
     }
-}
\ No newline at end of file
+}
diff --git a/test/test_snakefmt.sh b/test/test_snakefmt.sh
new file mode 100755
index 00000000..f6128e6d
--- /dev/null
+++ b/test/test_snakefmt.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Tear down test environment
+cleanup () {
+    rc=$?
+    cd $user_dir
+    echo "Exit status: $rc"
+}
+trap cleanup EXIT
+
+# Set up test environment
+set -eo pipefail  # ensures that script exits at first command that exits with non-zero status
+set -u  # ensures that script exits when unset variables are used
+set -x  # facilitates debugging by printing out executed commands
+user_dir=$PWD
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+cd $script_dir
+
+# Run tests
+snakefmt  --check -l 80 ../workflow
diff --git a/test/test_snakemake_lint.sh b/test/test_snakemake_lint.sh
new file mode 100755
index 00000000..2e986c6e
--- /dev/null
+++ b/test/test_snakemake_lint.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# This script is currently exiting with non-zero status.
+# This is expected behaviour though, as several parameters can't be inferred from the test files.
+
+# Tear down test environment
+cleanup () {
+    rc=$?
+    cd $user_dir
+    echo "Exit status: $rc"
+}
+trap cleanup EXIT
+
+# Set up test environment
+set -eo pipefail  # ensures that script exits at first command that exits with non-zero status
+set -u  # ensures that script exits when unset variables are used
+set -x  # facilitates debugging by printing out executed commands
+user_dir=$PWD
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+cd $script_dir
+
+# Run tests
+snakemake \
+    --snakefile="../workflow/Snakefile" \
+    --configfile="config.yaml" \
+    --lint
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index c14b3d99..4bf25bc7 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -11,3 +11,16 @@ def get_sample(column_id: str, sample_id: int = None) -> str:
         )
     else:
         return str(samples_table[column_id].iloc[0])
+
+
+def convert_lib_format(lib_format: str) -> str:
+    """Convert library file format."""
+    formats = {
+        "fa": "fa",
+        "fasta": "fa",
+        "FASTA": "fa",
+        "fq": "fastq",
+        "fastq": "fastq",
+        "FASTQ": "fastq",
+    }
+    return formats[lib_format]
diff --git a/workflow/rules/map.smk b/workflow/rules/map.smk
index bc800e70..672b9c56 100644
--- a/workflow/rules/map.smk
+++ b/workflow/rules/map.smk
@@ -86,7 +86,7 @@ rule start:
             pd.Series(
                 samples_table.loc[wildcards.sample, "sample_file"]
             ).values,
-            format=get_sample("format"),
+            format=convert_lib_format(get_sample("format")),
         ),
     output:
         reads=OUT_DIR / "{sample}" / "{format}" / "reads.{format}",
@@ -162,7 +162,7 @@ rule format_fasta:
     input:
         reads=lambda wildcards: OUT_DIR
         / wildcards.sample
-        / get_sample("format", wildcards.sample)
+        / convert_lib_format(get_sample("format", wildcards.sample))
         / "reads.fa",
     output:
         reads=OUT_DIR / "{sample}" / "reads_formatted.fasta",
@@ -189,7 +189,9 @@ rule remove_adapters:
     output:
         reads=OUT_DIR / "{sample}" / "reads_trimmed_adapters.fasta",
     params:
-        adapter=lambda wildcards: get_sample("adapter", wildcards.sample),
+        adapter=lambda wildcards: get_sample(
+            "adapter", wildcards.sample
+        ).upper(),
         error_rate=config["error_rate"],
         minimum_length=config["minimum_length"],
         overlap=config["overlap"],
@@ -517,7 +519,6 @@ rule convert_transcriptome_to_sam_oligomap:
         > {output.tmap}) &> {log}"
 
 
-
 ###############################################################################
 ### Merge genome mappings
 ###############################################################################