Plant-Food-Research-Open · GallVp · Nov 12, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,14 +3,18 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v0.5.0dev - [07-Nov-2024]
+## v0.5.0dev - [11-Nov-2024]
 
 ### `Added`
 
 1. Added MultiQC [#65](https://github.com/plant-food-research-open/genepal/issues/65)
 2. Updated nf-core template to 3.0.2 [#66](https://github.com/PlantandFoodResearch/genepal/issues/66)
 3. Integrated nf-test into pipeline CI [#68](https://github.com/PlantandFoodResearch/genepal/issues/68)
 4. Updated the flowchart [#87](https://github.com/PlantandFoodResearch/genepal/issues/87)
+5. Added a large test dataset for the `test_full` profile [#90](https://github.com/PlantandFoodResearch/genepal/issues/90)
+6. Now `.gff.gz` and `.gff3.gz` inputs are also allowed for the `benchmark` column in `--input`
+7. Now removing liftoff genes with any intron shorted than 10bp [#89](https://github.com/Plant-Food-Research-Open/genepal/issues/89)
+8. Now also removing `rRNA` and `tRNA` after liftoff as the downstream logic in the pipeline can not correctly handle these
 
 ### `Fixed`
 

diff --git a/README.md b/README.md
@@ -29,16 +29,16 @@
   - With protein evidence alone, [BRAKER workflow C](https://github.com/Gaius-Augustus/BRAKER/tree/f58479fe5bb13a9e51c3ca09cb9e137cab3b8471?tab=readme-ov-file#overview-of-modes-for-running-braker) is executed
   - With protein plus RNASeq evidence, [BRAKER workflow D](https://github.com/Gaius-Augustus/BRAKER/tree/f58479fe5bb13a9e51c3ca09cb9e137cab3b8471?tab=readme-ov-file#overview-of-modes-for-running-braker) is executed
 - [Liftoff](https://github.com/agshumate/Liftoff): Optionally, liftoff annotations from reference genome FASTA/GFF
-- [TSEBRA](https://github.com/Gaius-Augustus/TSEBRA)
-  - Ensure that each BRAKER model has [full intron support](./docs/usage.md#iso-forms-and-full-intron-support)
-  - Optionally, ensure that each Liftoff model has full intron support
+- [TSEBRA](https://github.com/Gaius-Augustus/TSEBRA): Optionally, ensure that each BRAKER or both BRAKER and Liftoff models have [full intron support](./docs/usage.md#iso-forms-and-full-intron-support)
 - [AGAT](https://github.com/NBISweden/AGAT)
   - Merge multi-reference liftoffs
   - Remove liftoff transcripts marked by _valid_ORF=False_
+  - Remove liftoff genes with any intron shorter than 10 bp
+  - Remove rRNA and tRNA from liftoff
   - Optionally, allow or remove iso-forms
   - Remove BRAKER models from Liftoff loci
-  - Optionally, remove models without any EggNOG-mapper hits
   - Merge Liftoff and BRAKER models
+  - Optionally, remove models without any EggNOG-mapper hits
 - [EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper): Add functional annotation to gff
 - [GenomeTools](https://github.com/genometools/genometools): GFF format validation
 - [GffRead](https://github.com/gpertea/gffread): Extraction of protein sequences

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -64,14 +64,14 @@
                 "anyOf": [
                     {
                         "type": "string",
-                        "pattern": "^\\S+\\.gff(3)?$"
+                        "pattern": "^\\S+\\.gff(3)?(\\.gz)?$"
                     },
                     {
                         "type": "string",
                         "maxLength": 0
                     }
                 ],
-                "errorMessage": "GFF/GFF3 file for benchmarking cannot contain spaces and must have extension '.gff' or '.gff3'"
+                "errorMessage": "GFF/GFF3 file for benchmarking cannot contain spaces and must have extension '.gff.gz', '.gff3.gz', '.gff' or '.gff3'"
             }
         },
         "type": "object",

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -10,26 +10,22 @@
 ----------------------------------------------------------------------------------------
 */
 
-process {
-    resourceLimits = [
-        cpus: 10,
-        memory: '32.GB',
-        time: '6.h'
-    ]
-}
-
 params {
     config_profile_name         = 'Full test profile'
     config_profile_description  = 'Full test dataset to check pipeline function'
 
-    // Input data
-    input                       = "${projectDir}/tests/minimal/assemblysheet.csv"
-    protein_evidence            = 'https://raw.githubusercontent.com/Gaius-Augustus/BRAKER/f58479fe5bb13a9e51c3ca09cb9e137cab3b8471/example/proteins.fa'
+    input                       = "${projectDir}/tests/full/assemblysheet.csv"
+    protein_evidence            = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/214/015/GCF_000214015.3_version_140606/GCF_000214015.3_version_140606_protein.faa.gz'
+    eggnogmapper_tax_scope      = 33090
+    rna_evidence                = "${projectDir}/tests/full/rnasheet.csv"
+    liftoff_annotations         = "${projectDir}/tests/full/liftoffannotations.csv"
+    orthofinder_annotations     = "${projectDir}/tests/full/orthofinderannotations.csv"
+
+    star_max_intron_length      = 5000
 
-    // Braker options for faster test execution!
-    // WARNING: Do not use with actual data!
-    braker_extra_args           = '--gm_max_intergenic 10000 --skipOptimize' // Added for faster test execution! Do not use with actual data!
+    busco_lineage_datasets      = 'chlorophyta_odb10 eukaryota_odb10'
 
-    // BUSCO lineage
-    busco_lineage_datasets      = 'eudicots_odb10'
+    // Relaxed filtering due to limited evidence
+    enforce_full_intron_support = false
+    filter_liftoff_by_hints     = false
 }
diff --git a/docs/img/genepal.png b/docs/img/genepal.png
diff --git a/docs/output.md b/docs/output.md
@@ -119,7 +119,7 @@ RNASeq alignment is performed with [STAR](https://github.com/alexdobin/STAR). Al
 >
 > BRAKER outputs are not the final outputs from the pipeline and that's why they are not stored by default. These are only intermediary files.
 >
-> The pipeline further processes the BRAKER predictions and stores the final validated outputs in the `annotations` directory. The `braker_save_outputs` option is only provided to allow a manual resume of the pipeline for advanced use cases.
+> The pipeline further processes the BRAKER predictions and stores the final validated outputs in the `annotations` directory. The `braker_save_outputs` option is only provided to allow a manual resume of the pipeline for advanced use cases. See [Advanced inputs for manual resume](./usage.md#advanced-inputs-for-manual-resume) in the [usage doc](./usage.md).
 
 ### Annotation with Liftoff
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -5,6 +5,7 @@
 > This document does not describe every pipeline parameter. For an exhaustive list of parameters, see [parameters.md](./parameters.md).
 
 - [Assemblysheet input](#assemblysheet-input)
+  - [Advanced inputs for manual resume](#advanced-inputs-for-manual-resume)
 - [Protein evidence](#protein-evidence)
   - [BRAKER workflow](#braker-workflow)
 - [RNASeq evidence](#rnaseq-evidence)
@@ -39,7 +40,17 @@ You will need to create an assemblysheet with information about the genome assem
 - `tag:` A unique tag which represents the target assembly throughout the pipeline. The `tag` and `fasta` file name should not be same, such as `tag.fasta`. This can create file name collisions in the pipeline or result in file overwrite. It is also a good-practice to make all the input files read-only.
 - `fasta:` FASTA file for the genome
 - `is_masked:` Whether the FASTA is masked or not? Use yes/no to indicate the masking. If the assembly is not masked. The pipeline will soft mask it before annotating it.
-- `te_lib [Optional]`: If an assembly is not masked and a TE library is available which cna be used to mask the assembly, the path of the TE library FASTA file can be provided here. If this column is absent and the assembly is not masked, the pipeline will first create a TE library so that it can soft mask the assembly.
+- `te_lib [Optional]:` If an assembly is not masked and a TE library is available which cna be used to mask the assembly, the path of the TE library FASTA file can be provided here. If this column is absent and the assembly is not masked, the pipeline will first create a TE library so that it can soft mask the assembly.
+- `benchmark [Optional]:` A GFF3 file which can be used to benchmark or compare the results of the pipeline against an existing annotation.
+
+### Advanced inputs for manual resume
+
+If the pipeline fails while processing large datasets, it is advisable to backup the repeat-masked genomes and the BRAKER outputs before attempting a [Nextflow resume](https://www.nextflow.io/docs/latest/cache-and-resume.html#caching-and-resuming). If the resume fails, these outputs from the first pipeline run can be used to setup a manual resume. This can be achieved by providing the repeat-masked genomes under the `fasta` column along with `is_masked` column set to `yes`. The BRAKER outputs can be provided under the following columns,
+
+- `braker_gff3 [Optional]:` BRAKER GFF3 file
+- `braker_hints [Optional]:` BRAKER hints file in GFF3 format
+
+The pipeline will automatically skip the repeat modelling, masking and BRAKER steps. It will still perform these steps for those genomes for which these files are not provided. These files are not saved by the pipeline by default. To save the files, set the `repeatmasker_save_outputs` and `braker_save_outputs` parameters to `true`.
 
 ## Protein evidence
 

diff --git a/local_genepal b/local_genepal
@@ -17,6 +17,6 @@ nextflow run \
     -profile docker,test_full \
     -resume \
     $stub \
-    --eggnogmapper_tax_scope 33090 \
+    -c ../nxf-config/resources.config \
     --eggnogmapper_db_dir ../dbs/emapperdb/5.0.2 \
     --outdir results
diff --git a/modules.json b/modules.json
@@ -15,6 +15,11 @@
                         "git_sha": "a8939d36280e7d9037c7cf164eeede19e46546a4",
                         "installed_by": ["gxf_fasta_agat_spaddintrons_spextractsequences"]
                     },
+                    "agat/spflagshortintrons": {
+                        "branch": "main",
+                        "git_sha": "d8f08700c82a3bd14811a3dfe7e7d63838130693",
+                        "installed_by": ["modules"]
+                    },
                     "braker3": {
                         "branch": "main",
                         "git_sha": "a8939d36280e7d9037c7cf164eeede19e46546a4",

diff --git a/modules/gallvp/agat/spflagshortintrons/environment.yml b/modules/gallvp/agat/spflagshortintrons/environment.yml
@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "bioconda::agat=1.4.1"
diff --git a/modules/gallvp/agat/spflagshortintrons/main.nf b/modules/gallvp/agat/spflagshortintrons/main.nf
@@ -0,0 +1,50 @@
+process AGAT_SPFLAGSHORTINTRONS {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/agat:1.4.1--pl5321hdfd78af_0':
+        'biocontainers/agat:1.4.1--pl5321hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(gxf)
+    path config
+
+    output:
+    tuple val(meta), path("*.gff")  , emit: gff
+    path "versions.yml"             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args        = task.ext.args ?: ''
+    def prefix      = task.ext.prefix ?: "${meta.id}"
+    def config_arg  = config ? "-c $config" : ''
+    if( "$gxf" == "${prefix}.gff" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    agat_sp_flag_short_introns.pl \\
+        $args \\
+        -g $gxf \\
+        $config_arg \\
+        -o ${prefix}.gff
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix  = task.ext.prefix ?: "${meta.id}"
+    if( "$gxf" == "${prefix}.gff" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    touch ${prefix}.gff
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p')
+    END_VERSIONS
+    """
+}
diff --git a/modules/gallvp/agat/spflagshortintrons/meta.yml b/modules/gallvp/agat/spflagshortintrons/meta.yml
@@ -0,0 +1,60 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "agat_spflagshortintrons"
+description: |
+  The script flags the short introns with the attribute <pseudo>. Is is usefull to avoid ERROR when submiting the data to EBI.
+  (Typical EBI error message: ********ERROR: Intron usually expected to be at least 10 nt long. Please check the accuracy)
+keywords:
+  - genomics
+  - gtf
+  - gff
+  - intron
+  - short
+  - annotation
+tools:
+  - "agat":
+      description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene annotations in any GTF/GFF format."
+      homepage: "https://agat.readthedocs.io/en/latest/"
+      documentation: "https://agat.readthedocs.io/en/latest/"
+      tool_dev_url: "https://github.com/NBISweden/AGAT"
+      doi: "10.5281/zenodo.3552717"
+      licence: ["GPL v3"]
+      identifier: biotools:AGAT
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1', single_end:false ]`
+    - gxf:
+        type: file
+        description: Input GFF3/GTF file
+        pattern: "*.{gff,gff3,gtf}"
+  - - config:
+        type: file
+        description: |
+          Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any,
+          otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose".
+          The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently).
+        pattern: "*.yaml"
+output:
+  - gff:
+      - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. `[ id:'sample1', single_end:false ]
+      - "*.gff":
+          type: file
+          description: Output GFF file.
+          pattern: "*.gff"
+  - versions:
+      - versions.yml:
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
+
+authors:
+  - "@GallVp"
+maintainers:
+  - "@GallVp"
diff --git a/modules/gallvp/agat/spflagshortintrons/tests/main.nf.test b/modules/gallvp/agat/spflagshortintrons/tests/main.nf.test
@@ -0,0 +1,61 @@
+nextflow_process {
+
+    name "Test Process AGAT_SPFLAGSHORTINTRONS"
+    script "../main.nf"
+    process "AGAT_SPFLAGSHORTINTRONS"
+
+    tag "modules"
+    tag "modules_gallvp"
+    tag "agat"
+    tag "agat/spflagshortintrons"
+
+    test("homo_sapiens - genome - gtf") {
+
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr1/genome.gtf', checkIfExists: true)
+                ]
+                input[1] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("homo_sapiens - genome - gtf - stub") {
+
+        options '-stub'
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr1/genome.gtf', checkIfExists: true)
+                ]
+                input[1] = []
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+}