From 3114adb0783494b4ebf36e3ada9c66ce2d90a15f Mon Sep 17 00:00:00 2001 From: Usman Rashid <usman@smme.edu.pk> Date: Tue, 10 Dec 2024 21:09:25 +1300 Subject: [PATCH 1/6] Added parameter filter_genes_by_aa_length --- CHANGELOG.md | 1 + README.md | 1 + conf/modules.config | 4 + docs/output.md | 4 +- docs/parameters.md | 13 +-- modules.json | 5 + .../agat/spfilterbyorfsize/environment.yml | 7 ++ modules/gallvp/agat/spfilterbyorfsize/main.nf | 60 +++++++++++ .../gallvp/agat/spfilterbyorfsize/meta.yml | 67 ++++++++++++ .../agat/spfilterbyorfsize/tests/main.nf.test | 62 +++++++++++ .../spfilterbyorfsize/tests/main.nf.test.snap | 100 ++++++++++++++++++ nextflow.config | 1 + nextflow_schema.json | 7 ++ subworkflows/local/gff_merge_cleanup.nf | 23 +++- workflows/genepal.nf | 3 +- 15 files changed, 345 insertions(+), 13 deletions(-) create mode 100644 modules/gallvp/agat/spfilterbyorfsize/environment.yml create mode 100644 modules/gallvp/agat/spfilterbyorfsize/main.nf create mode 100644 modules/gallvp/agat/spfilterbyorfsize/meta.yml create mode 100644 modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test create mode 100644 modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap diff --git a/CHANGELOG.md b/CHANGELOG.md index e4d0ca6..33813b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 1. Added cDNA and CDS outputs to <OUTPUT_DIR>/annotations/<SAMPLE> directory [#118](https://github.com/Plant-Food-Research-Open/genepal/issues/118) 2. Added parameter `add_attrs_to_proteins_cds_fastas` +3. Added parameter `filter_genes_by_aa_length` with default set to `24` which allows removal of genes with ORFs shorter than 24 [#125](https://github.com/Plant-Food-Research-Open/genepal/issues/125) ### `Fixed` diff --git a/README.md b/README.md index 51f3a3e..177e8f5 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ - Optionally, allow or remove iso-forms - Remove BRAKER models from Liftoff loci - Merge Liftoff and BRAKER models + - Optionally, remove models with ORFs shorter than `N` amino acids - Optionally, remove models without any EggNOG-mapper hits - [EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper): Add functional annotation to gff - [GenomeTools](https://github.com/genometools/genometools): GFF format validation diff --git a/conf/modules.config b/conf/modules.config index 44e6123..fbb5f52 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -240,6 +240,10 @@ process { // SUBWORKFLOW: GFF_MERGE_CLEANUP ext.prefix = { "${meta.id}.liftoff.braker" } } + withName: '.*:GFF_MERGE_CLEANUP:AGAT_SPFILTERBYORFSIZE' { + ext.args = params.filter_genes_by_aa_length ? "-s ${params.filter_genes_by_aa_length}" : '' + } + withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' { ext.args = '-tidy -retainids -sort' } diff --git a/docs/output.md b/docs/output.md index f4793b5..40b546c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -169,8 +169,8 @@ If more than one genome is included in the pipeline, [ORTHOFINDER](https://githu - `Y/` - `Y.gt.gff3`: Final annotation file for genome `Y` which contains gene models and their functional annotations - `Y.pep.fasta`: Protein sequences for the gene models - - 'Y.cdna.fasta': cDNA sequences for the gene models - - 'Y.cds.fasta': Coding sequences for the gene models + - `Y.cdna.fasta`: cDNA sequences for the gene models + - `Y.cds.fasta`: Coding sequences for the gene models </details> diff --git a/docs/parameters.md b/docs/parameters.md index 9297c4a..0c2bb09 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -59,12 +59,13 @@ A Nextflow pipeline for consensus, phased and pan-genome annotation. ## Post-annotation filtering options -| Parameter | Description | Type | Default | Required | Hidden | -| ----------------------------- | ----------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | -| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | -| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | -| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | +| Parameter | Description | Type | Default | Required | Hidden | +| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | +| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | +| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | +| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | +| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | +| `filter_genes_by_aa_length` | Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped. | `integer` | 24 | | | ## Annotation output options diff --git a/modules.json b/modules.json index da05f16..6b9d74a 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,11 @@ "git_sha": "a8939d36280e7d9037c7cf164eeede19e46546a4", "installed_by": ["gxf_fasta_agat_spaddintrons_spextractsequences"] }, + "agat/spfilterbyorfsize": { + "branch": "main", + "git_sha": "a0054cdffbd84f002fb6582b28575b699e01098e", + "installed_by": ["modules"] + }, "agat/spflagshortintrons": { "branch": "main", "git_sha": "d8f08700c82a3bd14811a3dfe7e7d63838130693", diff --git a/modules/gallvp/agat/spfilterbyorfsize/environment.yml b/modules/gallvp/agat/spfilterbyorfsize/environment.yml new file mode 100644 index 0000000..2c3daab --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::agat=1.4.2" diff --git a/modules/gallvp/agat/spfilterbyorfsize/main.nf b/modules/gallvp/agat/spfilterbyorfsize/main.nf new file mode 100644 index 0000000..502a9cd --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/main.nf @@ -0,0 +1,60 @@ +process AGAT_SPFILTERBYORFSIZE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.4.2--pl5321hdfd78af_0': + 'biocontainers/agat:1.4.2--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gxf) + path config + + output: + tuple val(meta), path("*.passed.gff") , emit: passed_gff + tuple val(meta), path("*.failed.gff") , emit: failed_gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def config_arg = config ? "-c $config" : '' + if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + agat_sp_filter_by_ORF_size.pl \\ + -g $gxf \\ + $args \\ + $config_arg \\ + -o $prefix + + mv \\ + ${prefix}_NOT* \\ + "${prefix}.failed.gff" + + mv \\ + ${prefix}_* \\ + "${prefix}.passed.gff" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.passed.gff + touch ${prefix}.failed.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/gallvp/agat/spfilterbyorfsize/meta.yml b/modules/gallvp/agat/spfilterbyorfsize/meta.yml new file mode 100644 index 0000000..cf399da --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/meta.yml @@ -0,0 +1,67 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "agat_spfilterbyorfsize" +description: The script reads a gff annotation file, and create two output files, + one contains the gene models with ORF passing the test, the other contains the rest. + By default the test is "> 100" that means all gene models that have ORF longer than + 100 Amino acids, will pass the test. +keywords: + - genomics + - GFF/GTF + - filter + - annotation +tools: + - "agat": + description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene + annotations in any GTF/GFF format." + homepage: "https://agat.readthedocs.io/en/latest/" + documentation: "https://agat.readthedocs.io/en/latest/" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] + identifier: biotools:AGAT + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - gxf: + type: file + description: Input GFF3/GTF file + pattern: "*.{gff,gff3,gtf}" + - - config: + type: file + description: | + Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, + otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". + The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). + pattern: "*.yaml" +output: + - passed_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ] + - "*.passed.gff": + type: file + description: GFF file with gene models which pass the filter test + - failed_gff: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ] + - "*.failed.gff": + type: file + description: GFF file with remaining gene models + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test new file mode 100644 index 0000000..4a6e1fc --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test @@ -0,0 +1,62 @@ +nextflow_process { + + name "Test Process AGAT_SPFILTERBYORFSIZE" + script "../main.nf" + process "AGAT_SPFILTERBYORFSIZE" + + tag "modules" + tag "modules_gallvp" + tag "agat" + tag "agat/spfilterbyorfsize" + + test("actinidia_chinensis - genome - gtf") { + + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/eukaryotes/actinidia_chinensis/genome/chr1/genome.gtf.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("homo_sapiens - genome - gtf - stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr1/genome.gtf', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + +} diff --git a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap new file mode 100644 index 0000000..22b26fe --- /dev/null +++ b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap @@ -0,0 +1,100 @@ +{ + "homo_sapiens - genome - gtf - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ], + "failed_gff": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "passed_gff": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.04.4" + }, + "timestamp": "2024-12-10T17:07:11.619928" + }, + "actinidia_chinensis - genome - gtf": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb" + ] + ], + "2": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ], + "failed_gff": [ + [ + { + "id": "test" + }, + "test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb" + ] + ], + "passed_gff": [ + [ + { + "id": "test" + }, + "test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3" + ] + ], + "versions": [ + "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.04.4" + }, + "timestamp": "2024-12-10T17:07:06.829402" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 363f0c5..c3ce861 100644 --- a/nextflow.config +++ b/nextflow.config @@ -54,6 +54,7 @@ params { enforce_full_intron_support = true filter_liftoff_by_hints = true eggnogmapper_purge_nohits = false + filter_genes_by_aa_length = 24 // Annotation output options braker_save_outputs = false diff --git a/nextflow_schema.json b/nextflow_schema.json index b7b5cc4..abe26a9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -272,6 +272,13 @@ "type": "boolean", "description": "Purge transcripts which do not have a hit against eggnog", "fa_icon": "fas fa-question-circle" + }, + "filter_genes_by_aa_length": { + "type": "integer", + "default": 24, + "fa_icon": "fas fa-hashtag", + "description": "Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped.", + "minimum": 3 } } }, diff --git a/subworkflows/local/gff_merge_cleanup.nf b/subworkflows/local/gff_merge_cleanup.nf index fc6c75e..fbdea37 100644 --- a/subworkflows/local/gff_merge_cleanup.nf +++ b/subworkflows/local/gff_merge_cleanup.nf @@ -1,18 +1,20 @@ include { AGAT_SPMERGEANNOTATIONS } from '../../modules/nf-core/agat/spmergeannotations/main' include { GT_GFF3 } from '../../modules/nf-core/gt/gff3/main' +include { AGAT_SPFILTERBYORFSIZE } from '../../modules/gallvp/agat/spfilterbyorfsize/main' include { AGAT_CONVERTSPGXF2GXF } from '../../modules/nf-core/agat/convertspgxf2gxf/main' workflow GFF_MERGE_CLEANUP { take: ch_braker_gff // Channel: [ meta, gff ] ch_liftoff_gff // Channel: [ meta, gff ] + val_filter_by_aa_length // val(null|Integer) main: ch_versions = Channel.empty() ch_gff_branch = ch_braker_gff | join(ch_liftoff_gff, remainder:true) - | branch { meta, braker_gff, liftoff_gff -> + | branch { _meta, braker_gff, liftoff_gff -> both : ( braker_gff && liftoff_gff ) braker_only : ( braker_gff && ( ! liftoff_gff ) ) liftoff_only: ( ( ! braker_gff ) && liftoff_gff ) @@ -25,12 +27,25 @@ workflow GFF_MERGE_CLEANUP { ) ch_merged_gff = AGAT_SPMERGEANNOTATIONS.out.gff - | mix ( ch_gff_branch.liftoff_only.map { meta, braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } ) - | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, liftoff_gff -> [ meta, braker_gff ] } ) + | mix ( ch_gff_branch.liftoff_only.map { meta, _braker_gff, liftoff_gff -> [ meta, liftoff_gff ] } ) + | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, _liftoff_gff -> [ meta, braker_gff ] } ) ch_versions = ch_versions.mix(AGAT_SPMERGEANNOTATIONS.out.versions.first()) + // MODULE: AGAT_SPFILTERBYORFSIZE + ch_filter_input = ch_merged_gff + | branch { + filter: val_filter_by_aa_length != null + pass: val_filter_by_aa_length == null + } + + AGAT_SPFILTERBYORFSIZE ( ch_filter_input.filter, [] ) + + ch_filtered_gff = AGAT_SPFILTERBYORFSIZE.out.passed_gff + | mix ( ch_filter_input.pass ) + ch_versions = ch_versions.mix(AGAT_SPFILTERBYORFSIZE.out.versions.first()) + // MODULE: GT_GFF3 - GT_GFF3 ( ch_merged_gff ) + GT_GFF3 ( ch_filtered_gff ) ch_gt_gff = GT_GFF3.out.gt_gff3 ch_versions = ch_versions.mix(GT_GFF3.out.versions.first()) diff --git a/workflows/genepal.nf b/workflows/genepal.nf index 538fcfe..6ee525b 100644 --- a/workflows/genepal.nf +++ b/workflows/genepal.nf @@ -178,7 +178,8 @@ workflow GENEPAL { // SUBWORKFLOW: GFF_MERGE_CLEANUP GFF_MERGE_CLEANUP( ch_braker_purged_gff, - ch_liftoff_gff3 + ch_liftoff_gff3, + params.filter_genes_by_aa_length ) ch_merged_gff = GFF_MERGE_CLEANUP.out.gff From d694431b94985b425e9212c1078975d6020a2006 Mon Sep 17 00:00:00 2001 From: Usman Rashid <usman@smme.edu.pk> Date: Wed, 11 Dec 2024 06:37:33 +1300 Subject: [PATCH 2/6] Updated snapshots --- pfr/params.json | 3 ++- tests/minimal/main.nf.test.snap | 11 +++++++---- tests/stub/main.nf.test.snap | 11 +++++++---- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pfr/params.json b/pfr/params.json index 7e993bf..9983398 100644 --- a/pfr/params.json +++ b/pfr/params.json @@ -32,8 +32,9 @@ "enforce_full_intron_support": true, "filter_liftoff_by_hints": true, "eggnogmapper_purge_nohits": false, + "filter_genes_by_aa_length": 24, "braker_save_outputs": false, - "add_attrs_to_proteins_fasta": false, + "add_attrs_to_proteins_cds_fastas": false, "busco_skip": false, "busco_lineage_datasets": "embryophyta_odb10" } diff --git a/tests/minimal/main.nf.test.snap b/tests/minimal/main.nf.test.snap index 48dae90..f4b2aba 100644 --- a/tests/minimal/main.nf.test.snap +++ b/tests/minimal/main.nf.test.snap @@ -2,7 +2,7 @@ "profile - test": { "content": [ { - "successful tasks": 20, + "successful tasks": 21, "versions": { "AGAT_CONVERTSPGFF2GTF": { "agat": "v1.4.0" @@ -16,6 +16,9 @@ "AGAT_SPEXTRACTSEQUENCES": { "agat": "v1.4.0" }, + "AGAT_SPFILTERBYORFSIZE": { + "agat": "v1.4.1" + }, "BRAKER3": { "braker3": "3.0.8", "augustus": "3.5.0", @@ -92,8 +95,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.04.2" + "nextflow": "24.04.4" }, - "timestamp": "2024-12-05T07:51:43.818374" + "timestamp": "2024-12-11T06:36:01.956188" } -} +} \ No newline at end of file diff --git a/tests/stub/main.nf.test.snap b/tests/stub/main.nf.test.snap index 4516d50..1548c96 100644 --- a/tests/stub/main.nf.test.snap +++ b/tests/stub/main.nf.test.snap @@ -2,7 +2,7 @@ "full - stub": { "content": [ { - "successful tasks": 162, + "successful tasks": 166, "versions": { "AGAT_CONVERTSPGFF2GTF": { "agat": "v1.4.0" @@ -16,6 +16,9 @@ "AGAT_SPEXTRACTSEQUENCES": { "agat": "v1.4.0" }, + "AGAT_SPFILTERBYORFSIZE": { + "agat": "v1.4.1" + }, "AGAT_SPFILTERFEATUREFROMKILLLIST": { "agat": "v1.4.0" }, @@ -203,8 +206,8 @@ ], "meta": { "nf-test": "0.9.2", - "nextflow": "24.04.2" + "nextflow": "24.04.4" }, - "timestamp": "2024-12-05T07:56:38.915238" + "timestamp": "2024-12-10T21:52:10.308719" } -} +} \ No newline at end of file From 0f7784ccf5ecd7fcf14798145086afd912458745 Mon Sep 17 00:00:00 2001 From: Usman Rashid <usman@smme.edu.pk> Date: Wed, 11 Dec 2024 21:18:26 +1300 Subject: [PATCH 3/6] Added test to verify that GFFREAD can filter mRNA by CDS length --- modules/local/tests/gffread/main.nf.test | 38 +++++++++++++++ modules/local/tests/gffread/main.nf.test.snap | 47 +++++++++++++++++++ modules/local/tests/gffread/nextflow.config | 5 ++ modules/local/tests/gffread/testdata/t.gff | 47 +++++++++++++++++++ subworkflows/local/gff_eggnogmapper.nf | 8 ++-- 5 files changed, 141 insertions(+), 4 deletions(-) create mode 100644 modules/local/tests/gffread/main.nf.test create mode 100644 modules/local/tests/gffread/main.nf.test.snap create mode 100644 modules/local/tests/gffread/nextflow.config create mode 100644 modules/local/tests/gffread/testdata/t.gff diff --git a/modules/local/tests/gffread/main.nf.test b/modules/local/tests/gffread/main.nf.test new file mode 100644 index 0000000..60e588b --- /dev/null +++ b/modules/local/tests/gffread/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process GFFREAD" + script "../../../nf-core/gffread/main.nf" + config "./nextflow.config" + process "GFFREAD" + + tag "gffread" + tag "modules_nfcore" + tag "modules" + + test("filter by length") { + + when { + process { + """ + input[0] = [ + [id: 'test'], + file("$baseDir" + '/modules/local/tests/gffread/testdata/t.gff', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert file(process.out.gffread_gff[0][1]).text.contains('gene19851') }, + { assert file(process.out.gffread_gff[0][1]).text.contains('gene19851.t1') }, + { assert ! file(process.out.gffread_gff[0][1]).text.contains('gene19851.t2') } // This is the only transcript which is being knocked out + ) + } + + } + +} diff --git a/modules/local/tests/gffread/main.nf.test.snap b/modules/local/tests/gffread/main.nf.test.snap new file mode 100644 index 0000000..261f436 --- /dev/null +++ b/modules/local/tests/gffread/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "filter by length": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.gff3:md5,59a7d6ff7123589ef2b90b20043a347c" + ] + ], + "2": [ + + ], + "3": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ], + "gffread_fasta": [ + + ], + "gffread_gff": [ + [ + { + "id": "test" + }, + "test.gff3:md5,59a7d6ff7123589ef2b90b20043a347c" + ] + ], + "gtf": [ + + ], + "versions": [ + "versions.yml:md5,05f671c6c6e530acedad0af0a5948dbd" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.04.4" + }, + "timestamp": "2024-12-11T21:11:59.953464" + } +} \ No newline at end of file diff --git a/modules/local/tests/gffread/nextflow.config b/modules/local/tests/gffread/nextflow.config new file mode 100644 index 0000000..734d066 --- /dev/null +++ b/modules/local/tests/gffread/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '--no-pseudo --keep-genes -C -l 72' + } +} diff --git a/modules/local/tests/gffread/testdata/t.gff b/modules/local/tests/gffread/testdata/t.gff new file mode 100644 index 0000000..6b1c076 --- /dev/null +++ b/modules/local/tests/gffread/testdata/t.gff @@ -0,0 +1,47 @@ +##gff-version 3 +### +chr23 AUGUSTUS gene 16515075 16516672 . - . ID=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29 +chr23 AUGUSTUS mRNA 16515075 16516597 1 - . ID=gene19849.t1;Parent=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29 +chr23 AUGUSTUS exon 16515075 16515794 . - . ID=gene19849.t1.exon1;Parent=gene19849.t1 +chr23 AUGUSTUS CDS 16515075 16515794 1 - 0 ID=gene19849.t1.cds1;Parent=gene19849.t1 +chr23 AUGUSTUS exon 16516562 16516597 . - . ID=gene19849.t1.exon2;Parent=gene19849.t1 +chr23 AUGUSTUS CDS 16516562 16516597 1 - 0 ID=gene19849.t1.cds2;Parent=gene19849.t1 +chr23 gmst mRNA 16515075 16516672 . - . ID=gene19849.t2;Parent=gene19849;description=Protein%20of%20unknown%20function%20%28DUF1635%29 +chr23 gmst exon 16515075 16515794 50.2 - 0 ID=gene19849.t2.exon1;Parent=gene19849.t2 +chr23 gmst CDS 16515075 16515794 50.2 - 0 ID=gene19849.t2.cds1;Parent=gene19849.t2 +chr23 gmst exon 16516562 16516672 50.2 - 0 ID=gene19849.t2.exon2;Parent=gene19849.t2 +chr23 gmst CDS 16516562 16516672 50.2 - 0 ID=gene19849.t2.cds2;Parent=gene19849.t2 +### +chr23 gmst gene 16530414 16531453 . - . ID=gene19850;description=Myb-like%20DNA-binding%20domain +chr23 gmst mRNA 16530414 16531453 . - . ID=gene19850.t1;Parent=gene19850;description=Myb-like%20DNA-binding%20domain +chr23 gmst exon 16530414 16531041 42.7 - 1 ID=gene19850.t1.exon1;Parent=gene19850.t1 +chr23 gmst CDS 16530414 16531041 42.7 - 1 ID=gene19850.t1.cds1;Parent=gene19850.t1 +chr23 gmst exon 16531197 16531453 42.7 - 0 ID=gene19850.t1.exon2;Parent=gene19850.t1 +chr23 gmst CDS 16531197 16531453 42.7 - 0 ID=gene19850.t1.cds2;Parent=gene19850.t1 +### +chr23 AUGUSTUS gene 16530414 16531542 . - . ID=gene19851;description=Differing%20isoform%20descriptions +chr23 AUGUSTUS mRNA 16530414 16531542 1 - . ID=gene19851.t1;Parent=gene19851;description=Myb-like%20DNA-binding%20domain +chr23 AUGUSTUS exon 16530414 16530721 . - . ID=gene19851.t1.exon1;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16530414 16530721 1 - 2 ID=gene19851.t1.cds1;Parent=gene19851.t1 +chr23 AUGUSTUS exon 16530824 16531041 . - . ID=gene19851.t1.exon2;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16530824 16531041 1 - 1 ID=gene19851.t1.cds2;Parent=gene19851.t1 +chr23 AUGUSTUS exon 16531197 16531326 . - . ID=gene19851.t1.exon3;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16531197 16531326 1 - 2 ID=gene19851.t1.cds3;Parent=gene19851.t1 +chr23 AUGUSTUS exon 16531428 16531542 . - . ID=gene19851.t1.exon4;Parent=gene19851.t1 +chr23 AUGUSTUS CDS 16531428 16531542 1 - 0 ID=gene19851.t1.cds4;Parent=gene19851.t1 +chr23 GeneMark.hmm3 mRNA 16531514 16531542 . - . ID=gene19851.t2;Parent=gene19851;description=Hypothetical%20protein%20%7C%20no%20eggnog%20hit +chr23 GeneMark.hmm3 exon 16531514 16531542 . - 0 ID=gene19851.t2.exon1;Parent=gene19851.t2 +chr23 GeneMark.hmm3 CDS 16531514 16531542 . - 0 ID=gene19851.t2.cds1;Parent=gene19851.t2 +### +chr23 AUGUSTUS gene 16539401 16545431 . + . ID=gene19852;description=nuclease%20HARBI1 +chr23 AUGUSTUS mRNA 16539401 16545431 1 + . ID=gene19852.t1;Parent=gene19852;description=nuclease%20HARBI1 +chr23 AUGUSTUS exon 16539401 16539509 . + . ID=gene19852.t1.exon1;Parent=gene19852.t1 +chr23 AUGUSTUS CDS 16539401 16539509 1 + 0 ID=gene19852.t1.cds1;Parent=gene19852.t1 +chr23 AUGUSTUS exon 16544386 16545431 . + . ID=gene19852.t1.exon2;Parent=gene19852.t1 +chr23 AUGUSTUS CDS 16544386 16545431 1 + 2 ID=gene19852.t1.cds2;Parent=gene19852.t1 +### +chr23 AUGUSTUS gene 16556338 16556796 . + . ID=gene19853;description=Zinc%20finger%20protein +chr23 AUGUSTUS mRNA 16556338 16556796 1 + . ID=gene19853.t1;Parent=gene19853;description=Zinc%20finger%20protein +chr23 AUGUSTUS exon 16556338 16556796 . + . ID=gene19853.t1.exon1;Parent=gene19853.t1 +chr23 AUGUSTUS CDS 16556338 16556796 1 + 0 ID=gene19853.t1.cds1;Parent=gene19853.t1 +### diff --git a/subworkflows/local/gff_eggnogmapper.nf b/subworkflows/local/gff_eggnogmapper.nf index 841a243..8e402d4 100644 --- a/subworkflows/local/gff_eggnogmapper.nf +++ b/subworkflows/local/gff_eggnogmapper.nf @@ -16,8 +16,8 @@ workflow GFF_EGGNOGMAPPER { | join(ch_fasta) GFF2FASTA_FOR_EGGNOGMAPPER( - ch_gffread_inputs.map { meta, gff, fasta -> [ meta, gff ] }, - ch_gffread_inputs.map { meta, gff, fasta -> fasta } + ch_gffread_inputs.map { meta, gff, _fasta -> [ meta, gff ] }, + ch_gffread_inputs.map { _meta, _gff, fasta -> fasta } ) ch_gffread_fasta = GFF2FASTA_FOR_EGGNOGMAPPER.out.gffread_fasta @@ -30,9 +30,9 @@ workflow GFF_EGGNOGMAPPER { | combine(Channel.fromPath(db_folder)) EGGNOGMAPPER( - ch_eggnogmapper_inputs.map { meta, fasta, db -> [ meta, fasta ] }, + ch_eggnogmapper_inputs.map { meta, fasta, _db -> [ meta, fasta ] }, [], - ch_eggnogmapper_inputs.map { meta, fasta, db -> db }, + ch_eggnogmapper_inputs.map { _meta, _fasta, db -> db }, [ [], [] ] ) From f9807724bace7b0d2b3a9b3ce892dc49c887996d Mon Sep 17 00:00:00 2001 From: Usman Rashid <usman@smme.edu.pk> Date: Wed, 11 Dec 2024 21:55:17 +1300 Subject: [PATCH 4/6] Updated snapshots --- conf/modules.config | 4 +- modules.json | 5 - .../agat/spfilterbyorfsize/environment.yml | 7 -- modules/gallvp/agat/spfilterbyorfsize/main.nf | 60 ----------- .../gallvp/agat/spfilterbyorfsize/meta.yml | 67 ------------ .../agat/spfilterbyorfsize/tests/main.nf.test | 62 ----------- .../spfilterbyorfsize/tests/main.nf.test.snap | 100 ------------------ subworkflows/local/gff_merge_cleanup.nf | 10 +- tests/minimal/main.nf.test.snap | 12 +-- tests/stub/main.nf.test.snap | 8 +- 10 files changed, 17 insertions(+), 318 deletions(-) delete mode 100644 modules/gallvp/agat/spfilterbyorfsize/environment.yml delete mode 100644 modules/gallvp/agat/spfilterbyorfsize/main.nf delete mode 100644 modules/gallvp/agat/spfilterbyorfsize/meta.yml delete mode 100644 modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test delete mode 100644 modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap diff --git a/conf/modules.config b/conf/modules.config index fbb5f52..dd15d6c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -240,8 +240,8 @@ process { // SUBWORKFLOW: GFF_MERGE_CLEANUP ext.prefix = { "${meta.id}.liftoff.braker" } } - withName: '.*:GFF_MERGE_CLEANUP:AGAT_SPFILTERBYORFSIZE' { - ext.args = params.filter_genes_by_aa_length ? "-s ${params.filter_genes_by_aa_length}" : '' + withName: '.*:GFF_MERGE_CLEANUP:FILTER_BY_ORF_SIZE' { + ext.args = params.filter_genes_by_aa_length ? "--no-pseudo --keep-genes -C -l ${params.filter_genes_by_aa_length * 3}" : '' } withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' { diff --git a/modules.json b/modules.json index 6b9d74a..da05f16 100644 --- a/modules.json +++ b/modules.json @@ -15,11 +15,6 @@ "git_sha": "a8939d36280e7d9037c7cf164eeede19e46546a4", "installed_by": ["gxf_fasta_agat_spaddintrons_spextractsequences"] }, - "agat/spfilterbyorfsize": { - "branch": "main", - "git_sha": "a0054cdffbd84f002fb6582b28575b699e01098e", - "installed_by": ["modules"] - }, "agat/spflagshortintrons": { "branch": "main", "git_sha": "d8f08700c82a3bd14811a3dfe7e7d63838130693", diff --git a/modules/gallvp/agat/spfilterbyorfsize/environment.yml b/modules/gallvp/agat/spfilterbyorfsize/environment.yml deleted file mode 100644 index 2c3daab..0000000 --- a/modules/gallvp/agat/spfilterbyorfsize/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda -dependencies: - - "bioconda::agat=1.4.2" diff --git a/modules/gallvp/agat/spfilterbyorfsize/main.nf b/modules/gallvp/agat/spfilterbyorfsize/main.nf deleted file mode 100644 index 502a9cd..0000000 --- a/modules/gallvp/agat/spfilterbyorfsize/main.nf +++ /dev/null @@ -1,60 +0,0 @@ -process AGAT_SPFILTERBYORFSIZE { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/agat:1.4.2--pl5321hdfd78af_0': - 'biocontainers/agat:1.4.2--pl5321hdfd78af_0' }" - - input: - tuple val(meta), path(gxf) - path config - - output: - tuple val(meta), path("*.passed.gff") , emit: passed_gff - tuple val(meta), path("*.failed.gff") , emit: failed_gff - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def config_arg = config ? "-c $config" : '' - if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - agat_sp_filter_by_ORF_size.pl \\ - -g $gxf \\ - $args \\ - $config_arg \\ - -o $prefix - - mv \\ - ${prefix}_NOT* \\ - "${prefix}.failed.gff" - - mv \\ - ${prefix}_* \\ - "${prefix}.passed.gff" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - if( "$gxf" in [ "${prefix}.passed.gff", "${prefix}.failed.gff" ] ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - touch ${prefix}.passed.gff - touch ${prefix}.failed.gff - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - agat: \$(agat_sp_flag_short_introns.pl -h | sed -n 's/.*(AGAT) - Version: \\(.*\\) .*/\\1/p') - END_VERSIONS - """ -} diff --git a/modules/gallvp/agat/spfilterbyorfsize/meta.yml b/modules/gallvp/agat/spfilterbyorfsize/meta.yml deleted file mode 100644 index cf399da..0000000 --- a/modules/gallvp/agat/spfilterbyorfsize/meta.yml +++ /dev/null @@ -1,67 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json -name: "agat_spfilterbyorfsize" -description: The script reads a gff annotation file, and create two output files, - one contains the gene models with ORF passing the test, the other contains the rest. - By default the test is "> 100" that means all gene models that have ORF longer than - 100 Amino acids, will pass the test. -keywords: - - genomics - - GFF/GTF - - filter - - annotation -tools: - - "agat": - description: "Another Gff Analysis Toolkit (AGAT). Suite of tools to handle gene - annotations in any GTF/GFF format." - homepage: "https://agat.readthedocs.io/en/latest/" - documentation: "https://agat.readthedocs.io/en/latest/" - tool_dev_url: "https://github.com/NBISweden/AGAT" - doi: "10.5281/zenodo.3552717" - licence: ["GPL v3"] - identifier: biotools:AGAT - -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'sample1' ]` - - gxf: - type: file - description: Input GFF3/GTF file - pattern: "*.{gff,gff3,gtf}" - - - config: - type: file - description: | - Input agat config file. By default AGAT takes as input agat_config.yaml file from the working directory if any, - otherwise it takes the orignal agat_config.yaml shipped with AGAT. To get the agat_config.yaml locally type: "agat config --expose". - The --config option gives you the possibility to use your own AGAT config file (located elsewhere or named differently). - pattern: "*.yaml" -output: - - passed_gff: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'sample1' ] - - "*.passed.gff": - type: file - description: GFF file with gene models which pass the filter test - - failed_gff: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. `[ id:'sample1' ] - - "*.failed.gff": - type: file - description: GFF file with remaining gene models - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@GallVp" -maintainers: - - "@GallVp" diff --git a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test deleted file mode 100644 index 4a6e1fc..0000000 --- a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test +++ /dev/null @@ -1,62 +0,0 @@ -nextflow_process { - - name "Test Process AGAT_SPFILTERBYORFSIZE" - script "../main.nf" - process "AGAT_SPFILTERBYORFSIZE" - - tag "modules" - tag "modules_gallvp" - tag "agat" - tag "agat/spfilterbyorfsize" - - test("actinidia_chinensis - genome - gtf") { - - - when { - process { - """ - input[0] = [ - [ id:'test' ], // meta map - file(params.modules_testdata_base_path + 'genomics/eukaryotes/actinidia_chinensis/genome/chr1/genome.gtf.gz', checkIfExists: true) - ] - input[1] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - test("homo_sapiens - genome - gtf - stub") { - - options '-stub' - - when { - process { - """ - input[0] = [ - [ id:'test' ], // meta map - file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr1/genome.gtf', checkIfExists: true) - ] - input[1] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - - } - - -} diff --git a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap b/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap deleted file mode 100644 index 22b26fe..0000000 --- a/modules/gallvp/agat/spfilterbyorfsize/tests/main.nf.test.snap +++ /dev/null @@ -1,100 +0,0 @@ -{ - "homo_sapiens - genome - gtf - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" - ], - "failed_gff": [ - [ - { - "id": "test" - }, - "test.failed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "passed_gff": [ - [ - { - "id": "test" - }, - "test.passed.gff:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.04.4" - }, - "timestamp": "2024-12-10T17:07:11.619928" - }, - "actinidia_chinensis - genome - gtf": { - "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb" - ] - ], - "2": [ - "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" - ], - "failed_gff": [ - [ - { - "id": "test" - }, - "test.failed.gff:md5,d7eb6ae1c3dc30675138029b513073eb" - ] - ], - "passed_gff": [ - [ - { - "id": "test" - }, - "test.passed.gff:md5,e2558c89e50df32d654f19f9a69e46a3" - ] - ], - "versions": [ - "versions.yml:md5,bc298e3688f3f90f287f56ee6929bd29" - ] - } - ], - "meta": { - "nf-test": "0.9.2", - "nextflow": "24.04.4" - }, - "timestamp": "2024-12-10T17:07:06.829402" - } -} \ No newline at end of file diff --git a/subworkflows/local/gff_merge_cleanup.nf b/subworkflows/local/gff_merge_cleanup.nf index fbdea37..8a77eda 100644 --- a/subworkflows/local/gff_merge_cleanup.nf +++ b/subworkflows/local/gff_merge_cleanup.nf @@ -1,6 +1,6 @@ include { AGAT_SPMERGEANNOTATIONS } from '../../modules/nf-core/agat/spmergeannotations/main' include { GT_GFF3 } from '../../modules/nf-core/gt/gff3/main' -include { AGAT_SPFILTERBYORFSIZE } from '../../modules/gallvp/agat/spfilterbyorfsize/main' +include { GFFREAD as FILTER_BY_ORF_SIZE } from '../../modules/nf-core/gffread/main' include { AGAT_CONVERTSPGXF2GXF } from '../../modules/nf-core/agat/convertspgxf2gxf/main' workflow GFF_MERGE_CLEANUP { @@ -31,18 +31,18 @@ workflow GFF_MERGE_CLEANUP { | mix ( ch_gff_branch.braker_only.map { meta, braker_gff, _liftoff_gff -> [ meta, braker_gff ] } ) ch_versions = ch_versions.mix(AGAT_SPMERGEANNOTATIONS.out.versions.first()) - // MODULE: AGAT_SPFILTERBYORFSIZE + // MODULE: GFFREAD as FILTER_BY_ORF_SIZE ch_filter_input = ch_merged_gff | branch { filter: val_filter_by_aa_length != null pass: val_filter_by_aa_length == null } - AGAT_SPFILTERBYORFSIZE ( ch_filter_input.filter, [] ) + FILTER_BY_ORF_SIZE ( ch_filter_input.filter, [] ) - ch_filtered_gff = AGAT_SPFILTERBYORFSIZE.out.passed_gff + ch_filtered_gff = FILTER_BY_ORF_SIZE.out.gffread_gff | mix ( ch_filter_input.pass ) - ch_versions = ch_versions.mix(AGAT_SPFILTERBYORFSIZE.out.versions.first()) + ch_versions = ch_versions.mix(FILTER_BY_ORF_SIZE.out.versions.first()) // MODULE: GT_GFF3 GT_GFF3 ( ch_filtered_gff ) diff --git a/tests/minimal/main.nf.test.snap b/tests/minimal/main.nf.test.snap index f4b2aba..e0f2ce3 100644 --- a/tests/minimal/main.nf.test.snap +++ b/tests/minimal/main.nf.test.snap @@ -16,9 +16,6 @@ "AGAT_SPEXTRACTSEQUENCES": { "agat": "v1.4.0" }, - "AGAT_SPFILTERBYORFSIZE": { - "agat": "v1.4.1" - }, "BRAKER3": { "braker3": "3.0.8", "augustus": "3.5.0", @@ -40,6 +37,9 @@ "FASTAVALIDATOR": { "py_fasta_validator": 0.6 }, + "FILTER_BY_ORF_SIZE": { + "gffread": "0.12.7" + }, "FINAL_GFF_CHECK": { "genometools": "1.6.5" }, @@ -70,9 +70,9 @@ "stable paths": [ "a_thaliana.cdna.fasta:md5,12b9bef973e488640aec8c04ba3882fe", "a_thaliana.cds.fasta:md5,b81060419355a590560f92aec8536281", - "a_thaliana.gt.gff3:md5,8ab16549095f605ff8715ac4a3de58ed", + "a_thaliana.gt.gff3:md5,528459cf9596523bf66de99d24c37e20", "a_thaliana.pep.fasta:md5,4994c0393ca0245a1c57966d846d101e", - "a_thaliana.gff3:md5,d23d16cd86499d48a30ffb981ed27891", + "a_thaliana.gff3:md5,30adac1b21d7aaed6ca7fb71ab33f32d", "summary_stats.json:md5,007ba5cf2b7a2fd395a27d9458ca2d2e" ], "stable names": [ @@ -97,6 +97,6 @@ "nf-test": "0.9.2", "nextflow": "24.04.4" }, - "timestamp": "2024-12-11T06:36:01.956188" + "timestamp": "2024-12-11T21:49:09.751422" } } \ No newline at end of file diff --git a/tests/stub/main.nf.test.snap b/tests/stub/main.nf.test.snap index 1548c96..7ed6f6e 100644 --- a/tests/stub/main.nf.test.snap +++ b/tests/stub/main.nf.test.snap @@ -16,9 +16,6 @@ "AGAT_SPEXTRACTSEQUENCES": { "agat": "v1.4.0" }, - "AGAT_SPFILTERBYORFSIZE": { - "agat": "v1.4.1" - }, "AGAT_SPFILTERFEATUREFROMKILLLIST": { "agat": "v1.4.0" }, @@ -73,6 +70,9 @@ "FASTP": { "fastp": "0.23.4" }, + "FILTER_BY_ORF_SIZE": { + "gffread": "0.12.7" + }, "FINAL_GFF_CHECK": { "genometools": "1.6.5" }, @@ -208,6 +208,6 @@ "nf-test": "0.9.2", "nextflow": "24.04.4" }, - "timestamp": "2024-12-10T21:52:10.308719" + "timestamp": "2024-12-11T21:51:12.841395" } } \ No newline at end of file From ab3ae3755423e16218e95c7e1de64d94adbd6a15 Mon Sep 17 00:00:00 2001 From: Usman Rashid <usman@smme.edu.pk> Date: Thu, 12 Dec 2024 09:58:01 +1300 Subject: [PATCH 5/6] Updated README and snapshot --- README.md | 5 +++-- tests/minimal/main.nf.test | 3 +++ tests/minimal/main.nf.test.snap | 17 +++++++++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 177e8f5..af463db 100644 --- a/README.md +++ b/README.md @@ -39,11 +39,12 @@ - Optionally, allow or remove iso-forms - Remove BRAKER models from Liftoff loci - Merge Liftoff and BRAKER models - - Optionally, remove models with ORFs shorter than `N` amino acids - Optionally, remove models without any EggNOG-mapper hits - [EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper): Add functional annotation to gff - [GenomeTools](https://github.com/genometools/genometools): GFF format validation -- [GffRead](https://github.com/gpertea/gffread): Extraction of protein sequences +- [GffRead](https://github.com/gpertea/gffread) + - Extraction of protein sequences + - Optionally, remove models with ORFs shorter than `N` amino acids - [OrthoFinder](https://github.com/davidemms/OrthoFinder): Perform phylogenetic orthology inference across genomes - [GffCompare](https://github.com/gpertea/gffcompare): Compare and benchmark against an existing annotation - [BUSCO](https://gitlab.com/ezlab/busco): Completeness statistics for genome and annotation through proteins diff --git a/tests/minimal/main.nf.test b/tests/minimal/main.nf.test index cce8a77..5f1d1af 100644 --- a/tests/minimal/main.nf.test +++ b/tests/minimal/main.nf.test @@ -38,6 +38,8 @@ nextflow_pipeline { ['**'] ) + def summary_stats = (Map) new groovy.json.JsonSlurper().parseText(file("$outputDir/genepal_data/summary_stats.json").text) + assertAll( { assert workflow.success}, { assert snapshot( @@ -46,6 +48,7 @@ nextflow_pipeline { 'versions': removeNextflowVersion("$outputDir/pipeline_info/genepal_software_mqc_versions.yml"), 'stable paths': stable_path, 'stable names': getRelativePath(stable_name, outputDir), + 'summary_stats': summary_stats ] ).match() } ) diff --git a/tests/minimal/main.nf.test.snap b/tests/minimal/main.nf.test.snap index e0f2ce3..96c8444 100644 --- a/tests/minimal/main.nf.test.snap +++ b/tests/minimal/main.nf.test.snap @@ -90,13 +90,26 @@ "genepal_report.html", "multiqc_report.html", "pipeline_info" - ] + ], + "summary_stats": { + "stats": [ + { + "ID": "a_thaliana", + "Genes": 252, + "mRNA": 265, + "CDS": 1340, + "Exons": 1340, + "Intron": 1075, + "Non canon splice sites": 18 + } + ] + } } ], "meta": { "nf-test": "0.9.2", "nextflow": "24.04.4" }, - "timestamp": "2024-12-11T21:49:09.751422" + "timestamp": "2024-12-12T09:36:52.952048" } } \ No newline at end of file From c65ebaac0bd81f29c38391be1fd46cf46170284e Mon Sep 17 00:00:00 2001 From: Usman Rashid <usman@smme.edu.pk> Date: Mon, 16 Dec 2024 10:04:21 +1300 Subject: [PATCH 6/6] Added 1 to filter_genes_by_aa_length to exclude stop codon from filter length --- CHANGELOG.md | 2 +- conf/modules.config | 2 +- docs/parameters.md | 14 +++++++------- nextflow_schema.json | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 33813b7..9937888 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v0.6.0 - [10-Dec-2024] +## v0.6.0 - [16-Dec-2024] ### 'Added' diff --git a/conf/modules.config b/conf/modules.config index dd15d6c..2a14621 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -241,7 +241,7 @@ process { // SUBWORKFLOW: GFF_MERGE_CLEANUP } withName: '.*:GFF_MERGE_CLEANUP:FILTER_BY_ORF_SIZE' { - ext.args = params.filter_genes_by_aa_length ? "--no-pseudo --keep-genes -C -l ${params.filter_genes_by_aa_length * 3}" : '' + ext.args = params.filter_genes_by_aa_length ? "--no-pseudo --keep-genes -C -l ${ ( params.filter_genes_by_aa_length + 1 ) * 3 }" : '' } withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' { diff --git a/docs/parameters.md b/docs/parameters.md index 0c2bb09..7ccd67a 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -59,13 +59,13 @@ A Nextflow pipeline for consensus, phased and pan-genome annotation. ## Post-annotation filtering options -| Parameter | Description | Type | Default | Required | Hidden | -| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | -| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | -| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | -| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | -| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | -| `filter_genes_by_aa_length` | Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped. | `integer` | 24 | | | +| Parameter | Description | Type | Default | Required | Hidden | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------- | ------- | -------- | ------ | +| `allow_isoforms` | Allow multiple isoforms for gene models | `boolean` | True | | | +| `enforce_full_intron_support` | Require every model to have external evidence for all its introns | `boolean` | True | | | +| `filter_liftoff_by_hints` | Use BRAKER hints to filter Liftoff models | `boolean` | True | | | +| `eggnogmapper_purge_nohits` | Purge transcripts which do not have a hit against eggnog | `boolean` | | | | +| `filter_genes_by_aa_length` | Filter genes with open reading frames shorter than the specified number of amino acids excluding the stop codon. If set to `null`, this filter step is skipped. | `integer` | 24 | | | ## Annotation output options diff --git a/nextflow_schema.json b/nextflow_schema.json index abe26a9..1012531 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -277,7 +277,7 @@ "type": "integer", "default": 24, "fa_icon": "fas fa-hashtag", - "description": "Filter genes with open reading frames shorter than the specified number of amino acids. If set to `null`, this filter step is skipped.", + "description": "Filter genes with open reading frames shorter than the specified number of amino acids excluding the stop codon. If set to `null`, this filter step is skipped.", "minimum": 3 } }