From f819e6b32c49460ceab5bdad9551a0041513d82f Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Wed, 24 Apr 2024 09:59:22 +1200 Subject: [PATCH] Added GFF_STORE sub workflow --- CHANGELOG.md | 1 + conf/modules.config | 20 +++- subworkflows/local/gff_store.nf | 114 +++++++++++++++++++++++ subworkflows/local/purge_nohit_models.nf | 36 +------ workflows/pangene.nf | 10 +- 5 files changed, 144 insertions(+), 37 deletions(-) create mode 100644 subworkflows/local/gff_store.nf diff --git a/CHANGELOG.md b/CHANGELOG.md index cfcb217..952ea21 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 32. Now using `description` field to store notes and textual annotations in the gff files 33. Now using `mRNA` in place of `transcript` in gff files 34. Now `eggnogmapper_purge_nohits` is set to `false` by default +35. Added `GFF_STORE` sub workflow ### `Fixed` diff --git a/conf/modules.config b/conf/modules.config index 215847d..66163de 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -202,7 +202,7 @@ process { } withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' { - ext.args = '-tidy -retainids' + ext.args = '-tidy -retainids -sort' } } @@ -220,6 +220,12 @@ process { "--itype proteins", '--go_evidence all' ].join(' ').trim() + + publishDir = [ + path: { "${params.outdir}/final/$meta.id" }, + mode: "copy", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] } } @@ -229,6 +235,18 @@ process { } } +process { + withName: 'FINAL_GFF_CHECK' { + ext.args = '-tidy -retainids -sort' + + publishDir = [ + path: { "${params.outdir}/final/$meta.id" }, + mode: "copy", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } +} + process { withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ diff --git a/subworkflows/local/gff_store.nf b/subworkflows/local/gff_store.nf new file mode 100644 index 0000000..f487a50 --- /dev/null +++ b/subworkflows/local/gff_store.nf @@ -0,0 +1,114 @@ +import java.net.URLEncoder + +include { GT_GFF3 as FINAL_GFF_CHECK } from '../../modules/nf-core/gt/gff3/main' + +workflow GFF_STORE { + take: + ch_target_gff // [ meta, gff ] + ch_eggnogmapper_annotations // [ meta, annotations ] + + main: + ch_versions = Channel.empty() + + // COLLECTFILE: Add eggnogmapper hits to gff + ch_described_gff = ch_target_gff + | join(ch_eggnogmapper_annotations) + | map { meta, gff, annotations -> + def tx_annotations = annotations.readLines() + .findAll { ! it.startsWith('#') } + .collect { line -> + def cols = line.split('\t') + def id = cols[0] + def txt = cols[7] + + [ id, txt ] + } + .findAll { id, txt -> + txt != '-' + }.collectEntries { id, txt -> + [ id, txt ] + } + + def gene_tx_annotations = [:] + gff.readLines() + .findAll { line -> + if ( line.startsWith('#') ) { return false } + + def cols = line.split('\t') + def feat = cols[2] + + if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return false } + + return true + } + .each { line -> + def cols = line.split('\t') + def atts = cols[8] + + def matches = atts =~ /ID=([^;]*)/ + def tx_id = matches[0][1] + + def matches_p= atts =~ /Parent=([^;]*)/ + def gene_id = matches_p[0][1] + + if ( ! gene_tx_annotations.containsKey(gene_id) ) { + gene_tx_annotations[gene_id] = [:] + } + + def anno = tx_annotations.containsKey(tx_id) + ? URLEncoder.encode(tx_annotations[tx_id], "UTF-8").replace('+', '%20') + : URLEncoder.encode('hypothetical protein | no eggnog hit', "UTF-8").replace('+', '%20') + + gene_tx_annotations[gene_id] += [ ( tx_id ): anno ] + } + + gene_tx_annotations = gene_tx_annotations + .collectEntries { gene_id, tx_annos -> + def default_anno = tx_annos.values().first() + + if ( tx_annos.values().findAll { it != default_anno }.size() > 0 ) { + return [ gene_id, ( tx_annos + [ 'default': 'differing%20isoform%20descriptions' ] ) ] + } + + [ gene_id, ( tx_annos + [ 'default': default_anno ] ) ] + } + + def gff_lines = gff.readLines() + .collect { line -> + + if ( line.startsWith('#') ) { return line } + + def cols = line.split('\t') + def feat = cols[2] + def atts = cols[8] + + if ( ! ( feat == 'gene' || feat == 'transcript' || feat == 'mRNA' ) ) { return line } + + def id = feat == 'gene' ? ( atts =~ /ID=([^;]*)/ )[0][1] : ( atts =~ /Parent=([^;]*)/ )[0][1] + + if ( ! gene_tx_annotations.containsKey(id) ) { return line } + + def tx_id = feat == 'gene' ? null : ( atts =~ /ID=([^;]*)/ )[0][1] + def desc = feat == 'gene' ? gene_tx_annotations[id]['default'] : gene_tx_annotations[id][tx_id] + + return ( line + ";description=$desc" ) + } + + [ "${meta.id}.described.gff" ] + gff_lines.join('\n') + } + | collectFile(newLine: true) + | map { file -> + [ [ id: file.baseName.replace('.described', '') ], file ] + } + + // MODULE: GT_GFF3 as FINAL_GFF_CHECK + FINAL_GFF_CHECK ( ch_described_gff ) + + ch_final_gff = FINAL_GFF_CHECK.out.gt_gff3 + ch_versions = ch_versions.mix(FINAL_GFF_CHECK.out.versions.first()) + + + emit: + final_gff = ch_final_gff // [ meta, gff ] + versions = ch_versions // [ versions.yml ] +} diff --git a/subworkflows/local/purge_nohit_models.nf b/subworkflows/local/purge_nohit_models.nf index b75cb07..5c22f5f 100644 --- a/subworkflows/local/purge_nohit_models.nf +++ b/subworkflows/local/purge_nohit_models.nf @@ -58,41 +58,7 @@ workflow PURGE_NOHIT_MODELS { ch_target_purged_gff = AGAT_SPFILTERFEATUREFROMKILLLIST.out.gff ch_versions = ch_versions.mix(AGAT_SPFILTERFEATUREFROMKILLLIST.out.versions.first()) - // COLLECTFILE: Mark transcripts with description=hypothetical%20protein%20%7C%20no%20eggnog%20hit - ch_marked_gff = val_purge_nohits - ? Channel.empty() - : ch_target_gff - | join(ch_kill_list) - | map { meta, gff, lst -> - def tx_without_hits = lst.readLines().collect { "$it".trim() } - - def marked_gff_lines = gff.readLines() - .collect { line -> - - if ( line.startsWith('#') ) { return line } - - def cols = line.split('\t') - def feat = cols[2] - - if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return line } - - def atts = cols[8] - def matches = atts =~ /ID=([^;]*)/ - def tx_id = matches[0][1] - - if ( ! ( tx_id in tx_without_hits ) ) { return line } - - return ( line + ';description=hypothetical%20protein%20%7C%20no%20eggnog%20hit' ) - } - - [ "${meta.id}.marked.gff" ] + marked_gff_lines.join('\n') - } - | collectFile(newLine: true) - | map { file -> - [ [ id: file.baseName.replace('.marked', '') ], file ] - } - emit: - purged_or_marked_gff = ch_target_purged_gff.mix(ch_marked_gff) + purged_gff = ch_target_purged_gff.mix(val_purge_nohits ? Channel.empty() : ch_target_gff) versions = ch_versions // [ versions.yml ] } diff --git a/workflows/pangene.nf b/workflows/pangene.nf index 3712a34..66a0459 100644 --- a/workflows/pangene.nf +++ b/workflows/pangene.nf @@ -10,6 +10,7 @@ include { PURGE_BREAKER_MODELS } from '../subworkflows/local/pu include { GFF_MERGE_CLEANUP } from '../subworkflows/local/gff_merge_cleanup' include { GFF_EGGNOGMAPPER } from '../subworkflows/local/gff_eggnogmapper' include { PURGE_NOHIT_MODELS } from '../subworkflows/local/purge_nohit_models' +include { GFF_STORE } from '../subworkflows/local/gff_store' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions' log.info paramsSummaryLog(workflow) @@ -232,6 +233,7 @@ workflow PANGENE { ) ch_eggnogmapper_hits = GFF_EGGNOGMAPPER.out.eggnogmapper_hits + ch_eggnogmapper_annotations = GFF_EGGNOGMAPPER.out.eggnogmapper_annotations ch_versions = ch_versions.mix(GFF_EGGNOGMAPPER.out.versions) // SUBWORKFLOW: PURGE_NOHIT_MODELS @@ -241,9 +243,15 @@ workflow PANGENE { params.eggnogmapper_purge_nohits ) - ch_purged_marked_gff = PURGE_NOHIT_MODELS.out.purged_or_marked_gff + ch_purged_gff = PURGE_NOHIT_MODELS.out.purged_gff ch_versions = ch_versions.mix(PURGE_NOHIT_MODELS.out.versions) + // SUBWORKFLOW: GFF_STORE + GFF_STORE( + ch_purged_gff, + ch_eggnogmapper_annotations + ) + // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml')