Skip to content

Commit

Permalink
Added GFF_STORE sub workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
GallVp committed Apr 30, 2024
1 parent f6bc9c7 commit f819e6b
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 37 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
32. Now using `description` field to store notes and textual annotations in the gff files
33. Now using `mRNA` in place of `transcript` in gff files
34. Now `eggnogmapper_purge_nohits` is set to `false` by default
35. Added `GFF_STORE` sub workflow

### `Fixed`

Expand Down
20 changes: 19 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ process {
}

withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' {
ext.args = '-tidy -retainids'
ext.args = '-tidy -retainids -sort'
}
}

Expand All @@ -220,6 +220,12 @@ process {
"--itype proteins",
'--go_evidence all'
].join(' ').trim()

publishDir = [
path: { "${params.outdir}/final/$meta.id" },
mode: "copy",
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
]
}
}

Expand All @@ -229,6 +235,18 @@ process {
}
}

process {
withName: 'FINAL_GFF_CHECK' {
ext.args = '-tidy -retainids -sort'

publishDir = [
path: { "${params.outdir}/final/$meta.id" },
mode: "copy",
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
]
}
}

process {
withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' {
publishDir = [
Expand Down
114 changes: 114 additions & 0 deletions subworkflows/local/gff_store.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import java.net.URLEncoder

include { GT_GFF3 as FINAL_GFF_CHECK } from '../../modules/nf-core/gt/gff3/main'

workflow GFF_STORE {
take:
ch_target_gff // [ meta, gff ]
ch_eggnogmapper_annotations // [ meta, annotations ]

main:
ch_versions = Channel.empty()

// COLLECTFILE: Add eggnogmapper hits to gff
ch_described_gff = ch_target_gff
| join(ch_eggnogmapper_annotations)
| map { meta, gff, annotations ->
def tx_annotations = annotations.readLines()
.findAll { ! it.startsWith('#') }
.collect { line ->
def cols = line.split('\t')
def id = cols[0]
def txt = cols[7]

[ id, txt ]
}
.findAll { id, txt ->
txt != '-'
}.collectEntries { id, txt ->
[ id, txt ]
}

def gene_tx_annotations = [:]
gff.readLines()
.findAll { line ->
if ( line.startsWith('#') ) { return false }

def cols = line.split('\t')
def feat = cols[2]

if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return false }

return true
}
.each { line ->
def cols = line.split('\t')
def atts = cols[8]

def matches = atts =~ /ID=([^;]*)/
def tx_id = matches[0][1]

def matches_p= atts =~ /Parent=([^;]*)/
def gene_id = matches_p[0][1]

if ( ! gene_tx_annotations.containsKey(gene_id) ) {
gene_tx_annotations[gene_id] = [:]
}

def anno = tx_annotations.containsKey(tx_id)
? URLEncoder.encode(tx_annotations[tx_id], "UTF-8").replace('+', '%20')
: URLEncoder.encode('hypothetical protein | no eggnog hit', "UTF-8").replace('+', '%20')

gene_tx_annotations[gene_id] += [ ( tx_id ): anno ]
}

gene_tx_annotations = gene_tx_annotations
.collectEntries { gene_id, tx_annos ->
def default_anno = tx_annos.values().first()

if ( tx_annos.values().findAll { it != default_anno }.size() > 0 ) {
return [ gene_id, ( tx_annos + [ 'default': 'differing%20isoform%20descriptions' ] ) ]
}

[ gene_id, ( tx_annos + [ 'default': default_anno ] ) ]
}

def gff_lines = gff.readLines()
.collect { line ->

if ( line.startsWith('#') ) { return line }

def cols = line.split('\t')
def feat = cols[2]
def atts = cols[8]

if ( ! ( feat == 'gene' || feat == 'transcript' || feat == 'mRNA' ) ) { return line }

def id = feat == 'gene' ? ( atts =~ /ID=([^;]*)/ )[0][1] : ( atts =~ /Parent=([^;]*)/ )[0][1]

if ( ! gene_tx_annotations.containsKey(id) ) { return line }

def tx_id = feat == 'gene' ? null : ( atts =~ /ID=([^;]*)/ )[0][1]
def desc = feat == 'gene' ? gene_tx_annotations[id]['default'] : gene_tx_annotations[id][tx_id]

return ( line + ";description=$desc" )
}

[ "${meta.id}.described.gff" ] + gff_lines.join('\n')
}
| collectFile(newLine: true)
| map { file ->
[ [ id: file.baseName.replace('.described', '') ], file ]
}

// MODULE: GT_GFF3 as FINAL_GFF_CHECK
FINAL_GFF_CHECK ( ch_described_gff )

ch_final_gff = FINAL_GFF_CHECK.out.gt_gff3
ch_versions = ch_versions.mix(FINAL_GFF_CHECK.out.versions.first())


emit:
final_gff = ch_final_gff // [ meta, gff ]
versions = ch_versions // [ versions.yml ]
}
36 changes: 1 addition & 35 deletions subworkflows/local/purge_nohit_models.nf
Original file line number Diff line number Diff line change
Expand Up @@ -58,41 +58,7 @@ workflow PURGE_NOHIT_MODELS {
ch_target_purged_gff = AGAT_SPFILTERFEATUREFROMKILLLIST.out.gff
ch_versions = ch_versions.mix(AGAT_SPFILTERFEATUREFROMKILLLIST.out.versions.first())

// COLLECTFILE: Mark transcripts with description=hypothetical%20protein%20%7C%20no%20eggnog%20hit
ch_marked_gff = val_purge_nohits
? Channel.empty()
: ch_target_gff
| join(ch_kill_list)
| map { meta, gff, lst ->
def tx_without_hits = lst.readLines().collect { "$it".trim() }

def marked_gff_lines = gff.readLines()
.collect { line ->

if ( line.startsWith('#') ) { return line }

def cols = line.split('\t')
def feat = cols[2]

if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return line }

def atts = cols[8]
def matches = atts =~ /ID=([^;]*)/
def tx_id = matches[0][1]

if ( ! ( tx_id in tx_without_hits ) ) { return line }

return ( line + ';description=hypothetical%20protein%20%7C%20no%20eggnog%20hit' )
}

[ "${meta.id}.marked.gff" ] + marked_gff_lines.join('\n')
}
| collectFile(newLine: true)
| map { file ->
[ [ id: file.baseName.replace('.marked', '') ], file ]
}

emit:
purged_or_marked_gff = ch_target_purged_gff.mix(ch_marked_gff)
purged_gff = ch_target_purged_gff.mix(val_purge_nohits ? Channel.empty() : ch_target_gff)
versions = ch_versions // [ versions.yml ]
}
10 changes: 9 additions & 1 deletion workflows/pangene.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ include { PURGE_BREAKER_MODELS } from '../subworkflows/local/pu
include { GFF_MERGE_CLEANUP } from '../subworkflows/local/gff_merge_cleanup'
include { GFF_EGGNOGMAPPER } from '../subworkflows/local/gff_eggnogmapper'
include { PURGE_NOHIT_MODELS } from '../subworkflows/local/purge_nohit_models'
include { GFF_STORE } from '../subworkflows/local/gff_store'
include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions'

log.info paramsSummaryLog(workflow)
Expand Down Expand Up @@ -232,6 +233,7 @@ workflow PANGENE {
)

ch_eggnogmapper_hits = GFF_EGGNOGMAPPER.out.eggnogmapper_hits
ch_eggnogmapper_annotations = GFF_EGGNOGMAPPER.out.eggnogmapper_annotations
ch_versions = ch_versions.mix(GFF_EGGNOGMAPPER.out.versions)

// SUBWORKFLOW: PURGE_NOHIT_MODELS
Expand All @@ -241,9 +243,15 @@ workflow PANGENE {
params.eggnogmapper_purge_nohits
)

ch_purged_marked_gff = PURGE_NOHIT_MODELS.out.purged_or_marked_gff
ch_purged_gff = PURGE_NOHIT_MODELS.out.purged_gff
ch_versions = ch_versions.mix(PURGE_NOHIT_MODELS.out.versions)

// SUBWORKFLOW: GFF_STORE
GFF_STORE(
ch_purged_gff,
ch_eggnogmapper_annotations
)

// MODULE: CUSTOM_DUMPSOFTWAREVERSIONS
CUSTOM_DUMPSOFTWAREVERSIONS (
ch_versions.unique().collectFile(name: 'collated_versions.yml')
Expand Down

0 comments on commit f819e6b

Please sign in to comment.