From f819e6b32c49460ceab5bdad9551a0041513d82f Mon Sep 17 00:00:00 2001
From: Usman Rashid <usman@smme.edu.pk>
Date: Wed, 24 Apr 2024 09:59:22 +1200
Subject: [PATCH] Added GFF_STORE sub workflow

---
 CHANGELOG.md                             |   1 +
 conf/modules.config                      |  20 +++-
 subworkflows/local/gff_store.nf          | 114 +++++++++++++++++++++++
 subworkflows/local/purge_nohit_models.nf |  36 +------
 workflows/pangene.nf                     |  10 +-
 5 files changed, 144 insertions(+), 37 deletions(-)
 create mode 100644 subworkflows/local/gff_store.nf

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cfcb217..952ea21 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 32. Now using `description` field to store notes and textual annotations in the gff files
 33. Now using `mRNA` in place of `transcript` in gff files
 34. Now `eggnogmapper_purge_nohits` is set to `false` by default
+35. Added `GFF_STORE` sub workflow
 
 ### `Fixed`
 
diff --git a/conf/modules.config b/conf/modules.config
index 215847d..66163de 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -202,7 +202,7 @@ process {
     }
 
     withName: '.*:GFF_MERGE_CLEANUP:GT_GFF3' {
-        ext.args = '-tidy -retainids'
+        ext.args = '-tidy -retainids -sort'
     }
 }
 
@@ -220,6 +220,12 @@ process {
             "--itype proteins",
             '--go_evidence all'
         ].join(' ').trim()
+
+        publishDir = [
+            path: { "${params.outdir}/final/$meta.id" },
+            mode: "copy",
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
     }
 }
 
@@ -229,6 +235,18 @@ process {
     }
 }
 
+process {
+    withName: 'FINAL_GFF_CHECK' {
+        ext.args = '-tidy -retainids -sort'
+
+        publishDir = [
+            path: { "${params.outdir}/final/$meta.id" },
+            mode: "copy",
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
+    }
+}
+
 process {
     withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' {
         publishDir = [
diff --git a/subworkflows/local/gff_store.nf b/subworkflows/local/gff_store.nf
new file mode 100644
index 0000000..f487a50
--- /dev/null
+++ b/subworkflows/local/gff_store.nf
@@ -0,0 +1,114 @@
+import java.net.URLEncoder
+
+include { GT_GFF3 as FINAL_GFF_CHECK    } from '../../modules/nf-core/gt/gff3/main'
+
+workflow GFF_STORE {
+    take:
+    ch_target_gff               // [ meta, gff ]
+    ch_eggnogmapper_annotations // [ meta, annotations ]
+
+    main:
+    ch_versions                 = Channel.empty()
+
+    // COLLECTFILE: Add eggnogmapper hits to gff
+    ch_described_gff            = ch_target_gff
+                                | join(ch_eggnogmapper_annotations)
+                                | map { meta, gff, annotations ->
+                                    def tx_annotations  = annotations.readLines()
+                                        .findAll { ! it.startsWith('#') }
+                                        .collect { line ->
+                                            def cols    = line.split('\t')
+                                            def id      = cols[0]
+                                            def txt     = cols[7]
+
+                                            [ id, txt ]
+                                        }
+                                        .findAll { id, txt ->
+                                            txt != '-'
+                                        }.collectEntries { id, txt ->
+                                            [ id, txt ]
+                                        }
+
+                                    def gene_tx_annotations = [:]
+                                    gff.readLines()
+                                        .findAll { line ->
+                                            if ( line.startsWith('#') ) { return false }
+
+                                            def cols    = line.split('\t')
+                                            def feat    = cols[2]
+
+                                            if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return false }
+
+                                            return true
+                                        }
+                                        .each { line ->
+                                            def cols    = line.split('\t')
+                                            def atts    = cols[8]
+
+                                            def matches = atts =~ /ID=([^;]*)/
+                                            def tx_id   = matches[0][1]
+
+                                            def matches_p= atts =~ /Parent=([^;]*)/
+                                            def gene_id = matches_p[0][1]
+
+                                            if ( ! gene_tx_annotations.containsKey(gene_id) ) {
+                                                gene_tx_annotations[gene_id] = [:]
+                                            }
+
+                                            def anno    = tx_annotations.containsKey(tx_id)
+                                                        ? URLEncoder.encode(tx_annotations[tx_id], "UTF-8").replace('+', '%20')
+                                                        : URLEncoder.encode('hypothetical protein | no eggnog hit', "UTF-8").replace('+', '%20')
+
+                                            gene_tx_annotations[gene_id] += [ ( tx_id ): anno ]
+                                        }
+
+                                    gene_tx_annotations = gene_tx_annotations
+                                        .collectEntries { gene_id, tx_annos ->
+                                            def default_anno = tx_annos.values().first()
+
+                                            if ( tx_annos.values().findAll { it != default_anno }.size() > 0 ) {
+                                                return [ gene_id, ( tx_annos + [ 'default': 'differing%20isoform%20descriptions' ] ) ]
+                                            }
+
+                                            [ gene_id, ( tx_annos + [ 'default': default_anno ] ) ]
+                                        }
+
+                                    def gff_lines = gff.readLines()
+                                        .collect { line ->
+
+                                            if ( line.startsWith('#') ) { return line }
+
+                                            def cols    = line.split('\t')
+                                            def feat    = cols[2]
+                                            def atts    = cols[8]
+
+                                            if ( ! ( feat == 'gene' || feat == 'transcript' || feat == 'mRNA' ) ) { return line }
+
+                                            def id      = feat == 'gene' ? ( atts =~ /ID=([^;]*)/ )[0][1] : ( atts =~ /Parent=([^;]*)/ )[0][1]
+
+                                            if ( ! gene_tx_annotations.containsKey(id) ) { return line }
+
+                                            def tx_id   = feat == 'gene' ? null : ( atts =~ /ID=([^;]*)/ )[0][1]
+                                            def desc    = feat == 'gene' ? gene_tx_annotations[id]['default'] : gene_tx_annotations[id][tx_id]
+
+                                            return ( line + ";description=$desc" )
+                                        }
+
+                                    [ "${meta.id}.described.gff" ] + gff_lines.join('\n')
+                                }
+                                | collectFile(newLine: true)
+                                | map { file ->
+                                    [ [ id: file.baseName.replace('.described', '') ], file ]
+                                }
+
+    // MODULE: GT_GFF3 as FINAL_GFF_CHECK
+    FINAL_GFF_CHECK ( ch_described_gff )
+
+    ch_final_gff                = FINAL_GFF_CHECK.out.gt_gff3
+    ch_versions                 = ch_versions.mix(FINAL_GFF_CHECK.out.versions.first())
+
+
+    emit:
+    final_gff                   = ch_final_gff          // [ meta, gff ]
+    versions                    = ch_versions           // [ versions.yml ]
+}
diff --git a/subworkflows/local/purge_nohit_models.nf b/subworkflows/local/purge_nohit_models.nf
index b75cb07..5c22f5f 100644
--- a/subworkflows/local/purge_nohit_models.nf
+++ b/subworkflows/local/purge_nohit_models.nf
@@ -58,41 +58,7 @@ workflow PURGE_NOHIT_MODELS {
     ch_target_purged_gff        = AGAT_SPFILTERFEATUREFROMKILLLIST.out.gff
     ch_versions                 = ch_versions.mix(AGAT_SPFILTERFEATUREFROMKILLLIST.out.versions.first())
 
-    // COLLECTFILE: Mark transcripts with description=hypothetical%20protein%20%7C%20no%20eggnog%20hit
-    ch_marked_gff               = val_purge_nohits
-                                ? Channel.empty()
-                                : ch_target_gff
-                                | join(ch_kill_list)
-                                | map { meta, gff, lst ->
-                                    def tx_without_hits = lst.readLines().collect { "$it".trim() }
-
-                                    def marked_gff_lines = gff.readLines()
-                                        .collect { line ->
-
-                                            if ( line.startsWith('#') ) { return line }
-
-                                            def cols = line.split('\t')
-                                            def feat = cols[2]
-
-                                            if ( ! ( feat == 'transcript' || feat == 'mRNA' ) ) { return line }
-
-                                            def atts    = cols[8]
-                                            def matches = atts =~ /ID=([^;]*)/
-                                            def tx_id   = matches[0][1]
-
-                                            if ( ! ( tx_id in tx_without_hits ) ) { return line }
-
-                                            return ( line + ';description=hypothetical%20protein%20%7C%20no%20eggnog%20hit' )
-                                        }
-
-                                    [ "${meta.id}.marked.gff" ] + marked_gff_lines.join('\n')
-                                }
-                                | collectFile(newLine: true)
-                                | map { file ->
-                                    [ [ id: file.baseName.replace('.marked', '') ], file ]
-                                }
-
     emit:
-    purged_or_marked_gff        = ch_target_purged_gff.mix(ch_marked_gff)
+    purged_gff                  = ch_target_purged_gff.mix(val_purge_nohits ? Channel.empty() : ch_target_gff)
     versions                    = ch_versions   // [ versions.yml ]
 }
diff --git a/workflows/pangene.nf b/workflows/pangene.nf
index 3712a34..66a0459 100644
--- a/workflows/pangene.nf
+++ b/workflows/pangene.nf
@@ -10,6 +10,7 @@ include { PURGE_BREAKER_MODELS                  } from '../subworkflows/local/pu
 include { GFF_MERGE_CLEANUP                     } from '../subworkflows/local/gff_merge_cleanup'
 include { GFF_EGGNOGMAPPER                      } from '../subworkflows/local/gff_eggnogmapper'
 include { PURGE_NOHIT_MODELS                    } from '../subworkflows/local/purge_nohit_models'
+include { GFF_STORE                             } from '../subworkflows/local/gff_store'
 include { CUSTOM_DUMPSOFTWAREVERSIONS           } from '../modules/nf-core/custom/dumpsoftwareversions'
 
 log.info paramsSummaryLog(workflow)
@@ -232,6 +233,7 @@ workflow PANGENE {
     )
 
     ch_eggnogmapper_hits        = GFF_EGGNOGMAPPER.out.eggnogmapper_hits
+    ch_eggnogmapper_annotations = GFF_EGGNOGMAPPER.out.eggnogmapper_annotations
     ch_versions                 = ch_versions.mix(GFF_EGGNOGMAPPER.out.versions)
 
     // SUBWORKFLOW: PURGE_NOHIT_MODELS
@@ -241,9 +243,15 @@ workflow PANGENE {
         params.eggnogmapper_purge_nohits
     )
 
-    ch_purged_marked_gff        = PURGE_NOHIT_MODELS.out.purged_or_marked_gff
+    ch_purged_gff               = PURGE_NOHIT_MODELS.out.purged_gff
     ch_versions                 = ch_versions.mix(PURGE_NOHIT_MODELS.out.versions)
 
+    // SUBWORKFLOW: GFF_STORE
+    GFF_STORE(
+        ch_purged_gff,
+        ch_eggnogmapper_annotations
+    )
+
     // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS
     CUSTOM_DUMPSOFTWAREVERSIONS (
         ch_versions.unique().collectFile(name: 'collated_versions.yml')