Merge pull request #30 from B-UMMI/add_snps

add number of snps to per reference table data
B-UMMI · Aug 23, 2022 · 9d8165a · 9d8165a
2 parents faf9a33 + 233ac5f
commit 9d8165a
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 11 deletions.
diff --git a/main.nf b/main.nf
@@ -98,7 +98,8 @@ workflow {
               postprocessing_wf.out.ngx_json, 
               postprocessing_wf.out.misassembly_reference_json, 
               postprocessing_wf.out.misassembly_plot_json, 
-              assembly_wf.out.all_versions)
+              assembly_wf.out.all_versions,
+              postprocessing_wf.out.snp_report_json)
 }
 
 workflow.onComplete {

diff --git a/modules/postprocessing/postprocessing.nf b/modules/postprocessing/postprocessing.nf
@@ -164,6 +164,7 @@ process SNP_ASSESSMENT {
     output:
     path('*.tsv'), emit: tsv
     path('*_snps.csv'), emit: csv
+    path('*_snps.json'), emit: json
 
     script:
     template "snp_assessment.py"
@@ -176,10 +177,12 @@ process PLOT_SNP_REFERENCE {
 
     input:
     path snp_coords_dataframes 
+    path snps_jsons
 
     output:
     path('*.html') optional true
-    path('*.json'), emit: json
+    path('snps_in_reference.json'), emit: json
+    path('snps_report_per_ref.json'), emit: reference_snps_json
 
     script:
     template "plot_snp.py"
@@ -278,7 +281,7 @@ workflow postprocessing_wf {
     PLOT_GAP_BOXPLOT(GAP_ASSESSMENT.out.json | collect)
     PLOT_GAP_REFERENCE(GAP_ASSESSMENT.out.csv | collect)
     SNP_ASSESSMENT(paf, triple_reference)
-    PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect)
+    PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect, SNP_ASSESSMENT.out.json | collect)
     MISASSEMBLY(misassembly_paf)
     PROCESS_MISASSEMBLY(MISASSEMBLY.out.trace_pkl | collect, MISASSEMBLY.out.contig_length_pkl | collect, MISASSEMBLY.out.misassembly_json | collect, MISASSEMBLY.out.misassembled_reference_json | collect)
     PLOT_MISASSEMBLY(MISASSEMBLY.out.csv | collect)
@@ -290,6 +293,7 @@ workflow postprocessing_wf {
     phred_json = PROCESS_SHRIMP_PLOT.out.json
     gap_reference_json = PLOT_GAP_REFERENCE.out.json
     snp_reference_json = PLOT_SNP_REFERENCE.out.json
+    snp_report_json = PLOT_SNP_REFERENCE.out.reference_snps_json
     gap_boxplot_json = PLOT_GAP_BOXPLOT.out.json
     misassembly_json = PROCESS_MISASSEMBLY.out.json
     misassembly_report_json = PROCESS_MISASSEMBLY.out.report_json

diff --git a/modules/report/report.nf b/modules/report/report.nf
@@ -44,6 +44,7 @@ process COMPILE_REPORT {
     file plot_misassembly_per_ref 
     file about_md 
     file containers_config 
+    file snp_report_json
 
     output:
     path('pipeline_report*.json')
@@ -84,9 +85,10 @@ workflow report_wf {
     misassembly_reference
     plot_misassembly
     version_file
+    snp_report_json
 
     main:
     COMPILE_VERSIONS(version_file)
-    COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config)
+    COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config, snp_report_json)
 
 }
diff --git a/templates/compile_reports.py b/templates/compile_reports.py
@@ -41,6 +41,7 @@
     ABOUT_MD = "$about_md"
     CONTAINERS = "$containers_config"
     PLOT_MISASSEMBLY_PER_REFERENCE = "$plot_misassembly_per_ref"
+    SNP_REPORT_REFERENCE = "$snp_report_json"
 
     logger.debug("Running {} with parameters:".format(
         os.path.basename(__file__)))
@@ -69,6 +70,7 @@
     logger.debug("ABOUT_MD: {}".format(ABOUT_MD))
     logger.debug("CONTAINERS: {}".format(CONTAINERS))
     logger.debug("CONTAINERS: {}".format(PLOT_MISASSEMBLY_PER_REFERENCE))
+    logger.debug("SNP_REPORT_REFERENCE: {}".format(SNP_REPORT_REFERENCE))
 
 
 html_template = """
@@ -327,7 +329,7 @@ def process_sample_reads(reads_jsons):
 def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapping_stats_report, completness_plot,
          lmas_logo, reference_file, lx_json, shrimp_json, gap_reference_json, gap_histogram, plot_misassembly, misassembly_report,
          min_contig_size, nax_json, ngx_json, reads_json, snp_reference_json, versions_json, misassembly_per_ref, about_md,
-         containers_config, plot_misassembly_per_reference_json):
+         containers_config, plot_misassembly_per_reference_json, snp_per_ref):
 
     metadata = {
         "nfMetadata": {
@@ -433,6 +435,29 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
                         except KeyError:
                             item['misassembled_contigs'] = 0
                             item['misassembly_events'] = 0
+
+    # add snp stats
+    with open(snp_per_ref) as snp_fh:
+        snp_stats = json.load(snp_fh)
+        for sample_id in main_data_tables_js.keys():
+            for reference in main_data_tables_js[sample_id]["ReferenceTables"].keys():
+                for item_row in main_data_tables_js[sample_id]["ReferenceTables"][reference]:
+                    for item in item_row:
+                        assembler = item['assembler']
+                        try:
+                            references = list(
+                                snp_stats[sample_id][assembler][0].keys())
+                            if reference in references:
+                                index = references.index(reference)
+                                ref_name = references[index]
+                                try:
+                                    item['snps'] = snp_stats[sample_id][assembler][0][ref_name]["snps"]
+                                except KeyError:
+                                    item['snps'] = 0
+                            else:
+                                item['snps'] = 0
+                        except KeyError:
+                            item['snps'] = 0
 
     for sample_id in main_data_tables_js.keys():
         main_data_plots_js[sample_id] = {}
@@ -628,5 +653,5 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
     main(MAIN_JS, PIPELINE_STATS, ASSEMBLY_STATS_REPORT, CONTIG_SIZE_DISTRIBUTION, MAPPING_STATS_REPORT,
          COMPLETNESS_JSON, LMAS_LOGO, REFERENCE_FILE, LX_JSON, SHRIMP_JSON, GAP_REFERENCE_JSON, GAP_HISTOGRAM,
          MISASSEMBLY_PLOT, MISASSEMBLY_REPORT, MIN_CONTIG_SIZE, NAX_JSON, NGX_JSON, READS_NUMBER, SNP_REFERENCE_JSON,
-         VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE)
+         VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE, SNP_REPORT_REFERENCE)
 
diff --git a/templates/plot_snp.py b/templates/plot_snp.py
@@ -50,12 +50,37 @@
 
 if __file__.endswith(".command.sh"):
     DATAFRAME_LIST = '$snp_coords_dataframes'.split()
+    JSON_LIST = '$snps_jsons'.split()
     logger.debug("Running {} with parameters:".format(
         os.path.basename(__file__)))
     logger.debug("DATAFRAME_LIST: {}".format(DATAFRAME_LIST))
-
-
-def main(dataframes):
+    logger.debug("JSON_LIST: {}".format(JSON_LIST))
+
+def snps_per_ref(report_per_reference):
+    """
+    """
+    master_report_data_per_reference = {}
+    for report_reference in report_per_reference:
+        with open(report_reference) as json_fh:
+            data_json = json.load(json_fh)
+            print(data_json)
+            if data_json["sample"] not in master_report_data_per_reference.keys():
+                master_report_data_per_reference[data_json["sample"]] = {
+                    data_json["assembler"]: [data_json["reference"]]}
+            elif data_json["assembler"] not in master_report_data_per_reference[data_json["sample"]].keys():
+                master_report_data_per_reference[data_json["sample"]][data_json["assembler"]] = [
+                    data_json["reference"]]
+            else:
+                master_report_data_per_reference[data_json["sample"]][data_json["assembler"]].append(
+                    data_json["reference"])
+
+    logger.debug("Global report data: {}". format(master_report_data_per_reference))
+
+    with open("snps_report_per_ref.json", "w") as json_report:
+        json_report.write(json.dumps(
+            master_report_data_per_reference, separators=(",", ":")))
+
+def main(dataframes, jsons):
 
     li = []
     for filename in dataframes:
@@ -154,7 +179,10 @@ def main(dataframes):
 
     with open('snps_in_reference.json', 'w') as json_report:
         json_report.write(json.dumps(report_dict, separators=(",", ":")))
+
+    # SNPS STATS PER REF
+    snps_per_ref(jsons)
 
 
 if __name__ == '__main__':
-    main(DATAFRAME_LIST)
+    main(DATAFRAME_LIST, JSON_LIST)
diff --git a/templates/snp_assessment.py b/templates/snp_assessment.py
@@ -33,6 +33,7 @@
 from itertools import groupby
 from numpy.lib.function_base import append
 import pandas as pd
+import json
 try:
     import utils
 except ImportError:
@@ -123,6 +124,9 @@ def main(sample_id, assembler, assembly, mapping, reference):
 
     df = pd.DataFrame(columns=COLUMNS)
 
+    reference_report = {"sample": sample_id,
+                    "assembler": assembler, "reference": {}}
+
     # iterator for reference files (sequence length is needed)
     references = (x[1] for x in groupby(open(reference, "r"), lambda line: line[0] == ">"))
 
@@ -139,7 +143,16 @@ def main(sample_id, assembler, assembly, mapping, reference):
             #print(coord, substitution)
             df = df.append({'Sample': sample_id, 'Assembler': assembler, 'Reference': reference_name,
                             'Reference Length': len(seq)/3, 'SNP Location': coord, 'Substitution Type': substitution}, ignore_index=True)
-
+
+            if reference_name not in reference_report['reference'].keys():
+                reference_report['reference'][reference_name] = {"snps": 1}
+            else:
+                reference_report['reference'][reference_name]["snps"] += 1
+
+    # Write files for report
+    with open("{}_{}_snps.json".format(sample_id, assembler), "w") as json_report:
+        json_report.write(json.dumps(reference_report, separators=(",", ":")))
+
     df.to_csv(sample_id + '_' + assembler + '_snps.csv')