From 233ac5fcdc9dbaecfd76318626c99a0b3cef94c5 Mon Sep 17 00:00:00 2001 From: cimendes Date: Fri, 19 Aug 2022 13:44:12 +0100 Subject: [PATCH] add number of snps to per reference table data --- main.nf | 3 +- modules/postprocessing/postprocessing.nf | 8 ++++-- modules/report/report.nf | 4 ++- templates/compile_reports.py | 29 +++++++++++++++++-- templates/plot_snp.py | 36 +++++++++++++++++++++--- templates/snp_assessment.py | 15 +++++++++- 6 files changed, 84 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index f51e466..8558b7c 100644 --- a/main.nf +++ b/main.nf @@ -98,7 +98,8 @@ workflow { postprocessing_wf.out.ngx_json, postprocessing_wf.out.misassembly_reference_json, postprocessing_wf.out.misassembly_plot_json, - assembly_wf.out.all_versions) + assembly_wf.out.all_versions, + postprocessing_wf.out.snp_report_json) } workflow.onComplete { diff --git a/modules/postprocessing/postprocessing.nf b/modules/postprocessing/postprocessing.nf index 8c05d3e..1895c2e 100644 --- a/modules/postprocessing/postprocessing.nf +++ b/modules/postprocessing/postprocessing.nf @@ -164,6 +164,7 @@ process SNP_ASSESSMENT { output: path('*.tsv'), emit: tsv path('*_snps.csv'), emit: csv + path('*_snps.json'), emit: json script: template "snp_assessment.py" @@ -176,10 +177,12 @@ process PLOT_SNP_REFERENCE { input: path snp_coords_dataframes + path snps_jsons output: path('*.html') optional true - path('*.json'), emit: json + path('snps_in_reference.json'), emit: json + path('snps_report_per_ref.json'), emit: reference_snps_json script: template "plot_snp.py" @@ -278,7 +281,7 @@ workflow postprocessing_wf { PLOT_GAP_BOXPLOT(GAP_ASSESSMENT.out.json | collect) PLOT_GAP_REFERENCE(GAP_ASSESSMENT.out.csv | collect) SNP_ASSESSMENT(paf, triple_reference) - PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect) + PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect, SNP_ASSESSMENT.out.json | collect) MISASSEMBLY(misassembly_paf) PROCESS_MISASSEMBLY(MISASSEMBLY.out.trace_pkl | collect, MISASSEMBLY.out.contig_length_pkl | collect, MISASSEMBLY.out.misassembly_json | collect, MISASSEMBLY.out.misassembled_reference_json | collect) PLOT_MISASSEMBLY(MISASSEMBLY.out.csv | collect) @@ -290,6 +293,7 @@ workflow postprocessing_wf { phred_json = PROCESS_SHRIMP_PLOT.out.json gap_reference_json = PLOT_GAP_REFERENCE.out.json snp_reference_json = PLOT_SNP_REFERENCE.out.json + snp_report_json = PLOT_SNP_REFERENCE.out.reference_snps_json gap_boxplot_json = PLOT_GAP_BOXPLOT.out.json misassembly_json = PROCESS_MISASSEMBLY.out.json misassembly_report_json = PROCESS_MISASSEMBLY.out.report_json diff --git a/modules/report/report.nf b/modules/report/report.nf index 5cf3d58..3c84e6f 100644 --- a/modules/report/report.nf +++ b/modules/report/report.nf @@ -44,6 +44,7 @@ process COMPILE_REPORT { file plot_misassembly_per_ref file about_md file containers_config + file snp_report_json output: path('pipeline_report*.json') @@ -84,9 +85,10 @@ workflow report_wf { misassembly_reference plot_misassembly version_file + snp_report_json main: COMPILE_VERSIONS(version_file) - COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config) + COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config, snp_report_json) } \ No newline at end of file diff --git a/templates/compile_reports.py b/templates/compile_reports.py index 4479037..26cbed9 100644 --- a/templates/compile_reports.py +++ b/templates/compile_reports.py @@ -41,6 +41,7 @@ ABOUT_MD = "$about_md" CONTAINERS = "$containers_config" PLOT_MISASSEMBLY_PER_REFERENCE = "$plot_misassembly_per_ref" + SNP_REPORT_REFERENCE = "$snp_report_json" logger.debug("Running {} with parameters:".format( os.path.basename(__file__))) @@ -69,6 +70,7 @@ logger.debug("ABOUT_MD: {}".format(ABOUT_MD)) logger.debug("CONTAINERS: {}".format(CONTAINERS)) logger.debug("CONTAINERS: {}".format(PLOT_MISASSEMBLY_PER_REFERENCE)) + logger.debug("SNP_REPORT_REFERENCE: {}".format(SNP_REPORT_REFERENCE)) html_template = """ @@ -327,7 +329,7 @@ def process_sample_reads(reads_jsons): def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapping_stats_report, completness_plot, lmas_logo, reference_file, lx_json, shrimp_json, gap_reference_json, gap_histogram, plot_misassembly, misassembly_report, min_contig_size, nax_json, ngx_json, reads_json, snp_reference_json, versions_json, misassembly_per_ref, about_md, - containers_config, plot_misassembly_per_reference_json): + containers_config, plot_misassembly_per_reference_json, snp_per_ref): metadata = { "nfMetadata": { @@ -433,6 +435,29 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp except KeyError: item['misassembled_contigs'] = 0 item['misassembly_events'] = 0 + + # add snp stats + with open(snp_per_ref) as snp_fh: + snp_stats = json.load(snp_fh) + for sample_id in main_data_tables_js.keys(): + for reference in main_data_tables_js[sample_id]["ReferenceTables"].keys(): + for item_row in main_data_tables_js[sample_id]["ReferenceTables"][reference]: + for item in item_row: + assembler = item['assembler'] + try: + references = list( + snp_stats[sample_id][assembler][0].keys()) + if reference in references: + index = references.index(reference) + ref_name = references[index] + try: + item['snps'] = snp_stats[sample_id][assembler][0][ref_name]["snps"] + except KeyError: + item['snps'] = 0 + else: + item['snps'] = 0 + except KeyError: + item['snps'] = 0 for sample_id in main_data_tables_js.keys(): main_data_plots_js[sample_id] = {} @@ -628,5 +653,5 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp main(MAIN_JS, PIPELINE_STATS, ASSEMBLY_STATS_REPORT, CONTIG_SIZE_DISTRIBUTION, MAPPING_STATS_REPORT, COMPLETNESS_JSON, LMAS_LOGO, REFERENCE_FILE, LX_JSON, SHRIMP_JSON, GAP_REFERENCE_JSON, GAP_HISTOGRAM, MISASSEMBLY_PLOT, MISASSEMBLY_REPORT, MIN_CONTIG_SIZE, NAX_JSON, NGX_JSON, READS_NUMBER, SNP_REFERENCE_JSON, - VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE) + VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE, SNP_REPORT_REFERENCE) diff --git a/templates/plot_snp.py b/templates/plot_snp.py index c126291..9069d03 100644 --- a/templates/plot_snp.py +++ b/templates/plot_snp.py @@ -50,12 +50,37 @@ if __file__.endswith(".command.sh"): DATAFRAME_LIST = '$snp_coords_dataframes'.split() + JSON_LIST = '$snps_jsons'.split() logger.debug("Running {} with parameters:".format( os.path.basename(__file__))) logger.debug("DATAFRAME_LIST: {}".format(DATAFRAME_LIST)) - - -def main(dataframes): + logger.debug("JSON_LIST: {}".format(JSON_LIST)) + +def snps_per_ref(report_per_reference): + """ + """ + master_report_data_per_reference = {} + for report_reference in report_per_reference: + with open(report_reference) as json_fh: + data_json = json.load(json_fh) + print(data_json) + if data_json["sample"] not in master_report_data_per_reference.keys(): + master_report_data_per_reference[data_json["sample"]] = { + data_json["assembler"]: [data_json["reference"]]} + elif data_json["assembler"] not in master_report_data_per_reference[data_json["sample"]].keys(): + master_report_data_per_reference[data_json["sample"]][data_json["assembler"]] = [ + data_json["reference"]] + else: + master_report_data_per_reference[data_json["sample"]][data_json["assembler"]].append( + data_json["reference"]) + + logger.debug("Global report data: {}". format(master_report_data_per_reference)) + + with open("snps_report_per_ref.json", "w") as json_report: + json_report.write(json.dumps( + master_report_data_per_reference, separators=(",", ":"))) + +def main(dataframes, jsons): li = [] for filename in dataframes: @@ -154,7 +179,10 @@ def main(dataframes): with open('snps_in_reference.json', 'w') as json_report: json_report.write(json.dumps(report_dict, separators=(",", ":"))) + + # SNPS STATS PER REF + snps_per_ref(jsons) if __name__ == '__main__': - main(DATAFRAME_LIST) + main(DATAFRAME_LIST, JSON_LIST) diff --git a/templates/snp_assessment.py b/templates/snp_assessment.py index ce036df..d24469b 100644 --- a/templates/snp_assessment.py +++ b/templates/snp_assessment.py @@ -33,6 +33,7 @@ from itertools import groupby from numpy.lib.function_base import append import pandas as pd +import json try: import utils except ImportError: @@ -123,6 +124,9 @@ def main(sample_id, assembler, assembly, mapping, reference): df = pd.DataFrame(columns=COLUMNS) + reference_report = {"sample": sample_id, + "assembler": assembler, "reference": {}} + # iterator for reference files (sequence length is needed) references = (x[1] for x in groupby(open(reference, "r"), lambda line: line[0] == ">")) @@ -139,7 +143,16 @@ def main(sample_id, assembler, assembly, mapping, reference): #print(coord, substitution) df = df.append({'Sample': sample_id, 'Assembler': assembler, 'Reference': reference_name, 'Reference Length': len(seq)/3, 'SNP Location': coord, 'Substitution Type': substitution}, ignore_index=True) - + + if reference_name not in reference_report['reference'].keys(): + reference_report['reference'][reference_name] = {"snps": 1} + else: + reference_report['reference'][reference_name]["snps"] += 1 + + # Write files for report + with open("{}_{}_snps.json".format(sample_id, assembler), "w") as json_report: + json_report.write(json.dumps(reference_report, separators=(",", ":"))) + df.to_csv(sample_id + '_' + assembler + '_snps.csv')