Skip to content

Commit

Permalink
Merge pull request #30 from B-UMMI/add_snps
Browse files Browse the repository at this point in the history
add number of snps to per reference table data
  • Loading branch information
cimendes authored Aug 23, 2022
2 parents faf9a33 + 233ac5f commit 9d8165a
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 11 deletions.
3 changes: 2 additions & 1 deletion main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ workflow {
postprocessing_wf.out.ngx_json,
postprocessing_wf.out.misassembly_reference_json,
postprocessing_wf.out.misassembly_plot_json,
assembly_wf.out.all_versions)
assembly_wf.out.all_versions,
postprocessing_wf.out.snp_report_json)
}

workflow.onComplete {
Expand Down
8 changes: 6 additions & 2 deletions modules/postprocessing/postprocessing.nf
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ process SNP_ASSESSMENT {
output:
path('*.tsv'), emit: tsv
path('*_snps.csv'), emit: csv
path('*_snps.json'), emit: json

script:
template "snp_assessment.py"
Expand All @@ -176,10 +177,12 @@ process PLOT_SNP_REFERENCE {

input:
path snp_coords_dataframes
path snps_jsons

output:
path('*.html') optional true
path('*.json'), emit: json
path('snps_in_reference.json'), emit: json
path('snps_report_per_ref.json'), emit: reference_snps_json

script:
template "plot_snp.py"
Expand Down Expand Up @@ -278,7 +281,7 @@ workflow postprocessing_wf {
PLOT_GAP_BOXPLOT(GAP_ASSESSMENT.out.json | collect)
PLOT_GAP_REFERENCE(GAP_ASSESSMENT.out.csv | collect)
SNP_ASSESSMENT(paf, triple_reference)
PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect)
PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect, SNP_ASSESSMENT.out.json | collect)
MISASSEMBLY(misassembly_paf)
PROCESS_MISASSEMBLY(MISASSEMBLY.out.trace_pkl | collect, MISASSEMBLY.out.contig_length_pkl | collect, MISASSEMBLY.out.misassembly_json | collect, MISASSEMBLY.out.misassembled_reference_json | collect)
PLOT_MISASSEMBLY(MISASSEMBLY.out.csv | collect)
Expand All @@ -290,6 +293,7 @@ workflow postprocessing_wf {
phred_json = PROCESS_SHRIMP_PLOT.out.json
gap_reference_json = PLOT_GAP_REFERENCE.out.json
snp_reference_json = PLOT_SNP_REFERENCE.out.json
snp_report_json = PLOT_SNP_REFERENCE.out.reference_snps_json
gap_boxplot_json = PLOT_GAP_BOXPLOT.out.json
misassembly_json = PROCESS_MISASSEMBLY.out.json
misassembly_report_json = PROCESS_MISASSEMBLY.out.report_json
Expand Down
4 changes: 3 additions & 1 deletion modules/report/report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ process COMPILE_REPORT {
file plot_misassembly_per_ref
file about_md
file containers_config
file snp_report_json

output:
path('pipeline_report*.json')
Expand Down Expand Up @@ -84,9 +85,10 @@ workflow report_wf {
misassembly_reference
plot_misassembly
version_file
snp_report_json

main:
COMPILE_VERSIONS(version_file)
COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config)
COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config, snp_report_json)

}
29 changes: 27 additions & 2 deletions templates/compile_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
ABOUT_MD = "$about_md"
CONTAINERS = "$containers_config"
PLOT_MISASSEMBLY_PER_REFERENCE = "$plot_misassembly_per_ref"
SNP_REPORT_REFERENCE = "$snp_report_json"

logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
Expand Down Expand Up @@ -69,6 +70,7 @@
logger.debug("ABOUT_MD: {}".format(ABOUT_MD))
logger.debug("CONTAINERS: {}".format(CONTAINERS))
logger.debug("CONTAINERS: {}".format(PLOT_MISASSEMBLY_PER_REFERENCE))
logger.debug("SNP_REPORT_REFERENCE: {}".format(SNP_REPORT_REFERENCE))


html_template = """
Expand Down Expand Up @@ -327,7 +329,7 @@ def process_sample_reads(reads_jsons):
def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapping_stats_report, completness_plot,
lmas_logo, reference_file, lx_json, shrimp_json, gap_reference_json, gap_histogram, plot_misassembly, misassembly_report,
min_contig_size, nax_json, ngx_json, reads_json, snp_reference_json, versions_json, misassembly_per_ref, about_md,
containers_config, plot_misassembly_per_reference_json):
containers_config, plot_misassembly_per_reference_json, snp_per_ref):

metadata = {
"nfMetadata": {
Expand Down Expand Up @@ -433,6 +435,29 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
except KeyError:
item['misassembled_contigs'] = 0
item['misassembly_events'] = 0

# add snp stats
with open(snp_per_ref) as snp_fh:
snp_stats = json.load(snp_fh)
for sample_id in main_data_tables_js.keys():
for reference in main_data_tables_js[sample_id]["ReferenceTables"].keys():
for item_row in main_data_tables_js[sample_id]["ReferenceTables"][reference]:
for item in item_row:
assembler = item['assembler']
try:
references = list(
snp_stats[sample_id][assembler][0].keys())
if reference in references:
index = references.index(reference)
ref_name = references[index]
try:
item['snps'] = snp_stats[sample_id][assembler][0][ref_name]["snps"]
except KeyError:
item['snps'] = 0
else:
item['snps'] = 0
except KeyError:
item['snps'] = 0

for sample_id in main_data_tables_js.keys():
main_data_plots_js[sample_id] = {}
Expand Down Expand Up @@ -628,5 +653,5 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
main(MAIN_JS, PIPELINE_STATS, ASSEMBLY_STATS_REPORT, CONTIG_SIZE_DISTRIBUTION, MAPPING_STATS_REPORT,
COMPLETNESS_JSON, LMAS_LOGO, REFERENCE_FILE, LX_JSON, SHRIMP_JSON, GAP_REFERENCE_JSON, GAP_HISTOGRAM,
MISASSEMBLY_PLOT, MISASSEMBLY_REPORT, MIN_CONTIG_SIZE, NAX_JSON, NGX_JSON, READS_NUMBER, SNP_REFERENCE_JSON,
VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE)
VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE, SNP_REPORT_REFERENCE)

36 changes: 32 additions & 4 deletions templates/plot_snp.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,37 @@

if __file__.endswith(".command.sh"):
DATAFRAME_LIST = '$snp_coords_dataframes'.split()
JSON_LIST = '$snps_jsons'.split()
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("DATAFRAME_LIST: {}".format(DATAFRAME_LIST))


def main(dataframes):
logger.debug("JSON_LIST: {}".format(JSON_LIST))

def snps_per_ref(report_per_reference):
"""
"""
master_report_data_per_reference = {}
for report_reference in report_per_reference:
with open(report_reference) as json_fh:
data_json = json.load(json_fh)
print(data_json)
if data_json["sample"] not in master_report_data_per_reference.keys():
master_report_data_per_reference[data_json["sample"]] = {
data_json["assembler"]: [data_json["reference"]]}
elif data_json["assembler"] not in master_report_data_per_reference[data_json["sample"]].keys():
master_report_data_per_reference[data_json["sample"]][data_json["assembler"]] = [
data_json["reference"]]
else:
master_report_data_per_reference[data_json["sample"]][data_json["assembler"]].append(
data_json["reference"])

logger.debug("Global report data: {}". format(master_report_data_per_reference))

with open("snps_report_per_ref.json", "w") as json_report:
json_report.write(json.dumps(
master_report_data_per_reference, separators=(",", ":")))

def main(dataframes, jsons):

li = []
for filename in dataframes:
Expand Down Expand Up @@ -154,7 +179,10 @@ def main(dataframes):

with open('snps_in_reference.json', 'w') as json_report:
json_report.write(json.dumps(report_dict, separators=(",", ":")))

# SNPS STATS PER REF
snps_per_ref(jsons)


if __name__ == '__main__':
main(DATAFRAME_LIST)
main(DATAFRAME_LIST, JSON_LIST)
15 changes: 14 additions & 1 deletion templates/snp_assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from itertools import groupby
from numpy.lib.function_base import append
import pandas as pd
import json
try:
import utils
except ImportError:
Expand Down Expand Up @@ -123,6 +124,9 @@ def main(sample_id, assembler, assembly, mapping, reference):

df = pd.DataFrame(columns=COLUMNS)

reference_report = {"sample": sample_id,
"assembler": assembler, "reference": {}}

# iterator for reference files (sequence length is needed)
references = (x[1] for x in groupby(open(reference, "r"), lambda line: line[0] == ">"))

Expand All @@ -139,7 +143,16 @@ def main(sample_id, assembler, assembly, mapping, reference):
#print(coord, substitution)
df = df.append({'Sample': sample_id, 'Assembler': assembler, 'Reference': reference_name,
'Reference Length': len(seq)/3, 'SNP Location': coord, 'Substitution Type': substitution}, ignore_index=True)


if reference_name not in reference_report['reference'].keys():
reference_report['reference'][reference_name] = {"snps": 1}
else:
reference_report['reference'][reference_name]["snps"] += 1

# Write files for report
with open("{}_{}_snps.json".format(sample_id, assembler), "w") as json_report:
json_report.write(json.dumps(reference_report, separators=(",", ":")))

df.to_csv(sample_id + '_' + assembler + '_snps.csv')


Expand Down

0 comments on commit 9d8165a

Please sign in to comment.