-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathplot_gap_sizes.py
95 lines (74 loc) · 3.18 KB
/
plot_gap_sizes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Purpose
-------
Script to plot the distribution of gap sizes of the filtered assemblies respective to the reference sequences.
Produces a boxplot for the gap size per assembler.
Expected input
--------------
The following variables are expected whether using NextFlow or the
:py:func:`main` executor.
- ``gap_distance_json``: list of JSON files containing the sample ID and the list of gap sizes
e.g.: ``'[SampleA.json, SampleB.json]'``
Expected input
--------------
This script takes the following arguments (in this order):
* List of Paths with the JSON containing the sample ID and the list of gap sizes
Authorship
----------
Inês Mendes, [email protected]
https://github.com/cimendes
"""
import os
import json
import pandas as pd
from pandas.core.common import flatten
from plotly.offline import plot
import plotly.graph_objects as go
try:
import utils
except ImportError:
from templates import utils
__version__ = "0.0.1"
__build__ = "15.12.2020"
__template__ = "PLOT_GAP_BOXPLOT-nf"
logger = utils.get_logger(__file__)
if __file__.endswith(".command.sh"):
GAP_JSON = '$gap_distance_json'.split()
logger.debug("Running {} with parameters:".format(
os.path.basename(__file__)))
logger.debug("GAP_JSON: {}".format(GAP_JSON))
COLUMNS = ['Assembler', 'Gap size'] # columns for dataframe
def main(gap_json):
all_data = {}
for json_file in gap_json:
with open(json_file) as jfh:
data = json.load(jfh)
for sample in data.keys():
for assembler in data[sample].keys():
if sample not in all_data.keys():
all_data[sample] = {assembler: [gap for gap in data[sample][assembler]]}
print(all_data[sample][assembler])
else:
if assembler not in all_data[sample].keys():
all_data[sample][assembler] = [gap for gap in data[sample][assembler]]
else:
all_data[sample][assembler].append(gap for gap in data[sample][assembler])
for sample in all_data.keys():
df = pd.DataFrame(columns=COLUMNS)
fig = go.Figure()
for k, v in all_data[sample].items():
flatlist = list(flatten(v))
for gap in flatlist:
df = df.append({'Assembler': k, 'Gap size': gap}, ignore_index=True)
for assembler in sorted(df['Assembler'].unique(), key=lambda v: v.upper(), reverse=True):
fig.add_trace(go.Box(x=df['Gap size'][df['Assembler'] == assembler],
name=assembler, boxpoints='outliers',
boxmean=False, fillcolor='#D3D3D3', line=dict(color='#000000')))
fig.update_layout(showlegend=False, xaxis_type="log", xaxis_title="Gap size (Log bp)",
plot_bgcolor='rgb(255,255,255)', xaxis=dict(zeroline=False, gridcolor='#DCDCDC'))
plot(fig, filename='{}_gap_size_boxplot.html'.format(sample), auto_open=False)
fig.write_json(file='{}_gap_distance_histogram.json'.format(sample))
if __name__ == '__main__':
main(GAP_JSON)