From de1242bae65b1ece217328431097c2ac1af2ab3f Mon Sep 17 00:00:00 2001
From: Jethro Rainford
Date: Sun, 27 Sep 2020 22:14:23 +0100
Subject: [PATCH] PR changes
---
bin/coverage_report_single.py | 122 ++++++++++++++--------------
bin/coverage_stats_single.py | 16 ++--
data/templates/single_template.html | 9 +-
output/single_report.html | 0
4 files changed, 72 insertions(+), 75 deletions(-)
delete mode 100644 output/single_report.html
diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py
index 8d7754a1..601382f9 100644
--- a/bin/coverage_report_single.py
+++ b/bin/coverage_report_single.py
@@ -60,8 +60,8 @@ def load_files(self, threshold, exon_stats,
template_dir = os.path.join(bin_dir, "../data/templates/")
single_template = os.path.join(template_dir, "single_template.html")
- with open(single_template, 'r') as temp:
- html_template = temp.read()
+ with open(single_template, 'r') as template:
+ html_template = template.read()
# read in exon stats file
with open(exon_stats.name) as exon_file:
@@ -84,11 +84,11 @@ def load_files(self, threshold, exon_stats,
key = ln.split(":")[0].strip("#")
val = ln.split(":")[1]
flagstat[key] = val
-
+
if "build" not in locals():
# build no. not included in gene_stats file
build = "Unknown"
-
+
column = [
"chrom", "exon_start", "exon_end",
"gene", "tx", "exon", "cov_start",
@@ -124,17 +124,17 @@ def load_files(self, threshold, exon_stats,
def build_report(self, html_template, total_stats, gene_stats,
- sub_thrshld_stats, snps_low_cov, snps_high_cov, fig,
+ sub_threshold_stats, snps_low_cov, snps_high_cov, fig,
all_plots, summary_plot, report_vals
):
"""
Build report from template and variables to write to file
Args:
- - html_template (str): string of HTML template
+ - html_template (str): string of HTML template file
- total_stats (df): total stats table of all genes & exons
- gene_stats (df): stats table of whole gene
- - sub_thrshld_stats (df): table of exons with < threshold
+ - sub_threshold_stats (df): table of exons with < threshold
- snps_low_cov (df): table of snps with cov < threshold
- snsp_high_cov (df): table of snps with cov > threshold
- fig (figure): grid of low coverage exon plots (plotly)
@@ -154,9 +154,9 @@ def build_report(self, html_template, total_stats, gene_stats,
threshold=report_vals["threshold"],
exon_issues=report_vals["exon_issues"],
gene_issues=report_vals["gene_issues"],
- covered_genes=report_vals["covered_genes"],
+ fully_covered_genes=report_vals["fully_covered_genes"],
name=report_vals["name"],
- sub_thrshld_stats=sub_thrshld_stats,
+ sub_threshold_stats=sub_threshold_stats,
low_cov_plots=fig,
all_plots=all_plots,
summary_plot=summary_plot,
@@ -272,7 +272,7 @@ def low_coverage_regions(self, cov_stats, raw_coverage, threshold):
# pandas is terrible and forces floats, change back to int
dtypes = {
- 'chrom': int,
+ 'chrom': str,
'exon': int,
'exon_start': int,
'exon_end': int,
@@ -612,18 +612,18 @@ def summary_gene_plot(self, cov_summary, threshold):
print("Generating summary plot")
- thrshld = str(threshold) + "x"
+ threshold = str(threshold) + "x"
# define colours based on values
cov_summary["colours"] = 'green'
- cov_summary.loc[cov_summary[thrshld] < 100, 'colours'] = 'orange'
- cov_summary.loc[cov_summary[thrshld] < 90, 'colours'] = 'red'
+ cov_summary.loc[cov_summary[threshold] < 100, 'colours'] = 'orange'
+ cov_summary.loc[cov_summary[threshold] < 90, 'colours'] = 'red'
- cov_summary = cov_summary.sort_values(by=[thrshld], ascending=False)
+ cov_summary = cov_summary.sort_values(by=[threshold], ascending=False)
summary_plot, axs = plt.subplots(figsize=(18, 10))
plt.bar(
- cov_summary["gene"], [int(x) for x in cov_summary[thrshld]],
+ cov_summary["gene"], [int(x) for x in cov_summary[threshold]],
color=cov_summary.colours
)
@@ -643,7 +643,7 @@ def summary_gene_plot(self, cov_summary, threshold):
axs.tick_params(axis='both', which='major', labelsize=10)
plt.xlabel("")
- plt.ylabel("% coverage >= {}".format(thrshld))
+ plt.ylabel("% coverage >= {}".format(threshold))
axs.yaxis.grid(linewidth=0.5, color="grey", linestyle="-.")
plt.box(False)
@@ -697,7 +697,7 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
print("Generating report")
# str of threshold for selecting df columns etc.
- thrshld = str(args.threshold) + "x"
+ threshold = str(args.threshold) + "x"
# get threshold columns and add to column names
threshold_cols = list(cov_stats.filter(regex='[0-9]+x', axis=1))
@@ -709,16 +709,16 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
column.extend(threshold_cols)
- sub_thrshld = pd.DataFrame(columns=column)
+ sub_threshold = pd.DataFrame(columns=column)
# get all exons with <100% coverage at threshold
for i, row in cov_stats.iterrows():
- if int(row[thrshld]) < 100:
- sub_thrshld = sub_thrshld.append(row, ignore_index=True)
+ if int(row[threshold]) < 100:
+ sub_threshold = sub_threshold.append(row, ignore_index=True)
# pandas is terrible and forces floats, change back to int
dtypes = {
- 'chrom': int,
+ 'chrom': str,
'exon': int,
'exon_len': int,
'exon_start': int,
@@ -727,7 +727,7 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
'max': int
}
- sub_thrshld = sub_thrshld.astype(dtypes)
+ sub_threshold = sub_threshold.astype(dtypes)
vals = ["min", "mean", "max"]
vals.extend(threshold_cols)
@@ -740,8 +740,8 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
values=vals
)
- sub_thrshld_stats = pd.pivot_table(
- sub_thrshld,
+ sub_threshold_stats = pd.pivot_table(
+ sub_threshold,
index=["gene", "tx", "chrom", "exon", "exon_len",
"exon_start", "exon_end"],
values=vals
@@ -749,12 +749,12 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
# reset index to fix formatting
total_stats = total_stats.reindex(vals, axis=1)
- sub_thrshld_stats = sub_thrshld_stats.reindex(vals, axis=1)
+ sub_threshold_stats = sub_threshold_stats.reindex(vals, axis=1)
total_stats.reset_index(inplace=True)
- sub_thrshld_stats.reset_index(inplace=True)
+ sub_threshold_stats.reset_index(inplace=True)
# rename columns to display properly
- sub_thrshld_stats = sub_thrshld_stats.rename(columns={
+ sub_threshold_stats = sub_threshold_stats.rename(columns={
"gene": "Gene",
"tx": "Transcript",
"chrom": "Chromosome",
@@ -791,60 +791,60 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
# get values to display in report
total_genes = len(cov_summary["Gene"])
- gene_issues = len(list(set(sub_thrshld_stats["Gene"].tolist())))
- exon_issues = len(sub_thrshld_stats["Exon"])
- covered_genes = total_genes - gene_issues
+ gene_issues = len(list(set(sub_threshold_stats["Gene"].tolist())))
+ exon_issues = len(sub_threshold_stats["Exon"])
+ fully_covered_genes = total_genes - gene_issues
# empty dict to add values for displaying in report text
report_vals = {}
report_vals["name"] = str(args.sample_name)
report_vals["total_genes"] = str(total_genes)
- report_vals["covered_genes"] = str(covered_genes)
+ report_vals["fully_covered_genes"] = str(fully_covered_genes)
report_vals["gene_issues"] = str(gene_issues)
- report_vals["threshold"] = thrshld
+ report_vals["threshold"] = threshold
report_vals["exon_issues"] = str(exon_issues)
report_vals["build"] = build
# set ranges for colouring cells
- x0 = pd.IndexSlice[sub_thrshld_stats.loc[(
- sub_thrshld_stats[thrshld] < 10
+ x0 = pd.IndexSlice[sub_threshold_stats.loc[(
+ sub_threshold_stats[threshold] < 10
) & (
- sub_thrshld_stats[thrshld] > 0)].index, thrshld]
- x10 = pd.IndexSlice[sub_thrshld_stats.loc[(
- sub_thrshld_stats[thrshld] < 30
+ sub_threshold_stats[threshold] > 0)].index, threshold]
+ x10 = pd.IndexSlice[sub_threshold_stats.loc[(
+ sub_threshold_stats[threshold] < 30
) & (
- sub_thrshld_stats[thrshld] >= 10)].index, thrshld]
- x30 = pd.IndexSlice[sub_thrshld_stats.loc[(
- sub_thrshld_stats[thrshld] < 50
+ sub_threshold_stats[threshold] >= 10)].index, threshold]
+ x30 = pd.IndexSlice[sub_threshold_stats.loc[(
+ sub_threshold_stats[threshold] < 50
) & (
- sub_thrshld_stats[thrshld] >= 30)].index, thrshld]
- x50 = pd.IndexSlice[sub_thrshld_stats.loc[(
- sub_thrshld_stats[thrshld] < 70
+ sub_threshold_stats[threshold] >= 30)].index, threshold]
+ x50 = pd.IndexSlice[sub_threshold_stats.loc[(
+ sub_threshold_stats[threshold] < 70
) & (
- sub_thrshld_stats[thrshld] >= 50)].index, thrshld]
- x70 = pd.IndexSlice[sub_thrshld_stats.loc[(
- sub_thrshld_stats[thrshld] < 90
+ sub_threshold_stats[threshold] >= 50)].index, threshold]
+ x70 = pd.IndexSlice[sub_threshold_stats.loc[(
+ sub_threshold_stats[threshold] < 90
) & (
- sub_thrshld_stats[thrshld] >= 70)].index, thrshld]
- x90 = pd.IndexSlice[sub_thrshld_stats.loc[(
- sub_thrshld_stats[thrshld] < 95
+ sub_threshold_stats[threshold] >= 70)].index, threshold]
+ x90 = pd.IndexSlice[sub_threshold_stats.loc[(
+ sub_threshold_stats[threshold] < 95
) & (
- sub_thrshld_stats[thrshld] >= 90)].index, thrshld]
- x95 = pd.IndexSlice[sub_thrshld_stats.loc[(
- sub_thrshld_stats[thrshld] >= 95)].index, thrshld]
+ sub_threshold_stats[threshold] >= 90)].index, threshold]
+ x95 = pd.IndexSlice[sub_threshold_stats.loc[(
+ sub_threshold_stats[threshold] >= 95)].index, threshold]
# df column index of threshold
- col_idx = sub_thrshld_stats.columns.get_loc(thrshld)
+ col_idx = sub_threshold_stats.columns.get_loc(threshold)
# make dict for rounding coverage columns to 2dp
rnd = {}
- for col in list(sub_thrshld_stats.columns[10:15]):
+ for col in list(sub_threshold_stats.columns[10:15]):
rnd[col] = '{0:.2f}%'
# apply colours to coverage cell based on value, 0 is given solid red
- s = sub_thrshld_stats.style.apply(lambda x: [
- "background-color: #d70000" if x[thrshld] == 0 and idx == col_idx
+ s = sub_threshold_stats.style.apply(lambda x: [
+ "background-color: #d70000" if x[threshold] == 0 and idx == col_idx
else "" for idx, v in enumerate(x)
], axis=1)\
.bar(subset=x0, color='red', vmin=0, vmax=100)\
@@ -858,7 +858,7 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
.set_table_attributes('table border="1"\
class="dataframe table table-hover table-bordered"')
- sub_thrshld_stats["Mean"] = sub_thrshld_stats["Mean"].apply(
+ sub_threshold_stats["Mean"] = sub_threshold_stats["Mean"].apply(
lambda x: int(x)
)
@@ -875,20 +875,20 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
total_stats = total_stats.to_html(justify='left').replace(
style[0], style[1]
)
- sub_thrshld_stats = s.render()
+ sub_threshold_stats = s.render()
if snps_low_cov is not None:
snps_not_covered = len(snps_low_cov.index)
snps_low_cov = snps_low_cov.to_html().replace(style[0], style[1])
else:
- snps_low_cov = "No SNPs present"
+ snps_low_cov = "No low covered SNPs"
snps_not_covered = 0
if snps_high_cov is not None:
snps_covered = len(snps_high_cov.index)
snps_high_cov = snps_high_cov.to_html().replace(style[0], style[1])
else:
- snps_high_cov = "No SNPs present"
+ snps_high_cov = "No covered SNPs"
snps_covered = 0
total_snps = str(snps_covered + snps_not_covered)
@@ -904,7 +904,7 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
# add tables & plots to template
html_string = self.build_report(
- html_template, total_stats, gene_stats, sub_thrshld_stats,
+ html_template, total_stats, gene_stats, sub_threshold_stats,
snps_low_cov, snps_high_cov, fig, all_plots, summary_plot,
report_vals
)
diff --git a/bin/coverage_stats_single.py b/bin/coverage_stats_single.py
index 1c8680d7..3f974e17 100644
--- a/bin/coverage_stats_single.py
+++ b/bin/coverage_stats_single.py
@@ -185,10 +185,14 @@ def cov_stats(self, data, thresholds):
exon_cov["cov"] > int(thrshld)
]["cov_bin_len"].sum()
- # calculate % bases at each threshold from raw to 2 dp.
+ # calculate % bases at each threshold from raw to 2 dp.
pct_bases = {}
for key, value in raw_bases.items():
- pct_bases[key] = round(value / tx_len * 100, 2)
+ # limit to 2dp using math.floor, use of round() with
+ # 2dp may lead to inaccuracy such as 99.99 => 100.00
+ raw_value = value / tx_len * 100
+ rounded_value = math.floor(raw_value * 100) / 100
+ pct_bases[key] = rounded_value
stats = {
"chrom": row["chrom"], "exon_start": row["exon_start"],
@@ -268,16 +272,16 @@ def summary_stats(self, cov_stats, thresholds):
cov_summary = cov_summary.append(stats, ignore_index=True)
- # round calculated vals to 2 dp
+ # limit calculated vals to 2 dp
round_cols = ['mean'] + threshold_header
- cov_summary[round_cols] = cov_summary[round_cols].round(2)
+ cov_summary[round_cols] = math.floor(
+ cov_summary[round_cols] * 100) / 100
return cov_summary
-
def write_outfiles(self, cov_stats, cov_summary, outfile, flagstat, build):
"""
- If --outfile arg given, writes coverage stats to file.
+ Writes both exon and gene level coverage stats to file.
Args:
- cov_stats (df): df of generated coverage stats
diff --git a/data/templates/single_template.html b/data/templates/single_template.html
index fde1dd2b..45b5bb90 100644
--- a/data/templates/single_template.html
+++ b/data/templates/single_template.html
@@ -37,7 +37,6 @@
text-align:left;
}
-
td {
word-wrap: break-word;
word-break:break-all
@@ -46,11 +45,9 @@
tr:hover {background-color:#ecebfc !important}
/* styling for collapsible sections */
-
.active, .collapsible:hover {
background-color: rgb(255, 255, 255);
}
-
.collapsible:after {
content: '+';
color: rgb(255, 255, 255);
@@ -58,7 +55,6 @@
float: right;
margin-left: 5px;
}
-
.active:after {
content: "-";
}
@@ -100,7 +96,7 @@ Summary
- Of the $total_genes genes in the panel, $covered_genes genes had 100% coverage at $threshold,
+ Of the $total_genes genes in the panel, $fully_covered_genes genes had 100% coverage at $threshold,
with $gene_issues genes having less than 100% coverage at $threshold .
$summary_plot
@@ -209,7 +205,6 @@ Coverage of SNPs
}
-
-
-