From 7a68c1f82ba7b209fd98914d904561a249d55c0e Mon Sep 17 00:00:00 2001 From: Jethro Rainford Date: Fri, 4 Dec 2020 07:56:01 +0000 Subject: [PATCH 1/4] change styling of low stats table --- bin/coverage_report_single.py | 60 ++++++++++++----------------------- 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py index 6cb4c783..12910452 100644 --- a/bin/coverage_report_single.py +++ b/bin/coverage_report_single.py @@ -1152,37 +1152,19 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov, report_vals["version"] = version report_vals["panel_pct_coverage"] = panel_pct_coverage - # set ranges for colouring cells - x0 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < 10 - ) & ( - sub_threshold_stats[threshold] > 0)].index, threshold] - x10 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < 30 - ) & ( - sub_threshold_stats[threshold] >= 10)].index, threshold] - x30 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < 50 - ) & ( - sub_threshold_stats[threshold] >= 30)].index, threshold] - x50 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < 70 - ) & ( - sub_threshold_stats[threshold] >= 50)].index, threshold] - x70 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < 90 - ) & ( - sub_threshold_stats[threshold] >= 70)].index, threshold] - x90 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < 95 - ) & ( - sub_threshold_stats[threshold] >= 90)].index, threshold] - x95 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < 99 - ) & ( - sub_threshold_stats[threshold] >= 95)].index, threshold] - x99 = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] >= 99)].index, threshold] + # creat slices of sub_threshold stats df to add styling to + slice_ranges = { + "x0": (10, 0), "x10": (30, 10), "x30": (50, 30), "x50": (70, 50), + "x70": (90, 70), "x90": (95, 90), "x95": (99, 95), "x99": (101, 99) + } + + sub_slice = {} + + for key, val in slice_ranges.items(): + sub_slice[key] = pd.IndexSlice[sub_threshold_stats.loc[( + sub_threshold_stats[threshold] < val[0] + ) & ( + sub_threshold_stats[threshold] >= val[1])].index, threshold] # df column index of threshold col_idx = sub_threshold_stats.columns.get_loc(threshold) @@ -1200,14 +1182,14 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov, "background-color: #b30000" if x[threshold] == 0 and idx == col_idx else "" for idx, v in enumerate(x) ], axis=1)\ - .bar(subset=x0, color='#b30000', vmin=0, vmax=100)\ - .bar(subset=x10, color='#990000', vmin=0, vmax=100)\ - .bar(subset=x30, color='#C82538', vmin=0, vmax=100)\ - .bar(subset=x50, color='#FF4500', vmin=0, vmax=100)\ - .bar(subset=x70, color='#FF4500', vmin=0, vmax=100)\ - .bar(subset=x90, color='#FF4500', vmin=0, vmax=100)\ - .bar(subset=x95, color='#FFBF00', vmin=0, vmax=100)\ - .bar(subset=x99, color='#007600', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x0"], color='#b30000', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x10"], color='#990000', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x30"], color='#C82538', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x50"], color='#FF4500', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x70"], color='#FF4500', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x90"], color='#FF4500', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x95"], color='#FFBF00', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x99"], color='#007600', vmin=0, vmax=100)\ .format(rnd)\ .set_table_attributes('table border="1"\ class="dataframe table table-hover table-bordered"')\ From 18a663d9dba74bd0055bc8bffce33b54553fcd00 Mon Sep 17 00:00:00 2001 From: jethror1 Date: Fri, 4 Dec 2020 11:08:51 +0000 Subject: [PATCH 2/4] refactor generating plots and table to styling to seperate classes and functions --- bin/coverage_report_single.py | 2331 +++++++++++++++++---------------- 1 file changed, 1220 insertions(+), 1111 deletions(-) diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py index 12910452..2984eb9a 100644 --- a/bin/coverage_report_single.py +++ b/bin/coverage_report_single.py @@ -31,1286 +31,1228 @@ from string import Template -class singleReport(): - def load_files(self, threshold, exon_stats, - gene_stats, raw_coverage, snp_vcfs, panel): +class generatePlots(): + """Functions to generate required plots""" + + def low_exon_plot(self, low_raw_cov, threshold): """ - Load in raw coverage data, coverage stats file and template. + Plot bp coverage of exon, used for those where coverage is given + threshold Args: - - threshold (int): threshold value passed from parse_args - - exon_stats (file): exon stats file (from args; - generated by coverage_stats_single.py) - - gene_stats (file): gene stats file (from args; - generated by coverage_stats_single.py) - - raw_coverage (file): from args; bp coverage file used as - input for coverage_stats_single.py - - snp_vcfs (list): VCFs of SNPs passed from args - - panel (file): panel bed file used for annotation, used to - display panel name in report if passed + - low_raw_cov (df): df of raw coverage for exons with low + coverage + - threshold (int): defined threshold level (default: 20) Returns: - - cov_stats (df): df of coverage stats for each exon - - cov_summary (df): df of gene level coverage - - raw_coverage (df): raw bp coverage for each exon - - html_template (str): string of HTML report template - - flagstat (dict): flagstat metrics, from gene_stats header - - build (str): ref build used, from gene_stats header - - panel (str): panes(s) / gene(s) included in report - - vcfs (str): list of vcf names used for SNP analysis - - version (str): version of Athena, used to add to report + - fig (figure): plots of low coverage regions """ - print("Reading in files") + print("Generating plots of low covered regions") - # read in single sample report template - bin_dir = os.path.dirname(os.path.abspath(__file__)) - template_dir = os.path.join(bin_dir, "../data/templates/") - single_template = os.path.join(template_dir, "single_template.html") + # get list of tuples of genes and exons to define plots + genes = low_raw_cov.drop_duplicates( + ["gene", "exon"])[["gene", "exon"]].values.tolist() + genes = [tuple(exon) for exon in genes] - with open(single_template, 'r') as template: - html_template = template.read() + if len(genes) == 0: + # everything above threshold, don't generate plots + fig = "

All regions in panel above threshold, no plots\ + to show.

" - try: - # attempt to get version tag from root dir name - # will only work if downloaded as zip / tar and not cloned - path = str(os.path.join(bin_dir, "../")).split("/") - version = [s for s in path if "athena" in s][0].split("-")[1] - version = "({})".format(version) - except Exception: - print("Error getting version from dir name, continuing.") - version = "" - pass + return fig - # read bootstrap into var to store in report html - bs = str(os.path.join(os.path.dirname( - os.path.abspath(__file__)), "../data/static/css/bootstrap.min.css" - )) - with open(bs) as bs: - bootstrap = bs.read() + # sort list of genes/exons by gene and exon + genes = sorted(genes, key=lambda element: (element[0], element[1])) - # read in exon stats file - with open(exon_stats.name) as exon_file: - dtypes = { - "chrom": str, "exon_start": int, "exon_end": int, "gene": str, - "tx": str, "exon": int, "min": int, "mean": float, "max": int, - r'[0-9]*x': float, "exon_len": int - } + plot_titles = [str(x[0]) + " exon: " + str(int(x[1])) for x in genes] - cov_stats = pd.read_csv( - exon_file, sep="\t", comment='#', dtype=dtypes - ) + low_raw_cov["exon_len"] =\ + low_raw_cov["exon_end"] - low_raw_cov["exon_start"] - # strip chr from chrom in cases of diff. formatted bed - cov_stats["chrom"] = cov_stats["chrom"].apply( - lambda x: str(x).replace("chr", "") - ) + low_raw_cov["relative_position"] = low_raw_cov["exon_end"] - round((( + low_raw_cov["cov_end"] + low_raw_cov["cov_start"]) / 2 + )) - # read in gene stats file - with open(gene_stats) as gene_file: - dtypes = { - "gene": str, "tx": str, "min": int, - "mean": float, "max": int, r'[0-9]*x': float - } + # set no. rows to no. of plots / no of columns to define grid + columns = 4 + rows = math.ceil(len(genes) / 4) - cov_summary = pd.read_csv( - gene_file, sep="\t", comment='#', dtype=dtypes - ) + # variable height depeendent on no. of plots + v_space = (1 / rows) * 0.25 - flagstat = {} - # read in flagstat and build from header of gene stats file - with open(gene_stats) as gene_file: - for ln in gene_file: - if ln.startswith("#"): - if "build" in ln: - # get build number - reference = ln.split(":")[1] - # add build to string to display - if "37" in reference: - build = "
  • Reference build used for aligment\ - {}
  • ".format(reference) - if "38" in build: - build = "
  • Reference build used for aligment\ - {}
  • ".format(reference) - else: - # read in flagstat from header - key = ln.split(":")[0].strip("#") - val = ln.split(":")[1] - flagstat[key] = val + # define grid to add plots to + fig = make_subplots( + rows=rows, cols=columns, print_grid=False, + horizontal_spacing=0.04, vertical_spacing=v_space, + subplot_titles=plot_titles + ) - if "build" not in locals(): - # build no. not included in gene_stats file - build = "" + # counter for grid + row_no = 1 + col_no = 1 - if panel is not None: - # if optional panel file given, get name and format for HTML - panel_name = Path(panel).stem + for gene in genes: + # make plot for each gene / exon - # format according to output of - # https://github.com/eastgenomics/eggd_generate_bed - panel_name = [x.strip("_") for x in panel_name.split("&&") if x] - panel_name = [ - x.strip("_b37").strip("_b38") for x in panel_name if x - ] - panel_name = [x.replace("_", " ") for x in panel_name if x] - panel_name = ", ".join(panel_name) - panel = "
  • Panel(s) / gene(s) included in report: {}\ -
  • ".format(panel_name) - else: - panel = "" + # counter for grid, by gets to 5th entry starts new row + if row_no // 5 == 1: + col_no += 1 + row_no = 1 - column = [ - "chrom", "exon_start", "exon_end", "gene", "tx", "exon", - "cov_start", "cov_end", "cov" - ] + # get rows for current gene and exon + exon_cov = low_raw_cov.loc[( + low_raw_cov["gene"] == gene[0] + ) & ( + low_raw_cov["exon"] == gene[1] + )] - dtypes = { - "chrom": str, "exon_start": int, "exon_end": int, "gene": str, - "tx": str, "exon": int, "cov_start": int, "cov_end": int, - "cov": int - } + exon_cov = exon_cov.sort_values(by='cov_start', ascending=True) + start = exon_cov.iloc[0] + end = exon_cov.iloc[-1] - # read in raw coverage stats file - with open(raw_coverage) as raw_file: - raw_coverage = pd.read_csv( - raw_file, sep="\t", names=column, dtype=dtypes - ) - # strip chr from chrom in cases of diff. formatted bed - raw_coverage["chrom"] = raw_coverage["chrom"].apply( - lambda x: str(x).replace("chr", "") - ) + if start["exon_start"] != start["cov_start"]: + # if cov_start is diff to tx start due to mosdepth + # binning, use tx start avoids wrongly estimating + # coverage by using wrong tx length + exon_cov.iloc[0, exon_cov.columns.get_loc( + "cov_start")] = int(start["exon_start"]) - if snp_vcfs: - # get names of SNP vcfs used to display in report - vcfs = ", ".join([Path(x).stem for x in snp_vcfs]) - vcfs = "
    VCF(s) of known variants included in report: {}\ -
    ".format(vcfs) - else: - vcfs = "" + if end["exon_end"] != end["cov_end"]: + # same as start + exon_cov.loc[ + exon_cov.index[-1], "cov_end"] = int(end["exon_end"]) - # check given threshold is in the stats files - if "x" not in str(threshold): - threshold = str(threshold) + "x" + # create empty df for unbinned data with same columns + exon_cov_unbinned = exon_cov[0:0] - if threshold not in list(cov_stats) and\ - threshold not in list(cov_summary): - print("""--threshold must be one of the gene and exon - stats coverage thresholds. Exiting now.""") - sys.exit() + for i, row in exon_cov.iterrows(): + for pos in range(row["cov_start"], row["cov_end"] + 1): + # unbin each row, set start & end to same value for each + # use +1 since range is non inclusive of final value + pos_row = row + pos_row["cov_start"] = pos + pos_row["cov_end"] = pos + exon_cov_unbinned = exon_cov_unbinned.append( + pos_row, ignore_index=True + ) - return cov_stats, cov_summary, raw_coverage, html_template, build,\ - panel, vcfs, bootstrap, version + # build list of first and last point for threshold line + xval = [x for x in range( + exon_cov_unbinned["cov_start"].iloc[0], + exon_cov_unbinned["cov_end"].iloc[-1] + )] + xval = xval[::len(xval) - 1] + yval = [threshold] * 2 + # info field for hovering on plot line + label = 'position: %{x}
    coverage: %{y}' - def build_report(self, html_template, total_stats, gene_stats, - sub_threshold_stats, snps_low_cov, snps_high_cov, - snps_no_cov, fig, all_plots, summary_plot, report_vals, - bootstrap - ): - """ - Build report from template and variables to write to file + # generate plot and threshold line to display + if sum(exon_cov_unbinned["cov"]) != 0: + plot = go.Scatter( + x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"], + mode="lines", + hovertemplate=label + ) + else: + # if any plots have no coverage, just display empty plot + # very hacky way by making data point transparent but + # ¯\_(ツ)_/¯ + plot = go.Scatter( + x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"], + mode="markers", marker={"opacity": 0} + ) - Args: - - html_template (str): string of HTML template file - - total_stats (df): total stats table of all genes & exons - - gene_stats (df): stats table of whole gene - - sub_threshold_stats (df): table of exons with < threshold - - snps_low_cov (df): table of snps with cov < threshold - - snsp_high_cov (df): table of snps with cov > threshold - - snps_no_cov (df): variants that span exon boundaries (i.e SVs) - - fig (figure): grid of low coverage exon plots (plotly) - - all-plots (figure): grid of all full gene- exon plots - - summary_plot (figure): gene summary plot - % at threshold - - report_vals (dict): values to display in report text - Returns: - - single_report (str): HTML string of filled report - """ - # convert logo image into string to pass in to template - logo = str(os.path.join(os.path.dirname( - os.path.abspath(__file__)), "../data/static/images/logo.png" - )) + threshold_line = go.Scatter( + x=xval, y=yval, hoverinfo='skip', mode="lines", + line=dict(color='rgb(205, 12, 24)', width=1) + ) - data_uri = base64.b64encode(open(logo, 'rb').read()).decode('utf-8') - logo = ''.format( - data_uri) + # add to subplot grid + fig.add_trace(plot, col_no, row_no) + fig.add_trace(threshold_line, col_no, row_no) - t = Template(html_template) + row_no = row_no + 1 - date = datetime.today().strftime('%Y-%m-%d') + # set height of grid by no. rows and scale value of 325 + height = (rows * 300) + 150 - single_report = t.safe_substitute( - bootstrap=bootstrap, - logo=logo, - total_genes=report_vals["total_genes"], - threshold=report_vals["threshold"], - summary_text=report_vals["summary_text"], - exon_issues=report_vals["exon_issues"], - gene_issues=report_vals["gene_issues"], - fully_covered_genes=report_vals["fully_covered_genes"], - name=report_vals["name"], - sub_threshold_stats=sub_threshold_stats, - low_cov_plots=fig, - all_plots=all_plots, - summary_plot=summary_plot, - gene_stats=gene_stats, - total_stats=total_stats, - snps_high_cov=snps_high_cov, - snps_low_cov=snps_low_cov, - snps_no_cov=snps_no_cov, - total_snps=report_vals["total_snps"], - snps_covered=report_vals["snps_covered"], - snps_pct_covered=report_vals["snps_pct_covered"], - snps_not_covered=report_vals["snps_not_covered"], - snps_pct_not_covered=report_vals["snps_pct_not_covered"], - snps_out_panel=report_vals["snps_out_panel"], - snps_pct_out_panel=report_vals["snps_pct_out_panel"], - date=date, - build=report_vals["build"], - vcfs=report_vals["vcfs"], - panel=report_vals["panel"], - panel_pct_coverage=report_vals["panel_pct_coverage"], - version=report_vals["version"] + # update plot formatting + fig.update_xaxes(nticks=3, ticks="", showgrid=True, tickformat=',d') + fig.update_yaxes(title='coverage', title_standoff=0) + fig.update_xaxes(title='exon position', color='#FFFFFF') + fig["layout"].update( + height=height, showlegend=False, margin=dict(l=50, r=0) ) - return single_report + # write plots to html string + fig = fig.to_html(full_html=False) + return fig - def panel_coverage(self, cov_stats, threshold): + + def all_gene_plots(self, raw_coverage, threshold): """ - Calculates mean coverage of all panel regions at given threshold, - normalised against length of each gene + Generate full plots for each gene Args: - - cov_stats (df): df of coverage stats for each exon - - threshold (int): threshold cut off for low coverage + - raw_coverage (file): from args; bp coverage file used as + input for coverage_stats_single.py + - threshold (int): defined threshold level (default: 20) Returns: - - panel_pct_coverage (str): % coverage of panel as str + - all-plots (figure): grid of all full gene- exon plots """ - print("Calculating panel average coverage") - # threshold column to check at - threshold = str(threshold) + "x" + all_plots = "" - gene_stats = pd.DataFrame( - columns=["gene", "gene_len", "coverage"]) + if len(raw_coverage.index) == 0: + # passed empty df, most likely because there were less genes + # than processes => empty df passed with multiprocess + return "" - # make list of genes - genes = sorted(list(set(cov_stats["gene"].tolist()))) + # get unique list of genes + genes = raw_coverage.drop_duplicates(["gene"])["gene"].values.tolist() for gene in genes: - # for each gene, calculate length and average % at threshold - gene_cov = cov_stats.loc[cov_stats["gene"] == gene] - length = sum(gene_cov["exon_len"]) - coverage = sum( - gene_cov[threshold] * gene_cov["exon_len"] / length) + # get coverage data for current gene + gene_cov = raw_coverage.loc[(raw_coverage["gene"] == gene)] + # get list of exons + exons = gene_cov.drop_duplicates(["exon"])["exon"].values.tolist() - gene_stats = gene_stats.append({ - "gene": gene, - "gene_len": length, - "coverage": coverage - }, ignore_index=True) + # no. plot columns = no. of exons + column_no = len(exons) - # calculate % panel coverage - panel_coverage = sum( - gene_stats["coverage"] * gene_stats["gene_len"] / sum( - gene_stats["gene_len"] - ) - ) + # make subplot grid size of no. of exons, height variable + # splits large genes to several rows and maintains height + height = math.ceil(len(exons) / 30) * 4 + fig = plt.figure(figsize=(30, height)) - # round to 12 dp to account for limit of accuracy of float & - # length of human genome - panel_coverage = round(panel_coverage, 12) + # generate grid with space for each exon + # splits genes with >25 exons to multiple rows + rows = math.ceil(len(exons) / 30) + if column_no > 30: + column_no = 30 - panel_pct_coverage = str(math.floor(panel_coverage * 100) / 100) + grid = fig.add_gridspec(rows, column_no, wspace=0) + axs = grid.subplots(sharey=True) - return panel_pct_coverage + if column_no == 1: + # handle single exon genes, axs needs turning into np + # array to flatten + axs = np.array([axs]) + axs = axs.flatten() - def snp_coverage(self, snp_vcfs, raw_coverage, threshold): - """ - Produces tables of coverage for variants inside of capture - regions, and larger structural variants spanning region - boundaries. + fig.suptitle(gene, fontweight="bold") + count = 0 - Args: - - snp_vcfs (str): list of vcf files used for SNP analysis - - raw_coverage (df): raw bp coverage for each exon - - threshold (int): threshold value passed from parse args + for exon in exons: + # get coverage data for current exon + exon_cov = raw_coverage.loc[( + raw_coverage["gene"] == gene + ) & ( + raw_coverage["exon"] == exon + )] - Returns: - - snps_low_cov (df): variants with lower coverage than threshold - - snps_high_cov (df): variants with higher coverage than threshold - - snps_no_cov (df): variants that span exon boundaries (i.e SVs) - """ - print("Calculating coverage of given SNPs") + exon_cov = exon_cov.reset_index(drop=True) - bedFile = raw_coverage[ - ["chrom", "exon_start", "exon_end"]].drop_duplicates() - coverageFile = raw_coverage[ - ["chrom", "cov_start", "cov_end", "cov"]].drop_duplicates() + # sort and check coordinates are correct + exon_cov = exon_cov.sort_values( + by='cov_start', ascending=True + ) - # turn dfs into BedTools objects - bed = bedtools.BedTool.from_dataframe(bedFile) - cov = bedtools.BedTool.from_dataframe(coverageFile) + start = exon_cov.iloc[0] + end = exon_cov.iloc[-1] - # empty df to add all SNP info to - snp_df = pd.DataFrame(columns=[ - 'VCF', 'chrom', 'pos', 'id', 'ref', 'alt', 'info' - ]) + if start["exon_start"] != start["cov_start"]: + # if cov_start is diff to tx start due to mosdepth + # binning, use tx start avoids wrongly estimating + # coverage by using wrong tx length + exon_cov.iloc[ + 0, exon_cov.columns.get_loc("cov_start") + ] = int(start["exon_start"]) - for vcf in snp_vcfs: - # read vcf into BedTools object - v = bedtools.BedTool(vcf) + if end["exon_end"] != end["cov_end"]: + # same as start + exon_cov.loc[exon_cov.index[-1], "cov_end"] = int( + end["exon_end"] + ) - # get vcf name to add to table, req. for multiple VCFS and - # recording variant source VCF - name = Path(vcf).stem.split("_")[0] + # check if coverage column empty + if (exon_cov['cov'] == 0).all(): + # no coverage, generate empty plot with just + # threshold line + axs[count].plot( + [0, 100], [threshold, threshold], + color='red', linestyle='-', linewidth=2 + ) + else: + axs[count].plot(exon_cov["cov_start"], exon_cov["cov"]) - # use bedtools intersect to get SNPs in capture region - snps = bed.intersect(v, wb=True) + # threshold line + axs[count].plot( + [exon_cov["exon_start"], exon_cov["exon_end"]], + [threshold, threshold], color='red', linestyle='-', + linewidth=1 + ) - for row in snps: - # get data from returned BedTools object, add to df - snp_data = str(row).split() - snp_df = snp_df.append({ - 'VCF': name, 'chrom': snp_data[3], - 'pos': snp_data[4], 'ref': snp_data[6], - 'alt': snp_data[7], 'info': snp_data[10] - }, ignore_index=True) + # add labels + xlab = str( + exon_cov["exon_end"].iloc[0] - + exon_cov["exon_start"].iloc[0] + ) + "\nbp" + axs[count].title.set_text(exon) + axs[count].set_xlabel(xlab) - snp_df = snp_df[ - ['VCF', 'chrom', 'pos', 'ref', 'alt', 'info']].drop_duplicates() + count += 1 - # reset index - raw_coverage = raw_coverage.reset_index(drop=True) + # remove y ticks & label for all but first plot of lines + for i in range(column_no * rows): + if i in [x * column_no for x in range(rows)]: + # first plot of line, keep ticks and labels + continue + else: + axs[i].yaxis.set_ticks_position('none') - # use pandasql to intersect SNPs against coverage df to find the - # coverage at each SNP position - coverage_sql = """ - SELECT snp_df.VCF, snp_df.chrom, snp_df.pos, snp_df.ref, - snp_df.alt, snp_df.info, raw_coverage.gene, raw_coverage.exon, - raw_coverage.cov_start, raw_coverage.cov_end, raw_coverage.cov - FROM snp_df - LEFT JOIN raw_coverage on snp_df.CHROM=raw_coverage.chrom - WHERE snp_df.POS > raw_coverage.cov_start AND - snp_df.POS <= raw_coverage.cov_end - """ + # strip x axis ticks and labels + plt.setp(plt.gcf().get_axes(), xticks=[]) - snp_cov = pdsql.sqldf(coverage_sql, locals()) + # adjust yaxis limits + ymax = max(gene_cov["cov"].tolist()) + 10 + plt.ylim(bottom=0, top=ymax) - # get SNPs that won't have coverage data but do intersect panel - # regions (i.e. large deletions that span a region) - snps_no_cov = snp_df.merge(snp_cov, how='outer', indicator=True).loc[ - lambda x: x['_merge'] == 'left_only'] + # remove outer white margins + fig.tight_layout(h_pad=1.2) - snps_no_cov = snps_no_cov[[ - "VCF", "chrom", "pos", "ref", "alt", "info" - ]].reset_index(drop=True) + # convert image to html string and append to one really long + # string to insert in report + buffer = BytesIO() + plt.savefig(buffer, format='png') + buffer.seek(0) + image_png = buffer.getvalue() + buffer.close() + graphic = base64.b64encode(image_png) + data_uri = graphic.decode('utf-8') + img_tag = "".format( + data_uri + ) - # get required columns for SNP tables - snps_cov = snp_cov[ - ["VCF", "gene", "exon", "chrom", "pos", "ref", "alt", "cov"] - ].drop_duplicates(subset=[ - "VCF", "chrom", "pos", "ref", "alt"]).reset_index(drop=True) + all_plots = all_plots + img_tag + "

    " - # rename columns for displaying in report - snps_cov.columns = ["VCF", "Gene", "Exon", "Chromosome", "Position", - "Ref", "Alt", "Coverage"] + plt.close(fig) - snps_no_cov.columns = [ - "VCF", "Chromosome", "Position", "Ref", "Alt", "Info" - ] + return all_plots - # remove <> from DELs to stop being interpreted as HTML tags - snps_no_cov["Alt"] = snps_no_cov["Alt"].str.strip("<>") - snps_cov["Coverage"] = snps_cov["Coverage"].astype(int) + def summary_gene_plot(self, cov_summary, threshold): + """ + Generate summary plot of all genes against threshold value - # sort no_cov table by chrom & pos, as pos is str first define - # order to sort by - order = [str(x) for x in range(0, 23)] - order.extend(["X", "Y", "MT"]) - snps_no_cov["Chromosome"] = pd.Categorical( - snps_no_cov["Chromosome"], order + Args: + - cov_summary (df): df of gene coverage values + - threshold (int): defined threshold level (default: 20) + + Returns: + - summary_plot (fig): plot of all genes + """ + print("Generating summary plot") + + threshold = str(threshold) + "x" + + summary_data = cov_summary.copy() + + # define colours based on values + summary_data["colours"] = 'green' + summary_data.loc[summary_data[threshold] < 100, 'colours'] = 'orange' + summary_data.loc[summary_data[threshold] < 90, 'colours'] = 'red' + + summary_data = summary_data.sort_values( + by=[threshold], ascending=False ) + summary_plot, axs = plt.subplots(figsize=(25, 7.5)) - snps_cov = snps_cov.sort_values(by=["Gene", "Exon", "Position"]) - snps_no_cov = snps_no_cov.sort_values(by=["Chromosome", "Position"]) + if len(summary_data.index) > 100: + # split off some of 100% covered genes to limit size of plot + if len(summary_data[summary_data[threshold] < 100]) > 100: + # over 100 sub threshold genes, remove all 100% genes + genes100pct = len(summary_data[summary_data[threshold] == 100]) + summary_data = summary_data[summary_data[threshold] < 100] + else: + # split off bottom 100 genes, plot includes some 100% covered + genes100pct = len(summary_data.iloc[:-100]) + summary_data = summary_data.iloc[-100:] - # split SNPs by coverage against threshold - snps_low_cov = snps_cov.loc[snps_cov["Coverage"] < threshold] - snps_high_cov = snps_cov.loc[snps_cov["Coverage"] >= threshold] + plt.bar( + summary_data["gene"], [int(x) for x in summary_data[threshold]], + color=summary_data.colours + ) - return snps_low_cov, snps_high_cov, snps_no_cov + if "genes100pct" in locals(): + genes100pct = str(genes100pct) + # more than 100 genes, add title inc. 100% covered not shown + axs.set_title( + r"$\bf{" + genes100pct + "}$" + " genes covered 100% at " + + r"$\bf{" + threshold + "}$" + + " were omitted from the plot due to the panel size", loc='left' + ) + # threshold lines + plt.axhline(y=99, linestyle='--', color="#565656", alpha=0.6) + plt.axhline(y=95, linestyle='--', color="#565656", alpha=0.6) - def low_coverage_regions(self, cov_stats, raw_coverage, threshold): + plt.text(1.005, 0.94, '99%', transform=axs.transAxes) + plt.text(1.005, 0.91, '95%', transform=axs.transAxes) + + # plot formatting + axs.tick_params(labelsize=6, length=0) + plt.xticks(rotation=55, color="#565656") + + # adjust whole plot marins + axs.margins(x=0.01) + axs.autoscale_view(scaley=True) + + # add legend + green = mpatches.Patch(color='green', label='100%') + orange = mpatches.Patch(color='orange', label='90-99.99%') + red = mpatches.Patch(color='red', label='<90%') + + plt.legend( + handles=[green, orange, red], loc='upper center', + bbox_to_anchor=(0.5, -0.1), + fancybox=True, shadow=True, ncol=12, fontsize=12 + ) + + vals = np.arange(0, 110, 10).tolist() + plt.yticks(vals, vals) + axs.tick_params(axis='both', which='major', labelsize=8) + + plt.xlabel("") + plt.ylabel("% coverage ({})".format(threshold), fontsize=11) + + axs.yaxis.grid(linewidth=0.5, color="grey", linestyle="-.") + plt.box(False) + axs.set_axisbelow(True) + plt.tight_layout() + + # convert image to html string to insert in report + buffer = BytesIO() + plt.savefig(buffer, format='png') + buffer.seek(0) + image_png = buffer.getvalue() + buffer.close() + graphic = base64.b64encode(image_png) + data_uri = graphic.decode('utf-8') + summary_plot = "".format( + data_uri + ) + + return summary_plot + + +class styleTables(): + """Functions for styling tables for displaying in report""" + + def style_sub_threshold( + self, cov_stats, threshold, threshold_cols, vals + ): """ - Get regions where coverage at given threshold is <100% + Styling of sub threshold stats df for displaying in report Args: - - cov_stats (df): df of coverage stats for each exon - - raw_coverage (df): raw bp coverage for each exon - - threshold (int): defined threshold level (default: 20) - + - cov_stats (df): df of per exon coverage stats + - threshold (str): low coverage threshold value + - threshold_cols (list): threshold values for coverage Returns: - - low_raw_cov (df): df of raw bp values for each region with - coverage less than 100% at threshold + - sub_threshold_stats (): """ - # threshold column to check at - threshold = str(threshold) + "x" - - # get threshold columns and add to column names - threshold_cols = list(cov_stats.filter(regex='[0-9]+x', axis=1)) - column = [ - "gene", "tx", "chrom", "exon", "exon_start", "exon_end", - "min", "mean", "max" + "gene", "tx", "chrom", "exon", "exon_len", "exon_start", + "exon_end", "min", "mean", "max" ] column.extend(threshold_cols) - # empty df - low_stats = pd.DataFrame(columns=column) + sub_threshold = pd.DataFrame(columns=column) - # get all exons with <100% coverage at given threshold + # get all exons with <100% coverage at threshold for i, row in cov_stats.iterrows(): if int(row[threshold]) < 100: - low_stats = low_stats.append(row, ignore_index=True) + sub_threshold = sub_threshold.append(row, ignore_index=True) # pandas is terrible and forces floats, change back to int dtypes = { 'chrom': str, 'exon': int, + 'exon_len': int, 'exon_start': int, 'exon_end': int, 'min': int, 'max': int } - low_stats = low_stats.astype(dtypes) + if not sub_threshold.empty: + # some low covered regions identified + sub_threshold = sub_threshold.astype(dtypes) - # get list of tuples of genes and exons with low coverage to - # select out raw coverage - low_exon_list = low_stats.reset_index()[['gene', - 'exon']].values.tolist() - low_exon_list = [tuple(exon) for exon in low_exon_list] + sub_threshold_stats = pd.pivot_table(sub_threshold, index=[ + "gene", "tx", "chrom", "exon", + "exon_len", "exon_start", "exon_end" + ], values=vals) - # get raw coverage for low coverage regions to plot - low_raw_cov = raw_coverage[raw_coverage[['gene', 'exon']].apply( - tuple, axis=1).isin(low_exon_list)].reset_index() + # reset index to fix formatting + sub_threshold_stats = sub_threshold_stats.reindex(vals, axis=1) + sub_threshold_stats.reset_index(inplace=True) - return low_raw_cov + gene_issues = len(list(set(sub_threshold_stats["gene"].tolist()))) + exon_issues = len(sub_threshold_stats["exon"]) + else: + # if no low regions set to empty df with appropriate columns + print("No low coverage regions, generating empty table") + sub_threshold_stats = pd.DataFrame(columns=column) + gene_issues = 0 + exon_issues = 0 + # rename columns to display properly + sub_threshold_stats = sub_threshold_stats.rename(columns={ + "gene": "Gene", + "tx": "Transcript", + "chrom": "Chr", + "exon": "Exon", + "exon_len": "Length", + "exon_start": "Start", + "exon_end": "End", + "min": "Min", + "mean": "Mean", + "max": "Max" + }) - def low_exon_plot(self, low_raw_cov, threshold): - """ - Plot bp coverage of exon, used for those where coverage is given - threshold + # reindex & set to begin at 1 + sub_threshold_stats.index = np.arange( + 1, len(sub_threshold_stats.index) + 1 + ) - Args: - - low_raw_cov (df): df of raw coverage for exons with low - coverage - - threshold (int): defined threshold level (default: 20) + # creat slices of sub_threshold stats df to add styling to + slice_ranges = { + "x0": (10, 0), "x10": (30, 10), "x30": (50, 30), "x50": (70, 50), + "x70": (90, 70), "x90": (95, 90), "x95": (99, 95), "x99": (101, 99) + } - Returns: - - fig (figure): plots of low coverage regions - """ - print("Generating plots of low covered regions") + sub_slice = {} - # get list of tuples of genes and exons to define plots - genes = low_raw_cov.drop_duplicates( - ["gene", "exon"])[["gene", "exon"]].values.tolist() - genes = [tuple(exon) for exon in genes] + for key, val in slice_ranges.items(): + sub_slice[key] = pd.IndexSlice[sub_threshold_stats.loc[( + sub_threshold_stats[threshold] < val[0] + ) & ( + sub_threshold_stats[threshold] >= val[1])].index, threshold] - if len(genes) == 0: - # everything above threshold, don't generate plots - fig = "

    All regions in panel above threshold, no plots\ - to show.

    " + # df column index of threshold + col_idx = sub_threshold_stats.columns.get_loc(threshold) - return fig + # make dict for rounding coverage columns to 2dp + rnd = {} + for col in list(sub_threshold_stats.columns[10:15]): + rnd[col] = '{0:.2f}%' - # sort list of genes/exons by gene and exon - genes = sorted(genes, key=lambda element: (element[0], element[1])) + # set threshold column widths as a fraction of 30% table width + t_width = str(30 / len(threshold_cols)) + "%" - plot_titles = [str(x[0]) + " exon: " + str(int(x[1])) for x in genes] + # apply colours to coverage cell based on value, 0 is given solid red + s = sub_threshold_stats.style.apply(lambda x: [ + "background-color: #b30000" if x[threshold] == 0 and idx == col_idx + else "" for idx, v in enumerate(x) + ], axis=1)\ + .bar(subset=sub_slice["x0"], color='#b30000', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x10"], color='#990000', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x30"], color='#C82538', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x50"], color='#FF4500', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x70"], color='#FF4500', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x90"], color='#FF4500', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x95"], color='#FFBF00', vmin=0, vmax=100)\ + .bar(subset=sub_slice["x99"], color='#007600', vmin=0, vmax=100)\ + .format(rnd)\ + .set_table_attributes('table border="1"\ + class="dataframe table table-hover table-bordered"')\ + .set_uuid("low_exon_table")\ + .set_properties(**{'font-size': '0.85vw', 'table-layout': 'auto'})\ + .set_properties(subset=threshold_cols, **{'width': t_width})\ - low_raw_cov["exon_len"] =\ - low_raw_cov["exon_end"] - low_raw_cov["exon_start"] + sub_threshold_stats["Mean"] = sub_threshold_stats["Mean"].apply( + lambda x: int(x) + ) - low_raw_cov["relative_position"] = low_raw_cov["exon_end"] - round((( - low_raw_cov["cov_end"] + low_raw_cov["cov_start"]) / 2 - )) + sub_threshold_stats = s.render() - # set no. rows to no. of plots / no of columns to define grid - columns = 4 - rows = math.ceil(len(genes) / 4) + return sub_threshold_stats, gene_issues, exon_issues - # variable height depeendent on no. of plots - v_space = (1 / rows) * 0.25 - # define grid to add plots to - fig = make_subplots( - rows=rows, cols=columns, print_grid=False, - horizontal_spacing=0.04, vertical_spacing=v_space, - subplot_titles=plot_titles + def style_total_stats(self, cov_stats, threshold_cols, vals): + """ + Styling of full gene-exon stats table for displaying in report + Args: + - + Returns: + - + """ + # do some excel level formatting to make table more readable + total_stats = pd.pivot_table( + cov_stats, + index=["gene", "tx", "chrom", "exon", "exon_len", + "exon_start", "exon_end"], + values=vals ) - # counter for grid - row_no = 1 - col_no = 1 - - for gene in genes: - # make plot for each gene / exon + # reset index to fix formatting, set beginning to 1 + total_stats = total_stats.reindex(vals, axis=1) + total_stats.index = np.arange(1, len(total_stats.index) + 1) - # counter for grid, by gets to 5th entry starts new row - if row_no // 5 == 1: - col_no += 1 - row_no = 1 + total_stats = total_stats.rename(columns={ + "gene": "Gene", + "tx": "Transcript", + "chrom": "Chr", + "exon": "Exon", + "exon_len": "Length", + "exon_start": "Start", + "exon_end": "End", + "min": "Min", + "mean": "Mean", + "max": "Max" + }) - # get rows for current gene and exon - exon_cov = low_raw_cov.loc[( - low_raw_cov["gene"] == gene[0] - ) & ( - low_raw_cov["exon"] == gene[1] - )] + # limit to 2dp using math.floor, use of round() with + # 2dp may lead to inaccuracy such as 99.99 => 100.00 + round_cols = ['Mean'] + threshold_cols - exon_cov = exon_cov.sort_values(by='cov_start', ascending=True) - start = exon_cov.iloc[0] - end = exon_cov.iloc[-1] + for col in round_cols: + total_stats[col] = total_stats[col].map( + lambda col: math.floor(col * 100) / 100 + ) - if start["exon_start"] != start["cov_start"]: - # if cov_start is diff to tx start due to mosdepth - # binning, use tx start avoids wrongly estimating - # coverage by using wrong tx length - exon_cov.iloc[0, exon_cov.columns.get_loc( - "cov_start")] = int(start["exon_start"]) + # CSS table class for styling tables + style = ( + '', + '
    ' + ) - if end["exon_end"] != end["cov_end"]: - # same as start - exon_cov.loc[ - exon_cov.index[-1], "cov_end"] = int(end["exon_end"]) + total_stats = total_stats.to_html(justify='left').replace( + style[0], style[1] + ) - # create empty df for unbinned data with same columns - exon_cov_unbinned = exon_cov[0:0] + return total_stats - for i, row in exon_cov.iterrows(): - for pos in range(row["cov_start"], row["cov_end"] + 1): - # unbin each row, set start & end to same value for each - # use +1 since range is non inclusive of final value - pos_row = row - pos_row["cov_start"] = pos - pos_row["cov_end"] = pos - exon_cov_unbinned = exon_cov_unbinned.append( - pos_row, ignore_index=True - ) - # build list of first and last point for threshold line - xval = [x for x in range( - exon_cov_unbinned["cov_start"].iloc[0], - exon_cov_unbinned["cov_end"].iloc[-1] - )] - xval = xval[::len(xval) - 1] - yval = [threshold] * 2 + def style_cov_summary(self, cov_summary, threshold_cols): + """ + """ + # rename columns for displaying in report + cov_summary = cov_summary.drop(columns=["exon"]) + cov_summary = cov_summary.rename(columns={ + "gene": "Gene", + "tx": "Transcript", + "min": "Min", + "mean": "Mean", + "max": "Max" + }) - # info field for hovering on plot line - label = 'position: %{x}
    coverage: %{y}' + # get values to display in report + total_genes = len(cov_summary["Gene"].tolist()) - # generate plot and threshold line to display - if sum(exon_cov_unbinned["cov"]) != 0: - plot = go.Scatter( - x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"], - mode="lines", - hovertemplate=label - ) - else: - # if any plots have no coverage, just display empty plot - # very hacky way by making data point transparent but - # ¯\_(ツ)_/¯ - plot = go.Scatter( - x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"], - mode="markers", marker={"opacity": 0} - ) + # limit to 2dp using math.floor, use of round() with + # 2dp may lead to inaccuracy such as 99.99 => 100.00 + round_cols = ['Mean'] + threshold_cols - threshold_line = go.Scatter( - x=xval, y=yval, hoverinfo='skip', mode="lines", - line=dict(color='rgb(205, 12, 24)', width=1) + for col in round_cols: + cov_summary[col] = cov_summary[col].map( + lambda col: math.floor(col * 100) / 100 ) - # add to subplot grid - fig.add_trace(plot, col_no, row_no) - fig.add_trace(threshold_line, col_no, row_no) - - row_no = row_no + 1 - - # set height of grid by no. rows and scale value of 325 - height = (rows * 300) + 150 + # reset index to start at 1 + cov_summary.index = np.arange(1, len(cov_summary.index) + 1) - # update plot formatting - fig.update_xaxes(nticks=3, ticks="", showgrid=True, tickformat=',d') - fig.update_yaxes(title='coverage', title_standoff=0) - fig.update_xaxes(title='exon position', color='#FFFFFF') - fig["layout"].update( - height=height, showlegend=False, margin=dict(l=50, r=0) + # CSS table class for styling tables + style = ( + '
    ', + '
    ' ) - # write plots to html string - fig = fig.to_html(full_html=False) + # generate HTML strings from table objects to write to report + gene_stats = cov_summary.to_html(justify='left').replace( + style[0], style[1] + ) - return fig + return gene_stats, total_genes - def all_gene_plots(self, raw_coverage, threshold): + def style_snps_low_cov(self, snps_low_cov): """ - Generate full plots for each gene - - Args: - - raw_coverage (file): from args; bp coverage file used as - input for coverage_stats_single.py - - threshold (int): defined threshold level (default: 20) - - Returns: - - all-plots (figure): grid of all full gene- exon plots """ + # get snps values and format dfs to display + if not snps_low_cov.empty: + # format low coverage SNPs table + snps_low_cov.index = np.arange(1, len(snps_low_cov.index) + 1) + snps_not_covered = len(snps_low_cov.index) + snps_low_cov = snps_low_cov.style\ + .set_table_attributes( + 'class="dataframe table table-striped"')\ + .set_uuid("var_low_cov")\ + .set_properties(**{ + 'font-size': '0.80vw', 'table-layout': 'auto' + })\ + .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\ + .set_properties(subset=["Exon"], **{'width': '7.5%'})\ + .set_properties(subset=["Chromosome"], **{'width': '10%'})\ + .set_properties(subset=["Position"], **{'width': '12.5%'})\ + .set_properties(subset=["Ref"], **{'width': '20%'})\ + .set_properties(subset=["Alt"], **{'width': '20%'})\ + .set_properties(subset=["Coverage"], **{'width': '10%'}) - all_plots = "" - - if len(raw_coverage.index) == 0: - # passed empty df, most likely because there were less genes - # than processes => empty df passed with multiprocess - return "" + snps_low_cov = snps_low_cov.render() + else: + snps_low_cov = "No low covered SNPs" + snps_not_covered = 0 - # get unique list of genes - genes = raw_coverage.drop_duplicates(["gene"])["gene"].values.tolist() + return snps_low_cov, snps_not_covered - for gene in genes: - # get coverage data for current gene - gene_cov = raw_coverage.loc[(raw_coverage["gene"] == gene)] - # get list of exons - exons = gene_cov.drop_duplicates(["exon"])["exon"].values.tolist() + def style_snps_high_cov(self, snps_high_cov): + """ + """ - # no. plot columns = no. of exons - column_no = len(exons) + if not snps_high_cov.empty: + # format high coverage SNPs table + snps_high_cov.index = np.arange(1, len(snps_high_cov.index) + 1) - # make subplot grid size of no. of exons, height variable - # splits large genes to several rows and maintains height - height = math.ceil(len(exons) / 30) * 4 - fig = plt.figure(figsize=(30, height)) + snps_covered = len(snps_high_cov.index) - # generate grid with space for each exon - # splits genes with >25 exons to multiple rows - rows = math.ceil(len(exons) / 30) - if column_no > 30: - column_no = 30 + snps_high_cov = snps_high_cov.style\ + .set_table_attributes( + 'class="dataframe table table-striped"')\ + .set_uuid("var_high_cov")\ + .set_properties(**{ + 'font-size': '0.80vw', 'table-layout': 'auto' + })\ + .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\ + .set_properties(subset=["Exon"], **{'width': '7.5%'})\ + .set_properties(subset=["Chromosome"], **{'width': '10%'})\ + .set_properties(subset=["Position"], **{'width': '12.5%'})\ + .set_properties(subset=["Ref"], **{'width': '20%'})\ + .set_properties(subset=["Alt"], **{'width': '20%'})\ + .set_properties(subset=["Coverage"], **{'width': '10%'}) - grid = fig.add_gridspec(rows, column_no, wspace=0) - axs = grid.subplots(sharey=True) + snps_high_cov = snps_high_cov.render() + else: + snps_high_cov = "No covered SNPs" + snps_covered = 0 + + return snps_high_cov, snps_covered - if column_no == 1: - # handle single exon genes, axs needs turning into np - # array to flatten - axs = np.array([axs]) - axs = axs.flatten() + def style_snps_no_cov(self, snps_no_cov): + """ + """ + # if variants from vcf found that span exon boundaries + if not snps_no_cov.empty: + # manually add div and styling around rendered table, allows + # to be fully absent from the report if the table is empty + snps_no_cov.index = np.arange(1, len(snps_no_cov) + 1) - fig.suptitle(gene, fontweight="bold") - count = 0 + # get number of variants to display in report + snps_out_panel = len(snps_no_cov.index) - for exon in exons: - # get coverage data for current exon - exon_cov = raw_coverage.loc[( - raw_coverage["gene"] == gene - ) & ( - raw_coverage["exon"] == exon - )] + html_string = snps_no_cov.style\ + .set_table_attributes( + 'class="dataframe table table-striped"')\ + .set_uuid("var_no_cov")\ + .set_properties(**{ + 'font-size': '0.80vw', 'table-layout': 'auto' + })\ + .set_properties(subset=["VCF"], **{ + 'width': '7.5%' + })\ + .set_properties(subset=[ + "Chromosome", "Position", "Ref", "Alt" + ], **{'width': '10%'}) - exon_cov = exon_cov.reset_index(drop=True) + html_string = html_string.render() - # sort and check coordinates are correct - exon_cov = exon_cov.sort_values( - by='cov_start', ascending=True - ) + snps_no_cov = """ +
    Variants included in the first table below either fully\ + or partially span panel region(s). These are most likely\ + large structural variants and as such do not have\ + coverage data available. See the "info" column for details\ + on the variant. +
    +
    Table of variants spanning panel regions(s)   + +
    +
    + {} +
    +
    + """.format(html_string) + else: + snps_no_cov = "" + snps_out_panel = 0 - start = exon_cov.iloc[0] - end = exon_cov.iloc[-1] + return snps_no_cov, snps_out_panel - if start["exon_start"] != start["cov_start"]: - # if cov_start is diff to tx start due to mosdepth - # binning, use tx start avoids wrongly estimating - # coverage by using wrong tx length - exon_cov.iloc[ - 0, exon_cov.columns.get_loc("cov_start") - ] = int(start["exon_start"]) - if end["exon_end"] != end["cov_end"]: - # same as start - exon_cov.loc[exon_cov.index[-1], "cov_end"] = int( - end["exon_end"] - ) +class singleReport(): + """Functions to calculate values and generate report""" - # check if coverage column empty - if (exon_cov['cov'] == 0).all(): - # no coverage, generate empty plot with just - # threshold line - axs[count].plot( - [0, 100], [threshold, threshold], - color='red', linestyle='-', linewidth=2 - ) - else: - axs[count].plot(exon_cov["cov_start"], exon_cov["cov"]) + def load_files(self, threshold, exon_stats, + gene_stats, raw_coverage, snp_vcfs, panel): + """ + Load in raw coverage data, coverage stats file and template. - # threshold line - axs[count].plot( - [exon_cov["exon_start"], exon_cov["exon_end"]], - [threshold, threshold], color='red', linestyle='-', - linewidth=1 - ) + Args: + - threshold (int): threshold value passed from parse_args + - exon_stats (file): exon stats file (from args; + generated by coverage_stats_single.py) + - gene_stats (file): gene stats file (from args; + generated by coverage_stats_single.py) + - raw_coverage (file): from args; bp coverage file used as + input for coverage_stats_single.py + - snp_vcfs (list): VCFs of SNPs passed from args + - panel (file): panel bed file used for annotation, used to + display panel name in report if passed - # add labels - xlab = str( - exon_cov["exon_end"].iloc[0] - - exon_cov["exon_start"].iloc[0] - ) + "\nbp" - axs[count].title.set_text(exon) - axs[count].set_xlabel(xlab) + Returns: + - cov_stats (df): df of coverage stats for each exon + - cov_summary (df): df of gene level coverage + - raw_coverage (df): raw bp coverage for each exon + - html_template (str): string of HTML report template + - flagstat (dict): flagstat metrics, from gene_stats header + - build (str): ref build used, from gene_stats header + - panel (str): panes(s) / gene(s) included in report + - vcfs (str): list of vcf names used for SNP analysis + - version (str): version of Athena, used to add to report + """ + print("Reading in files") - count += 1 + # read in single sample report template + bin_dir = os.path.dirname(os.path.abspath(__file__)) + template_dir = os.path.join(bin_dir, "../data/templates/") + single_template = os.path.join(template_dir, "single_template.html") - # remove y ticks & label for all but first plot of lines - for i in range(column_no * rows): - if i in [x * column_no for x in range(rows)]: - # first plot of line, keep ticks and labels - continue - else: - axs[i].yaxis.set_ticks_position('none') + with open(single_template, 'r') as template: + html_template = template.read() - # strip x axis ticks and labels - plt.setp(plt.gcf().get_axes(), xticks=[]) + try: + # attempt to get version tag from root dir name + # will only work if downloaded as zip / tar and not cloned + path = str(os.path.join(bin_dir, "../")).split("/") + version = [s for s in path if "athena" in s][0].split("-")[1] + version = "({})".format(version) + except Exception: + print("Error getting version from dir name, continuing.") + version = "" + pass - # adjust yaxis limits - ymax = max(gene_cov["cov"].tolist()) + 10 - plt.ylim(bottom=0, top=ymax) + # read bootstrap into var to store in report html + bs = str(os.path.join(os.path.dirname( + os.path.abspath(__file__)), "../data/static/css/bootstrap.min.css" + )) + with open(bs) as bs: + bootstrap = bs.read() - # remove outer white margins - fig.tight_layout(h_pad=1.2) + # read in exon stats file + with open(exon_stats.name) as exon_file: + dtypes = { + "chrom": str, "exon_start": int, "exon_end": int, "gene": str, + "tx": str, "exon": int, "min": int, "mean": float, "max": int, + r'[0-9]*x': float, "exon_len": int + } - # convert image to html string and append to one really long - # string to insert in report - buffer = BytesIO() - plt.savefig(buffer, format='png') - buffer.seek(0) - image_png = buffer.getvalue() - buffer.close() - graphic = base64.b64encode(image_png) - data_uri = graphic.decode('utf-8') - img_tag = "".format( - data_uri + cov_stats = pd.read_csv( + exon_file, sep="\t", comment='#', dtype=dtypes ) - all_plots = all_plots + img_tag + "

    " + # strip chr from chrom in cases of diff. formatted bed + cov_stats["chrom"] = cov_stats["chrom"].apply( + lambda x: str(x).replace("chr", "") + ) - plt.close(fig) + # read in gene stats file + with open(gene_stats) as gene_file: + dtypes = { + "gene": str, "tx": str, "min": int, + "mean": float, "max": int, r'[0-9]*x': float + } - return all_plots + cov_summary = pd.read_csv( + gene_file, sep="\t", comment='#', dtype=dtypes + ) + + flagstat = {} + # read in flagstat and build from header of gene stats file + with open(gene_stats) as gene_file: + for ln in gene_file: + if ln.startswith("#"): + if "build" in ln: + # get build number + reference = ln.split(":")[1] + # add build to string to display + if "37" in reference: + build = "
  • Reference build used for aligment\ + {}
  • ".format(reference) + if "38" in build: + build = "
  • Reference build used for aligment\ + {}
  • ".format(reference) + else: + # read in flagstat from header + key = ln.split(":")[0].strip("#") + val = ln.split(":")[1] + flagstat[key] = val + + if "build" not in locals(): + # build no. not included in gene_stats file + build = "" + + if panel is not None: + # if optional panel file given, get name and format for HTML + panel_name = Path(panel).stem + + # format according to output of + # https://github.com/eastgenomics/eggd_generate_bed + panel_name = [x.strip("_") for x in panel_name.split("&&") if x] + panel_name = [ + x.strip("_b37").strip("_b38") for x in panel_name if x + ] + panel_name = [x.replace("_", " ") for x in panel_name if x] + panel_name = ", ".join(panel_name) + panel = "
  • Panel(s) / gene(s) included in report: {}\ +
  • ".format(panel_name) + else: + panel = "" + + column = [ + "chrom", "exon_start", "exon_end", "gene", "tx", "exon", + "cov_start", "cov_end", "cov" + ] + + dtypes = { + "chrom": str, "exon_start": int, "exon_end": int, "gene": str, + "tx": str, "exon": int, "cov_start": int, "cov_end": int, + "cov": int + } + + # read in raw coverage stats file + with open(raw_coverage) as raw_file: + raw_coverage = pd.read_csv( + raw_file, sep="\t", names=column, dtype=dtypes + ) + # strip chr from chrom in cases of diff. formatted bed + raw_coverage["chrom"] = raw_coverage["chrom"].apply( + lambda x: str(x).replace("chr", "") + ) + + if snp_vcfs: + # get names of SNP vcfs used to display in report + vcfs = ", ".join([Path(x).stem for x in snp_vcfs]) + vcfs = "
    VCF(s) of known variants included in report: {}\ +
    ".format(vcfs) + else: + vcfs = "" + + # check given threshold is in the stats files + if "x" not in str(threshold): + threshold = str(threshold) + "x" + + if threshold not in list(cov_stats) and\ + threshold not in list(cov_summary): + print("""--threshold must be one of the gene and exon + stats coverage thresholds. Exiting now.""") + sys.exit() + + return cov_stats, cov_summary, raw_coverage, html_template, build,\ + panel, vcfs, bootstrap, version - def summary_gene_plot(self, cov_summary, threshold): + def panel_coverage(self, cov_stats, threshold): """ - Generate summary plot of all genes against threshold value + Calculates mean coverage of all panel regions at given threshold, + normalised against length of each gene Args: - - cov_summary (df): df of gene coverage values - - threshold (int): defined threshold level (default: 20) + - cov_stats (df): df of coverage stats for each exon + - threshold (int): threshold cut off for low coverage Returns: - - summary_plot (fig): plot of all genes + - panel_pct_coverage (str): % coverage of panel as str """ - print("Generating summary plot") + print("Calculating panel average coverage") + # threshold column to check at threshold = str(threshold) + "x" - summary_data = cov_summary.copy() + gene_stats = pd.DataFrame( + columns=["gene", "gene_len", "coverage"]) - # define colours based on values - summary_data["colours"] = 'green' - summary_data.loc[summary_data[threshold] < 100, 'colours'] = 'orange' - summary_data.loc[summary_data[threshold] < 90, 'colours'] = 'red' + # make list of genes + genes = sorted(list(set(cov_stats["gene"].tolist()))) - summary_data = summary_data.sort_values( - by=[threshold], ascending=False - ) - summary_plot, axs = plt.subplots(figsize=(25, 7.5)) + for gene in genes: + # for each gene, calculate length and average % at threshold + gene_cov = cov_stats.loc[cov_stats["gene"] == gene] - if len(summary_data.index) > 100: - # split off some of 100% covered genes to limit size of plot - if len(summary_data[summary_data[threshold] < 100]) > 100: - # over 100 sub threshold genes, remove all 100% genes - genes100pct = len(summary_data[summary_data[threshold] == 100]) - summary_data = summary_data[summary_data[threshold] < 100] - else: - # split off bottom 100 genes, plot includes some 100% covered - genes100pct = len(summary_data.iloc[:-100]) - summary_data = summary_data.iloc[-100:] + length = sum(gene_cov["exon_len"]) + coverage = sum( + gene_cov[threshold] * gene_cov["exon_len"] / length) - plt.bar( - summary_data["gene"], [int(x) for x in summary_data[threshold]], - color=summary_data.colours - ) + gene_stats = gene_stats.append({ + "gene": gene, + "gene_len": length, + "coverage": coverage + }, ignore_index=True) - if "genes100pct" in locals(): - genes100pct = str(genes100pct) - # more than 100 genes, add title inc. 100% covered not shown - axs.set_title( - r"$\bf{" + genes100pct + "}$" + " genes covered 100% at " + - r"$\bf{" + threshold + "}$" + - " were omitted from the plot due to the panel size", loc='left' + # calculate % panel coverage + panel_coverage = sum( + gene_stats["coverage"] * gene_stats["gene_len"] / sum( + gene_stats["gene_len"] ) + ) - # threshold lines - plt.axhline(y=99, linestyle='--', color="#565656", alpha=0.6) - plt.axhline(y=95, linestyle='--', color="#565656", alpha=0.6) + # round to 12 dp to account for limit of accuracy of float & + # length of human genome + panel_coverage = round(panel_coverage, 12) - plt.text(1.005, 0.94, '99%', transform=axs.transAxes) - plt.text(1.005, 0.91, '95%', transform=axs.transAxes) + panel_pct_coverage = str(math.floor(panel_coverage * 100) / 100) - # plot formatting - axs.tick_params(labelsize=6, length=0) - plt.xticks(rotation=55, color="#565656") + return panel_pct_coverage - # adjust whole plot marins - axs.margins(x=0.01) - axs.autoscale_view(scaley=True) - # add legend - green = mpatches.Patch(color='green', label='100%') - orange = mpatches.Patch(color='orange', label='90-99.99%') - red = mpatches.Patch(color='red', label='<90%') + def snp_coverage(self, snp_vcfs, raw_coverage, threshold): + """ + Produces tables of coverage for variants inside of capture + regions, and larger structural variants spanning region + boundaries. - plt.legend( - handles=[green, orange, red], loc='upper center', - bbox_to_anchor=(0.5, -0.1), - fancybox=True, shadow=True, ncol=12, fontsize=12 - ) + Args: + - snp_vcfs (str): list of vcf files used for SNP analysis + - raw_coverage (df): raw bp coverage for each exon + - threshold (int): threshold value passed from parse args - vals = np.arange(0, 110, 10).tolist() - plt.yticks(vals, vals) - axs.tick_params(axis='both', which='major', labelsize=8) + Returns: + - snps_low_cov (df): variants with lower coverage than threshold + - snps_high_cov (df): variants with higher coverage than threshold + - snps_no_cov (df): variants that span exon boundaries (i.e SVs) + """ + print("Calculating coverage of given SNPs") - plt.xlabel("") - plt.ylabel("% coverage ({})".format(threshold), fontsize=11) + bedFile = raw_coverage[ + ["chrom", "exon_start", "exon_end"]].drop_duplicates() + coverageFile = raw_coverage[ + ["chrom", "cov_start", "cov_end", "cov"]].drop_duplicates() - axs.yaxis.grid(linewidth=0.5, color="grey", linestyle="-.") - plt.box(False) - axs.set_axisbelow(True) - plt.tight_layout() + # turn dfs into BedTools objects + bed = bedtools.BedTool.from_dataframe(bedFile) + cov = bedtools.BedTool.from_dataframe(coverageFile) - # convert image to html string to insert in report - buffer = BytesIO() - plt.savefig(buffer, format='png') - buffer.seek(0) - image_png = buffer.getvalue() - buffer.close() - graphic = base64.b64encode(image_png) - data_uri = graphic.decode('utf-8') - summary_plot = "".format( - data_uri - ) + # empty df to add all SNP info to + snp_df = pd.DataFrame(columns=[ + 'VCF', 'chrom', 'pos', 'id', 'ref', 'alt', 'info' + ]) - return summary_plot + for vcf in snp_vcfs: + # read vcf into BedTools object + v = bedtools.BedTool(vcf) + # get vcf name to add to table, req. for multiple VCFS and + # recording variant source VCF + name = Path(vcf).stem.split("_")[0] - def writeSummary(self, cov_summary, threshold, panel_pct_coverage): - """ - Write summary paragraph with sequencing details and list of - genes / transcripts used in panel. + # use bedtools intersect to get SNPs in capture region + snps = bed.intersect(v, wb=True) - Args: - - cov_summary (df): df of gene coverage values - - threshold (int): defined threshold level (default: 20) - - panel_pct_coverage (str): % coverage of panel as str - Returns: - - summary_text (str): summary text with req. HTML markup - """ - threshold = str(threshold) + "x" + for row in snps: + # get data from returned BedTools object, add to df + snp_data = str(row).split() + snp_df = snp_df.append({ + 'VCF': name, 'chrom': snp_data[3], + 'pos': snp_data[4], 'ref': snp_data[6], + 'alt': snp_data[7], 'info': snp_data[10] + }, ignore_index=True) - pct_cov = str(math.floor(float(panel_pct_coverage))) + snp_df = snp_df[ + ['VCF', 'chrom', 'pos', 'ref', 'alt', 'info']].drop_duplicates() - # summary text paragraph with div styling - summary_text = """ -
  • Clinical report summary:
  • -
    -
    """ + # reset index + raw_coverage = raw_coverage.reset_index(drop=True) - for i, gene in cov_summary.iterrows(): - # build string of each gene, trascript and coverage at - # threshold to display in summary - summary = "{} ({}); ".format(gene["gene"], gene["tx"]) - summary_text += summary + # use pandasql to intersect SNPs against coverage df to find the + # coverage at each SNP position + coverage_sql = """ + SELECT snp_df.VCF, snp_df.chrom, snp_df.pos, snp_df.ref, + snp_df.alt, snp_df.info, raw_coverage.gene, raw_coverage.exon, + raw_coverage.cov_start, raw_coverage.cov_end, raw_coverage.cov + FROM snp_df + LEFT JOIN raw_coverage on snp_df.CHROM=raw_coverage.chrom + WHERE snp_df.POS > raw_coverage.cov_start AND + snp_df.POS <= raw_coverage.cov_end + """ - summary_text = summary_text.strip(" ;") + "." - summary_text += """ -

    {} % of this panel was sequenced to a depth of {} or - greater.
    """.format(pct_cov, threshold) + snp_cov = pdsql.sqldf(coverage_sql, locals()) - # add closing div and copy button for summary text - summary_text += """
    -
    """ + # get SNPs that won't have coverage data but do intersect panel + # regions (i.e. large deletions that span a region) + snps_no_cov = snp_df.merge(snp_cov, how='outer', indicator=True).loc[ + lambda x: x['_merge'] == 'left_only'] - return summary_text + snps_no_cov = snps_no_cov[[ + "VCF", "chrom", "pos", "ref", "alt", "info" + ]].reset_index(drop=True) + # get required columns for SNP tables + snps_cov = snp_cov[ + ["VCF", "gene", "exon", "chrom", "pos", "ref", "alt", "cov"] + ].drop_duplicates(subset=[ + "VCF", "chrom", "pos", "ref", "alt"]).reset_index(drop=True) - def generate_report(self, cov_stats, cov_summary, snps_low_cov, - snps_high_cov, snps_no_cov, fig, all_plots, - summary_plot, html_template, args, build, panel, vcfs, - panel_pct_coverage, bootstrap, version, summary_text - ): + # rename columns for displaying in report + snps_cov.columns = ["VCF", "Gene", "Exon", "Chromosome", "Position", + "Ref", "Alt", "Coverage"] + + snps_no_cov.columns = [ + "VCF", "Chromosome", "Position", "Ref", "Alt", "Info" + ] + + # remove <> from DELs to stop being interpreted as HTML tags + snps_no_cov["Alt"] = snps_no_cov["Alt"].str.strip("<>") + + snps_cov["Coverage"] = snps_cov["Coverage"].astype(int) + + # sort no_cov table by chrom & pos, as pos is str first define + # order to sort by + order = [str(x) for x in range(0, 23)] + order.extend(["X", "Y", "MT"]) + snps_no_cov["Chromosome"] = pd.Categorical( + snps_no_cov["Chromosome"], order + ) + + snps_cov = snps_cov.sort_values(by=["Gene", "Exon", "Position"]) + snps_no_cov = snps_no_cov.sort_values(by=["Chromosome", "Position"]) + + # split SNPs by coverage against threshold + snps_low_cov = snps_cov.loc[snps_cov["Coverage"] < threshold] + snps_high_cov = snps_cov.loc[snps_cov["Coverage"] >= threshold] + + return snps_low_cov, snps_high_cov, snps_no_cov + + + def low_coverage_regions(self, cov_stats, raw_coverage, threshold): """ - Generate single sample report from coverage stats + Get regions where coverage at given threshold is <100% Args: - cov_stats (df): df of coverage stats for each exon - - cov_summary (df): df of gene level coverage - - snps_low_cov (df): SNPs with lower coverage than threshold - - snps_high_cov (df): SNPs with higher coverage than threshold - - snps_no_cov (df): variants that span exon boundaries (i.e SVs) - - fig (figure): plots of low coverage regions - - all-plots (figure): grid of all full gene- exon plots - - summary_plot (figure): gene summary plot - % at threshold - - html_template (str): string of HTML template - - args (args): passed cmd line arguments - - build (str): build number used for alignment - - panel (str): panes(s) / gene(s) included in report - - vcfs (str): vcfs(s) passed for SNP analysis - - panel_pct_coverage (str): total % coverage of panel - - bootstrap (str): bootstrap to store directly in html - - version (str): version of Athena, used to add to report - - Returns: None + - raw_coverage (df): raw bp coverage for each exon + - threshold (int): defined threshold level (default: 20) - Outputs: - - coverage_report.html (file): HTML coverage report + Returns: + - low_raw_cov (df): df of raw bp values for each region with + coverage less than 100% at threshold """ - print("Generating report") - - # str of threshold for selecting df columns etc. - threshold = str(args.threshold) + "x" + # threshold column to check at + threshold = str(threshold) + "x" # get threshold columns and add to column names threshold_cols = list(cov_stats.filter(regex='[0-9]+x', axis=1)) column = [ - "gene", "tx", "chrom", "exon", "exon_len", "exon_start", - "exon_end", "min", "mean", "max" + "gene", "tx", "chrom", "exon", "exon_start", "exon_end", + "min", "mean", "max" ] column.extend(threshold_cols) - sub_threshold = pd.DataFrame(columns=column) + # empty df + low_stats = pd.DataFrame(columns=column) - # get all exons with <100% coverage at threshold + # get all exons with <100% coverage at given threshold for i, row in cov_stats.iterrows(): if int(row[threshold]) < 100: - sub_threshold = sub_threshold.append(row, ignore_index=True) + low_stats = low_stats.append(row, ignore_index=True) # pandas is terrible and forces floats, change back to int dtypes = { 'chrom': str, 'exon': int, - 'exon_len': int, 'exon_start': int, 'exon_end': int, 'min': int, 'max': int } - vals = ["min", "mean", "max"] - vals.extend(threshold_cols) - - if not sub_threshold.empty: - # some low covered regions identified - sub_threshold = sub_threshold.astype(dtypes) - - sub_threshold_stats = pd.pivot_table(sub_threshold, index=[ - "gene", "tx", "chrom", "exon", - "exon_len", "exon_start", "exon_end" - ], values=vals) - - # reset index to fix formatting - sub_threshold_stats = sub_threshold_stats.reindex(vals, axis=1) - sub_threshold_stats.reset_index(inplace=True) - - gene_issues = len(list(set(sub_threshold_stats["gene"].tolist()))) - exon_issues = len(sub_threshold_stats["exon"]) - else: - # if no low regions set to empty df with appropriate columns - print("No low coverage regions, generating empty table") - sub_threshold_stats = pd.DataFrame(columns=column) - gene_issues = 0 - exon_issues = 0 - - - # do some excel level formatting to make table more readable - total_stats = pd.pivot_table( - cov_stats, - index=["gene", "tx", "chrom", "exon", "exon_len", - "exon_start", "exon_end"], - values=vals - ) - - # reset index to fix formatting - total_stats = total_stats.reindex(vals, axis=1) - total_stats.reset_index(inplace=True) - - all_dfs = [ - total_stats, cov_summary, sub_threshold_stats, - snps_low_cov, snps_high_cov - ] - - # set index to start at 1 to be more readable - for df in all_dfs: - if not df.empty: - df.index = np.arange(1, len(df) + 1) - - # rename columns for displaying in report - cov_summary = cov_summary.drop(columns=["exon"]) - cov_summary = cov_summary.rename(columns={ - "gene": "Gene", - "tx": "Transcript", - "min": "Min", - "mean": "Mean", - "max": "Max" - }) - - total_stats = total_stats.rename(columns={ - "gene": "Gene", - "tx": "Transcript", - "chrom": "Chr", - "exon": "Exon", - "exon_len": "Length", - "exon_start": "Start", - "exon_end": "End", - "min": "Min", - "mean": "Mean", - "max": "Max" - }) - - # rename columns to display properly - sub_threshold_stats = sub_threshold_stats.rename(columns={ - "gene": "Gene", - "tx": "Transcript", - "chrom": "Chr", - "exon": "Exon", - "exon_len": "Length", - "exon_start": "Start", - "exon_end": "End", - "min": "Min", - "mean": "Mean", - "max": "Max" - }) - - # limit to 2dp using math.floor, use of round() with - # 2dp may lead to inaccuracy such as 99.99 => 100.00 - round_cols = ['Mean'] + threshold_cols - - for col in round_cols: - cov_summary[col] = cov_summary[col].map( - lambda col: math.floor(col * 100) / 100 - ) - - total_stats[col] = total_stats[col].map( - lambda col: math.floor(col * 100) / 100 - ) - - # get values to display in report - total_genes = len(cov_summary["Gene"]) - fully_covered_genes = total_genes - gene_issues - - # empty dict to add values for displaying in report text - report_vals = {} - - report_vals["summary_text"] = summary_text - report_vals["name"] = str(args.sample_name).replace("_", " ") - report_vals["total_genes"] = str(total_genes) - report_vals["fully_covered_genes"] = str(fully_covered_genes) - report_vals["gene_issues"] = str(gene_issues) - report_vals["threshold"] = threshold - report_vals["exon_issues"] = str(exon_issues) - report_vals["build"] = build - report_vals["panel"] = panel - report_vals["vcfs"] = vcfs - report_vals["version"] = version - report_vals["panel_pct_coverage"] = panel_pct_coverage - - # creat slices of sub_threshold stats df to add styling to - slice_ranges = { - "x0": (10, 0), "x10": (30, 10), "x30": (50, 30), "x50": (70, 50), - "x70": (90, 70), "x90": (95, 90), "x95": (99, 95), "x99": (101, 99) - } - - sub_slice = {} - - for key, val in slice_ranges.items(): - sub_slice[key] = pd.IndexSlice[sub_threshold_stats.loc[( - sub_threshold_stats[threshold] < val[0] - ) & ( - sub_threshold_stats[threshold] >= val[1])].index, threshold] - - # df column index of threshold - col_idx = sub_threshold_stats.columns.get_loc(threshold) - - # make dict for rounding coverage columns to 2dp - rnd = {} - for col in list(sub_threshold_stats.columns[10:15]): - rnd[col] = '{0:.2f}%' - - # set threshold column widths as a fraction of 30% table width - t_width = str(30 / len(threshold_cols)) + "%" - - # apply colours to coverage cell based on value, 0 is given solid red - s = sub_threshold_stats.style.apply(lambda x: [ - "background-color: #b30000" if x[threshold] == 0 and idx == col_idx - else "" for idx, v in enumerate(x) - ], axis=1)\ - .bar(subset=sub_slice["x0"], color='#b30000', vmin=0, vmax=100)\ - .bar(subset=sub_slice["x10"], color='#990000', vmin=0, vmax=100)\ - .bar(subset=sub_slice["x30"], color='#C82538', vmin=0, vmax=100)\ - .bar(subset=sub_slice["x50"], color='#FF4500', vmin=0, vmax=100)\ - .bar(subset=sub_slice["x70"], color='#FF4500', vmin=0, vmax=100)\ - .bar(subset=sub_slice["x90"], color='#FF4500', vmin=0, vmax=100)\ - .bar(subset=sub_slice["x95"], color='#FFBF00', vmin=0, vmax=100)\ - .bar(subset=sub_slice["x99"], color='#007600', vmin=0, vmax=100)\ - .format(rnd)\ - .set_table_attributes('table border="1"\ - class="dataframe table table-hover table-bordered"')\ - .set_uuid("low_exon_table")\ - .set_properties(**{'font-size': '0.85vw', 'table-layout': 'auto'})\ - .set_properties(subset=threshold_cols, **{'width': t_width})\ - - sub_threshold_stats["Mean"] = sub_threshold_stats["Mean"].apply( - lambda x: int(x) - ) + low_stats = low_stats.astype(dtypes) - # CSS table class for styling tables - style = ( - '', - '
    ' - ) + # get list of tuples of genes and exons with low coverage to + # select out raw coverage + low_exon_list = low_stats.reset_index()[['gene', + 'exon']].values.tolist() + low_exon_list = [tuple(exon) for exon in low_exon_list] - # generate HTML strings from table objects to write to report - gene_stats = cov_summary.to_html(justify='left').replace( - style[0], style[1] - ) - total_stats = total_stats.to_html(justify='left').replace( - style[0], style[1] - ) + # get raw coverage for low coverage regions to plot + low_raw_cov = raw_coverage[raw_coverage[['gene', 'exon']].apply( + tuple, axis=1).isin(low_exon_list)].reset_index() - sub_threshold_stats = s.render() + return low_raw_cov - # get snps values and format dfs to display - if not snps_low_cov.empty: - # format low coverage SNPs table - snps_not_covered = len(snps_low_cov.index) - snps_low_cov = snps_low_cov.style\ - .set_table_attributes( - 'class="dataframe table table-striped"')\ - .set_uuid("var_low_cov")\ - .set_properties(**{ - 'font-size': '0.80vw', 'table-layout': 'auto' - })\ - .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\ - .set_properties(subset=["Exon"], **{'width': '7.5%'})\ - .set_properties(subset=["Chromosome"], **{'width': '10%'})\ - .set_properties(subset=["Position"], **{'width': '12.5%'})\ - .set_properties(subset=["Ref"], **{'width': '20%'})\ - .set_properties(subset=["Alt"], **{'width': '20%'})\ - .set_properties(subset=["Coverage"], **{'width': '10%'}) - snps_low_cov = snps_low_cov.render() - else: - snps_low_cov = "No low covered SNPs" - snps_not_covered = 0 + def write_summary(self, cov_summary, threshold, panel_pct_coverage): + """ + Write summary paragraph with sequencing details and list of + genes / transcripts used in panel. - if not snps_high_cov.empty: - # format high coverage SNPs table - snps_covered = len(snps_high_cov.index) + Args: + - cov_summary (df): df of gene coverage values + - threshold (int): defined threshold level (default: 20) + - panel_pct_coverage (str): % coverage of panel as str + Returns: + - summary_text (str): summary text with req. HTML markup + """ + threshold = str(threshold) + "x" - snps_high_cov = snps_high_cov.style\ - .set_table_attributes( - 'class="dataframe table table-striped"')\ - .set_uuid("var_high_cov")\ - .set_properties(**{ - 'font-size': '0.80vw', 'table-layout': 'auto' - })\ - .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\ - .set_properties(subset=["Exon"], **{'width': '7.5%'})\ - .set_properties(subset=["Chromosome"], **{'width': '10%'})\ - .set_properties(subset=["Position"], **{'width': '12.5%'})\ - .set_properties(subset=["Ref"], **{'width': '20%'})\ - .set_properties(subset=["Alt"], **{'width': '20%'})\ - .set_properties(subset=["Coverage"], **{'width': '10%'}) + pct_cov = str(math.floor(float(panel_pct_coverage))) - snps_high_cov = snps_high_cov.render() - else: - snps_high_cov = "No covered SNPs" - snps_covered = 0 + # summary text paragraph with div styling + summary_text = """ +
  • Clinical report summary:
  • +
    +
    """ - # if variants from vcf found that span exon boundaries - if not snps_no_cov.empty: - # manually add div and styling around rendered table, allows - # to be fully absent from the report if the table is empty - snps_no_cov.index = np.arange(1, len(snps_no_cov) + 1) + for i, gene in cov_summary.iterrows(): + # build string of each gene, trascript and coverage at + # threshold to display in summary + summary = "{} ({}); ".format(gene["gene"], gene["tx"]) + summary_text += summary - # get number of variants to display in report - snps_out_panel = len(snps_no_cov.index) + summary_text = summary_text.strip(" ;") + "." + summary_text += """ +

    {} % of this panel was sequenced to a depth of {} or + greater.
    """.format(pct_cov, threshold) - html_string = snps_no_cov.style\ - .set_table_attributes( - 'class="dataframe table table-striped"')\ - .set_uuid("var_no_cov")\ - .set_properties(**{ - 'font-size': '0.80vw', 'table-layout': 'auto' - })\ - .set_properties(subset=["VCF"], **{ - 'width': '7.5%' - })\ - .set_properties(subset=[ - "Chromosome", "Position", "Ref", "Alt" - ], **{'width': '10%'}) + # add closing div and copy button for summary text + summary_text += """
    +
    """ - html_string = html_string.render() + return summary_text - snps_no_cov = """ -
    Variants included in the first table below either fully\ - or partially span panel region(s). These are most likely\ - large structural variants and as such do not have\ - coverage data available. See the "info" column for details\ - on the variant. -
    -
    Table of variants spanning panel regions(s)   - -
    -
    - {} -
    -
    - """.format(html_string) - else: - snps_no_cov = "" - snps_out_panel = 0 - snps_pct_out_panel = 0 + def calculate_snp_vals( + self, snps_covered, snps_not_covered, snps_out_panel): + """ + """ total_snps = str(snps_covered + snps_not_covered + snps_out_panel) # calculate % SNPs covered vs. not, limit to 2dp with math.floor @@ -1331,7 +1273,99 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov, snps_pct_out_panel = int( snps_out_panel) / int(total_snps) * 100 snps_pct_out_panel = math.floor(snps_pct_out_panel * 100) / 100 + else: + snps_pct_out_panel = 0 + + return total_snps, snps_pct_covered, snps_pct_not_covered,\ + snps_pct_out_panel + + + def generate_report(self, cov_stats, cov_summary, snps_low_cov, + snps_high_cov, snps_no_cov, fig, all_plots, + summary_plot, html_template, args, build, panel, vcfs, + panel_pct_coverage, bootstrap, version, summary_text + ): + """ + Generate single sample report from coverage stats + + Args: + - cov_stats (df): df of coverage stats for each exon + - cov_summary (df): df of gene level coverage + - snps_low_cov (df): SNPs with lower coverage than threshold + - snps_high_cov (df): SNPs with higher coverage than threshold + - snps_no_cov (df): variants that span exon boundaries (i.e SVs) + - fig (figure): plots of low coverage regions + - all-plots (figure): grid of all full gene- exon plots + - summary_plot (figure): gene summary plot - % at threshold + - html_template (str): string of HTML template + - args (args): passed cmd line arguments + - build (str): build number used for alignment + - panel (str): panes(s) / gene(s) included in report + - vcfs (str): vcfs(s) passed for SNP analysis + - panel_pct_coverage (str): total % coverage of panel + - bootstrap (str): bootstrap to store directly in html + - version (str): version of Athena, used to add to report + + Returns: None + + Outputs: + - coverage_report.html (file): HTML coverage report + """ + print("Generating report") + styling = styleTables() + + # format threshold val & select threshold columns + threshold = str(args.threshold) + "x" + threshold_cols = list(cov_stats.filter(regex='[0-9]+x', axis=1)) + vals = ["min", "mean", "max"] + vals.extend(threshold_cols) + # apply styling to tables for displaying in report + sub_threshold_stats, gene_issues, exon_issues = styling.style_sub_threshold( + cov_stats, threshold, threshold_cols, vals + ) + + total_stats = styling.style_total_stats( + cov_stats, threshold_cols, vals + ) + + gene_stats, total_genes = styling.style_cov_summary( + cov_summary, threshold_cols + ) + + snps_low_cov, snps_not_covered = styling.style_snps_low_cov( + snps_low_cov + ) + + snps_high_cov, snps_covered = styling.style_snps_high_cov( + snps_high_cov + ) + + snps_no_cov, snps_out_panel = styling.style_snps_no_cov(snps_no_cov) + + # get values to display in report + fully_covered_genes = total_genes - gene_issues + + total_snps, snps_pct_covered, snps_pct_not_covered,\ + snps_pct_out_panel = self.calculate_snp_vals( + snps_covered, snps_not_covered, snps_out_panel + ) + + # empty dict to add values for displaying in report text + report_vals = {} + + report_vals["summary_text"] = summary_text + report_vals["name"] = str(args.sample_name).replace("_", " ") + report_vals["total_genes"] = str(total_genes) + report_vals["fully_covered_genes"] = str(fully_covered_genes) + report_vals["gene_issues"] = str(gene_issues) + report_vals["threshold"] = threshold + report_vals["exon_issues"] = str(exon_issues) + report_vals["build"] = build + report_vals["panel"] = panel + report_vals["vcfs"] = vcfs + report_vals["version"] = version + report_vals["panel_pct_coverage"] = panel_pct_coverage report_vals["total_snps"] = total_snps report_vals["snps_covered"] = str(snps_covered) report_vals["snps_not_covered"] = str(snps_not_covered) @@ -1357,102 +1391,176 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov, file.close() - def parse_args(self): + def build_report(self, html_template, total_stats, gene_stats, + sub_threshold_stats, snps_low_cov, snps_high_cov, + snps_no_cov, fig, all_plots, summary_plot, report_vals, + bootstrap + ): """ - Parse cmd line arguments - - Args: None + Build report from template and variables to write to file + Args: + - html_template (str): string of HTML template file + - total_stats (df): total stats table of all genes & exons + - gene_stats (df): stats table of whole gene + - sub_threshold_stats (df): table of exons with < threshold + - snps_low_cov (df): table of snps with cov < threshold + - snsp_high_cov (df): table of snps with cov > threshold + - snps_no_cov (df): variants that span exon boundaries (i.e SVs) + - fig (figure): grid of low coverage exon plots (plotly) + - all-plots (figure): grid of all full gene- exon plots + - summary_plot (figure): gene summary plot - % at threshold + - report_vals (dict): values to display in report text Returns: - - args (arguments): args passed from cmd line + - single_report (str): HTML string of filled report """ + # convert logo image into string to pass in to template + logo = str(os.path.join(os.path.dirname( + os.path.abspath(__file__)), "../data/static/images/logo.png" + )) - parser = argparse.ArgumentParser( - description='Generate coverage report for a single sample.' - ) - parser.add_argument( - '-e', '--exon_stats', - help='exon stats file (from coverage_stats_single.py)', - type=argparse.FileType('r'), required=True - ) - parser.add_argument( - '-g', '--gene_stats', - help='gene stats file (from coverage_stats_single.py)', - required=True - ) - parser.add_argument( - '-r', '--raw_coverage', - help='raw coverage bed file used to generate stats', - required=True - ) - parser.add_argument( - '-s', '--snps', nargs='*', - help='Optional; check coverage of VCF(s) of SNPs.', - required=False - ) - parser.add_argument( - '-t', '--threshold', nargs='?', - default=20, type=int, - help="threshold to define low coverage (int), if not\ - given 20 will be used as default. Must be one of\ - the thresholds in the input file.", - required=False - ) - parser.add_argument( - '-n', '--sample_name', nargs='?', - help="Name of sample to display in report, if not\ - specified this will be the prefix of the\ - gene_stats input file.", - required=False - ) - parser.add_argument( - '-o', '--output', nargs='?', - help='Output report name, if not specified the sample\ - name from the report will be used.', - required=False - ) - parser.add_argument( - '-p', '--panel', nargs='?', - help='(Optional) Panel bed file used from annotation, if passed\ - name of file will be displayed in report to show what\ - panel(s) / gene(s) were included.', - required=False - ) - parser.add_argument( - '-l', '--limit', nargs='?', - help="Number of genes at which to limit including full gene plots,\ - large numbers of genes takes a long time to generate the plots.", - default=-1, - required=False - ) - parser.add_argument( - '-m', '--summary', - help="If passed, a short paragraph will be included in the\ - summary section. This includes details on the sequencing and the\ - genes/transcripts used in the panel.", - default=False, action='store_true' - ) - parser.add_argument( - '--cores', nargs='?', default=None, - help='Number of cores to utilise, for larger numbers of genes this\ - will drastically reduce run time. If not given will use maximum\ - available' + data_uri = base64.b64encode(open(logo, 'rb').read()).decode('utf-8') + logo = ''.format( + data_uri) + + t = Template(html_template) + + date = datetime.today().strftime('%Y-%m-%d') + + single_report = t.safe_substitute( + bootstrap=bootstrap, + logo=logo, + total_genes=report_vals["total_genes"], + threshold=report_vals["threshold"], + summary_text=report_vals["summary_text"], + exon_issues=report_vals["exon_issues"], + gene_issues=report_vals["gene_issues"], + fully_covered_genes=report_vals["fully_covered_genes"], + name=report_vals["name"], + sub_threshold_stats=sub_threshold_stats, + low_cov_plots=fig, + all_plots=all_plots, + summary_plot=summary_plot, + gene_stats=gene_stats, + total_stats=total_stats, + snps_high_cov=snps_high_cov, + snps_low_cov=snps_low_cov, + snps_no_cov=snps_no_cov, + total_snps=report_vals["total_snps"], + snps_covered=report_vals["snps_covered"], + snps_pct_covered=report_vals["snps_pct_covered"], + snps_not_covered=report_vals["snps_not_covered"], + snps_pct_not_covered=report_vals["snps_pct_not_covered"], + snps_out_panel=report_vals["snps_out_panel"], + snps_pct_out_panel=report_vals["snps_pct_out_panel"], + date=date, + build=report_vals["build"], + vcfs=report_vals["vcfs"], + panel=report_vals["panel"], + panel_pct_coverage=report_vals["panel_pct_coverage"], + version=report_vals["version"] ) - args = parser.parse_args() + return single_report + + +def parse_args(): + """ + Parse cmd line arguments + + Args: None + + Returns: + - args (arguments): args passed from cmd line + """ + + parser = argparse.ArgumentParser( + description='Generate coverage report for a single sample.' + ) + parser.add_argument( + '-e', '--exon_stats', + help='exon stats file (from coverage_stats_single.py)', + type=argparse.FileType('r'), required=True + ) + parser.add_argument( + '-g', '--gene_stats', + help='gene stats file (from coverage_stats_single.py)', + required=True + ) + parser.add_argument( + '-r', '--raw_coverage', + help='raw coverage bed file used to generate stats', + required=True + ) + parser.add_argument( + '-s', '--snps', nargs='*', + help='Optional; check coverage of VCF(s) of SNPs.', + required=False + ) + parser.add_argument( + '-t', '--threshold', nargs='?', + default=20, type=int, + help="threshold to define low coverage (int), if not\ + given 20 will be used as default. Must be one of\ + the thresholds in the input file.", + required=False + ) + parser.add_argument( + '-n', '--sample_name', nargs='?', + help="Name of sample to display in report, if not\ + specified this will be the prefix of the\ + gene_stats input file.", + required=False + ) + parser.add_argument( + '-o', '--output', nargs='?', + help='Output report name, if not specified the sample\ + name from the report will be used.', + required=False + ) + parser.add_argument( + '-p', '--panel', nargs='?', + help='(Optional) Panel bed file used from annotation, if passed\ + name of file will be displayed in report to show what\ + panel(s) / gene(s) were included.', + required=False + ) + parser.add_argument( + '-l', '--limit', nargs='?', + help="Number of genes at which to limit including full gene plots,\ + large numbers of genes takes a long time to generate the plots.", + default=-1, + required=False + ) + parser.add_argument( + '-m', '--summary', + help="If passed, a short paragraph will be included in the\ + summary section. This includes details on the sequencing and the\ + genes/transcripts used in the panel.", + default=False, action='store_true' + ) + parser.add_argument( + '--cores', nargs='?', default=None, + help='Number of cores to utilise, for larger numbers of genes this\ + will drastically reduce run time. If not given will use maximum\ + available' + ) + + args = parser.parse_args() - if not args.sample_name: - # sample name not given, use input file name - args.sample_name = Path(args.gene_stats).stem - if "_" in args.sample_name: - # if named X1000_ take prefix - args.sample_name = args.sample_name.split("_", 1)[0] + if not args.sample_name: + # sample name not given, use input file name + args.sample_name = Path(args.gene_stats).stem + if "_" in args.sample_name: + # if named X1000_ take prefix + args.sample_name = args.sample_name.split("_", 1)[0] - if not args.output: - # output file name not given, using sample name - args.output = args.sample_name + "_coverage_report.html" + if not args.output: + # output file name not given, using sample name + args.output = args.sample_name + "_coverage_report.html" - return args + return args def main(): @@ -1460,8 +1568,9 @@ def main(): Main function to generate coverage report """ report = singleReport() + plots = generatePlots() - args = report.parse_args() + args = parse_args() # read in files cov_stats, cov_summary, raw_coverage, html_template, build, panel,\ @@ -1503,7 +1612,7 @@ def main(): panel_pct_coverage = report.panel_coverage(cov_stats, args.threshold) # generate summary plot - summary_plot = report.summary_gene_plot( + summary_plot = plots.summary_gene_plot( cov_summary, args.threshold ) @@ -1513,7 +1622,7 @@ def main(): ) # generate plot of sub optimal regions - fig = report.low_exon_plot(low_raw_cov, args.threshold) + fig = plots.low_exon_plot(low_raw_cov, args.threshold) if len(cov_summary.index) < int(args.limit) or int(args.limit) == -1: # generate plots of each full gene @@ -1543,7 +1652,7 @@ def main(): all_plots = ''.join( pool.starmap( - report.all_gene_plots, map( + plots.all_gene_plots, map( lambda e: (e, args.threshold), split_dfs ) ) @@ -1554,7 +1663,7 @@ def main(): if args.summary: # summary text to be included - summary_text = report.writeSummary( + summary_text = report.write_summary( cov_summary, args.threshold, panel_pct_coverage ) else: From 6f28499c33a9470dd9d235631012f6853e893fc7 Mon Sep 17 00:00:00 2001 From: jethror1 Date: Fri, 4 Dec 2020 11:38:34 +0000 Subject: [PATCH 3/4] add docstrings --- bin/coverage_report_single.py | 61 +++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py index 2984eb9a..a6431e56 100644 --- a/bin/coverage_report_single.py +++ b/bin/coverage_report_single.py @@ -458,8 +458,7 @@ class styleTables(): """Functions for styling tables for displaying in report""" def style_sub_threshold( - self, cov_stats, threshold, threshold_cols, vals - ): + self, cov_stats, threshold, threshold_cols, vals): """ Styling of sub threshold stats df for displaying in report @@ -468,7 +467,10 @@ def style_sub_threshold( - threshold (str): low coverage threshold value - threshold_cols (list): threshold values for coverage Returns: - - sub_threshold_stats (): + - sub_threshold_stats (str): HTML formatted str of cov stats + table + - gene_issues (int): total number of genes under threshold + - exon_issues (int): total numbner of exons under threshold """ column = [ "gene", "tx", "chrom", "exon", "exon_len", "exon_start", @@ -594,7 +596,9 @@ def style_total_stats(self, cov_stats, threshold_cols, vals): """ Styling of full gene-exon stats table for displaying in report Args: - - + - cov_stats (df): df of exon stats + - threshold_cols (list): list of threshold columns + - vals (list): list of min, mean and max strs Returns: - """ @@ -647,6 +651,13 @@ def style_total_stats(self, cov_stats, threshold_cols, vals): def style_cov_summary(self, cov_summary, threshold_cols): """ + Add styling to per gene coverage summary table + Args: + - cov_summary (df): df of gene coverage stats + - threshold_cols (list): list of threshold values + Returns: + - gene_stats (str): HTML formatted str of gene summary df + - total_genes (int): total number of genes """ # rename columns for displaying in report cov_summary = cov_summary.drop(columns=["exon"]) @@ -689,6 +700,13 @@ def style_cov_summary(self, cov_summary, threshold_cols): def style_snps_low_cov(self, snps_low_cov): """ + Add styling to table of snps under coverage threshold + Args: + - snps_low_cov (df): df of snps under covegrage threshold + Returns: + - snps_low_cov (str): HTML formatted str of low covered snps + - snps_not_covered (int): total number snps not covered at + threshold """ # get snps values and format dfs to display if not snps_low_cov.empty: @@ -720,6 +738,12 @@ def style_snps_low_cov(self, snps_low_cov): def style_snps_high_cov(self, snps_high_cov): """ + Add styling to table of SNPs covered above threshold + Args: + - snps_high_cov (df): df of snps covered above threshold + Returns: + - snps_high_cov (str): HTML formatted str of covered snps + - snps_covered (int): total number of snps covered """ if not snps_high_cov.empty: @@ -753,6 +777,13 @@ def style_snps_high_cov(self, snps_high_cov): def style_snps_no_cov(self, snps_no_cov): """ + Add styling to table of snps that span exon boundaries => have + coverage values + Args: + - snps_no_cov (df): df of snps with no coverage values + Returns: + - snps_no_cov (str): HTML formatted str of snps with no cov + - snps_out_panel (int): total number snps with no cov """ # if variants from vcf found that span exon boundaries if not snps_no_cov.empty: @@ -1252,6 +1283,18 @@ def write_summary(self, cov_summary, threshold, panel_pct_coverage): def calculate_snp_vals( self, snps_covered, snps_not_covered, snps_out_panel): """ + Calculate % values for SNP totals + Args: + - snps_covered (int): total number snps covered at threshold + - snps_not_covered (int): total number snps not covered at + threshold + - snps_out_panel (int): total number snps spanning exon + boundaries + Returns: + - total_snps (int): sum of all snps + - snps_pct_covered (float): % value of snps_covered + - snps_pct_not_covered (float): % value of snps_not_covered + - snps_pct_out_panel (float): % value of snps_out_panel """ total_snps = str(snps_covered + snps_not_covered + snps_out_panel) @@ -1321,9 +1364,10 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov, vals.extend(threshold_cols) # apply styling to tables for displaying in report - sub_threshold_stats, gene_issues, exon_issues = styling.style_sub_threshold( - cov_stats, threshold, threshold_cols, vals - ) + sub_threshold_stats, gene_issues,\ + exon_issues = styling.style_sub_threshold( + cov_stats, threshold, threshold_cols, vals + ) total_stats = styling.style_total_stats( cov_stats, threshold_cols, vals @@ -1351,9 +1395,8 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov, snps_covered, snps_not_covered, snps_out_panel ) - # empty dict to add values for displaying in report text + # add values to dict to pass into report report_vals = {} - report_vals["summary_text"] = summary_text report_vals["name"] = str(args.sample_name).replace("_", " ") report_vals["total_genes"] = str(total_genes) From 52f474c780fd302104393dc1406b7223b922e8ac Mon Sep 17 00:00:00 2001 From: jethror1 Date: Fri, 4 Dec 2020 14:04:53 +0000 Subject: [PATCH 4/4] change width of coverage threshold columns --- bin/coverage_report_single.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py index a6431e56..46876142 100644 --- a/bin/coverage_report_single.py +++ b/bin/coverage_report_single.py @@ -31,7 +31,6 @@ from string import Template - class generatePlots(): """Functions to generate required plots""" @@ -557,11 +556,11 @@ def style_sub_threshold( # make dict for rounding coverage columns to 2dp rnd = {} - for col in list(sub_threshold_stats.columns[10:15]): + for col in list(sub_threshold_stats.columns[10:]): rnd[col] = '{0:.2f}%' - # set threshold column widths as a fraction of 30% table width - t_width = str(30 / len(threshold_cols)) + "%" + # set threshold column widths as a fraction of 40% table width + t_width = str(40 / len(threshold_cols)) + "%" # apply colours to coverage cell based on value, 0 is given solid red s = sub_threshold_stats.style.apply(lambda x: [