From 7a68c1f82ba7b209fd98914d904561a249d55c0e Mon Sep 17 00:00:00 2001
From: Jethro Rainford <rainford1995@gmail.com>
Date: Fri, 4 Dec 2020 07:56:01 +0000
Subject: [PATCH 1/4] change styling of low stats table

---
 bin/coverage_report_single.py | 60 ++++++++++++-----------------------
 1 file changed, 21 insertions(+), 39 deletions(-)

diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py
index 6cb4c783..12910452 100644
--- a/bin/coverage_report_single.py
+++ b/bin/coverage_report_single.py
@@ -1152,37 +1152,19 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
         report_vals["version"] = version
         report_vals["panel_pct_coverage"] = panel_pct_coverage
 
-        # set ranges for colouring cells
-        x0 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] < 10
-        ) & (
-            sub_threshold_stats[threshold] > 0)].index, threshold]
-        x10 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] < 30
-        ) & (
-            sub_threshold_stats[threshold] >= 10)].index, threshold]
-        x30 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] < 50
-        ) & (
-            sub_threshold_stats[threshold] >= 30)].index, threshold]
-        x50 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] < 70
-        ) & (
-            sub_threshold_stats[threshold] >= 50)].index, threshold]
-        x70 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] < 90
-        ) & (
-            sub_threshold_stats[threshold] >= 70)].index, threshold]
-        x90 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] < 95
-        ) & (
-            sub_threshold_stats[threshold] >= 90)].index, threshold]
-        x95 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] < 99
-        ) & (
-            sub_threshold_stats[threshold] >= 95)].index, threshold]
-        x99 = pd.IndexSlice[sub_threshold_stats.loc[(
-            sub_threshold_stats[threshold] >= 99)].index, threshold]
+        # creat slices of sub_threshold stats df to add styling to
+        slice_ranges = {
+            "x0": (10, 0), "x10": (30, 10), "x30": (50, 30), "x50": (70, 50),
+            "x70": (90, 70), "x90": (95, 90), "x95": (99, 95), "x99": (101, 99)
+        }
+
+        sub_slice = {}
+
+        for key, val in slice_ranges.items():
+            sub_slice[key] = pd.IndexSlice[sub_threshold_stats.loc[(
+                sub_threshold_stats[threshold] < val[0]
+            ) & (
+                sub_threshold_stats[threshold] >= val[1])].index, threshold]
 
         # df column index of threshold
         col_idx = sub_threshold_stats.columns.get_loc(threshold)
@@ -1200,14 +1182,14 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
             "background-color: #b30000" if x[threshold] == 0 and idx == col_idx
             else "" for idx, v in enumerate(x)
         ], axis=1)\
-            .bar(subset=x0, color='#b30000', vmin=0, vmax=100)\
-            .bar(subset=x10, color='#990000', vmin=0, vmax=100)\
-            .bar(subset=x30, color='#C82538', vmin=0, vmax=100)\
-            .bar(subset=x50, color='#FF4500', vmin=0, vmax=100)\
-            .bar(subset=x70, color='#FF4500', vmin=0, vmax=100)\
-            .bar(subset=x90, color='#FF4500', vmin=0, vmax=100)\
-            .bar(subset=x95, color='#FFBF00', vmin=0, vmax=100)\
-            .bar(subset=x99, color='#007600', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x0"], color='#b30000', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x10"], color='#990000', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x30"], color='#C82538', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x50"], color='#FF4500', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x70"], color='#FF4500', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x90"], color='#FF4500', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x95"], color='#FFBF00', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x99"], color='#007600', vmin=0, vmax=100)\
             .format(rnd)\
             .set_table_attributes('table border="1"\
                 class="dataframe table table-hover table-bordered"')\

From 18a663d9dba74bd0055bc8bffce33b54553fcd00 Mon Sep 17 00:00:00 2001
From: jethror1 <rainford1995@gmail.com>
Date: Fri, 4 Dec 2020 11:08:51 +0000
Subject: [PATCH 2/4] refactor generating plots and table to styling to
 seperate classes and functions

---
 bin/coverage_report_single.py | 2331 +++++++++++++++++----------------
 1 file changed, 1220 insertions(+), 1111 deletions(-)

diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py
index 12910452..2984eb9a 100644
--- a/bin/coverage_report_single.py
+++ b/bin/coverage_report_single.py
@@ -31,1286 +31,1228 @@
 from string import Template
 
 
-class singleReport():
 
-    def load_files(self, threshold, exon_stats,
-                   gene_stats, raw_coverage, snp_vcfs, panel):
+class generatePlots():
+    """Functions to generate required plots"""
+
+    def low_exon_plot(self, low_raw_cov, threshold):
         """
-        Load in raw coverage data, coverage stats file and template.
+        Plot bp coverage of exon, used for those where coverage is given
+        threshold
 
         Args:
-            - threshold (int): threshold value passed from parse_args
-            - exon_stats (file): exon stats file (from args;
-                                generated by coverage_stats_single.py)
-            - gene_stats (file): gene stats file (from args;
-                                generated by coverage_stats_single.py)
-            - raw_coverage (file): from args; bp coverage file used as
-                                input for coverage_stats_single.py
-            - snp_vcfs (list): VCFs of SNPs passed from args
-            - panel (file): panel bed file used for annotation, used to
-                            display panel name in report if passed
+            - low_raw_cov (df): df of raw coverage for exons with low
+                                coverage
+            - threshold (int): defined threshold level (default: 20)
 
         Returns:
-            - cov_stats (df): df of coverage stats for each exon
-            - cov_summary (df): df of gene level coverage
-            - raw_coverage (df): raw bp coverage for each exon
-            - html_template (str): string of HTML report template
-            - flagstat (dict): flagstat metrics, from gene_stats header
-            - build (str): ref build used, from gene_stats header
-            - panel (str): panes(s) / gene(s) included in report
-            - vcfs (str): list of vcf names used for SNP analysis
-            - version (str): version of Athena, used to add to report
+            - fig (figure): plots of low coverage regions
         """
-        print("Reading in files")
+        print("Generating plots of low covered regions")
 
-        # read in single sample report template
-        bin_dir = os.path.dirname(os.path.abspath(__file__))
-        template_dir = os.path.join(bin_dir, "../data/templates/")
-        single_template = os.path.join(template_dir, "single_template.html")
+        # get list of tuples of genes and exons to define plots
+        genes = low_raw_cov.drop_duplicates(
+            ["gene", "exon"])[["gene", "exon"]].values.tolist()
+        genes = [tuple(exon) for exon in genes]
 
-        with open(single_template, 'r') as template:
-            html_template = template.read()
+        if len(genes) == 0:
+            # everything above threshold, don't generate plots
+            fig = "<br></br><b>All regions in panel above threshold, no plots\
+                to show.</b><br></br>"
 
-        try:
-            # attempt to get version tag from root dir name
-            # will only work if downloaded as zip / tar and not cloned
-            path = str(os.path.join(bin_dir, "../")).split("/")
-            version = [s for s in path if "athena" in s][0].split("-")[1]
-            version = "({})".format(version)
-        except Exception:
-            print("Error getting version from dir name, continuing.")
-            version = ""
-            pass
+            return fig
 
-        # read bootstrap into var to store in report html
-        bs = str(os.path.join(os.path.dirname(
-            os.path.abspath(__file__)), "../data/static/css/bootstrap.min.css"
-        ))
-        with open(bs) as bs:
-            bootstrap = bs.read()
+        # sort list of genes/exons by gene and exon
+        genes = sorted(genes, key=lambda element: (element[0], element[1]))
 
-        # read in exon stats file
-        with open(exon_stats.name) as exon_file:
-            dtypes = {
-                "chrom": str, "exon_start": int, "exon_end": int, "gene": str,
-                "tx": str, "exon": int, "min": int, "mean": float, "max": int,
-                r'[0-9]*x': float, "exon_len": int
-            }
+        plot_titles = [str(x[0]) + " exon: " + str(int(x[1])) for x in genes]
 
-            cov_stats = pd.read_csv(
-                exon_file, sep="\t", comment='#', dtype=dtypes
-            )
+        low_raw_cov["exon_len"] =\
+            low_raw_cov["exon_end"] - low_raw_cov["exon_start"]
 
-            # strip chr from chrom in cases of diff. formatted bed
-            cov_stats["chrom"] = cov_stats["chrom"].apply(
-                lambda x: str(x).replace("chr", "")
-            )
+        low_raw_cov["relative_position"] = low_raw_cov["exon_end"] - round(((
+            low_raw_cov["cov_end"] + low_raw_cov["cov_start"]) / 2
+        ))
 
-        # read in gene stats file
-        with open(gene_stats) as gene_file:
-            dtypes = {
-                "gene": str, "tx": str, "min": int,
-                "mean": float, "max": int, r'[0-9]*x': float
-            }
+        # set no. rows to no. of plots / no of columns to define grid
+        columns = 4
+        rows = math.ceil(len(genes) / 4)
 
-            cov_summary = pd.read_csv(
-                gene_file, sep="\t", comment='#', dtype=dtypes
-            )
+        # variable height depeendent on no. of plots
+        v_space = (1 / rows) * 0.25
 
-        flagstat = {}
-        # read in flagstat and build from header of gene stats file
-        with open(gene_stats) as gene_file:
-            for ln in gene_file:
-                if ln.startswith("#"):
-                    if "build" in ln:
-                        # get build number
-                        reference = ln.split(":")[1]
-                        # add build to string to display
-                        if "37" in reference:
-                            build = "<li>Reference build used for aligment<b>\
-                                {}</b></li>".format(reference)
-                        if "38" in build:
-                            build = "<li>Reference build used for aligment<b>\
-                                {}</b></li>".format(reference)
-                    else:
-                        # read in flagstat from header
-                        key = ln.split(":")[0].strip("#")
-                        val = ln.split(":")[1]
-                        flagstat[key] = val
+        # define grid to add plots to
+        fig = make_subplots(
+            rows=rows, cols=columns, print_grid=False,
+            horizontal_spacing=0.04, vertical_spacing=v_space,
+            subplot_titles=plot_titles
+        )
 
-        if "build" not in locals():
-            # build no. not included in gene_stats file
-            build = ""
+        # counter for grid
+        row_no = 1
+        col_no = 1
 
-        if panel is not None:
-            # if optional panel file given, get name and format for HTML
-            panel_name = Path(panel).stem
+        for gene in genes:
+            # make plot for each gene / exon
 
-            # format according to output of
-            # https://github.com/eastgenomics/eggd_generate_bed
-            panel_name = [x.strip("_") for x in panel_name.split("&&") if x]
-            panel_name = [
-                x.strip("_b37").strip("_b38") for x in panel_name if x
-            ]
-            panel_name = [x.replace("_", " ") for x in panel_name if x]
-            panel_name = ",&nbsp".join(panel_name)
-            panel = "<li>Panel(s) / gene(s) included in report: <b>{}</b>\
-                </li>".format(panel_name)
-        else:
-            panel = ""
+            # counter for grid, by gets to 5th entry starts new row
+            if row_no // 5 == 1:
+                col_no += 1
+                row_no = 1
 
-        column = [
-            "chrom", "exon_start", "exon_end", "gene", "tx", "exon",
-            "cov_start", "cov_end", "cov"
-        ]
+            # get rows for current gene and exon
+            exon_cov = low_raw_cov.loc[(
+                low_raw_cov["gene"] == gene[0]
+            ) & (
+                low_raw_cov["exon"] == gene[1]
+            )]
 
-        dtypes = {
-            "chrom": str, "exon_start": int, "exon_end": int, "gene": str,
-            "tx": str, "exon": int, "cov_start": int, "cov_end": int,
-            "cov": int
-        }
+            exon_cov = exon_cov.sort_values(by='cov_start', ascending=True)
+            start = exon_cov.iloc[0]
+            end = exon_cov.iloc[-1]
 
-        # read in raw coverage stats file
-        with open(raw_coverage) as raw_file:
-            raw_coverage = pd.read_csv(
-                raw_file, sep="\t", names=column, dtype=dtypes
-            )
-            # strip chr from chrom in cases of diff. formatted bed
-            raw_coverage["chrom"] = raw_coverage["chrom"].apply(
-                lambda x: str(x).replace("chr", "")
-            )
+            if start["exon_start"] != start["cov_start"]:
+                # if cov_start is diff to tx start due to mosdepth
+                # binning, use tx start avoids wrongly estimating
+                # coverage by using wrong tx length
+                exon_cov.iloc[0, exon_cov.columns.get_loc(
+                    "cov_start")] = int(start["exon_start"])
 
-        if snp_vcfs:
-            # get names of SNP vcfs used to display in report
-            vcfs = ", ".join([Path(x).stem for x in snp_vcfs])
-            vcfs = "<br>VCF(s) of known variants included in report: <b>{}</b>\
-                </br>".format(vcfs)
-        else:
-            vcfs = ""
+            if end["exon_end"] != end["cov_end"]:
+                # same as start
+                exon_cov.loc[
+                    exon_cov.index[-1], "cov_end"] = int(end["exon_end"])
 
-        # check given threshold is in the stats files
-        if "x" not in str(threshold):
-            threshold = str(threshold) + "x"
+            # create empty df for unbinned data with same columns
+            exon_cov_unbinned = exon_cov[0:0]
 
-        if threshold not in list(cov_stats) and\
-                threshold not in list(cov_summary):
-            print("""--threshold must be one of the gene and exon
-                stats coverage thresholds. Exiting now.""")
-            sys.exit()
+            for i, row in exon_cov.iterrows():
+                for pos in range(row["cov_start"], row["cov_end"] + 1):
+                    # unbin each row, set start & end to same value for each
+                    # use +1 since range is non inclusive of final value
+                    pos_row = row
+                    pos_row["cov_start"] = pos
+                    pos_row["cov_end"] = pos
+                    exon_cov_unbinned = exon_cov_unbinned.append(
+                        pos_row, ignore_index=True
+                    )
 
-        return cov_stats, cov_summary, raw_coverage, html_template, build,\
-            panel, vcfs, bootstrap, version
+            # build list of first and last point for threshold line
+            xval = [x for x in range(
+                exon_cov_unbinned["cov_start"].iloc[0],
+                exon_cov_unbinned["cov_end"].iloc[-1]
+            )]
+            xval = xval[::len(xval) - 1]
+            yval = [threshold] * 2
 
+            # info field for hovering on plot line
+            label = '<i>position: </i>%{x}<br>coverage: %{y}<extra></extra>'
 
-    def build_report(self, html_template, total_stats, gene_stats,
-                     sub_threshold_stats, snps_low_cov, snps_high_cov,
-                     snps_no_cov, fig, all_plots, summary_plot, report_vals,
-                     bootstrap
-                     ):
-        """
-        Build report from template and variables to write to file
+            # generate plot and threshold line to display
+            if sum(exon_cov_unbinned["cov"]) != 0:
+                plot = go.Scatter(
+                    x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"],
+                    mode="lines",
+                    hovertemplate=label
+                )
+            else:
+                # if any plots have no coverage, just display empty plot
+                # very hacky way by making data point transparent but
+                # ¯\_(ツ)_/¯
+                plot = go.Scatter(
+                    x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"],
+                    mode="markers", marker={"opacity": 0}
+                )
 
-        Args:
-            - html_template (str): string of HTML template file
-            - total_stats (df): total stats table of all genes & exons
-            - gene_stats (df): stats table of whole gene
-            - sub_threshold_stats (df): table of exons with < threshold
-            - snps_low_cov (df): table of snps with cov < threshold
-            - snsp_high_cov (df): table of snps with cov > threshold
-            - snps_no_cov (df): variants that span exon boundaries (i.e SVs)
-            - fig (figure): grid of low coverage exon plots (plotly)
-            - all-plots (figure): grid of all full gene- exon plots
-            - summary_plot (figure): gene summary plot - % at threshold
-            - report_vals (dict): values to display in report text
-        Returns:
-            - single_report (str): HTML string of filled report
-        """
-        # convert logo image into string to pass in to template
-        logo = str(os.path.join(os.path.dirname(
-            os.path.abspath(__file__)), "../data/static/images/logo.png"
-        ))
+            threshold_line = go.Scatter(
+                x=xval, y=yval, hoverinfo='skip', mode="lines",
+                line=dict(color='rgb(205, 12, 24)', width=1)
+            )
 
-        data_uri = base64.b64encode(open(logo, 'rb').read()).decode('utf-8')
-        logo = '<img height="25" width="22" src=data:image/png;base64,{0}\
-            alt="" style="vertical-align:middle; padding-bottom:3px">'.format(
-            data_uri)
+            # add to subplot grid
+            fig.add_trace(plot, col_no, row_no)
+            fig.add_trace(threshold_line, col_no, row_no)
 
-        t = Template(html_template)
+            row_no = row_no + 1
 
-        date = datetime.today().strftime('%Y-%m-%d')
+        # set height of grid by no. rows and scale value of 325
+        height = (rows * 300) + 150
 
-        single_report = t.safe_substitute(
-            bootstrap=bootstrap,
-            logo=logo,
-            total_genes=report_vals["total_genes"],
-            threshold=report_vals["threshold"],
-            summary_text=report_vals["summary_text"],
-            exon_issues=report_vals["exon_issues"],
-            gene_issues=report_vals["gene_issues"],
-            fully_covered_genes=report_vals["fully_covered_genes"],
-            name=report_vals["name"],
-            sub_threshold_stats=sub_threshold_stats,
-            low_cov_plots=fig,
-            all_plots=all_plots,
-            summary_plot=summary_plot,
-            gene_stats=gene_stats,
-            total_stats=total_stats,
-            snps_high_cov=snps_high_cov,
-            snps_low_cov=snps_low_cov,
-            snps_no_cov=snps_no_cov,
-            total_snps=report_vals["total_snps"],
-            snps_covered=report_vals["snps_covered"],
-            snps_pct_covered=report_vals["snps_pct_covered"],
-            snps_not_covered=report_vals["snps_not_covered"],
-            snps_pct_not_covered=report_vals["snps_pct_not_covered"],
-            snps_out_panel=report_vals["snps_out_panel"],
-            snps_pct_out_panel=report_vals["snps_pct_out_panel"],
-            date=date,
-            build=report_vals["build"],
-            vcfs=report_vals["vcfs"],
-            panel=report_vals["panel"],
-            panel_pct_coverage=report_vals["panel_pct_coverage"],
-            version=report_vals["version"]
+        # update plot formatting
+        fig.update_xaxes(nticks=3, ticks="", showgrid=True, tickformat=',d')
+        fig.update_yaxes(title='coverage', title_standoff=0)
+        fig.update_xaxes(title='exon position', color='#FFFFFF')
+        fig["layout"].update(
+            height=height, showlegend=False, margin=dict(l=50, r=0)
         )
 
-        return single_report
+        # write plots to html string
+        fig = fig.to_html(full_html=False)
 
+        return fig
 
-    def panel_coverage(self, cov_stats, threshold):
+
+    def all_gene_plots(self, raw_coverage, threshold):
         """
-        Calculates mean coverage of all panel regions at given threshold,
-        normalised against length of each gene
+        Generate full plots for each gene
 
         Args:
-            - cov_stats (df): df of coverage stats for each exon
-            - threshold (int): threshold cut off for low coverage
+            - raw_coverage (file): from args; bp coverage file used as
+                                    input for coverage_stats_single.py
+            - threshold (int): defined threshold level (default: 20)
 
         Returns:
-            - panel_pct_coverage (str): % coverage of panel as str
+            - all-plots (figure): grid of all full gene- exon plots
         """
-        print("Calculating panel average coverage")
 
-        # threshold column to check at
-        threshold = str(threshold) + "x"
+        all_plots = ""
 
-        gene_stats = pd.DataFrame(
-            columns=["gene", "gene_len", "coverage"])
+        if len(raw_coverage.index) == 0:
+            # passed empty df, most likely because there were less genes
+            # than processes => empty df passed with multiprocess
+            return ""
 
-        # make list of genes
-        genes = sorted(list(set(cov_stats["gene"].tolist())))
+        # get unique list of genes
+        genes = raw_coverage.drop_duplicates(["gene"])["gene"].values.tolist()
 
         for gene in genes:
-            # for each gene, calculate length and average % at threshold
-            gene_cov = cov_stats.loc[cov_stats["gene"] == gene]
 
-            length = sum(gene_cov["exon_len"])
-            coverage = sum(
-                gene_cov[threshold] * gene_cov["exon_len"] / length)
+            # get coverage data for current gene
+            gene_cov = raw_coverage.loc[(raw_coverage["gene"] == gene)]
+            # get list of exons
+            exons = gene_cov.drop_duplicates(["exon"])["exon"].values.tolist()
 
-            gene_stats = gene_stats.append({
-                "gene": gene,
-                "gene_len": length,
-                "coverage": coverage
-            }, ignore_index=True)
+            # no. plot columns = no. of exons
+            column_no = len(exons)
 
-        # calculate % panel coverage
-        panel_coverage = sum(
-            gene_stats["coverage"] * gene_stats["gene_len"] / sum(
-                gene_stats["gene_len"]
-            )
-        )
+            # make subplot grid size of no. of exons, height variable
+            # splits large genes to several rows and maintains height
+            height = math.ceil(len(exons) / 30) * 4
+            fig = plt.figure(figsize=(30, height))
 
-        # round to 12 dp to account for limit of accuracy of float &
-        # length of human genome
-        panel_coverage = round(panel_coverage, 12)
+            # generate grid with space for each exon
+            # splits genes with >25 exons to multiple rows
+            rows = math.ceil(len(exons) / 30)
+            if column_no > 30:
+                column_no = 30
 
-        panel_pct_coverage = str(math.floor(panel_coverage * 100) / 100)
+            grid = fig.add_gridspec(rows, column_no, wspace=0)
+            axs = grid.subplots(sharey=True)
 
-        return panel_pct_coverage
+            if column_no == 1:
+                # handle single exon genes, axs needs turning into np
+                # array to flatten
+                axs = np.array([axs])
 
+            axs = axs.flatten()
 
-    def snp_coverage(self, snp_vcfs, raw_coverage, threshold):
-        """
-        Produces tables of coverage for variants inside of capture
-        regions, and larger structural variants spanning region
-        boundaries.
+            fig.suptitle(gene, fontweight="bold")
+            count = 0
 
-        Args:
-            - snp_vcfs (str): list of vcf files used for SNP analysis
-            - raw_coverage (df): raw bp coverage for each exon
-            - threshold (int): threshold value passed from parse args
+            for exon in exons:
+                # get coverage data for current exon
+                exon_cov = raw_coverage.loc[(
+                    raw_coverage["gene"] == gene
+                ) & (
+                    raw_coverage["exon"] == exon
+                )]
 
-        Returns:
-            - snps_low_cov (df): variants with lower coverage than threshold
-            - snps_high_cov (df): variants with higher coverage than threshold
-            - snps_no_cov (df): variants that span exon boundaries (i.e SVs)
-        """
-        print("Calculating coverage of given SNPs")
+                exon_cov = exon_cov.reset_index(drop=True)
 
-        bedFile = raw_coverage[
-            ["chrom", "exon_start", "exon_end"]].drop_duplicates()
-        coverageFile = raw_coverage[
-            ["chrom", "cov_start", "cov_end", "cov"]].drop_duplicates()
+                # sort and check coordinates are correct
+                exon_cov = exon_cov.sort_values(
+                    by='cov_start', ascending=True
+                )
 
-        # turn dfs into BedTools objects
-        bed = bedtools.BedTool.from_dataframe(bedFile)
-        cov = bedtools.BedTool.from_dataframe(coverageFile)
+                start = exon_cov.iloc[0]
+                end = exon_cov.iloc[-1]
 
-        # empty df to add all SNP info to
-        snp_df = pd.DataFrame(columns=[
-            'VCF', 'chrom', 'pos', 'id', 'ref', 'alt', 'info'
-        ])
+                if start["exon_start"] != start["cov_start"]:
+                    # if cov_start is diff to tx start due to mosdepth
+                    # binning, use tx start avoids wrongly estimating
+                    # coverage by using wrong tx length
+                    exon_cov.iloc[
+                        0, exon_cov.columns.get_loc("cov_start")
+                    ] = int(start["exon_start"])
 
-        for vcf in snp_vcfs:
-            # read vcf into BedTools object
-            v = bedtools.BedTool(vcf)
+                if end["exon_end"] != end["cov_end"]:
+                    # same as start
+                    exon_cov.loc[exon_cov.index[-1], "cov_end"] = int(
+                        end["exon_end"]
+                    )
 
-            # get vcf name to add to table, req. for multiple VCFS and
-            # recording variant source VCF
-            name = Path(vcf).stem.split("_")[0]
+                # check if coverage column empty
+                if (exon_cov['cov'] == 0).all():
+                    # no coverage, generate empty plot with just
+                    # threshold line
+                    axs[count].plot(
+                        [0, 100], [threshold, threshold],
+                        color='red', linestyle='-', linewidth=2
+                    )
+                else:
+                    axs[count].plot(exon_cov["cov_start"], exon_cov["cov"])
 
-            # use bedtools intersect to get SNPs in capture region
-            snps = bed.intersect(v, wb=True)
+                    # threshold line
+                    axs[count].plot(
+                        [exon_cov["exon_start"], exon_cov["exon_end"]],
+                        [threshold, threshold], color='red', linestyle='-',
+                        linewidth=1
+                    )
 
-            for row in snps:
-                # get data from returned BedTools object, add to df
-                snp_data = str(row).split()
-                snp_df = snp_df.append({
-                    'VCF': name, 'chrom': snp_data[3],
-                    'pos': snp_data[4], 'ref': snp_data[6],
-                    'alt': snp_data[7], 'info': snp_data[10]
-                }, ignore_index=True)
+                # add labels
+                xlab = str(
+                    exon_cov["exon_end"].iloc[0] -
+                    exon_cov["exon_start"].iloc[0]
+                ) + "\nbp"
+                axs[count].title.set_text(exon)
+                axs[count].set_xlabel(xlab)
 
-        snp_df = snp_df[
-            ['VCF', 'chrom', 'pos', 'ref', 'alt', 'info']].drop_duplicates()
+                count += 1
 
-        # reset index
-        raw_coverage = raw_coverage.reset_index(drop=True)
+            # remove y ticks & label for all but first plot of lines
+            for i in range(column_no * rows):
+                if i in [x * column_no for x in range(rows)]:
+                    # first plot of line, keep ticks and labels
+                    continue
+                else:
+                    axs[i].yaxis.set_ticks_position('none')
 
-        # use pandasql to intersect SNPs against coverage df to find the
-        # coverage at each SNP position
-        coverage_sql = """
-            SELECT snp_df.VCF, snp_df.chrom, snp_df.pos, snp_df.ref,
-            snp_df.alt, snp_df.info, raw_coverage.gene, raw_coverage.exon,
-            raw_coverage.cov_start, raw_coverage.cov_end, raw_coverage.cov
-            FROM snp_df
-            LEFT JOIN raw_coverage on snp_df.CHROM=raw_coverage.chrom
-            WHERE snp_df.POS > raw_coverage.cov_start AND
-            snp_df.POS <= raw_coverage.cov_end
-            """
+            # strip x axis ticks and labels
+            plt.setp(plt.gcf().get_axes(), xticks=[])
 
-        snp_cov = pdsql.sqldf(coverage_sql, locals())
+            # adjust yaxis limits
+            ymax = max(gene_cov["cov"].tolist()) + 10
+            plt.ylim(bottom=0, top=ymax)
 
-        # get SNPs that won't have coverage data but do intersect panel
-        # regions (i.e. large deletions that span a region)
-        snps_no_cov = snp_df.merge(snp_cov, how='outer', indicator=True).loc[
-            lambda x: x['_merge'] == 'left_only']
+            # remove outer white margins
+            fig.tight_layout(h_pad=1.2)
 
-        snps_no_cov = snps_no_cov[[
-            "VCF", "chrom", "pos", "ref", "alt", "info"
-        ]].reset_index(drop=True)
+            # convert image to html string and append to one really long
+            # string to insert in report
+            buffer = BytesIO()
+            plt.savefig(buffer, format='png')
+            buffer.seek(0)
+            image_png = buffer.getvalue()
+            buffer.close()
+            graphic = base64.b64encode(image_png)
+            data_uri = graphic.decode('utf-8')
+            img_tag = "<img src=data:image/png;base64,{0} style='max-width:\
+                100%; max-height: auto; object-fit: contain; ' />".format(
+                data_uri
+            )
 
-        # get required columns for SNP tables
-        snps_cov = snp_cov[
-            ["VCF", "gene", "exon", "chrom", "pos", "ref", "alt", "cov"]
-        ].drop_duplicates(subset=[
-            "VCF", "chrom", "pos", "ref", "alt"]).reset_index(drop=True)
+            all_plots = all_plots + img_tag + "<br></br>"
 
-        # rename columns for displaying in report
-        snps_cov.columns = ["VCF", "Gene", "Exon", "Chromosome", "Position",
-                            "Ref", "Alt", "Coverage"]
+            plt.close(fig)
 
-        snps_no_cov.columns = [
-            "VCF", "Chromosome", "Position", "Ref", "Alt", "Info"
-        ]
+        return all_plots
 
-        # remove <> from DELs to stop being interpreted as HTML tags
-        snps_no_cov["Alt"] = snps_no_cov["Alt"].str.strip("<>")
 
-        snps_cov["Coverage"] = snps_cov["Coverage"].astype(int)
+    def summary_gene_plot(self, cov_summary, threshold):
+        """
+        Generate summary plot of all genes against threshold value
 
-        # sort no_cov table by chrom & pos, as pos is str first define
-        # order to sort by
-        order = [str(x) for x in range(0, 23)]
-        order.extend(["X", "Y", "MT"])
-        snps_no_cov["Chromosome"] = pd.Categorical(
-            snps_no_cov["Chromosome"], order
+        Args:
+            - cov_summary (df): df of gene coverage values
+            - threshold (int): defined threshold level (default: 20)
+
+        Returns:
+            - summary_plot (fig): plot of all genes
+        """
+        print("Generating summary plot")
+
+        threshold = str(threshold) + "x"
+
+        summary_data = cov_summary.copy()
+
+        # define colours based on values
+        summary_data["colours"] = 'green'
+        summary_data.loc[summary_data[threshold] < 100, 'colours'] = 'orange'
+        summary_data.loc[summary_data[threshold] < 90, 'colours'] = 'red'
+
+        summary_data = summary_data.sort_values(
+            by=[threshold], ascending=False
         )
+        summary_plot, axs = plt.subplots(figsize=(25, 7.5))
 
-        snps_cov = snps_cov.sort_values(by=["Gene", "Exon", "Position"])
-        snps_no_cov = snps_no_cov.sort_values(by=["Chromosome", "Position"])
+        if len(summary_data.index) > 100:
+            # split off some of 100% covered genes to limit size of plot
+            if len(summary_data[summary_data[threshold] < 100]) > 100:
+                # over 100 sub threshold genes, remove all 100% genes
+                genes100pct = len(summary_data[summary_data[threshold] == 100])
+                summary_data = summary_data[summary_data[threshold] < 100]
+            else:
+                # split off bottom 100 genes, plot includes some 100% covered
+                genes100pct = len(summary_data.iloc[:-100])
+                summary_data = summary_data.iloc[-100:]
 
-        # split SNPs by coverage against threshold
-        snps_low_cov = snps_cov.loc[snps_cov["Coverage"] < threshold]
-        snps_high_cov = snps_cov.loc[snps_cov["Coverage"] >= threshold]
+        plt.bar(
+            summary_data["gene"], [int(x) for x in summary_data[threshold]],
+            color=summary_data.colours
+        )
 
-        return snps_low_cov, snps_high_cov, snps_no_cov
+        if "genes100pct" in locals():
+            genes100pct = str(genes100pct)
+            # more than 100 genes, add title inc. 100% covered not shown
+            axs.set_title(
+                r"$\bf{" + genes100pct + "}$" + " genes covered 100% at " +
+                r"$\bf{" + threshold + "}$" +
+                " were omitted from the plot due to the panel size", loc='left'
+            )
 
+        # threshold lines
+        plt.axhline(y=99, linestyle='--', color="#565656", alpha=0.6)
+        plt.axhline(y=95, linestyle='--', color="#565656", alpha=0.6)
 
-    def low_coverage_regions(self, cov_stats, raw_coverage, threshold):
+        plt.text(1.005, 0.94, '99%', transform=axs.transAxes)
+        plt.text(1.005, 0.91, '95%', transform=axs.transAxes)
+
+        # plot formatting
+        axs.tick_params(labelsize=6, length=0)
+        plt.xticks(rotation=55, color="#565656")
+
+        # adjust whole plot marins
+        axs.margins(x=0.01)
+        axs.autoscale_view(scaley=True)
+
+        # add legend
+        green = mpatches.Patch(color='green', label='100%')
+        orange = mpatches.Patch(color='orange', label='90-99.99%')
+        red = mpatches.Patch(color='red', label='<90%')
+
+        plt.legend(
+            handles=[green, orange, red], loc='upper center',
+            bbox_to_anchor=(0.5, -0.1),
+            fancybox=True, shadow=True, ncol=12, fontsize=12
+        )
+
+        vals = np.arange(0, 110, 10).tolist()
+        plt.yticks(vals, vals)
+        axs.tick_params(axis='both', which='major', labelsize=8)
+
+        plt.xlabel("")
+        plt.ylabel("% coverage ({})".format(threshold), fontsize=11)
+
+        axs.yaxis.grid(linewidth=0.5, color="grey", linestyle="-.")
+        plt.box(False)
+        axs.set_axisbelow(True)
+        plt.tight_layout()
+
+        # convert image to html string to insert in report
+        buffer = BytesIO()
+        plt.savefig(buffer, format='png')
+        buffer.seek(0)
+        image_png = buffer.getvalue()
+        buffer.close()
+        graphic = base64.b64encode(image_png)
+        data_uri = graphic.decode('utf-8')
+        summary_plot = "<img src=data:image/png;base64,{0} style='max-width:\
+            100%; max-height: auto; object-fit: contain; ' />".format(
+            data_uri
+        )
+
+        return summary_plot
+
+
+class styleTables():
+    """Functions for styling tables for displaying in report"""
+
+    def style_sub_threshold(
+            self, cov_stats, threshold, threshold_cols, vals
+        ):
         """
-        Get regions where coverage at given threshold is <100%
+        Styling of sub threshold stats df for displaying in report
 
         Args:
-            - cov_stats (df): df of coverage stats for each exon
-            - raw_coverage (df): raw bp coverage for each exon
-            - threshold (int): defined threshold level (default: 20)
-
+            - cov_stats (df): df of per exon coverage stats
+            - threshold (str): low coverage threshold value
+            - threshold_cols (list): threshold values for coverage
         Returns:
-            - low_raw_cov (df): df of raw bp values for each region with
-                                coverage less than 100% at threshold
+            - sub_threshold_stats ():
         """
-        # threshold column to check at
-        threshold = str(threshold) + "x"
-
-        # get threshold columns and add to column names
-        threshold_cols = list(cov_stats.filter(regex='[0-9]+x', axis=1))
-
         column = [
-            "gene", "tx", "chrom", "exon", "exon_start", "exon_end",
-            "min", "mean", "max"
+            "gene", "tx", "chrom", "exon", "exon_len", "exon_start",
+            "exon_end", "min", "mean", "max"
         ]
 
         column.extend(threshold_cols)
 
-        # empty df
-        low_stats = pd.DataFrame(columns=column)
+        sub_threshold = pd.DataFrame(columns=column)
 
-        # get all exons with <100% coverage at given threshold
+        # get all exons with <100% coverage at threshold
         for i, row in cov_stats.iterrows():
             if int(row[threshold]) < 100:
-                low_stats = low_stats.append(row, ignore_index=True)
+                sub_threshold = sub_threshold.append(row, ignore_index=True)
 
         # pandas is terrible and forces floats, change back to int
         dtypes = {
             'chrom': str,
             'exon': int,
+            'exon_len': int,
             'exon_start': int,
             'exon_end': int,
             'min': int,
             'max': int
         }
 
-        low_stats = low_stats.astype(dtypes)
+        if not sub_threshold.empty:
+            # some low covered regions identified
+            sub_threshold = sub_threshold.astype(dtypes)
 
-        # get list of tuples of genes and exons with low coverage to
-        # select out raw coverage
-        low_exon_list = low_stats.reset_index()[['gene',
-                                                'exon']].values.tolist()
-        low_exon_list = [tuple(exon) for exon in low_exon_list]
+            sub_threshold_stats = pd.pivot_table(sub_threshold, index=[
+                "gene", "tx", "chrom", "exon",
+                "exon_len", "exon_start", "exon_end"
+            ], values=vals)
 
-        # get raw coverage for low coverage regions to plot
-        low_raw_cov = raw_coverage[raw_coverage[['gene', 'exon']].apply(
-            tuple, axis=1).isin(low_exon_list)].reset_index()
+            # reset index to fix formatting
+            sub_threshold_stats = sub_threshold_stats.reindex(vals, axis=1)
+            sub_threshold_stats.reset_index(inplace=True)
 
-        return low_raw_cov
+            gene_issues = len(list(set(sub_threshold_stats["gene"].tolist())))
+            exon_issues = len(sub_threshold_stats["exon"])
+        else:
+            # if no low regions set to empty df with appropriate columns
+            print("No low coverage regions, generating empty table")
+            sub_threshold_stats = pd.DataFrame(columns=column)
+            gene_issues = 0
+            exon_issues = 0
 
+        # rename columns to display properly
+        sub_threshold_stats = sub_threshold_stats.rename(columns={
+            "gene": "Gene",
+            "tx": "Transcript",
+            "chrom": "Chr",
+            "exon": "Exon",
+            "exon_len": "Length",
+            "exon_start": "Start",
+            "exon_end": "End",
+            "min": "Min",
+            "mean": "Mean",
+            "max": "Max"
+        })
 
-    def low_exon_plot(self, low_raw_cov, threshold):
-        """
-        Plot bp coverage of exon, used for those where coverage is given
-        threshold
+        # reindex & set to begin at 1
+        sub_threshold_stats.index = np.arange(
+            1, len(sub_threshold_stats.index) + 1
+        )
 
-        Args:
-            - low_raw_cov (df): df of raw coverage for exons with low
-                                coverage
-            - threshold (int): defined threshold level (default: 20)
+        # creat slices of sub_threshold stats df to add styling to
+        slice_ranges = {
+            "x0": (10, 0), "x10": (30, 10), "x30": (50, 30), "x50": (70, 50),
+            "x70": (90, 70), "x90": (95, 90), "x95": (99, 95), "x99": (101, 99)
+        }
 
-        Returns:
-            - fig (figure): plots of low coverage regions
-        """
-        print("Generating plots of low covered regions")
+        sub_slice = {}
 
-        # get list of tuples of genes and exons to define plots
-        genes = low_raw_cov.drop_duplicates(
-            ["gene", "exon"])[["gene", "exon"]].values.tolist()
-        genes = [tuple(exon) for exon in genes]
+        for key, val in slice_ranges.items():
+            sub_slice[key] = pd.IndexSlice[sub_threshold_stats.loc[(
+                sub_threshold_stats[threshold] < val[0]
+            ) & (
+                sub_threshold_stats[threshold] >= val[1])].index, threshold]
 
-        if len(genes) == 0:
-            # everything above threshold, don't generate plots
-            fig = "<br></br><b>All regions in panel above threshold, no plots\
-                to show.</b><br></br>"
+        # df column index of threshold
+        col_idx = sub_threshold_stats.columns.get_loc(threshold)
 
-            return fig
+        # make dict for rounding coverage columns to 2dp
+        rnd = {}
+        for col in list(sub_threshold_stats.columns[10:15]):
+            rnd[col] = '{0:.2f}%'
 
-        # sort list of genes/exons by gene and exon
-        genes = sorted(genes, key=lambda element: (element[0], element[1]))
+        # set threshold column widths as a fraction of 30% table width
+        t_width = str(30 / len(threshold_cols)) + "%"
 
-        plot_titles = [str(x[0]) + " exon: " + str(int(x[1])) for x in genes]
+        # apply colours to coverage cell based on value, 0 is given solid red
+        s = sub_threshold_stats.style.apply(lambda x: [
+            "background-color: #b30000" if x[threshold] == 0 and idx == col_idx
+            else "" for idx, v in enumerate(x)
+        ], axis=1)\
+            .bar(subset=sub_slice["x0"], color='#b30000', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x10"], color='#990000', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x30"], color='#C82538', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x50"], color='#FF4500', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x70"], color='#FF4500', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x90"], color='#FF4500', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x95"], color='#FFBF00', vmin=0, vmax=100)\
+            .bar(subset=sub_slice["x99"], color='#007600', vmin=0, vmax=100)\
+            .format(rnd)\
+            .set_table_attributes('table border="1"\
+                class="dataframe table table-hover table-bordered"')\
+            .set_uuid("low_exon_table")\
+            .set_properties(**{'font-size': '0.85vw', 'table-layout': 'auto'})\
+            .set_properties(subset=threshold_cols, **{'width': t_width})\
 
-        low_raw_cov["exon_len"] =\
-            low_raw_cov["exon_end"] - low_raw_cov["exon_start"]
+        sub_threshold_stats["Mean"] = sub_threshold_stats["Mean"].apply(
+            lambda x: int(x)
+        )
 
-        low_raw_cov["relative_position"] = low_raw_cov["exon_end"] - round(((
-            low_raw_cov["cov_end"] + low_raw_cov["cov_start"]) / 2
-        ))
+        sub_threshold_stats = s.render()
 
-        # set no. rows to no. of plots / no of columns to define grid
-        columns = 4
-        rows = math.ceil(len(genes) / 4)
+        return sub_threshold_stats, gene_issues, exon_issues
 
-        # variable height depeendent on no. of plots
-        v_space = (1 / rows) * 0.25
 
-        # define grid to add plots to
-        fig = make_subplots(
-            rows=rows, cols=columns, print_grid=False,
-            horizontal_spacing=0.04, vertical_spacing=v_space,
-            subplot_titles=plot_titles
+    def style_total_stats(self, cov_stats, threshold_cols, vals):
+        """
+        Styling of full gene-exon stats table for displaying in report
+        Args:
+            -
+        Returns:
+            -
+        """
+        # do some excel level formatting to make table more readable
+        total_stats = pd.pivot_table(
+            cov_stats,
+            index=["gene", "tx", "chrom", "exon", "exon_len",
+                   "exon_start", "exon_end"],
+            values=vals
         )
 
-        # counter for grid
-        row_no = 1
-        col_no = 1
-
-        for gene in genes:
-            # make plot for each gene / exon
+        # reset index to fix formatting, set beginning to 1
+        total_stats = total_stats.reindex(vals, axis=1)
+        total_stats.index = np.arange(1, len(total_stats.index) + 1)
 
-            # counter for grid, by gets to 5th entry starts new row
-            if row_no // 5 == 1:
-                col_no += 1
-                row_no = 1
+        total_stats = total_stats.rename(columns={
+            "gene": "Gene",
+            "tx": "Transcript",
+            "chrom": "Chr",
+            "exon": "Exon",
+            "exon_len": "Length",
+            "exon_start": "Start",
+            "exon_end": "End",
+            "min": "Min",
+            "mean": "Mean",
+            "max": "Max"
+        })
 
-            # get rows for current gene and exon
-            exon_cov = low_raw_cov.loc[(
-                low_raw_cov["gene"] == gene[0]
-            ) & (
-                low_raw_cov["exon"] == gene[1]
-            )]
+        # limit to 2dp using math.floor, use of round() with
+        # 2dp may lead to inaccuracy such as 99.99 => 100.00
+        round_cols = ['Mean'] + threshold_cols
 
-            exon_cov = exon_cov.sort_values(by='cov_start', ascending=True)
-            start = exon_cov.iloc[0]
-            end = exon_cov.iloc[-1]
+        for col in round_cols:
+            total_stats[col] = total_stats[col].map(
+                lambda col: math.floor(col * 100) / 100
+            )
 
-            if start["exon_start"] != start["cov_start"]:
-                # if cov_start is diff to tx start due to mosdepth
-                # binning, use tx start avoids wrongly estimating
-                # coverage by using wrong tx length
-                exon_cov.iloc[0, exon_cov.columns.get_loc(
-                    "cov_start")] = int(start["exon_start"])
+        # CSS table class for styling tables
+        style = (
+            '<table border="1" class="dataframe">',
+            '<table class="table table-striped" style="font-size: 0.85vw;" >'
+        )
 
-            if end["exon_end"] != end["cov_end"]:
-                # same as start
-                exon_cov.loc[
-                    exon_cov.index[-1], "cov_end"] = int(end["exon_end"])
+        total_stats = total_stats.to_html(justify='left').replace(
+            style[0], style[1]
+        )
 
-            # create empty df for unbinned data with same columns
-            exon_cov_unbinned = exon_cov[0:0]
+        return total_stats
 
-            for i, row in exon_cov.iterrows():
-                for pos in range(row["cov_start"], row["cov_end"] + 1):
-                    # unbin each row, set start & end to same value for each
-                    # use +1 since range is non inclusive of final value
-                    pos_row = row
-                    pos_row["cov_start"] = pos
-                    pos_row["cov_end"] = pos
-                    exon_cov_unbinned = exon_cov_unbinned.append(
-                        pos_row, ignore_index=True
-                    )
 
-            # build list of first and last point for threshold line
-            xval = [x for x in range(
-                exon_cov_unbinned["cov_start"].iloc[0],
-                exon_cov_unbinned["cov_end"].iloc[-1]
-            )]
-            xval = xval[::len(xval) - 1]
-            yval = [threshold] * 2
+    def style_cov_summary(self, cov_summary, threshold_cols):
+        """
+        """
+        # rename columns for displaying in report
+        cov_summary = cov_summary.drop(columns=["exon"])
+        cov_summary = cov_summary.rename(columns={
+            "gene": "Gene",
+            "tx": "Transcript",
+            "min": "Min",
+            "mean": "Mean",
+            "max": "Max"
+        })
 
-            # info field for hovering on plot line
-            label = '<i>position: </i>%{x}<br>coverage: %{y}<extra></extra>'
+        # get values to display in report
+        total_genes = len(cov_summary["Gene"].tolist())
 
-            # generate plot and threshold line to display
-            if sum(exon_cov_unbinned["cov"]) != 0:
-                plot = go.Scatter(
-                    x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"],
-                    mode="lines",
-                    hovertemplate=label
-                )
-            else:
-                # if any plots have no coverage, just display empty plot
-                # very hacky way by making data point transparent but
-                # ¯\_(ツ)_/¯
-                plot = go.Scatter(
-                    x=exon_cov_unbinned["cov_start"], y=exon_cov_unbinned["cov"],
-                    mode="markers", marker={"opacity": 0}
-                )
+        # limit to 2dp using math.floor, use of round() with
+        # 2dp may lead to inaccuracy such as 99.99 => 100.00
+        round_cols = ['Mean'] + threshold_cols
 
-            threshold_line = go.Scatter(
-                x=xval, y=yval, hoverinfo='skip', mode="lines",
-                line=dict(color='rgb(205, 12, 24)', width=1)
+        for col in round_cols:
+            cov_summary[col] = cov_summary[col].map(
+                lambda col: math.floor(col * 100) / 100
             )
 
-            # add to subplot grid
-            fig.add_trace(plot, col_no, row_no)
-            fig.add_trace(threshold_line, col_no, row_no)
-
-            row_no = row_no + 1
-
-        # set height of grid by no. rows and scale value of 325
-        height = (rows * 300) + 150
+        # reset index to start at 1
+        cov_summary.index = np.arange(1, len(cov_summary.index) + 1)
 
-        # update plot formatting
-        fig.update_xaxes(nticks=3, ticks="", showgrid=True, tickformat=',d')
-        fig.update_yaxes(title='coverage', title_standoff=0)
-        fig.update_xaxes(title='exon position', color='#FFFFFF')
-        fig["layout"].update(
-            height=height, showlegend=False, margin=dict(l=50, r=0)
+        # CSS table class for styling tables
+        style = (
+            '<table border="1" class="dataframe">',
+            '<table class="table table-striped" style="font-size: 0.85vw;" >'
         )
 
-        # write plots to html string
-        fig = fig.to_html(full_html=False)
+        # generate HTML strings from table objects to write to report
+        gene_stats = cov_summary.to_html(justify='left').replace(
+            style[0], style[1]
+        )
 
-        return fig
+        return gene_stats, total_genes
 
 
-    def all_gene_plots(self, raw_coverage, threshold):
+    def style_snps_low_cov(self, snps_low_cov):
         """
-        Generate full plots for each gene
-
-        Args:
-            - raw_coverage (file): from args; bp coverage file used as
-                                    input for coverage_stats_single.py
-            - threshold (int): defined threshold level (default: 20)
-
-        Returns:
-            - all-plots (figure): grid of all full gene- exon plots
         """
+        # get snps values and format dfs to display
+        if not snps_low_cov.empty:
+            # format low coverage SNPs table
+            snps_low_cov.index = np.arange(1, len(snps_low_cov.index) + 1)
+            snps_not_covered = len(snps_low_cov.index)
+            snps_low_cov = snps_low_cov.style\
+                .set_table_attributes(
+                    'class="dataframe table table-striped"')\
+                .set_uuid("var_low_cov")\
+                .set_properties(**{
+                    'font-size': '0.80vw', 'table-layout': 'auto'
+                })\
+                .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\
+                .set_properties(subset=["Exon"], **{'width': '7.5%'})\
+                .set_properties(subset=["Chromosome"], **{'width': '10%'})\
+                .set_properties(subset=["Position"], **{'width': '12.5%'})\
+                .set_properties(subset=["Ref"], **{'width': '20%'})\
+                .set_properties(subset=["Alt"], **{'width': '20%'})\
+                .set_properties(subset=["Coverage"], **{'width': '10%'})
 
-        all_plots = ""
-
-        if len(raw_coverage.index) == 0:
-            # passed empty df, most likely because there were less genes
-            # than processes => empty df passed with multiprocess
-            return ""
+            snps_low_cov = snps_low_cov.render()
+        else:
+            snps_low_cov = "<b>No low covered SNPs</b>"
+            snps_not_covered = 0
 
-        # get unique list of genes
-        genes = raw_coverage.drop_duplicates(["gene"])["gene"].values.tolist()
+        return snps_low_cov, snps_not_covered
 
-        for gene in genes:
 
-            # get coverage data for current gene
-            gene_cov = raw_coverage.loc[(raw_coverage["gene"] == gene)]
-            # get list of exons
-            exons = gene_cov.drop_duplicates(["exon"])["exon"].values.tolist()
+    def style_snps_high_cov(self, snps_high_cov):
+        """
+        """
 
-            # no. plot columns = no. of exons
-            column_no = len(exons)
+        if not snps_high_cov.empty:
+            # format high coverage SNPs table
+            snps_high_cov.index = np.arange(1, len(snps_high_cov.index) + 1)
 
-            # make subplot grid size of no. of exons, height variable
-            # splits large genes to several rows and maintains height
-            height = math.ceil(len(exons) / 30) * 4
-            fig = plt.figure(figsize=(30, height))
+            snps_covered = len(snps_high_cov.index)
 
-            # generate grid with space for each exon
-            # splits genes with >25 exons to multiple rows
-            rows = math.ceil(len(exons) / 30)
-            if column_no > 30:
-                column_no = 30
+            snps_high_cov = snps_high_cov.style\
+                .set_table_attributes(
+                    'class="dataframe table table-striped"')\
+                .set_uuid("var_high_cov")\
+                .set_properties(**{
+                    'font-size': '0.80vw', 'table-layout': 'auto'
+                })\
+                .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\
+                .set_properties(subset=["Exon"], **{'width': '7.5%'})\
+                .set_properties(subset=["Chromosome"], **{'width': '10%'})\
+                .set_properties(subset=["Position"], **{'width': '12.5%'})\
+                .set_properties(subset=["Ref"], **{'width': '20%'})\
+                .set_properties(subset=["Alt"], **{'width': '20%'})\
+                .set_properties(subset=["Coverage"], **{'width': '10%'})
 
-            grid = fig.add_gridspec(rows, column_no, wspace=0)
-            axs = grid.subplots(sharey=True)
+            snps_high_cov = snps_high_cov.render()
+        else:
+            snps_high_cov = "<b>No covered SNPs</b>"
+            snps_covered = 0
+        
+        return snps_high_cov, snps_covered
 
-            if column_no == 1:
-                # handle single exon genes, axs needs turning into np
-                # array to flatten
-                axs = np.array([axs])
 
-            axs = axs.flatten()
+    def style_snps_no_cov(self, snps_no_cov):
+        """
+        """
+        # if variants from vcf found that span exon boundaries
+        if not snps_no_cov.empty:
+            # manually add div and styling around rendered table, allows
+            # to be fully absent from the report if the table is empty
+            snps_no_cov.index = np.arange(1, len(snps_no_cov) + 1)
 
-            fig.suptitle(gene, fontweight="bold")
-            count = 0
+            # get number of variants to display in report
+            snps_out_panel = len(snps_no_cov.index)
 
-            for exon in exons:
-                # get coverage data for current exon
-                exon_cov = raw_coverage.loc[(
-                    raw_coverage["gene"] == gene
-                ) & (
-                    raw_coverage["exon"] == exon
-                )]
+            html_string = snps_no_cov.style\
+                .set_table_attributes(
+                    'class="dataframe table table-striped"')\
+                .set_uuid("var_no_cov")\
+                .set_properties(**{
+                    'font-size': '0.80vw', 'table-layout': 'auto'
+                })\
+                .set_properties(subset=["VCF"], **{
+                    'width': '7.5%'
+                })\
+                .set_properties(subset=[
+                    "Chromosome", "Position", "Ref", "Alt"
+                ], **{'width': '10%'})
 
-                exon_cov = exon_cov.reset_index(drop=True)
+            html_string = html_string.render()
 
-                # sort and check coordinates are correct
-                exon_cov = exon_cov.sort_values(
-                    by='cov_start', ascending=True
-                )
+            snps_no_cov = """
+                <br> Variants included in the first table below either fully\
+                    or partially span panel region(s). These are most likely\
+                    large structural variants and as such do not have\
+                    coverage data available. See the "info" column for details\
+                    on the variant.
+                </br>
+                <br> Table of variants spanning panel regions(s) &nbsp
+                <button class="btn btn-info collapsible btn-sm">Show /\
+                     hide table</button>
+                <div class="content">
+                    <table>
+                        {}
+                    </table>
+                </div></br>
+                """.format(html_string)
+        else:
+            snps_no_cov = ""
+            snps_out_panel = 0
 
-                start = exon_cov.iloc[0]
-                end = exon_cov.iloc[-1]
+        return snps_no_cov, snps_out_panel
 
-                if start["exon_start"] != start["cov_start"]:
-                    # if cov_start is diff to tx start due to mosdepth
-                    # binning, use tx start avoids wrongly estimating
-                    # coverage by using wrong tx length
-                    exon_cov.iloc[
-                        0, exon_cov.columns.get_loc("cov_start")
-                    ] = int(start["exon_start"])
 
-                if end["exon_end"] != end["cov_end"]:
-                    # same as start
-                    exon_cov.loc[exon_cov.index[-1], "cov_end"] = int(
-                        end["exon_end"]
-                    )
+class singleReport():
+    """Functions to calculate values and generate report"""
 
-                # check if coverage column empty
-                if (exon_cov['cov'] == 0).all():
-                    # no coverage, generate empty plot with just
-                    # threshold line
-                    axs[count].plot(
-                        [0, 100], [threshold, threshold],
-                        color='red', linestyle='-', linewidth=2
-                    )
-                else:
-                    axs[count].plot(exon_cov["cov_start"], exon_cov["cov"])
+    def load_files(self, threshold, exon_stats,
+                   gene_stats, raw_coverage, snp_vcfs, panel):
+        """
+        Load in raw coverage data, coverage stats file and template.
 
-                    # threshold line
-                    axs[count].plot(
-                        [exon_cov["exon_start"], exon_cov["exon_end"]],
-                        [threshold, threshold], color='red', linestyle='-',
-                        linewidth=1
-                    )
+        Args:
+            - threshold (int): threshold value passed from parse_args
+            - exon_stats (file): exon stats file (from args;
+                                generated by coverage_stats_single.py)
+            - gene_stats (file): gene stats file (from args;
+                                generated by coverage_stats_single.py)
+            - raw_coverage (file): from args; bp coverage file used as
+                                input for coverage_stats_single.py
+            - snp_vcfs (list): VCFs of SNPs passed from args
+            - panel (file): panel bed file used for annotation, used to
+                            display panel name in report if passed
 
-                # add labels
-                xlab = str(
-                    exon_cov["exon_end"].iloc[0] -
-                    exon_cov["exon_start"].iloc[0]
-                ) + "\nbp"
-                axs[count].title.set_text(exon)
-                axs[count].set_xlabel(xlab)
+        Returns:
+            - cov_stats (df): df of coverage stats for each exon
+            - cov_summary (df): df of gene level coverage
+            - raw_coverage (df): raw bp coverage for each exon
+            - html_template (str): string of HTML report template
+            - flagstat (dict): flagstat metrics, from gene_stats header
+            - build (str): ref build used, from gene_stats header
+            - panel (str): panes(s) / gene(s) included in report
+            - vcfs (str): list of vcf names used for SNP analysis
+            - version (str): version of Athena, used to add to report
+        """
+        print("Reading in files")
 
-                count += 1
+        # read in single sample report template
+        bin_dir = os.path.dirname(os.path.abspath(__file__))
+        template_dir = os.path.join(bin_dir, "../data/templates/")
+        single_template = os.path.join(template_dir, "single_template.html")
 
-            # remove y ticks & label for all but first plot of lines
-            for i in range(column_no * rows):
-                if i in [x * column_no for x in range(rows)]:
-                    # first plot of line, keep ticks and labels
-                    continue
-                else:
-                    axs[i].yaxis.set_ticks_position('none')
+        with open(single_template, 'r') as template:
+            html_template = template.read()
 
-            # strip x axis ticks and labels
-            plt.setp(plt.gcf().get_axes(), xticks=[])
+        try:
+            # attempt to get version tag from root dir name
+            # will only work if downloaded as zip / tar and not cloned
+            path = str(os.path.join(bin_dir, "../")).split("/")
+            version = [s for s in path if "athena" in s][0].split("-")[1]
+            version = "({})".format(version)
+        except Exception:
+            print("Error getting version from dir name, continuing.")
+            version = ""
+            pass
 
-            # adjust yaxis limits
-            ymax = max(gene_cov["cov"].tolist()) + 10
-            plt.ylim(bottom=0, top=ymax)
+        # read bootstrap into var to store in report html
+        bs = str(os.path.join(os.path.dirname(
+            os.path.abspath(__file__)), "../data/static/css/bootstrap.min.css"
+        ))
+        with open(bs) as bs:
+            bootstrap = bs.read()
 
-            # remove outer white margins
-            fig.tight_layout(h_pad=1.2)
+        # read in exon stats file
+        with open(exon_stats.name) as exon_file:
+            dtypes = {
+                "chrom": str, "exon_start": int, "exon_end": int, "gene": str,
+                "tx": str, "exon": int, "min": int, "mean": float, "max": int,
+                r'[0-9]*x': float, "exon_len": int
+            }
 
-            # convert image to html string and append to one really long
-            # string to insert in report
-            buffer = BytesIO()
-            plt.savefig(buffer, format='png')
-            buffer.seek(0)
-            image_png = buffer.getvalue()
-            buffer.close()
-            graphic = base64.b64encode(image_png)
-            data_uri = graphic.decode('utf-8')
-            img_tag = "<img src=data:image/png;base64,{0} style='max-width:\
-                100%; max-height: auto; object-fit: contain; ' />".format(
-                data_uri
+            cov_stats = pd.read_csv(
+                exon_file, sep="\t", comment='#', dtype=dtypes
             )
 
-            all_plots = all_plots + img_tag + "<br></br>"
+            # strip chr from chrom in cases of diff. formatted bed
+            cov_stats["chrom"] = cov_stats["chrom"].apply(
+                lambda x: str(x).replace("chr", "")
+            )
 
-            plt.close(fig)
+        # read in gene stats file
+        with open(gene_stats) as gene_file:
+            dtypes = {
+                "gene": str, "tx": str, "min": int,
+                "mean": float, "max": int, r'[0-9]*x': float
+            }
 
-        return all_plots
+            cov_summary = pd.read_csv(
+                gene_file, sep="\t", comment='#', dtype=dtypes
+            )
+
+        flagstat = {}
+        # read in flagstat and build from header of gene stats file
+        with open(gene_stats) as gene_file:
+            for ln in gene_file:
+                if ln.startswith("#"):
+                    if "build" in ln:
+                        # get build number
+                        reference = ln.split(":")[1]
+                        # add build to string to display
+                        if "37" in reference:
+                            build = "<li>Reference build used for aligment<b>\
+                                {}</b></li>".format(reference)
+                        if "38" in build:
+                            build = "<li>Reference build used for aligment<b>\
+                                {}</b></li>".format(reference)
+                    else:
+                        # read in flagstat from header
+                        key = ln.split(":")[0].strip("#")
+                        val = ln.split(":")[1]
+                        flagstat[key] = val
+
+        if "build" not in locals():
+            # build no. not included in gene_stats file
+            build = ""
+
+        if panel is not None:
+            # if optional panel file given, get name and format for HTML
+            panel_name = Path(panel).stem
+
+            # format according to output of
+            # https://github.com/eastgenomics/eggd_generate_bed
+            panel_name = [x.strip("_") for x in panel_name.split("&&") if x]
+            panel_name = [
+                x.strip("_b37").strip("_b38") for x in panel_name if x
+            ]
+            panel_name = [x.replace("_", " ") for x in panel_name if x]
+            panel_name = ",&nbsp".join(panel_name)
+            panel = "<li>Panel(s) / gene(s) included in report: <b>{}</b>\
+                </li>".format(panel_name)
+        else:
+            panel = ""
+
+        column = [
+            "chrom", "exon_start", "exon_end", "gene", "tx", "exon",
+            "cov_start", "cov_end", "cov"
+        ]
+
+        dtypes = {
+            "chrom": str, "exon_start": int, "exon_end": int, "gene": str,
+            "tx": str, "exon": int, "cov_start": int, "cov_end": int,
+            "cov": int
+        }
+
+        # read in raw coverage stats file
+        with open(raw_coverage) as raw_file:
+            raw_coverage = pd.read_csv(
+                raw_file, sep="\t", names=column, dtype=dtypes
+            )
+            # strip chr from chrom in cases of diff. formatted bed
+            raw_coverage["chrom"] = raw_coverage["chrom"].apply(
+                lambda x: str(x).replace("chr", "")
+            )
+
+        if snp_vcfs:
+            # get names of SNP vcfs used to display in report
+            vcfs = ", ".join([Path(x).stem for x in snp_vcfs])
+            vcfs = "<br>VCF(s) of known variants included in report: <b>{}</b>\
+                </br>".format(vcfs)
+        else:
+            vcfs = ""
+
+        # check given threshold is in the stats files
+        if "x" not in str(threshold):
+            threshold = str(threshold) + "x"
+
+        if threshold not in list(cov_stats) and\
+                threshold not in list(cov_summary):
+            print("""--threshold must be one of the gene and exon
+                stats coverage thresholds. Exiting now.""")
+            sys.exit()
+
+        return cov_stats, cov_summary, raw_coverage, html_template, build,\
+            panel, vcfs, bootstrap, version
 
 
-    def summary_gene_plot(self, cov_summary, threshold):
+    def panel_coverage(self, cov_stats, threshold):
         """
-        Generate summary plot of all genes against threshold value
+        Calculates mean coverage of all panel regions at given threshold,
+        normalised against length of each gene
 
         Args:
-            - cov_summary (df): df of gene coverage values
-            - threshold (int): defined threshold level (default: 20)
+            - cov_stats (df): df of coverage stats for each exon
+            - threshold (int): threshold cut off for low coverage
 
         Returns:
-            - summary_plot (fig): plot of all genes
+            - panel_pct_coverage (str): % coverage of panel as str
         """
-        print("Generating summary plot")
+        print("Calculating panel average coverage")
 
+        # threshold column to check at
         threshold = str(threshold) + "x"
 
-        summary_data = cov_summary.copy()
+        gene_stats = pd.DataFrame(
+            columns=["gene", "gene_len", "coverage"])
 
-        # define colours based on values
-        summary_data["colours"] = 'green'
-        summary_data.loc[summary_data[threshold] < 100, 'colours'] = 'orange'
-        summary_data.loc[summary_data[threshold] < 90, 'colours'] = 'red'
+        # make list of genes
+        genes = sorted(list(set(cov_stats["gene"].tolist())))
 
-        summary_data = summary_data.sort_values(
-            by=[threshold], ascending=False
-        )
-        summary_plot, axs = plt.subplots(figsize=(25, 7.5))
+        for gene in genes:
+            # for each gene, calculate length and average % at threshold
+            gene_cov = cov_stats.loc[cov_stats["gene"] == gene]
 
-        if len(summary_data.index) > 100:
-            # split off some of 100% covered genes to limit size of plot
-            if len(summary_data[summary_data[threshold] < 100]) > 100:
-                # over 100 sub threshold genes, remove all 100% genes
-                genes100pct = len(summary_data[summary_data[threshold] == 100])
-                summary_data = summary_data[summary_data[threshold] < 100]
-            else:
-                # split off bottom 100 genes, plot includes some 100% covered
-                genes100pct = len(summary_data.iloc[:-100])
-                summary_data = summary_data.iloc[-100:]
+            length = sum(gene_cov["exon_len"])
+            coverage = sum(
+                gene_cov[threshold] * gene_cov["exon_len"] / length)
 
-        plt.bar(
-            summary_data["gene"], [int(x) for x in summary_data[threshold]],
-            color=summary_data.colours
-        )
+            gene_stats = gene_stats.append({
+                "gene": gene,
+                "gene_len": length,
+                "coverage": coverage
+            }, ignore_index=True)
 
-        if "genes100pct" in locals():
-            genes100pct = str(genes100pct)
-            # more than 100 genes, add title inc. 100% covered not shown
-            axs.set_title(
-                r"$\bf{" + genes100pct + "}$" + " genes covered 100% at " +
-                r"$\bf{" + threshold + "}$" +
-                " were omitted from the plot due to the panel size", loc='left'
+        # calculate % panel coverage
+        panel_coverage = sum(
+            gene_stats["coverage"] * gene_stats["gene_len"] / sum(
+                gene_stats["gene_len"]
             )
+        )
 
-        # threshold lines
-        plt.axhline(y=99, linestyle='--', color="#565656", alpha=0.6)
-        plt.axhline(y=95, linestyle='--', color="#565656", alpha=0.6)
+        # round to 12 dp to account for limit of accuracy of float &
+        # length of human genome
+        panel_coverage = round(panel_coverage, 12)
 
-        plt.text(1.005, 0.94, '99%', transform=axs.transAxes)
-        plt.text(1.005, 0.91, '95%', transform=axs.transAxes)
+        panel_pct_coverage = str(math.floor(panel_coverage * 100) / 100)
 
-        # plot formatting
-        axs.tick_params(labelsize=6, length=0)
-        plt.xticks(rotation=55, color="#565656")
+        return panel_pct_coverage
 
-        # adjust whole plot marins
-        axs.margins(x=0.01)
-        axs.autoscale_view(scaley=True)
 
-        # add legend
-        green = mpatches.Patch(color='green', label='100%')
-        orange = mpatches.Patch(color='orange', label='90-99.99%')
-        red = mpatches.Patch(color='red', label='<90%')
+    def snp_coverage(self, snp_vcfs, raw_coverage, threshold):
+        """
+        Produces tables of coverage for variants inside of capture
+        regions, and larger structural variants spanning region
+        boundaries.
 
-        plt.legend(
-            handles=[green, orange, red], loc='upper center',
-            bbox_to_anchor=(0.5, -0.1),
-            fancybox=True, shadow=True, ncol=12, fontsize=12
-        )
+        Args:
+            - snp_vcfs (str): list of vcf files used for SNP analysis
+            - raw_coverage (df): raw bp coverage for each exon
+            - threshold (int): threshold value passed from parse args
 
-        vals = np.arange(0, 110, 10).tolist()
-        plt.yticks(vals, vals)
-        axs.tick_params(axis='both', which='major', labelsize=8)
+        Returns:
+            - snps_low_cov (df): variants with lower coverage than threshold
+            - snps_high_cov (df): variants with higher coverage than threshold
+            - snps_no_cov (df): variants that span exon boundaries (i.e SVs)
+        """
+        print("Calculating coverage of given SNPs")
 
-        plt.xlabel("")
-        plt.ylabel("% coverage ({})".format(threshold), fontsize=11)
+        bedFile = raw_coverage[
+            ["chrom", "exon_start", "exon_end"]].drop_duplicates()
+        coverageFile = raw_coverage[
+            ["chrom", "cov_start", "cov_end", "cov"]].drop_duplicates()
 
-        axs.yaxis.grid(linewidth=0.5, color="grey", linestyle="-.")
-        plt.box(False)
-        axs.set_axisbelow(True)
-        plt.tight_layout()
+        # turn dfs into BedTools objects
+        bed = bedtools.BedTool.from_dataframe(bedFile)
+        cov = bedtools.BedTool.from_dataframe(coverageFile)
 
-        # convert image to html string to insert in report
-        buffer = BytesIO()
-        plt.savefig(buffer, format='png')
-        buffer.seek(0)
-        image_png = buffer.getvalue()
-        buffer.close()
-        graphic = base64.b64encode(image_png)
-        data_uri = graphic.decode('utf-8')
-        summary_plot = "<img src=data:image/png;base64,{0} style='max-width:\
-            100%; max-height: auto; object-fit: contain; ' />".format(
-            data_uri
-        )
+        # empty df to add all SNP info to
+        snp_df = pd.DataFrame(columns=[
+            'VCF', 'chrom', 'pos', 'id', 'ref', 'alt', 'info'
+        ])
 
-        return summary_plot
+        for vcf in snp_vcfs:
+            # read vcf into BedTools object
+            v = bedtools.BedTool(vcf)
 
+            # get vcf name to add to table, req. for multiple VCFS and
+            # recording variant source VCF
+            name = Path(vcf).stem.split("_")[0]
 
-    def writeSummary(self, cov_summary, threshold, panel_pct_coverage):
-        """
-        Write summary paragraph with sequencing details and list of
-        genes / transcripts used in panel.
+            # use bedtools intersect to get SNPs in capture region
+            snps = bed.intersect(v, wb=True)
 
-        Args:
-            - cov_summary (df): df of gene coverage values
-            - threshold (int): defined threshold level (default: 20)
-            - panel_pct_coverage (str): % coverage of panel as str
-        Returns:
-            - summary_text (str): summary text with req. HTML markup
-        """
-        threshold = str(threshold) + "x"
+            for row in snps:
+                # get data from returned BedTools object, add to df
+                snp_data = str(row).split()
+                snp_df = snp_df.append({
+                    'VCF': name, 'chrom': snp_data[3],
+                    'pos': snp_data[4], 'ref': snp_data[6],
+                    'alt': snp_data[7], 'info': snp_data[10]
+                }, ignore_index=True)
 
-        pct_cov = str(math.floor(float(panel_pct_coverage)))
+        snp_df = snp_df[
+            ['VCF', 'chrom', 'pos', 'ref', 'alt', 'info']].drop_duplicates()
 
-        # summary text paragraph with div styling
-        summary_text = """
-        <li>Clinical report summary:</li>
-        <div style="background-color:aliceblue; margin-top: 15px;
-        border-radius: 15px; padding-left:25px;">
-        <div id="summary_text" style="font-size: 14px;
-        padding-bottom: 15px; padding-top:10px">"""
+        # reset index
+        raw_coverage = raw_coverage.reset_index(drop=True)
 
-        for i, gene in cov_summary.iterrows():
-            # build string of each gene, trascript and coverage at
-            # threshold to display in summary
-            summary = "{} ({}); ".format(gene["gene"], gene["tx"])
-            summary_text += summary
+        # use pandasql to intersect SNPs against coverage df to find the
+        # coverage at each SNP position
+        coverage_sql = """
+            SELECT snp_df.VCF, snp_df.chrom, snp_df.pos, snp_df.ref,
+            snp_df.alt, snp_df.info, raw_coverage.gene, raw_coverage.exon,
+            raw_coverage.cov_start, raw_coverage.cov_end, raw_coverage.cov
+            FROM snp_df
+            LEFT JOIN raw_coverage on snp_df.CHROM=raw_coverage.chrom
+            WHERE snp_df.POS > raw_coverage.cov_start AND
+            snp_df.POS <= raw_coverage.cov_end
+            """
 
-        summary_text = summary_text.strip(" ;") + "."
-        summary_text += """
-            <br></br>{} % of this panel was sequenced to a depth of {} or
-            greater.<br>""".format(pct_cov, threshold)
+        snp_cov = pdsql.sqldf(coverage_sql, locals())
 
-        # add closing div and copy button for summary text
-        summary_text += """</div><div style="padding-bottom:15px;">
-        <button class="btn-info btn-sm summarybtn" onclick=
-        "CopyToClipboard('summary_text')";return false; style="font-size: 14px;
-        padding:5px 10px; border-radius: 10px;">Copy summary text
-        </button></div></div>"""
+        # get SNPs that won't have coverage data but do intersect panel
+        # regions (i.e. large deletions that span a region)
+        snps_no_cov = snp_df.merge(snp_cov, how='outer', indicator=True).loc[
+            lambda x: x['_merge'] == 'left_only']
 
-        return summary_text
+        snps_no_cov = snps_no_cov[[
+            "VCF", "chrom", "pos", "ref", "alt", "info"
+        ]].reset_index(drop=True)
 
+        # get required columns for SNP tables
+        snps_cov = snp_cov[
+            ["VCF", "gene", "exon", "chrom", "pos", "ref", "alt", "cov"]
+        ].drop_duplicates(subset=[
+            "VCF", "chrom", "pos", "ref", "alt"]).reset_index(drop=True)
 
-    def generate_report(self, cov_stats, cov_summary, snps_low_cov,
-                        snps_high_cov, snps_no_cov, fig, all_plots,
-                        summary_plot, html_template, args, build, panel, vcfs,
-                        panel_pct_coverage, bootstrap, version, summary_text
-                        ):
+        # rename columns for displaying in report
+        snps_cov.columns = ["VCF", "Gene", "Exon", "Chromosome", "Position",
+                            "Ref", "Alt", "Coverage"]
+
+        snps_no_cov.columns = [
+            "VCF", "Chromosome", "Position", "Ref", "Alt", "Info"
+        ]
+
+        # remove <> from DELs to stop being interpreted as HTML tags
+        snps_no_cov["Alt"] = snps_no_cov["Alt"].str.strip("<>")
+
+        snps_cov["Coverage"] = snps_cov["Coverage"].astype(int)
+
+        # sort no_cov table by chrom & pos, as pos is str first define
+        # order to sort by
+        order = [str(x) for x in range(0, 23)]
+        order.extend(["X", "Y", "MT"])
+        snps_no_cov["Chromosome"] = pd.Categorical(
+            snps_no_cov["Chromosome"], order
+        )
+
+        snps_cov = snps_cov.sort_values(by=["Gene", "Exon", "Position"])
+        snps_no_cov = snps_no_cov.sort_values(by=["Chromosome", "Position"])
+
+        # split SNPs by coverage against threshold
+        snps_low_cov = snps_cov.loc[snps_cov["Coverage"] < threshold]
+        snps_high_cov = snps_cov.loc[snps_cov["Coverage"] >= threshold]
+
+        return snps_low_cov, snps_high_cov, snps_no_cov
+
+
+    def low_coverage_regions(self, cov_stats, raw_coverage, threshold):
         """
-        Generate single sample report from coverage stats
+        Get regions where coverage at given threshold is <100%
 
         Args:
             - cov_stats (df): df of coverage stats for each exon
-            - cov_summary (df): df of gene level coverage
-            - snps_low_cov (df): SNPs with lower coverage than threshold
-            - snps_high_cov (df): SNPs with higher coverage than threshold
-            - snps_no_cov (df): variants that span exon boundaries (i.e SVs)
-            - fig (figure): plots of low coverage regions
-            - all-plots (figure): grid of all full gene- exon plots
-            - summary_plot (figure): gene summary plot - % at threshold
-            - html_template (str): string of HTML template
-            - args (args): passed cmd line arguments
-            - build (str): build number used for alignment
-            - panel (str): panes(s) / gene(s) included in report
-            - vcfs (str): vcfs(s) passed for SNP analysis
-            - panel_pct_coverage (str): total % coverage of panel
-            - bootstrap (str): bootstrap to store directly in html
-            - version (str): version of Athena, used to add to report
-
-        Returns: None
+            - raw_coverage (df): raw bp coverage for each exon
+            - threshold (int): defined threshold level (default: 20)
 
-        Outputs:
-            - coverage_report.html (file): HTML coverage report
+        Returns:
+            - low_raw_cov (df): df of raw bp values for each region with
+                                coverage less than 100% at threshold
         """
-        print("Generating report")
-
-        # str of threshold for selecting df columns etc.
-        threshold = str(args.threshold) + "x"
+        # threshold column to check at
+        threshold = str(threshold) + "x"
 
         # get threshold columns and add to column names
         threshold_cols = list(cov_stats.filter(regex='[0-9]+x', axis=1))
 
         column = [
-            "gene", "tx", "chrom", "exon", "exon_len", "exon_start",
-            "exon_end", "min", "mean", "max"
+            "gene", "tx", "chrom", "exon", "exon_start", "exon_end",
+            "min", "mean", "max"
         ]
 
         column.extend(threshold_cols)
 
-        sub_threshold = pd.DataFrame(columns=column)
+        # empty df
+        low_stats = pd.DataFrame(columns=column)
 
-        # get all exons with <100% coverage at threshold
+        # get all exons with <100% coverage at given threshold
         for i, row in cov_stats.iterrows():
             if int(row[threshold]) < 100:
-                sub_threshold = sub_threshold.append(row, ignore_index=True)
+                low_stats = low_stats.append(row, ignore_index=True)
 
         # pandas is terrible and forces floats, change back to int
         dtypes = {
             'chrom': str,
             'exon': int,
-            'exon_len': int,
             'exon_start': int,
             'exon_end': int,
             'min': int,
             'max': int
         }
 
-        vals = ["min", "mean", "max"]
-        vals.extend(threshold_cols)
-
-        if not sub_threshold.empty:
-            # some low covered regions identified
-            sub_threshold = sub_threshold.astype(dtypes)
-
-            sub_threshold_stats = pd.pivot_table(sub_threshold, index=[
-                "gene", "tx", "chrom", "exon",
-                "exon_len", "exon_start", "exon_end"
-            ], values=vals)
-
-            # reset index to fix formatting
-            sub_threshold_stats = sub_threshold_stats.reindex(vals, axis=1)
-            sub_threshold_stats.reset_index(inplace=True)
-
-            gene_issues = len(list(set(sub_threshold_stats["gene"].tolist())))
-            exon_issues = len(sub_threshold_stats["exon"])
-        else:
-            # if no low regions set to empty df with appropriate columns
-            print("No low coverage regions, generating empty table")
-            sub_threshold_stats = pd.DataFrame(columns=column)
-            gene_issues = 0
-            exon_issues = 0
-
-
-        # do some excel level formatting to make table more readable
-        total_stats = pd.pivot_table(
-            cov_stats,
-            index=["gene", "tx", "chrom", "exon", "exon_len",
-                   "exon_start", "exon_end"],
-            values=vals
-        )
-
-        # reset index to fix formatting
-        total_stats = total_stats.reindex(vals, axis=1)
-        total_stats.reset_index(inplace=True)
-
-        all_dfs = [
-            total_stats, cov_summary, sub_threshold_stats,
-            snps_low_cov, snps_high_cov
-        ]
-
-        # set index to start at 1 to be more readable
-        for df in all_dfs:
-            if not df.empty:
-                df.index = np.arange(1, len(df) + 1)
-
-        # rename columns for displaying in report
-        cov_summary = cov_summary.drop(columns=["exon"])
-        cov_summary = cov_summary.rename(columns={
-            "gene": "Gene",
-            "tx": "Transcript",
-            "min": "Min",
-            "mean": "Mean",
-            "max": "Max"
-        })
-
-        total_stats = total_stats.rename(columns={
-            "gene": "Gene",
-            "tx": "Transcript",
-            "chrom": "Chr",
-            "exon": "Exon",
-            "exon_len": "Length",
-            "exon_start": "Start",
-            "exon_end": "End",
-            "min": "Min",
-            "mean": "Mean",
-            "max": "Max"
-        })
-
-        # rename columns to display properly
-        sub_threshold_stats = sub_threshold_stats.rename(columns={
-            "gene": "Gene",
-            "tx": "Transcript",
-            "chrom": "Chr",
-            "exon": "Exon",
-            "exon_len": "Length",
-            "exon_start": "Start",
-            "exon_end": "End",
-            "min": "Min",
-            "mean": "Mean",
-            "max": "Max"
-        })
-
-        # limit to 2dp using math.floor, use of round() with
-        # 2dp may lead to inaccuracy such as 99.99 => 100.00
-        round_cols = ['Mean'] + threshold_cols
-
-        for col in round_cols:
-            cov_summary[col] = cov_summary[col].map(
-                lambda col: math.floor(col * 100) / 100
-            )
-
-            total_stats[col] = total_stats[col].map(
-                lambda col: math.floor(col * 100) / 100
-            )
-
-        # get values to display in report
-        total_genes = len(cov_summary["Gene"])
-        fully_covered_genes = total_genes - gene_issues
-
-        # empty dict to add values for displaying in report text
-        report_vals = {}
-
-        report_vals["summary_text"] = summary_text
-        report_vals["name"] = str(args.sample_name).replace("_", " ")
-        report_vals["total_genes"] = str(total_genes)
-        report_vals["fully_covered_genes"] = str(fully_covered_genes)
-        report_vals["gene_issues"] = str(gene_issues)
-        report_vals["threshold"] = threshold
-        report_vals["exon_issues"] = str(exon_issues)
-        report_vals["build"] = build
-        report_vals["panel"] = panel
-        report_vals["vcfs"] = vcfs
-        report_vals["version"] = version
-        report_vals["panel_pct_coverage"] = panel_pct_coverage
-
-        # creat slices of sub_threshold stats df to add styling to
-        slice_ranges = {
-            "x0": (10, 0), "x10": (30, 10), "x30": (50, 30), "x50": (70, 50),
-            "x70": (90, 70), "x90": (95, 90), "x95": (99, 95), "x99": (101, 99)
-        }
-
-        sub_slice = {}
-
-        for key, val in slice_ranges.items():
-            sub_slice[key] = pd.IndexSlice[sub_threshold_stats.loc[(
-                sub_threshold_stats[threshold] < val[0]
-            ) & (
-                sub_threshold_stats[threshold] >= val[1])].index, threshold]
-
-        # df column index of threshold
-        col_idx = sub_threshold_stats.columns.get_loc(threshold)
-
-        # make dict for rounding coverage columns to 2dp
-        rnd = {}
-        for col in list(sub_threshold_stats.columns[10:15]):
-            rnd[col] = '{0:.2f}%'
-
-        # set threshold column widths as a fraction of 30% table width
-        t_width = str(30 / len(threshold_cols)) + "%"
-
-        # apply colours to coverage cell based on value, 0 is given solid red
-        s = sub_threshold_stats.style.apply(lambda x: [
-            "background-color: #b30000" if x[threshold] == 0 and idx == col_idx
-            else "" for idx, v in enumerate(x)
-        ], axis=1)\
-            .bar(subset=sub_slice["x0"], color='#b30000', vmin=0, vmax=100)\
-            .bar(subset=sub_slice["x10"], color='#990000', vmin=0, vmax=100)\
-            .bar(subset=sub_slice["x30"], color='#C82538', vmin=0, vmax=100)\
-            .bar(subset=sub_slice["x50"], color='#FF4500', vmin=0, vmax=100)\
-            .bar(subset=sub_slice["x70"], color='#FF4500', vmin=0, vmax=100)\
-            .bar(subset=sub_slice["x90"], color='#FF4500', vmin=0, vmax=100)\
-            .bar(subset=sub_slice["x95"], color='#FFBF00', vmin=0, vmax=100)\
-            .bar(subset=sub_slice["x99"], color='#007600', vmin=0, vmax=100)\
-            .format(rnd)\
-            .set_table_attributes('table border="1"\
-                class="dataframe table table-hover table-bordered"')\
-            .set_uuid("low_exon_table")\
-            .set_properties(**{'font-size': '0.85vw', 'table-layout': 'auto'})\
-            .set_properties(subset=threshold_cols, **{'width': t_width})\
-
-        sub_threshold_stats["Mean"] = sub_threshold_stats["Mean"].apply(
-            lambda x: int(x)
-        )
+        low_stats = low_stats.astype(dtypes)
 
-        # CSS table class for styling tables
-        style = (
-            '<table border="1" class="dataframe">',
-            '<table class="table table-striped" style="font-size: 0.85vw;" >'
-        )
+        # get list of tuples of genes and exons with low coverage to
+        # select out raw coverage
+        low_exon_list = low_stats.reset_index()[['gene',
+                                                'exon']].values.tolist()
+        low_exon_list = [tuple(exon) for exon in low_exon_list]
 
-        # generate HTML strings from table objects to write to report
-        gene_stats = cov_summary.to_html(justify='left').replace(
-            style[0], style[1]
-        )
-        total_stats = total_stats.to_html(justify='left').replace(
-            style[0], style[1]
-        )
+        # get raw coverage for low coverage regions to plot
+        low_raw_cov = raw_coverage[raw_coverage[['gene', 'exon']].apply(
+            tuple, axis=1).isin(low_exon_list)].reset_index()
 
-        sub_threshold_stats = s.render()
+        return low_raw_cov
 
-        # get snps values and format dfs to display
-        if not snps_low_cov.empty:
-            # format low coverage SNPs table
-            snps_not_covered = len(snps_low_cov.index)
-            snps_low_cov = snps_low_cov.style\
-                .set_table_attributes(
-                    'class="dataframe table table-striped"')\
-                .set_uuid("var_low_cov")\
-                .set_properties(**{
-                    'font-size': '0.80vw', 'table-layout': 'auto'
-                })\
-                .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\
-                .set_properties(subset=["Exon"], **{'width': '7.5%'})\
-                .set_properties(subset=["Chromosome"], **{'width': '10%'})\
-                .set_properties(subset=["Position"], **{'width': '12.5%'})\
-                .set_properties(subset=["Ref"], **{'width': '20%'})\
-                .set_properties(subset=["Alt"], **{'width': '20%'})\
-                .set_properties(subset=["Coverage"], **{'width': '10%'})
 
-            snps_low_cov = snps_low_cov.render()
-        else:
-            snps_low_cov = "<b>No low covered SNPs</b>"
-            snps_not_covered = 0
+    def write_summary(self, cov_summary, threshold, panel_pct_coverage):
+        """
+        Write summary paragraph with sequencing details and list of
+        genes / transcripts used in panel.
 
-        if not snps_high_cov.empty:
-            # format high coverage SNPs table
-            snps_covered = len(snps_high_cov.index)
+        Args:
+            - cov_summary (df): df of gene coverage values
+            - threshold (int): defined threshold level (default: 20)
+            - panel_pct_coverage (str): % coverage of panel as str
+        Returns:
+            - summary_text (str): summary text with req. HTML markup
+        """
+        threshold = str(threshold) + "x"
 
-            snps_high_cov = snps_high_cov.style\
-                .set_table_attributes(
-                    'class="dataframe table table-striped"')\
-                .set_uuid("var_high_cov")\
-                .set_properties(**{
-                    'font-size': '0.80vw', 'table-layout': 'auto'
-                })\
-                .set_properties(subset=["VCF", "Gene"], **{'width': '10%'})\
-                .set_properties(subset=["Exon"], **{'width': '7.5%'})\
-                .set_properties(subset=["Chromosome"], **{'width': '10%'})\
-                .set_properties(subset=["Position"], **{'width': '12.5%'})\
-                .set_properties(subset=["Ref"], **{'width': '20%'})\
-                .set_properties(subset=["Alt"], **{'width': '20%'})\
-                .set_properties(subset=["Coverage"], **{'width': '10%'})
+        pct_cov = str(math.floor(float(panel_pct_coverage)))
 
-            snps_high_cov = snps_high_cov.render()
-        else:
-            snps_high_cov = "<b>No covered SNPs</b>"
-            snps_covered = 0
+        # summary text paragraph with div styling
+        summary_text = """
+        <li>Clinical report summary:</li>
+        <div style="background-color:aliceblue; margin-top: 15px;
+        border-radius: 15px; padding-left:25px;">
+        <div id="summary_text" style="font-size: 14px;
+        padding-bottom: 15px; padding-top:10px">"""
 
-        # if variants from vcf found that span exon boundaries
-        if not snps_no_cov.empty:
-            # manually add div and styling around rendered table, allows
-            # to be fully absent from the report if the table is empty
-            snps_no_cov.index = np.arange(1, len(snps_no_cov) + 1)
+        for i, gene in cov_summary.iterrows():
+            # build string of each gene, trascript and coverage at
+            # threshold to display in summary
+            summary = "{} ({}); ".format(gene["gene"], gene["tx"])
+            summary_text += summary
 
-            # get number of variants to display in report
-            snps_out_panel = len(snps_no_cov.index)
+        summary_text = summary_text.strip(" ;") + "."
+        summary_text += """
+            <br></br>{} % of this panel was sequenced to a depth of {} or
+            greater.<br>""".format(pct_cov, threshold)
 
-            html_string = snps_no_cov.style\
-                .set_table_attributes(
-                    'class="dataframe table table-striped"')\
-                .set_uuid("var_no_cov")\
-                .set_properties(**{
-                    'font-size': '0.80vw', 'table-layout': 'auto'
-                })\
-                .set_properties(subset=["VCF"], **{
-                    'width': '7.5%'
-                })\
-                .set_properties(subset=[
-                    "Chromosome", "Position", "Ref", "Alt"
-                ], **{'width': '10%'})
+        # add closing div and copy button for summary text
+        summary_text += """</div><div style="padding-bottom:15px;">
+        <button class="btn-info btn-sm summarybtn" onclick=
+        "CopyToClipboard('summary_text')";return false; style="font-size: 14px;
+        padding:5px 10px; border-radius: 10px;">Copy summary text
+        </button></div></div>"""
 
-            html_string = html_string.render()
+        return summary_text
 
-            snps_no_cov = """
-                <br> Variants included in the first table below either fully\
-                    or partially span panel region(s). These are most likely\
-                    large structural variants and as such do not have\
-                    coverage data available. See the "info" column for details\
-                    on the variant.
-                </br>
-                <br> Table of variants spanning panel regions(s) &nbsp
-                <button class="btn btn-info collapsible btn-sm">Show /\
-                     hide table</button>
-                <div class="content">
-                    <table>
-                        {}
-                    </table>
-                </div></br>
-                """.format(html_string)
-        else:
-            snps_no_cov = ""
-            snps_out_panel = 0
-            snps_pct_out_panel = 0
 
+    def calculate_snp_vals(
+            self, snps_covered, snps_not_covered, snps_out_panel):
+        """
+        """
         total_snps = str(snps_covered + snps_not_covered + snps_out_panel)
 
         # calculate % SNPs covered vs. not, limit to 2dp with math.floor
@@ -1331,7 +1273,99 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
             snps_pct_out_panel = int(
                 snps_out_panel) / int(total_snps) * 100
             snps_pct_out_panel = math.floor(snps_pct_out_panel * 100) / 100
+        else:
+            snps_pct_out_panel = 0
+
+        return total_snps, snps_pct_covered, snps_pct_not_covered,\
+            snps_pct_out_panel
+
+
+    def generate_report(self, cov_stats, cov_summary, snps_low_cov,
+                        snps_high_cov, snps_no_cov, fig, all_plots,
+                        summary_plot, html_template, args, build, panel, vcfs,
+                        panel_pct_coverage, bootstrap, version, summary_text
+                        ):
+        """
+        Generate single sample report from coverage stats
+
+        Args:
+            - cov_stats (df): df of coverage stats for each exon
+            - cov_summary (df): df of gene level coverage
+            - snps_low_cov (df): SNPs with lower coverage than threshold
+            - snps_high_cov (df): SNPs with higher coverage than threshold
+            - snps_no_cov (df): variants that span exon boundaries (i.e SVs)
+            - fig (figure): plots of low coverage regions
+            - all-plots (figure): grid of all full gene- exon plots
+            - summary_plot (figure): gene summary plot - % at threshold
+            - html_template (str): string of HTML template
+            - args (args): passed cmd line arguments
+            - build (str): build number used for alignment
+            - panel (str): panes(s) / gene(s) included in report
+            - vcfs (str): vcfs(s) passed for SNP analysis
+            - panel_pct_coverage (str): total % coverage of panel
+            - bootstrap (str): bootstrap to store directly in html
+            - version (str): version of Athena, used to add to report
+
+        Returns: None
+
+        Outputs:
+            - coverage_report.html (file): HTML coverage report
+        """
+        print("Generating report")
+        styling = styleTables()
+
+        # format threshold val & select threshold columns
+        threshold = str(args.threshold) + "x"
+        threshold_cols = list(cov_stats.filter(regex='[0-9]+x', axis=1))
+        vals = ["min", "mean", "max"]
+        vals.extend(threshold_cols)
 
+        # apply styling to tables for displaying in report
+        sub_threshold_stats, gene_issues, exon_issues = styling.style_sub_threshold(
+            cov_stats, threshold, threshold_cols, vals
+        )
+
+        total_stats = styling.style_total_stats(
+            cov_stats, threshold_cols, vals
+        )
+
+        gene_stats, total_genes = styling.style_cov_summary(
+            cov_summary, threshold_cols
+        )
+
+        snps_low_cov, snps_not_covered = styling.style_snps_low_cov(
+            snps_low_cov
+        )
+
+        snps_high_cov, snps_covered = styling.style_snps_high_cov(
+            snps_high_cov
+        )
+
+        snps_no_cov, snps_out_panel = styling.style_snps_no_cov(snps_no_cov)
+
+        # get values to display in report
+        fully_covered_genes = total_genes - gene_issues
+
+        total_snps, snps_pct_covered, snps_pct_not_covered,\
+            snps_pct_out_panel = self.calculate_snp_vals(
+                snps_covered, snps_not_covered, snps_out_panel
+            )
+
+        # empty dict to add values for displaying in report text
+        report_vals = {}
+
+        report_vals["summary_text"] = summary_text
+        report_vals["name"] = str(args.sample_name).replace("_", " ")
+        report_vals["total_genes"] = str(total_genes)
+        report_vals["fully_covered_genes"] = str(fully_covered_genes)
+        report_vals["gene_issues"] = str(gene_issues)
+        report_vals["threshold"] = threshold
+        report_vals["exon_issues"] = str(exon_issues)
+        report_vals["build"] = build
+        report_vals["panel"] = panel
+        report_vals["vcfs"] = vcfs
+        report_vals["version"] = version
+        report_vals["panel_pct_coverage"] = panel_pct_coverage
         report_vals["total_snps"] = total_snps
         report_vals["snps_covered"] = str(snps_covered)
         report_vals["snps_not_covered"] = str(snps_not_covered)
@@ -1357,102 +1391,176 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
         file.close()
 
 
-    def parse_args(self):
+    def build_report(self, html_template, total_stats, gene_stats,
+                     sub_threshold_stats, snps_low_cov, snps_high_cov,
+                     snps_no_cov, fig, all_plots, summary_plot, report_vals,
+                     bootstrap
+                     ):
         """
-        Parse cmd line arguments
-
-        Args: None
+        Build report from template and variables to write to file
 
+        Args:
+            - html_template (str): string of HTML template file
+            - total_stats (df): total stats table of all genes & exons
+            - gene_stats (df): stats table of whole gene
+            - sub_threshold_stats (df): table of exons with < threshold
+            - snps_low_cov (df): table of snps with cov < threshold
+            - snsp_high_cov (df): table of snps with cov > threshold
+            - snps_no_cov (df): variants that span exon boundaries (i.e SVs)
+            - fig (figure): grid of low coverage exon plots (plotly)
+            - all-plots (figure): grid of all full gene- exon plots
+            - summary_plot (figure): gene summary plot - % at threshold
+            - report_vals (dict): values to display in report text
         Returns:
-            - args (arguments): args passed from cmd line
+            - single_report (str): HTML string of filled report
         """
+        # convert logo image into string to pass in to template
+        logo = str(os.path.join(os.path.dirname(
+            os.path.abspath(__file__)), "../data/static/images/logo.png"
+        ))
 
-        parser = argparse.ArgumentParser(
-            description='Generate coverage report for a single sample.'
-        )
-        parser.add_argument(
-            '-e', '--exon_stats',
-            help='exon stats file (from coverage_stats_single.py)',
-            type=argparse.FileType('r'), required=True
-        )
-        parser.add_argument(
-            '-g', '--gene_stats',
-            help='gene stats file (from coverage_stats_single.py)',
-            required=True
-        )
-        parser.add_argument(
-            '-r', '--raw_coverage',
-            help='raw coverage bed file used to generate stats',
-            required=True
-        )
-        parser.add_argument(
-            '-s', '--snps', nargs='*',
-            help='Optional; check coverage of VCF(s) of SNPs.',
-            required=False
-        )
-        parser.add_argument(
-            '-t', '--threshold', nargs='?',
-            default=20, type=int,
-            help="threshold to define low coverage (int), if not\
-                given 20 will be used as default. Must be one of\
-                the thresholds in the input file.",
-            required=False
-        )
-        parser.add_argument(
-            '-n', '--sample_name', nargs='?',
-            help="Name of sample to display in report, if not\
-                specified this will be the prefix of the\
-                gene_stats input file.",
-            required=False
-        )
-        parser.add_argument(
-            '-o', '--output', nargs='?',
-            help='Output report name, if not specified the sample\
-            name from the report will be used.',
-            required=False
-        )
-        parser.add_argument(
-            '-p', '--panel', nargs='?',
-            help='(Optional) Panel bed file used from annotation, if passed\
-            name of file will be displayed in report to show what\
-            panel(s) / gene(s) were included.',
-            required=False
-        )
-        parser.add_argument(
-            '-l', '--limit', nargs='?',
-            help="Number of genes at which to limit including full gene plots,\
-            large numbers of genes takes a long time to generate the plots.",
-            default=-1,
-            required=False
-        )
-        parser.add_argument(
-            '-m', '--summary',
-            help="If passed, a short paragraph will be included in the\
-            summary section. This includes details on the sequencing and the\
-            genes/transcripts used in the panel.",
-            default=False, action='store_true'
-        )
-        parser.add_argument(
-            '--cores', nargs='?', default=None,
-            help='Number of cores to utilise, for larger numbers of genes this\
-            will drastically reduce run time. If not given will use maximum\
-            available'
+        data_uri = base64.b64encode(open(logo, 'rb').read()).decode('utf-8')
+        logo = '<img height="25" width="22" src=data:image/png;base64,{0}\
+            alt="" style="vertical-align:middle; padding-bottom:3px">'.format(
+            data_uri)
+
+        t = Template(html_template)
+
+        date = datetime.today().strftime('%Y-%m-%d')
+
+        single_report = t.safe_substitute(
+            bootstrap=bootstrap,
+            logo=logo,
+            total_genes=report_vals["total_genes"],
+            threshold=report_vals["threshold"],
+            summary_text=report_vals["summary_text"],
+            exon_issues=report_vals["exon_issues"],
+            gene_issues=report_vals["gene_issues"],
+            fully_covered_genes=report_vals["fully_covered_genes"],
+            name=report_vals["name"],
+            sub_threshold_stats=sub_threshold_stats,
+            low_cov_plots=fig,
+            all_plots=all_plots,
+            summary_plot=summary_plot,
+            gene_stats=gene_stats,
+            total_stats=total_stats,
+            snps_high_cov=snps_high_cov,
+            snps_low_cov=snps_low_cov,
+            snps_no_cov=snps_no_cov,
+            total_snps=report_vals["total_snps"],
+            snps_covered=report_vals["snps_covered"],
+            snps_pct_covered=report_vals["snps_pct_covered"],
+            snps_not_covered=report_vals["snps_not_covered"],
+            snps_pct_not_covered=report_vals["snps_pct_not_covered"],
+            snps_out_panel=report_vals["snps_out_panel"],
+            snps_pct_out_panel=report_vals["snps_pct_out_panel"],
+            date=date,
+            build=report_vals["build"],
+            vcfs=report_vals["vcfs"],
+            panel=report_vals["panel"],
+            panel_pct_coverage=report_vals["panel_pct_coverage"],
+            version=report_vals["version"]
         )
 
-        args = parser.parse_args()
+        return single_report
+
+
+def parse_args():
+    """
+    Parse cmd line arguments
+
+    Args: None
+
+    Returns:
+        - args (arguments): args passed from cmd line
+    """
+
+    parser = argparse.ArgumentParser(
+        description='Generate coverage report for a single sample.'
+    )
+    parser.add_argument(
+        '-e', '--exon_stats',
+        help='exon stats file (from coverage_stats_single.py)',
+        type=argparse.FileType('r'), required=True
+    )
+    parser.add_argument(
+        '-g', '--gene_stats',
+        help='gene stats file (from coverage_stats_single.py)',
+        required=True
+    )
+    parser.add_argument(
+        '-r', '--raw_coverage',
+        help='raw coverage bed file used to generate stats',
+        required=True
+    )
+    parser.add_argument(
+        '-s', '--snps', nargs='*',
+        help='Optional; check coverage of VCF(s) of SNPs.',
+        required=False
+    )
+    parser.add_argument(
+        '-t', '--threshold', nargs='?',
+        default=20, type=int,
+        help="threshold to define low coverage (int), if not\
+            given 20 will be used as default. Must be one of\
+            the thresholds in the input file.",
+        required=False
+    )
+    parser.add_argument(
+        '-n', '--sample_name', nargs='?',
+        help="Name of sample to display in report, if not\
+            specified this will be the prefix of the\
+            gene_stats input file.",
+        required=False
+    )
+    parser.add_argument(
+        '-o', '--output', nargs='?',
+        help='Output report name, if not specified the sample\
+        name from the report will be used.',
+        required=False
+    )
+    parser.add_argument(
+        '-p', '--panel', nargs='?',
+        help='(Optional) Panel bed file used from annotation, if passed\
+        name of file will be displayed in report to show what\
+        panel(s) / gene(s) were included.',
+        required=False
+    )
+    parser.add_argument(
+        '-l', '--limit', nargs='?',
+        help="Number of genes at which to limit including full gene plots,\
+        large numbers of genes takes a long time to generate the plots.",
+        default=-1,
+        required=False
+    )
+    parser.add_argument(
+        '-m', '--summary',
+        help="If passed, a short paragraph will be included in the\
+        summary section. This includes details on the sequencing and the\
+        genes/transcripts used in the panel.",
+        default=False, action='store_true'
+    )
+    parser.add_argument(
+        '--cores', nargs='?', default=None,
+        help='Number of cores to utilise, for larger numbers of genes this\
+        will drastically reduce run time. If not given will use maximum\
+        available'
+    )
+
+    args = parser.parse_args()
 
-        if not args.sample_name:
-            # sample name not given, use input file name
-            args.sample_name = Path(args.gene_stats).stem
-            if "_" in args.sample_name:
-                # if named X1000_ take prefix
-                args.sample_name = args.sample_name.split("_", 1)[0]
+    if not args.sample_name:
+        # sample name not given, use input file name
+        args.sample_name = Path(args.gene_stats).stem
+        if "_" in args.sample_name:
+            # if named X1000_ take prefix
+            args.sample_name = args.sample_name.split("_", 1)[0]
 
-        if not args.output:
-            # output file name not given, using sample name
-            args.output = args.sample_name + "_coverage_report.html"
+    if not args.output:
+        # output file name not given, using sample name
+        args.output = args.sample_name + "_coverage_report.html"
 
-        return args
+    return args
 
 
 def main():
@@ -1460,8 +1568,9 @@ def main():
     Main function to generate coverage report
     """
     report = singleReport()
+    plots = generatePlots()
 
-    args = report.parse_args()
+    args = parse_args()
 
     # read in files
     cov_stats, cov_summary, raw_coverage, html_template, build, panel,\
@@ -1503,7 +1612,7 @@ def main():
     panel_pct_coverage = report.panel_coverage(cov_stats, args.threshold)
 
     # generate summary plot
-    summary_plot = report.summary_gene_plot(
+    summary_plot = plots.summary_gene_plot(
         cov_summary, args.threshold
     )
 
@@ -1513,7 +1622,7 @@ def main():
     )
 
     # generate plot of sub optimal regions
-    fig = report.low_exon_plot(low_raw_cov, args.threshold)
+    fig = plots.low_exon_plot(low_raw_cov, args.threshold)
 
     if len(cov_summary.index) < int(args.limit) or int(args.limit) == -1:
         # generate plots of each full gene
@@ -1543,7 +1652,7 @@ def main():
 
             all_plots = ''.join(
                 pool.starmap(
-                    report.all_gene_plots, map(
+                    plots.all_gene_plots, map(
                         lambda e: (e, args.threshold), split_dfs
                     )
                 )
@@ -1554,7 +1663,7 @@ def main():
 
     if args.summary:
         # summary text to be included
-        summary_text = report.writeSummary(
+        summary_text = report.write_summary(
             cov_summary, args.threshold, panel_pct_coverage
         )
     else:

From 6f28499c33a9470dd9d235631012f6853e893fc7 Mon Sep 17 00:00:00 2001
From: jethror1 <rainford1995@gmail.com>
Date: Fri, 4 Dec 2020 11:38:34 +0000
Subject: [PATCH 3/4] add docstrings

---
 bin/coverage_report_single.py | 61 +++++++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 9 deletions(-)

diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py
index 2984eb9a..a6431e56 100644
--- a/bin/coverage_report_single.py
+++ b/bin/coverage_report_single.py
@@ -458,8 +458,7 @@ class styleTables():
     """Functions for styling tables for displaying in report"""
 
     def style_sub_threshold(
-            self, cov_stats, threshold, threshold_cols, vals
-        ):
+            self, cov_stats, threshold, threshold_cols, vals):
         """
         Styling of sub threshold stats df for displaying in report
 
@@ -468,7 +467,10 @@ def style_sub_threshold(
             - threshold (str): low coverage threshold value
             - threshold_cols (list): threshold values for coverage
         Returns:
-            - sub_threshold_stats ():
+            - sub_threshold_stats (str): HTML formatted str of cov stats
+                table
+            - gene_issues (int): total number of genes under threshold
+            - exon_issues (int): total numbner of exons under threshold
         """
         column = [
             "gene", "tx", "chrom", "exon", "exon_len", "exon_start",
@@ -594,7 +596,9 @@ def style_total_stats(self, cov_stats, threshold_cols, vals):
         """
         Styling of full gene-exon stats table for displaying in report
         Args:
-            -
+            - cov_stats (df): df of exon stats
+            - threshold_cols (list): list of threshold columns
+            - vals (list): list of min, mean and max strs
         Returns:
             -
         """
@@ -647,6 +651,13 @@ def style_total_stats(self, cov_stats, threshold_cols, vals):
 
     def style_cov_summary(self, cov_summary, threshold_cols):
         """
+        Add styling to per gene coverage summary table
+        Args:
+            - cov_summary (df): df of gene coverage stats
+            - threshold_cols (list): list of threshold values
+        Returns:
+            - gene_stats (str): HTML formatted str of gene summary df
+            - total_genes (int): total number of genes
         """
         # rename columns for displaying in report
         cov_summary = cov_summary.drop(columns=["exon"])
@@ -689,6 +700,13 @@ def style_cov_summary(self, cov_summary, threshold_cols):
 
     def style_snps_low_cov(self, snps_low_cov):
         """
+        Add styling to table of snps under coverage threshold
+        Args:
+            - snps_low_cov (df): df of snps under covegrage threshold
+        Returns:
+            - snps_low_cov (str): HTML formatted str of low covered snps
+            - snps_not_covered (int): total number snps not covered at
+                threshold
         """
         # get snps values and format dfs to display
         if not snps_low_cov.empty:
@@ -720,6 +738,12 @@ def style_snps_low_cov(self, snps_low_cov):
 
     def style_snps_high_cov(self, snps_high_cov):
         """
+        Add styling to table of SNPs covered above threshold
+        Args:
+            - snps_high_cov (df): df of snps covered above threshold
+        Returns:
+            - snps_high_cov (str): HTML formatted str of covered snps
+            - snps_covered (int): total number of snps covered
         """
 
         if not snps_high_cov.empty:
@@ -753,6 +777,13 @@ def style_snps_high_cov(self, snps_high_cov):
 
     def style_snps_no_cov(self, snps_no_cov):
         """
+        Add styling to table of snps that span exon boundaries => have
+        coverage values
+        Args:
+            - snps_no_cov (df): df of snps with no coverage values
+        Returns:
+            - snps_no_cov (str): HTML formatted str of snps with no cov
+            - snps_out_panel (int): total number snps with no cov
         """
         # if variants from vcf found that span exon boundaries
         if not snps_no_cov.empty:
@@ -1252,6 +1283,18 @@ def write_summary(self, cov_summary, threshold, panel_pct_coverage):
     def calculate_snp_vals(
             self, snps_covered, snps_not_covered, snps_out_panel):
         """
+        Calculate % values for SNP totals
+        Args:
+            - snps_covered (int): total number snps covered at threshold
+            - snps_not_covered (int): total number snps not covered at
+                threshold
+            - snps_out_panel (int): total number snps spanning exon
+                boundaries
+        Returns:
+            - total_snps (int): sum of all snps
+            - snps_pct_covered (float): % value of snps_covered
+            - snps_pct_not_covered (float): % value of snps_not_covered
+            - snps_pct_out_panel (float): % value of snps_out_panel
         """
         total_snps = str(snps_covered + snps_not_covered + snps_out_panel)
 
@@ -1321,9 +1364,10 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
         vals.extend(threshold_cols)
 
         # apply styling to tables for displaying in report
-        sub_threshold_stats, gene_issues, exon_issues = styling.style_sub_threshold(
-            cov_stats, threshold, threshold_cols, vals
-        )
+        sub_threshold_stats, gene_issues,\
+            exon_issues = styling.style_sub_threshold(
+                cov_stats, threshold, threshold_cols, vals
+            )
 
         total_stats = styling.style_total_stats(
             cov_stats, threshold_cols, vals
@@ -1351,9 +1395,8 @@ def generate_report(self, cov_stats, cov_summary, snps_low_cov,
                 snps_covered, snps_not_covered, snps_out_panel
             )
 
-        # empty dict to add values for displaying in report text
+        # add values to dict to pass into report
         report_vals = {}
-
         report_vals["summary_text"] = summary_text
         report_vals["name"] = str(args.sample_name).replace("_", " ")
         report_vals["total_genes"] = str(total_genes)

From 52f474c780fd302104393dc1406b7223b922e8ac Mon Sep 17 00:00:00 2001
From: jethror1 <rainford1995@gmail.com>
Date: Fri, 4 Dec 2020 14:04:53 +0000
Subject: [PATCH 4/4] change width of coverage threshold columns

---
 bin/coverage_report_single.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/bin/coverage_report_single.py b/bin/coverage_report_single.py
index a6431e56..46876142 100644
--- a/bin/coverage_report_single.py
+++ b/bin/coverage_report_single.py
@@ -31,7 +31,6 @@
 from string import Template
 
 
-
 class generatePlots():
     """Functions to generate required plots"""
 
@@ -557,11 +556,11 @@ def style_sub_threshold(
 
         # make dict for rounding coverage columns to 2dp
         rnd = {}
-        for col in list(sub_threshold_stats.columns[10:15]):
+        for col in list(sub_threshold_stats.columns[10:]):
             rnd[col] = '{0:.2f}%'
 
-        # set threshold column widths as a fraction of 30% table width
-        t_width = str(30 / len(threshold_cols)) + "%"
+        # set threshold column widths as a fraction of 40% table width
+        t_width = str(40 / len(threshold_cols)) + "%"
 
         # apply colours to coverage cell based on value, 0 is given solid red
         s = sub_threshold_stats.style.apply(lambda x: [