Skip to content

Commit

Permalink
PIP-1277 add average coverage to qc (#39)
Browse files Browse the repository at this point in the history
  • Loading branch information
paul-sud authored Jul 27, 2020
1 parent 02f9820 commit 4db26ef
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ include_trailing_comma = True
force_grid_wrap = 0
use_parentheses = True
line_length = 88
known_third_party = attr,bs4,pandas,pytest,qc_utils
known_third_party = attr,bs4,pandas,pysam,pytest,qc_utils
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
beautifulsoup4==4.8.2
pandas==1.0.4
pysam==0.16.0.1
qc-utils>=20.6.1
2 changes: 2 additions & 0 deletions tests/functional/test_wgbs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,7 @@
md5sum: a09ae01f70fa6d2461e37d5814ceb579
- path: test-output/coverage.bw
md5sum: afa224c2037829dccacea4a67b6fa84a
- path: test-output/average_coverage_qc.json
md5sum: 82ce31e21d361d52a7f19dce1988b827
- path: test-output/bed_pearson_correlation_qc.json
should_exist: false
5 changes: 5 additions & 0 deletions tests/integration/json/test_calculate_average_coverage.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"test_calculate_average_coverage.bam": "tests/data/sample_sample5.bam",
"test_calculate_average_coverage.chrom_sizes": "tests/data/sacCer3.contig.sizes",
"test_calculate_average_coverage.ncpus": 1
}
11 changes: 11 additions & 0 deletions tests/integration/test_calculate_average_coverage.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
- name: test_calculate_average_coverage
tags:
- integration
command: >-
tests/caper_run.sh
tests/integration/wdl/test_calculate_average_coverage.wdl
tests/integration/json/test_calculate_average_coverage.json
files:
- path: test-output/average_coverage_qc.json
md5sum: 1f6dd50510bb8007552229b99be0e724
13 changes: 13 additions & 0 deletions tests/integration/wdl/test_calculate_average_coverage.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import "../../../wgbs-pipeline.wdl" as wgbs

workflow test_calculate_average_coverage {
File bam
File chrom_sizes
Int ncpus

call wgbs.calculate_average_coverage { input:
bam = bam,
chrom_sizes = chrom_sizes,
ncpus = ncpus,
}
}
39 changes: 39 additions & 0 deletions tests/python/test_calculate_average_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pytest
from qc_utils import QCMetricRecord

from wgbs_pipeline.calculate_average_coverage import (
calculate_average_coverage,
calculate_genome_size,
get_samtools_stats,
make_qc_record,
)


@pytest.mark.filesystem
def test_get_samtools_stats(mocker):
mocker.patch("pysam.stats", return_value="SN\tfoo:\t3\n")
result = get_samtools_stats("bam_path", threads=3)
assert result == {"foo": 3}


def test_calculate_genome_size(mocker):
mocker.patch("builtins.open", mocker.mock_open(read_data="foo\t1\nbar\t5\n"))
result = calculate_genome_size("path")
assert result == 6


def test_calculate_average_coverage():
result = calculate_average_coverage(
genome_size=10, aligned_read_count=3, read_length=3
)
assert isinstance(result, dict)
assert result["average_coverage"] == pytest.approx(0.9)


def test_make_qc_record():
metric_1 = ("foo", {"bar": "baz"})
metric_2 = ("qux", {"quux": "corge"})
result = make_qc_record([metric_1, metric_2])
assert result.to_ordered_dict()["foo"] == {"bar": "baz"}
assert result.to_ordered_dict()["qux"] == {"quux": "corge"}
assert isinstance(result, QCMetricRecord)
6 changes: 5 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ commands = python -m pytest --ignore=tests/functional/ --ignore=tests/integratio
basepython = python3.7
commands = python -m pytest --ignore=tests/python --symlink {posargs}
deps =
caper
caper==0.8.2.1
pytest
pytest-workflow>=1.3.0
passenv = WGBS_DOCKER_IMAGE_TAG
Expand Down Expand Up @@ -52,3 +52,7 @@ exclude_lines =

# Don't complain if non-runnable code isn't run:
if __name__ == .__main__.:

[pytest]
markers =
filesystem: mark tests using the filesystem.
31 changes: 31 additions & 0 deletions wgbs-pipeline.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ workflow wgbs {

String barcode_prefix = "sample_"

Int samtools_stats_ncpus = 1
Int bsmooth_num_workers = 8
Int bsmooth_num_threads = 2
Boolean run_bsmooth = true
Expand Down Expand Up @@ -96,6 +97,12 @@ workflow wgbs {
index = index,
}

call calculate_average_coverage { input:
bam = map.bam,
chrom_sizes = contig_sizes,
ncpus = samtools_stats_ncpus,
}

call extract { input:
gemBS_json = gemBS_json,
reference = reference,
Expand Down Expand Up @@ -289,6 +296,30 @@ task map {
}
}
task calculate_average_coverage {
File bam
File chrom_sizes
Int ncpus
command {
python3 "$(which calculate_average_coverage.py)" \
--bam ${bam} \
--chrom-sizes ${chrom_sizes} \
--threads ${ncpus} \
--outfile average_coverage_qc.json
}
output {
File average_coverage_qc = "average_coverage_qc.json"
}
runtime {
cpu: "${ncpus}"
memory: "8 GB"
disks: "local-disk 500 SSD"
}
}
task bscaller {
File reference
File gemBS_json
Expand Down
86 changes: 86 additions & 0 deletions wgbs_pipeline/calculate_average_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
Calculates average coverage from a bam file. The formula for this is given by this:
Coverage = (Aligned read counts * read length ) / total genome size
The total genome size is obtained by summing the values in the chromosome sizes. The
aligned read counts is the number of reads in the bam. The read length is obtained from
samtools-stats `RL`, see http://www.htslib.org/doc/samtools-stats.html for details.
"""

import argparse
import tempfile
from typing import Any, Dict, List, Tuple

import pysam
from qc_utils import QCMetric, QCMetricRecord
from qc_utils.parsers import parse_samtools_stats


def main() -> None: # pragma: no cover
"""
We already integration test this in the WDL tests, no need for Python test coverage.
"""
parser = get_parser()
args = parser.parse_args()
samtools_stats = get_samtools_stats(args.bam, args.threads)
genome_size = calculate_genome_size(args.chrom_sizes)
average_coverage = calculate_average_coverage(
genome_size=genome_size,
aligned_read_count=samtools_stats["reads mapped"],
read_length=samtools_stats["average length"],
)
qc_record = make_qc_record(
[("samtools_stats", samtools_stats), ("average_coverage", average_coverage)]
)
qc_record.save(args.outfile)


def get_samtools_stats(bam_path: str, threads: int) -> Dict[str, Any]:
"""
The buffer must be flushed first before it is read again by the parser, see
https://stackoverflow.com/questions/46004774/python-namedtemporaryfile-appears-empty-even-after-data-is-written
"""
samtools_stats_stdout = pysam.stats("--threads", str(threads), bam_path)
with tempfile.NamedTemporaryFile(mode="w") as samtools_stats_file:
samtools_stats_file.write(samtools_stats_stdout)
samtools_stats_file.flush()
samtools_stats = parse_samtools_stats(samtools_stats_file.name)
return samtools_stats


def calculate_genome_size(chrom_sizes_path: str) -> int:
size = 0
with open(chrom_sizes_path) as f:
for line in f:
size += int(line.split()[1])
return size


def calculate_average_coverage(
genome_size: int, aligned_read_count: int, read_length: int
) -> Dict[str, float]:
average_coverage = (aligned_read_count * read_length) / genome_size
return {"average_coverage": average_coverage}


def make_qc_record(qcs: List[Tuple[str, Dict[str, Any]]]) -> QCMetricRecord:
qc_record = QCMetricRecord()
for name, qc in qcs:
metric = QCMetric(name, qc)
qc_record.add(metric)
return qc_record


def get_parser() -> argparse.ArgumentParser: # pragma: nocover
parser = argparse.ArgumentParser()
parser.add_argument("--bam", help="Bam file to compute coverage on", required=True)
parser.add_argument("--chrom-sizes", help="Chromosome sizes file", required=True)
parser.add_argument(
"--threads", help="Number of threads for Samtools", required=True
)
parser.add_argument("--outfile", required=True)
return parser


if __name__ == "__main__":
main()

0 comments on commit 4db26ef

Please sign in to comment.