-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PIP-1277 add average coverage to qc (#39)
- Loading branch information
Showing
10 changed files
with
194 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
beautifulsoup4==4.8.2 | ||
pandas==1.0.4 | ||
pysam==0.16.0.1 | ||
qc-utils>=20.6.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"test_calculate_average_coverage.bam": "tests/data/sample_sample5.bam", | ||
"test_calculate_average_coverage.chrom_sizes": "tests/data/sacCer3.contig.sizes", | ||
"test_calculate_average_coverage.ncpus": 1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
--- | ||
- name: test_calculate_average_coverage | ||
tags: | ||
- integration | ||
command: >- | ||
tests/caper_run.sh | ||
tests/integration/wdl/test_calculate_average_coverage.wdl | ||
tests/integration/json/test_calculate_average_coverage.json | ||
files: | ||
- path: test-output/average_coverage_qc.json | ||
md5sum: 1f6dd50510bb8007552229b99be0e724 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import "../../../wgbs-pipeline.wdl" as wgbs | ||
|
||
workflow test_calculate_average_coverage { | ||
File bam | ||
File chrom_sizes | ||
Int ncpus | ||
|
||
call wgbs.calculate_average_coverage { input: | ||
bam = bam, | ||
chrom_sizes = chrom_sizes, | ||
ncpus = ncpus, | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import pytest | ||
from qc_utils import QCMetricRecord | ||
|
||
from wgbs_pipeline.calculate_average_coverage import ( | ||
calculate_average_coverage, | ||
calculate_genome_size, | ||
get_samtools_stats, | ||
make_qc_record, | ||
) | ||
|
||
|
||
@pytest.mark.filesystem | ||
def test_get_samtools_stats(mocker): | ||
mocker.patch("pysam.stats", return_value="SN\tfoo:\t3\n") | ||
result = get_samtools_stats("bam_path", threads=3) | ||
assert result == {"foo": 3} | ||
|
||
|
||
def test_calculate_genome_size(mocker): | ||
mocker.patch("builtins.open", mocker.mock_open(read_data="foo\t1\nbar\t5\n")) | ||
result = calculate_genome_size("path") | ||
assert result == 6 | ||
|
||
|
||
def test_calculate_average_coverage(): | ||
result = calculate_average_coverage( | ||
genome_size=10, aligned_read_count=3, read_length=3 | ||
) | ||
assert isinstance(result, dict) | ||
assert result["average_coverage"] == pytest.approx(0.9) | ||
|
||
|
||
def test_make_qc_record(): | ||
metric_1 = ("foo", {"bar": "baz"}) | ||
metric_2 = ("qux", {"quux": "corge"}) | ||
result = make_qc_record([metric_1, metric_2]) | ||
assert result.to_ordered_dict()["foo"] == {"bar": "baz"} | ||
assert result.to_ordered_dict()["qux"] == {"quux": "corge"} | ||
assert isinstance(result, QCMetricRecord) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
""" | ||
Calculates average coverage from a bam file. The formula for this is given by this: | ||
Coverage = (Aligned read counts * read length ) / total genome size | ||
The total genome size is obtained by summing the values in the chromosome sizes. The | ||
aligned read counts is the number of reads in the bam. The read length is obtained from | ||
samtools-stats `RL`, see http://www.htslib.org/doc/samtools-stats.html for details. | ||
""" | ||
|
||
import argparse | ||
import tempfile | ||
from typing import Any, Dict, List, Tuple | ||
|
||
import pysam | ||
from qc_utils import QCMetric, QCMetricRecord | ||
from qc_utils.parsers import parse_samtools_stats | ||
|
||
|
||
def main() -> None: # pragma: no cover | ||
""" | ||
We already integration test this in the WDL tests, no need for Python test coverage. | ||
""" | ||
parser = get_parser() | ||
args = parser.parse_args() | ||
samtools_stats = get_samtools_stats(args.bam, args.threads) | ||
genome_size = calculate_genome_size(args.chrom_sizes) | ||
average_coverage = calculate_average_coverage( | ||
genome_size=genome_size, | ||
aligned_read_count=samtools_stats["reads mapped"], | ||
read_length=samtools_stats["average length"], | ||
) | ||
qc_record = make_qc_record( | ||
[("samtools_stats", samtools_stats), ("average_coverage", average_coverage)] | ||
) | ||
qc_record.save(args.outfile) | ||
|
||
|
||
def get_samtools_stats(bam_path: str, threads: int) -> Dict[str, Any]: | ||
""" | ||
The buffer must be flushed first before it is read again by the parser, see | ||
https://stackoverflow.com/questions/46004774/python-namedtemporaryfile-appears-empty-even-after-data-is-written | ||
""" | ||
samtools_stats_stdout = pysam.stats("--threads", str(threads), bam_path) | ||
with tempfile.NamedTemporaryFile(mode="w") as samtools_stats_file: | ||
samtools_stats_file.write(samtools_stats_stdout) | ||
samtools_stats_file.flush() | ||
samtools_stats = parse_samtools_stats(samtools_stats_file.name) | ||
return samtools_stats | ||
|
||
|
||
def calculate_genome_size(chrom_sizes_path: str) -> int: | ||
size = 0 | ||
with open(chrom_sizes_path) as f: | ||
for line in f: | ||
size += int(line.split()[1]) | ||
return size | ||
|
||
|
||
def calculate_average_coverage( | ||
genome_size: int, aligned_read_count: int, read_length: int | ||
) -> Dict[str, float]: | ||
average_coverage = (aligned_read_count * read_length) / genome_size | ||
return {"average_coverage": average_coverage} | ||
|
||
|
||
def make_qc_record(qcs: List[Tuple[str, Dict[str, Any]]]) -> QCMetricRecord: | ||
qc_record = QCMetricRecord() | ||
for name, qc in qcs: | ||
metric = QCMetric(name, qc) | ||
qc_record.add(metric) | ||
return qc_record | ||
|
||
|
||
def get_parser() -> argparse.ArgumentParser: # pragma: nocover | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--bam", help="Bam file to compute coverage on", required=True) | ||
parser.add_argument("--chrom-sizes", help="Chromosome sizes file", required=True) | ||
parser.add_argument( | ||
"--threads", help="Number of threads for Samtools", required=True | ||
) | ||
parser.add_argument("--outfile", required=True) | ||
return parser | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |