Skip to content

Commit

Permalink
PIP-127-rrbs (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
paul-sud authored Apr 21, 2021
1 parent 33166d4 commit 522492e
Show file tree
Hide file tree
Showing 8 changed files with 148 additions and 32 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

## Overview

An [ENCODE](https://www.encodeproject.org/) pipeline for processing whole-genome bisulfite sequencing (WGBS) data using [gemBS](https://github.com/heathsc/gemBS) for alignment and methylation extraction.
An [ENCODE](https://www.encodeproject.org/) pipeline for processing whole-genome bisulfite sequencing (WGBS) and reduced representation bisulfite sequencing (RRBS) data using [gemBS](https://github.com/heathsc/gemBS) for alignment and methylation extraction.

## Installation

Expand Down
5 changes: 5 additions & 0 deletions conf/IHEC_RRBS.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
include IHEC_standard.conf

[calling]

keep_duplicates = True
2 changes: 2 additions & 0 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ A input JSON to run the full pipeline, including building `gemBS` indexes, will
}
```

To process RRBS data, add `"wgbs.pipeline_type": "rrbs"` to your input JSON.

If you plan on running the pipeline multiple times it is recommended to create the `gemBS` index file once, then use that as input to the runs instead of the raw sequence `fasta` files. To build the references, your input should look like the following:

```json
Expand Down
17 changes: 17 additions & 0 deletions tests/functional/json/test_rrbs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"wgbs.benchmark_mode": true,
"wgbs.fastqs": [
[
[
"tests/data/sample5_data_1_200000.fastq.gz",
"tests/data/sample5_data_2_200000.fastq.gz"
]
]
],
"wgbs.pipeline_type": "rrbs",
"wgbs.reference": "tests/data/sacCer3.fa.gz",
"wgbs.sample_names": [
"sample5"
],
"wgbs.underconversion_sequence_name": "NC_001416.1"
}
64 changes: 64 additions & 0 deletions tests/functional/test_rrbs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
---
# Not actually RRBS data, just run WGBS as RRBS
- name: test_rrbs
tags:
- functional
command: >-
tests/caper_run.sh
wgbs-pipeline.wdl
tests/functional/json/test_rrbs.json
files:
- path: test-output/gembs.conf
md5sum: 656a4e72a320f47adf9bddc7a864ac3f
- path: test-output/gembs_metadata.csv
md5sum: 40419347ed006135fbf6e909693f0e9c
- path: test-output/indexes.tar.gz
md5sum: 85754d48cc1ca9238ef115ca64a61445
- path: test-output/gemBS.json
md5sum: 38cc35382f907a63fc58baebe5e5de5f
- path: test-output/glob-e97d885c83d966247d485dc62b6ae799/sacCer3.contig.sizes
md5sum: 0497066e3880c6932cf6bde815c42c40
- path: test-output/glob-c8599c0b9048b55a8d5cfaad52995a94/sample_sample5.bam
md5sum: 8976efc0e5d3f51bf65e6e0493afeeb6
- path: test-output/glob-c0e92e4e9fb050e7e70bb645748b45dd/sample5.json
md5sum: 043a3f07cc800fcbfb2186e449fcb413
- path: test-output/glob-0b0236659b9524643e6454061959b28c/sample_sample5_pos.bw
md5sum: bd67706599694f3c8984c38ff7589eb3
- path: test-output/glob-041e1709c7dd1f320426281eb4649f9b/sample_sample5_neg.bw
md5sum: a3055e0651234c1f7dc1fa6be7012c73
- path: test-output/glob-708835e6a0042d33b00b6937266734f5/sample_sample5_chg.bb
md5sum: 5ca9c51b683180d4286e83f89b4b90f3
- path: test-output/glob-f70a6609728d4fb1448dba1f41361a30/sample_sample5_cpg.bb
md5sum: 295bd3b29aac0722565ac9c9bb124cfd
- path: test-output/glob-52f916d7cc14a5bcfb168d6910e04b56/sample_sample5_chg.bed.gz
md5sum: 1164171e1c7c2a62989590709bf379eb
- path: test-output/glob-2b5148d6967b43eee33eb370fd36b70e/sample_sample5_chh.bed.gz
md5sum: 2d8f5cd52cf74b0757e2599f06266d3c
- path: test-output/glob-f31cca1fcab505e10c2fe5ff003b211a/sample_sample5_cpg.bed.gz
md5sum: dc0a7f1831e9c0475bee112bc21f683a
- path: test-output/glob-b76cddd256e1197e0b726acc7184afc4/sample_sample5_cpg.txt.gz
md5sum: dbdff797c9dd1a855b8d4ca7b5498f7c
- path: test-output/glob-40c90aa4516b00209d682b819b1d021f/sample_sample5_non_cpg.txt.gz
md5sum: 97ed1234fbdd00e428e4c722f067134e
- path: test-output/gembs_map_qc.json
md5sum: d85e3700e41607e991bf7a7bfd6c799f
- path: test-output/glob-65c481a690a62b639d918bb70927f25e/sample_sample5.isize.png
md5sum: a4596aa822ed081788ed2c72ec555a9a
- path: test-output/mapping_reports/mapping/sample_sample5.mapq.png
md5sum: 3ed99b9be0131e9f7cc7d70f74194669
- path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/ENCODE.html
md5sum: b2ad3be0fa1ab6dd1252172a79b33b9c
- path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample5.html
md5sum: 54032ceac4315e9a74ef605831d37e47
- path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample_sample5.html
md5sum: ff5c58abb18ba97df3788ea5acb710ca
- path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample5.isize.png
md5sum: 329d748b732e27a0996956b288ad1071
- path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample5.mapq.png
md5sum: ec55f587784b6475607874f9837911ee
- path: test-output/coverage.bw
md5sum: 6b1da64b55ae1102b364903b3fea5289
- path: test-output/average_coverage_qc.json
md5sum: e98ea1f0eb34fedb394c4dd38fb52cdc
- path: test-output/bed_pearson_correlation_qc.json
should_exist: false
26 changes: 25 additions & 1 deletion tests/python/test_make_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class StubArgs:
does_not_raise(),
),
(["-r", "ref.fa.gz", "-e", "extra_ref.fa.gz"], does_not_raise()),
(["-r", "ref.fa.gz"], pytest.raises(SystemExit)),
(["-r", "ref.fa.gz"], does_not_raise()),
(["-e", "extra.fa.gz"], pytest.raises(SystemExit)),
],
)
Expand Down Expand Up @@ -132,6 +132,30 @@ def test_make_conf(
assert output[row_index] == expected_value


def test_make_conf_no_extra_reference(mocker):
mock_args = mocker.Mock(
reference="/foo/bar.baz",
extra_reference=None,
benchmark_mode=False,
include_file=None,
num_threads=8,
num_jobs=3,
)
result = make_conf(mock_args)
assert result == [
"reference = reference/bar.baz",
"index_dir = indexes",
"base = .",
"sequence_dir = ${base}/fastq/@SAMPLE",
"bam_dir = ${base}/mapping/@BARCODE",
"bcf_dir = ${base}/calls/@BARCODE",
"extract_dir = ${base}/extract/@BARCODE",
"report_dir = ${base}/report",
"threads = 8",
"jobs = 3",
]


def test_extract_underconversion_sequence(mocker):
extra_reference = b">chrL\nGATACA\n"
extra_reference_gzipped = gzip.compress(extra_reference)
Expand Down
16 changes: 9 additions & 7 deletions wgbs-pipeline.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@ workflow wgbs {
File reference
File? indexed_reference
File? indexed_contig_sizes
File extra_reference
File? extra_reference
# biological replicate[technical replicate[one (single ended) or two paired fastqs]]]
Array[Array[Array[File]]]? fastqs
Array[String]? sample_names

Int num_gembs_threads = 8
Int num_gembs_jobs = 3
String? underconversion_sequence_name
String include_conf_file = "/software/conf/IHEC_standard.conf"
String pipeline_type = "wgbs"
String include_conf_file = if pipeline_type=="wgbs" then "/software/conf/IHEC_standard.conf" else "/software/conf/IHEC_RRBS.conf"

String barcode_prefix = "sample_"

Expand Down Expand Up @@ -109,7 +110,7 @@ workflow wgbs {
metadata_file = select_first([make_metadata_csv.metadata_csv]),
reference = reference,
index = index,
extra_reference = extra_reference,
extra_reference = select_first([extra_reference, reference]),
num_cpus = prepare_num_cpus,
ram_gb = prepare_ram_gb,
disk_size_gb = prepare_disk_size_gb,
Expand Down Expand Up @@ -236,7 +237,7 @@ task make_conf {
Int num_threads
Int num_jobs
String reference
File extra_reference
File? extra_reference
String? include_file
String? underconversion_sequence_name
Int? num_cpus = 1
Expand All @@ -249,7 +250,7 @@ task make_conf {
-t "${num_threads}" \
-j "${num_jobs}" \
-r "${reference}" \
-e "${extra_reference}" \
${if defined(extra_reference) then ("-e " + extra_reference) else ""} \
${if defined(underconversion_sequence_name) then ("-u " + underconversion_sequence_name) else ""} \
${if defined(include_file) then ("-i " + include_file) else ""} \
${if benchmark_mode then ("--benchmark-mode") else ""} \
Expand Down Expand Up @@ -303,15 +304,16 @@ task prepare {
task index {
File configuration_file
File reference
File extra_reference
File? extra_reference
Int? num_cpus = 8
Int? ram_gb = 64
Int? disk_size_gb = 500

command {
set -euo pipefail
mkdir reference
ln -s ${reference} reference && ln -s ${extra_reference} reference
ln -s ${reference} reference
${if defined(extra_reference) then ("ln -s " + extra_reference + " reference") else ""}
# We need a gemBS JSON to create the index, and we need a metadata csv to create
# the gemBS JSON. Create a dummy one, we don't use the config later so it
# doesn't matter.
Expand Down
48 changes: 25 additions & 23 deletions wgbs_pipeline/make_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,32 +20,35 @@ def make_conf(args: argparse.Namespace) -> List[str]:
If a name for an underconversion sequence is not provided, it is assumed that the
name of the first contig in the extra reference is the name to use.
"""
conf = [
f"reference = reference/{Path(args.reference).name}",
f"extra_references = reference/{Path(args.extra_reference).name}",
"index_dir = indexes",
"base = .",
"sequence_dir = ${base}/fastq/@SAMPLE",
"bam_dir = ${base}/mapping/@BARCODE",
"bcf_dir = ${base}/calls/@BARCODE",
"extract_dir = ${base}/extract/@BARCODE",
"report_dir = ${base}/report",
f"threads = {args.num_threads}",
f"jobs = {args.num_jobs}",
]
conf = [f"reference = reference/{Path(args.reference).name}"]
if args.extra_reference is not None:
conf.append(f"extra_references = reference/{Path(args.extra_reference).name}")
conf.extend(
[
"index_dir = indexes",
"base = .",
"sequence_dir = ${base}/fastq/@SAMPLE",
"bam_dir = ${base}/mapping/@BARCODE",
"bcf_dir = ${base}/calls/@BARCODE",
"extract_dir = ${base}/extract/@BARCODE",
"report_dir = ${base}/report",
f"threads = {args.num_threads}",
f"jobs = {args.num_jobs}",
]
)

if args.benchmark_mode:
conf.append("benchmark_mode = true")

conf.append("[mapping]")

if args.underconversion_sequence:
underconversion_sequence = args.underconversion_sequence
else:
underconversion_sequence = extract_underconversion_sequence(
args.extra_reference
)
conf.append(f"underconversion_sequence = {underconversion_sequence}")
if args.extra_reference is not None:
conf.append("[mapping]")
if args.underconversion_sequence:
underconversion_sequence = args.underconversion_sequence
else:
underconversion_sequence = extract_underconversion_sequence(
args.extra_reference
)
conf.append(f"underconversion_sequence = {underconversion_sequence}")

if args.include_file:
conf.append(f"include {args.include_file}")
Expand Down Expand Up @@ -76,7 +79,6 @@ def get_parser() -> argparse.ArgumentParser:
"-e",
"--extra-reference",
help="File name of extra reference for control sequence",
required=True,
)
parser.add_argument(
"-r", "--reference", help="File name of reference fasta", required=True
Expand Down

0 comments on commit 522492e

Please sign in to comment.