PIP-127-rrbs (#62)

ENCODE-DCC · Apr 21, 2021 · 522492e · 522492e
1 parent 33166d4
commit 522492e
Show file tree

Hide file tree

Showing 8 changed files with 148 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 ## Overview
 
-An [ENCODE](https://www.encodeproject.org/) pipeline for processing whole-genome bisulfite sequencing (WGBS) data using [gemBS](https://github.com/heathsc/gemBS) for alignment and methylation extraction.
+An [ENCODE](https://www.encodeproject.org/) pipeline for processing whole-genome bisulfite sequencing (WGBS) and reduced representation bisulfite sequencing (RRBS) data using [gemBS](https://github.com/heathsc/gemBS) for alignment and methylation extraction.
 
 ## Installation
 

diff --git a/conf/IHEC_RRBS.conf b/conf/IHEC_RRBS.conf
@@ -0,0 +1,5 @@
+include IHEC_standard.conf
+
+[calling]
+
+keep_duplicates = True
diff --git a/docs/reference.md b/docs/reference.md
@@ -36,6 +36,8 @@ A input JSON to run the full pipeline, including building `gemBS` indexes, will
 }
 ```
 
+To process RRBS data, add `"wgbs.pipeline_type": "rrbs"` to your input JSON.
+
 If you plan on running the pipeline multiple times it is recommended to create the `gemBS` index file once, then use that as input to the runs instead of the raw sequence `fasta` files. To build the references, your input should look like the following:
 
 ```json

diff --git a/tests/functional/json/test_rrbs.json b/tests/functional/json/test_rrbs.json
@@ -0,0 +1,17 @@
+{
+  "wgbs.benchmark_mode": true,
+  "wgbs.fastqs": [
+    [
+      [
+        "tests/data/sample5_data_1_200000.fastq.gz",
+        "tests/data/sample5_data_2_200000.fastq.gz"
+      ]
+    ]
+  ],
+  "wgbs.pipeline_type": "rrbs",
+  "wgbs.reference": "tests/data/sacCer3.fa.gz",
+  "wgbs.sample_names": [
+    "sample5"
+  ],
+  "wgbs.underconversion_sequence_name": "NC_001416.1"
+}
diff --git a/tests/functional/test_rrbs.yaml b/tests/functional/test_rrbs.yaml
@@ -0,0 +1,64 @@
+---
+  # Not actually RRBS data, just run WGBS as RRBS
+  - name: test_rrbs
+    tags:
+      - functional
+    command: >-
+      tests/caper_run.sh
+      wgbs-pipeline.wdl
+      tests/functional/json/test_rrbs.json
+    files:
+      - path: test-output/gembs.conf
+        md5sum: 656a4e72a320f47adf9bddc7a864ac3f
+      - path: test-output/gembs_metadata.csv
+        md5sum: 40419347ed006135fbf6e909693f0e9c
+      - path: test-output/indexes.tar.gz
+        md5sum: 85754d48cc1ca9238ef115ca64a61445
+      - path: test-output/gemBS.json
+        md5sum: 38cc35382f907a63fc58baebe5e5de5f
+      - path: test-output/glob-e97d885c83d966247d485dc62b6ae799/sacCer3.contig.sizes
+        md5sum: 0497066e3880c6932cf6bde815c42c40
+      - path: test-output/glob-c8599c0b9048b55a8d5cfaad52995a94/sample_sample5.bam
+        md5sum: 8976efc0e5d3f51bf65e6e0493afeeb6
+      - path: test-output/glob-c0e92e4e9fb050e7e70bb645748b45dd/sample5.json
+        md5sum: 043a3f07cc800fcbfb2186e449fcb413
+      - path: test-output/glob-0b0236659b9524643e6454061959b28c/sample_sample5_pos.bw
+        md5sum: bd67706599694f3c8984c38ff7589eb3
+      - path: test-output/glob-041e1709c7dd1f320426281eb4649f9b/sample_sample5_neg.bw
+        md5sum: a3055e0651234c1f7dc1fa6be7012c73
+      - path: test-output/glob-708835e6a0042d33b00b6937266734f5/sample_sample5_chg.bb
+        md5sum: 5ca9c51b683180d4286e83f89b4b90f3
+      - path: test-output/glob-f70a6609728d4fb1448dba1f41361a30/sample_sample5_cpg.bb
+        md5sum: 295bd3b29aac0722565ac9c9bb124cfd
+      - path: test-output/glob-52f916d7cc14a5bcfb168d6910e04b56/sample_sample5_chg.bed.gz
+        md5sum: 1164171e1c7c2a62989590709bf379eb
+      - path: test-output/glob-2b5148d6967b43eee33eb370fd36b70e/sample_sample5_chh.bed.gz
+        md5sum: 2d8f5cd52cf74b0757e2599f06266d3c
+      - path: test-output/glob-f31cca1fcab505e10c2fe5ff003b211a/sample_sample5_cpg.bed.gz
+        md5sum: dc0a7f1831e9c0475bee112bc21f683a
+      - path: test-output/glob-b76cddd256e1197e0b726acc7184afc4/sample_sample5_cpg.txt.gz
+        md5sum: dbdff797c9dd1a855b8d4ca7b5498f7c
+      - path: test-output/glob-40c90aa4516b00209d682b819b1d021f/sample_sample5_non_cpg.txt.gz
+        md5sum: 97ed1234fbdd00e428e4c722f067134e
+      - path: test-output/gembs_map_qc.json
+        md5sum: d85e3700e41607e991bf7a7bfd6c799f
+      - path: test-output/glob-65c481a690a62b639d918bb70927f25e/sample_sample5.isize.png
+        md5sum: a4596aa822ed081788ed2c72ec555a9a
+      - path: test-output/mapping_reports/mapping/sample_sample5.mapq.png
+        md5sum: 3ed99b9be0131e9f7cc7d70f74194669
+      - path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/ENCODE.html
+        md5sum: b2ad3be0fa1ab6dd1252172a79b33b9c
+      - path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample5.html
+        md5sum: 54032ceac4315e9a74ef605831d37e47
+      - path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample_sample5.html
+        md5sum: ff5c58abb18ba97df3788ea5acb710ca
+      - path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample5.isize.png
+        md5sum: 329d748b732e27a0996956b288ad1071
+      - path: test-output/glob-1aeed469ae5d1e8d7cbca51e8758b781/sample5.mapq.png
+        md5sum: ec55f587784b6475607874f9837911ee
+      - path: test-output/coverage.bw
+        md5sum: 6b1da64b55ae1102b364903b3fea5289
+      - path: test-output/average_coverage_qc.json
+        md5sum: e98ea1f0eb34fedb394c4dd38fb52cdc
+      - path: test-output/bed_pearson_correlation_qc.json
+        should_exist: false
diff --git a/tests/python/test_make_conf.py b/tests/python/test_make_conf.py
@@ -51,7 +51,7 @@ class StubArgs:
             does_not_raise(),
         ),
         (["-r", "ref.fa.gz", "-e", "extra_ref.fa.gz"], does_not_raise()),
-        (["-r", "ref.fa.gz"], pytest.raises(SystemExit)),
+        (["-r", "ref.fa.gz"], does_not_raise()),
         (["-e", "extra.fa.gz"], pytest.raises(SystemExit)),
     ],
 )
@@ -132,6 +132,30 @@ def test_make_conf(
         assert output[row_index] == expected_value
 
 
+def test_make_conf_no_extra_reference(mocker):
+    mock_args = mocker.Mock(
+        reference="/foo/bar.baz",
+        extra_reference=None,
+        benchmark_mode=False,
+        include_file=None,
+        num_threads=8,
+        num_jobs=3,
+    )
+    result = make_conf(mock_args)
+    assert result == [
+        "reference = reference/bar.baz",
+        "index_dir = indexes",
+        "base = .",
+        "sequence_dir = ${base}/fastq/@SAMPLE",
+        "bam_dir = ${base}/mapping/@BARCODE",
+        "bcf_dir = ${base}/calls/@BARCODE",
+        "extract_dir = ${base}/extract/@BARCODE",
+        "report_dir = ${base}/report",
+        "threads = 8",
+        "jobs = 3",
+    ]
+
+
 def test_extract_underconversion_sequence(mocker):
     extra_reference = b">chrL\nGATACA\n"
     extra_reference_gzipped = gzip.compress(extra_reference)

diff --git a/wgbs-pipeline.wdl b/wgbs-pipeline.wdl
@@ -9,15 +9,16 @@ workflow wgbs {
     File reference
     File? indexed_reference
     File? indexed_contig_sizes
-    File extra_reference
+    File? extra_reference
     # biological replicate[technical replicate[one (single ended) or two paired fastqs]]]
     Array[Array[Array[File]]]? fastqs
     Array[String]? sample_names
 
     Int num_gembs_threads = 8
     Int num_gembs_jobs = 3
     String? underconversion_sequence_name
-    String include_conf_file = "/software/conf/IHEC_standard.conf"
+    String pipeline_type = "wgbs"
+    String include_conf_file = if pipeline_type=="wgbs" then "/software/conf/IHEC_standard.conf" else "/software/conf/IHEC_RRBS.conf"
 
     String barcode_prefix = "sample_"
 
@@ -109,7 +110,7 @@ workflow wgbs {
             metadata_file = select_first([make_metadata_csv.metadata_csv]),
             reference = reference,
             index = index,
-            extra_reference = extra_reference,
+            extra_reference = select_first([extra_reference, reference]),
             num_cpus = prepare_num_cpus,
             ram_gb = prepare_ram_gb,
             disk_size_gb = prepare_disk_size_gb,
@@ -236,7 +237,7 @@ task make_conf {
     Int num_threads
     Int num_jobs
     String reference
-    File extra_reference
+    File? extra_reference
     String? include_file
     String? underconversion_sequence_name
     Int? num_cpus = 1
@@ -249,7 +250,7 @@ task make_conf {
             -t "${num_threads}" \
             -j "${num_jobs}" \
             -r "${reference}" \
-            -e "${extra_reference}" \
+            ${if defined(extra_reference) then ("-e " + extra_reference) else ""} \
             ${if defined(underconversion_sequence_name) then ("-u " + underconversion_sequence_name) else ""} \
             ${if defined(include_file) then ("-i " + include_file) else ""} \
             ${if benchmark_mode then ("--benchmark-mode") else ""} \
@@ -303,15 +304,16 @@ task prepare {
 task index {
     File configuration_file
     File reference
-    File extra_reference
+    File? extra_reference
     Int? num_cpus = 8
     Int? ram_gb = 64
     Int? disk_size_gb = 500
 
     command {
         set -euo pipefail
         mkdir reference
-        ln -s ${reference} reference && ln -s ${extra_reference} reference
+        ln -s ${reference} reference
+        ${if defined(extra_reference) then ("ln -s " + extra_reference + " reference") else ""}
         # We need a gemBS JSON to create the index, and we need a metadata csv to create
         # the gemBS JSON. Create a dummy one, we don't use the config later so it
         # doesn't matter.

diff --git a/wgbs_pipeline/make_conf.py b/wgbs_pipeline/make_conf.py
@@ -20,32 +20,35 @@ def make_conf(args: argparse.Namespace) -> List[str]:
     If a name for an underconversion sequence is not provided, it is assumed that the
     name of the first contig in the extra reference is the name to use.
     """
-    conf = [
-        f"reference = reference/{Path(args.reference).name}",
-        f"extra_references = reference/{Path(args.extra_reference).name}",
-        "index_dir = indexes",
-        "base = .",
-        "sequence_dir = ${base}/fastq/@SAMPLE",
-        "bam_dir = ${base}/mapping/@BARCODE",
-        "bcf_dir = ${base}/calls/@BARCODE",
-        "extract_dir = ${base}/extract/@BARCODE",
-        "report_dir = ${base}/report",
-        f"threads = {args.num_threads}",
-        f"jobs = {args.num_jobs}",
-    ]
+    conf = [f"reference = reference/{Path(args.reference).name}"]
+    if args.extra_reference is not None:
+        conf.append(f"extra_references = reference/{Path(args.extra_reference).name}")
+    conf.extend(
+        [
+            "index_dir = indexes",
+            "base = .",
+            "sequence_dir = ${base}/fastq/@SAMPLE",
+            "bam_dir = ${base}/mapping/@BARCODE",
+            "bcf_dir = ${base}/calls/@BARCODE",
+            "extract_dir = ${base}/extract/@BARCODE",
+            "report_dir = ${base}/report",
+            f"threads = {args.num_threads}",
+            f"jobs = {args.num_jobs}",
+        ]
+    )
 
     if args.benchmark_mode:
         conf.append("benchmark_mode = true")
 
-    conf.append("[mapping]")
-
-    if args.underconversion_sequence:
-        underconversion_sequence = args.underconversion_sequence
-    else:
-        underconversion_sequence = extract_underconversion_sequence(
-            args.extra_reference
-        )
-    conf.append(f"underconversion_sequence = {underconversion_sequence}")
+    if args.extra_reference is not None:
+        conf.append("[mapping]")
+        if args.underconversion_sequence:
+            underconversion_sequence = args.underconversion_sequence
+        else:
+            underconversion_sequence = extract_underconversion_sequence(
+                args.extra_reference
+            )
+        conf.append(f"underconversion_sequence = {underconversion_sequence}")
 
     if args.include_file:
         conf.append(f"include {args.include_file}")
@@ -76,7 +79,6 @@ def get_parser() -> argparse.ArgumentParser:
         "-e",
         "--extra-reference",
         help="File name of extra reference for control sequence",
-        required=True,
     )
     parser.add_argument(
         "-r", "--reference", help="File name of reference fasta", required=True
-Original file line number
+Diff line change
@@ Expand Up @@
     }
     ```
+    To process RRBS data, add `"wgbs.pipeline_type": "rrbs"` to your input JSON.
     If you plan on running the pipeline multiple times it is recommended to create the `gemBS` index file once, then use that as input to the runs instead of the raw sequence `fasta` files. To build the references, your input should look like the following:
     ```json
@@ Expand Down @@