PIP-1240 add cpg correlation (#33)

ENCODE-DCC · Jun 17, 2020 · 52f334d · 52f334d
1 parent d63d65c
commit 52f334d
Show file tree

Hide file tree

Showing 26 changed files with 303 additions and 92 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -4,3 +4,4 @@
 !wgbs_pipeline/*/
 !Cargo.*
 !conf/*
+!requirements*.txt
diff --git a/.isort.cfg b/.isort.cfg
@@ -4,4 +4,4 @@ include_trailing_comma = True
 force_grid_wrap = 0
 use_parentheses = True
 line_length = 88
-known_third_party = attr,bs4,pytest,setuptools
+known_third_party = attr,bs4,pandas,pytest,qc_utils
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -51,10 +51,11 @@
       - id: flake8
       - id: trailing-whitespace
       - id: end-of-file-fixer
+        exclude: tests\/data\/bed_pearson_correlation_qc\.json$
       - id: debug-statements
       - id: check-json
       - id: pretty-format-json
-        exclude: tests\/data\/sample.+\.json$
+        exclude: tests\/data\/.+\.json$
         args:
           - --autofix
       - id: check-yaml

diff --git a/Dockerfile b/Dockerfile
@@ -33,7 +33,9 @@ RUN apt-get update && \
     && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && rm get-pip.py \
     && rm -rf /var/lib/apt/lists/*
 
-RUN pip3 install --no-cache-dir beautifulsoup4 matplotlib==3.0.2 multiprocess
+COPY requirements*.txt ./
+RUN pip3 install --no-cache-dir -r requirements.txt -r requirements-gembs.txt && \
+    rm requirements*.txt
 
 # Install bsmooth.R dependencies
 RUN echo "r <- getOption('repos'); r['CRAN'] <- 'https://cloud.r-project.org'; options(repos = r);" > ~/.Rprofile && \

diff --git a/pyproject.toml b/pyproject.toml
diff --git a/requirements-gembs.txt b/requirements-gembs.txt
@@ -0,0 +1,2 @@
+matplotlib==3.0.2
+multiprocess
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+beautifulsoup4==4.8.2
+pandas==1.0.4
+qc-utils>=20.6.1
diff --git a/setup.py b/setup.py
diff --git a/tests/caper_run.sh b/tests/caper_run.sh
@@ -19,7 +19,17 @@ fi
 
 WDL=$1
 INPUT=$2
+WORKFLOW_OPTIONS="tests/pytest_workflow_options.json"
+WORKFLOW_OPTIONS_FLAG="--no-relative-output-paths"
 
-echo "Running caper with WDL ${WDL}, input ${INPUT}, and image ${WGBS_DOCKER_IMAGE_TAG}"
+if [ $# -gt 2 ]; then
+    if [ "$3" != "${WORKFLOW_OPTIONS_FLAG}" ]; then
+        echo "Third argument must be ${WORKFLOW_OPTIONS_FLAG}"
+        exit 1
+    fi
+    WORKFLOW_OPTIONS="tests/pytest_workflow_no_relative_output_paths.json"
+fi
+
+echo "Running caper with WDL ${WDL}, input ${INPUT}, workflow options ${WORKFLOW_OPTIONS}, and image ${WGBS_DOCKER_IMAGE_TAG}"
 
-caper run "${WDL}" -i "${INPUT}" --docker "${WGBS_DOCKER_IMAGE_TAG}" -o ./tests/pytest_workflow_options.json
+caper run "${WDL}" -i "${INPUT}" --docker "${WGBS_DOCKER_IMAGE_TAG}" -o "./${WORKFLOW_OPTIONS}"
diff --git a/tests/data/bed_pearson_correlation_qc.json b/tests/data/bed_pearson_correlation_qc.json
@@ -0,0 +1 @@
+{"pearson_correlation": {"pearson_correlation": 0.9446482132888102}}
diff --git a/tests/data/encode_cpg_10e5_gt_10x_coverage.bed.gz b/tests/data/encode_cpg_10e5_gt_10x_coverage.bed.gz
diff --git a/tests/data/sample5_data_1_200000_s100_100000.fastq.gz b/tests/data/sample5_data_1_200000_s100_100000.fastq.gz
diff --git a/tests/data/sample5_data_2_200000_s100_100000.fastq.gz b/tests/data/sample5_data_2_200000_s100_100000.fastq.gz
diff --git a/tests/functional/json/test_wgbs_two_reps.json b/tests/functional/json/test_wgbs_two_reps.json
@@ -0,0 +1,24 @@
+{
+  "wgbs.benchmark_mode": true,
+  "wgbs.extra_reference": "tests/data/conversion_control.fa.gz",
+  "wgbs.fastqs": [
+    [
+      [
+        "tests/data/sample5_data_1_200000.fastq.gz",
+        "tests/data/sample5_data_2_200000.fastq.gz"
+      ]
+    ],
+    [
+      [
+        "tests/data/sample5_data_1_200000_s100_100000.fastq.gz",
+        "tests/data/sample5_data_2_200000_s100_100000.fastq.gz"
+      ]
+    ]
+  ],
+  "wgbs.include_conf_file": "/software/conf/IHEC_standard.conf",
+  "wgbs.indexed_contig_sizes": "tests/data/sacCer3.contig.sizes",
+  "wgbs.indexed_reference": "tests/data/indexes.tar.gz",
+  "wgbs.reference": "tests/data/sacCer3.fa.gz",
+  "wgbs.run_bsmooth": false,
+  "wgbs.underconversion_sequence_name": "NC_001416.1"
+}
diff --git a/tests/functional/test_wgbs.yaml b/tests/functional/test_wgbs.yaml
@@ -73,3 +73,5 @@
         md5sum: a09ae01f70fa6d2461e37d5814ceb579
       - path: test-output/coverage.bw
         md5sum: afa224c2037829dccacea4a67b6fa84a
+      - path: test-output/bed_pearson_correlation_qc.json
+        should_exist: false
diff --git a/tests/functional/test_wgbs_two_reps.py b/tests/functional/test_wgbs_two_reps.py
@@ -0,0 +1,22 @@
+import json
+import math
+
+import pytest
+
+
+@pytest.mark.workflow("test_wgbs_two_reps")
+def test_wgbs_two_reps_check_pearson_qc(workflow_dir, test_data_dir):
+    with open(test_data_dir / "bed_pearson_correlation_qc.json") as f:
+        expected_qc = json.load(f)
+
+    result_path = next(
+        (workflow_dir / "test-output").glob(
+            "wgbs/*/call-calculate_bed_pearson_correlation/execution/bed_pearson_correlation_qc.json"
+        )
+    )
+    with open(result_path) as f:
+        result_qc = json.load(f)
+
+    result = result_qc["pearson_correlation"]["pearson_correlation"]
+    expected = expected_qc["pearson_correlation"]["pearson_correlation"]
+    assert math.isclose(result, expected, rel_tol=1e-5)
diff --git a/tests/functional/test_wgbs_two_reps.yaml b/tests/functional/test_wgbs_two_reps.yaml
@@ -0,0 +1,12 @@
+---
+  - name: test_wgbs_two_reps
+    tags:
+      - functional
+    # Float output in the QC is subject to rounding errors, need custom comparison. Due
+    # to path collisons we cannot use relative output paths for Cromwell outputs, wf
+    # will fail otherwise.
+    command: >-
+      tests/caper_run.sh
+      wgbs-pipeline.wdl
+      tests/functional/json/test_wgbs_two_reps.json
+      --no-relative-output-paths
diff --git a/tests/integration/json/test_calculate_bed_pearson_correlation.json b/tests/integration/json/test_calculate_bed_pearson_correlation.json
@@ -0,0 +1,4 @@
+{
+  "test_calculate_bed_pearson_correlation.bed1": "tests/data/encode_cpg_10e5.bed.gz",
+  "test_calculate_bed_pearson_correlation.bed2": "tests/data/encode_cpg_10e5_gt_10x_coverage.bed.gz"
+}
diff --git a/tests/integration/test_calculate_bed_pearson_correlation.yaml b/tests/integration/test_calculate_bed_pearson_correlation.yaml
@@ -0,0 +1,11 @@
+---
+  - name: test_calculate_bed_pearson_correlation
+    tags:
+      - integration
+    command: >-
+      tests/caper_run.sh
+      tests/integration/wdl/test_calculate_bed_pearson_correlation.wdl
+      tests/integration/json/test_calculate_bed_pearson_correlation.json
+    files:
+      - path: test-output/bed_pearson_correlation_qc.json
+        md5sum: cddfd85f87898abeff6701f1c58c71f7
diff --git a/tests/integration/wdl/test_calculate_bed_pearson_correlation.wdl b/tests/integration/wdl/test_calculate_bed_pearson_correlation.wdl
@@ -0,0 +1,11 @@
+import "../../../wgbs-pipeline.wdl" as wgbs
+
+workflow test_calculate_bed_pearson_correlation {
+    File bed1
+    File bed2
+
+    call wgbs.calculate_bed_pearson_correlation { input:
+        bed1 = bed1,
+        bed2 = bed2,
+    }
+}
diff --git a/tests/pytest_workflow_no_relative_output_paths.json b/tests/pytest_workflow_no_relative_output_paths.json
@@ -0,0 +1,8 @@
+{
+  "default_runtime_attributes": {
+    "docker_user": "$EUID"
+  },
+  "final_workflow_outputs_dir": "test-output",
+  "maxRetries": 1,
+  "use_relative_output_paths": false
+}
diff --git a/tests/python/__init__.py b/tests/python/__init__.py
diff --git a/tests/python/test_calculate_bed_pearson_correlation.py b/tests/python/test_calculate_bed_pearson_correlation.py
@@ -0,0 +1,66 @@
+from contextlib import suppress as does_not_raise
+from io import BytesIO
+
+import pandas as pd
+import pytest
+
+from wgbs_pipeline.calculate_bed_pearson_correlation import (
+    DF_COLUMN_NAMES,
+    calculate_pearson,
+    get_parser,
+    load_bedmethyl,
+    make_pearson_qc,
+)
+
+
+def test_load_bedmethyl():
+    data = BytesIO(
+        (
+            b'track name="ENCSR156JXJ" description="ENCSR156JXJ" visibility=2 itemRgb="'
+            b'On"\nchr1\t10468\t10469\t"ENCSR156JXJ"\t1\t+\t10468\t10469\t255,0,0\t1\t1'
+            b"00\tCG\tCG\t2"
+        )
+    )
+    df = load_bedmethyl(data)
+    assert df.shape[0] == 1
+    expected = ("chr1", 10468, 10469, 1, 100)
+    for key, expected in zip(DF_COLUMN_NAMES, expected):
+        assert df.at[0, key] == expected
+
+
+def test_calculate_pearson():
+    df1 = pd.DataFrame.from_records(
+        [("chr1", 1, 2, 14, 50), ("chr1", 3, 4, 12, 100), ("chr1", 5, 6, 3, 100)],
+        columns=DF_COLUMN_NAMES,
+    )
+    df2 = pd.DataFrame.from_records(
+        [
+            ("chr1", 1, 2, 16, 25),
+            ("chr1", 3, 4, 12, 50),
+            ("chr1", 5, 6, 3, 100),
+            ("chr1", 7, 8, 3, 100),
+        ],
+        columns=DF_COLUMN_NAMES,
+    )
+    pearson = calculate_pearson(df1, df2)
+    assert pearson == 1.0
+
+
+def test_make_pearson_qc():
+    qc = make_pearson_qc(0.33)
+    assert dict(qc.to_ordered_dict()) == {
+        "pearson_correlation": {"pearson_correlation": 0.33}
+    }
+
+
+@pytest.mark.parametrize(
+    "condition,args",
+    [
+        (does_not_raise(), ["--bedmethyls", "foo", "bar", "--outfile", "baz"]),
+        (pytest.raises(SystemExit), ["--bedmethyls", "foo", "--outfile", "baz"]),
+    ],
+)
+def test_get_parser(condition, args):
+    parser = get_parser()
+    with condition:
+        parser.parse_args(args)
diff --git a/tox.ini b/tox.ini
@@ -1,9 +1,17 @@
 [tox]
 envlist = lint,py37,coverage-report
 isolated_build = True
+skipsdist = True
+
+[base]
+deps =
+    attrs
+    pytest
+    pytest-mock
+    -rrequirements.txt
 
 [testenv]
-extras = tests
+deps = {[base]deps}
 commands = python -m pytest --ignore=tests/functional/ --ignore=tests/integration --ignore=tests/unit --noconftest {posargs}
 
 [testenv:wdl]
@@ -22,9 +30,10 @@ deps = pre-commit
 commands = pre-commit run --all-files
 
 [testenv:coverage-report]
-basepython = python3.7
-extras = tests
 commands = pytest --cov-report term-missing --ignore=tests/functional/ --ignore=tests/integration --ignore=tests/unit --noconftest --cov=wgbs_pipeline
+deps =
+    pytest-cov
+    {[base]deps}
 
 [flake8]
 max_line_length = 88
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"pearson_correlation": {"pearson_correlation": 0.9446482132888102}}