Merge pull request #103 from ENCODE-DCC/dev

v1.3.3
ENCODE-DCC · Nov 1, 2019 · c238150 · c238150
2 parents 43ebe90 + 22caf9e
commit c238150
Show file tree

Hide file tree

Showing 120 changed files with 424 additions and 4,278 deletions.
diff --git a/README.md b/README.md
@@ -42,15 +42,12 @@ Use `https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq
 
 ## Input JSON file
 
-An input JSON file specifies all the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data fastq file. Please make sure to specify absolute paths rather than relative paths in your input JSON files.
-
-[Input JSON file specification](docs/input.md)
-
-## Running a pipeline without Caper
+> **IMPORTANT**: DO NOT BLINDLY USE A TEMPLATE/EXAMPLE INPUT JSON. READ THROUGH THE FOLLOWING GUIDE TO MAKE A CORRECT INPUT JSON FILE.
 
-> **WARNING**: This method has been deprecated. There are many unfixed known bugs. We no longer support it.
+An input JSON file specifies all the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data fastq file. Please make sure to specify absolute paths rather than relative paths in your input JSON files.
 
-Caper uses the cromwell workflow execution engine to run the workflow on the platform you specify. While we recommend you use caper, if you want to run cromwell directly without caper you can learn about that [here](docs/deprecated/OLD_METHOD.md).
+1) [Input JSON file specification (short)](docs/input_short.md)
+2) [Input JSON file specification (long)](docs/input.md)
 
 ## Running a pipeline on DNAnexus
 
@@ -67,5 +64,3 @@ Install [Croo](https://github.com/ENCODE-DCC/croo#installation). **You can skip
 $ pip install croo
 $ croo [METADATA_JSON_FILE]
 ```
-
-There is another [useful tool](utils/qc_jsons_to_tsv/README.md) to make a spreadsheet of QC metrics from multiple workflows. This tool recursively finds and parses all `qc.json` (pipeline's [final output](docs/example_output/v1.1.5/qc.json)) found from a specified root directory. It generates a TSV file that has all quality metrics tabulated in rows for each experiment and replicate. This tool also estimates overall quality of a sample by [a criteria definition JSON file](utils/qc_jsons_to_tsv/criteria.default.json) which can be a good guideline for QC'ing experiments.
diff --git a/chip.croo.json b/chip.croo.json
@@ -570,21 +570,25 @@
   "chip.macs2_signal_track": {
     "pval_bw": {
       "path": "signal/rep${i+1}/${basename}",
-      "table": "Signal/Replicate ${i+1}/MACS2 signal track (p-val)"
+      "table": "Signal/Replicate ${i+1}/MACS2 signal track (p-val)",
+      "ucsc_track": "track type=bigWig name=\"MACS2 p-val (rep${i+1})\" priority=${i+1} smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full"
     },
     "fc_bw": {
       "path": "signal/rep${i+1}/${basename}",
-      "table": "Signal/Replicate ${i+1}/MACS2 signal track (fold-enrichment)"
+      "table": "Signal/Replicate ${i+1}/MACS2 signal track (fold-enrichment)",
+      "ucsc_track": "track type=bigWig name=\"MACS2 fc (rep${i+1})\" priority=${i+1} smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full"
     }
   },
   "chip.macs2_signal_track_pooled": {
     "pval_bw": {
       "path": "signal/pooled-rep/${basename}",
-      "table": "Signal/Pooled replicate/MACS2 signal track (p-val)"
+      "table": "Signal/Pooled replicate/MACS2 signal track (p-val)",
+      "ucsc_track": "track type=bigWig name=\"MACS2 p-val (pooled)\" priority=0 smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full"
     },
     "fc_bw": {
       "path": "signal/pooled-rep/${basename}",
-      "table": "Signal/Pooled replicate/MACS2 signal track (fold-enrichment)"
+      "table": "Signal/Pooled replicate/MACS2 signal track (fold-enrichment)",
+      "ucsc_track": "track type=bigWig name=\"MACS2 fc (pooled)\" priority=0 smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full"
     }
   },
   "chip.count_signal_track": {
@@ -758,7 +762,8 @@
     },
     "optimal_peak_bb": {
       "path": "peak/idr_reproducibility/${basename}",
-      "table": "Peak/IDR reproducibility/Optimal peak (BigBed)"
+      "table": "Peak/IDR reproducibility/Optimal peak (BigBed)",
+      "ucsc_track": "track type=bigBed name=\"Optimal IDR peak\" priority=100 color=0,0,255 visibility=full"
     },
     "optimal_peak_hammock": {
       "path": "peak/idr_reproducibility/${basename}",
@@ -770,7 +775,8 @@
     },
     "conservative_peak_bb": {
       "path": "peak/idr_reproducibility/${basename}",
-      "table": "Peak/IDR reproducibility/Conservative peak (BigBed)"
+      "table": "Peak/IDR reproducibility/Conservative peak (BigBed)",
+      "ucsc_track": "track type=bigBed name=\"Conservative IDR peak\" priority=100 color=0,0,255 visibility=full"
     },
     "conservative_peak_hammock": {
       "path": "peak/idr_reproducibility/${basename}",
@@ -788,7 +794,8 @@
     },
     "optimal_peak_bb": {
       "path": "peak/overlap_reproducibility/${basename}",
-      "table": "Peak/Overlap reproducibility/Optimal peak (BigBed)"
+      "table": "Peak/Overlap reproducibility/Optimal peak (BigBed)",
+      "ucsc_track": "track type=bigBed name=\"Optimal overlap peak\" priority=100 color=0,0,255 visibility=full"
     },
     "optimal_peak_hammock": {
       "path": "peak/overlap_reproducibility/${basename}",
@@ -800,7 +807,8 @@
     },
     "conservative_peak_bb": {
       "path": "peak/overlap_reproducibility/${basename}",
-      "table": "Peak/Overlap reproducibility/Conservative peak (BigBed)"
+      "table": "Peak/Overlap reproducibility/Conservative peak (BigBed)",
+      "ucsc_track": "track type=bigBed name=\"Conservative overlap peak\" priority=100 color=0,0,255 visibility=full"
     },
     "conservative_peak_hammock": {
       "path": "peak/overlap_reproducibility/${basename}",

diff --git a/chip.wdl b/chip.wdl
@@ -1,12 +1,12 @@
 # ENCODE TF/Histone ChIP-Seq pipeline
 # Author: Jin Lee ([email protected])
 
-#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.3.2
-#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2
+#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.3.3
+#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.3
 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.json
 
 workflow chip {
-	String pipeline_ver = 'v1.3.2'
+	String pipeline_ver = 'v1.3.3'
 	### sample name, description
 	String title = 'Untitled'
 	String description = 'No description'
@@ -94,7 +94,7 @@ workflow chip {
 	Int align_cpu = 4
 	Int align_mem_mb = 20000
 	Int align_time_hr = 48
-	String align_disks = 'local-disk 200 HDD'
+	String align_disks = 'local-disk 400 HDD'
 
 	Int filter_cpu = 2
 	Int filter_mem_mb = 20000
@@ -127,6 +127,9 @@ workflow chip {
 	Int call_peak_time_hr = 72
 	String call_peak_disks = 'local-disk 200 HDD'
 
+	String filter_picard_java_heap = '4G'
+	String gc_bias_picard_java_heap = '6G'
+
 	#### input file definition
 	# pipeline can start from any type of inputs and then leave all other types undefined
 	# supported types: fastq, bam, nodup_bam (filtered bam), ta (tagAlign), peak
@@ -219,10 +222,13 @@ workflow chip {
 	File? blacklist2_ = if defined(blacklist2) then blacklist2
 		else read_genome_tsv.blacklist2
 	# merge multiple blacklists
+	# two blacklists can have different number of columns (3 vs 6)
+	# so we limit merged blacklist's columns to 3
 	Array[File] blacklists = select_all([blacklist1_, blacklist2_])
 	if ( length(blacklists) > 1 ) {
 		call pool_ta as pool_blacklist { input:
 			tas = blacklists,
+			col = 3,
 		}
 	}
 	File? blacklist_ = if length(blacklists) > 1 then pool_blacklist.ta_pooled
@@ -421,6 +427,7 @@ workflow chip {
 
 				cpu = filter_cpu,
 				mem_mb = filter_mem_mb,
+				picard_java_heap = filter_picard_java_heap,
 				time_hr = filter_time_hr,
 				disks = filter_disks,
 			}
@@ -466,6 +473,7 @@ workflow chip {
 			call gc_bias { input :
 				nodup_bam = nodup_bam_,
 				ref_fa = ref_fa_,
+				picard_java_heap = gc_bias_picard_java_heap,
 			}
 		}
 
@@ -502,6 +510,7 @@ workflow chip {
 
 				cpu = filter_cpu,
 				mem_mb = filter_mem_mb,
+				picard_java_heap = filter_picard_java_heap,
 				time_hr = filter_time_hr,
 				disks = filter_disks,
 			}
@@ -534,6 +543,7 @@ workflow chip {
 
 				cpu = filter_cpu,
 				mem_mb = filter_mem_mb,
+				picard_java_heap = filter_picard_java_heap,
 				time_hr = filter_time_hr,
 				disks = filter_disks,
 			}
@@ -630,6 +640,7 @@ workflow chip {
 
 				cpu = filter_cpu,
 				mem_mb = filter_mem_mb,
+				picard_java_heap = filter_picard_java_heap,
 				time_hr = filter_time_hr,
 				disks = filter_disks,
 			}
@@ -1277,8 +1288,10 @@ task filter {
 	File chrsz					# 2-col chromosome sizes file
 	Boolean no_dup_removal 		# no dupe reads removal when filtering BAM
 	String mito_chr_name
+
 	Int cpu
 	Int mem_mb
+	String picard_java_heap
 	Int time_hr
 	String disks
 
@@ -1293,7 +1306,8 @@ task filter {
 			${'--chrsz ' + chrsz} \
 			${if no_dup_removal then '--no-dup-removal' else ''} \
 			${'--mito-chr-name ' + mito_chr_name} \
-			${'--nth ' + cpu}
+			${'--nth ' + cpu} \
+			${'--picard-java-heap ' + picard_java_heap}
 	}
 	output {
 		File nodup_bam = glob('*.bam')[0]
@@ -1366,10 +1380,12 @@ task spr { # make two self pseudo replicates
 
 task pool_ta {
 	Array[File?] tas
+	Int? col 			# number of columns in pooled TA
 
 	command {
 		python3 $(which encode_task_pool_ta.py) \
-			${sep=' ' tas}
+			${sep=' ' tas} \
+			${'--col ' + col}
 	}
 	output {
 		File ta_pooled = glob('*.tagAlign.gz')[0]
@@ -1755,10 +1771,13 @@ task gc_bias {
 	File nodup_bam
 	File ref_fa
 
+	String picard_java_heap
+
 	command {
 		python3 $(which encode_task_gc_bias.py) \
 			${'--nodup-bam ' + nodup_bam} \
-			${'--ref-fa ' + ref_fa}
+			${'--ref-fa ' + ref_fa} \
+			${'--picard-java-heap ' + picard_java_heap}
 	}
 	output {
 		File gc_plot = glob('*.gc_plot.png')[0]

diff --git a/dev/backends/backend_with_db.conf b/dev/backends/backend_with_db.conf
diff --git a/dev/dev.md → dev/build_on_dx.sh b/dev/dev.md → dev/build_on_dx.sh
@@ -1,30 +1,7 @@
-# Dev
-
-## Command line for version change
-```bash
-PREV_VER=v1.3.2
-NEW_VER=v1.3.2
-for f in $(grep -rl ${PREV_VER} --include=*.{wdl,md,sh})
-do
-  sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f}
-done
-cd dev/workflow_opts
-for f in $(grep -rl ${PREV_VER} --include=*.json)
-do
-  sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f}
-done
-cd ../../
-```
-
-## Building templates on DX for each genome
-
-Make sure that you have [`dxWDL-0.79.1.jar`](https://github.com/DNAnexus/dxWDL/releases/download/0.79.1/dxWDL-0.79.1.jar) on your `$HOME`. Install [DNAnexus Platform SDK](https://wiki.DNAnexus.com/downloads) with `pip install dxpy`. Log-in on DNAnexus with `dx login` and choose "ENCODE Uniform Processing Pipelines" (name of our official DNAnexus project for pipelines).
-
-Run the following command line locally to build out DX workflows for this pipeline on our official one. This will overwrite (`-f` parameter does it).
-
-```bash
-# version
-VER=v1.3.2
+#!/bin/bash
+set -e
+
+VER=$(cat chip.wdl | grep "#CAPER docker" | awk 'BEGIN{FS=":"} {print $2}')
 DOCKER=quay.io/encode-dcc/chip-seq-pipeline:$VER
 
 # general
@@ -79,4 +56,3 @@ java -jar ~/dxWDL-0.79.1.jar compile chip.wdl -project "ENCODE Uniform Processin
 
 # test sample SE ENCSR000DYI (subsampled, chr19/chrM only)
 java -jar ~/dxWDL-0.79.1.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines Azure" -extras <(echo "{\"default_runtime_attributes\":{\"docker\":\"${DOCKER}\"}}") -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI_subsampled_chr19_only -defaults example_input_json/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json
-```
diff --git a/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only.json b/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only.json
diff --git a/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only_rm_chrM.json b/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only_rm_chrM.json
diff --git a/dev/examples/dx/ENCSR000DYI_dx.json b/dev/examples/dx/ENCSR000DYI_dx.json
diff --git a/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json b/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json