From 2755f91a1dd25994ae660032851e66cf048605e3 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Thu, 24 Oct 2019 19:25:57 -0700 Subject: [PATCH 01/15] croo: added UCSC browser tracks (MACS2 signal p-val/fc bigWigs, opt/consv idr/overlap peak bigBeds) --- chip.croo.json | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/chip.croo.json b/chip.croo.json index ad9b20a5..253c3390 100644 --- a/chip.croo.json +++ b/chip.croo.json @@ -570,21 +570,25 @@ "chip.macs2_signal_track": { "pval_bw": { "path": "signal/rep${i+1}/${basename}", - "table": "Signal/Replicate ${i+1}/MACS2 signal track (p-val)" + "table": "Signal/Replicate ${i+1}/MACS2 signal track (p-val)", + "ucsc_track": "track type=bigWig name=\"MACS2 p-val (rep${i+1})\" priority=${i+1} smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full" }, "fc_bw": { "path": "signal/rep${i+1}/${basename}", - "table": "Signal/Replicate ${i+1}/MACS2 signal track (fold-enrichment)" + "table": "Signal/Replicate ${i+1}/MACS2 signal track (fold-enrichment)", + "ucsc_track": "track type=bigWig name=\"MACS2 fc (rep${i+1})\" priority=${i+1} smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full" } }, "chip.macs2_signal_track_pooled": { "pval_bw": { "path": "signal/pooled-rep/${basename}", - "table": "Signal/Pooled replicate/MACS2 signal track (p-val)" + "table": "Signal/Pooled replicate/MACS2 signal track (p-val)", + "ucsc_track": "track type=bigWig name=\"MACS2 p-val (pooled)\" priority=0 smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full" }, "fc_bw": { "path": "signal/pooled-rep/${basename}", - "table": "Signal/Pooled replicate/MACS2 signal track (fold-enrichment)" + "table": "Signal/Pooled replicate/MACS2 signal track (fold-enrichment)", + "ucsc_track": "track type=bigWig name=\"MACS2 fc (pooled)\" priority=0 smoothingWindow=off maxHeightPixels=80:60:40 color=255,0,0 autoScale=off viewLimits=0:40 visibility=full" } }, "chip.count_signal_track": { @@ -758,7 +762,8 @@ }, "optimal_peak_bb": { "path": "peak/idr_reproducibility/${basename}", - "table": "Peak/IDR reproducibility/Optimal peak (BigBed)" + "table": "Peak/IDR reproducibility/Optimal peak (BigBed)", + "ucsc_track": "track type=bigBed name=\"Optimal IDR peak\" priority=100 color=0,0,255 visibility=full" }, "optimal_peak_hammock": { "path": "peak/idr_reproducibility/${basename}", @@ -770,7 +775,8 @@ }, "conservative_peak_bb": { "path": "peak/idr_reproducibility/${basename}", - "table": "Peak/IDR reproducibility/Conservative peak (BigBed)" + "table": "Peak/IDR reproducibility/Conservative peak (BigBed)", + "ucsc_track": "track type=bigBed name=\"Conservative IDR peak\" priority=100 color=0,0,255 visibility=full" }, "conservative_peak_hammock": { "path": "peak/idr_reproducibility/${basename}", @@ -788,7 +794,8 @@ }, "optimal_peak_bb": { "path": "peak/overlap_reproducibility/${basename}", - "table": "Peak/Overlap reproducibility/Optimal peak (BigBed)" + "table": "Peak/Overlap reproducibility/Optimal peak (BigBed)", + "ucsc_track": "track type=bigBed name=\"Optimal overlap peak\" priority=100 color=0,0,255 visibility=full" }, "optimal_peak_hammock": { "path": "peak/overlap_reproducibility/${basename}", @@ -800,7 +807,8 @@ }, "conservative_peak_bb": { "path": "peak/overlap_reproducibility/${basename}", - "table": "Peak/Overlap reproducibility/Conservative peak (BigBed)" + "table": "Peak/Overlap reproducibility/Conservative peak (BigBed)", + "ucsc_track": "track type=bigBed name=\"Conservative overlap peak\" priority=100 color=0,0,255 visibility=full" }, "conservative_peak_hammock": { "path": "peak/overlap_reproducibility/${basename}", From 813d31f04ef2bdbc46ed4822425ea1789328e16d Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 13:42:21 -0700 Subject: [PATCH 02/15] src: sync with atac (adapter trimmer param) --- src/encode_task_trim_adapter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/encode_task_trim_adapter.py b/src/encode_task_trim_adapter.py index 96812407..78b0f5ad 100755 --- a/src/encode_task_trim_adapter.py +++ b/src/encode_task_trim_adapter.py @@ -66,11 +66,11 @@ def parse_arguments(debug=False): args.adapters = [[a] for a in args.adapters] # make it a matrix # if adapter not given - if not args.adapters: # fill empty string in adapter list + if args.adapter or not args.adapters: # fill empty string in adapter list args.adapters = copy.deepcopy(args.fastqs) for i, adapters in enumerate(args.adapters): for j, adapter in enumerate(adapters): - args.adapters[i][j] = '' + args.adapters[i][j] = args.adapter if args.adapter else '' # check if fastqs, adapers have same/correct dimension if len(args.adapters) != len(args.fastqs): From 540bcf80980469217b2730d60aad5a1dee0d1bec Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 13:43:05 -0700 Subject: [PATCH 03/15] wdl, src: fix merged blacklist's col at 3 --- chip.wdl | 7 ++++++- src/encode_task_pool_ta.py | 12 +++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/chip.wdl b/chip.wdl index 22b7ed88..71b0eaa0 100644 --- a/chip.wdl +++ b/chip.wdl @@ -219,10 +219,13 @@ workflow chip { File? blacklist2_ = if defined(blacklist2) then blacklist2 else read_genome_tsv.blacklist2 # merge multiple blacklists + # two blacklists can have different number of columns (3 vs 6) + # so we limit merged blacklist's columns to 3 Array[File] blacklists = select_all([blacklist1_, blacklist2_]) if ( length(blacklists) > 1 ) { call pool_ta as pool_blacklist { input: tas = blacklists, + col = 3, } } File? blacklist_ = if length(blacklists) > 1 then pool_blacklist.ta_pooled @@ -1366,10 +1369,12 @@ task spr { # make two self pseudo replicates task pool_ta { Array[File?] tas + Int? col # number of columns in pooled TA command { python3 $(which encode_task_pool_ta.py) \ - ${sep=' ' tas} + ${sep=' ' tas} \ + ${'--col ' + col} } output { File ta_pooled = glob('*.tagAlign.gz')[0] diff --git a/src/encode_task_pool_ta.py b/src/encode_task_pool_ta.py index 4edc91a2..ff767032 100755 --- a/src/encode_task_pool_ta.py +++ b/src/encode_task_pool_ta.py @@ -17,6 +17,9 @@ def parse_arguments(): help='List of TAGALIGNs to be pooled.') parser.add_argument('--out-dir', default='', type=str, help='Output directory.') + parser.add_argument('--col', + help='Number of columns to keep in a pooled TAGALIGN. ' + 'Keep all columns if not defined.') parser.add_argument('--log-level', default='INFO', choices=['NOTSET', 'DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'ERROR', @@ -29,13 +32,16 @@ def parse_arguments(): return args -def pool_ta(tas, out_dir): +def pool_ta(tas, col, out_dir): if len(tas) > 1: prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(tas[0]))) pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix) - cmd = 'zcat -f {} | gzip -nc > {}' + cmd = 'zcat -f {} | ' + if col is not None: + cmd += 'cut -f 1-{} | '.format(col) + cmd += 'gzip -nc > {}' cmd = cmd.format( ' '.join(tas), pooled_ta) @@ -53,7 +59,7 @@ def main(): mkdir_p(args.out_dir) log.info('Pooling TAGALIGNs...') - pool_ta(args.tas, args.out_dir) + pool_ta(args.tas, args.col, args.out_dir) log.info('List all files in output directory...') ls_l(args.out_dir) From af19389002026a52ae4653a36ec0ab4e2abd1430 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 14:19:38 -0700 Subject: [PATCH 04/15] sync with atac: bug fix for uncompressed fastqs --- src/detect_adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/detect_adapter.py b/src/detect_adapter.py index 1991bfa1..eea6470e 100755 --- a/src/detect_adapter.py +++ b/src/detect_adapter.py @@ -13,7 +13,7 @@ def open_gz(fname): - return gzip.open(fname) if fname.endswith('.gz') else open(fname, 'r') + return gzip.open(fname) if fname.endswith('.gz') else open(fname, 'rb') def detect_adapters_and_cnts(fname, max_n_lines=1000000): From bbbbde87dc91a6659d0f3976e0d2e1f1e191d2dd Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 16:31:50 -0700 Subject: [PATCH 05/15] doc: add desc for chip.fraglen --- docs/input.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/input.md b/docs/input.md index 25daa691..7cb90e74 100644 --- a/docs/input.md +++ b/docs/input.md @@ -182,6 +182,14 @@ Parameter|Default|Description `chip.enable_gc_bias` | true | Enable GC bias calculation `chip.enable_count_signal_track` | false | Enable count signal track generation +## Optional parameter for fragment length + +Our pipeline automatically estimate fragment lengths (required for TF ChIP-Seq) from cross-correlation (task `xcor`) anaylses, but `chip.fraglen` will override those estimated ones. Use this if your pipeline fails due to invalid (negative) fragment length estimated from the cross-correlation analysis. + +Parameter|Type | Description +---------|-----|----------- +`chip.fraglen` | `Array[Int]` | Fragment length for each replicate. + ## Other optional parameters Parameter|Default|Description From 5a4b4671bfda00fe0b8936fbc5bb9662b133f000 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 16:43:12 -0700 Subject: [PATCH 06/15] default, doc: default disk dize for align (200G->400G) --- chip.wdl | 2 +- docs/input.md | 10 +++++----- example_input_json/template.full.json | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/chip.wdl b/chip.wdl index 71b0eaa0..66797433 100644 --- a/chip.wdl +++ b/chip.wdl @@ -94,7 +94,7 @@ workflow chip { Int align_cpu = 4 Int align_mem_mb = 20000 Int align_time_hr = 48 - String align_disks = 'local-disk 200 HDD' + String align_disks = 'local-disk 400 HDD' Int filter_cpu = 2 Int filter_mem_mb = 20000 diff --git a/docs/input.md b/docs/input.md index 7cb90e74..75db8beb 100644 --- a/docs/input.md +++ b/docs/input.md @@ -207,14 +207,14 @@ Parameter|Default `chip.align_cpu` | 4 `chip.align_mem_mb` | 20000 `chip.align_time_hr` | 48 -`chip.align_disks` | `local-disk 100 HDD` +`chip.align_disks` | `local-disk 400 HDD` Parameter|Default ---------|------- `chip.filter_cpu` | 2 `chip.filter_mem_mb` | 20000 `chip.filter_time_hr` | 24 -`chip.filter_disks` | `local-disk 100 HDD` +`chip.filter_disks` | `local-disk 400 HDD` Parameter|Default ---------|------- @@ -232,7 +232,7 @@ Parameter|Default `chip.jsd_cpu` | 2 `chip.jsd_mem_mb` | 12000 `chip.jsd_time_hr` | 6 -`chip.jsd_disks` | `local-disk 100 HDD` +`chip.jsd_disks` | `local-disk 200 HDD` Parameter|Default ---------|------- @@ -246,13 +246,13 @@ Parameter|Default `chip.call_peak_cpu` | 2 `chip.call_peak_mem_mb` | 16000 `chip.call_peak_time_hr` | 24 -`chip.call_peak_disks` | `local-disk 100 HDD` +`chip.call_peak_disks` | `local-disk 200 HDD` Parameter|Default ---------|------- `chip.macs2_signal_track_mem_mb` | 16000 `chip.macs2_signal_track_time_hr` | 24 -`chip.macs2_signal_track_disks` | `local-disk 100 HDD` +`chip.macs2_signal_track_disks` | `local-disk 200 HDD` ## How to use a custom aligner diff --git a/example_input_json/template.full.json b/example_input_json/template.full.json index 7c325999..e4cabad0 100644 --- a/example_input_json/template.full.json +++ b/example_input_json/template.full.json @@ -55,7 +55,7 @@ "chip.align_cpu" : 4, "chip.align_mem_mb" : 20000, "chip.align_time_hr" : 48, - "chip.align_disks" : "local-disk 200 HDD", + "chip.align_disks" : "local-disk 400 HDD", "chip.filter_cpu" : 2, "chip.filter_mem_mb" : 20000, From a8795c8da684edf1ca4bb3bbe65353223d220629 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 17:10:01 -0700 Subject: [PATCH 07/15] backend, doc: remove deprecated old method --- dev/backends/backend.conf | 354 ------------------ dev/backends/backend_with_db.conf | 17 - .../ENCSR936XTK_subsampled_chr19_only.json | 26 -- ...R936XTK_subsampled_chr19_only_rm_chrM.json | 27 -- dev/examples/dx/ENCSR000DYI_dx.json | 19 - .../ENCSR000DYI_subsampled_chr19_only_dx.json | 18 - ..._subsampled_chr19_only_old_fraglen_dx.json | 20 - ...R000DYI_subsampled_chr19_only_rep1_dx.json | 14 - .../dx/ENCSR000DYI_subsampled_dx.json | 18 - .../dx/ENCSR000DYI_subsampled_rep1_dx.json | 14 - dev/examples/dx/ENCSR936XTK_dx.json | 35 -- .../ENCSR936XTK_subsampled_chr19_only_dx.json | 36 -- ..._subsampled_chr19_only_old_fraglen_dx.json | 37 -- ...K_subsampled_chr19_only_single_rep_dx.json | 24 -- .../dx/ENCSR936XTK_subsampled_dx.json | 35 -- dev/examples/dx/template_general.json | 3 - dev/examples/dx/template_hg19.json | 4 - dev/examples/dx/template_hg38.json | 4 - dev/examples/dx/template_mm10.json | 4 - dev/examples/dx/template_mm9.json | 4 - .../dx_azure/ENCSR000DYI_dx_azure.json | 19 - ...000DYI_subsampled_chr19_only_dx_azure.json | 18 - .../ENCSR000DYI_subsampled_dx_azure.json | 18 - .../dx_azure/ENCSR936XTK_dx_azure.json | 35 -- ...936XTK_subsampled_chr19_only_dx_azure.json | 35 -- .../ENCSR936XTK_subsampled_dx_azure.json | 35 -- dev/examples/dx_azure/template_general.json | 3 - dev/examples/dx_azure/template_hg19.json | 4 - dev/examples/dx_azure/template_hg38.json | 4 - dev/examples/dx_azure/template_mm10.json | 4 - dev/examples/dx_azure/template_mm9.json | 4 - dev/examples/google/ENCSR000DYI.json | 1 - dev/examples/google/ENCSR936XTK.json | 1 - .../ENCSR936XTK_subsampled_chr19_only.json | 1 - dev/examples/klab/ENCSR000DYI_klab.json | 19 - ..._chr19_only_dx_style_fastq_input_klab.json | 21 -- ...ed_chr19_only_keep_irregular_chr_klab.json | 19 - ...NCSR000DYI_subsampled_chr19_only_klab.json | 18 - .../klab/ENCSR000DYI_subsampled_klab.json | 18 - dev/examples/klab/ENCSR936XTK_klab.json | 26 -- ...NCSR936XTK_subsampled_chr19_only_klab.json | 26 -- ...subsampled_chr19_only_single_rep_klab.json | 18 - .../klab/ENCSR936XTK_subsampled_klab.json | 26 -- .../local/ENCSR936XTK_subsampled.json | 22 -- .../ENCSR936XTK_subsampled_chr19_only.json | 49 --- ...R936XTK_subsampled_chr19_only_sge_conda.sh | 69 ---- ...K_subsampled_chr19_only_sge_singularity.sh | 66 ---- ...36XTK_subsampled_chr19_only_slurm_conda.sh | 65 ---- ...subsampled_chr19_only_slurm_singularity.sh | 62 --- .../local/ENCSR936XTK_subsampled_sge_conda.sh | 69 ---- .../ENCSR936XTK_subsampled_sge_singularity.sh | 66 ---- .../ENCSR936XTK_subsampled_slurm_conda.sh | 65 ---- ...NCSR936XTK_subsampled_slurm_singularity.sh | 62 --- dev/examples/nat_prot_paper/ENCSR000DYI.json | 14 - dev/examples/nat_prot_paper/ENCSR936XTK.json | 18 - ...ENCSR936XTK_subsampled_chr19_only_scg.json | 55 --- ...R936XTK_subsampled_chr19_only_scg_conda.sh | 66 ---- ...K_subsampled_chr19_only_scg_singularity.sh | 62 --- .../scg/ENCSR936XTK_subsampled_scg.json | 26 -- .../scg/ENCSR936XTK_subsampled_scg_conda.sh | 66 ---- .../ENCSR936XTK_subsampled_scg_singularity.sh | 62 --- ...936XTK_subsampled_chr19_only_sherlock.json | 53 --- ...TK_subsampled_chr19_only_sherlock_conda.sh | 65 ---- ...sampled_chr19_only_sherlock_singularity.sh | 62 --- .../ENCSR936XTK_subsampled_sherlock.json | 26 -- .../ENCSR936XTK_subsampled_sherlock_conda.sh | 65 ---- ...R936XTK_subsampled_sherlock_singularity.sh | 62 --- .../ENCSR000DYI_bowtie2_google.json | 21 -- .../test_bowtie2/ENCSR000DYI_google.json | 21 -- .../ENCSR936XTK_bowtie2_google.json | 28 -- .../test_bowtie2/ENCSR936XTK_google.json | 28 -- ...TK_subsampled_chr19_only_bowtie2_klab.json | 27 -- dev/workflow_opts/docker.json | 11 - dev/workflow_opts/scg.json | 7 - dev/workflow_opts/sge.json | 6 - dev/workflow_opts/sherlock.json | 7 - dev/workflow_opts/singularity.json | 5 - dev/workflow_opts/slurm.json | 7 - docs/deprecated/OLD_METHOD.md | 24 -- docs/deprecated/output.md | 64 ---- docs/deprecated/tutorial_google.md | 88 ----- docs/deprecated/tutorial_local_conda.md | 53 --- docs/deprecated/tutorial_local_docker.md | 38 -- docs/deprecated/tutorial_local_singularity.md | 60 --- docs/deprecated/tutorial_scg.md | 70 ---- docs/deprecated/tutorial_scg_backend.md | 121 ------ docs/deprecated/tutorial_sge.md | 90 ----- docs/deprecated/tutorial_sge_backend.md | 126 ------- docs/deprecated/tutorial_sherlock.md | 75 ---- docs/deprecated/tutorial_sherlock_backend.md | 124 ------ docs/deprecated/tutorial_slurm.md | 85 ----- docs/deprecated/tutorial_slurm_backend.md | 128 ------- 92 files changed, 3697 deletions(-) delete mode 100644 dev/backends/backend.conf delete mode 100644 dev/backends/backend_with_db.conf delete mode 100644 dev/examples/caper/ENCSR936XTK_subsampled_chr19_only.json delete mode 100644 dev/examples/caper/ENCSR936XTK_subsampled_chr19_only_rm_chrM.json delete mode 100644 dev/examples/dx/ENCSR000DYI_dx.json delete mode 100644 dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json delete mode 100644 dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json delete mode 100644 dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_rep1_dx.json delete mode 100644 dev/examples/dx/ENCSR000DYI_subsampled_dx.json delete mode 100644 dev/examples/dx/ENCSR000DYI_subsampled_rep1_dx.json delete mode 100644 dev/examples/dx/ENCSR936XTK_dx.json delete mode 100644 dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json delete mode 100644 dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json delete mode 100644 dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_single_rep_dx.json delete mode 100644 dev/examples/dx/ENCSR936XTK_subsampled_dx.json delete mode 100644 dev/examples/dx/template_general.json delete mode 100644 dev/examples/dx/template_hg19.json delete mode 100644 dev/examples/dx/template_hg38.json delete mode 100644 dev/examples/dx/template_mm10.json delete mode 100644 dev/examples/dx/template_mm9.json delete mode 100644 dev/examples/dx_azure/ENCSR000DYI_dx_azure.json delete mode 100644 dev/examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json delete mode 100644 dev/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json delete mode 100644 dev/examples/dx_azure/ENCSR936XTK_dx_azure.json delete mode 100644 dev/examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json delete mode 100644 dev/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json delete mode 100644 dev/examples/dx_azure/template_general.json delete mode 100644 dev/examples/dx_azure/template_hg19.json delete mode 100644 dev/examples/dx_azure/template_hg38.json delete mode 100644 dev/examples/dx_azure/template_mm10.json delete mode 100644 dev/examples/dx_azure/template_mm9.json delete mode 120000 dev/examples/google/ENCSR000DYI.json delete mode 120000 dev/examples/google/ENCSR936XTK.json delete mode 120000 dev/examples/google/ENCSR936XTK_subsampled_chr19_only.json delete mode 100644 dev/examples/klab/ENCSR000DYI_klab.json delete mode 100644 dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_dx_style_fastq_input_klab.json delete mode 100644 dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_keep_irregular_chr_klab.json delete mode 100644 dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_klab.json delete mode 100644 dev/examples/klab/ENCSR000DYI_subsampled_klab.json delete mode 100644 dev/examples/klab/ENCSR936XTK_klab.json delete mode 100644 dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_klab.json delete mode 100644 dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_single_rep_klab.json delete mode 100644 dev/examples/klab/ENCSR936XTK_subsampled_klab.json delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled.json delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_singularity.sh delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_sge_conda.sh delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh delete mode 100644 dev/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh delete mode 100644 dev/examples/nat_prot_paper/ENCSR000DYI.json delete mode 100644 dev/examples/nat_prot_paper/ENCSR936XTK.json delete mode 100644 dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json delete mode 100644 dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh delete mode 100644 dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh delete mode 100644 dev/examples/scg/ENCSR936XTK_subsampled_scg.json delete mode 100644 dev/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh delete mode 100644 dev/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh delete mode 100644 dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json delete mode 100644 dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh delete mode 100644 dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh delete mode 100644 dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock.json delete mode 100644 dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh delete mode 100644 dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh delete mode 100644 dev/examples/test_bowtie2/ENCSR000DYI_bowtie2_google.json delete mode 100644 dev/examples/test_bowtie2/ENCSR000DYI_google.json delete mode 100644 dev/examples/test_bowtie2/ENCSR936XTK_bowtie2_google.json delete mode 100644 dev/examples/test_bowtie2/ENCSR936XTK_google.json delete mode 100644 dev/examples/test_bowtie2/ENCSR936XTK_subsampled_chr19_only_bowtie2_klab.json delete mode 100644 dev/workflow_opts/docker.json delete mode 100644 dev/workflow_opts/scg.json delete mode 100644 dev/workflow_opts/sge.json delete mode 100644 dev/workflow_opts/sherlock.json delete mode 100644 dev/workflow_opts/singularity.json delete mode 100644 dev/workflow_opts/slurm.json delete mode 100644 docs/deprecated/OLD_METHOD.md delete mode 100644 docs/deprecated/output.md delete mode 100644 docs/deprecated/tutorial_google.md delete mode 100644 docs/deprecated/tutorial_local_conda.md delete mode 100644 docs/deprecated/tutorial_local_docker.md delete mode 100644 docs/deprecated/tutorial_local_singularity.md delete mode 100644 docs/deprecated/tutorial_scg.md delete mode 100644 docs/deprecated/tutorial_scg_backend.md delete mode 100644 docs/deprecated/tutorial_sge.md delete mode 100644 docs/deprecated/tutorial_sge_backend.md delete mode 100644 docs/deprecated/tutorial_sherlock.md delete mode 100644 docs/deprecated/tutorial_sherlock_backend.md delete mode 100644 docs/deprecated/tutorial_slurm.md delete mode 100644 docs/deprecated/tutorial_slurm_backend.md diff --git a/dev/backends/backend.conf b/dev/backends/backend.conf deleted file mode 100644 index 040ef6ca..00000000 --- a/dev/backends/backend.conf +++ /dev/null @@ -1,354 +0,0 @@ -include required(classpath("application")) - -backend { - default = "Local" - providers { - - pbs { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30 && sync" - concurrent-job-limit = 50 - runtime-attributes = """ - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - """ - submit = """ - qsub \ - -N ${job_name} \ - -o ${out} \ - -e ${err} \ - ${true="-lselect=1:ncpus=" false="" defined(cpu)}${cpu}${true=":mem=" false="" defined(memory_mb)}${memory_mb}${true="mb" false="" defined(memory_mb)} \ - ${true="-lwalltime=" false="" defined(time)}${time}${true=":0:0" false="" defined(time)} \ - ${true="-lngpus=" false="" gpu>1}${if gpu>1 then gpu else ""} \ - -V \ - ${script} - """ - kill = "qdel ${job_id}" - check-alive = "qstat ${job_id}" - job-id-regex = "(\\d+).+" - } - } - - pbs_singularity { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30 && sync" - concurrent-job-limit = 50 - runtime-attributes = """ - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String singularity_container - String? singularity_bindpath - """ - submit = """ - echo "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" | qsub \ - -N ${job_name} \ - -o ${out} \ - -e ${err} \ - ${true="-lselect=1:ncpus=" false="" defined(cpu)}${cpu}${true=":mem=" false="" defined(memory_mb)}${memory_mb}${true="mb" false="" defined(memory_mb)} \ - ${true="-lwalltime=" false="" defined(time)}${time}${true=":0:0" false="" defined(time)} \ - ${true="-lngpus=" false="" gpu>1}${if gpu>1 then gpu else ""} \ - -V - # If you see an error "The job was aborted from outside Cromwell" - # then check your singularity settings in a workflow options JSON file - # (e.g. check if you have an image file defined by "singularity_container") - # Also, make sure that your input data files (and genome database files) - # are on directories recursively bound by - # "singularity_bindpath" in a workflow options JSON file - # or singularity's built-in environment variable SINGULARITY_BINDPATH. - """ - # cromwell is desinged to monitor rc (return code) file, which is generated/controlled - # in ${script}, so if singularity does not run it due to some problems in singuarlity's - # internal settings then rc file is not generated. - # this can result in hanging of a cromwell process. - # setting the below parameter enables monitoring by "check-alive". - # it will take about "exit-code-timeout-seconds" x 3 time to detect failure. - exit-code-timeout-seconds = 180 - - kill = "qdel ${job_id}" - check-alive = "qstat -j ${job_id}" - job-id-regex = "(\\d+)" - } - } - - slurm_singularity { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30 && sync" - concurrent-job-limit = 50 - runtime-attributes = """ - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? slurm_partition - String? slurm_account - String? slurm_extra_param - String singularity_container - String? singularity_bindpath - """ - submit = """ - sbatch \ - --export=ALL \ - -J ${job_name} \ - -D ${cwd} \ - -o ${out} \ - -e ${err} \ - ${"-t " + time*60} \ - -n 1 \ - --ntasks-per-node=1 \ - ${true="--cpus-per-task=" false="" defined(cpu)}${cpu} \ - ${true="--mem=" false="" defined(memory_mb)}${memory_mb} \ - ${"-p " + slurm_partition} \ - ${"--account " + slurm_account} \ - ${true="--gres gpu:" false="" defined(gpu)}${gpu} \ - ${slurm_extra_param} \ - --wrap "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" - # If you see an error "The job was aborted from outside Cromwell" - # then check your singularity settings in a workflow options JSON file - # (e.g. check if you have an image file defined by "singularity_container") - # Also, make sure that your input data files (and genome database files) - # are on directories recursively bound by - # "singularity_bindpath" in a workflow options JSON file - # or singularity's built-in environment variable SINGULARITY_BINDPATH. - """ - kill = "scancel ${job_id}" - # cromwell is desinged to monitor rc (return code) file, which is generated/controlled - # in ${script}, so if singularity does not run it due to some problems in singuarlity's - # internal settings then rc file is not generated. - # this can result in hanging of a cromwell process. - # setting the below parameter enables monitoring by "check-alive". - # it will take about "exit-code-timeout-seconds" x 3 time to detect failure. - exit-code-timeout-seconds = 180 - - # cromwell responds only to non-zero exit code from "check-alive", - # but "squeue -j [JOB_ID]" returns zero exit code even when job is not found - # workaround to exit with 1 (like SGE's qstat -j [JOB_ID] does) for such cases. - check-alive = "CHK_ALIVE=$(squeue --noheader -j ${job_id}); if [ -z $CHK_ALIVE ]; then /bin/bash -c 'exit 1'; else echo $CHK_ALIVE; fi" - job-id-regex = "Submitted batch job (\\d+).*" - } - } - - sge_singularity { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30 && sync" - concurrent-job-limit = 50 - runtime-attributes = """ - String sge_pe = "shm" - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? sge_queue - String? sge_extra_param - String singularity_container - String? singularity_bindpath - """ - submit = """ - echo "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" | qsub \ - -S /bin/sh \ - -terse \ - -b n \ - -N ${job_name} \ - -wd ${cwd} \ - -o ${out} \ - -e ${err} \ - ${if cpu>1 then "-pe " + sge_pe + " " else ""}${if cpu>1 then cpu else ""} \ - ${true="-l h_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ - ${true="-l s_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ - ${true="-l h_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ - ${true="-l s_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ - ${"-q " + sge_queue} \ - ${"-l gpu=" + gpu} \ - ${sge_extra_param} \ - -V - # If you see an error "The job was aborted from outside Cromwell" - # then check your singularity settings in a workflow options JSON file - # (e.g. check if you have an image file defined by "singularity_container") - # Also, make sure that your input data files (and genome database files) - # are on directories recursively bound by - # "singularity_bindpath" in a workflow options JSON file - # or singularity's built-in environment variable SINGULARITY_BINDPATH. - """ - # cromwell is desinged to monitor rc (return code) file, which is generated/controlled - # in ${script}, so if singularity does not run it due to some problems in singuarlity's - # internal settings then rc file is not generated. - # this can result in hanging of a cromwell process. - # setting the below parameter enables monitoring by "check-alive". - # it will take about "exit-code-timeout-seconds" x 3 time to detect failure. - exit-code-timeout-seconds = 180 - - kill = "qdel ${job_id}" - check-alive = "qstat -j ${job_id}" - job-id-regex = "(\\d+)" - } - } - - singularity { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 5 && sync" - concurrent-job-limit = 10 - run-in-background = true - runtime-attributes = """ - Int? gpu - String singularity_container - String? singularity_bindpath - """ - submit = """ - SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script} - """ - } - } - - Local { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - concurrent-job-limit = 10 - } - } - - sge { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30 && sync" - concurrent-job-limit = 50 - runtime-attributes = """ - String sge_pe = "shm" - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? sge_queue - String? sge_extra_param - """ - submit = """ - qsub \ - -S /bin/sh \ - -terse \ - -b n \ - -N ${job_name} \ - -wd ${cwd} \ - -o ${out} \ - -e ${err} \ - ${if cpu>1 then "-pe " + sge_pe + " " else ""}${if cpu>1 then cpu else ""} \ - ${true="-l h_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ - ${true="-l s_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ - ${true="-l h_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ - ${true="-l s_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ - ${"-q " + sge_queue} \ - ${true="-l gpu=" false="" defined(gpu)}${gpu} \ - ${sge_extra_param} \ - -V \ - ${script} - """ - kill = "qdel ${job_id}" - check-alive = "qstat -j ${job_id}" - job-id-regex = "(\\d+)" - } - } - - slurm { - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - config { - script-epilogue = "sleep 30" - concurrent-job-limit = 50 - runtime-attributes = """ - Int cpu = 1 - Int? gpu - Int? time - Int? memory_mb - String? slurm_partition - String? slurm_account - String? slurm_extra_param - """ - submit = """ - sbatch \ - --export=ALL \ - -J ${job_name} \ - -D ${cwd} \ - -o ${out} \ - -e ${err} \ - ${"-t " + time*60} \ - -n 1 \ - --ntasks-per-node=1 \ - ${true="--cpus-per-task=" false="" defined(cpu)}${cpu} \ - ${true="--mem=" false="" defined(memory_mb)}${memory_mb} \ - ${"-p " + slurm_partition} \ - ${"--account " + slurm_account} \ - ${true="--gres gpu:" false="" defined(gpu)}${gpu} \ - ${slurm_extra_param} \ - --wrap "/bin/bash ${script}" - """ - kill = "scancel ${job_id}" - check-alive = "squeue -j ${job_id}" - job-id-regex = "Submitted batch job (\\d+).*" - } - } - - google { - actor-factory = "cromwell.backend.google.pipelines.v2alpha1.PipelinesApiLifecycleActorFactory" - - config { - # Google project - project = "your-project-name" - - # Base bucket for workflow executions - root = "gs://your-bucket-name" - - concurrent-job-limit = 1000 - genomics-api-queries-per-100-seconds = 1000 - maximum-polling-interval = 600 - - genomics { - auth = "application-default" - compute-service-account = "default" - endpoint-url = "https://genomics.googleapis.com/" - restrict-metadata-access = false - } - - filesystems { - gcs { - auth = "application-default" - } - } - } - } - } -} - -services { - LoadController { - class = "cromwell.services.loadcontroller.impl.LoadControllerServiceActor" - config { - # disable it (for login nodes on Stanford SCG, Sherlock) - control-frequency = 21474834 seconds - } - } -} - -system { - abort-jobs-on-terminate = true - graceful-server-shutdown = true -} - -call-caching { - enabled = false - invalidate-bad-cache-results = true -} - -google { - application-name = "cromwell" - auths = [ - { - name = "application-default" - scheme = "application_default" - } - ] -} diff --git a/dev/backends/backend_with_db.conf b/dev/backends/backend_with_db.conf deleted file mode 100644 index 95183b94..00000000 --- a/dev/backends/backend_with_db.conf +++ /dev/null @@ -1,17 +0,0 @@ -include "backend.conf" - -database { - profile = "slick.jdbc.MySQLProfile$" - db { - url = "jdbc:mysql://localhost/cromwell_db?useSSL=false&rewriteBatchedStatements=true" - user = "cromwell" - password = "cromwell" - driver = "com.mysql.jdbc.Driver" - } -} - -call-caching { - #enabled = true - #invalidate-bad-cache-results = true -} - diff --git a/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only.json b/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only.json deleted file mode 100644 index 38a557f2..00000000 --- a/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_caper.tsv", - "chip.fastqs_rep1_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only_rm_chrM.json b/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only_rm_chrM.json deleted file mode 100644 index 1aac6730..00000000 --- a/dev/examples/caper/ENCSR936XTK_subsampled_chr19_only_rm_chrM.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_caper.tsv", - "chip.fastqs_rep1_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878", - "chip.regex_filter_reads" : "chrM" -} diff --git a/dev/examples/dx/ENCSR000DYI_dx.json b/dev/examples/dx/ENCSR000DYI_dx.json deleted file mode 100644 index d931587f..00000000 --- a/dev/examples/dx/ENCSR000DYI_dx.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_dx.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - "chip.spp_cpu" : 2, - - "chip.title" : "ENCSR000DYI", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json b/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json deleted file mode 100644 index 3cd61e10..00000000 --- a/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.20.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl2.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19 and chrM only)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json b/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json deleted file mode 100644 index 9c8268a5..00000000 --- a/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.20.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl2.subsampled.25.fastq.gz" - ], - - "chip.fraglen" : [95,105], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19 and chrM only)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_rep1_dx.json b/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_rep1_dx.json deleted file mode 100644 index a1d7432c..00000000 --- a/dev/examples/dx/ENCSR000DYI_subsampled_chr19_only_rep1_dx.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19 and chrM only)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx/ENCSR000DYI_subsampled_dx.json b/dev/examples/dx/ENCSR000DYI_subsampled_dx.json deleted file mode 100644 index b4258a9c..00000000 --- a/dev/examples/dx/ENCSR000DYI_subsampled_dx.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_dx.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.15.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl2.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx/ENCSR000DYI_subsampled_rep1_dx.json b/dev/examples/dx/ENCSR000DYI_subsampled_rep1_dx.json deleted file mode 100644 index 42d143c0..00000000 --- a/dev/examples/dx/ENCSR000DYI_subsampled_rep1_dx.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_dx.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (unreplicated, subsampled 1/25)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx/ENCSR936XTK_dx.json b/dev/examples/dx/ENCSR936XTK_dx.json deleted file mode 100644 index 02e7d93a..00000000 --- a/dev/examples/dx/ENCSR936XTK_dx.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_dx.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.fastq.gz" - ], - "chip.fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json b/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json deleted file mode 100644 index f2596383..00000000 --- a/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" - -} diff --git a/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json b/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json deleted file mode 100644 index 9e00bcf7..00000000 --- a/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - "chip.fraglen" : [210,235], - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" - -} diff --git a/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_single_rep_dx.json b/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_single_rep_dx.json deleted file mode 100644 index 6984643f..00000000 --- a/dev/examples/dx/ENCSR936XTK_subsampled_chr19_only_single_rep_dx.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" - -} diff --git a/dev/examples/dx/ENCSR936XTK_subsampled_dx.json b/dev/examples/dx/ENCSR936XTK_subsampled_dx.json deleted file mode 100644 index 43ec09f6..00000000 --- a/dev/examples/dx/ENCSR936XTK_subsampled_dx.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_dx.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : [ - "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/dx/template_general.json b/dev/examples/dx/template_general.json deleted file mode 100644 index dd070109..00000000 --- a/dev/examples/dx/template_general.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "chip.pipeline_type" : "tf" -} diff --git a/dev/examples/dx/template_hg19.json b/dev/examples/dx/template_hg19.json deleted file mode 100644 index 8c87b94f..00000000 --- a/dev/examples/dx/template_hg19.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg19_dx.tsv" -} diff --git a/dev/examples/dx/template_hg38.json b/dev/examples/dx/template_hg38.json deleted file mode 100644 index 1b00d440..00000000 --- a/dev/examples/dx/template_hg38.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/hg38_dx.tsv" -} diff --git a/dev/examples/dx/template_mm10.json b/dev/examples/dx/template_mm10.json deleted file mode 100644 index dcf5dd1c..00000000 --- a/dev/examples/dx/template_mm10.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/mm10_dx.tsv" -} diff --git a/dev/examples/dx/template_mm9.json b/dev/examples/dx/template_mm9.json deleted file mode 100644 index fc42863a..00000000 --- a/dev/examples/dx/template_mm9.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/genome_tsv/v1/mm9_dx.tsv" -} diff --git a/dev/examples/dx_azure/ENCSR000DYI_dx_azure.json b/dev/examples/dx_azure/ENCSR000DYI_dx_azure.json deleted file mode 100644 index 8c07d7c8..00000000 --- a/dev/examples/dx_azure/ENCSR000DYI_dx_azure.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg38_dx_azure.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - "chip.spp_cpu" : 2, - - "chip.title" : "ENCSR000DYI", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json b/dev/examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json deleted file mode 100644 index cc404bf0..00000000 --- a/dev/examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx_azure.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.15.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl2.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19/chrM only)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json b/dev/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json deleted file mode 100644 index 2ce25cdc..00000000 --- a/dev/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg38_dx_azure.tsv", - "chip.fastqs_rep1_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.15.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl2.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/dx_azure/ENCSR936XTK_dx_azure.json b/dev/examples/dx_azure/ENCSR936XTK_dx_azure.json deleted file mode 100644 index d835ac46..00000000 --- a/dev/examples/dx_azure/ENCSR936XTK_dx_azure.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg38_dx_azure.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.fastq.gz" - ], - "chip.fastqs_rep2_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json b/dev/examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json deleted file mode 100644 index be4fa124..00000000 --- a/dev/examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg38_chr19_chrM_dx_azure.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json b/dev/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json deleted file mode 100644 index 757c8efd..00000000 --- a/dev/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg38_dx_azure.tsv", - - "chip.fastqs_rep1_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : [ - "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/dx_azure/template_general.json b/dev/examples/dx_azure/template_general.json deleted file mode 100644 index dd070109..00000000 --- a/dev/examples/dx_azure/template_general.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "chip.pipeline_type" : "tf" -} diff --git a/dev/examples/dx_azure/template_hg19.json b/dev/examples/dx_azure/template_hg19.json deleted file mode 100644 index ce29e235..00000000 --- a/dev/examples/dx_azure/template_hg19.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg19_dx_azure.tsv" -} diff --git a/dev/examples/dx_azure/template_hg38.json b/dev/examples/dx_azure/template_hg38.json deleted file mode 100644 index a8e4db3b..00000000 --- a/dev/examples/dx_azure/template_hg38.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/hg38_dx_azure.tsv" -} diff --git a/dev/examples/dx_azure/template_mm10.json b/dev/examples/dx_azure/template_mm10.json deleted file mode 100644 index f5cb1c99..00000000 --- a/dev/examples/dx_azure/template_mm10.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/mm10_dx_azure.tsv" -} diff --git a/dev/examples/dx_azure/template_mm9.json b/dev/examples/dx_azure/template_mm9.json deleted file mode 100644 index a44fed52..00000000 --- a/dev/examples/dx_azure/template_mm9.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/genome_tsv/v1/mm9_dx_azure.tsv" -} diff --git a/dev/examples/google/ENCSR000DYI.json b/dev/examples/google/ENCSR000DYI.json deleted file mode 120000 index 727a8a2a..00000000 --- a/dev/examples/google/ENCSR000DYI.json +++ /dev/null @@ -1 +0,0 @@ -../../test/test_workflow/ENCSR000DYI.json \ No newline at end of file diff --git a/dev/examples/google/ENCSR936XTK.json b/dev/examples/google/ENCSR936XTK.json deleted file mode 120000 index 98de4f57..00000000 --- a/dev/examples/google/ENCSR936XTK.json +++ /dev/null @@ -1 +0,0 @@ -../../test/test_workflow/ENCSR936XTK.json \ No newline at end of file diff --git a/dev/examples/google/ENCSR936XTK_subsampled_chr19_only.json b/dev/examples/google/ENCSR936XTK_subsampled_chr19_only.json deleted file mode 120000 index 846d8d87..00000000 --- a/dev/examples/google/ENCSR936XTK_subsampled_chr19_only.json +++ /dev/null @@ -1 +0,0 @@ -../../test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json \ No newline at end of file diff --git a/dev/examples/klab/ENCSR000DYI_klab.json b/dev/examples/klab/ENCSR000DYI_klab.json deleted file mode 100644 index 322132ff..00000000 --- a/dev/examples/klab/ENCSR000DYI_klab.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38/hg38_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - "chip.spp_cpu" : 2, - - "chip.title" : "ENCSR000DYI", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_dx_style_fastq_input_klab.json b/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_dx_style_fastq_input_klab.json deleted file mode 100644 index f27b397d..00000000 --- a/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_dx_style_fastq_input_klab.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM/hg38_chr19_chrM_klab.tsv", - "chip.fastqs_rep1_R1" : [ - "/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : [ - "/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.subsampled.20.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : [ - "/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : [ - "/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.subsampled.25.fastq.gz" - ], - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19 and chrM only, dx style fastqs def)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_keep_irregular_chr_klab.json b/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_keep_irregular_chr_klab.json deleted file mode 100644 index 08988e9b..00000000 --- a/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_keep_irregular_chr_klab.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM/hg38_chr19_chrM_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.subsampled.20.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - "chip.keep_irregular_chr_in_bfilt_peak" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19 and chrM only)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_klab.json b/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_klab.json deleted file mode 100644 index 0ef5c019..00000000 --- a/dev/examples/klab/ENCSR000DYI_subsampled_chr19_only_klab.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM/hg38_chr19_chrM_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.subsampled.20.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19 and chrM only)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/klab/ENCSR000DYI_subsampled_klab.json b/dev/examples/klab/ENCSR000DYI_subsampled_klab.json deleted file mode 100644 index a181d209..00000000 --- a/dev/examples/klab/ENCSR000DYI_subsampled_klab.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38/hg38_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.subsampled.25.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.subsampled.15.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.subsampled.25.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.subsampled.25.fastq.gz" - ], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI (subsampled 1/25)", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/klab/ENCSR936XTK_klab.json b/dev/examples/klab/ENCSR936XTK_klab.json deleted file mode 100644 index 4640be5b..00000000 --- a/dev/examples/klab/ENCSR936XTK_klab.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38/hg38_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_klab.json b/dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_klab.json deleted file mode 100644 index a190fa54..00000000 --- a/dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_klab.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM/hg38_chr19_chrM_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_single_rep_klab.json b/dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_single_rep_klab.json deleted file mode 100644 index 0dc553eb..00000000 --- a/dev/examples/klab/ENCSR936XTK_subsampled_chr19_only_single_rep_klab.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM/hg38_chr19_chrM_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/klab/ENCSR936XTK_subsampled_klab.json b/dev/examples/klab/ENCSR936XTK_subsampled_klab.json deleted file mode 100644 index 46375cd1..00000000 --- a/dev/examples/klab/ENCSR936XTK_subsampled_klab.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38/hg38_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/local/ENCSR936XTK_subsampled.json b/dev/examples/local/ENCSR936XTK_subsampled.json deleted file mode 100644 index 5af03a88..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "test_genome_database/hg38_local.tsv", - "chip.fastqs" : [ - [["test_sample/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz"]], - [["test_sample/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz"]] - ], - "chip.ctl_fastqs" : [ - [["test_sample/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz"]], - [["test_sample/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz"]] - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json b/dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json deleted file mode 100644 index 2c68fd89..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "test_genome_database/hg38_chr19_chrM_local.tsv", - "chip.fastqs" : [ - [["test_sample/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz"]], - [["test_sample/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz"]] - ], - "chip.ctl_fastqs" : [ - [["test_sample/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz"]], - [["test_sample/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz", - "test_sample/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz"]] - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19_chrM only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878", - - "chip.align_cpu" : 1, - "chip.align_mem_mb" : 4000, - "chip.align_time_hr" : 4, - - "chip.filter_cpu" : 1, - "chip.filter_mem_mb" : 4000, - "chip.filter_time_hr" : 4, - - "chip.bam2ta_cpu" : 1, - "chip.bam2ta_mem_mb" : 4000, - "chip.bam2ta_time_hr" : 4, - - "chip.spr_mem_mb" : 4000, - - "chip.fingerprint_cpu" : 1, - "chip.fingerprint_mem_mb" : 4000, - "chip.fingerprint_time_hr" : 6, - - "chip.xcor_cpu" : 1, - "chip.xcor_mem_mb" : 4000, - "chip.xcor_time_hr" : 4, - - "chip.call_peak_mem_mb" : 4000, - "chip.call_peak_time_hr" : 4, - - "chip.call_peak_cpu" : 1 -} diff --git a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh b/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh deleted file mode 100644 index 83384c82..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#$ -S /bin/sh -#$ -terse -#$ -V - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#$ -N ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#$ -l h_rt=12:00:00 -#$ -l s_rt=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#$ -l h_vmem=20G -#$ -l s_vmem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -# SGE has a parallel environment (PE). -# ask your admin to add a new PE named "shm" -# or use your cluster's own PE instead of "shm" -# 2 means number of cpus per pipeline -#$ -pe shm 2 - -# load java module if it exists -module load java || true - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_singularity.sh b/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_singularity.sh deleted file mode 100644 index 4df78341..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_singularity.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#$ -S /bin/sh -#$ -terse -#$ -V - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#$ -N ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#$ -l h_rt=12:00:00 -#$ -l s_rt=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#$ -l h_vmem=20G -#$ -l s_vmem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -# SGE has a parallel environment (PE). -# ask your admin to add a new PE named "shm" -# or use your cluster's own PE instead of "shm" -# 2 means number of cpus per pipeline -#$ -pe shm 2 - -# load java module if it exists -module load java || true - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh b/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh deleted file mode 100644 index 231d6108..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java || true - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh b/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh deleted file mode 100644 index 4b4a578a..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java || true - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/local/ENCSR936XTK_subsampled_sge_conda.sh b/dev/examples/local/ENCSR936XTK_subsampled_sge_conda.sh deleted file mode 100644 index e625f36e..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_sge_conda.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#$ -S /bin/sh -#$ -terse -#$ -V - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#$ -N ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#$ -l h_rt=12:00:00 -#$ -l s_rt=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#$ -l h_vmem=20G -#$ -l s_vmem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -# SGE has a parallel environment (PE). -# ask your admin to add a new PE named "shm" -# or use your cluster's own PE instead of "shm" -# 2 means number of cpus per pipeline -#$ -pe shm 2 - -# load java module if it exists -module load java || true - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh b/dev/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh deleted file mode 100644 index 50617ab6..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#$ -S /bin/sh -#$ -terse -#$ -V - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#$ -N ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#$ -l h_rt=12:00:00 -#$ -l s_rt=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#$ -l h_vmem=20G -#$ -l s_vmem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -# SGE has a parallel environment (PE). -# ask your admin to add a new PE named "shm" -# or use your cluster's own PE instead of "shm" -# 2 means number of cpus per pipeline -#$ -pe shm 2 - -# load java module if it exists -module load java || true - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh b/dev/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh deleted file mode 100644 index e7bdf937..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java || true - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh b/dev/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh deleted file mode 100644 index f0a163f1..00000000 --- a/dev/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java || true - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/local/ENCSR936XTK_subsampled.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/nat_prot_paper/ENCSR000DYI.json b/dev/examples/nat_prot_paper/ENCSR000DYI.json deleted file mode 100644 index 849f35da..00000000 --- a/dev/examples/nat_prot_paper/ENCSR000DYI.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "hg38/hg38.tsv", - "chip.fastqs_rep1_R1" : ["rep1.fastq.gz"], - "chip.fastqs_rep2_R1" : ["rep2.fastq.gz"], - "chip.ctl_fastqs_rep1_R1" : ["ctl1.fastq.gz"], - "chip.ctl_fastqs_rep2_R1" : ["ctl2.fastq.gz"], - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR000DYI", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/nat_prot_paper/ENCSR936XTK.json b/dev/examples/nat_prot_paper/ENCSR936XTK.json deleted file mode 100644 index ffe249ca..00000000 --- a/dev/examples/nat_prot_paper/ENCSR936XTK.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "hg38/hg38.tsv", - "chip.fastqs_rep1_R1" : ["rep1-R1.fastq.gz"], - "chip.fastqs_rep1_R2" : ["rep1-R2.fastq.gz"], - "chip.fastqs_rep2_R1" : ["rep2-R1.fastq.gz"], - "chip.fastqs_rep2_R2" : ["rep2-R2.fastq.gz"], - "chip.ctl_fastqs_rep1_R1" : ["ctl1-R1.fastq.gz"], - "chip.ctl_fastqs_rep1_R2" : ["ctl1-R2.fastq.gz"], - "chip.ctl_fastqs_rep2_R1" : ["ctl2-R1.fastq.gz"], - "chip.ctl_fastqs_rep2_R2" : ["ctl2-R2.fastq.gz"], - - "chip.paired_end" : true, - "chip.always_use_pooled_ctl" : true, - - "chip.title" : "ENCSR936XTK", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json b/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json deleted file mode 100644 index dd88685b..00000000 --- a/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/reference/ENCODE/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM_scg.tsv", - "chip.fastqs_rep1_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19_chrM only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878", - - "chip.align_cpu" : 1, - "chip.align_mem_mb" : 4000, - "chip.align_time_hr" : 4, - - "chip.filter_cpu" : 1, - "chip.filter_mem_mb" : 4000, - "chip.filter_time_hr" : 4, - - "chip.bam2ta_cpu" : 1, - "chip.bam2ta_mem_mb" : 4000, - "chip.bam2ta_time_hr" : 4, - - "chip.spr_mem_mb" : 4000, - - "chip.fingerprint_cpu" : 1, - "chip.fingerprint_mem_mb" : 4000, - "chip.fingerprint_time_hr" : 6, - - "chip.xcor_cpu" : 1, - "chip.xcor_mem_mb" : 4000, - "chip.xcor_time_hr" : 4, - - "chip.call_peak_mem_mb" : 4000, - "chip.call_peak_time_hr" : 4, - - "chip.call_peak_cpu" : 1 - "chip.spp_mem_mb" : 4000, - "chip.spp_time_hr" : 4 -} diff --git a/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh b/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh deleted file mode 100644 index 2eaae870..00000000 --- a/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java -module load miniconda/3 - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh b/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh deleted file mode 100644 index 28e5acd3..00000000 --- a/dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/scg.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/scg/ENCSR936XTK_subsampled_scg.json b/dev/examples/scg/ENCSR936XTK_subsampled_scg.json deleted file mode 100644 index 98d247c7..00000000 --- a/dev/examples/scg/ENCSR936XTK_subsampled_scg.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/reference/ENCODE/pipeline_genome_data/genome_tsv/v1/hg38_scg.tsv", - "chip.fastqs_rep1_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh b/dev/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh deleted file mode 100644 index 2e5c335f..00000000 --- a/dev/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java -module load miniconda/3 - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/scg/ENCSR936XTK_subsampled_scg.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh b/dev/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh deleted file mode 100644 index 11c05254..00000000 --- a/dev/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=12:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/scg/ENCSR936XTK_subsampled_scg.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/scg.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json b/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json deleted file mode 100644 index a4807adb..00000000 --- a/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/home/groups/cherry/encode/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM_sherlock.tsv", - "chip.fastqs_rep1_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19_chrM only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878", - - "chip.align_cpu" : 1, - "chip.align_mem_mb" : 4000, - "chip.align_time_hr" : 4, - - "chip.filter_cpu" : 1, - "chip.filter_mem_mb" : 4000, - "chip.filter_time_hr" : 4, - - "chip.bam2ta_cpu" : 1, - "chip.bam2ta_mem_mb" : 4000, - "chip.bam2ta_time_hr" : 4, - - "chip.spr_mem_mb" : 4000, - - "chip.fingerprint_cpu" : 1, - "chip.fingerprint_mem_mb" : 4000, - "chip.fingerprint_time_hr" : 6, - - "chip.xcor_cpu" : 1, - "chip.xcor_mem_mb" : 4000, - "chip.xcor_time_hr" : 4, - - "chip.call_peak_mem_mb" : 4000, - "chip.call_peak_time_hr" : 4, - - "chip.call_peak_cpu" : 1 -} diff --git a/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh b/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh deleted file mode 100644 index a7bf3c7d..00000000 --- a/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=24:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh b/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh deleted file mode 100644 index 215a86d8..00000000 --- a/dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=24:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/sherlock.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock.json b/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock.json deleted file mode 100644 index 31505811..00000000 --- a/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/home/groups/cherry/encode/pipeline_genome_data/genome_tsv/v1/hg38_sherlock.tsv", - "chip.fastqs_rep1_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz" - ], - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh b/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh deleted file mode 100644 index a899d5cf..00000000 --- a/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=24:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java - -# activate pipeline's Conda environment if Conda env exists -source activate encode-chip-seq-pipeline - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/sherlock/ENCSR936XTK_subsampled_sherlock.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf \ --Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh b/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh deleted file mode 100644 index ebebcf82..00000000 --- a/dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash - -# do not touch these settings -# number of tasks and nodes are fixed at 1 -#SBATCH -n 1 -#SBATCH --ntasks-per-node=1 - -# job name for pipeline -# this name will appear when you monitor jobs with "squeue -u $USER" -#SBATCH --job-name=ENCSR936XTK_subsampled - -# walltime for your job -# give long time enough to finish your pipeline -# <12 hr: small/test samples -# >24 hr: large samples -#SBATCH --time=24:00:00 - -# total amount of memory -# depends on the size of your FASTQs -# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples -# or <= NUM_CONCURRENT_TASK x 10GB for small samples -# do not request too much memory -# cluster will not accept your job -#SBATCH --mem=20G - -# max number of cpus for each pipeline -# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file -# since bwa is a bottlenecking task in the pipeline -# "chip.bwa_cpu" is a number of cpus per replicate -#SBATCH --cpus-per-task=2 - -# email notification for job status -#SBATCH --mail-type=END,FAIL - -# load java module if it exists -module load java - -# use input JSON for a small test sample -# you make an input JSON for your own sample -# start from any of two templates for single-ended and paired-ended samples -# (examples/template_se.json, examples/template_pe.json) -# do not use an input JSON file for a test sample (ENCSR936XTK) -# it's a sample with multimapping reads -INPUT=examples/sherlock/ENCSR936XTK_subsampled_sherlock.json - -# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left -# See details in /utils/resumer/README.md -PIPELINE_METADATA=metadata.json - -# limit number of concurrent tasks -# we recommend to use a number of replicates here -# so that all replicates are processed in parellel at the same time. -# make sure that resource settings in your input JSON file -# are consistent with SBATCH resource settings (--mem, --cpus-per-task) -# in this script -NUM_CONCURRENT_TASK=2 - -# run pipeline -# you can monitor your jobs with "squeue -u $USER" -java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ --Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-38.jar run chip.wdl -i ${INPUT} -o workflow_opts/sherlock.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/dev/examples/test_bowtie2/ENCSR000DYI_bowtie2_google.json b/dev/examples/test_bowtie2/ENCSR000DYI_bowtie2_google.json deleted file mode 100644 index fe1638cd..00000000 --- a/dev/examples/test_bowtie2/ENCSR000DYI_bowtie2_google.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR000DYI/qc.json", - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v1/hg38_google.tsv", - "chip.fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.fastq.gz" - ], - "chip.aligner" : "bowtie2", - - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - "chip.spp_cpu" : 2, - - "chip.title" : "ENCSR000DYI", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/test_bowtie2/ENCSR000DYI_google.json b/dev/examples/test_bowtie2/ENCSR000DYI_google.json deleted file mode 100644 index 0f82e05b..00000000 --- a/dev/examples/test_bowtie2/ENCSR000DYI_google.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR000DYI/qc.json", - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v1/hg38_google.tsv", - "chip.fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep1.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/rep2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq/ctl2.fastq.gz" - ], - - "chip.aligner" : "bwa", - "chip.paired_end" : false, - "chip.always_use_pooled_ctl" : true, - "chip.spp_cpu" : 2, - - "chip.title" : "ENCSR000DYI", - "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" -} diff --git a/dev/examples/test_bowtie2/ENCSR936XTK_bowtie2_google.json b/dev/examples/test_bowtie2/ENCSR936XTK_bowtie2_google.json deleted file mode 100644 index 6b06b1d7..00000000 --- a/dev/examples/test_bowtie2/ENCSR936XTK_bowtie2_google.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR936XTK/qc.json", - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v1/hg38_google.tsv", - "chip.fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.fastq.gz" - ], - "chip.aligner" : "bowtie2", - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/test_bowtie2/ENCSR936XTK_google.json b/dev/examples/test_bowtie2/ENCSR936XTK_google.json deleted file mode 100644 index f761622e..00000000 --- a/dev/examples/test_bowtie2/ENCSR936XTK_google.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR936XTK/qc.json", - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v1/hg38_google.tsv", - "chip.fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.fastq.gz" - ], - "chip.aligner" : "bwa", - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/examples/test_bowtie2/ENCSR936XTK_subsampled_chr19_only_bowtie2_klab.json b/dev/examples/test_bowtie2/ENCSR936XTK_subsampled_chr19_only_bowtie2_klab.json deleted file mode 100644 index 71114bb0..00000000 --- a/dev/examples/test_bowtie2/ENCSR936XTK_subsampled_chr19_only_bowtie2_klab.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_chr19_chrM/hg38_chr19_chrM_klab.tsv", - "chip.fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz" - ], - "chip.fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep1_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R1" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz" - ], - "chip.ctl_fastqs_rep2_R2" : ["/mnt/data/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz" - ], - "chip.aligner" : "bowtie2", - - "chip.paired_end" : true, - - "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", - "chip.description" : "ZNF143 ChIP-seq on human GM12878" -} diff --git a/dev/workflow_opts/docker.json b/dev/workflow_opts/docker.json deleted file mode 100644 index f5241a3d..00000000 --- a/dev/workflow_opts/docker.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "default_runtime_attributes" : { - "docker" : "quay.io/encode-dcc/chip-seq-pipeline:v1.3.2", - "zones": "us-west1-a us-west1-b us-west1-c us-central1-c us-central1-b", - "failOnStderr" : false, - "continueOnReturnCode" : 0, - "preemptible": "0", - "bootDiskSizeGb": "10", - "noAddress": "false" - } -} diff --git a/dev/workflow_opts/scg.json b/dev/workflow_opts/scg.json deleted file mode 100644 index d4780793..00000000 --- a/dev/workflow_opts/scg.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "default_runtime_attributes" : { - "slurm_account" : "YOUR_SLURM_ACCOUNT", - "singularity_container" : "/reference/ENCODE/pipeline_singularity_images/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/reference/ENCODE,/scratch,/srv/gsfs0" - } -} diff --git a/dev/workflow_opts/sge.json b/dev/workflow_opts/sge.json deleted file mode 100644 index c20ffb9d..00000000 --- a/dev/workflow_opts/sge.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "default_runtime_attributes" : { - "sge_pe" : "shm", - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg" - } -} diff --git a/dev/workflow_opts/sherlock.json b/dev/workflow_opts/sherlock.json deleted file mode 100644 index d57cb7ba..00000000 --- a/dev/workflow_opts/sherlock.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "default_runtime_attributes" : { - "slurm_partition" : "normal", - "singularity_container" : "/home/groups/cherry/encode/pipeline_singularity_images/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/scratch,/lscratch,/oak/stanford,/home/groups/cherry/encode" - } -} diff --git a/dev/workflow_opts/singularity.json b/dev/workflow_opts/singularity.json deleted file mode 100644 index dbabf500..00000000 --- a/dev/workflow_opts/singularity.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg" - } -} diff --git a/dev/workflow_opts/slurm.json b/dev/workflow_opts/slurm.json deleted file mode 100644 index 2396efe5..00000000 --- a/dev/workflow_opts/slurm.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "default_runtime_attributes" : { - "slurm_partition" : "YOUR_SLURM_PARTITION", - "slurm_account" : "YOUR_SLURM_ACCOUNT", - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg" - } -} diff --git a/docs/deprecated/OLD_METHOD.md b/docs/deprecated/OLD_METHOD.md deleted file mode 100644 index 91750599..00000000 --- a/docs/deprecated/OLD_METHOD.md +++ /dev/null @@ -1,24 +0,0 @@ -## Installation and tutorial - -This pipeline supports many cloud platforms and cluster engines. It also supports `docker`, `singularity` and `Conda` to resolve complicated software dependencies for the pipeline. A tutorial-based instruction for each platform will be helpful to understand how to run pipelines. There are special instructions for two major Stanford HPC servers (SCG4 and Sherlock). - -* Cloud platforms - * Web interface - * [DNAnexus Platform](../tutorial_dx_web.md) - * CLI (command line interface) - * [Google Cloud Platform](tutorial_google.md) - * [DNAnexus Platform](../tutorial_dx_cli.md) -* Stanford HPC servers (CLI) - * [Stanford SCG4](tutorial_scg.md) - * [Stanford Sherlock 2.0](tutorial_sherlock.md) -* Cluster engines (CLI) - * [SLURM](tutorial_slurm.md) - * [Sun GridEngine (SGE/PBS)](tutorial_sge.md) -* Local computers (CLI) - * [Local system with `singularity`](tutorial_local_singularity.md) - * [Local system with `docker`](tutorial_local_docker.md) - * [Local system with `Conda`](tutorial_local_conda.md) - -## Output directories - -[Output directory specification](output.md) diff --git a/docs/deprecated/output.md b/docs/deprecated/output.md deleted file mode 100644 index f6c076f4..00000000 --- a/docs/deprecated/output.md +++ /dev/null @@ -1,64 +0,0 @@ -# Output specification - -All output filenames keep prefixes from corresponding input filenames. For example. If you have started from `REP1.fastq.gz` and `REP2.fastq.gz` then corresponding alignment log for each replicate has a filename of `REP1.flagstat.qc` and `REP2.flagstat.qc`, respectively. - -Final HTML report (`qc.html`) and QC json (`qc.json`) files do not have any prefix. - -1. `DNAnexus`: Output will be stored on the specified output directory without any subdirectories. - -2. `Cromwell`: `Cromwell` will store outputs for each task under `cromwell-executions/[WORKFLOW_ID]/call-[TASK_NAME]/shard-[IDX]`. For all tasks except two peak calling tasks `idr` (irreproducible discovery rate) and `overlap` (naive overlapping peaks), `[IDX]` means a zero-based index for each replicate. For two tasks `idr` and `overlap`, `[IDX]` stands for a zero-based index for all possible pair of replicates. For example, you have 3 replicates and all possible combination of two replicates are `[(rep1,rep2), (rep1,rep3), (rep2,rep3)]`. Therefore, `call-idr/shard-2` should be an output directory for the pair of replicate 2 and 3. - -There can be duplicate output files on `execution/` and `execution/glob-*/` directories. A file on the latter (`execution/glob-*/`) is a symbolic link of an actual output file on the former. For Google Cloud Storage bucket (`gs://`) there is no `execution/` directory and files on `glob-*/` are actual outputs. - -|task|filename| description| -|-|-|-| -|merge_fastq| merge_fastqs_R?_*.fastq.gz| Merged FASTQ| -|trim_fastq| *.trim_*bp.fastq.gz| Trimmed FASTQ (R1 only) | -|bwa| * .bam| Raw BAM| -|bwa| * .bai| BAI for Raw BAM| -|bwa| * .flagstat.qc| Samtools flagstat log for raw BAM| -|filter| * .nodup.bam| Filtered/deduped BAM| -|filter| * .nodup.flagstat.qc| Samtools flagstat log for filtered/deduped BAM| -|filter| * .dup.qc| Picard/sambamba markdup log| -|filter| * .pbc.qc| PBC QC log| -|bam2ta| * .tagAlign.gz| TAG-ALIGN generated from filtered BAM| -|bam2ta| * .N.tagAlign.gz| Subsampled (N reads) TAG-ALIGN generated from filtered BAM| -|bam2ta| * .tn5.tagAlign.gz| TN5-shifted TAG-ALIGN| -|spr| * .pr1.tagAlign.gz| 1st pseudo-replicated TAG-ALIGN| -|spr| * .pr2.tagAlign.gz| 2nd pseudo-replicated TAG-ALIGN| -|pool_ta| * .tagAlign.gz| Pooled TAG-ALIGN from all replciates| -|fingerprint| * .jsd.qc| DeepTools fingerprint log| -|fingerprint| * .png| DeepTools fingerprint plot| -|choose_ctl| ctl_for_rep*.tagAlign.gz | Chosen control for each IP replicate| -|xcor| * .cc.plot.pdf| Cross-correlation plot PDF| -|xcor| * .cc.plot.png| Cross-correlation plot PNG| -|xcor| * .cc.qc| Cross-correlation analysis score log| -|xcor| * .cc.fraglen.txt| Estimated fragment length| -|macs2| * .narrowPeak.gz| NARROWPEAK| -|macs2| * .bfilt.narrowPeak.gz| Blacklist-filtered NARROWPEAK| -|macs2| * .bfilt.narrowPeak.bb| Blacklist-filtered NARROWPEAK in BigBed format| -|macs2| * .pval.signal.bigwig| p-val signal BIGWIG| -|macs2| * .fc.signal.bigwig| fold enrichment signal BIGWIG| -|macs2| * .frip.qc| Fraction of read (TAG-ALIGN) in peaks (NARROWPEAK)| -|spp| * .regionPeak.gz| SPP NARROWPEAK(REGIONPEAK)| -|spp| * .bfilt.regionPeak.gz| Blacklist-filtered REGIONPEAK| -|spp| * .bfilt.regionPeak.bb| Blacklist-filtered REGIONPEAK in BigBed format| -|spp| * .frip.qc| Fraction of read (TAG-ALIGN) in peaks (REGIONPEAK)| -|idr| * .*Peak.gz| IDR NARROWPEAK| -|idr| * .bfilt.*Peak.gz| Blacklist-filtered IDR NARROWPEAK| -|idr| * .bfilt.*Peak.bb| Blacklist-filtered IDR NARROWPEAK in BigBed format| -|idr| * .txt.png| IDR plot PNG| -|idr| * .txt.gz| Unthresholded IDR output| -|idr| * .log| IDR STDOUT log| -|idr| * .frip.qc| Fraction of read (TAG-ALIGN) in peaks (IDR NARROWPEAK)| -|overlap| * .*Peak.gz| Overlapping NARROWPEAK| -|overlap| * .bfilt.*Peak.gz| Blacklist-filtered overlapping NARROWPEAK| -|overlap| * .bfilt.*Peak.bb| Blacklist-filtered overlapping NARROWPEAK in BigBed format| -|overlap| * .frip.qc| Fraction of read (TAG-ALIGN) in peaks (overlapping NARROWPEAK)| -|reproducibility| * .reproducibility.qc| Reproducibililty QC log| -|reproducibility| optimal_peak.gz| Optimal final peak file| -|reproducibility| optimal_peak.bb| Optimal final peak file in BigBed format| -|reproducibility| conservative_peak.gz| Conservative final peak file| -|reproducibility| conservative_peak.bb| Conservative final peak file in BigBed format| -|qc_report| qc.html| Final HTML QC report| -|qc_report| qc.json| Final QC JSON| diff --git a/docs/deprecated/tutorial_google.md b/docs/deprecated/tutorial_google.md deleted file mode 100644 index 9ece9d5c..00000000 --- a/docs/deprecated/tutorial_google.md +++ /dev/null @@ -1,88 +0,0 @@ -# Tutorial for Google Cloud Platform - -All test samples and genome data are shared on our public Google Cloud buckets. You don't have to download any data for testing our pipeline on Google Cloud. - -1. Sign up for a Google account. -2. Go to [Google Project](https://console.developers.google.com/project) page and click "SIGN UP FOR FREE TRIAL" on the top left and agree to terms. -3. Set up a payment method and click "START MY FREE TRIAL". -4. Create a [Google Project](https://console.developers.google.com/project) `[YOUR_PROJECT_NAME]` and choose it on the top of the page. -5. Create a [Google Cloud Storage bucket](https://console.cloud.google.com/storage/browser) `gs://[YOUR_BUCKET_NAME]` by clicking on a button "CREATE BUCKET" and create it to store pipeline outputs. -6. Find and enable following APIs in your [API Manager](https://console.developers.google.com/apis/library). Click a back button on your web brower after enabling each. - * Compute Engine API - * Google Cloud Storage (DO NOT click on "Create credentials") - * Google Cloud Storage JSON API - * Genomics API - -7. Install [Google Cloud Platform SDK](https://cloud.google.com/sdk/downloads) and authenticate through it. You will be asked to enter verification keys. Get keys from the URLs they provide. - ```bash - $ gcloud auth login --no-launch-browser - $ gcloud auth application-default login --no-launch-browser - ``` - -8. If you see permission errors at runtime, then unset environment variable `GOOGLE_APPLICATION_CREDENTIALS` or add it to your BASH startup scripts (`$HOME/.bashrc` or `$HOME/.bash_profile`). - ```bash - unset GOOGLE_APPLICATION_CREDENTIALS - ``` - -7. Set your default Google Cloud Project. Pipeline will provision instances on this project. - ```bash - $ gcloud config set project [YOUR_PROJECT_NAME] - ``` - -8. Download [cromwell](https://github.com/broadinstitute/cromwell). - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -9. Git clone this pipeline and move into it. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -10. Run a pipeline for the test sample. - ```bash - $ PROJECT=[YOUR_PROJECT_NAME] - $ BUCKET=gs://[YOUR_BUCKET_NAME]/ENCSR936XTK_subsampled - $ INPUT=dev/examples/google/ENCSR936XTK_subsampled_chr19_only.json - - $ java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=google -Dbackend.providers.google.config.project=${PROJECT} -Dbackend.providers.google.config.root=${BUCKET} cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/docker.json - ``` - -11. It will take about 6 hours. You will be able to find all outputs on your Google Cloud bucket. Final QC report/JSON will be written on `gs://[YOUR_BUCKET_NAME]/ENCSR936XTK_subsampled/chip/[SOME_HASH_STRING]/call-qc_report/execution/glob*/qc.html` or `qc.json`. See [output directory structure](output.md) for details. - -12. See full specification for [input JSON file](input.md). - -13. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/google/ENCSR936XTK_subsampled_chr19_only.json`. - -## Extras for advanced users - -1. Set quota for [Google Compute Engine API](https://console.cloud.google.com/iam-admin/quotas) per region. Increase quota for SSD/HDD storage, number of vCPUs to process more samples faster simulateneouly. - * CPUs - * Persistent Disk Standard (GB) - * Persistent Disk SSD (GB) - * In-use IP addresses - * Networks - -2. Set `default_runtime_attributes.zones` in `dev/workflow_opts/docker.json` as your preferred Google Cloud zone. - ```javascript - { - "default_runtime_attributes" : { - ... - "zones": "us-west1-a us-west1-b us-west1-c", - ... - } - ``` - -3. Set `default_runtime_attributes.preemptible` as `"0"` to disable preemptible instances. This value means a number of retrial for failures in a preemtible instance. Pipeline defaults not to use [preemptible instances](https://cloud.google.com/compute/docs/instances/preemptible). If all retrial fails then the instance will be upgraded to a regular one. **Disabling preemtible instances will cost you significantly more** but you can get your samples processed much faster and stabler. Preemptible instance is disabled by default. Some hard tasks like `bowtie2`, `bwa` and `spp` will not be executed on preemtible instances since they can take longer than the limit (24 hours) of preemptible instances. - ```javascript - { - "default_runtime_attributes" : { - ... - "preemptible": "0", - ... - } - ``` \ No newline at end of file diff --git a/docs/deprecated/tutorial_local_conda.md b/docs/deprecated/tutorial_local_conda.md deleted file mode 100644 index 1fb66fe5..00000000 --- a/docs/deprecated/tutorial_local_conda.md +++ /dev/null @@ -1,53 +0,0 @@ -# Tutorial for general UNIX computers without docker - -1. Download [cromwell](https://github.com/broadinstitute/cromwell). - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -2. Git clone this pipeline and move into it. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -3. Download a SUBSAMPLED paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ENCSR936XTK_fastq_subsampled.tar - $ tar xvf ENCSR936XTK_fastq_subsampled.tar - ``` - -4. Download pre-built chr19/chrM-only genome database for hg38. - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar - $ tar xvf test_genome_database_hg38_chr19_chrM_chip.tar - ``` - -5. [Install Conda](https://conda.io/miniconda.html). Skip this if you already have equivalent Conda alternatives (Anaconda Python). Download and run the [installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Agree to the license term by typing `yes`. It will ask you about the installation location. On Stanford clusters (Sherlock and SCG4), we recommend to install it outside of your `$HOME` directory since its filesystem is slow and has very limited space. At the end of the installation, choose `yes` to add Miniconda's binary to `$PATH` in your BASH startup script. - ```bash - $ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - $ bash Miniconda3-latest-Linux-x86_64.sh - ``` - -6. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -7. Run a pipeline for the test sample. - ```bash - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json - $ PIPELINE_METADATA=metadata.json - $ java -jar -Dconfig.file=dev/backends/backend.conf cromwell-38.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} - ``` - -8. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -9. See full specification for [input JSON file](input.md). - -10. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json`. diff --git a/docs/deprecated/tutorial_local_docker.md b/docs/deprecated/tutorial_local_docker.md deleted file mode 100644 index 46a50789..00000000 --- a/docs/deprecated/tutorial_local_docker.md +++ /dev/null @@ -1,38 +0,0 @@ -# Tutorial for general UNIX computers with docker - -1. Download [cromwell](https://github.com/broadinstitute/cromwell). - ```bash - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -2. Git clone this pipeline and move into it. - ```bash - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -3. Download a SUBSAMPLED paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ENCSR936XTK_fastq_subsampled.tar - $ tar xvf ENCSR936XTK_fastq_subsampled.tar - ``` - -4. Download pre-built chr19/chrM-only genome database for hg38. - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar - $ tar xvf test_genome_database_hg38_chr19_chrM_chip.tar - ``` - -5. Run a pipeline for the test sample. - ```bash - $ INPUT=dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json - $ PIPELINE_METADATA=metadata.json - $ java -jar -Dconfig.file=dev/backends/backend.conf cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/docker.json -m ${PIPELINE_METADATA} - ``` - -6. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -7. See full specification for [input JSON file](input.md). - -8. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json`. diff --git a/docs/deprecated/tutorial_local_singularity.md b/docs/deprecated/tutorial_local_singularity.md deleted file mode 100644 index f6c4a58a..00000000 --- a/docs/deprecated/tutorial_local_singularity.md +++ /dev/null @@ -1,60 +0,0 @@ -# Tutorial for general UNIX computers with singularity - -1. Download [cromwell](https://github.com/broadinstitute/cromwell). - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -2. Git clone this pipeline and move into it. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -3. Download a SUBSAMPLED paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ENCSR936XTK_fastq_subsampled.tar - $ tar xvf ENCSR936XTK_fastq_subsampled.tar - ``` - -4. Download pre-built chr19/chrM-only genome database for hg38. - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar - $ tar xvf test_genome_database_hg38_chr19_chrM_chip.tar - ``` - -5. CHECK YOUR SINGULARITY VERSION FIRST AND UPGRADE IT TO A VERSION `>=2.5.2` OR PIPELINE WILL NOT WORK CORRECTLY. - ```bash - $ singularity --version - ``` - -6. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. - ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.3.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 - ``` - -7. Run a pipeline for the test sample. - ```bash - $ INPUT=dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json - $ PIPELINE_METADATA=metadata.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=singularity cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/singularity.json -m ${PIPELINE_METADATA} - ``` - -8. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -9. See full specification for [input JSON file](input.md). - -10. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/local/ENCSR936XTK_subsampled_chr19_only.json`. - -11. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/singularity.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." - } - } - ``` diff --git a/docs/deprecated/tutorial_scg.md b/docs/deprecated/tutorial_scg.md deleted file mode 100644 index 0eca235a..00000000 --- a/docs/deprecated/tutorial_scg.md +++ /dev/null @@ -1,70 +0,0 @@ -# Tutorial for Stanford SCG4 cluster - -This tutorial shows how to run pipelines on SCG4. You may need to have a paid account on it because SCG4 does not offer any free of charge SLURM partition. We recommend that free users use [Sherlock](tutorial_sherlock.md) instead. - -All test samples and genome data are shared on Stanford SCG4 cluster based on SLURM. You don't have to download any data for testing our pipeline on it. - -1. SSH to SCG's login node. - ```bash - $ ssh login.scg.stanford.edu - ``` - -2. Download [cromwell](https://github.com/broadinstitute/cromwell) on your `$HOME` directory. - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -3. Git clone this pipeline and move into its directory. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users - -4. Install Conda dependencies. - ```bash - $ module load miniconda/3 - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -5. Run a pipeline for the test sample. You must have a paid account on SCG4. - ```bash - $ sbatch --account [YOUR_PAID_ACCOUNT_ON_SCG4] dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh - ``` - -## For singularity users - -4. Run a pipeline for the test sample. You must have a paid account on SCG4. - ```bash - $ sbatch --account [YOUR_PAID_ACCOUNT_ON_SCG4] dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh - ``` - -## For all users - -6. It will take about an hour. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. You can monitor your jobs with the following command: - ```bash - $ squeue -u $USER - ``` - -7. See full specification for [input JSON file](input.md). - -8. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`dev/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/...`. - -## For singularity users - -9. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/scg.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/reference/ENCODE,/scratch,/srv/gsfs0,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." - } - } - ``` diff --git a/docs/deprecated/tutorial_scg_backend.md b/docs/deprecated/tutorial_scg_backend.md deleted file mode 100644 index 47ea5f06..00000000 --- a/docs/deprecated/tutorial_scg_backend.md +++ /dev/null @@ -1,121 +0,0 @@ -# Tutorial for Stanford SCG cluster - -All test samples and genome data are shared on Stanford SCG cluster based on SLURM. You don't have to download any data for testing our pipeline on it. - -1. SSH to SCG's login node. - ```bash - $ ssh login.scg.stanford.edu - ``` - -2. Download [cromwell](https://github.com/broadinstitute/cromwell). - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -3. Git clone this pipeline and move into it. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - - -4. Set your account in `dev/workflow_opts/scg.json`. [PIPELINE WILL NOT WORK WITHOUT A SLURM ACCOUNT](https://web.stanford.edu/group/scgpm/cgi-bin/informatics/wiki/index.php/Getting_A_Cluster_Account). Ignore other runtime attributes for singularity. - ```javascript - { - "default_runtime_attributes" : { - "slurm_account" : "YOUR_SLURM_ACCOUNT" - } - } - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users - -5. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -6. Run a pipeline for a SUBSAMPLED paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ module load java miniconda/3 - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/scg/ENCSR936XTK_subsampled_scg.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/scg.json - ``` - -7. It will take about 2 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -8. See full specification for [input JSON file](input.md). - -## For singularity users - -5. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. - ```bash - $ sdev # SCG cluster does not allow building a container on login node - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.3.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 - $ exit - ``` - -6. Run a pipeline for a SUBSAMPLED paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/scg/ENCSR936XTK_subsampled_scg.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm_singularity cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/scg.json - ``` - -7. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -8. See full specification for [input JSON file](input.md). - -9. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/scg.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/scratch/users,/srv/gsfs0,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." - } - } - ``` - -## Running multiple pipelines with cromwell server mode - -1. If you want to run multiple (>10) pipelines, then run a cromwell server on an interactive node. We recommend to use `screen` or `tmux` to keep your session alive and note that all running pipelines will be killed after walltime. Run a Cromwell server with the following commands. - - ```bash - $ srun -n 2 --mem 5G -t 3-0 --qos normal --account [YOUR_SCG_ACCOUNT] --pty /bin/bash -i -l # 2 CPU, 5 GB RAM and 3 day walltime - $ hostname -f # to get [CROMWELL_SVR_IP] - ``` - - For Conda users, - ```bash - $ source activate encode-chip-seq-pipeline - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm cromwell-38.jar server - ``` - - For singularity users, - ```bash - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm_singularity cromwell-38.jar server - ``` - - -2. You can modify `backend.providers.slurm.concurrent-job-limit` or `backend.providers.slurm_singularity.concurrent-job-limit` in `dev/backends/backend.conf` to increase maximum concurrent jobs. This limit is **NOT PER SAMPLE**. It's for all sub-tasks of all submitted samples. - -3. On a login node, submit jobs to the cromwell server. You will get `[WORKFLOW_ID]` as a return value. Keep these workflow IDs for monitoring pipelines and finding outputs for a specific sample later. - ```bash - $ INPUT=YOUR_INPUT.json - $ curl -X POST --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1" \ - -F workflowSource=@chip.wdl \ - -F workflowInputs=@${INPUT} \ - -F workflowOptions=@dev/workflow_opts/scg.json - ``` - - To monitor pipelines, see [cromwell server REST API description](http://cromwell.readthedocs.io/en/develop/api/RESTAPI/#cromwell-server-rest-api>) for more details. `squeue` will not give you enough information for monitoring jobs per sample. - ```bash - $ curl -X GET --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1/[WORKFLOW_ID]/status" - ``` diff --git a/docs/deprecated/tutorial_sge.md b/docs/deprecated/tutorial_sge.md deleted file mode 100644 index b331e026..00000000 --- a/docs/deprecated/tutorial_sge.md +++ /dev/null @@ -1,90 +0,0 @@ -# Tutorial for GridEngine (SGE/PBS) clusters - -1. Download [cromwell](https://github.com/broadinstitute/cromwell) on your `$HOME` directory. - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -2. Git clone this pipeline and move into its directory. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -3. Download a SUBSAMPLED (1/400) paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ENCSR936XTK_fastq_subsampled.tar - $ tar xvf ENCSR936XTK_fastq_subsampled.tar - ``` - -4. Download pre-built chr19/chrM-only genome database for hg38. - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar - $ tar xvf test_genome_database_hg38_chip.tar - ``` - -5. Get information about a parallel environment (PE) on your SGE system. If your system doesn't have a PE then ask your admin to add one with name `shm` to SGE master. - ``` - $ qconf -spl - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users - -6. [Install Conda](https://conda.io/miniconda.html). Skip this if you already have equivalent Conda alternatives (Anaconda Python). Download and run the [installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Agree to the license term by typing `yes`. It will ask you about the installation location. On Stanford clusters (Sherlock and SCG4), we recommend to install it outside of your `$HOME` directory since its filesystem is slow and has very limited space. At the end of the installation, choose `yes` to add Miniconda's binary to `$PATH` in your BASH startup script. - ```bash - $ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - $ bash Miniconda3-latest-Linux-x86_64.sh - ``` - -7. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -8. Run a pipeline for the test sample. If your parallel environment (PE) found from step 5) has a different name from `shm` then edit the following shell script to change the PE name. - ```bash - $ qsub dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh - ``` - -## For singularity users - -6. CHECK YOUR SINGULARITY VERSION FIRST AND UPGRADE IT TO A VERSION `>=2.5.2` OR PIPELINE WILL NOT WORK CORRECTLY. - ```bash - $ singularity --version - ``` - -7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. - ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.3.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 - ``` - -8. Run a pipeline for the test sample. If your parallel environment (PE) found from step 5) has a different name from `shm` then edit the following shell script to change the PE name. - ```bash - $ qsub dev/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh - ``` - -## For all users - -9. It will take about an hour. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -10. See full specification for [input JSON file](input.md). - -11. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`dev/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/...`. - -## For singularity users - -12. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/sge.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." - } - } - ``` diff --git a/docs/deprecated/tutorial_sge_backend.md b/docs/deprecated/tutorial_sge_backend.md deleted file mode 100644 index 59ba64bf..00000000 --- a/docs/deprecated/tutorial_sge_backend.md +++ /dev/null @@ -1,126 +0,0 @@ -# Tutorial for Sun GridEngine (SGE) clusters - -1. Download [cromwell](https://github.com/broadinstitute/cromwell). - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -2. Git clone this pipeline and move into it. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -3. Download a SUBSAMPLED (1/400) paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ENCSR936XTK_fastq_subsampled.tar - $ tar xvf ENCSR936XTK_fastq_subsampled.tar - ``` - -4. Download pre-built genome database for hg38. - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chip.tar - $ tar xvf test_genome_database_hg38_chip.tar - ``` - -5. Set your parallel environment (PE) and queue in `dev/workflow_opts/sge.json`. If your SGE cluster does not have any PE, ask your admin to add one for our pipeline. If you don't want to specify any queue then remove `, "sge_queue" : "YOUR_SGE_QUEUE"` from the file. See [here](how_to_config_sge.md) to find details about how to configure SGE for the pipeline. - ```javascript - { - "default_runtime_attributes" : { - "sge_pe" : "YOUR_SGE_PE", - "sge_queue" : "YOUR_SGE_QUEUE" - } - } - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users, - -6. [Install Conda](https://conda.io/miniconda.html) - -7. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -8. Run a pipeline for the test sample. - ```bash - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/local/ENCSR936XTK_subsampled.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=sge cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/sge.json - ``` - -9. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -10. See full specification for [input JSON file](input.md). - -## For singularity users, - -6. CHECK YOUR SINGULARITY VERSION FIRST AND UPGRADE IT TO A VERSION `>=2.5.2` OR PIPELINE WILL NOT WORK CORRECTLY. - ```bash - $ singularity --version - ``` - -7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. - ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.3.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1 - ``` - -8. Run a pipeline for the test sample. - ```bash - $ INPUT=dev/examples/local/ENCSR936XTK_subsampled.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=sge_singularity cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/sge.json - ``` - -9. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -10. See full specification for [input JSON file](input.md). - -11. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/sge.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.simg", - "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." - } - } - ``` - -## Running multiple pipelines with cromwell server mode - -1. If you want to run multiple (>10) pipelines, then run a cromwell server on an interactive node. We recommend to use `screen` or `tmux` to keep your session alive and note that all running pipelines will be killed after walltime. Run a Cromwell server with the following commands. - ```bash - $ qlogin -h_vmem=5G -h_rt=72:00:00 # long walltime - $ hostname -f # to get [CROMWELL_SVR_IP] - ``` - - For Conda users, - ```bash - $ source activate encode-chip-seq-pipeline - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=sge cromwell-38.jar server - ``` - For singularity users, - ```bash - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=sge_singularity cromwell-38.jar server - ``` - -2. You can modify `backend.providers.sge.concurrent-job-limit` or `backend.providers.sge_singularity.concurrent-job-limit` in `dev/backends/backend.conf` to increase maximum concurrent jobs. This limit is **not per sample**. It's for all sub-tasks of all submitted samples. - -3. On a login node, submit jobs to the cromwell server. You will get `[WORKFLOW_ID]` as a return value. Keep these workflow IDs for monitoring pipelines and finding outputs for a specific sample later. - ```bash - $ INPUT=YOUR_INPUT.json - $ curl -X POST --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1" \ - -F workflowSource=@chip.wdl \ - -F workflowInputs=@${INPUT} \ - -F workflowOptions=@dev/workflow_opts/sge.json - ``` - - To monitor pipelines, see [cromwell server REST API description](http://cromwell.readthedocs.io/en/develop/api/RESTAPI/#cromwell-server-rest-api>) for more details. `squeue` will not give you enough information for monitoring jobs per sample. - ```bash - $ curl -X GET --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1/[WORKFLOW_ID]/status" - ``` diff --git a/docs/deprecated/tutorial_sherlock.md b/docs/deprecated/tutorial_sherlock.md deleted file mode 100644 index da9410d7..00000000 --- a/docs/deprecated/tutorial_sherlock.md +++ /dev/null @@ -1,75 +0,0 @@ -# Tutorial for Stanford Sherlock cluster - -This tutorial shows how to run pipelines on Sherlock. - -All test samples and genome data are shared on Stanford Sherlock cluster based on SLURM. You don't have to download any data for testing our pipeline on it. - -1. SSH to Sherlock's login node. - ```bash - $ ssh login.sherlock.stanford.edu - ``` - -2. Download [cromwell](https://github.com/broadinstitute/cromwell) on your `$HOME` directory. - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -3. Git clone this pipeline and move into its directory. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users - -4. [Install Conda](https://conda.io/miniconda.html). Skip this if you already have equivalent Conda alternatives (Anaconda Python). Download and run the [installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Agree to the license term by typing `yes`. It will ask you about the installation location. On Stanford clusters (Sherlock and SCG4), we recommend to install it outside of your `$HOME` directory since its filesystem is slow and has very limited space. At the end of the installation, choose `yes` to add Miniconda's binary to `$PATH` in your BASH startup script. - ```bash - $ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - $ bash Miniconda3-latest-Linux-x86_64.sh - ``` - -5. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -6. Run a pipeline for the test sample. - ```bash - $ sbatch --partition normal dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh - ``` - -## For singularity users - -6. Run a pipeline for the test sample. - ```bash - $ sbatch --partition normal dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh - ``` - -## For all users - -7. It will take about an hour. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. You can monitor your jobs with the following command: - ```bash - $ squeue -u $USER - ``` - -8. See full specification for [input JSON file](input.md). - -9. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`dev/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/...`. - -## For singularity users - -10. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/sherlock.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/scratch,/lscratch,/oak/stanford,/home/groups/cherry/encode,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." - } - } - ``` diff --git a/docs/deprecated/tutorial_sherlock_backend.md b/docs/deprecated/tutorial_sherlock_backend.md deleted file mode 100644 index 8524dc62..00000000 --- a/docs/deprecated/tutorial_sherlock_backend.md +++ /dev/null @@ -1,124 +0,0 @@ -# Tutorial for Stanford Sherlock 2.0 cluster - -All test samples and genome data are shared on Stanford Sherlock cluster. You don't have to download any data for testing our pipeline on it. - -1. SSH to Sherlock's login node. - ```bash - $ ssh login.sherlock.stanford.edu - ``` - -2. Download [cromwell](https://github.com/broadinstitute/cromwell). - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -3. Git clone this pipeline and move into it. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -4. Set your partition in `dev/workflow_opts/sherlock.json`. PIPELINE WILL NOT WORK WITHOUT A PAID SLURM PARTITION DUE TO LIMITED RESOURCE SETTINGS FOR FREE USERS. Ignore other runtime attributes for singularity. - ```javascript - { - "default_runtime_attributes" : { - "slurm_partition": "YOUR_SLURM_PARTITON" - } - } - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users - -5. [Install Conda](https://conda.io/miniconda.html) - -6. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -7. Run a pipeline for a SUBSAMPLED (1/400) paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). DO NOT SBATCH THIS COMMAND LINE! RUN IT DIRECTLY ON A LOGIN NODE! FREE USERS ON SHERLOCK SHOULD KEEP `-Dbackend.providers.slurm.config.concurrent-job-limit=1` IN THE COMMAND LINE. USERS WITH A PAID PARTITON CAN INCREASE IT TO >=30 AND ALSO INCREASE RESOURCES DEFINED IN THE INPUT JSON FILE. - ```bash - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm -Dbackend.providers.slurm.config.concurrent-job-limit=1 cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/sherlock.json - ``` - -8. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -9. See full specification for [input JSON file](input.md). - -## For singularity users - -5. Add the following line to your BASH startup script (`~/.bashrc` or `~/.bash_profile`). - ```bash - module load system singularity - ``` - -6. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. Stanford Sherlock does not allow building a container on login nodes. Wait until you get a command prompt after `sdev`. - ```bash - $ sdev # sherlock cluster does not allow building a container on login node - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.3.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 - $ exit # exit from an interactive node - ``` - -7. Run a pipeline for a SUBSAMPLED (1/400) paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). DO NOT SBATCH THIS COMMAND LINE! RUN IT DIRECTLY ON A LOGIN NODE! FREE USERS ON SHERLOCK SHOULD KEEP `-Dbackend.providers.slurm_singularity.config.concurrent-job-limit=1` IN THE COMMAND LINE. USERS WITH A PAID PARTITON CAN INCREASE IT TO >=30 AND ALSO INCREASE RESOURCES DEFINED IN THE INPUT JSON FILE. - ```bash - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/sherlock/ENCSR936XTK_subsampled_sherlock.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm_singularity -Dbackend.providers.slurm_singularity.config.concurrent-job-limit=1 cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/sherlock.json - ``` - -8. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -9. See full specification for [input JSON file](input.md). - -10. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/sherlock.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/scratch,/oak/stanford,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." - } - } - ``` - -## Running multiple pipelines with cromwell server mode - -1. If you want to run multiple (>10) pipelines, then run a cromwell server on an interactive node. We recommend to use `screen` or `tmux` to keep your session alive and note that all running pipelines will be killed after walltime. Run a Cromwell server with the following commands. - - ```bash - $ srun -n 2 --mem 5G -t 3-0 --qos normal -p [YOUR_SLURM_PARTITION] --pty /bin/bash -i -l # 2 CPU, 5 GB RAM and 3 day walltime - $ hostname -f # to get [CROMWELL_SVR_IP] - ``` - - For Conda users, - ```bash - $ source activate encode-chip-seq-pipeline - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm cromwell-38.jar server - ``` - For singularity users, - ```bash - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm_singularity cromwell-38.jar server - ``` - -2. You can modify `backend.providers.slurm.concurrent-job-limit` or `backend.providers.slurm_singularity.concurrent-job-limit` in `dev/backends/backend.conf` to increase maximum concurrent jobs. This limit is **not per sample**. It's for all sub-tasks of all submitted samples. - -3. On a login node, submit jobs to the cromwell server. You will get `[WORKFLOW_ID]` as a return value. Keep these workflow IDs for monitoring pipelines and finding outputs for a specific sample later. - ```bash - $ INPUT=YOUR_INPUT.json - $ curl -X POST --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1" \ - -F workflowSource=@chip.wdl \ - -F workflowInputs=@${INPUT} \ - -F workflowOptions=@dev/workflow_opts/sherlock.json - ``` - - To monitor pipelines, see [cromwell server REST API description](http://cromwell.readthedocs.io/en/develop/api/RESTAPI/#cromwell-server-rest-api>) for more details. `squeue` will not give you enough information for monitoring jobs per sample. - ```bash - $ curl -X GET --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1/[WORKFLOW_ID]/status" - ``` diff --git a/docs/deprecated/tutorial_slurm.md b/docs/deprecated/tutorial_slurm.md deleted file mode 100644 index e0fc0b96..00000000 --- a/docs/deprecated/tutorial_slurm.md +++ /dev/null @@ -1,85 +0,0 @@ -# Tutorial for SLURM clusters - -1. Download [cromwell](https://github.com/broadinstitute/cromwell) on your `$HOME` directory. - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -2. Git clone this pipeline and move into its directory. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -3. Download a SUBSAMPLED (1/400) paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ENCSR936XTK_fastq_subsampled.tar - $ tar xvf ENCSR936XTK_fastq_subsampled.tar - ``` - -4. Download pre-built chr19/chrM-only genome database for hg38. - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar - $ tar xvf test_genome_database_hg38_chr19_chrM_chip.tar - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users - -5. [Install Conda](https://conda.io/miniconda.html). Skip this if you already have equivalent Conda alternatives (Anaconda Python). Download and run the [installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Agree to the license term by typing `yes`. It will ask you about the installation location. On Stanford clusters (Sherlock and SCG4), we recommend to install it outside of your `$HOME` directory since its filesystem is slow and has very limited space. At the end of the installation, choose `yes` to add Miniconda's binary to `$PATH` in your BASH startup script. - ```bash - $ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh - $ bash Miniconda3-latest-Linux-x86_64.sh - ``` - -6. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -7. Run a pipeline for the test sample. Try without partition and account settings first. If your cluster requires to specify any of them then add one to the command line. - ```bash - $ sbatch --partition [YOUR_PARTITION] --account [YOUR_ACCOUNT] dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh - ``` - -## For singularity users - -6. CHECK YOUR SINGULARITY VERSION FIRST AND UPGRADE IT TO A VERSION `>=2.5.2` OR PIPELINE WILL NOT WORK CORRECTLY. - ```bash - $ singularity --version - ``` - -7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. - ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.3.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 - ``` - -8. Run a pipeline for the test sample. If your cluster requires to specify any of them then add one to the command line. - ```bash - $ sbatch --partition [YOUR_PARTITION] --account [YOUR_ACCOUNT] dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh - ``` - -## For all users - -8. It will take about an hour. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -9. See full specification for [input JSON file](input.md). - -10. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`dev/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=dev/examples/...`. - -## For singularity users - -11. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/slurm.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." - } - } - ``` diff --git a/docs/deprecated/tutorial_slurm_backend.md b/docs/deprecated/tutorial_slurm_backend.md deleted file mode 100644 index b61d0cb3..00000000 --- a/docs/deprecated/tutorial_slurm_backend.md +++ /dev/null @@ -1,128 +0,0 @@ -# Tutorial for SLURM clusters - -1. Download [cromwell](https://github.com/broadinstitute/cromwell) on your `$HOME` directory. - ```bash - $ cd - $ wget https://github.com/broadinstitute/cromwell/releases/download/38/cromwell-38.jar - $ chmod +rx cromwell-38.jar - ``` - -2. Git clone this pipeline and move into it. - ```bash - $ cd - $ git clone https://github.com/ENCODE-DCC/chip-seq-pipeline2 - $ cd chip-seq-pipeline2 - ``` - -3. Download a SUBSAMPLED (1/400) paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ENCSR936XTK_fastq_subsampled.tar - $ tar xvf ENCSR936XTK_fastq_subsampled.tar - ``` - -4. Download pre-built genome database for hg38. - ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chip.tar - $ tar xvf test_genome_database_hg38_chip.tar - ``` - -5. Set your partition/account in `dev/workflow_opts/slurm.json`. If your SLURM cluster does not require either user's partition or account information, then remove them from this file. Otherwise, `YOUR_SLURM_PARTITON` or `YOUR_SLURM_ACCOUNT` will be used internally for `srun ... --partition YOUR_SLURM_PARTITON` or `srun ... --account YOUR_SLURM_PARTITON`, respectively. - ```javascript - { - "default_runtime_attributes" : { - "slurm_partition": "YOUR_SLURM_PARTITON", - "slurm_account": "YOUR_SLURM_ACCOUNT" - } - } - ``` - -Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](https://singularity.lbl.gov/). - -## For Conda users, - -6. [Install Conda](https://conda.io/miniconda.html) - -7. Install Conda dependencies. - ```bash - $ bash conda/uninstall_dependencies.sh # to remove any existing pipeline env - $ bash conda/install_dependencies.sh - ``` - -8. Run a pipeline for the test sample. - ```bash - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/local/ENCSR936XTK_subsampled.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/slurm.json - ``` - -9. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -10. See full specification for [input JSON file](input.md). - -## For singularity users - -6. CHECK YOUR SINGULARITY VERSION FIRST AND UPGRADE IT TO A VERSION `>=2.5.2` OR PIPELINE WILL NOT WORK CORRECTLY. - ```bash - $ singularity --version - ``` - -7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. - ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.3.2.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 - ``` - -8. Run a pipeline for the test sample. - ```bash - $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=dev/examples/local/ENCSR936XTK_subsampled.json - $ java -jar -Xmx1G -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm_singularity cromwell-38.jar run chip.wdl -i ${INPUT} -o dev/workflow_opts/slurm.json - ``` - -9. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. - -10. See full specification for [input JSON file](input.md). - -11. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `dev/workflow_opts/slurm.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. - ```javascript - { - "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.3.2.simg", - "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." - } - } - ``` - -## Running multiple pipelines with cromwell server mode - -1. If you want to run multiple (>10) pipelines, then run a cromwell server on an interactive node. We recommend to use `screen` or `tmux` to keep your session alive and note that all running pipelines will be killed after walltime. Run a Cromwell server with the following commands. You can skip `-p [YOUR_SLURM_PARTITION]` or `--account [YOUR_SLURM_ACCOUNT]` according to your cluster's SLURM configuration. - - ```bash - $ srun -n 2 --mem 5G -t 3-0 --qos normal -p [YOUR_SLURM_PARTITION] --account [YOUR_SLURM_ACCOUNT] --pty /bin/bash -i -l # 2 CPU, 5 GB RAM and 3 day walltime - $ hostname -f # to get [CROMWELL_SVR_IP] - ``` - - For Conda users, - ```bash - $ source activate encode-chip-seq-pipeline - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm cromwell-38.jar server - ``` - For singularity users, - ```bash - $ _JAVA_OPTIONS="-Xmx5G" java -jar -Dconfig.file=dev/backends/backend.conf -Dbackend.default=slurm_singularity cromwell-38.jar server - ``` - -2. You can modify `backend.providers.slurm.concurrent-job-limit` or `backend.providers.slurm_singularity.concurrent-job-limit` in `dev/backends/backend.conf` to increase maximum concurrent jobs. This limit is **not per sample**. It's for all sub-tasks of all submitted samples. - -3. On a login node, submit jobs to the cromwell server. You will get `[WORKFLOW_ID]` as a return value. Keep these workflow IDs for monitoring pipelines and finding outputs for a specific sample later. - ```bash - $ INPUT=YOUR_INPUT.json - $ curl -X POST --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1" \ - -F workflowSource=@chip.wdl \ - -F workflowInputs=@${INPUT} \ - -F workflowOptions=@dev/workflow_opts/slurm.json - ``` - - To monitor pipelines, see [cromwell server REST API description](http://cromwell.readthedocs.io/en/develop/api/RESTAPI/#cromwell-server-rest-api>) for more details. `squeue` will not give you enough information for monitoring jobs per sample. - ```bash - $ curl -X GET --header "Accept: application/json" -v "[CROMWELL_SVR_IP]:8000/api/workflows/v1/[WORKFLOW_ID]/status" - ``` From dce73c200deef4af42d459568f03db9a07a82bd2 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 17:11:11 -0700 Subject: [PATCH 08/15] doc: remove deprecated method from README --- README.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README.md b/README.md index 514a2973..0094fc3e 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,6 @@ An input JSON file specifies all the input parameters and files that are necessa [Input JSON file specification](docs/input.md) -## Running a pipeline without Caper - -> **WARNING**: This method has been deprecated. There are many unfixed known bugs. We no longer support it. - -Caper uses the cromwell workflow execution engine to run the workflow on the platform you specify. While we recommend you use caper, if you want to run cromwell directly without caper you can learn about that [here](docs/deprecated/OLD_METHOD.md). - ## Running a pipeline on DNAnexus You can also run this pipeline on DNAnexus without using Caper or Cromwell. There are two ways to build a workflow on DNAnexus based on our WDL. From fb81c5ef18dd0b79c5a4218760e343efc47bf41c Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 17:26:55 -0700 Subject: [PATCH 09/15] test: fix broken task-level tests (due to removal of backend.conf) --- dev/test/test_task/backend.conf | 354 ++++++++++++++++++++++++++++++++ dev/test/test_task/test.sh | 2 +- 2 files changed, 355 insertions(+), 1 deletion(-) create mode 100644 dev/test/test_task/backend.conf diff --git a/dev/test/test_task/backend.conf b/dev/test/test_task/backend.conf new file mode 100644 index 00000000..040ef6ca --- /dev/null +++ b/dev/test/test_task/backend.conf @@ -0,0 +1,354 @@ +include required(classpath("application")) + +backend { + default = "Local" + providers { + + pbs { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30 && sync" + concurrent-job-limit = 50 + runtime-attributes = """ + Int cpu = 1 + Int? gpu + Int? time + Int? memory_mb + """ + submit = """ + qsub \ + -N ${job_name} \ + -o ${out} \ + -e ${err} \ + ${true="-lselect=1:ncpus=" false="" defined(cpu)}${cpu}${true=":mem=" false="" defined(memory_mb)}${memory_mb}${true="mb" false="" defined(memory_mb)} \ + ${true="-lwalltime=" false="" defined(time)}${time}${true=":0:0" false="" defined(time)} \ + ${true="-lngpus=" false="" gpu>1}${if gpu>1 then gpu else ""} \ + -V \ + ${script} + """ + kill = "qdel ${job_id}" + check-alive = "qstat ${job_id}" + job-id-regex = "(\\d+).+" + } + } + + pbs_singularity { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30 && sync" + concurrent-job-limit = 50 + runtime-attributes = """ + Int cpu = 1 + Int? gpu + Int? time + Int? memory_mb + String singularity_container + String? singularity_bindpath + """ + submit = """ + echo "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" | qsub \ + -N ${job_name} \ + -o ${out} \ + -e ${err} \ + ${true="-lselect=1:ncpus=" false="" defined(cpu)}${cpu}${true=":mem=" false="" defined(memory_mb)}${memory_mb}${true="mb" false="" defined(memory_mb)} \ + ${true="-lwalltime=" false="" defined(time)}${time}${true=":0:0" false="" defined(time)} \ + ${true="-lngpus=" false="" gpu>1}${if gpu>1 then gpu else ""} \ + -V + # If you see an error "The job was aborted from outside Cromwell" + # then check your singularity settings in a workflow options JSON file + # (e.g. check if you have an image file defined by "singularity_container") + # Also, make sure that your input data files (and genome database files) + # are on directories recursively bound by + # "singularity_bindpath" in a workflow options JSON file + # or singularity's built-in environment variable SINGULARITY_BINDPATH. + """ + # cromwell is desinged to monitor rc (return code) file, which is generated/controlled + # in ${script}, so if singularity does not run it due to some problems in singuarlity's + # internal settings then rc file is not generated. + # this can result in hanging of a cromwell process. + # setting the below parameter enables monitoring by "check-alive". + # it will take about "exit-code-timeout-seconds" x 3 time to detect failure. + exit-code-timeout-seconds = 180 + + kill = "qdel ${job_id}" + check-alive = "qstat -j ${job_id}" + job-id-regex = "(\\d+)" + } + } + + slurm_singularity { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30 && sync" + concurrent-job-limit = 50 + runtime-attributes = """ + Int cpu = 1 + Int? gpu + Int? time + Int? memory_mb + String? slurm_partition + String? slurm_account + String? slurm_extra_param + String singularity_container + String? singularity_bindpath + """ + submit = """ + sbatch \ + --export=ALL \ + -J ${job_name} \ + -D ${cwd} \ + -o ${out} \ + -e ${err} \ + ${"-t " + time*60} \ + -n 1 \ + --ntasks-per-node=1 \ + ${true="--cpus-per-task=" false="" defined(cpu)}${cpu} \ + ${true="--mem=" false="" defined(memory_mb)}${memory_mb} \ + ${"-p " + slurm_partition} \ + ${"--account " + slurm_account} \ + ${true="--gres gpu:" false="" defined(gpu)}${gpu} \ + ${slurm_extra_param} \ + --wrap "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" + # If you see an error "The job was aborted from outside Cromwell" + # then check your singularity settings in a workflow options JSON file + # (e.g. check if you have an image file defined by "singularity_container") + # Also, make sure that your input data files (and genome database files) + # are on directories recursively bound by + # "singularity_bindpath" in a workflow options JSON file + # or singularity's built-in environment variable SINGULARITY_BINDPATH. + """ + kill = "scancel ${job_id}" + # cromwell is desinged to monitor rc (return code) file, which is generated/controlled + # in ${script}, so if singularity does not run it due to some problems in singuarlity's + # internal settings then rc file is not generated. + # this can result in hanging of a cromwell process. + # setting the below parameter enables monitoring by "check-alive". + # it will take about "exit-code-timeout-seconds" x 3 time to detect failure. + exit-code-timeout-seconds = 180 + + # cromwell responds only to non-zero exit code from "check-alive", + # but "squeue -j [JOB_ID]" returns zero exit code even when job is not found + # workaround to exit with 1 (like SGE's qstat -j [JOB_ID] does) for such cases. + check-alive = "CHK_ALIVE=$(squeue --noheader -j ${job_id}); if [ -z $CHK_ALIVE ]; then /bin/bash -c 'exit 1'; else echo $CHK_ALIVE; fi" + job-id-regex = "Submitted batch job (\\d+).*" + } + } + + sge_singularity { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30 && sync" + concurrent-job-limit = 50 + runtime-attributes = """ + String sge_pe = "shm" + Int cpu = 1 + Int? gpu + Int? time + Int? memory_mb + String? sge_queue + String? sge_extra_param + String singularity_container + String? singularity_bindpath + """ + submit = """ + echo "SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script}" | qsub \ + -S /bin/sh \ + -terse \ + -b n \ + -N ${job_name} \ + -wd ${cwd} \ + -o ${out} \ + -e ${err} \ + ${if cpu>1 then "-pe " + sge_pe + " " else ""}${if cpu>1 then cpu else ""} \ + ${true="-l h_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ + ${true="-l s_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ + ${true="-l h_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ + ${true="-l s_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ + ${"-q " + sge_queue} \ + ${"-l gpu=" + gpu} \ + ${sge_extra_param} \ + -V + # If you see an error "The job was aborted from outside Cromwell" + # then check your singularity settings in a workflow options JSON file + # (e.g. check if you have an image file defined by "singularity_container") + # Also, make sure that your input data files (and genome database files) + # are on directories recursively bound by + # "singularity_bindpath" in a workflow options JSON file + # or singularity's built-in environment variable SINGULARITY_BINDPATH. + """ + # cromwell is desinged to monitor rc (return code) file, which is generated/controlled + # in ${script}, so if singularity does not run it due to some problems in singuarlity's + # internal settings then rc file is not generated. + # this can result in hanging of a cromwell process. + # setting the below parameter enables monitoring by "check-alive". + # it will take about "exit-code-timeout-seconds" x 3 time to detect failure. + exit-code-timeout-seconds = 180 + + kill = "qdel ${job_id}" + check-alive = "qstat -j ${job_id}" + job-id-regex = "(\\d+)" + } + } + + singularity { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 5 && sync" + concurrent-job-limit = 10 + run-in-background = true + runtime-attributes = """ + Int? gpu + String singularity_container + String? singularity_bindpath + """ + submit = """ + SINGULARITY_BINDPATH=$(echo ${cwd} | sed 's/cromwell-executions/\n/g' | head -n1)cromwell-executions,${singularity_bindpath},$SINGULARITY_BINDPATH singularity exec --cleanenv --home ${cwd} ${if defined(gpu) then '--nv' else ''} ${singularity_container} /bin/bash ${script} + """ + } + } + + Local { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + concurrent-job-limit = 10 + } + } + + sge { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30 && sync" + concurrent-job-limit = 50 + runtime-attributes = """ + String sge_pe = "shm" + Int cpu = 1 + Int? gpu + Int? time + Int? memory_mb + String? sge_queue + String? sge_extra_param + """ + submit = """ + qsub \ + -S /bin/sh \ + -terse \ + -b n \ + -N ${job_name} \ + -wd ${cwd} \ + -o ${out} \ + -e ${err} \ + ${if cpu>1 then "-pe " + sge_pe + " " else ""}${if cpu>1 then cpu else ""} \ + ${true="-l h_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ + ${true="-l s_vmem=$(expr " false="" defined(memory_mb)}${memory_mb}${true=" / " false="" defined(memory_mb)}${if defined(memory_mb) then cpu else ""}${true=")m" false="" defined(memory_mb)} \ + ${true="-l h_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ + ${true="-l s_rt=" false="" defined(time)}${time}${true=":00:00" false="" defined(time)}\ + ${"-q " + sge_queue} \ + ${true="-l gpu=" false="" defined(gpu)}${gpu} \ + ${sge_extra_param} \ + -V \ + ${script} + """ + kill = "qdel ${job_id}" + check-alive = "qstat -j ${job_id}" + job-id-regex = "(\\d+)" + } + } + + slurm { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + script-epilogue = "sleep 30" + concurrent-job-limit = 50 + runtime-attributes = """ + Int cpu = 1 + Int? gpu + Int? time + Int? memory_mb + String? slurm_partition + String? slurm_account + String? slurm_extra_param + """ + submit = """ + sbatch \ + --export=ALL \ + -J ${job_name} \ + -D ${cwd} \ + -o ${out} \ + -e ${err} \ + ${"-t " + time*60} \ + -n 1 \ + --ntasks-per-node=1 \ + ${true="--cpus-per-task=" false="" defined(cpu)}${cpu} \ + ${true="--mem=" false="" defined(memory_mb)}${memory_mb} \ + ${"-p " + slurm_partition} \ + ${"--account " + slurm_account} \ + ${true="--gres gpu:" false="" defined(gpu)}${gpu} \ + ${slurm_extra_param} \ + --wrap "/bin/bash ${script}" + """ + kill = "scancel ${job_id}" + check-alive = "squeue -j ${job_id}" + job-id-regex = "Submitted batch job (\\d+).*" + } + } + + google { + actor-factory = "cromwell.backend.google.pipelines.v2alpha1.PipelinesApiLifecycleActorFactory" + + config { + # Google project + project = "your-project-name" + + # Base bucket for workflow executions + root = "gs://your-bucket-name" + + concurrent-job-limit = 1000 + genomics-api-queries-per-100-seconds = 1000 + maximum-polling-interval = 600 + + genomics { + auth = "application-default" + compute-service-account = "default" + endpoint-url = "https://genomics.googleapis.com/" + restrict-metadata-access = false + } + + filesystems { + gcs { + auth = "application-default" + } + } + } + } + } +} + +services { + LoadController { + class = "cromwell.services.loadcontroller.impl.LoadControllerServiceActor" + config { + # disable it (for login nodes on Stanford SCG, Sherlock) + control-frequency = 21474834 seconds + } + } +} + +system { + abort-jobs-on-terminate = true + graceful-server-shutdown = true +} + +call-caching { + enabled = false + invalidate-bad-cache-results = true +} + +google { + application-name = "cromwell" + auths = [ + { + name = "application-default" + scheme = "application_default" + } + ] +} diff --git a/dev/test/test_task/test.sh b/dev/test/test_task/test.sh index 324fdad2..f107c3fa 100755 --- a/dev/test/test_task/test.sh +++ b/dev/test/test_task/test.sh @@ -26,7 +26,7 @@ else wget -N -c https://storage.googleapis.com/encode-pipeline-test-samples/cromwell_jar/cromwell-42.jar fi CROMWELL_JAR=cromwell-42.jar -BACKEND_CONF=../../backends/backend.conf +BACKEND_CONF=backend.conf BACKEND=Local EXTRA_PARAM="-Dbackend.providers.Local.config.concurrent-job-limit=2" PREFIX=$(basename ${WDL} .wdl) From 390b002401f1b796bcc6ff395bd86035ebd55422 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Mon, 28 Oct 2019 21:05:22 -0700 Subject: [PATCH 10/15] ver: v1.3.2 -> dev-v1.3.3 --- chip.wdl | 6 +++--- dev/dev.md | 12 +++--------- dev/test/test_task/test.sh | 2 +- dev/test/test_workflow/test_chip.sh | 2 +- docs/tutorial_dx_cli.md | 2 +- docs/tutorial_dx_web.md | 24 ++++++++++++------------ 6 files changed, 21 insertions(+), 27 deletions(-) diff --git a/chip.wdl b/chip.wdl index 66797433..72d99787 100644 --- a/chip.wdl +++ b/chip.wdl @@ -1,12 +1,12 @@ # ENCODE TF/Histone ChIP-Seq pipeline # Author: Jin Lee (leepc12@gmail.com) -#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 -#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 +#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 +#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.json workflow chip { - String pipeline_ver = 'v1.3.2' + String pipeline_ver = 'dev-v1.3.3' ### sample name, description String title = 'Untitled' String description = 'No description' diff --git a/dev/dev.md b/dev/dev.md index bc49fe39..52a9030d 100644 --- a/dev/dev.md +++ b/dev/dev.md @@ -2,18 +2,12 @@ ## Command line for version change ```bash -PREV_VER=v1.3.2 -NEW_VER=v1.3.2 +PREV_VER=dev-v1.3.3 +NEW_VER=dev-v1.3.3 for f in $(grep -rl ${PREV_VER} --include=*.{wdl,md,sh}) do sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f} done -cd dev/workflow_opts -for f in $(grep -rl ${PREV_VER} --include=*.json) -do - sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f} -done -cd ../../ ``` ## Building templates on DX for each genome @@ -24,7 +18,7 @@ Run the following command line locally to build out DX workflows for this pipeli ```bash # version -VER=v1.3.2 +VER=dev-v1.3.3 DOCKER=quay.io/encode-dcc/chip-seq-pipeline:$VER # general diff --git a/dev/test/test_task/test.sh b/dev/test/test_task/test.sh index f107c3fa..61d3a164 100755 --- a/dev/test/test_task/test.sh +++ b/dev/test/test_task/test.sh @@ -12,7 +12,7 @@ INPUT=$2 if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 + DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 fi if [ $# -gt 3 ]; then NUM_TASK=$4 diff --git a/dev/test/test_workflow/test_chip.sh b/dev/test/test_workflow/test_chip.sh index a6b19fc1..f08bc884 100755 --- a/dev/test/test_workflow/test_chip.sh +++ b/dev/test/test_workflow/test_chip.sh @@ -8,7 +8,7 @@ fi if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 + DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 fi INPUT=$1 GCLOUD_SERVICE_ACCOUNT_SECRET_JSON_FILE=$2 diff --git a/docs/tutorial_dx_cli.md b/docs/tutorial_dx_cli.md index b04ab6a5..d480f442 100644 --- a/docs/tutorial_dx_cli.md +++ b/docs/tutorial_dx_cli.md @@ -45,7 +45,7 @@ This document describes instruction for the item 1). ```bash $ PROJECT=[YOUR_PROJECT_NAME] $ OUT_FOLDER=/test_sample_chip_ENCSR936XTK_subsampled_chr19_only - $ DOCKER=quay.io/encode-dcc/chip-seq-pipeline:v1.3.2 + $ DOCKER=quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 $ java -jar dxWDL-0.77.jar compile chip.wdl -project ${PROJECT} -f -folder ${OUT_FOLDER} -defaults ${INPUT} -extras <(echo "{\"default_runtime_attributes\":{\"docker\":\"${DOCKER}\"}}") ``` diff --git a/docs/tutorial_dx_web.md b/docs/tutorial_dx_web.md index 33518e50..fd587f77 100644 --- a/docs/tutorial_dx_web.md +++ b/docs/tutorial_dx_web.md @@ -15,8 +15,8 @@ This document describes instruction for the item 2). 3. Move to one of the following workflow directories according to the platform you have chosen for your project (AWS or Azure). These DX workflows are pre-built with all parameters defined. -* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.3.2/test_ENCSR936XTK_subsampled_chr19_only) -* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.3.2/test_ENCSR936XTK_subsampled_chr19_only) +* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/test_ENCSR936XTK_subsampled_chr19_only) +* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/test_ENCSR936XTK_subsampled_chr19_only) 4. Copy it to your project by right-clicking on the DX workflow `chip` and choose "Copy". @@ -40,16 +40,16 @@ This document describes instruction for the item 2). 1. DNAnexus allows only one copy of a workflow per project. The example workflow in the previous section is pre-built for the subsampled test sample [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/) with all parameters defined already. 2. Copy one of the following workflows according to the platform you have chosen for your project (AWS or Azure). -* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.3.2/general) without pre-defined reference genome. -* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.3.2/hg38) with pre-defined hg38 reference genome. -* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.3.2/hg19) with pre-defined hg19 reference genome. -* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.3.2/mm10) with pre-defined mm10 reference genome. -* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.3.2/mm9) with pre-defined mm9 reference genome. -* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.3.2/general) without pre-defined reference genome. -* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.3.2/hg38) with pre-defined hg38 reference genome. -* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.3.2/hg19) with pre-defined hg19 reference genome. -* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.3.2/mm10) with pre-defined mm10 reference genome. -* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.3.2/mm9) with pre-defined mm9 reference genome. +* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/general) without pre-defined reference genome. +* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/hg38) with pre-defined hg38 reference genome. +* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/hg19) with pre-defined hg19 reference genome. +* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/mm10) with pre-defined mm10 reference genome. +* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/mm9) with pre-defined mm9 reference genome. +* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/general) without pre-defined reference genome. +* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/hg38) with pre-defined hg38 reference genome. +* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/hg19) with pre-defined hg19 reference genome. +* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/mm10) with pre-defined mm10 reference genome. +* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/mm9) with pre-defined mm9 reference genome. 3. Click on the DX workflow `chip`. From 5760f4d9bb898c9f57d9ca551d24c2917ec112f3 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Tue, 29 Oct 2019 15:18:54 -0700 Subject: [PATCH 11/15] new feature: nadded params for some tasks with java -Xmx (for picard tools) --- chip.wdl | 18 ++++++++++++++++-- dev/test/test_task/test_filter.wdl | 4 ++++ dev/test/test_task/test_gc_bias.wdl | 2 +- docs/input.md | 9 +++++++++ example_input_json/template.full.json | 5 ++++- src/encode_task_filter.py | 13 ++++++++++--- src/encode_task_fraglen_stat_pe.py | 18 ++++++++++++++---- src/encode_task_gc_bias.py | 18 ++++++++++++++---- src/encode_task_preseq.py | 16 ++++++++++++---- src/encode_task_tss_enrich.py | 9 +++++++++ 10 files changed, 93 insertions(+), 19 deletions(-) diff --git a/chip.wdl b/chip.wdl index 72d99787..9ee8c923 100644 --- a/chip.wdl +++ b/chip.wdl @@ -127,6 +127,9 @@ workflow chip { Int call_peak_time_hr = 72 String call_peak_disks = 'local-disk 200 HDD' + String filter_picard_java_heap = '4G' + String gc_bias_picard_java_heap = '6G' + #### input file definition # pipeline can start from any type of inputs and then leave all other types undefined # supported types: fastq, bam, nodup_bam (filtered bam), ta (tagAlign), peak @@ -424,6 +427,7 @@ workflow chip { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = filter_picard_java_heap, time_hr = filter_time_hr, disks = filter_disks, } @@ -469,6 +473,7 @@ workflow chip { call gc_bias { input : nodup_bam = nodup_bam_, ref_fa = ref_fa_, + picard_java_heap = gc_bias_picard_java_heap, } } @@ -505,6 +510,7 @@ workflow chip { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = filter_picard_java_heap, time_hr = filter_time_hr, disks = filter_disks, } @@ -537,6 +543,7 @@ workflow chip { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = filter_picard_java_heap, time_hr = filter_time_hr, disks = filter_disks, } @@ -633,6 +640,7 @@ workflow chip { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = filter_picard_java_heap, time_hr = filter_time_hr, disks = filter_disks, } @@ -1280,8 +1288,10 @@ task filter { File chrsz # 2-col chromosome sizes file Boolean no_dup_removal # no dupe reads removal when filtering BAM String mito_chr_name + Int cpu Int mem_mb + String picard_java_heap Int time_hr String disks @@ -1296,7 +1306,8 @@ task filter { ${'--chrsz ' + chrsz} \ ${if no_dup_removal then '--no-dup-removal' else ''} \ ${'--mito-chr-name ' + mito_chr_name} \ - ${'--nth ' + cpu} + ${'--nth ' + cpu} \ + ${'--picard-java-heap ' + picard_java_heap} } output { File nodup_bam = glob('*.bam')[0] @@ -1760,10 +1771,13 @@ task gc_bias { File nodup_bam File ref_fa + String picard_java_heap + command { python3 $(which encode_task_gc_bias.py) \ ${'--nodup-bam ' + nodup_bam} \ - ${'--ref-fa ' + ref_fa} + ${'--ref-fa ' + ref_fa} \ + ${'--picard-java-heap ' + picard_java_heap} } output { File gc_plot = glob('*.gc_plot.png')[0] diff --git a/dev/test/test_task/test_filter.wdl b/dev/test/test_task/test_filter.wdl index 22fbb53b..98c721b5 100644 --- a/dev/test/test_task/test_filter.wdl +++ b/dev/test/test_task/test_filter.wdl @@ -39,6 +39,7 @@ workflow test_filter { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = '4G', time_hr = filter_time_hr, disks = filter_disks, } @@ -54,6 +55,7 @@ workflow test_filter { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = '4G', time_hr = filter_time_hr, disks = filter_disks, } @@ -69,6 +71,7 @@ workflow test_filter { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = '4G', time_hr = filter_time_hr, disks = filter_disks, } @@ -84,6 +87,7 @@ workflow test_filter { cpu = filter_cpu, mem_mb = filter_mem_mb, + picard_java_heap = '4G', time_hr = filter_time_hr, disks = filter_disks, } diff --git a/dev/test/test_task/test_gc_bias.wdl b/dev/test/test_task/test_gc_bias.wdl index b9ff4bf2..1786a1ce 100644 --- a/dev/test/test_task/test_gc_bias.wdl +++ b/dev/test/test_task/test_gc_bias.wdl @@ -12,8 +12,8 @@ workflow test_gc_bias { call chip.gc_bias { input : nodup_bam = nodup_bam, - ref_fa = ref_fa, + picard_java_heap = '4G', } call remove_comments_from_gc_log { input : diff --git a/docs/input.md b/docs/input.md index 75db8beb..ca747db2 100644 --- a/docs/input.md +++ b/docs/input.md @@ -254,6 +254,15 @@ Parameter|Default `chip.macs2_signal_track_time_hr` | 24 `chip.macs2_signal_track_disks` | `local-disk 200 HDD` +> **IMPORTANT**: If you see Java memory errors, check the following resource parameters. + +There are special parameters to control maximum Java heap memory (e.g. `java -Xmx4G`) for Picard tools. They are strings including size units. Such string will be directly appended to Java's parameter `-Xmx`. + +Parameter|Default +---------|------- +`chip.filter_picard_java_heap` | `4G` +`chip.gc_bias_picard_java_heap` | `6G` + ## How to use a custom aligner ENCODE ChIP-Seq pipeline currently supports `bwa` and `bowtie2`. In order to use your own aligner you need to define the following parameters first. You can define `custom_aligner_idx_tar` either in your input JSON file or in your genome TSV file. Such index TAR file should be an uncompressed TAR file without any directory structured. diff --git a/example_input_json/template.full.json b/example_input_json/template.full.json index e4cabad0..e8571010 100644 --- a/example_input_json/template.full.json +++ b/example_input_json/template.full.json @@ -86,5 +86,8 @@ "chip.macs2_signal_track_mem_mb" : 16000, "chip.macs2_signal_track_time_hr" : 24, - "chip.macs2_signal_track_disks" : "local-disk 200 HDD" + "chip.macs2_signal_track_disks" : "local-disk 200 HDD", + + "chip.filter_picard_java_heap" : "4G", + "chip.gc_bias_picard_java_heap" : "6G" } diff --git a/src/encode_task_filter.py b/src/encode_task_filter.py index 763d2c52..5af891e1 100755 --- a/src/encode_task_filter.py +++ b/src/encode_task_filter.py @@ -40,6 +40,9 @@ def parse_arguments(): help='Mito chromosome name.') parser.add_argument('--nth', type=int, default=1, help='Number of threads to parallelize.') + parser.add_argument('--picard-java-heap', + help='Picard\'s Java max. heap: java -jar picard.jar ' + '-Xmx[MAX_HEAP]') parser.add_argument('--out-dir', default='', type=str, help='Output directory.') parser.add_argument('--log-level', default='INFO', @@ -144,15 +147,19 @@ def rm_unmapped_lowq_reads_pe(bam, multimapping, mapq_thresh, nth, out_dir): return filt_bam -def mark_dup_picard(bam, out_dir): # shared by both se and pe +def mark_dup_picard(bam, out_dir, java_heap=None): # shared by both se and pe prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step prefix = strip_ext(prefix, 'filt') dupmark_bam = '{}.dupmark.bam'.format(prefix) dup_qc = '{}.dup.qc'.format(prefix) + if java_heap is None: + java_heap_param = '-Xmx4G' + else: + java_heap_param = '-Xmx{}'.format(java_heap) - cmd = 'java -Xmx4G -XX:ParallelGCThreads=1 -jar ' + cmd = 'java {} -XX:ParallelGCThreads=1 -jar '.format(java_heap_param) cmd += locate_picard() cmd += ' MarkDuplicates ' # cmd = 'picard MarkDuplicates ' @@ -295,7 +302,7 @@ def main(): log.info('Marking dupes with {}...'.format(args.dup_marker)) if args.dup_marker == 'picard': dupmark_bam, dup_qc = mark_dup_picard( - filt_bam, args.out_dir) + filt_bam, args.out_dir, args.picard_java_heap) elif args.dup_marker == 'sambamba': dupmark_bam, dup_qc = mark_dup_sambamba( filt_bam, args.nth, args.out_dir) diff --git a/src/encode_task_fraglen_stat_pe.py b/src/encode_task_fraglen_stat_pe.py index fcecec82..8102ac8e 100755 --- a/src/encode_task_fraglen_stat_pe.py +++ b/src/encode_task_fraglen_stat_pe.py @@ -86,6 +86,9 @@ def parse_arguments(): parser = argparse.ArgumentParser(prog='ENCODE fragment length stat') parser.add_argument('--nodup-bam', type=str, help='Raw BAM file (from task filter).') + parser.add_argument('--picard-java-heap', + help='Picard\'s Java max. heap: java -jar picard.jar ' + '-Xmx[MAX_HEAP]') parser.add_argument('--out-dir', default='', type=str, help='Output directory.') parser.add_argument('--log-level', default='INFO', help='Log level', @@ -107,14 +110,18 @@ def read_picard_histogram(data_file): return data -def get_insert_distribution(final_bam, prefix): +def get_insert_distribution(final_bam, prefix, java_heap=None): ''' Calls Picard CollectInsertSizeMetrics ''' log.info('insert size distribution...') insert_data = '{0}.inserts.hist_data.log'.format(prefix) insert_plot = '{0}.inserts.hist_graph.pdf'.format(prefix) - graph_insert_dist = ('java -Xmx6G -XX:ParallelGCThreads=1 -jar ' + if java_heap is None: + java_heap_param = '-Xmx6G' + else: + java_heap_param = '-Xmx{}'.format(java_heap) + graph_insert_dist = ('java {4} -XX:ParallelGCThreads=1 -jar ' '{3} ' 'CollectInsertSizeMetrics ' 'INPUT={0} OUTPUT={1} H={2} ' @@ -123,7 +130,8 @@ def get_insert_distribution(final_bam, prefix): 'W=1000 STOP_AFTER=5000000').format(final_bam, insert_data, insert_plot, - locate_picard()) + locate_picard(), + java_heap_param) log.info(graph_insert_dist) os.system(graph_insert_dist) return insert_data, insert_plot @@ -213,10 +221,12 @@ def main(): args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) + JAVA_HEAP = args.picard_java_heap # Insert size distribution - CAN'T GET THIS FOR SE FILES insert_data, insert_plot = get_insert_distribution(RG_FREE_FINAL_BAM, - OUTPUT_PREFIX) + OUTPUT_PREFIX, + JAVA_HEAP) # Also need to run n-nucleosome estimation fragment_length_qc(read_picard_histogram(insert_data), OUTPUT_PREFIX) diff --git a/src/encode_task_gc_bias.py b/src/encode_task_gc_bias.py index 7a86061c..93b1264f 100755 --- a/src/encode_task_gc_bias.py +++ b/src/encode_task_gc_bias.py @@ -25,6 +25,9 @@ def parse_arguments(): parser.add_argument('--nodup-bam', type=str, help='Raw BAM file (from task filter).') parser.add_argument('--ref-fa', type=str, help='Reference fasta file.') + parser.add_argument('--picard-java-heap', + help='Picard\'s Java max. heap: java -jar picard.jar ' + '-Xmx[MAX_HEAP]') parser.add_argument('--out-dir', default='', type=str, help='Output directory.') parser.add_argument('--log-level', default='INFO', help='Log level', @@ -36,7 +39,7 @@ def parse_arguments(): return args -def get_gc(qsorted_bam_file, reference_fasta, prefix): +def get_gc(qsorted_bam_file, reference_fasta, prefix, java_heap=None): ''' Uses picard tools (CollectGcBiasMetrics). Note that the reference MUST be the same fasta file that generated the bowtie indices. @@ -47,7 +50,11 @@ def get_gc(qsorted_bam_file, reference_fasta, prefix): output_file = '{0}.gc.txt'.format(prefix) plot_file = '{0}.gcPlot.pdf'.format(prefix) summary_file = '{0}.gcSummary.txt'.format(prefix) - get_gc_metrics = ('java -Xmx6G -XX:ParallelGCThreads=1 -jar ' + if java_heap is None: + java_heap_param = '-Xmx6G' + else: + java_heap_param = '-Xmx{}'.format(java_heap) + get_gc_metrics = ('java {6} -XX:ParallelGCThreads=1 -jar ' '{5} ' 'CollectGcBiasMetrics R={0} I={1} O={2} ' 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' @@ -58,7 +65,8 @@ def get_gc(qsorted_bam_file, reference_fasta, prefix): output_file, plot_file, summary_file, - locate_picard()) + locate_picard(), + java_heap_param) logging.info(get_gc_metrics) os.system(get_gc_metrics) return output_file, plot_file, summary_file @@ -114,10 +122,12 @@ def main(): args.out_dir, os.path.basename(strip_ext_bam(FINAL_BAM))) RG_FREE_FINAL_BAM = remove_read_group(FINAL_BAM) + JAVA_HEAP = args.picard_java_heap gc_out, gc_plot_pdf, gc_summary = get_gc(RG_FREE_FINAL_BAM, REF, - OUTPUT_PREFIX) + OUTPUT_PREFIX, + JAVA_HEAP) # will generate PNG format from gc_out plot_gc(gc_out, OUTPUT_PREFIX) diff --git a/src/encode_task_preseq.py b/src/encode_task_preseq.py index 5dc2464a..2c6b5e78 100755 --- a/src/encode_task_preseq.py +++ b/src/encode_task_preseq.py @@ -24,6 +24,9 @@ def parse_arguments(): parser.add_argument('--paired-end', action="store_true", help='Paired-end BAM.') parser.add_argument('--bam', type=str, help='Raw BAM file.') + parser.add_argument('--picard-java-heap', + help='Picard\'s Java max. heap: java -jar picard.jar ' + '-Xmx[MAX_HEAP]') parser.add_argument('--out-dir', default='', type=str, help='Output directory.') parser.add_argument('--log-level', default='INFO', help='Log level', @@ -35,21 +38,25 @@ def parse_arguments(): return args -def get_picard_complexity_metrics(aligned_bam, prefix): +def get_picard_complexity_metrics(aligned_bam, prefix, java_heap=None): ''' Picard EsimateLibraryComplexity ''' # remove redundant (or malformed) info (read group) from bam out_file = '{0}.picardcomplexity.qc'.format(prefix) + if java_heap is None: + java_heap_param = '-Xmx6G' + else: + java_heap_param = '-Xmx{}'.format(java_heap) get_gc_metrics = ( 'mkdir -p tmp_java && java -Djava.io.tmpdir=$PWD/tmp_java ' - '-Xmx6G -XX:ParallelGCThreads=1 -jar ' + '{3} -XX:ParallelGCThreads=1 -jar ' '{2} ' 'EstimateLibraryComplexity INPUT={0} OUTPUT={1} ' 'USE_JDK_DEFLATER=TRUE USE_JDK_INFLATER=TRUE ' 'VERBOSITY=ERROR ' 'QUIET=TRUE && rm -rf tmp_java').format( - aligned_bam, out_file, locate_picard()) + aligned_bam, out_file, locate_picard(), java_heap_param) os.system(get_gc_metrics) # Extract the actual estimated library size @@ -128,10 +135,11 @@ def main(): args.out_dir, os.path.basename(strip_ext_bam(ALIGNED_BAM))) RG_FREE_ALIGNED_BAM = remove_read_group(ALIGNED_BAM) + JAVA_HEAP = args.picard_java_heap # Library complexity: Preseq results, NRF, PBC1, PBC2 if args.paired_end: picard_est_lib_size = get_picard_complexity_metrics( - RG_FREE_ALIGNED_BAM, OUTPUT_PREFIX) + RG_FREE_ALIGNED_BAM, OUTPUT_PREFIX, JAVA_HEAP) else: picard_est_lib_size = None preseq_data, preseq_log = run_preseq( diff --git a/src/encode_task_tss_enrich.py b/src/encode_task_tss_enrich.py index 6d4905aa..8c9c2cc7 100755 --- a/src/encode_task_tss_enrich.py +++ b/src/encode_task_tss_enrich.py @@ -25,6 +25,9 @@ def parse_arguments(): parser = argparse.ArgumentParser(prog='ENCODE TSS enrichment.') parser.add_argument('--read-len-log', type=str, help='Read length log file (from aligner task).') + parser.add_argument('--read-len', type=int, + help='Read length (integer). This is ignored if ' + '--read-len-log is defined.') parser.add_argument('--nodup-bam', type=str, help='Raw BAM file (from task filter).') parser.add_argument('--chrsz', type=str, @@ -36,6 +39,10 @@ def parse_arguments(): choices=['NOTSET', 'DEBUG', 'INFO', 'WARNING', 'CRITICAL', 'ERROR', 'CRITICAL']) args = parser.parse_args() + + if args.read_len_log is None and args.read_len is None: + raise ValueError('Either --read-len-log or --read-len must be defined.') + log.setLevel(args.log_level) log.info(sys.argv) return args @@ -144,6 +151,8 @@ def main(): if args.read_len_log: with open(args.read_len_log, 'r') as fp: read_len = int(fp.read().strip()) + elif args.read_len: + read_len = args.read_len else: read_len = None From d2f6e97122734a0260bad348dc9ba3961dd778d5 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Wed, 30 Oct 2019 14:31:28 -0700 Subject: [PATCH 12/15] doc: added TLDR version of input spec --- README.md | 5 +- docs/input_short.md | 242 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+), 1 deletion(-) create mode 100644 docs/input_short.md diff --git a/README.md b/README.md index 0094fc3e..9525a8c3 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,12 @@ Use `https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq ## Input JSON file +> **IMPORTANT**: DO NOT BLINDLY USE A TEMPLATE/EXAMPLE INPUT JSON. READ THROUGH THE FOLLOWING GUIDE TO MAKE A CORRECT INPUT JSON FILE. + An input JSON file specifies all the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data fastq file. Please make sure to specify absolute paths rather than relative paths in your input JSON files. -[Input JSON file specification](docs/input.md) + - [Input JSON file specification (short)](docs/input_short.md) + - [Input JSON file specification (long)](docs/input.md) ## Running a pipeline on DNAnexus diff --git a/docs/input_short.md b/docs/input_short.md new file mode 100644 index 00000000..2c672a4c --- /dev/null +++ b/docs/input_short.md @@ -0,0 +1,242 @@ +# Input JSON + +An input JSON file includes all genomic data files, parameters and metadata for running pipelines. Our pipeline will use default values if they are not defined in an input JSON file. We provide a set of template JSON files: [minimum](../example_input_json/template.json) and [full](../example_input_json/template.full.json). We recommend to use a minimum template instead of full one. A full template includes all parameters of the pipeline with default values defined. + +# Checklist + +Mandatory parameters: + + 1) Pipeline type + - `chip.pipeline_type`: `tf` for TF ChIP-seq or `histone` for histone ChIP-seq. One major difference between two types is that `tf` uses `spp` peak caller with controls but `histone` uses `macs2` peak caller without controls. + + 2) Experiment title/description + - `chip.title`: experiment title for a final HTML report. + - `chip.description`: experiment description for a final HTML report. + + 3) Read endedness + - `chip.paired_end`: `true` if ALL replicates are paired-ended. + - (Optional) `chip.paired_ends`: For samples with mixed read ends, you can define read endedness for each biological replicate (e.g. `[true, false]` means paired-ended biorep-1 and single-ended biorep-2). + - `chip.ctl_paired_end`: `true` if ALL controls are paired-ended. If not defined then `chip.paired_end` will be used. + - (Optional) `chip.ctl_paired_ends`: For controls with mixed read ends, you can define read endedness for each biological replicate (e.g. `[true, false]` means paired-ended biorep-1 and single-ended biorep-2). If not defined then `chip.paired_ends` will be used. + + 4) Reference genome + - `chip.genome_tsv`: Use `https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/[GENOME]_caper.tsv`. + - Supported `GENOME`s: are hg38, mm10, hg19 and mm9. + - We provide a genome TSV file that defines all genome-specific parameters and reference data files. Caper will automatically download big reference data files from our ENCODE repository. + - However, we also have reference data mirrors for [some platforms](input_details.md/#reference-genome) (GCP, AWS, Sherlock, SCG, ...). On these platforms, you can use a different TSV file to prevent downloading such big reference data. + - To build a new TSV file from use your own FASTA (`.fa` and `.2bit`) see [this](build_genome_database.md). + + 5) [Input files](#input-files) and [adapters](#adapters) + - See [this](#input-files) for how to define FASTQ/BAM/TAG-ALIGNs for your sample. + - See [this](#adapters) for how to define adapters to be trimmed. + + 6) Important parameters + - `chip.always_use_pooled_ctl`: (For TF ChIP-seq only) Always use a pooled control to compare with each replicate. If a single control is given then use it. It is disabled by default. + - `chip.ctl_depth_ratio`: (For TF ChIP-seq only) If ratio of depth between controls is higher than this. then always use a pooled control for all replicates. It's 1.2 by default. + + 7) [Resources](#resources) + - If your FASTQs/BAMs are big (>10GB) then try with higher resource settings, especially for memory (`chip.[TASK_NAME]_mem_mb`). + +Optional parameters: + + 8) Useful parameters + - `chip.subsample_reads`: Subsample experimet reads. This will affect all downsteam analyses including peak-calling. It's 0 by default, which means no subsampling. + - `chip.ctl_subsample_reads`: Subsample control reads. This will affect all downsteam analyses including peak-calling. It's 0 by default, which means no subsampling. + - `chip.fraglen`: Array of Integers. Fragment length for each bio replicate. If you start from FASTQs then our pipeline automatically estimate it from cross-correlation analysis (task `xcor`) result since such analysis requires a special treamtment for FASTQs. It is possible that fragment length is not estimated correctly (or pipeline can fail due to negative fraglen) if you start from different types (BAM/TAG-ALIGN). For such case, you can manually define fragment length for each bio rep. (e.g. `[200, 150]` means 200 for rep1 and 150 for rep2). + + 9) Flags + - `chip.align_only`: Peak calling and its downstream analyses will be disabled. Useful if you just want to align your FASTQs into filtered BAMs/TAG-ALIGNs and don't want to call peaks on them. + - `chip.true_rep_only`: Disable pseudo replicate generation and all related analyses + + +## Input files + +> **IMPORTANT**: Our pipeline considers a replicate (`rep`) as a biological replicate. You can still define technical replicates for each bio replicate. Tech replicates will be merged together to make a single FASTQ for each bio replicate. Controls can also have technical replicates. + +> **IMPORTANT**: Our pipeline supports up to 10 bio replicates and 10 controls. + +> **IMPORTANT**: Our pipeline has cross-validation analyses (IDR/overlap) comparing every pair of all replicates. Number of tasks for such analyses will be like nC2. This number will be 45 for 10 bio replicates. It's recommended to keep number of replicates <= 4. + +Pipeline can start from any of the following data types (FASTQ, BAM, NODUP_BAM and TAG-ALIGN). + + 1) Starting from FASTQs + - Technical replicates for each bio-rep will be **MERGED** in the very early stage of the pipeline. Each read end R1 and R2 have separate arrays `chip.fastqs_repX_R1` and `chip.fastqs_repX_R2`. Do not define R2 array for single-ended replicates. + - Example of 3 paired-ended biological replicates and 2 technical replicates for each bio rep. Two technical replicates `BIOREPX_TECHREP1.R1.fq.gz` and `BIOREPX_TECHREP2.R1.fq.gz` for each bio replicate will be merged. + + ```javascript + { + "chip.paired_end" : true, + "chip.fastqs_rep1_R1" : ["BIOREP1_TECHREP1.R1.fq.gz", "BIOREP1_TECHREP2.R1.fq.gz"], + "chip.fastqs_rep1_R2" : ["BIOREP1_TECHREP1.R2.fq.gz", "BIOREP1_TECHREP2.R2.fq.gz"], + "chip.fastqs_rep2_R1" : ["BIOREP2_TECHREP1.R1.fq.gz", "BIOREP2_TECHREP2.R1.fq.gz"], + "chip.fastqs_rep2_R2" : ["BIOREP2_TECHREP1.R2.fq.gz", "BIOREP2_TECHREP2.R2.fq.gz"], + "chip.fastqs_rep3_R1" : ["BIOREP3_TECHREP1.R1.fq.gz", "BIOREP3_TECHREP2.R1.fq.gz"], + "chip.fastqs_rep3_R2" : ["BIOREP3_TECHREP1.R2.fq.gz", "BIOREP3_TECHREP2.R2.fq.gz"] + } + ``` + + 2) Starting from BAMs + - Define a BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. + - Example of 3 singled-ended replicates. + ```javascript + { + "chip.paired_end" : false, + "chip.bams" : ["rep1.bam", "rep2.bam", "rep3.bam"] + } + ``` + + 3) Starting from filtered/deduped BAMs + - Define a filtered/deduped BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. These BAMs should not have unmapped reads or duplicates. + - Example of 2 singled-ended replicates. + ```javascript + { + "chip.paired_end" : false, + "chip.nodup_bams" : ["rep1.nodup.bam", "rep2.nodup.bam"] + } + ``` + + 4) Starting from TAG-ALIGN BEDs + - Define a TAG-ALIGN for each replicate. Our pipeline does not determine read endedness from a TAG-ALIGN file. You need to explicitly define read endedness. + - Example of 4 paired-ended replicates. + + ```javascript + { + "chip.paired_end" : true, + "chip.tas" : ["rep1.tagAlign.gz", "rep2.tagAlign.gz", "rep3.tagAlign.gz", "rep3.tagAlign.gz"] + } + ``` + +You need to define controls for TF ChIP-seq pipeline. Skip this if you want to run histone ChIP-seq pipelines. You can define controls similarly to experiment IP replicates. Just add `ctl_` prefix to parameter names. + + 1) Control FASTQs + - Technical replicates for each bio-rep will be **MERGED** in the very early stage of the pipeline. Each read end R1 and R2 have separate arrays `chip.ctl_fastqs_repX_R1` and `chip.ctl_fastqs_repX_R2`. Do not define R2 array for single-ended replicates. + - Example of 3 paired-ended biological replicates and 2 technical replicates for each bio rep. Two technical replicates `BIOREPX_TECHREP1.R1.fq.gz` and `BIOREPX_TECHREP2.R1.fq.gz` for each bio replicate will be merged. + + ```javascript + { + "chip.ctl_paired_end" : true, + "chip.ctl_fastqs_rep1_R1" : ["BIOREP1_TECHREP1.R1.fq.gz", "BIOREP1_TECHREP2.R1.fq.gz"], + "chip.ctl_fastqs_rep1_R2" : ["BIOREP1_TECHREP1.R2.fq.gz", "BIOREP1_TECHREP2.R2.fq.gz"], + "chip.ctl_fastqs_rep2_R1" : ["BIOREP2_TECHREP1.R1.fq.gz", "BIOREP2_TECHREP2.R1.fq.gz"], + "chip.ctl_fastqs_rep2_R2" : ["BIOREP2_TECHREP1.R2.fq.gz", "BIOREP2_TECHREP2.R2.fq.gz"], + } + ``` + + 2) Control BAMs + - Define a BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. + - Example of 3 singled-ended replicates. + + ```javascript + { + "chip.ctl_paired_end" : false, + "chip.ctl_bams" : ["ctl1.bam", "ctl2.bam", "ctl3.bam"] + } + ``` + + 3) Control BAMs + - Define a filtered/deduped BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. These BAMs should not have unmapped reads or duplicates. + - Example of 2 singled-ended replicates. + ```javascript + { + "chip.ctl_paired_end" : false, + "chip.ctl_nodup_bams" : ["ctl1.nodup.bam", "ctl2.nodup.bam"] + } + ``` + + 4) Control TAG-ALIGN BEDs + - Define a TAG-ALIGN for each replicate. Our pipeline does not determine read endedness from a TAG-ALIGN file. You need to explicitly define read endedness. + - Example of 4 paired-ended replicates. + + ```javascript + { + "chip.ctl_paired_end" : true, + "chip.ctl_tas" : ["ctl1.tagAlign.gz", "ctl2.tagAlign.gz", "ctl3.tagAlign.gz", "ctl4.tagAlign.gz"] + } + ``` + +You can also mix up different data types for individual bio replicate and control. For example, pipeline can start from FASTQs for rep1 (SE) and rep3 (PE), BAMs for rep2 (SE), NODUP_BAMs for rep4 (SE) and TAG-ALIGNs for rep5 (PE). This example has two controls (ctl1: SE BAM, ctl2: PE FASTQs). + +```javascript +{ + "chip.paired_ends" : [false, false, true, false, true], + "chip.fastqs_rep1_R1" : ["rep1.fastq.gz"], + "chip.fastqs_rep3_R1" : ["rep3.R1.fastq.gz"], + "chip.fastqs_rep3_R2" : ["rep3.R2.fastq.gz"], + "chip.bams" : [null, "rep2.bam", null, null, null], + "chip.nodup_bams" : [null, null, null, "rep4.nodup.bam", null], + "chip.tas" : [null, null, null, null, "rep5.tagAlign.gz"], + + "chip.ctl_paired_ends" : [false, true], + "chip.ctl_fastqs_rep2_R1" : ["ctl2.R1.fastq.gz"], + "chip.ctl_fastqs_rep2_R2" : ["ctl2.R2.fastq.gz"], + "chip.ctl_bams" : ["ctl1.bam", null], +} +``` + +## Resources + +> **WARNING**: It is recommened not to change the following parameters unless you get resource-related errors for a certain task and you want to increase resources for such task. The following parameters are provided for users who want to run our pipeline with Caper's `local` on HPCs and 2). + +Resources defined here are **PER BIO REPLICATE**. Therefore, total number of cores will be approximately `chip.align_cpu` x `NUMBER_OF_BIO_REPLICATES` because `align` is a bottlenecking task of the pipeline. This total number of cores will be useful **ONLY** when you use a `local` backend of Caper and manually `qsub` or `sbatch` your job. `disks` is used for Google Cloud and DNAnexus only. + + +Parameter|Default +---------|------- +`chip.align_cpu` | 4 +`chip.align_mem_mb` | 20000 +`chip.align_time_hr` | 48 +`chip.align_disks` | `local-disk 400 HDD` + +Parameter|Default +---------|------- +`chip.filter_cpu` | 2 +`chip.filter_mem_mb` | 20000 +`chip.filter_time_hr` | 24 +`chip.filter_disks` | `local-disk 400 HDD` + +Parameter|Default +---------|------- +`chip.bam2ta_cpu` | 2 +`chip.bam2ta_mem_mb` | 10000 +`chip.bam2ta_time_hr` | 6 +`chip.bam2ta_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.spr_mem_mb` | 16000 + +Parameter|Default +---------|------- +`chip.jsd_cpu` | 2 +`chip.jsd_mem_mb` | 12000 +`chip.jsd_time_hr` | 6 +`chip.jsd_disks` | `local-disk 200 HDD` + +Parameter|Default +---------|------- +`chip.xcor_cpu` | 2 +`chip.xcor_mem_mb` | 16000 +`chip.xcor_time_hr` | 24 +`chip.xcor_disks` | `local-disk 100 HDD` + +Parameter|Default +---------|------- +`chip.call_peak_cpu` | 2 +`chip.call_peak_mem_mb` | 16000 +`chip.call_peak_time_hr` | 24 +`chip.call_peak_disks` | `local-disk 200 HDD` + +Parameter|Default +---------|------- +`chip.macs2_signal_track_mem_mb` | 16000 +`chip.macs2_signal_track_time_hr` | 24 +`chip.macs2_signal_track_disks` | `local-disk 200 HDD` + +> **IMPORTANT**: If you see Java memory errors, check the following resource parameters. + +There are special parameters to control maximum Java heap memory (e.g. `java -Xmx4G`) for Picard tools. They are strings including size units. Such string will be directly appended to Java's parameter `-Xmx`. + +Parameter|Default +---------|------- +`chip.filter_picard_java_heap` | `4G` +`chip.gc_bias_picard_java_heap` | `6G` \ No newline at end of file From bd96aa5b504a65d9717189b6525cca54e466f175 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Wed, 30 Oct 2019 16:42:24 -0700 Subject: [PATCH 13/15] deprecate: old qc json organizer --- utils/qc_jsons_to_tsv/README.md | 52 ---- utils/qc_jsons_to_tsv/criteria.default.json | 56 ---- utils/qc_jsons_to_tsv/qc_jsons_to_tsv.py | 326 -------------------- utils/resumer/README.md | 189 ------------ utils/resumer/chip.json | 108 ------- utils/resumer/default.json | 1 - utils/resumer/resumer.py | 100 ------ 7 files changed, 832 deletions(-) delete mode 100644 utils/qc_jsons_to_tsv/README.md delete mode 100644 utils/qc_jsons_to_tsv/criteria.default.json delete mode 100755 utils/qc_jsons_to_tsv/qc_jsons_to_tsv.py delete mode 100644 utils/resumer/README.md delete mode 100644 utils/resumer/chip.json delete mode 120000 utils/resumer/default.json delete mode 100755 utils/resumer/resumer.py diff --git a/utils/qc_jsons_to_tsv/README.md b/utils/qc_jsons_to_tsv/README.md deleted file mode 100644 index d56466c8..00000000 --- a/utils/qc_jsons_to_tsv/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# qc_jsons_to_tsv - -## Introduction - -This python script finds `qc.json` (or can be specified) recursively, parses all of them and make a TSV spreadsheet file. - -## Usage - -``` -usage: qc.json parser for ENCODE ATAC/Chip-Seq pipelines [-h] - [--search-dir SEARCH_DIR] - [--out-file OUT_FILE] - [--criteria-def-json-file CRITERIA_DEF_JSON_FILE] - [--qc-json-file-basename QC_JSON_FILE_BASENAME] - [--tsv-mapping-qc-json-path-to-title TSV_MAPPING_QC_JSON_PATH_TO_TITLE] - [--tsv-mapping-workflow-id-to-title TSV_MAPPING_WORKFLOW_ID_TO_TITLE] - -Recursively find qc.json, parse it and make a TSV spreadsheet of all quality -metrics. - -optional arguments: - -h, --help show this help message and exit - --search-dir SEARCH_DIR - Root directory to be recursively searched for qc.json - (or --qc-json-file-basename). - --out-file OUT_FILE (Optional) Output TSV filename. Prints to STDOUT if - not defined. - --criteria-def-json-file CRITERIA_DEF_JSON_FILE - (Optional but important) Specify criteria definition - JSON file. "criteria" category will be added to the - output file/STDOUT. - --qc-json-file-basename QC_JSON_FILE_BASENAME - (Optional) Specify QC JSON file basename to be parsed. - Files with this name will be recursively found and - parsed. - --tsv-mapping-qc-json-path-to-title TSV_MAPPING_QC_JSON_PATH_TO_TITLE - (Optional) Two-column TSV (ABSOLUTE path for qc.json - (or --qc-json-file-basename) [TAB] title).This is - useful when you forgot to define titles for your - pipeline runs. - --tsv-mapping-workflow-id-to-title TSV_MAPPING_WORKFLOW_ID_TO_TITLE - (Optional) Two-column TSV (cromwell workflow ID [TAB] - title).This is useful when you forgot to define titles - for your pipeline runs. -``` - -## Examples - -``` -python qc_jsons_to_tsv.py --search-dir test/v1.1.4 --criteria-def-json-file criteria.default.json > test_v1.1.4.tsv -python qc_jsons_to_tsv.py --search-dir test/v1.1.5 --criteria-def-json-file criteria.default.json > test_v1.1.5.tsv -``` diff --git a/utils/qc_jsons_to_tsv/criteria.default.json b/utils/qc_jsons_to_tsv/criteria.default.json deleted file mode 100644 index cd808113..00000000 --- a/utils/qc_jsons_to_tsv/criteria.default.json +++ /dev/null @@ -1,56 +0,0 @@ -{ - "01_No. of mapped nodup non-mito reads" : { - "eval" : "qc['flagstat_qc']['rep?']['total']", - "condition_se" : { "OK" : ">= 25000000", "FAIL" : "< 25000000" }, - "condition_pe" : { "OK" : ">= 50000000", "FAIL" : "< 50000000" } - }, - "02_Alignment rate" : { - "eval" : "qc['nodup_flagstat_qc']['rep?']['total']/qc['flagstat_qc']['rep?']['total']", - "condition" : { "OK" : ">= 0.95", "ACCEPTABLE" : ">= 0.80", "FAIL" : "< 0.80" } - }, - "03_No. of IDR peaks" : { - "eval" : "qc['idr_reproducibility_qc']['N_opt']", - "condition" : { "OK" : ">= 70000", "FAIL" : "< 70000" } - }, - "04_No. of naive overlap peaks" : { - "eval" : "qc['overlap_reproducibility_qc']['N_opt']", - "condition" : { "OK" : ">= 150000", "FAIL" : "< 150000" } - }, - "05_IDR FRiP per replicate" : { - "eval" : "qc['idr_frip_qc']['rep?-pr']['FRiP']", - "condition" : { "OK" : ">= 0.1", "FAIL" : "< 0.1" } - }, - "05_Overlap FRiP per replicate" : { - "eval" : "qc['overlap_frip_qc']['rep?-pr']['FRiP']" - }, - "06_IDR FRiP of pooled sample" : { - "eval" : "qc['idr_frip_qc']['ppr']['FRiP']", - "condition" : { "OK" : ">= 0.1", "FAIL" : "< 0.1" } - }, - "06_Overlap FRiP of pooled sample" : { - "eval" : "qc['overlap_frip_qc']['ppr']['FRiP']", - "condition" : { "OK" : ">= 0.1", "FAIL" : "< 0.1" } - }, - "07_IDR reproducibility" : { - "eval" : "qc['idr_reproducibility_qc']['reproducibility']" - }, - "08_TSS enrichment" : { - "eval" : "qc['ataqc']['rep?']['TSS_enrichment']", - "condition" : { "OK" : ">= 10.0", "FAIL" : "< 10.0" } - }, - "09_NFR region" : { - "eval" : "qc['ataqc']['rep?']['Presence of NFR peak']" - }, - "10_mono-nuc region" : { - "eval" : "qc['ataqc']['rep?']['Presence of Mono-Nuc peak']" - }, - "11_NRF" : { - "eval" : "qc['pbc_qc']['rep?']['NRF']" - }, - "12_PBC1" : { - "eval" : "qc['pbc_qc']['rep?']['PBC1']" - }, - "13_PBC2" : { - "eval" : "qc['pbc_qc']['rep?']['PBC2']" - } -} \ No newline at end of file diff --git a/utils/qc_jsons_to_tsv/qc_jsons_to_tsv.py b/utils/qc_jsons_to_tsv/qc_jsons_to_tsv.py deleted file mode 100755 index e3a81d27..00000000 --- a/utils/qc_jsons_to_tsv/qc_jsons_to_tsv.py +++ /dev/null @@ -1,326 +0,0 @@ -#!/usr/bin/env python2 - -# written by Jin Lee, 2016 - -import os -import glob -import sys -import re -import argparse -import json -import csv -import hashlib -from collections import OrderedDict, defaultdict - -def parse_arguments(): - parser = argparse.ArgumentParser(prog='qc.json parser for ENCODE ATAC/Chip-Seq pipelines', - description='Recursively find qc.json, ' - 'parse it and make a TSV spreadsheet of all quality metrics.') - parser.add_argument('--search-dir', type=str, default='.', - help='Root directory to be recursively searched for qc.json (or --qc-json-file-basename).') - parser.add_argument('--out-file', type=argparse.FileType('w'), default=sys.stdout, \ - help='(Optional) Output TSV filename. Prints to STDOUT if not defined.') - parser.add_argument('--criteria-def-json-file', type=str, - help='(Optional but important) Specify criteria definition JSON file. ' - '"criteria" category will be added to the output file/STDOUT.') - parser.add_argument('--qc-json-file-basename', type=str, default='qc.json', - help='(Optional) Specify QC JSON file basename to be parsed. ' - 'Files with this name will be recursively found and parsed.') - parser.add_argument('--tsv-mapping-qc-json-path-to-title', type=str, - help='(Optional) Two-column TSV (ABSOLUTE path for qc.json (or --qc-json-file-basename) [TAB] title).' - 'This is useful when you forgot to define titles for your pipeline runs.') - parser.add_argument('--tsv-mapping-workflow-id-to-title', type=str, - help='(Optional) Two-column TSV (cromwell workflow ID [TAB] title).' - 'This is useful when you forgot to define titles for your pipeline runs.') - args = parser.parse_args() - return args - -def find_workflow_id_from_path(path): - abspath = os.path.abspath(path) - # *cromwell-executions/*/WORKFLOW_ID/call-* - pattern = 'cromwell-executions\/.+\/(.+)\/call-.+' - m = re.findall(pattern, abspath) - return m[0] if m else None - -def calc_md5(fname): - """https://stackoverflow.com/a/3431838 - """ - hash_md5 = hashlib.md5() - with open(fname, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() - -def recursively_read_qc_jsons(search_dir, qc_json_file_basename, - map_qc_json_path_to_title=None, map_workflow_id_to_title=None): - # find all qc.json recursively - json_files = [y for x in os.walk(search_dir) \ - for y in glob.glob(os.path.join(x[0], qc_json_file_basename))] - - uniq_json_files = [] - json_files_md5sum = set() - for json_file in json_files: - md5 = calc_md5(json_file) - if md5 in json_files_md5sum: - continue - json_files_md5sum.add(md5) - uniq_json_files.append(json_file) - - # read all qc.json files - qc_jsons = [] - for json_file in uniq_json_files: - with open(json_file,'r') as fp: - qc = json.load(fp, object_pairs_hook=OrderedDict) - - # backward compatibility: change qc.json format (v1.1.4->v1.1.5) - if 'name' in qc: # < v1.1.5 - if 'ataqc' in qc: - paired_end = (qc['ataqc'][0]['Paired/Single-ended']=="Paired-ended") - elif 'flagstat_qc' in qc: - paired_end = (qc['flagstat_qc'][0]['paired']>0) - elif 'nodup_flagstat_qc' in qc: - paired_end = (qc['nodup_flagstat_qc'][0]['paired']>0) - else: - paired_end = None - qc['general'] = OrderedDict() - qc['general']['genome'] = qc['ataqc'][0]['Genome'] if 'ataqc' in qc else None - qc['general']['description'] = qc.pop('desc') - qc['general']['title'] = qc.pop('name') - qc['general']['paired_end'] = paired_end - - for category_name in qc: - if type(qc[category_name])==list: - tmp_dict = OrderedDict() - for i, category_item in enumerate(qc[category_name]): - tmp_dict['rep{}'.format(i+1)] = category_item - qc[category_name] = tmp_dict - # print(json.dumps(tmp_dict["rep1"], sort_keys=False, indent=4, separators=(',',': '))) - - qc['general']['rep_id'] = None - - # find workflow_id (hash string) if possible - workflow_id = find_workflow_id_from_path(json_file) - - if workflow_id and map_workflow_id_to_title: - qc['general']['title'] = map_workflow_id_to_title[workflow_id] - elif map_qc_json_path_to_title: - qc['general']['title'] = map_qc_json_path_to_title[os.path.abspath(json_file)] - - qc_jsons.append(qc) - return qc_jsons - -def read_2col_tsv(tsv): # tsv (key \t val) -> map (key:val) - if tsv: - with open(tsv) as fp: - tsv = dict(csv.reader(fp, delimiter='\t')) - return {row : tsv[row] for row in tsv} - else: - return None - -def read_json_file(json_file): - if json_file: - with open(json_file,'r') as fp: - return json.load(fp, object_pairs_hook=OrderedDict) - else: - return None - -def parse_rep_key_name(rep_key_name): - # repX-pr, repX-pr1/2 goes to repX, others (ppr, rep1-rep2, ...) go to rep1 - # 'suffix' will be suffixed to the quality_metric_name - repX = re.findall('^rep(\d+)$',rep_key_name) - repX_repY = re.findall('^rep(\d+)-rep(\d+)$',rep_key_name) - repX_pr = re.findall('^rep(\d+)-pr$',rep_key_name) - repX_pr1 = re.findall('^rep(\d+)-pr1$',rep_key_name) - repX_pr2 = re.findall('^rep(\d+)-pr2$',rep_key_name) - pooled = re.findall('^pooled$',rep_key_name) - ppr = re.findall('^ppr$',rep_key_name) - ppr1 = re.findall('^ppr1$',rep_key_name) - ppr2 = re.findall('^ppr2$',rep_key_name) - if repX: - rep = 'rep{}'.format(repX[0]) - suffix = '' - elif repX_repY: - rep = 'rep1' - suffix = ' (rep{}-rep{})'.format(repX_repY[0][0], repX_repY[0][1]) - elif repX_pr: - rep = 'rep{}'.format(repX_pr[0]) - suffix = ' (rep{}-pr)'.format(repX_pr[0]) - elif repX_pr1: - rep = 'rep{}'.format(repX_pr1[0]) - suffix = ' (rep{}-pr1)'.format(repX_pr1[0]) - elif repX_pr2: - rep = 'rep{}'.format(repX_pr2[0]) - suffix = ' (rep{}-pr2)'.format(repX_pr2[0]) - elif pooled: - rep = 'rep1' - suffix = ' (pooled)' - elif ppr: - rep = 'rep1' - suffix = ' (ppr)' - elif ppr1: - rep = 'rep1' - suffix = ' (ppr1)' - elif ppr2: - rep = 'rep1' - suffix = ' (ppr2)' - else: - rep = 'rep1' - suffix = '' - return rep, suffix - -def make_a_sorted_qc_json(qc): - result = OrderedDict() - result['general'] = qc['general'] - if 'criteria' in qc: - result['criteria'] = qc['criteria'] - for category in qc: - if category in ['general','criteria']: - continue - result[category] = qc[category] - return result - -def pretty_print_json(d): - print(json.dumps(d, sort_keys=False, indent=4, separators=(',',': '))) - -def add_criteria_category_to_qc_json(qc, criteria): - # count number of replicates - tmp_dict_rep = OrderedDict() - for category in qc: # for each category - for rep_key_name in qc[category]: # for each rep - rep, suffix = parse_rep_key_name(rep_key_name) - tmp_dict_rep[rep] = None - - # add criteria if criterie definition file is given - if criteria: - qc['criteria'] = OrderedDict() - - for rep in tmp_dict_rep: - qc['criteria'][rep] = OrderedDict() - for c in criteria: - # two quality metric items (val, pass/fail) per condition - # read condition - if 'condition' in criteria[c]: - condition = criteria[c]['condition'] - elif qc['general']['paired_end']!=None: - if 'condition_pe' in criteria[c] and qc['general']['paired_end']: - condition = criteria[c]['condition_pe'] - elif 'condition_se' in criteria[c] and not qc['general']['paired_end']: - condition = criteria[c]['condition_se'] - else: - condition = None - else: - condition = None - - val1 = 'N/A' - if condition: - val2 = 'N/A' - try: - val1 = eval(criteria[c]['eval'].replace('rep?',rep)) - if condition: - for key in condition: - cond_met = eval('{} {}'.format(val1, condition[key])) - if cond_met: - val2 = key - break - - except: - sys.stderr.write('Failed to evaluate condition ({}) of criterion ({})\n'.format(condition, c)) - - qc['criteria'][rep][c] = val1 - if condition: - qc['criteria'][rep]['{} (QC)'.format(c)] = val2 - # pretty_print_json(qc['criteria']) - return qc - -def main(): - args = parse_arguments() - - map_qc_json_path_to_title = read_2col_tsv(args.tsv_mapping_qc_json_path_to_title) - map_workflow_id_to_title = read_2col_tsv(args.tsv_mapping_workflow_id_to_title) - criteria = read_json_file(args.criteria_def_json_file) - - qc_jsons = recursively_read_qc_jsons(args.search_dir, args.qc_json_file_basename, - map_qc_json_path_to_title, map_workflow_id_to_title) - - # parse each qc_json and add criteria category if criteria definition file is given - for qc_json in qc_jsons: - add_criteria_category_to_qc_json(qc_json, criteria) - - # sort qc json (general, criteria, ...) - sorted_qc_jsons = [] - for qc_json in qc_jsons: - sorted_qc_jsons.append(make_a_sorted_qc_json(qc_json)) - - flattened_qc_jsons = [] - for qc_json in sorted_qc_jsons: - flat_qc_json = OrderedDict() - for cat in qc_json: - flat_qc_json[cat] = OrderedDict() - for rep_key_name in qc_json[cat]: - if type(qc_json[cat][rep_key_name])==OrderedDict: - rep, suffix = parse_rep_key_name(rep_key_name) - for key in qc_json[cat][rep_key_name]: - val = qc_json[cat][rep_key_name][key] - if not rep in flat_qc_json[cat]: - flat_qc_json[cat][rep] = OrderedDict() - flat_qc_json[cat][rep][key+suffix] = val - else: - flat_qc_json[cat]['rep1'] = qc_json[cat] - break - # pretty_print_json(flat_qc_json) - flattened_qc_jsons.append(flat_qc_json) - - # dict with all category and quality metric name - all_metric_names = OrderedDict() - - for qc_json in flattened_qc_jsons: - for category in qc_json: - if not category in all_metric_names: - all_metric_names[category] = OrderedDict() - if 'rep1' in qc_json[category]: - for key in qc_json[category]['rep1']: - all_metric_names[category][key] = None - # TSV format - # header - # layer1: category - # layer2: quality metric name - # cells - # quality metric value - - # layer1 - header_layer1 = '\t'.join([cat if i==0 else '' \ - for cat in all_metric_names \ - for i, metric_name in enumerate(all_metric_names[cat])]) - # layer2 - header_layer2 = '\t'.join([metric_name \ - for cat in all_metric_names \ - for i, metric_name in enumerate(all_metric_names[cat])]) - - args.out_file.write(header_layer1+'\n') - args.out_file.write(header_layer2+'\n') - - # cells = [] - for qc_json in flattened_qc_jsons: - num_rep = max([len(qc_json[cat]) for cat in qc_json]) - for rep_id in range(1,num_rep+1): - line = [] - for cat in all_metric_names: - for i, metric_name in enumerate(all_metric_names[cat]): - if metric_name=='rep_id': - line.append(str(rep_id)) - break - found_elem = False - if cat in qc_json: - for rep in qc_json[cat]: # for each rep - if rep=='rep{}'.format(rep_id): - for key in qc_json[cat][rep]: - if key==metric_name: - line.append(str(qc_json[cat][rep][key])) - found_elem = True - break - if not found_elem: - line.append('') - args.out_file.write('\t'.join(line)+'\n') - -if __name__=='__main__': - main() diff --git a/utils/resumer/README.md b/utils/resumer/README.md deleted file mode 100644 index c32cd02d..00000000 --- a/utils/resumer/README.md +++ /dev/null @@ -1,189 +0,0 @@ -# resumer - -** RESUMER HAS BEEN DEPRECATED. PIPELINE NO LONGER SUPPORTS RESUMING. USE [CAPER](https://github.com/ENCODE-DCC/caper) WITH MYSQL DATABASE TO USE CROMWELL'S CALL-CACHING. WE WILL KEEP IT FOR A COUPLE OF NEXT RELEASES BUT WILL BE REMOVED SOON AFTER THAT ** - -## Introduction - -This python script parses a metadata JSON file from a previous failed workflow and generates a new input JSON file to start a pipeline from where it left off. - -## How to use it - -Before running this script, you should have a metadata JSON file for a previous failed workflow. You can get it by adding a parameter `-m metadata.json` to the cromwell Java command line. If you stop a workflow (CTRL+C or kill) metadata then JSON file will not be generated. -```bash -$ java -jar ... cromwell-38.jar run chip.wdl -i original_input.json ... -m metadata.json -``` - -Unfortunately your workflow failed for some reasons but you can fix the problem and want to resume it from where it left off. -```bash -$ python resumer.py metadata.json -``` - -You will get a new input JSON file `resume.FAILED_WORKFLOW_ID.json` and run cromwell with it instead of the original one `original_input.json`. -```bash -$ java -jar ... cromwell-38.jar run chip.wdl -i resume.FAILED_WORKFLOW_ID.json ... -``` - -## Usage - -```bash -usage: Resumer for ENCODE ATAC/Chip-Seq pipelines [-h] - [--output-def-json-file OUTPUT_DEF_JSON_FILE] - metadata_json_file - -Parse cromwell's metadata JSON file and generate a new input JSON file to -resume a pipeline from where it left off. - -positional arguments: - metadata_json_file Cromwell metadata JSON file from a previous failed - run. - -optional arguments: - -h, --help show this help message and exit - --output-def-json-file OUTPUT_DEF_JSON_FILE - Output definition JSON file for your pipeline. If not - specified, it will look for a valid JSON file on - script's directory. You can use your own JSON file for - your pipeline. Entries in "Array[Object]" is for - Array[Object] in an input JSON. This is useful to take - outputs from a scatter block. For example, the 1st - entry of "Array[Object]" in chip.json is "chip.bwa" : - {"bam" : "chip.bams", "flagstat_qc" : - "chip.flagstat_qcs"}. chip.flagstat_qcs : [...(taken - from an output of chip.bwa.flagstat_qc)...] will be - added to your new input JSON. For example, the 1st - entry of "Object" in chip.json is "chip.pool_ta" : - {"ta_pooled" : "chip.ta_pooled"}. chip.ta_pooled : - "(taken from an output of chip.pool_ta.ta_pooled)" - will be added to your new input JSON. -``` - -## Examples - -```bash -$ python resumer.py metadata.json -``` - -## How it works (for developers) - -In order to use this script, your pipeline should be able to start from any type of inputs (e.g. FASTQ, BAM, ...) and inputs to the previous task (e.g. map_fastq) should be ignored if next step (e.g. filter_bam)'s input is already given in the input JSON file. - -``` -# example toy_chip workflow that processes through FASTQ->BAM->FILT_BAM->PEAK->REPORT -# this pipeline can start from any types of input FASTQ, BAM, FILT_BAM, PEAK -# key idea of resuming workflow is to skip previous step -# if next step's input is already given in the input JSON file -# this is controlled by `Boolean` variables (`need_to_process_XXX`). - -workflow toy_chip { - # input definition - Array[File] fastqs = [] # per replicate - Array[File] bams = [] # per replicate - Array[File] filt_bams = [] # per replicate - Array[File] peaks = [] # per replicate - - Boolean need_to_process_peak = true # trivial - Boolean need_to_process_filt_bam = need_to_process_peak && length(peaks)==0 - Boolean need_to_process_bam = need_to_process_filt_bam && length(filt_bams)==0 - Boolean need_to_process_fastq = need_to_process_bam && length(bams)==0 - - scatter(fastq in if need_to_process_fastq then fastqs else []) { - call map_fastq { input: fastq = fastq } - } - - # temporary array to deal with outputs from either previous step or from an input JSON file - Array[File] bams_ = flatten([map_fastq.bam, bams]) - scatter(bam in if need_to_process_bam then bams_ else []) { - call filter_bam { input: bam = bam } - } - - Array[File] filt_bams_ = flatten([filter_bam.filt_bam, filt_bams]) # temporary array again - scatter(filt_bam in if need_to_process_filt_bam then filt_bams_ else []) { - call call_peak { input: filt_bam = filt_bam } - } - - Array[File] peaks_ = flatten([call_peak.peak, peaks]) # temporary array again - if (need_to_process_peak) { - call generate_report { input: peaks = peaks_ } - } -} -``` - -Output definition JSON file `toy_chip.json` for the above example workflow should look like: -```javascript -{ - "Array[Object]" : { - "toy_chip.map_fastq" : { - "bam" : "toy_chip.bams" - }, - "toy_chip.filter_bam" : { - "filt_bam" : "toy_chip.filt_bams" - } - "toy_chip.call_peak" : { - "peak" : "toy_chip.peaks" - } - } -} -``` - -An original input JSON file to start from fastqs. -```javscript -{ - "toy_chip.fastqs" : ["rep1.fastq.gz", "rep1.fastq.gz"] -} -``` - -Run a pipeline with this original input JSON. -```bash -$ java -jar cromwell-38.jar run toy_chip.wdl -i org_input.json -m metadata.json -``` - -Pipeline fails due to some errors in `call_peak` task. Run `resumer.py` to make a new input JSON file to resume. -```bash -$ python resumer.py metadata.json --output-def-json-file toy_chip.json -``` - -Then `result.WORKFLOW_ID.json` will be generated. -```javscript -{ - "toy_chip.fastqs" : ["rep1.fastq.gz", "rep1.fastq.gz"] - "toy_chip.bams" : ["rep1.bam", "rep1.bam"] - "toy_chip.filt_bams" : ["rep1.filt.bam", "rep1.filt.bam"] -} -``` - -You feed it to the cromwell java command line after fixing the problem. Then pipeline will start from ``scatter` block for `call_peak` tasks. -```bash -$ java -jar cromwell-38.jar run toy_chip.wdl -i resume.WORKFLOW_ID.json -``` - -## Output definition JSON file (for developers) - -An output definition JSON file must have at least one object from `"Array[Object]"` and `"Object"`. It can have both. The following JSON is a simplified version of an output definition JSON file for ChIP-Seq pipeline (`chip.json`). -```javascript -{ - "Array[Object]" : { - "chip.bwa" : { - "bam" : "chip.bams", - "flagstat_qc" : "chip.flagstat_qcs" - } - }, - - "Object" : { - "chip.pool_ta" : { - "ta_pooled" : "chip.ta_pooled" - } - } -} -``` - -`"Array[Object]"` is useful to take an array of outputs from a `scatter` block and `"Object"` is good for taking a single value from any tasks. - -Using this JSON file for `resumer.py` will add the following extra input data definitions to the original input JSON file. -```javascript -{ - "chip.bams" : [...(an array of values taken from chip.bwa.bam)...], - "chip.flagstat_qcs" : [...(an array of values taken from chip.bwa.flagstat_qc)...], - "chip.ta_pooled" : "...(a value taken from chip.pool_ta.ta_pooled)..." -} - - diff --git a/utils/resumer/chip.json b/utils/resumer/chip.json deleted file mode 100644 index 6629bb50..00000000 --- a/utils/resumer/chip.json +++ /dev/null @@ -1,108 +0,0 @@ -{ - "Array[Object]" : { - "chip.bwa" : { - "bam" : "chip.bams", - "flagstat_qc" : "chip.flagstat_qcs" - }, - "chip.filter" : { - "nodup_bam" : "chip.nodup_bams", - "flagstat_qc" : "chip.nodup_flagstat_qcs", - "dup_qc" : "chip.dup_qcs", - "pbc_qc" : "chip.pbc_qcs" - }, - "chip.bam2ta" : { - "ta" : "chip.tas" - }, - "chip.align_ctl" : { - "bam" : "chip.ctl_bams", - "flagstat_qc" : "chip.ctl_flagstat_qcs" - }, - "chip.filter_ctl" : { - "nodup_bam" : "chip.ctl_nodup_bams", - "flagstat_qc" : "chip.ctl_nodup_flagstat_qcs", - "dup_qc" : "chip.ctl_dup_qcs", - "pbc_qc" : "chip.ctl_pbc_qcs" - }, - "chip.bam2ta_ctl" : { - "ta" : "chip.ctl_tas" - }, - "chip.xcor" : { - "plot_png" : "chip.xcor_plots", - "score" : "chip.xcor_scores", - "fraglen" : "chip.fraglen" - }, - "chip.macs2" : { - "npeak" : "chip.peaks", - "frip_qc" : "chip.macs2_frip_qcs", - "sig_pval" : "chip.pval_bws" - }, - "chip.macs2_signal_track" : { - "pval_bw" : "chip.pval_bws" - }, - "chip.macs2_pr1" : { - "npeak" : "chip.peaks_pr1", - "frip_qc" : "chip.macs2_pr1_frip_qcs" - }, - "chip.macs2_pr2" : { - "npeak" : "chip.peaks_pr2", - "frip_qc" : "chip.macs2_pr2_frip_qcs" - }, - "chip.spp" : { - "rpeak" : "chip.peaks", - "frip_qc" : "chip.spp_frip_qcs" - }, - "chip.spp_pr1" : { - "rpeak" : "chip.peaks_pr1", - "frip_qc" : "chip.spp_pr1_frip_qcs" - }, - "chip.spp_pr2" : { - "rpeak" : "chip.peaks_pr2", - "frip_qc" : "chip.spp_pr2_frip_qcs" - }, - "chip.count_signal_track" : { - "pos_bw" : "chip.count_signal_track_pos_bws", - "neg_bw" : "chip.count_signal_track_neg_bws" - } - }, - - "Object" : { - "chip.pool_ta" : { - "ta_pooled" : "chip.ta_pooled" - }, - "chip.pool_ta_ctl" : { - "ta_pooled" : "chip.ctl_ta_pooled" - }, - "chip.macs2_pooled" : { - "npeak" : "chip.peak_pooled", - "frip_qc" : "chip.macs2_pooled_frip_qc_" - }, - "chip.macs2_ppr1" : { - "npeak" : "chip.peak_ppr1", - "frip_qc" : "chip.macs2_ppr1_frip_qc_" - }, - "chip.macs2_ppr2" : { - "npeak" : "chip.peak_ppr2", - "frip_qc" : "chip.macs2_ppr2_frip_qc_" - }, - "chip.spp_pooled" : { - "rpeak" : "chip.peak_pooled", - "frip_qc" : "chip.spp_pooled_frip_qc_" - }, - "chip.spp_ppr1" : { - "rpeak" : "chip.peak_ppr1", - "frip_qc" : "chip.spp_ppr1_frip_qc_" - }, - "chip.spp_ppr2" : { - "rpeak" : "chip.peak_ppr2", - "frip_qc" : "chip.spp_ppr2_frip_qc_" - }, - "chip.fingerprint" : { - "jsd_qcs" : "chip.jsd_qcs", - "plot" : "chip.jsd_plot" - }, - "chip.count_signal_track_pooled" : { - "pos_bw" : "chip.count_signal_track_pooled_pos_bw_", - "neg_bw" : "chip.count_signal_track_pooled_neg_bw_" - } - } -} diff --git a/utils/resumer/default.json b/utils/resumer/default.json deleted file mode 120000 index e1156357..00000000 --- a/utils/resumer/default.json +++ /dev/null @@ -1 +0,0 @@ -chip.json \ No newline at end of file diff --git a/utils/resumer/resumer.py b/utils/resumer/resumer.py deleted file mode 100755 index 5b8709ca..00000000 --- a/utils/resumer/resumer.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python2 - -# written by Jin Lee, 2019 - -import os -import argparse -import json -from collections import OrderedDict - -def parse_arguments(): - parser = argparse.ArgumentParser(prog='Resumer for ENCODE ATAC/Chip-Seq pipelines', - description='Parse cromwell\'s metadata JSON file and generate a new input JSON file ' - 'to resume a pipeline from where it left off.') - parser.add_argument('metadata_json_file', type=str, help='Cromwell metadata JSON file from a previous failed run.') - parser.add_argument('--output-def-json-file', type=str, help='Output definition JSON file for your pipeline. ' - 'If not specified, it will look for a valid JSON file on script\'s directory. ' - 'You can use your own JSON file for your pipeline. ' - 'Entries in "Array[Object]" is for Array[Object] in an input JSON. This is useful to take outputs from a scatter block. ' - 'For example, the 1st entry of "Array[Object]" in chip.json is "chip.bwa" : {"bam" : "chip.bams", "flagstat_qc" : "chip.flagstat_qcs"}. ' - 'chip.flagstat_qcs : [...(taken from an output of chip.bwa.flagstat_qc)...] will be added to your new input JSON. ' - 'For example, the 1st entry of "Object" in chip.json is "chip.pool_ta" : {"ta_pooled" : "chip.ta_pooled"}. ' - 'chip.ta_pooled : "(taken from an output of chip.pool_ta.ta_pooled)" will be added to your new input JSON. ') - args = parser.parse_args() - - # if not specified by user, look into this array on script's directory - if not args.output_def_json_file: - script_dir = os.path.dirname(os.path.realpath(__file__)) - default_output_def_json_files = ['default.json'] - for f in default_output_def_json_files: - json_file = os.path.join(script_dir, f) - if os.path.exists(json_file): - args.output_def_json_file = json_file - break - return args - -def read_json_file(json_file): - with open(json_file,'r') as fp: - return json.load(fp, object_pairs_hook=OrderedDict) - -def parse_cromwell_metadata_json_file(json_file): - metadata_json = read_json_file(json_file) - - workflow_id = metadata_json['labels']['cromwell-workflow-id'].replace('cromwell-','') - org_input_json = json.loads(metadata_json['submittedFiles']['inputs'], object_pairs_hook=OrderedDict) - calls = metadata_json['calls'] - - return workflow_id, org_input_json, calls - -def find_output_of_successful_calls(calls, output_def_json): - result = OrderedDict() - - if 'Array[Object]' in output_def_json: - for call_name in output_def_json['Array[Object]']: - if call_name in calls: - call = calls[call_name] # call is a list of the same task for multiple replicates - failed = False - for i, c in enumerate(call): # i = 0-based replicate id - if c['executionStatus']!='Done': - failed = True - break - if not failed: - for key in output_def_json['Array[Object]'][call_name]: - wdl_var_name = output_def_json['Array[Object]'][call_name][key] - result[wdl_var_name] = [call[i]['outputs'][key] for i, _ in enumerate(call)] - - if 'Object' in output_def_json: - for call_name in output_def_json['Object']: - if call_name in calls: - call = calls[call_name] # call is a list of the same task for multiple replicates - failed = False - for i, c in enumerate(call): # i = 0-based replicate id - if c['executionStatus']!='Done': - failed = True - break - if not failed: - assert(len(call)==1) - for key in output_def_json['Object'][call_name]: - wdl_var_name = output_def_json['Object'][call_name][key] - result[wdl_var_name] = call[0]['outputs'][key] - - return result - -def main(): - args = parse_arguments() - - workflow_id, org_input_json, calls = parse_cromwell_metadata_json_file(args.metadata_json_file) - - output_def_json = read_json_file(args.output_def_json_file) - - new_input_json = find_output_of_successful_calls(calls, output_def_json) - - # merge new input json over original input json - for key in new_input_json: - org_input_json[key] = new_input_json[key] - - with open('resume.{}.json'.format(workflow_id),'w') as fp: - fp.write(json.dumps(org_input_json, indent=4)) - -if __name__=='__main__': - main() From 10299398417b0a4c221ff4d12e7bbcec6a6936f1 Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Wed, 30 Oct 2019 16:42:53 -0700 Subject: [PATCH 14/15] doc: fix indent in md --- README.md | 6 +- docs/input_short.md | 248 ++++++++++++++++++++++---------------------- 2 files changed, 126 insertions(+), 128 deletions(-) diff --git a/README.md b/README.md index 9525a8c3..6ff8dd41 100644 --- a/README.md +++ b/README.md @@ -46,8 +46,8 @@ Use `https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq An input JSON file specifies all the input parameters and files that are necessary for successfully running this pipeline. This includes a specification of the path to the genome reference files and the raw data fastq file. Please make sure to specify absolute paths rather than relative paths in your input JSON files. - - [Input JSON file specification (short)](docs/input_short.md) - - [Input JSON file specification (long)](docs/input.md) +1) [Input JSON file specification (short)](docs/input_short.md) +2) [Input JSON file specification (long)](docs/input.md) ## Running a pipeline on DNAnexus @@ -64,5 +64,3 @@ Install [Croo](https://github.com/ENCODE-DCC/croo#installation). **You can skip $ pip install croo $ croo [METADATA_JSON_FILE] ``` - -There is another [useful tool](utils/qc_jsons_to_tsv/README.md) to make a spreadsheet of QC metrics from multiple workflows. This tool recursively finds and parses all `qc.json` (pipeline's [final output](docs/example_output/v1.1.5/qc.json)) found from a specified root directory. It generates a TSV file that has all quality metrics tabulated in rows for each experiment and replicate. This tool also estimates overall quality of a sample by [a criteria definition JSON file](utils/qc_jsons_to_tsv/criteria.default.json) which can be a good guideline for QC'ing experiments. diff --git a/docs/input_short.md b/docs/input_short.md index 2c672a4c..6ce2a7bb 100644 --- a/docs/input_short.md +++ b/docs/input_short.md @@ -4,49 +4,49 @@ An input JSON file includes all genomic data files, parameters and metadata for # Checklist -Mandatory parameters: +Mandatory parameters. - 1) Pipeline type - - `chip.pipeline_type`: `tf` for TF ChIP-seq or `histone` for histone ChIP-seq. One major difference between two types is that `tf` uses `spp` peak caller with controls but `histone` uses `macs2` peak caller without controls. +1) Pipeline type + * `chip.pipeline_type`: `tf` for TF ChIP-seq or `histone` for histone ChIP-seq. One major difference between two types is that `tf` uses `spp` peak caller with controls but `histone` uses `macs2` peak caller without controls. - 2) Experiment title/description - - `chip.title`: experiment title for a final HTML report. - - `chip.description`: experiment description for a final HTML report. +2) Experiment title/description + * `chip.title`: experiment title for a final HTML report. + * `chip.description`: experiment description for a final HTML report. - 3) Read endedness - - `chip.paired_end`: `true` if ALL replicates are paired-ended. - - (Optional) `chip.paired_ends`: For samples with mixed read ends, you can define read endedness for each biological replicate (e.g. `[true, false]` means paired-ended biorep-1 and single-ended biorep-2). - - `chip.ctl_paired_end`: `true` if ALL controls are paired-ended. If not defined then `chip.paired_end` will be used. - - (Optional) `chip.ctl_paired_ends`: For controls with mixed read ends, you can define read endedness for each biological replicate (e.g. `[true, false]` means paired-ended biorep-1 and single-ended biorep-2). If not defined then `chip.paired_ends` will be used. +3) Read endedness + * `chip.paired_end`: `true` if ALL replicates are paired-ended. + * (Optional) `chip.paired_ends`: For samples with mixed read ends, you can define read endedness for each biological replicate (e.g. `[true, false]` means paired-ended biorep-1 and single-ended biorep-2). + * `chip.ctl_paired_end`: `true` if ALL controls are paired-ended. If not defined then `chip.paired_end` will be used. + * (Optional) `chip.ctl_paired_ends`: For controls with mixed read ends, you can define read endedness for each biological replicate (e.g. `[true, false]` means paired-ended biorep-1 and single-ended biorep-2). If not defined then `chip.paired_ends` will be used. - 4) Reference genome - - `chip.genome_tsv`: Use `https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/[GENOME]_caper.tsv`. - - Supported `GENOME`s: are hg38, mm10, hg19 and mm9. - - We provide a genome TSV file that defines all genome-specific parameters and reference data files. Caper will automatically download big reference data files from our ENCODE repository. - - However, we also have reference data mirrors for [some platforms](input_details.md/#reference-genome) (GCP, AWS, Sherlock, SCG, ...). On these platforms, you can use a different TSV file to prevent downloading such big reference data. - - To build a new TSV file from use your own FASTA (`.fa` and `.2bit`) see [this](build_genome_database.md). +4) Reference genome + * `chip.genome_tsv`: Use `https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/[GENOME]_caper.tsv`. + * Supported `GENOME`s: are hg38, mm10, hg19 and mm9. + * We provide a genome TSV file that defines all genome-specific parameters and reference data files. Caper will automatically download big reference data files from our ENCODE repository. + * However, we also have reference data mirrors for [some platforms](input_details.md/#reference-genome) (GCP, AWS, Sherlock, SCG, ...). On these platforms, you can use a different TSV file to prevent downloading such big reference data. + * To build a new TSV file from use your own FASTA (`.fa` and `.2bit`) see [this](build_genome_database.md). - 5) [Input files](#input-files) and [adapters](#adapters) - - See [this](#input-files) for how to define FASTQ/BAM/TAG-ALIGNs for your sample. - - See [this](#adapters) for how to define adapters to be trimmed. +5) [Input files](#input-files) and [adapters](#adapters) + * See [this](#input-files) for how to define FASTQ/BAM/TAG-ALIGNs for your sample. + * See [this](#adapters) for how to define adapters to be trimmed. - 6) Important parameters - - `chip.always_use_pooled_ctl`: (For TF ChIP-seq only) Always use a pooled control to compare with each replicate. If a single control is given then use it. It is disabled by default. - - `chip.ctl_depth_ratio`: (For TF ChIP-seq only) If ratio of depth between controls is higher than this. then always use a pooled control for all replicates. It's 1.2 by default. +6) Important parameters + * `chip.always_use_pooled_ctl`: (For TF ChIP-seq only) Always use a pooled control to compare with each replicate. If a single control is given then use it. It is disabled by default. + * `chip.ctl_depth_ratio`: (For TF ChIP-seq only) If ratio of depth between controls is higher than this. then always use a pooled control for all replicates. It's 1.2 by default. - 7) [Resources](#resources) - - If your FASTQs/BAMs are big (>10GB) then try with higher resource settings, especially for memory (`chip.[TASK_NAME]_mem_mb`). +7) [Resources](#resources) + * If your FASTQs/BAMs are big (>10GB) then try with higher resource settings, especially for memory (`chip.[TASK_NAME]_mem_mb`). -Optional parameters: +Optional parameters. - 8) Useful parameters - - `chip.subsample_reads`: Subsample experimet reads. This will affect all downsteam analyses including peak-calling. It's 0 by default, which means no subsampling. - - `chip.ctl_subsample_reads`: Subsample control reads. This will affect all downsteam analyses including peak-calling. It's 0 by default, which means no subsampling. - - `chip.fraglen`: Array of Integers. Fragment length for each bio replicate. If you start from FASTQs then our pipeline automatically estimate it from cross-correlation analysis (task `xcor`) result since such analysis requires a special treamtment for FASTQs. It is possible that fragment length is not estimated correctly (or pipeline can fail due to negative fraglen) if you start from different types (BAM/TAG-ALIGN). For such case, you can manually define fragment length for each bio rep. (e.g. `[200, 150]` means 200 for rep1 and 150 for rep2). +8) Useful parameters + * `chip.subsample_reads`: Subsample experimet reads. This will affect all downsteam analyses including peak-calling. It's 0 by default, which means no subsampling. + * `chip.ctl_subsample_reads`: Subsample control reads. This will affect all downsteam analyses including peak-calling. It's 0 by default, which means no subsampling. + * `chip.fraglen`: Array of Integers. Fragment length for each bio replicate. If you start from FASTQs then our pipeline automatically estimate it from cross-correlation analysis (task `xcor`) result since such analysis requires a special treamtment for FASTQs. It is possible that fragment length is not estimated correctly (or pipeline can fail due to negative fraglen) if you start from different types (BAM/TAG-ALIGN). For such case, you can manually define fragment length for each bio rep. (e.g. `[200, 150]` means 200 for rep1 and 150 for rep2). - 9) Flags - - `chip.align_only`: Peak calling and its downstream analyses will be disabled. Useful if you just want to align your FASTQs into filtered BAMs/TAG-ALIGNs and don't want to call peaks on them. - - `chip.true_rep_only`: Disable pseudo replicate generation and all related analyses +9) Flags + * `chip.align_only`: Peak calling and its downstream analyses will be disabled. Useful if you just want to align your FASTQs into filtered BAMs/TAG-ALIGNs and don't want to call peaks on them. + * `chip.true_rep_only`: Disable pseudo replicate generation and all related analyses ## Input files @@ -59,100 +59,100 @@ Optional parameters: Pipeline can start from any of the following data types (FASTQ, BAM, NODUP_BAM and TAG-ALIGN). - 1) Starting from FASTQs - - Technical replicates for each bio-rep will be **MERGED** in the very early stage of the pipeline. Each read end R1 and R2 have separate arrays `chip.fastqs_repX_R1` and `chip.fastqs_repX_R2`. Do not define R2 array for single-ended replicates. - - Example of 3 paired-ended biological replicates and 2 technical replicates for each bio rep. Two technical replicates `BIOREPX_TECHREP1.R1.fq.gz` and `BIOREPX_TECHREP2.R1.fq.gz` for each bio replicate will be merged. - - ```javascript - { - "chip.paired_end" : true, - "chip.fastqs_rep1_R1" : ["BIOREP1_TECHREP1.R1.fq.gz", "BIOREP1_TECHREP2.R1.fq.gz"], - "chip.fastqs_rep1_R2" : ["BIOREP1_TECHREP1.R2.fq.gz", "BIOREP1_TECHREP2.R2.fq.gz"], - "chip.fastqs_rep2_R1" : ["BIOREP2_TECHREP1.R1.fq.gz", "BIOREP2_TECHREP2.R1.fq.gz"], - "chip.fastqs_rep2_R2" : ["BIOREP2_TECHREP1.R2.fq.gz", "BIOREP2_TECHREP2.R2.fq.gz"], - "chip.fastqs_rep3_R1" : ["BIOREP3_TECHREP1.R1.fq.gz", "BIOREP3_TECHREP2.R1.fq.gz"], - "chip.fastqs_rep3_R2" : ["BIOREP3_TECHREP1.R2.fq.gz", "BIOREP3_TECHREP2.R2.fq.gz"] - } - ``` - - 2) Starting from BAMs - - Define a BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. - - Example of 3 singled-ended replicates. - ```javascript - { - "chip.paired_end" : false, - "chip.bams" : ["rep1.bam", "rep2.bam", "rep3.bam"] - } - ``` - - 3) Starting from filtered/deduped BAMs - - Define a filtered/deduped BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. These BAMs should not have unmapped reads or duplicates. - - Example of 2 singled-ended replicates. - ```javascript - { - "chip.paired_end" : false, - "chip.nodup_bams" : ["rep1.nodup.bam", "rep2.nodup.bam"] - } - ``` - - 4) Starting from TAG-ALIGN BEDs - - Define a TAG-ALIGN for each replicate. Our pipeline does not determine read endedness from a TAG-ALIGN file. You need to explicitly define read endedness. - - Example of 4 paired-ended replicates. - - ```javascript - { - "chip.paired_end" : true, - "chip.tas" : ["rep1.tagAlign.gz", "rep2.tagAlign.gz", "rep3.tagAlign.gz", "rep3.tagAlign.gz"] - } - ``` +1) Starting from FASTQs + * Technical replicates for each bio-rep will be **MERGED** in the very early stage of the pipeline. Each read end R1 and R2 have separate arrays `chip.fastqs_repX_R1` and `chip.fastqs_repX_R2`. Do not define R2 array for single-ended replicates. + * Example of 3 paired-ended biological replicates and 2 technical replicates for each bio rep. Two technical replicates `BIOREPX_TECHREP1.R1.fq.gz` and `BIOREPX_TECHREP2.R1.fq.gz` for each bio replicate will be merged. + + ```javascript + { + "chip.paired_end" : true, + "chip.fastqs_rep1_R1" : ["BIOREP1_TECHREP1.R1.fq.gz", "BIOREP1_TECHREP2.R1.fq.gz"], + "chip.fastqs_rep1_R2" : ["BIOREP1_TECHREP1.R2.fq.gz", "BIOREP1_TECHREP2.R2.fq.gz"], + "chip.fastqs_rep2_R1" : ["BIOREP2_TECHREP1.R1.fq.gz", "BIOREP2_TECHREP2.R1.fq.gz"], + "chip.fastqs_rep2_R2" : ["BIOREP2_TECHREP1.R2.fq.gz", "BIOREP2_TECHREP2.R2.fq.gz"], + "chip.fastqs_rep3_R1" : ["BIOREP3_TECHREP1.R1.fq.gz", "BIOREP3_TECHREP2.R1.fq.gz"], + "chip.fastqs_rep3_R2" : ["BIOREP3_TECHREP1.R2.fq.gz", "BIOREP3_TECHREP2.R2.fq.gz"] + } + ``` + +2) Starting from BAMs + * Define a BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. + * Example of 3 singled-ended replicates. + ```javascript + { + "chip.paired_end" : false, + "chip.bams" : ["rep1.bam", "rep2.bam", "rep3.bam"] + } + ``` + +3) Starting from filtered/deduped BAMs + * Define a filtered/deduped BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. These BAMs should not have unmapped reads or duplicates. + * Example of 2 singled-ended replicates. + ```javascript + { + "chip.paired_end" : false, + "chip.nodup_bams" : ["rep1.nodup.bam", "rep2.nodup.bam"] + } + ``` + +4) Starting from TAG-ALIGN BEDs + * Define a TAG-ALIGN for each replicate. Our pipeline does not determine read endedness from a TAG-ALIGN file. You need to explicitly define read endedness. + * Example of 4 paired-ended replicates. + + ```javascript + { + "chip.paired_end" : true, + "chip.tas" : ["rep1.tagAlign.gz", "rep2.tagAlign.gz", "rep3.tagAlign.gz", "rep3.tagAlign.gz"] + } + ``` You need to define controls for TF ChIP-seq pipeline. Skip this if you want to run histone ChIP-seq pipelines. You can define controls similarly to experiment IP replicates. Just add `ctl_` prefix to parameter names. - 1) Control FASTQs - - Technical replicates for each bio-rep will be **MERGED** in the very early stage of the pipeline. Each read end R1 and R2 have separate arrays `chip.ctl_fastqs_repX_R1` and `chip.ctl_fastqs_repX_R2`. Do not define R2 array for single-ended replicates. - - Example of 3 paired-ended biological replicates and 2 technical replicates for each bio rep. Two technical replicates `BIOREPX_TECHREP1.R1.fq.gz` and `BIOREPX_TECHREP2.R1.fq.gz` for each bio replicate will be merged. - - ```javascript - { - "chip.ctl_paired_end" : true, - "chip.ctl_fastqs_rep1_R1" : ["BIOREP1_TECHREP1.R1.fq.gz", "BIOREP1_TECHREP2.R1.fq.gz"], - "chip.ctl_fastqs_rep1_R2" : ["BIOREP1_TECHREP1.R2.fq.gz", "BIOREP1_TECHREP2.R2.fq.gz"], - "chip.ctl_fastqs_rep2_R1" : ["BIOREP2_TECHREP1.R1.fq.gz", "BIOREP2_TECHREP2.R1.fq.gz"], - "chip.ctl_fastqs_rep2_R2" : ["BIOREP2_TECHREP1.R2.fq.gz", "BIOREP2_TECHREP2.R2.fq.gz"], - } - ``` - - 2) Control BAMs - - Define a BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. - - Example of 3 singled-ended replicates. - - ```javascript - { - "chip.ctl_paired_end" : false, - "chip.ctl_bams" : ["ctl1.bam", "ctl2.bam", "ctl3.bam"] - } - ``` - - 3) Control BAMs - - Define a filtered/deduped BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. These BAMs should not have unmapped reads or duplicates. - - Example of 2 singled-ended replicates. - ```javascript - { - "chip.ctl_paired_end" : false, - "chip.ctl_nodup_bams" : ["ctl1.nodup.bam", "ctl2.nodup.bam"] - } - ``` - - 4) Control TAG-ALIGN BEDs - - Define a TAG-ALIGN for each replicate. Our pipeline does not determine read endedness from a TAG-ALIGN file. You need to explicitly define read endedness. - - Example of 4 paired-ended replicates. - - ```javascript - { - "chip.ctl_paired_end" : true, - "chip.ctl_tas" : ["ctl1.tagAlign.gz", "ctl2.tagAlign.gz", "ctl3.tagAlign.gz", "ctl4.tagAlign.gz"] - } - ``` +1) Control FASTQs + * Technical replicates for each bio-rep will be **MERGED** in the very early stage of the pipeline. Each read end R1 and R2 have separate arrays `chip.ctl_fastqs_repX_R1` and `chip.ctl_fastqs_repX_R2`. Do not define R2 array for single-ended replicates. + * Example of 3 paired-ended biological replicates and 2 technical replicates for each bio rep. Two technical replicates `BIOREPX_TECHREP1.R1.fq.gz` and `BIOREPX_TECHREP2.R1.fq.gz` for each bio replicate will be merged. + + ```javascript + { + "chip.ctl_paired_end" : true, + "chip.ctl_fastqs_rep1_R1" : ["BIOREP1_TECHREP1.R1.fq.gz", "BIOREP1_TECHREP2.R1.fq.gz"], + "chip.ctl_fastqs_rep1_R2" : ["BIOREP1_TECHREP1.R2.fq.gz", "BIOREP1_TECHREP2.R2.fq.gz"], + "chip.ctl_fastqs_rep2_R1" : ["BIOREP2_TECHREP1.R1.fq.gz", "BIOREP2_TECHREP2.R1.fq.gz"], + "chip.ctl_fastqs_rep2_R2" : ["BIOREP2_TECHREP1.R2.fq.gz", "BIOREP2_TECHREP2.R2.fq.gz"], + } + ``` + +2) Control BAMs + * Define a BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. + * Example of 3 singled-ended replicates. + + ```javascript + { + "chip.ctl_paired_end" : false, + "chip.ctl_bams" : ["ctl1.bam", "ctl2.bam", "ctl3.bam"] + } + ``` + +3) Control BAMs + * Define a filtered/deduped BAM for each replicate. Our pipeline does not determine read endedness from a BAM file. You need to explicitly define read endedness. These BAMs should not have unmapped reads or duplicates. + * Example of 2 singled-ended replicates. + ```javascript + { + "chip.ctl_paired_end" : false, + "chip.ctl_nodup_bams" : ["ctl1.nodup.bam", "ctl2.nodup.bam"] + } + ``` + +4) Control TAG-ALIGN BEDs + * Define a TAG-ALIGN for each replicate. Our pipeline does not determine read endedness from a TAG-ALIGN file. You need to explicitly define read endedness. + * Example of 4 paired-ended replicates. + + ```javascript + { + "chip.ctl_paired_end" : true, + "chip.ctl_tas" : ["ctl1.tagAlign.gz", "ctl2.tagAlign.gz", "ctl3.tagAlign.gz", "ctl4.tagAlign.gz"] + } + ``` You can also mix up different data types for individual bio replicate and control. For example, pipeline can start from FASTQs for rep1 (SE) and rep3 (PE), BAMs for rep2 (SE), NODUP_BAMs for rep4 (SE) and TAG-ALIGNs for rep5 (PE). This example has two controls (ctl1: SE BAM, ctl2: PE FASTQs). From 22caf9e23d8e68544c04479c854cb6bbacc2ed2e Mon Sep 17 00:00:00 2001 From: Jin Lee Date: Thu, 31 Oct 2019 15:52:37 -0700 Subject: [PATCH 15/15] remove all hardcoded version strings except for WDL --- chip.wdl | 6 ++--- dev/{dev.md => build_on_dx.sh} | 24 +++----------------- dev/test/test_task/test.sh | 2 +- dev/test/test_workflow/test_chip.sh | 9 ++++++-- docs/tutorial_dx_cli.md | 3 ++- docs/tutorial_dx_web.md | 35 +++++++++++++++-------------- 6 files changed, 34 insertions(+), 45 deletions(-) rename dev/{dev.md => build_on_dx.sh} (86%) diff --git a/chip.wdl b/chip.wdl index 9ee8c923..ab9ff544 100644 --- a/chip.wdl +++ b/chip.wdl @@ -1,12 +1,12 @@ # ENCODE TF/Histone ChIP-Seq pipeline # Author: Jin Lee (leepc12@gmail.com) -#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 -#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 +#CAPER docker quay.io/encode-dcc/chip-seq-pipeline:v1.3.3 +#CAPER singularity docker://quay.io/encode-dcc/chip-seq-pipeline:v1.3.3 #CROO out_def https://storage.googleapis.com/encode-pipeline-output-definition/chip.croo.json workflow chip { - String pipeline_ver = 'dev-v1.3.3' + String pipeline_ver = 'v1.3.3' ### sample name, description String title = 'Untitled' String description = 'No description' diff --git a/dev/dev.md b/dev/build_on_dx.sh similarity index 86% rename from dev/dev.md rename to dev/build_on_dx.sh index 52a9030d..da5fa43e 100644 --- a/dev/dev.md +++ b/dev/build_on_dx.sh @@ -1,24 +1,7 @@ -# Dev +#!/bin/bash +set -e -## Command line for version change -```bash -PREV_VER=dev-v1.3.3 -NEW_VER=dev-v1.3.3 -for f in $(grep -rl ${PREV_VER} --include=*.{wdl,md,sh}) -do - sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f} -done -``` - -## Building templates on DX for each genome - -Make sure that you have [`dxWDL-0.79.1.jar`](https://github.com/DNAnexus/dxWDL/releases/download/0.79.1/dxWDL-0.79.1.jar) on your `$HOME`. Install [DNAnexus Platform SDK](https://wiki.DNAnexus.com/downloads) with `pip install dxpy`. Log-in on DNAnexus with `dx login` and choose "ENCODE Uniform Processing Pipelines" (name of our official DNAnexus project for pipelines). - -Run the following command line locally to build out DX workflows for this pipeline on our official one. This will overwrite (`-f` parameter does it). - -```bash -# version -VER=dev-v1.3.3 +VER=$(cat chip.wdl | grep "#CAPER docker" | awk 'BEGIN{FS=":"} {print $2}') DOCKER=quay.io/encode-dcc/chip-seq-pipeline:$VER # general @@ -73,4 +56,3 @@ java -jar ~/dxWDL-0.79.1.jar compile chip.wdl -project "ENCODE Uniform Processin # test sample SE ENCSR000DYI (subsampled, chr19/chrM only) java -jar ~/dxWDL-0.79.1.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines Azure" -extras <(echo "{\"default_runtime_attributes\":{\"docker\":\"${DOCKER}\"}}") -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI_subsampled_chr19_only -defaults example_input_json/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json -``` diff --git a/dev/test/test_task/test.sh b/dev/test/test_task/test.sh index 61d3a164..30923a99 100755 --- a/dev/test/test_task/test.sh +++ b/dev/test/test_task/test.sh @@ -12,7 +12,7 @@ INPUT=$2 if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 + DOCKER_IMAGE="conda" fi if [ $# -gt 3 ]; then NUM_TASK=$4 diff --git a/dev/test/test_workflow/test_chip.sh b/dev/test/test_workflow/test_chip.sh index f08bc884..2d2db026 100755 --- a/dev/test/test_workflow/test_chip.sh +++ b/dev/test/test_workflow/test_chip.sh @@ -8,7 +8,7 @@ fi if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 + DOCKER_IMAGE="conda" fi INPUT=$1 GCLOUD_SERVICE_ACCOUNT_SECRET_JSON_FILE=$2 @@ -31,6 +31,11 @@ cat > $TMP_WF_OPT << EOM } } EOM +if [ $DOCKER_IMAGE == 'conda' ]; then + WF_OPT= +else + WF_OPT="-o ${TMP_WF_OPT}" +fi METADATA=${PREFIX}.metadata.json # metadata RESULT=${PREFIX}.result.json # output @@ -45,6 +50,6 @@ java -Dconfig.file=backend_gcp_service_account.conf \ -Dbackend.providers.google.config.filesystems.gcs.auth=service-account \ -jar ${CROMWELL_JAR} run \ ../../../chip.wdl \ --i ${INPUT} -o ${TMP_WF_OPT} -m ${METADATA} +-i ${INPUT} ${WF_OPT} -m ${METADATA} rm -f tmp_secret_key ${TMP_WF_OPT} diff --git a/docs/tutorial_dx_cli.md b/docs/tutorial_dx_cli.md index d480f442..42a55aeb 100644 --- a/docs/tutorial_dx_cli.md +++ b/docs/tutorial_dx_cli.md @@ -43,9 +43,10 @@ This document describes instruction for the item 1). 7. Compile `chip.wdl` with an input JSON for the SUBSAMPLED paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). ```bash + $ WDL=chip.wdl $ PROJECT=[YOUR_PROJECT_NAME] $ OUT_FOLDER=/test_sample_chip_ENCSR936XTK_subsampled_chr19_only - $ DOCKER=quay.io/encode-dcc/chip-seq-pipeline:dev-v1.3.3 + $ DOCKER=$(cat ${WDL} | grep "#CAPER docker" | awk '{print $3}') $ java -jar dxWDL-0.77.jar compile chip.wdl -project ${PROJECT} -f -folder ${OUT_FOLDER} -defaults ${INPUT} -extras <(echo "{\"default_runtime_attributes\":{\"docker\":\"${DOCKER}\"}}") ``` diff --git a/docs/tutorial_dx_web.md b/docs/tutorial_dx_web.md index fd587f77..2977d563 100644 --- a/docs/tutorial_dx_web.md +++ b/docs/tutorial_dx_web.md @@ -15,8 +15,8 @@ This document describes instruction for the item 2). 3. Move to one of the following workflow directories according to the platform you have chosen for your project (AWS or Azure). These DX workflows are pre-built with all parameters defined. -* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/test_ENCSR936XTK_subsampled_chr19_only) -* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/test_ENCSR936XTK_subsampled_chr19_only) +* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows): Use `[LATEST_VER]/test_ENCSR936XTK_subsampled_chr19_only`. +* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows): Use `[LATEST_VER]/test_ENCSR936XTK_subsampled_chr19_only`. 4. Copy it to your project by right-clicking on the DX workflow `chip` and choose "Copy". @@ -39,22 +39,23 @@ This document describes instruction for the item 2). 1. DNAnexus allows only one copy of a workflow per project. The example workflow in the previous section is pre-built for the subsampled test sample [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/) with all parameters defined already. -2. Copy one of the following workflows according to the platform you have chosen for your project (AWS or Azure). -* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/general) without pre-defined reference genome. -* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/hg38) with pre-defined hg38 reference genome. -* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/hg19) with pre-defined hg19 reference genome. -* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/mm10) with pre-defined mm10 reference genome. -* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/dev-v1.3.3/mm9) with pre-defined mm9 reference genome. -* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/general) without pre-defined reference genome. -* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/hg38) with pre-defined hg38 reference genome. -* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/hg19) with pre-defined hg19 reference genome. -* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/mm10) with pre-defined mm10 reference genome. -* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/dev-v1.3.3/mm9) with pre-defined mm9 reference genome. +2. Choose your main platform (AWS or Azure). Move to [ENCODE ChIP-seq pipeline repository for AWS](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows) or [ENCODE ChIP-seq pipeline repository for Azure](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows). -3. Click on the DX workflow `chip`. +3. Choose a folder with the latest available version. -4. Specify your input files (FASTQs, BAMs, TAG-ALIGNs, ...) on the top left. For example, click on the item "fastqs_rep1_R1" and choose your R1 FASTQ file for replicate 1. See details [here](input.md) for other input types. +4. Copy one of the following workflows according to the platform you have chosen for your project. +> **IMPORTANT**: Make sure that you have chosen a correct platform (AWS or Azure) for your project. -5. Choose a reference genome. See details [here](input.md). + * general: General workflow without pre-defined reference genome. + * hg38: Worfklow with pre-defined hg38 reference genome. + * hg19: Worfklow with pre-defined hg19 reference genome. + * mm10: Worfklow with pre-defined mm10 reference genome. + * mm9: Worfklow with pre-defined mm9 reference genome. -6. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab. +5. Click on the DX workflow `chip`. + +6. Specify your input files (FASTQs, BAMs, TAG-ALIGNs, ...) on the top left. For example, click on the item "fastqs_rep1_R1" and choose your R1 FASTQ file for replicate 1. See details [here](input.md) for other input types. + +7. Choose a reference genome. See details [here](input.md). + +8. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab.