Skip to content

Commit

Permalink
Merge pull request #263 from ENCODE-DCC/PIPE-36_add_more_info_to_report
Browse files Browse the repository at this point in the history
more info in qc_report
  • Loading branch information
leepc12 authored Feb 15, 2022
2 parents fbf7b20 + 9594bee commit 3b918b6
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 8 deletions.
4 changes: 4 additions & 0 deletions chip.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -2096,6 +2096,7 @@ workflow chip {
ctl_paired_ends = ctl_paired_end_,
pipeline_type = pipeline_type,
aligner = aligner_,
no_dup_removal = no_dup_removal,
peak_caller = peak_caller_,
cap_num_peak = cap_num_peak_,
idr_thresh = idr_thresh,
Expand Down Expand Up @@ -3046,6 +3047,7 @@ task qc_report {
Array[Boolean] ctl_paired_ends
String pipeline_type
String aligner
Boolean no_dup_removal
String peak_caller
Int cap_num_peak
Float idr_thresh
Expand Down Expand Up @@ -3105,6 +3107,7 @@ task qc_report {
command {
set -e
python3 $(which encode_task_qc_report.py) \
--pipeline-prefix chip \
${'--pipeline-ver ' + pipeline_ver} \
${"--title '" + sub(title,"'","_") + "'"} \
${"--desc '" + sub(description,"'","_") + "'"} \
Expand All @@ -3114,6 +3117,7 @@ task qc_report {
--ctl-paired-ends ${sep=' ' ctl_paired_ends} \
--pipeline-type ${pipeline_type} \
--aligner ${aligner} \
${if (no_dup_removal) then '--no-dup-removal ' else ''} \
--peak-caller ${peak_caller} \
${'--cap-num-peak ' + cap_num_peak} \
--idr-thresh ${idr_thresh} \
Expand Down
30 changes: 22 additions & 8 deletions src/encode_task_qc_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ def parse_arguments():
help='Description for sample.')
parser.add_argument('--genome', type=str,
help='Reference genome.')
parser.add_argument('--pipeline-prefix', type=str, required=True,
help='Pipeline. e.g. atac, chip.')
parser.add_argument('--pipeline-ver', type=str,
help='Pipeline version.')
parser.add_argument('--multimapping', default=0, type=int,
Expand All @@ -50,6 +52,8 @@ def parse_arguments():
help='Pipeline type.')
parser.add_argument('--aligner', type=str, required=True,
help='Aligner.')
parser.add_argument('--no-dup-removal', action='store_true',
help='No duplicate removal.')
parser.add_argument('--peak-caller', type=str, required=True,
help='Peak caller.')
parser.add_argument('--cap-num-peak', default=0, type=int,
Expand Down Expand Up @@ -302,7 +306,7 @@ def make_cat_align(args, cat_root):
html_head='<h2>Marking duplicates (filtered BAM)</h2>',
html_foot="""
<div id='help-filter'>
Filtered out (samtools view -F 1804):
Filtered with samtools flag 1804 (samtools view -F 1804):
<ul>
<li>read unmapped (0x4)</li>
<li>mate unmapped (0x8, for paired-end)</li>
Expand Down Expand Up @@ -360,8 +364,16 @@ def make_cat_align(args, cat_root):
'nodup_samstat',
html_head='<h2>SAMstat (filtered/deduped BAM)</h2>',
html_foot="""
<p>Filtered and duplicates removed</p><br>
""",
<p>Filtered {dup_removal_detail}.
Subsampling with {pipeline_prefix}.{subsample_param_name} is not done in alignment steps.
Nodup BAM is converted into a BED type (TAGALIGN) later and then TAGALIGN is subsampled
with such parameter in the peak-calling step.<br>
</p>
""".format(
dup_removal_detail='but duplicates are kept' if args.no_dup_removal else 'and duplicates are removed',
pipeline_prefix=args.pipeline_prefix,
subsample_param_name='subsample_reads',
),
parser=parse_flagstat_qc,
map_key_desc=MAP_KEY_DESC_FLAGSTAT_QC,
parent=cat_align
Expand Down Expand Up @@ -465,9 +477,10 @@ def make_cat_lib_complexity(args, cat_root):
locations with EXACTLY two read pairs. The PBC2 should be significantly
greater than 1. {pipeline_specific_info}
</p><br>
<p>NRF (non redundant fraction) <br>
PBC1 (PCR Bottleneck coefficient 1) <br>
PBC2 (PCR Bottleneck coefficient 2) <br>
<p>Fragment: read for a single-ended dataset, pair of reads for a paired-ended dataset <br>
NRF: non redundant fraction <br>
PBC1: PCR Bottleneck coefficient 1 <br>
PBC2: PCR Bottleneck coefficient 2 <br>
PBC1 is the primary measure. Provisionally <br>
<ul>
<li>0-0.5 is severe bottlenecking</li>
Expand Down Expand Up @@ -580,7 +593,7 @@ def make_cat_replication(args, cat_root):
'num_peaks',
html_head='<h2>Number of raw peaks</h2>',
html_foot="""
Top {num_peak} raw peaks from {peak_caller} {extra_info}
The number of peaks is capped at {num_peak}<br>Peaks are called from {peak_caller} {extra_info}
""".format(
num_peak=args.cap_num_peak,
peak_caller=args.peak_caller,
Expand Down Expand Up @@ -651,7 +664,7 @@ def make_cat_align_enrich(args, cat_root):
html_head_xcor = '<h2>Strand cross-correlation measures (trimmed/filtered SE BAM)</h2>'
html_foot_xcor = """
<br><p>Performed on subsampled ({xcor_subsample_reads}) reads mapped from FASTQs that are trimmed to {xcor_trim_bp}.
Such FASTQ trimming and subsampling reads are for cross-corrleation analysis only.
Such FASTQ trimming and subsampling are for the cross-corrleation analysis only and only R1 reads are taken.
Untrimmed FASTQs are used for all the other analyses.</p>
<div id='help-xcor'><p>
NOTE1: For SE datasets, reads from replicates are randomly subsampled to {xcor_subsample_reads}.<br>
Expand All @@ -670,6 +683,7 @@ def make_cat_align_enrich(args, cat_root):
xcor_subsample_reads=args.xcor_subsample_reads
)
html_foot_xcor += """<ul>
<li>Fragment = read (for single-ended dataset) or pair of reads (for paired-ended dataset) </li>
<li>Normalized strand cross-correlation coefficient (NSC) = col9 in outFile </li>
<li>Relative strand cross-correlation coefficient (RSC) = col10 in outFile </li>
<li>Estimated fragment length = col3 in outFile, take the top value </li>
Expand Down

0 comments on commit 3b918b6

Please sign in to comment.