Skip to content

Commit

Permalink
Merge branch '20191223efrat'
Browse files Browse the repository at this point in the history
  • Loading branch information
Wanding Zhou committed Jan 4, 2020
2 parents 736ebef + 2064438 commit ede9180
Showing 1 changed file with 57 additions and 53 deletions.
110 changes: 57 additions & 53 deletions scripts/QC.sh
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ function biscuitQC {
>&2 echo "`date`---- BISCUIT_QC_BASECOV ----"
# bedtools genomecov -bga -split -ibam $input_bam -g ${BISCUIT_REFERENCE}.fai | bedtools sort >$QCdir/${sname}_bga.bed
# samtools view -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g ${BISCUIT_REFERENCE}.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_q40.bed
bedtools genomecov -bga -split -ibam $input_bam -g ${BISCUIT_REFERENCE}.fai | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga.bed
samtools view -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g ${BISCUIT_REFERENCE}.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_q40.bed
bedtools genomecov -bga -split -ibam $input_bam | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga.bed
samtools view -q 40 -b $input_bam | bedtools genomecov -ibam stdin -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_q40.bed

echo -e "BISCUITqc Depth Distribution (All)" >$QCdir/${sname}_covdist_table.txt
echo -e "depth\tcount" >>$QCdir/${sname}_covdist_table.txt
Expand All @@ -237,52 +237,56 @@ function biscuitQC {
>&2 echo "`date`---- BISCUIT_QC_DUPLICATE ----"
# duplicate
#samtools view -f 0x400 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_dup.bed
samtools view -f 0x400 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup.bed

# duplication rate
echo -e "BISCUITqc Read Duplication Table" >$QCdir/${sname}_dup_report.txt
echo -ne "#bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt
awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga.bed >>$QCdir/${sname}_dup_report.txt
echo -ne "#bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt
awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup.bed >>$QCdir/${sname}_dup_report.txt

if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then
# high GC content
echo -ne "#high-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#high-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt

# low GC content
echo -ne "#low-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#low-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
fi

## Q40
# duplicate
# samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_dup_q40.bed
samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup_q40.bed

# duplication rate
echo -ne "#bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt
awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_q40.bed >>$QCdir/${sname}_dup_report.txt
echo -ne "#bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt
awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup_q40.bed >>$QCdir/${sname}_dup_report.txt

if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then
# high GC content
echo -ne "#high-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#high-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt

# low GC content
echo -ne "#low-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#low-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
samtools view -f 0x400 -b $input_bam >${sname}_dup_tmp.bam
if [[ $(samtools view ${sname}_dup_tmp.bam | wc -l) -ge 2 ]]; then
bedtools genomecov -ibam ${sname}_dup_tmp.bam -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup.bed
rm -f ${sname}_dup_tmp.bam

# duplication rate
echo -e "BISCUITqc Read Duplication Table" >$QCdir/${sname}_dup_report.txt
echo -ne "#bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt
awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga.bed >>$QCdir/${sname}_dup_report.txt
echo -ne "#bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt
awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup.bed >>$QCdir/${sname}_dup_report.txt

if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then
# high GC content
echo -ne "#high-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#high-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_TOPGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt

# low GC content
echo -ne "#low-GC bases covered by all reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#low-GC bases covered by duplicate reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup.bed -b $BISCUIT_BOTGC_BED -sorted | awk 'BEGIN{a=0}$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
fi

## Q40
# duplicate
# samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -g $BISCUIT_REFERENCE.fai -bga -split | bedtools sort >$QCdir/${sname}_bga_dup_q40.bed
samtools view -f 0x400 -q 40 -b $input_bam | bedtools genomecov -ibam stdin -bga -split | LC_ALL=C sort --parallel=$processes -k1,1 -k2,2n -T $QCdir >$QCdir/${sname}_bga_dup_q40.bed

# duplication rate
echo -ne "#bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt
awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_q40.bed >>$QCdir/${sname}_dup_report.txt
echo -ne "#bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt
awk '$4>0{a+=$3-$2}END{print a}' $QCdir/${sname}_bga_dup_q40.bed >>$QCdir/${sname}_dup_report.txt

if [[ -f "$BISCUIT_TOPGC_BED" && -f "$BISCUIT_BOTGC_BED" ]]; then
# high GC content
echo -ne "#high-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#high-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_TOPGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt

# low GC content
echo -ne "#low-GC bases covered by all q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
echo -ne "#low-GC bases covered by duplicate q40-reads: " >>$QCdir/${sname}_dup_report.txt
bedtools intersect -a $QCdir/${sname}_bga_dup_q40.bed -b $BISCUIT_BOTGC_BED -sorted | awk '$4>0{a+=$3-$2}END{print a}' >>$QCdir/${sname}_dup_report.txt
fi
fi
fi

Expand Down Expand Up @@ -420,7 +424,7 @@ function biscuitQC {
samtools view -h -q 40 $input_bam | biscuit bsconv $BISCUIT_REFERENCE - | grep -E -o --color "ZN:Z:[^ ].*" | awk -F '[^0-9]*' -v OFS="\t" '{ra[$2]+=1;rc[$4]+=1;rg[$6]+=1;rt[$8]+=1;}END{for(k in ra) {print "CA", k, ra[k]} for(k in rc) {print "CC", k, rc[k]} for(k in rg) {print "CG", k, rg[k]} for(k in rt) {print "CT", k, rt[k]}}' | sort -k1,1 -k2,2n >>$QCdir/${sname}_freqOfTotalRetentionPerRead.txt

echo -e "BISCUITqc Conversion Rate by Base Average Table" >$QCdir/${sname}_totalBaseConversionRate.txt
biscuit vcf2bed -et c $input_vcf | awk '{beta_sum[$6]+=$8; beta_cnt[$6]+=1;} END{print "CA\tCC\tCG\tCT"; print beta_sum["CA"]/beta_cnt["CA"]"\t"beta_sum["CC"]/beta_cnt["CC"]"\t"beta_sum["CG"]/beta_cnt["CG"]"\t"beta_sum["CT"]/beta_cnt["CT"];}' >>$QCdir/${sname}_totalBaseConversionRate.txt
biscuit vcf2bed -et c $input_vcf | awk '{beta_sum[$6]+=$8; beta_cnt[$6]+=1;} END{print "CA\tCC\tCG\tCT"; if(beta_cnt["CA"] < 20 || beta_cnt["CC"] < 20 || beta_cnt["CG"] < 20 || beta_cnt["CT"] < 20) print "-1\t-1\t-1\t-1"; else print beta_sum["CA"]/beta_cnt["CA"]"\t"beta_sum["CC"]/beta_cnt["CC"]"\t"beta_sum["CG"]/beta_cnt["CG"]"\t"beta_sum["CT"]/beta_cnt["CT"];}' >>$QCdir/${sname}_totalBaseConversionRate.txt

echo -e "BISCUITqc Conversion Rate by Read Average Table" >$QCdir/${sname}_totalReadConversionRate.txt
samtools view -hq 40 -F 0x900 $input_bam | biscuit bsconv -b $BISCUIT_REFERENCE - | awk '{for(i=1;i<=8;++i) a[i]+=$i;}END{print "CpA\tCpC\tCpG\tCpT"; print a[1]/(a[1]+a[2])"\t"a[3]/(a[3]+a[4])"\t"a[5]/(a[5]+a[6])"\t"a[7]/(a[7]+a[8]);}' >>$QCdir/${sname}_totalReadConversionRate.txt
Expand Down Expand Up @@ -474,10 +478,10 @@ function biscuitQC {
rm $QCdir/*.bed
fi

########################################
## Running check on output files
########################################
basic_check_output_filled
########################################
## Running check on output files
########################################
basic_check_output_filled
}


Expand Down

0 comments on commit ede9180

Please sign in to comment.