Skip to content

Commit

Permalink
Merge pull request #454 from fidelram/fingerprint_summation
Browse files Browse the repository at this point in the history
Fingerprint summation
  • Loading branch information
dpryan79 authored Dec 2, 2016
2 parents bd865bb + c896eda commit 41920d5
Show file tree
Hide file tree
Showing 9 changed files with 498 additions and 278 deletions.
1 change: 1 addition & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Scale factor is now set to 1 in bamCoverage if no normalization is used. The fact that this wasn't being done previously was a bug.
* Fixed a bug (#451) affecting BED files with a `deepTools_group` column that caused a problem with `--sortRegions keep` in computeMatrix.
* Fixed a bug where some matrices produced with `computeMatrixOperations cbind` would result in the right-most samples sometimes getting squished due to having ticks outside of their graph bounds. Ticks are now scaled if they don't match the data range (issue #452).
* In plotFingerprint, the number of reads per-bin are no longer used. Instead, the sum of the per-base coverage (or signal if bigWig input is used) is used. This leads to more similar metrics produced by us and others regarding things like Jensen-Shannon metrics. For those just interested in the plots, there's little effective change here.

2.4.0

Expand Down
1 change: 0 additions & 1 deletion deeptools/SES_scaleFactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
# the transpose is taken to easily iterate by columns which are now
# converted to rows
num_reads_per_bin = num_reads_per_bin.transpose()
# np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin)
# size factors based on order statistics
# see Signal extraction scaling (SES) method in: Diaz et al (2012)
# Normalization, bias correction, and peak calling for ChIP-seq.
Expand Down
26 changes: 10 additions & 16 deletions deeptools/countReadsPerBin.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,22 +563,11 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
start_time = time.time()
# caching seems faster. TODO: profile the function
c = 0
try:
# BAM input
if chrom in bamHandle.references:
reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
if r.flag & 4 == 0]
else:
raise NameError("chromosome {} not found in bam file".format(chrom))
except:
# bigWig input, as used by plotFingerprint
if bamHandle.chroms(chrom):
_ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
_[np.isnan(_)] = 0.0
coverages += _
continue
else:
raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))
if chrom in bamHandle.references:
reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
if r.flag & 4 == 0]
else:
raise NameError("chromosome {} not found in bam file".format(chrom))

prev_start_pos = None # to store the start positions
# of previous processed read pair
Expand Down Expand Up @@ -625,6 +614,11 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
continue

if fragmentStart < reg[0]:
fragmentStart = reg[0]
if fragmentEnd > reg[0] + len(coverages) * tileSize:
fragmentEnd = reg[0] + len(coverages) * tileSize

sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
if last_eIdx is not None:
Expand Down
3 changes: 2 additions & 1 deletion deeptools/plotFingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from scipy.stats import poisson

import deeptools.countReadsPerBin as countR
import deeptools.sumCoveragePerBin as sumR
from deeptools import parserCommon

old_settings = np.seterr(all='ignore')
Expand Down Expand Up @@ -342,7 +343,7 @@ def getExpected(mu):
def main(args=None):
args = process_args(args)

cr = countR.CountReadsPerBin(
cr = sumR.SumCoveragePerBin(
args.bamfiles,
args.binSize,
args.numberOfSamples,
Expand Down
225 changes: 225 additions & 0 deletions deeptools/sumCoveragePerBin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import numpy as np
import multiprocessing
import time

from deeptools import countReadsPerBin
from deeptoolsintervals import GTF


class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin):
r"""This is an extension of CountReadsPerBin for use with plotFingerprint.
There, we need to sum the per-base coverage.
"""
def get_coverage_of_region(self, bamHandle, chrom, regions,
fragmentFromRead_func=None):
"""
Returns a numpy array that corresponds to the number of reads
that overlap with each tile.
>>> test = Tester()
>>> import pysam
>>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)
For this case the reads are length 36. The number of overlapping
read fragments is 4 and 5 for the positions tested. Note that reads are
NOT extended, due to there being a 0 length input list of BAM files!
>>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
... [(5000833, 5000834), (5000834, 5000835)])
array([ 4., 5.])
In the following case the reads length is 50. Reads are not extended.
>>> c.extendReads=False
>>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
array([ 2., 4., 4.])
"""
if not fragmentFromRead_func:
fragmentFromRead_func = self.get_fragment_from_read
nbins = len(regions)
if len(regions[0]) == 3:
nbins = 0
for reg in regions:
nbins += (reg[1] - reg[0]) // reg[2]
coverages = np.zeros(nbins, dtype='float64')

if self.defaultFragmentLength == 'read length':
extension = 0
else:
extension = self.maxPairedFragmentLength

blackList = None
if self.blackListFileName is not None:
blackList = GTF(self.blackListFileName)

vector_start = 0
for idx, reg in enumerate(regions):
if len(reg) == 3:
tileSize = int(reg[2])
nRegBins = (reg[1] - reg[0]) // tileSize
else:
nRegBins = 1
tileSize = int(reg[1] - reg[0])

# Blacklisted regions have a coverage of 0
if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
continue
regStart = int(max(0, reg[0] - extension))
regEnd = reg[1] + int(extension)

# If alignments are extended and there's a blacklist, ensure that no
# reads originating in a blacklist are fetched
if blackList and reg[0] > 0 and extension > 0:
o = blackList.findOverlaps(chrom, regStart, reg[0])
if o is not None and len(o) > 0:
regStart = o[-1][1]
o = blackList.findOverlaps(chrom, reg[1], regEnd)
if o is not None and len(o) > 0:
regEnd = o[0][0]

start_time = time.time()
# caching seems faster. TODO: profile the function
c = 0
try:
# BAM input
if chrom in bamHandle.references:
reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
if r.flag & 4 == 0]
else:
raise NameError("chromosome {} not found in bam file".format(chrom))
except:
# bigWig input, as used by plotFingerprint
if bamHandle.chroms(chrom):
_ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
_[np.isnan(_)] = 0.0
_ = _ * tileSize
coverages += _
continue
else:
raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))

prev_start_pos = None # to store the start positions
# of previous processed read pair
for read in reads:
if self.minMappingQuality and read.mapq < self.minMappingQuality:
continue

# filter reads based on SAM flag
if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
continue
if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
continue

# Fragment lengths
if self.minFragmentLength > 0 and abs(read.template_length) < self.minFragmentLength:
continue
if self.maxFragmentLength > 0 and abs(read.template_length) > self.maxFragmentLength:
continue

# get rid of duplicate reads that have same position on each of the
# pairs
if self.ignoreDuplicates and prev_start_pos \
and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
continue

# since reads can be split (e.g. RNA-seq reads) each part of the
# read that maps is called a position block.
try:
position_blocks = fragmentFromRead_func(read)
except TypeError:
# the get_fragment_from_read functions returns None in some cases.
# Those cases are to be skipped, hence the continue line.
continue

last_eIdx = None
for fragmentStart, fragmentEnd in position_blocks:
if fragmentEnd is None or fragmentStart is None:
continue
fragmentLength = fragmentEnd - fragmentStart
if fragmentLength == 0:
continue
# skip reads that are not in the region being
# evaluated.
if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
continue

if fragmentStart < reg[0]:
fragmentStart = reg[0]
if fragmentEnd > reg[0] + len(coverages) * tileSize:
fragmentEnd = reg[0] + len(coverages) * tileSize

sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
if eIdx >= len(coverages):
eIdx = len(coverages) - 1
if last_eIdx is not None:
sIdx = max(last_eIdx, sIdx)
if sIdx >= eIdx:
continue

# First bin
if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
_ = fragmentEnd - fragmentStart
else:
_ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
if _ > tileSize:
_ = tileSize
coverages[sIdx] += _
_ = sIdx + 1
while _ < eIdx:
coverages[_] += tileSize
_ += 1
while eIdx - sIdx >= nRegBins:
eIdx -= 1
if eIdx > sIdx:
_ = fragmentEnd - (reg[0] + eIdx * tileSize)
if _ > tileSize:
_ = tileSize
elif _ < 0:
_ = 0
coverages[eIdx] += _
last_eIdx = eIdx

prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)
c += 1

if self.verbose:
endTime = time.time()
print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % (
multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))

vector_start += nRegBins

# change zeros to NAN
if self.zerosToNans:
coverages[coverages == 0] = np.nan

return coverages


class Tester(object):

def __init__(self):
"""
The distribution of reads between the two bam files is as follows.
They cover 200 bp
0 100 200
|------------------------------------------------------------|
A ===============
===============
B =============== ===============
===============
===============
"""
import os
self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
self.bamFile1 = self.root + "testA.bam"
self.bamFile2 = self.root + "testB.bam"
self.bamFile_PE = self.root + "test_paired2.bam"
self.chrom = '3R'
8 changes: 4 additions & 4 deletions docs/content/tools/plotFingerprint.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ It determines how well the signal in the ChIP-seq sample can be differentiated f
For factors that will enrich well-defined, rather narrow regions (e.g. transcription factors such as p300), the resulting plot can be used to assess the strength of a ChIP, but the broader the enrichments are to be expected, the less clear the plot will be.
Vice versa, if you do not know what kind of signal to expect, the fingerprint plot will give you a straight-forward indication of how careful you will have to be during your downstream analyses to separate biological noise from meaningful signal.

Similar to ``multiBamSummary``, ``plotFingerprint`` randomly samples genome regions (bins) of a specified length and counts the reads from indexed [BAM][] files that overlap with those regions.
These counts are then sorted according to their rank and the cumulative sum of read counts is plotted.
Similar to ``multiBamSummary``, ``plotFingerprint`` randomly samples genome regions (bins) of a specified length and sums the per-base coverage in indexed [BAM][] (or bigWig) files that overlap with those regions.
These values are then sorted according to their rank and the cumulative sum of read counts is plotted.


What the plots tell you
~~~~~~~~~~~~~~~~~~~~~~~~

An ideal [input][] with perfect uniform distribution of reads along the genome (i.e. without enrichments in open chromatin etc.) should generate a straight diagonal line. A very specific and strong ChIP enrichment will be indicated by a prominent and steep rise of the cumulative sum towards the highest rank. This means that a big chunk of reads from the ChIP sample is located in few bins which corresponds to high, narrow enrichments typically seen for transcription factors.
An ideal [input][] with perfect uniform distribution of reads along the genome (i.e. without enrichments in open chromatin etc.) and infinite sequencing coverage should generate a straight diagonal line. A very specific and strong ChIP enrichment will be indicated by a prominent and steep rise of the cumulative sum towards the highest rank. This means that a big chunk of reads from the ChIP sample is located in few bins which corresponds to high, narrow enrichments typically seen for transcription factors.

Here you see 3 different fingerprint plots.
We chose these examples to show you how the nature of the ChIP signal (narrow and high vs. wide and not extremely high) is reflected in the "fingerprint" plots.
Expand Down Expand Up @@ -58,7 +58,7 @@ The following example generates the fingerprints for the invididual ENCODE histo
.. image:: ../../images/test_plots/fingerprints.png

The table that you can obtain via ``--outRawCounts`` simply contains the number of reads overlapping with each individually sampled genome bin. For the plot above, each column is sorted in increasing order and then the cumulative sum is plotted.
The table that you can obtain via ``--outRawCounts`` simply contains the sum of the per-base coverage inside each sampled genome bin. For the plot above, each column is sorted in increasing order and then the cumulative sum is plotted.

.. code:: bash
Expand Down
10 changes: 5 additions & 5 deletions galaxy/wrapper/plotFingerprint.xml
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ Output
The default output is a diagnostic plot (see below for an example and further down for some background information).
Optionally, you can obtain the table of raw counts that was used to generate the plot.
Optionally, you can obtain the table of raw values that were used to generate the plot.
.. image:: $PATH_TO_IMAGES/plotFingerprint_output.png
:width: 600
Expand All @@ -170,10 +170,10 @@ What follows is the output of ``plotFingerprint`` with our test ChIP-Seq data se
Theoretical Background
----------------------
The tool first samples indexed BAM files and counts all reads overlapping a window (bin) of the specified length.
These counts are then sorted according to their rank (the bin with the highest number of reads has the highest rank)
and the cumulative sum of read counts are plotted. An ideal input (control) with a uniform distribution of reads alignments
result in a diagonal line. A very specific and strong ChIP enrichment, on the other hand, would result in a large portion
The tool first samples indexed BAM files and sums the per-base coverage of reads/fragments overlapping a window (bin) of the specified length.
These values are then sorted according to their rank (the bin with the highest number of reads has the highest rank)
and the cumulative sum plotted. An ideal input (control) with a uniform distribution of reads alignments
and infinite sequencing depth will result in a diagonal line. A very specific and strong ChIP enrichment, on the other hand, would result in a large portion
of reads accumulating in just a few bins and the resulting plot showing a steep rise toward the right-most edge. Such results are
most commonly seen with transcription factors.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Sample AUC Synthetic AUC X-intercept Synthetic X-intercept Elbow Point Synthetic Elbow Point JS Distance Synthetic JS Distance % genome enriched diff. enrichment CHANCE divergence
bowtie2 test1.bam 0.00739484047583 0.270844774362 0.984443061605 0.905310085331 0.984380833852 0.597688388779 NA 0.177435375809 NA NA NA
bowtie2 test1.bam 0.00739484047583 0.270844774362 0.984443061605 0.905310085331 0.984380833852 0.597688388779 NA 0.177435375809 NA NA NA
bowtie2 test1.bam 0.00493632029864 0.481650684758 0.984443061605 1.15310443503e-24 0.984940883634 0.523268829811 NA 0.269861238192 NA NA NA
bowtie2 test1.bam 0.00493632029864 0.481650684758 0.984443061605 1.15310443503e-24 0.984940883634 0.523268829811 NA 0.269861238192 NA NA NA
Loading

0 comments on commit 41920d5

Please sign in to comment.