Merge pull request #454 from fidelram/fingerprint_summation

Fingerprint summation
deeptools · Dec 2, 2016 · 41920d5 · 41920d5
2 parents bd865bb + c896eda
commit 41920d5
Show file tree

Hide file tree

Showing 9 changed files with 498 additions and 278 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -4,6 +4,7 @@
  * Scale factor is now set to 1 in bamCoverage if no normalization is used. The fact that this wasn't being done previously was a bug.
  * Fixed a bug (#451) affecting BED files with a `deepTools_group` column that caused a problem with `--sortRegions keep` in computeMatrix.
  * Fixed a bug where some matrices produced with `computeMatrixOperations cbind` would result in the right-most samples sometimes getting squished due to having ticks outside of their graph bounds. Ticks are now scaled if they don't match the data range (issue #452).
+ * In plotFingerprint, the number of reads per-bin are no longer used. Instead, the sum of the per-base coverage (or signal if bigWig input is used) is used. This leads to more similar metrics produced by us and others regarding things like Jensen-Shannon metrics. For those just interested in the plots, there's little effective change here.
 
 2.4.0
 

diff --git a/deeptools/SES_scaleFactor.py b/deeptools/SES_scaleFactor.py
@@ -102,7 +102,6 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
     # the transpose is taken to easily iterate by columns which are now
     # converted to rows
     num_reads_per_bin = num_reads_per_bin.transpose()
-#    np.savetxt("/home/ramirez/tmp/test.num_reads", num_reads_per_bin)
     # size factors based on order statistics
     # see Signal extraction scaling (SES) method in: Diaz et al (2012)
     # Normalization, bias correction, and peak calling for ChIP-seq.

diff --git a/deeptools/countReadsPerBin.py b/deeptools/countReadsPerBin.py
@@ -563,22 +563,11 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
             start_time = time.time()
             # caching seems faster. TODO: profile the function
             c = 0
-            try:
-                # BAM input
-                if chrom in bamHandle.references:
-                    reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
-                             if r.flag & 4 == 0]
-                else:
-                    raise NameError("chromosome {} not found in bam file".format(chrom))
-            except:
-                # bigWig input, as used by plotFingerprint
-                if bamHandle.chroms(chrom):
-                    _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
-                    _[np.isnan(_)] = 0.0
-                    coverages += _
-                    continue
-                else:
-                    raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))
+            if chrom in bamHandle.references:
+                reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
+                         if r.flag & 4 == 0]
+            else:
+                raise NameError("chromosome {} not found in bam file".format(chrom))
 
             prev_start_pos = None  # to store the start positions
             # of previous processed read pair
@@ -625,6 +614,11 @@ def get_coverage_of_region(self, bamHandle, chrom, regions,
                     if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
                         continue
 
+                    if fragmentStart < reg[0]:
+                        fragmentStart = reg[0]
+                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
+                        fragmentEnd = reg[0] + len(coverages) * tileSize
+
                     sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
                     eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
                     if last_eIdx is not None:

diff --git a/deeptools/plotFingerprint.py b/deeptools/plotFingerprint.py
@@ -13,6 +13,7 @@
 from scipy.stats import poisson
 
 import deeptools.countReadsPerBin as countR
+import deeptools.sumCoveragePerBin as sumR
 from deeptools import parserCommon
 
 old_settings = np.seterr(all='ignore')
@@ -342,7 +343,7 @@ def getExpected(mu):
 def main(args=None):
     args = process_args(args)
 
-    cr = countR.CountReadsPerBin(
+    cr = sumR.SumCoveragePerBin(
         args.bamfiles,
         args.binSize,
         args.numberOfSamples,

diff --git a/deeptools/sumCoveragePerBin.py b/deeptools/sumCoveragePerBin.py
@@ -0,0 +1,225 @@
+import numpy as np
+import multiprocessing
+import time
+
+from deeptools import countReadsPerBin
+from deeptoolsintervals import GTF
+
+
+class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin):
+    r"""This is an extension of CountReadsPerBin for use with plotFingerprint.
+    There, we need to sum the per-base coverage.
+    """
+    def get_coverage_of_region(self, bamHandle, chrom, regions,
+                               fragmentFromRead_func=None):
+        """
+        Returns a numpy array that corresponds to the number of reads
+        that overlap with each tile.
+
+        >>> test = Tester()
+        >>> import pysam
+        >>> c = SumCoveragePerBin([], stepSize=1, extendReads=300)
+
+        For this case the reads are length 36. The number of overlapping
+        read fragments is 4 and 5 for the positions tested. Note that reads are
+        NOT extended, due to there being a 0 length input list of BAM files!
+
+        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2',
+        ... [(5000833, 5000834), (5000834, 5000835)])
+        array([ 4.,  5.])
+
+        In the following  case the reads length is 50. Reads are not extended.
+
+        >>> c.extendReads=False
+        >>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)])
+        array([ 2.,  4.,  4.])
+
+
+        """
+        if not fragmentFromRead_func:
+            fragmentFromRead_func = self.get_fragment_from_read
+        nbins = len(regions)
+        if len(regions[0]) == 3:
+            nbins = 0
+            for reg in regions:
+                nbins += (reg[1] - reg[0]) // reg[2]
+        coverages = np.zeros(nbins, dtype='float64')
+
+        if self.defaultFragmentLength == 'read length':
+            extension = 0
+        else:
+            extension = self.maxPairedFragmentLength
+
+        blackList = None
+        if self.blackListFileName is not None:
+            blackList = GTF(self.blackListFileName)
+
+        vector_start = 0
+        for idx, reg in enumerate(regions):
+            if len(reg) == 3:
+                tileSize = int(reg[2])
+                nRegBins = (reg[1] - reg[0]) // tileSize
+            else:
+                nRegBins = 1
+                tileSize = int(reg[1] - reg[0])
+
+            # Blacklisted regions have a coverage of 0
+            if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]):
+                continue
+            regStart = int(max(0, reg[0] - extension))
+            regEnd = reg[1] + int(extension)
+
+            # If alignments are extended and there's a blacklist, ensure that no
+            # reads originating in a blacklist are fetched
+            if blackList and reg[0] > 0 and extension > 0:
+                o = blackList.findOverlaps(chrom, regStart, reg[0])
+                if o is not None and len(o) > 0:
+                    regStart = o[-1][1]
+                o = blackList.findOverlaps(chrom, reg[1], regEnd)
+                if o is not None and len(o) > 0:
+                    regEnd = o[0][0]
+
+            start_time = time.time()
+            # caching seems faster. TODO: profile the function
+            c = 0
+            try:
+                # BAM input
+                if chrom in bamHandle.references:
+                    reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd)
+                             if r.flag & 4 == 0]
+                else:
+                    raise NameError("chromosome {} not found in bam file".format(chrom))
+            except:
+                # bigWig input, as used by plotFingerprint
+                if bamHandle.chroms(chrom):
+                    _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float)
+                    _[np.isnan(_)] = 0.0
+                    _ = _ * tileSize
+                    coverages += _
+                    continue
+                else:
+                    raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms()))
+
+            prev_start_pos = None  # to store the start positions
+            # of previous processed read pair
+            for read in reads:
+                if self.minMappingQuality and read.mapq < self.minMappingQuality:
+                    continue
+
+                # filter reads based on SAM flag
+                if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include:
+                    continue
+                if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0:
+                    continue
+
+                # Fragment lengths
+                if self.minFragmentLength > 0 and abs(read.template_length) < self.minFragmentLength:
+                    continue
+                if self.maxFragmentLength > 0 and abs(read.template_length) > self.maxFragmentLength:
+                    continue
+
+                # get rid of duplicate reads that have same position on each of the
+                # pairs
+                if self.ignoreDuplicates and prev_start_pos \
+                        and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse):
+                    continue
+
+                # since reads can be split (e.g. RNA-seq reads) each part of the
+                # read that maps is called a position block.
+                try:
+                    position_blocks = fragmentFromRead_func(read)
+                except TypeError:
+                    # the get_fragment_from_read functions returns None in some cases.
+                    # Those cases are to be skipped, hence the continue line.
+                    continue
+
+                last_eIdx = None
+                for fragmentStart, fragmentEnd in position_blocks:
+                    if fragmentEnd is None or fragmentStart is None:
+                        continue
+                    fragmentLength = fragmentEnd - fragmentStart
+                    if fragmentLength == 0:
+                        continue
+                    # skip reads that are not in the region being
+                    # evaluated.
+                    if fragmentEnd <= reg[0] or fragmentStart >= reg[1]:
+                        continue
+
+                    if fragmentStart < reg[0]:
+                        fragmentStart = reg[0]
+                    if fragmentEnd > reg[0] + len(coverages) * tileSize:
+                        fragmentEnd = reg[0] + len(coverages) * tileSize
+
+                    sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0)
+                    eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins)
+                    if eIdx >= len(coverages):
+                        eIdx = len(coverages) - 1
+                    if last_eIdx is not None:
+                        sIdx = max(last_eIdx, sIdx)
+                        if sIdx >= eIdx:
+                            continue
+
+                    # First bin
+                    if fragmentEnd < reg[0] + (sIdx + 1) * tileSize:
+                        _ = fragmentEnd - fragmentStart
+                    else:
+                        _ = reg[0] + (sIdx + 1) * tileSize - fragmentStart
+                    if _ > tileSize:
+                        _ = tileSize
+                    coverages[sIdx] += _
+                    _ = sIdx + 1
+                    while _ < eIdx:
+                        coverages[_] += tileSize
+                        _ += 1
+                    while eIdx - sIdx >= nRegBins:
+                        eIdx -= 1
+                    if eIdx > sIdx:
+                        _ = fragmentEnd - (reg[0] + eIdx * tileSize)
+                        if _ > tileSize:
+                            _ = tileSize
+                        elif _ < 0:
+                            _ = 0
+                        coverages[eIdx] += _
+                    last_eIdx = eIdx
+
+                prev_start_pos = (read.reference_start, read.pnext, read.is_reverse)
+                c += 1
+
+            if self.verbose:
+                endTime = time.time()
+                print("%s,  processing %s (%.1f per sec) reads @ %s:%s-%s" % (
+                    multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1]))
+
+            vector_start += nRegBins
+
+        # change zeros to NAN
+        if self.zerosToNans:
+            coverages[coverages == 0] = np.nan
+
+        return coverages
+
+
+class Tester(object):
+
+    def __init__(self):
+        """
+        The distribution of reads between the two bam files is as follows.
+
+        They cover 200 bp
+
+          0                              100                           200
+          |------------------------------------------------------------|
+        A                                ===============
+                                                        ===============
+
+
+        B                 ===============               ===============
+                                         ===============
+                                                        ===============
+        """
+        import os
+        self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
+        self.bamFile1 = self.root + "testA.bam"
+        self.bamFile2 = self.root + "testB.bam"
+        self.bamFile_PE = self.root + "test_paired2.bam"
+        self.chrom = '3R'
diff --git a/docs/content/tools/plotFingerprint.rst b/docs/content/tools/plotFingerprint.rst
@@ -21,14 +21,14 @@ It determines how well the signal in the ChIP-seq sample can be differentiated f
 For factors that will enrich well-defined, rather narrow regions (e.g. transcription factors such as p300), the resulting plot can be used to assess the strength of a ChIP, but the broader the enrichments are to be expected, the less clear the plot will be.
 Vice versa, if you do not know what kind of signal to expect, the fingerprint plot will give you a straight-forward indication of how careful you will have to be during your downstream analyses to separate biological noise from meaningful signal.
 
-Similar to ``multiBamSummary``, ``plotFingerprint`` randomly samples genome regions (bins) of a specified length and counts the reads from indexed [BAM][] files that overlap with those regions.
-These counts are then sorted according to their rank and the cumulative sum of read counts is plotted. 
+Similar to ``multiBamSummary``, ``plotFingerprint`` randomly samples genome regions (bins) of a specified length and sums the per-base coverage in indexed [BAM][] (or bigWig) files that overlap with those regions.
+These values are then sorted according to their rank and the cumulative sum of read counts is plotted. 
 
 
 What the plots tell you
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-An ideal [input][] with perfect uniform distribution of reads along the genome (i.e. without enrichments in open chromatin etc.) should generate a straight diagonal line. A very specific and strong ChIP enrichment will be indicated by a prominent and steep rise of the cumulative sum towards the highest rank. This means that a big chunk of reads from the ChIP sample is located in few bins which corresponds to high, narrow enrichments typically seen for transcription factors.
+An ideal [input][] with perfect uniform distribution of reads along the genome (i.e. without enrichments in open chromatin etc.) and infinite sequencing coverage should generate a straight diagonal line. A very specific and strong ChIP enrichment will be indicated by a prominent and steep rise of the cumulative sum towards the highest rank. This means that a big chunk of reads from the ChIP sample is located in few bins which corresponds to high, narrow enrichments typically seen for transcription factors.
 
 Here you see 3 different fingerprint plots.
 We chose these examples to show you how the nature of the ChIP signal (narrow and high vs. wide and not extremely high) is reflected in the "fingerprint" plots. 
@@ -58,7 +58,7 @@ The following example generates the fingerprints for the invididual ENCODE histo
 
 .. image:: ../../images/test_plots/fingerprints.png
 
-The table that you can obtain via ``--outRawCounts`` simply contains the number of reads overlapping with each individually sampled genome bin. For the plot above, each column is sorted in increasing order and then the cumulative sum is plotted.
+The table that you can obtain via ``--outRawCounts`` simply contains the sum of the per-base coverage inside each sampled genome bin. For the plot above, each column is sorted in increasing order and then the cumulative sum is plotted.
 
 .. code:: bash
 

diff --git a/galaxy/wrapper/plotFingerprint.xml b/galaxy/wrapper/plotFingerprint.xml
@@ -148,7 +148,7 @@ Output
 
 The default output is a diagnostic plot (see below for an example and further down for some background information).
 
-Optionally, you can obtain the table of raw counts that was used to generate the plot.
+Optionally, you can obtain the table of raw values that were used to generate the plot.
 
 .. image:: $PATH_TO_IMAGES/plotFingerprint_output.png
   :width: 600
@@ -170,10 +170,10 @@ What follows is the output of ``plotFingerprint`` with our test ChIP-Seq data se
 Theoretical Background
 ----------------------
 
-The tool first samples indexed BAM files and counts all reads overlapping a window (bin) of the specified length.
-These counts are then sorted according to their rank (the bin with the highest number of reads has the highest rank)
-and the cumulative sum of read counts are plotted. An ideal input (control) with a uniform distribution of reads alignments
-result in a diagonal line. A very specific and strong ChIP enrichment, on the other hand, would result in a large portion
+The tool first samples indexed BAM files and sums the per-base coverage of reads/fragments overlapping a window (bin) of the specified length.
+These values are then sorted according to their rank (the bin with the highest number of reads has the highest rank)
+and the cumulative sum plotted. An ideal input (control) with a uniform distribution of reads alignments
+and infinite sequencing depth will result in a diagonal line. A very specific and strong ChIP enrichment, on the other hand, would result in a large portion
 of reads accumulating in just a few bins and the resulting plot showing a steep rise toward the right-most edge. Such results are
 most commonly seen with transcription factors.
 

diff --git a/galaxy/wrapper/test-data/plotFingerprint_quality_metrics.tabular b/galaxy/wrapper/test-data/plotFingerprint_quality_metrics.tabular
@@ -1,3 +1,3 @@
 Sample	AUC	Synthetic AUC	X-intercept	Synthetic X-intercept	Elbow Point	Synthetic Elbow Point	JS Distance	Synthetic JS Distance	% genome enriched	diff. enrichment	CHANCE divergence
-bowtie2 test1.bam	0.00739484047583	0.270844774362	0.984443061605	0.905310085331	0.984380833852	0.597688388779	NA	0.177435375809	NA	NA	NA
-bowtie2 test1.bam	0.00739484047583	0.270844774362	0.984443061605	0.905310085331	0.984380833852	0.597688388779	NA	0.177435375809	NA	NA	NA
+bowtie2 test1.bam	0.00493632029864	0.481650684758	0.984443061605	1.15310443503e-24	0.984940883634	0.523268829811	NA	0.269861238192	NA	NA	NA
+bowtie2 test1.bam	0.00493632029864	0.481650684758	0.984443061605	1.15310443503e-24	0.984940883634	0.523268829811	NA	0.269861238192	NA	NA	NA