Merge branch 'release/v2.2'

deeptools · Mar 1, 2016 · 5400ccd · 5400ccd
2 parents ae628ec + 0ec91d3
commit 5400ccd
Show file tree

Hide file tree

Showing 47 changed files with 1,427 additions and 1,244 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,14 @@
+2.2.0
+
+ * plotFingerprint now iterates through line styles as well as colors. This allows up to 35 samples per plot without repeating (not that that many would ever be recommended). This was issue #80.
+ * Fixed a number of Galaxy wrappers, which were rendered incorrectly due to including a section title of "Background".
+ * A number of image file handles were previously not explicitly closed, which caused occasional completion of a plot* program but without the files actually being there. This only happened on some NFS mount points.
+ * The Galaxy wrappers now support the `--outFileNameData` option on plotProfile and plotHeatmap.
+ * Added support for blacklist regions. These can be supplied as a BED file and the regions will largely be skipped in processing (they'll also be ignored during normalization). This is very useful to skip regions known to attract excess signal. This was issue #101.
+ * Modified plotPCA to include the actual eigenvalues rather than rescaled ones. Also, plotPCA can now output the underlying values (issue #231).
+ * Regions within each feature body can now be unscaled when using `computeMatrix`. Thus, if you're interested in unscaled signal around the TSS/TES then you can now use the `--unscaled5prime` and `--unscaled3prime` options. This was issue #108.
+ * bamCoverage now has a `--filterRNAstrand` option, that will produce coverage for only a single strand. Note that the strand referred to is the DNA strand and not sense/anti-sense.
+ * Issues with plotHeatmap x-axis labels were fixed (issue #301).
 
 2.1.1
 

diff --git a/deeptools/SES_scaleFactor.py b/deeptools/SES_scaleFactor.py
@@ -13,7 +13,7 @@
 
 def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
                         normalizationLength,
-                        avg_method='median', numberOfProcessors=1,
+                        avg_method='median', blackListFileName=None, numberOfProcessors=1,
                         verbose=False, chrsToSkip=[]):
     r"""
     Subdivides the genome into chunks to be analyzed in parallel
@@ -41,6 +41,8 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
     chrsToSkip : list
         name of the chromosomes to be excluded from the
         scale estimation. Usually the chrX is included.
+    blackListFileName : str
+        BED file containing blacklisted regions
 
     Returns
     -------
@@ -84,6 +86,7 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
                                  binLength=binLength,
                                  numberOfSamples=numberOfSamples,
                                  extendReads=False,
+                                 blackListFileName=blackListFileName,
                                  numberOfProcessors=numberOfProcessors,
                                  verbose=verbose,
                                  chrsToSkip=chrsToSkip)

diff --git a/deeptools/_version.py b/deeptools/_version.py
@@ -2,4 +2,4 @@
 # This file is originally generated from Git information by running 'setup.py
 # version'. Distribution tarballs contain a pre-generated copy of this file.
 
-__version__ = '2.1.0'
+__version__ = '2.2.0'
diff --git a/deeptools/bamCompare.py b/deeptools/bamCompare.py
@@ -157,7 +157,12 @@ def get_scale_factors(args):
     bam2 = bamHandler.openBam(args.bamfile2)
 
     bam1_mapped = parserCommon.bam_total_reads(bam1, args.ignoreForNormalization)
+    blacklisted = parserCommon.bam_blacklisted_reads(bam1, args.ignoreForNormalization, args.blackListFileName)
+    bam1_mapped -= blacklisted
+
     bam2_mapped = parserCommon.bam_total_reads(bam2, args.ignoreForNormalization)
+    blacklisted = parserCommon.bam_blacklisted_reads(bam2, args.ignoreForNormalization, args.blackListFileName)
+    bam2_mapped -= blacklisted
 
     if args.scaleFactors:
         scale_factors = map(float, args.scaleFactors.split(":"))
@@ -167,6 +172,7 @@ def get_scale_factors(args):
                 [bam1.filename, bam2.filename],
                 args.sampleLength, args.numberOfSamples,
                 1,
+                blackListFileName=args.blackListFileName,
                 numberOfProcessors=args.numberOfProcessors,
                 verbose=args.verbose,
                 chrsToSkip=args.ignoreForNormalization)
@@ -214,6 +220,7 @@ def get_scale_factors(args):
                 from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
                 frag_len_dict, read_len_dict = get_read_and_fragment_length(bamfile,
                                                                             return_lengths=False,
+                                                                            blackListFileName=args.blackListFileName,
                                                                             numberOfProcessors=args.numberOfProcessors,
                                                                             verbose=args.verbose)
                 if args.extendReads:
@@ -295,6 +302,7 @@ def main(args=None):
                                      region=args.region,
                                      numberOfProcessors=args.numberOfProcessors,
                                      extendReads=args.extendReads,
+                                     blackListFileName=args.blackListFileName,
                                      minMappingQuality=args.minMappingQuality,
                                      ignoreDuplicates=args.ignoreDuplicates,
                                      center_read=args.centerReads,

diff --git a/deeptools/bamCoverage.py b/deeptools/bamCoverage.py
@@ -77,6 +77,12 @@ def get_optional_args():
                           '*NOTE*: Requires paired-end data. A bin size of 1 is recommended.',
                           action='store_true')
 
+    optional.add_argument('--filterRNAstrand',
+                          help='Selects RNA-seq reads (single-end or paired-end) in '
+                               'the given strand.',
+                          choices=['forward', 'reverse'],
+                          default=None)
+
     return parser
 
 
@@ -110,12 +116,15 @@ def get_scale_factor(args):
     scale_factor = args.scaleFactor
     bam_handle = bamHandler.openBam(args.bam)
     bam_mapped = parserCommon.bam_total_reads(bam_handle, args.ignoreForNormalization)
+    blacklisted = parserCommon.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, args.blackListFileName)
+    bam_mapped -= blacklisted
 
     if args.normalizeTo1x:
         # try to guess fragment length if the bam file contains paired end reads
         from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
         frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                     return_lengths=False,
+                                                                    blackListFileName=args.blackListFileName,
                                                                     numberOfProcessors=args.numberOfProcessors,
                                                                     verbose=args.verbose)
         if args.extendReads:
@@ -174,12 +183,14 @@ def main(args=None):
         debug = 0
 
     func_args = {'scaleFactor': get_scale_factor(args)}
+
     if args.MNase:
         # check that library is paired end
         # using getFragmentAndReadSize
         from deeptools.getFragmentAndReadSize import get_read_and_fragment_length
         frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam,
                                                                     return_lengths=False,
+                                                                    blackListFileName=args.blackListFileName,
                                                                     numberOfProcessors=args.numberOfProcessors,
                                                                     verbose=args.verbose)
         if frag_len_dict is None:
@@ -189,6 +200,7 @@ def main(args=None):
                             binLength=args.binSize,
                             stepSize=args.binSize,
                             region=args.region,
+                            blackListFileName=args.blackListFileName,
                             numberOfProcessors=args.numberOfProcessors,
                             extendReads=args.extendReads,
                             minMappingQuality=args.minMappingQuality,
@@ -200,11 +212,29 @@ def main(args=None):
                             verbose=args.verbose,
                             )
 
+    elif args.filterRNAstrand:
+        wr = filterRnaStrand([args.bam],
+                             binLength=args.binSize,
+                             stepSize=args.binSize,
+                             region=args.region,
+                             numberOfProcessors=args.numberOfProcessors,
+                             extendReads=args.extendReads,
+                             minMappingQuality=args.minMappingQuality,
+                             ignoreDuplicates=args.ignoreDuplicates,
+                             center_read=args.centerReads,
+                             zerosToNans=args.skipNonCoveredRegions,
+                             samFlag_include=args.samFlagInclude,
+                             samFlag_exclude=args.samFlagExclude,
+                             verbose=args.verbose,
+                             )
+
+        wr.filter_strand = args.filterRNAstrand
     else:
         wr = writeBedGraph.WriteBedGraph([args.bam],
                                          binLength=args.binSize,
                                          stepSize=args.binSize,
                                          region=args.region,
+                                         blackListFileName=args.blackListFileName,
                                          numberOfProcessors=args.numberOfProcessors,
                                          extendReads=args.extendReads,
                                          minMappingQuality=args.minMappingQuality,
@@ -217,6 +247,7 @@ def main(args=None):
                                          )
 
     wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName,
+           blackListFileName=args.blackListFileName,
            format=args.outFileFormat, smoothLength=args.smoothLength)
 
 
@@ -249,3 +280,58 @@ def get_fragment_from_read(self, read):
                 fragment_end = fragment_start + 3
 
         return [(fragment_start, fragment_end)]
+
+
+class filterRnaStrand(writeBedGraph.WriteBedGraph):
+    """
+    Class to redefine the get_fragment_from_read for the --filterRNAstrand case
+
+    Only reads either forward or reverse are kept as follows:
+
+    For paired-end
+    --------------
+    reads forward:
+
+     1. alignments of the second in pair (128) if they map to the forward strand (~16)
+     2. alignments of the first in pair (64) if they map to the reverse  strand (~32)
+
+     1. include 128, exclude 16
+     or
+     2. include 64 exclude 32
+
+    reads reverse:
+    1. alignments of the second in pair (128) if it maps to the reverse strand (16) 128 & 16 = 144
+    2. alignments of the first in pair (64) if their mates map to the reverse strand (32) 64 & 32 = 96
+
+     1. include 144
+     or
+     2. include 96
+
+    For single-end
+    --------------
+    forward: include 16 (map forward strand)
+    reverse: exclude 16
+
+    """
+
+    def get_fragment_from_read(self, read):
+        """
+        Gets only reads for the given strand
+        """
+        fragment_start = fragment_end = None
+
+        # only paired forward reads are considered
+        if read.is_paired:
+            if self.filter_strand == 'forward':
+                if (read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0):
+                    return read.get_blocks()
+            else:
+                if read.flag & 144 == 144 or read.flag & 96 == 96:
+                    return read.get_blocks()
+        else:
+            if self.filter_strand == 'forward' and read.flag & 16 == 16:
+                return read.get_blocks()
+            elif self.filter_strand == 'reverse' and read.flag & 16 == 0:
+                return read.get_blocks()
+
+        return [(fragment_start, fragment_end)]
diff --git a/deeptools/bamPEFragmentSize.py b/deeptools/bamPEFragmentSize.py
@@ -54,6 +54,10 @@ def parse_arguments():
                         'decreased. (default 1000000)',
                         default=1000000,
                         type=int)
+    parser.add_argument('--blackListFileName', '-bl',
+                        help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
+                        metavar="BED file",
+                        required=False)
     parser.add_argument('--verbose',
                         help='Set if processing data messages are wanted.',
                         action='store_true',
@@ -67,6 +71,7 @@ def parse_arguments():
 def main(args=None):
     args = parse_arguments().parse_args(args)
     fragment_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, return_lengths=True,
+                                                                    blackListFileName=args.blackListFileName,
                                                                     numberOfProcessors=args.numberOfProcessors,
                                                                     verbose=args.verbose,
                                                                     binSize=args.binSize,
@@ -111,6 +116,7 @@ def main(args=None):
         plt.ylabel('Frequency')
         plt.title(args.plotTitle)
         plt.savefig(args.histogram, bbox_inches=0)
+        plt.close()
 
 
 if __name__ == "__main__":

diff --git a/deeptools/bigwigCompare.py b/deeptools/bigwigCompare.py
@@ -98,6 +98,7 @@ def main(args=None):
          (args.bigwig2, 'bigwig')],
         args.outFileName, 0, FUNC,
         function_args, tileSize=args.binSize, region=args.region,
+        blackListFileName=args.blackListFileName,
         numberOfProcessors=args.numberOfProcessors,
         format=args.outFileFormat,
         smoothLength=False,

diff --git a/deeptools/computeGCBias.py b/deeptools/computeGCBias.py
@@ -22,7 +22,7 @@
 
 
 def parse_arguments(args=None):
-    parentParser = parserCommon.getParentArgParse(binSize=False)
+    parentParser = parserCommon.getParentArgParse(binSize=False, blackList=True)
     requiredArgs = getRequiredArgs()
     parser = argparse.ArgumentParser(
         parents=[requiredArgs, parentParser],
@@ -96,26 +96,6 @@ def getRequiredArgs():
                           help='Number of sampling points to be considered.',
                           type=int)
 
-    optional.add_argument(
-        '--filterOut', '-fo',
-        help='BED file containing genomic regions to be excluded '
-        'from the estimation of the correction. Such regions '
-        'usually contain repetitive regions and peaks that, if '
-        'included, would bias the correction. '
-        'It is recommended to filter out known repetitive regions '
-        'if multimappers (reads that map to more than one genomic '
-        'position) were excluded. In the case of ChIP-seq data, '
-        'it is recommended to first use '
-        'a peak caller to identify and filter out the identified peaks.\n\n'
-        'Mappability tracks for different read lengths can be '
-        'found at: http://hgdownload.cse.ucsc.edu/gbdb/mm9/bbi/ '
-        'and http://hgdownload.cse.ucsc.edu/gbdb/hg19/bbi \n\n'
-        'The script: scripts/mappabilityBigWig_to_unmappableBed.sh '
-        'can be used to get a BED file from the mappability '
-        'bigWig.',
-        type=argparse.FileType('r'),
-        metavar='BED file')
-
     optional.add_argument('--extraSampling',
                           help='BED file containing genomic regions for which '
                           'extra sampling is required because they are '
@@ -632,13 +612,14 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N
     ax2.set_xlim(0.2, 0.7)
     plt.tight_layout()
     plt.savefig(file_name, bbox_inches='tight', dpi=100, format=image_format)
+    plt.close()
 
 
 def main(args=None):
     args = parse_arguments().parse_args(args)
     # check if directory is writable
-    if args.filterOut:
-        filter_out_file = args.filterOut.name
+    if args.blackListFileName:
+        filter_out_file = args.BlackListFileName
         args.filterOut.close()
     else:
         filter_out_file = None

diff --git a/deeptools/computeMatrix.py b/deeptools/computeMatrix.py
@@ -176,6 +176,17 @@ def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]):
                               'of the given regions. If the '
                               'regions are genes, this would be the distance '
                               'downstream of the transcription end site.')
+        optional.add_argument("--unscaled5prime",
+                              default=0,
+                              type=int,
+                              help='Number of bases at the 5-prime end of the '
+                              'region to exclude from scaling. By default, '
+                              'each region is scaled to a given length (see the --regionBodyLength option). In some cases it is useful to look at unscaled signals around region boundaries, so this setting specifies the number of unscaled bases on the 5-prime end of each boundary.')
+        optional.add_argument("--unscaled3prime",
+                              default=0,
+                              type=int,
+                              help='Like --unscaled3prime, but for the 3-prime '
+                              'end.')
 
     elif case == 'reference-point':
         optional.add_argument('--referencePoint',
@@ -191,6 +202,8 @@ def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]):
         # set region body length to zero for reference point mode
         optional.add_argument('--regionBodyLength', help=argparse.SUPPRESS,
                               default=0, type=int)
+        optional.add_argument('--unscaled5prime', default=0, type=int, help=argparse.SUPPRESS)
+        optional.add_argument('--unscaled3prime', default=0, type=int, help=argparse.SUPPRESS)
         optional.add_argument('--beforeRegionStartLength', '-b', '--upstream',
                               default=500,
                               type=int,
@@ -279,6 +292,11 @@ def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]):
                           '(e.g. micro satellites) that may bias the average '
                           'values.')
 
+    optional.add_argument('--blackListFileName', '-bl',
+                          help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.",
+                          metavar="BED file",
+                          required=False)
+
     # in contrast to other tools,
     # computeMatrix by default outputs
     # messages and the --quiet flag supresses them
@@ -360,13 +378,15 @@ def main(args=None):
                   'nan after end': args.nanAfterEnd,
                   'proc number': args.numberOfProcessors,
                   'sort regions': args.sortRegions,
-                  'sort using': args.sortUsing
+                  'sort using': args.sortUsing,
+                  'unscaled 5 prime': args.unscaled5prime,
+                  'unscaled 3 prime': args.unscaled3prime
                   }
 
     hm = heatmapper.heatmapper()
 
     scores_file_list = args.scoreFileName
-    hm.computeMatrix(scores_file_list, bed_file, parameters, verbose=args.verbose)
+    hm.computeMatrix(scores_file_list, bed_file, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose)
     if args.sortRegions != 'no':
         hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions)
 

diff --git a/deeptools/correctGCBias.py b/deeptools/correctGCBias.py
@@ -21,7 +21,7 @@
 
 
 def parse_arguments(args=None):
-    parentParser = parserCommon.getParentArgParse(binSize=True)
+    parentParser = parserCommon.getParentArgParse(binSize=True, blackList=False)
     requiredArgs = getRequiredArgs()
     parser = argparse.ArgumentParser(
         parents=[requiredArgs, parentParser],