From a613fb2d5998c47d443f94470f471028c9180cc2 Mon Sep 17 00:00:00 2001 From: dpryan79 Date: Wed, 26 Oct 2016 21:40:54 +0200 Subject: [PATCH 1/4] Shhh, don't tell anyone, but plotFingerprint handles bigWig files. The deepBlue support here is temporary. --- deeptools/countReadsPerBin.py | 71 ++++++++++++++++++++++++++++------- deeptools/plotFingerprint.py | 43 ++++++++++++++++++++- deeptools/utilities.py | 8 +++- 3 files changed, 105 insertions(+), 17 deletions(-) diff --git a/deeptools/countReadsPerBin.py b/deeptools/countReadsPerBin.py index 1b971914e..3cef89a44 100644 --- a/deeptools/countReadsPerBin.py +++ b/deeptools/countReadsPerBin.py @@ -10,6 +10,7 @@ from deeptools import bamHandler from deeptools import mapReduce from deeptoolsintervals import GTF +import pyBigWig debug = 0 old_settings = np.seterr(all='ignore') @@ -235,7 +236,13 @@ def run(self, allArgs=None): # workers for analysis. If too short, too much time is spend loading the files # if too long, some processors end up free. # the following values are empirical - bamFilesHandlers = [bamHandler.openBam(x) for x in self.bamFilesList] + bamFilesHandlers = [] + for x in self.bamFilesList: + try: + y = bamHandler.openBam(x) + except: + y = pyBigWig.open(x) + bamFilesHandlers.append(y) chromSizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandlers, verbose=self.verbose) # skip chromosome in the list. This is usually for the @@ -263,14 +270,28 @@ def run(self, allArgs=None): min_num_of_samples = int(genomeSize / np.mean(chrLengths)) raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) - max_mapped = max([x.mapped for x in bamFilesHandlers]) - - reads_per_bp = float(max_mapped) / genomeSize - # chunkSize = int(100 / ( reads_per_bp * len(bamFilesList)) ) - - chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) + max_mapped = [] + for x in bamFilesHandlers: + try: + max_mapped.append(x.mapped) + except: + # bigWig, use a fixed value + max_mapped.append(0) + max_mapped = max(max_mapped) + + # If max_mapped is 0 (i.e., bigWig input), set chunkSize to a multiple of binLength and use every bin + if max_mapped == 0: + chunkSize = 10000 * self.binLength + self.stepSize = self.binLength + else: + reads_per_bp = float(max_mapped) / genomeSize + chunkSize = int(self.stepSize * 1e3 / (reads_per_bp * len(bamFilesHandlers))) [bam_h.close() for bam_h in bamFilesHandlers] + # Ensure that chunkSize is always at least self.stepSize + if chunkSize < self.stepSize: + chunkSize = self.stepSize + if self.verbose: print("step size is {}".format(self.stepSize)) @@ -388,7 +409,12 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): start_time = time.time() - bam_handlers = [bamHandler.openBam(bam) for bam in self.bamFilesList] + bam_handlers = [] + for fname in self.bamFilesList: + try: + bam_handlers.append(bamHandler.openBam(fname)) + except: + bam_handlers.append(pyBigWig.open(fname)) blackList = None if self.blackListFileName is not None: @@ -537,11 +563,30 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, start_time = time.time() # caching seems faster. TODO: profile the function c = 0 - if chrom in bamHandle.references: - reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd) - if r.flag & 4 == 0] - else: - raise NameError("chromosome {} not found in bam file".format(chrom)) + try: + # BAM input + if chrom in bamHandle.references: + reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd) + if r.flag & 4 == 0] + else: + raise NameError("chromosome {} not found in bam file".format(chrom)) + except: + # bigWig input, as used by plotFingerprint + if bamHandle.chroms(chrom): + sIdx = 0 + _ = np.array(bamHandle.values(chrom, regStart, regEnd)) + _[np.isnan(_)] = 0.0 + # Partition the values into appropriately sized chunks + parts = np.linspace(0, reg[1] - reg[0], num=nRegBins, endpoint=False, dtype=int) + if parts[-1] != reg[1] - reg[0]: + parts = np.hstack([parts, reg[1] - reg[0]]) + nSteps = len(parts) - 1 + while sIdx < nSteps: + coverages[sIdx] += np.sum(_[parts[sIdx]:parts[sIdx + 1]]) + sIdx += 1 + continue + else: + raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) prev_start_pos = None # to store the start positions # of previous processed read pair diff --git a/deeptools/plotFingerprint.py b/deeptools/plotFingerprint.py index 1dbdb4814..5275799d0 100644 --- a/deeptools/plotFingerprint.py +++ b/deeptools/plotFingerprint.py @@ -11,9 +11,12 @@ import matplotlib.pyplot as plt from scipy import interpolate from scipy.stats import poisson +import multiprocessing +import os import deeptools.countReadsPerBin as countR from deeptools import parserCommon +import deeptools.deepBlue as db old_settings = np.seterr(all='ignore') @@ -24,9 +27,10 @@ def parse_arguments(args=None): output_args = get_output_args() optional_args = get_optional_args() read_options_parser = parserCommon.read_options() + deepBlue_optional_parser = parserCommon.deepBlueOptionalArgs() parser = argparse.ArgumentParser( parents=[required_args, output_args, read_options_parser, - optional_args, parent_parser], + optional_args, deepBlue_optional_parser, parent_parser], formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='This tool samples indexed BAM files ' 'and plots a profile of cumulative read coverages for each. ' @@ -318,7 +322,7 @@ def signalAndBinDist(x): # Compute the JSD from the PMFs M = (PMFinput + PMFchip) / 2.0 - JSD = 0.5 * (np.sum(PMFinput * np.log2(PMFinput / M))) + 0.5 * (np.sum(PMFchip * np.log2(PMFchip / M))) + JSD = 0.5 * (np.nansum(PMFinput * np.log2(PMFinput / M))) + 0.5 * (np.nansum(PMFchip * np.log2(PMFchip / M))) return np.sqrt(JSD) @@ -342,6 +346,32 @@ def getExpected(mu): def main(args=None): args = process_args(args) + # Preload deepBlue files, which need to then be deleted + deepBlueFiles = [] + for idx, fname in enumerate(args.bamfiles): + if db.isDeepBlue(fname): + deepBlueFiles.append([fname, idx]) + if len(deepBlueFiles) > 0: + sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles]))) + foo = db.deepBlue(deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey) + regs = db.makeChromTiles(foo) + for x in deepBlueFiles: + x.extend([args, regs]) + if len(deepBlueFiles) > 1 and args.numberOfProcessors > 1: + pool = multiprocessing.Pool(args.numberOfProcessors) + res = pool.map_async(db.preloadWrapper, deepBlueFiles).get(9999999) + else: + res = list(map(db.preloadWrapper, deepBlueFiles)) + + # substitute the file names with the temp files + for (ftuple, r) in zip(deepBlueFiles, res): + if ftuple[1] == 0: + args.bigwig1 = r + else: + args.bigwig2 = r + deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles] + del regs + cr = countR.CountReadsPerBin( args.bamfiles, args.binSize, @@ -424,5 +454,14 @@ def main(args=None): args.outQualityMetrics.write("\n") args.outQualityMetrics.close() + # Clean up temporary bigWig files, if applicable + if not args.deepBlueKeepTemp: + for k, v in deepBlueFiles: + os.remove(args.bamfiles[v]) + else: + for k, v in deepBlueFiles: + print("{} is stored in {}".format(k, args.bamfiles[v])) + + if __name__ == "__main__": main() diff --git a/deeptools/utilities.py b/deeptools/utilities.py index d4b0865bd..013b4ab56 100644 --- a/deeptools/utilities.py +++ b/deeptools/utilities.py @@ -107,8 +107,12 @@ def get_chrom_and_size(bam_handler): :param bam_handler: :return: list of (chrom, size) tuples """ - return [(bam_handler.references[i], bam_handler.lengths[i]) - for i in range(0, len(bam_handler.references))] + try: + # BAM file + return [(bam_handler.references[i], bam_handler.lengths[i]) + for i in range(0, len(bam_handler.references))] + except: + return [(k, v) for k, v in bam_handler.chroms().items()] def print_chr_names_and_size(chr_set): sys.stderr.write("chromosome\tlength\n") From bd21d82833cdeaae1cfc58cd821eb8764fa182e8 Mon Sep 17 00:00:00 2001 From: dpryan79 Date: Fri, 28 Oct 2016 09:37:41 +0200 Subject: [PATCH 2/4] Simplify bigWig processing in plotFingerprint. This only makes sense if the bin sizes in the bigWig match what plotFingerprint is told to use. --- deeptools/countReadsPerBin.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/deeptools/countReadsPerBin.py b/deeptools/countReadsPerBin.py index 3cef89a44..17b727d6d 100644 --- a/deeptools/countReadsPerBin.py +++ b/deeptools/countReadsPerBin.py @@ -573,17 +573,9 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, except: # bigWig input, as used by plotFingerprint if bamHandle.chroms(chrom): - sIdx = 0 - _ = np.array(bamHandle.values(chrom, regStart, regEnd)) + _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) _[np.isnan(_)] = 0.0 - # Partition the values into appropriately sized chunks - parts = np.linspace(0, reg[1] - reg[0], num=nRegBins, endpoint=False, dtype=int) - if parts[-1] != reg[1] - reg[0]: - parts = np.hstack([parts, reg[1] - reg[0]]) - nSteps = len(parts) - 1 - while sIdx < nSteps: - coverages[sIdx] += np.sum(_[parts[sIdx]:parts[sIdx + 1]]) - sIdx += 1 + coverages += _ continue else: raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) From b7dcfdcb763fa5c0dc2d40f6bbf2e8a7ab830d43 Mon Sep 17 00:00:00 2001 From: dpryan79 Date: Wed, 2 Nov 2016 14:15:48 +0100 Subject: [PATCH 3/4] Remove deepBlue support from plotFingerprint, in prep for merging --- deeptools/plotFingerprint.py | 38 +----------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/deeptools/plotFingerprint.py b/deeptools/plotFingerprint.py index 5275799d0..dd0fab8b7 100644 --- a/deeptools/plotFingerprint.py +++ b/deeptools/plotFingerprint.py @@ -16,7 +16,6 @@ import deeptools.countReadsPerBin as countR from deeptools import parserCommon -import deeptools.deepBlue as db old_settings = np.seterr(all='ignore') @@ -27,10 +26,9 @@ def parse_arguments(args=None): output_args = get_output_args() optional_args = get_optional_args() read_options_parser = parserCommon.read_options() - deepBlue_optional_parser = parserCommon.deepBlueOptionalArgs() parser = argparse.ArgumentParser( parents=[required_args, output_args, read_options_parser, - optional_args, deepBlue_optional_parser, parent_parser], + optional_args, parent_parser], formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='This tool samples indexed BAM files ' 'and plots a profile of cumulative read coverages for each. ' @@ -346,32 +344,6 @@ def getExpected(mu): def main(args=None): args = process_args(args) - # Preload deepBlue files, which need to then be deleted - deepBlueFiles = [] - for idx, fname in enumerate(args.bamfiles): - if db.isDeepBlue(fname): - deepBlueFiles.append([fname, idx]) - if len(deepBlueFiles) > 0: - sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles]))) - foo = db.deepBlue(deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey) - regs = db.makeChromTiles(foo) - for x in deepBlueFiles: - x.extend([args, regs]) - if len(deepBlueFiles) > 1 and args.numberOfProcessors > 1: - pool = multiprocessing.Pool(args.numberOfProcessors) - res = pool.map_async(db.preloadWrapper, deepBlueFiles).get(9999999) - else: - res = list(map(db.preloadWrapper, deepBlueFiles)) - - # substitute the file names with the temp files - for (ftuple, r) in zip(deepBlueFiles, res): - if ftuple[1] == 0: - args.bigwig1 = r - else: - args.bigwig2 = r - deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles] - del regs - cr = countR.CountReadsPerBin( args.bamfiles, args.binSize, @@ -454,14 +426,6 @@ def main(args=None): args.outQualityMetrics.write("\n") args.outQualityMetrics.close() - # Clean up temporary bigWig files, if applicable - if not args.deepBlueKeepTemp: - for k, v in deepBlueFiles: - os.remove(args.bamfiles[v]) - else: - for k, v in deepBlueFiles: - print("{} is stored in {}".format(k, args.bamfiles[v])) - if __name__ == "__main__": main() From 0cb808a0ff761ea59f3ec9d136cffcdc820ccabf Mon Sep 17 00:00:00 2001 From: dpryan79 Date: Wed, 2 Nov 2016 14:33:29 +0100 Subject: [PATCH 4/4] PEP8 and fix miniconda URL --- .travis.yml | 4 ++-- deeptools/plotFingerprint.py | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5200d19e2..f5717c024 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,8 +10,8 @@ os: before_install: - if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then pip install virtualenv --user ; fi - if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then virtualenv foo; source foo/bin/activate; pip install planemo ; deactivate ; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then curl http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -o miniconda.sh ; fi - - if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then curl http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh ; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then curl https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -o miniconda.sh ; fi + - if [[ "$TRAVIS_OS_NAME" == "linux" && "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh ; fi - if [[ "$TRAVIS_OS_NAME" == "osx" && "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then curl https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -o miniconda.sh ; fi - if [[ "$TRAVIS_OS_NAME" == "osx" && "$TRAVIS_PYTHON_VERSION" == "3.5" ]]; then curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o miniconda.sh ; fi - bash miniconda.sh -b -p $HOME/miniconda diff --git a/deeptools/plotFingerprint.py b/deeptools/plotFingerprint.py index dd0fab8b7..abf40c9d0 100644 --- a/deeptools/plotFingerprint.py +++ b/deeptools/plotFingerprint.py @@ -11,8 +11,6 @@ import matplotlib.pyplot as plt from scipy import interpolate from scipy.stats import poisson -import multiprocessing -import os import deeptools.countReadsPerBin as countR from deeptools import parserCommon