-
Notifications
You must be signed in to change notification settings - Fork 214
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #454 from fidelram/fingerprint_summation
Fingerprint summation
- Loading branch information
Showing
9 changed files
with
498 additions
and
278 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
import numpy as np | ||
import multiprocessing | ||
import time | ||
|
||
from deeptools import countReadsPerBin | ||
from deeptoolsintervals import GTF | ||
|
||
|
||
class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin): | ||
r"""This is an extension of CountReadsPerBin for use with plotFingerprint. | ||
There, we need to sum the per-base coverage. | ||
""" | ||
def get_coverage_of_region(self, bamHandle, chrom, regions, | ||
fragmentFromRead_func=None): | ||
""" | ||
Returns a numpy array that corresponds to the number of reads | ||
that overlap with each tile. | ||
>>> test = Tester() | ||
>>> import pysam | ||
>>> c = SumCoveragePerBin([], stepSize=1, extendReads=300) | ||
For this case the reads are length 36. The number of overlapping | ||
read fragments is 4 and 5 for the positions tested. Note that reads are | ||
NOT extended, due to there being a 0 length input list of BAM files! | ||
>>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile_PE), 'chr2', | ||
... [(5000833, 5000834), (5000834, 5000835)]) | ||
array([ 4., 5.]) | ||
In the following case the reads length is 50. Reads are not extended. | ||
>>> c.extendReads=False | ||
>>> c.get_coverage_of_region(pysam.AlignmentFile(test.bamFile2), '3R', [(148, 150), (150, 152), (152, 154)]) | ||
array([ 2., 4., 4.]) | ||
""" | ||
if not fragmentFromRead_func: | ||
fragmentFromRead_func = self.get_fragment_from_read | ||
nbins = len(regions) | ||
if len(regions[0]) == 3: | ||
nbins = 0 | ||
for reg in regions: | ||
nbins += (reg[1] - reg[0]) // reg[2] | ||
coverages = np.zeros(nbins, dtype='float64') | ||
|
||
if self.defaultFragmentLength == 'read length': | ||
extension = 0 | ||
else: | ||
extension = self.maxPairedFragmentLength | ||
|
||
blackList = None | ||
if self.blackListFileName is not None: | ||
blackList = GTF(self.blackListFileName) | ||
|
||
vector_start = 0 | ||
for idx, reg in enumerate(regions): | ||
if len(reg) == 3: | ||
tileSize = int(reg[2]) | ||
nRegBins = (reg[1] - reg[0]) // tileSize | ||
else: | ||
nRegBins = 1 | ||
tileSize = int(reg[1] - reg[0]) | ||
|
||
# Blacklisted regions have a coverage of 0 | ||
if blackList and blackList.findOverlaps(chrom, reg[0], reg[1]): | ||
continue | ||
regStart = int(max(0, reg[0] - extension)) | ||
regEnd = reg[1] + int(extension) | ||
|
||
# If alignments are extended and there's a blacklist, ensure that no | ||
# reads originating in a blacklist are fetched | ||
if blackList and reg[0] > 0 and extension > 0: | ||
o = blackList.findOverlaps(chrom, regStart, reg[0]) | ||
if o is not None and len(o) > 0: | ||
regStart = o[-1][1] | ||
o = blackList.findOverlaps(chrom, reg[1], regEnd) | ||
if o is not None and len(o) > 0: | ||
regEnd = o[0][0] | ||
|
||
start_time = time.time() | ||
# caching seems faster. TODO: profile the function | ||
c = 0 | ||
try: | ||
# BAM input | ||
if chrom in bamHandle.references: | ||
reads = [r for r in bamHandle.fetch(chrom, regStart, regEnd) | ||
if r.flag & 4 == 0] | ||
else: | ||
raise NameError("chromosome {} not found in bam file".format(chrom)) | ||
except: | ||
# bigWig input, as used by plotFingerprint | ||
if bamHandle.chroms(chrom): | ||
_ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) | ||
_[np.isnan(_)] = 0.0 | ||
_ = _ * tileSize | ||
coverages += _ | ||
continue | ||
else: | ||
raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) | ||
|
||
prev_start_pos = None # to store the start positions | ||
# of previous processed read pair | ||
for read in reads: | ||
if self.minMappingQuality and read.mapq < self.minMappingQuality: | ||
continue | ||
|
||
# filter reads based on SAM flag | ||
if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: | ||
continue | ||
if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: | ||
continue | ||
|
||
# Fragment lengths | ||
if self.minFragmentLength > 0 and abs(read.template_length) < self.minFragmentLength: | ||
continue | ||
if self.maxFragmentLength > 0 and abs(read.template_length) > self.maxFragmentLength: | ||
continue | ||
|
||
# get rid of duplicate reads that have same position on each of the | ||
# pairs | ||
if self.ignoreDuplicates and prev_start_pos \ | ||
and prev_start_pos == (read.reference_start, read.pnext, read.is_reverse): | ||
continue | ||
|
||
# since reads can be split (e.g. RNA-seq reads) each part of the | ||
# read that maps is called a position block. | ||
try: | ||
position_blocks = fragmentFromRead_func(read) | ||
except TypeError: | ||
# the get_fragment_from_read functions returns None in some cases. | ||
# Those cases are to be skipped, hence the continue line. | ||
continue | ||
|
||
last_eIdx = None | ||
for fragmentStart, fragmentEnd in position_blocks: | ||
if fragmentEnd is None or fragmentStart is None: | ||
continue | ||
fragmentLength = fragmentEnd - fragmentStart | ||
if fragmentLength == 0: | ||
continue | ||
# skip reads that are not in the region being | ||
# evaluated. | ||
if fragmentEnd <= reg[0] or fragmentStart >= reg[1]: | ||
continue | ||
|
||
if fragmentStart < reg[0]: | ||
fragmentStart = reg[0] | ||
if fragmentEnd > reg[0] + len(coverages) * tileSize: | ||
fragmentEnd = reg[0] + len(coverages) * tileSize | ||
|
||
sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) | ||
eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) | ||
if eIdx >= len(coverages): | ||
eIdx = len(coverages) - 1 | ||
if last_eIdx is not None: | ||
sIdx = max(last_eIdx, sIdx) | ||
if sIdx >= eIdx: | ||
continue | ||
|
||
# First bin | ||
if fragmentEnd < reg[0] + (sIdx + 1) * tileSize: | ||
_ = fragmentEnd - fragmentStart | ||
else: | ||
_ = reg[0] + (sIdx + 1) * tileSize - fragmentStart | ||
if _ > tileSize: | ||
_ = tileSize | ||
coverages[sIdx] += _ | ||
_ = sIdx + 1 | ||
while _ < eIdx: | ||
coverages[_] += tileSize | ||
_ += 1 | ||
while eIdx - sIdx >= nRegBins: | ||
eIdx -= 1 | ||
if eIdx > sIdx: | ||
_ = fragmentEnd - (reg[0] + eIdx * tileSize) | ||
if _ > tileSize: | ||
_ = tileSize | ||
elif _ < 0: | ||
_ = 0 | ||
coverages[eIdx] += _ | ||
last_eIdx = eIdx | ||
|
||
prev_start_pos = (read.reference_start, read.pnext, read.is_reverse) | ||
c += 1 | ||
|
||
if self.verbose: | ||
endTime = time.time() | ||
print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( | ||
multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) | ||
|
||
vector_start += nRegBins | ||
|
||
# change zeros to NAN | ||
if self.zerosToNans: | ||
coverages[coverages == 0] = np.nan | ||
|
||
return coverages | ||
|
||
|
||
class Tester(object): | ||
|
||
def __init__(self): | ||
""" | ||
The distribution of reads between the two bam files is as follows. | ||
They cover 200 bp | ||
0 100 200 | ||
|------------------------------------------------------------| | ||
A =============== | ||
=============== | ||
B =============== =============== | ||
=============== | ||
=============== | ||
""" | ||
import os | ||
self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" | ||
self.bamFile1 = self.root + "testA.bam" | ||
self.bamFile2 = self.root + "testB.bam" | ||
self.bamFile_PE = self.root + "test_paired2.bam" | ||
self.chrom = '3R' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 2 additions & 2 deletions
4
galaxy/wrapper/test-data/plotFingerprint_quality_metrics.tabular
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
Sample AUC Synthetic AUC X-intercept Synthetic X-intercept Elbow Point Synthetic Elbow Point JS Distance Synthetic JS Distance % genome enriched diff. enrichment CHANCE divergence | ||
bowtie2 test1.bam 0.00739484047583 0.270844774362 0.984443061605 0.905310085331 0.984380833852 0.597688388779 NA 0.177435375809 NA NA NA | ||
bowtie2 test1.bam 0.00739484047583 0.270844774362 0.984443061605 0.905310085331 0.984380833852 0.597688388779 NA 0.177435375809 NA NA NA | ||
bowtie2 test1.bam 0.00493632029864 0.481650684758 0.984443061605 1.15310443503e-24 0.984940883634 0.523268829811 NA 0.269861238192 NA NA NA | ||
bowtie2 test1.bam 0.00493632029864 0.481650684758 0.984443061605 1.15310443503e-24 0.984940883634 0.523268829811 NA 0.269861238192 NA NA NA |
Oops, something went wrong.