From 20c3ccaaf2e004992a689e803fb763deab2acc57 Mon Sep 17 00:00:00 2001 From: "Ricardo D'O. Albanus" Date: Tue, 17 Sep 2019 13:42:04 -0400 Subject: [PATCH] Update version to 0.1.9 --- README.rst | 26 +- atactk/__init__.py | 4 +- atactk/command.py | 52 +++- atactk/data.py | 59 ++-- atactk/metrics/__init__.py | 7 + atactk/metrics/common.py | 36 +++ atactk/metrics/cut.py | 242 +++++++++++++++ atactk/metrics/mid.py | 361 +++++++++++++++++++++++ atactk/util.py | 50 +++- docs/installation.rst | 3 +- docs/usage.rst | 8 +- scripts/make_cut_matrix | 310 +++++++++---------- scripts/make_midpoint_matrix | 245 +++++++++++++++ scripts/measure_features | 143 +++++++++ scripts/measure_signal | 201 +++++++++++++ scripts/plot_aggregate_cut_matrix.R | 62 ++++ scripts/plot_aggregate_midpoint_matrix.R | 60 ++++ scripts/plot_signal.R | 177 +++++++++++ scripts/trim_adapters | 10 +- setup.py | 22 +- tox.ini | 15 +- 21 files changed, 1853 insertions(+), 240 deletions(-) create mode 100644 atactk/metrics/__init__.py create mode 100644 atactk/metrics/common.py create mode 100644 atactk/metrics/cut.py create mode 100644 atactk/metrics/mid.py create mode 100755 scripts/make_midpoint_matrix create mode 100755 scripts/measure_features create mode 100755 scripts/measure_signal create mode 100755 scripts/plot_aggregate_cut_matrix.R create mode 100755 scripts/plot_aggregate_midpoint_matrix.R create mode 100755 scripts/plot_signal.R diff --git a/README.rst b/README.rst index 370710d..43dcb93 100644 --- a/README.rst +++ b/README.rst @@ -7,15 +7,24 @@ A toolkit for working with ATAC-seq data. What's in the box? ================== -Programs we've found useful in ATAC-seq pipelines -------------------------------------------------- +Programs we've found useful in analyzing ATAC-seq data +------------------------------------------------------ -* ``trim_adapters``: based on Jason Buenrostro's utility for trimming - Illumina adapters by aligning paired reads to each other. * ``make_cut_matrix``: useful in conjunction with CENTIPEDE, and in generating plots of transcription factor binding sites. -* ``plot_aggregate_matrix.R``: generates plots for motifs given the +* ``make_midpoint_matrix``: useful in conjunction with CENTIPEDE, and in + generating plots of transcription factor binding sites. +* ``measure_signal``: measures size and position of ATAC-seq fragments + overlapping genomic features. +* ``measure_features``: given a bigWig file of coverage counts and a BED + file of features, calculates a requested statistic for each feature. +* ``plot_aggregate_cut_matrix.R``: generates plots for motifs given the aggregate output of `make_cut_matrix` +* ``plot_aggregate_midpoint_matrix.R``: generates plots for motifs given the + aggregate output of `make_midpoint_matrix` +* ``plot_signal.R``: plots the output of `measure_signal` +* ``trim_adapters``: based on Jason Buenrostro's utility for trimming + Illumina adapters by aligning paired reads to each other. A Python library you can use in your own tools for processing ATAC-seq data --------------------------------------------------------------------------- @@ -31,7 +40,8 @@ you might find your pipeline can be simplified too. Requirements ============ -* Python. We've run it successfully under versions 2.7.10 and 3.4.3. +* Python. We've run it successfully under versions 2.7 and 3.5. +* pyBigWig * pysam * python-levenshtein * sexpdata @@ -44,6 +54,10 @@ At the command line:: git clone https://github.com/ParkerLab/atactk pip install ./atactk +Or in one step, if you don't want a local copy:: + + pip install git+https://github.com/ParkerLab/atactk + Documentation ============= diff --git a/atactk/__init__.py b/atactk/__init__.py index fa59cbc..f1cb310 100644 --- a/atactk/__init__.py +++ b/atactk/__init__.py @@ -1,7 +1,7 @@ # # atactk: ATAC-seq toolkit # -# Copyright 2015 The Parker Lab at the University of Michigan +# Copyright 2015 Stephen Parker # # Licensed under Version 3 of the GPL or any later version # @@ -9,4 +9,4 @@ __author__ = 'The Parker Lab' __email__ = 'parkerlab-software@umich.edu' -__version__ = '0.1.6' +__version__ = '0.1.9' diff --git a/atactk/command.py b/atactk/command.py index 1e21b52..1b096db 100644 --- a/atactk/command.py +++ b/atactk/command.py @@ -1,7 +1,7 @@ # # atactk: ATAC-seq toolkit # -# Copyright 2015 The Parker Lab at the University of Michigan +# Copyright 2015 Stephen Parker # # Licensed under Version 3 of the GPL or any later version # @@ -14,10 +14,13 @@ from __future__ import print_function import argparse +import logging import sys import sexpdata +import atactk + def check_bins_for_overlap(bins): """ @@ -39,28 +42,51 @@ def check_bins_for_overlap(bins): for bin in bins: start, end, resolution = bin if start <= last_end: - raise argparse.ArgumentTypeError("Bin %d-%d overlaps %d-%d." % (start, end, last_start, last_end)) + raise argparse.ArgumentTypeError("Bin {:d}-{:d} overlaps {:d}-{:d}.".format(start, end, last_start, last_end)) last_start, last_end = start, end +def check_bin_resolutions(bins, extension): + """ + Check the resolutions of a flattened list of bins. + + Look for inconsistent resolutions, which tend to give suboptimal + results, or resolutions that don't divide the extended region + evenly. + """ + logger = logging.getLogger("{}.{}".format(__name__, check_bin_resolutions.__name__)) + + resolutions = set(bin[2] for bin in bins) + if len(resolutions) > 1: + logger.warn("""We've found we usually get better results using the same resolution in each bin.""") + + for bin in bins: + start, end, resolution = bin + if extension % resolution != 0: + logger.warn("Bin {:d}-{:d} resolution {:d} is not a divisor of extension {:d}".format(start, end, resolution, extension)) + + def parse_bins(bins_string): """ - Parse the string representing template size groups. + Parse the string representing fragment size groups. The bins are specified as a list of groups, each group comprising one or more bins, and ending with a resolution value, which controls how many individual cuts in the extended region around the feature are aggregated. Within the feature itself, we always - count the cut points for each base. A complete example: + count the cut points for each base. - (36-149 1) (150-224 225-324 2) (325-400 5) + We recommend using the same resolution in all bins. The following + example only shows different resolutions to be thorough. + + Assume a --bins value of '(36-149 1) (150-224 225-324 2) (325-400 5)'. With a resolution of 1, every base in the extended region - around motifs overlapping templates of length 36-149 would be + around motifs overlapping fragments of length 36-149 would be scored independently; each base's cut count would be added to the matrix. - The second group, for templates of length 150-224 or 225-324, + The second group, for fragments of length 150-224 or 225-324, with a resolution of 2, would result in every two bases in the extended region around motifs being added together. Then the aggregate scores of the two bins in the group would be summed, @@ -72,7 +98,7 @@ def parse_bins(bins_string): To illustrate, assume these settings and an imaginary motif 5 bases long, with a 10-base extended region on either side, and - for the sake of example pretend that each template length bin + for the sake of example pretend that each fragment length bin had the same counts of cut points around the motif, shown here:: @@ -120,7 +146,7 @@ def parse_bins(bins_string): if resolution < 1: raise ValueError except ValueError: - raise argparse.ArgumentTypeError("Resolution in bin group %s is not a positive integer." % g) + raise argparse.ArgumentTypeError("Resolution in bin group {} is not a positive integer.".format(g)) for i, bin_string in enumerate(bin_group): bin_string = bin_string.value() @@ -131,14 +157,18 @@ def parse_bins(bins_string): start, end = [int(s) for s in bin] if start > end: start, end = end, start - print("Bin %s specified backward; corrected to %d-%d" % (bin_string, start, end), file=sys.stderr) + print("Bin {} specified backward; corrected to {:d}-{:d}".format(bin_string, start, end), file=sys.stderr) group.append((start, end, resolution)) except ValueError: - raise argparse.ArgumentTypeError("Bin %s in group %s is malformed." % (i, g)) + raise argparse.ArgumentTypeError("Bin {} in group {} is malformed.".format(i, g)) groups.append(group) # flatten groups to just a list of bins, sort, check for overlaps bins = sorted([b for bins in groups for b in bins]) check_bins_for_overlap(bins) return groups + + +def version(program_name=''): + return program_name + ' ' + atactk.__version__ diff --git a/atactk/data.py b/atactk/data.py index f75c0f8..b17da6d 100644 --- a/atactk/data.py +++ b/atactk/data.py @@ -1,7 +1,7 @@ # # atactk: ATAC-seq toolkit # -# Copyright 2015 The Parker Lab at the University of Michigan +# Copyright 2015 Stephen Parker # # Licensed under Version 3 of the GPL or any later version # @@ -11,8 +11,6 @@ Code for reading and manipulating data commonly used in ATAC-seq pipelines. """ -from __future__ import print_function - import csv import gzip import pysam @@ -80,15 +78,12 @@ def __init__(self, reference=None, start=None, end=None, name=None, score=0, str self.feature_end = int(end) # optional BED fields - self.name = name - self.score = float(score) - self.strand = strand + self.name = name or None + self.score = score and float(score) or None + self.strand = strand or None # region adjustments self.extension = int(extension) - self.is_reverse = strand == '-' - self.region_start = self.feature_start - self.extension - self.region_end = self.feature_end + self.extension def __str__(self): return '\t'.join(str(attribute or '') for attribute in [ @@ -101,13 +96,30 @@ def __str__(self): self.extension, ]) + @property + def center(self): + return self.feature_start + int(round(self.feature_length / 2.0)) + @property def feature_length(self): return self.feature_end - self.feature_start + @property + def is_reverse(self): + return self.strand == '-' + + @property + def region_end(self): + return self.center + self.extension + @property def region_length(self): - return self.region_end - self.region_start + return self.extension * 2 + + @property + def region_start(self): + return self.center - self.extension + def complement(seq): """ @@ -175,8 +187,9 @@ def open_maybe_gzipped(filename): def count_features(filename): count = 0 - for line in open_maybe_gzipped(filename): - count += 1 + with open_maybe_gzipped(filename) as f: + for line in f: + count += 1 return count @@ -203,24 +216,16 @@ def read_features(filename, extension=100, feature_class=ExtendedFeature): An :class:`ExtendedFeature` instance for each row of the file. """ - if filename == '-': - source = sys.stdin - else: - source = open_maybe_gzipped(filename) - reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab') - for row in reader: - if 'extra_fields' in row: - del row['extra_fields'] - yield feature_class(extension=extension, **row) + with filename == '-' and sys.stdin or open_maybe_gzipped(filename) as source: + reader = csv.DictReader(source, fieldnames=FEATURE_FIELDNAMES, restkey='extra_fields', dialect='excel-tab') - -ALIGNMENT_FILE_CACHE = {} + for row in reader: + if 'extra_fields' in row: + del row['extra_fields'] + yield feature_class(extension=extension, **row) def open_alignment_file(alignment_filename): - if alignment_filename in ALIGNMENT_FILE_CACHE: - return ALIGNMENT_FILE_CACHE[alignment_filename] - alignment_file = pysam.AlignmentFile(alignment_filename, 'rb') try: alignment_file.check_index() @@ -228,7 +233,7 @@ def open_alignment_file(alignment_filename): raise AttributeError('The alignments file {} is not in BAM format. Please supply an indexed BAM file.'.format(alignment_filename)) except ValueError: raise ValueError('The alignment file {} is not usable. Please supply an indexed BAM file.'.format(alignment_filename)) - ALIGNMENT_FILE_CACHE[alignment_filename] = alignment_file + return alignment_file diff --git a/atactk/metrics/__init__.py b/atactk/metrics/__init__.py new file mode 100644 index 0000000..05a8532 --- /dev/null +++ b/atactk/metrics/__init__.py @@ -0,0 +1,7 @@ +# +# atactk: ATAC-seq toolkit +# +# Copyright 2015 Stephen Parker +# +# Licensed under Version 3 of the GPL or any later version +# diff --git a/atactk/metrics/common.py b/atactk/metrics/common.py new file mode 100644 index 0000000..658ef71 --- /dev/null +++ b/atactk/metrics/common.py @@ -0,0 +1,36 @@ +# +# atactk: ATAC-seq toolkit +# +# Copyright 2015 Stephen Parker +# +# Licensed under Version 3 of the GPL or any later version +# + + +import atactk.data +import atactk.util + + +DEFAULT_CUT_POINT_OFFSET = 4 + + +def reduce_scores(scores, resolution): + """ + Reduce a sequence of scores by summing every `resolution` values. + + Called with `scores` of [0, 1, 1, 4, 2], you'd get the following + results at various resolutions: + + ========== ====== + Resolution Result + ========== ====== + 1 [0, 1, 1, 4, 2] + 2 [1, 5, 2] + 3 [2, 6] + 4 [6, 2] + 10 [8] + ========== ====== + """ + if resolution == 1: + return scores + return [sum(chunk) for chunk in atactk.util.partition(resolution, scores)] diff --git a/atactk/metrics/cut.py b/atactk/metrics/cut.py new file mode 100644 index 0000000..60a0678 --- /dev/null +++ b/atactk/metrics/cut.py @@ -0,0 +1,242 @@ +# +# atactk: ATAC-seq toolkit +# +# Copyright 2015 Stephen Parker +# +# Licensed under Version 3 of the GPL or any later version +# + + +""" +Code for making quantitative observations about ATAC-seq experiments. +""" + +import collections +import functools +import multiprocessing +import signal + +import atactk.data +import atactk.metrics.common as common +import atactk.util + + +def find_cut_point(aligned_segment, cut_point_offset=common.DEFAULT_CUT_POINT_OFFSET): + """Return the position of the given aligned segment's ATAC-seq cut point. + + Parameters + ---------- + aligned_segment: :class:`pysam.AlignedSegment` + https://pysam.readthedocs.org/en/latest/api.html#pysam.AlignedSegment + + Returns + ------- + int + Position of the ATAC-seq cut point. + """ + if aligned_segment.is_reverse: + # the cut point is the reference_end minus (cut_point_offset + 1) + # (pysam's reference_end is one past the last aligned residue) + cut_point = aligned_segment.reference_end - (cut_point_offset + 1) + else: + cut_point = aligned_segment.reference_start + cut_point_offset # start of the read plus offset + return cut_point + + +def count_cut_points(aligned_segments, start, end, cut_point_offset=common.DEFAULT_CUT_POINT_OFFSET): + """ + Return any cut points in the region from the aligned segments. + + Parameters + ---------- + aligned_segments: list + A list of :class:`pysam.AlignedSegment`. + start: int + The start of the region of interest. + end: int + The end of the region of interest. + + Returns + ------- + list + A list of counts, one for each position from `start` to `end`, + of the aligned segments' cut points at that position. + """ + + cut_points_in_region = [] + for segment in aligned_segments: + cut_point = find_cut_point(segment, cut_point_offset) + if start <= cut_point < end: + cut_points_in_region.append(cut_point) + + # initialize the region with zero counts + counts = {p: 0 for p in range(start, end)} + + # add the cut points + counts.update(collections.Counter(cut_points_in_region)) + + cut_point_counts = [counts[position] for position in sorted(counts.keys())] + return cut_point_counts + + +def add_cut_points_to_region_tree(region_tree, group_key, strand, cut_points): + """ + Record cut points by position, fragment size, and strand. + + The tree consists of nested dictionaries. The first level keys are + positions in a region. The second level keys are the names of + fragment size groups. The third level keys are the strand, and the + values are the cut point counts for that position, fragment size + group, and strand. + + + Parameters + ---------- + region_tree: dict + The tree to which the cut point counts will be added. + group_key: str + The key corresponding to the fragment size group. + strand: str + The strand of the cut points' aligned segments: ``+`` or ``-``. + cut_points: list + The count of cut points at each position in the region that matched the fragment size group and strand. + + Notes + ----- + + Only positive counts are recorded. It takes a lot of time and + space to record so many zeroes, and it's better to produce them on + demand via :class:`collections.defaultdict`. So instead, collect + all the scores, and after that work is done, update a tree built + with :class:`collections.defaultdict`, then work with that. See + the ``make_cut_matrix`` script included with ``atactk`` for an + example. + """ + + for position, count in enumerate(cut_points, 0 - (len(cut_points) // 2)): + if count > 0: + if position not in region_tree: + region_tree[position] = {} + if group_key not in region_tree[position]: + region_tree[position][group_key] = {} + if strand not in region_tree[position][group_key]: + region_tree[position][group_key][strand] = count + else: + region_tree[position][group_key][strand] += count + + +def score_feature_with_cut_points(bin_groups, include_flags, exclude_flags, quality, cut_point_offset, feature): + """ + Count the number of transposition events around the given feature. + + Intended to be run only within a `multiprocessing.Pool`, in which + each worker initializes its own copy of the indexed BAM file + containing aligned reads, stored in the global + `atactk.metrics.cut.alignment_file`. + + Parameters + ---------- + bin_groups: iterable + A sequence of iterables containing fragment size bins and the resolution with which they should be scored. If + omitted, the matrix will contain a column for each position in the extended region, representing the count of + cuts at that position. + include_flags: iterable + The SAM flags to use when selecting aligned segments to score. + exclude_flags: iterable + The SAM flags to use when excluding aligned segments to score; any flag present on a read excludes it. + quality: int + The minimum mapping quality a read must have to be scored. + feature: ExtendedFeature + The feature to score. + + Returns + ------- + tuple + A tuple of `(row, tree)` where + + * `row` is a tab-separated list of scores in the region around the feature + * `tree` is a three-level dict holding a score for each position, in each of the fragment size bins given, on each strand, e.g.:: + + >>> tree[0]['36_149']['F'] + 22 + >>> tree[0]['36_149']['R'] + 15 + + + See Also + -------- + add_cut_points_to_region_tree: Where the tree for the cut point aggregate matrix is described more fully. + """ + + aligned_segments = alignment_file.fetch(feature.reference, max(0, feature.region_start), feature.region_end) + aligned_segments = atactk.data.filter_aligned_segments(aligned_segments, include_flags, exclude_flags, quality) + + row = [] + tree = {} + + if bin_groups: + for group in bin_groups: + group_rows = [] + group_key = atactk.util.make_bin_group_key(group) + for (minimum_length, maximum_length, resolution) in group: + bin_scores = [] + aligned_segments_in_bin = [a for a in aligned_segments if minimum_length <= abs(a.isize) <= maximum_length] + forward_aligned_segments = [a for a in aligned_segments_in_bin if not a.is_reverse] + reverse_aligned_segments = [a for a in aligned_segments_in_bin if a.is_reverse] + + forward_cut_points = count_cut_points(forward_aligned_segments, feature.region_start, feature.region_end, cut_point_offset) + reverse_cut_points = count_cut_points(reverse_aligned_segments, feature.region_start, feature.region_end, cut_point_offset) + + if feature.is_reverse: + # need to orient the cut point positions to the motif in the matrix + forward_cut_points, reverse_cut_points = list(reversed(reverse_cut_points)), list(reversed(forward_cut_points)) + + # for the discrete matrix: scores for each feature + bin_scores.extend(common.reduce_scores(forward_cut_points, resolution)) + bin_scores.extend(common.reduce_scores(reverse_cut_points, resolution)) + group_rows.append(bin_scores) + + # for the aggregate matrix: scores for the entire region + add_cut_points_to_region_tree(tree, group_key, 'F', forward_cut_points) + add_cut_points_to_region_tree(tree, group_key, 'R', reverse_cut_points) + + if len(group) == 1: + row.extend(group_rows[0]) + else: + row.extend(functools.reduce(atactk.util.add_lists, group_rows)) + else: + row = count_cut_points(aligned_segments, feature.region_start, feature.region_end, cut_point_offset) + if feature.is_reverse: + row = list(reversed(row)) + add_cut_points_to_region_tree(tree, 'All', 'Both', row) + + row = '\t'.join(str(score) for score in row) + return feature, row, tree + + +alignment_file = None + + +def scoring_process_init(alignment_filename): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + global alignment_file + if alignment_file is not None: + alignment_file.close() + alignment_file = atactk.data.open_alignment_file(alignment_filename) + + +def score_features_with_cut_points(alignment_file, bin_groups, include_flags, exclude_flags, quality, cut_point_offset, features, parallel=1): + + partial_scoring_function = functools.partial( + score_feature_with_cut_points, + bin_groups, + include_flags, + exclude_flags, + quality, + cut_point_offset + ) + + pool = multiprocessing.Pool(processes=parallel, initializer=scoring_process_init, initargs=[alignment_file]) + + return pool.imap(partial_scoring_function, features, parallel) diff --git a/atactk/metrics/mid.py b/atactk/metrics/mid.py new file mode 100644 index 0000000..96deabb --- /dev/null +++ b/atactk/metrics/mid.py @@ -0,0 +1,361 @@ +# +# atactk: ATAC-seq toolkit +# +# Copyright 2015 Stephen Parker +# +# Licensed under Version 3 of the GPL or any later version +# + + +""" +Code for making quantitative observations about ATAC-seq experiments. +""" + +import collections +import functools +import multiprocessing +import signal + +import atactk.data +import atactk.metrics.common as common +import atactk.util + + +# +# When determining a fragment's position relative to a feature, we consider it 'left' if both cut point positions are less than +# the feature's center reference position, 'right' if both are greater than the feature's center, and and 'overlapping' if they +# span the center position. +# +FEATURE_RELATIVE_POSITIONS = ('L', 'O', 'R') + + +def find_midpoint(aligned_segment): + """ + Return the location of the midpoint of the aligned segment. + + For forward reads, it's reference_start plus half the fragment length. + + For reverse reads, it's reference_end plus half the fragment length. + + (Pysam stores fragment length as a negative number for reverse reads.) + + Parameters + ---------- + aligned_segment: :class:`pysam.AlignedSegment`. + + Returns + ------- + midpoint + The integer position of the segment's midpoint. + """ + if aligned_segment.is_reverse: + return aligned_segment.reference_end + (aligned_segment.isize // 2) + else: + return aligned_segment.reference_start + (aligned_segment.isize // 2) + + +def count_midpoints(aligned_segments, feature): + """ + Count the aligned segments' midpoints that fall in the feature's extended region. + + The list of counts at positions will be ordered with respect to + the feature's strand. This just means that the returned sequence + will be reversed for features on the reverse strand. + + Parameters + ---------- + aligned_segments: list + A list of :class:`pysam.AlignedSegment`. + feature: ExtendedFeature + The feature around which to count fragment midpoints. + + Returns + ------- + generator + A generator of counts, one for each position from `start` to `end`, + of fragment midpoints at that position relative to the feature. + """ + + reads_seen = {} # map of read names already seen, to avoid double-counting fragments + midpoints_in_region = [] + for segment in aligned_segments: + if segment.qname in reads_seen: + # skip this one; we've already found the midpoint of its fragment using its mate + continue + + reads_seen[segment.qname] = 1 + + midpoint = find_midpoint(segment) + if feature.region_start <= midpoint < feature.region_end: + midpoints_in_region.append(midpoint) + + # initialize the region with zero counts + counts = {p: 0 for p in range(feature.region_start, feature.region_end)} + + # add the cut points + counts.update(collections.Counter(midpoints_in_region)) + + midpoint_counts = [counts[position] for position in sorted(counts.keys())] + if feature.is_reverse: + midpoint_counts = list(reversed(midpoint_counts)) + return midpoint_counts + + +def add_midpoints_to_region_tree(region_tree, group_key, midpoints): + """ + Record fragment midpoints by position and fragment size. + + The tree consists of nested dictionaries. The first level keys are + positions in a region. The second level keys are the names of + fragment size groups, and the values are the cut point counts for + that position and fragment size group. + + + Parameters + ---------- + region_tree: dict + The tree to which the cut point counts will be added. + group_key: str + The key corresponding to the fragment size group. + midpoints: list + The count of cut points at each position in the region that matched the fragment size group and strand. + + Notes + ----- + + Only positive counts are recorded. It takes a lot of time and + space to record so many zeroes, and it's better to produce them on + demand via :class:`collections.defaultdict`. So instead, collect + all the scores, and after that work is done, update a tree built + with :class:`collections.defaultdict`, then work with that. See + the ``make_cut_matrix`` script included with ``atactk`` for an + example. + """ + + for position, count in enumerate(midpoints, 0 - (len(midpoints) // 2)): + if count > 0: + if position not in region_tree: + region_tree[position] = {} + if group_key not in region_tree[position]: + region_tree[position][group_key] = count + else: + region_tree[position][group_key] += count + + +def score_feature_with_midpoints(bin_groups, include_flags, exclude_flags, quality, feature): + """ + Count the number of fragment midpoints around the given feature. + + Intended to be run only within a `multiprocessing.Pool`, in which + each worker initializes its own copy of the indexed BAM file + containing aligned reads, stored in the global + `atactk.metrics.cut.alignment_file`. + + Parameters + ---------- + bin_groups: iterable + A sequence of iterables containing bins and the resolution with which they should be scored. + include_flags: iterable + The SAM flags to use when selecting aligned segments to score. + exclude_flags: iterable + The SAM flags to use when excluding aligned segments to score; any flag present on a read excludes it. + quality: int + The minimum mapping quality a read must have to be scored. + feature: ExtendedFeature + The feature to score. + + Returns + ------- + tuple + A tuple of `(row, tree)` where + + * `row` is a tab-separated list of scores in the region around the feature + * `tree` is a two-level dict holding a score for each position, in each of the fragment size bins given, e.g.:: + + >>> tree[0]['36_149'] + 22 + >>> tree[0]['36_149'] + 15 + + + See Also + -------- + add_midpoints_to_region_tree: Where the tree for the midpoint aggregate matrix is described more fully. + + """ + + alignment_search_region_extension = max([b[1] for bins in bin_groups for b in bins]) // 2 + + aligned_segments = alignment_file.fetch( + feature.reference, + max(0, feature.region_start - alignment_search_region_extension), + feature.region_end + alignment_search_region_extension + ) + aligned_segments = atactk.data.filter_aligned_segments(aligned_segments, include_flags, exclude_flags, quality) + + row = [] + tree = {} + + for group in bin_groups: + group_rows = [] + group_key = atactk.util.make_bin_group_key(group) + for (minimum_length, maximum_length, resolution) in group: + bin_scores = [] + aligned_segments_in_bin = [a for a in aligned_segments if minimum_length <= abs(a.isize) <= maximum_length] + midpoint_counts = count_midpoints(aligned_segments_in_bin, feature) + + # for the discrete matrix: scores for each feature + bin_scores.extend(common.reduce_scores(midpoint_counts, resolution)) + group_rows.append(bin_scores) + + # for the aggregate matrix: scores for the entire region + add_midpoints_to_region_tree(tree, group_key, midpoint_counts) + + if len(group) == 1: + row.extend(group_rows[0]) + else: + row.extend(functools.reduce(atactk.util.add_lists, group_rows)) + + row = '\t'.join(str(score) for score in row) + return feature, row, tree + + +alignment_file = None + + +def scoring_process_init(alignment_filename): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + global alignment_file + if alignment_file is not None: + alignment_file.close() + alignment_file = atactk.data.open_alignment_file(alignment_filename) + + +def score_features_with_midpoints(alignment_file, bin_groups, include_flags, exclude_flags, quality, cut_point_offset, features, parallel=1): + + partial_scoring_function = functools.partial( + score_feature_with_midpoints, + bin_groups, + include_flags, + exclude_flags, + quality, + cut_point_offset + ) + + pool = multiprocessing.Pool(processes=parallel, initializer=scoring_process_init, initargs=[alignment_file]) + + return pool.imap(partial_scoring_function, features, parallel) + + +def get_fragment_cut_points(aligned_segment, cut_point_offset=common.DEFAULT_CUT_POINT_OFFSET): + """ + Return the positions of both of the cut points in the the given aligned segment's fragment. + + 80bp fragment mapped to reference: + S-------------------------------------------------------------------------------E + + 50bp paired end read 1 (forward): + S---C---------------------------------------------E + + 50bp paired end read 2 (reverse): + S---------------------------------------------C---E + + Starts (S) and ends (E) are always just the AlignedSegment's reference_start and reference_end. The cut points are always + relative to the start of the read, which for reverse reads is reference_end. Pysam's reference_end is one *past* the last + actual position of the read, so we have to take that into account when calculating the cut point position. + + Parameters + ---------- + aligned_segment: :class:`pysam.AlignedSegment` + https://pysam.readthedocs.org/en/latest/api.html#pysam.AlignedSegment + + Returns + ------- + tuple + A tuple of (int, int) containing the fragment's sorted cut point positions. + """ + if aligned_segment.is_reverse: + right_cut_point = aligned_segment.reference_end - (cut_point_offset + 1) + left_cut_point = aligned_segment.next_reference_start + cut_point_offset + else: + left_cut_point = aligned_segment.reference_start + cut_point_offset + right_cut_point = aligned_segment.reference_start + aligned_segment.isize - (cut_point_offset + 1) + return (left_cut_point, right_cut_point) + + +def get_fragment_position_relative_to_feature(aligned_segment, cut_point_offset, feature): + """ + Determine where an aligned segment's fragment maps relative to a feature. + + If both cut points in the fragment map before the center position of the feature, we say the fragment is to the feature's + left. If both map after the center, it's to the right. Otherwise the fragment overlaps the center. + """ + + cut_points = get_fragment_cut_points(aligned_segment, cut_point_offset) + if feature.center > max(cut_points): + # 'left' of the feature + if feature.is_reverse: + position = 'R' + else: + position = 'L' + elif feature.center < min(cut_points): + # 'right' of the feature + if feature.is_reverse: + position = 'L' + else: + position = 'R' + else: + # overlapping the feature + position = 'O' + return position + + +def find_midpoints_around_feature(alignment_filename, include_flags, exclude_flags, quality, cut_point_offset, feature): + """ + Find fragment midpoints around a feature. + + Parameters + ---------- + alignment_filename: str + The BAM file containing aligned reads. + include_flags: iterable + The SAM flags to use when selecting aligned segments to score. + exclude_flags: iterable + The SAM flags to use when excluding aligned segments to score; any flag present on a read excludes it. + quality: int + The minimum mapping quality a read must have to be scored. + feature: ExtendedFeature + The feature to score. + + Returns + ------- + A list containing for each midpoint found its position relative to the feature center, its fragment size, and its + fragment's position relative to the feature center. + """ + + alignment_file = atactk.data.open_alignment_file(alignment_filename) + + feature_center = feature.center + reads_seen = {} # map of read names already seen, to avoid double-counting fragments + midpoints = [] + + aligned_segments = alignment_file.fetch(feature.reference, max(0, feature.region_start), feature.region_end) + aligned_segments = atactk.data.filter_aligned_segments(aligned_segments, include_flags, exclude_flags, quality) + + for aligned_segment in aligned_segments: + if aligned_segment.qname in reads_seen: + # skip this one; we've already found the midpoint of its fragment using its mate + continue + + reads_seen[aligned_segment.qname] = 1 + + midpoint = find_midpoint(aligned_segment) + if feature.region_start <= midpoint <= feature.region_end: + distance_to_center = midpoint - feature_center + if feature.is_reverse: + distance_to_center *= -1 + fragment_relative_position = get_fragment_position_relative_to_feature(aligned_segment, cut_point_offset, feature) + midpoints.append((feature_center, midpoint, distance_to_center, abs(aligned_segment.isize), fragment_relative_position)) + + return midpoints diff --git a/atactk/util.py b/atactk/util.py index 35b3c80..f8e2654 100644 --- a/atactk/util.py +++ b/atactk/util.py @@ -1,7 +1,7 @@ # # atactk: ATAC-seq toolkit # -# Copyright 2015 The Parker Lab at the University of Michigan +# Copyright 2015 Stephen Parker # # Licensed under Version 3 of the GPL or any later version # @@ -11,7 +11,11 @@ """ import collections +import logging import operator +import os +import resource +import signal def add_lists(l1, l2): @@ -37,6 +41,50 @@ def add_lists(l1, l2): return map(operator.add, l1, l2) +def exit_forcefully(signum, stack): + logging.fatal('Exiting forcefully.') + os.kill(os.getpid(), signal.SIGTERM) + + +def humanize_time(seconds): + s = '' + days = hours = minutes = 0 + if seconds >= 86400: + days = seconds // 86400 + seconds = seconds % 86400 + s += '{:.0f}d '.format(days) + if seconds >= 3600: + hours = seconds // 3600 + seconds = seconds % 3600 + s += '{:.0f}h '.format(hours) + if seconds >= 60: + minutes = seconds // 60 + seconds = seconds % 60 + s += '{:.0f}m '.format(minutes) + if not s: + s += '{:.2f}s'.format(seconds) + return s + + +def log_memory_usage(signum, stack): + logging.debug('Memory in use: {}'.format(memory_in_use())) + signal.alarm(5) + + +def make_bin_group_key(bin_group): + keys = [] + for bin in bin_group: + key = '{}'.format(bin[0]) + if bin[1]: + key = '{}_{}'.format(key, bin[1]) + keys.append(key) + return ','.join(keys) + + +def memory_in_use(): + return '{} mb'.format(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) + + def take(count, seq): """ Return a list of up to `count` elements from the iterable `seq`. diff --git a/docs/installation.rst b/docs/installation.rst index d52c3b9..65ea5b5 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -12,8 +12,7 @@ The dependencies should be installed automatically. Requirements ------------ -* Python. We've run it successfully under versions 2.7.10 and 3.4.3. +* Python. We've run it successfully under versions 2.7.10 and 3.5. * pysam * python-levenshtein * sexpdata - diff --git a/docs/usage.rst b/docs/usage.rst index 871fb04..71ce756 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -82,15 +82,15 @@ Aggregate matrices After you've run CENTIPEDE with the resulting `discrete` matrix, and identified bound and unbound motifs (perhaps using posterior -probabilities of at least 0.95 or at most 0.5, respectively), you can +probabilities of at least 0.99 or at most 0.5, respectively), you can move on to generating what we call `aggregate` matrices. These are designed for creating a plot of the ATAC-seq signal around a single motif. An `aggregate` matrix contains a row for each combination of position, fragment size bin, and strand within and around the motif, with -columns representing the absolute and mean cut point counts at each -position. +columns representing the absolute count and count divided by the +number of total motifs at each position. An example invocation, using a BED file of bound motifs:: @@ -127,7 +127,7 @@ this:: make_cut_matrix -a -b '(36-149 1) (150-224 225-324 2) (325-400 5)' ... Pretend you're scoring a motif 5 bases long, with a 10-base extended -region on either side, and for simplicity, pretend that each template +region on either side, and for simplicity, pretend that each fragment length bin had the same counts of cut points around the motif, shown here:: diff --git a/scripts/make_cut_matrix b/scripts/make_cut_matrix index 8d921e6..c8981ee 100755 --- a/scripts/make_cut_matrix +++ b/scripts/make_cut_matrix @@ -5,7 +5,7 @@ # ATAC-seq experiment and a BED file of motifs, creates a matrix of the # transposition cuts around the motifs. # -# Copyright 2015 The Parker Lab at the University of Michigan +# Copyright 2015 Stephen Parker # # Licensed under Version 3 of the GPL or any later version # @@ -13,11 +13,9 @@ from __future__ import print_function import argparse -import collections -import functools +import decimal import logging -import multiprocessing -import multiprocessing.managers +import os import signal import sys import time @@ -27,32 +25,11 @@ import traceback import atactk.command import atactk.data import atactk.metrics +import atactk.metrics.cut +import atactk.util -LOGGING_FORMAT = '%(asctime)s %(levelname)s %(message)s' - -def worker_init(): - signal.signal(signal.SIGINT, signal.SIG_IGN) - - -def humanize_time(seconds): - s = '' - days = hours = minutes = 0 - if seconds >= 86400: - days = seconds // 86400 - seconds = seconds % 86400 - s += '{:.0f}d '.format(days) - if seconds >= 3600: - hours = seconds // 3600 - seconds = seconds % 3600 - s += '{:.0f}h '.format(hours) - if seconds >= 60: - minutes = seconds // 60 - seconds = seconds % 60 - s += '{:.0f}m '.format(minutes) - if not s: - s += '{:.2f}s'.format(seconds) - return s +LOGGING_FORMAT = '%(levelname)s %(message)s' def parse_arguments(): @@ -61,72 +38,31 @@ def parse_arguments(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent(""" - Given a BAM file containing alignments from an ATAC-seq experiment and a - BED file of motifs, creates a matrix of counts of the transposition - integration events (cuts) around the motifs. + Given an indexed BAM file containing alignments from an + ATAC-seq experiment and a BED file of motifs, creates a matrix + of counts of the transposition integration events (cuts) + around the motifs. BINNING ======= - Each motif in the BED file is scored for each of the given fragment - length bins. The bins are specified as a list of groups, each group - comprising one or more bins, and ending with a resolution value, which - controls how many individual cuts in the extended region around the - motif are aggregated. Within the motif itself, we always count the cut - points for each base. A complete example: - - (36-149 1) (150-224 225-324 2) (325-400 5) - - With a resolution of 1, every base in the extended region around motifs - overlapping fragments of length 36-149 would be scored independently; - each base's cut count would be added to the matrix. - - The second group, for fragments of length 150-224 or 225-324, with a - resolution of 2, would result in every two bases in the extended region - around motifs being added together. Then the aggregate scores of the two - bins in the group would be summed, and the result would be added to the - matrix. - - The last bin group, (325-400 5), with a resolution of 5, would also - produce aggregate scores in the extended region, each being the sum of - five bases' cut counts. - - To illustrate, assume these settings and an imaginary motif 5 bases - long, with a 10-base extended region on either side, and for the sake of - example pretend that each fragment length bin had the same counts of cut - points around the motif, shown here: - - extended region motif extended region - ------------------- --------- ------------------- - 0 1 2 3 3 4 4 4 4 5 9 2 0 2 7 5 4 4 4 4 3 3 2 1 0 - - The scores for the first bin group, (36-149 1): + If fragment size bins are not specified with the --bins argument, each + column in the matrix will represent the total number of cut points at that + position in the extended region around the motif. - extended region motif extended region - ------------------- --------- ------------------- - 0 1 2 3 3 4 4 4 4 5 9 2 0 2 7 5 4 4 4 4 3 3 2 1 0 + If bins are specified, the matrix will contain a column for each position in + the extended region, for each fragment size bin, and each strand. - The scores for the second bin group, (150-224 225-324 2): - - e.r. motif e.r. - --------- --------- --------- - 1 5 7 8 9 9 2 0 2 7 9 8 7 5 1 - - The scores for the last bin group, (325-400 5): - - e.r. motif e.r. - ---- --------- ---- - 9 21 9 2 0 2 7 21 9 + {} INPUT ===== BAM files just work. - BED files mean different things to different people. We - require a file of tab-separated values, where the first six - fields (and their labels in the BED format documentation at - https://genome.ucsc.edu/FAQ/FAQformat.html) are: + BED files mean different things to different people. We require a file of + tab-separated values, where the first six fields (and their labels in the + BED format documentation at https://genome.ucsc.edu/FAQ/FAQformat.html) are: 1. The reference sequence name ('chrom'): text 2. The feature start position ('chromStart'): integer @@ -141,32 +77,34 @@ def parse_arguments(): ====== You can request either discrete or aggregate output. Discrete output is - intended to be usable as input to CENTIPEDE. After you've run CENTIPEDE, - you can feed different parts of the resulting BED file back into this - script and request aggregate output to produce a file suitable for - plotting with out plot_aggregate_matrix.R script. As an example, we like - to visualize all motifs with a posterior probability of binding >0.95 - and a random sampling of motifs with a posterior probability of - <0.5. This provides a good visual inspection of the results. - - Each row of the discrete matrix represents the cut point counts around a - single motif from your input BED file. The positions reported depend on - the resolution specified in the --bins argument. For example, specifying - a resolution of 1 will result in single nucleotide resolution -- you'll - get the count at each position in the extended region around the - motif. Specifying 10 will report the sum of every 10 positions. - - Aggregate output reports absolute and mean cut point counts for all of - your motifs at each position in the region around the motif, for each - strand and for each fragment size bin represented at that position. - - \0""") + intended to be usable as input to CENTIPEDE. After you've run CENTIPEDE, you + can feed different parts of the resulting BED file back into this script and + request aggregate output to produce a file suitable for + plot_aggregate_matrix.R. As an example, we like to extract from the + CENTIPEDE output all motifs with a posterior probability of binding greater + than 0.99, and a random sampling of motifs with a posterior probability of + less than 0.5. Plotting those provides a good visual inspection of the + results. + + Each row of the discrete matrix represents the scores around a single motif + from your input BED file. The positions reported depend on the resolution + specified in the --bins argument. For example, specifying a resolution of 1 + will result in single nucleotide resolution -- you'll get the score at each + position in the extended region around the motif. Specifying 10 will report + the sum of every 10 positions. + + Aggregate output includes absolute counts and counts divided by the number + of motifs for all of your motifs at each position in the region around the + motif, for each fragment size bin represented at that position, on each + strand. + + \0""".format('\n '.join(atactk.command.parse_bins.__doc__.split('\n')[3:]).lstrip())) ) group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('-a', '--aggregate-output', dest='aggregate', action='store_true', help='Requests a matrix in which each row represents a position in the extended region and the mean cut point count at that position across all motifs. See OUTPUT, below.') - group.add_argument('-d', '--discrete-output', dest='discrete', action='store_true', help='Requests a matrix in which each row represents all the cut point counts around one motif. See OUTPUT, below.') + group.add_argument('-a', '--aggregate-output', dest='aggregate', action='store_true', help='Requests an aggregate matrix. See OUTPUT, below.') + group.add_argument('-d', '--discrete-output', dest='discrete', action='store_true', help='Requests a discrete matrix. See OUTPUT, below.') - parser.add_argument('-b', '--bins', dest='bins', type=atactk.command.parse_bins, required=True, help='A list of fragment size bin groups and their resolutions. See BINNING, below.') + parser.add_argument('-b', '--bins', dest='bins', default=[], type=atactk.command.parse_bins, help='A list of fragment size bin groups and their resolutions. See BINNING, below.') parser.add_argument('-F', '--exclude-flags', type=int, dest='exclude_flags', action='append', help='A SAM flag used to exclude alignments from the BAM file. More than one may be specified. Alignments matching any exclude flag will not be counted. The default is to exclude all unmapped reads/mates by filtering out any alignments with SAM flags 4 or 8 set.') parser.add_argument('-f', '--include-flags', type=int, dest='include_flags', action='append', help='A SAM flag that determines which alignments from the BAM file will be included in the counts. More than one may be specified. Any alignment matching any include flag will be counted. The default is to include properly paired and mapped reads by filtering for SAM flags 83, 99, 147, or 163.') parser.add_argument('-o', '--cut-point-offset', type=int, default=4, dest='cut_point_offset', help='The position of cut points relative to the beginning of a read and in the direction toward the read end, as a number of bases (default: 4).') @@ -174,7 +112,7 @@ def parse_arguments(): parser.add_argument('-q', '--quality', type=int, default=30, dest='quality', help='The minimum mapping quality required for a read to be counted (default: 30).') parser.add_argument('-r', '--region-extension', type=int, default=100, dest='extension', help='The number of bases to score on either side of the motifs (default: 100).') parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Requests more detailed output.') - parser.add_argument('--version', action='version', version='%%(prog)s %s' % atactk.__version__) + parser.add_argument('--version', action='version', version='%(prog)s {}'.format(atactk.__version__)) parser.add_argument('alignments', metavar='BAM-file-of-aligned-reads', help='The indexed BAM file containing the aligned ATAC-seq reads.') parser.add_argument('motifs', metavar='BED-file-of-motifs', help='The BED file containing the motifs. Use "-" to read from standard input.') @@ -182,6 +120,50 @@ def parse_arguments(): return parser.parse_args() +def print_discrete_matrix(scored_motifs): + motif_count = 0 + for motif_count, (motif, row, tree) in enumerate(scored_motifs, 1): + print(row) + return motif_count + + +def print_aggregate_matrix(scored_motifs, aggregate_positions, bins): + strands = ('F', 'R') + + if not bins: + bins = [[('All', None, None)]] + strands = ['Both'] + + matrix = {} + for position in aggregate_positions: + matrix[position] = {} + for bin_group in bins: + bin_group_key = atactk.util.make_bin_group_key(bin_group) + matrix[position][bin_group_key] = {} + for strand in strands: + matrix[position][bin_group_key][strand] = 0 + + for motif_count, scored_motif in enumerate(scored_motifs, 1): + motif, row, tree = scored_motif + for position in tree.keys(): + for bin_group_key in tree[position].keys(): + for strand in strands: + try: + count = tree[position][bin_group_key][strand] + matrix[position][bin_group_key][strand] += count + except KeyError: + pass # no count to add + + decimal_motif_count = decimal.Decimal(motif_count) + print('Position\tFragmentSizeBin\tStrand\tCutPointCount\tCutPointCountFraction') + for position, fragment_size_bins in sorted(matrix.items()): + for fragment_size_bin, strands in sorted(fragment_size_bins.items(), key=lambda item: '_' in item and int(item[0].split('_')[0])): + for strand, count in sorted(strands.items()): + print('{}\t{}\t{}\t{:d}\t{}'.format(position, fragment_size_bin, strand, count, count / decimal_motif_count)) + + return motif_count + + def make_cut_matrix(args): job_start = time.time() @@ -195,83 +177,71 @@ def make_cut_matrix(args): else: args.exclude_flags = [4, 8] - loglevel = args.verbose and logging.DEBUG or logging.INFO - logging.basicConfig(level=loglevel, format=LOGGING_FORMAT) - logger = logging.getLogger('make_cut_matrix') - if args.parallel > 1: - logger.info('Using %s concurrent scoring processes' % args.parallel) - logger.info('Filtering alignments for quality >= %s, with flags %s and without flags %s' % (args.quality, args.include_flags, args.exclude_flags)) - logger.info('Using these fragment size bins: %s' % args.bins) - - matrix = collections.defaultdict( # position - lambda: collections.defaultdict( # template size bin - lambda: collections.defaultdict(int) # strand - ) - ) + logger.info('Using {:d} concurrent scoring processes'.format(args.parallel)) + logger.info('Filtering alignments for quality >= {}, with flags {} and without flags {}'.format(args.quality, args.include_flags, args.exclude_flags)) + if args.bins: + logger.info('Using these fragment size bins: {}'.format(args.bins)) logger.info('Reading motifs from {}...'.format(args.motifs == '-' and 'standard input' or args.motifs)) + motifs = atactk.data.read_features(args.motifs, args.extension) - score = functools.partial( - atactk.metrics.score_feature, - args.alignments, - args.bins, - args.include_flags, - args.exclude_flags, - args.quality, - args.cut_point_offset - ) + if args.verbose: + logging.debug('Memory in use after loading motifs: {}'.format(atactk.util.memory_in_use())) + + motif_count = 0 + + aggregate_positions = range(0 - args.extension, args.extension + 1) + pool = None - logger.info('Making cut point matrix...') try: - if args.parallel > 1: - pool = multiprocessing.Pool(processes=args.parallel, initializer=worker_init) - results = pool.imap(score, motifs, args.parallel) - else: - results = (score(motif) for motif in motifs) - - motif_count = None - for motif_count, (motif, row, tree) in enumerate(results, 1): - if args.discrete: - print(row) - elif args.aggregate: - # since the tree returned from atactk.metrics.score_feature is sparse, but we want to print a row for every - # position in the extended region around the motif, we can't just iterate tree.items() - max_position = motif.region_length // 2 - min_position = 0 - max_position - - for position in range(min_position, max_position): - for bin_group in args.bins: - bin_key = ','.join('%s_%s' % (bin[0], bin[1]) for bin in bin_group) - for strand in ('F', 'R'): - if position in tree and bin_key in tree[position]: - count = tree[position][bin_key].get(strand, 0) - else: - count = 0 - matrix[position][bin_key][strand] += count + logger.info('Making {} cut point matrix...'.format(args.discrete and 'discrete' or 'aggregate')) + scored_motifs = atactk.metrics.cut.score_features_with_cut_points(args.alignments, args.bins, args.include_flags, args.exclude_flags, args.quality, args.cut_point_offset, motifs, args.parallel) + + if args.discrete: + motif_count = print_discrete_matrix(scored_motifs) + elif args.aggregate: + motif_count = print_aggregate_matrix(scored_motifs, aggregate_positions, args.bins) if not motif_count: - logger.error('No motifs were found in the BED input.') - sys.exit(1) - - if args.aggregate: - motif_count = float(motif_count) - print('Position\tBin\tStrand\tCutPointCount\tMeanCutPointCount') - for position, template_size_bins in sorted(matrix.items()): - for template_size_bin, strands in sorted(template_size_bins.items(), key=lambda item: int(item[0].split('_')[0])): - for strand, count in sorted(strands.items()): - print('%s\t%s\t%s\t%d\t%.3f' % (position, template_size_bin, strand, count, count / motif_count)) - - logger.info('Processed {:.0f} motif{} in {}'.format(motif_count, motif_count > 1 and 's' or '', humanize_time(time.time() - job_start))) + logger.warn("""No motifs were found in the BED input. Make sure it is in the format\nspecified in this program's help and at https://genome.ucsc.edu/FAQ/FAQformat.html.""") + + if args.verbose: + logging.debug('Memory in use at end: {}'.format(atactk.util.memory_in_use())) + + logger.info('Processed {:.0f} feature{} in {}'.format(motif_count, motif_count == 1 and '' or 's', atactk.util.humanize_time(time.time() - job_start))) except KeyboardInterrupt: - print('Keyboard interrupt received. Stopping...', file=sys.stderr) + logger.info('Keyboard interrupt received in process {}.'.format(os.getpid())) + + if pool: + if sys.version_info.major < 3: + logger.warn('Interrupt handling in multiprocessing programs is not reliable before Python 3, so I may have to exit without cleaning up all worker processes. Sorry.') + signal.signal(signal.SIGALRM, atactk.util.exit_forcefully) + signal.alarm(10) + + logger.info('Telling worker processes to terminate...') + pool.terminate() + logger.info('Waiting for worker processes to terminate...') + pool.join() + logger.info('Exiting.') sys.exit(1) except Exception as e: logger.error(e) - traceback.print_exc(file=sys.stderr) + logger.info("""Check your input files. Make sure the BED file of motifs is in the proper format,\nas specified in this program's help and at https://genome.ucsc.edu/FAQ/FAQformat.html.""") + if args.verbose: + traceback.print_exc(file=sys.stderr) sys.exit(1) if __name__ == '__main__': - make_cut_matrix(parse_arguments()) + args = parse_arguments() + + loglevel = args.verbose and logging.DEBUG or logging.INFO + logging.basicConfig(level=loglevel, format=LOGGING_FORMAT) + logger = logging.getLogger('make_cut_matrix') + + bins = sorted([b for bins in args.bins for b in bins]) + atactk.command.check_bin_resolutions(bins, args.extension) + + make_cut_matrix(args) diff --git a/scripts/make_midpoint_matrix b/scripts/make_midpoint_matrix new file mode 100755 index 0000000..2af6d51 --- /dev/null +++ b/scripts/make_midpoint_matrix @@ -0,0 +1,245 @@ +#!/usr/bin/env python + +# +# make_midpoint_matrix: Given a BAM file containing alignments from an +# ATAC-seq experiment and a BED file of motifs, creates a matrix of the +# fragment midpoints around the motifs. +# +# Copyright 2015 Stephen Parker +# +# Licensed under Version 3 of the GPL or any later version +# + +from __future__ import print_function + +import argparse +import collections +import decimal +import functools +import logging +import multiprocessing +import multiprocessing.managers +import os +import signal +import sys +import time +import textwrap +import traceback + +import atactk.command +import atactk.data +import atactk.metrics +import atactk.metrics.mid +import atactk.util + + +LOGGING_FORMAT = '%(levelname)s %(message)s' + + +def worker_init(): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + prog='make_midpoint_matrix', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(""" + + Given a BAM file containing alignments from an ATAC-seq + experiment and a BED file of motifs, creates a matrix of + counts of the fragment midpoints around the motifs. + + BINNING + ======= + + Each motif in the BED file is scored for each of the given + fragment length bins. + + {} + + INPUT + ===== + + BAM files just work. + + BED files mean different things to different people. We + require a file of tab-separated values, where the first six + fields (and their labels in the BED format documentation at + https://genome.ucsc.edu/FAQ/FAQformat.html) are: + + 1. The reference sequence name ('chrom'): text + 2. The feature start position ('chromStart'): integer + 3. The feature end position ('chromEnd'): integer + 4. The name of the feature ('name'): text + 5. The feature's score ('score'): number + 6. The feature's strand ('strand'): '+' or '-' + + Extra fields are ignored. + + OUTPUT + ====== + + You can request either discrete or aggregate output. Discrete + output is intended to be usable as input to CENTIPEDE. After + you've run CENTIPEDE, you can feed different parts of the + resulting BED file back into this script and request aggregate + output to produce a file suitable for plotting with one of our + R scripts. As an example, we like to extract from the CENTIPEDE + output all motifs with a posterior probability of binding greater + than 0.99, and a random sampling of motifs with a posterior + probability of less than 0.5. Plotting those provides a good + visual inspection of the results. + + Each row of the discrete matrix represents the scores around a + single motif from your input BED file. The positions reported + depend on the resolution specified in the --bins argument. For + example, specifying a resolution of 1 will result in single + nucleotide resolution -- you'll get the score at each position + in the extended region around the motif. Specifying 10 will + report the sum of every 10 positions. + + Aggregate output reports absolute counts and counts divided by + the number of motifs for all of your motifs at each position + in the region around the motif, for each fragment size bin + represented at that position. + + \0""".format('\n '.join(atactk.command.parse_bins.__doc__.split('\n')[3:]).lstrip())) + ) + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('-a', '--aggregate-output', dest='aggregate', action='store_true', help='Requests an aggregate matrix. See OUTPUT, below.') + group.add_argument('-d', '--discrete-output', dest='discrete', action='store_true', help='Requests a discrete matrix. See OUTPUT, below.') + + parser.add_argument('-b', '--bins', dest='bins', type=atactk.command.parse_bins, required=True, help='A list of fragment size bin groups and their resolutions. See BINNING, below.') + parser.add_argument('-F', '--exclude-flags', type=int, dest='exclude_flags', action='append', help='A SAM flag used to exclude alignments from the BAM file. More than one may be specified. Alignments matching any exclude flag will not be counted. The default is to exclude all unmapped reads/mates by filtering out any alignments with SAM flags 4 or 8 set.') + parser.add_argument('-f', '--include-flags', type=int, dest='include_flags', action='append', help='A SAM flag that determines which alignments from the BAM file will be included in the counts. More than one may be specified. Any alignment matching any include flag will be counted. The default is to include properly paired and mapped reads by filtering for SAM flags 83, 99, 147, or 163.') + parser.add_argument('-p', '--parallel', type=int, default=1, dest='parallel', help='The number of parallel scoring processes to use (default: 1).') + parser.add_argument('-q', '--quality', type=int, default=30, dest='quality', help='The minimum mapping quality required for a read to be counted (default: 30).') + parser.add_argument('-r', '--region-extension', type=int, default=100, dest='extension', help='The number of bases to score on either side of the motifs (default: 100).') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Requests more detailed output.') + parser.add_argument('--version', action='version', version='%(prog)s {}'.format(atactk.__version__)) + + parser.add_argument('alignments', metavar='BAM-file-of-aligned-reads', help='The indexed BAM file containing the aligned ATAC-seq reads.') + parser.add_argument('motifs', metavar='BED-file-of-motifs', help='The BED file containing the motifs. Use "-" to read from standard input.') + + return parser.parse_args() + + +def print_discrete_matrix(scored_motifs): + motif_count = 0 + for motif_count, (motif, row, tree) in enumerate(scored_motifs, 1): + print(row) + return motif_count + + +def print_aggregate_matrix(scored_motifs, aggregate_positions, bins): + matrix = collections.defaultdict( # position + lambda: collections.defaultdict(int) # fragment size bin + ) + motif_count = 0 + + # since the tree returned from atactk.metrics.mid.score_feature_with_midpoints is sparse, but we want to print a row for every + # position in the extended region around the motif, we can't just iterate tree.items() + for motif_count, (motif, row, tree) in enumerate(scored_motifs, 1): + for position in aggregate_positions: + for bin_group in bins: + group_key = atactk.util.make_bin_group_key(bin_group) + if position in tree: + count = tree[position].get(group_key, 0) + else: + count = 0 + matrix[position][group_key] += count + + decimal_motif_count = decimal.Decimal(motif_count) + print('Position\tFragmentSizeBin\tMidpointCount\tMidpointCountFraction') + for position, fragment_size_bins in sorted(matrix.items()): + for fragment_size_bin, count in sorted(fragment_size_bins.items(), key=lambda item: int(item[0].split('_')[0])): + print('{}\t{}\t{:d}\t{}'.format(position, fragment_size_bin, count, count / decimal_motif_count)) + + return motif_count + + +def make_midpoint_matrix(args): + job_start = time.time() + + if args.include_flags: + args.include_flags = sorted(set(args.include_flags)) + else: + args.include_flags = [83, 99, 147, 163] + + if args.exclude_flags: + args.exclude_flags = sorted(set(args.exclude_flags)) + else: + args.exclude_flags = [4, 8] + + if args.parallel > 1: + logger.info('Using {:d} concurrent scoring processes'.format(args.parallel)) + logger.info('Filtering alignments for quality >= {}, with flags {} and without flags {}'.format(args.quality, args.include_flags, args.exclude_flags)) + logger.info('Using these fragment size bins: {}'.format(args.bins)) + logger.info('Reading motifs from {}...'.format(args.motifs == '-' and 'standard input' or args.motifs)) + + motifs = atactk.data.read_features(args.motifs, args.extension) + + motif_count = 0 + score = functools.partial( + atactk.metrics.mid.score_feature_with_midpoints, + args.alignments, + args.bins, + args.include_flags, + args.exclude_flags, + args.quality, + ) + + aggregate_positions = range(0 - args.extension, args.extension) + pool = None + + try: + logger.info('Making {} midpoint matrix...'.format(args.discrete and 'discrete' or 'aggregate')) + pool = multiprocessing.Pool(processes=args.parallel, initializer=worker_init) + scored_motifs = pool.imap(score, motifs, args.parallel) + + if args.discrete: + motif_count = print_discrete_matrix(scored_motifs) + elif args.aggregate: + motif_count = print_aggregate_matrix(scored_motifs, aggregate_positions, args.bins) + + if not motif_count: + logger.warn('No motifs were found in the BED input.') + + if args.verbose: + logging.debug('Memory in use: {}'.format(atactk.util.memory_in_use())) + + logger.info('Processed {:.0f} feature{} in {}'.format(motif_count, motif_count == 1 and '' or 's', atactk.util.humanize_time(time.time() - job_start))) + except KeyboardInterrupt: + logger.info('Keyboard interrupt received in process {}.'.format(os.getpid())) + + if pool: + if sys.version_info.major < 3: + logger.warn('Interrupt handling in multiprocessing programs is not reliable before Python 3, so I may have to exit without cleaning up all worker processes. Sorry.') + signal.signal(signal.SIGALRM, atactk.util.exit_forcefully) + signal.alarm(10) + + logger.info('Telling worker processes to terminate...') + pool.terminate() + logger.info('Waiting for worker processes to terminate...') + pool.join() + logger.info('Exiting.') + sys.exit(1) + except Exception as e: + logger.error(e) + if args.verbose: + traceback.print_exc(file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + args = parse_arguments() + + loglevel = args.verbose and logging.DEBUG or logging.INFO + logging.basicConfig(level=loglevel, format=LOGGING_FORMAT) + logger = logging.getLogger('make_midpoint_matrix') + + bins = sorted([b for bins in args.bins for b in bins]) + atactk.command.check_bin_resolutions(bins, args.extension) + + make_midpoint_matrix(args) diff --git a/scripts/measure_features b/scripts/measure_features new file mode 100755 index 0000000..dbb887c --- /dev/null +++ b/scripts/measure_features @@ -0,0 +1,143 @@ +#!/usr/bin/env python + +# +# measure_features: Get coverage statistics for features +# + +from __future__ import print_function + +import argparse +import csv +import errno +import functools +import logging +import math +import multiprocessing +import multiprocessing.managers +import signal +import sys +import textwrap + +import pyBigWig + +import atactk.data + + +STAT_CHOICES = [ + 'coverage', + 'max', + 'mean', + 'median', + 'min', + 'std', +] + + +def parse_arguments(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=textwrap.dedent(""" + + Given a bigWig file containing coverage counts and a BED file of features, + calculates a requested statistic for each feature. + + """) + '\n\n' + ) + + parser.add_argument('-n', '--null-value', default='NA', dest='nullvalue', help="""The string to record for features that cannot be scored for some reason (usually because the bigWig has no data in their regions).""") + parser.add_argument('-p', '--parallel', type=int, default=1, dest='parallel', help='The number of parallel scoring processes to use (default: 1).') + parser.add_argument('-s', '--statistic', default='mean', choices=STAT_CHOICES, help='The statistic to calculate for each feature.') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Requests more detailed output.') + + parser.add_argument('coverage', help='The bigWig file containing coverage counts.') + parser.add_argument('features', default='-', help='The BED file containing the feature data. Omit or use "-" to read from stdin.') + + return parser.parse_args() + + +def clean_measurement(m, nullvalue): + result = m + try: + if math.isnan(m): + result = nullvalue + except: + result = nullvalue + + return result + + +def measure_feature(coverage_file, statistic, nullvalue, feature): + bw = pyBigWig.open(coverage_file) + + measurement = None + measured = False + try: + if statistic == 'median': + values = sorted(bw.values(feature[0], int(feature[1]), int(feature[2]))) + count = len(values) + if count == 0: + measurement = 0 + elif count % 2 == 0: + measurement = (values[count / 2 - 1] + values[count / 2]) / 2 + else: + measurement = values[count / 2] + try: + measurement = [int(measurement)] + except: + measurement = [measurement] + else: + measurement = bw.stats(feature[0], int(feature[1]), int(feature[2]), type=statistic, exact=True) + + measurement = [clean_measurement(m, nullvalue) for m in measurement] + + measured = True + except RuntimeError as e: + logging.error('Error measuring {} for feature {}:{}-{} -- {}'.format(statistic, feature[0], feature[1], feature[2], e)) + measurement = [nullvalue] + + return feature + measurement, measured + + +def worker_init(): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + +if __name__ == '__main__': + args = parse_arguments() + + loglevel = args.verbose and logging.DEBUG or logging.INFO + logging.basicConfig(level=loglevel, format='%(message)s') + + if args.features == '-': + feature_source = sys.stdin + else: + feature_source = atactk.data.open_maybe_gzipped(args.features) + + features = csv.reader(feature_source, dialect='excel-tab') + + score = functools.partial( + measure_feature, + args.coverage, + args.statistic, + args.nullvalue + ) + + if args.parallel > 1: + print('Using {} parallel processes'.format(args.parallel), file=sys.stderr) + pool = multiprocessing.Pool(processes=args.parallel, initializer=worker_init) + scored_features = pool.imap(score, features, args.parallel) + else: + scored_features = (score(feature) for feature in features) + + errors = False + try: + for feature, measured in scored_features: + print('\t'.join(str(t) for t in feature)) + if not measured: + errors = True + if errors: + print('Some features could not be measured, probably because the bigWig file lacked data in their regions.', file=sys.stderr) + + except IOError as e: + if e.errno != errno.EPIPE: + print('Error: {}'.format(e), file=sys.stderr) diff --git a/scripts/measure_signal b/scripts/measure_signal new file mode 100755 index 0000000..471c29c --- /dev/null +++ b/scripts/measure_signal @@ -0,0 +1,201 @@ +#!/usr/bin/env python + +# +# measure_signal: Given a BAM file containing alignments from an +# ATAC-seq experiment and a BED file of features, creates a +# tab-separated value file containing for each feature, a line with +# each overlapping fragment's size and position relative to the +# feature. +# +# Copyright 2015 Stephen Parker +# +# Licensed under Version 3 of the GPL or any later version +# + +from __future__ import print_function + +import argparse +import functools +import logging +import multiprocessing +import os +import random +import signal +import sys +import time +import textwrap +import traceback + +import atactk.command +import atactk.data +import atactk.metrics +import atactk.metrics.mid +import atactk.util + + +LOGGING_FORMAT = '%(levelname)s %(message)s' + + +def worker_init(): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + +def parse_arguments(): + parser = argparse.ArgumentParser( + prog='measure_signal', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=textwrap.dedent(""" + + Given a BAM file containing alignments from an ATAC-seq experiment and a + BED file of features, measures ATAC-seq reads around each feature. + + INPUT + ===== + + BAM files just work. + + BED files mean different things to different people. We + require a file of tab-separated values, where the first six + fields (and their labels in the BED format documentation at + https://genome.ucsc.edu/FAQ/FAQformat.html) are: + + 1. The reference sequence name ('chrom'): text + 2. The feature start position ('chromStart'): integer + 3. The feature end position ('chromEnd'): integer + 4. The name of the feature ('name'): text + 5. The feature's score ('score'): number + 6. The feature's strand ('strand'): '+' or '-' + + Extra fields are ignored. + + OUTPUT + ====== + + A tab-separated value file containing a line for each fragment overlapping + any feature's region. Each line contains three columns: + + FeatureMidpoint: the absolute position of the feature's midpoint + + FragmentMidpoint: the absolute position of the fragment midpoint + + FragmentMidpointDistanceToFeatureMidpoint: the distance in + nucleotides between the fragment and feature midpoints + + FragmentSize: the length of the fragment (template length or + insert size in SAM terminology) + + FragmentPositionRelativeToFeatureMidpoint: if a fragment's cut + points both map upstream of the feature's midpoint, it's 'L' for + 'left'; if both map downstream of the feature's midpoint, it's + 'R' for 'right'; if one cut point maps to each side of the + midpoint, it's 'O' for 'overlapping'. + + \0""".format('\n '.join(atactk.command.parse_bins.__doc__.split('\n')[3:]).lstrip())) + ) + + parser.add_argument('-F', '--exclude-flags', type=int, dest='exclude_flags', action='append', help='A SAM flag used to exclude alignments from the BAM file. More than one may be specified. Alignments matching any exclude flag will not be counted. The default is to exclude all unmapped reads/mates by filtering out any alignments with SAM flags 4 or 8 set.') + parser.add_argument('-f', '--include-flags', type=int, dest='include_flags', action='append', help='A SAM flag that determines which alignments from the BAM file will be included in the counts. More than one may be specified. Any alignment matching any include flag will be counted. The default is to include properly paired and mapped reads by filtering for SAM flags 83, 99, 147, or 163.') + parser.add_argument('-o', '--cut-point-offset', type=int, default=4, dest='cut_point_offset', help='The position of cut points relative to the beginning of a read and in the direction toward the read end, as a number of bases (default: 4).') + parser.add_argument('-p', '--parallel', type=int, default=1, dest='parallel', help='The number of parallel scoring processes to use (default: 1).') + parser.add_argument('-q', '--quality', type=int, default=30, dest='quality', help='The minimum mapping quality required for a read to be counted (default: 30).') + parser.add_argument('-r', '--region-extension', type=int, default=100, dest='extension', help='The number of bases to score on either side of the features (default: 100).') + parser.add_argument('-s', '--sample', type=int, default=None, help='Only measure a random sample of this many features from the input file.') + parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Requests more detailed output.') + parser.add_argument('--version', action='version', version='%(prog)s {}'.format(atactk.__version__)) + + parser.add_argument('alignments', metavar='BAM-file-of-aligned-reads', help='The indexed BAM file containing the aligned ATAC-seq reads.') + parser.add_argument('features', metavar='BED-file-of-features', help='The BED file containing the features. Use "-" to read from standard input.') + + return parser.parse_args() + + +def measure_signal(args): + job_start = time.time() + + if args.include_flags: + args.include_flags = sorted(set(args.include_flags)) + else: + args.include_flags = [83, 99, 147, 163] + + if args.exclude_flags: + args.exclude_flags = sorted(set(args.exclude_flags)) + else: + args.exclude_flags = [4, 8] + + if args.parallel > 1: + logger.info('Using {:d} concurrent scoring processes'.format(args.parallel)) + + logger.info('Filtering alignments for quality >= {}, with flags {} and without flags {}'.format(args.quality, args.include_flags, args.exclude_flags)) + logger.info('Reading features from {}...'.format(args.features == '-' and 'standard input' or args.features)) + + features = atactk.data.read_features(args.features, args.extension) + + if args.sample: + features = list(features) + total_feature_count = len(features) + features = random.sample(features, args.sample) + sample_count = len(features) + logger.info('Using a sample of {:d} feature{} from the total of {:d}'.format(sample_count, sample_count == 1 and '' or 's', total_feature_count)) + + feature_count = 0 + finder = functools.partial( + atactk.metrics.mid.find_midpoints_around_feature, + args.alignments, + args.include_flags, + args.exclude_flags, + args.quality, + args.cut_point_offset + ) + + pool = None + + try: + logger.info('Finding fragment midpoints...') + if args.parallel > 1: + pool = multiprocessing.Pool(processes=args.parallel, initializer=worker_init) + midpoint_lists = pool.imap(finder, features, args.parallel) + else: + midpoint_lists = (finder(feature) for feature in features) + + print('FeatureMidpoint\tFragmentMidpoint\tFragmentMidpointDistanceToFeatureMidpoint\tFragmentSize\tFragmentPositionRelativeToFeatureMidpoint\tFeatureNumber') + for feature_count, midpoint_list in enumerate(midpoint_lists, 1): + for feature_midpoint, fragment_midpoint, distance, fragment_size, fragment_position in midpoint_list: + print('{:d}\t{:d}\t{:d}\t{:d}\t{}\t{}'.format(feature_midpoint, fragment_midpoint, distance, fragment_size, fragment_position, feature_count)) + + if not feature_count: + logger.warn('No features were found in the BED input.') + + if args.verbose: + logging.debug('Memory in use: {}'.format(atactk.util.memory_in_use())) + + logger.info('Processed {:.0f} feature{} in {}'.format(feature_count, feature_count == 1 and '' or 's', atactk.util.humanize_time(time.time() - job_start))) + except KeyboardInterrupt: + logger.info('Keyboard interrupt received in process {}.'.format(os.getpid())) + + if pool: + if sys.version_info.major < 3: + logger.warn('Interrupt handling in multiprocessing programs is not reliable before Python 3, so I may have to exit without cleaning up all worker processes. Sorry.') + signal.signal(signal.SIGALRM, atactk.util.exit_forcefully) + signal.alarm(10) + + logger.info('Telling worker processes to terminate...') + pool.terminate() + logger.info('Waiting for worker processes to terminate...') + pool.join() + logger.info('Exiting.') + sys.exit(1) + except Exception as e: + logger.error(e) + if args.verbose: + traceback.print_exc(file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + args = parse_arguments() + + loglevel = args.verbose and logging.DEBUG or logging.INFO + logging.basicConfig(level=loglevel, format=LOGGING_FORMAT) + logger = logging.getLogger('measure_signal') + + measure_signal(args) diff --git a/scripts/plot_aggregate_cut_matrix.R b/scripts/plot_aggregate_cut_matrix.R new file mode 100755 index 0000000..24caee0 --- /dev/null +++ b/scripts/plot_aggregate_cut_matrix.R @@ -0,0 +1,62 @@ +#!/usr/bin/env Rscript +# -*- mode: R -*- + +# plot_aggregate_matrix: produces a plot of ATAC-seq signal around +# features, as generated by calling make_cut_matrix with the +# --aggregate-output option. + + +suppressMessages(library("RColorBrewer")) +suppressMessages(library("ggplot2")) +suppressMessages(library("grid")) +suppressMessages(library("gtools")) + +argv <- commandArgs(TRUE) + +bound_file = argv[1] +unbound_file = argv[2] +title = argv[3] +output_filename <- argv[4] + +bound = read.delim(file=bound_file, header=TRUE, sep="\t") +if (nrow(bound) > 0) { + bound$Bound = "Bound" + bound$FragmentSizeBin <- as.character(bound$FragmentSizeBin) +} + +if (unbound_file != "") { + cat("Unbound file given\n") + unbound = read.delim(file=unbound_file, header=TRUE, sep="\t") +} else { + cat("No unbound file given\n") + unbound = bound[FALSE,] +} +if (nrow(unbound) > 0) { + unbound$Bound = "Unbound" + unbound$FragmentSizeBin <- as.character(unbound$FragmentSizeBin) +} + +data <- rbind(bound, unbound) +data$FragmentSizeBin <- factor(data$FragmentSizeBin) +data$FragmentSizeBin <- factor(data$FragmentSizeBin, levels=mixedsort(levels(data$FragmentSizeBin))) +if (length(data$Strand) > 0) { + data$Strand <- as.factor(data$Strand) +} + +p <- ggplot(data, aes(x=Position, y=CutPointCountFraction, colour=Strand)) +p <- p + geom_line(alpha=0.75) + facet_grid(FragmentSizeBin ~ Bound, scales="free") + ggtitle(title) +p <- p + xlab('Position relative to feature') + ylab('ATAC-seq signal at position / feature count') +p <- p + theme_bw(base_size=10) +p <- p + theme( + strip.background = element_rect(fill="gray90", colour=FALSE), + panel.border = element_rect(colour="gray90"), + panel.margin = unit(1, "lines"), + plot.title=element_text(vjust=1), + axis.title.x=element_text(vjust=-0.5), + axis.title.y=element_text(vjust=1) +) +p <- p + scale_fill_brewer(palette="Dark2") + +ggsave(p, file=output_filename, width=11, height=8.5, units="in") + +cat(paste("Plot written to", output_filename, "\n")) diff --git a/scripts/plot_aggregate_midpoint_matrix.R b/scripts/plot_aggregate_midpoint_matrix.R new file mode 100755 index 0000000..d89f862 --- /dev/null +++ b/scripts/plot_aggregate_midpoint_matrix.R @@ -0,0 +1,60 @@ +#!/usr/bin/env Rscript +# -*- mode: R -*- + +# plot_aggregate_matrix: produces a plot of ATAC-seq signal around +# features, as generated by calling make_cut_matrix with the +# --aggregate-output option. + + +suppressMessages(library("RColorBrewer")) +suppressMessages(library("ggplot2")) +suppressMessages(library("grid")) +suppressMessages(library("gtools")) + +argv <- commandArgs(TRUE) + +bound_file = argv[1] +unbound_file = argv[2] +title = argv[3] +output_filename <- argv[4] + +cat(paste("Reading aggregate matrix for bound motifs from", bound_file, "\n")) +bound = read.delim(file=bound_file, header=TRUE, sep="\t") +if (nrow(bound) > 0) { + bound$Bound = "Bound" + bound$FragmentSizeBin <- as.character(bound$FragmentSizeBin) +} + +if (unbound_file != "") { + cat("Unbound file given\n") + unbound = read.delim(file=unbound_file, header=TRUE, sep="\t") +} else { + cat("No unbound file given\n") + unbound = bound[FALSE,] +} +if (nrow(unbound) > 0) { + unbound$Bound = "Unbound" + unbound$FragmentSizeBin <- as.character(unbound$FragmentSizeBin) +} + +data <- rbind(bound, unbound) +data$FragmentSizeBin <- factor(data$FragmentSizeBin) +data$FragmentSizeBin <- factor(data$FragmentSizeBin, levels=mixedsort(levels(data$FragmentSizeBin))) + +p <- ggplot(data, aes(x=Position, y=MidpointCountFraction)) +p <- p + geom_line(alpha=0.75) + facet_grid(FragmentSizeBin ~ Bound, scales="free") + ggtitle(title) +p <- p + xlab('Position relative to feature') + ylab('ATAC-seq signal at position / feature count') +p <- p + theme_bw(base_size=10) +p <- p + theme( + strip.background = element_rect(fill="gray90", colour=FALSE), + panel.border = element_rect(colour="gray90"), + panel.margin = unit(1, "lines"), + plot.title=element_text(vjust=1), + axis.title.x=element_text(vjust=-0.5), + axis.title.y=element_text(vjust=1) +) +p <- p + scale_fill_brewer(palette="Dark2") + +ggsave(p, file=output_filename, width=11, height=8.5, units="in") + +cat(paste("Plot written to", output_filename, "\n")) diff --git a/scripts/plot_signal.R b/scripts/plot_signal.R new file mode 100755 index 0000000..5da2e22 --- /dev/null +++ b/scripts/plot_signal.R @@ -0,0 +1,177 @@ +#!/usr/bin/env Rscript +# -*- mode: R -*- + +# plot_signal: produces a plot of ATAC-seq signal around features, +# using the output of the measure_signal script + +options(stringsAsFactors = F) +library(entropy) +library(ggplot2) +suppressMessages(library("RColorBrewer")) +suppressMessages(library("ggplot2")) + +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + library(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +} + +argv <- commandArgs(TRUE) + +signal_file = argv[1] +title = argv[2] +output_filename <- argv[3] +plot_filename <- argv[4] + +## Step 1: Read and pre-process data +# Read data table +d <- read.delim(signal_file) +# Trim data to our region of interest +d <- d[d$FragmentSize >= 36 & d$FragmentSize <= 400,] +# Transform frag sizes in factors to avoid 0-count sizes not being included +d2 <- d +d2$FragmentSize <- factor(d2$FragmentSize, levels = 36:400) +# Make counts matrix +d2 <- table(d2$FragmentMidpointDistanceToFeatureMidpoint, d2$FragmentSize) +d2 <- as.data.frame.matrix(d2) + + +## Step 2: Make compressed matrices +# Compress matrix in 5 and 11 bp windows to boost signal +comp_mat <- matrix(nrow = 91, ncol = 365) +for(i in 0:90){ + j <- 11*i + comp_mat[i+1,] <- as.numeric(apply(d2[(j+1):(j+11),], 2, sum)) +}; rm(i,j) +comp_mat2 <- matrix(nrow = 91, ncol = 73) +for(i in 0:72){ + j <- 5*i + comp_mat2[,i+1] <- as.numeric(apply(comp_mat[,(j+1):(j+5)], 1, sum)) +}; rm(i,j) + +# Compress matrix in 11 bp windows and 3 fragment sizes bins +sizes <- list(s = c(36,149), m = c(150, 325), l = c(325,400)) +comp_mat_bins <- matrix(nrow = 91, ncol = 3) +for(i in 1:3){ + start = sizes[[i]][1]-35 + end = sizes[[i]][2]-35 + comp_mat_bins[,i] <- as.numeric(apply(comp_mat[,(start):(end)], 1, sum))/(end-start) +}; rm(i, sizes, start, end) + + +## Step 3: Calculate entropies +# Normalized entropy function +normentropy <- function(x){ + maxentro <- entropy(rep(1,length(x))) + normentro <- ( maxentro - entropy(x) ) / maxentro + return(normentro) +} +# Calculate entropy +entro_bins100 <- apply(comp_mat_bins[36:55,],1,normentropy)/20 +entro_bins500 <- apply(comp_mat_bins[c(1:35,56:91),],1,normentropy)/71 +entro_full <- sum(apply(comp_mat_bins,1,normentropy)) +entro100 <- sum(entro_bins100) +entro500 <- sum(entro_bins500) +entro_ratio <- entro100/entro500 + +## Step 4: Output data +write.table(matrix(c(entro100, entro500, entro_ratio, entro_full),nrow = 1), row.names = F, + col.names = F, file = output_filename, sep = '\t') +cat(paste("Entropies written to", output_filename, "\n")) + +## Step 5: Plot information content per region + +posinfo <- data.frame(info = apply(comp_mat2, 1, normentropy), pos = -501+c(1:91)*11) +fraginfo <- data.frame(info = apply(comp_mat2, 2, normentropy), size = 35+c(1:73)*5) + +p1 <- ggplot(posinfo, aes(x = pos , y= info)) + geom_line() + + theme_bw() + theme( + strip.background = element_rect(fill="gray90", colour=FALSE), + panel.border = element_rect(colour="gray90"), + plot.title=element_text(vjust= 1, face = 'bold'), + axis.title.x=element_text(vjust = -0.5, size = 7), + axis.title.y=element_text(vjust = 1, size = 7), + axis.text.x = element_text(size = 5), + axis.text.y = element_text(size = 5) + ) + + labs(x = 'Fragment midpoint position relative to feature midpoint', + y = 'Normalized information') + + ylim(0, 0.28) + +p2 <- ggplot(fraginfo, aes(x = size, y = info)) + geom_line() + coord_flip() + + theme_bw() + theme( + strip.background = element_rect(fill="gray90", colour=FALSE), + panel.border = element_rect(colour="gray90"), + plot.title=element_text(vjust= 1, face = 'bold'), + axis.title.x=element_text(vjust = -0.5, size = 7), + axis.title.y=element_text(vjust = 1, size = 7), + axis.text.x = element_text(size = 5), + axis.text.y = element_text(size = 5) + ) + + labs(x = 'Fragment size (bp)', + y = 'Normalized information', + title = '') + + ylim(0, 0.2) + +if(nrow(d) >= 250000){ + d_to_plot <- d[sample(nrow(d), 250000, replace = F),] +} else {d_to_plot <- d} + +p <- ggplot(d_to_plot, aes(x=FragmentMidpointDistanceToFeatureMidpoint, y=FragmentSize, + colour=FragmentPositionRelativeToFeatureMidpoint)) +p <- p + geom_point(size=0.025, alpha=0.05) +p <- p + ggtitle(title) +p <- p + theme_bw(base_size=10) +p <- p + theme( + strip.background = element_rect(fill="gray90", colour=FALSE), + panel.border = element_rect(colour="gray90"), + panel.margin = unit(1, "lines"), + plot.title=element_text(vjust=1, hjust = 0.5), + axis.title.y=element_text(vjust=1), + axis.title.x=element_blank() + ) + + labs(y = '') +p <- p + scale_colour_manual(values = c("blue", "red", "black"), guide = F) + + +p0 <- ggplot() + theme_bw() + theme(strip.background = element_rect(fill="white", colour=FALSE), + panel.border = element_rect(colour="white")) + + +layout <- matrix(c(1,2,2, + 1,2,2, + 4,3,3), ncol=3, byrow = T) +pdf(file = plot_filename, width = 6, height = 5) +multiplot(layout = layout, p2, p, p1, p0) +dev.off() +cat(paste("Plot written to", plot_filename, "\n")) diff --git a/scripts/trim_adapters b/scripts/trim_adapters index 8b0cd89..deca820 100755 --- a/scripts/trim_adapters +++ b/scripts/trim_adapters @@ -11,13 +11,9 @@ from __future__ import print_function import argparse -import codecs -import collections import gzip import os import re -import string -import sys import Levenshtein @@ -25,6 +21,8 @@ import atactk.data FQ_FILENAME_RE = re.compile('^(?P[^/]+)\.(?Pf(?:ast)?q)*(?P\.gz)*$') + + def make_trimmed_filename(original_filename): output_directory = os.path.dirname(original_filename) basename = os.path.basename(original_filename) @@ -86,8 +84,8 @@ def trim_pair(pair): forward_read = trim_record(forward_read, trim_start, cut) reverse_read = trim_record(reverse_read, trim_start, cut) - forward_rec = ['%s\n' % i for i in forward_read] - reverse_rec = ['%s\n' % i for i in reverse_read] + forward_rec = ['{}\n'.format(i) for i in forward_read] + reverse_rec = ['{}\n'.format(i) for i in reverse_read] return forward_rec, reverse_rec diff --git a/setup.py b/setup.py index c8059e8..7a22704 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,8 @@ readme = open('README.rst').read() requirements = [ - 'pysam', + 'pyBigWig', + 'pysam>=0.10.0', 'python-levenshtein', 'sexpdata', ] @@ -20,17 +21,22 @@ setup( name='atactk', - version='0.1.6', + version='0.1.9', description="A toolkit for working with ATAC-seq data.", long_description=readme + '\n\n', author="The Parker Lab", author_email='parkerlab-software@umich.edu', url='https://github.com/ParkerLab/atactk', - packages=['atactk', ], + packages=['atactk', 'atactk.metrics'], scripts=[ 'scripts/make_cut_matrix', + 'scripts/make_midpoint_matrix', + 'scripts/measure_features', + 'scripts/measure_signal', + 'scripts/plot_aggregate_cut_matrix.R', + 'scripts/plot_aggregate_midpoint_matrix.R', + 'scripts/plot_signal.R', 'scripts/trim_adapters', - 'scripts/plot_aggregate_matrix.R' ], include_package_data=True, install_requires=requirements, @@ -38,16 +44,14 @@ zip_safe=False, keywords='atactk', classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', + 'Development Status :: 4 - Beta', + 'Intended Audience :: Science/Research', 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', 'Natural Language :: English', "Programming Language :: Python :: 2", - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', - 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ], test_suite='tests', tests_require=test_requirements diff --git a/tox.ini b/tox.ini index 3329dc8..027507d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,9 +1,20 @@ [tox] -envlist = py26, py27, py33, py34 +envlist = clean,py27,py35,stats [testenv] setenv = PYTHONPATH = {toxinidir}:{toxinidir}/atactk -commands = python setup.py test +commands= + coverage run --source atactk -a setup.py test deps = -r{toxinidir}/requirements.txt + coverage + +[testenv:clean] +commands= + coverage erase + +[testenv:stats] +commands= + coverage report + coverage html