Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make pairsio.py to read (and, in the future, write) .pairs files #195

Merged
merged 10 commits into from
Mar 9, 2024
126 changes: 85 additions & 41 deletions doc/examples/scalings_example.ipynb

Large diffs are not rendered by default.

20 changes: 10 additions & 10 deletions pairtools/cli/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import click
import pandas as pd

from ..lib import fileio, pairsam_format, headerops
from ..lib import fileio
from . import cli, common_io_options

from ..lib.scaling import compute_scaling
Expand Down Expand Up @@ -39,21 +39,21 @@
@click.option(
"--dist-range",
type=click.Tuple([int, int]),
default=(10, 1_000_000_000),
default=(1, 1_000_000_000),
show_default=True,
required=False,
help="Distance range. ",
)
@click.option(
"--n-dist-bins",
"--n-dist-bins-decade",
type=int,
default=128,
default=8,
show_default=True,
required=False,
help="Number of distance bins to split the distance range. ",
help="Number of bins to split the distance range in log10-space, specified per a factor of 10 difference.",
)
@common_io_options
def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs):
def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):
"""Calculate pairs scalings.

INPUT_PATH : by default, a .pairs/.pairsam file to calculate statistics.
Expand All @@ -63,10 +63,10 @@ def scaling(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwar

Output is .tsv file with scaling stats (both cis scalings and trans levels).
"""
scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs)
scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs)


def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **kwargs):
def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins_decade, **kwargs):

if len(input_path) == 0:
raise ValueError(f"No input paths: {input_path}")
Expand All @@ -93,13 +93,13 @@ def scaling_py(input_path, output, view, chunksize, dist_range, n_dist_bins, **k
regions=view,
chromsizes=None,
dist_range=dist_range,
n_dist_bins=n_dist_bins,
n_dist_bins_decade=n_dist_bins_decade,
chunksize=chunksize,
)
summary_stats = pd.concat([cis_scalings, trans_levels])

# save statistics to the file
summary_stats.to_csv(outstream, sep="\t")
summary_stats.to_csv(outstream, sep="\t", index=False)

if instream != sys.stdin:
instream.close()
Expand Down
1 change: 1 addition & 0 deletions pairtools/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from . import dedup
from . import filterbycov
from . import headerops
from . import pairsio
from . import pairsam_format
from . import parse
from . import parse_pysam
Expand Down
30 changes: 29 additions & 1 deletion pairtools/lib/fileio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import subprocess
import sys


class ParseError(Exception):
pass

Expand Down Expand Up @@ -235,3 +234,32 @@ def close(self, timeout=None):
self._stream.close()
retcode = self._proc.wait(timeout=timeout)
return retcode


def get_stream_handlers(instream):
"""
Get the readline and peek functions for the provided input stream.

Parameters:
instream (file-like object): The input stream to get the handlers for.

Returns:
tuple: A tuple containing the following elements:
- readline_f (function): The readline function for the input stream.
- peek_f (function): The peek function for the input stream.

Raises:
ValueError: If the peek function cannot be found for the provided stream.
"""
readline_f, peek_f = None, None
if hasattr(instream, "buffer"):
peek_f = instream.buffer.peek
readline_f = instream.buffer.readline
elif hasattr(instream, "peek"):
peek_f = instream.peek
readline_f = instream.readline
else:
raise ValueError("Cannot find the peek() function of the provided stream!")
return readline_f, peek_f


15 changes: 1 addition & 14 deletions pairtools/lib/headerops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from .. import __version__
from . import pairsam_format
from .fileio import ParseError
from .fileio import ParseError, get_stream_handlers

from .._logging import get_logger

Expand All @@ -21,19 +21,6 @@
COMMENT_CHAR = "#"


def get_stream_handlers(instream):
# get peekable buffer for the instream
readline_f, peek_f = None, None
if hasattr(instream, "buffer"):
peek_f = instream.buffer.peek
readline_f = instream.buffer.readline
elif hasattr(instream, "peek"):
peek_f = instream.peek
readline_f = instream.readline
else:
raise ValueError("Cannot find the peek() function of the provided stream!")
return readline_f, peek_f


def get_header(instream, comment_char=COMMENT_CHAR, ignore_warning=False):
"""Returns a header from the stream and an the reaminder of the stream
Expand Down
49 changes: 49 additions & 0 deletions pairtools/lib/pairsio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import pandas as pd

from . import fileio, headerops

def read_pairs(pairs, nproc=3, cmd_in=None, **kwargs):
"""
Reads a file with .pairs format and returns a header, a dataframe of pairs, and chromsizes.

Parameters:
pairs (str or file-like object): A path to a .pairs file to read or an open file-like object/handle.
nproc (int): Number of processes to use for reading the file. Default is 3.
cmd_in (str): The command to be used for reading the file. Default is None.

**kwargs: Additional keyword arguments to be passed to pd.read_csv. Useful options include:
- chunksize (int): If specified, return an iterable object of type TextFileReader that reads in chunks of lines.
- usecols (list-like or callable): Return a subset of the columns. If list-like, all elements must either be positional or strings. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True.

Returns:
tuple: A tuple containing the following elements:
- pairs_df (pd.DataFrame): A pandas DataFrame with pairs.
- header (list of str): The original header of the pairs file.
- chromsizes (dict): A dictionary containing chromosome sizes extracted from the header.
"""
pairs_stream = (
fileio.auto_open(
pairs,
mode="r",
nproc=nproc,
command=cmd_in,
)
if isinstance(pairs, str)
else pairs
)

header, pairs_body = headerops.get_header(pairs_stream)
cols = headerops.extract_column_names(header)

chromsizes = headerops.extract_chromsizes(header)

pairs_df = pd.read_csv(
pairs_body,
header=None,
names=cols,
sep="\t",
dtype={"chrom1": str, "chrom2": str},
**kwargs
)

return pairs_df, header, chromsizes
Loading
Loading