Skip to content

Commit

Permalink
Add fast_concat function to Metric (#53)
Browse files Browse the repository at this point in the history
* Appends Metrics without parsing them
* Checks for consistent headers
* Rolled specialized appending functions into writer functions by adding a new parameter
  • Loading branch information
kockan authored Nov 7, 2023
1 parent 28ab1e3 commit 1b0e0ff
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 12 deletions.
1 change: 0 additions & 1 deletion fgpyo/fasta/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def samtools_dict(*args: Any) -> None:
def samtools_faidx(*args: Any) -> None:
pass


else:
from pysam import dict as samtools_dict
from pysam import faidx as samtools_faidx
Expand Down
28 changes: 17 additions & 11 deletions fgpyo/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def assert_path_is_writeable(path: Path, parent_must_exist: bool = True) -> None


def to_reader(path: Path) -> Union[io.TextIOWrapper, TextIO, IO[Any]]:
"""Opens a Path for reading.
"""Opens a Path for reading and based on extension uses open() or gzip.open()
Args:
path: Path to read from
Expand All @@ -144,21 +144,27 @@ def to_reader(path: Path) -> Union[io.TextIOWrapper, TextIO, IO[Any]]:
return path.open(mode="r")


def to_writer(path: Path) -> Union[IO[Any], io.TextIOWrapper]:
"""Opens a Path for reading and based on extension uses open() or gzip.open()
def to_writer(path: Path, append: bool = False) -> Union[IO[Any], io.TextIOWrapper]:
"""Opens a Path for writing (or appending) and based on extension uses open() or gzip.open()
Args:
path: Path to write to
path: Path to write (or append) to
Example:
writer = fio.to_writer(path = Path("writer.txt"))
writer.write(f'{something}\n')
writer.close()
"""
mode_prefix = "w"
if append:
mode_prefix = "a"

if path.suffix in COMPRESSED_FILE_EXTENSIONS:
return io.TextIOWrapper(cast(IO[bytes], gzip.open(path, mode="wb")), encoding="utf-8")
return io.TextIOWrapper(
cast(IO[bytes], gzip.open(path, mode=mode_prefix + "b")), encoding="utf-8"
)
else:
return path.open(mode="w")
return path.open(mode=mode_prefix)


def read_lines(path: Path, strip: bool = False) -> Iterator[str]:
Expand All @@ -183,19 +189,19 @@ def read_lines(path: Path, strip: bool = False) -> Iterator[str]:
yield line.rstrip("\r\n")


def write_lines(path: Path, lines_to_write: Iterable[Any]) -> None:
"""Writes a file with one line per item in provided iterable
def write_lines(path: Path, lines_to_write: Iterable[Any], append: bool = False) -> None:
"""Writes (or appends) a file with one line per item in provided iterable
Args:
path: Path to write to
lines_to_write: items to write to file
path: Path to write (or append) to
lines_to_write: items to write (or append) to file
Example:
lines: List[Any] = ["things to write", 100]
path_to_write_to: Path = Path("file_to_write_to.txt")
fio.write_lines(path = path_to_write_to, lines_to_write = lines)
"""
with to_writer(path=path) as writer:
with to_writer(path=path, append=append) as writer:
for line in lines_to_write:
writer.write(str(line))
writer.write("\n")
Expand Down
14 changes: 14 additions & 0 deletions fgpyo/util/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,17 @@ def format_value(cls, value: Any) -> str:
def to_list(cls, value: str) -> List[Any]:
"""Returns a list value split on comma delimeter."""
return [] if value == "" else value.split(",")

@staticmethod
def fast_concat(*inputs: Path, output: Path) -> None:
if len(inputs) == 0:
raise ValueError("No inputs provided")

headers = [next(io.read_lines(input_path)) for input_path in inputs]
assert len(set(headers)) == 1, "Input headers do not match"
io.write_lines(path=output, lines_to_write=set(headers))

for input_path in inputs:
io.write_lines(
path=output, lines_to_write=list(io.read_lines(input_path))[1:], append=True
)
23 changes: 23 additions & 0 deletions fgpyo/util/tests/test_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,3 +319,26 @@ def test_metric_list_parse_with_none() -> None:
assert ListPerson.parse(fields=["Max,Sally", ","]) == ListPerson(
name=["Max", "Sally"], age=[None, None]
)


def test_metrics_fast_concat(tmpdir: TmpDir) -> None:
path_input = []
path_input.append(Path(tmpdir) / "metrics_1.txt")
path_input.append(Path(tmpdir) / "metrics_2.txt")
path_input.append(Path(tmpdir) / "metrics_3.txt")
path_output: Path = Path(tmpdir) / "metrics_concat.txt"

DummyMetric.write(path_input[0], DUMMY_METRICS[0])
DummyMetric.write(path_input[1], DUMMY_METRICS[1])
DummyMetric.write(path_input[2], DUMMY_METRICS[2])

Metric.fast_concat(*path_input, output=path_output)
metrics: List[DummyMetric] = list(DummyMetric.read(path=path_output))

assert len(metrics) == len(DUMMY_METRICS)
assert metrics[0].header() == DummyMetric.header()
assert metrics[1].header() == DummyMetric.header()
assert metrics[2].header() == DummyMetric.header()
assert metrics[0] == DUMMY_METRICS[0]
assert metrics[1] == DUMMY_METRICS[1]
assert metrics[2] == DUMMY_METRICS[2]

0 comments on commit 1b0e0ff

Please sign in to comment.