diff --git a/fgpyo/fasta/builder.py b/fgpyo/fasta/builder.py index bb58316d..3efdc640 100755 --- a/fgpyo/fasta/builder.py +++ b/fgpyo/fasta/builder.py @@ -43,7 +43,6 @@ def samtools_dict(*args: Any) -> None: def samtools_faidx(*args: Any) -> None: pass - else: from pysam import dict as samtools_dict from pysam import faidx as samtools_faidx diff --git a/fgpyo/io/__init__.py b/fgpyo/io/__init__.py index cb47ed99..34c95139 100644 --- a/fgpyo/io/__init__.py +++ b/fgpyo/io/__init__.py @@ -128,7 +128,7 @@ def assert_path_is_writeable(path: Path, parent_must_exist: bool = True) -> None def to_reader(path: Path) -> Union[io.TextIOWrapper, TextIO, IO[Any]]: - """Opens a Path for reading. + """Opens a Path for reading and based on extension uses open() or gzip.open() Args: path: Path to read from @@ -144,21 +144,27 @@ def to_reader(path: Path) -> Union[io.TextIOWrapper, TextIO, IO[Any]]: return path.open(mode="r") -def to_writer(path: Path) -> Union[IO[Any], io.TextIOWrapper]: - """Opens a Path for reading and based on extension uses open() or gzip.open() +def to_writer(path: Path, append: bool = False) -> Union[IO[Any], io.TextIOWrapper]: + """Opens a Path for writing (or appending) and based on extension uses open() or gzip.open() Args: - path: Path to write to + path: Path to write (or append) to Example: writer = fio.to_writer(path = Path("writer.txt")) writer.write(f'{something}\n') writer.close() """ + mode_prefix = "w" + if append: + mode_prefix = "a" + if path.suffix in COMPRESSED_FILE_EXTENSIONS: - return io.TextIOWrapper(cast(IO[bytes], gzip.open(path, mode="wb")), encoding="utf-8") + return io.TextIOWrapper( + cast(IO[bytes], gzip.open(path, mode=mode_prefix + "b")), encoding="utf-8" + ) else: - return path.open(mode="w") + return path.open(mode=mode_prefix) def read_lines(path: Path, strip: bool = False) -> Iterator[str]: @@ -183,19 +189,19 @@ def read_lines(path: Path, strip: bool = False) -> Iterator[str]: yield line.rstrip("\r\n") -def write_lines(path: Path, lines_to_write: Iterable[Any]) -> None: - """Writes a file with one line per item in provided iterable +def write_lines(path: Path, lines_to_write: Iterable[Any], append: bool = False) -> None: + """Writes (or appends) a file with one line per item in provided iterable Args: - path: Path to write to - lines_to_write: items to write to file + path: Path to write (or append) to + lines_to_write: items to write (or append) to file Example: lines: List[Any] = ["things to write", 100] path_to_write_to: Path = Path("file_to_write_to.txt") fio.write_lines(path = path_to_write_to, lines_to_write = lines) """ - with to_writer(path=path) as writer: + with to_writer(path=path, append=append) as writer: for line in lines_to_write: writer.write(str(line)) writer.write("\n") diff --git a/fgpyo/util/metric.py b/fgpyo/util/metric.py index 2bff416f..a4681994 100644 --- a/fgpyo/util/metric.py +++ b/fgpyo/util/metric.py @@ -305,3 +305,17 @@ def format_value(cls, value: Any) -> str: def to_list(cls, value: str) -> List[Any]: """Returns a list value split on comma delimeter.""" return [] if value == "" else value.split(",") + + @staticmethod + def fast_concat(*inputs: Path, output: Path) -> None: + if len(inputs) == 0: + raise ValueError("No inputs provided") + + headers = [next(io.read_lines(input_path)) for input_path in inputs] + assert len(set(headers)) == 1, "Input headers do not match" + io.write_lines(path=output, lines_to_write=set(headers)) + + for input_path in inputs: + io.write_lines( + path=output, lines_to_write=list(io.read_lines(input_path))[1:], append=True + ) diff --git a/fgpyo/util/tests/test_metric.py b/fgpyo/util/tests/test_metric.py index d011873c..5f2e2eaa 100644 --- a/fgpyo/util/tests/test_metric.py +++ b/fgpyo/util/tests/test_metric.py @@ -319,3 +319,26 @@ def test_metric_list_parse_with_none() -> None: assert ListPerson.parse(fields=["Max,Sally", ","]) == ListPerson( name=["Max", "Sally"], age=[None, None] ) + + +def test_metrics_fast_concat(tmpdir: TmpDir) -> None: + path_input = [] + path_input.append(Path(tmpdir) / "metrics_1.txt") + path_input.append(Path(tmpdir) / "metrics_2.txt") + path_input.append(Path(tmpdir) / "metrics_3.txt") + path_output: Path = Path(tmpdir) / "metrics_concat.txt" + + DummyMetric.write(path_input[0], DUMMY_METRICS[0]) + DummyMetric.write(path_input[1], DUMMY_METRICS[1]) + DummyMetric.write(path_input[2], DUMMY_METRICS[2]) + + Metric.fast_concat(*path_input, output=path_output) + metrics: List[DummyMetric] = list(DummyMetric.read(path=path_output)) + + assert len(metrics) == len(DUMMY_METRICS) + assert metrics[0].header() == DummyMetric.header() + assert metrics[1].header() == DummyMetric.header() + assert metrics[2].header() == DummyMetric.header() + assert metrics[0] == DUMMY_METRICS[0] + assert metrics[1] == DUMMY_METRICS[1] + assert metrics[2] == DUMMY_METRICS[2]