From a1b609b8ff11e85fb70e416bc3964aeb39208928 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 19 Nov 2024 16:16:00 -0800 Subject: [PATCH 1/2] src/utils/xcounts.cpp: Adding a check for chrom order being consistent between the header or reference and the internals of the counts file --- src/utils/xcounts.cpp | 96 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 14 deletions(-) diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index 13e7dc3..0cfd982 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -23,6 +23,7 @@ #include #include #include +#include // std::underlying_type_t #include // from smithlab_cpp @@ -45,6 +46,49 @@ using std::vector; using bamxx::bgzf_file; +enum class xcounts_err { + // clang-format off + ok = 0, + open_failure = 1, + header_failure = 2, + chromosome_not_found = 3, + inconsistent_chromosome_order = 4, + incorrect_chromosome_size = 5, + failed_to_write_file = 6, + failed_to_parse_site = 7, + // clang-format on +}; + +struct xcounts_err_cat : std::error_category { + auto name() const noexcept -> const char * override { + return "xcounts error"; + } + auto message(const int condition) const -> std::string override { + using std::string_literals::operator""s; + switch (condition) { + case 0: return "ok"s; + case 1: return "failed to open methylome file"s; + case 2: return "failed to parse xcounts header"s; + case 3: return "failed to find chromosome in xcounts header"s; + case 4: return "inconsistent chromosome order"s; + case 5: return "incorrect chromosome size"s; + case 6: return "failed to write to file"s; + case 7: return "failed to parse site"s; + } + std::abort(); // unreacheable + } +}; + +template <> +struct std::is_error_code_enum : public std::true_type {}; + +std::error_code +make_error_code(xcounts_err e) { + static auto category = xcounts_err_cat{}; + return std::error_code(static_cast>(e), + category); +} + template static inline uint32_t fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) { @@ -137,10 +181,12 @@ main_xcounts(int argc, const char **argv) { tpool.set_io(out); } + std::unordered_map chrom_order; if (!header_file.empty()) - write_counts_header_from_file(header_file, out); + chrom_order = write_counts_header_from_file(header_file, out); else if (!genome_file.empty()) - write_counts_header_from_chrom_sizes(chrom_names, chrom_sizes, out); + chrom_order = + write_counts_header_from_chrom_sizes(chrom_names, chrom_sizes, out); // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; @@ -152,11 +198,14 @@ main_xcounts(int argc, const char **argv) { uint32_t offset = 0; string prev_chrom; - bool status_ok = true; bool found_header = (!genome_file.empty() || !header_file.empty()); + std::error_code ec{}; + + uint32_t chrom_counter = 0; + MSite site; - while (status_ok && bamxx::getline(in, line)) { + while (ec == std::errc{} && bamxx::getline(in, line)) { if (is_counts_header_line(line.s)) { if (!genome_file.empty() || !header_file.empty()) continue; @@ -166,34 +215,53 @@ main_xcounts(int argc, const char **argv) { continue; } - status_ok = site.initialize(line.s, line.s + line.l); - if (!status_ok || !found_header) + if (!site.initialize(line.s, line.s + line.l)) { + ec = xcounts_err::failed_to_parse_site; + break; + } + if (!found_header) { + ec = xcounts_err::header_failure; break; + } if (site.chrom != prev_chrom) { if (verbose) cerr << "processing: " << site.chrom << endl; + + if (!chrom_order.empty()) { + const auto expected_chrom_counter = chrom_order.find(site.chrom); + if (expected_chrom_counter == std::cend(chrom_order)) { + ec = xcounts_err::chromosome_not_found; + break; + } + if (expected_chrom_counter->second != chrom_counter) { + ec = xcounts_err::inconsistent_chromosome_order; + break; + } + } + + chrom_counter++; + prev_chrom = site.chrom; offset = 0; site.chrom += '\n'; const int64_t sz = size(site.chrom); - status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz; + if (bgzf_write(out.f, site.chrom.data(), sz) != sz) + ec = xcounts_err::failed_to_write_file; } if (site.n_reads > 0) { const int64_t sz = fill_output_buffer(offset, site, buf); - status_ok = bgzf_write(out.f, buf.data(), sz) == sz; + if (bgzf_write(out.f, buf.data(), sz) != sz) + ec = xcounts_err::failed_to_write_file; offset = site.pos; } } ks_free(&line); - if (!status_ok) { - cerr << "failed converting " << filename << " to " << outfile << endl; - return EXIT_FAILURE; - } - if (!found_header) { - cerr << "no header provided or found" << endl; + if (ec) { + cerr << "failed converting " << filename << " to " << outfile << endl + << ec << endl; return EXIT_FAILURE; } } From cb04f309c9784266db67466dbde2984e6507f50c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Tue, 19 Nov 2024 16:16:48 -0800 Subject: [PATCH 2/2] src/common/counts_header.chpp: returning an unordered map with chrom order from functions that write chrom order from a header or ref genome --- src/common/counts_header.cpp | 65 +++++++++++++++++++++++------------- src/common/counts_header.hpp | 15 +++++---- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/src/common/counts_header.cpp b/src/common/counts_header.cpp index 6ac4ff0..523408e 100644 --- a/src/common/counts_header.cpp +++ b/src/common/counts_header.cpp @@ -18,13 +18,13 @@ #include "counts_header.hpp" -#include -#include -#include -#include #include #include +#include +#include #include +#include +#include #include "bam_record_utils.hpp" @@ -34,40 +34,54 @@ #include "bamxx.hpp" #include "dnmt_error.hpp" -using std::vector; using std::string; using std::to_string; +using std::unordered_map; +using std::vector; using bamxx::bgzf_file; -void +std::unordered_map write_counts_header_from_chrom_sizes(const vector &chrom_names, const vector &chrom_sizes, bgzf_file &out) { const auto version = "#DNMTOOLS " + string(VERSION) + "\n"; out.write(version.c_str()); + + std::unordered_map chrom_order; + std::uint32_t chrom_count = 0; + for (auto i = 0u; i < size(chrom_sizes); ++i) { const string tmp = "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n"; - out.write(tmp.c_str()); + out.write(tmp.data()); + chrom_order.emplace(chrom_names[i], chrom_count++); } out.write("#\n"); -} + return chrom_order; +} -void +std::unordered_map write_counts_header_from_file(const string &header_file, bgzf_file &out) { std::ifstream in(header_file); if (!in.is_open()) { - throw dnmt_error("failed to open header file: " + header_file); + throw dnmt_error("failed to open header file: " + header_file); } + + std::unordered_map chrom_order; + std::uint32_t chrom_count = 0; + string line; - while(getline(in, line)) { + while (getline(in, line)) { out.write(line + '\n'); + const auto name = line.substr(1, line.find(' ') - 1); + chrom_order.emplace(name, chrom_count++); } in.close(); -} + return chrom_order; +} bamxx::bgzf_file & skip_counts_header(bamxx::bgzf_file &in) { @@ -75,7 +89,8 @@ skip_counts_header(bamxx::bgzf_file &in) { // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; const int ret = ks_resize(&line, 1024); - if (ret) return in; + if (ret) + return in; while (bamxx::getline(in, line) && line.s[0] == '#') { if (line.s[0] == '#' && line.l == 1) @@ -86,7 +101,6 @@ skip_counts_header(bamxx::bgzf_file &in) { return in; } - int get_chrom_sizes_for_counts_header(const uint32_t n_threads, const string &filename, @@ -96,7 +110,8 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads, bamxx::bam_tpool tpool(n_threads); bgzf_file in(filename, "r"); - if (!in) return -1; + if (!in) + return -1; if (n_threads > 1 && in.is_bgzf()) tpool.set_io(in); @@ -106,18 +121,22 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads, // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; const int ret = ks_resize(&line, 1024); - if (ret) return -1; + if (ret) + return -1; uint64_t chrom_size = 0; while (getline(in, line)) { if (line.s[0] == '>') { - if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + if (!chrom_names.empty()) + chrom_sizes.push_back(chrom_size); chrom_names.emplace_back(line.s + 1); chrom_size = 0; } - else chrom_size += line.l; + else + chrom_size += line.l; } - if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); + if (!chrom_names.empty()) + chrom_sizes.push_back(chrom_size); ks_free(&line); @@ -126,7 +145,6 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads, return 0; } - void write_counts_header_from_bam_header(const bamxx::bam_header &hdr, bgzf_file &out) { @@ -142,7 +160,6 @@ write_counts_header_from_bam_header(const bamxx::bam_header &hdr, out.write("#\n"); } - bool write_counts_header_line(string line, bgzf_file &out) { line += '\n'; @@ -153,8 +170,10 @@ write_counts_header_line(string line, bgzf_file &out) { bool get_has_counts_header(const string &filename) { bgzf_file in(filename, "r"); - if (!in) return false; + if (!in) + return false; string line; - if (!getline(in, line)) return false; + if (!getline(in, line)) + return false; return line[0] == '#'; } diff --git a/src/common/counts_header.hpp b/src/common/counts_header.hpp index 520cab5..5f38007 100644 --- a/src/common/counts_header.hpp +++ b/src/common/counts_header.hpp @@ -19,18 +19,19 @@ #ifndef COUNTS_HEADER_HPP #define COUNTS_HEADER_HPP +#include #include +#include #include -#include #include "bamxx.hpp" -void -write_counts_header_from_chrom_sizes(const std::vector &chrom_names, - const std::vector &chrom_sizes, - bamxx::bgzf_file &out); +std::unordered_map +write_counts_header_from_chrom_sizes( + const std::vector &chrom_names, + const std::vector &chrom_sizes, bamxx::bgzf_file &out); -void +std::unordered_map write_counts_header_from_file(const std::string &header_file, bamxx::bgzf_file &out); @@ -60,7 +61,7 @@ is_counts_header_version_line(const std::string &line) { return line.compare(0, 9, version_line) == 0; } -template +template inline bool is_counts_header_line(T &line) { return line[0] == '#';