diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 4ff6609..d18f3b1 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -577,13 +577,11 @@ build_domains(const vector &bins, //Modified to take multiple replicates template static void -separate_regions(const bool VERBOSE, const size_t desert_size, +separate_regions(const size_t desert_size, vector > &bins, - vector > &meth, vector > &reads, + vector > &meth, vector> &reads, vector &reset_points, vector &dists_btwn_bins) { - if (VERBOSE) - cerr << "[SEPARATING BY CPG DESERT]" << endl; const size_t n_replicates = bins.size(); // eliminate the zero-read cpg sites if no coverage in any rep @@ -623,12 +621,8 @@ separate_regions(const bool VERBOSE, const size_t desert_size, reset_points.push_back(i); prev_cpg = bins[0][i].get_start(); } - assert(reset_points.size() > 0); + assert(std::size(reset_points) > 0); reset_points.push_back(bins[0].size()); - if (VERBOSE) - cerr << "BINS RETAINED: " << bins[0].size() << endl - << "NUMBER OF DISTANCES BETWEEN: " << dists_btwn_bins.size() << endl - << "DESERTS REMOVED: " << reset_points.size() - 2 << endl << endl; } @@ -675,7 +669,7 @@ assign_p_values(const vector &random_scores, static void -read_params_file(const bool VERBOSE, +read_params_file(const bool verbose, const string ¶ms_file, double &fg_alpha, double &fg_beta, @@ -702,7 +696,7 @@ read_params_file(const bool VERBOSE, >> jnk >> end_trans[1] >> jnk >> fdr_cutoff; - if (VERBOSE) + if (verbose) cerr << "Read in params from " << params_file << endl << "FG_ALPHA\t" << fg_alpha << endl << "FG_BETA\t" << fg_beta << endl @@ -1134,10 +1128,22 @@ add_missing_bins(const vector &all_bins, } +static void +write_empty_summary(const string &summary_file) { + if (!summary_file.empty()) { + ofstream summary_out(summary_file); + if (!summary_out) + throw runtime_error("failed to open: " + summary_file); + summary_out << pmd_summary({}).tostring() << endl; + } +} + + int main_pmd(int argc, const char **argv) { try { + static const size_t min_observations_for_inference = 100; static const size_t max_bin_size = 500000; static const size_t min_bin_size = 1000; size_t resolution = 500; @@ -1152,7 +1158,7 @@ main_pmd(int argc, const char **argv) { size_t bin_size = 1000; size_t max_iterations = 10; // run mode flags - bool VERBOSE = false; + bool verbose = false; bool ARRAY_MODE = false; bool fixed_bin_size = false; @@ -1187,7 +1193,7 @@ main_pmd(int argc, const char **argv) { opt_parse.add_opt("arraymode",'a', "All samples are array", false, ARRAY_MODE); opt_parse.add_opt("itr", 'i', "max iterations", false, max_iterations); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); opt_parse.add_opt("debug", 'D', "print more run info", false, DEBUG); opt_parse.add_opt("params-in", 'P', "HMM parameter files for " "individual methylomes (separated with comma)", @@ -1246,7 +1252,7 @@ main_pmd(int argc, const char **argv) { // Sanity checks input file format and dynamically selects bin // size from WGBS samples. if (!fixed_bin_size && !ARRAY_MODE) { - if (VERBOSE) + if (verbose) cerr << "[DYNAMICALLY SELECTING BIN SIZE]" << endl; double confidence_interval = 0.80; double prop_accept = 0.80; @@ -1278,18 +1284,12 @@ main_pmd(int argc, const char **argv) { if (insufficient_data) { // ADS: first check for insufficient data; another is needed if // fixed bin size is used - if (VERBOSE) - cerr << "EXITING: INSUFFICIENT DATA" << endl; - if (!summary_file.empty()) { - ofstream summary_out(summary_file); - if (!summary_out) - throw runtime_error("failed to open: " + summary_file); - summary_out << pmd_summary({}).tostring() << endl; - } + if (verbose) cerr << "EXITING: INSUFFICIENT DATA" << endl; + if (!summary_file.empty()) write_empty_summary(summary_file); return EXIT_SUCCESS; } - if (VERBOSE) + if (verbose) cerr << "[READING IN AT BIN SIZE " << bin_size << "]" << endl; // separate the regions by chrom and by desert @@ -1299,7 +1299,7 @@ main_pmd(int argc, const char **argv) { vector array_status; for (size_t i = 0; i < n_replicates && !insufficient_data; ++i) { - if (VERBOSE) + if (verbose) cerr << "[READING CPGS AND METH PROPS] from " << cpgs_file[i] << endl; load_bins(bin_size, cpgs_file[i], bins[i], meth[i], @@ -1308,7 +1308,7 @@ main_pmd(int argc, const char **argv) { accumulate(begin(reads[i]), end(reads[i]), 0); if (total_observations <= num_lim::min()) insufficient_data = true; - if (VERBOSE) + if (verbose) cerr << "TOTAL BINS: " << bins[i].size() << endl << "MEAN COVERAGE: " << total_observations/std::max(1ul, reads[i].size()) @@ -1316,16 +1316,10 @@ main_pmd(int argc, const char **argv) { } if (insufficient_data) { - // ADS: first check for insufficient data; another is needed if - // fixed bin size is used - if (VERBOSE) - cerr << "EXITING: INSUFFICIENT DATA" << endl; - if (!summary_file.empty()) { - ofstream summary_out(summary_file); - if (!summary_out) - throw runtime_error("failed to open: " + summary_file); - summary_out << pmd_summary({}).tostring() << endl; - } + // ADS: second check for insufficient data; another is needed if + // filtered number of bins is too few + if (verbose) cerr << "EXITING: INSUFFICIENT DATA" << endl; + if (!summary_file.empty()) write_empty_summary(summary_file); return EXIT_SUCCESS; } @@ -1347,14 +1341,31 @@ main_pmd(int argc, const char **argv) { // separate regions by chrom and desert; eliminate isolated Bins vector reset_points; vector dists_btwn_bins; - separate_regions(VERBOSE, desert_size, bins, meth, reads, + if (verbose) + cerr << "[separating by CpG desert]" << endl; + separate_regions(desert_size, bins, meth, reads, reset_points, dists_btwn_bins); + if (size(bins[0]) < min_observations_for_inference) + insufficient_data = true; + + if (insufficient_data) { + // ADS: final check for sufficient data failed; too few bins + // after filtering + if (verbose) cerr << "EXITING: INSUFFICIENT DATA" << endl; + if (!summary_file.empty()) write_empty_summary(summary_file); + return EXIT_SUCCESS; + } + + if (verbose) + cerr << "bins retained: " << std::size(bins[0]) << endl + << "number of distances between: " << std::size(dists_btwn_bins) << endl + << "deserts removed: " << size(reset_points) - 2 << endl; /****************** Read in params *****************/ vector start_trans(2, 0.5), end_trans(2, 1e-10); vector > trans(2, vector(2, 0.01)); trans[0][0] = trans[1][1] = 0.99; - const TwoStateHMM hmm(min_prob, tolerance, max_iterations, VERBOSE, DEBUG); + const TwoStateHMM hmm(min_prob, tolerance, max_iterations, verbose, DEBUG); vector reps_fg_alpha(n_replicates, 0.05); vector reps_fg_beta(n_replicates, 0.95); vector reps_bg_alpha(n_replicates, 0.95); @@ -1363,8 +1374,8 @@ main_pmd(int argc, const char **argv) { if (!params_in_file.empty()) { // read parameters files - for (size_t i= 0; i< n_replicates; ++i) - read_params_file(VERBOSE, params_in_file[i], reps_fg_alpha[i], + for (size_t i= 0; i < n_replicates; ++i) + read_params_file(verbose, params_in_file[i], reps_fg_alpha[i], reps_fg_beta[i], reps_bg_alpha[i], reps_bg_beta[i], start_trans, trans, end_trans, score_cutoff_for_fdr); } @@ -1416,7 +1427,7 @@ main_pmd(int argc, const char **argv) { vector domain_scores; get_domain_scores(classes, meth, reset_points, domain_scores); - if (VERBOSE) + if (verbose) cerr << "[RANDOMIZING SCORES FOR FDR]" << endl; vector random_scores; diff --git a/src/common/TwoStateHMM_PMD.cpp b/src/common/TwoStateHMM_PMD.cpp index 3297a35..189ba59 100644 --- a/src/common/TwoStateHMM_PMD.cpp +++ b/src/common/TwoStateHMM_PMD.cpp @@ -29,6 +29,9 @@ using std::string; using std::setprecision; using std::isfinite; using std::unique_ptr; +using std::make_pair; + +template using num_lim = std::numeric_limits; inline double TwoStateHMM::log_sum_log(const double p, const double q) const { @@ -43,16 +46,14 @@ TwoStateHMM::log_sum_log(const double p, const double q) const { double TwoStateHMM::log_sum_log_vec(const vector &vals, size_t limit) const { const vector::const_iterator x = - std::max_element(vals.begin(), vals.begin() + limit); + max_element(cbegin(vals), cbegin(vals) + limit); const double max_val = *x; - const size_t max_idx = x - vals.begin(); + const size_t max_idx = std::distance(cbegin(vals), x); double sum = 1.0; for (size_t i = 0; i < limit; ++i) { if (i != max_idx) { sum += exp(vals[i] - max_val); -#ifdef DEBUG assert(isfinite(sum)); -#endif } } return max_val + log(sum); @@ -60,8 +61,8 @@ TwoStateHMM::log_sum_log_vec(const vector &vals, size_t limit) const { void -TwoStateHMM::estimate_emissions(const vector > &f, - const vector > &b, +TwoStateHMM::estimate_emissions(const vector> &f, + const vector> &b, vector &fg_probs, vector &bg_probs) const { for (size_t i = 0; i < b.size(); ++i) { @@ -76,10 +77,10 @@ TwoStateHMM::estimate_emissions(const vector > &f, void TwoStateHMM::TransitionPosteriors_rep( - const vector > > &values, + const vector> > &values, const vector &reset_points, const vector &start_trans, - const vector > &trans, + const vector> &trans, const vector &end_trans, const vector &fg_alpha, const vector &fg_beta, @@ -89,8 +90,8 @@ TwoStateHMM::TransitionPosteriors_rep( const size_t transition, vector &llr_scores) const { - vector > fg_distro; - vector > bg_distro; + vector> fg_distro; + vector> bg_distro; size_t NREP = values.size(); for (size_t i = 0; i < NREP; ++i) { @@ -123,15 +124,15 @@ TwoStateHMM::TransitionPosteriors_rep( //////////////////////////////////////////////////////////////////////////////// double TwoStateHMM::forward_algorithm_rep( - const vector > > &vals, + const vector> > &vals, const size_t start, const size_t end, const double lp_sf, const double lp_sb, const double lp_ff, const double lp_fb, const double lp_ft, const double lp_bf, const double lp_bb, const double lp_bt, - const vector > &fg_distro, - const vector > &bg_distro, - vector > &f) const { + const vector> &fg_distro, + const vector> &bg_distro, + vector> &f) const { size_t NREP = vals.size(); f[start].first = lp_sf; f[start].second = lp_sb; @@ -160,15 +161,15 @@ TwoStateHMM::forward_algorithm_rep( double -TwoStateHMM::backward_algorithm_rep(const vector > > &vals, +TwoStateHMM::backward_algorithm_rep(const vector> > &vals, const size_t start, const size_t end, const double lp_sf, const double lp_sb, const double lp_ff, const double lp_fb, const double lp_ft, const double lp_bf, const double lp_bb, const double lp_bt, - const vector > &fg_distro, - const vector > &bg_distro, - vector > &b) const { + const vector> &fg_distro, + const vector> &bg_distro, + vector> &b) const { size_t NREP = vals.size(); b[end - 1].first = lp_ft; @@ -206,19 +207,19 @@ TwoStateHMM::backward_algorithm_rep(const vector > > //ff_vals: ksi_t(1,1), where 1 is the S_1, i.e. posterior prob of transitions void TwoStateHMM::estimate_transitions_rep( - const vector > > &vals, - const size_t start, const size_t end, - const vector > &f, - const vector > &b, - const double total, - const vector > &fg_distro, - const vector > &bg_distro, - const double lp_ff, const double lp_fb, - const double lp_bf, const double lp_bb, - vector &ff_vals, - vector &fb_vals, - vector &bf_vals, - vector &bb_vals) const { + const vector> > &vals, + const size_t start, const size_t end, + const vector> &f, + const vector> &b, + const double total, + const vector> &fg_distro, + const vector> &bg_distro, + const double lp_ff, const double lp_fb, + const double lp_bf, const double lp_bb, + vector &ff_vals, + vector &fb_vals, + vector &bf_vals, + vector &bb_vals) const { size_t NREP = vals.size(); for (size_t i = start + 1; i < end; ++i) { const size_t k = i - 1; @@ -244,17 +245,17 @@ TwoStateHMM::estimate_transitions_rep( double -TwoStateHMM::single_iteration_rep(const vector > > &values, - const vector > &vals_a_reps, - const vector > &vals_b_reps, +TwoStateHMM::single_iteration_rep(const vector> > &values, + const vector> &vals_a_reps, + const vector> &vals_b_reps, const vector &reset_points, - vector > &forward, - vector > &backward, + vector> &forward, + vector> &backward, double &p_sf, double &p_sb, double &p_ff, double &p_fb, double &p_ft, double &p_bf, double &p_bb, double &p_bt, - vector > &fg_distro, - vector > &bg_distro) const { + vector> &fg_distro, + vector> &bg_distro) const { size_t NREP= values.size(); vector log_fg_expected; @@ -397,17 +398,17 @@ TwoStateHMM::single_iteration_rep(const vector > > & double TwoStateHMM::BaumWelchTraining_rep( - const vector > > &values, + const vector> > &values, const vector &reset_points, vector &start_trans, - vector > &trans, + vector> &trans, vector &end_trans, vector &fg_alpha, vector &fg_beta, vector &bg_alpha, vector &bg_beta, const vector &array_status) const { - vector > fg_distro; - vector > bg_distro; + vector> fg_distro; + vector> bg_distro; size_t NREP = values.size(); for (size_t i = 0; i < NREP; ++i) { @@ -447,20 +448,20 @@ TwoStateHMM::BaumWelchTraining_rep( double TwoStateHMM::BaumWelchTraining_rep( - const vector > > &values, + const vector> > &values, const vector &reset_points, double &p_sf, double &p_sb, double &p_ff, double &p_fb, double &p_ft, double &p_bf, double &p_bb, double &p_bt, - vector > &fg_distro, - vector > &bg_distro, + vector> &fg_distro, + vector> &bg_distro, const vector &array_status) const { size_t NREP = values.size(); - vector > forward(values[0].size(), - pair(0, 0)); - vector > backward(values[0].size(), - pair(0, 0)); + vector> forward(values[0].size(), + make_pair(0.0, 0.0)); + vector> backward(values[0].size(), + make_pair(0.0, 0.0)); if (VERBOSE) { cerr << "MAX_ITER=" << max_iterations << "\tTOLERANCE=" @@ -472,22 +473,20 @@ TwoStateHMM::BaumWelchTraining_rep( << endl; } - double prev_total = -std::numeric_limits::max(); + double prev_total = -num_lim::max(); - vector > vals_a_reps(NREP, - vector(values[0].size(), 0)); - vector > vals_b_reps(NREP, - vector(values[0].size(), 0)); + vector> vals_a_reps(NREP, vector(values[0].size(), 0)); + vector> vals_b_reps(NREP, vector(values[0].size(), 0)); for (size_t r = 0; r < NREP; ++r) { - if(array_status[r]) { + if (array_status[r]) { for (size_t i = 0; i < values[0].size(); ++i) { - if(values[r][i].second > 0) { + if (values[r][i].second > 0) { vals_a_reps[r][i] = - log(std::min(std::max(values[r][i].first/values[r][i].second, - 1e-2), 1.0 - 1e-2)); + log(min(max(values[r][i].first/values[r][i].second, + 1e-2), 1.0 - 1e-2)); vals_b_reps[r][i] = - log(1 - std::min(std::max(values[r][i].first/values[r][i].second, - 1e-2), 1.0 - 1e-2)); + log(1.0 - min(max(values[r][i].first/values[r][i].second, + 1e-2), 1.0 - 1e-2)); } } } @@ -495,11 +494,13 @@ TwoStateHMM::BaumWelchTraining_rep( for (size_t i = 0; i < values[0].size(); ++i) { if (values[r][i].first + values[r][i].second >= 1) { vals_a_reps[r][i] = - log(std::min(std::max(values[r][i].first/(values[r][i].first + - values[r][i].second), 1e-2), 1.0 - 1e-2)); + log(min(max(values[r][i].first/ + (values[r][i].first + + values[r][i].second), 1e-2), 1.0 - 1e-2)); vals_b_reps[r][i] = - log(1 - std::min(std::max(values[r][i].first/(values[r][i].first + - values[r][i].second), 1e-2), 1.0 - 1e-2)); + log(1 - min(max(values[r][i].first/ + (values[r][i].first + + values[r][i].second), 1e-2), 1.0 - 1e-2)); } } } @@ -535,8 +536,8 @@ TwoStateHMM::BaumWelchTraining_rep( << "B_E\t" << p_bt_est << endl << endl; for (size_t r = 0; r < NREP; ++r) - cerr << "Emission parameters for Rep" << r+1 - << setw(14) << fg_distro[r]->getalpha() << setw(14) + cerr << "Emission parameters for Rep" << r+1 + << setw(14) << fg_distro[r]->getalpha() << setw(14) << fg_distro[r]->getbeta() << setw(14) << bg_distro[r]->getalpha() << setw(14) << bg_distro[r]->getbeta() << endl; } @@ -574,10 +575,10 @@ TwoStateHMM::BaumWelchTraining_rep( void -TwoStateHMM::PosteriorScores_rep(const vector > > &values, +TwoStateHMM::PosteriorScores_rep(const vector> > &values, const vector &reset_points, const vector &start_trans, - const vector > &trans, + const vector> &trans, const vector &end_trans, const vector fg_alpha, const vector fg_beta, @@ -587,8 +588,8 @@ TwoStateHMM::PosteriorScores_rep(const vector > > &v vector &llr_scores, const vector &array_status) const { - vector > fg_distro; - vector > bg_distro; + vector> fg_distro; + vector> bg_distro; size_t NREP = values.size(); for (size_t i = 0; i < NREP; ++i) { @@ -639,10 +640,10 @@ TwoStateHMM::PosteriorScores_rep( isfinite(lp_ff) && isfinite(lp_fb) && isfinite(lp_ft) && isfinite(lp_bf) && isfinite(lp_bb) && isfinite(lp_bt)); - vector > forward(values[0].size(), - pair(0, 0)); - vector > backward(values[0].size(), - pair(0, 0)); + vector> forward(values[0].size(), + make_pair(0.0, 0.0)); + vector> backward(values[0].size(), + make_pair(0.0, 0.0)); for (size_t i = 0; i < reset_points.size() - 1; ++i) { const double score = forward_algorithm_rep(values, @@ -682,10 +683,10 @@ TwoStateHMM::PosteriorScores_rep( } double -TwoStateHMM::PosteriorDecoding_rep(const vector > > &values, +TwoStateHMM::PosteriorDecoding_rep(const vector> > &values, const vector &reset_points, const vector &start_trans, - const vector > &trans, + const vector> &trans, const vector &end_trans, const vector fg_alpha, const vector fg_beta, const vector bg_alpha, const vector bg_beta, @@ -693,8 +694,8 @@ TwoStateHMM::PosteriorDecoding_rep(const vector > > vector &llr_scores, const vector &array_status) const { - vector > fg_distro; - vector > bg_distro; + vector> fg_distro; + vector> bg_distro; size_t NREP = values.size(); for (size_t i = 0; i < NREP; ++i) { @@ -748,10 +749,10 @@ TwoStateHMM::TransitionPosteriors_rep( isfinite(lp_ff) && isfinite(lp_fb) && isfinite(lp_ft) && isfinite(lp_bf) && isfinite(lp_bb) && isfinite(lp_bt)); - vector > forward(values[0].size(), - pair(0, 0)); - vector > backward(values[0].size(), - pair(0, 0)); + vector> forward(values[0].size(), + make_pair(0.0, 0.0)); + vector> backward(values[0].size(), + make_pair(0.0, 0.0)); for (size_t i = 0; i < reset_points.size() - 1; ++i) { const double score = forward_algorithm_rep(values, reset_points[i], @@ -813,13 +814,13 @@ TwoStateHMM::TransitionPosteriors_rep( double -TwoStateHMM::PosteriorDecoding_rep(const vector > > &values, +TwoStateHMM::PosteriorDecoding_rep(const vector> > &values, const vector &reset_points, double p_sf, double p_sb, double p_ff, double p_fb, double p_ft, double p_bf, double p_bb, double p_bt, - const vector > &fg_distro, - const vector > &bg_distro, + const vector> &fg_distro, + const vector> &bg_distro, vector &classes, vector &llr_scores) const { double total_score = 0; @@ -837,10 +838,8 @@ TwoStateHMM::PosteriorDecoding_rep(const vector > > isfinite(lp_ff) && isfinite(lp_fb) && isfinite(lp_ft) && isfinite(lp_bf) && isfinite(lp_bb) && isfinite(lp_bt)); - vector > forward(values[0].size(), - pair(0, 0)); - vector > backward(values[0].size(), - pair(0, 0)); + vector> forward(values[0].size(), make_pair(0.0, 0.0)); + vector> backward(values[0].size(), make_pair(0.0, 0.0)); for (size_t i = 0; i < reset_points.size() - 1; ++i) { const double score = forward_algorithm_rep(values, reset_points[i],