diff --git a/Correct.cpp b/Correct.cpp index c312f07..2a62bad 100644 --- a/Correct.cpp +++ b/Correct.cpp @@ -1773,7 +1773,7 @@ inline int move_gap_greedy(char* path, int path_i, int path_length, char* x, int { break; } - else + else ///path[path_i] = 1 || path[path_i] = 0, exchange path[path_i] with path[path_i-1] { if(path[path_i] == 1 && x[x_i] == y[y_i]) { @@ -3286,12 +3286,17 @@ inline void recalcate_window_advance(overlap_region_alloc* overlap_list, All_rea // fprintf(stderr, "[M::%s::j->%lld] utg%.6dl(%c), align_length::%u, overlap_length::%lld\n", __func__, // j, (int32_t)z->y_id + 1, "+-"[z->y_pos_strand], z->align_length, overlap_length); // } + // if(y_id == 4) { + // fprintf(stderr, "[M::%s::idx->%lld::] z::x_pos_s->%u, z::x_pos_e->%u, ovl->%lld, aln->%u\n", + // __func__, j, z->x_pos_s, z->x_pos_e, overlap_length, z->align_length); + // } ///debug_scan_cigar(&(overlap_list->list[j])); ///only calculate cigar for high quality overlaps if ((rref && (overlap_length*OVERLAP_THRESHOLD_FILTER <= z->align_length)) || (uref && (overlap_length*(1-e_rate) <= z->align_length))) { - a_nw = z->w_list.n; + a_nw = z->w_list.n; + // int64_t tt = 0; for (i = 0, is_srt = 1; i < a_nw; i++) { p = &(z->w_list.a[i]); ///check if the cigar of this window has been got @@ -3352,17 +3357,30 @@ inline void recalcate_window_advance(overlap_region_alloc* overlap_list, All_rea p->y_end = y_start + end_site - extra_begin; p->error = error; } + + // if(y_id == 4) { + // fprintf(stderr, "+[M::idx->%lld::] y_start->%d, y_end->%d, error->%d\n", + // j, p->y_start, p->y_end, p->error); + // } } else { p->y_end -= p->extra_begin; + // if(y_id == 4) { + // fprintf(stderr, "-[M::idx->%lld::] y_start->%d, y_end->%d, error->%d\n", + // j, p->y_start, p->y_end, p->error); + // } } - + // tt += p->error; if(is_srt && i > 0 && p->x_start < z->w_list.a[i-1].x_start) is_srt = 0; } if(!is_srt) radix_sort_window_list_xs_srt(z->w_list.a, z->w_list.a + z->w_list.n); error_rate = non_trim_error_rate(z, rref, uref, v_idx, dumy, g_read, e_rate, block_s); z->is_match = 0; + // if(y_id == 4) { + // fprintf(stderr, "[M::%s::idx->%lld::] z::x_pos_s->%u, z::x_pos_e->%u, ovl->%lld, aln->%u, error_rate->%f, e_rate_final->%f, tt->%ld\n", + // __func__, j, z->x_pos_s, z->x_pos_e, overlap_length, z->align_length, error_rate, e_rate_final, tt); + // } if (error_rate <= e_rate_final/**asm_opt.max_ov_diff_final**/) { overlap_list->mapped_overlaps_length += overlap_length; @@ -3405,7 +3423,330 @@ inline void recalcate_window_advance(overlap_region_alloc* overlap_list, All_rea ///debug_window_cigar(overlap_list, g_read, dumy, rref, 1, 1); } +uint32_t inline simi_pass(int64_t ol, int64_t aln_ol, All_reads *rref, const ul_idx_t *uref, double *e_rate) +{ + if(aln_ol == 0 || ol == 0) return 0; + if(rref) { + if((ol*OVERLAP_THRESHOLD_FILTER) <= aln_ol) return 1; + } else if(uref) { + if(e_rate) { + if((ol*((double)(((double)1.0)-(*e_rate)))) <= aln_ol) return 1; + } else { + if(((ol*MIN_UL_ALIN_RATE) <= aln_ol) && (aln_ol >= MIN_UL_ALIN_LEN)) return 1; + } + } + + return 0; +} + +inline uint32_t gen_backtrace(window_list *p, overlap_region *z, All_reads *rref, const ul_idx_t *uref, UC_Read* g_read, Correct_dumy* dumy, +int32_t y_strand, int32_t y_id) +{ + int64_t x_start, x_end, x_len, Window_Len, o_len; + int32_t threshold; long long y_start; + int real_y_start = 0, end_site, extra_begin, extra_end; + char *x_string, *y_string; unsigned int error; + ///there is no problem for x + x_start = p->x_start; x_end = p->x_end; x_len = x_end - x_start + 1; + threshold = p->error_threshold; Window_Len = x_len + (threshold << 1); + + + ///y_start is the real y_start + ///for the window with cigar, y_start has already reduced extra_begin + y_start = p->y_start; extra_begin = p->extra_begin; extra_end = p->extra_end; + o_len = Window_Len - extra_end - extra_begin; + if(rref) { + fill_subregion(dumy->overlap_region, y_start, o_len, y_strand, rref, y_id, extra_begin, extra_end); + } else { + fill_subregion_ul(dumy->overlap_region, y_start, o_len, y_strand, uref, y_id, extra_begin, extra_end); + } + x_string = g_read->seq + x_start; y_string = dumy->overlap_region; + + + ///note!!! need notification + end_site = Reserve_Banded_BPM_PATH(y_string, Window_Len, x_string, x_len, threshold, &error, &real_y_start, + &(dumy->path_length), dumy->matrix_bit, dumy->path, p->error, p->y_end - y_start); + // assert(error != (unsigned int)-1); + if(error != (unsigned int)-1) { + ///this condition is always wrong + ///in best case, real_y_start = threshold, end_site = Window_Len - threshold - 1 + if (end_site == Window_Len - 1 || real_y_start == 0) { + if(rref) { + if(fix_boundary(x_string, x_len, threshold, y_start, real_y_start, end_site, + extra_begin, extra_end, y_id, Window_Len, rref, dumy, y_strand, error, + &y_start, &real_y_start, &end_site, &extra_begin, &extra_end, &error)) { + p->error = error; p->extra_begin = extra_begin; p->extra_end = extra_end; + } + } else { + if(fix_ul_boundary(x_string, x_len, threshold, y_start, real_y_start, + end_site, extra_begin, extra_end, y_id, Window_Len, uref, dumy, + y_strand, error, &y_start, &real_y_start, &end_site, &extra_begin, + &extra_end, &error)) { + p->error = error; p->extra_begin = extra_begin; p->extra_end = extra_end; + } + } + } + + generate_cigar(dumy->path, dumy->path_length, p, &(z->w_list), &real_y_start, &end_site, &error, x_string, x_len, y_string); + + ///note!!! need notification + real_y_start = y_start + real_y_start - extra_begin; + p->y_start = real_y_start; + p->y_end = y_start + end_site - extra_begin; + p->error = error; + return 1; + } + p->error = -1; + return 0; +} + +inline uint32_t aln_wlst(overlap_region *z, All_reads *rref, const ul_idx_t *uref, UC_Read* g_read, Correct_dumy* dumy, +int32_t y_strand, int32_t y_id, int64_t x_start, int64_t x_end, long long y_start, int64_t block_s, double e_rate, int32_t is_cigar) +{ + int64_t x_len, Window_Len; window_list *p = NULL; long long o_len; + int32_t threshold; int real_y_start = 0, end_site, extra_begin, extra_end; + char *x_string, *y_string; unsigned int error; + x_len = x_end + 1 - x_start; + ///there are two potiential reasons for unmatched window: + ///1. this window has a large number of differences + ///2. DP does not start from the right offset + if(rref) { + threshold = double_error_threshold(get_init_err_thres(x_len, e_rate, block_s, THRESHOLD), x_len); + } else { + threshold = double_ul_error_threshold(get_init_err_thres(x_len, e_rate, block_s, THRESHOLD_MAX_SIZE), x_len); + } + Window_Len = x_len + (threshold << 1); + ///y_start might be less than 0 + if(!determine_overlap_region(threshold, y_start, y_id, Window_Len, (rref?(Get_READ_LENGTH((*rref), y_id)):(uref->ug->u.a[y_id].len)), + &extra_begin, &extra_end, &y_start, &o_len)) { + return 0; + } + + if(o_len + threshold < x_len) return 0; + if(rref) { + fill_subregion(dumy->overlap_region, y_start, o_len, y_strand, rref, y_id, extra_begin, extra_end); + } else { + fill_subregion_ul(dumy->overlap_region, y_start, o_len, y_strand, uref, y_id, extra_begin, extra_end); + } + x_string = g_read->seq + x_start; + y_string = dumy->overlap_region; + + if(is_cigar) { + ///note!!! need notification + end_site = Reserve_Banded_BPM_PATH(y_string, Window_Len, x_string, x_len, threshold, &error, &real_y_start, + &(dumy->path_length), dumy->matrix_bit, dumy->path, -1, -1); + } else { + ///note!!! need notification + end_site = Reserve_Banded_BPM(y_string, Window_Len, x_string, x_len, threshold, &error); + } + if(error!=(unsigned int)-1) { + if(is_cigar) { + ///this condition is always wrong + ///in best case, real_y_start = threshold, end_site = Window_Len - threshold - 1 + if (end_site == Window_Len - 1 || real_y_start == 0) { + if(rref) { + fix_boundary(x_string, x_len, threshold, y_start, real_y_start, + end_site, extra_begin, extra_end, y_id, Window_Len, rref, dumy, + y_strand, error, &y_start, &real_y_start, &end_site, &extra_begin, + &extra_end, &error); + } else { + fix_ul_boundary(x_string, x_len, threshold, y_start, real_y_start, + end_site, extra_begin, extra_end, y_id, Window_Len, uref, dumy, + y_strand, error, &y_start, &real_y_start, &end_site, &extra_begin, + &extra_end, &error); + } + } + } + + kv_pushp(window_list, z->w_list, &p); + p->x_start = x_start; p->x_end = x_end; ///must set x_start/x_end here + if(is_cigar) { + generate_cigar(dumy->path, dumy->path_length, p, &(z->w_list), &real_y_start, &end_site, &error, x_string, x_len, y_string); + } else { + p->cidx = p->clen = 0; + } + p->y_start = y_start + real_y_start;///difference + p->y_end = y_start + end_site; + p->error = error; + p->extra_begin = extra_begin; + p->extra_end = extra_end; + p->error_threshold = threshold; + z->align_length += x_len; + + return 1; + } + return 0; +} + +inline void refine_ed_aln(overlap_region_alloc* overlap_list, All_reads *rref, const ul_idx_t *uref, + UC_Read* g_read, Correct_dumy* dumy, UC_Read* overlap_read, kvec_t_u64_warp* v_idx, int64_t block_s, double e_rate, double e_rate_final) +{ + int64_t j, k, i, on, y_id, y_readLen, x_start, x_end, x_len, total_y_start, total_y_end; + int32_t y_strand, real_y_start; + int64_t nw, a_nw, w_id, w_s, w_e, is_srt, mm_we, mm_ws, mm_aln, ovl; + double error_rate; uint64_t *w_idx; overlap_region *z; window_list *p = NULL; + + overlap_list->mapped_overlaps_length = 0; on = overlap_list->length; + for (j = 0; j < on; ++j) { + z = &(overlap_list->list[j]); z->is_match = 0; is_srt = 1; + if(z->w_list.n == 0) continue;///no alignment + nw = get_num_wins(z->x_pos_s, z->x_pos_e+1, block_s); a_nw = z->w_list.n; + kv_resize(uint64_t, v_idx->a, (uint64_t)nw); + memset(v_idx->a.a, -1, sizeof((*v_idx->a.a))*nw); w_idx = v_idx->a.a; + for (i = 0; i < a_nw; i++) { ///w_idx[] == (uint64_t) if unmatched + assert(z->w_list.a[i].y_end != -1); + w_id = get_win_id_by_s(z, z->w_list.a[i].x_start, block_s, NULL); + w_idx[w_id] = i; + } + + y_id = z->y_id; y_strand = z->y_pos_strand; + ovl = z->x_pos_e+1-z->x_pos_s; mm_we = z->x_pos_s; mm_aln = 0; + y_readLen = (rref?(Get_READ_LENGTH((*rref), y_id)):(uref->ug->u.a[y_id].len)); + for (i = a_nw-1; i >= 0; i--) { //utilize the the end pos of pre-window in forward + w_id = get_win_id_by_s(z, z->w_list.a[i].x_start, block_s, &w_e); + assert(z->w_list.a[i].x_end == w_e); + if(w_e > mm_we) mm_we = w_e; + ///in most cases, extra_begin = 0 + total_y_start = z->w_list.a[i].y_end + 1 - z->w_list.a[i].extra_begin; + for (k = w_id + 1; k < nw && total_y_start < y_readLen; k++) { + if(w_idx[k] != (uint64_t)-1) break; + w_s = w_e + 1; + w_id = get_win_id_by_s(z, w_s, block_s, &w_e); + assert(w_id == k); + x_start = w_s; x_end = w_e; + if(aln_wlst(z, rref, uref, g_read, dumy, y_strand, y_id, x_start, x_end, + total_y_start, block_s, e_rate, 0)) { + p = &(z->w_list.a[z->w_list.n-1]); + w_idx[k] = z->w_list.n - 1; + if(x_end > mm_we) mm_we = x_end; + if(is_srt && z->w_list.n > 1 && p->x_start < z->w_list.a[z->w_list.n-2].x_start) is_srt = 0; + } else { + break; + } + total_y_start = p->y_end + 1 - p->extra_begin; + } + } + mm_ws = z->x_pos_s; mm_aln = mm_we+1-mm_ws; + if(!simi_pass(ovl, mm_aln, rref, uref, NULL)) continue; + if(nw > 0 && w_idx[0] != (uint64_t)-1) mm_ws = z->w_list.a[w_idx[0]].x_end+1; + + for (i = 1; i < nw; i++) { //utilize the the start pos of next window in backward + ///find the first matched window, which should not be the first window + ///the pre-window of this matched window must be unmatched + if(w_idx[i] != (uint64_t)-1 && w_idx[i-1] == (uint64_t)-1) { + w_s = z->w_list.a[w_idx[i]].x_start; mm_aln -= (w_s-mm_ws); + ///check if the start pos of this matched window has been calculated + if(z->w_list.a[w_idx[i]].clen == 0) { + p = &(z->w_list.a[w_idx[i]]); + gen_backtrace(p, z, rref, uref, g_read, dumy, y_strand, y_id); + assert(p->error != -1); + p->y_end += p->extra_begin; + } + real_y_start = p->y_start; + + ///the end pos for pre window is real_y_start - 1 + total_y_end = real_y_start - 1; + ///find the unmatched window on the left of current matched window + ///k starts from i - 1 + for (k = i - 1; k >= 0 && w_idx[k] == (uint64_t)-1 && total_y_end > 0; k--) { + w_e = w_s - 1; + w_id = get_win_id_by_e(z, w_e, block_s, &w_s); + assert(w_id == k); + x_start = w_s; x_end = w_e; x_len = x_end + 1 - x_start; + if(aln_wlst(z, rref, uref, g_read, dumy, y_strand, y_id, x_start, x_end, total_y_end+1-x_len, block_s, e_rate, 1)) { + p = &(z->w_list.a[z->w_list.n-1]); + p->y_start -= p->extra_begin; ///y_start has no shift, but y_end has shift + w_idx[k] = z->w_list.n - 1; + mm_aln += x_len; + if(is_srt && z->w_list.n > 1 && p->x_start < z->w_list.a[z->w_list.n-2].x_start) is_srt = 0; + } else { + break; + } + total_y_end = p->y_start - 1; + } + if(!simi_pass(ovl, mm_aln, rref, uref, NULL)) break; + } + if(w_idx[i] != (uint64_t)-1) mm_ws = z->w_list.a[w_idx[i]].x_end+1; + } + + if(i < nw) continue; + if(uref && simi_pass(ovl, z->align_length, NULL, uref, NULL)) { + z->is_match = 3; overlap_list->mapped_overlaps_length += z->align_length; + ///sort for set_herror_win + if(!is_srt) radix_sort_window_list_xs_srt(z->w_list.a, z->w_list.a + z->w_list.n); + } + } + + if(uref && overlap_list->mapped_overlaps_length > 0) { + set_herror_win(overlap_list, dumy, v_idx, e_rate, g_read->length, block_s); + } + + overlap_list->mapped_overlaps_length = 0; + for (j = 0; j < (long long)overlap_list->length; j++) { + z = &(overlap_list->list[j]); + y_id = z->y_id; y_strand = z->y_pos_strand; + y_readLen = (rref?(Get_READ_LENGTH((*rref), y_id)):(uref->ug->u.a[y_id].len)); + ovl = z->x_pos_e + 1 - z->x_pos_s; //z->is_match = 0; + // if(y_id == 4) { + // fprintf(stderr, "[M::%s::idx->%ld::] z::x_pos_s->%u, z::x_pos_e->%u, ovl->%ld, aln->%u\n", + // __func__, j, z->x_pos_s, z->x_pos_e, ovl, z->align_length); + // } + ///debug_scan_cigar(&(overlap_list->list[j])); + ///only calculate cigar for high quality overlaps + // int64_t tt = 0; + if(simi_pass(ovl, z->align_length, rref, uref, &e_rate)) { + a_nw = z->w_list.n; + for (i = 0, is_srt = 1; i < a_nw; i++) { + p = &(z->w_list.a[i]); + ///check if the cigar of this window has been got + if(p->clen == 0) { + gen_backtrace(p, z, rref, uref, g_read, dumy, y_strand, y_id); + assert(p->error != -1); + // if(y_id == 4) { + // fprintf(stderr, "+[M::idx->%ld::] y_start->%d, y_end->%d, error->%d\n", + // j, p->y_start, p->y_end, p->error); + // } + } + else { + p->y_end -= p->extra_begin; + // if(y_id == 4) { + // fprintf(stderr, "-[M::idx->%ld::] y_start->%d, y_end->%d, error->%d\n", + // j, p->y_start, p->y_end, p->error); + // } + } + // tt += p->error; + if(is_srt && i > 0 && p->x_start < z->w_list.a[i-1].x_start) is_srt = 0; + } + if(!is_srt) radix_sort_window_list_xs_srt(z->w_list.a, z->w_list.a + z->w_list.n); + error_rate = non_trim_error_rate(z, rref, uref, v_idx, dumy, g_read, e_rate, block_s); + z->is_match = 0;///must be here; + // if(y_id == 4) { + // fprintf(stderr, "[M::%s::idx->%ld::] block_s->%ld, z::x_pos_s->%u, z::x_pos_e->%u, ovl->%ld, aln->%u, error_rate->%f, e_rate_final->%f\n", + // __func__, j, block_s, z->x_pos_s, z->x_pos_e, ovl, z->align_length, error_rate, e_rate_final); + // exit(1); + // } + if (error_rate <= e_rate_final) { + overlap_list->mapped_overlaps_length += ovl; + z->is_match = 1; append_unmatched_wins(z, block_s); + if(rref) { + calculate_boundary_cigars(z, rref, dumy, g_read, e_rate); + } else { + calculate_ul_boundary_cigars(z, uref, dumy, g_read, e_rate, block_s); + } + // assert(get_num_wins(z->x_pos_s, z->x_pos_e+1, block_s)==(int64_t)z->w_list.n); + // assert((int64_t)z->x_pos_s==z->w_list.a[0].x_start && + // (int64_t)z->x_pos_e==z->w_list.a[z->w_list.n-1].x_end); + } else if (error_rate <= e_rate_final * 1.5) { + z->is_match = 3; + } + } else {///it impossible to be matched + z->is_match = 0; + // fprintf(stderr, "[M::%s::idx->%ld::is_match->%u] z::x_pos_s->%u, z::x_pos_e->%u, error_rate->-1, e_threshold->%f\n", + // __func__, j, z->is_match, z->x_pos_s, z->x_pos_e, e_rate); + } + } +} inline void add_base_to_correct_read_directly(Correct_dumy* dumy, char base) { @@ -8855,7 +9196,77 @@ void correct_ul_overlap(overlap_region_alloc* overlap_list, const ul_idx_t *uref // recalcate_window(overlap_list, R_INF, g_read, dumy, overlap_read); // partition_overlaps(overlap_list, R_INF, g_read, dumy, hap, force_repeat); // recalcate_window_ul_advance(overlap_list, uref, g_read, dumy, overlap_read, max_ov_diff_ec, w_inf.window_length, km); - recalcate_window_advance(overlap_list, NULL, uref, g_read, dumy, overlap_read, v_idx, w_inf.window_length, max_ov_diff_ec, max_ov_diff_ec); + // recalcate_window_advance(overlap_list, NULL, uref, g_read, dumy, overlap_read, v_idx, w_inf.window_length, max_ov_diff_ec, max_ov_diff_ec); + refine_ed_aln(overlap_list, NULL, uref, g_read, dumy, overlap_read, v_idx, w_inf.window_length, max_ov_diff_ec, max_ov_diff_ec); + // fprintf(stderr, "[M::%s-beg] occ[0]->%lu, occ[1]->%lu, occ[2]->%lu, occ[3]->%lu\n", __func__, + // ovlp_occ(overlap_list, 0), ovlp_occ(overlap_list, 1), ovlp_occ(overlap_list, 2), ovlp_occ(overlap_list, 3)); + ///after this function, overlap_list is sorted by x_pos_e; used for g_chain + partition_ul_overlaps_advance(overlap_list, uref, g_read, overlap_read, dumy, hap, force_repeat, max_ov_diff_ec, w_inf.window_length, km); + // print_ovlp_occ_stat(overlap_list, g_read->length, 1); + // print_ovlp_occ_stat(overlap_list, g_read->length, 2); + // fprintf(stderr, "[M::%s-end] occ[0]->%lu, occ[1]->%lu, occ[2]->%lu, occ[3]->%lu\n", __func__, + // ovlp_occ(overlap_list, 0), ovlp_occ(overlap_list, 1), ovlp_occ(overlap_list, 2), ovlp_occ(overlap_list, 3)); + // debug_phasing_status(overlap_list, uref->ug, 0, hap, g_read, 20, 1176); + // debug_phasing_status(overlap_list, uref->ug, 0, hap, g_read, 20, 1167); + // debug_phasing_status(overlap_list, uref->ug, 0, hap, g_read, 20, 1170); + /** + + + if(is_consensus) + { + generate_consensus(overlap_list, R_INF, g_read, dumy, g, DAGCon, current_cigar, second_round); + } + + + (*fully_cov) = check_if_fully_covered(overlap_list, R_INF, g_read, dumy, g, abnormal); + **/ +} + + + + +void lchain_align(overlap_region_alloc* overlap_list, const ul_idx_t *uref, + UC_Read* g_read, Correct_dumy* dumy, UC_Read* overlap_read, + Graph* g, Graph* DAGCon, Cigar_record* current_cigar, + haplotype_evdience_alloc* hap, Round2_alignment* second_round, + kvec_t_u64_warp* v_idx, window_list_alloc* win_ciagr_buf, + int force_repeat, int is_consensus, int* fully_cov, int* abnormal, + double max_ov_diff_ec, long long winLen, void *km) +{ + + clear_Correct_dumy(dumy, overlap_list, km); + + long long window_start, window_end; + + Window_Pool w_inf; + + init_Window_Pool(&w_inf, g_read->length, winLen, (int)(1.0/max_ov_diff_ec)); + int flag = 0; + + while(get_Window(&w_inf, &window_start, &window_end) && flag != -2) { + dumy->length = 0; dumy->lengthNT = 0; + flag = get_interval(window_start, window_end, overlap_list, dumy, w_inf.window_length); + + switch (flag) { + case 1: ///no match here + break; + case 0: ///no match here + break; + case -2: ///if flag == -2, loop would be terminated + break; + } + + ///dumy->lengthNT represent how many overlaps that the length of them is not equal to WINDOW; may larger or less than WINDOW + ///dumy->length represent how many overlaps that the length of them is WINDOW + ///now the windows which are larger than WINDOW are verified one-by-one, to improve it, we can do it group-bygroup + verify_ul_window(window_start, window_end, overlap_list, dumy, uref, g_read->seq, max_ov_diff_ec, w_inf.window_length, /**THRESHOLD**/THRESHOLD_MAX_SIZE, km); + // verify_ul_ll_window(window_start, window_end, overlap_list, dumy, uref, g_read->seq, max_ov_diff_ec, w_inf.window_length, km); + } + + // recalcate_window(overlap_list, R_INF, g_read, dumy, overlap_read); + // partition_overlaps(overlap_list, R_INF, g_read, dumy, hap, force_repeat); + // recalcate_window_ul_advance(overlap_list, uref, g_read, dumy, overlap_read, max_ov_diff_ec, w_inf.window_length, km); + refine_ed_aln(overlap_list, NULL, uref, g_read, dumy, overlap_read, v_idx, w_inf.window_length, max_ov_diff_ec, max_ov_diff_ec); // fprintf(stderr, "[M::%s-beg] occ[0]->%lu, occ[1]->%lu, occ[2]->%lu, occ[3]->%lu\n", __func__, // ovlp_occ(overlap_list, 0), ovlp_occ(overlap_list, 1), ovlp_occ(overlap_list, 2), ovlp_occ(overlap_list, 3)); ///after this function, overlap_list is sorted by x_pos_e; used for g_chain diff --git a/Correct.h b/Correct.h index df6de7a..801e85f 100644 --- a/Correct.h +++ b/Correct.h @@ -1131,6 +1131,13 @@ void correct_ul_overlap(overlap_region_alloc* overlap_list, const ul_idx_t *uref int force_repeat, int is_consensus, int* fully_cov, int* abnormal, double max_ov_diff_ec, long long winLen, void *km); +void lchain_align(overlap_region_alloc* overlap_list, const ul_idx_t *uref, + UC_Read* g_read, Correct_dumy* dumy, UC_Read* overlap_read, + Graph* g, Graph* DAGCon, Cigar_record* current_cigar, + haplotype_evdience_alloc* hap, Round2_alignment* second_round, + kvec_t_u64_warp* v_idx, window_list_alloc* win_ciagr_buf, + int force_repeat, int is_consensus, int* fully_cov, int* abnormal, + double max_ov_diff_ec, long long winLen, void *km); /*** type: 0. match diff --git a/Hash_Table.cpp b/Hash_Table.cpp index 86c73a2..41156dc 100644 --- a/Hash_Table.cpp +++ b/Hash_Table.cpp @@ -78,6 +78,94 @@ int get_fake_gap_shift(Fake_Cigar* x, int index) return result; } +int ovlp_chain_gen(overlap_region_alloc* ol, overlap_region* t, int64_t xl, int64_t yl, int64_t apend_be) +{ + if (ol->length + 1 > ol->size) { + uint64_t sl = ol->size; + ol->size = ol->length + 1; + kroundup64(ol->size); + REALLOC(ol->list, ol->size); + /// need to set new space to be 0 + memset(ol->list + sl, 0, sizeof(overlap_region)*(ol->size - sl)); + } + + if ((ol->length!=0) && (ol->list[ol->length-1].y_id==t->y_id)) { + if((ol->list[ol->length-1].shared_seed > t->shared_seed) || + ((ol->list[ol->length-1].shared_seed == t->shared_seed) && + (ol->list[ol->length-1].overlapLen <= t->overlapLen))) { + return 0; + } + else { + ol->length--; + } + } + + int64_t xr, yr, dd, pdd, dq, dr, id, i, fn; + if(t->x_pos_s <= t->y_pos_s) { + t->y_pos_s -= t->x_pos_s; t->x_pos_s = 0; + } else { + t->x_pos_s -= t->y_pos_s; t->y_pos_s = 0; + } + + xr = xl-t->x_pos_e-1; yr = yl-t->y_pos_e-1; + if(xr <= yr) { + t->x_pos_e = xl-1; t->y_pos_e += xr; + } else { + t->y_pos_e = yl-1; t->x_pos_e += yr; + } + + overlap_region *o = &(ol->list[ol->length++]); + o->shared_seed = t->shared_seed; + o->align_length = 0; + o->is_match = 0; + o->non_homopolymer_errors = 0; + o->strong = 0; + o->x_id = t->x_id; + o->y_id = t->y_id; + o->x_pos_strand = 0;///always 0 + o->y_pos_strand = t->x_pos_strand; + + resize_fake_cigar(&(o->f_cigar), (t->f_cigar.length + 2), NULL); + if(apend_be == 1) add_fake_cigar(&(o->f_cigar), o->x_pos_s, 0, NULL); + + if (t->x_pos_strand == 1) { + o->x_pos_e = xl-t->x_pos_s-1; o->x_pos_s = xl-t->x_pos_e-1; + o->y_pos_e = yl-t->y_pos_s-1; o->y_pos_s = yl-t->y_pos_e-1; + + pdd = INT32_MAX; fn = t->f_cigar.length; + for (i = 0; i < fn; i++) { + dd = get_fake_gap_shift(&(t->f_cigar), i); + if(dd != pdd) { + pdd = dd; + add_fake_cigar(&(o->f_cigar), xl-get_fake_gap_pos(&(t->f_cigar), i)-1, pdd, NULL); + } + } + } else { + o->x_pos_e = t->x_pos_e; o->x_pos_s = t->x_pos_s; + o->y_pos_e = t->y_pos_e; o->y_pos_s = t->y_pos_s; + + dq = t->x_pos_e - t->x_pos_s; + dr = t->y_pos_e - t->y_pos_s; + id = dr - dq; + + pdd = INT32_MAX; fn = t->f_cigar.length; + for (i = fn-1; i >= 0; i--) { + dd = get_fake_gap_shift(&(t->f_cigar), i); + if(dd != pdd) { + pdd = dd; + add_fake_cigar(&(o->f_cigar), get_fake_gap_pos(&(t->f_cigar), i), id - pdd, NULL); + } + } + } + + if((apend_be == 1) && (get_fake_gap_pos(&(o->f_cigar), o->f_cigar.length-1) != ((int64_t)o->x_pos_e))) { + add_fake_cigar(&(o->f_cigar), o->x_pos_e, + get_fake_gap_shift(&(o->f_cigar), o->f_cigar.length-1), NULL); + } + + return 1; +} + int append_inexact_overlap_region_alloc(overlap_region_alloc* list, overlap_region* tmp, long long xLen, long long yLen, int add_beg_end, void *km) { @@ -1107,3 +1195,217 @@ void resize_window_list_alloc(window_list_alloc* x, uint64_t size) for (k = 0; k < x->m; k++) x->a[k].error = -1; x->c.n = 0; } + +#define normal_cw(x) (((x)&(0xffu))>=((x)>>8)?(((x)&(0xffu))/((x)>>8)):1) + +int32_t lchain_check(k_mer_hit *a, int32_t n_a, Chain_Data *dp, double bw_thres) +{ + int32_t i, tot_g = 0, sc, dg, dq, dr, dd, span; + double bw_pen; + if (n_a == 0) return -1; + if (n_a > 1) { + if ((a[0].self_offset >= a[n_a-1].self_offset)||(a[0].offset == a[n_a-1].offset)) return -1; + dq = (int32_t)a[n_a-1].self_offset - (int32_t)a[0].self_offset; + dr = (int32_t)a[n_a-1].offset - (int32_t)a[0].offset; + dd = ((dq>=dr)? (dq-dr): (dr-dq));//gap + dg = ((dq>=dr)? (dr): (dq));///len + if (dd > (dg*bw_thres)) return -1; + } + + for (i = 1; i < n_a; ++i) {///a[] is sorted by offset, instead of self_offset; but offset might be equal + if(a[i-1].self_offset >= a[i].self_offset) break; + if(a[i-1].offset == a[i].offset) break; + } + if (i < n_a) return -1; + + bw_pen = 1.0 / bw_thres; + dp->score[0] = normal_w((a[0].cnt&(0xffu)), (a[0].cnt>>8)); dp->pre[0] = -1; + for (i = 1; i < n_a; ++i) { + dq = (int32_t)a[i].self_offset - (int32_t)a[i-1].self_offset; + dr = (int32_t)a[i].offset - (int32_t)a[i-1].offset; + dd = ((dq>=dr)? (dq-dr): (dr-dq));//gap + dg = ((dq>=dr)? (dr): (dq));///len + + tot_g += dd; + if (dd > THRESHOLD_MAX_SIZE && dd > (dg*bw_thres)) break; + span = a[i].cnt&(0xffu); + sc = dg < span? dg : span; + sc = normal_w(sc, ((int32_t)(a[i].cnt>>8))); + sc -= (int32_t)((((double)dd)/((double)dg))*bw_pen*((double)sc));///bw_pen is 20 for HiFi + + dp->score[i] = dp->score[i-1] + sc; + dp->pre[i] = i - 1; + } + if (i < n_a) return -1; + + if(n_a > 1) { + dq = (int32_t)a[n_a-1].self_offset - (int32_t)a[0].self_offset; + dr = (int32_t)a[n_a-1].offset - (int32_t)a[0].offset; + dg = ((dq>=dr)? (dr): (dq));///len + dd = tot_g;///gap + if (dd > (dg*bw_thres)) return -1; + } + return n_a; +} + +inline int32_t cal_bw(const k_mer_hit *ai, const k_mer_hit *aj, double bw_rate, int64_t sf_l, int64_t ot_l) +{ + ///ai is the suffix of aj + int64_t sf_s = aj->self_offset, sf_e = ai->self_offset + 1; + int64_t ot_s = aj->offset, ot_e = ai->offset + 1; + int64_t sf_r = sf_l - sf_e, ot_r = ot_l - ot_e; + if(sf_s <= ot_s) sf_s = 0; + else sf_s -= ot_s; + + if(sf_r <= ot_r) sf_e = sf_l; + else sf_e += ot_r; + + return (sf_e - sf_s)*bw_rate; +} + +inline int32_t comput_sc_ch(const k_mer_hit *ai, const k_mer_hit *aj, double bw_rate, double chn_pen_gap, double chn_pen_skip, int64_t sl, int64_t ol) +{ + ///ai is the suffix of aj + int32_t dq, dr, dd, dg, q_span, sc; + dq = (int64_t)(ai->self_offset) - (int64_t)(aj->self_offset); + if(dq < 0) return INT32_MIN; + dr = (int64_t)(ai->offset) - (int64_t)(aj->offset); + if(dr < 0) return INT32_MIN; + dd = dr > dq? dr - dq : dq - dr;//gap + if((dd > 16) && (dd > cal_bw(ai, aj, bw_rate, sl, ol))) return INT32_MIN; + dg = dr < dq? dr : dq;//len + q_span = ai->cnt&(0xffu); + sc = q_span < dg? q_span : dg; + sc = normal_w(sc, ((int32_t)(ai->cnt>>8))); + if (dd || dg > q_span) { + double lin_pen, a_pen; + lin_pen = (chn_pen_gap*(double)dd); + a_pen = ((double)(sc))*((((double)dd)/((double)dg))/bw_rate); + if(lin_pen > a_pen) lin_pen = a_pen; + lin_pen += (chn_pen_skip*(double)dg); + sc -= (int32_t)lin_pen; + } + return sc; +} + +uint64_t lchain_dp(k_mer_hit* a, int64_t a_n, k_mer_hit* des, Chain_Data* dp, overlap_region* res, + int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, + int64_t xl, int64_t yl, int64_t quick_check) +{ + int64_t *p, *t, max_f, n_skip, st, max_j, end_j, sc, msc, msc_i, bw, max_ii, ovl, movl; + int32_t *f, max, tmp; int64_t i, j, ret, dq, dr, dd, pdd, cL = 0; + resize_Chain_Data(dp, a_n, NULL); + t = dp->tmp; f = dp->score; p = dp->pre; + bw = ((xl < yl)?xl:yl); bw *= bw_rate; + msc = msc_i = -1; movl = INT32_MAX; + + if(quick_check) { + ret = lchain_check(a, a_n, dp, bw_rate); + if (ret > 0) { + a_n = ret; msc_i = a_n-1; msc = f[msc_i]; + goto skip_ldp; + } + } + + memset(t, 0, (a_n*sizeof((*t)))); + for (i = st = 0, max_ii = -1; i < a_n; ++i) { + max_f = a[i].cnt&(0xffu); + n_skip = 0; max_j = end_j = -1; + if ((i-st) > max_iter) st = i-max_iter; + + for (j = i - 1; j >= st; --j) { + sc = comput_sc_ch(&a[i], &a[j], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); + if (sc == INT32_MIN) continue; + sc += f[j]; + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) { + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + end_j = j; + + if (max_ii < 0 || ((int64_t)a[i].offset) - ((int64_t)a[max_ii].offset) > max_dis) { + max = INT32_MIN; max_ii = -1; + for (j = i - 1; j >= st; --j) { + if (max < f[j]) { + max = f[j], max_ii = j; + } + } + } + + if (max_ii >= 0 && max_ii < end_j) {///just have a try with a[i]<->a[max_ii] + tmp = comput_sc_ch(&a[i], &a[max_ii], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl); + if (tmp != INT32_MIN && max_f < tmp + f[max_ii]) + max_f = tmp + f[max_ii], max_j = max_ii; + } + f[i] = max_f; p[i] = max_j; + if ((max_ii < 0) || (((((int64_t)a[i].offset)-((int64_t)a[max_ii].offset))<=max_dis) && (f[max_ii]= msc) { + ovl = get_chainLen(a[i].self_offset, a[i].self_offset, xl, a[i].offset, a[i].offset, yl); + if(f[i] > msc || ovl < movl) { + msc = f[i]; msc_i = i; movl = ovl; + } + } + } + + skip_ldp: + clear_fake_cigar(&(res->f_cigar)); + ///a[] has been sorted by offset + i = msc_i; + res->x_pos_e = a[i].self_offset; + res->y_pos_e = a[i].offset; + res->shared_seed = msc; + // res->overlapLen = movl; + + dq = res->x_pos_e - a[i].self_offset; + dr = res->y_pos_e - a[i].offset; + dd = dr - dq; pdd = dd; + ///record first site + ///the length of f_cigar should be at least 1 + ///record the offset of reference + add_fake_cigar(&(res->f_cigar), a[i].self_offset, dd, NULL); + cL = 0; + if(res->x_pos_strand == 1) { + while (i >= 0) { + dq = res->x_pos_e - a[i].self_offset; + dr = res->y_pos_e - a[i].offset; + dd = dr - dq; + if(dd != pdd) { + pdd = dd; + ///record this site + add_fake_cigar(&(res->f_cigar), a[i].self_offset, pdd, NULL); + } + t[cL++] = i; + res->x_pos_s = a[i].self_offset; + res->y_pos_s = a[i].offset; + msc_i = i; i = p[i]; + } + } + else { + while (i >= 0) { + dq = res->x_pos_e - a[i].self_offset; + dr = res->y_pos_e - a[i].offset; + dd = dr - dq; + if(dd == pdd) { + res->f_cigar.length--; + add_fake_cigar(&(res->f_cigar), a[i].self_offset, pdd, NULL); + } else { + pdd = dd; + add_fake_cigar(&(res->f_cigar), a[i].self_offset, pdd, NULL); + } + t[cL++] = i; + res->x_pos_s = a[i].self_offset; + res->y_pos_s = a[i].offset; + msc_i = i; i = p[i]; + } + } + res->overlapLen = get_chainLen(a[msc_i].self_offset, a[i].self_offset, xl, a[msc_i].offset, a[i].offset, yl); + for (i = 0; i < cL; i++) des[i] = a[t[cL-i-1]]; + return cL; +} \ No newline at end of file diff --git a/Hash_Table.h b/Hash_Table.h index 14c7409..c780b24 100644 --- a/Hash_Table.h +++ b/Hash_Table.h @@ -187,6 +187,11 @@ void clear_window_list_alloc(window_list_alloc* x); void destory_window_list_alloc(window_list_alloc* x); void resize_window_list_alloc(window_list_alloc* x, uint64_t size); long long chain_DP(k_mer_hit* a, long long a_n, Chain_Data* dp, overlap_region* result, double band_width_threshold, int max_skip, int x_readLen, int y_readLen, void *km); +uint64_t lchain_dp(k_mer_hit* a, int64_t a_n, k_mer_hit* des, Chain_Data* dp, overlap_region* res, + int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, + int64_t xl, int64_t yl, int64_t quick_check); +int ovlp_chain_gen(overlap_region_alloc* ol, overlap_region* t, int64_t xl, int64_t yl, int64_t apend_be); int append_utg_inexact_overlap_region_alloc(overlap_region_alloc* list, overlap_region* tmp, ma_utg_v *ua, int add_beg_end, void *km); + #endif diff --git a/Makefile b/Makefile index 30c843c..9d70e79 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ INCLUDES= OBJS= CommandLines.o Process_Read.o Assembly.o Hash_Table.o \ POA.o Correct.o Levenshtein_distance.o Overlaps.o Trio.o kthread.o Purge_Dups.o \ htab.o hist.o sketch.o anchor.o extract.o sys.o ksw2_extz2_sse.o hic.o rcut.o horder.o \ - tovlp.o inter.o kalloc.o gfa_ut.o + tovlp.o inter.o kalloc.o gfa_ut.o gchain_map.o EXE= hifiasm LIBS= -lz -lpthread -lm @@ -78,4 +78,5 @@ horder.o: horder.h tovlp.o: tovlp.h inter.o: inter.h Process_Read.h kalloc.o: kalloc.h -gfa_ut.o: Overlaps.h \ No newline at end of file +gfa_ut.o: Overlaps.h +gchain_map.o: gchain_map.h \ No newline at end of file diff --git a/anchor.cpp b/anchor.cpp index 15df407..75e11ed 100644 --- a/anchor.cpp +++ b/anchor.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "htab.h" #include "ksort.h" #include "Hash_Table.h" @@ -780,3 +781,171 @@ void ha_sort_list_by_anchor(overlap_region_alloc *overlap_list) { ks_introsort_or_xs(overlap_list->length, overlap_list->list); } + + +void minimizers_gen(ha_abufl_t *ab, char* rs, int64_t rl, uint64_t mz_w, uint64_t mz_k, Candidates_list *cl, kvec_t_u8_warp* k_flag, +void *ha_flt_tab, ha_pt_t *ha_idx, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t high_occ) +{ + uint64_t i, k, l; int n, j; ha_mzl_t *z; seedl_t *s; + if(high_occ < 1) high_occ = 1; + clear_Candidates_list(cl); ab->mz.n = 0, ab->n_a = 0; + + // get the list of anchors + mz2_ha_sketch(rs, rl, mz_w, mz_k, 0, !(asm_opt.flag & HA_F_NO_HPC), &ab->mz, ha_flt_tab, asm_opt.mz_sample_dist, k_flag, dbg_ct, NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 0, NULL); + + // minimizer of queried read + if (ab->mz.m > ab->old_mz_m) { + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + + for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { + ab->seed[i].a = ha_ptl_get(ha_idx, ab->mz.a[i].x, &n); + ab->seed[i].n = n; + ab->n_a += n; + } + + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + REALLOC(ab->a, ab->m_a); + } + + for (i = 0, k = 0; i < ab->mz.n; ++i) { + ///z is one of the minimizer + z = &ab->mz.a[i]; s = &ab->seed[i]; + for (j = 0; j < s->n; ++j) { + const ha_idxposl_t *y = &s->a[j]; + anchor1_t *an = &ab->a[k++]; + uint8_t rev = z->rev == y->rev? 0 : 1; + an->other_off = y->pos; + an->self_off = rev? rl - 1 - (z->pos + 1 - z->span) : z->pos; + ///an->cnt: cnt<<8|span + an->cnt = s->n; if(an->cnt > ((uint32_t)(0xffffffu))) an->cnt = 0xffffffu; + an->cnt <<= 8; an->cnt |= ((z->span <= ((uint32_t)(0xffu)))?z->span:((uint32_t)(0xffu))); + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->other_off; + } + } + + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); + for (k = 1, l = 0; k <= ab->n_a; ++k) { + if (k == ab->n_a || ab->a[k].srt != ab->a[l].srt) { + if (k - l > 1) + radix_sort_ha_an2(ab->a + l, ab->a + k); + l = k; + } + } + + // copy over to _cl_ + if (ab->m_a >= (uint64_t)cl->size) { + cl->size = ab->m_a; + REALLOC(cl->list, cl->size); + } + + for (k = 0; k < ab->n_a; ++k) { + k_mer_hit *p = &cl->list[k]; + p->readID = ab->a[k].srt >> 33; + p->strand = ab->a[k].srt >> 32 & 1; + p->offset = ab->a[k].other_off; + p->self_offset = ab->a[k].self_off; + if((ab->a[k].cnt>>8) <= high_occ){ + p->cnt = 1; + } + else{ + p->cnt = 1 + (((ab->a[k].cnt>>8) + (high_occ<<1) - 1)/(high_occ<<1)); + p->cnt = pow(p->cnt, 1.1); + } + if(p->cnt > ((uint32_t)(0xffffffu))) p->cnt = 0xffffffu; + p->cnt <<= 8; p->cnt |= (((uint32_t)(0xffu))&(ab->a[k].cnt)); + } + cl->length = ab->n_a; +} + +void lchain_gen(Candidates_list* cl, overlap_region_alloc* ol, uint64_t rid, uint64_t rl, All_reads* rdb, + const ul_idx_t *udb, uint32_t beg_tail, overlap_region* tf, uint64_t max_n_chain, + int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate, int64_t quick_check) +{ + uint64_t i, k, l, m, sm, cn = cl->length; + clear_overlap_region_alloc(ol); + clear_fake_cigar(&(tf->f_cigar)); + + ///calculate_overlap_region_by_chaining(cl, overlap_list, chain_idx, rid, rl, NULL, uref, bw_thres, keep_whole_chain, f_cigar); + for (l = 0, k = 1, m = 0; k <= cn; k++) { + if((k == cn) || (cl->list[k].readID != cl->list[l].readID) + || (cl->list[k].strand != cl->list[l].strand)) { + if(cl->list[l].readID != rid) { + tf->x_id = rid; + tf->x_pos_strand = cl->list[l].strand; + tf->y_id = cl->list[l].readID; + tf->y_pos_strand = 0;///always 0 + sm = lchain_dp(cl->list+l, k-l, cl->list+m, &(cl->chainDP), tf, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_rate, + rl, rdb?Get_READ_LENGTH((*rdb), (*tf).y_id):udb->ug->u.a[(*tf).y_id].len, quick_check); + assert(sm > 0); + if(ovlp_chain_gen(ol, tf, rl, rdb?Get_READ_LENGTH((*rdb), (*tf).y_id):udb->ug->u.a[(*tf).y_id].len, beg_tail)) { + m += sm; + } + } + l = k; + } + } + cl->length = m; + + + + if (ol->length > max_n_chain) { + int32_t w, n[4], s[4]; overlap_region t; + n[0] = n[1] = n[2] = n[3] = 0, s[0] = s[1] = s[2] = s[3] = 0; + ks_introsort_or_ss(ol->length, ol->list); + for (i = 0; i < ol->length; ++i) { + const overlap_region *r = &(ol->list[i]); + w = ha_ov_type(r, rl); + ++n[w]; + if (((uint64_t)n[w]) == max_n_chain) s[w] = r->shared_seed; + } + if (s[0] > 0 || s[1] > 0 || s[2] > 0 || s[3] > 0) { + // n[0] = n[1] = n[2] = n[3] = 0; + for (i = 0, k = 0; i < ol->length; ++i) { + overlap_region *r = &(ol->list[i]); + w = ha_ov_type(r, rl); + // ++n[w]; + // if (((int)n[w] <= max_n_chain) || (r->shared_seed >= s[w] && s[w] >= (asm_opt.k_mer_length<<1))) { + if (r->shared_seed >= s[w]) { + if (k != i) { + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + ++k; + } + } + ol->length = k; + } + } + ks_introsort_or_xs(ol->length, ol->list); +} + +void set_lchain_dp_op(uint32_t is_accurate, uint32_t mz_k, int64_t *max_skip, int64_t *max_iter, int64_t *max_dis, double *chn_pen_gap, double *chn_pen_skip, int64_t *quick_check) +{ + double div, pen_gap, pen_skip, tmp; + if(is_accurate) { + (*quick_check) = 1; (*max_skip) = 25; (*max_iter) = 5000; (*max_dis) = 5000; div = 0.01; pen_gap = 0.5f; pen_skip = 0.0005f; + } else { + (*quick_check) = 0; (*max_skip) = 25; (*max_iter) = 5000; (*max_dis) = 5000; div = 0.1; pen_gap = 0.5f; pen_skip = 0.0005f; + } + tmp = expf(-div * (double)mz_k);///0.60049557881 -> HiFi; 0.18268352405 -> ont + *chn_pen_gap = pen_gap * tmp;///0.300247789405 -> HiFi; 0.091341762025 -> ont + ///0.000300247789405 -> HiFi (>3330 will be negative); + //0.000091341762025 -> ont (>10947 will be negative); + *chn_pen_skip = pen_skip * tmp; +} + +void ul_map_lchain(ha_abufl_t *ab, int64_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, overlap_region_alloc *overlap_list_hp, Candidates_list *cl, double bw_thres, + int max_n_chain, int keep_whole_chain, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t high_occ, uint32_t is_accurate) +{ + extern void *ha_flt_tab; + extern ha_pt_t *ha_idx; + int64_t max_skip, max_iter, max_dis, quick_check; double chn_pen_gap, chn_pen_skip; + set_lchain_dp_op(is_accurate, mz_k, &max_skip, &max_iter, &max_dis, &chn_pen_gap, &chn_pen_skip, &quick_check); + minimizers_gen(ab, rs, rl, mz_w, mz_k, cl, k_flag, ha_flt_tab, ha_idx, dbg_ct, sp, high_occ); + lchain_gen(cl, overlap_list, rid, rl, NULL, uref, keep_whole_chain, f_cigar, max_n_chain, max_skip, max_iter, max_dis, chn_pen_gap, chn_pen_skip, bw_thres, quick_check); + ///no need to sort here, overlap_list has been sorted at lchain_gen +} diff --git a/gchain_map.cpp b/gchain_map.cpp new file mode 100644 index 0000000..09f142b --- /dev/null +++ b/gchain_map.cpp @@ -0,0 +1,204 @@ +#include +#include +#include +#include +#include +#include "kseq.h" // FASTA/Q parser +#include "kavl.h" +#include "khash.h" +#include "kalloc.h" +#include "kthread.h" +#include "inter.h" +#include "Overlaps.h" +#include "CommandLines.h" +#include "htab.h" +#include "Hash_Table.h" +#include "Correct.h" +#include "Process_Read.h" +#include "Assembly.h" +#include "gchain_map.h" +KSEQ_INIT(gzFile, gzread) +void ul_map_lchain(ha_abufl_t *ab, int64_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, overlap_region_alloc *overlap_list_hp, Candidates_list *cl, double bw_thres, + int max_n_chain, int keep_whole_chain, kvec_t_u8_warp* k_flag, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t high_occ, uint32_t is_accurate); + +typedef struct { // global data structure for kt_pipeline() + const void *ha_flt_tab; + const ha_pt_t *ha_idx; + const mg_idxopt_t *opt; + const ma_ug_t *ug; + const asg_t *rg; + const ug_opt_t *uopt; + const ul_idx_t *uu; + ucr_file_t *ucr_s; + kseq_t *ks; + int64_t chunk_size; + uint64_t n_thread; + uint64_t total_base; + uint64_t total_pair; + uint64_t num_bases, num_corrected_bases, num_recorrected_bases; + uint64_t remap, mini_cut; +} gmap_t; + +typedef struct { // data structure for each step in kt_pipeline() + const mg_idxopt_t *opt; + const void *ha_flt_tab; + const ha_pt_t *ha_idx; + const ma_ug_t *ug; + const asg_t *rg; + const ug_opt_t *uopt; + const ul_idx_t *uu; + int n, m, sum_len; + uint64_t *len, id; + char **seq; + // ha_mzl_v *mzs;///useless + // st_mt_t *sps;///useless + // mg_gchains_t **gcs;///useless + mg_tbuf_t **buf;///useless + ha_ovec_buf_t **hab; + kv_ul_ov_t *res; + // glchain_t *ll; + // gdpchain_t *gdp; + // glchain_t *sec_ll; + uint64_t num_bases, num_corrected_bases, num_recorrected_bases, mini_cut; + int64_t n_thread; +} sstep_t; + +/** +static void worker_ul_map(void *data, long i, int tid) // callback for kt_for() +{ + sstep_t *s = (sstep_t*)data; + ha_ovec_buf_t *b = s->hab[tid]; + kv_ul_ov_t *res = (s->res?(&(s->res[tid])):(NULL)); + mg_tbuf_t *buf = (s->buf?s->buf[tid]:NULL); + int64_t winLen = MIN((((double)THRESHOLD_MAX_SIZE)/s->opt->diff_ec_ul), WINDOW); + int fully_cov, abnormal; + assert(UL_INF.a[s->id+i].rlen == s->len[i]); + // if(s->id+i!=43) return; + + ul_map_lchain(b->abl, i, s->seq[i], s->len[i], s->opt->w, s->opt->k, s->uu, &b->olist, &b->olist_hp, &b->clist, s->opt->bw_thres, + s->opt->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), s->mini_cut, 0); + + clear_Cigar_record(&b->cigar1); + clear_Round2_alignment(&b->round2); + + b->self_read.seq = s->seq[i]; b->self_read.length = s->len[i]; b->self_read.size = 0; + lchain_align(&b->olist, s->uu, &b->self_read, &b->correct, &b->ovlp_read, &b->POA_Graph, &b->DAGCon, + &b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 1, &fully_cov, &abnormal, s->opt->diff_ec_ul, winLen, NULL); + + // gl_chain_refine(&b->olist, &b->correct, &b->hap, bl, s->uu, s->opt->diff_ec_ul, winLen, s->len[i], km); + gl_chain_refine_advance_combine(s->buf[tid], &(UL_INF.a[s->id+i]), &b->olist, &b->correct, &b->hap, &(s->sps[tid]), bl, &(s->gdp[tid]), s->uu, s->opt->diff_ec_ul, winLen, s->len[i], s->uopt, s->id+i, tid, NULL); + + memset(&b->self_read, 0, sizeof(b->self_read)); + if(UL_INF.a[s->id+i].dd) { + free(s->seq[i]); s->seq[i] = NULL; b->num_correct_base++; + } + s->hab[tid]->num_read_base++; +} + +static void *worker_gmap_work_ovec_pip(void *data, int step, void *in) // callback for kt_pipeline() +{ + gmap_t *p = (gmap_t*)data; + if (step == 0) { // step 1: read a block of sequences + int32_t ret; uint64_t l; sstep_t *s; CALLOC(s, 1); + s->ha_flt_tab = p->ha_flt_tab; s->ha_idx = p->ha_idx; s->id = p->total_pair; + s->opt = p->opt; s->uu = p->uu; s->uopt = p->uopt; s->rg = p->rg; s->mini_cut = p->mini_cut;///need set + while ((ret = kseq_read(p->ks)) >= 0) { + if (p->ks->seq.l < (uint64_t)p->opt->k) continue; + if (s->n == s->m) { + s->m = s->m < 16? 16 : s->m + (s->n>>1); + REALLOC(s->len, s->m); + REALLOC(s->seq, s->m); + } + if(!(p->remap)) { + append_ul_t(&UL_INF, NULL, p->ks->name.s, p->ks->name.l, NULL, 0, NULL, 0, P_CHAIN_COV, s->uopt, 0); + } + l = p->ks->seq.l; + MALLOC(s->seq[s->n], l); + s->sum_len += l; + memcpy(s->seq[s->n], p->ks->seq.s, l); + s->len[s->n++] = l; + if (s->sum_len >= p->chunk_size) break; + } + p->total_pair += s->n; + if (s->sum_len == 0) free(s); + else return s; + } + else if (step == 1) { // step 2: alignment + sstep_t *s = (sstep_t*)in; uint64_t i; + CALLOC(s->hab, p->n_thread); + CALLOC(s->buf, p->n_thread); + if(!(p->remap)) CALLOC(s->res, p->n_thread);//for results + for (i = 0; i < p->n_thread; ++i) { + s->hab[i] = ha_ovec_init(0, 0, 1); s->buf[i] = mg_tbuf_init(); + } + // kt_for(p->n_thread, worker_for_ul_scall_alignment, s, s->n); + + for (i = 0; i < p->n_thread; ++i) { + s->num_bases += s->hab[i]->num_read_base; + s->num_corrected_bases += s->hab[i]->num_correct_base; + s->num_recorrected_bases += s->hab[i]->num_recorrect_base; + ha_ovec_destroy(s->hab[i]); mg_tbuf_destroy(s->buf[i]); + } + free(s->hab); free(s->buf); + return s; + } + else if (step == 2) { // step 3: dump + sstep_t *s = (sstep_t*)in; + uint64_t i, rid, sn = s->n; + p->num_bases += s->num_bases; + p->num_corrected_bases += s->num_corrected_bases; + p->num_recorrected_bases += s->num_recorrected_bases; + if(!(p->remap)) { + for (i = 0; i < p->n_thread; ++i) { + push_uc_block_t(s->uopt, &(s->res[i]), s->seq, s->len, s->id); + kv_destroy(s->res[i]); + } + free(s->res); + + for (i = 0; i < sn; ++i) { + rid = s->id + i; + if((UL_INF.n <= rid) || (UL_INF.n > rid && UL_INF.a[rid].rlen != s->len[i])) {///reads without alignment + append_ul_t(&UL_INF, &rid, NULL, 0, s->seq[i], s->len[i], NULL, 0, P_CHAIN_COV, s->uopt, 0); + } + free(s->seq[i]); + } + } else { + for (i = 0; i < sn; ++i) { + rid = s->id + i; + if(UL_INF.a[rid].dd == 0 && p->ucr_s && p->ucr_s->flag == 1) { + assert(s->seq[i]); + ///for debug interval + write_compress_base_disk(p->ucr_s->fp, rid, s->seq[i], s->len[i], &(p->ucr_s->u)); + } + free(s->seq[i]); + } + } + free(s->len); free(s->seq); free(s); + } + return 0; +} +**/ + +// int gmap_work_ovec(gmap_t* sl, const enzyme *fn) +// { +// double index_time = yak_realtime(); +// int i; + +// init_all_ul_t(&UL_INF, &R_INF); +// for (i = 0; i < fn->n; i++){ +// gzFile fp; +// if ((fp = gzopen(fn->a[i], "r")) == 0) return 0; +// sl->ks = kseq_init(fp); +// kt_pipeline(3, worker_gmap_work_ovec_pip, sl, 3); +// kseq_destroy(sl->ks); +// gzclose(fp); +// } +// sl->hits.total_base = sl->total_base; +// sl->hits.total_pair = sl->total_pair; +// fprintf(stderr, "[M::%s::%.3f] ==> Qualification\n", __func__, yak_realtime()-index_time); +// fprintf(stderr, "[M::%s::] ==> # reads: %lu, # bases: %lu\n", __func__, UL_INF.n, sl->total_base); +// fprintf(stderr, "[M::%s::] ==> # bases: %lu; # corrected bases: %lu; # recorrected bases: %lu\n", +// __func__, sl->num_bases, sl->num_corrected_bases, sl->num_recorrected_bases); +// gen_ul_vec_rid_t(&UL_INF, &R_INF, NULL); +// return 1; +// } \ No newline at end of file diff --git a/gchain_map.h b/gchain_map.h new file mode 100644 index 0000000..c2d53a4 --- /dev/null +++ b/gchain_map.h @@ -0,0 +1,6 @@ +#ifndef __INTER__ +#define __INTER__ +#include "Overlaps.h" +#include "Process_Read.h" + +#endif diff --git a/inter.cpp b/inter.cpp index f112bcf..7effda3 100644 --- a/inter.cpp +++ b/inter.cpp @@ -20,22 +20,6 @@ KSEQ_INIT(gzFile, gzread) void ha_get_ul_candidates_interface(ha_abufl_t *ab, int64_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, overlap_region_alloc *overlap_list_hp, Candidates_list *cl, double bw_thres, int max_n_chain, int keep_whole_chain, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* chain_idx, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t high_occ, void *km); -#define G_CHAIN_BW 16//128 -#define FLANK_M (0x7fffU) -#define P_CHAIN_COV 0.985 -#define P_FRAGEMENT_CHAIN_COV 0.20 -#define P_FRAGEMENT_PRIMARY_CHAIN_COV 0.70 -#define P_FRAGEMENT_PRIMARY_SECOND_COV 0.25 -#define P_CHAIN_SCORE 0.6 -#define G_CHAIN_GAP 0.1 -#define UG_SKIP 5 -#define RG_SKIP 25 -#define G_CHAIN_TRANS_RATE 0.25 -#define G_CHAIN_TRANS_WEIGHT -1 -#define G_CHAIN_INDEL 128 -#define W_CHN_PEN_GAP 0.1 -#define N_GCHAIN_RATE 0.04 -#define PRIMARY_UL_CHAIN_MIN 75000 #define MG_SEED_IGNORE (1ULL<<41) #define MG_SEED_TANDEM (1ULL<<42) @@ -85,13 +69,6 @@ KRADIX_SORT_INIT(hap_ev_cov_srt, haplotype_evdience, hap_ev_cov_key, member_size #define uc_block_t_qe_key(x) ((x).qe) KRADIX_SORT_INIT(uc_block_t_qe_srt, uc_block_t, uc_block_t_qe_key, member_size(uc_block_t, qe)); -struct mg_tbuf_s { - void *km; - int frag_gap; -}; -typedef struct mg_tbuf_s mg_tbuf_t; - - mg_tbuf_t *mg_tbuf_init(void) { mg_tbuf_t *b; @@ -112,15 +89,6 @@ void *mg_tbuf_get_km(mg_tbuf_t *b) return b->km; } -typedef struct { - int w, k, bw, max_gap, is_HPC, hap_n, occ_weight, max_gap_pre, max_gc_seq_ext, seed; - int max_lc_skip, max_lc_iter, min_lc_cnt, min_lc_score, max_gc_skip, ref_bonus; - int min_gc_cnt, min_gc_score, sub_diff, best_n; - float chn_pen_gap, mask_level, pri_ratio; - ///base-alignment - double bw_thres, diff_ec_ul, diff_ec_ul_low; int max_n_chain, ec_ul_round; -} mg_idxopt_t; - typedef struct { ///off: start idx in mg128_t * a[]; ///cnt: how many eles in this chain @@ -159,58 +127,6 @@ KRADIX_SORT_INIT(mg_pathv_t_v_srt, mg_pathv_t, mg_pathv_t_v_srt_key, member_size KRADIX_SORT_INIT(mg_pathv_t_d_srt, mg_pathv_t, mg_pathv_t_d_srt_key, member_size(mg_pathv_t, d)) -typedef struct { - int32_t off, cnt; - uint32_t v; - int32_t score; -} mg_llchain_t; - -typedef struct { - int32_t id, parent; - int32_t off, cnt; - int32_t n_anchor, score; - int32_t qs, qe; - int32_t plen, ps, pe; - int32_t blen, mlen; - float div; - uint32_t hash; - int32_t subsc, n_sub; - uint32_t mapq:8, flt:1, dummy:23; -} mg_gchain_t; - -typedef struct { - size_t n,m; - uint64_t *a, tl; - kvec_t(char) cc; -} mg_dbn_t; - -typedef struct { - int32_t cnt; - uint32_t v; - int32_t score; - uint32_t qs, qe, ts, te; -} mg_lres_t; - -typedef struct { - int32_t n_gc, n_lc; - mg_gchain_t *gc;///g_chain; idx in l_chains - mg_lres_t *lc;///l_chain - uint64_t qid, qlen; -} mg_gres_t; - -typedef struct { - size_t n,m; - mg_gres_t *a; - uint64_t total_base; - uint64_t total_pair; -} mg_gres_a; - -typedef struct { - FILE *fp; - ul_vec_t u; - uint64_t flag; -} ucr_file_t; - typedef struct { // global data structure for kt_pipeline() const void *ha_flt_tab; const ha_pt_t *ha_idx; @@ -5309,7 +5225,7 @@ static void worker_for_ul_rescall_alignment(void *data, long i, int tid) // call // if(s->id+i!=41927 && s->id+i!=47072 && s->id+i!=67641 && s->id+i!=90305 && s->id+i!=698342 && s->id+i!=329421) { // return; // } - // if(s->id+i!=43) return; + // if(s->id+i!=97) return; // fprintf(stderr, "\n[M::%s] rid::%ld, len::%lu, name::%.*s\n", __func__, s->id+i, s->len[i], // (int32_t)UL_INF.nid.a[s->id+i].n, UL_INF.nid.a[s->id+i].a); @@ -5326,7 +5242,7 @@ static void worker_for_ul_rescall_alignment(void *data, long i, int tid) // call b->self_read.seq = s->seq[i]; b->self_read.length = s->len[i]; b->self_read.size = 0; correct_ul_overlap(&b->olist, s->uu, &b->self_read, &b->correct, &b->ovlp_read, &b->POA_Graph, &b->DAGCon, &b->cigar1, &b->hap, &b->round2, &b->r_buf, &(b->tmp_region.w_list), 0, 1, &fully_cov, &abnormal, s->opt->diff_ec_ul, winLen, NULL); - + // exit(1); // uint64_t k; // for (k = 0; k < b->olist.length; k++) { // if(b->olist.list[k].is_match == 1) b->num_correct_base += b->olist.list[k].x_pos_e+1-b->olist.list[k].x_pos_s; @@ -10688,7 +10604,7 @@ ma_ug_t *ul_realignment(const ug_opt_t *uopt, asg_t *sg, uint32_t double_check_c if(!load_all_ul_t(&UL_INF, gfa_name, &R_INF, ug)/**1**/) { gen_UL_reovlps(&sl, ug, sg, gfa_name, cutoff); // exit(1); - write_all_ul_t(&UL_INF, gfa_name, ug); + // write_all_ul_t(&UL_INF, gfa_name, ug); } else if(double_check_cache){ if(drenew_UL_reovlps(&sl, ug, sg, gfa_name, cutoff)) { write_all_ul_t(&UL_INF, gfa_name, ug); diff --git a/inter.h b/inter.h index aaf9c42..77ac3eb 100644 --- a/inter.h +++ b/inter.h @@ -3,6 +3,98 @@ #include "Overlaps.h" #include "Process_Read.h" +#define G_CHAIN_BW 16//128 +#define FLANK_M (0x7fffU) +#define P_CHAIN_COV 0.985 +#define P_FRAGEMENT_CHAIN_COV 0.20 +#define P_FRAGEMENT_PRIMARY_CHAIN_COV 0.70 +#define P_FRAGEMENT_PRIMARY_SECOND_COV 0.25 +#define P_CHAIN_SCORE 0.6 +#define G_CHAIN_GAP 0.1 +#define UG_SKIP 5 +#define RG_SKIP 25 +#define G_CHAIN_TRANS_RATE 0.25 +#define G_CHAIN_TRANS_WEIGHT -1 +#define G_CHAIN_INDEL 128 +#define W_CHN_PEN_GAP 0.1 +#define N_GCHAIN_RATE 0.04 +#define PRIMARY_UL_CHAIN_MIN 75000 + +typedef struct { + int w, k, bw, max_gap, is_HPC, hap_n, occ_weight, max_gap_pre, max_gc_seq_ext, seed; + int max_lc_skip, max_lc_iter, min_lc_cnt, min_lc_score, max_gc_skip, ref_bonus; + int min_gc_cnt, min_gc_score, sub_diff, best_n; + float chn_pen_gap, mask_level, pri_ratio; + ///base-alignment + double bw_thres, diff_ec_ul, diff_ec_ul_low; int max_n_chain, ec_ul_round; +} mg_idxopt_t; + +struct mg_tbuf_s { + void *km; + int frag_gap; +}; +typedef struct mg_tbuf_s mg_tbuf_t; + + +mg_tbuf_t *mg_tbuf_init(void); + +void mg_tbuf_destroy(mg_tbuf_t *b); + +void *mg_tbuf_get_km(mg_tbuf_t *b); + +typedef struct { + FILE *fp; + ul_vec_t u; + uint64_t flag; +} ucr_file_t; + +typedef struct { + int32_t off, cnt; + uint32_t v; + int32_t score; +} mg_llchain_t; + +typedef struct { + int32_t id, parent; + int32_t off, cnt; + int32_t n_anchor, score; + int32_t qs, qe; + int32_t plen, ps, pe; + int32_t blen, mlen; + float div; + uint32_t hash; + int32_t subsc, n_sub; + uint32_t mapq:8, flt:1, dummy:23; +} mg_gchain_t; + +typedef struct { + size_t n,m; + uint64_t *a, tl; + kvec_t(char) cc; +} mg_dbn_t; + +typedef struct { + int32_t cnt; + uint32_t v; + int32_t score; + uint32_t qs, qe, ts, te; +} mg_lres_t; + +typedef struct { + int32_t n_gc, n_lc; + mg_gchain_t *gc;///g_chain; idx in l_chains + mg_lres_t *lc;///l_chain + uint64_t qid, qlen; +} mg_gres_t; + +typedef struct { + size_t n,m; + mg_gres_t *a; + uint64_t total_base; + uint64_t total_pair; +} mg_gres_a; + +void push_uc_block_t(const ug_opt_t *uopt, kv_ul_ov_t *z, char **seq, uint64_t *len, uint64_t b_id); void ul_resolve(ma_ug_t *ug, const asg_t *rg, const ug_opt_t *uopt, int hap_n); void ul_load(const ug_opt_t *uopt); uint64_t* get_hifi2ul_list(all_ul_t *x, uint64_t hid, uint64_t* a_n);