diff --git a/anchor.cpp b/anchor.cpp index 3a07f95..bad4b55 100644 --- a/anchor.cpp +++ b/anchor.cpp @@ -265,130 +265,127 @@ void calculate_ug_chaining(Candidates_list* candidates, overlap_region_alloc* ov } } -/** -void ha_get_inter_candidates(ha_abufl_t *ab, int64_t uid, ma_utg_v *ua, overlap_region_alloc *ovlp, -Candidates_list *cl, double bw_thres, int max_n_chain, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* chain_idx, -void *ha_flt_tab, ha_pt_t *ha_idx, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, -double chain_match_rate, uint32_t uk, uint32_t uw, uint32_t is_hpc, uint32_t h_occ) -{ - uint32_t i, rlen; - uint64_t k, l; - ma_utg_t *u = &(ua->a[uid]); +void ha_get_inter_candidates(ha_abufl_t *ab, uint64_t id, char* r, uint64_t rlen, uint64_t rw, uint64_t rk, uint64_t is_hpc, + overlap_region_alloc *ol, Candidates_list *cl, double bw_thres, int max_n_chain, int keep_whole_chain, + kvec_t_u8_warp* k_flag, kvec_t_u64_warp* chain_idx, void *ha_flt_tab, ha_pt_t *ha_idx, + overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp) +{ + uint64_t i, k, l; // prepare clear_Candidates_list(cl); - clear_overlap_region_alloc(ovlp); + clear_overlap_region_alloc(ol); ab->mz.n = 0, ab->n_a = 0; - mz2_ha_sketch(u->s, u->len, uw, uk, uid, is_hpc, &ab->mz, ha_flt_tab, asm_opt.mz_sample_dist, k_flag, dbg_ct, - NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 1); + // get the list of anchors + mz2_ha_sketch(r, rlen, rw, rk, 0, is_hpc, &ab->mz, ha_flt_tab, asm_opt.mz_sample_dist, k_flag, dbg_ct, + NULL, -1, asm_opt.dp_min_len, -1, sp, asm_opt.mz_rewin, 1); + // minimizer of queried read if (ab->mz.m > ab->old_mz_m) { - ab->old_mz_m = ab->mz.m; - REALLOC(ab->seed, ab->old_mz_m); - } - for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { - int n; - ab->seed[i].a = ha_ptl_get(ha_idx, ab->mz.a[i].x, &n); - ab->seed[i].n = n; - ab->seed[i].cnt = ha_ft_cnt(ha_flt_tab, ab->mz.a[i].x); - ab->n_a += n; - } - if (ab->n_a > ab->m_a) { - ab->m_a = ab->n_a; - kroundup64(ab->m_a); - REALLOC(ab->a, ab->m_a); - } - for (i = 0, k = 0; i < ab->mz.n; ++i) { - int j; - ///z is one of the minimizer - ha_mzl_t *z = &ab->mz.a[i]; - seedl_t *s = &ab->seed[i]; - for (j = 0; j < s->n; ++j) { - const ha_idxposl_t *y = &s->a[j]; - anchor1_t *an = &ab->a[k++]; - uint8_t rev = z->rev == y->rev? 0 : 1; - an->other_off = y->pos; - an->self_off = rev? u->len - 1 - (z->pos + 1 - z->span) : z->pos; - an->cnt = s->cnt; - an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->other_off; - } - } + ab->old_mz_m = ab->mz.m; + REALLOC(ab->seed, ab->old_mz_m); + } + for (i = 0, ab->n_a = 0; i < ab->mz.n; ++i) { + int n; + ab->seed[i].a = ha_ptl_get(ha_idx, ab->mz.a[i].x, &n); + ab->seed[i].n = n; + ab->seed[i].cnt = ha_ft_cnt(ha_flt_tab, ab->mz.a[i].x); + ab->n_a += n; + } + if (ab->n_a > ab->m_a) { + ab->m_a = ab->n_a; + kroundup64(ab->m_a); + REALLOC(ab->a, ab->m_a); + } + for (i = 0, k = 0; i < ab->mz.n; ++i) { + int j; + ///z is one of the minimizer + ha_mzl_t *z = &ab->mz.a[i]; + seedl_t *s = &ab->seed[i]; + for (j = 0; j < s->n; ++j) { + const ha_idxposl_t *y = &s->a[j]; + anchor1_t *an = &ab->a[k++]; + uint8_t rev = z->rev == y->rev? 0 : 1; + an->other_off = y->pos; + an->self_off = rev? rlen - 1 - (z->pos + 1 - z->span) : z->pos; + an->cnt = s->n; + an->srt = (uint64_t)y->rid<<33 | (uint64_t)rev<<32 | an->other_off; + } + } - // sort anchors - radix_sort_ha_an1(ab->a, ab->a + ab->n_a); - for (k = 1, l = 0; k <= ab->n_a; ++k) { - if (k == ab->n_a || ab->a[k].srt != ab->a[l].srt) { - if (k - l > 1) - radix_sort_ha_an2(ab->a + l, ab->a + k); - l = k; - } - } + // sort anchors + radix_sort_ha_an1(ab->a, ab->a + ab->n_a); + for (k = 1, l = 0; k <= ab->n_a; ++k) { + if (k == ab->n_a || ab->a[k].srt != ab->a[l].srt) { + if (k - l > 1) + radix_sort_ha_an2(ab->a + l, ab->a + k); + l = k; + } + } - // copy over to _cl_ - if (ab->m_a >= (uint64_t)cl->size) { - cl->size = ab->m_a; - REALLOC(cl->list, cl->size); - } - for (k = 0; k < ab->n_a; ++k) { - k_mer_hit *p = &cl->list[k]; - p->readID = ab->a[k].srt >> 33; - p->strand = ab->a[k].srt >> 32 & 1; - p->offset = ab->a[k].other_off; - p->self_offset = ab->a[k].self_off; - p->cnt = (ab->a[k].cnt == 1? 1 : 16); - } - cl->length = ab->n_a; - // calculate_overlap_region_by_chaining(cl, ovlp, chain_idx, rid, ucr->length, &R_INF, bw_thres, keep_whole_chain, f_cigar); - // calculate_ug_chaining(cl, overlap_list, chain_idx, rid, ua, bw_thres, keep_whole_chain, f_cigar, ab->mz.n, chain_match_rate); + // copy over to _cl_ + if (ab->m_a >= (uint64_t)cl->size) { + cl->size = ab->m_a; + REALLOC(cl->list, cl->size); + } + for (k = 0; k < ab->n_a; ++k) { + k_mer_hit *p = &cl->list[k]; + p->readID = ab->a[k].srt >> 33; + p->strand = ab->a[k].srt >> 32 & 1; + p->offset = ab->a[k].other_off; + p->self_offset = ab->a[k].self_off; + p->cnt = (ab->a[k].cnt == 1? 1 : 16); + } + cl->length = ab->n_a; + calculate_overlap_region_by_chaining(cl, ol, chain_idx, id, rlen, &R_INF, bw_thres, keep_whole_chain, f_cigar); - #if 0 - if (overlap_list->length > 0) { - fprintf(stderr, "B\t%ld\t%ld\t%d\n", (long)rid, (long)overlap_list->length, rlen); - for (int i = 0; i < (int)overlap_list->length; ++i) { - overlap_region *r = &overlap_list->list[i]; - fprintf(stderr, "C\t%d\t%d\t%d\t%c\t%d\t%ld\t%d\t%d\t%c\t%d\t%d\n", (int)r->x_id, (int)r->x_pos_s, (int)r->x_pos_e, "+-"[r->x_pos_strand], - (int)r->y_id, (long)Get_READ_LENGTH(R_INF, r->y_id), (int)r->y_pos_s, (int)r->y_pos_e, "+-"[r->y_pos_strand], (int)r->shared_seed, ha_ov_type(r, rlen)); - } - } - #endif - - if ((int)ovlp->length > max_n_chain) { - int32_t w, n[4], s[4]; - n[0] = n[1] = n[2] = n[3] = 0, s[0] = s[1] = s[2] = s[3] = 0; - ks_introsort_or_ss(ovlp->length, ovlp->list); - for (i = 0; i < (uint32_t)ovlp->length; ++i) { - const overlap_region *r = &ovlp->list[i]; - w = ha_ov_type(r, rlen); - ++n[w]; - if ((int)n[w] == max_n_chain) s[w] = r->shared_seed; - } - if (s[0] > 0 || s[1] > 0 || s[2] > 0 || s[3] > 0) { - // n[0] = n[1] = n[2] = n[3] = 0; - for (i = 0, k = 0; i < (uint32_t)ovlp->length; ++i) { - overlap_region *r = &ovlp->list[i]; - w = ha_ov_type(r, rlen); - // ++n[w]; - // if (((int)n[w] <= max_n_chain) || (r->shared_seed >= s[w] && s[w] >= (asm_opt.k_mer_length<<1))) { - if (r->shared_seed >= s[w]) { - if ((uint32_t)k != i) { - overlap_region t; - t = ovlp->list[k]; - ovlp->list[k] = ovlp->list[i]; - ovlp->list[i] = t; - } - ++k; - } - } - ovlp->length = k; - } - } - ///ks_introsort_or_xs(overlap_list->length, overlap_list->list); -} -**/ + #if 0 + if (ol->length > 0) { + fprintf(stderr, "B\t%ld\t%ld\t%d\n", (long)rid, (long)ol->length, rlen); + for (int i = 0; i < (int)ol->length; ++i) { + overlap_region *r = &ol->list[i]; + fprintf(stderr, "C\t%d\t%d\t%d\t%c\t%d\t%ld\t%d\t%d\t%c\t%d\t%d\n", (int)r->x_id, (int)r->x_pos_s, (int)r->x_pos_e, "+-"[r->x_pos_strand], + (int)r->y_id, (long)Get_READ_LENGTH(R_INF, r->y_id), (int)r->y_pos_s, (int)r->y_pos_e, "+-"[r->y_pos_strand], (int)r->shared_seed, ha_ov_type(r, rlen)); + } + } + #endif + + if ((int)ol->length > max_n_chain) { + int32_t w, n[4], s[4]; + n[0] = n[1] = n[2] = n[3] = 0, s[0] = s[1] = s[2] = s[3] = 0; + ks_introsort_or_ss(ol->length, ol->list); + for (i = 0; i < (uint32_t)ol->length; ++i) { + const overlap_region *r = &ol->list[i]; + w = ha_ov_type(r, rlen); + ++n[w]; + if ((int)n[w] == max_n_chain) s[w] = r->shared_seed; + } + if (s[0] > 0 || s[1] > 0 || s[2] > 0 || s[3] > 0) { + // n[0] = n[1] = n[2] = n[3] = 0; + for (i = 0, k = 0; i < (uint32_t)ol->length; ++i) { + overlap_region *r = &ol->list[i]; + w = ha_ov_type(r, rlen); + // ++n[w]; + // if (((int)n[w] <= max_n_chain) || (r->shared_seed >= s[w] && s[w] >= (asm_opt.k_mer_length<<1))) { + if (r->shared_seed >= s[w]) { + if ((uint32_t)k != i) { + overlap_region t; + t = ol->list[k]; + ol->list[k] = ol->list[i]; + ol->list[i] = t; + } + ++k; + } + } + ol->length = k; + } + } + ///ks_introsort_or_xs(overlap_list->length, overlap_list->list); +} void ha_get_ug_candidates(ha_abuf_t *ab, int64_t rid, ma_utg_t *u, ma_utg_v *ua, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, int max_n_chain, int keep_whole_chain, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* chain_idx, void *ha_flt_tab, ha_pt_t *ha_idx, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, double chain_match_rate) diff --git a/inter.cpp b/inter.cpp index ce6b2d5..e4d6315 100644 --- a/inter.cpp +++ b/inter.cpp @@ -8,12 +8,20 @@ #include "Overlaps.h" #include "CommandLines.h" #include "htab.h" +#include "Hash_Table.h" KSEQ_INIT(gzFile, gzread) typedef struct { int w, k, bw, max_gap, is_HPC, hap_n; } mg_idxopt_t; +typedef struct { + ha_abufl_t *abl; + st_mt_t sp; + Candidates_list clist; + overlap_region_alloc olist; +} ma_ov_buf_t; + typedef struct { // global data structure for kt_pipeline() const void *ha_flt_tab; const ha_pt_t *ha_idx; @@ -25,6 +33,39 @@ typedef struct { // global data structure for kt_pipeline() uint64_t total_pair; } uldat_t; +typedef struct { // data structure for each step in kt_pipeline() + const void *ha_flt_tab; + const ha_pt_t *ha_idx; + int n, m, sum_len; + uint64_t *len, id; + char **seq; + ma_ov_buf_t **mo; +} utepdat_t; + +ma_ov_buf_t **init_ma_ov_buf_t_arr(uint64_t n) +{ + uint64_t i; + ma_ov_buf_t **p = NULL; CALLOC(p, 1); CALLOC(*p, n); + for (i = 0; i < n; i++){ + kv_init((*p)[i].sp); (*p)[i].abl = ha_abufl_init(); + init_Candidates_list(&((*p)[i].clist)); + init_overlap_region_alloc(&((*p)[i].olist)); + } + return p; +} + +void destory_ma_ov_buf_t_arr(ma_ov_buf_t ***p, uint64_t n) +{ + uint64_t i; + for (i = 0; i < n; i++){ + kv_destroy((**p)[i].sp); ha_abufl_destroy((**p)[i].abl); + destory_Candidates_list(&((**p)[i].clist)); + destory_overlap_region_alloc(&((**p)[i].olist)); + } + free((**p)); free((*p)); +} + + void init_mg_opt(mg_idxopt_t *opt, int is_HPC, int k, int w, int hap_n) { opt->k = k; @@ -50,33 +91,39 @@ void uidx_destory() ha_pt_destroy(ha_idx); } +static void worker_for_ul_alignment(void *data, long i, int tid) // callback for kt_for() +{ + utepdat_t *s = (utepdat_t*)data; + ma_ov_buf_t *b = s->mo[tid]; + uint64_t len = s->len[i]; + char *r = s->seq[i]; + + +} + static void *worker_ul_pipeline(void *data, int step, void *in) // callback for kt_pipeline() { - /** uldat_t *p = (uldat_t*)data; ///uint64_t total_base = 0, total_pair = 0; if (step == 0) { // step 1: read a block of sequences - int ret1, ret2; - uint64_t l1, l2; - stepdat_t *s; + int ret; + uint64_t l; + utepdat_t *s; CALLOC(s, 1); - s->idx = p->idx; s->id = p->total_pair; s->t_ch = p->t_ch; - while (((ret1 = kseq_read(p->ks1)) >= 0)&&((ret2 = kseq_read(p->ks2)) >= 0)) + s->ha_flt_tab = p->ha_flt_tab; s->ha_idx = p->ha_idx; s->id = p->total_pair; + while ((ret = kseq_read(p->ks)) >= 0) { - if (p->ks1->seq.l < p->idx->k || p->ks2->seq.l < p->idx->k) continue; + if (p->ks->seq.l < (uint64_t)p->opt->k) continue; if (s->n == s->m) { s->m = s->m < 16? 16 : s->m + (s->n>>1); REALLOC(s->len, s->m); REALLOC(s->seq, s->m); } - - l1 = p->ks1->seq.l; l2 = p->ks2->seq.l; - MALLOC(s->seq[s->n], l1+l2); - s->sum_len += l1+l2; - memcpy(s->seq[s->n], p->ks1->seq.s, l1); - memcpy(s->seq[s->n]+l1, p->ks2->seq.s, l2); - s->len[s->n++] = (uint64_t)(l1<<32)|(uint64_t)l2; - + l = p->ks->seq.l; + MALLOC(s->seq[s->n], l); + s->sum_len += l; + memcpy(s->seq[s->n], p->ks->seq.s, l); + s->len[s->n++] = l; if (s->sum_len >= p->chunk_size) break; } p->total_pair += s->n; @@ -84,25 +131,24 @@ static void *worker_ul_pipeline(void *data, int step, void *in) // callback for else return s; } else if (step == 1) { // step 2: alignment - stepdat_t *s = (stepdat_t*)in; - CALLOC(s->pos_buf, p->n_thread); + utepdat_t *s = (utepdat_t*)in; + s->mo = init_ma_ov_buf_t_arr(p->n_thread); + /** CALLOC(s->pos, s->n); + **/ int i; - kt_for(p->n_thread, worker_for_alignment, s, s->n); + kt_for(p->n_thread, worker_for_ul_alignment, s, s->n); for (i = 0; i < s->n; ++i) { free(s->seq[i]); - p->total_base += (s->len[i]>>32) + (uint32_t)s->len[i]; + p->total_base += s->len[i]; } - free(s->seq); free(s->len); - for (i = 0; i < (int)p->n_thread; ++i) { - free(s->pos_buf[i].a.a); - } - free(s->pos_buf); + destory_ma_ov_buf_t_arr(&(s->mo), p->n_thread); return s; } else if (step == 2) { // step 3: dump - stepdat_t *s = (stepdat_t*)in; + utepdat_t *s = (utepdat_t*)in; + /** int i; for (i = 0; i < s->n; ++i) { // if(s->pos[i].a == NULL) continue; @@ -111,9 +157,9 @@ static void *worker_ul_pipeline(void *data, int step, void *in) // callback for kv_push(pe_hit, p->hits.a, s->pos[i]); } free(s->pos); + **/ free(s); } - **/ return 0; }