From 7dd4a848b93dc1c6b9403c181e2e06621139c7bb Mon Sep 17 00:00:00 2001 From: chhylp123 Date: Sun, 26 Sep 2021 13:51:13 -0400 Subject: [PATCH] debug graph --- CommandLines.cpp | 4 +- CommandLines.h | 3 +- Makefile | 3 +- Overlaps.cpp | 127 ++++- Overlaps.h | 2 +- Trio.cpp | 77 +++ htab.h | 1 + inter.cpp | 1254 ++++++++++++++++++++++++++++++++++++++++++++-- kalloc.cpp | 205 ++++++++ kalloc.h | 77 +++ kavl.h | 414 +++++++++++++++ khash.h | 635 +++++++++++++++++++++++ tovlp.cpp | 6 +- 13 files changed, 2747 insertions(+), 61 deletions(-) create mode 100644 kalloc.cpp create mode 100644 kalloc.h create mode 100644 kavl.h create mode 100644 khash.h diff --git a/CommandLines.cpp b/CommandLines.cpp index 064e2b7..7746faf 100644 --- a/CommandLines.cpp +++ b/CommandLines.cpp @@ -157,6 +157,7 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->hic_enzymes = NULL; asm_opt->hic_reads[0] = NULL; asm_opt->hic_reads[1] = NULL; + asm_opt->fn_bin_poy = NULL; asm_opt->ar = NULL; asm_opt->thread_num = 1; asm_opt->k_mer_length = 51; @@ -654,7 +655,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt) int c; - while ((c = ketopt(&opt, argc, argv, 1, "hvt:o:k:w:m:n:r:a:b:z:x:y:p:c:d:M:P:if:D:FN:1:2:3:4:l:s:O:eu", long_options)) >= 0) { + while ((c = ketopt(&opt, argc, argv, 1, "hvt:o:k:w:m:n:r:a:b:z:x:y:p:c:d:M:P:if:D:FN:1:2:3:4:5:l:s:O:eu", long_options)) >= 0) { if (c == 'h') { Print_H(asm_opt); @@ -684,6 +685,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt) else if (c == '2' || c == 'M') asm_opt->fn_bin_yak[1] = opt.arg; else if (c == '3') asm_opt->fn_bin_list[0] = opt.arg; else if (c == '4') asm_opt->fn_bin_list[1] = opt.arg; + else if (c == '5') asm_opt->fn_bin_poy = opt.arg; else if (c == 'x') asm_opt->max_drop_rate = atof(opt.arg); else if (c == 'y') asm_opt->min_drop_rate = atof(opt.arg); else if (c == 'p') asm_opt->small_pop_bubble_size = atoll(opt.arg); diff --git a/CommandLines.h b/CommandLines.h index 5490e8f..66f99a7 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -4,7 +4,7 @@ #include #include -#define HA_VERSION "0.16.1-r375" +#define HA_VERSION "0.16.1-r377" #define VERBOSE 0 @@ -38,6 +38,7 @@ typedef struct { char* required_read_name; char *fn_bin_yak[2]; char *fn_bin_list[2]; + char *fn_bin_poy; char *extract_list; enzyme *hic_reads[2]; enzyme *hic_enzymes; diff --git a/Makefile b/Makefile index 3d85a74..8da074f 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ INCLUDES= OBJS= CommandLines.o Process_Read.o Assembly.o Hash_Table.o \ POA.o Correct.o Levenshtein_distance.o Overlaps.o Trio.o kthread.o Purge_Dups.o \ htab.o hist.o sketch.o anchor.o extract.o sys.o ksw2_extz2_sse.o hic.o rcut.o horder.o \ - tovlp.o inter.o + tovlp.o inter.o kalloc.o EXE= hifiasm LIBS= -lz -lpthread -lm @@ -77,3 +77,4 @@ rcut.o: rcut.h horder.o: horder.h tovlp.o: tovlp.h inter.o: inter.h +kalloc.o: kalloc.h \ No newline at end of file diff --git a/Overlaps.cpp b/Overlaps.cpp index a1b794b..0c3e4c2 100644 --- a/Overlaps.cpp +++ b/Overlaps.cpp @@ -13090,6 +13090,61 @@ void hic_clean(asg_t* read_g) kv_destroy(ax); } +void update_poly_trio(uint32_t mm, uint32_t *hapS, uint32_t rn) +{ + uint32_t i; + for (i = 0; i < rn; i++) { + if(R_INF.trio_flag[i] == DROP) continue; + R_INF.trio_flag[i] = AMBIGU; + if(!hapS[i]) continue; + R_INF.trio_flag[i] = (hapS[i]&mm?FATHER:MOTHER); + } +} +void debug_hapS(uint32_t *hapS, uint32_t rn) +{ + uint32_t i, p, tot, nt, max_tot = 0, *occ = NULL, *freq = NULL; + for (i = 0; i < rn; i++) { + for (p = hapS[i], tot = 0; p; p>>=1, tot++); + max_tot = MAX(max_tot, tot); + } + fprintf(stderr, "[M::%s:] ==> %u haplotypes in total\n", __func__, max_tot); + if(max_tot){ + CALLOC(occ, max_tot+1); + CALLOC(freq, max_tot+1); + for (i = 0; i < rn; i++) { + for (p = hapS[i], tot = nt = 0; p; p>>=1, tot++){ + if(p&1) occ[tot+1]++, nt++; + } + freq[nt]++; + } + for (i = 1; i <= max_tot; i++) { + fprintf(stderr, "[M::%s:] ==> # reads in hap%u: %u\n", __func__, i, occ[i]); + } + for (i = 0; i <= max_tot; i++) { + fprintf(stderr, "[M::%s:] ==> # reads within %u haplotypes: %u\n", __func__, i, freq[i]); + } + free(occ); free(freq); + } +} +void output_poly_trio(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, ma_hit_t_alloc* sources, +ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, +R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, +bub_label_t* b_mask_t, uint32_t hapN) +{ + uint32_t i; + uint32_t *hapS = ha_polybin_list(&asm_opt); + // debug_hapS(hapS, sg->n_seq); + char *fp = NULL; MALLOC(fp, 100); + for (i = 0; i < hapN; i++){ + update_poly_trio(1<n_seq); + sprintf(fp, "hap%u", i+1); + output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, + stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, is_bench, b_mask_t, fp); + } + free(fp); free(hapS); +} + + void output_contig_graph_alternative(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, ma_hit_t_alloc* sources, R_to_U* ruIndex, int max_hang, int min_ovlp); void clean_u_trans_t_idx(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); @@ -13212,9 +13267,9 @@ long long gap_fuzz, bub_label_t* b_mask_t) reduce_hamming_error(sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz); ug_fa = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t); + 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL); ug_mo = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t); + 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL); if(rhits) { ha_aware_order(rhits, sg, ug_fa, ug_mo, cov?&(cov->t_ch->k_trans):&(t_ch->k_trans), &opt, 3); @@ -13320,9 +13375,9 @@ long long gap_fuzz, bub_label_t* b_mask_t) reduce_hamming_error(sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz); output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t); + 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL); output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t); + 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL); } void set_trio_flag_by_cov(ma_ug_t *ug, asg_t *read_g, hap_cov_t *cov) @@ -14117,9 +14172,9 @@ bub_label_t* b_mask_t) kv_destroy(new_rtg_edges.a); output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t); + 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL); output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t); + 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL); } ma_ug_t* merge_utg(ma_ug_t **dest, ma_ug_t **src) @@ -14186,11 +14241,11 @@ float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* { ma_ug_t *ug_1 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, - chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t); + chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t, NULL); ma_ug_t *ug_2 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, - chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t); + chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t, NULL); fprintf(stderr, "ug_1->u.n: %u, ug_2->u.n: %u\n", (uint32_t)ug_1->u.n, (uint32_t)ug_2->u.n); ma_ug_t *ug = merge_utg(&ug_1, &ug_2); fprintf(stderr, "ug->u.n: %u\n", (uint32_t)ug->u.n); @@ -15258,10 +15313,7 @@ hap_cov_t *cov, utg_trans_t *o) uint8_t if_primary_unitig(ma_utg_t* u, asg_t* read_g, ma_sub_t* coverage_cut, ma_hit_t_alloc* sources, R_to_U* ruIndex, uint8_t* r_flag) { - if(asm_opt.recover_atg_cov_min < 0 || asm_opt.recover_atg_cov_max < 0) - { - return 0; - } + if(asm_opt.recover_atg_cov_min < 0 || asm_opt.recover_atg_cov_max < 0) return 0; long long R_bases = 0, C_bases = 0, C_bases_primary = 0, C_bases_alter = 0; long long total_C_bases = 0, total_C_bases_primary = 0; uint32_t available_reads = 0, k, j, rId, tn, is_Unitig; @@ -15979,7 +16031,7 @@ float drop_ratio, uint32_t trio_flag, float trio_drop_rate, hap_cov_t *cov) cut_trio_tip_primary(g, ug, tipsLen, trio_flag, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2); } - ///print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); + // print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); magic_trio_phasing(g, ug, read_g, coverage_cut, sources, reverse_sources, 2, ruIndex, trio_flag, trio_drop_rate); resolve_tangles(ug, read_g, reverse_sources, 20, 100, 0.05, 0.2, ruIndex, cov->is_r_het, trio_flag, drop_ratio); drop_semi_circle(ug, g, read_g, reverse_sources, ruIndex, cov->is_r_het); @@ -16870,6 +16922,31 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex) return u_flag; } +void purge_dump(ma_ug_t* ug) +{ + asg_t* nsg = ug->g; + uint32_t v, n_vtx = nsg->n_seq, k, rId; + ma_utg_t *u = NULL; + for (v = 0; v < n_vtx; ++v) { + if (nsg->seq[v].del) continue; + u = &((ug)->u.a[v]); + if(u->m == 0) continue; + for (k = 0; k < u->n; k++){ + rId = u->a[k]>>33; + if(R_INF.trio_flag[rId] != AMBIGU && R_INF.trio_flag[rId] != DROP) break; + } + if(k >= u->n){ + if(u->m != 0){ + u->circ = u->end = u->len = u->m = u->n = u->start = 0; + free(u->a); + u->a = NULL; + } + asg_seq_del(nsg, v); + } + } + asg_cleanup(nsg); +} + void adjust_utg_by_trio(ma_ug_t **ug, asg_t* read_g, uint8_t flag, float drop_rate, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, ma_sub_t* coverage_cut, @@ -16974,6 +17051,7 @@ kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t) set_drop_trio_flag(*ug); destory_hap_cov_t(&cov); + // purge_dump(*ug); renew_utg(ug, read_g, new_rtg_edges); } @@ -16999,10 +17077,11 @@ int debug_untig_length(ma_ug_t *g, uint32_t tipsLen, const char* name) ma_ug_t* output_trio_unitig_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, uint8_t flag, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, -float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t) +float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t, +char *f_prefix) { char* gfa_name = (char*)malloc(strlen(output_file_name)+100); - sprintf(gfa_name, "%s.%s.p_ctg.gfa", output_file_name, (flag==FATHER?"hap1":"hap2")); + sprintf(gfa_name, "%s.%s.p_ctg.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); FILE* output_file = NULL; if(is_bench == 0) output_file = fopen(gfa_name, "w"); @@ -17043,13 +17122,13 @@ float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, (flag==FATHER?"h1tg":"h2tg"), output_file); fclose(output_file); - sprintf(gfa_name, "%s.%s.p_ctg.noseq.gfa", output_file_name, (flag==FATHER?"hap1":"hap2")); + sprintf(gfa_name, "%s.%s.p_ctg.noseq.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); output_file = fopen(gfa_name, "w"); ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, (flag==FATHER?"h1tg":"h2tg"), output_file); fclose(output_file); if(asm_opt.bed_inconsist_rate != 0) { - sprintf(gfa_name, "%s.%s.p_ctg.lowQ.bed", output_file_name, (flag==FATHER?"hap1":"hap2")); + sprintf(gfa_name, "%s.%s.p_ctg.lowQ.bed", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); output_file = fopen(gfa_name, "w"); ma_ug_print_bed(ug, sg, &R_INF, coverage_cut, sources, &new_rtg_edges, max_hang, min_ovlp, asm_opt.bed_inconsist_rate, (flag==FATHER?"h1tg":"h2tg"), output_file, NULL); @@ -30888,8 +30967,14 @@ ma_sub_t **coverage_cut_ptr, int debug_g) write_debug_graph(sg, sources, coverage_cut, output_file_name, n_read, reverse_sources, ruIndex); debug_gfa:; } - - if (ha_opt_triobin(&asm_opt) && ha_opt_hic(&asm_opt)) + + if(asm_opt.fn_bin_poy) + { + if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; + output_poly_trio(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + 0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, asm_opt.polyploidy); + } + else if (ha_opt_triobin(&asm_opt) && ha_opt_hic(&asm_opt)) { if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; benchmark_hic_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, @@ -30900,10 +30985,10 @@ ma_sub_t **coverage_cut_ptr, int debug_g) if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; output_trio_unitig_graph(sg, coverage_cut, o_file, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t); + 0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, NULL); output_trio_unitig_graph(sg, coverage_cut, o_file, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t); + 0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, NULL); } else if(ha_opt_hic(&asm_opt)) { diff --git a/Overlaps.h b/Overlaps.h index 9430e15..04e25cf 100644 --- a/Overlaps.h +++ b/Overlaps.h @@ -913,7 +913,7 @@ ma_ug_t* copy_untig_graph(ma_ug_t *src); ma_ug_t* output_trio_unitig_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, uint8_t flag, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, -float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t); +float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t, char *f_prefix); asg_t* copy_read_graph(asg_t *src); ma_ug_t *ma_ug_gen(asg_t *g); void ma_ug_destroy(ma_ug_t *ug); diff --git a/Trio.cpp b/Trio.cpp index 75f20f4..f03db1b 100644 --- a/Trio.cpp +++ b/Trio.cpp @@ -349,6 +349,83 @@ static void ha_triobin_list(const hifiasm_opt_t *opt) fprintf(stderr, "[M::%s::%.3f*%.2f] ==> partitioned reads with external lists\n", __func__, yak_realtime(), yak_cpu_usage()); } +inline void phrase_hstatus(char *s, char **rname, uint32_t *hid) +{ + char *p = NULL, *id = NULL; *rname = NULL; *hid = (uint32_t)-1; + uint32_t tot; + for (p = s, tot = 0; *p; ++p){ + if (*p == '\t' || *p == ' '){ + *p = 0; + if(!tot) *rname = p+1; + else break; + tot++; + } + } + for (p = s, tot = 0; *p; ++p){ + if (*p == '_') id = p + 1; + } + *hid = atoi(id); +} + +uint32_t *ha_polybin_list(const hifiasm_opt_t *opt) +{ + int64_t i; + khint_t k; + cstr_ht_t *h; + assert(R_INF.total_reads < (uint32_t)-1); + h = cstr_ht_init(); + for (i = 0; i < (int64_t)R_INF.total_reads; ++i) { + int absent; + char *str = (char*)calloc(Get_NAME_LENGTH(R_INF, i) + 1, 1); + strncpy(str, Get_NAME(R_INF, i), Get_NAME_LENGTH(R_INF, i)); + k = cstr_ht_put(h, str, &absent); + if (absent) kh_val(h, k) = i; + } + fprintf(stderr, "[M::%s::%.3f*%.2f] created the hash table for read names\n", __func__, yak_realtime(), yak_cpu_usage()); + + gzFile fp; + kstream_t *ks; + kstring_t str = {0,0,0}; + char *rname = NULL; + uint32_t hid, *ss = NULL; + int dret; + int64_t n_tot = 0, n_bin = 0; + fp = gzopen(opt->fn_bin_poy, "r"); + if (fp == 0) { + fprintf(stderr, "ERROR: failed to open file '%s'\n", opt->fn_bin_poy); + for (k = 0; k < kh_end(h); ++k) + if (kh_exist(h, k)) + free((char*)kh_key(h, k)); + cstr_ht_destroy(h); + return NULL; + } + CALLOC(ss, R_INF.total_reads); + ks = ks_init(fp); + while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { + khint_t k; ++n_tot; + phrase_hstatus(str.s, &rname, &hid); + if((!(*rname)) || hid == (uint32_t)-1) { + fprintf(stderr, "ERROR: wrong hap status\n"); + continue; + } + k = cstr_ht_get(h, rname); + if (k != kh_end(h)) { + ss[kh_val(h, k)] |= (((uint32_t)1)<<(hid-1)); + ++n_bin; + } + } + free(str.s); + ks_destroy(ks); + gzclose(fp); + + for (k = 0; k < kh_end(h); ++k) + if (kh_exist(h, k)) + free((char*)kh_key(h, k)); + cstr_ht_destroy(h); + fprintf(stderr, "[M::%s::%.3f*%.2f] ==> partitioned reads with external lists\n", __func__, yak_realtime(), yak_cpu_usage()); + return ss; +} + void ha_triobin(const hifiasm_opt_t *opt) { memset(R_INF.trio_flag, AMBIGU, R_INF.total_reads * sizeof(uint8_t)); diff --git a/htab.h b/htab.h index 259de57..9f92171 100644 --- a/htab.h +++ b/htab.h @@ -105,6 +105,7 @@ double yak_peakrss_in_gb(void); double yak_cpu_usage(void); void ha_triobin(const hifiasm_opt_t *opt); +uint32_t *ha_polybin_list(const hifiasm_opt_t *opt); void mz1_ha_sketch(const char *str, int len, int w, int k, uint32_t rid, int is_hpc, ha_mz1_v *p, const void *hf, int sample_dist, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, ha_pt_t *pt, int min_freq, int32_t dp_min_len, float dp_e, st_mt_t *mt, int32_t ws, int32_t is_unique); void mz2_ha_sketch(const char *str, int len, int w, int k, uint32_t rid, int is_hpc, ha_mzl_v *p, const void *hf, int sample_dist, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, ha_pt_t *pt, int min_freq, int32_t dp_min_len, float dp_e, st_mt_t *mt, int32_t ws, int32_t is_unique); diff --git a/inter.cpp b/inter.cpp index e4d6315..a966820 100644 --- a/inter.cpp +++ b/inter.cpp @@ -3,6 +3,9 @@ #include #include #include "kseq.h" // FASTA/Q parser +#include "kavl.h" +#include "khash.h" +#include "kalloc.h" #include "kthread.h" #include "inter.h" #include "Overlaps.h" @@ -11,8 +14,55 @@ #include "Hash_Table.h" KSEQ_INIT(gzFile, gzread) +#define MG_SEED_IGNORE (1ULL<<41) +#define MG_SEED_TANDEM (1ULL<<42) +#define MG_SEED_KEPT (1ULL<<43) + +#define MG_MAX_SEG 255 +#define MG_SEED_SEG_SHIFT 48 +#define MG_SEED_SEG_MASK (0xffULL<<(MG_SEED_SEG_SHIFT)) +#define mg_seg_id(a) ((int32_t)(((a).y&MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT)) + +#define MG_SEED_WT_SHIFT 56 +#define MG_MAX_SHORT_K 15 + +#define MG_SHORT_K_EXT 10000 ///1000 in minigraph + + +#define generic_key(x) (x) +KRADIX_SORT_INIT(gfa64, uint64_t, generic_key, 8) + +struct mg_tbuf_s { + void *km; + int frag_gap; +}; +typedef struct mg_tbuf_s mg_tbuf_t; + + +mg_tbuf_t *mg_tbuf_init(void) +{ + mg_tbuf_t *b; + b = (mg_tbuf_t*)calloc(1, sizeof(mg_tbuf_t)); + b->km = km_init(); + return b; +} + +void mg_tbuf_destroy(mg_tbuf_t *b) +{ + if (b == 0) return; + if (b->km) km_destroy(b->km); + free(b); +} + +void *mg_tbuf_get_km(mg_tbuf_t *b) +{ + return b->km; +} + typedef struct { - int w, k, bw, max_gap, is_HPC, hap_n; + int w, k, bw, max_gap, is_HPC, hap_n, occ_weight, max_gap_pre; + int max_lc_skip, max_lc_iter, min_lc_cnt, min_lc_score; + float chn_pen_gap; } mg_idxopt_t; typedef struct { @@ -26,6 +76,7 @@ typedef struct { // global data structure for kt_pipeline() const void *ha_flt_tab; const ha_pt_t *ha_idx; const mg_idxopt_t *opt; + const ma_ug_t *ug; kseq_t *ks; int64_t chunk_size; uint64_t n_thread; @@ -34,37 +85,118 @@ typedef struct { // global data structure for kt_pipeline() } uldat_t; typedef struct { // data structure for each step in kt_pipeline() + const mg_idxopt_t *opt; const void *ha_flt_tab; const ha_pt_t *ha_idx; + const ma_ug_t *ug; int n, m, sum_len; uint64_t *len, id; char **seq; - ma_ov_buf_t **mo; + ha_mzl_v *mzs; + st_mt_t *sps; + mg_tbuf_t **buf; } utepdat_t; -ma_ov_buf_t **init_ma_ov_buf_t_arr(uint64_t n) -{ - uint64_t i; - ma_ov_buf_t **p = NULL; CALLOC(p, 1); CALLOC(*p, n); - for (i = 0; i < n; i++){ - kv_init((*p)[i].sp); (*p)[i].abl = ha_abufl_init(); - init_Candidates_list(&((*p)[i].clist)); - init_overlap_region_alloc(&((*p)[i].olist)); - } - return p; -} +typedef struct { + ///off: start idx in mg128_t * a[]; + ///cnt: how many eles in this chain + ///a[off, off+cnt) saves the eles in this chain + int32_t off, cnt:31, inner_pre:1; + ///ref_id|rev + uint32_t v; + ///chain in ref: [rs, re) + ///chain in query: [qs, qe) + int32_t rs, re, qs, qe; + ///score: chain score + int32_t score, dist_pre; + uint32_t hash_pre; +} mg_lchain_t; -void destory_ma_ov_buf_t_arr(ma_ov_buf_t ***p, uint64_t n) -{ - uint64_t i; - for (i = 0; i < n; i++){ - kv_destroy((**p)[i].sp); ha_abufl_destroy((**p)[i].abl); - destory_Candidates_list(&((**p)[i].clist)); - destory_overlap_region_alloc(&((**p)[i].olist)); - } - free((**p)); free((*p)); -} +typedef struct { + uint32_t v, d; + int32_t pre; +} mg_pathv_t; + +KHASH_MAP_INIT_INT(sp, sp_topk_t) +KHASH_MAP_INIT_INT(sp2, uint64_t) + +///mg128_t->y: weight(8)seg_id(8)flag(8)span(8)pos(32) +///mg128_t->x: rid(31)rev(1)pos(33); keep reference +typedef struct { uint64_t x, y; } mg128_t; +#define sort_key_128x(a) ((a).x) +KRADIX_SORT_INIT(128x, mg128_t, sort_key_128x, 8) +void radix_sort_128x(mg128_t *beg, mg128_t *end); + +typedef struct { + uint32_t n; ///length of candidate list + uint64_t q_span:31, rev:1, q_pos:32; + uint32_t qid:16, weight:15, is_tandem:1; + const ha_idxposl_t *cr; ///candidate list +} mg_match_t; + +// shortest path +typedef struct { + // input + ///(lj_ref_id)|(lj_ref_rev^1) + uint32_t v; + ///target_dist should like the overlap length in string graph + ///it should be used to evaluate if the identified path is close to real path/alignment + int32_t target_dist; + uint32_t target_hash; + ///inner: if li and lj are at the same ref id + ///meta: j + uint32_t meta:30, check_hash:1, inner:1; + /** + * There are two cases: + * (1) lj->qs************lj->qe + * li->qs************li->qe + * (2) lj->qs************lj->qe + * li->qs************li->qe + * qlen = li->qs - lj->qe;///might be negative + * **/ + int32_t qlen; + // output + uint32_t n_path:31, is_0:1;///I guess n_path is how many path from src to dest + int32_t path_end;///looks like an idx to alignment + int32_t dist, mlen; + uint32_t hash; + // aux + uint64_t srt_key; +} mg_path_dst_t; + +typedef struct { + uint32_t srt; + int32_t i; +} gc_frag_t; +///I think this structure is just used for iteration +///iterate each ref id, instead of alignment id +typedef struct sp_node_s { + uint64_t di; // dist<<32 | node_id in avl tree(doesn't matter too much) + uint32_t v;///ref_id|rev + int32_t pre; + uint32_t hash; + int32_t is_0; + KAVL_HEAD(struct sp_node_s) head; +} sp_node_t, *sp_node_p; + +typedef struct { + int32_t k, mlen;//k: number of walks from src to this node + int32_t qs, qe; + sp_node_t *p[MG_MAX_SHORT_K]; // this forms a max-heap +} sp_topk_t; + +#define gc_frag_key(p) ((p).srt) +KRADIX_SORT_INIT(gc, gc_frag_t, gc_frag_key, 4) + +#define dst_key(p) ((p).srt_key) +KRADIX_SORT_INIT(dst, mg_path_dst_t, dst_key, 8) + +#define sp_node_cmp(a, b) (((a)->di > (b)->di) - ((a)->di < (b)->di)) +KAVL_INIT(sp, sp_node_t, head, sp_node_cmp) + +#define sp_node_lt(a, b) ((a)->di < (b)->di) +KSORT_INIT(sp, sp_node_p, sp_node_lt) void init_mg_opt(mg_idxopt_t *opt, int is_HPC, int k, int w, int hap_n) { @@ -74,6 +206,14 @@ void init_mg_opt(mg_idxopt_t *opt, int is_HPC, int k, int w, int hap_n) opt->is_HPC = is_HPC; opt->bw = 2000; opt->max_gap = 5000; + opt->occ_weight = 20; + opt->max_gap_pre = 10000;///1000 in minigraph + opt->max_lc_iter = 10000; + opt->chn_pen_gap = 0.19;///using minimap2's value + opt->max_lc_skip = 25;// mo->max_gc_skip = 25; + opt->max_lc_iter = 10000; + opt->min_lc_cnt = 2; + opt->min_lc_score = 30; } void uidx_build(ma_ug_t *ug, mg_idxopt_t *opt) @@ -91,13 +231,1052 @@ void uidx_destory() ha_pt_destroy(ha_idx); } + +///only use non-repetitive minimizers +static mg_match_t *collect_matches(void *km, int *_n_m, int max_occ, const void *ha_flt_tab, const ha_pt_t *ha_idx, int check_unique, const ha_mzl_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, int32_t **mini_pos) +{ + int rep_st = 0, rep_en = 0, n_m, tn, tw; + size_t i; + mg_match_t *m; + *n_mini_pos = 0; + KMALLOC(km, *mini_pos, mv->n);///mv->n how many minimizers in query + m = (mg_match_t*)kmalloc(km, mv->n * sizeof(mg_match_t)); + for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) { + const ha_idxposl_t *cr; + ha_mzl_t *z = &mv->a[i]; + cr = ha_ptl_get(ha_idx, z->x, &tn); + tw = ha_ft_cnt(ha_flt_tab, z->x); + if (tw > max_occ) { ///the frequency of repetitive regions; ignore those minimizers + int en = z->pos + 1, st = en - z->span;//[st, en) + if (st > rep_en) { ///just record the length of repetive regions + *rep_len += rep_en - rep_st; + rep_st = st, rep_en = en; + } else rep_en = en; + } else { + mg_match_t *q = &m[n_m++]; + q->q_pos = z->pos, q->q_span = z->span, q->rev = z->rev, q->cr = cr, q->n = tn, q->qid = 0; + q->is_tandem = 0, q->weight = 255; + if(check_unique && tw != 1) q->is_tandem = 1, q->weight = 15; + *n_a += q->n;///how many candidates + (*mini_pos)[(*n_mini_pos)++] = z->pos;///minimizer offset in query + } + } + *rep_len += rep_en - rep_st; ///the length of repetitive regions + *_n_m = n_m; + return m; +} + +mg128_t *collect_seed_hits(void *km, const mg_idxopt_t *opt, int max_occ, const void *ha_flt_tab, const ha_pt_t *ha_idx, +const ma_ug_t *ug, const ha_mzl_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, int32_t **mini_pos) +{ + int i, n_m; + mg128_t *a = NULL; + mg_match_t *m = collect_matches(km, &n_m, max_occ, ha_flt_tab, ha_idx, 1, mv, n_a, rep_len, n_mini_pos, mini_pos); + a = (mg128_t*)kmalloc(km, *n_a * sizeof(mg128_t));///n_a: how many available candidates in total + for (i = 0, *n_a = 0; i < n_m; ++i) {///n_m: how many available seeds, instead of candidates + mg_match_t *q = &m[i]; + const ha_idxposl_t *r = q->cr; + uint32_t k; + for (k = 0; k < q->n; ++k) {///q->n: number of candidates belonging to seed m[i] + mg128_t *p; + p = &a[(*n_a)++];///pick up a slot for one candidate + if (r[k].rev == q->rev) // forward strand + p->x = (uint64_t)(r[k].rid)<<33|r[k].pos; ///reference: rid(31)|rev(1)|pos(32) + else // reverse strand + p->x = (uint64_t)(r[k].rid)<<33 | 1ULL<<32 | (ug->g->seq[r[k].rid].len - (r[k].pos + 1 - r[k].span) - 1); + p->y = (uint64_t)q->q_span << 32 | q->q_pos; + p->y |= (uint64_t)q->qid << MG_SEED_SEG_SHIFT; + if (q->is_tandem) p->y |= MG_SEED_TANDEM; + p->y |= (uint64_t)q->weight << MG_SEED_WT_SHIFT; + ///p->y: weight(8)seg_id(8)flag(8)span(8)pos(32) + ///p->x: rid(31)rev(1)pos(33); keep reference + } + } + kfree(km, m); + radix_sort_128x(a, a + (*n_a)); + return a; +} + +///r is 1000 in default +///remove isolated hits, whic are not close enough to others +static int64_t flt_anchors(int64_t n_a, mg128_t *a, int32_t r) +{ + int64_t i, j; + for (i = 0; i < n_a; ++i) { + for (j = i - 1; j >= 0; --j) { + /** + * a is sorted by x + * a[].x: ref_id(31)rev(1)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + **/ + int32_t dq; + int64_t dr = a[i].x - a[j].x;///a is sorted by x + if (dr > r) break;///if two candidates coming from differnt unitigs, dr would be extremly large + dq = (int32_t)a[i].y - (int32_t)a[j].y; + if (dq > r || dq < 0) continue; + a[j].y |= MG_SEED_KEPT; + a[i].y |= MG_SEED_KEPT; + break; + } + } + for (i = n_a - 1; i >= 0; --i) { + if (a[i].y & MG_SEED_KEPT) continue; + for (j = i + 1; j < n_a; ++j) { + int32_t dq; + int64_t dr = a[j].x - a[i].x; + if (dr > r) break; + dq = (int32_t)a[j].y - (int32_t)a[i].y; + if (dq > r || dq < 0) continue; + a[j].y |= MG_SEED_KEPT; + a[i].y |= MG_SEED_KEPT; + break; + } + } + for (i = j = 0; i < n_a; ++i) + if (a[i].y & MG_SEED_KEPT) + a[j++] = a[i]; + return j; +} + +static inline float mg_log2(float x) // NB: this doesn't work when x<2 +{ + union { float f; uint32_t i; } z = { x }; + float log_2 = ((z.i >> 23) & 255) - 128; + z.i &= ~(255 << 23); + z.i += 127 << 23; + log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f; + return log_2; +} + +inline int32_t normal_sc(uint64_t w, int32_t sc) +{ + if(w < 255){ + int32_t tmp = (int)(0.00392156862745098 * w * sc); // 0.00392... = 1/255 + sc = tmp > 1? tmp : 1; + } + return sc; +} +// ai[].x: ref_id(31)rev(1)r_pos(32) +// ai[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) +// comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_segs); +static inline int32_t comput_sc(const mg128_t *ai, const mg128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap) +{ + int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr = (int32_t)ai->x - (int32_t)aj->x, dd, dg, q_span, sc; + ///ai and aj has already been sorted by x + ///which means ai->x >= aj->x + if (dq <= 0 || dq > max_dist_x) return INT32_MIN; + if (dr <= 0 || dr > max_dist_y) return INT32_MIN; + dd = dr > dq? dr - dq : dq - dr; ///indel, dd is always >= 0 + if (dd > bw) return INT32_MIN; + dg = dr < dq? dr : dq;///MIN(dr, dq) + q_span = aj->y>>32&0xff;///query span; should be ai->y>>32&0xff, is it a bug? + sc = normal_sc(aj->y>>MG_SEED_WT_SHIFT, (q_span q_span: there are some bases that are not covered between ai and aj + ///it is if (dd || dg > q_span) in minigraph + if (dd) { + float lin_pen, log_pen; + lin_pen = chn_pen_gap * (float)dd; + log_pen = dd >= 2? mg_log2(dd) : 0.0f; // mg_log2() only works for dd>=2 + sc -= (int)(lin_pen + log_pen); + } + return sc; +} + + +///p[]: id of last +///f[]: the score ending at i, not always the peak +///v[]: keeps the peak score up to i; +///t[]: id of next +///min_cnt = 2; min_sc = 30; extra_u = 0 +///u = mg_chain_backtrack(n, f, p, v, t, min_cnt, min_sc, 0, &n_u, &n_v); +uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t extra_u, int32_t *n_u_, int32_t *n_v_) +{ + mg128_t *z; + uint64_t *u; + int64_t i, k, n_z, n_v; + int32_t n_u; + // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + *n_u_ = *n_v_ = 0; + for (i = 0, n_z = 0; i < n; ++i) // precompute n_z + if (f[i] >= min_sc) ++n_z; + if (n_z == 0) return 0; + KMALLOC(km, z, n_z); + for (i = 0, k = 0; i < n; ++i) // populate z[] + if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i; + radix_sort_128x(z, z + n_z);///sort by score + + memset(t, 0, n * 4);///t is a buffer + ///from the largest to the smallest + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u + int64_t n_v0 = n_v; + int32_t sc; + ///note t[i] == 0 is not used to find local alignment + ///say if we have already found a long chain, then the secondary might be able to merged to the long chain + ///t[i] == 0 is used to find those chains + for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i]) + ++n_v, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + ++n_u;///how many chains, including primary chains and non-primary chains + else n_v = n_v0; + } + KMALLOC(km, u, n_u + extra_u); + memset(t, 0, n * 4); + for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[] + int64_t n_v0 = n_v; + int32_t sc; + for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i]) + v[n_v++] = i, t[i] = 1; + sc = i < 0? z[k].x : (int32_t)z[k].x - f[i]; + if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt) + u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0); + else n_v = n_v0; + } + kfree(km, z); + assert(n_v < INT32_MAX); + *n_u_ = n_u, *n_v_ = n_v; + return u; +} + +//u[]: sc|occ of chains +//v[]: idx of each element +static mg128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mg128_t *a) +{ + mg128_t *b, *w; + uint64_t *u2; + int64_t i, j, k; + + // write the result to b[] + KMALLOC(km, b, n_v); + for (i = 0, k = 0; i < n_u; ++i) { + int32_t k0 = k, ni = (int32_t)u[i]; + for (j = 0; j < ni; ++j) + b[k++] = a[v[k0 + (ni - j - 1)]];///write all elements of a chain together + } + kfree(km, v); + + // sort u[] and a[] by the target position, such that adjacent chains may be joined + KMALLOC(km, w, n_u); + for (i = k = 0; i < n_u; ++i) {///n_u: how many chains + ///x: ref_id(31)rev(1)r_pos(32) + w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i; + k += (int32_t)u[i]; + } + radix_sort_128x(w, w + n_u);///sort by ref_id(31)rev(1)r_pos(32); r_pos is the start pos of chain + KMALLOC(km, u2, n_u); + for (i = k = 0; i < n_u; ++i) {///note merge chain; just place close chains together + ///j is chain id; n is how many elements in j-th chain + int32_t j = (int32_t)w[i].y, n = (int32_t)u[j]; + u2[i] = u[j]; + memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mg128_t)); + k += n; + } + memcpy(u, u2, n_u * 8); + memcpy(b, a, k * sizeof(mg128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot + kfree(km, a); kfree(km, w); kfree(km, u2); + return b; +} + +/* Input: + * a[].x: ref_id(31)rev(1)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + * n: length of a[] + * Output: + * n_u: #chains + * u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[]) + * input a[] is deallocated on return + */ +///is_cdna is is_splice +mg128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, + int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km) +{ // TODO: make sure this works when n has more than 32 bits + int32_t *f, *t, *v, n_u, n_v; + int64_t *p, i, j, max_ii, st = 0; + uint64_t *u; + + if (_u) *_u = 0, *n_u_ = 0; + if (n == 0 || a == 0) return 0; + KMALLOC(km, p, n);///id of last cell + KMALLOC(km, f, n);///f[] is the score ending at i, not always the peak + KMALLOC(km, v, n);///v[] keeps the peak score up to i; + KCALLOC(km, t, n);///t doesn't matter too much; it is mainly used to accelrate the iteration + + // a[].x: ref_id(31)rev(1)r_pos(32) + // a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + // fill the score and backtrack arrays + for (i = st = 0, max_ii = -1; i < n; ++i) { + int64_t max_j = -1, end_j; + ///max_f -> minimizer span length in query, which is the initial score + int32_t max_f = normal_sc(a[i].y>>MG_SEED_WT_SHIFT, a[i].y>>32&0xff), n_skip = 0; + ///until we are at the same rid, same direction, and the coordinates are close enough + while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st; + ///max_iter = 10000 in default, which means dp can go back to up to 10000 cells + if (i - st > max_iter) st = i - max_iter; + for (j = i - 1; j >= st; --j) { + int32_t sc; + sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap); + if (sc == INT32_MIN) continue; + sc += f[j]; + if (sc > max_f) { + max_f = sc, max_j = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == (int32_t)i) {///note we scan j backwards; we don't need to update t[] for each i + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + end_j = j;///end_j might be > 0 + ///if not close enough, select a new max + if (max_ii < 0 || (int64_t)(a[i].x - a[max_ii].x) > (int64_t)max_dist_x) {///select a new max + int32_t max = INT32_MIN; + max_ii = -1; + for (j = i - 1; j >= st; --j) + if (max < f[j]) max = f[j], max_ii = j; + } + ///note: it will happen when `max_ii` < `end_j`; + ///iteration is terminated at `end_j` mostly because of `max_skip` and `max_iter` + if (max_ii >= 0 && max_ii < end_j) { + int32_t tmp; + tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap); + if (tmp != INT32_MIN && max_f < tmp + f[max_ii]) + max_f = tmp + f[max_ii], max_j = max_ii; + } + // v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak + f[i] = max_f, p[i] = max_j; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; + if (max_ii < 0 || ((int64_t)(a[i].x - a[max_ii].x) <= (int64_t)max_dist_x && f[max_ii] < f[i])) + max_ii = i; + } + ///after mg_chain_backtrack, the results are saved in u and v; + u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, 0, &n_u, &n_v); + *n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here + kfree(km, p); kfree(km, f); kfree(km, t); + if (n_u == 0) { + kfree(km, a); kfree(km, v); + return 0; + } + //u[]: sc|occ of chains + //v[]: idx of each element + return compact_a(km, n_u, u, n_v, v, a); +} + +///qlen: query length +///u[]: sc|occ of chains +///a[]: candidate list +mg_lchain_t *mg_lchain_gen(void *km, int qlen, int n_u, uint64_t *u, mg128_t *a) +{ + mg128_t *z; + mg_lchain_t *r; + int i, k; + + if (n_u == 0) return 0; + KCALLOC(km, r, n_u); KMALLOC(km, z, n_u); + // u[] is sorted by query position + for (i = k = 0; i < n_u; ++i) { + /** + * a[].x: ref_id(31)rev(1)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + **/ + ///u[]: sc(32)occ(32) + int32_t qs = (int32_t)a[k].y + 1 - (a[k].y>>32 & 0xff); + z[i].x = (uint64_t)qs << 32 | u[i] >> 32; + z[i].y = (uint64_t)k << 32 | (int32_t)u[i]; + k += (int32_t)u[i]; + } + radix_sort_128x(z, z + n_u);//sort by qs|sc + + // populate r[] + for (i = 0; i < n_u; ++i) { + mg_lchain_t *ri = &r[i]; + /** + * z[].x: query start pos| chain score + * z[].y: idx in a[] | chain occ + * a[].x: ref_id(31)rev(1)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + * **/ + int32_t k = z[i].y >> 32, q_span = a[k].y >> 32 & 0xff; + ri->off = k; + ri->cnt = (int32_t)z[i].y; + ri->score = (uint32_t)z[i].x; + ri->v = a[k].x >> 32;///ref_id|rev + ri->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // for HPC k-mer + ri->qs = z[i].x >> 32; + ri->re = (int32_t)a[k + ri->cnt - 1].x + 1; + ri->qe = (int32_t)a[k + ri->cnt - 1].y + 1; + } + kfree(km, z); + return r; +} + +static int32_t get_mini_idx(const mg128_t *a, int32_t n, const int32_t *mini_pos) +{ + int32_t x, L = 0, R = n - 1; + x = (int32_t)a->y; + while (L <= R) { // binary search + int32_t m = ((uint64_t)L + R) >> 1; + int32_t y = mini_pos[m]; + if (y < x) L = m + 1; + else if (y > x) R = m - 1; + else return m; + } + return -1; +} + +/* Before: + * a[].x: ref_id(31)rev(1)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + * After: + * a[].x: idx_in_minimizer_arr(32)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + */ +void mg_update_anchors(int32_t n_a, mg128_t *a, int32_t n, const int32_t *mini_pos) +{ + int32_t st, j, k; + if (n_a <= 0) return; + st = get_mini_idx(&a[0], n, mini_pos); + assert(st >= 0); + for (k = 0, j = st; j < n && k < n_a; ++j) + if ((int32_t)a[k].y == mini_pos[j]) + a[k].x = (uint64_t)j << 32 | (a[k].x & 0xffffffffU), ++k; + assert(k == n_a); +} + +static int32_t find_max(int32_t n, const gc_frag_t *gf, uint32_t x) +{ + int32_t s = 0, e = n; + if (n == 0) return -1; + if (gf[n-1].srt < x) return n - 1; + if (gf[0].srt >= x) return -1; + while (e > s) { // TODO: finish this block + int32_t m = s + (e - s) / 2; + if (gf[m].srt >= x) e = m; + else s = m + 1; + } + assert(s == e); + return s; +} +///target_dist should like the overlap length in string graph +///it should be used to evaluate if the identified path is close to real path/alignment +static int32_t mg_target_dist(const asg_t *g, const mg_lchain_t *l0, const mg_lchain_t *l1) +{ + /** + case 1: l0->qs************l0->qe + l1->qs************l1->qe + case 2: l0->qs************l0->qe + l1->qs************l1->qe + + *****l0->rs************l0->re***** + ****l1->rs************l1->re** + * **/ + ///min_dist = l1->rs + (g->seg[l0->v>>1].len - l0->re); + // below equals (l1->qs - l0->qe) - min_dist + g->seg[l1->v>>1].len; see mg_gchain1_dp() for the calculation of min_dist + //(l1->qs - l0->qe) is the gap in query, min_dist is the gap in reference + return (l1->qs - l0->qe) - (g->seq[l0->v>>1].len - l0->re) + (g->seq[l1->v>>1].len - l1->rs); + // when l0->v == l1->v, the above becomes (l1->qs - l0->qe) - (l1->rs - l0->re), which is what we want +} + +static inline sp_node_t *gen_sp_node(void *km, uint32_t v, int32_t d, int32_t id) +{ + sp_node_t *p; + KMALLOC(km, p, 1); + p->v = v, p->di = (uint64_t)d<<32 | id, p->pre = -1, p->is_0 = 1; + return p; +} + +///max_dist is like the overlap length in string graph +///mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), MG_MAX_SHORT_K, 0, 0, 1, 0); +///the end position of qs is li->qs; dst[]->->qlen indicate the region that need to be checked in bases +mg_pathv_t *mg_shortest_k(void *km0, const asg_t *g, uint32_t src, int32_t n_dst, mg_path_dst_t *dst, int32_t max_dist, int32_t max_k, int32_t ql, const char *qs, int is_rev, int32_t *n_pathv) +{ + sp_node_t *p, *root = 0, **out; + sp_topk_t *q; + khash_t(sp) *h; + khash_t(sp2) *h2; + void *km; + khint_t k; + int absent; + int32_t i, j, n_done, n_found, n_seeds = 0; + uint32_t id, n_out, m_out; + int8_t *dst_done; + mg_pathv_t *ret = 0; + uint64_t *dst_group, *seeds = 0; + /** //path + void *h_seeds = 0; + mg128_v mini = {0,0,0}; + **/ + + if (n_pathv) *n_pathv = 0;///for us, n_pathv = NULL + if (n_dst <= 0) return 0;///n_dst: how many candidate nodes + for (i = 0; i < n_dst; ++i) { // initialize + mg_path_dst_t *t = &dst[i]; + ///if src and dest are at the same ref id, there are already one path + if (t->inner)///if two chains are at the same ref id + t->dist = 0, t->n_path = 1, t->path_end = -1; + else + t->dist = -1, t->n_path = 0, t->path_end = -1; + } + if (max_k > MG_MAX_SHORT_K) max_k = MG_MAX_SHORT_K; + km = km_init2(km0, 0x4000); + + /** //path + ///for the first time, we just check th reachability without sequence (qs); + ///but for the second round, we need to check sequence + ///qs is the sequence between two minimizers + if (ql > 0 && qs) { // build the seed hash table for the query + mg_sketch(km, qs, ql, MG_SHORT_KW, MG_SHORT_KK, 0, &mini); + // mini->a[].x = hash_key<<8 | kmerSpan + // mini->a[].y = rid<<32 | lastPos<<1 | strand + if (is_rev)///is_rev = 1; + for (i = 0; i < mini.n; ++i)///reverse qs[0, ql) to qs(ql, 0] + mini.a[i].y = (ql - (((int32_t)mini.a[i].y>>1) + 1 - MG_SHORT_KK) - 1) << 1 | ((mini.a[i].y&1)^1); + ///h_seeds is the ordinary hash index + h_seeds = mg_idx_a2h(km, mini.n, mini.a, 0, &seeds, &n_seeds); + ///h_seeds+seeds+n_seeds ----> hash index of qs[0, ql) + } + **/ + + KCALLOC(km, dst_done, n_dst); + KMALLOC(km, dst_group, n_dst); + // multiple dst[] may have the same dst[].v. We need to group them first. + // in other words, one ref id may have multiple dst alignment chains + for (i = 0; i < n_dst; ++i) + dst_group[i] = (uint64_t)dst[i].v<<32 | i; + radix_sort_gfa64(dst_group, dst_group + n_dst); + + h2 = kh_init2(sp2, km); // this hash table keeps all destinations from the same ref id + kh_resize(sp2, h2, n_dst * 2); + ///please note that one contig in ref may have multiple alignment chains + ///so h2 is a index that helps us to query it + ///key(h2) = ref id; value(h2) = start_idx | occ + for (i = 1, j = 0; i <= n_dst; ++i) { + if (i == n_dst || dst_group[i]>>32 != dst_group[j]>>32) { + k = kh_put(sp2, h2, dst_group[j]>>32, &absent); + kh_val(h2, k) = (uint64_t)j << 32 | (i - j); + assert(absent); + j = i; + } + } + + + + h = kh_init2(sp, km); // this hash table keeps visited vertices; path to each visited vertice + kh_resize(sp, h, 16); + + m_out = 16, n_out = 0;///16 is just the initial size + KMALLOC(km, out, m_out); + + /** + typedef struct { + int32_t k, mlen;//k: number of walks from src to this node + int32_t qs, qe; + sp_node_t *p[MG_MAX_SHORT_K]; // this forms a max-heap; all path + } sp_topk_t; + **/ + id = 0; + p = gen_sp_node(km, src, 0, id++);///just malloc a node for src; the distance is 0 + p->hash = __ac_Wang_hash(src); + kavl_insert(sp, &root, p, 0);///should be avl tree + + k = kh_put(sp, h, src, &absent);///here is a hash table + q = &kh_val(h, k); + q->k = 1, q->p[0] = p, q->mlen = 0, q->qs = q->qe = -1; + + n_done = 0; + ///the key of avl tree: #define sp_node_cmp(a, b) (((a)->di > (b)->di) - ((a)->di < (b)->di)) + ///the higher bits of (*)->di is distance to src node + ///so the key of avl tree is distance + while (kavl_size(head, root) > 0) {///thr first root is src + int32_t i, nv; + asg_arc_t *av; + sp_node_t *r; + ///note that one node might be visited multiple times if there are circles + ///delete the first node + r = kavl_erase_first(sp, &root); // take out the closest vertex in the heap (as a binary tree) + //fprintf(stderr, "XX\t%d\t%d\t%d\t%c%s[%d]\t%d\n", n_out, kavl_size(head, root), n_finished, "><"[(r->v&1)^1], g->seg[r->v>>1].name, r->v, (int32_t)(r->di>>32)); + if (n_out == m_out) KEXPAND(km, out, m_out); + ///higher 32 bits might be the distance to root node + // lower 32 bits now for position in the out[] array + r->di = r->di>>32<<32 | n_out; ///n_out is just the id in out + ///so one node id in graph might be saved multiple times in avl tree and out[] + out[n_out++] = r;///out[0] = src + + ///r->v is the dst vertex id + ///sometimes k==kh_end(h2). Some nodes are found by graph travesal but not in linear chain alignment + k = kh_get(sp2, h2, r->v); + // we have reached one dst vertex + // note that one dst vertex may have multipe alignment chains + // we can visit some nodes in graph which are not reachable during chaining + // h2 is used to determine if one node is reachable or not + if (k != kh_end(h2)) { + ///node r->v might be visited multiple times + int32_t j, dist = r->di>>32, off = kh_val(h2, k) >> 32, cnt = (int32_t)kh_val(h2, k); + //src can reach ref id r->v; there might be not only one alignment chain in r->v + //so we need to scan all of them + for (j = 0; j < cnt; ++j) { + mg_path_dst_t *t = &dst[(int32_t)dst_group[off + j]]; + int32_t done = 0; + ///the src and dest are at the same ref id, say we directly find the shortest path + if (t->inner) { + done = 1; + } else { + int32_t mlen = 0, copy = 0; + ///in the first round, we just check reachability without sequence + ///so h_seeds = NULL; we can assume mlen = 0 + /** //path + mlen = h_seeds? path_mlen(out, n_out - 1, h, t->qlen) : 0; + **/ + //if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, " src=%c%s[%d],qlen=%d\tdst=%c%s[%d]\ttarget_distx=%d,target_hash=%x\tdistx=%d,mlen=%d,hash=%x\n", "><"[src&1], g->seg[src>>1].name, src, ql, "><"[t->v&1], g->seg[t->v>>1].name, t->v, t->target_dist - g->seg[src>>1].len, t->target_hash, dist - g->seg[src>>1].len, mlen, r->hash); + // note: t indicates a linear alignmnet, instead of a node in graph + ///target_dist should be the distance on query + if (t->n_path == 0) { // keep the shortest path + copy = 1; + } else if (t->target_dist >= 0) { // we have a target distance; choose the closest + if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) { // we found the target path + copy = 1, done = 1; + } else { + int32_t d0 = t->dist, d1 = dist; + d0 = d0 > t->target_dist? d0 - t->target_dist : t->target_dist - d0; + d1 = d1 > t->target_dist? d1 - t->target_dist : t->target_dist - d1; + ///if the new distance (d1) is smaller than the old distance (d0), update the results + ///in other words, the length of new path should be closer to t->target_dist + if (d1 - mlen/2 < d0 - t->mlen/2) copy = 1; + } + } + if (copy) { + t->path_end = n_out - 1, t->dist = dist, t->hash = r->hash, t->mlen = mlen, t->is_0 = r->is_0; + if (t->target_dist >= 0) { + ///src is from li from li to lj, so the dis is generally increased + ///target_dist should be the distance on query + if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) done = 1; + else if (dist > t->target_dist + MG_SHORT_K_EXT) done = 1; + } + } + ++t->n_path;///we found a path to the alignment t + if (t->n_path >= max_k) done = 1; + } + if (dst_done[off + j] == 0 && done) + dst_done[off + j] = 1, ++n_done; + } + ///if all alignments have been settle down + ///pre-end; accelerate the loop + if (n_done == n_dst) break; + } + + ///below is used to push new nodes to avl tree for iteration + nv = asg_arc_n(g, r->v); + av = asg_arc_a(g, r->v); + for (i = 0; i < nv; ++i) { // visit all neighbors + asg_arc_t *ai = &av[i]; + ///v_lv is the (dest_length - overlap_length); it is a normal path length in string graph + ///ai->v_lv is the path length from r->v to ai->w + ///(r->di>>32) + int32_t d = (r->di>>32) + (uint32_t)ai->ul; + if (d > max_dist) continue; // don't probe vertices too far away + ///ai->w is the dest ref id; we insert a new ref id, instead of an alignment chain + k = kh_put(sp, h, ai->v, &absent);///one node might be visited multiple times + q = &kh_val(h, k); + if (absent) { // a new vertex visited + ///q->k: number of walks from src to ai->w + q->k = 0, q->qs = q->qe = -1; q->mlen = 0; + ///h_seeds = NULL; so q->mlen = 0 + /** //path + q->mlen = h_seeds && d + gfa_arc_lw(g, *ai) <= max_dist? node_mlen(km, g, ai->w, &mini, h_seeds, n_seeds, seeds, &q->qs, &q->qe) : 0; + **/ + //if (ql && qs) fprintf(stderr, "ql=%d,src=%d\tv=%c%s[%d],n_seeds=%d,mlen=%d\n", ql, src, "><"[ai->w&1], g->seg[ai->w>>1].name, ai->w, n_seeds, q->mlen); + } + ///if there are less than walks from src to ai->w, directly add + ///if there are more, keep the smallest walks + if (q->k < max_k) { // enough room: add to the heap + p = gen_sp_node(km, ai->v, d, id++); + p->pre = n_out - 1;///the parent node of this one + p->hash = r->hash + __ac_Wang_hash(ai->v); + p->is_0 = r->is_0; + /** //path + if (ai->rank > 0) p->is_0 = 0; + **/ + kavl_insert(sp, &root, p, 0); + q->p[q->k++] = p; + ks_heapup_sp(q->k, q->p);///adjust heap by distance + } else if (q->p[0]->di>>32 > d) { // shorter than the longest path so far: replace the longest + p = kavl_erase(sp, &root, q->p[0], 0); + if (p) { + p->di = (uint64_t)d<<32 | (id++); + p->pre = n_out - 1; + p->hash = r->hash + __ac_Wang_hash(ai->v); + p->is_0 = r->is_0; + /** //path + if (ai->rank > 0) p->is_0 = 0; + **/ + kavl_insert(sp, &root, p, 0); + ks_heapdown_sp(0, q->k, q->p); + } else { + fprintf(stderr, "Warning: logical bug in gfa_shortest_k(): q->k=%d,q->p[0]->{d,i}={%d,%d},d=%d,src=%u,max_dist=%d,n_dst=%d\n", q->k, (int32_t)(q->p[0]->di>>32), (int32_t)q->p[0]->di, d, src, max_dist, n_dst); + km_destroy(km); + return 0; + } + } // else: the path is longer than all the existing paths ended at ai->w + } + } + + kfree(km, dst_group); + kfree(km, dst_done); + kh_destroy(sp, h); + mg_idx_hfree(h_seeds); + kfree(km, seeds); + kfree(km, mini.a); + // NB: AVL nodes are not deallocated. When km==0, they are memory leaks. + + for (i = 0, n_found = 0; i < n_dst; ++i) + if (dst[i].n_path > 0) ++n_found;///n_path might be larger than 16 + ///we can assume n_pathv = NULL for now + if (n_found > 0 && n_pathv) { // then generate the backtrack array + int32_t n, *trans; + KCALLOC(km, trans, n_out); // used to squeeze unused elements in out[] + for (i = 0; i < n_dst; ++i) { // mark dst vertices with a target distance + mg_path_dst_t *t = &dst[i]; + if (t->n_path > 0 && t->target_dist >= 0 && t->path_end >= 0) + trans[(int32_t)out[t->path_end]->di] = 1; + } + for (i = 0; i < n_out; ++i) { // mark dst vertices without a target distance + k = kh_get(sp2, h2, out[i]->v); + if (k != kh_end(h2)) { // TODO: check if this is correct! + int32_t off = kh_val(h2, k)>>32, cnt = (int32_t)kh_val(h2, k); + for (j = off; j < off + cnt; ++j) + if (dst[j].target_dist < 0) + trans[i] = 1; + } + } + for (i = n_out - 1; i >= 0; --i) // mark all predecessors + if (trans[i] && out[i]->pre >= 0) + trans[out[i]->pre] = 1; + for (i = n = 0; i < n_out; ++i) // generate coordinate translations + if (trans[i]) trans[i] = n++; + else trans[i] = -1; + + *n_pathv = n; + KMALLOC(km0, ret, n); + for (i = 0; i < n_out; ++i) { // generate the backtrack array + mg_pathv_t *p; + if (trans[i] < 0) continue; + p = &ret[trans[i]]; + p->v = out[i]->v, p->d = out[i]->di >> 32; + p->pre = out[i]->pre < 0? out[i]->pre : trans[out[i]->pre]; + } + for (i = 0; i < n_dst; ++i) // translate "path_end" + if (dst[i].path_end >= 0) + dst[i].path_end = trans[dst[i].path_end]; + } + + km_destroy(km); + return ret; +} + + +int32_t mg_gchain1_dp(void *km, const ma_ug_t *ug, int32_t *n_lc_, mg_lchain_t *lc, int32_t qlen, int32_t max_dist_g, int32_t max_dist_q, int32_t bw, int32_t max_skip, + int32_t ref_bonus, float chn_pen_gap, float chn_pen_skip, float mask_level, int32_t max_gc_seq_ext, const char *qseq, const mg128_t *an, uint64_t **u_) +{ + int32_t i, j, k, m_dst, n_dst, n_ext, n_u, n_v, n_lc = *n_lc_; + int32_t *f, *v, *t; + int64_t *p; + uint64_t *u; + mg_path_dst_t *dst; + gc_frag_t *a; + mg_lchain_t *swap; + char *qs; + asg_t *g = ug->g; + + *u_ = 0; + if (n_lc == 0) return 0; + + KMALLOC(km, a, n_lc); + ///n_lc how many linear chains + for (i = n_ext = 0; i < n_lc; ++i) { // a[] is a view of frag[]; for sorting + mg_lchain_t *r = &lc[i]; + gc_frag_t *ai = &a[i]; + int32_t is_isolated = 0, min_end_dist_g; + r->dist_pre = -1;///indicate parent in graph chain + min_end_dist_g = g->seq[r->v>>1].len - r->re;///r->v: ref_id|rev + if (r->rs < min_end_dist_g) min_end_dist_g = r->rs; + if (min_end_dist_g > max_dist_g) is_isolated = 1; // if too far from segment ends + else if (min_end_dist_g>>3 > r->score) is_isolated = 1; // if the lchain too small relative to distance to the segment ends + ai->srt = (uint32_t)is_isolated<<31 | r->qe; + ai->i = i; + if (!is_isolated) ++n_ext; + } + ///if the alignment is too far from segment ends, which means it cannot contribute to graph alignment + if (n_ext < 2) { // no graph chaining needed; early return + kfree(km, a); + KMALLOC(km, u, n_lc); + for (i = 0; i < n_lc; ++i) + u[i] = (uint64_t)lc[i].score<<32 | 1; + *u_ = u; + return n_lc; + } + radix_sort_gc(a, a + n_lc);///sort by: is_isolated(1):qe + + KMALLOC(km, v, n_lc); + KMALLOC(km, f, n_ext); + KMALLOC(km, p, n_ext); + KCALLOC(km, t, n_ext); + // KMALLOC(km, qs, max_dist_q + 1);//for + + m_dst = n_dst = 0, dst = 0; + ///n_ext is number of linear chains that might be included in graph chains + ///sorted by the positions in query; sorted by qe of each chain + for (i = 0; i < n_ext; ++i) { // core loop + gc_frag_t *ai = &a[i]; + mg_lchain_t *li = &lc[ai->i];///linear chain; sorted by qe, i.e. end position in query + ///note segi is query id, instead of ref id; it is not such useful + /** + * a[].x: idx_in_minimizer_arr(32)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + **/ + { // collect end points potentially reachable from _i_ + int32_t x = li->qs + bw, n_skip = 0; + if (x > qlen) x = qlen; + ///collect alignments that can be reachable from the left side + ///that is, a[x].qe <= x + x = find_max(i, a, x); + n_dst = 0; + for (j = x; j >= 0; --j) { // collect potential destination vertices + gc_frag_t *aj = &a[j]; + //potential chains that might be overlapped with the left side of li + mg_lchain_t *lj = &lc[aj->i]; + mg_path_dst_t *q; + int32_t target_dist, dq; + ///lj->qs >= li->qs && lj->qe <= li->qs, so lj is contained + if (lj->qs >= li->qs) continue; // lj is contained in li on the query coordinate + ///lj->qs************lj->qe + /// li->qs************li->qe + if (lj->qe > li->qs) { // test overlap on the query + int o = lj->qe - li->qs; + ///mask_level = 0.5, if overlap is too long + ///note here is the overlap in query, so too long overlaps might be wrong + if (o > (lj->qe - lj->qs) * mask_level || o > (li->qe - li->qs) * mask_level) + continue; + } + dq = li->qs - lj->qe;///dq might be smaller than 0 + if (dq > max_dist_q) break; // if query gap too large, stop + ///The above filter chains like: + ///1. lj is contained in li + ///2. the overlap between li and lj is too large + ///3. li and lj are too far + /** + lj->qs************lj->qe + li->qs************li->qe + + *****lj->rs************lj->re***** + ****li->rs************li->re** + **/ + ///above we have checked gap/overlap in query + ///then we need to check gap/overlap in reference + if (li->v != lj->v) { // the two linear chains are on two different refs + // minimal graph gap; the real graph gap might be larger + int32_t min_dist = li->rs + (g->seq[lj->v>>1].len - lj->re); + if (min_dist > max_dist_g) continue; // graph gap too large + //note here min_dist - (lj->qs - li->qe) > bw is important + //min_dist is always larger than 0, (lj->qs - li->qe) might be negative + if (min_dist - bw > li->qs - lj->qe) continue; ///note seg* is the query id, instead of ref id + target_dist = mg_target_dist(g, lj, li); + if (target_dist < 0) continue; // this may happen if the query overlap is far too large + } else if (lj->rs >= li->rs || lj->re >= li->re) { // not colinear + continue; + } else {///li->v == lj->v and colinear; at the same ref id + /** + case 1: lj->qs************lj->qe + li->qs************li->qe + case 2: lj->qs************lj->qe + li->qs************li->qe + + *****lj->rs************lj->re***** + ****li->rs************li->re** + * **/ + ///w is indel, w is always positive + int32_t dr = li->rs - lj->re, w = dr > dq? dr - dq : dq - dr; + ///note that l*->v is the ref id, while seg* is the query id + if (w > bw) continue; // test bandwidth + if (dr > max_dist_g || dr < -max_dist_g) continue; + if (lj->re > li->rs) { // test overlap on the graph segment + int o = lj->re - li->rs; + if (o > (lj->re - lj->rs) * mask_level || o > (li->re - li->rs) * mask_level) + continue; + } + target_dist = mg_target_dist(g, lj, li); + } + if (n_dst == m_dst) KEXPAND(km, dst, m_dst); // TODO: watch out the quadratic behavior! + q = &dst[n_dst++];///q saves information for i->j + memset(q, 0, sizeof(mg_path_dst_t)); + ///note v is (rid:rev), so two alignment chains might be at the same ref id with different directions + q->inner = (li->v == lj->v); + q->v = lj->v^1;///must be v^1 instead of v + q->meta = j; + ///lj->qs************lj->qe + /// li->qs************li->qe + q->qlen = li->qs - lj->qe;///might be negative + q->target_dist = target_dist;///cannot understand the target_dist + q->target_hash = 0; + q->check_hash = 0; + if (t[j] == i) {///this pre-cut is weird; attention + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + } + ///the above saves all linear chains that might be reached to the left side of chain i + ///all those chains are saved to dst + { // confirm reach-ability + int32_t k; + // test reach-ability without sequences + /** + *****lj->rs************lj->re***** + ****li->rs************li->re*** + (g->seg[li->v>>1].len - li->rs) ----> is like the node length in string graph + **/ + mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seq[li->v>>1].len - li->rs), MG_MAX_SHORT_K, 0, 0, 1, 0); + // remove unreachable destinations + for (j = k = 0; j < n_dst; ++j) { + mg_path_dst_t *dj = &dst[j]; + int32_t sc; + if (dj->n_path == 0) continue; // unreachable + sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap); + if (sc == INT32_MIN) continue; // out of band + if (sc + li->score < 0) continue; // negative score and too low + dst[k] = dst[j]; + dst[k++].srt_key = INT64_MAX/2 - (int64_t)sc; // sort in the descending order + } + n_dst = k; + if (n_dst > 0) { + radix_sort_dst(dst, dst + n_dst); + // discard weaker chains if the best score is much larger (assuming base-level heuristic won't lift it to the top chain) + // dst[0].srt_key has the largest score + for (j = 1; j < n_dst; ++j) + if (dst[j].srt_key - dst[0].srt_key > li->score)//discard chains with too small weight + break; + n_dst = j; + if (n_dst > max_gc_seq_ext) n_dst = max_gc_seq_ext; // discard weaker chains + } + } + if (n_dst > 0) { // find paths with sequences + int32_t min_qs = li->qs; + for (j = 0; j < n_dst; ++j) { + const mg_lchain_t *lj; + assert(dst[j].n_path > 0); + ///a[]->srt = (uint32_t)is_isolated<<31 | r->qe; + ///a[]->i = i; + lj = &lc[a[dst[j].meta].i]; + if (lj->qe < min_qs) min_qs = lj->qe; + } + ///qs keeps the sequence at the gap between the li and lj in query + memcpy(qs, &qseq[min_qs], li->qs - min_qs); + mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), MG_MAX_SHORT_K, li->qs - min_qs, qs, 1, 0); + if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, "[src:%d] q_intv=[%d,%d), src=%c%s[%d], n_dst=%d, max_dist=%d, min_qs=%d, lc_score=%d\n", ai->i, li->qs, li->qe, "><"[(li->v&1)^1], g->seg[li->v>>1].name, li->v^1, n_dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), min_qs, li->score); + } + { // DP + int32_t max_f = li->score, max_j = -1, max_d = -1, max_inner = 0; + uint32_t max_hash = 0; + for (j = 0; j < n_dst; ++j) { + mg_path_dst_t *dj = &dst[j]; + int32_t sc; + sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap); + if (sc == INT32_MIN) continue; + if (mg_dbg_flag & MG_DBG_GC1) { + mg_lchain_t *lj = &lc[a[dj->meta].i]; + fprintf(stderr, " [dst:%d] dst=%c%s[%d], n_path=%d, target=%d, opt_dist=%d, score=%d, q_intv=[%d,%d), g_intv=[%d,%d)\n", dj->meta, "><"[dj->v&1], g->seg[dj->v>>1].name, dj->v, dj->n_path, dj->target_dist - g->seg[li->v>>1].len, dj->dist - g->seg[li->v>>1].len, sc, lj->qs, lj->qe, lj->rs, lj->re); + } + if (sc > max_f) max_f = sc, max_j = dj->meta, max_d = dj->dist, max_hash = dj->hash, max_inner = dj->inner; + } + f[i] = max_f, p[i] = max_j; + li->dist_pre = max_d; + li->hash_pre = max_hash; + li->inner_pre = max_inner; + v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; + if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, " [opt:%d] opt=%d, max_f=%d\n", ai->i, max_j, max_f); + } + } + kfree(km, dst); + kfree(km, qs); + if (mg_dbg_flag & MG_DBG_GC1) { + int32_t mmax_f = 0, mmax_i = -1; + for (i = 0; i < n_ext; ++i) if (f[i] > mmax_f) mmax_f = f[i], mmax_i = i; + i = mmax_i; while (i >= 0) { fprintf(stderr, "[best] i=%d, seg=%s, max_f=%d, chn_pen_gap=%f\n", a[i].i, g->seg[lc[a[i].i].v>>1].name, f[i], chn_pen_gap); i = p[i]; } + } + ///n_ext: number of useful chains + ///n_lc - n_ext: number of isoated chains + u = mg_chain_backtrack(km, n_ext, f, p, v, t, 0, 0, n_lc - n_ext, &n_u, &n_v); + kfree(km, f); kfree(km, p); kfree(km, t); + ///store the extra isoated chains + for (i = 0; i < n_lc - n_ext; ++i) { + u[n_u++] = (uint64_t)lc[a[n_ext + i].i].score << 32 | 1; + v[n_v++] = n_ext + i; + } + + ///reorganize lc; + KMALLOC(km, swap, n_v); + for (i = 0, k = 0; i < n_u; ++i) { + int32_t k0 = k, ni = (int32_t)u[i]; + for (j = 0; j < ni; ++j) + swap[k++] = lc[a[v[k0 + (ni - j - 1)]].i]; + } + assert(k == n_v); + memcpy(lc, swap, n_v * sizeof(mg_lchain_t)); + *n_lc_ = n_v; + *u_ = u; + + kfree(km, a); + kfree(km, swap); + kfree(km, v); + return n_u; +} + + +void mg_map_frag(const void *ha_flt_tab, const ha_pt_t *ha_idx, const ma_ug_t *ug, const uint32_t qid, const int qlen, const char *qseq, ha_mzl_v *mz, +st_mt_t *sp, mg_tbuf_t *b, int32_t w, int32_t k, int32_t hpc, int32_t mz_sd, int32_t mz_rewin, const mg_idxopt_t *opt) +{ + mg128_t *a = NULL; + int64_t n_a; + int32_t *mini_pos; + int i, rep_len, n_mini_pos, n_lc, max_chain_gap_qry, max_chain_gap_ref; + uint64_t *u; + mg_lchain_t *lc; + mz->n = 0; + mz2_ha_sketch(qseq, qlen, w, k, 0, hpc, mz, ha_flt_tab, mz_sd, NULL, NULL, NULL, -1, -1, -1, sp, mz_rewin, 1); + ///a[]->y: weight(8)seg_id(8)flag(8)span(8)pos(32);--->query + ///a[]->x: rid(31)rev(1)rpos(33);--->reference + a = collect_seed_hits(b->km, opt, opt->hap_n, ha_flt_tab, ha_idx, ug, mz, &n_a, &rep_len, &n_mini_pos, &mini_pos); + if (opt->max_gap_pre > 0 && opt->max_gap_pre * 2 < opt->max_gap) n_a = flt_anchors(n_a, a, opt->max_gap_pre); + max_chain_gap_qry = max_chain_gap_ref = opt->max_gap; + if (n_a == 0) { + if(a) kfree(b->km, a); + a = 0, n_lc = 0, u = 0; + } else { + a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_lc_skip, opt->max_lc_iter, + opt->min_lc_cnt, opt->min_lc_score, opt->chn_pen_gap, n_a, a, &n_lc, &u, b->km); + } + + if (n_lc) {///n_lc is how many chain we found + lc = mg_lchain_gen(b->km, qlen, n_lc, u, a); + for (i = 0; i < n_lc; ++i)///update a[] since ref_id|rev has already been saved to lc[].v + mg_update_anchors(lc[i].cnt, &a[lc[i].off], n_mini_pos, mini_pos);///update a[].x + } else lc = 0; + kfree(b->km, mini_pos); kfree(b->km, u); + + /** + * up to here, a[] has been changed + * a[].x: idx_in_minimizer_arr(32)r_pos(32) + * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32) + **/ + +} + static void worker_for_ul_alignment(void *data, long i, int tid) // callback for kt_for() { utepdat_t *s = (utepdat_t*)data; - ma_ov_buf_t *b = s->mo[tid]; - uint64_t len = s->len[i]; - char *r = s->seq[i]; - + // ma_ov_buf_t *b = s->mo[tid]; + mg_map_frag(s->ha_flt_tab, s->ha_idx, s->ug, s->id+i, s->len[i], s->seq[i], &(s->mzs[tid]), &(s->sps[tid]), s->buf[tid], s->opt->w, s->opt->k, + s->opt->is_HPC, asm_opt.mz_sample_dist, asm_opt.mz_rewin, s->opt); } @@ -110,7 +1289,7 @@ static void *worker_ul_pipeline(void *data, int step, void *in) // callback for uint64_t l; utepdat_t *s; CALLOC(s, 1); - s->ha_flt_tab = p->ha_flt_tab; s->ha_idx = p->ha_idx; s->id = p->total_pair; + s->ha_flt_tab = p->ha_flt_tab; s->ha_idx = p->ha_idx; s->id = p->total_pair; s->opt = p->opt; s->ug = p->ug; while ((ret = kseq_read(p->ks)) >= 0) { if (p->ks->seq.l < (uint64_t)p->opt->k) continue; @@ -131,19 +1310,25 @@ static void *worker_ul_pipeline(void *data, int step, void *in) // callback for else return s; } else if (step == 1) { // step 2: alignment + uint64_t i; utepdat_t *s = (utepdat_t*)in; - s->mo = init_ma_ov_buf_t_arr(p->n_thread); + CALLOC(s->mzs, p->n_thread); + CALLOC(s->sps, p->n_thread); + + s->buf = (mg_tbuf_t**)calloc(p->n_thread, sizeof(mg_tbuf_t*)); + for (i = 0; i < p->n_thread; ++i) s->buf[i] = mg_tbuf_init(); /** CALLOC(s->pos, s->n); **/ - int i; kt_for(p->n_thread, worker_for_ul_alignment, s, s->n); - for (i = 0; i < s->n; ++i) { + for (i = 0; i < (uint64_t)s->n; ++i) { free(s->seq[i]); p->total_base += s->len[i]; } free(s->seq); free(s->len); - destory_ma_ov_buf_t_arr(&(s->mo), p->n_thread); + + for (i = 0; i < p->n_thread; ++i) mg_tbuf_destroy(s->buf[i]); + free(s->buf); return s; } else if (step == 2) { // step 3: dump @@ -179,7 +1364,7 @@ int alignment_ul_pipeline(uldat_t* sl, const enzyme *fn) return 1; } -int ul_align(mg_idxopt_t *opt, const enzyme *fn, void *ha_flt_tab, ha_pt_t *ha_idx) +int ul_align(mg_idxopt_t *opt, const enzyme *fn, void *ha_flt_tab, ha_pt_t *ha_idx, ma_ug_t *ug) { uldat_t sl; memset(&sl, 0, sizeof(sl)); sl.ha_flt_tab = ha_flt_tab; @@ -187,6 +1372,7 @@ int ul_align(mg_idxopt_t *opt, const enzyme *fn, void *ha_flt_tab, ha_pt_t *ha_i sl.opt = opt; sl.chunk_size = 20000000; sl.n_thread = asm_opt.thread_num; + sl.ug = ug; alignment_ul_pipeline(&sl, fn); return 1; } @@ -196,7 +1382,7 @@ void ul_resolve(ma_ug_t *ug, int hap_n) mg_idxopt_t opt; init_mg_opt(&opt, 0, 19, 10, hap_n); uidx_build(ug, &opt); - ul_align(&opt, asm_opt.ar, ha_flt_tab, ha_idx); + ul_align(&opt, asm_opt.ar, ha_flt_tab, ha_idx, ug); diff --git a/kalloc.cpp b/kalloc.cpp new file mode 100644 index 0000000..8499552 --- /dev/null +++ b/kalloc.cpp @@ -0,0 +1,205 @@ +#include +#include +#include +#include "kalloc.h" + +/* In kalloc, a *core* is a large chunk of contiguous memory. Each core is + * associated with a master header, which keeps the size of the current core + * and the pointer to next core. Kalloc allocates small *blocks* of memory from + * the cores and organizes free memory blocks in a circular single-linked list. + * + * In the following diagram, "@" stands for the header of a free block (of type + * header_t), "#" for the header of an allocated block (of type size_t), "-" + * for free memory, and "+" for allocated memory. + * + * master This region is core 1. master This region is core 2. + * | | + * *@-------#++++++#++++++++++++@-------- *@----------#++++++++++++#+++++++@------------ + * | | | | + * p=p->ptr->ptr->ptr->ptr p->ptr p->ptr->ptr p->ptr->ptr->ptr + */ +typedef struct header_t { + size_t size; + struct header_t *ptr; +} header_t; + +typedef struct { + void *par; + size_t min_core_size; + header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */ +} kmem_t; + +static void panic(const char *s) +{ + fprintf(stderr, "%s\n", s); + abort(); +} + +void *km_init2(void *km_par, size_t min_core_size) +{ + kmem_t *km; + km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t)); + km->par = km_par; + km->min_core_size = min_core_size > 0? min_core_size : 0x80000; + return (void*)km; +} + +void *km_init(void) { return km_init2(0, 0); } + +void km_destroy(void *_km) +{ + kmem_t *km = (kmem_t*)_km; + void *km_par; + header_t *p, *q; + if (km == NULL) return; + km_par = km->par; + for (p = km->core_head; p != NULL;) { + q = p->ptr; + kfree(km_par, p); + p = q; + } + kfree(km_par, km); +} + +static header_t *morecore(kmem_t *km, size_t nu) +{ + header_t *q; + size_t bytes, *p; + nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */ + bytes = nu * sizeof(header_t); + q = (header_t*)kmalloc(km->par, bytes); + if (!q) panic("[morecore] insufficient memory"); + q->ptr = km->core_head, q->size = nu, km->core_head = q; + p = (size_t*)(q + 1); + *p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */ + kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */ + return km->loop_head; +} + +void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */ +{ + header_t *p, *q; + kmem_t *km = (kmem_t*)_km; + + if (!ap) return; + if (km == NULL) { + free(ap); + return; + } + p = (header_t*)((size_t*)ap - 1); + p->size = *((size_t*)ap - 1); + /* Find the pointer that points to the block to be freed. The following loop can stop on two conditions: + * + * a) "p>q && pptr": @------#++++++++#+++++++@------- @---------------#+++++++@------- + * (can also be in | | | -> | | + * two cores) q p q->ptr q q->ptr + * + * @-------- #+++++++++@-------- @-------- @------------------ + * | | | -> | | + * q p q->ptr q q->ptr + * + * b) "q>=q->ptr && (p>q || pptr)": @-------#+++++ @--------#+++++++ @-------#+++++ @---------------- + * | | | -> | | + * q->ptr q p q->ptr q + * + * #+++++++@----- #++++++++@------- @------------- #++++++++@------- + * | | | -> | | + * p q->ptr q q->ptr q + */ + for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr) + if (q >= q->ptr && (p > q || p < q->ptr)) break; + if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */ + p->size += q->ptr->size; + p->ptr = q->ptr->ptr; + } else if (p + p->size > q->ptr && q->ptr >= p) { + panic("[kfree] The end of the allocated block enters a free block."); + } else p->ptr = q->ptr; /* backup q->ptr */ + + if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */ + q->size += p->size; + q->ptr = p->ptr; + km->loop_head = q; + } else if (q + q->size > p && p >= q) { + panic("[kfree] The end of a free block enters the allocated block."); + } else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */ +} + +void *kmalloc(void *_km, size_t n_bytes) +{ + kmem_t *km = (kmem_t*)_km; + size_t n_units; + header_t *p, *q; + + if (n_bytes == 0) return 0; + if (km == NULL) return malloc(n_bytes); + n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */ + + if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */ + q = km->loop_head = km->base.ptr = &km->base; + for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */ + if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */ + if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */ + else { /* split the block. NB: memory is allocated at the end of the block! */ + p->size -= n_units; /* reduce the size of the free block */ + p += p->size; /* p points to the allocated block */ + *(size_t*)p = n_units; /* set the size */ + } + km->loop_head = q; /* set the end of chain */ + return (size_t*)p + 1; + } + if (p == km->loop_head) { /* then ask for more "cores" */ + if ((p = morecore(km, n_units)) == 0) return 0; + } + } +} + +void *kcalloc(void *_km, size_t count, size_t size) +{ + kmem_t *km = (kmem_t*)_km; + void *p; + if (size == 0 || count == 0) return 0; + if (km == NULL) return calloc(count, size); + p = kmalloc(km, count * size); + memset(p, 0, count * size); + return p; +} + +void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle +{ + kmem_t *km = (kmem_t*)_km; + size_t cap, *p, *q; + + if (n_bytes == 0) { + kfree(km, ap); return 0; + } + if (km == NULL) return realloc(ap, n_bytes); + if (ap == NULL) return kmalloc(km, n_bytes); + p = (size_t*)ap - 1; + cap = (*p) * sizeof(header_t) - sizeof(size_t); + if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */ + q = (size_t*)kmalloc(km, n_bytes); + memcpy(q, ap, cap); + kfree(km, ap); + return q; +} + +void km_stat(const void *_km, km_stat_t *s) +{ + kmem_t *km = (kmem_t*)_km; + header_t *p; + memset(s, 0, sizeof(km_stat_t)); + if (km == NULL || km->loop_head == NULL) return; + for (p = km->loop_head;; p = p->ptr) { + s->available += p->size * sizeof(header_t); + if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */ + if (p->ptr > p && p + p->size > p->ptr) + panic("[km_stat] The end of a free block enters another free block."); + if (p->ptr == km->loop_head) break; + } + for (p = km->core_head; p != NULL; p = p->ptr) { + size_t size = p->size * sizeof(header_t); + ++s->n_cores; + s->capacity += size; + s->largest = s->largest > size? s->largest : size; + } +} diff --git a/kalloc.h b/kalloc.h new file mode 100644 index 0000000..5ef432a --- /dev/null +++ b/kalloc.h @@ -0,0 +1,77 @@ +#ifndef _KALLOC_H_ +#define _KALLOC_H_ + +#include /* for size_t */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + size_t capacity, available, n_blocks, n_cores, largest; +} km_stat_t; + +void *kmalloc(void *km, size_t size); +void *krealloc(void *km, void *ptr, size_t size); +void *kcalloc(void *km, size_t count, size_t size); +void kfree(void *km, void *ptr); + +void *km_init(void); +void *km_init2(void *km_par, size_t min_core_size); +void km_destroy(void *km); +void km_stat(const void *_km, km_stat_t *s); + +#ifdef __cplusplus +} +#endif + +#define KMALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kmalloc((km), (len) * sizeof(*(ptr)))) +#define KCALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kcalloc((km), (len), sizeof(*(ptr)))) +#define KREALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))krealloc((km), (ptr), (len) * sizeof(*(ptr)))) + +#define KEXPAND(km, a, m) do { \ + (m) = (m) >= 4? (m) + ((m)>>1) : 16; \ + KREALLOC((km), (a), (m)); \ + } while (0) + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +// adapted from klist.h +#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + void *km; \ + } kmp_##name##_t; \ + SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \ + kmp_##name##_t *mp; \ + KCALLOC(km, mp, 1); \ + mp->km = km; \ + return mp; \ + } \ + SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \ + kfree(mp->km, mp->buf); kfree(mp->km, mp); \ + } \ + SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \ + mp->buf[mp->n++] = p; \ + } + +#define KALLOC_POOL_INIT(name, kmptype_t) \ + KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t) + +#endif diff --git a/kavl.h b/kavl.h new file mode 100644 index 0000000..e0a8e1b --- /dev/null +++ b/kavl.h @@ -0,0 +1,414 @@ +/* The MIT License + + Copyright (c) 2018 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* An example: + +#include +#include +#include +#include "kavl.h" + +struct my_node { + char key; + KAVL_HEAD(struct my_node) head; +}; +#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key)) +KAVL_INIT(my, struct my_node, head, my_cmp) + +int main(void) { + const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate + struct my_node *root = 0; + int i, l = strlen(str); + for (i = 0; i < l; ++i) { // insert in the input order + struct my_node *q, *p = malloc(sizeof(*p)); + p->key = str[i]; + q = kavl_insert(my, &root, p, 0); + if (p != q) free(p); // if already present, free + } + kavl_itr_t(my) itr; + kavl_itr_first(my, root, &itr); // place at first + do { // traverse + const struct my_node *p = kavl_at(&itr); + putchar(p->key); + free((void*)p); // free node + } while (kavl_itr_next(my, &itr)); + putchar('\n'); + return 0; +} +*/ + +#ifndef KAVL_H +#define KAVL_H + +#ifdef __STRICT_ANSI__ +#define inline __inline__ +#endif + +#define KAVL_MAX_DEPTH 64 + +#define kavl_size(head, p) ((p)? (p)->head.size : 0) +#define kavl_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0) + +#define KAVL_HEAD(__type) \ + struct { \ + __type *p[2]; \ + signed char balance; /* balance factor */ \ + unsigned size; /* #elements in subtree */ \ + } + +#define __KAVL_FIND(suf, __scope, __type, __head, __cmp) \ + __scope __type *kavl_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \ + const __type *p = root; \ + unsigned cnt = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += kavl_size_child(__head, p, 0) + 1; \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + if (cnt_) *cnt_ = cnt; \ + return (__type*)p; \ + } \ + __scope __type *kavl_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \ + const __type *p = root, *l = 0, *u = 0; \ + while (p != 0) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp < 0) u = p, p = p->__head.p[0]; \ + else if (cmp > 0) l = p, p = p->__head.p[1]; \ + else { l = u = p; break; } \ + } \ + if (lower) *lower = (__type*)l; \ + if (upper) *upper = (__type*)u; \ + return (__type*)p; \ + } + +#define __KAVL_ROTATE(suf, __type, __head) \ + /* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \ + static inline __type *kavl_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \ + int opp = 1 - dir; /* opposite direction */ \ + __type *q = p->__head.p[opp]; \ + unsigned size_p = p->__head.size; \ + p->__head.size -= q->__head.size - kavl_size_child(__head, q, dir); \ + q->__head.size = size_p; \ + p->__head.p[opp] = q->__head.p[dir]; \ + q->__head.p[dir] = p; \ + return q; \ + } \ + /* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \ + static inline __type *kavl_rotate2_##suf(__type *p, int dir) { \ + int b1, opp = 1 - dir; \ + __type *q = p->__head.p[opp], *r = q->__head.p[dir]; \ + unsigned size_x_dir = kavl_size_child(__head, r, dir); \ + r->__head.size = p->__head.size; \ + p->__head.size -= q->__head.size - size_x_dir; \ + q->__head.size -= size_x_dir + 1; \ + p->__head.p[opp] = r->__head.p[dir]; \ + r->__head.p[dir] = p; \ + q->__head.p[dir] = r->__head.p[opp]; \ + r->__head.p[opp] = q; \ + b1 = dir == 0? +1 : -1; \ + if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \ + else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \ + else q->__head.balance = b1, p->__head.balance = 0; \ + r->__head.balance = 0; \ + return r; \ + } + +#define __KAVL_INSERT(suf, __scope, __type, __head, __cmp) \ + __scope __type *kavl_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \ + unsigned char stack[KAVL_MAX_DEPTH]; \ + __type *path[KAVL_MAX_DEPTH]; \ + __type *bp, *bq; \ + __type *p, *q, *r = 0; /* _r_ is potentially the new root */ \ + int i, which = 0, top, b1, path_len; \ + unsigned cnt = 0; \ + bp = *root_, bq = 0; \ + /* find the insertion location */ \ + for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \ + int cmp; \ + cmp = __cmp(x, p); \ + if (cmp >= 0) cnt += kavl_size_child(__head, p, 0) + 1; \ + if (cmp == 0) { \ + if (cnt_) *cnt_ = cnt; \ + return p; \ + } \ + if (p->__head.balance != 0) \ + bq = q, bp = p, top = 0; \ + stack[top++] = which = (cmp > 0); \ + path[path_len++] = p; \ + } \ + if (cnt_) *cnt_ = cnt; \ + x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0; \ + if (q == 0) *root_ = x; \ + else q->__head.p[which] = x; \ + if (bp == 0) return x; \ + for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \ + for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \ + if (stack[top] == 0) --p->__head.balance; \ + else ++p->__head.balance; \ + if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \ + /* re-balance */ \ + which = (bp->__head.balance < 0); \ + b1 = which == 0? +1 : -1; \ + q = bp->__head.p[1 - which]; \ + if (q->__head.balance == b1) { \ + r = kavl_rotate1_##suf(bp, which); \ + q->__head.balance = bp->__head.balance = 0; \ + } else r = kavl_rotate2_##suf(bp, which); \ + if (bq == 0) *root_ = r; \ + else bq->__head.p[bp != bq->__head.p[0]] = r; \ + return x; \ + } + +#define __KAVL_ERASE(suf, __scope, __type, __head, __cmp) \ + __scope __type *kavl_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \ + __type *p, *path[KAVL_MAX_DEPTH], fake; \ + unsigned char dir[KAVL_MAX_DEPTH]; \ + int i, d = 0, cmp; \ + unsigned cnt = 0; \ + fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \ + if (cnt_) *cnt_ = 0; \ + if (x) { \ + for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \ + int which = (cmp > 0); \ + if (cmp > 0) cnt += kavl_size_child(__head, p, 0) + 1; \ + dir[d] = which; \ + path[d++] = p; \ + p = p->__head.p[which]; \ + if (p == 0) { \ + if (cnt_) *cnt_ = 0; \ + return 0; \ + } \ + } \ + cnt += kavl_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \ + } else { \ + for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \ + dir[d] = 0, path[d++] = p; \ + p = path[--d]; \ + } \ + if (cnt_) *cnt_ = cnt; \ + for (i = 1; i < d; ++i) --path[i]->__head.size; \ + if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \ + path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \ + } else { \ + __type *q = p->__head.p[1]; \ + if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3 */ \ + q->__head.p[0] = p->__head.p[0]; \ + q->__head.balance = p->__head.balance; \ + path[d-1]->__head.p[dir[d-1]] = q; \ + path[d] = q, dir[d++] = 1; \ + q->__head.size = p->__head.size - 1; \ + } else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \ + __type *r; \ + int e = d++; /* backup _d_ */\ + for (;;) { \ + dir[d] = 0; \ + path[d++] = q; \ + r = q->__head.p[0]; \ + if (r->__head.p[0] == 0) break; \ + q = r; \ + } \ + r->__head.p[0] = p->__head.p[0]; \ + q->__head.p[0] = r->__head.p[1]; \ + r->__head.p[1] = p->__head.p[1]; \ + r->__head.balance = p->__head.balance; \ + path[e-1]->__head.p[dir[e-1]] = r; \ + path[e] = r, dir[e] = 1; \ + for (i = e + 1; i < d; ++i) --path[i]->__head.size; \ + r->__head.size = p->__head.size - 1; \ + } \ + } \ + while (--d > 0) { \ + __type *q = path[d]; \ + int which, other, b1 = 1, b2 = 2; \ + which = dir[d], other = 1 - which; \ + if (which) b1 = -b1, b2 = -b2; \ + q->__head.balance += b1; \ + if (q->__head.balance == b1) break; \ + else if (q->__head.balance == b2) { \ + __type *r = q->__head.p[other]; \ + if (r->__head.balance == -b1) { \ + path[d-1]->__head.p[dir[d-1]] = kavl_rotate2_##suf(q, which); \ + } else { \ + path[d-1]->__head.p[dir[d-1]] = kavl_rotate1_##suf(q, which); \ + if (r->__head.balance == 0) { \ + r->__head.balance = -b1; \ + q->__head.balance = b1; \ + break; \ + } else r->__head.balance = q->__head.balance = 0; \ + } \ + } \ + } \ + *root_ = fake.__head.p[0]; \ + return p; \ + } + +#define kavl_free(__type, __head, __root, __free) do { \ + __type *_p, *_q; \ + for (_p = __root; _p; _p = _q) { \ + if (_p->__head.p[0] == 0) { \ + _q = _p->__head.p[1]; \ + __free(_p); \ + } else { \ + _q = _p->__head.p[0]; \ + _p->__head.p[0] = _q->__head.p[1]; \ + _q->__head.p[1] = _p; \ + } \ + } \ + } while (0) + +#define __KAVL_ITR(suf, __scope, __type, __head, __cmp) \ + struct kavl_itr_##suf { \ + const __type *stack[KAVL_MAX_DEPTH], **top; \ + }; \ + __scope void kavl_itr_first_##suf(const __type *root, struct kavl_itr_##suf *itr) { \ + const __type *p; \ + for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \ + *++itr->top = p; \ + } \ + __scope int kavl_itr_find_##suf(const __type *root, const __type *x, struct kavl_itr_##suf *itr) { \ + const __type *p = root; \ + itr->top = itr->stack - 1; \ + while (p != 0) { \ + int cmp; \ + *++itr->top = p; \ + cmp = __cmp(x, p); \ + if (cmp < 0) p = p->__head.p[0]; \ + else if (cmp > 0) p = p->__head.p[1]; \ + else break; \ + } \ + return p? 1 : 0; \ + } \ + __scope int kavl_itr_next_bidir_##suf(struct kavl_itr_##suf *itr, int dir) { \ + const __type *p; \ + if (itr->top < itr->stack) return 0; \ + dir = !!dir; \ + p = (*itr->top)->__head.p[dir]; \ + if (p) { /* go down */ \ + for (; p; p = p->__head.p[!dir]) \ + *++itr->top = p; \ + return 1; \ + } else { /* go up */ \ + do { \ + p = *itr->top--; \ + } while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \ + return itr->top < itr->stack? 0 : 1; \ + } \ + } \ + +/** + * Insert a node to the tree + * + * @param suf name suffix used in KAVL_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node to insert (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return _x_ if not present in the tree, or the node equal to x. + */ +#define kavl_insert(suf, proot, x, cnt) kavl_insert_##suf(proot, x, cnt) + +/** + * Find a node in the tree + * + * @param suf name suffix used in KAVL_INIT() + * @param root root of the tree + * @param x node value to find (in) + * @param cnt number of nodes smaller than or equal to _x_; can be NULL (out) + * + * @return node equal to _x_ if present, or NULL if absent + */ +#define kavl_find(suf, root, x, cnt) kavl_find_##suf(root, x, cnt) +#define kavl_interval(suf, root, x, lower, upper) kavl_interval_##suf(root, x, lower, upper) + +/** + * Delete a node from the tree + * + * @param suf name suffix used in KAVL_INIT() + * @param proot pointer to the root of the tree (in/out: root may change) + * @param x node value to delete; if NULL, delete the first node (in) + * + * @return node removed from the tree if present, or NULL if absent + */ +#define kavl_erase(suf, proot, x, cnt) kavl_erase_##suf(proot, x, cnt) +#define kavl_erase_first(suf, proot) kavl_erase_##suf(proot, 0, 0) + +#define kavl_itr_t(suf) struct kavl_itr_##suf + +/** + * Place the iterator at the smallest object + * + * @param suf name suffix used in KAVL_INIT() + * @param root root of the tree + * @param itr iterator + */ +#define kavl_itr_first(suf, root, itr) kavl_itr_first_##suf(root, itr) + +/** + * Place the iterator at the object equal to or greater than the query + * + * @param suf name suffix used in KAVL_INIT() + * @param root root of the tree + * @param x query (in) + * @param itr iterator (out) + * + * @return 1 if find; 0 otherwise. kavl_at(itr) is NULL if and only if query is + * larger than all objects in the tree + */ +#define kavl_itr_find(suf, root, x, itr) kavl_itr_find_##suf(root, x, itr) + +/** + * Move to the next object in order + * + * @param itr iterator (modified) + * + * @return 1 if there is a next object; 0 otherwise + */ +#define kavl_itr_next(suf, itr) kavl_itr_next_bidir_##suf(itr, 1) +#define kavl_itr_prev(suf, itr) kavl_itr_next_bidir_##suf(itr, 0) + +/** + * Return the pointer at the iterator + * + * @param itr iterator + * + * @return pointer if present; NULL otherwise + */ +#define kavl_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top) + +#define KAVL_INIT2(suf, __scope, __type, __head, __cmp) \ + __KAVL_FIND(suf, __scope, __type, __head, __cmp) \ + __KAVL_ROTATE(suf, __type, __head) \ + __KAVL_INSERT(suf, __scope, __type, __head, __cmp) \ + __KAVL_ERASE(suf, __scope, __type, __head, __cmp) \ + __KAVL_ITR(suf, __scope, __type, __head, __cmp) + +#define KAVL_INIT(suf, __type, __head, __cmp) \ + KAVL_INIT2(suf,, __type, __head, __cmp) + +#endif diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..311e88d --- /dev/null +++ b/khash.h @@ -0,0 +1,635 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include +#include "kalloc.h" + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + void *km; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init2_##name(void *km) { \ + kh_##name##_t *h; \ + h = (kh_##name##_t*)kcalloc(km, 1, sizeof(kh_##name##_t)); \ + h->km = km; \ + return h; \ + } \ + SCOPE kh_##name##_t *kh_init_##name(void) { return kh_init2_##name(0); } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + void *km = h->km; \ + kfree(km, (void *)h->keys); kfree(km, h->flags); \ + kfree(km, (void *)h->vals); \ + kfree(km, h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(h->km, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc(h->km, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(h->km, new_flags); return -1; } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc(h->km, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { kfree(h->km, new_flags); return -1; } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc(h->km, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc(h->km, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->km, h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) + +static kh_inline khint64_t __ac_Wang_hash64(khint64_t key) +{ + key = ~key + (key << 21); + key = key ^ key >> 24; + key = (key + (key << 3)) + (key << 8); + key = key ^ key >> 14; + key = (key + (key << 2)) + (key << 4); + key = key ^ key >> 28; + key = key + (key << 31); + return key; +} +#define kh_int_hash64_func2(key) __ac_Wang_hash64((khint64_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() +#define kh_init2(name, km) kh_init2_##name(km) + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/tovlp.cpp b/tovlp.cpp index 8401f89..55923cc 100644 --- a/tovlp.cpp +++ b/tovlp.cpp @@ -968,7 +968,7 @@ pdq *pq_p, pdq *pq_a, uint8_t fp, uint8_t fa, long long *d) for (i = 0; i < ns; i++) { if(as[i].del || as[i].v == sv) continue; - nc = dfs_set(g, as[i].v, s>>1, stack, tt, vis, fa); + nc = dfs_set(g, as[i].v, s>>1, stack, tt, vis, fa);///nc > 0 means as[i].v and sv have common suffix if(nc && check_trans_relation_by_path(sv, as[i].v, pq_p, NULL, NULL, pq_a, NULL, NULL, g, vis, fp+fa, nc, 0.45, d)) { @@ -1100,11 +1100,12 @@ int get_min_dec(clean_mul_t *cl, uint32_t positive_flag, uint32_t negative_flag, clean_t *p = NULL; (*v) = (uint32_t)-1; for (i = 0; i < cl->n; i++) cl->a[i].is_b = 0, cl->a[i].min_v = (uint32_t)-1; - kt_for(cl->n, bub_iden_worker, cl, cl->g->n_seq<<1); + kt_for(cl->n, bub_iden_worker, cl, cl->g->n_seq<<1);///fast check if there are bubbles in graph for (i = 0; i < cl->n; i++) { if(cl->a[i].is_b) ///pop bubble { + ///pop all bubbles n_pop = asg_pop_bubble_primary_trio(cl->g, cl->ug, cl->bs_flag, &(cl->a[i].b), cl->max_dist, positive_flag, negative_flag, o); if(n_pop ==0) fprintf(stderr, "ERROR-n_pop\n"); @@ -1116,6 +1117,7 @@ int get_min_dec(clean_mul_t *cl, uint32_t positive_flag, uint32_t negative_flag, { if(cl->a[i].min_v == (uint32_t)-1) continue; // if(b->min_v == (uint32_t)-1 || (b->min_d > d) || (b->min_d == d && b->min_v < eid)) + ///looks like select the minum node, and back if(!p || p->min_d > cl->a[i].min_d || (p->min_d == cl->a[i].min_d && cl->g->seq[p->min_v>>1].len > cl->g->seq[cl->a[i].min_v>>1].len) || (p->min_d == cl->a[i].min_d && cl->g->seq[p->min_v>>1].len == cl->g->seq[cl->a[i].min_v>>1].len && p->min_v > cl->a[i].min_v))