From 7dd4a848b93dc1c6b9403c181e2e06621139c7bb Mon Sep 17 00:00:00 2001
From: chhylp123 <hcheng@jimmy.harvard.edu>
Date: Sun, 26 Sep 2021 13:51:13 -0400
Subject: [PATCH] debug graph

---
 CommandLines.cpp |    4 +-
 CommandLines.h   |    3 +-
 Makefile         |    3 +-
 Overlaps.cpp     |  127 ++++-
 Overlaps.h       |    2 +-
 Trio.cpp         |   77 +++
 htab.h           |    1 +
 inter.cpp        | 1254 ++++++++++++++++++++++++++++++++++++++++++++--
 kalloc.cpp       |  205 ++++++++
 kalloc.h         |   77 +++
 kavl.h           |  414 +++++++++++++++
 khash.h          |  635 +++++++++++++++++++++++
 tovlp.cpp        |    6 +-
 13 files changed, 2747 insertions(+), 61 deletions(-)
 create mode 100644 kalloc.cpp
 create mode 100644 kalloc.h
 create mode 100644 kavl.h
 create mode 100644 khash.h

diff --git a/CommandLines.cpp b/CommandLines.cpp
index 064e2b7..7746faf 100644
--- a/CommandLines.cpp
+++ b/CommandLines.cpp
@@ -157,6 +157,7 @@ void init_opt(hifiasm_opt_t* asm_opt)
     asm_opt->hic_enzymes = NULL;
     asm_opt->hic_reads[0] = NULL;
     asm_opt->hic_reads[1] = NULL;
+    asm_opt->fn_bin_poy = NULL;
     asm_opt->ar = NULL;
     asm_opt->thread_num = 1;
     asm_opt->k_mer_length = 51;
@@ -654,7 +655,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
 
     int c;
 
-    while ((c = ketopt(&opt, argc, argv, 1, "hvt:o:k:w:m:n:r:a:b:z:x:y:p:c:d:M:P:if:D:FN:1:2:3:4:l:s:O:eu", long_options)) >= 0) {
+    while ((c = ketopt(&opt, argc, argv, 1, "hvt:o:k:w:m:n:r:a:b:z:x:y:p:c:d:M:P:if:D:FN:1:2:3:4:5:l:s:O:eu", long_options)) >= 0) {
         if (c == 'h')
         {
             Print_H(asm_opt);
@@ -684,6 +685,7 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
         else if (c == '2' || c == 'M') asm_opt->fn_bin_yak[1] = opt.arg;
         else if (c == '3') asm_opt->fn_bin_list[0] = opt.arg;
         else if (c == '4') asm_opt->fn_bin_list[1] = opt.arg;
+        else if (c == '5') asm_opt->fn_bin_poy = opt.arg;
         else if (c == 'x') asm_opt->max_drop_rate = atof(opt.arg);
         else if (c == 'y') asm_opt->min_drop_rate = atof(opt.arg);
         else if (c == 'p') asm_opt->small_pop_bubble_size = atoll(opt.arg);
diff --git a/CommandLines.h b/CommandLines.h
index 5490e8f..66f99a7 100644
--- a/CommandLines.h
+++ b/CommandLines.h
@@ -4,7 +4,7 @@
 #include <pthread.h>
 #include <stdint.h>
 
-#define HA_VERSION "0.16.1-r375"
+#define HA_VERSION "0.16.1-r377"
 
 #define VERBOSE 0
 
@@ -38,6 +38,7 @@ typedef struct {
     char* required_read_name;
 	char *fn_bin_yak[2];
 	char *fn_bin_list[2];
+    char *fn_bin_poy;
 	char *extract_list;
     enzyme *hic_reads[2];
     enzyme *hic_enzymes;
diff --git a/Makefile b/Makefile
index 3d85a74..8da074f 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ INCLUDES=
 OBJS=		CommandLines.o Process_Read.o Assembly.o Hash_Table.o \
 			POA.o Correct.o Levenshtein_distance.o Overlaps.o Trio.o kthread.o Purge_Dups.o \
 			htab.o hist.o sketch.o anchor.o extract.o sys.o ksw2_extz2_sse.o hic.o rcut.o horder.o \
-			tovlp.o inter.o
+			tovlp.o inter.o kalloc.o
 EXE=		hifiasm
 LIBS=		-lz -lpthread -lm
 
@@ -77,3 +77,4 @@ rcut.o: rcut.h
 horder.o: horder.h
 tovlp.o: tovlp.h
 inter.o: inter.h
+kalloc.o: kalloc.h
\ No newline at end of file
diff --git a/Overlaps.cpp b/Overlaps.cpp
index a1b794b..0c3e4c2 100644
--- a/Overlaps.cpp
+++ b/Overlaps.cpp
@@ -13090,6 +13090,61 @@ void hic_clean(asg_t* read_g)
     kv_destroy(ax);
 }
 
+void update_poly_trio(uint32_t mm, uint32_t *hapS, uint32_t rn)
+{
+    uint32_t i;
+    for (i = 0; i < rn; i++) {
+        if(R_INF.trio_flag[i] == DROP) continue;
+        R_INF.trio_flag[i] = AMBIGU;
+        if(!hapS[i]) continue;
+        R_INF.trio_flag[i] = (hapS[i]&mm?FATHER:MOTHER);
+    }    
+}
+void debug_hapS(uint32_t *hapS, uint32_t rn)
+{
+    uint32_t i, p, tot, nt, max_tot = 0, *occ = NULL, *freq = NULL;
+    for (i = 0; i < rn; i++) {
+        for (p = hapS[i], tot = 0; p; p>>=1, tot++);   
+        max_tot = MAX(max_tot, tot); 
+    }
+    fprintf(stderr, "[M::%s:] ==> %u haplotypes in total\n", __func__, max_tot);
+    if(max_tot){
+        CALLOC(occ, max_tot+1);
+        CALLOC(freq, max_tot+1);
+        for (i = 0; i < rn; i++) {
+            for (p = hapS[i], tot = nt = 0; p; p>>=1, tot++){
+                if(p&1) occ[tot+1]++, nt++;
+            }
+            freq[nt]++;
+        }
+        for (i = 1; i <= max_tot; i++) {
+            fprintf(stderr, "[M::%s:] ==> # reads in hap%u: %u\n", __func__, i, occ[i]);
+        }
+        for (i = 0; i <= max_tot; i++) {
+            fprintf(stderr, "[M::%s:] ==> # reads within %u haplotypes: %u\n", __func__, i, freq[i]);
+        }
+        free(occ); free(freq);
+    }
+}
+void output_poly_trio(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, ma_hit_t_alloc* sources, 
+ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, 
+R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, 
+bub_label_t* b_mask_t, uint32_t hapN)
+{
+    uint32_t i;
+    uint32_t *hapS = ha_polybin_list(&asm_opt);
+    // debug_hapS(hapS, sg->n_seq);
+    char *fp = NULL; MALLOC(fp, 100);
+    for (i = 0; i < hapN; i++){
+        update_poly_trio(1<<i, hapS, sg->n_seq);
+        sprintf(fp, "hap%u", i+1);
+        output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, 
+        stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, is_bench, b_mask_t, fp);
+    }
+    free(fp); free(hapS);
+}
+
+
 void output_contig_graph_alternative(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name,
 ma_hit_t_alloc* sources, R_to_U* ruIndex, int max_hang, int min_ovlp);
 void clean_u_trans_t_idx(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g);
@@ -13212,9 +13267,9 @@ long long gap_fuzz, bub_label_t* b_mask_t)
     reduce_hamming_error(sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz);
 
     ug_fa = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-    0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t);
+    0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL);
     ug_mo = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-    0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t);
+    0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL);
     if(rhits)
     {
         ha_aware_order(rhits, sg, ug_fa, ug_mo, cov?&(cov->t_ch->k_trans):&(t_ch->k_trans), &opt, 3);
@@ -13320,9 +13375,9 @@ long long gap_fuzz, bub_label_t* b_mask_t)
     reduce_hamming_error(sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz);
 
     output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t);
+    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL);
     output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t);
+    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL);
 }
 
 void set_trio_flag_by_cov(ma_ug_t *ug, asg_t *read_g, hap_cov_t *cov)
@@ -14117,9 +14172,9 @@ bub_label_t* b_mask_t)
     kv_destroy(new_rtg_edges.a);
 
     output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t);
+    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL);
     output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t);
+    0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL);
 }
 
 ma_ug_t* merge_utg(ma_ug_t **dest, ma_ug_t **src)
@@ -14186,11 +14241,11 @@ float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t*
 {
     ma_ug_t *ug_1 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, 
     reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, 
-    chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t);
+    chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t, NULL);
 
     ma_ug_t *ug_2 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, 
     reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, 
-    chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t);
+    chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t, NULL);
     fprintf(stderr, "ug_1->u.n: %u, ug_2->u.n: %u\n", (uint32_t)ug_1->u.n, (uint32_t)ug_2->u.n);
     ma_ug_t *ug = merge_utg(&ug_1, &ug_2);
     fprintf(stderr, "ug->u.n: %u\n", (uint32_t)ug->u.n);
@@ -15258,10 +15313,7 @@ hap_cov_t *cov, utg_trans_t *o)
 uint8_t if_primary_unitig(ma_utg_t* u, asg_t* read_g, ma_sub_t* coverage_cut, 
 ma_hit_t_alloc* sources, R_to_U* ruIndex, uint8_t* r_flag)
 {
-    if(asm_opt.recover_atg_cov_min < 0 || asm_opt.recover_atg_cov_max < 0)
-    {
-        return 0;
-    }
+    if(asm_opt.recover_atg_cov_min < 0 || asm_opt.recover_atg_cov_max < 0) return 0;
     long long R_bases = 0, C_bases = 0, C_bases_primary = 0, C_bases_alter = 0;
     long long total_C_bases = 0, total_C_bases_primary = 0;
     uint32_t available_reads = 0, k, j, rId, tn, is_Unitig;
@@ -15979,7 +16031,7 @@ float drop_ratio, uint32_t trio_flag, float trio_drop_rate, hap_cov_t *cov)
         cut_trio_tip_primary(g, ug, tipsLen, trio_flag, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2);
     }
 
-    ///print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len);
+    // print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len);
     magic_trio_phasing(g, ug, read_g, coverage_cut, sources, reverse_sources, 2, ruIndex, trio_flag, trio_drop_rate); 
     resolve_tangles(ug, read_g, reverse_sources, 20, 100, 0.05, 0.2, ruIndex, cov->is_r_het, trio_flag, drop_ratio);    
     drop_semi_circle(ug, g, read_g, reverse_sources, ruIndex, cov->is_r_het);
@@ -16870,6 +16922,31 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex)
     return u_flag;
 }
 
+void purge_dump(ma_ug_t* ug)
+{
+    asg_t* nsg = ug->g;
+    uint32_t v, n_vtx = nsg->n_seq, k, rId;
+    ma_utg_t *u = NULL;
+    for (v = 0; v < n_vtx; ++v) {
+        if (nsg->seq[v].del) continue;
+        u = &((ug)->u.a[v]);
+        if(u->m == 0) continue;
+        for (k = 0; k < u->n; k++){
+            rId = u->a[k]>>33;
+            if(R_INF.trio_flag[rId] != AMBIGU && R_INF.trio_flag[rId] != DROP) break;
+        }
+        if(k >= u->n){
+            if(u->m != 0){
+                u->circ = u->end = u->len = u->m = u->n = u->start = 0;
+                free(u->a);
+                u->a = NULL;
+            }
+            asg_seq_del(nsg, v);
+        }
+    }
+    asg_cleanup(nsg);
+}
+
 
 void adjust_utg_by_trio(ma_ug_t **ug, asg_t* read_g, uint8_t flag, float drop_rate,
 ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, ma_sub_t* coverage_cut, 
@@ -16974,6 +17051,7 @@ kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t)
     set_drop_trio_flag(*ug);
     destory_hap_cov_t(&cov);
 
+    // purge_dump(*ug);
     renew_utg(ug, read_g, new_rtg_edges);
 }
 
@@ -16999,10 +17077,11 @@ int debug_untig_length(ma_ug_t *g, uint32_t tipsLen, const char* name)
 ma_ug_t* output_trio_unitig_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, 
 uint8_t flag, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, 
 long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, 
-float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t)
+float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t, 
+char *f_prefix)
 {
     char* gfa_name = (char*)malloc(strlen(output_file_name)+100);
-    sprintf(gfa_name, "%s.%s.p_ctg.gfa", output_file_name, (flag==FATHER?"hap1":"hap2"));
+    sprintf(gfa_name, "%s.%s.p_ctg.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2"));
     FILE* output_file = NULL;
     if(is_bench == 0) output_file = fopen(gfa_name, "w");
 
@@ -17043,13 +17122,13 @@ float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench,
     ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, (flag==FATHER?"h1tg":"h2tg"), output_file);
     fclose(output_file);
 
-    sprintf(gfa_name, "%s.%s.p_ctg.noseq.gfa", output_file_name, (flag==FATHER?"hap1":"hap2"));
+    sprintf(gfa_name, "%s.%s.p_ctg.noseq.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2"));
     output_file = fopen(gfa_name, "w");
     ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, (flag==FATHER?"h1tg":"h2tg"), output_file);
     fclose(output_file);
     if(asm_opt.bed_inconsist_rate != 0)
     {
-        sprintf(gfa_name, "%s.%s.p_ctg.lowQ.bed", output_file_name, (flag==FATHER?"hap1":"hap2"));
+        sprintf(gfa_name, "%s.%s.p_ctg.lowQ.bed", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2"));
         output_file = fopen(gfa_name, "w");
         ma_ug_print_bed(ug, sg, &R_INF, coverage_cut, sources, &new_rtg_edges, 
         max_hang, min_ovlp, asm_opt.bed_inconsist_rate, (flag==FATHER?"h1tg":"h2tg"), output_file, NULL);
@@ -30888,8 +30967,14 @@ ma_sub_t **coverage_cut_ptr, int debug_g)
         write_debug_graph(sg, sources, coverage_cut, output_file_name, n_read, reverse_sources, ruIndex);
         debug_gfa:;
     }
-    
-    if (ha_opt_triobin(&asm_opt) && ha_opt_hic(&asm_opt))
+
+    if(asm_opt.fn_bin_poy)
+    {
+        if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION;
+        output_poly_trio(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
+        0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, asm_opt.polyploidy);
+    }
+    else if (ha_opt_triobin(&asm_opt) && ha_opt_hic(&asm_opt))
     {
         if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION;
         benchmark_hic_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, 
@@ -30900,10 +30985,10 @@ ma_sub_t **coverage_cut_ptr, int debug_g)
         if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION;
         output_trio_unitig_graph(sg, coverage_cut, o_file, FATHER, sources, 
         reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-        0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t);
+        0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, NULL);
         output_trio_unitig_graph(sg, coverage_cut, o_file, MOTHER, sources,
         reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, 
-        0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t);
+        0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, NULL);
     }
     else if(ha_opt_hic(&asm_opt))
     {
diff --git a/Overlaps.h b/Overlaps.h
index 9430e15..04e25cf 100644
--- a/Overlaps.h
+++ b/Overlaps.h
@@ -913,7 +913,7 @@ ma_ug_t* copy_untig_graph(ma_ug_t *src);
 ma_ug_t* output_trio_unitig_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, 
 uint8_t flag, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, 
 long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, 
-float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t);
+float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t, char *f_prefix);
 asg_t* copy_read_graph(asg_t *src);
 ma_ug_t *ma_ug_gen(asg_t *g);
 void ma_ug_destroy(ma_ug_t *ug);
diff --git a/Trio.cpp b/Trio.cpp
index 75f20f4..f03db1b 100644
--- a/Trio.cpp
+++ b/Trio.cpp
@@ -349,6 +349,83 @@ static void ha_triobin_list(const hifiasm_opt_t *opt)
 	fprintf(stderr, "[M::%s::%.3f*%.2f] ==> partitioned reads with external lists\n", __func__, yak_realtime(), yak_cpu_usage());
 }
 
+inline void phrase_hstatus(char *s, char **rname, uint32_t *hid)
+{
+	char *p = NULL, *id = NULL; *rname = NULL; *hid = (uint32_t)-1;
+	uint32_t tot; 
+	for (p = s, tot = 0; *p; ++p){
+		if (*p == '\t' || *p == ' '){
+			*p = 0;
+			if(!tot) *rname = p+1;
+			else break;
+			tot++;
+		}
+	}
+	for (p = s, tot = 0; *p; ++p){
+		if (*p == '_') id = p + 1;
+	}
+	*hid = atoi(id);
+}
+
+uint32_t *ha_polybin_list(const hifiasm_opt_t *opt)
+{
+	int64_t i;
+	khint_t k;
+	cstr_ht_t *h;
+	assert(R_INF.total_reads < (uint32_t)-1);
+	h = cstr_ht_init();
+	for (i = 0; i < (int64_t)R_INF.total_reads; ++i) {
+		int absent;
+		char *str = (char*)calloc(Get_NAME_LENGTH(R_INF, i) + 1, 1);
+		strncpy(str, Get_NAME(R_INF, i), Get_NAME_LENGTH(R_INF, i));
+		k = cstr_ht_put(h, str, &absent);
+		if (absent) kh_val(h, k) = i;
+	}
+	fprintf(stderr, "[M::%s::%.3f*%.2f] created the hash table for read names\n", __func__, yak_realtime(), yak_cpu_usage());
+
+	gzFile fp;
+	kstream_t *ks;
+	kstring_t str = {0,0,0};
+	char *rname = NULL;
+	uint32_t hid, *ss = NULL;
+	int dret;
+	int64_t n_tot = 0, n_bin = 0;
+	fp = gzopen(opt->fn_bin_poy, "r");
+	if (fp == 0) {
+		fprintf(stderr, "ERROR: failed to open file '%s'\n", opt->fn_bin_poy);
+		for (k = 0; k < kh_end(h); ++k)
+			if (kh_exist(h, k))
+				free((char*)kh_key(h, k));
+		cstr_ht_destroy(h);
+		return NULL;
+	}
+	CALLOC(ss, R_INF.total_reads);
+	ks = ks_init(fp);
+	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
+		khint_t k; ++n_tot;
+		phrase_hstatus(str.s, &rname, &hid);
+		if((!(*rname)) || hid == (uint32_t)-1) {
+			fprintf(stderr, "ERROR: wrong hap status\n");
+			continue;
+		}
+		k = cstr_ht_get(h, rname);
+		if (k != kh_end(h)) {
+			ss[kh_val(h, k)] |= (((uint32_t)1)<<(hid-1));
+			++n_bin;
+		}
+	}
+	free(str.s);
+	ks_destroy(ks);
+	gzclose(fp);
+	
+	for (k = 0; k < kh_end(h); ++k)
+		if (kh_exist(h, k))
+			free((char*)kh_key(h, k));
+	cstr_ht_destroy(h);
+	fprintf(stderr, "[M::%s::%.3f*%.2f] ==> partitioned reads with external lists\n", __func__, yak_realtime(), yak_cpu_usage());
+	return ss;
+}
+
 void ha_triobin(const hifiasm_opt_t *opt)
 {
 	memset(R_INF.trio_flag, AMBIGU, R_INF.total_reads * sizeof(uint8_t));
diff --git a/htab.h b/htab.h
index 259de57..9f92171 100644
--- a/htab.h
+++ b/htab.h
@@ -105,6 +105,7 @@ double yak_peakrss_in_gb(void);
 double yak_cpu_usage(void);
 
 void ha_triobin(const hifiasm_opt_t *opt);
+uint32_t *ha_polybin_list(const hifiasm_opt_t *opt);
 
 void mz1_ha_sketch(const char *str, int len, int w, int k, uint32_t rid, int is_hpc, ha_mz1_v *p, const void *hf, int sample_dist, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, ha_pt_t *pt, int min_freq, int32_t dp_min_len, float dp_e, st_mt_t *mt, int32_t ws, int32_t is_unique);
 void mz2_ha_sketch(const char *str, int len, int w, int k, uint32_t rid, int is_hpc, ha_mzl_v *p, const void *hf, int sample_dist, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* dbg_ct, ha_pt_t *pt, int min_freq, int32_t dp_min_len, float dp_e, st_mt_t *mt, int32_t ws, int32_t is_unique);
diff --git a/inter.cpp b/inter.cpp
index e4d6315..a966820 100644
--- a/inter.cpp
+++ b/inter.cpp
@@ -3,6 +3,9 @@
 #include <assert.h>
 #include <zlib.h>
 #include "kseq.h" // FASTA/Q parser
+#include "kavl.h"
+#include "khash.h"
+#include "kalloc.h"
 #include "kthread.h"
 #include "inter.h"
 #include "Overlaps.h"
@@ -11,8 +14,55 @@
 #include "Hash_Table.h"
 KSEQ_INIT(gzFile, gzread)
 
+#define MG_SEED_IGNORE     (1ULL<<41)
+#define MG_SEED_TANDEM     (1ULL<<42)
+#define MG_SEED_KEPT       (1ULL<<43)
+
+#define MG_MAX_SEG        255
+#define MG_SEED_SEG_SHIFT  48
+#define MG_SEED_SEG_MASK   (0xffULL<<(MG_SEED_SEG_SHIFT))
+#define mg_seg_id(a) ((int32_t)(((a).y&MG_SEED_SEG_MASK) >> MG_SEED_SEG_SHIFT))
+
+#define MG_SEED_WT_SHIFT   56
+#define MG_MAX_SHORT_K  15
+
+#define MG_SHORT_K_EXT 10000 ///1000 in minigraph
+
+
+#define generic_key(x) (x)
+KRADIX_SORT_INIT(gfa64, uint64_t, generic_key, 8)
+
+struct mg_tbuf_s {
+	void *km;
+	int frag_gap;
+};
+typedef struct mg_tbuf_s mg_tbuf_t;
+
+
+mg_tbuf_t *mg_tbuf_init(void)
+{
+	mg_tbuf_t *b;
+	b = (mg_tbuf_t*)calloc(1, sizeof(mg_tbuf_t));
+	b->km = km_init();
+	return b;
+}
+
+void mg_tbuf_destroy(mg_tbuf_t *b)
+{
+	if (b == 0) return;
+	if (b->km) km_destroy(b->km);
+	free(b);
+}
+
+void *mg_tbuf_get_km(mg_tbuf_t *b)
+{
+	return b->km;
+}
+
 typedef struct {
-	int w, k, bw, max_gap, is_HPC, hap_n;
+	int w, k, bw, max_gap, is_HPC, hap_n, occ_weight, max_gap_pre;
+	int max_lc_skip, max_lc_iter, min_lc_cnt, min_lc_score;
+    float chn_pen_gap;
 } mg_idxopt_t;
 
 typedef struct {
@@ -26,6 +76,7 @@ typedef struct { // global data structure for kt_pipeline()
     const void *ha_flt_tab;
     const ha_pt_t *ha_idx;
     const mg_idxopt_t *opt;
+    const ma_ug_t *ug;
 	kseq_t *ks;
     int64_t chunk_size;
     uint64_t n_thread;
@@ -34,37 +85,118 @@ typedef struct { // global data structure for kt_pipeline()
 } uldat_t;
 
 typedef struct { // data structure for each step in kt_pipeline()
+    const mg_idxopt_t *opt;
     const void *ha_flt_tab;
     const ha_pt_t *ha_idx;
+    const ma_ug_t *ug;
 	int n, m, sum_len;
 	uint64_t *len, id;
 	char **seq;
-    ma_ov_buf_t **mo;
+    ha_mzl_v *mzs;
+    st_mt_t *sps;
+    mg_tbuf_t **buf;
 } utepdat_t;
 
-ma_ov_buf_t **init_ma_ov_buf_t_arr(uint64_t n)
-{
-    uint64_t i;
-    ma_ov_buf_t **p = NULL; CALLOC(p, 1); CALLOC(*p, n);
-    for (i = 0; i < n; i++){
-        kv_init((*p)[i].sp); (*p)[i].abl = ha_abufl_init();
-        init_Candidates_list(&((*p)[i].clist));
-	    init_overlap_region_alloc(&((*p)[i].olist));
-    }
-    return p;
-}
+typedef struct {
+	///off: start idx in mg128_t * a[];
+	///cnt: how many eles in this chain
+	///a[off, off+cnt) saves the eles in this chain
+	int32_t off, cnt:31, inner_pre:1;
+	///ref_id|rev
+	uint32_t v;
+	///chain in ref: [rs, re)
+	///chain in query: [qs, qe)
+	int32_t rs, re, qs, qe;
+	///score: chain score
+	int32_t score, dist_pre;
+	uint32_t hash_pre;
+} mg_lchain_t;
 
-void destory_ma_ov_buf_t_arr(ma_ov_buf_t ***p, uint64_t n)
-{
-    uint64_t i;
-    for (i = 0; i < n; i++){
-        kv_destroy((**p)[i].sp); ha_abufl_destroy((**p)[i].abl);
-        destory_Candidates_list(&((**p)[i].clist));
-	    destory_overlap_region_alloc(&((**p)[i].olist));
-    }
-    free((**p)); free((*p));
-}
+typedef struct {
+	uint32_t v, d;
+	int32_t pre;
+} mg_pathv_t;
+
+KHASH_MAP_INIT_INT(sp, sp_topk_t)
+KHASH_MAP_INIT_INT(sp2, uint64_t)
+
+///mg128_t->y: weight(8)seg_id(8)flag(8)span(8)pos(32)
+///mg128_t->x: rid(31)rev(1)pos(33); keep reference
+typedef struct { uint64_t x, y; } mg128_t;
+#define sort_key_128x(a) ((a).x)
+KRADIX_SORT_INIT(128x, mg128_t, sort_key_128x, 8) 
+void radix_sort_128x(mg128_t *beg, mg128_t *end);
+
+typedef struct {
+	uint32_t n; ///length of candidate list 
+    uint64_t q_span:31, rev:1, q_pos:32;
+	uint32_t qid:16, weight:15, is_tandem:1;
+	const ha_idxposl_t *cr; ///candidate list
+} mg_match_t;
+
+// shortest path
+typedef struct {
+	// input
+	///(lj_ref_id)|(lj_ref_rev^1)
+	uint32_t v;
+	///target_dist should like the overlap length in string graph
+	///it should be used to evaluate if the identified path is close to real path/alignment
+	int32_t target_dist;
+	uint32_t target_hash;
+	///inner: if li and lj are at the same ref id
+	///meta: j
+	uint32_t meta:30, check_hash:1, inner:1;
+	/**
+	 * There are two cases:
+	 * (1) lj->qs************lj->qe
+	 * 				li->qs************li->qe
+	 * (2) lj->qs************lj->qe
+	 * 									li->qs************li->qe
+	 * qlen = li->qs - lj->qe;///might be negative
+	 * **/
+	int32_t qlen;
+	// output
+	uint32_t n_path:31, is_0:1;///I guess n_path is how many path from src to dest
+	int32_t path_end;///looks like an idx to alignment
+	int32_t dist, mlen;
+	uint32_t hash;
+	// aux
+	uint64_t srt_key;
+} mg_path_dst_t;
+
+typedef struct {
+	uint32_t srt;
+	int32_t i;
+} gc_frag_t;
 
+///I think this structure is just used for iteration
+///iterate each ref id, instead of alignment id
+typedef struct sp_node_s {
+	uint64_t di; // dist<<32 | node_id in avl tree(doesn't matter too much)
+	uint32_t v;///ref_id|rev
+	int32_t pre;
+	uint32_t hash;
+	int32_t is_0;
+	KAVL_HEAD(struct sp_node_s) head;
+} sp_node_t, *sp_node_p;
+
+typedef struct {
+	int32_t k, mlen;//k: number of walks from src to this node
+	int32_t qs, qe;
+	sp_node_t *p[MG_MAX_SHORT_K]; // this forms a max-heap
+} sp_topk_t;
+
+#define gc_frag_key(p) ((p).srt)
+KRADIX_SORT_INIT(gc, gc_frag_t, gc_frag_key, 4)
+
+#define dst_key(p) ((p).srt_key)
+KRADIX_SORT_INIT(dst, mg_path_dst_t, dst_key, 8)
+
+#define sp_node_cmp(a, b) (((a)->di > (b)->di) - ((a)->di < (b)->di))
+KAVL_INIT(sp, sp_node_t, head, sp_node_cmp)
+
+#define sp_node_lt(a, b) ((a)->di < (b)->di)
+KSORT_INIT(sp, sp_node_p, sp_node_lt)
 
 void init_mg_opt(mg_idxopt_t *opt, int is_HPC, int k, int w, int hap_n)
 {
@@ -74,6 +206,14 @@ void init_mg_opt(mg_idxopt_t *opt, int is_HPC, int k, int w, int hap_n)
     opt->is_HPC = is_HPC;
     opt->bw = 2000;
     opt->max_gap = 5000;
+    opt->occ_weight = 20;
+    opt->max_gap_pre = 10000;///1000 in minigraph
+    opt->max_lc_iter = 10000;
+    opt->chn_pen_gap = 0.19;///using minimap2's value
+	opt->max_lc_skip = 25;// mo->max_gc_skip = 25;
+	opt->max_lc_iter = 10000;
+	opt->min_lc_cnt = 2; 
+	opt->min_lc_score = 30;
 }
 
 void uidx_build(ma_ug_t *ug, mg_idxopt_t *opt)
@@ -91,13 +231,1052 @@ void uidx_destory()
     ha_pt_destroy(ha_idx);
 }
 
+
+///only use non-repetitive minimizers
+static mg_match_t *collect_matches(void *km, int *_n_m, int max_occ, const void *ha_flt_tab, const ha_pt_t *ha_idx, int check_unique, const ha_mzl_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, int32_t **mini_pos)
+{
+	int rep_st = 0, rep_en = 0, n_m, tn, tw;
+	size_t i;
+	mg_match_t *m;
+	*n_mini_pos = 0;
+    KMALLOC(km, *mini_pos, mv->n);///mv->n how many minimizers in query
+	m = (mg_match_t*)kmalloc(km, mv->n * sizeof(mg_match_t));
+	for (i = 0, n_m = 0, *rep_len = 0, *n_a = 0; i < mv->n; ++i) {
+		const ha_idxposl_t *cr;
+		ha_mzl_t *z = &mv->a[i];
+		cr = ha_ptl_get(ha_idx, z->x, &tn);        
+        tw = ha_ft_cnt(ha_flt_tab, z->x);
+		if (tw > max_occ) { ///the frequency of repetitive regions; ignore those minimizers
+			int en = z->pos + 1, st = en - z->span;//[st, en)
+			if (st > rep_en) { ///just record the length of repetive regions
+				*rep_len += rep_en - rep_st;
+				rep_st = st, rep_en = en;
+			} else rep_en = en;
+		} else {
+			mg_match_t *q = &m[n_m++];
+			q->q_pos = z->pos, q->q_span = z->span, q->rev = z->rev, q->cr = cr, q->n = tn, q->qid = 0;
+			q->is_tandem = 0, q->weight = 255;
+            if(check_unique && tw != 1) q->is_tandem = 1, q->weight = 15;
+			*n_a += q->n;///how many candidates 
+			(*mini_pos)[(*n_mini_pos)++] = z->pos;///minimizer offset in query
+		}
+	}
+	*rep_len += rep_en - rep_st; ///the length of repetitive regions 
+	*_n_m = n_m;
+	return m;
+}
+
+mg128_t *collect_seed_hits(void *km, const mg_idxopt_t *opt, int max_occ, const void *ha_flt_tab, const ha_pt_t *ha_idx, 
+const ma_ug_t *ug, const ha_mzl_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, int32_t **mini_pos)
+{
+    int i, n_m;
+    mg128_t *a = NULL;
+    mg_match_t *m = collect_matches(km, &n_m, max_occ, ha_flt_tab, ha_idx, 1, mv, n_a, rep_len, n_mini_pos, mini_pos);
+    a = (mg128_t*)kmalloc(km, *n_a * sizeof(mg128_t));///n_a: how many available candidates in total 
+    for (i = 0, *n_a = 0; i < n_m; ++i) {///n_m: how many available seeds, instead of candidates
+		mg_match_t *q = &m[i];
+		const ha_idxposl_t *r = q->cr;
+		uint32_t k;
+		for (k = 0; k < q->n; ++k) {///q->n: number of candidates belonging to seed m[i]
+			mg128_t *p;
+			p = &a[(*n_a)++];///pick up a slot for one candidate
+			if (r[k].rev == q->rev) // forward strand
+				p->x = (uint64_t)(r[k].rid)<<33|r[k].pos; ///reference: rid(31)|rev(1)|pos(32)                
+			else // reverse strand
+				p->x = (uint64_t)(r[k].rid)<<33 | 1ULL<<32 | (ug->g->seq[r[k].rid].len - (r[k].pos + 1 - r[k].span) - 1);
+			p->y = (uint64_t)q->q_span << 32 | q->q_pos;
+			p->y |= (uint64_t)q->qid << MG_SEED_SEG_SHIFT;
+			if (q->is_tandem) p->y |= MG_SEED_TANDEM;
+			p->y |= (uint64_t)q->weight << MG_SEED_WT_SHIFT; 
+			///p->y: weight(8)seg_id(8)flag(8)span(8)pos(32)
+			///p->x: rid(31)rev(1)pos(33); keep reference
+		}
+	}
+    kfree(km, m);
+	radix_sort_128x(a, a + (*n_a));
+	return a;
+}
+
+///r is 1000 in default
+///remove isolated hits, whic are not close enough to others
+static int64_t flt_anchors(int64_t n_a, mg128_t *a, int32_t r)
+{
+	int64_t i, j;
+	for (i = 0; i < n_a; ++i) {
+		for (j = i - 1; j >= 0; --j) {
+			/**
+			 * a is sorted by x
+			 * a[].x: ref_id(31)rev(1)r_pos(32)
+			 * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+			 **/
+			int32_t dq;
+			int64_t dr = a[i].x - a[j].x;///a is sorted by x
+			if (dr > r) break;///if two candidates coming from differnt unitigs, dr would be extremly large
+			dq = (int32_t)a[i].y - (int32_t)a[j].y;
+			if (dq > r || dq < 0) continue;
+			a[j].y |= MG_SEED_KEPT;
+			a[i].y |= MG_SEED_KEPT;
+			break;
+		}
+	}
+	for (i = n_a - 1; i >= 0; --i) {
+		if (a[i].y & MG_SEED_KEPT) continue;
+		for (j = i + 1; j < n_a; ++j) {
+			int32_t dq;
+			int64_t dr = a[j].x - a[i].x;
+			if (dr > r) break;
+			dq = (int32_t)a[j].y - (int32_t)a[i].y;
+			if (dq > r || dq < 0) continue;
+			a[j].y |= MG_SEED_KEPT;
+			a[i].y |= MG_SEED_KEPT;
+			break;
+		}
+	}
+	for (i = j = 0; i < n_a; ++i)
+		if (a[i].y & MG_SEED_KEPT)
+			a[j++] = a[i];
+	return j;
+}
+
+static inline float mg_log2(float x) // NB: this doesn't work when x<2
+{
+	union { float f; uint32_t i; } z = { x };
+	float log_2 = ((z.i >> 23) & 255) - 128;
+	z.i &= ~(255 << 23);
+	z.i += 127 << 23;
+	log_2 += (-0.34484843f * z.f + 2.02466578f) * z.f - 0.67487759f;
+	return log_2;
+}
+
+inline int32_t normal_sc(uint64_t w, int32_t sc)
+{
+    if(w < 255){
+        int32_t tmp = (int)(0.00392156862745098 * w * sc); // 0.00392... = 1/255
+        sc = tmp > 1? tmp : 1;
+    }
+    return sc;
+}
+// ai[].x: ref_id(31)rev(1)r_pos(32)
+// ai[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+// comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap, chn_pen_skip, is_cdna, n_segs);
+static inline int32_t comput_sc(const mg128_t *ai, const mg128_t *aj, int32_t max_dist_x, int32_t max_dist_y, int32_t bw, float chn_pen_gap)
+{	
+	int32_t dq = (int32_t)ai->y - (int32_t)aj->y, dr = (int32_t)ai->x - (int32_t)aj->x, dd, dg, q_span, sc;
+	///ai and aj has already been sorted by x
+	///which means ai->x >= aj->x
+	if (dq <= 0 || dq > max_dist_x) return INT32_MIN;
+    if (dr <= 0 || dr > max_dist_y) return INT32_MIN;
+	dd = dr > dq? dr - dq : dq - dr; ///indel, dd is always >= 0
+	if (dd > bw) return INT32_MIN;
+	dg = dr < dq? dr : dq;///MIN(dr, dq)
+	q_span = aj->y>>32&0xff;///query span; should be ai->y>>32&0xff, is it a bug?
+    sc = normal_sc(aj->y>>MG_SEED_WT_SHIFT, (q_span<dg?q_span:dg)); ///positive part of sc
+    ///dd: there are indels
+    ///dg > q_span: there are some bases that are not covered between ai and aj
+    ///it is if (dd || dg > q_span) in minigraph
+	if (dd) {
+		float lin_pen, log_pen;
+		lin_pen = chn_pen_gap * (float)dd;
+		log_pen = dd >= 2? mg_log2(dd) : 0.0f; // mg_log2() only works for dd>=2
+        sc -= (int)(lin_pen + log_pen);
+	}
+	return sc;
+}
+
+
+///p[]: id of last
+///f[]: the score ending at i, not always the peak
+///v[]: keeps the peak score up to i;
+///t[]: id of next
+///min_cnt = 2; min_sc = 30; extra_u = 0
+///u = mg_chain_backtrack(n, f, p, v, t, min_cnt, min_sc, 0, &n_u, &n_v);
+uint64_t *mg_chain_backtrack(void *km, int64_t n, const int32_t *f, const int64_t *p, int32_t *v, int32_t *t, int32_t min_cnt, int32_t min_sc, int32_t extra_u, int32_t *n_u_, int32_t *n_v_)
+{
+	mg128_t *z;
+	uint64_t *u;
+	int64_t i, k, n_z, n_v;
+	int32_t n_u;
+	// v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
+	*n_u_ = *n_v_ = 0;
+	for (i = 0, n_z = 0; i < n; ++i) // precompute n_z
+		if (f[i] >= min_sc) ++n_z;
+	if (n_z == 0) return 0;
+	KMALLOC(km, z, n_z);
+	for (i = 0, k = 0; i < n; ++i) // populate z[]
+		if (f[i] >= min_sc) z[k].x = f[i], z[k++].y = i;
+	radix_sort_128x(z, z + n_z);///sort by score
+
+	memset(t, 0, n * 4);///t is a buffer
+	///from the largest to the smallest
+	for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // precompute n_u
+		int64_t n_v0 = n_v;
+		int32_t sc;
+		///note t[i] == 0 is not used to find local alignment
+		///say if we have already found a long chain, then the secondary might be able to merged to the long chain
+		///t[i] == 0 is used to find those chains
+		for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
+			++n_v, t[i] = 1;
+		sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
+		if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
+			++n_u;///how many chains, including primary chains and non-primary chains
+		else n_v = n_v0;
+	}
+	KMALLOC(km, u, n_u + extra_u);
+	memset(t, 0, n * 4);
+	for (k = n_z - 1, n_v = n_u = 0; k >= 0; --k) { // populate u[]
+		int64_t n_v0 = n_v;
+		int32_t sc;
+		for (i = z[k].y; i >= 0 && t[i] == 0; i = p[i])
+			v[n_v++] = i, t[i] = 1;
+		sc = i < 0? z[k].x : (int32_t)z[k].x - f[i];
+		if (sc >= min_sc && n_v > n_v0 && n_v - n_v0 >= min_cnt)
+			u[n_u++] = (uint64_t)sc << 32 | (n_v - n_v0);
+		else n_v = n_v0;
+	}
+	kfree(km, z);
+	assert(n_v < INT32_MAX);
+	*n_u_ = n_u, *n_v_ = n_v;
+	return u;
+}
+
+//u[]: sc|occ of chains
+//v[]: idx of each element
+static mg128_t *compact_a(void *km, int32_t n_u, uint64_t *u, int32_t n_v, int32_t *v, mg128_t *a)
+{
+	mg128_t *b, *w;
+	uint64_t *u2;
+	int64_t i, j, k;
+
+	// write the result to b[]
+	KMALLOC(km, b, n_v);
+	for (i = 0, k = 0; i < n_u; ++i) {
+		int32_t k0 = k, ni = (int32_t)u[i];
+		for (j = 0; j < ni; ++j)
+			b[k++] = a[v[k0 + (ni - j - 1)]];///write all elements of a chain together
+	}
+	kfree(km, v);
+
+	// sort u[] and a[] by the target position, such that adjacent chains may be joined
+	KMALLOC(km, w, n_u);
+	for (i = k = 0; i < n_u; ++i) {///n_u: how many chains
+		///x: ref_id(31)rev(1)r_pos(32)
+		w[i].x = b[k].x, w[i].y = (uint64_t)k<<32|i;
+		k += (int32_t)u[i];
+	}
+	radix_sort_128x(w, w + n_u);///sort by ref_id(31)rev(1)r_pos(32); r_pos is the start pos of chain
+	KMALLOC(km, u2, n_u);
+	for (i = k = 0; i < n_u; ++i) {///note merge chain; just place close chains together
+		///j is chain id; n is how many elements in j-th chain
+		int32_t j = (int32_t)w[i].y, n = (int32_t)u[j];
+		u2[i] = u[j];
+		memcpy(&a[k], &b[w[i].y>>32], n * sizeof(mg128_t));
+		k += n;
+	}
+	memcpy(u, u2, n_u * 8);
+	memcpy(b, a, k * sizeof(mg128_t)); // write _a_ to _b_ and deallocate _a_ because _a_ is oversized, sometimes a lot
+	kfree(km, a); kfree(km, w); kfree(km, u2);
+	return b;
+}
+
+/* Input:
+ *   a[].x: ref_id(31)rev(1)r_pos(32)
+ *   a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+ *   n: length of a[]
+ * Output:
+ *   n_u: #chains
+ *   u[]: score<<32 | #anchors (sum of lower 32 bits of u[] is the returned length of a[])
+ * input a[] is deallocated on return
+ */
+///is_cdna is is_splice
+mg128_t *mg_lchain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int max_iter, int min_cnt, int min_sc, float chn_pen_gap, 
+					  int64_t n, mg128_t *a, int *n_u_, uint64_t **_u, void *km)
+{ // TODO: make sure this works when n has more than 32 bits
+	int32_t *f, *t, *v, n_u, n_v;
+	int64_t *p, i, j, max_ii, st = 0;
+	uint64_t *u;
+
+	if (_u) *_u = 0, *n_u_ = 0;
+	if (n == 0 || a == 0) return 0;
+	KMALLOC(km, p, n);///id of last cell
+	KMALLOC(km, f, n);///f[] is the score ending at i, not always the peak
+	KMALLOC(km, v, n);///v[] keeps the peak score up to i;
+	KCALLOC(km, t, n);///t doesn't matter too much; it is mainly used to accelrate the iteration
+
+	// a[].x: ref_id(31)rev(1)r_pos(32)
+ 	// a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+	// fill the score and backtrack arrays
+	for (i = st = 0, max_ii = -1; i < n; ++i) {
+		int64_t max_j = -1, end_j;
+        ///max_f -> minimizer span length in query, which is the initial score
+		int32_t max_f = normal_sc(a[i].y>>MG_SEED_WT_SHIFT, a[i].y>>32&0xff), n_skip = 0;
+		///until we are at the same rid, same direction, and the coordinates are close enough
+		while (st < i && (a[i].x>>32 != a[st].x>>32 || a[i].x > a[st].x + max_dist_x)) ++st;
+		///max_iter = 10000 in default, which means dp can go back to up to 10000 cells
+		if (i - st > max_iter) st = i - max_iter; 
+		for (j = i - 1; j >= st; --j) {
+			int32_t sc;
+			sc = comput_sc(&a[i], &a[j], max_dist_x, max_dist_y, bw, chn_pen_gap);
+			if (sc == INT32_MIN) continue;
+			sc += f[j];
+			if (sc > max_f) {
+				max_f = sc, max_j = j;
+				if (n_skip > 0) --n_skip;
+			} else if (t[j] == (int32_t)i) {///note we scan j backwards; we don't need to update t[] for each i
+				if (++n_skip > max_skip)
+					break;
+			}
+			if (p[j] >= 0) t[p[j]] = i;
+		}
+		end_j = j;///end_j might be > 0
+		///if not close enough, select a new max
+		if (max_ii < 0 || (int64_t)(a[i].x - a[max_ii].x) > (int64_t)max_dist_x) {///select a new max
+			int32_t max = INT32_MIN;
+			max_ii = -1;
+			for (j = i - 1; j >= st; --j)
+				if (max < f[j]) max = f[j], max_ii = j;
+		}
+		///note: it will happen when `max_ii` < `end_j`; 
+		///iteration is terminated at `end_j` mostly because of `max_skip` and `max_iter`
+		if (max_ii >= 0 && max_ii < end_j) {
+			int32_t tmp;
+			tmp = comput_sc(&a[i], &a[max_ii], max_dist_x, max_dist_y, bw, chn_pen_gap);
+			if (tmp != INT32_MIN && max_f < tmp + f[max_ii])
+				max_f = tmp + f[max_ii], max_j = max_ii;
+		}
+		// v[] keeps the peak score up to i; f[] is the score ending at i, not always the peak
+		f[i] = max_f, p[i] = max_j;
+		v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f; 
+		if (max_ii < 0 || ((int64_t)(a[i].x - a[max_ii].x) <= (int64_t)max_dist_x && f[max_ii] < f[i]))
+			max_ii = i;
+	}
+	///after mg_chain_backtrack, the results are saved in u and v;
+	u = mg_chain_backtrack(km, n, f, p, v, t, min_cnt, min_sc, 0, &n_u, &n_v);
+	*n_u_ = n_u, *_u = u; // NB: note that u[] may not be sorted by score here
+	kfree(km, p); kfree(km, f); kfree(km, t);
+	if (n_u == 0) {
+		kfree(km, a); kfree(km, v);
+		return 0;
+	}
+	//u[]: sc|occ of chains
+	//v[]: idx of each element
+	return compact_a(km, n_u, u, n_v, v, a);
+}
+
+///qlen: query length
+///u[]: sc|occ of chains
+///a[]: candidate list
+mg_lchain_t *mg_lchain_gen(void *km, int qlen, int n_u, uint64_t *u, mg128_t *a)
+{
+	mg128_t *z;
+	mg_lchain_t *r;
+	int i, k;
+
+	if (n_u == 0) return 0;
+	KCALLOC(km, r, n_u); KMALLOC(km, z, n_u);
+	// u[] is sorted by query position
+	for (i = k = 0; i < n_u; ++i) {
+		/**
+		 * a[].x: ref_id(31)rev(1)r_pos(32)
+		 * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+		**/
+		///u[]: sc(32)occ(32)
+		int32_t qs = (int32_t)a[k].y + 1 - (a[k].y>>32 & 0xff);
+		z[i].x = (uint64_t)qs << 32 | u[i] >> 32;
+		z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
+		k += (int32_t)u[i];
+	}
+	radix_sort_128x(z, z + n_u);//sort by qs|sc
+
+	// populate r[]
+	for (i = 0; i < n_u; ++i) {
+		mg_lchain_t *ri = &r[i];
+		/**
+		 * z[].x: query start pos| chain score
+		 * z[].y: idx in a[] | chain occ
+		 * a[].x: ref_id(31)rev(1)r_pos(32)
+		 * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+		 * **/
+		int32_t k = z[i].y >> 32, q_span = a[k].y >> 32 & 0xff;
+		ri->off = k;
+		ri->cnt = (int32_t)z[i].y;
+		ri->score = (uint32_t)z[i].x;
+		ri->v = a[k].x >> 32;///ref_id|rev
+		ri->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // for HPC k-mer
+		ri->qs = z[i].x >> 32;
+		ri->re = (int32_t)a[k + ri->cnt - 1].x + 1;
+		ri->qe = (int32_t)a[k + ri->cnt - 1].y + 1;
+	}
+	kfree(km, z);
+	return r;
+}
+
+static int32_t get_mini_idx(const mg128_t *a, int32_t n, const int32_t *mini_pos)
+{
+	int32_t x, L = 0, R = n - 1;
+	x = (int32_t)a->y;
+	while (L <= R) { // binary search
+		int32_t m = ((uint64_t)L + R) >> 1;
+		int32_t y = mini_pos[m];
+		if (y < x) L = m + 1;
+		else if (y > x) R = m - 1;
+		else return m;
+	}
+	return -1;
+}
+
+/* Before:
+ *   a[].x: ref_id(31)rev(1)r_pos(32)
+ *   a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+ * After:
+ *   a[].x: idx_in_minimizer_arr(32)r_pos(32)
+ *   a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+ */
+void mg_update_anchors(int32_t n_a, mg128_t *a, int32_t n, const int32_t *mini_pos)
+{
+	int32_t st, j, k;
+	if (n_a <= 0) return;
+	st = get_mini_idx(&a[0], n, mini_pos);
+	assert(st >= 0);
+	for (k = 0, j = st; j < n && k < n_a; ++j)
+		if ((int32_t)a[k].y == mini_pos[j])
+			a[k].x = (uint64_t)j << 32 | (a[k].x & 0xffffffffU), ++k;
+	assert(k == n_a);
+}
+
+static int32_t find_max(int32_t n, const gc_frag_t *gf, uint32_t x)
+{
+	int32_t s = 0, e = n;
+	if (n == 0) return -1;
+	if (gf[n-1].srt < x) return n - 1;
+	if (gf[0].srt >= x) return -1;
+	while (e > s) { // TODO: finish this block
+		int32_t m = s + (e - s) / 2;
+		if (gf[m].srt >= x) e = m;
+		else s = m + 1;
+	}
+	assert(s == e);
+	return s;
+}
+///target_dist should like the overlap length in string graph
+///it should be used to evaluate if the identified path is close to real path/alignment
+static int32_t mg_target_dist(const asg_t *g, const mg_lchain_t *l0, const mg_lchain_t *l1)
+{
+	/**
+	 case 1: l0->qs************l0->qe
+	 					l1->qs************l1->qe
+	 case 2: l0->qs************l0->qe
+	                                       l1->qs************l1->qe
+
+	*****l0->rs************l0->re*****
+				                                     ****l1->rs************l1->re**
+	 * **/
+	///min_dist = l1->rs + (g->seg[l0->v>>1].len - l0->re); 
+	// below equals (l1->qs - l0->qe) - min_dist + g->seg[l1->v>>1].len; see mg_gchain1_dp() for the calculation of min_dist
+	//(l1->qs - l0->qe) is the gap in query, min_dist is the gap in reference
+	return (l1->qs - l0->qe) - (g->seq[l0->v>>1].len - l0->re) + (g->seq[l1->v>>1].len - l1->rs);
+	// when l0->v == l1->v, the above becomes (l1->qs - l0->qe) - (l1->rs - l0->re), which is what we want
+}
+
+static inline sp_node_t *gen_sp_node(void *km, uint32_t v, int32_t d, int32_t id)
+{
+	sp_node_t *p;
+	KMALLOC(km, p, 1);
+	p->v = v, p->di = (uint64_t)d<<32 | id, p->pre = -1, p->is_0 = 1;
+	return p;
+}
+
+///max_dist is like the overlap length in string graph
+///mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), MG_MAX_SHORT_K, 0, 0, 1, 0);
+///the end position of qs is li->qs; dst[]->->qlen indicate the region that need to be checked in bases
+mg_pathv_t *mg_shortest_k(void *km0, const asg_t *g, uint32_t src, int32_t n_dst, mg_path_dst_t *dst, int32_t max_dist, int32_t max_k, int32_t ql, const char *qs, int is_rev, int32_t *n_pathv)
+{
+	sp_node_t *p, *root = 0, **out;
+	sp_topk_t *q;
+	khash_t(sp) *h;
+	khash_t(sp2) *h2;
+	void *km;
+	khint_t k;
+	int absent;
+	int32_t i, j, n_done, n_found, n_seeds = 0;
+	uint32_t id, n_out, m_out;
+	int8_t *dst_done;
+	mg_pathv_t *ret = 0;
+	uint64_t *dst_group, *seeds = 0;
+	/** //path
+	void *h_seeds = 0; 
+	mg128_v mini = {0,0,0}; 
+	**/
+
+	if (n_pathv) *n_pathv = 0;///for us, n_pathv = NULL
+	if (n_dst <= 0) return 0;///n_dst: how many candidate nodes
+	for (i = 0; i < n_dst; ++i) { // initialize
+		mg_path_dst_t *t = &dst[i];
+		///if src and dest are at the same ref id, there are already one path
+		if (t->inner)///if two chains are at the same ref id
+			t->dist = 0, t->n_path = 1, t->path_end = -1;
+		else
+			t->dist = -1, t->n_path = 0, t->path_end = -1;
+	}
+	if (max_k > MG_MAX_SHORT_K) max_k = MG_MAX_SHORT_K;
+	km = km_init2(km0, 0x4000);
+
+	/** //path
+	///for the first time, we just check th reachability without sequence (qs);
+	///but for the second round, we need to check sequence
+	///qs is the sequence between two minimizers
+	if (ql > 0 && qs) { // build the seed hash table for the query
+		mg_sketch(km, qs, ql, MG_SHORT_KW, MG_SHORT_KK, 0, &mini);
+		// mini->a[].x = hash_key<<8 | kmerSpan
+		// mini->a[].y = rid<<32 | lastPos<<1 | strand
+		if (is_rev)///is_rev = 1;
+			for (i = 0; i < mini.n; ++i)///reverse qs[0, ql) to qs(ql, 0]
+				mini.a[i].y = (ql - (((int32_t)mini.a[i].y>>1) + 1 - MG_SHORT_KK) - 1) << 1 | ((mini.a[i].y&1)^1);
+		///h_seeds is the ordinary hash index
+		h_seeds = mg_idx_a2h(km, mini.n, mini.a, 0, &seeds, &n_seeds);
+		///h_seeds+seeds+n_seeds ----> hash index of qs[0, ql)
+	}
+	**/
+
+	KCALLOC(km, dst_done, n_dst);
+	KMALLOC(km, dst_group, n_dst);
+	// multiple dst[] may have the same dst[].v. We need to group them first.
+	// in other words, one ref id may have multiple dst alignment chains
+	for (i = 0; i < n_dst; ++i) 
+		dst_group[i] = (uint64_t)dst[i].v<<32 | i;
+	radix_sort_gfa64(dst_group, dst_group + n_dst);
+
+	h2 = kh_init2(sp2, km); // this hash table keeps all destinations from the same ref id
+	kh_resize(sp2, h2, n_dst * 2);
+	///please note that one contig in ref may have multiple alignment chains
+	///so h2 is a index that helps us to query it
+	///key(h2) = ref id; value(h2) = start_idx | occ
+	for (i = 1, j = 0; i <= n_dst; ++i) {
+		if (i == n_dst || dst_group[i]>>32 != dst_group[j]>>32) {
+			k = kh_put(sp2, h2, dst_group[j]>>32, &absent);
+			kh_val(h2, k) = (uint64_t)j << 32 | (i - j);
+			assert(absent);
+			j = i;
+		}
+	}
+
+	
+
+	h = kh_init2(sp, km); // this hash table keeps visited vertices; path to each visited vertice
+	kh_resize(sp, h, 16);
+
+	m_out = 16, n_out = 0;///16 is just the initial size
+	KMALLOC(km, out, m_out);
+	
+	/**
+	typedef struct {
+		int32_t k, mlen;//k: number of walks from src to this node
+		int32_t qs, qe;
+		sp_node_t *p[MG_MAX_SHORT_K]; // this forms a max-heap; all path
+	} sp_topk_t;
+	**/
+	id = 0;
+	p = gen_sp_node(km, src, 0, id++);///just malloc a node for src; the distance is 0
+	p->hash = __ac_Wang_hash(src);
+	kavl_insert(sp, &root, p, 0);///should be avl tree
+
+	k = kh_put(sp, h, src, &absent);///here is a hash table
+	q = &kh_val(h, k);
+	q->k = 1, q->p[0] = p, q->mlen = 0, q->qs = q->qe = -1;
+
+	n_done = 0;
+	///the key of avl tree: #define sp_node_cmp(a, b) (((a)->di > (b)->di) - ((a)->di < (b)->di))
+	///the higher bits of (*)->di is distance to src node
+	///so the key of avl tree is distance
+	while (kavl_size(head, root) > 0) {///thr first root is src
+		int32_t i, nv;
+		asg_arc_t *av;
+		sp_node_t *r;
+		///note that one node might be visited multiple times if there are circles
+		///delete the first node
+		r = kavl_erase_first(sp, &root); // take out the closest vertex in the heap (as a binary tree)
+		//fprintf(stderr, "XX\t%d\t%d\t%d\t%c%s[%d]\t%d\n", n_out, kavl_size(head, root), n_finished, "><"[(r->v&1)^1], g->seg[r->v>>1].name, r->v, (int32_t)(r->di>>32));
+		if (n_out == m_out) KEXPAND(km, out, m_out);
+		///higher 32 bits might be the distance to root node
+		// lower 32 bits now for position in the out[] array
+		r->di = r->di>>32<<32 | n_out; ///n_out is just the id in out
+		///so one node id in graph might be saved multiple times in avl tree and out[]
+		out[n_out++] = r;///out[0] = src
+
+		///r->v is the dst vertex id
+		///sometimes k==kh_end(h2). Some nodes are found by graph travesal but not in linear chain alignment
+		k = kh_get(sp2, h2, r->v);
+		// we have reached one dst vertex
+		// note that one dst vertex may have multipe alignment chains
+		// we can visit some nodes in graph which are not reachable during chaining
+		// h2 is used to determine if one node is reachable or not
+		if (k != kh_end(h2)) { 
+			///node r->v might be visited multiple times
+			int32_t j, dist = r->di>>32, off = kh_val(h2, k) >> 32, cnt = (int32_t)kh_val(h2, k);
+			//src can reach ref id r->v; there might be not only one alignment chain in r->v
+			//so we need to scan all of them
+			for (j = 0; j < cnt; ++j) {
+				mg_path_dst_t *t = &dst[(int32_t)dst_group[off + j]];
+				int32_t done = 0;
+				///the src and dest are at the same ref id, say we directly find the shortest path
+				if (t->inner) {
+					done = 1;
+				} else {
+					int32_t mlen = 0, copy = 0;
+					///in the first round, we just check reachability without sequence
+					///so h_seeds = NULL; we can assume mlen = 0
+					/** //path
+					mlen = h_seeds? path_mlen(out, n_out - 1, h, t->qlen) : 0;
+					**/
+					//if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, "  src=%c%s[%d],qlen=%d\tdst=%c%s[%d]\ttarget_distx=%d,target_hash=%x\tdistx=%d,mlen=%d,hash=%x\n", "><"[src&1], g->seg[src>>1].name, src, ql, "><"[t->v&1], g->seg[t->v>>1].name, t->v, t->target_dist - g->seg[src>>1].len, t->target_hash, dist - g->seg[src>>1].len, mlen, r->hash);
+					// note: t indicates a linear alignmnet, instead of a node in graph
+					///target_dist should be the distance on query
+					if (t->n_path == 0) { // keep the shortest path
+						copy = 1;
+					} else if (t->target_dist >= 0) { // we have a target distance; choose the closest
+						if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) { // we found the target path
+							copy = 1, done = 1;
+						} else {
+							int32_t d0 = t->dist, d1 = dist;
+							d0 = d0 > t->target_dist? d0 - t->target_dist : t->target_dist - d0;
+							d1 = d1 > t->target_dist? d1 - t->target_dist : t->target_dist - d1;
+							///if the new distance (d1) is smaller than the old distance (d0), update the results
+							///in other words, the length of new path should be closer to t->target_dist
+							if (d1 - mlen/2 < d0 - t->mlen/2) copy = 1;
+						}
+					}
+					if (copy) {
+						t->path_end = n_out - 1, t->dist = dist, t->hash = r->hash, t->mlen = mlen, t->is_0 = r->is_0;
+						if (t->target_dist >= 0) {
+							///src is from li from li to lj, so the dis is generally increased
+							///target_dist should be the distance on query
+							if (dist == t->target_dist && t->check_hash && r->hash == t->target_hash) done = 1;
+							else if (dist > t->target_dist + MG_SHORT_K_EXT) done = 1;
+						}
+					}
+					++t->n_path;///we found a path to the alignment t
+					if (t->n_path >= max_k) done = 1;
+				}
+				if (dst_done[off + j] == 0 && done)
+					dst_done[off + j] = 1, ++n_done;
+			}
+			///if all alignments have been settle down
+			///pre-end; accelerate the loop
+			if (n_done == n_dst) break;
+		}
+
+		///below is used to push new nodes to avl tree for iteration
+		nv = asg_arc_n(g, r->v);
+		av = asg_arc_a(g, r->v);
+		for (i = 0; i < nv; ++i) { // visit all neighbors
+			asg_arc_t *ai = &av[i];
+			///v_lv is the (dest_length - overlap_length); it is a normal path length in string graph
+			///ai->v_lv is the path length from r->v to ai->w
+			///(r->di>>32)
+			int32_t d = (r->di>>32) + (uint32_t)ai->ul;
+			if (d > max_dist) continue; // don't probe vertices too far away
+			///ai->w is the dest ref id; we insert a new ref id, instead of an alignment chain
+			k = kh_put(sp, h, ai->v, &absent);///one node might be visited multiple times
+			q = &kh_val(h, k);
+			if (absent) { // a new vertex visited
+				///q->k: number of walks from src to ai->w
+				q->k = 0, q->qs = q->qe = -1; q->mlen = 0;
+				///h_seeds = NULL; so q->mlen = 0
+				/** //path
+				q->mlen = h_seeds && d + gfa_arc_lw(g, *ai) <= max_dist? node_mlen(km, g, ai->w, &mini, h_seeds, n_seeds, seeds, &q->qs, &q->qe) : 0;
+				**/
+				//if (ql && qs) fprintf(stderr, "ql=%d,src=%d\tv=%c%s[%d],n_seeds=%d,mlen=%d\n", ql, src, "><"[ai->w&1], g->seg[ai->w>>1].name, ai->w, n_seeds, q->mlen);
+			}
+			///if there are less than <max_k> walks from src to ai->w, directly add
+			///if there are more, keep the smallest <max_k> walks
+			if (q->k < max_k) { // enough room: add to the heap
+				p = gen_sp_node(km, ai->v, d, id++);
+				p->pre = n_out - 1;///the parent node of this one 
+				p->hash = r->hash + __ac_Wang_hash(ai->v);
+				p->is_0 = r->is_0;
+				/** //path
+				if (ai->rank > 0) p->is_0 = 0;
+				**/
+				kavl_insert(sp, &root, p, 0);
+				q->p[q->k++] = p;
+				ks_heapup_sp(q->k, q->p);///adjust heap by distance
+			} else if (q->p[0]->di>>32 > d) { // shorter than the longest path so far: replace the longest
+				p = kavl_erase(sp, &root, q->p[0], 0);
+				if (p) {
+					p->di = (uint64_t)d<<32 | (id++);
+					p->pre = n_out - 1;
+					p->hash = r->hash + __ac_Wang_hash(ai->v);
+					p->is_0 = r->is_0;
+					/** //path
+					if (ai->rank > 0) p->is_0 = 0;
+					**/
+					kavl_insert(sp, &root, p, 0);
+					ks_heapdown_sp(0, q->k, q->p);
+				} else {
+					fprintf(stderr, "Warning: logical bug in gfa_shortest_k(): q->k=%d,q->p[0]->{d,i}={%d,%d},d=%d,src=%u,max_dist=%d,n_dst=%d\n", q->k, (int32_t)(q->p[0]->di>>32), (int32_t)q->p[0]->di, d, src, max_dist, n_dst);
+					km_destroy(km);
+					return 0;
+				}
+			} // else: the path is longer than all the existing paths ended at ai->w
+		}
+	}
+
+	kfree(km, dst_group);
+	kfree(km, dst_done);
+	kh_destroy(sp, h);
+	mg_idx_hfree(h_seeds);
+	kfree(km, seeds);
+	kfree(km, mini.a);
+	// NB: AVL nodes are not deallocated. When km==0, they are memory leaks.
+
+	for (i = 0, n_found = 0; i < n_dst; ++i)
+		if (dst[i].n_path > 0) ++n_found;///n_path might be larger than 16
+	///we can assume n_pathv = NULL for now 
+	if (n_found > 0 && n_pathv) { // then generate the backtrack array
+		int32_t n, *trans;
+		KCALLOC(km, trans, n_out); // used to squeeze unused elements in out[]
+		for (i = 0; i < n_dst; ++i) { // mark dst vertices with a target distance
+			mg_path_dst_t *t = &dst[i];
+			if (t->n_path > 0 && t->target_dist >= 0 && t->path_end >= 0)
+				trans[(int32_t)out[t->path_end]->di] = 1;
+		}
+		for (i = 0; i < n_out; ++i) { // mark dst vertices without a target distance
+			k = kh_get(sp2, h2, out[i]->v);
+			if (k != kh_end(h2)) { // TODO: check if this is correct!
+				int32_t off = kh_val(h2, k)>>32, cnt = (int32_t)kh_val(h2, k);
+				for (j = off; j < off + cnt; ++j)
+					if (dst[j].target_dist < 0)
+						trans[i] = 1;
+			}
+		}
+		for (i = n_out - 1; i >= 0; --i) // mark all predecessors
+			if (trans[i] && out[i]->pre >= 0)
+				trans[out[i]->pre] = 1;
+		for (i = n = 0; i < n_out; ++i) // generate coordinate translations
+			if (trans[i]) trans[i] = n++;
+			else trans[i] = -1;
+
+		*n_pathv = n;
+		KMALLOC(km0, ret, n);
+		for (i = 0; i < n_out; ++i) { // generate the backtrack array
+			mg_pathv_t *p;
+			if (trans[i] < 0) continue;
+			p = &ret[trans[i]];
+			p->v = out[i]->v, p->d = out[i]->di >> 32;
+			p->pre = out[i]->pre < 0? out[i]->pre : trans[out[i]->pre];
+		}
+		for (i = 0; i < n_dst; ++i) // translate "path_end"
+			if (dst[i].path_end >= 0)
+				dst[i].path_end = trans[dst[i].path_end];
+	}
+
+	km_destroy(km);
+	return ret;
+}
+
+
+int32_t mg_gchain1_dp(void *km, const ma_ug_t *ug, int32_t *n_lc_, mg_lchain_t *lc, int32_t qlen, int32_t max_dist_g, int32_t max_dist_q, int32_t bw, int32_t max_skip,
+					  int32_t ref_bonus, float chn_pen_gap, float chn_pen_skip, float mask_level, int32_t max_gc_seq_ext, const char *qseq, const mg128_t *an, uint64_t **u_)
+{
+	int32_t i, j, k, m_dst, n_dst, n_ext, n_u, n_v, n_lc = *n_lc_;
+	int32_t *f, *v, *t;
+	int64_t *p;
+	uint64_t *u;
+	mg_path_dst_t *dst;
+	gc_frag_t *a;
+	mg_lchain_t *swap;
+	char *qs;
+	asg_t *g = ug->g;
+
+	*u_ = 0;
+	if (n_lc == 0) return 0;
+
+	KMALLOC(km, a, n_lc);
+	///n_lc how many linear chains
+	for (i = n_ext = 0; i < n_lc; ++i) { // a[] is a view of frag[]; for sorting
+		mg_lchain_t *r = &lc[i];
+		gc_frag_t *ai = &a[i];
+		int32_t is_isolated = 0, min_end_dist_g;
+		r->dist_pre = -1;///indicate parent in graph chain
+		min_end_dist_g = g->seq[r->v>>1].len - r->re;///r->v: ref_id|rev
+		if (r->rs < min_end_dist_g) min_end_dist_g = r->rs;
+		if (min_end_dist_g > max_dist_g) is_isolated = 1; // if too far from segment ends
+		else if (min_end_dist_g>>3 > r->score) is_isolated = 1; // if the lchain too small relative to distance to the segment ends
+		ai->srt = (uint32_t)is_isolated<<31 | r->qe;
+		ai->i = i;
+		if (!is_isolated) ++n_ext;
+	}
+	///if the alignment is too far from segment ends, which means it cannot contribute to graph alignment
+	if (n_ext < 2) { // no graph chaining needed; early return
+		kfree(km, a);
+		KMALLOC(km, u, n_lc);
+		for (i = 0; i < n_lc; ++i)
+			u[i] = (uint64_t)lc[i].score<<32 | 1;
+		*u_ = u;
+		return n_lc;
+	}
+	radix_sort_gc(a, a + n_lc);///sort by: is_isolated(1):qe
+
+	KMALLOC(km, v, n_lc);
+	KMALLOC(km, f, n_ext);
+	KMALLOC(km, p, n_ext);
+	KCALLOC(km, t, n_ext);
+	// KMALLOC(km, qs, max_dist_q + 1);//for 
+
+	m_dst = n_dst = 0, dst = 0;
+	///n_ext is number of linear chains that might be included in graph chains
+	///sorted by the positions in query; sorted by qe of each chain
+	for (i = 0; i < n_ext; ++i) { // core loop
+		gc_frag_t *ai = &a[i];
+		mg_lchain_t *li = &lc[ai->i];///linear chain; sorted by qe, i.e. end position in query
+		///note segi is query id, instead of ref id; it is not such useful
+		/**
+		 * a[].x: idx_in_minimizer_arr(32)r_pos(32)
+		 * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+		**/
+		{ // collect end points potentially reachable from _i_
+			int32_t x = li->qs + bw, n_skip = 0;
+			if (x > qlen) x = qlen;
+			///collect alignments that can be reachable from the left side
+			///that is, a[x].qe <= x
+			x = find_max(i, a, x);
+			n_dst = 0;
+			for (j = x; j >= 0; --j) { // collect potential destination vertices
+				gc_frag_t *aj = &a[j];
+				//potential chains that might be overlapped with the left side of li
+				mg_lchain_t *lj = &lc[aj->i];
+				mg_path_dst_t *q;
+				int32_t target_dist, dq;
+				///lj->qs >= li->qs && lj->qe <= li->qs, so lj is contained
+				if (lj->qs >= li->qs) continue; // lj is contained in li on the query coordinate
+				///lj->qs************lj->qe
+				///			li->qs************li->qe
+				if (lj->qe > li->qs) { // test overlap on the query
+					int o = lj->qe - li->qs;
+					///mask_level = 0.5, if overlap is too long
+					///note here is the overlap in query, so too long overlaps might be wrong
+					if (o > (lj->qe - lj->qs) * mask_level || o > (li->qe - li->qs) * mask_level)
+						continue;
+				}
+				dq = li->qs - lj->qe;///dq might be smaller than 0
+				if (dq > max_dist_q) break; // if query gap too large, stop
+				///The above filter chains like:
+				///1. lj is contained in li
+				///2. the overlap between li and lj is too large
+				///3. li and lj are too far
+				/**
+				 lj->qs************lj->qe
+				 			li->qs************li->qe
+
+				*****lj->rs************lj->re*****
+				                                     ****li->rs************li->re**
+				**/
+				///above we have checked gap/overlap in query 
+				///then we need to check gap/overlap in reference
+				if (li->v != lj->v) { // the two linear chains are on two different refs
+					// minimal graph gap; the real graph gap might be larger
+					int32_t min_dist = li->rs + (g->seq[lj->v>>1].len - lj->re); 
+					if (min_dist > max_dist_g) continue; // graph gap too large
+					//note here min_dist - (lj->qs - li->qe) > bw is important
+					//min_dist is always larger than 0, (lj->qs - li->qe) might be negative 
+					if (min_dist - bw > li->qs - lj->qe) continue; ///note seg* is the query id, instead of ref id
+					target_dist = mg_target_dist(g, lj, li);
+					if (target_dist < 0) continue; // this may happen if the query overlap is far too large
+				} else if (lj->rs >= li->rs || lj->re >= li->re) { // not colinear
+					continue;
+				} else {///li->v == lj->v and colinear; at the same ref id
+						/**
+						 	case 1: lj->qs************lj->qe
+												li->qs************li->qe
+							case 2: lj->qs************lj->qe
+																li->qs************li->qe
+
+							*****lj->rs************lj->re*****
+																			****li->rs************li->re**
+						* **/
+					///w is indel, w is always positive
+					int32_t dr = li->rs - lj->re, w = dr > dq? dr - dq : dq - dr;
+					///note that l*->v is the ref id, while seg* is the query id
+					if (w > bw) continue; // test bandwidth
+					if (dr > max_dist_g || dr < -max_dist_g) continue;
+					if (lj->re > li->rs) { // test overlap on the graph segment
+						int o = lj->re - li->rs;
+						if (o > (lj->re - lj->rs) * mask_level || o > (li->re - li->rs) * mask_level)
+							continue;
+					}
+					target_dist = mg_target_dist(g, lj, li);
+				}
+				if (n_dst == m_dst) KEXPAND(km, dst, m_dst); // TODO: watch out the quadratic behavior!
+				q = &dst[n_dst++];///q saves information for i->j
+				memset(q, 0, sizeof(mg_path_dst_t));
+				///note v is (rid:rev), so two alignment chains might be at the same ref id with different directions
+				q->inner = (li->v == lj->v);
+				q->v = lj->v^1;///must be v^1 instead of v
+				q->meta = j;
+				///lj->qs************lj->qe
+				///			li->qs************li->qe
+				q->qlen = li->qs - lj->qe;///might be negative
+				q->target_dist = target_dist;///cannot understand the target_dist
+				q->target_hash = 0;
+				q->check_hash = 0;
+				if (t[j] == i) {///this pre-cut is weird; attention
+					if (++n_skip > max_skip)
+						break;
+				}
+				if (p[j] >= 0) t[p[j]] = i;
+			}
+		}
+		///the above saves all linear chains that might be reached to the left side of chain i
+		///all those chains are saved to dst<n_dst>
+		{ // confirm reach-ability
+			int32_t k;
+			// test reach-ability without sequences
+			/**
+			*****lj->rs************lj->re*****
+				                                     ****li->rs************li->re***
+			(g->seg[li->v>>1].len - li->rs) ----> is like the node length in string graph
+	 		**/
+			mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seq[li->v>>1].len - li->rs), MG_MAX_SHORT_K, 0, 0, 1, 0);
+			// remove unreachable destinations
+			for (j = k = 0; j < n_dst; ++j) {
+				mg_path_dst_t *dj = &dst[j];
+				int32_t sc;
+				if (dj->n_path == 0) continue; // unreachable
+				sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap);
+				if (sc == INT32_MIN) continue; // out of band
+				if (sc + li->score < 0) continue; // negative score and too low
+				dst[k] = dst[j];
+				dst[k++].srt_key = INT64_MAX/2 - (int64_t)sc; // sort in the descending order
+			}
+			n_dst = k;
+			if (n_dst > 0) {
+				radix_sort_dst(dst, dst + n_dst);
+				// discard weaker chains if the best score is much larger (assuming base-level heuristic won't lift it to the top chain)
+				// dst[0].srt_key has the largest score
+				for (j = 1; j < n_dst; ++j) 
+					if (dst[j].srt_key - dst[0].srt_key > li->score)//discard chains with too small weight
+						break;
+				n_dst = j;
+				if (n_dst > max_gc_seq_ext) n_dst = max_gc_seq_ext; // discard weaker chains
+			}
+		}
+		if (n_dst > 0) { // find paths with sequences
+			int32_t min_qs = li->qs;
+			for (j = 0; j < n_dst; ++j) {
+				const mg_lchain_t *lj;
+				assert(dst[j].n_path > 0);
+				///a[]->srt = (uint32_t)is_isolated<<31 | r->qe;
+				///a[]->i = i;
+				lj = &lc[a[dst[j].meta].i];
+				if (lj->qe < min_qs) min_qs = lj->qe;
+			}
+			///qs keeps the sequence at the gap between the li and lj in query
+			memcpy(qs, &qseq[min_qs], li->qs - min_qs);
+			mg_shortest_k(km, g, li->v^1, n_dst, dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), MG_MAX_SHORT_K, li->qs - min_qs, qs, 1, 0);
+			if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, "[src:%d] q_intv=[%d,%d), src=%c%s[%d], n_dst=%d, max_dist=%d, min_qs=%d, lc_score=%d\n", ai->i, li->qs, li->qe, "><"[(li->v&1)^1], g->seg[li->v>>1].name, li->v^1, n_dst, max_dist_g + (g->seg[li->v>>1].len - li->rs), min_qs, li->score);
+		}
+		{ // DP
+			int32_t max_f = li->score, max_j = -1, max_d = -1, max_inner = 0;
+			uint32_t max_hash = 0;
+			for (j = 0; j < n_dst; ++j) {
+				mg_path_dst_t *dj = &dst[j];
+				int32_t sc;
+				sc = cal_sc(dj, li, lc, an, a, f, bw, ref_bonus, chn_pen_gap);
+				if (sc == INT32_MIN) continue;
+				if (mg_dbg_flag & MG_DBG_GC1) {
+					mg_lchain_t *lj = &lc[a[dj->meta].i];
+					fprintf(stderr, "  [dst:%d] dst=%c%s[%d], n_path=%d, target=%d, opt_dist=%d, score=%d, q_intv=[%d,%d), g_intv=[%d,%d)\n", dj->meta, "><"[dj->v&1], g->seg[dj->v>>1].name, dj->v, dj->n_path, dj->target_dist - g->seg[li->v>>1].len, dj->dist - g->seg[li->v>>1].len, sc, lj->qs, lj->qe, lj->rs, lj->re);
+				}
+				if (sc > max_f) max_f = sc, max_j = dj->meta, max_d = dj->dist, max_hash = dj->hash, max_inner = dj->inner;
+			}
+			f[i] = max_f, p[i] = max_j;
+			li->dist_pre = max_d;
+			li->hash_pre = max_hash;
+			li->inner_pre = max_inner;
+			v[i] = max_j >= 0 && v[max_j] > max_f? v[max_j] : max_f;
+			if (mg_dbg_flag & MG_DBG_GC1) fprintf(stderr, " [opt:%d] opt=%d, max_f=%d\n", ai->i, max_j, max_f);
+		}
+	}
+	kfree(km, dst);
+	kfree(km, qs);
+	if (mg_dbg_flag & MG_DBG_GC1) {
+		int32_t mmax_f = 0, mmax_i = -1;
+		for (i = 0; i < n_ext; ++i) if (f[i] > mmax_f) mmax_f = f[i], mmax_i = i;
+		i = mmax_i; while (i >= 0) { fprintf(stderr, "[best] i=%d, seg=%s, max_f=%d, chn_pen_gap=%f\n", a[i].i, g->seg[lc[a[i].i].v>>1].name, f[i], chn_pen_gap); i = p[i]; }
+	}
+	///n_ext: number of useful chains
+	///n_lc - n_ext: number of isoated chains
+	u = mg_chain_backtrack(km, n_ext, f, p, v, t, 0, 0, n_lc - n_ext, &n_u, &n_v);
+	kfree(km, f); kfree(km, p); kfree(km, t);
+	///store the extra isoated chains
+	for (i = 0; i < n_lc - n_ext; ++i) {
+		u[n_u++] = (uint64_t)lc[a[n_ext + i].i].score << 32 | 1;
+		v[n_v++] = n_ext + i;
+	}
+
+	///reorganize lc;
+	KMALLOC(km, swap, n_v);
+	for (i = 0, k = 0; i < n_u; ++i) {
+		int32_t k0 = k, ni = (int32_t)u[i];
+		for (j = 0; j < ni; ++j)
+			swap[k++] = lc[a[v[k0 + (ni - j - 1)]].i];
+	}
+	assert(k == n_v);
+	memcpy(lc, swap, n_v * sizeof(mg_lchain_t));
+	*n_lc_ = n_v;
+	*u_ = u;
+
+	kfree(km, a);
+	kfree(km, swap);
+	kfree(km, v);
+	return n_u;
+}
+
+
+void mg_map_frag(const void *ha_flt_tab, const ha_pt_t *ha_idx, const ma_ug_t *ug, const uint32_t qid, const int qlen, const char *qseq, ha_mzl_v *mz, 
+st_mt_t *sp, mg_tbuf_t *b, int32_t w, int32_t k, int32_t hpc, int32_t mz_sd, int32_t mz_rewin, const mg_idxopt_t *opt)
+{
+    mg128_t *a = NULL;
+    int64_t n_a;
+    int32_t *mini_pos;
+    int i, rep_len, n_mini_pos, n_lc, max_chain_gap_qry, max_chain_gap_ref;
+    uint64_t *u;
+	mg_lchain_t *lc;
+    mz->n = 0;
+    mz2_ha_sketch(qseq, qlen, w, k, 0, hpc, mz, ha_flt_tab, mz_sd, NULL, NULL, NULL, -1, -1, -1, sp, mz_rewin, 1);
+    ///a[]->y: weight(8)seg_id(8)flag(8)span(8)pos(32);--->query
+    ///a[]->x: rid(31)rev(1)rpos(33);--->reference
+    a = collect_seed_hits(b->km, opt, opt->hap_n, ha_flt_tab, ha_idx, ug, mz, &n_a, &rep_len, &n_mini_pos, &mini_pos);
+    if (opt->max_gap_pre > 0 && opt->max_gap_pre * 2 < opt->max_gap) n_a = flt_anchors(n_a, a, opt->max_gap_pre);
+    max_chain_gap_qry = max_chain_gap_ref = opt->max_gap;
+    if (n_a == 0) {
+        if(a) kfree(b->km, a);
+        a = 0, n_lc = 0, u = 0;
+    } else {
+		a = mg_lchain_dp(max_chain_gap_ref, max_chain_gap_qry, opt->bw, opt->max_lc_skip, opt->max_lc_iter, 
+		opt->min_lc_cnt, opt->min_lc_score, opt->chn_pen_gap, n_a, a, &n_lc, &u, b->km);
+    }
+
+	if (n_lc) {///n_lc is how many chain we found
+		lc = mg_lchain_gen(b->km, qlen, n_lc, u, a);
+		for (i = 0; i < n_lc; ++i)///update a[] since ref_id|rev has already been saved to lc[].v
+			mg_update_anchors(lc[i].cnt, &a[lc[i].off], n_mini_pos, mini_pos);///update a[].x
+	} else lc = 0;
+	kfree(b->km, mini_pos); kfree(b->km, u);
+
+	/**
+	 * up to here, a[] has been changed
+	 * a[].x: idx_in_minimizer_arr(32)r_pos(32)
+	 * a[].y: weight(8)query_id(8)flag(8)span(8)q_pos(32)
+	**/
+	
+}
+
 static void worker_for_ul_alignment(void *data, long i, int tid) // callback for kt_for()
 {
     utepdat_t *s = (utepdat_t*)data;
-    ma_ov_buf_t *b = s->mo[tid];
-    uint64_t len = s->len[i];
-    char *r = s->seq[i];
-
+    // ma_ov_buf_t *b = s->mo[tid];
+    mg_map_frag(s->ha_flt_tab, s->ha_idx, s->ug, s->id+i, s->len[i], s->seq[i], &(s->mzs[tid]), &(s->sps[tid]), s->buf[tid], s->opt->w, s->opt->k, 
+    s->opt->is_HPC, asm_opt.mz_sample_dist, asm_opt.mz_rewin, s->opt);
 
 }
 
@@ -110,7 +1289,7 @@ static void *worker_ul_pipeline(void *data, int step, void *in) // callback for
         uint64_t l;
 		utepdat_t *s;
 		CALLOC(s, 1);
-        s->ha_flt_tab = p->ha_flt_tab; s->ha_idx = p->ha_idx; s->id = p->total_pair; 
+        s->ha_flt_tab = p->ha_flt_tab; s->ha_idx = p->ha_idx; s->id = p->total_pair; s->opt = p->opt; s->ug = p->ug;
         while ((ret = kseq_read(p->ks)) >= 0) 
         {
             if (p->ks->seq.l < (uint64_t)p->opt->k) continue;
@@ -131,19 +1310,25 @@ static void *worker_ul_pipeline(void *data, int step, void *in) // callback for
 		else return s;
     }
     else if (step == 1) { // step 2: alignment
+		uint64_t i;
         utepdat_t *s = (utepdat_t*)in;
-        s->mo = init_ma_ov_buf_t_arr(p->n_thread);
+        CALLOC(s->mzs, p->n_thread);
+        CALLOC(s->sps, p->n_thread);
+
+		s->buf = (mg_tbuf_t**)calloc(p->n_thread, sizeof(mg_tbuf_t*));
+		for (i = 0; i < p->n_thread; ++i) s->buf[i] = mg_tbuf_init();
         /**
         CALLOC(s->pos, s->n);
         **/
-        int i;
         kt_for(p->n_thread, worker_for_ul_alignment, s, s->n);
-        for (i = 0; i < s->n; ++i) {
+        for (i = 0; i < (uint64_t)s->n; ++i) {
             free(s->seq[i]);
             p->total_base += s->len[i];
         }
         free(s->seq); free(s->len);
-        destory_ma_ov_buf_t_arr(&(s->mo), p->n_thread);
+
+		for (i = 0; i < p->n_thread; ++i) mg_tbuf_destroy(s->buf[i]);
+		free(s->buf);
 		return s;
     }
     else if (step == 2) { // step 3: dump
@@ -179,7 +1364,7 @@ int alignment_ul_pipeline(uldat_t* sl, const enzyme *fn)
     return 1;
 }
 
-int ul_align(mg_idxopt_t *opt, const enzyme *fn, void *ha_flt_tab, ha_pt_t *ha_idx)
+int ul_align(mg_idxopt_t *opt, const enzyme *fn, void *ha_flt_tab, ha_pt_t *ha_idx, ma_ug_t *ug)
 {
     uldat_t sl; memset(&sl, 0, sizeof(sl));
     sl.ha_flt_tab = ha_flt_tab;
@@ -187,6 +1372,7 @@ int ul_align(mg_idxopt_t *opt, const enzyme *fn, void *ha_flt_tab, ha_pt_t *ha_i
     sl.opt = opt;   
     sl.chunk_size = 20000000;
     sl.n_thread = asm_opt.thread_num;
+    sl.ug = ug;
     alignment_ul_pipeline(&sl, fn);
     return 1;
 }
@@ -196,7 +1382,7 @@ void ul_resolve(ma_ug_t *ug, int hap_n)
     mg_idxopt_t opt;
     init_mg_opt(&opt, 0, 19, 10, hap_n);
     uidx_build(ug, &opt);
-    ul_align(&opt, asm_opt.ar, ha_flt_tab, ha_idx);
+    ul_align(&opt, asm_opt.ar, ha_flt_tab, ha_idx, ug);
 
 
 
diff --git a/kalloc.cpp b/kalloc.cpp
new file mode 100644
index 0000000..8499552
--- /dev/null
+++ b/kalloc.cpp
@@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kalloc.h"
+
+/* In kalloc, a *core* is a large chunk of contiguous memory. Each core is
+ * associated with a master header, which keeps the size of the current core
+ * and the pointer to next core. Kalloc allocates small *blocks* of memory from
+ * the cores and organizes free memory blocks in a circular single-linked list.
+ *
+ * In the following diagram, "@" stands for the header of a free block (of type
+ * header_t), "#" for the header of an allocated block (of type size_t), "-"
+ * for free memory, and "+" for allocated memory.
+ *
+ * master        This region is core 1.          master           This region is core 2.
+ *      |                                             |
+ *      *@-------#++++++#++++++++++++@--------        *@----------#++++++++++++#+++++++@------------
+ *       |                           |                 |                               |
+ *       p=p->ptr->ptr->ptr->ptr     p->ptr            p->ptr->ptr                     p->ptr->ptr->ptr
+ */
+typedef struct header_t {
+	size_t size;
+	struct header_t *ptr;
+} header_t;
+
+typedef struct {
+	void *par;
+	size_t min_core_size;
+	header_t base, *loop_head, *core_head; /* base is a zero-sized block always kept in the loop */
+} kmem_t;
+
+static void panic(const char *s)
+{
+	fprintf(stderr, "%s\n", s);
+	abort();
+}
+
+void *km_init2(void *km_par, size_t min_core_size)
+{
+	kmem_t *km;
+	km = (kmem_t*)kcalloc(km_par, 1, sizeof(kmem_t));
+	km->par = km_par;
+	km->min_core_size = min_core_size > 0? min_core_size : 0x80000;
+	return (void*)km;
+}
+
+void *km_init(void) { return km_init2(0, 0); }
+
+void km_destroy(void *_km)
+{
+	kmem_t *km = (kmem_t*)_km;
+	void *km_par;
+	header_t *p, *q;
+	if (km == NULL) return;
+	km_par = km->par;
+	for (p = km->core_head; p != NULL;) {
+		q = p->ptr;
+		kfree(km_par, p);
+		p = q;
+	}
+	kfree(km_par, km);
+}
+
+static header_t *morecore(kmem_t *km, size_t nu)
+{
+	header_t *q;
+	size_t bytes, *p;
+	nu = (nu + 1 + (km->min_core_size - 1)) / km->min_core_size * km->min_core_size; /* the first +1 for core header */
+	bytes = nu * sizeof(header_t);
+	q = (header_t*)kmalloc(km->par, bytes);
+	if (!q) panic("[morecore] insufficient memory");
+	q->ptr = km->core_head, q->size = nu, km->core_head = q;
+	p = (size_t*)(q + 1);
+	*p = nu - 1; /* the size of the free block; -1 because the first unit is used for the core header */
+	kfree(km, p + 1); /* initialize the new "core"; NB: the core header is not looped. */
+	return km->loop_head;
+}
+
+void kfree(void *_km, void *ap) /* kfree() also adds a new core to the circular list */
+{
+	header_t *p, *q;
+	kmem_t *km = (kmem_t*)_km;
+	
+	if (!ap) return;
+	if (km == NULL) {
+		free(ap);
+		return;
+	}
+	p = (header_t*)((size_t*)ap - 1);
+	p->size = *((size_t*)ap - 1);
+	/* Find the pointer that points to the block to be freed. The following loop can stop on two conditions:
+	 *
+	 * a) "p>q && p<q->ptr": @------#++++++++#+++++++@-------    @---------------#+++++++@-------
+	 *    (can also be in    |      |                |        -> |                       |
+	 *     two cores)        q      p           q->ptr           q                  q->ptr
+	 *
+	 *                       @--------    #+++++++++@--------    @--------    @------------------
+	 *                       |            |         |         -> |            |
+	 *                       q            p    q->ptr            q       q->ptr
+	 *
+	 * b) "q>=q->ptr && (p>q || p<q->ptr)":  @-------#+++++   @--------#+++++++     @-------#+++++   @----------------
+	 *                                       |                |        |         -> |                |
+	 *                                  q->ptr                q        p       q->ptr                q
+	 *
+	 *                                       #+++++++@-----   #++++++++@-------     @-------------   #++++++++@-------
+	 *                                       |       |                 |         -> |                         |
+	 *                                       p  q->ptr                 q       q->ptr                         q
+	 */
+	for (q = km->loop_head; !(p > q && p < q->ptr); q = q->ptr)
+		if (q >= q->ptr && (p > q || p < q->ptr)) break;
+	if (p + p->size == q->ptr) { /* two adjacent blocks, merge p and q->ptr (the 2nd and 4th cases) */
+		p->size += q->ptr->size;
+		p->ptr = q->ptr->ptr;
+	} else if (p + p->size > q->ptr && q->ptr >= p) {
+		panic("[kfree] The end of the allocated block enters a free block.");
+	} else p->ptr = q->ptr; /* backup q->ptr */
+
+	if (q + q->size == p) { /* two adjacent blocks, merge q and p (the other two cases) */
+		q->size += p->size;
+		q->ptr = p->ptr;
+		km->loop_head = q;
+	} else if (q + q->size > p && p >= q) {
+		panic("[kfree] The end of a free block enters the allocated block.");
+	} else km->loop_head = p, q->ptr = p; /* in two cores, cannot be merged; create a new block in the list */
+}
+
+void *kmalloc(void *_km, size_t n_bytes)
+{
+	kmem_t *km = (kmem_t*)_km;
+	size_t n_units;
+	header_t *p, *q;
+
+	if (n_bytes == 0) return 0;
+	if (km == NULL) return malloc(n_bytes);
+	n_units = (n_bytes + sizeof(size_t) + sizeof(header_t) - 1) / sizeof(header_t); /* header+n_bytes requires at least this number of units */
+
+	if (!(q = km->loop_head)) /* the first time when kmalloc() is called, intialize it */
+		q = km->loop_head = km->base.ptr = &km->base;
+	for (p = q->ptr;; q = p, p = p->ptr) { /* search for a suitable block */
+		if (p->size >= n_units) { /* p->size if the size of current block. This line means the current block is large enough. */
+			if (p->size == n_units) q->ptr = p->ptr; /* no need to split the block */
+			else { /* split the block. NB: memory is allocated at the end of the block! */
+				p->size -= n_units; /* reduce the size of the free block */
+				p += p->size; /* p points to the allocated block */
+				*(size_t*)p = n_units; /* set the size */
+			}
+			km->loop_head = q; /* set the end of chain */
+			return (size_t*)p + 1;
+		}
+		if (p == km->loop_head) { /* then ask for more "cores" */
+			if ((p = morecore(km, n_units)) == 0) return 0;
+		}
+	}
+}
+
+void *kcalloc(void *_km, size_t count, size_t size)
+{
+	kmem_t *km = (kmem_t*)_km;
+	void *p;
+	if (size == 0 || count == 0) return 0;
+	if (km == NULL) return calloc(count, size);
+	p = kmalloc(km, count * size);
+	memset(p, 0, count * size);
+	return p;
+}
+
+void *krealloc(void *_km, void *ap, size_t n_bytes) // TODO: this can be made more efficient in principle
+{
+	kmem_t *km = (kmem_t*)_km;
+	size_t cap, *p, *q;
+
+	if (n_bytes == 0) {
+		kfree(km, ap); return 0;
+	}
+	if (km == NULL) return realloc(ap, n_bytes);
+	if (ap == NULL) return kmalloc(km, n_bytes);
+	p = (size_t*)ap - 1;
+	cap = (*p) * sizeof(header_t) - sizeof(size_t);
+	if (cap >= n_bytes) return ap; /* TODO: this prevents shrinking */
+	q = (size_t*)kmalloc(km, n_bytes);
+	memcpy(q, ap, cap);
+	kfree(km, ap);
+	return q;
+}
+
+void km_stat(const void *_km, km_stat_t *s)
+{
+	kmem_t *km = (kmem_t*)_km;
+	header_t *p;
+	memset(s, 0, sizeof(km_stat_t));
+	if (km == NULL || km->loop_head == NULL) return;
+	for (p = km->loop_head;; p = p->ptr) {
+		s->available += p->size * sizeof(header_t);
+		if (p->size != 0) ++s->n_blocks; /* &kmem_t::base is always one of the cores. It is zero-sized. */
+		if (p->ptr > p && p + p->size > p->ptr)
+			panic("[km_stat] The end of a free block enters another free block.");
+		if (p->ptr == km->loop_head) break;
+	}
+	for (p = km->core_head; p != NULL; p = p->ptr) {
+		size_t size = p->size * sizeof(header_t);
+		++s->n_cores;
+		s->capacity += size;
+		s->largest = s->largest > size? s->largest : size;
+	}
+}
diff --git a/kalloc.h b/kalloc.h
new file mode 100644
index 0000000..5ef432a
--- /dev/null
+++ b/kalloc.h
@@ -0,0 +1,77 @@
+#ifndef _KALLOC_H_
+#define _KALLOC_H_
+
+#include <stddef.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+	size_t capacity, available, n_blocks, n_cores, largest;
+} km_stat_t;
+
+void *kmalloc(void *km, size_t size);
+void *krealloc(void *km, void *ptr, size_t size);
+void *kcalloc(void *km, size_t count, size_t size);
+void kfree(void *km, void *ptr);
+
+void *km_init(void);
+void *km_init2(void *km_par, size_t min_core_size);
+void km_destroy(void *km);
+void km_stat(const void *_km, km_stat_t *s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#define KMALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kmalloc((km), (len) * sizeof(*(ptr))))
+#define KCALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))kcalloc((km), (len), sizeof(*(ptr))))
+#define KREALLOC(km, ptr, len) ((ptr) = (__typeof__(ptr))krealloc((km), (ptr), (len) * sizeof(*(ptr))))
+
+#define KEXPAND(km, a, m) do { \
+		(m) = (m) >= 4? (m) + ((m)>>1) : 16; \
+		KREALLOC((km), (a), (m)); \
+	} while (0)
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+// adapted from klist.h
+#define KALLOC_POOL_INIT2(SCOPE, name, kmptype_t) \
+	typedef struct { \
+		size_t cnt, n, max; \
+		kmptype_t **buf; \
+		void *km; \
+	} kmp_##name##_t; \
+	SCOPE kmp_##name##_t *kmp_init_##name(void *km) { \
+		kmp_##name##_t *mp; \
+		KCALLOC(km, mp, 1); \
+		mp->km = km; \
+		return mp; \
+	} \
+	SCOPE void kmp_destroy_##name(kmp_##name##_t *mp) { \
+		size_t k; \
+		for (k = 0; k < mp->n; ++k) kfree(mp->km, mp->buf[k]); \
+		kfree(mp->km, mp->buf); kfree(mp->km, mp); \
+	} \
+	SCOPE kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \
+		++mp->cnt; \
+		if (mp->n == 0) return (kmptype_t*)kcalloc(mp->km, 1, sizeof(kmptype_t)); \
+		return mp->buf[--mp->n]; \
+	} \
+	SCOPE void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+		--mp->cnt; \
+		if (mp->n == mp->max) KEXPAND(mp->km, mp->buf, mp->max); \
+		mp->buf[mp->n++] = p; \
+	}
+
+#define KALLOC_POOL_INIT(name, kmptype_t) \
+	KALLOC_POOL_INIT2(static inline klib_unused, name, kmptype_t)
+
+#endif
diff --git a/kavl.h b/kavl.h
new file mode 100644
index 0000000..e0a8e1b
--- /dev/null
+++ b/kavl.h
@@ -0,0 +1,414 @@
+/* The MIT License
+
+   Copyright (c) 2018 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* An example:
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "kavl.h"
+
+struct my_node {
+  char key;
+  KAVL_HEAD(struct my_node) head;
+};
+#define my_cmp(p, q) (((q)->key < (p)->key) - ((p)->key < (q)->key))
+KAVL_INIT(my, struct my_node, head, my_cmp)
+
+int main(void) {
+  const char *str = "MNOLKQOPHIA"; // from wiki, except a duplicate
+  struct my_node *root = 0;
+  int i, l = strlen(str);
+  for (i = 0; i < l; ++i) {        // insert in the input order
+    struct my_node *q, *p = malloc(sizeof(*p));
+    p->key = str[i];
+    q = kavl_insert(my, &root, p, 0);
+    if (p != q) free(p);           // if already present, free
+  }
+  kavl_itr_t(my) itr;
+  kavl_itr_first(my, root, &itr);  // place at first
+  do {                             // traverse
+    const struct my_node *p = kavl_at(&itr);
+    putchar(p->key);
+    free((void*)p);                // free node
+  } while (kavl_itr_next(my, &itr));
+  putchar('\n');
+  return 0;
+}
+*/
+
+#ifndef KAVL_H
+#define KAVL_H
+
+#ifdef __STRICT_ANSI__
+#define inline __inline__
+#endif
+
+#define KAVL_MAX_DEPTH 64
+
+#define kavl_size(head, p) ((p)? (p)->head.size : 0)
+#define kavl_size_child(head, q, i) ((q)->head.p[(i)]? (q)->head.p[(i)]->head.size : 0)
+
+#define KAVL_HEAD(__type) \
+	struct { \
+		__type *p[2]; \
+		signed char balance; /* balance factor */ \
+		unsigned size; /* #elements in subtree */ \
+	}
+
+#define __KAVL_FIND(suf, __scope, __type, __head,  __cmp) \
+	__scope __type *kavl_find_##suf(const __type *root, const __type *x, unsigned *cnt_) { \
+		const __type *p = root; \
+		unsigned cnt = 0; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp >= 0) cnt += kavl_size_child(__head, p, 0) + 1; \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		return (__type*)p; \
+	} \
+	__scope __type *kavl_interval_##suf(const __type *root, const __type *x, __type **lower, __type **upper) { \
+		const __type *p = root, *l = 0, *u = 0; \
+		while (p != 0) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp < 0) u = p, p = p->__head.p[0]; \
+			else if (cmp > 0) l = p, p = p->__head.p[1]; \
+			else { l = u = p; break; } \
+		} \
+		if (lower) *lower = (__type*)l; \
+		if (upper) *upper = (__type*)u; \
+		return (__type*)p; \
+	}
+
+#define __KAVL_ROTATE(suf, __type, __head) \
+	/* one rotation: (a,(b,c)q)p => ((a,b)p,c)q */ \
+	static inline __type *kavl_rotate1_##suf(__type *p, int dir) { /* dir=0 to left; dir=1 to right */ \
+		int opp = 1 - dir; /* opposite direction */ \
+		__type *q = p->__head.p[opp]; \
+		unsigned size_p = p->__head.size; \
+		p->__head.size -= q->__head.size - kavl_size_child(__head, q, dir); \
+		q->__head.size = size_p; \
+		p->__head.p[opp] = q->__head.p[dir]; \
+		q->__head.p[dir] = p; \
+		return q; \
+	} \
+	/* two consecutive rotations: (a,((b,c)r,d)q)p => ((a,b)p,(c,d)q)r */ \
+	static inline __type *kavl_rotate2_##suf(__type *p, int dir) { \
+		int b1, opp = 1 - dir; \
+		__type *q = p->__head.p[opp], *r = q->__head.p[dir]; \
+		unsigned size_x_dir = kavl_size_child(__head, r, dir); \
+		r->__head.size = p->__head.size; \
+		p->__head.size -= q->__head.size - size_x_dir; \
+		q->__head.size -= size_x_dir + 1; \
+		p->__head.p[opp] = r->__head.p[dir]; \
+		r->__head.p[dir] = p; \
+		q->__head.p[dir] = r->__head.p[opp]; \
+		r->__head.p[opp] = q; \
+		b1 = dir == 0? +1 : -1; \
+		if (r->__head.balance == b1) q->__head.balance = 0, p->__head.balance = -b1; \
+		else if (r->__head.balance == 0) q->__head.balance = p->__head.balance = 0; \
+		else q->__head.balance = b1, p->__head.balance = 0; \
+		r->__head.balance = 0; \
+		return r; \
+	}
+
+#define __KAVL_INSERT(suf, __scope, __type, __head, __cmp) \
+	__scope __type *kavl_insert_##suf(__type **root_, __type *x, unsigned *cnt_) { \
+		unsigned char stack[KAVL_MAX_DEPTH]; \
+		__type *path[KAVL_MAX_DEPTH]; \
+		__type *bp, *bq; \
+		__type *p, *q, *r = 0; /* _r_ is potentially the new root */ \
+		int i, which = 0, top, b1, path_len; \
+		unsigned cnt = 0; \
+		bp = *root_, bq = 0; \
+		/* find the insertion location */ \
+		for (p = bp, q = bq, top = path_len = 0; p; q = p, p = p->__head.p[which]) { \
+			int cmp; \
+			cmp = __cmp(x, p); \
+			if (cmp >= 0) cnt += kavl_size_child(__head, p, 0) + 1; \
+			if (cmp == 0) { \
+				if (cnt_) *cnt_ = cnt; \
+				return p; \
+			} \
+			if (p->__head.balance != 0) \
+				bq = q, bp = p, top = 0; \
+			stack[top++] = which = (cmp > 0); \
+			path[path_len++] = p; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		x->__head.balance = 0, x->__head.size = 1, x->__head.p[0] = x->__head.p[1] = 0; \
+		if (q == 0) *root_ = x; \
+		else q->__head.p[which] = x; \
+		if (bp == 0) return x; \
+		for (i = 0; i < path_len; ++i) ++path[i]->__head.size; \
+		for (p = bp, top = 0; p != x; p = p->__head.p[stack[top]], ++top) /* update balance factors */ \
+			if (stack[top] == 0) --p->__head.balance; \
+			else ++p->__head.balance; \
+		if (bp->__head.balance > -2 && bp->__head.balance < 2) return x; /* no re-balance needed */ \
+		/* re-balance */ \
+		which = (bp->__head.balance < 0); \
+		b1 = which == 0? +1 : -1; \
+		q = bp->__head.p[1 - which]; \
+		if (q->__head.balance == b1) { \
+			r = kavl_rotate1_##suf(bp, which); \
+			q->__head.balance = bp->__head.balance = 0; \
+		} else r = kavl_rotate2_##suf(bp, which); \
+		if (bq == 0) *root_ = r; \
+		else bq->__head.p[bp != bq->__head.p[0]] = r; \
+		return x; \
+	}
+
+#define __KAVL_ERASE(suf, __scope, __type, __head, __cmp) \
+	__scope __type *kavl_erase_##suf(__type **root_, const __type *x, unsigned *cnt_) { \
+		__type *p, *path[KAVL_MAX_DEPTH], fake; \
+		unsigned char dir[KAVL_MAX_DEPTH]; \
+		int i, d = 0, cmp; \
+		unsigned cnt = 0; \
+		fake.__head.p[0] = *root_, fake.__head.p[1] = 0; \
+		if (cnt_) *cnt_ = 0; \
+		if (x) { \
+			for (cmp = -1, p = &fake; cmp; cmp = __cmp(x, p)) { \
+				int which = (cmp > 0); \
+				if (cmp > 0) cnt += kavl_size_child(__head, p, 0) + 1; \
+				dir[d] = which; \
+				path[d++] = p; \
+				p = p->__head.p[which]; \
+				if (p == 0) { \
+					if (cnt_) *cnt_ = 0; \
+					return 0; \
+				} \
+			} \
+			cnt += kavl_size_child(__head, p, 0) + 1; /* because p==x is not counted */ \
+		} else { \
+			for (p = &fake, cnt = 1; p; p = p->__head.p[0]) \
+				dir[d] = 0, path[d++] = p; \
+			p = path[--d]; \
+		} \
+		if (cnt_) *cnt_ = cnt; \
+		for (i = 1; i < d; ++i) --path[i]->__head.size; \
+		if (p->__head.p[1] == 0) { /* ((1,.)2,3)4 => (1,3)4; p=2 */ \
+			path[d-1]->__head.p[dir[d-1]] = p->__head.p[0]; \
+		} else { \
+			__type *q = p->__head.p[1]; \
+			if (q->__head.p[0] == 0) { /* ((1,2)3,4)5 => ((1)2,4)5; p=3 */ \
+				q->__head.p[0] = p->__head.p[0]; \
+				q->__head.balance = p->__head.balance; \
+				path[d-1]->__head.p[dir[d-1]] = q; \
+				path[d] = q, dir[d++] = 1; \
+				q->__head.size = p->__head.size - 1; \
+			} else { /* ((1,((.,2)3,4)5)6,7)8 => ((1,(2,4)5)3,7)8; p=6 */ \
+				__type *r; \
+				int e = d++; /* backup _d_ */\
+				for (;;) { \
+					dir[d] = 0; \
+					path[d++] = q; \
+					r = q->__head.p[0]; \
+					if (r->__head.p[0] == 0) break; \
+					q = r; \
+				} \
+				r->__head.p[0] = p->__head.p[0]; \
+				q->__head.p[0] = r->__head.p[1]; \
+				r->__head.p[1] = p->__head.p[1]; \
+				r->__head.balance = p->__head.balance; \
+				path[e-1]->__head.p[dir[e-1]] = r; \
+				path[e] = r, dir[e] = 1; \
+				for (i = e + 1; i < d; ++i) --path[i]->__head.size; \
+				r->__head.size = p->__head.size - 1; \
+			} \
+		} \
+		while (--d > 0) { \
+			__type *q = path[d]; \
+			int which, other, b1 = 1, b2 = 2; \
+			which = dir[d], other = 1 - which; \
+			if (which) b1 = -b1, b2 = -b2; \
+			q->__head.balance += b1; \
+			if (q->__head.balance == b1) break; \
+			else if (q->__head.balance == b2) { \
+				__type *r = q->__head.p[other]; \
+				if (r->__head.balance == -b1) { \
+					path[d-1]->__head.p[dir[d-1]] = kavl_rotate2_##suf(q, which); \
+				} else { \
+					path[d-1]->__head.p[dir[d-1]] = kavl_rotate1_##suf(q, which); \
+					if (r->__head.balance == 0) { \
+						r->__head.balance = -b1; \
+						q->__head.balance = b1; \
+						break; \
+					} else r->__head.balance = q->__head.balance = 0; \
+				} \
+			} \
+		} \
+		*root_ = fake.__head.p[0]; \
+		return p; \
+	}
+
+#define kavl_free(__type, __head, __root, __free) do { \
+		__type *_p, *_q; \
+		for (_p = __root; _p; _p = _q) { \
+			if (_p->__head.p[0] == 0) { \
+				_q = _p->__head.p[1]; \
+				__free(_p); \
+			} else { \
+				_q = _p->__head.p[0]; \
+				_p->__head.p[0] = _q->__head.p[1]; \
+				_q->__head.p[1] = _p; \
+			} \
+		} \
+	} while (0)
+
+#define __KAVL_ITR(suf, __scope, __type, __head, __cmp) \
+	struct kavl_itr_##suf { \
+		const __type *stack[KAVL_MAX_DEPTH], **top; \
+	}; \
+	__scope void kavl_itr_first_##suf(const __type *root, struct kavl_itr_##suf *itr) { \
+		const __type *p; \
+		for (itr->top = itr->stack - 1, p = root; p; p = p->__head.p[0]) \
+			*++itr->top = p; \
+	} \
+	__scope int kavl_itr_find_##suf(const __type *root, const __type *x, struct kavl_itr_##suf *itr) { \
+		const __type *p = root; \
+		itr->top = itr->stack - 1; \
+		while (p != 0) { \
+			int cmp; \
+			*++itr->top = p; \
+			cmp = __cmp(x, p); \
+			if (cmp < 0) p = p->__head.p[0]; \
+			else if (cmp > 0) p = p->__head.p[1]; \
+			else break; \
+		} \
+		return p? 1 : 0; \
+	} \
+	__scope int kavl_itr_next_bidir_##suf(struct kavl_itr_##suf *itr, int dir) { \
+		const __type *p; \
+		if (itr->top < itr->stack) return 0; \
+		dir = !!dir; \
+		p = (*itr->top)->__head.p[dir]; \
+		if (p) { /* go down */ \
+			for (; p; p = p->__head.p[!dir]) \
+				*++itr->top = p; \
+			return 1; \
+		} else { /* go up */ \
+			do { \
+				p = *itr->top--; \
+			} while (itr->top >= itr->stack && p == (*itr->top)->__head.p[dir]); \
+			return itr->top < itr->stack? 0 : 1; \
+		} \
+	} \
+
+/**
+ * Insert a node to the tree
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param proot   pointer to the root of the tree (in/out: root may change)
+ * @param x       node to insert (in)
+ * @param cnt     number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return _x_ if not present in the tree, or the node equal to x.
+ */
+#define kavl_insert(suf, proot, x, cnt) kavl_insert_##suf(proot, x, cnt)
+
+/**
+ * Find a node in the tree
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param root    root of the tree
+ * @param x       node value to find (in)
+ * @param cnt     number of nodes smaller than or equal to _x_; can be NULL (out)
+ *
+ * @return node equal to _x_ if present, or NULL if absent
+ */
+#define kavl_find(suf, root, x, cnt) kavl_find_##suf(root, x, cnt)
+#define kavl_interval(suf, root, x, lower, upper) kavl_interval_##suf(root, x, lower, upper)
+
+/**
+ * Delete a node from the tree
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param proot   pointer to the root of the tree (in/out: root may change)
+ * @param x       node value to delete; if NULL, delete the first node (in)
+ *
+ * @return node removed from the tree if present, or NULL if absent
+ */
+#define kavl_erase(suf, proot, x, cnt) kavl_erase_##suf(proot, x, cnt)
+#define kavl_erase_first(suf, proot) kavl_erase_##suf(proot, 0, 0)
+
+#define kavl_itr_t(suf) struct kavl_itr_##suf
+
+/**
+ * Place the iterator at the smallest object
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param root    root of the tree
+ * @param itr     iterator
+ */
+#define kavl_itr_first(suf, root, itr) kavl_itr_first_##suf(root, itr)
+
+/**
+ * Place the iterator at the object equal to or greater than the query
+ *
+ * @param suf     name suffix used in KAVL_INIT()
+ * @param root    root of the tree
+ * @param x       query (in)
+ * @param itr     iterator (out)
+ *
+ * @return 1 if find; 0 otherwise. kavl_at(itr) is NULL if and only if query is
+ *         larger than all objects in the tree
+ */
+#define kavl_itr_find(suf, root, x, itr) kavl_itr_find_##suf(root, x, itr)
+
+/**
+ * Move to the next object in order
+ *
+ * @param itr     iterator (modified)
+ *
+ * @return 1 if there is a next object; 0 otherwise
+ */
+#define kavl_itr_next(suf, itr) kavl_itr_next_bidir_##suf(itr, 1)
+#define kavl_itr_prev(suf, itr) kavl_itr_next_bidir_##suf(itr, 0)
+
+/**
+ * Return the pointer at the iterator
+ *
+ * @param itr     iterator
+ *
+ * @return pointer if present; NULL otherwise
+ */
+#define kavl_at(itr) ((itr)->top < (itr)->stack? 0 : *(itr)->top)
+
+#define KAVL_INIT2(suf, __scope, __type, __head, __cmp) \
+	__KAVL_FIND(suf, __scope, __type, __head,  __cmp) \
+	__KAVL_ROTATE(suf, __type, __head) \
+	__KAVL_INSERT(suf, __scope, __type, __head, __cmp) \
+	__KAVL_ERASE(suf, __scope, __type, __head, __cmp) \
+	__KAVL_ITR(suf, __scope, __type, __head, __cmp)
+
+#define KAVL_INIT(suf, __type, __head, __cmp) \
+	KAVL_INIT2(suf,, __type, __head, __cmp)
+
+#endif
diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..311e88d
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,635 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "kalloc.h"
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+		void *km; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init2_##name(void *km) { \
+		kh_##name##_t *h; \
+		h = (kh_##name##_t*)kcalloc(km, 1, sizeof(kh_##name##_t)); \
+		h->km = km; \
+		return h; \
+	} \
+	SCOPE kh_##name##_t *kh_init_##name(void) {	return kh_init2_##name(0); } \
+	SCOPE void kh_destroy_##name(kh_##name##_t *h) \
+	{ \
+		if (h) { \
+			void *km = h->km; \
+			kfree(km, (void *)h->keys); kfree(km, h->flags); \
+			kfree(km, (void *)h->vals); \
+			kfree(km, h); \
+		} \
+	} \
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(h->km, __ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc(h->km, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(h->km, new_flags); return -1; }	\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc(h->km, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(h->km, new_flags); return -1; } \
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc(h->km, (void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc(h->km, (void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->km, h->flags); /* free the working space */ \
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+static kh_inline khint64_t __ac_Wang_hash64(khint64_t key)
+{
+	key = ~key + (key << 21);
+	key = key ^ key >> 24;
+	key = (key + (key << 3)) + (key << 8);
+	key = key ^ key >> 14;
+	key = (key + (key << 2)) + (key << 4);
+	key = key ^ key >> 28;
+	key = key + (key << 31);
+	return key;
+}
+#define kh_int_hash64_func2(key) __ac_Wang_hash64((khint64_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+#define kh_init2(name, km) kh_init2_##name(km)
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/tovlp.cpp b/tovlp.cpp
index 8401f89..55923cc 100644
--- a/tovlp.cpp
+++ b/tovlp.cpp
@@ -968,7 +968,7 @@ pdq *pq_p, pdq *pq_a, uint8_t fp, uint8_t fa, long long *d)
     for (i = 0; i < ns; i++)
     {
         if(as[i].del || as[i].v == sv) continue;
-        nc = dfs_set(g, as[i].v, s>>1, stack, tt, vis, fa);
+        nc = dfs_set(g, as[i].v, s>>1, stack, tt, vis, fa);///nc > 0 means as[i].v and sv have common suffix
         if(nc && check_trans_relation_by_path(sv, as[i].v, pq_p, NULL, NULL, pq_a, NULL, NULL, g, 
         vis, fp+fa, nc, 0.45, d))
         {
@@ -1100,11 +1100,12 @@ int get_min_dec(clean_mul_t *cl, uint32_t positive_flag, uint32_t negative_flag,
     clean_t *p = NULL;
     (*v) = (uint32_t)-1;
     for (i = 0; i < cl->n; i++) cl->a[i].is_b = 0, cl->a[i].min_v = (uint32_t)-1;    
-    kt_for(cl->n, bub_iden_worker, cl, cl->g->n_seq<<1);
+    kt_for(cl->n, bub_iden_worker, cl, cl->g->n_seq<<1);///fast check if there are bubbles in graph
     for (i = 0; i < cl->n; i++)
     {
         if(cl->a[i].is_b) ///pop bubble
         {
+            ///pop all bubbles
             n_pop = asg_pop_bubble_primary_trio(cl->g, cl->ug, cl->bs_flag, &(cl->a[i].b),
             cl->max_dist, positive_flag, negative_flag, o);
             if(n_pop ==0) fprintf(stderr, "ERROR-n_pop\n");
@@ -1116,6 +1117,7 @@ int get_min_dec(clean_mul_t *cl, uint32_t positive_flag, uint32_t negative_flag,
     {
         if(cl->a[i].min_v == (uint32_t)-1) continue;
         // if(b->min_v == (uint32_t)-1 || (b->min_d > d) || (b->min_d == d && b->min_v < eid))
+        ///looks like select the minum node, and back
         if(!p || p->min_d > cl->a[i].min_d || 
         (p->min_d == cl->a[i].min_d && cl->g->seq[p->min_v>>1].len > cl->g->seq[cl->a[i].min_v>>1].len) ||
         (p->min_d == cl->a[i].min_d && cl->g->seq[p->min_v>>1].len == cl->g->seq[cl->a[i].min_v>>1].len && p->min_v > cl->a[i].min_v))