Skip to content

Commit

Permalink
code clean
Browse files Browse the repository at this point in the history
  • Loading branch information
chhylp123 committed Aug 28, 2022
1 parent 8b7ca5f commit 7b89478
Show file tree
Hide file tree
Showing 10 changed files with 1,206 additions and 93 deletions.
419 changes: 415 additions & 4 deletions Correct.cpp

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions Correct.h
Original file line number Diff line number Diff line change
Expand Up @@ -1131,6 +1131,13 @@ void correct_ul_overlap(overlap_region_alloc* overlap_list, const ul_idx_t *uref
int force_repeat, int is_consensus, int* fully_cov, int* abnormal,
double max_ov_diff_ec, long long winLen, void *km);

void lchain_align(overlap_region_alloc* overlap_list, const ul_idx_t *uref,
UC_Read* g_read, Correct_dumy* dumy, UC_Read* overlap_read,
Graph* g, Graph* DAGCon, Cigar_record* current_cigar,
haplotype_evdience_alloc* hap, Round2_alignment* second_round,
kvec_t_u64_warp* v_idx, window_list_alloc* win_ciagr_buf,
int force_repeat, int is_consensus, int* fully_cov, int* abnormal,
double max_ov_diff_ec, long long winLen, void *km);
/***
type:
0. match
Expand Down
302 changes: 302 additions & 0 deletions Hash_Table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,94 @@ int get_fake_gap_shift(Fake_Cigar* x, int index)
return result;
}

int ovlp_chain_gen(overlap_region_alloc* ol, overlap_region* t, int64_t xl, int64_t yl, int64_t apend_be)
{
if (ol->length + 1 > ol->size) {
uint64_t sl = ol->size;
ol->size = ol->length + 1;
kroundup64(ol->size);
REALLOC(ol->list, ol->size);
/// need to set new space to be 0
memset(ol->list + sl, 0, sizeof(overlap_region)*(ol->size - sl));
}

if ((ol->length!=0) && (ol->list[ol->length-1].y_id==t->y_id)) {
if((ol->list[ol->length-1].shared_seed > t->shared_seed) ||
((ol->list[ol->length-1].shared_seed == t->shared_seed) &&
(ol->list[ol->length-1].overlapLen <= t->overlapLen))) {
return 0;
}
else {
ol->length--;
}
}

int64_t xr, yr, dd, pdd, dq, dr, id, i, fn;
if(t->x_pos_s <= t->y_pos_s) {
t->y_pos_s -= t->x_pos_s; t->x_pos_s = 0;
} else {
t->x_pos_s -= t->y_pos_s; t->y_pos_s = 0;
}

xr = xl-t->x_pos_e-1; yr = yl-t->y_pos_e-1;
if(xr <= yr) {
t->x_pos_e = xl-1; t->y_pos_e += xr;
} else {
t->y_pos_e = yl-1; t->x_pos_e += yr;
}

overlap_region *o = &(ol->list[ol->length++]);
o->shared_seed = t->shared_seed;
o->align_length = 0;
o->is_match = 0;
o->non_homopolymer_errors = 0;
o->strong = 0;
o->x_id = t->x_id;
o->y_id = t->y_id;
o->x_pos_strand = 0;///always 0
o->y_pos_strand = t->x_pos_strand;

resize_fake_cigar(&(o->f_cigar), (t->f_cigar.length + 2), NULL);
if(apend_be == 1) add_fake_cigar(&(o->f_cigar), o->x_pos_s, 0, NULL);

if (t->x_pos_strand == 1) {
o->x_pos_e = xl-t->x_pos_s-1; o->x_pos_s = xl-t->x_pos_e-1;
o->y_pos_e = yl-t->y_pos_s-1; o->y_pos_s = yl-t->y_pos_e-1;

pdd = INT32_MAX; fn = t->f_cigar.length;
for (i = 0; i < fn; i++) {
dd = get_fake_gap_shift(&(t->f_cigar), i);
if(dd != pdd) {
pdd = dd;
add_fake_cigar(&(o->f_cigar), xl-get_fake_gap_pos(&(t->f_cigar), i)-1, pdd, NULL);
}
}
} else {
o->x_pos_e = t->x_pos_e; o->x_pos_s = t->x_pos_s;
o->y_pos_e = t->y_pos_e; o->y_pos_s = t->y_pos_s;

dq = t->x_pos_e - t->x_pos_s;
dr = t->y_pos_e - t->y_pos_s;
id = dr - dq;

pdd = INT32_MAX; fn = t->f_cigar.length;
for (i = fn-1; i >= 0; i--) {
dd = get_fake_gap_shift(&(t->f_cigar), i);
if(dd != pdd) {
pdd = dd;
add_fake_cigar(&(o->f_cigar), get_fake_gap_pos(&(t->f_cigar), i), id - pdd, NULL);
}
}
}

if((apend_be == 1) && (get_fake_gap_pos(&(o->f_cigar), o->f_cigar.length-1) != ((int64_t)o->x_pos_e))) {
add_fake_cigar(&(o->f_cigar), o->x_pos_e,
get_fake_gap_shift(&(o->f_cigar), o->f_cigar.length-1), NULL);
}

return 1;
}

int append_inexact_overlap_region_alloc(overlap_region_alloc* list, overlap_region* tmp,
long long xLen, long long yLen, int add_beg_end, void *km)
{
Expand Down Expand Up @@ -1107,3 +1195,217 @@ void resize_window_list_alloc(window_list_alloc* x, uint64_t size)
for (k = 0; k < x->m; k++) x->a[k].error = -1;
x->c.n = 0;
}

#define normal_cw(x) (((x)&(0xffu))>=((x)>>8)?(((x)&(0xffu))/((x)>>8)):1)

int32_t lchain_check(k_mer_hit *a, int32_t n_a, Chain_Data *dp, double bw_thres)
{
int32_t i, tot_g = 0, sc, dg, dq, dr, dd, span;
double bw_pen;
if (n_a == 0) return -1;
if (n_a > 1) {
if ((a[0].self_offset >= a[n_a-1].self_offset)||(a[0].offset == a[n_a-1].offset)) return -1;
dq = (int32_t)a[n_a-1].self_offset - (int32_t)a[0].self_offset;
dr = (int32_t)a[n_a-1].offset - (int32_t)a[0].offset;
dd = ((dq>=dr)? (dq-dr): (dr-dq));//gap
dg = ((dq>=dr)? (dr): (dq));///len
if (dd > (dg*bw_thres)) return -1;
}

for (i = 1; i < n_a; ++i) {///a[] is sorted by offset, instead of self_offset; but offset might be equal
if(a[i-1].self_offset >= a[i].self_offset) break;
if(a[i-1].offset == a[i].offset) break;
}
if (i < n_a) return -1;

bw_pen = 1.0 / bw_thres;
dp->score[0] = normal_w((a[0].cnt&(0xffu)), (a[0].cnt>>8)); dp->pre[0] = -1;
for (i = 1; i < n_a; ++i) {
dq = (int32_t)a[i].self_offset - (int32_t)a[i-1].self_offset;
dr = (int32_t)a[i].offset - (int32_t)a[i-1].offset;
dd = ((dq>=dr)? (dq-dr): (dr-dq));//gap
dg = ((dq>=dr)? (dr): (dq));///len

tot_g += dd;
if (dd > THRESHOLD_MAX_SIZE && dd > (dg*bw_thres)) break;
span = a[i].cnt&(0xffu);
sc = dg < span? dg : span;
sc = normal_w(sc, ((int32_t)(a[i].cnt>>8)));
sc -= (int32_t)((((double)dd)/((double)dg))*bw_pen*((double)sc));///bw_pen is 20 for HiFi

dp->score[i] = dp->score[i-1] + sc;
dp->pre[i] = i - 1;
}
if (i < n_a) return -1;

if(n_a > 1) {
dq = (int32_t)a[n_a-1].self_offset - (int32_t)a[0].self_offset;
dr = (int32_t)a[n_a-1].offset - (int32_t)a[0].offset;
dg = ((dq>=dr)? (dr): (dq));///len
dd = tot_g;///gap
if (dd > (dg*bw_thres)) return -1;
}
return n_a;
}

inline int32_t cal_bw(const k_mer_hit *ai, const k_mer_hit *aj, double bw_rate, int64_t sf_l, int64_t ot_l)
{
///ai is the suffix of aj
int64_t sf_s = aj->self_offset, sf_e = ai->self_offset + 1;
int64_t ot_s = aj->offset, ot_e = ai->offset + 1;
int64_t sf_r = sf_l - sf_e, ot_r = ot_l - ot_e;
if(sf_s <= ot_s) sf_s = 0;
else sf_s -= ot_s;

if(sf_r <= ot_r) sf_e = sf_l;
else sf_e += ot_r;

return (sf_e - sf_s)*bw_rate;
}

inline int32_t comput_sc_ch(const k_mer_hit *ai, const k_mer_hit *aj, double bw_rate, double chn_pen_gap, double chn_pen_skip, int64_t sl, int64_t ol)
{
///ai is the suffix of aj
int32_t dq, dr, dd, dg, q_span, sc;
dq = (int64_t)(ai->self_offset) - (int64_t)(aj->self_offset);
if(dq < 0) return INT32_MIN;
dr = (int64_t)(ai->offset) - (int64_t)(aj->offset);
if(dr < 0) return INT32_MIN;
dd = dr > dq? dr - dq : dq - dr;//gap
if((dd > 16) && (dd > cal_bw(ai, aj, bw_rate, sl, ol))) return INT32_MIN;
dg = dr < dq? dr : dq;//len
q_span = ai->cnt&(0xffu);
sc = q_span < dg? q_span : dg;
sc = normal_w(sc, ((int32_t)(ai->cnt>>8)));
if (dd || dg > q_span) {
double lin_pen, a_pen;
lin_pen = (chn_pen_gap*(double)dd);
a_pen = ((double)(sc))*((((double)dd)/((double)dg))/bw_rate);
if(lin_pen > a_pen) lin_pen = a_pen;
lin_pen += (chn_pen_skip*(double)dg);
sc -= (int32_t)lin_pen;
}
return sc;
}

uint64_t lchain_dp(k_mer_hit* a, int64_t a_n, k_mer_hit* des, Chain_Data* dp, overlap_region* res,
int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate,
int64_t xl, int64_t yl, int64_t quick_check)
{
int64_t *p, *t, max_f, n_skip, st, max_j, end_j, sc, msc, msc_i, bw, max_ii, ovl, movl;
int32_t *f, max, tmp; int64_t i, j, ret, dq, dr, dd, pdd, cL = 0;
resize_Chain_Data(dp, a_n, NULL);
t = dp->tmp; f = dp->score; p = dp->pre;
bw = ((xl < yl)?xl:yl); bw *= bw_rate;
msc = msc_i = -1; movl = INT32_MAX;

if(quick_check) {
ret = lchain_check(a, a_n, dp, bw_rate);
if (ret > 0) {
a_n = ret; msc_i = a_n-1; msc = f[msc_i];
goto skip_ldp;
}
}

memset(t, 0, (a_n*sizeof((*t))));
for (i = st = 0, max_ii = -1; i < a_n; ++i) {
max_f = a[i].cnt&(0xffu);
n_skip = 0; max_j = end_j = -1;
if ((i-st) > max_iter) st = i-max_iter;

for (j = i - 1; j >= st; --j) {
sc = comput_sc_ch(&a[i], &a[j], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl);
if (sc == INT32_MIN) continue;
sc += f[j];
if (sc > max_f) {
max_f = sc, max_j = j;
if (n_skip > 0) --n_skip;
} else if (t[j] == (int32_t)i) {
if (++n_skip > max_skip)
break;
}
if (p[j] >= 0) t[p[j]] = i;
}
end_j = j;

if (max_ii < 0 || ((int64_t)a[i].offset) - ((int64_t)a[max_ii].offset) > max_dis) {
max = INT32_MIN; max_ii = -1;
for (j = i - 1; j >= st; --j) {
if (max < f[j]) {
max = f[j], max_ii = j;
}
}
}

if (max_ii >= 0 && max_ii < end_j) {///just have a try with a[i]<->a[max_ii]
tmp = comput_sc_ch(&a[i], &a[max_ii], bw_rate, chn_pen_gap, chn_pen_skip, xl, yl);
if (tmp != INT32_MIN && max_f < tmp + f[max_ii])
max_f = tmp + f[max_ii], max_j = max_ii;
}
f[i] = max_f; p[i] = max_j;
if ((max_ii < 0) || (((((int64_t)a[i].offset)-((int64_t)a[max_ii].offset))<=max_dis) && (f[max_ii]<f[i]))) {
max_ii = i;
}
if(f[i] >= msc) {
ovl = get_chainLen(a[i].self_offset, a[i].self_offset, xl, a[i].offset, a[i].offset, yl);
if(f[i] > msc || ovl < movl) {
msc = f[i]; msc_i = i; movl = ovl;
}
}
}

skip_ldp:
clear_fake_cigar(&(res->f_cigar));
///a[] has been sorted by offset
i = msc_i;
res->x_pos_e = a[i].self_offset;
res->y_pos_e = a[i].offset;
res->shared_seed = msc;
// res->overlapLen = movl;

dq = res->x_pos_e - a[i].self_offset;
dr = res->y_pos_e - a[i].offset;
dd = dr - dq; pdd = dd;
///record first site
///the length of f_cigar should be at least 1
///record the offset of reference
add_fake_cigar(&(res->f_cigar), a[i].self_offset, dd, NULL);
cL = 0;
if(res->x_pos_strand == 1) {
while (i >= 0) {
dq = res->x_pos_e - a[i].self_offset;
dr = res->y_pos_e - a[i].offset;
dd = dr - dq;
if(dd != pdd) {
pdd = dd;
///record this site
add_fake_cigar(&(res->f_cigar), a[i].self_offset, pdd, NULL);
}
t[cL++] = i;
res->x_pos_s = a[i].self_offset;
res->y_pos_s = a[i].offset;
msc_i = i; i = p[i];
}
}
else {
while (i >= 0) {
dq = res->x_pos_e - a[i].self_offset;
dr = res->y_pos_e - a[i].offset;
dd = dr - dq;
if(dd == pdd) {
res->f_cigar.length--;
add_fake_cigar(&(res->f_cigar), a[i].self_offset, pdd, NULL);
} else {
pdd = dd;
add_fake_cigar(&(res->f_cigar), a[i].self_offset, pdd, NULL);
}
t[cL++] = i;
res->x_pos_s = a[i].self_offset;
res->y_pos_s = a[i].offset;
msc_i = i; i = p[i];
}
}
res->overlapLen = get_chainLen(a[msc_i].self_offset, a[i].self_offset, xl, a[msc_i].offset, a[i].offset, yl);
for (i = 0; i < cL; i++) des[i] = a[t[cL-i-1]];
return cL;
}
5 changes: 5 additions & 0 deletions Hash_Table.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,11 @@ void clear_window_list_alloc(window_list_alloc* x);
void destory_window_list_alloc(window_list_alloc* x);
void resize_window_list_alloc(window_list_alloc* x, uint64_t size);
long long chain_DP(k_mer_hit* a, long long a_n, Chain_Data* dp, overlap_region* result, double band_width_threshold, int max_skip, int x_readLen, int y_readLen, void *km);
uint64_t lchain_dp(k_mer_hit* a, int64_t a_n, k_mer_hit* des, Chain_Data* dp, overlap_region* res,
int64_t max_skip, int64_t max_iter, int64_t max_dis, double chn_pen_gap, double chn_pen_skip, double bw_rate,
int64_t xl, int64_t yl, int64_t quick_check);
int ovlp_chain_gen(overlap_region_alloc* ol, overlap_region* t, int64_t xl, int64_t yl, int64_t apend_be);
int append_utg_inexact_overlap_region_alloc(overlap_region_alloc* list, overlap_region* tmp,
ma_utg_v *ua, int add_beg_end, void *km);

#endif
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ INCLUDES=
OBJS= CommandLines.o Process_Read.o Assembly.o Hash_Table.o \
POA.o Correct.o Levenshtein_distance.o Overlaps.o Trio.o kthread.o Purge_Dups.o \
htab.o hist.o sketch.o anchor.o extract.o sys.o ksw2_extz2_sse.o hic.o rcut.o horder.o \
tovlp.o inter.o kalloc.o gfa_ut.o
tovlp.o inter.o kalloc.o gfa_ut.o gchain_map.o
EXE= hifiasm
LIBS= -lz -lpthread -lm

Expand Down Expand Up @@ -78,4 +78,5 @@ horder.o: horder.h
tovlp.o: tovlp.h
inter.o: inter.h Process_Read.h
kalloc.o: kalloc.h
gfa_ut.o: Overlaps.h
gfa_ut.o: Overlaps.h
gchain_map.o: gchain_map.h
Loading

0 comments on commit 7b89478

Please sign in to comment.