Skip to content

Commit

Permalink
ont simplex support
Browse files Browse the repository at this point in the history
  • Loading branch information
chhylp123 committed Nov 27, 2024
1 parent 80fa5ed commit 676385c
Show file tree
Hide file tree
Showing 10 changed files with 896 additions and 88 deletions.
49 changes: 38 additions & 11 deletions CommandLines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
#include <sys/time.h>
#include "CommandLines.h"
#include "ketopt.h"
#include "kseq.h"

KSEQ_INIT(gzFile, gzread)

#define DEFAULT_OUTPUT "hifiasm.asm"

Expand Down Expand Up @@ -72,7 +75,9 @@ static ko_longopt_t long_options[] = {
{ "telo-s", ko_required_argument, 357},
{ "ctg-n", ko_required_argument, 358},
{ "ont", ko_no_argument, 359},
{ "sc-n", ko_no_argument, 360},
// { "sc-n", ko_no_argument, 360},
{ "chem-c", ko_required_argument, 361},
{ "chem-f", ko_required_argument, 362},
// { "path-round", ko_required_argument, 348},
{ 0, 0, 0 }
};
Expand Down Expand Up @@ -210,6 +215,14 @@ void Print_H(hifiasm_opt_t* asm_opt)
fprintf(stderr, " --telo-s INT\n");
fprintf(stderr, " min score for telomere reads [%ld]\n", asm_opt->telo_mic_sc);

fprintf(stderr, " ONT simplex assembly (beta):\n");
fprintf(stderr, " --ont assemble ONT simplex reads in fastq format\n");
// fprintf(stderr, " --sc-n consider base qual value for assembly\n");
fprintf(stderr, " --chem-c INT\n");
fprintf(stderr, " detect chemical reads with <=INT other reads support [%lu]\n", asm_opt->chemical_cov);
fprintf(stderr, " --chem-f INT\n");
fprintf(stderr, " length of flanking regions for chemical read detection [%lu]\n", asm_opt->chemical_flank);


fprintf(stderr, "Example: ./hifiasm -o NA12878.asm -t 32 NA12878.fq.gz\n");
fprintf(stderr, "See `https://hifiasm.readthedocs.io/en/latest/' or `man ./hifiasm.1' for complete documentation.\n");
Expand Down Expand Up @@ -342,6 +355,8 @@ void init_opt(hifiasm_opt_t* asm_opt)

asm_opt->is_ont = 0;
asm_opt->is_sc = 0;
asm_opt->chemical_cov = 1;
asm_opt->chemical_flank = 256;
}

void destory_enzyme(enzyme* f)
Expand Down Expand Up @@ -692,25 +707,33 @@ int check_option(hifiasm_opt_t* asm_opt)

void get_queries(int argc, char *argv[], ketopt_t* opt, hifiasm_opt_t* asm_opt)
{
if(opt->ind == argc)
{
if(opt->ind == argc) {
return;
}

asm_opt->num_reads = argc - opt->ind;
asm_opt->read_file_names = (char**)malloc(sizeof(char*)*asm_opt->num_reads);

long long i;
gzFile dfp;
for (i = 0; i < asm_opt->num_reads; i++)
{
long long i; int ret;
gzFile dfp; kseq_t *ks = NULL;
for (i = 0; i < asm_opt->num_reads; i++) {
asm_opt->read_file_names[i] = argv[i + opt->ind];
dfp = gzopen(asm_opt->read_file_names[i], "r");
if (dfp == 0)
{
if (dfp == 0) {
fprintf(stderr, "[ERROR] Cannot find the input read file: %s\n",
asm_opt->read_file_names[i]);
exit(0);
} else if(asm_opt->is_sc){
ks = kseq_init(dfp);
while (((ret = kseq_read(ks)) >= 0)) {
if((ks->qual.l == 0) || (ks->qual.s == NULL)) {
fprintf(stderr, "[ERROR] %s is in fasta format rather than fastq format\n", asm_opt->read_file_names[i]);
asm_opt->is_sc = 0;
exit(0);
}
break;
}
kseq_destroy(ks); ks = NULL;
}
gzclose(dfp);
}
Expand Down Expand Up @@ -914,9 +937,13 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt)
else if (c == 357) asm_opt->telo_mic_sc = atol(opt.arg);
else if (c == 358) asm_opt->max_contig_tip = atol(opt.arg);
else if (c == 359) {
asm_opt->is_ont = 1; asm_opt->max_ov_diff_ec = 0.07; ///asm_opt->mz_win = 37; asm_opt->k_mer_length = 37;
} else if (c == 360) {
asm_opt->is_ont = 1; asm_opt->max_ov_diff_ec = 0.07; asm_opt->is_sc = 1; ///asm_opt->mz_win = 37; asm_opt->k_mer_length = 37;
} /**else if (c == 360) {
asm_opt->is_sc = 1;
}**/ else if (c == 361) {
asm_opt->chemical_cov = atol(opt.arg);
} else if (c == 362) {
asm_opt->chemical_flank = atol(opt.arg);
} else if (c == 'l') { ///0: disable purge_dup; 1: purge containment; 2: purge overlap
asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg);
}
Expand Down
4 changes: 3 additions & 1 deletion CommandLines.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <pthread.h>
#include <stdint.h>

#define HA_VERSION "0.21.0-r666"
#define HA_VERSION "0.21.0-r686"

#define VERBOSE 0

Expand Down Expand Up @@ -163,6 +163,8 @@ typedef struct {

uint64_t is_ont;
uint64_t is_sc;
uint64_t chemical_cov;
uint64_t chemical_flank;
} hifiasm_opt_t;

extern hifiasm_opt_t asm_opt;
Expand Down
110 changes: 108 additions & 2 deletions Correct.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24531,7 +24531,108 @@ void reassign_gaps(overlap_region *z, overlap_region *aux, char* qstr, int64_t q
// }
}

void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf)
uint32_t is_ovlp_debug(int64_t s, int64_t e, int64_t ws, int64_t we, int64_t op)
{
int64_t os, oe, ovlp;
os = MAX(s, ws); oe = MIN(e, we);
ovlp = ((oe>os)? (oe-os):0);
if(op != 2) {
return (!!ovlp);
} else {
if(ws >= s && we <= e) return 1;
}
return 0;
}

uint32_t inline ff_tend(overlap_region *z, int64_t wn, int64_t dn, double dr, double er, int64_t min_err)
{
int64_t k, zwn = z->w_list.n, err, mm, qi, ci, cn, ql, ws, we, zs, ze, s, e, os, oe, ovlp; bit_extz_t ez; uint32_t cl; uint16_t c;
zs = z->x_pos_s; ze = z->x_pos_e + 1; ql = ze - zs;
if(ql < wn) return 0; if(dn > (ql*dr)) dn = ql*dr; if(dn < wn) return 0; if(ql < dn) return 0;

s = zs; e = zs + dn;
// if(z->y_id == 27) fprintf(stderr, "-a-[M::%s] tid::%u\t%.*s\tz::[%ld,%ld)\ti::[%ld,%ld)\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), zs, ze, s, e);
for (k = err = mm = 0, qi = zs; (k < zwn) && (z->w_list.a[k].x_start < e); k++) {
// if(z->y_id == 27) fprintf(stderr, "-a-[M::%s] tid::%u\t%.*s\tw::[%d,%d)\terr::%d\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), z->w_list.a[k].x_start, z->w_list.a[k].x_end + 1, z->w_list.a[k].error);
if(!(is_ualn_win(z->w_list.a[k]))) {
set_bit_extz_t(ez, (*z), k);
ci = 0; cn = ez.cigar.n; qi = ez.ts; //ti = ez.ps;
while (ci < cn && qi < e) {
ws = qi;
ci = pop_trace(&(ez.cigar), ci, &c, &cl);
if(c!=2) qi += cl;
// if(c!=3) ti += cl;
we = qi;

if(c == 0) {
os = MAX(s, ws); oe = MIN(e, we);
ovlp = ((oe>os)? (oe-os):0); mm += ovlp;
} else {
err += cl;
}
// assert(is_ovlp_debug(s, e, ws, we, c));

if((err > min_err) && ((mm + err) > wn) && (err > ((mm + err)*er))) {
// fprintf(stderr, "-0-[M::%s] tid::%u\t%.*s\tmm::%ld\terr::%ld\tdn::%ld\twn::%ld\tdif::%f\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), mm, err, dn, wn, er);
return 1;
}
}
} else {
ws = z->w_list.a[k].x_start; we = z->w_list.a[k].x_end + 1;
err += we - ws;

// assert(is_ovlp_debug(s, e, ws, we, -1));

if((err > min_err) && ((mm + err) > wn) && (err > ((mm + err)*er))) {
// fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\tmm::%ld\terr::%ld\tdn::%ld\twn::%ld\tdif::%f\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), mm, err, dn, wn, er);
return 1;
}
}
}

s = ze - dn; e = ze;
for (k = zwn - 1, err = mm = 0, qi = ze; (k >= 0) && ((z->w_list.a[k].x_end + 1) > s); k--) {
if(!(is_ualn_win(z->w_list.a[k]))) {
set_bit_extz_t(ez, (*z), k);
ci = ((int64_t)ez.cigar.n) - 1; qi = ez.te + 1; //ti = ez.pe + 1;
while (ci >= 0 && qi > s) {
we = qi;
ci = pop_trace_back(&(ez.cigar), ci, &c, &cl);
if(c!=2) qi -= cl;
// if(c!=3) ti += cl;
ws = qi;

if(c == 0) {
os = MAX(s, ws); oe = MIN(e, we);
ovlp = ((oe>os)? (oe-os):0); mm += ovlp;
} else {
err += cl;
}

// assert(is_ovlp_debug(s, e, ws, we, c));

if((err > min_err) && ((mm + err) > wn) && (err > ((mm + err)*er))) {
// fprintf(stderr, "-2-[M::%s] tid::%u\t%.*s\tmm::%ld\terr::%ld\tdn::%ld\twn::%ld\tdif::%f\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), mm, err, dn, wn, er);
return 1;
}
}
} else {
ws = z->w_list.a[k].x_start; we = z->w_list.a[k].x_end + 1;
err += we - ws;

// assert(is_ovlp_debug(s, e, ws, we, -1));

if((err > min_err) && ((mm + err) > wn) && (err > ((mm + err)*er))) {
// fprintf(stderr, "-3-[M::%s] tid::%u\t%.*s\tmm::%ld\terr::%ld\tdn::%ld\twn::%ld\tdif::%f\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), mm, err, dn, wn, er);
return 1;
}
}
}

return 0;
}

void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf, uint8_t chem_drop)
{
uint64_t i, bs, k, ql = qu->length; Window_Pool w; double err, e_max, rr; int64_t re;
overlap_region t; overlap_region *z; //asg64_v iidx, buf, buf1;
Expand Down Expand Up @@ -24565,6 +24666,9 @@ void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rre

if(!gen_hc_fast_cigar(z, cl, rref, w.window_length, qu->seq, tu, exz, aux_o, e_rate, ql, rid, khit, &re)) continue;

if(chem_drop && ff_tend(z, 384, 2000, 0.1, (((e_rate*10)<0.36)?(e_rate*10):(0.36)), 128)) continue;


// if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re);

reassign_gaps(z, aux_o, qu->seq, ql, NULL, -1, rref, tu, buf);
Expand All @@ -24586,7 +24690,7 @@ void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rre
}


void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf)
void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf, uint8_t chem_drop)
{
uint64_t i, bs, k, ql = qu->length; Window_Pool w; double err, e_max, rr; int64_t re;
overlap_region t; overlap_region *z; //asg64_v iidx, buf, buf1;
Expand Down Expand Up @@ -24628,6 +24732,8 @@ void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads

if(!gen_hc_fast_cigar(z, cl, rref, w.window_length, qu->seq, tu, exz, aux_o, e_rate, ql, rid, khit, &re)) continue;

if(chem_drop && ff_tend(z, 384, 2000, 0.1, (((e_rate*10)<0.36)?(e_rate*10):(0.36)), 128)) continue;

// if(z->x_id == 3196 && z->y_id == 3199) fprintf(stderr, "-1-[M::%s] tid::%u\t%.*s\trr::%f\tre::%ld\n", __func__, z->y_id, (int)Get_NAME_LENGTH(R_INF, z->y_id), Get_NAME(R_INF, z->y_id), rr, re);
///debug for memory
// snprintf(NULL, 0, "dwn::%u\tdcn::%u", (uint32_t)aux_o->w_list.n, (uint32_t)aux_o->w_list.c.n);
Expand Down
4 changes: 2 additions & 2 deletions Correct.h
Original file line number Diff line number Diff line change
Expand Up @@ -1391,8 +1391,8 @@ int64_t get_rid_backward_cigar_err(rtrace_iter *it, ul_ov_t *aln, kv_rtrace_t *t
const ul_idx_t *uref, char* qstr, UC_Read *tu, overlap_region_alloc *ol, overlap_region *o,
bit_extz_t *exz, double e_rate, int64_t qs);

void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf);
void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf);
void gen_hc_r_alin(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf, uint8_t chem_drop);
void gen_hc_r_alin_nec(overlap_region_alloc* ol, Candidates_list *cl, All_reads *rref, UC_Read* qu, UC_Read* tu, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf, uint8_t chem_drop);
uint64_t gen_hc_r_alin_re(overlap_region* z, Candidates_list *cl, char* qstr, uint64_t ql, char* tstr, uint64_t tl, bit_extz_t *exz, overlap_region *aux_o, double e_rate, int64_t wl, int64_t rid, int64_t khit, int64_t move_gap, asg16_v* buf);
void rphase_hc(overlap_region_alloc* ol, All_reads *rref, haplotype_evdience_alloc* hp, UC_Read* qu, UC_Read* tu, kv_ul_ov_t *c_idx, asg64_v* idx, asg64_v* buf, int64_t bd, int64_t wl, int64_t ql, uint8_t occ_thres/**, uint8_t is_dbg**/, uint64_t rid, uint64_t hpc_len, uint64_t std_bs, Chain_Data *dp, asg8_v *q8, asg8_v *t8);
void set_exact_exz(bit_extz_t *exz, int64_t qs, int64_t qe, int64_t ts, int64_t te);
Expand Down
5 changes: 3 additions & 2 deletions Hash_Table.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
#define WINDOW 375
#define WINDOW_BOUNDARY 375
#define WINDOW_HC 775
#define WINDOW_OHC 475
// #define WINDOW_OHC 375 ///ONT high error
///ONT high error
// #define WINDOW_OHC 475
#define WINDOW_OHC 375
#define WINDOW_HC_FAST 512
///for one side, the first or last WINDOW_UNCORRECT_SINGLE_SIDE_BOUNDARY bases should not be corrected
#define WINDOW_UNCORRECT_SINGLE_SIDE_BOUNDARY 25
Expand Down
Loading

0 comments on commit 676385c

Please sign in to comment.