From 3fba279007d0e8a68b731299facd913b85a410ad Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 1 Mar 2023 10:37:37 +0000 Subject: [PATCH] Add VCF/BCF support for POS=0 coordinate Review of `bcftools merge` code and possibly others is necessary because it is relying on uninitialized POS==-1 in synced_bcf_reader This should resolve #1571 --- hts.c | 10 ++++++---- htscodecs | 2 +- htslib/synced_bcf_reader.h | 2 ++ synced_bcf_reader.c | 31 ++++++++++++++++--------------- tbx.c | 8 ++++---- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/hts.c b/hts.c index cead9d537..0f07fd072 100644 --- a/hts.c +++ b/hts.c @@ -2205,6 +2205,8 @@ static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t of { int i; hts_pos_t beg, end; + if ( _beg<0 ) _beg = 0; + if ( _end<=_beg ) _end = _beg+1; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { @@ -2905,6 +2907,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi { int l, t, s = min_shift + (n_lvls<<1) + n_lvls; if (beg >= end) return 0; + if (beg < 0 ) beg = 0; + if (beg==end) end = 1; if (end >= 1LL<= idx->n || (bidx = idx->bidx[tid]) == NULL) { iter->finished = 1; } else { - if (beg < 0) beg = 0; + if (beg < -1) beg = -1; if (end < beg) { free(iter); return NULL; @@ -3103,7 +3107,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } - rel_off = beg>>idx->min_shift; + rel_off = (beg>=0 ? beg : 0) >> idx->min_shift; // compute min_off bin = hts_bin_first(idx->n_lvls) + rel_off; do { @@ -3138,8 +3142,6 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t min_off = kh_val(bidx, k).loff; } } else if (unmapped) { //CSI index - if (k != kh_end(bidx)) - min_off = kh_val(bidx, k).loff; } // compute max_off: a virtual offset from a bin to the right of end diff --git a/htscodecs b/htscodecs index cd0737fff..3ef17f6fb 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit cd0737fff5893b0842b047da5aa3209e5f65442c +Subproject commit 3ef17f6fb5b8b6b0ad2d4c1c562165664f0703f8 diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 78e9a0b4a..c8c31ac13 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -64,6 +64,8 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +#define CSI_COOR_EMPTY -2 + /* When reading multiple files in parallel, duplicate records within each file will be reordered and offered in intuitive order. For example, diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 23e0ecaef..1eaf31324 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -371,7 +371,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) if ( !files->regions ) files->regions = _regions_init_string(names[i]); else - _regions_add(files->regions, names[i], -1, -1); + _regions_add(files->regions, names[i], CSI_COOR_EMPTY, CSI_COOR_EMPTY); } free(names); _regions_sort_and_merge(files->regions); @@ -555,7 +555,7 @@ static int _readers_next_region(bcf_srs_t *files) int prev_iseq = files->regions->iseq; hts_pos_t prev_end = files->regions->end; if ( bcf_sr_regions_next(files->regions)<0 ) return -1; - files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1; + files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : CSI_COOR_EMPTY; for (i=0; inreaders; i++) _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end); @@ -616,7 +616,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; - reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 + reader->buffer[reader->mbuffer-i]->pos = CSI_COOR_EMPTY; // for rare cases when VCF starts from 1 or 0 } } if ( files->streaming ) @@ -656,7 +656,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) if ( ret < 0 ) break; // no more lines or an error bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } - // Prevent creation of duplicates from records overlapping multiple regions // and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-) if ( files->regions ) @@ -676,7 +675,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); exit(1); } - if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; } @@ -954,9 +952,9 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions(). static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) { - if ( start==-1 && end==-1 ) + if ( start==CSI_COOR_EMPTY && end==CSI_COOR_EMPTY ) { - start = 0; end = MAX_CSI_COOR-1; + start = -1; end = MAX_CSI_COOR-1; } else { @@ -1029,8 +1027,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); - reg->start = reg->end = -1; - reg->prev_start = reg->prev_end = reg->prev_seq = -1; + reg->start = reg->end = CSI_COOR_EMPTY; + reg->prev_start = reg->prev_end = CSI_COOR_EMPTY; + reg->prev_seq = -1; kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; @@ -1075,7 +1074,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } else { - if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); + if ( tmp.l ) _regions_add(reg, tmp.s, CSI_COOR_EMPTY, CSI_COOR_EMPTY); if ( !*ep ) break; sp = ++ep; } @@ -1157,8 +1156,9 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr } reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); - reg->start = reg->end = -1; - reg->prev_start = reg->prev_end = reg->prev_seq = -1; + reg->start = reg->end = CSI_COOR_EMPTY; + reg->prev_start = reg->prev_end = CSI_COOR_EMPTY; + reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) @@ -1253,7 +1253,8 @@ void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq) { - reg->iseq = reg->start = reg->end = -1; + reg->iseq = -1; + reg->start = reg->end = CSI_COOR_EMPTY; if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions // using in-memory regions @@ -1284,7 +1285,7 @@ static int advance_creg(region_t *reg) int bcf_sr_regions_next(bcf_sr_regions_t *reg) { if ( reg->iseq<0 ) return -1; - reg->start = reg->end = -1; + reg->start = reg->end = CSI_COOR_EMPTY; reg->nals = 0; // using in-memory regions @@ -1442,7 +1443,7 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p bcf_sr_regions_flush(reg); bcf_sr_regions_seek(reg, seq); - reg->start = reg->end = -1; + reg->start = reg->end = CSI_COOR_EMPTY; } if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome reg->prev_seq = reg->iseq; diff --git a/tbx.c b/tbx.c index d897a21f1..4fd6ace65 100644 --- a/tbx.c +++ b/tbx.c @@ -107,12 +107,12 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) if ( s==line+b ) return -1; // expected int if (!(conf->preset&TBX_UCSC)) --intv->beg; else ++intv->end; - if (intv->beg < 0) { + if (intv->beg < -1) { hts_log_warning("Coordinate <= 0 detected. " "Did you forget to use the -0 option?"); intv->beg = 0; } - if (intv->end < 1) intv->end = 1; + if (intv->end < intv->beg ) intv->end = intv->beg; } else { if ((conf->preset&0xffff) == TBX_GENERIC) { if (id == conf->ec) @@ -171,7 +171,7 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) ++id; } } - if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1; + if (intv->ss == 0 || intv->se == 0 || intv->beg < -1 || intv->end < -1) return -1; return 0; } @@ -181,7 +181,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ int c = *intv->se; *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c; if (intv->tid < 0) return -2; // get_tid out of memory - return (intv->beg >= 0 && intv->end >= 0)? 0 : -1; + return (intv->beg >= -1 && intv->end >= -1)? 0 : -1; } else { char *type = NULL; switch (tbx->conf.preset&0xffff)