Skip to content

Commit

Permalink
review updates
Browse files Browse the repository at this point in the history
  • Loading branch information
vasudeva8 committed Jan 14, 2025
1 parent 6a0fb00 commit 67b2734
Show file tree
Hide file tree
Showing 4 changed files with 362 additions and 141 deletions.
53 changes: 52 additions & 1 deletion htslib/kstring.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* The MIT License
Copyright (C) 2011 by Attractive Chaos <[email protected]>
Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024 Genome Research Ltd.
Copyright (C) 2013-2014, 2016, 2018-2020, 2022, 2024-2025 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
Expand Down Expand Up @@ -449,6 +449,57 @@ static inline int *ksplit(kstring_t *s, int delimiter, int *n)
return offsets;
}

/**
* kinsert_char - inserts a char to kstring
* @param c - char to insert
* @param pos - position at which to insert, starting from 0
* @param s - pointer to output string
* Returns 0 on success and -1 on failure
* 0 for pos inserts at start and length of current string as pos appends at
* the end.
*/
static inline int kinsert_char(char c, size_t pos, kstring_t *s)
{
if (!s || pos > s->l) {
return EOF;
}
if (ks_resize(s, s->l + 2) < 0) {
return EOF;
}
memmove(s->s + pos + 1, s->s + pos, s->l - pos);
s->s[pos] = c;
s->s[++s->l] = 0;
return 0;
}

/**
* kinsert_str - inserts a null terminated string to kstring
* @param str - string to insert
* @param pos - position at which to insert, starting from 0
* @param s - pointer to output string
* Returns 0 on success and -1 on failure
* 0 for pos inserts at start and length of current string as pos appends at
* the end. empty string makes no update.
*/
static inline int kinsert_str(const char *str, size_t pos, kstring_t *s)
{
size_t len = 0;
if (!s || pos > s->l || !str) {
return EOF;
}
if (!(len = strlen(str))) {
return 0;
}
if (ks_resize(s, s->l + len + 1) < 0) {
return EOF;
}
memmove(s->s + pos + len, s->s + pos, s->l - pos);
memcpy(s->s + pos, str, len);
s->l += len;
s->s[s->l] = '\0';
return 0;
}

#ifdef HTSLIB_SSIZE_T
#undef HTSLIB_SSIZE_T
#undef ssize_t
Expand Down
138 changes: 11 additions & 127 deletions htslib/vcf.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
/// High-level VCF/BCF variant calling file operations.
/*
Copyright (C) 2012, 2013 Broad Institute.
Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd.
Copyright (C) 2012-2020, 2022-2025 Genome Research Ltd.
Author: Heng Li <[email protected]>
Expand Down Expand Up @@ -1501,141 +1501,25 @@ static inline int bcf_float_is_vector_end(float f)
return u.i==bcf_float_vector_end ? 1 : 0;
}

typedef enum bcf_version {v41 = 1, v42, v43, v44} bcf_version;
/**
* bcf_get_version - get the version as bcf_version enumeration
* @param hdr - bcf header, to get version
* @param ipver - pointer to return version
* Returns 0 on success and -1 on failure
*/
static inline int bcf_get_version(const bcf_hdr_t *hdr, bcf_version *ver)
{
const char *version = NULL;

if (!hdr || !ver) {
return -1;
}

version = bcf_hdr_get_version(hdr);
if (!strcmp("VCFv4.1", version)) {
*ver = v41;
} else if (!strcmp("VCFv4.2", version)) {
*ver = v42;
} else if (!strcmp("VCFv4.3", version)) {
*ver = v43;
} else {
*ver = v44;
}
return 0;
}

static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
{
uint32_t e = 0;
#define BRANCH(type_t, convert, missing, vector_end) { \
uint8_t *ptr = fmt->p + isample*fmt->size; \
int i; \
for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
{ \
type_t val = convert(ptr); \
if ( val == vector_end ) break; \
if ( i ) e |= kputc("/|"[val&1], str) < 0; \
if ( !(val>>1) ) e |= kputc('.', str) < 0; \
else e |= kputw((val>>1) - 1, str) < 0; \
} \
if (i == 0) e |= kputc('.', str) < 0; \
}
switch (fmt->type) {
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
case BCF_BT_NULL: e |= kputc('.', str) < 0; break;
default: hts_log_error("Unexpected type %d", fmt->type); return -2;
}
#undef BRANCH
return e == 0 ? 0 : -1;
}

/**
* bcf_format_gt1 - formats GT information on a string
* bcf_format_gt_v2 - formats GT information on a string
* @param hdr - bcf header, to get version
* @param fmt - pointer to bcf format data
* @param isample - position of interested sample in data
* @param str - pointer to output string
* Returns 0 on success and -1 on failure
* This method is extended from bcf_format_gt to output phasing information
* in accordance with v4.4 format, which supports explicit / prefixed phasing
* for 1st allele.
* Explicit / prefixed phasing for 1st allele is used only when it is a must to
* correctly express phasing.
* This method is preferred over bcf_format_gt as this supports vcf4.4 and
* prefixed phasing. Explicit / prefixed phasing for 1st allele is used only
* when it is a must to correctly express phasing.
*/
static inline int bcf_format_gt1(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample, kstring_t *str)
HTSLIB_EXPORT
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample,
kstring_t *str) HTS_RESULT_USED;

static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
{
uint32_t e = 0;
bcf_version ver = v42;
int ploidy = 1, anyunphased = 0;
int32_t val0 = 0;
kstring_t tmp1 = KS_INITIALIZE, tmp2 = KS_INITIALIZE;

if (bcf_get_version(hdr, &ver)) {
hts_log_error("Failed to get version information");
return -1;
}
#define BRANCH(type_t, convert, missing, vector_end) { \
uint8_t *ptr = fmt->p + isample*fmt->size; \
int i; \
for (i=0; i<fmt->n; i++, ptr += sizeof(type_t)) \
{ \
type_t val = convert(ptr); \
if ( val == vector_end ) break; \
if (!i) { val0 = val; } \
if (i) { \
e |= kputc("/|"[val & 1], &tmp1) < 0; \
anyunphased |= !(val & 1); \
} \
if (!(val >> 1)) e |= kputc('.', &tmp1) < 0; \
else e |= kputw((val >> 1) - 1, &tmp1) < 0; \
} \
if (i == 0) e |= kputc('.', &tmp1) < 0; \
ploidy = i; \
}
switch (fmt->type) {
case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break;
case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break;
case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break;
case BCF_BT_NULL: e |= kputc('.', &tmp1) < 0; break;
default: hts_log_error("Unexpected type %d", fmt->type); return -2;
}
#undef BRANCH

if (ver >= v44) { //output which supports prefixed phasing
/* update 1st allele's phasing if required and append rest to it.
use prefixed phasing only when it is a must. i.e. without which the
inferred value will be incorrect */
if (val0 & 1) {
/* 1st one is phased, if ploidy is > 1 and an unphased allele exists
need to specify explicitly */
e |= (ploidy > 1 && anyunphased) ?
(kputc('|', &tmp2) < 0) :
(ploidy <= 1 && !((val0 >> 1)) ? //|. needs explicit o/p
(kputc('|', &tmp2) < 0) :
0);
} else {
/* 1st allele is unphased, if ploidy is = 1 or allele is '.' or
ploidy > 1 and no other unphased allele exist, need to specify
explicitly */
e |= ((ploidy <= 1 && val0 != 0) || (ploidy > 1 && !anyunphased)) ?
(kputc('/', &tmp2) < 0) :
0;
}
e |= kputsn(tmp1.s, tmp1.l, &tmp2) < 0; //append rest with updated one
ks_free(&tmp1);
tmp1 = tmp2;
}
//updated v44 string or <v44 without any update
e |= kputsn(tmp1.s, tmp1.l, str) < 0;
ks_free(&tmp1);
return e == 0 ? 0 : -1;
return bcf_format_gt_v2(NULL, fmt, isample, str);
}

static inline int bcf_enc_size(kstring_t *s, int size, int type)
Expand Down
136 changes: 135 additions & 1 deletion test/test_kstring.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* test_kstring.c -- kstring unit tests
Copyright (C) 2018, 2020, 2024 Genome Research Ltd.
Copyright (C) 2018, 2020, 2024-2025 Genome Research Ltd.
Author: Rob Davies <[email protected]>
Expand Down Expand Up @@ -451,6 +451,134 @@ static int test_kgetline2(void) {
return EXIT_SUCCESS;
}

static int test_kinsertchar(void) {
kstring_t t = KS_INITIALIZE, res = KS_INITIALIZE;
int i = 0;
struct data {
int pos;
const char *val;
};

struct data tdata[] = { { -1, ""}, {0, "X0123"}, {1, "0X123"}, {2, "01X23"},
{3, "012X3"}, {4, "0123X"}, {5, ""} };

for (i = -1; i < 6; ++i) {
kstring_t s = KS_INITIALIZE;
kputs("0123", &s);
if (kinsert_char('X', i, &s) < 0) {
if ( i < 0 || i > 4) { ks_free(&s); continue; } //expected failures
fprintf(stderr, "kinsert_char failed\n");
ks_free(&s);
return -1;
}
if (s.s[s.l] != '\0') {
fprintf(stderr, "No NUL termination on string from kinsert_char\n");
ks_free(&s);
return -1;
}
if (memcmp(s.s, tdata[i + 1].val, s.l + 1)) {
fprintf(stderr, "kinsert_char comparison failed\n");
ks_free(&s);
return -1;
}
ks_free(&s);
}
//realloc checks
for (i = 0; i < 7; ++i) {
kputc('A' + i, &res);
if (kinsert_char('A' + i, t.l, &t) < 0) {
fprintf(stderr, "kinsert_char failed in realloc\n");
ks_free(&res); ks_free(&t);
return -1;
}
if (t.s[t.l] != '\0') {
fprintf(stderr, "No NUL termination on string from kinsert_char in realloc\n");
ks_free(&res); ks_free(&t);
return -1;
}
if (memcmp(t.s, res.s, res.l+1)) {
fprintf(stderr, "kinsert_char realloc comparison failed in realloc\n");
ks_free(&res); ks_free(&t);
return -1;
}
}
ks_free(&t);
ks_free(&res);
return 0;
}

static int test_kinsertstr(void) {
kstring_t t = KS_INITIALIZE, res = KS_INITIALIZE;
int i = 0;
struct data {
int pos;
const char *val;
};

struct data tdata[] = { { -1, ""}, {0, "XYZ0123"}, {1, "0XYZ123"},
{2, "01XYZ23"}, {3, "012XYZ3"}, {4, "0123XYZ"}, {5, ""} };

for (i = -1; i < 6; ++i) {
kstring_t s = KS_INITIALIZE;
kputs("0123", &s);
if (kinsert_str("XYZ", i, &s) < 0) {
if ( i < 0 || i > 4) { ks_free(&s); continue; } //expected failures
fprintf(stderr, "kinsert_str failed\n");
return -1;
}
if (s.s[s.l] != '\0') {
fprintf(stderr, "No NUL termination on string from kinsert_str\n");
return -1;
}
if (memcmp(s.s, tdata[i + 1].val, s.l + 1)) {
fprintf(stderr, "kinsert_str comparison failed\n");
return -1;
}
ks_free(&s);
}
//realloc checks
for (i = 0; i < 15; ++i) {
kstring_t val = KS_INITIALIZE;
ksprintf(&val, "%c", 'A' + i);
kputs(val.s, &res);
if (kinsert_str(val.s, t.l, &t) < 0) {
ks_free(&val);
fprintf(stderr, "kinsert_str failed in realloc\n");
return -1;
}
if (t.s[t.l] != '\0') {
ks_free(&val); ks_free(&res);
fprintf(stderr, "No NUL termination on string from kinsert_str in realloc\n");
return -1;
}
if (memcmp(t.s, res.s, res.l+1)) {
ks_free(&val); ks_free(&res);
fprintf(stderr, "kinsert_str realloc comparison failed in realloc\n");
return -1;
}
ks_free(&val);
}
//empty strings
ks_free(&t);
if (kinsert_str("", 1, &t)) { //expected
if (kinsert_str("", 0, &t) || t.l != 0) {
fprintf(stderr, "kinsert_str empty insertion failed\n");
return -1;
}
} else {
fprintf(stderr, "kinsert_str empty ins to invalid pos succeeded\n");
return -1;
}
i = res.l;
if (kinsert_str("", 1, &res) || i != res.l) {
fprintf(stderr, "kinsert_str empty ins to valid pos failed\n");
ks_free(&res);
return -1;
}
ks_free(&res);
return 0;
}

int main(int argc, char **argv) {
int opt, res = EXIT_SUCCESS;
int64_t start = 0;
Expand Down Expand Up @@ -500,5 +628,11 @@ int main(int argc, char **argv) {
if (!test || strcmp(test, "kgetline2") == 0)
if (test_kgetline2() != 0) res = EXIT_FAILURE;

if (!test || strcmp(test, "kinsertchar") == 0)
if (test_kinsertchar() != 0) res = EXIT_FAILURE;

if (!test || strcmp(test, "kinsertstr") == 0)
if (test_kinsertstr() != 0) res = EXIT_FAILURE;

return res;
}
Loading

0 comments on commit 67b2734

Please sign in to comment.