Skip to content

Commit

Permalink
The -s, --select option was extended to print only one consequence.
Browse files Browse the repository at this point in the history
Previously it was possible to select a single transcript (e.g., the
one with the worst consequence), and it was possible to filter by
consequence severity (e.g., missing or worse), but in some cases
multiple consequences are reported within a single transcript
(e.g., start_lost&splice_region). The extended option allows to
print the worst part, for example as

        --select primary:missense+:worst
        --select ::worst
  • Loading branch information
pd3 committed Jan 7, 2025
1 parent 707336b commit 0d63570
Show file tree
Hide file tree
Showing 8 changed files with 1,063 additions and 15 deletions.
11 changes: 11 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@ Changes affecting specific commands:

- Support for setting missing genotypes with arbitrary ploidy via `-n c:./.` (#2303)

* bcftools +split-vep

- The `-s, --select` option was extended to print only one consequence. Previously it
was possible to select a single transcript (e.g., the one with the worst consequence),
and it was possible to filter by consequence severity (e.g., missing or worse),
but in some cases multiple consequences are reported within a single transcript
(e.g., start_lost&splice_region). The extended option allows to print the worst
part, for example as

--select primary:missense+:worst

* bcftools +trio-dnm2

- Fix a problem with --strictly-novel option which would neglect the presence of the apparent de novo
Expand Down
80 changes: 65 additions & 15 deletions plugins/split-vep.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2019-2024 Genome Research Ltd.
Copyright (c) 2019-2025 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -53,6 +53,8 @@
#define SELECT_TR_WORST 1
#define SELECT_TR_EXPR 2
#define SELECT_CSQ_ANY -1
#define PRN_CSQ_ALL 0
#define PRN_CSQ_WORST 1

#define TR_OP_EQ 0 // =
#define TR_OP_NE 1 // !=
Expand Down Expand Up @@ -132,6 +134,7 @@ typedef struct
int ncols_csq,mcols_csq; // the number of cols_csq transcripts and the high-water mark
int min_severity, max_severity; // ignore consequences outside this severity range
int drop_sites; // the -x, --drop-sites option
int prn_csq; // one of PRN_CSQ_*
int select_tr; // one of SELECT_TR_*
select_tr_t tr_expr; // with SELECT_TR_EXPR, see --select <FIELD><OP><VALUE>
uint8_t *smpl_pass; // for filtering at sample level, used with -f
Expand Down Expand Up @@ -237,17 +240,26 @@ static const char *usage_text(void)
" -H, --print-header Print header, -HH to omit column indices\n"
" -l, --list Parse the VCF header and list the annotation fields\n"
" -p, --annot-prefix STR Before doing anything else, prepend STR to all CSQ fields to avoid tag name conflicts\n"
" -s, --select TR:CSQ Select transcripts to extract by type and/or consequence severity. (See also -S and -x.)\n"
" TR, transcript: all,worst,primary,pick,mane,EXPRESSION [all]\n"
" CSQ, consequence: any,missense,missense+,etc [any]\n"
" Where the transcript EXPRESSION is of the form <FIELD><OPERATOR><VALUE>\n"
" FIELD: field name (e.g. \"CANONICAL\")\n"
" OPERATOR: string comparison (=,!=), regex matching (~,!~)\n"
" VALUE: required string value (e.g. \"YES\")\n"
" The TR presets are defined as follows\n"
" primary: CANONICAL=YES\n"
" pick: PICK=1\n"
" mane: MANE_SELECT!=\"\"\n"
" -s, --select TR:CSQ[:PRN] Select transcripts to extract by type and/or consequence severity, see also -S and -x\n"
" TR, filter transcripts: all,worst,primary,pick,mane,EXPRESSION [all]\n"
" CSQ, filter consequences: any,missense,missense+,etc [any]\n"
" PRN, print consequences: all,worst [all]\n"
" TR transcript selection\n"
" all .. list all transcripts\n"
" worst .. list only one transcript with the worst consequence (see -S)\n"
" primary .. list transcripts marked as CANONICAL=YES\n"
" pick .. as PICK=1\n"
" mane .. as MANE_SELECT!=\"\"\n"
" or an EXPRESSION in the form of \"<FIELD><OPERATOR><VALUE>\", where\n"
" FIELD .. field name (e.g. \"CANONICAL\")\n"
" OPERATOR .. string comparison (=,!=), regex matching (~,!~)\n"
" VALUE .. required string value (e.g. \"YES\")\n"
" CSQ consequence filtering, selects transcripts by CSQ severity (see \"-S -\")\n"
" missense .. selects only transcripts with a missense variant\n"
" missense+ .. transcripts with a missense consequence or more severe\n"
" PRN controls what consequences are actually printed\n"
" all .. print all consequences if multiple per transcript (e.g., start_lost&splice_region)\n"
" worst .. print the worst consequence per transcript (e.g., start_lost above)\n"
" -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n"
" the default scale\n"
" -u, --allow-undef-tags Print \".\" for undefined tags\n"
Expand Down Expand Up @@ -959,7 +971,8 @@ static void init_data(args_t *args)
if ( !args->select ) args->select = "all:any";
cols_t *cols = cols_split(args->select, NULL, ':');
char *sel_tr = cols->off[0][0] ? cols->off[0] : "all";
char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any";
char *sel_csq = cols->n > 1 && cols->off[1][0] ? cols->off[1] : "any";
char *prn_csq = cols->n > 2 && cols->off[2][0] ? cols->off[2] : "all";

// ... transcript selection
if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL;
Expand All @@ -969,7 +982,7 @@ static void init_data(args_t *args)
else if ( !strcasecmp(sel_tr,"mane") ) init_select_tr_expr(args,"MANE_SELECT!=\"\"");
else init_select_tr_expr(args,sel_tr);

// ... consequence selection
// ... transcript selection by consequence severity
if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups
else
{
Expand All @@ -984,6 +997,11 @@ static void init_data(args_t *args)
else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; }
else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; }
}

// .. consequence printout - everything or only one (worst)?
if ( !strcasecmp(prn_csq,"all") ) args->prn_csq = PRN_CSQ_ALL;
else if ( !strcasecmp(prn_csq,"worst") ) args->prn_csq = PRN_CSQ_WORST;
else error("Error: could not parse \"%s\" in the expression \"%s\"\n",prn_csq,args->select);
cols_destroy(cols);

// The "Consequence" column to determine severity for filtering. The name of this column is hardwired for now, both VEP and bt/csq use the same name
Expand Down Expand Up @@ -1348,6 +1366,32 @@ static void restrict_csqs_to_genes(args_t *args)
args->ncols_csq = nhit;
}

// Beware: edits the string, writes \0 character. For example, overwrites the '&' in start_lost&splice_region
char *csq_rewrite_worst(args_t *args, char *str)
{
cols_t *tmp = cols_split(str,NULL,'&');
char *ret = str;
if ( tmp->n > 1 )
{
// find the consequence with max severity
int i, imax = 0, smax = -1;
for (i=0; i<tmp->n; i++)
{
int severity = -1;
khash_str2int_get(args->csq2severity, tmp->off[i], &severity);
if ( smax < severity ) smax = severity, imax = i;
}

// position return string to the most severe csq
ret = str + (tmp->off[imax] - tmp->off[0]);

// if max is not the last one, need to write the null termination byte
if ( imax+1 < tmp->n ) str[tmp->off[imax+1] - tmp->off[0] - 1] = 0;
}
cols_destroy(tmp);
return ret;
}

// Split the VEP annotation by transcript and by field, then check if the number of subfields looks alright.
// Unfortunately, we cannot enforce the number of subfields to match the header definition because that can
// be variable: `bcftools csq` outputs different number of fields for different consequence types.
Expand Down Expand Up @@ -1465,7 +1509,13 @@ static void process_record(args_t *args, bcf1_t *rec)

char *ann_str = NULL;
if ( ann->idx==-1 ) ann_str = args->cols_tr->off[i];
else if ( *cols_csq->off[ann->idx] ) ann_str = cols_csq->off[ann->idx];
else if ( *cols_csq->off[ann->idx] )
{
if ( ann->idx==args->csq_idx && args->prn_csq==PRN_CSQ_WORST )
ann_str = csq_rewrite_worst(args, cols_csq->off[ann->idx]);
else
ann_str = cols_csq->off[ann->idx];
}
if ( ann_str )
{
annot_append(ann, ann_str);
Expand Down
Loading

0 comments on commit 0d63570

Please sign in to comment.