-
Notifications
You must be signed in to change notification settings - Fork 242
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The
-s, --select
option was extended to print only one consequence.
Previously it was possible to select a single transcript (e.g., the one with the worst consequence), and it was possible to filter by consequence severity (e.g., missing or worse), but in some cases multiple consequences are reported within a single transcript (e.g., start_lost&splice_region). The extended option allows to print the worst part, for example as --select primary:missense+:worst --select ::worst
- Loading branch information
Showing
8 changed files
with
1,063 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
/* The MIT License | ||
Copyright (c) 2019-2024 Genome Research Ltd. | ||
Copyright (c) 2019-2025 Genome Research Ltd. | ||
Author: Petr Danecek <[email protected]> | ||
|
@@ -53,6 +53,8 @@ | |
#define SELECT_TR_WORST 1 | ||
#define SELECT_TR_EXPR 2 | ||
#define SELECT_CSQ_ANY -1 | ||
#define PRN_CSQ_ALL 0 | ||
#define PRN_CSQ_WORST 1 | ||
|
||
#define TR_OP_EQ 0 // = | ||
#define TR_OP_NE 1 // != | ||
|
@@ -132,6 +134,7 @@ typedef struct | |
int ncols_csq,mcols_csq; // the number of cols_csq transcripts and the high-water mark | ||
int min_severity, max_severity; // ignore consequences outside this severity range | ||
int drop_sites; // the -x, --drop-sites option | ||
int prn_csq; // one of PRN_CSQ_* | ||
int select_tr; // one of SELECT_TR_* | ||
select_tr_t tr_expr; // with SELECT_TR_EXPR, see --select <FIELD><OP><VALUE> | ||
uint8_t *smpl_pass; // for filtering at sample level, used with -f | ||
|
@@ -237,17 +240,26 @@ static const char *usage_text(void) | |
" -H, --print-header Print header, -HH to omit column indices\n" | ||
" -l, --list Parse the VCF header and list the annotation fields\n" | ||
" -p, --annot-prefix STR Before doing anything else, prepend STR to all CSQ fields to avoid tag name conflicts\n" | ||
" -s, --select TR:CSQ Select transcripts to extract by type and/or consequence severity. (See also -S and -x.)\n" | ||
" TR, transcript: all,worst,primary,pick,mane,EXPRESSION [all]\n" | ||
" CSQ, consequence: any,missense,missense+,etc [any]\n" | ||
" Where the transcript EXPRESSION is of the form <FIELD><OPERATOR><VALUE>\n" | ||
" FIELD: field name (e.g. \"CANONICAL\")\n" | ||
" OPERATOR: string comparison (=,!=), regex matching (~,!~)\n" | ||
" VALUE: required string value (e.g. \"YES\")\n" | ||
" The TR presets are defined as follows\n" | ||
" primary: CANONICAL=YES\n" | ||
" pick: PICK=1\n" | ||
" mane: MANE_SELECT!=\"\"\n" | ||
" -s, --select TR:CSQ[:PRN] Select transcripts to extract by type and/or consequence severity, see also -S and -x\n" | ||
" TR, filter transcripts: all,worst,primary,pick,mane,EXPRESSION [all]\n" | ||
" CSQ, filter consequences: any,missense,missense+,etc [any]\n" | ||
" PRN, print consequences: all,worst [all]\n" | ||
" TR transcript selection\n" | ||
" all .. list all transcripts\n" | ||
" worst .. list only one transcript with the worst consequence (see -S)\n" | ||
" primary .. list transcripts marked as CANONICAL=YES\n" | ||
" pick .. as PICK=1\n" | ||
" mane .. as MANE_SELECT!=\"\"\n" | ||
" or an EXPRESSION in the form of \"<FIELD><OPERATOR><VALUE>\", where\n" | ||
" FIELD .. field name (e.g. \"CANONICAL\")\n" | ||
" OPERATOR .. string comparison (=,!=), regex matching (~,!~)\n" | ||
" VALUE .. required string value (e.g. \"YES\")\n" | ||
" CSQ consequence filtering, selects transcripts by CSQ severity (see \"-S -\")\n" | ||
" missense .. selects only transcripts with a missense variant\n" | ||
" missense+ .. transcripts with a missense consequence or more severe\n" | ||
" PRN controls what consequences are actually printed\n" | ||
" all .. print all consequences if multiple per transcript (e.g., start_lost&splice_region)\n" | ||
" worst .. print the worst consequence per transcript (e.g., start_lost above)\n" | ||
" -S, --severity -|FILE Pass \"-\" to print the default severity scale or FILE to override\n" | ||
" the default scale\n" | ||
" -u, --allow-undef-tags Print \".\" for undefined tags\n" | ||
|
@@ -959,7 +971,8 @@ static void init_data(args_t *args) | |
if ( !args->select ) args->select = "all:any"; | ||
cols_t *cols = cols_split(args->select, NULL, ':'); | ||
char *sel_tr = cols->off[0][0] ? cols->off[0] : "all"; | ||
char *sel_csq = cols->n==2 && cols->off[1][0] ? cols->off[1] : "any"; | ||
char *sel_csq = cols->n > 1 && cols->off[1][0] ? cols->off[1] : "any"; | ||
char *prn_csq = cols->n > 2 && cols->off[2][0] ? cols->off[2] : "all"; | ||
|
||
// ... transcript selection | ||
if ( !strcasecmp(sel_tr,"all") ) args->select_tr = SELECT_TR_ALL; | ||
|
@@ -969,7 +982,7 @@ static void init_data(args_t *args) | |
else if ( !strcasecmp(sel_tr,"mane") ) init_select_tr_expr(args,"MANE_SELECT!=\"\""); | ||
else init_select_tr_expr(args,sel_tr); | ||
|
||
// ... consequence selection | ||
// ... transcript selection by consequence severity | ||
if ( !strcasecmp(sel_csq,"any") ) { args->min_severity = args->max_severity = SELECT_CSQ_ANY; } // to avoid unnecessary lookups | ||
else | ||
{ | ||
|
@@ -984,6 +997,11 @@ static void init_data(args_t *args) | |
else if ( modifier=='+' ) { args->min_severity = severity; args->max_severity = INT_MAX; } | ||
else if ( modifier=='-' ) { args->min_severity = 0; args->max_severity = severity; } | ||
} | ||
|
||
// .. consequence printout - everything or only one (worst)? | ||
if ( !strcasecmp(prn_csq,"all") ) args->prn_csq = PRN_CSQ_ALL; | ||
else if ( !strcasecmp(prn_csq,"worst") ) args->prn_csq = PRN_CSQ_WORST; | ||
else error("Error: could not parse \"%s\" in the expression \"%s\"\n",prn_csq,args->select); | ||
cols_destroy(cols); | ||
|
||
// The "Consequence" column to determine severity for filtering. The name of this column is hardwired for now, both VEP and bt/csq use the same name | ||
|
@@ -1348,6 +1366,32 @@ static void restrict_csqs_to_genes(args_t *args) | |
args->ncols_csq = nhit; | ||
} | ||
|
||
// Beware: edits the string, writes \0 character. For example, overwrites the '&' in start_lost&splice_region | ||
char *csq_rewrite_worst(args_t *args, char *str) | ||
{ | ||
cols_t *tmp = cols_split(str,NULL,'&'); | ||
char *ret = str; | ||
if ( tmp->n > 1 ) | ||
{ | ||
// find the consequence with max severity | ||
int i, imax = 0, smax = -1; | ||
for (i=0; i<tmp->n; i++) | ||
{ | ||
int severity = -1; | ||
khash_str2int_get(args->csq2severity, tmp->off[i], &severity); | ||
if ( smax < severity ) smax = severity, imax = i; | ||
} | ||
|
||
// position return string to the most severe csq | ||
ret = str + (tmp->off[imax] - tmp->off[0]); | ||
|
||
// if max is not the last one, need to write the null termination byte | ||
if ( imax+1 < tmp->n ) str[tmp->off[imax+1] - tmp->off[0] - 1] = 0; | ||
} | ||
cols_destroy(tmp); | ||
return ret; | ||
} | ||
|
||
// Split the VEP annotation by transcript and by field, then check if the number of subfields looks alright. | ||
// Unfortunately, we cannot enforce the number of subfields to match the header definition because that can | ||
// be variable: `bcftools csq` outputs different number of fields for different consequence types. | ||
|
@@ -1465,7 +1509,13 @@ static void process_record(args_t *args, bcf1_t *rec) | |
|
||
char *ann_str = NULL; | ||
if ( ann->idx==-1 ) ann_str = args->cols_tr->off[i]; | ||
else if ( *cols_csq->off[ann->idx] ) ann_str = cols_csq->off[ann->idx]; | ||
else if ( *cols_csq->off[ann->idx] ) | ||
{ | ||
if ( ann->idx==args->csq_idx && args->prn_csq==PRN_CSQ_WORST ) | ||
ann_str = csq_rewrite_worst(args, cols_csq->off[ann->idx]); | ||
else | ||
ann_str = cols_csq->off[ann->idx]; | ||
} | ||
if ( ann_str ) | ||
{ | ||
annot_append(ann, ann_str); | ||
|
Oops, something went wrong.