From 2fe56797a6929402802fa57d8528d447711f1add Mon Sep 17 00:00:00 2001 From: Robert Sheridan Date: Wed, 1 Apr 2020 09:23:23 -0400 Subject: [PATCH] add command line option --inhibit-vep - update docs and README --- README.md | 2 ++ docs/vep_maf_readme.txt | 4 ++-- vcf2maf.pl | 32 +++++++++++++++++++++++--------- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 8d7174f..73c521c 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,8 @@ If you have the VEP script in a different folder like `/opt/vep`, and its cache perl vcf2maf.pl --input-vcf tests/test.vcf --output-maf tests/test.vep.maf --vep-path /opt/vep --vep-data /srv/vep +You can also omit the running of VEP by including the option --inhibit-vep + maf2maf ------- diff --git a/docs/vep_maf_readme.txt b/docs/vep_maf_readme.txt index c2180e2..c63f53c 100644 --- a/docs/vep_maf_readme.txt +++ b/docs/vep_maf_readme.txt @@ -1,8 +1,8 @@ This file describes the columns of MAF files generated by maf2maf or vcf2maf, and how to interpret them, especially in the context of cancer genetics. -The first 34 columns are NCI's Mutation Annotation Format (MAF), described here: -https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/ +The first 34 columns are NCI's standard TCGA MAF format, and described here: +https://wiki.nci.nih.gov/x/eJaPAQ The subsequent 10 columns are relevant to most analyses. diff --git a/vcf2maf.pl b/vcf2maf.pl index 810d7aa..c38d87a 100644 --- a/vcf2maf.pl +++ b/vcf2maf.pl @@ -13,6 +13,7 @@ # Set any default paths and constants my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" ); +my $inhibit_vep = 0; my ( $vep_path, $vep_data, $vep_forks, $buffer_size, $any_allele, $online ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, 5000, 0, 0 ); my ( $ref_fasta, $filter_vcf ) = ( "$ENV{HOME}/.vep/homo_sapiens/95_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz", "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" ); my ( $species, $ncbi_build, $cache_version, $maf_center, $retain_info, $retain_fmt, $min_hom_vaf, $max_filter_ac ) = ( "homo_sapiens", "GRCh37", "", ".", "", "", 0.7, 10 ); @@ -202,6 +203,7 @@ sub GetBiotypePriority { 'vcf-tumor-id=s' => \$vcf_tumor_id, 'vcf-normal-id=s' => \$vcf_normal_id, 'custom-enst=s' => \$custom_enst_file, + 'inhibit-vep!' => \$inhibit_vep, 'vep-path=s' => \$vep_path, 'vep-data=s' => \$vep_data, 'vep-forks=s' => \$vep_forks, @@ -427,8 +429,10 @@ sub GetBiotypePriority { # Annotate variants in given VCF to all possible transcripts my $output_vcf = ( $remap_chain ? "$tmp_dir/$input_name.remap.vep.vcf" : "$tmp_dir/$input_name.vep.vcf" ); + # Skip running VEP if an annotated VCF already exists -unless( -s $output_vcf ) { +unless( not $inhibit_vep && -s $output_vcf ) { + warn "STATUS: Running VEP and writing to: $output_vcf\n"; # Make sure we can find the VEP script my $vep_script = ( -s "$vep_path/vep" ? "$vep_path/vep" : "$vep_path/variant_effect_predictor.pl" ); @@ -528,8 +532,8 @@ sub GetBiotypePriority { # one transcript per variant whose annotation will be used in the MAF my $maf_fh = IO::File->new( $output_maf, ">" ) or die "ERROR: Couldn't open --output-maf: $output_maf!\n"; $maf_fh->print( "#version 2.4\n" . join( "\t", @maf_header ), "\n" ); # Print MAF header -( -s $output_vcf ) or exit; # Warnings on this were printed earlier, but quit here, only after a blank MAF is created -my $annotated_vcf_fh = IO::File->new( $output_vcf ) or die "ERROR: Couldn't open annotated VCF: $output_vcf!\n"; +( -s $input_vcf ) or exit; # Warnings on this were printed earlier, but quit here, only after a blank MAF is created +my $annotated_vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open un-annotated VCF: $input_vcf!\n"; my ( $vcf_tumor_idx, $vcf_normal_idx, %sv_pair ); while( my $line = $annotated_vcf_fh->getline ) { @@ -776,7 +780,11 @@ sub GetBiotypePriority { %maf_line = map{( $_, ( $maf_effect->{$_} ? $maf_effect->{$_} : '' ))} @maf_header; $maf_line{Hugo_Symbol} = $maf_effect->{Transcript_ID} unless( $maf_effect->{Hugo_Symbol} ); $maf_line{Hugo_Symbol} = 'Unknown' unless( $maf_effect->{Transcript_ID} ); - $maf_line{Entrez_Gene_Id} = ( defined $entrez_id_map{$maf_effect->{Gene}} ? $entrez_id_map{$maf_effect->{Gene}} : "0" ); + if( $inhibit_vep || ! defined $entrez_id_map{$maf_effect->{Gene}} ) { + $maf_line{Entrez_Gene_Id} = "0"; } + else { + $maf_line{Entrez_Gene_Id} = $entrez_id_map{$maf_effect->{Gene}}; + } $maf_line{Center} = $maf_center; $maf_line{NCBI_Build} = $ncbi_build; $maf_line{Chromosome} = $chrom; @@ -899,7 +907,7 @@ sub GetBiotypePriority { foreach my $fmt_col ( @addl_fmt_cols ) { my $fmt_key = $fmt_col; if ( $fmt_key =~ /^t_/ ) { $fmt_key =~ s/^t_//; $maf_line{$fmt_col} = ( defined $tum_info{$fmt_key} ? $tum_info{$fmt_key} : "" ); } - if ( $fmt_key =~ /^n_/ ) { $fmt_key =~ s/^n_//; $maf_line{$fmt_col} = ( defined $nrm_info{$fmt_key} ? $nrm_info{$fmt_key} : "" ); } + if ( $fmt_key =~ /^n_/ ) { $fmt_key =~ s/^n_//; $maf_line{$fmt_col} = ( defined $nrm_info{$fmt_key} ? $nrm_info{$fmt_key} : "" ); } } # If this is an SV, pair up gene names from separate lines to backfill the Fusion column later @@ -956,6 +964,11 @@ sub GetBiotypePriority { # Converts Sequence Ontology variant types to MAF variant classifications sub GetVariantClassification { + my $DEFAULT_CLASSIFICATION = "Targeted_Region"; + # When not using VEP, the effect is not known + if( $inhibit_vep ) { + return $DEFAULT_CLASSIFICATION; + } my ( $effect, $var_type, $inframe ) = @_; return "Splice_Site" if( $effect =~ /^(splice_acceptor_variant|splice_donor_variant|transcript_ablation|exon_loss_variant)$/ ); return "Nonsense_Mutation" if( $effect eq 'stop_gained' ); @@ -979,7 +992,7 @@ sub GetVariantClassification { # Annotate everything else simply as a targeted region # TFBS_ablation, TFBS_amplification,regulatory_region_ablation, regulatory_region_amplification, # feature_elongation, feature_truncation - return "Targeted_Region"; + return $DEFAULT_CLASSIFICATION; } # Fix the AD and DP fields, given data from a FORMATted genotype string @@ -1023,7 +1036,7 @@ sub FixAlleleDepths { elsif( !defined $fmt_info{AD} and defined $fmt_info{TIR} and defined $fmt_info{TAR}) { @depths = map{""} @alleles; $depths[0] = ( split /,/, $fmt_info{TAR} )[0]; - $depths[$var_allele_idx] = ( split /,/, $fmt_info{TIR} )[0]; + $depths[$var_allele_idx] = ( split /,/, $fmt_info{TIR} )[0]; } # Handle VCF lines by CaVEMan, where allele depths are in FAZ:FCZ:FGZ:FTZ:RAZ:RCZ:RGZ:RTZ elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/FAZ FCZ FGZ FTZ RAZ RCZ RGZ RTZ/ ) == 8 ) { @@ -1043,7 +1056,7 @@ sub FixAlleleDepths { $depths[$var_allele_idx] = $fmt_info{RV}; } # Handle VCF lines where REF/ALT allele counts must be extracted from DP4 - elsif( !defined $fmt_info{AD} and defined $fmt_info{DP4} and scalar( split( /,/, $fmt_info{DP4} )) == 4 ) { + elsif( !defined $fmt_info{AD} and defined $fmt_info{DP4} and scalar( my @fmt_terms = split( /,/, $fmt_info{DP4} )) == 4 ) { # Reference allele depth and depths for any other ALT alleles must be left undefined @depths = map{""} @alleles; # DP4 is usually a comma-delimited list for ref-forward, ref-reverse, alt-forward and alt-reverse read counts @@ -1124,6 +1137,7 @@ =head1 OPTIONS --vcf-tumor-id Tumor sample ID used in VCF's genotype columns [--tumor-id] --vcf-normal-id Matched normal ID used in VCF's genotype columns [--normal-id] --custom-enst List of custom ENST IDs that override canonical selection + --inhibit-vep Omit the running of VEP to determine variant effects --vep-path Folder containing the vep script [~/vep] --vep-data VEP's base cache/plugin directory [~/.vep] --vep-forks Number of forked processes to use when running VEP [4] @@ -1148,7 +1162,7 @@ =head1 DESCRIPTION To convert a VCF into a MAF, each variant must be mapped to only one of all possible gene transcripts/isoforms that it might affect. This selection of a single effect per variant, is often subjective. So this project is an attempt to make the selection criteria smarter, reproducible, and more configurable. -This script needs VEP, a variant annotator that maps effects of a variant on all possible genes and transcripts. For more info, see the README. +This script uses VEP, a variant annotator that maps effects of a variant on all possible genes and transcripts. For more info, see the README. =head2 Relevant links: