From 2fe56797a6929402802fa57d8528d447711f1add Mon Sep 17 00:00:00 2001
From: Robert Sheridan <sheridan@cbio.mskcc.org>
Date: Wed, 1 Apr 2020 09:23:23 -0400
Subject: [PATCH] add command line option --inhibit-vep - update docs and
 README

---
 README.md               |  2 ++
 docs/vep_maf_readme.txt |  4 ++--
 vcf2maf.pl              | 32 +++++++++++++++++++++++---------
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 8d7174f..73c521c 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ If you have the VEP script in a different folder like `/opt/vep`, and its cache
 
     perl vcf2maf.pl --input-vcf tests/test.vcf --output-maf tests/test.vep.maf --vep-path /opt/vep --vep-data /srv/vep
 
+You can also omit the running of VEP by including the option --inhibit-vep
+
 maf2maf
 -------
 
diff --git a/docs/vep_maf_readme.txt b/docs/vep_maf_readme.txt
index c2180e2..c63f53c 100644
--- a/docs/vep_maf_readme.txt
+++ b/docs/vep_maf_readme.txt
@@ -1,8 +1,8 @@
 This file describes the columns of MAF files generated by maf2maf or vcf2maf,
 and how to interpret them, especially in the context of cancer genetics.
 
-The first 34 columns are NCI's Mutation Annotation Format (MAF), described here:
-https://docs.gdc.cancer.gov/Data/File_Formats/MAF_Format/
+The first 34 columns are NCI's standard TCGA MAF format, and described here:
+https://wiki.nci.nih.gov/x/eJaPAQ
 
 The subsequent 10 columns are relevant to most analyses.
 
diff --git a/vcf2maf.pl b/vcf2maf.pl
index 810d7aa..c38d87a 100644
--- a/vcf2maf.pl
+++ b/vcf2maf.pl
@@ -13,6 +13,7 @@
 
 # Set any default paths and constants
 my ( $tumor_id, $normal_id ) = ( "TUMOR", "NORMAL" );
+my $inhibit_vep = 0;
 my ( $vep_path, $vep_data, $vep_forks, $buffer_size, $any_allele, $online ) = ( "$ENV{HOME}/vep", "$ENV{HOME}/.vep", 4, 5000, 0, 0 );
 my ( $ref_fasta, $filter_vcf ) = ( "$ENV{HOME}/.vep/homo_sapiens/95_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz", "$ENV{HOME}/.vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz" );
 my ( $species, $ncbi_build, $cache_version, $maf_center, $retain_info, $retain_fmt, $min_hom_vaf, $max_filter_ac ) = ( "homo_sapiens", "GRCh37", "", ".", "", "", 0.7, 10 );
@@ -202,6 +203,7 @@ sub GetBiotypePriority {
     'vcf-tumor-id=s' => \$vcf_tumor_id,
     'vcf-normal-id=s' => \$vcf_normal_id,
     'custom-enst=s' => \$custom_enst_file,
+    'inhibit-vep!' => \$inhibit_vep,
     'vep-path=s' => \$vep_path,
     'vep-data=s' => \$vep_data,
     'vep-forks=s' => \$vep_forks,
@@ -427,8 +429,10 @@ sub GetBiotypePriority {
 
 # Annotate variants in given VCF to all possible transcripts
 my $output_vcf = ( $remap_chain ? "$tmp_dir/$input_name.remap.vep.vcf" : "$tmp_dir/$input_name.vep.vcf" );
+
 # Skip running VEP if an annotated VCF already exists
-unless( -s $output_vcf ) {
+unless( not $inhibit_vep && -s $output_vcf ) {
+
     warn "STATUS: Running VEP and writing to: $output_vcf\n";
     # Make sure we can find the VEP script
     my $vep_script = ( -s "$vep_path/vep" ? "$vep_path/vep" : "$vep_path/variant_effect_predictor.pl" );
@@ -528,8 +532,8 @@ sub GetBiotypePriority {
 # one transcript per variant whose annotation will be used in the MAF
 my $maf_fh = IO::File->new( $output_maf, ">" ) or die "ERROR: Couldn't open --output-maf: $output_maf!\n";
 $maf_fh->print( "#version 2.4\n" . join( "\t", @maf_header ), "\n" ); # Print MAF header
-( -s $output_vcf ) or exit; # Warnings on this were printed earlier, but quit here, only after a blank MAF is created
-my $annotated_vcf_fh = IO::File->new( $output_vcf ) or die "ERROR: Couldn't open annotated VCF: $output_vcf!\n";
+( -s $input_vcf ) or exit; # Warnings on this were printed earlier, but quit here, only after a blank MAF is created
+my $annotated_vcf_fh = IO::File->new( $input_vcf ) or die "ERROR: Couldn't open un-annotated VCF: $input_vcf!\n";
 my ( $vcf_tumor_idx, $vcf_normal_idx, %sv_pair );
 while( my $line = $annotated_vcf_fh->getline ) {
 
@@ -776,7 +780,11 @@ sub GetBiotypePriority {
     %maf_line = map{( $_, ( $maf_effect->{$_} ? $maf_effect->{$_} : '' ))} @maf_header;
     $maf_line{Hugo_Symbol} = $maf_effect->{Transcript_ID} unless( $maf_effect->{Hugo_Symbol} );
     $maf_line{Hugo_Symbol} = 'Unknown' unless( $maf_effect->{Transcript_ID} );
-    $maf_line{Entrez_Gene_Id} = ( defined $entrez_id_map{$maf_effect->{Gene}} ? $entrez_id_map{$maf_effect->{Gene}} : "0" );
+    if( $inhibit_vep || ! defined $entrez_id_map{$maf_effect->{Gene}} ) {
+        $maf_line{Entrez_Gene_Id} = "0"; }
+    else {
+        $maf_line{Entrez_Gene_Id} = $entrez_id_map{$maf_effect->{Gene}};
+    }
     $maf_line{Center} = $maf_center;
     $maf_line{NCBI_Build} = $ncbi_build;
     $maf_line{Chromosome} = $chrom;
@@ -899,7 +907,7 @@ sub GetBiotypePriority {
     foreach my $fmt_col ( @addl_fmt_cols ) {
         my $fmt_key = $fmt_col;
         if ( $fmt_key =~ /^t_/ ) { $fmt_key =~ s/^t_//; $maf_line{$fmt_col} = ( defined $tum_info{$fmt_key} ? $tum_info{$fmt_key} : "" ); }
-        if ( $fmt_key =~ /^n_/ ) { $fmt_key =~ s/^n_//; $maf_line{$fmt_col} = ( defined $nrm_info{$fmt_key} ? $nrm_info{$fmt_key} : "" ); }  
+        if ( $fmt_key =~ /^n_/ ) { $fmt_key =~ s/^n_//; $maf_line{$fmt_col} = ( defined $nrm_info{$fmt_key} ? $nrm_info{$fmt_key} : "" ); }
     }
 
     # If this is an SV, pair up gene names from separate lines to backfill the Fusion column later
@@ -956,6 +964,11 @@ sub GetBiotypePriority {
 
 # Converts Sequence Ontology variant types to MAF variant classifications
 sub GetVariantClassification {
+    my $DEFAULT_CLASSIFICATION = "Targeted_Region";
+    # When not using VEP, the effect is not known
+    if( $inhibit_vep ) {
+        return $DEFAULT_CLASSIFICATION;
+    }
     my ( $effect, $var_type, $inframe ) = @_;
     return "Splice_Site" if( $effect =~ /^(splice_acceptor_variant|splice_donor_variant|transcript_ablation|exon_loss_variant)$/ );
     return "Nonsense_Mutation" if( $effect eq 'stop_gained' );
@@ -979,7 +992,7 @@ sub GetVariantClassification {
     # Annotate everything else simply as a targeted region
     # TFBS_ablation, TFBS_amplification,regulatory_region_ablation, regulatory_region_amplification,
     # feature_elongation, feature_truncation
-    return "Targeted_Region";
+    return $DEFAULT_CLASSIFICATION;
 }
 
 # Fix the AD and DP fields, given data from a FORMATted genotype string
@@ -1023,7 +1036,7 @@ sub FixAlleleDepths {
     elsif( !defined $fmt_info{AD} and defined $fmt_info{TIR} and defined $fmt_info{TAR}) {
         @depths = map{""} @alleles;
         $depths[0] = ( split /,/, $fmt_info{TAR} )[0];
-        $depths[$var_allele_idx] = ( split /,/, $fmt_info{TIR} )[0];	
+        $depths[$var_allele_idx] = ( split /,/, $fmt_info{TIR} )[0];
     }
     # Handle VCF lines by CaVEMan, where allele depths are in FAZ:FCZ:FGZ:FTZ:RAZ:RCZ:RGZ:RTZ
     elsif( !defined $fmt_info{AD} and scalar( grep{defined $fmt_info{$_}} qw/FAZ FCZ FGZ FTZ RAZ RCZ RGZ RTZ/ ) == 8 ) {
@@ -1043,7 +1056,7 @@ sub FixAlleleDepths {
         $depths[$var_allele_idx] = $fmt_info{RV};
     }
     # Handle VCF lines where REF/ALT allele counts must be extracted from DP4
-    elsif( !defined $fmt_info{AD} and defined $fmt_info{DP4} and scalar( split( /,/, $fmt_info{DP4} )) == 4 ) {
+    elsif( !defined $fmt_info{AD} and defined $fmt_info{DP4} and scalar( my @fmt_terms = split( /,/, $fmt_info{DP4} )) == 4 ) {
         # Reference allele depth and depths for any other ALT alleles must be left undefined
         @depths = map{""} @alleles;
         # DP4 is usually a comma-delimited list for ref-forward, ref-reverse, alt-forward and alt-reverse read counts
@@ -1124,6 +1137,7 @@ =head1 OPTIONS
  --vcf-tumor-id   Tumor sample ID used in VCF's genotype columns [--tumor-id]
  --vcf-normal-id  Matched normal ID used in VCF's genotype columns [--normal-id]
  --custom-enst    List of custom ENST IDs that override canonical selection
+ --inhibit-vep    Omit the running of VEP to determine variant effects
  --vep-path       Folder containing the vep script [~/vep]
  --vep-data       VEP's base cache/plugin directory [~/.vep]
  --vep-forks      Number of forked processes to use when running VEP [4]
@@ -1148,7 +1162,7 @@ =head1 DESCRIPTION
 
 To convert a VCF into a MAF, each variant must be mapped to only one of all possible gene transcripts/isoforms that it might affect. This selection of a single effect per variant, is often subjective. So this project is an attempt to make the selection criteria smarter, reproducible, and more configurable.
 
-This script needs VEP, a variant annotator that maps effects of a variant on all possible genes and transcripts. For more info, see the README.
+This script uses VEP, a variant annotator that maps effects of a variant on all possible genes and transcripts. For more info, see the README.
 
 =head2 Relevant links: