forked from mskcc/vcf2maf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvariant_effect_predictor.pl
executable file
·3023 lines (2444 loc) · 107 KB
/
variant_effect_predictor.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
=head1 LICENSE
Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
=cut
=head1 CONTACT
Please email comments or questions to the public Ensembl
developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
Questions may also be sent to the Ensembl help desk at
<http://www.ensembl.org/Help/Contact>.
=cut
=head1 NAME
Variant Effect Predictor - a script to predict the consequences of genomic variants
http://www.ensembl.org/info/docs/tools/vep/script/index.html
Version 86
by Will McLaren ([email protected])
=cut
use strict;
use Getopt::Long;
use FileHandle;
use CGI qw/:standard/;
use FindBin qw($RealBin);
use lib $RealBin;
use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Variation::Utils::VEP qw(
parse_line
vf_to_consequences
validate_vf
convert_to_vcf
get_all_consequences
get_slice
build_full_cache
read_cache_info
get_version_data
get_time
debug
@OUTPUT_COLS
@VCF_COLS
@EXTRA_HEADERS
%COL_DESCS
@REG_FEAT_TYPES
%FILTER_SHORTCUTS
@PICK_ORDER
);
use Bio::EnsEMBL::Variation::Utils::FastaSequence qw(setup_fasta);
# global vars
my $VERSION = '86';
my %ts_tv = (
'A/G' => 'Ts',
'G/A' => 'Ts',
'C/T' => 'Ts',
'T/C' => 'Ts',
'A/C' => 'Tv',
'C/A' => 'Tv',
'G/T' => 'Tv',
'T/G' => 'Tv',
'C/G' => 'Tv',
'G/C' => 'Tv',
'A/T' => 'Tv',
'T/A' => 'Tv',
);
my %colour_keys = (
'polyphen' => {
'unknown' => 'blue',
'benign' => 'green',
'possibly damaging' => 'orange',
'probably damaging' => 'red',
},
'sift' => {
'tolerated' => 'green',
'deleterious' => 'red',
},
# copied from COLOUR.ini in web code via browser to check colours
'consequences' => {
'intergenic_variant' => 'gray',
'intron_variant' => '#02599c',
'upstream_gene_variant' => '#a2b5cd',
'downstream_gene_variant' => '#a2b5cd',
'5_prime_utr_variant' => '#7ac5cd',
'3_prime_utr_variant' => '#7ac5cd',
'splice_region_variant' => '#ff7f50',
'splice_donor_variant' => '#ff7f50',
'splice_acceptor_variant' => '#ff7f50',
'frameshift_variant' => '#ff69b4',
'transcript_ablation' => '#ff0000',
'transcript_amplification' => '#ff69b4',
'inframe_insertion' => '#ff69b4',
'inframe_deletion' => '#ff69b4',
'synonymous_variant' => '#76ee00',
'stop_retained_variant' => '#76ee00',
'missense_variant' => '#ffd700',
'initiator_codon_variant' => '#ffd700',
'stop_gained' => '#ff0000',
'stop_lost' => '#ff0000',
'mature_mirna_variant' => '#458b00',
'non_coding_exon_variant' => '#32cd32',
'nc_transcript_variant' => '#32cd32',
'incomplete_terminal_codon_variant' => '#ff00ff',
'nmd_transcript_variant' => '#ff4500',
'coding_sequence_variant' => '#458b00',
'tfbs_ablation' => 'brown',
'tfbs_amplification' => 'brown',
'tf_binding_site_variant' => 'brown',
'regulatory_region_variant' => 'brown',
'regulatory_region_ablation' => 'brown',
'regulatory_region_amplification' => 'brown',
},
);
# set output autoflush for progress bars
$| = 1;
# configure from command line opts
my $config = &configure(scalar @ARGV);
# run the main sub routine
&main($config);
# run with leaktrace
# use Test::LeakTrace;
# leaktrace { &main($config); undef $config; } -verbose;
# this is the main sub-routine - it needs the configured $config hash
sub main {
my $config = shift;
debug("Starting...") unless defined $config->{quiet};
# this is for counting seconds
$config->{start_time} = time();
$config->{last_time} = time();
# this is for stats
$config->{stats}->{start_time} = get_time();
my $tr_cache = {};
my $rf_cache = {};
# create a hash to hold slices so we don't get the same one twice
my %slice_cache = ();
my @vfs;
my ($vf_count, $total_vf_count);
my $in_file_handle = $config->{in_file_handle};
# initialize line number in config
$config->{line_number} = 0;
# read the file
while(<$in_file_handle>) {
# split again to avoid Windows character nonsense
foreach my $line(split /\r|(?>\v|\x0D\x0A)/) {
chomp($line);
next unless $line =~ /\w+/;
$config->{line_number}++;
# header line?
if($line =~ /^\#/) {
# retain header lines if we are outputting VCF
if(defined($config->{vcf})) {
push @{$config->{headers}}, $line;
}
# line with sample labels in VCF
if(defined($config->{individual}) && /^#CHROM/) {
my @split = split(/\s+/, $line);
# no individuals
die("ERROR: No individual data found in VCF\n") if scalar @split <= 9;
# get individual column indices
my %ind_cols = map {$split[$_] => $_} (9..$#split);
# all?
if(scalar @{$config->{individual}} == 1 && $config->{individual}->[0] =~ /^all$/i) {
$config->{ind_cols} = \%ind_cols;
}
else {
my %new_ind_cols;
# check we have specified individual(s)
foreach my $ind(@{$config->{individual}}) {
die("ERROR: Individual named \"$ind\" not found in VCF\n") unless defined $ind_cols{$ind};
$new_ind_cols{$ind} = $ind_cols{$ind};
}
$config->{ind_cols} = \%new_ind_cols;
}
}
next;
}
# strip off nasty characters
$line =~ s/\s+$//g;
# configure output file
$config->{out_file_handle} ||= &get_out_file_handle($config);
# some lines (pileup) may actually parse out into more than one variant
foreach my $vf(@{&parse_line($config, $line)}) {
$vf->{_line} = $line;
$vf->{_line_number} = $config->{line_number};
# now get the slice
if(!defined($vf->{slice})) {
my $slice;
# don't get slices if we're using cache
# we can steal them from transcript objects later
if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
# check if we have fetched this slice already
if(defined $slice_cache{$vf->{chr}}) {
$slice = $slice_cache{$vf->{chr}};
}
# if not create a new one
else {
$slice = &get_slice($config, $vf->{chr});
# if failed, warn and skip this line
if(!defined($slice)) {
warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
next;
}
# store the hash
$slice_cache{$vf->{chr}} = $slice;
}
}
$vf->{slice} = $slice;
}
# validate the VF
next unless validate_vf($config, $vf) || defined($config->{dont_skip});
# make a name if one doesn't exist
$vf->{variation_name} ||= ($vf->{original_chr} || $vf->{chr}).'_'.$vf->{start}.'_'.($vf->{allele_string} || $vf->{class_SO_term});
# jump out to convert here
if(defined($config->{convert})) {
&convert_vf($config, $vf);
next;
}
if(defined $config->{whole_genome}) {
push @vfs, $vf;
$vf_count++;
$total_vf_count++;
if($vf_count == $config->{buffer_size}) {
debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
$config->{stats}->{out_count} += print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
# calculate stats
my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
$config->{last_time} = time();
debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
@vfs = ();
$vf_count = 0;
}
}
else {
$config->{stats}->{out_count} += print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
$vf_count++;
$total_vf_count++;
debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
}
}
}
}
# if in whole-genome mode, finish off the rest of the buffer
if(defined $config->{whole_genome} && scalar @vfs) {
debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
$config->{stats}->{out_count} += print_line($config, $_) foreach @{get_all_consequences($config, \@vfs)};
# calculate stats
my $total_rate = sprintf("%.0f vars/sec", $total_vf_count / ((time() - $config->{start_time}) || 1));
my $rate = sprintf("%.0f vars/sec", $vf_count / ((time() - $config->{last_time}) || 1));
$config->{last_time} = time();
debug("Processed $total_vf_count total variants ($rate, $total_rate total)") unless defined($config->{quiet});
}
debug($config->{stats}->{filter_count}, "/$total_vf_count variants remain after filtering") if (defined($config->{check_frequency})) && !defined($config->{quiet});
debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
# finalise run-time stats
$config->{stats}->{var_count} = $total_vf_count;
$config->{stats}->{end_time} = get_time();
$config->{stats}->{run_time} = time() - $config->{start_time};
# write stats
unless(defined($config->{no_stats})) {
summarise_stats($config);
debug("Wrote stats summary to ".$config->{stats_file}) unless defined($config->{quiet});
}
# tell user about any warnings
if($config->{warning_count}) {
debug("See ".$config->{warning_file}." for details of ".$config->{warning_count}." warnings") unless defined($config->{quiet});
}
# close HTML output
if(defined($config->{html}) && defined($config->{html_file_handle})) {
my $fh = $config->{html_file_handle};
print $fh "</tbody><tfoot><tr>".$config->{_th}."</tr></tfoot></table><p> </p></div></html></body>\n</html>\n";
$fh->close;
}
if(defined($config->{solr})) {
my $fh = $config->{out_file_handle};
print $fh "</add>\n";
}
# tabix?
if(defined($config->{tabix})) {
debug("Compressing and indexing output") unless defined($config->{quiet});
# check sorting
open SORT, $config->{output_file};
my $prev_pos = 0;
my $prev_chr = 0;
my $is_sorted = 0;
my %seen_chrs;
while(<SORT>) {
if(!/^\#/) {
$is_sorted = 1;
my @data = split /\s+/, $_;
if(
($data[0] eq $prev_chr && $data[1] < $prev_pos) ||
($data[0] ne $prev_chr && defined($seen_chrs{$data[0]}))
) {
$is_sorted = 0;
last;
}
$prev_pos = $data[1];
$prev_chr = $data[0];
$seen_chrs{$data[0]} = 1;
}
}
close SORT;
unless($is_sorted) {
system('grep "^#" '.$config->{output_file}.' > '.$config->{output_file}.'.sorted');
system('grep -v "^#" '.$config->{output_file}.' | sort -k1,1 -k2,2n >> '.$config->{output_file}.'.sorted');
system('mv '.$config->{output_file}.'.sorted '.$config->{output_file});
}
system("bgzip -f ".$config->{output_file}) == 0 or warn "WARNING: failed to generated bgzipped file for tabix\n$?\n";
rename($config->{output_file}.".gz", $config->{output_file});
system("tabix -p vcf -f ".$config->{output_file}) == 0 or warn "WARNING: failed to generated tabix index\n$?\n";
}
debug("Finished!") unless defined $config->{quiet};
}
# sets up configuration hash that is used throughout the script
sub configure {
my $args = shift;
my $config = {};
my @ARGV_copy = @ARGV;
$config->{stats}->{options} = \@ARGV_copy;
GetOptions(
$config,
'help', # displays help message
# input options,
'config=s', # config file name
'input_file|i=s', # input file name
'format=s', # input file format
# DB options
'species=s', # species e.g. human, homo_sapiens
'registry=s', # registry file
'host=s', # database host
'port=s', # database port
'user|u=s', # database user name
'password=s', # database password
'db_version=i', # Ensembl database version to use e.g. 62
'assembly|a=s', # assembly version to use
'genomes', # automatically sets DB params for e!Genomes
'refseq', # use otherfeatures RefSeq DB instead of Ensembl
'merged', # use merged cache
'all_refseq', # report consequences on all transcripts in RefSeq cache, includes CCDS, EST etc
'gencode_basic', # limit to using just GenCode basic transcript set
'is_multispecies=i', # '1' for a multispecies database (e.g protists_euglenozoa1_collection_core_29_82_1)
# runtime options
'minimal', # convert input alleles to minimal representation
'most_severe', # only return most severe consequence
'summary', # only return one line per variation with all consquence types
'per_gene', # only return most severe per gene
'pick', # used defined criteria to return most severe line
'flag_pick', # like --pick but just adds a flag to picked line
'pick_allele', # like --pick but chooses one con per allele
'flag_pick_allele', # like --flag_pick but flags one con per allele
'pick_order=s', # define the order of categories used by the --*pick* flags
'buffer_size=i', # number of variations to read in before analysis
'chunk_size=s', # size in bases of "chunks" used in internal hash structure
'failed=i', # include failed variations when finding existing
'no_whole_genome', # disables now default whole-genome mode
'whole_genome', # proxy for whole genome mode - now just warns user
'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
'check_ref', # check supplied reference allele against DB
'check_existing', # find existing co-located variations
'check_svs', # find overlapping structural variations
'check_alleles', # only attribute co-located if alleles are the same
'check_frequency', # enable frequency checking
'gmaf', # add global MAF of existing var
'maf_1kg', # add 1KG MAFs of existing vars
'maf_esp', # add ESP MAFs of existing vars
'maf_exac', # add ExAC MAFs of existing vars
'old_maf', # report 1KG/ESP MAFs in the old way (no allele, always < 0.5)
'pubmed', # add Pubmed IDs for publications that cite existing vars
'freq_filter=s', # exclude or include
'freq_freq=f', # frequency to filter on
'freq_gt_lt=s', # gt or lt (greater than or less than)
'freq_pop=s', # population to filter on
'filter_common', # shortcut to MAF filtering
'allow_non_variant', # allow non-variant VCF lines through
'process_ref_homs', # force processing of individuals with homozygous ref genotype
'individual=s', # give results by genotype for individuals
'phased', # force VCF genotypes to be interpreted as phased
'fork=i', # fork into N processes
'dont_skip', # don't skip vars that fail validation
# verbosity options
'verbose|v', # print out a bit more info while running
'quiet', # print nothing to STDOUT (unless using -o stdout)
'no_progress', # don't display progress bars
# output options
'everything|e', # switch on EVERYTHING :-)
'output_file|o=s', # output file name
'tabix', # bgzip and tabix-index output
'html', # generate an HTML version of output
'stats_file|sf=s', # stats file name
'stats_text', # write stats as text
'warning_file=s', # file to write warnings to
'no_stats', # don't write stats file
'force_overwrite', # force overwrite of output file if already exists
'terms|t=s', # consequence terms to use e.g. NCBI, SO
'coding_only', # only return results for consequences in coding regions
'canonical', # indicates if transcript is canonical
'tsl', # output transcript support level
'appris', # output APPRIS transcript annotation
'ccds', # output CCDS identifer
'xref_refseq', # output refseq mrna xref
'uniprot', # output Uniprot identifiers (includes UniParc)
'protein', # add e! protein ID to extra column
'biotype', # add biotype of transcript to output
'hgnc', # add HGNC gene ID to extra column
'symbol', # add gene symbol (e.g. HGNC)
'gene_phenotype', # indicate if genes are phenotype-associated
'hgvs', # add HGVS names to extra column
'shift_hgvs=i', # disable/enable 3-prime shifting of HGVS indels to comply with standard
'sift=s', # SIFT predictions
'polyphen=s', # PolyPhen predictions
'humdiv', # use humDiv instead of humVar for PolyPhen
'condel=s', # Condel predictions
'variant_class', # get SO variant type
'regulatory', # enable regulatory stuff
'cell_type=s' => ($config->{cell_type} ||= []), # filter cell types for regfeats
'convert=s', # convert input to another format (doesn't run VEP)
'no_intergenic', # don't print out INTERGENIC consequences
'gvf', # produce gvf output
'vcf', # produce vcf output
'solr', # produce XML output for Solr
'json', # produce JSON document output
'tab', # produce tabulated output
'vcf_info_field=s', # allow user to change VCF info field name
'keep_csq', # don't nuke existing CSQ fields in VCF
'keep_ann', # synonym for keep_csq
'no_consequences', # don't calculate consequences
'lrg', # enable LRG-based features
'fields=s', # define your own output fields
'domains', # output overlapping protein features
'numbers', # include exon and intron numbers
'total_length', # give total length alongside positions e.g. 14/203
'allele_number', # indicate allele by number to avoid confusion with VCF conversions
'no_escape', # don't percent-escape HGVS strings
# cache stuff
'database', # must specify this to use DB now
'cache', # use cache
'cache_version=i', # specify a different cache version
'write_cache', # enables writing to the cache
'show_cache_info', # print cache info and quit
'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
'build_test', # disable some slow start-up stuff for speed when testing
'build_parts=s', # choose which bits of the cache to build (t=transcript, v=variants, r=regfeats)
'build_range=s', # for testing, give a coord range. Probably best when using one chrom only e.g. --build 21
'no_adaptor_cache', # don't write adaptor cache
'strip', # strips adaptors etc from objects before caching them
'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
'dir_cache=s', # specific directory for cache
'dir_plugins=s', # specific directory for plugins
'cache_region_size=i', # size of region in bases for each cache file
'no_slice_cache', # tell API not to cache features on slice
'standalone', # standalone renamed offline
'offline', # offline mode uses minimal set of modules installed in same dir, no DB connection
'skip_db_check', # don't compare DB parameters with cached
'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
'custom=s' => ($config->{custom} ||= []), # specify custom tabixed bgzipped file with annotation
'tmpdir=s', # tmp dir used for BigWig retrieval
'plugin=s' => ($config->{plugin} ||= []), # specify a method in a module in the plugins directory
'safe', # die if plugins don't compile or spit warnings
'fasta=s', # file or dir containing FASTA files with reference sequence
'freq_file=s', # file containing freqs to add to cache build
'freq_vcf=s' => ($config->{freq_vcf} ||= []), # VCF file containing freqs
'sereal', # user Sereal instead of Storable for the cache
'synonyms=s', # chromosome synonyms file
# debug
'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
'count_queries', # counts SQL queries executed
'admin', # allows me to build off public hosts
'debug', # print out debug info
) or die "ERROR: Failed to parse command-line flags\n";
# print usage message if requested or no args supplied
if(defined($config->{help}) || !$args) {
&usage;
exit(0);
}
# config file?
if(defined $config->{config}) {
read_config_from_file($config, $config->{config});
}
# dir is where the cache and plugins live
my $default_dir = join '/', ($ENV{'HOME'}, '.vep');
$config->{dir_plugins} ||= ($config->{dir} ? $config->{dir}.'/Plugins' : $default_dir.'/Plugins');
$config->{dir} ||= $config->{dir_cache} || $default_dir;
# ini file?
my $ini_file = $config->{dir}.'/vep.ini';
if(-e $ini_file) {
read_config_from_file($config, $ini_file);
}
# everything?
if(defined($config->{everything})) {
my %everything = (
sift => 'b',
polyphen => 'b',
ccds => 1,
hgvs => 1,
symbol => 1,
numbers => 1,
domains => 1,
regulatory => 1,
canonical => 1,
protein => 1,
biotype => 1,
gmaf => 1,
maf_1kg => 1,
maf_esp => 1,
maf_exac => 1,
pubmed => 1,
uniprot => 1,
tsl => 1,
appris => 1,
variant_class => 1,
gene_phenotype => 1,
);
$config->{$_} = $everything{$_} for keys %everything;
}
# subroutine to check for illegal flags or combinations
check_flags($config);
# connection settings for Ensembl Genomes
if($config->{genomes}) {
$config->{host} ||= 'mysql-eg-publicsql.ebi.ac.uk';
$config->{port} ||= 4157;
}
# connection settings for main Ensembl
else {
$config->{species} ||= "homo_sapiens";
$config->{host} ||= 'ensembldb.ensembl.org';
$config->{port} ||= 3306;
}
# refseq or core?
if(defined($config->{refseq})) {
$config->{core_type} = 'otherfeatures';
}
else {
$config->{core_type} = 'core';
}
# turn on rest for json
if(defined($config->{json})) {
$config->{rest} = 1;
eval q{ use JSON; };
if($@) {
die("ERROR: Could not load required JSON module\n");
}
}
if(defined($config->{vcf})) {
$config->{keep_csq} = 1 if defined($config->{keep_ann});
$config->{$_} = 1 for qw(symbol biotype numbers);
}
# force quiet if outputting to STDOUT
if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
delete $config->{verbose} if defined($config->{verbose});
$config->{quiet} = 1;
}
# individual(s) specified?
if(defined($config->{individual})) {
$config->{individual} = [split /\,/, $config->{individual}];
# force allow_non_variant
$config->{allow_non_variant} = 1;
}
# regulatory has to be on for cell_type
if(defined($config->{cell_type}) && scalar(@{$config->{cell_type}})) {
$config->{regulatory} = 1;
$config->{cell_type} = [map {split /\,/, $_} @{$config->{cell_type}}];
}
else {
delete $config->{cell_type};
}
# summarise options if verbose
if(defined $config->{verbose}) {
my $header =<<INTRO;
#----------------------------------#
# ENSEMBL VARIANT EFFECT PREDICTOR #
#----------------------------------#
version $VERSION
By Will McLaren (wm2\@ebi.ac.uk)
Configuration options:
INTRO
print $header;
my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
foreach my $key(sort keys %$config) {
next if ref($config->{$key}) eq 'ARRAY' && scalar @{$config->{$key}} == 0;
print $key.(' ' x (($max_length - length($key)) + 4)).(ref($config->{$key}) eq 'ARRAY' ? join "\t", @{$config->{$key}} : $config->{$key})."\n";
}
print "\n".("-" x 20)."\n\n";
}
# set defaults
$config->{user} ||= 'anonymous';
$config->{buffer_size} ||= 5000;
$config->{chunk_size} ||= '50kb';
$config->{output_file} ||= "variant_effect_output.txt";
$config->{stats_file} ||= $config->{output_file}."_summary.".(defined($config->{stats_text}) ? 'txt' : 'html');
$config->{tmpdir} ||= '/tmp';
$config->{format} ||= 'guess';
$config->{terms} ||= 'SO';
$config->{cache_region_size} ||= 1000000;
$config->{compress} ||= 'gzip -dc';
$config->{polyphen_analysis} = defined($config->{humdiv}) ? 'humdiv' : 'humvar';
$config->{vcf_info_field} ||= 'CSQ';
# shift HGVS?
if(defined($config->{shift_hgvs})) {
use Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor;
use Bio::EnsEMBL::Variation::DBSQL::DBAdaptor;
no warnings 'once';
$Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor::DEFAULT_SHIFT_HGVS_VARIANTS_3PRIME = $config->{shift_hgvs};
no warnings 'once';
$Bio::EnsEMBL::Variation::DBSQL::DBAdaptor::DEFAULT_SHIFT_HGVS_VARIANTS_3PRIME = $config->{shift_hgvs};
}
# frequency filtering
if(defined($config->{filter_common})) {
$config->{check_frequency} = 1;
# set defaults
$config->{freq_freq} ||= 0.01;
$config->{freq_filter} ||= 'exclude';
$config->{freq_pop} ||= '1KG_ALL';
$config->{freq_gt_lt} ||= 'gt';
}
if(defined($config->{check_frequency})) {
foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
die "ERROR: To use --check_frequency you must also specify flag --$flag\n" unless defined $config->{$flag};
}
# need to set check_existing
$config->{check_existing} = 1;
}
foreach my $flag(qw(check_existing check_alleles gmaf maf_1kg maf_esp maf_exac pubmed)) {
$config->{check_existing} = 1 if defined $config->{$flag};
}
# warn users still using whole_genome flag
if(defined($config->{whole_genome})) {
debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
}
$config->{whole_genome} = 1 unless defined $config->{no_whole_genome};
$config->{failed} = 0 unless defined $config->{failed};
$config->{chunk_size} =~ s/mb?/000000/i;
$config->{chunk_size} =~ s/kb?/000/i;
$config->{cache_region_size} =~ s/mb?/000000/i;
$config->{cache_region_size} =~ s/kb?/000/i;
# cluck and display executed SQL?
if (defined($config->{cluck})) {
no warnings 'once';
$Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1;
}
# write_cache needs cache
$config->{cache} = 1 if defined $config->{write_cache};
$config->{cache} = 1 if defined $config->{offline};
$config->{cache} = 1 if defined $config->{show_cache_info};
# no_slice_cache and whole_genome have to be on to use cache
if(defined($config->{cache})) {
$config->{no_slice_cache} = 1;
$config->{whole_genome} = 1;
$config->{strip} = 1;
}
$config->{build} = $config->{rebuild} if defined($config->{rebuild});
# force options for full build
if(defined($config->{build})) {
$config->{symbol} = 1;
$config->{no_slice_cache} = 1;
$config->{cache} = 1;
$config->{strip} = 1;
$config->{write_cache} = 1;
$config->{cell_type} = [1] if defined($config->{regulatory});
$config->{build_parts} ||= 'tvr';
die("ERROR: --build_parts [tvr] must have at least one of t (transcripts), v (variants), r (regfeats)\n") unless $config->{build_parts} =~ /^[tvr]+$/;
}
# connect to databases
$config->{reg} = &connect_to_dbs($config);
# setup cache dir etc
setup_cache($config) if defined($config->{cache});
if(defined($config->{show_cache_info})) {
my $v = get_version_data($config);
print "$_\t".$v->{$_}."\n" for keys %$v;
exit(0);
}
# chromosome synonyms
read_chromosome_synonyms($config, $config->{synonyms}) if defined($config->{synonyms});
# setup FASTA file
if(defined($config->{fasta})) {
# spoof a coordinate system
$config->{coord_system} = Bio::EnsEMBL::CoordSystem->new(
-NAME => 'chromosome',
-RANK => 1,
);
$config->{fasta_db} = setup_fasta(
-FASTA => $config->{fasta},
-ASSEMBLY => $config->{assembly},
-OFFLINE => $config->{offline},
-SYNONYMS => $config->{chromosome_synonyms},
) if defined($config->{fasta});
}
# setup custom files
setup_custom($config) if defined($config->{custom});
# setup forking
setup_forking($config) if defined($config->{fork});
# offline needs cache, can't use HGVS
if(defined($config->{offline})) {
die("ERROR: Cannot generate HGVS coordinates in offline mode without a FASTA file (see --fasta)\n") if defined($config->{hgvs}) && !defined($config->{fasta});
die("ERROR: Cannot use HGVS as input in offline mode\n") if $config->{format} eq 'hgvs';
die("ERROR: Cannot use variant identifiers as input in offline mode\n") if $config->{format} eq 'id';
die("ERROR: Cannot do frequency filtering in offline mode\n") if defined($config->{check_frequency}) && $config->{freq_pop} !~ /1kg.*(all|afr|amr|asn|eur)/i;
die("ERROR: Cannot retrieve overlapping structural variants in offline mode\n") if defined($config->{check_sv});
die("ERROR: Cannot check reference sequences without a FASTA file (see --fasta)\n") if defined($config->{check_ref}) && !defined($config->{fasta});
die("ERROR: Cannot map to LRGs in offline mode\n") if defined($config->{lrg});
}
# suppress warnings that the FeatureAdpators spit if using no_slice_cache
Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
# we configure plugins here because they can sometimes switch on the
# regulatory config option
configure_plugins($config);
# include regulatory modules if requested
if(defined($config->{regulatory})) {
# do the use statements here so that users don't have to have the
# funcgen API installed to use the rest of the script
eval q{
use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
use Bio::EnsEMBL::Funcgen::MotifFeature;
use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
use Bio::EnsEMBL::Funcgen::BindingMatrix;
};
if($@) {
die("ERROR: Ensembl Funcgen API must be installed to use --regulatory or plugins that deal with regulatory features\n$@");
}
}
# user defined custom output fields
if(defined($config->{fields})) {
$config->{fields} = [split ',', $config->{fields}];
debug("Output fields redefined (".scalar @{$config->{fields}}." defined)") unless defined($config->{quiet});
$config->{fields_redefined} = 1;
}
$config->{fields} ||= \@OUTPUT_COLS;
# get adaptors (don't get them in offline mode)
unless(defined($config->{offline})) {
&get_adaptors($config);
# reg adaptors (only fetches if not retrieved from cache already)
&get_reg_adaptors($config) if defined($config->{regulatory});
}
# check regulatory available
if(defined($config->{regulatory}) && defined($config->{cache}) && !defined($config->{write_cache}) && !(defined($config->{cache_regulatory}) || defined($config->{cache_cell_types}))) {
# --everything this option is implicit so don't die
if(defined($config->{everything})) {
delete $config->{regulatory};
}
else {
die("ERROR: --regulatory is not available for this species");
}
}
# check cell types
if(defined($config->{cell_type}) && scalar @{$config->{cell_type}} && !defined($config->{build})) {
my $cls = '';
if(defined($config->{cache})) {
$cls = $config->{cache_cell_types};
}
else {
my $regulatory_build_adaptor = $config->{RegulatoryFeature_adaptor}->db->get_RegulatoryBuildAdaptor();
my $regulatory_build = $regulatory_build_adaptor->fetch_current_regulatory_build;
my @cell_types =
sort
map {s/ /\_/g; $_}
map {$_->display_label}
@{$regulatory_build->get_all_Epigenomes};
$cls = join ",", @cell_types;
}
foreach my $cl(@{$config->{cell_type}}) {
die "ERROR: cell type $cl not recognised; available cell types are:\n$cls\n" unless $cls =~ /(^|,)$cl(,|$)/;
}
}
# check SIFT/PolyPhen available?
foreach my $tool(grep {defined($config->{$_})} qw(sift polyphen)) {
my $vd = get_version_data($config);
unless(defined($vd->{$tool}) || defined($config->{'cache_'.$tool.'_version'})) {
# --everything this option is implicit so don't die
if(defined($config->{everything})) {
delete $config->{$tool};
}
else {
die("ERROR: --$tool is not available for this species or cache");
}
}
}
# get terminal width for progress bars
unless(defined($config->{quiet}) || defined($config->{no_progress})) {
my $width;
# module may not be installed
eval q{
use Term::ReadKey;
};
if(!$@) {
my ($w, $h);
# module may be installed, but e.g.
eval {
($w, $h) = GetTerminalSize();
};
$width = $w if defined $w;
}
$width ||= 60;
$width -= 12;
$config->{terminal_width} = $width;
}
# jump out to build cache if requested
if(defined($config->{build})) {
if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
}
# get VCF freqs
if(defined($config->{freq_vcf})) {
my @new;
foreach my $vcf_conf(@{$config->{freq_vcf}}) {
my ($freq_file, @opts_and_pops) = split /\,/, $vcf_conf;
my @opts = grep {/\=/} @opts_and_pops;
my @file_pops = grep {!/\=/} @opts_and_pops;
my %opts = (
file => $freq_file,
pops => \@file_pops,
);