-
Notifications
You must be signed in to change notification settings - Fork 34
/
Copy pathncbi_ftp_concat_unpack.pl
408 lines (367 loc) · 20.6 KB
/
ncbi_ftp_concat_unpack.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/usr/bin/perl
use warnings;
use strict;
use File::Find; # module to traverse directory trees
my $usage = << "USAGE";
#######################################################################
# $0 genbank|refseq [y] #
# #
# Unpacks and concatenates all draft and complete genomes (in genbank #
# and fasta format) downloaded from NCBI's FTP server #
# (ftp://ftp.ncbi.nlm.nih.gov/). The script traverses the downloaded #
# NCBI FTP folder structure and thus has to be called from the top #
# level (containing the folder './ftp.ncbi.nlm.nih.gov'). #
# Therefore, use the bash-shell wrapper script 'ncbi_ftp_download.sh',#
# which employs 'wget' to download the genomes and mirrors the NCBI #
# FTP server folder structure locally (with 'ftp.ncbi.nlm.nih.gov' #
# being the top folder). Afterwards, 'ncbi_ftp_download.sh' runs #
# 'ncbi_ftp_concat_unpack.pl' with both 'genbank' and 'refseq' #
# options, as well as option 'y' to overwrite the old result folders #
# (see below). Both scripts have to be in the same directory (or in #
# the path) to run 'ncbi_ftp_download.sh'. #
# For COMPLETE genomes PLASMIDS are concatenated to the CHROMOSOMES to#
# create multi-genbank/-fasta files (script 'split_multi-seq_file.pl' #
# can be used to split the multi-seq file to single-seq files). #
# In DRAFT genomes, SCAFFOLD and/or CONTIG files, designated by #
# 'draft_scaf' or 'draft_con', are controlled for annotation (i.e. if #
# gene primary feature tags exist). Usually only one file contains #
# annotations. The one with annotation is then used to create multi- #
# genbank files. Multi-fasta files are created for the corresponding #
# genbank-file or, if no annotation exists, for the file which #
# contains more sequence information (either contigs or scaffolds), #
# and if equal, scaffolds are preferred. #
# #
# Use option 'genbank' to extract/copy GenBank genomes and 'refseq' #
# for RefSeq genomes. Concatenated files are stored in the result #
# folders './genbank' and './refseq', respectively. Set option 'y' in #
# the program call to delete previous result folders and create new #
# ones (otherwise, the script will ask user if to proceed). #
# If sequence size discrepancies between a genbank and its #
# corresponding fasta file are found, error file 'seq_errors.txt' will#
# contain warnings. #
# #
# version 0.2.1, update: 13.07.2015 Andreas Leimbach #
# 15.09.2012 aleimba[at]gmx[dot]de #
#######################################################################
USAGE
;
### Print usage if -h|--h|--help is given as argument or options are not given
my ($db, $ask) = @ARGV; # if $ask not given, run subroutine 'ask_exit' (below) to get input from STDIN
if (!defined $db) {
die $usage;
} elsif ($db =~ m/-h/) {
die $usage;
}
my $err = 'seq_errors.txt'; # Error file will be created if sequence discrepancies between genbank and corresponding fasta files will be found (see sub 'warn_seq')
if (-e $err) { # remove error file from previous run, as it will be opened/created in append mode
unlink $err or die "Can't remove previous error file \'$err\' for new script run: $!\n";
}
### Set the correct directories to start traversion of the folder structure
my $dir_complete; # for complete genomes
my $dir_draft; # for draft genomes
if ($db =~ /genbank/i) {
$db = 'genbank';
$dir_complete = './ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria';
$dir_draft = './ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria_DRAFT';
ask_exit($db, $ask); # subroutine to ask if the current result folder should be deleted and created new, or exit the script
rm_dir($db); # subroutine to remove the result directory and all its contents prior filling it with new files (in case files have changed)
} elsif ($db =~ /refseq/i) {
$db = 'refseq';
$dir_complete = './ftp.ncbi.nlm.nih.gov/genomes/Bacteria';
$dir_draft = './ftp.ncbi.nlm.nih.gov/genomes/Bacteria_DRAFT';
ask_exit($db, $ask);
rm_dir($db);
} else {
die "\nFatal error: wrong database name, give either 'genbank' or 'refseq' as first argument!\n\n";
}
### Concatenate all genbank and fasta files for COMPLETE genomes in subdirectories and write to the genbank/refseq folders
find ({wanted => \&completes, no_chdir => 1}, $dir_complete); # the function 'find' from File::Find takes a reference to a subroutine and a starting directory to walk through recursively (e.g. 'find (\$concat, $dirname)')
# For each file/directory/link in the tree (also in the current directory) it calls the referenced function and changes to that directory with 'chdir()' --> this can be stopped with 'no_chdir => 1', parameters to find are passed as a hash reference
### Unzip and unpack all DRAFT files with *.tgz in subdirectories and concat in result folders
my %draft_files; # store the filenames of the concatenated files in here (in subroutine 'drafts')
find ({wanted => \&drafts, no_chdir => 1}, $dir_draft);
### For DRAFTS go through all the files in the result folder and keep only gbk files with annotation (either only in contig or scaffold) and the corresponding fastas for each strain; if no annotation exists keep the fasta file with the most sequence information (contig or scaffold); delete the other files (gbk and corresponding fasta) in the result folder
my ($A, $C, $G, $T); # to calculate the total base count (subs 'seq_size' and 'base_count')
my $skip = ''; # not possible to delete elements from a hash while iterating through it, thus need a variable to skip files
foreach my $file (sort{$b cmp $a} keys %draft_files) { # reverse sort to get scaf files first if existent
$skip =~ s/(\S+)\_draft\_(scaf|con)\.\w*$/$1/; # get rid of file-extension to skip for scaffold genomes corresponding contig gbk
if ($file =~ /$skip/) { # $skip stores $file from previous foreach
next;
}
if ($file =~ /\_draft\_scaf\.gbk$/) { # if scaffold file exists
print "\nProcessing DRAFT genome: $file\n"; # status message
my $scaf_gbkfile = my $scaf_fasfile = $file;
$scaf_fasfile =~ s/gbk$/fasta/;
my $con_gbkfile = $scaf_gbkfile;
$con_gbkfile =~ s/\_scaf\./\_con\./;
my $con_fasfile = $con_gbkfile;
$con_fasfile =~ s/gbk$/fasta/;
my $scaf_size = -s "./$db/$scaf_gbkfile"; # file size of the scaffold gbk file
my $con_size;
if (-e "./$db/$con_gbkfile") { # only a scaffold file might exist and not a contig file
$con_size = -s "./$db/$con_gbkfile";
} else {
$con_size = 0;
}
my $scaf_gene = count_gene($scaf_gbkfile); # subroutine to count genes in genbank files
# print "scaf file size: $scaf_size\tscaf_gene: $scaf_gene\n"; # print statement to test output
my $con_gene = count_gene($con_gbkfile); # 0 if '$con_gbkfile' doesn't exist
# print "con file size: $con_size\tcon_gene: $con_gene\n"; # print statement to test output
if ($scaf_gene > $con_gene && $scaf_size > $con_size && $con_gene == 0) { # so many conditions needed? Do they always work?
print "Annotation in scaffolds: Keeping \'$scaf_gbkfile\' and \'$scaf_fasfile\' and removing contig files (if they exist)!\n"; # status message
rm_exist_file($db, $con_gbkfile); # remove the contig-gbk without annotation; subroutine, if only a scaffold file exists and not a contig file
$skip = $file; # scaffold file with annotation found, skip a corresponding contig gbk in %draft_files (if existent)
my $scaf_gbkcount = seq_size($scaf_gbkfile); # subroutine to calculate total bases (A, C, G, T) of multi-genbank or -fasta files
rm_exist_file($db, $con_fasfile); # also delete the corresponding fasta
my $scaf_fascount = seq_size($scaf_fasfile); # count total bases of fasta
# print "scaf_gbkcount: $scaf_gbkcount\tscaf_fascount: $scaf_fascount\n"; # print statement to test output
warn_seq($scaf_fasfile, $scaf_gbkcount, $scaf_fascount); # subroutine to warn if the genbank and the fasta file have differing sequence sizes
next;
} elsif ($con_gene > $scaf_gene && $con_size > $scaf_size && $scaf_gene == 0) {
print "Annotation in contigs: Keeping \'$con_gbkfile\' and \'$con_fasfile\' and removing scaffold files!\n";
unlink "./$db/$scaf_gbkfile" or die "Can't remove file \'$scaf_gbkfile\': $!\n";
$skip = $file;
my $con_gbkcount = seq_size($con_gbkfile);
unlink "./$db/$scaf_fasfile" or die "Can't remove file \'$scaf_fasfile\': $!\n";
my $con_fascount = seq_size($con_fasfile);
# print "con_gbkcount: $con_gbkcount\tcon_fascount: $con_fascount\n"; # print statement to test output
warn_seq($con_fasfile, $con_gbkcount, $con_fascount);
next;
} elsif ($con_gene == 0 && $scaf_gene == 0) { # no annotation in both genbanks, just keep one fasta
print "No annotation in either scaffold or contig file, but ";
unlink "./$db/$scaf_gbkfile" or die "Can't remove file \'$scaf_gbkfile\': $!\n";
rm_exist_file($db, $con_gbkfile);
$skip = $file;
my $scaf_fascount = seq_size($scaf_fasfile);
my $con_fascount = seq_size($con_fasfile); # 0 if '$con_gbkfile' doesn't exist
# print "\nscaf_fascount: $scaf_fascount\tcon_fascount: $con_fascount\n"; # print statement to test output
if ($scaf_fascount > $con_fascount || $scaf_fascount == $con_fascount) { # keep scaffold file if both same base count
print "more/equal sequence information in scaffolds (or contigs don't exist). Thus keeping \'$scaf_fasfile\' and removing contig fasta file!\n";
rm_exist_file($db, $con_fasfile);
} elsif ($con_fascount > $scaf_fascount) {
print "more sequence information in contigs. Thus keeping \'$con_fasfile\' and removing scaffold fasta file!\n";
unlink "./$db/$scaf_fasfile" or die "Can't remove file \'$scaf_fasfile\': $!\n";
}
next;
}
} elsif ($file =~ /\_draft\_con\.gbk$/) { # if no scaffold file exists, contig file exists
print "\nProcessing DRAFT genome: $file\n"; # status message
print "Only contigs exist ";
$skip = $file;
my $con_gene = count_gene($file);
my $con_fasfile = $file;
$con_fasfile =~ s/gbk$/fasta/;
if ($con_gene == 0) { # no annotation exists, only keep the fasta
unlink "./$db/$file" or die "Can't remove file \'$file\': $!\n";
print "but not annotated, thus keeping only fasta file \'$con_fasfile\'!\n";
} else {
print "and annotated, thus keeping contig genbank, \'$file\', and fasta files, \'$con_fasfile\'!\n";
my $con_gbkcount = seq_size($file);
my $con_fascount = seq_size($con_fasfile);
# print "con_gbkcount: $con_gbkcount\tcon_fascount: $con_fascount\n"; # print statement to test output
warn_seq($con_fasfile, $con_gbkcount, $con_fascount);
}
}
}
### Print result folder
print "\n#### All genome files have been written to the folder \'./$db\'!\n";
if (-e $err) {
print "#### Sequence discrepancies between genbank and corresponding fasta files have been found, see file \'$err\'!\n";
}
exit;
###############
# Subroutines #
###############
### Subroutine to ask if the script should proceed with deleting the previous result folder or exit, if $ask not given as ARGV
sub ask_exit {
my ($db, $ask) = @_;
if (!defined($ask) && -e $db) {
print "The script will delete an already existing result folder \'./$db\', proceed [y|n]? ";
$ask = <STDIN>;
} elsif (!defined($ask) && !-e $db) {
$ask = 'y';
}
if ($ask !~ /y/i) {
die "Script aborted!\n";
}
return 1;
}
### Subroutine that counts all bases
sub base_count {
my $seq = shift;
$A = ($seq =~ tr/[aA]//) + $A; # pattern match modifier like 'i' don't work with transliterations
$C = ($seq =~ tr/[cC]//) + $C;
$G = ($seq =~ tr/[gG]//) + $G;
$T = ($seq =~ tr/[tT]//) + $T;
return 1;
}
### Subroutine to concatenate all genbank and fasta files for COMPLETE genomes and store in the result folder
sub completes { # subroutine gets one argument $_, the file/directory seen by 'find'
if (/^\.+$/ || /.*\/Bacteria$/) { # skip system folders (. and ..) and 'Bacteria' folder
return 1;
} elsif (-d) { # '-d' = only directories
my $file = $_; # because of 'no_chdir => 1', $_ contains the full path to the file/dir
$file =~ s/.*\/(\S*)$/$1/; # get the folder name to create the filename
print "\nProcessing COMPLETE genome: $file\n"; # status message
$file =~ s/Escherichia_coli_/Ecoli_/; # shorten final E. coli file names
# $file =~ s/Shigella_/S/; # 'Shigella_D9' ends up as 'SD9'
my $gbk_file = "$file.gbk";
my $fas_file = "$file.fasta";
dir_check($db); # subroutine to test if result directory exists, if not create; ALWAYS for first file since above 'rm_dir'
system("cat $_/*.gbk > ./$db/$gbk_file"); # unix system call for concatenation
system("cat $_/*.fna > ./$db/$fas_file");
my $gbk_count = seq_size($gbk_file); # subroutine to calculate total bases (A, C, G, and T) of genbank or fasta files
my $fas_count = seq_size($fas_file);
# print "gbk_count: $gbk_count\tfas_count: $fas_count\n"; # print statement to test output
warn_seq($fas_file, $gbk_count, $fas_count); # subroutine to warn if the genbank and the fasta file have differing sequence sizes
}
return 1;
}
### Count the genes in genbank files
sub count_gene {
my $file = shift;
my $gene_count = 0;
if (-e "./$db/$file") { # if the file doesn't exist (e.g. only scaffold file and not contig file) $gene_count will remain 0
open(IN, "<./$db/$file") or die "Can't open file \'$file\': $!\n";
while (<IN>) {
$gene_count++ if /\s{3,}gene\s{3,}/; # Some Genbank files only annotated with 'gene' tags (instead of 'gene' and 'CDS'); '\s{3,}' at least three whitespaces
}
close IN;
}
return $gene_count;
}
### Subroutine to check for result directories (./genbank and ./refseq), if they don't exist create; ALWAYS as the prior directory with its contents is deleted at the start of the script with 'rm_dir'
sub dir_check {
my $dir = shift;
if (!-e $db) { # mkdir if not existent
mkdir $db or die "Couldn't create result directory \'$db\': $!\n";
}
return 1;
}
### Subroutine to work through all possible DRAFT files and call next subroutine 'unpack_concat'
sub drafts {
my @result; # stores results of sub 'unpack_concat'
push(@result, unpack_concat('contig.gbk')); # subroutine to unpack and concat respective files; returns nothing if argument (here 'contig.gbk') doesn't fit to filename and $result[x] will be undefined
push(@result, unpack_concat('scaffold.gbk'));
push(@result, unpack_concat('contig.fna'));
push(@result, unpack_concat('scaffold.fna'));
foreach (@result) { # for each succesful subroutine run @result contains one element, i.e. the concatenated filename
$draft_files{$_} = 1; # save filename in hash '%draft_files' if array element '$result[x]' defined
}
return 1;
}
### Subroutine to remove the prior result directories (./genbank and ./refseq) and all their contents, before new files are written in
sub rm_dir {
my $directory = shift;
if (-e $directory) {
opendir (DIR, $directory) or die "Can't opendir $directory: $!"; # system unix call 'rm -rf' is too dangerous
while (defined(my $file = readdir(DIR))) {
if ($file =~ /^\.+$/) { # skip system folders (. and ..)
next;
}
unlink "./$directory/$file" or die "Can't remove file \'$file\': $!\n";
}
close DIR;
rmdir $directory or die "Can't remove directory \'$directory\': $!\n"; # delete empty directory
return 1;
}
return 0;
}
### Test for file existence and remove file
sub rm_exist_file {
my ($db, $file) = @_;
if (-e "./$db/$file") {
unlink "./$db/$file" or die "Can't remove file \'$file\': $!\n";
}
return 1;
}
### Subroutine to remove the unpacked files and clean up
sub rm_files {
my ($directory, $assembly) = @_;
opendir (DIR, $directory) or die "Can't opendir \'$directory\' to remove the unpacked files: $!";
$assembly =~ s/.*(\..*)$/$1/; # get the file-extension to remove files
while (defined(my $file = readdir(DIR))) {
if ($file =~ /.*$assembly$/) { # Why does it not work to include: '-f $file &&'?
unlink "$directory/$file";
}
}
close DIR;
return 1;
}
### Calculate the total sequence in multi-genbanks and -fastas
sub seq_size {
my $file = shift;
my $seq_count = 0;
if (-e "./$db/$file") { # if the file doesn't exist (e.g. only scaffold file and not contig file) $seq_count will remain 0
$A = $C = $G = $T = 0;
open(IN, "<./$db/$file") or die "Can't open file \'$file\': $!\n";
if ($file =~ /.*\.gbk/) {
while (my $seq = <IN>) {
if ($seq =~ /^ORIGIN\s*$/) {
$seq = <IN>; # don't want the ORIGIN line, but the next one onwards until '//'
while ($seq !~ /\/\//) {
base_count($seq);
$seq = <IN>;
}
}
}
} elsif ($file =~ /.*\.fasta$/) {
while (my $seq = <IN>) {
if ($seq =~ /^>/) {
next;
}
base_count($seq);
}
}
close IN;
$seq_count = $A + $C + $G + $T;
}
return $seq_count;
}
### Subroutine to unzip, unpack all DRAFT files with *.tgz, and concatenate the contigs/scaffolds in the result folder
sub unpack_concat {
my $assembly = shift;
if (-f && /.*\.$assembly\.tgz/) { # only if $_ is a file (-f) and fits to $assembly.tgz; because of 'no_chdir => 1' $_ contains the full path to the file (with filename)
my $file = $_;
my $directory = $File::Find::dir; # internal variable for File::Find, contains the directory path to the file
system("tar xfz $file --directory=$directory"); # system call to unpack the $file.tgz
$file = $File::Find::dir; # replace path+file to just directory path
$file =~ s/.*\/(\S*)$/$1/; # get the folder name to create the filename
$file =~ s/Escherichia_coli_/Ecoli_/; # shorten final E. coli file names
# $file =~ s/Shigella_/S/; # 'Shigella_D9' ends up as 'SD9'
dir_check($db); # subroutine to test if result directory exists, if not create; ALWAYS for first file since above 'rm_dir'
if ($assembly =~ /contig.gbk/) {
$file = $file . '_draft_con.gbk';
system("cat $directory/*.gbk > ./$db/$file");
rm_files($directory, $assembly); # subroutine to remove unpacked files
return $file; # @results in sub 'drafts' defined
} elsif ($assembly =~ /scaffold.gbk/) {
$file = $file . '_draft_scaf.gbk';
system("cat $directory/*.gbk > ./$db/$file");
rm_files($directory, $assembly);
return $file;
} elsif ($assembly =~ /contig.fna/) {
$file = $file . '_draft_con.fasta';
system("cat $directory/*.fna > ./$db/$file");
rm_files($directory, $assembly);
return $file;
} elsif ($assembly =~ /scaffold.fna/) {
$file = $file . '_draft_scaf.fasta';
system("cat $directory/*.fna > ./$db/$file");
rm_files($directory, $assembly);
return $file;
}
}
return; # return undefined to scan @results for the concatenated file in subroutine 'drafts'
}
### Subroutine to warn if the genbank and corresponding fasta file have differing sequence
sub warn_seq {
my ($file, $gbk_size, $fasta_size) = @_;
if ($gbk_size != $fasta_size) {
$file =~ s/\.fasta//; # get rid of file-extension
open (ERR, ">>$err");
print ERR "There is a difference in the sequence length of corresponding files \'$file\.gbk: $gbk_size\' and \'$file\.fasta: $fasta_size\'!\n\n";
close ERR;
}
return 1;
}