-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 37ffb36
Showing
13 changed files
with
65,551 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
*~ | ||
*/*~ | ||
*/*/*~ | ||
db/tmp | ||
db/blast/silva* | ||
examples/contigs.500.fasta | ||
examples/cyano.fna | ||
examples/test_output | ||
examples/\.* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
rRNAFinder is a small software package, which can be used to automatically predicts and annotates ribosome RNA using only assembled genome/metagenome contigs as input. It reports the location, type of the | ||
genes, and their taxonomic assignments. The outputs of the gene predictions are gff files and a set of fasta format sequence files, which are 16SrRNA.ffn, 18SrRNA.ffn, 5SrRNA.ffn, 5_8SrRNA.ffn, 23SrRNA.f | ||
fn, and 28SrRNA.ffn. It uses nhmmer program search against a hmm database to do the rRNA gene predication. | ||
The predicted 16s, 18s, 23s, and 28s rNRA fasta sequences can be further annotated by searching against SILVA database to get he taxonomic assignment. | ||
|
||
Features: | ||
The program can use multiple threads and runs very fast. | ||
Predict rRNA genes (5s, 5.8s, 16s, 18s, 23s, 28s rRNA genes) | ||
Taxonomic assignment of the predicted rRNA genes | ||
|
||
Example command | ||
#RNA prediction, produce rRNA.fasta and rRA.gff.txt file | ||
perl bin/rRNAFinder.pl examples/test.fasta | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/env perl | ||
use warnings; | ||
use strict; | ||
|
||
my @taxons = (); | ||
my $index = 0; | ||
|
||
while (<DATA>){ | ||
chomp; | ||
my @terms = split(/;/, $_); | ||
next if @terms == 0; | ||
@{$taxons[$index]} = @terms; | ||
print join(";", @{$taxons[$index]}); | ||
print "\n"; | ||
$index++; | ||
} | ||
|
||
my $com = lca(@taxons); | ||
print join(";", @$com); | ||
print "\n"; | ||
# Get list of species names, find their Tax IDs, then find the Lowest Common Ancestor (LCA) | ||
sub lca { | ||
my @taxids = @_; | ||
|
||
my $size = @taxids; | ||
my $test = $taxids[0]; | ||
#print STDERR "lca size=$size\n"; | ||
#print STDERR "first*********=@$test\n"; | ||
|
||
|
||
for (my $index = 0; $index <$size; $index++){ | ||
#$test = $taxids[$index]; | ||
#print STDERR "index=$index*********=@$test\n"; | ||
} | ||
|
||
if (@taxids == 0) {return 0;} # Empty list? Not assigned=0 | ||
|
||
@taxids=grep {defined} @taxids; # screen out undefined values | ||
|
||
if (@taxids==0) {die("undefined values snuck into the taxid list; help?");} # how did undefined values get on the list? | ||
#print " taxid3: ",(map {my @z=@$_;sprintf("%7s ",$z[$#z])} @taxids),"\n"; | ||
|
||
if (@taxids==0) {return -1;} # No hits | ||
#print " taxid4: ",(map {my @z=@$_;sprintf("%7s ",$z[$#z])} @taxids),"\n"; | ||
|
||
# Find the lca | ||
my $n; | ||
for ($n=0;$n<100;$n++) { | ||
my @col; | ||
my $size = @taxids; | ||
|
||
for (my $m=0,@col=();$m<@taxids;$m++) { push @col,$taxids[$m][$n]; } | ||
if (!defined($col[0]) or !unanimous(@col)) {last;} | ||
} | ||
#if ($n<=0) {print STDERR "n=$n\n"; die("Impossible Error: Lca algorithm couldn't find any nodes in common!? Data:\n".Dumper(\@_,\@taxids)." ");} | ||
|
||
my @slice = (); | ||
|
||
for(my $i = 0; $i < $n; $i++){ | ||
push (@slice, $taxids[0][$i]); | ||
} | ||
|
||
#print STDERR "n=$n, common nodes=", $taxids[0][$n-1], "\n"; | ||
#print STDERR "n*******=$n, common nodes=@slice\n"; | ||
|
||
return \@slice; | ||
|
||
|
||
#return $taxids[0][$n-1]; #Return the taxid of the last common node | ||
} | ||
# Return 1 if the array is unanimous, 0 if not | ||
sub unanimous { | ||
if (!defined $_[0]) { # Unanimously 'undef'ined | ||
if (grep {defined} @_) {return 0;} | ||
return 1; | ||
} | ||
if (grep {!defined or $_ ne $_[0]} @_) {return 0;} | ||
return 1; | ||
} | ||
|
||
__DATA__ | ||
Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales (SR1);uncultured bacterium | ||
Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales (SR1);uncultured organism | ||
Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
#!/usr/bin/env perl | ||
use File::Fetch; | ||
use Archive::Extract; | ||
use FindBin qw($Bin); | ||
use strict; | ||
use warnings; | ||
|
||
my $now_string = localtime; # e.g., "Thu Oct 13 04:54:34 1994" | ||
print STDERR "build database at: $now_string\n"; | ||
my $bin = "$FindBin::Bin"; | ||
my $tmpdir = "$bin/../db/tmp"; | ||
my $hmm_dir = "$bin/../db/hmm"; | ||
print STDERR "hmmdb_dir=$hmm_dir\n"; | ||
|
||
my $fp = find_exe("hmmbuild"); | ||
if(!$fp){ | ||
err("Cannot find hmmbuild tool"); | ||
} | ||
|
||
my $rfam = "Rfam.seed"; | ||
if(! -e "$tmpdir/$rfam.gz"){ | ||
my $ff = File::Fetch->new(uri => "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/$rfam.gz"); | ||
|
||
### fetch the uri to local directory### | ||
print STDERR "Fetching $rfam.gz\n"; | ||
my $where = $ff->fetch(to => $tmpdir) or die $ff->error; | ||
} | ||
if(! -e "$tmpdir/$rfam"){ | ||
my $ae = Archive::Extract->new(archive =>"$tmpdir/$rfam.gz"); | ||
$ae->extract(to => $tmpdir); | ||
} | ||
my %queries = ( | ||
|
||
"5S_rRNA" => "$tmpdir/5S_rRNA.rfam.align.fasta", | ||
"5_8S_rRNA" =>"$tmpdir/5_8S_rRNA.rfam.align.fasta", | ||
"SSU_rRNA_bacteria" =>"$tmpdir/SSU_rRNA_bacteria.rfam.align.fasta", | ||
"SSU_rRNA_archaea" =>"$tmpdir/SSU_rRNA_archaea.rfam.align.fasta", | ||
"SSU_rRNA_eukarya" =>"$tmpdir/SSU_rRNA_eukarya.rfam.align.fasta", | ||
"LSU_rRNA_archaea" =>"$tmpdir/LSU_rRNA_archaea.rfam.align.fasta", | ||
"LSU_rRNA_bacteria" =>"$tmpdir/LSU_rRNA_bacteria.rfam.align.fasta", | ||
"LSU_rRNA_eukarya" =>"$tmpdir/LSU_rRNA_eukarya.rfam.align.fasta" | ||
); | ||
open(STOCK, "$tmpdir/$rfam") or die "cannot open $rfam for reading: $!"; | ||
|
||
my $i = 0; | ||
$/ = "\n\//"; | ||
|
||
foreach my $query (keys %queries){ | ||
|
||
my $outname = $queries{$query}; | ||
open(OUT, ">$outname") or die "cannot open $outname for writting: $!\n"; | ||
while(<STOCK>){ | ||
chomp; | ||
if(/\#=GF\s+ID\s+$query/s){ | ||
my @lines = split(/\n/, $_); | ||
foreach my $line (@lines){ | ||
if($line =~ /^\#/ || $line !~ /\S/){ | ||
next; | ||
} | ||
else{ | ||
#seqid alignment | ||
my @a = split(/\s+/, $line); | ||
print OUT ">$a[0]\n$a[1]\n"; | ||
} | ||
|
||
} | ||
|
||
} | ||
} | ||
close(OUT); | ||
seek STOCK, 0, 0; | ||
} | ||
$/ = "\n"; | ||
close(STOCK); | ||
|
||
fasta2domain("5S_rRNA.rfam.align.fasta", $tmpdir); | ||
fasta2domain("5_8S_rRNA.rfam.align.fasta", $tmpdir); | ||
|
||
my %hmms = ( | ||
"arc_5SrRNA" => "$tmpdir/arc_5S_rRNA.rfam.align.fasta", | ||
"euk_5SrRNA" => "$tmpdir/euk_5S_rRNA.rfam.align.fasta", | ||
"bac_5SrRNA" => "$tmpdir/bac_5S_rRNA.rfam.align.fasta", | ||
"euk_5_8SrRNA" => "$tmpdir/euk_5_8S_rRNA.rfam.align.fasta", | ||
"arc_16SrRNA" => "$tmpdir/SSU_rRNA_archaea.rfam.align.fasta", | ||
"bac_16SrRNA" => "$tmpdir/SSU_rRNA_bacteria.rfam.align.fasta", | ||
"euk_18SrRNA" => "$tmpdir/SSU_rRNA_eukarya.rfam.align.fasta", | ||
"arc_23SrRNA" => "$tmpdir/LSU_rRNA_archaea.rfam.align.fasta", | ||
"bac_23SrRNA" => "$tmpdir/LSU_rRNA_bacteria.rfam.align.fasta", | ||
"euk_28SrRNA" => "$tmpdir/LSU_rRNA_eukarya.rfam.align.fasta" | ||
); | ||
|
||
my $cmd = ""; | ||
foreach my $name (keys %hmms){ | ||
my $alignFile = $hmms{$name}; | ||
$cmd .= "hmmbuild --rna -n $name $tmpdir/$name.hmm $alignFile;" | ||
} | ||
|
||
print STDERR "cmd=$cmd\n"; | ||
system ($cmd) >> 8 and die "Could not execute cmd=$cmd, $!\n"; | ||
$cmd = "cat $tmpdir/arc_16SrRNA.hmm $tmpdir/arc_23SrRNA.hmm $tmpdir/arc_5SrRNA.hmm > $hmm_dir/arc.hmm;"; | ||
$cmd .= "cat $tmpdir/bac_16SrRNA.hmm $tmpdir/bac_23SrRNA.hmm $tmpdir/bac_5SrRNA.hmm > $hmm_dir/bac.hmm;"; | ||
$cmd .= "cat $tmpdir/euk_18SrRNA.hmm $tmpdir/euk_28SrRNA.hmm $tmpdir/euk_5_8SrRNA.hmm $tmpdir/euk_5SrRNA.hmm > $hmm_dir/euk.hmm;"; | ||
|
||
print STDERR "cmd=$cmd\n"; | ||
system ($cmd) >> 8 and die "Could not execute cmd=$cmd, $!\n"; | ||
|
||
my @tmp_files = glob "$tmpdir/*"; | ||
unlink glob "'$tmpdir/*'"; | ||
#rmdir $tmpdir or warn "couldn't rmdir $tmpdir: $!\n"; | ||
|
||
|
||
sub fasta2domain{ | ||
my ($fasta, $tmpdir) = @_; | ||
use Bio::DB::EUtilities; | ||
use Bio::SeqIO; | ||
|
||
open (FASTA, "$tmpdir/$fasta") or die "Could not open $tmpdir/$fasta to read, $!\n"; | ||
$/ = "\n>"; | ||
open (BAC, ">$tmpdir/bac\_$fasta") or die "Could not open $tmpdir\/bac\_$fasta for writting, $!\n"; | ||
open (ARC, ">$tmpdir\/arc\_$fasta") or die "Could not open $tmpdir\/arc\_$fasta for writting, $!\n";; | ||
open (EUK, ">$tmpdir\/euk\_$fasta") or die "Could not open $tmpdir\/euk\_$fasta for writting, $!\n";; | ||
|
||
while(<FASTA>){ | ||
chomp; | ||
if(my ($seqid,$other, $align) = /^>?(\S+?)(\/.*?)\n(.*)/s){ | ||
my $factory = Bio::DB::EUtilities->new(-eutil => 'efetch', | ||
-db => 'nucleotide', | ||
-rettype => 'gb', | ||
-email => '[email protected]', | ||
-id => $seqid); | ||
my $file = "$tmpdir/myseqs.gb"; | ||
|
||
# dump HTTP::Response content to a file (not retained in memory) | ||
$factory->get_Response(-file => $file); | ||
my $seqin = Bio::SeqIO->new(-file => $file, | ||
-format => 'genbank'); | ||
|
||
while (my $seq = $seqin->next_seq) { | ||
my @classification = $seq->species->classification; | ||
my $lineage = join('\t', @classification); | ||
if($lineage =~ /Bacteria/i){ | ||
print BAC ">$seqid$other\n$align\n"; | ||
} | ||
elsif($lineage =~ /Archaea/i){ | ||
print ARC ">$seqid$other\n$align\n"; | ||
} | ||
elsif($lineage =~ /Eukaryota/i){ | ||
print EUK ">$seqid$other\n$align\n"; | ||
} | ||
} | ||
|
||
} | ||
|
||
} | ||
|
||
$/ = "\n"; | ||
close(FASTA); | ||
close(BAC); | ||
close(ARC); | ||
close(EUK); | ||
} | ||
sub find_exe { | ||
my($bin) = shift; | ||
for my $dir (File::Spec->path) { | ||
my $exe = File::Spec->catfile($dir, $bin); | ||
|
||
if(-x $exe){ | ||
|
||
return $exe; | ||
} | ||
} | ||
return; | ||
} |
Oops, something went wrong.