Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoli-dong committed Aug 2, 2019
0 parents commit 37ffb36
Show file tree
Hide file tree
Showing 13 changed files with 65,551 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
*~
*/*~
*/*/*~
db/tmp
db/blast/silva*
examples/contigs.500.fasta
examples/cyano.fna
examples/test_output
examples/\.*
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
rRNAFinder is a small software package, which can be used to automatically predicts and annotates ribosome RNA using only assembled genome/metagenome contigs as input. It reports the location, type of the
genes, and their taxonomic assignments. The outputs of the gene predictions are gff files and a set of fasta format sequence files, which are 16SrRNA.ffn, 18SrRNA.ffn, 5SrRNA.ffn, 5_8SrRNA.ffn, 23SrRNA.f
fn, and 28SrRNA.ffn. It uses nhmmer program search against a hmm database to do the rRNA gene predication.
The predicted 16s, 18s, 23s, and 28s rNRA fasta sequences can be further annotated by searching against SILVA database to get he taxonomic assignment.

Features:
The program can use multiple threads and runs very fast.
Predict rRNA genes (5s, 5.8s, 16s, 18s, 23s, 28s rRNA genes)
Taxonomic assignment of the predicted rRNA genes

Example command
#RNA prediction, produce rRNA.fasta and rRA.gff.txt file
perl bin/rRNAFinder.pl examples/test.fasta


84 changes: 84 additions & 0 deletions bin/lca.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env perl
use warnings;
use strict;

my @taxons = ();
my $index = 0;

while (<DATA>){
chomp;
my @terms = split(/;/, $_);
next if @terms == 0;
@{$taxons[$index]} = @terms;
print join(";", @{$taxons[$index]});
print "\n";
$index++;
}

my $com = lca(@taxons);
print join(";", @$com);
print "\n";
# Get list of species names, find their Tax IDs, then find the Lowest Common Ancestor (LCA)
sub lca {
my @taxids = @_;

my $size = @taxids;
my $test = $taxids[0];
#print STDERR "lca size=$size\n";
#print STDERR "first*********=@$test\n";


for (my $index = 0; $index <$size; $index++){
#$test = $taxids[$index];
#print STDERR "index=$index*********=@$test\n";
}

if (@taxids == 0) {return 0;} # Empty list? Not assigned=0

@taxids=grep {defined} @taxids; # screen out undefined values

if (@taxids==0) {die("undefined values snuck into the taxid list; help?");} # how did undefined values get on the list?
#print " taxid3: ",(map {my @z=@$_;sprintf("%7s ",$z[$#z])} @taxids),"\n";

if (@taxids==0) {return -1;} # No hits
#print " taxid4: ",(map {my @z=@$_;sprintf("%7s ",$z[$#z])} @taxids),"\n";

# Find the lca
my $n;
for ($n=0;$n<100;$n++) {
my @col;
my $size = @taxids;

for (my $m=0,@col=();$m<@taxids;$m++) { push @col,$taxids[$m][$n]; }
if (!defined($col[0]) or !unanimous(@col)) {last;}
}
#if ($n<=0) {print STDERR "n=$n\n"; die("Impossible Error: Lca algorithm couldn't find any nodes in common!? Data:\n".Dumper(\@_,\@taxids)." ");}

my @slice = ();

for(my $i = 0; $i < $n; $i++){
push (@slice, $taxids[0][$i]);
}

#print STDERR "n=$n, common nodes=", $taxids[0][$n-1], "\n";
#print STDERR "n*******=$n, common nodes=@slice\n";

return \@slice;


#return $taxids[0][$n-1]; #Return the taxid of the last common node
}
# Return 1 if the array is unanimous, 0 if not
sub unanimous {
if (!defined $_[0]) { # Unanimously 'undef'ined
if (grep {defined} @_) {return 0;}
return 1;
}
if (grep {!defined or $_ ne $_[0]} @_) {return 0;}
return 1;
}

__DATA__
Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales (SR1);uncultured bacterium
Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales (SR1);uncultured organism
Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales
173 changes: 173 additions & 0 deletions bin/make_hmm_db.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#!/usr/bin/env perl
use File::Fetch;
use Archive::Extract;
use FindBin qw($Bin);
use strict;
use warnings;

my $now_string = localtime; # e.g., "Thu Oct 13 04:54:34 1994"
print STDERR "build database at: $now_string\n";
my $bin = "$FindBin::Bin";
my $tmpdir = "$bin/../db/tmp";
my $hmm_dir = "$bin/../db/hmm";
print STDERR "hmmdb_dir=$hmm_dir\n";

my $fp = find_exe("hmmbuild");
if(!$fp){
err("Cannot find hmmbuild tool");
}

my $rfam = "Rfam.seed";
if(! -e "$tmpdir/$rfam.gz"){
my $ff = File::Fetch->new(uri => "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/$rfam.gz");

### fetch the uri to local directory###
print STDERR "Fetching $rfam.gz\n";
my $where = $ff->fetch(to => $tmpdir) or die $ff->error;
}
if(! -e "$tmpdir/$rfam"){
my $ae = Archive::Extract->new(archive =>"$tmpdir/$rfam.gz");
$ae->extract(to => $tmpdir);
}
my %queries = (

"5S_rRNA" => "$tmpdir/5S_rRNA.rfam.align.fasta",
"5_8S_rRNA" =>"$tmpdir/5_8S_rRNA.rfam.align.fasta",
"SSU_rRNA_bacteria" =>"$tmpdir/SSU_rRNA_bacteria.rfam.align.fasta",
"SSU_rRNA_archaea" =>"$tmpdir/SSU_rRNA_archaea.rfam.align.fasta",
"SSU_rRNA_eukarya" =>"$tmpdir/SSU_rRNA_eukarya.rfam.align.fasta",
"LSU_rRNA_archaea" =>"$tmpdir/LSU_rRNA_archaea.rfam.align.fasta",
"LSU_rRNA_bacteria" =>"$tmpdir/LSU_rRNA_bacteria.rfam.align.fasta",
"LSU_rRNA_eukarya" =>"$tmpdir/LSU_rRNA_eukarya.rfam.align.fasta"
);
open(STOCK, "$tmpdir/$rfam") or die "cannot open $rfam for reading: $!";

my $i = 0;
$/ = "\n\//";

foreach my $query (keys %queries){

my $outname = $queries{$query};
open(OUT, ">$outname") or die "cannot open $outname for writting: $!\n";
while(<STOCK>){
chomp;
if(/\#=GF\s+ID\s+$query/s){
my @lines = split(/\n/, $_);
foreach my $line (@lines){
if($line =~ /^\#/ || $line !~ /\S/){
next;
}
else{
#seqid alignment
my @a = split(/\s+/, $line);
print OUT ">$a[0]\n$a[1]\n";
}

}

}
}
close(OUT);
seek STOCK, 0, 0;
}
$/ = "\n";
close(STOCK);

fasta2domain("5S_rRNA.rfam.align.fasta", $tmpdir);
fasta2domain("5_8S_rRNA.rfam.align.fasta", $tmpdir);

my %hmms = (
"arc_5SrRNA" => "$tmpdir/arc_5S_rRNA.rfam.align.fasta",
"euk_5SrRNA" => "$tmpdir/euk_5S_rRNA.rfam.align.fasta",
"bac_5SrRNA" => "$tmpdir/bac_5S_rRNA.rfam.align.fasta",
"euk_5_8SrRNA" => "$tmpdir/euk_5_8S_rRNA.rfam.align.fasta",
"arc_16SrRNA" => "$tmpdir/SSU_rRNA_archaea.rfam.align.fasta",
"bac_16SrRNA" => "$tmpdir/SSU_rRNA_bacteria.rfam.align.fasta",
"euk_18SrRNA" => "$tmpdir/SSU_rRNA_eukarya.rfam.align.fasta",
"arc_23SrRNA" => "$tmpdir/LSU_rRNA_archaea.rfam.align.fasta",
"bac_23SrRNA" => "$tmpdir/LSU_rRNA_bacteria.rfam.align.fasta",
"euk_28SrRNA" => "$tmpdir/LSU_rRNA_eukarya.rfam.align.fasta"
);

my $cmd = "";
foreach my $name (keys %hmms){
my $alignFile = $hmms{$name};
$cmd .= "hmmbuild --rna -n $name $tmpdir/$name.hmm $alignFile;"
}

print STDERR "cmd=$cmd\n";
system ($cmd) >> 8 and die "Could not execute cmd=$cmd, $!\n";
$cmd = "cat $tmpdir/arc_16SrRNA.hmm $tmpdir/arc_23SrRNA.hmm $tmpdir/arc_5SrRNA.hmm > $hmm_dir/arc.hmm;";
$cmd .= "cat $tmpdir/bac_16SrRNA.hmm $tmpdir/bac_23SrRNA.hmm $tmpdir/bac_5SrRNA.hmm > $hmm_dir/bac.hmm;";
$cmd .= "cat $tmpdir/euk_18SrRNA.hmm $tmpdir/euk_28SrRNA.hmm $tmpdir/euk_5_8SrRNA.hmm $tmpdir/euk_5SrRNA.hmm > $hmm_dir/euk.hmm;";

print STDERR "cmd=$cmd\n";
system ($cmd) >> 8 and die "Could not execute cmd=$cmd, $!\n";

my @tmp_files = glob "$tmpdir/*";
unlink glob "'$tmpdir/*'";
#rmdir $tmpdir or warn "couldn't rmdir $tmpdir: $!\n";


sub fasta2domain{
my ($fasta, $tmpdir) = @_;
use Bio::DB::EUtilities;
use Bio::SeqIO;

open (FASTA, "$tmpdir/$fasta") or die "Could not open $tmpdir/$fasta to read, $!\n";
$/ = "\n>";
open (BAC, ">$tmpdir/bac\_$fasta") or die "Could not open $tmpdir\/bac\_$fasta for writting, $!\n";
open (ARC, ">$tmpdir\/arc\_$fasta") or die "Could not open $tmpdir\/arc\_$fasta for writting, $!\n";;
open (EUK, ">$tmpdir\/euk\_$fasta") or die "Could not open $tmpdir\/euk\_$fasta for writting, $!\n";;

while(<FASTA>){
chomp;
if(my ($seqid,$other, $align) = /^>?(\S+?)(\/.*?)\n(.*)/s){
my $factory = Bio::DB::EUtilities->new(-eutil => 'efetch',
-db => 'nucleotide',
-rettype => 'gb',
-email => '[email protected]',
-id => $seqid);
my $file = "$tmpdir/myseqs.gb";

# dump HTTP::Response content to a file (not retained in memory)
$factory->get_Response(-file => $file);
my $seqin = Bio::SeqIO->new(-file => $file,
-format => 'genbank');

while (my $seq = $seqin->next_seq) {
my @classification = $seq->species->classification;
my $lineage = join('\t', @classification);
if($lineage =~ /Bacteria/i){
print BAC ">$seqid$other\n$align\n";
}
elsif($lineage =~ /Archaea/i){
print ARC ">$seqid$other\n$align\n";
}
elsif($lineage =~ /Eukaryota/i){
print EUK ">$seqid$other\n$align\n";
}
}

}

}

$/ = "\n";
close(FASTA);
close(BAC);
close(ARC);
close(EUK);
}
sub find_exe {
my($bin) = shift;
for my $dir (File::Spec->path) {
my $exe = File::Spec->catfile($dir, $bin);

if(-x $exe){

return $exe;
}
}
return;
}
Loading

0 comments on commit 37ffb36

Please sign in to comment.