first commit

xiaoli-dong · Aug 2, 2019 · 37ffb36 · 37ffb36
commit 37ffb36
Show file tree

Hide file tree

Showing 13 changed files with 65,551 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+*~
+*/*~
+*/*/*~
+db/tmp
+db/blast/silva*
+examples/contigs.500.fasta
+examples/cyano.fna
+examples/test_output
+examples/\.*
diff --git a/README.md b/README.md
@@ -0,0 +1,15 @@
+rRNAFinder is a small software package, which can be used to automatically predicts and annotates ribosome RNA using only assembled genome/metagenome contigs as input. It reports the location, type of the
+ genes, and their taxonomic assignments. The outputs of the gene predictions are gff files and a set of fasta format sequence files, which are 16SrRNA.ffn, 18SrRNA.ffn, 5SrRNA.ffn, 5_8SrRNA.ffn, 23SrRNA.f
+fn, and 28SrRNA.ffn. It uses nhmmer program search against a hmm database to do the rRNA gene predication.
+The predicted 16s, 18s, 23s, and 28s rNRA fasta sequences can be further annotated by searching against SILVA database to get he taxonomic assignment.
+
+Features:
+The program can use multiple threads and runs very fast.
+Predict rRNA genes (5s, 5.8s, 16s, 18s, 23s, 28s rRNA genes)
+Taxonomic assignment of the predicted rRNA genes
+
+Example command
+#RNA prediction, produce rRNA.fasta and rRA.gff.txt file
+perl bin/rRNAFinder.pl  examples/test.fasta
+
+
diff --git a/bin/lca.pl b/bin/lca.pl
@@ -0,0 +1,84 @@
+#!/usr/bin/env perl
+use warnings;
+use strict;
+
+my @taxons = ();
+my $index = 0;
+
+while (<DATA>){
+    chomp;
+    my @terms = split(/;/, $_);
+    next if @terms == 0;
+    @{$taxons[$index]} = @terms;
+    print join(";", @{$taxons[$index]});
+    print "\n";
+    $index++;
+}
+
+my $com = lca(@taxons);
+print join(";", @$com);
+print "\n";
+# Get list of species names, find their Tax IDs, then find the Lowest Common Ancestor (LCA)
+sub lca {
+    my @taxids = @_;
+
+    my $size = @taxids;
+    my $test = $taxids[0];
+    #print STDERR "lca size=$size\n";
+    #print STDERR "first*********=@$test\n";
+
+
+    for (my $index = 0; $index <$size; $index++){
+	#$test = $taxids[$index];
+	#print STDERR "index=$index*********=@$test\n";
+    }
+
+    if (@taxids == 0) {return 0;} # Empty list? Not assigned=0
+
+    @taxids=grep {defined} @taxids;  # screen out undefined values
+
+    if (@taxids==0) {die("undefined values snuck into the taxid list; help?");} # how did undefined values get on the list?
+    #print "  taxid3: ",(map {my @z=@$_;sprintf("%7s ",$z[$#z])} @taxids),"\n";
+
+    if (@taxids==0) {return -1;} # No hits
+    #print "  taxid4: ",(map {my @z=@$_;sprintf("%7s ",$z[$#z])} @taxids),"\n";
+
+    # Find the lca
+    my $n;
+    for ($n=0;$n<100;$n++) {
+	my @col;
+	my $size = @taxids;
+
+	for (my $m=0,@col=();$m<@taxids;$m++) { push @col,$taxids[$m][$n]; }
+	if (!defined($col[0]) or !unanimous(@col)) {last;}
+    }
+    #if ($n<=0) {print STDERR "n=$n\n"; die("Impossible Error: Lca algorithm couldn't find any nodes in common!?  Data:\n".Dumper(\@_,\@taxids)." ");}
+
+    my @slice = ();
+
+    for(my $i = 0; $i < $n; $i++){
+	push (@slice, $taxids[0][$i]);
+    }
+
+    #print STDERR "n=$n, common nodes=", $taxids[0][$n-1], "\n";
+    #print STDERR "n*******=$n, common nodes=@slice\n";
+
+    return \@slice;
+
+
+    #return $taxids[0][$n-1]; #Return the taxid of the last common node
+}
+# Return 1 if the array is unanimous, 0 if not
+sub unanimous {
+  if (!defined $_[0]) {  # Unanimously 'undef'ined
+    if (grep {defined} @_) {return 0;}
+    return 1;
+  }
+  if (grep {!defined or $_ ne $_[0]} @_) {return 0;}
+  return 1;
+}
+
+__DATA__
+Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales (SR1);uncultured bacterium
+Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales (SR1);uncultured organism
+Bacteria;Patescibacteria;Gracilibacteria;Absconditabacteriales
diff --git a/bin/make_hmm_db.pl b/bin/make_hmm_db.pl
@@ -0,0 +1,173 @@
+#!/usr/bin/env perl
+use File::Fetch;
+use Archive::Extract;
+use FindBin qw($Bin);
+use strict;
+use warnings;
+
+my $now_string = localtime;  # e.g., "Thu Oct 13 04:54:34 1994"
+print STDERR "build database at: $now_string\n";
+my $bin = "$FindBin::Bin";
+my $tmpdir = "$bin/../db/tmp";
+my $hmm_dir = "$bin/../db/hmm";
+print STDERR "hmmdb_dir=$hmm_dir\n";
+
+my $fp = find_exe("hmmbuild");
+if(!$fp){
+    err("Cannot find hmmbuild tool");
+}
+
+my $rfam = "Rfam.seed";
+if(! -e "$tmpdir/$rfam.gz"){
+    my $ff = File::Fetch->new(uri => "ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/$rfam.gz");
+
+### fetch the uri to local directory###
+    print STDERR "Fetching $rfam.gz\n";
+    my $where = $ff->fetch(to => $tmpdir) or die $ff->error;
+}
+if(! -e "$tmpdir/$rfam"){
+    my $ae = Archive::Extract->new(archive =>"$tmpdir/$rfam.gz");
+    $ae->extract(to => $tmpdir);
+}
+my %queries = (
+
+    "5S_rRNA" => "$tmpdir/5S_rRNA.rfam.align.fasta",
+    "5_8S_rRNA" =>"$tmpdir/5_8S_rRNA.rfam.align.fasta",
+    "SSU_rRNA_bacteria" =>"$tmpdir/SSU_rRNA_bacteria.rfam.align.fasta",
+    "SSU_rRNA_archaea" =>"$tmpdir/SSU_rRNA_archaea.rfam.align.fasta",
+    "SSU_rRNA_eukarya" =>"$tmpdir/SSU_rRNA_eukarya.rfam.align.fasta",
+    "LSU_rRNA_archaea" =>"$tmpdir/LSU_rRNA_archaea.rfam.align.fasta",
+    "LSU_rRNA_bacteria" =>"$tmpdir/LSU_rRNA_bacteria.rfam.align.fasta",
+    "LSU_rRNA_eukarya" =>"$tmpdir/LSU_rRNA_eukarya.rfam.align.fasta"
+    );
+open(STOCK, "$tmpdir/$rfam") or die "cannot open $rfam for reading: $!";
+
+my $i = 0;
+$/ = "\n\//";
+
+foreach my $query (keys %queries){
+
+    my $outname = $queries{$query};
+    open(OUT, ">$outname") or die "cannot open $outname for writting: $!\n";
+    while(<STOCK>){
+	chomp;
+	if(/\#=GF\s+ID\s+$query/s){
+	    my @lines = split(/\n/, $_);
+	    foreach my $line (@lines){
+		if($line =~ /^\#/ || $line !~ /\S/){
+		    next;
+		} 	
+		else{	
+		    #seqid alignment
+		    my @a = split(/\s+/, $line);
+		    print OUT ">$a[0]\n$a[1]\n";
+		}	
+
+	    }
+
+	}
+    }
+    close(OUT);
+    seek STOCK, 0, 0;
+}
+$/ = "\n";
+close(STOCK);
+
+fasta2domain("5S_rRNA.rfam.align.fasta", $tmpdir);
+fasta2domain("5_8S_rRNA.rfam.align.fasta", $tmpdir);
+
+my %hmms = (
+    "arc_5SrRNA" => "$tmpdir/arc_5S_rRNA.rfam.align.fasta",
+    "euk_5SrRNA" => "$tmpdir/euk_5S_rRNA.rfam.align.fasta",
+    "bac_5SrRNA" => "$tmpdir/bac_5S_rRNA.rfam.align.fasta",
+    "euk_5_8SrRNA" => "$tmpdir/euk_5_8S_rRNA.rfam.align.fasta",
+    "arc_16SrRNA" => "$tmpdir/SSU_rRNA_archaea.rfam.align.fasta",
+    "bac_16SrRNA" => "$tmpdir/SSU_rRNA_bacteria.rfam.align.fasta",
+    "euk_18SrRNA" => "$tmpdir/SSU_rRNA_eukarya.rfam.align.fasta",
+    "arc_23SrRNA" => "$tmpdir/LSU_rRNA_archaea.rfam.align.fasta",
+    "bac_23SrRNA" => "$tmpdir/LSU_rRNA_bacteria.rfam.align.fasta",
+    "euk_28SrRNA" => "$tmpdir/LSU_rRNA_eukarya.rfam.align.fasta"
+    );
+
+my $cmd = "";
+foreach my $name (keys %hmms){
+    my $alignFile = $hmms{$name};
+    $cmd .= "hmmbuild --rna -n $name $tmpdir/$name.hmm $alignFile;"
+}
+
+print STDERR "cmd=$cmd\n";
+system ($cmd) >> 8 and  die "Could not execute cmd=$cmd, $!\n";
+$cmd = "cat $tmpdir/arc_16SrRNA.hmm $tmpdir/arc_23SrRNA.hmm $tmpdir/arc_5SrRNA.hmm > $hmm_dir/arc.hmm;";
+$cmd .= "cat $tmpdir/bac_16SrRNA.hmm $tmpdir/bac_23SrRNA.hmm $tmpdir/bac_5SrRNA.hmm > $hmm_dir/bac.hmm;";
+$cmd .= "cat $tmpdir/euk_18SrRNA.hmm $tmpdir/euk_28SrRNA.hmm $tmpdir/euk_5_8SrRNA.hmm $tmpdir/euk_5SrRNA.hmm > $hmm_dir/euk.hmm;";
+
+print STDERR "cmd=$cmd\n";
+system ($cmd) >> 8 and  die "Could not execute cmd=$cmd, $!\n";
+
+my @tmp_files = glob "$tmpdir/*";
+unlink glob "'$tmpdir/*'";
+#rmdir $tmpdir or warn "couldn't rmdir $tmpdir: $!\n";
+
+
+sub fasta2domain{
+    my ($fasta, $tmpdir) = @_;
+    use Bio::DB::EUtilities;
+    use Bio::SeqIO;
+
+    open (FASTA, "$tmpdir/$fasta") or die "Could not open $tmpdir/$fasta to read, $!\n";
+    $/ = "\n>";
+    open (BAC, ">$tmpdir/bac\_$fasta") or die "Could not open $tmpdir\/bac\_$fasta for writting, $!\n";
+    open (ARC, ">$tmpdir\/arc\_$fasta") or die "Could not open $tmpdir\/arc\_$fasta for writting, $!\n";;
+    open (EUK, ">$tmpdir\/euk\_$fasta") or die "Could not open $tmpdir\/euk\_$fasta for writting, $!\n";;
+
+    while(<FASTA>){
+	chomp;
+	if(my ($seqid,$other, $align) =  /^>?(\S+?)(\/.*?)\n(.*)/s){
+	    my $factory = Bio::DB::EUtilities->new(-eutil   => 'efetch',
+						   -db      => 'nucleotide',
+						   -rettype => 'gb',
+						   -email   => '[email protected]',
+						   -id      => $seqid);
+	    my $file = "$tmpdir/myseqs.gb";
+
+	    # dump HTTP::Response content to a file (not retained in memory)
+	    $factory->get_Response(-file => $file);
+	    my $seqin = Bio::SeqIO->new(-file   => $file,
+					-format => 'genbank');
+
+	    while (my $seq = $seqin->next_seq) {
+		my @classification = $seq->species->classification;
+		my $lineage = join('\t', @classification);
+		if($lineage =~ /Bacteria/i){
+		    print  BAC ">$seqid$other\n$align\n";
+		}
+		elsif($lineage =~ /Archaea/i){
+		    print  ARC ">$seqid$other\n$align\n";
+		}
+		elsif($lineage =~ /Eukaryota/i){
+		    print EUK ">$seqid$other\n$align\n";
+		}
+	    }
+
+	}
+
+    }
+
+    $/ = "\n";
+    close(FASTA);
+    close(BAC);
+    close(ARC);
+    close(EUK);
+}
+sub find_exe {
+    my($bin) = shift;
+  for my $dir (File::Spec->path) {
+    my $exe = File::Spec->catfile($dir, $bin);
+
+    if(-x $exe){
+
+	return $exe;
+    }
+  }
+  return;
+}