substantial spreed improvements to the buildBOLDdb function

can now handle GB files, thanks to avoiding loops
VascoElbrecht · Nov 3, 2019 · 61d9319 · 61d9319
1 parent bdde4f6
commit 61d9319
Showing 1 changed file with 34 additions and 24 deletions.
diff --git a/JAMP/R/DB_build_BOLD.R b/JAMP/R/DB_build_BOLD.R
@@ -1,22 +1,37 @@
 # 191025 convert BOLD ref data to DB
 
 
-buildBOLDdb <- function(tsv="ref_tsv_fro_bold.txt", savefasta=NA, savetaxonomy=NA, minlength=500, maxlength=NA, blacklist=NA, GenBank=F){
+buildBOLDdb <- function(tsv="ref_tsv_fro_bold.txt", savefasta=NA, savetaxonomy=NA, minlength=500, maxlength=NA, blacklist=NA, GenBank=F, stripptsv=T){
+
+
+
+if(file.size(tsv)/1000000>1000 & stripptsv){
+message("tsv is larger tan 1 GB, only the relavant columns will be extracted before importing into R!")
+tsv_stripped <- sub("\\.[tc].*$", "_stripped.tsv", tsv)
+system2("awk", paste("-v FS='\t' -v OFS='\t' ' {print $1, $6, $8, $ 9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $27, $72}' ", tsv, " > ", tsv_stripped, sep=""))
+tsv <- tsv_stripped
+}
+
 
 message("Building BOLD based refference database using: ", tsv)
 data <- read.csv(tsv, sep="\t", stringsAsFactors=F)
 
+if(!exists("tsv_stripped")){
+message("Removing unneeded coulumns from table.")
+data <- data[,c(1, 6, 8:24, 27, 72)]
+}
+
+
 if(!GenBank){
 GB <- data$institution_storing=="Mined from GenBank, NCBI"
 message("Excluding sequences mined from Genbank: ", sum(GB), "/", nrow(data), " (", round(sum(GB)/nrow(data)*100, 2), "% removed).")
 data <- data[!GB,]
 }
 
+# rm GB info
+data <- data[,c(-2)]
 
 
-message("Removing unneeded coulumns from table.")
-data <- data[,c(1, 8:24, 27, 72)]
-
 
 if(!is.na(blacklist)){
 rm <- data$processid %in% blacklist
@@ -58,17 +73,17 @@ data[data==""] <- NA
 
 
 # catch taxonomic ID
-
+dataID <- data[,grep("ID", names(data))]
 taxon <- rep(0, nrow(data))
 
-taxon[which(!is.na(data$phylum_taxID))] <- 3
-taxon[which(!is.na(data$class_taxID))] <- 5
-taxon[which(!is.na(data$order_taxID))] <- 7
-taxon[which(!is.na(data$family_taxID))] <- 9
-taxon[which(!is.na(data$subfamily_taxID))] <- 11
-taxon[which(!is.na(data$genus_taxID))] <- 13
-taxon[which(!is.na(data$species_taxID))] <- 15
-taxon[which(!is.na(data$subspecies_taxID))] <- 17
+taxon[which(!is.na(dataID$phylum_taxID))] <- 1
+taxon[which(!is.na(dataID$class_taxID))] <- 2
+taxon[which(!is.na(dataID$order_taxID))] <- 3
+taxon[which(!is.na(dataID$family_taxID))] <- 4
+taxon[which(!is.na(dataID$subfamily_taxID))] <- 5
+taxon[which(!is.na(dataID$genus_taxID))] <- 6
+taxon[which(!is.na(dataID$species_taxID))] <- 7
+taxon[which(!is.na(dataID$subspecies_taxID))] <- 8
 
 
 message("Preparing taxonomy reference files...")
@@ -80,20 +95,15 @@ if(is.na(savetaxonomy)){
 savetaxonomy <- paste("DB_", sub("(.*)\\..*$", "\\1", tsv), "_taxonomy.csv", sep="")
 }
 
-
-ID <- NULL
-for (i in 1:nrow(data)){
-ID[i] <- paste(data$processid[i], "_", data$bin_uri[i], "_taxID=BOLD:", data[i, taxon[i]], sep="")
-}
+# make sequence ID
+taxID <- as.vector(unlist(t(dataID)))[taxon+seq(0, nrow(dataID)*8-1, 8)]
+ID <- paste(data$processid, "_", data$bin_uri, "_taxID=BOLD:", taxID, sep="")
 
 
-meep <- rep(NA, nrow(data))
-taxonomyDB <- data.frame("taxID"=meep, "phylum"=meep, "class"=meep, "order"=meep, "family"=meep, "genus"=meep, "species"=meep, "subspecies"=meep, "taxRef"=meep)
-
-for (i in 1:nrow(data)){
-taxonomyDB[i,] <- c(paste("BOLD:", data[i, taxon[i]], sep=""), as.vector(unlist(data[i, c(4,6,8,10, 14, 16, 18, 19)])))
-}
+# make taxonomy table
+taxonomyDB <- data.frame("taxID"=paste("BOLD:", taxID, sep=""), "phylum"=data$phylum_name, "class"=data$class_name, "order"=data$order_name, "family"=data$family_name, "genus"=data$genus_name, "species"=data$species_name, "subspecies"=data$subspecies_name, "taxRef"=data$identification_reference, stringsAsFactors=F)
 
+# remove taxID duplicates
 taxonomyDB <- taxonomyDB[!duplicated(taxonomyDB$taxID),]
 message(nrow(taxonomyDB), " unique taxonomic IDs detected (from ", nrow(data), " specimens).")
 taxonomyDB$taxRef <- gsub("[()]", "", taxonomyDB$taxRef)