Skip to content

Commit

Permalink
substantial spreed improvements to the buildBOLDdb function
Browse files Browse the repository at this point in the history
can now handle GB files, thanks to avoiding loops
  • Loading branch information
VascoElbrecht committed Nov 3, 2019
1 parent bdde4f6 commit 61d9319
Showing 1 changed file with 34 additions and 24 deletions.
58 changes: 34 additions & 24 deletions JAMP/R/DB_build_BOLD.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,37 @@
# 191025 convert BOLD ref data to DB


buildBOLDdb <- function(tsv="ref_tsv_fro_bold.txt", savefasta=NA, savetaxonomy=NA, minlength=500, maxlength=NA, blacklist=NA, GenBank=F){
buildBOLDdb <- function(tsv="ref_tsv_fro_bold.txt", savefasta=NA, savetaxonomy=NA, minlength=500, maxlength=NA, blacklist=NA, GenBank=F, stripptsv=T){



if(file.size(tsv)/1000000>1000 & stripptsv){
message("tsv is larger tan 1 GB, only the relavant columns will be extracted before importing into R!")
tsv_stripped <- sub("\\.[tc].*$", "_stripped.tsv", tsv)
system2("awk", paste("-v FS='\t' -v OFS='\t' ' {print $1, $6, $8, $ 9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $27, $72}' ", tsv, " > ", tsv_stripped, sep=""))
tsv <- tsv_stripped
}


message("Building BOLD based refference database using: ", tsv)
data <- read.csv(tsv, sep="\t", stringsAsFactors=F)

if(!exists("tsv_stripped")){
message("Removing unneeded coulumns from table.")
data <- data[,c(1, 6, 8:24, 27, 72)]
}


if(!GenBank){
GB <- data$institution_storing=="Mined from GenBank, NCBI"
message("Excluding sequences mined from Genbank: ", sum(GB), "/", nrow(data), " (", round(sum(GB)/nrow(data)*100, 2), "% removed).")
data <- data[!GB,]
}

# rm GB info
data <- data[,c(-2)]


message("Removing unneeded coulumns from table.")
data <- data[,c(1, 8:24, 27, 72)]


if(!is.na(blacklist)){
rm <- data$processid %in% blacklist
Expand Down Expand Up @@ -58,17 +73,17 @@ data[data==""] <- NA


# catch taxonomic ID

dataID <- data[,grep("ID", names(data))]
taxon <- rep(0, nrow(data))

taxon[which(!is.na(data$phylum_taxID))] <- 3
taxon[which(!is.na(data$class_taxID))] <- 5
taxon[which(!is.na(data$order_taxID))] <- 7
taxon[which(!is.na(data$family_taxID))] <- 9
taxon[which(!is.na(data$subfamily_taxID))] <- 11
taxon[which(!is.na(data$genus_taxID))] <- 13
taxon[which(!is.na(data$species_taxID))] <- 15
taxon[which(!is.na(data$subspecies_taxID))] <- 17
taxon[which(!is.na(dataID$phylum_taxID))] <- 1
taxon[which(!is.na(dataID$class_taxID))] <- 2
taxon[which(!is.na(dataID$order_taxID))] <- 3
taxon[which(!is.na(dataID$family_taxID))] <- 4
taxon[which(!is.na(dataID$subfamily_taxID))] <- 5
taxon[which(!is.na(dataID$genus_taxID))] <- 6
taxon[which(!is.na(dataID$species_taxID))] <- 7
taxon[which(!is.na(dataID$subspecies_taxID))] <- 8


message("Preparing taxonomy reference files...")
Expand All @@ -80,20 +95,15 @@ if(is.na(savetaxonomy)){
savetaxonomy <- paste("DB_", sub("(.*)\\..*$", "\\1", tsv), "_taxonomy.csv", sep="")
}


ID <- NULL
for (i in 1:nrow(data)){
ID[i] <- paste(data$processid[i], "_", data$bin_uri[i], "_taxID=BOLD:", data[i, taxon[i]], sep="")
}
# make sequence ID
taxID <- as.vector(unlist(t(dataID)))[taxon+seq(0, nrow(dataID)*8-1, 8)]
ID <- paste(data$processid, "_", data$bin_uri, "_taxID=BOLD:", taxID, sep="")


meep <- rep(NA, nrow(data))
taxonomyDB <- data.frame("taxID"=meep, "phylum"=meep, "class"=meep, "order"=meep, "family"=meep, "genus"=meep, "species"=meep, "subspecies"=meep, "taxRef"=meep)

for (i in 1:nrow(data)){
taxonomyDB[i,] <- c(paste("BOLD:", data[i, taxon[i]], sep=""), as.vector(unlist(data[i, c(4,6,8,10, 14, 16, 18, 19)])))
}
# make taxonomy table
taxonomyDB <- data.frame("taxID"=paste("BOLD:", taxID, sep=""), "phylum"=data$phylum_name, "class"=data$class_name, "order"=data$order_name, "family"=data$family_name, "genus"=data$genus_name, "species"=data$species_name, "subspecies"=data$subspecies_name, "taxRef"=data$identification_reference, stringsAsFactors=F)

# remove taxID duplicates
taxonomyDB <- taxonomyDB[!duplicated(taxonomyDB$taxID),]
message(nrow(taxonomyDB), " unique taxonomic IDs detected (from ", nrow(data), " specimens).")
taxonomyDB$taxRef <- gsub("[()]", "", taxonomyDB$taxRef)
Expand Down

0 comments on commit 61d9319

Please sign in to comment.