diff --git a/JAMP/R/DB_build_BOLD.R b/JAMP/R/DB_build_BOLD.R index f410458..28e1974 100644 --- a/JAMP/R/DB_build_BOLD.R +++ b/JAMP/R/DB_build_BOLD.R @@ -1,22 +1,37 @@ # 191025 convert BOLD ref data to DB -buildBOLDdb <- function(tsv="ref_tsv_fro_bold.txt", savefasta=NA, savetaxonomy=NA, minlength=500, maxlength=NA, blacklist=NA, GenBank=F){ +buildBOLDdb <- function(tsv="ref_tsv_fro_bold.txt", savefasta=NA, savetaxonomy=NA, minlength=500, maxlength=NA, blacklist=NA, GenBank=F, stripptsv=T){ + + + +if(file.size(tsv)/1000000>1000 & stripptsv){ +message("tsv is larger tan 1 GB, only the relavant columns will be extracted before importing into R!") +tsv_stripped <- sub("\\.[tc].*$", "_stripped.tsv", tsv) +system2("awk", paste("-v FS='\t' -v OFS='\t' ' {print $1, $6, $8, $ 9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $27, $72}' ", tsv, " > ", tsv_stripped, sep="")) +tsv <- tsv_stripped +} + message("Building BOLD based refference database using: ", tsv) data <- read.csv(tsv, sep="\t", stringsAsFactors=F) +if(!exists("tsv_stripped")){ +message("Removing unneeded coulumns from table.") +data <- data[,c(1, 6, 8:24, 27, 72)] +} + + if(!GenBank){ GB <- data$institution_storing=="Mined from GenBank, NCBI" message("Excluding sequences mined from Genbank: ", sum(GB), "/", nrow(data), " (", round(sum(GB)/nrow(data)*100, 2), "% removed).") data <- data[!GB,] } +# rm GB info +data <- data[,c(-2)] -message("Removing unneeded coulumns from table.") -data <- data[,c(1, 8:24, 27, 72)] - if(!is.na(blacklist)){ rm <- data$processid %in% blacklist @@ -58,17 +73,17 @@ data[data==""] <- NA # catch taxonomic ID - +dataID <- data[,grep("ID", names(data))] taxon <- rep(0, nrow(data)) -taxon[which(!is.na(data$phylum_taxID))] <- 3 -taxon[which(!is.na(data$class_taxID))] <- 5 -taxon[which(!is.na(data$order_taxID))] <- 7 -taxon[which(!is.na(data$family_taxID))] <- 9 -taxon[which(!is.na(data$subfamily_taxID))] <- 11 -taxon[which(!is.na(data$genus_taxID))] <- 13 -taxon[which(!is.na(data$species_taxID))] <- 15 -taxon[which(!is.na(data$subspecies_taxID))] <- 17 +taxon[which(!is.na(dataID$phylum_taxID))] <- 1 +taxon[which(!is.na(dataID$class_taxID))] <- 2 +taxon[which(!is.na(dataID$order_taxID))] <- 3 +taxon[which(!is.na(dataID$family_taxID))] <- 4 +taxon[which(!is.na(dataID$subfamily_taxID))] <- 5 +taxon[which(!is.na(dataID$genus_taxID))] <- 6 +taxon[which(!is.na(dataID$species_taxID))] <- 7 +taxon[which(!is.na(dataID$subspecies_taxID))] <- 8 message("Preparing taxonomy reference files...") @@ -80,20 +95,15 @@ if(is.na(savetaxonomy)){ savetaxonomy <- paste("DB_", sub("(.*)\\..*$", "\\1", tsv), "_taxonomy.csv", sep="") } - -ID <- NULL -for (i in 1:nrow(data)){ -ID[i] <- paste(data$processid[i], "_", data$bin_uri[i], "_taxID=BOLD:", data[i, taxon[i]], sep="") -} +# make sequence ID +taxID <- as.vector(unlist(t(dataID)))[taxon+seq(0, nrow(dataID)*8-1, 8)] +ID <- paste(data$processid, "_", data$bin_uri, "_taxID=BOLD:", taxID, sep="") -meep <- rep(NA, nrow(data)) -taxonomyDB <- data.frame("taxID"=meep, "phylum"=meep, "class"=meep, "order"=meep, "family"=meep, "genus"=meep, "species"=meep, "subspecies"=meep, "taxRef"=meep) - -for (i in 1:nrow(data)){ -taxonomyDB[i,] <- c(paste("BOLD:", data[i, taxon[i]], sep=""), as.vector(unlist(data[i, c(4,6,8,10, 14, 16, 18, 19)]))) -} +# make taxonomy table +taxonomyDB <- data.frame("taxID"=paste("BOLD:", taxID, sep=""), "phylum"=data$phylum_name, "class"=data$class_name, "order"=data$order_name, "family"=data$family_name, "genus"=data$genus_name, "species"=data$species_name, "subspecies"=data$subspecies_name, "taxRef"=data$identification_reference, stringsAsFactors=F) +# remove taxID duplicates taxonomyDB <- taxonomyDB[!duplicated(taxonomyDB$taxID),] message(nrow(taxonomyDB), " unique taxonomic IDs detected (from ", nrow(data), " specimens).") taxonomyDB$taxRef <- gsub("[()]", "", taxonomyDB$taxRef)