Skip to content

Commit

Permalink
Using INSERT with multiple values in insertTable() for DataBrick…
Browse files Browse the repository at this point in the history
…s for faster inserts.
  • Loading branch information
Admin_mschuemi authored and Admin_mschuemi committed Dec 16, 2024
1 parent 2076842 commit 514d535
Show file tree
Hide file tree
Showing 39 changed files with 524 additions and 117 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,6 @@ VignetteBuilder: knitr
URL: https://ohdsi.github.io/DatabaseConnector/, https://github.com/OHDSI/DatabaseConnector
BugReports: https://github.com/OHDSI/DatabaseConnector/issues
Copyright: See file COPYRIGHTS
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE)
Encoding: UTF-8
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Changes:

- Updated Databricks driver to 2.6.36.

- Using `INSERT` with multiple values in `insertTable()` for DataBricks for faster inserts.


DatabaseConnector 6.3.2
=======================
Expand Down
39 changes: 39 additions & 0 deletions R/CtasHack.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,42 @@ ctasHack <- function(connection, sqlTableName, tempTable, sqlFieldNames, sqlData
delta <- Sys.time() - startTime
inform(paste("Inserting data took", signif(delta, 3), attr(delta, "units")))
}

multiValuesInsert <- function(connection, sqlTableName, tempTable, sqlFieldNames, sqlDataTypes, data, progressBar, tempEmulationSchema) {
logTrace(sprintf("Inserting %d rows into table '%s' using multi-values inserts", nrow(data), sqlTableName))

assign("noLogging", TRUE, envir = globalVars)
on.exit(
assign("noLogging", NULL, envir = globalVars)
)
startTime <- Sys.time()
batchSize <- 1000

# Insert data in batches using multi-value inserts:
if (progressBar) {
pb <- txtProgressBar(style = 3)
}
for (start in seq(1, nrow(data), by = batchSize)) {
if (progressBar) {
setTxtProgressBar(pb, start / nrow(data))
}
end <- min(start + batchSize - 1, nrow(data))
batch <- toStrings(data[start:end, , drop = FALSE], sqlDataTypes)
valuesString <- paste("(", paste(apply(batch, MARGIN = 1, FUN = paste, collapse = ","), collapse = "),("), ")")

sql <- "INSERT INTO @table (@fields) VALUES @values;"
sql <- SqlRender::render(sql = sql,
table = sqlTableName,
fields = sqlFieldNames,
values = valuesString)
sql <- SqlRender::translate(sql, targetDialect = dbms(connection), tempEmulationSchema = tempEmulationSchema)
executeSql(connection, sql, progressBar = FALSE, reportOverallTime = FALSE)
}
if (progressBar) {
setTxtProgressBar(pb, 1)
close(pb)
}
delta <- Sys.time() - startTime
inform(paste("Inserting data took", signif(delta, 3), attr(delta, "units")))
}

2 changes: 2 additions & 0 deletions R/InsertTable.R
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ insertTable.default <- function(connection,
} else if (useCtasHack) {
# Inserting using CTAS hack ----------------------------------------------------------------
ctasHack(connection, sqlTableName, tempTable, sqlFieldNames, sqlDataTypes, data, progressBar, tempEmulationSchema)
} else if (dbms == "spark") {
multiValuesInsert(connection, sqlTableName, tempTable, sqlFieldNames, sqlDataTypes, data, progressBar, tempEmulationSchema)
} else {
# Inserting using SQL inserts --------------------------------------------------------------
logTrace(sprintf("Inserting %d rows into table '%s'", nrow(data), sqlTableName))
Expand Down
63 changes: 63 additions & 0 deletions extras/TestCode.R
Original file line number Diff line number Diff line change
Expand Up @@ -455,3 +455,66 @@ querySql.sqlite(connection = connection,

disconnect(connection)
DBI::dbGetQuery(sqliteConnection, "SELECT COUNT(*) FROM test;")


# Test insert table performance on DataBricks -----------------------------
library(DatabaseConnector)
connectionDetails <- createConnectionDetails(
dbms = "spark",
connectionString = keyring::key_get("databricksConnectionString"),
user = "token",
password = keyring::key_get("databricksToken")
)
options(sqlRenderTempEmulationSchema = "scratch.scratch_mschuemi")

conn <- connect(connectionDetails)
set.seed(1)
day.start <- "1900/01/01"
day.end <- "2012/12/31"
dayseq <- seq.Date(as.Date(day.start), as.Date(day.end), by = "day")
makeRandomStrings <- function(n = 1, lenght = 12) {
randomString <- c(1:n)
for (i in 1:n) randomString[i] <- paste(sample(c(0:9, letters, LETTERS), lenght, replace = TRUE),
collapse = "")
return(randomString)
}
data <- data.frame(start_date = dayseq,
person_id = as.integer(round(runif(length(dayseq), 1, 1e+07))),
value = runif(length(dayseq)),
id = makeRandomStrings(length(dayseq)))

data$start_date[4] <- NA
data$person_id[5] <- NA
data$value[2] <- NA
data$id[3] <- NA

# data <- data[1:100, c("value", "id")]
system.time(
insertTable(connection = conn,
tableName = "scratch.scratch_mschuemi.insert_test",
data = data,
dropTableIfExists = TRUE,
createTable = TRUE,
tempTable = FALSE,
progressBar = TRUE,
bulkLoad = FALSE)
)
# Using default inserts with parameterized queries:
# user system elapsed
# 2.87 1.67 212.97

# USing CTAS hack:
# user system elapsed
# 0.54 0.03 11.19

system.time({
sql <- "DROP TABLE IF EXISTS scratch.scratch_mschuemi.insert_test;"
executeSql(conn, sql)
sql <- "CREATE TABLE scratch.scratch_mschuemi.insert_test (value FLOAT, id STRING);"
executeSql(conn, sql)
sql <- sprintf("INSERT INTO scratch.scratch_mschuemi.insert_test (value, id) VALUES %s;", paste(sprintf("(%s, '%s')", data$value, data$id), collapse = ","))
sql <- gsub("NA", "NULL", gsub("'NA'", "NULL", sql))
executeSql(conn, sql)
})
# user system elapsed
# 0.16 0.07 7.07
Binary file modified inst/doc/Connecting.pdf
Binary file not shown.
21 changes: 12 additions & 9 deletions man/dbAppendTable-DatabaseConnectorConnection-character-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 29 additions & 5 deletions man/dbClearResult-DatabaseConnectorDbiResult-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 29 additions & 5 deletions man/dbClearResult-DatabaseConnectorJdbcResult-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 1 addition & 3 deletions man/dbColumnInfo-DatabaseConnectorDbiResult-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 1 addition & 3 deletions man/dbColumnInfo-DatabaseConnectorJdbcResult-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 10 additions & 5 deletions man/dbCreateTable-DatabaseConnectorConnection-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion man/dbDisconnect-DatabaseConnectorConnection-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 514d535

Please sign in to comment.