ssi-dk · RasmusSkytte · Feb 3, 2025 · Nov 8, 2024 · Nov 8, 2024 · Jan 10, 2025
diff --git a/data-raw/benchmark.R b/data-raw/benchmark.R
@@ -38,6 +38,9 @@ for (version in c("CRAN", "main", "branch")) {
   missing <- jsonlite::fromJSON("SCDB.lock")$packages$ref %>%
     purrr::discard(rlang::is_installed)
   if (length(missing) > 0) pak::pkg_install(missing, lib = lib_path)
+
+  # Explicitly install the packages
+  pak::pkg_install(source, lib = lib_path, dependencies = FALSE)
 }
 
 
@@ -66,8 +69,12 @@ if (identical(Sys.getenv("CI"), "true") && identical(Sys.getenv("BACKEND"), ""))
       version == "branch" ~ glue::glue("ssi-dk-SCDB-{sha}")
     )
 
-    .libPaths(c(here::here("installations", lib_dir), lib_paths_default))
-    library("SCDB")
+    library("SCDB", lib.loc = here::here("installations", lib_dir))
+
+    # Add proper version labels to the benchmarks
+    if (version == "CRAN") {
+      version <- paste0("SCDB v",  packageVersion("SCDB"))
+    }
 
     # Open connection to the database
     conns <- get_test_conns()
@@ -137,8 +144,11 @@ if (identical(Sys.getenv("CI"), "true") && identical(Sys.getenv("BACKEND"), ""))
           "n" = n
         )
 
-      dir.create("data", showWarnings = FALSE)
-      saveRDS(update_snapshot_benchmark, glue::glue("data/benchmark-update_snapshot_{names(conns)[[1]]}_{version}.rds"))
+      dir.create("inst/extdata", showWarnings = FALSE, recursive = TRUE)
+      saveRDS(
+        update_snapshot_benchmark,
+        glue::glue("inst/extdata/benchmark-update_snapshot_{names(conns)[[1]]}_{version}.rds")
+      )
     })
 
     # Benchmark 2, update_snapshot() with increasing data size
@@ -164,10 +174,10 @@ if (identical(Sys.getenv("CI"), "true") && identical(Sys.getenv("BACKEND"), ""))
             "n" = n
           )
 
-        dir.create("data", showWarnings = FALSE)
+        dir.create("inst/extdata", showWarnings = FALSE, recursive = TRUE)
         saveRDS(
           update_snapshot_benchmark,
-          glue::glue("data/benchmark-update_snapshot_complexity_{n}_{names(conns)[[1]]}_{version}.rds")
+          glue::glue("inst/extdata/benchmark-update_snapshot_complexity_{n}_{names(conns)[[1]]}_{version}.rds")
         )
       }
 

diff --git a/inst/extdata/benchmarks.rds b/inst/extdata/benchmarks.rds
diff --git a/vignettes/benchmarks.Rmd b/vignettes/benchmarks.Rmd
@@ -14,6 +14,9 @@
   comment = "#>"
 )
 
+# Set a flag to determine if this is being run on CRAN
+on_cran <- !identical(Sys.getenv("NOT_CRAN"), "true")
+
 # NOTE:
 # To re-run the benchmarks, run the "benchmark" workflow on GitHub
 ```
@@ -41,8 +44,19 @@
 The performance of this benchmark function is timed with the `{microbenchmark}` package using 10 replicates.
 All benchmarks are run on the same machine.
 
-The results of the benchmark are shown graphically below (mean and standard deviation), where we compare the current
-development version of `SCDB` with the current CRAN version.
+```{r benchmark context, results = "asis", include = FALSE}
+if (on_cran) {
+  cat(
+    "The results of the benchmark are shown graphically below (mean and standard deviation), where measure the",
+    "performance of `SCDB`."
+  )
+} else {
+  cat(
+    "The results of the benchmark are shown graphically below (mean and standard deviation), where we compare the",
+    "current development version of `SCDB` with the current CRAN version."
+  )
+}
+```
 
 ```{r benchmark_preprocessing, echo = FALSE, eval = requireNamespace("here")}
 benchmark_location <- c(
@@ -52,12 +66,13 @@
   purrr::discard(~ identical(., "")) %>%
   purrr::pluck(1)
 
-benchmarks <- readRDS(benchmark_location)
+benchmarks <- readRDS(benchmark_location) %>%
+  dplyr::mutate("version" = as.character(.data$version))
 
 # Determine if the SHA is on main
 sha <- benchmarks %>%
-  dplyr::distinct(version) %>%
-  dplyr::filter(!(version %in% c("CRAN", "main", "branch"))) %>%
+  dplyr::distinct(.data$version) %>%
+  dplyr::filter(!startsWith(.data$version, "SCDB"), .data$version != "main") %>%
   dplyr::pull("version")
 
 # Check local git history
@@ -71,14 +86,17 @@
   return(identical(Sys.getenv("CI"), "true"))
 })
 
-# If the SHA has been merged, use as the "main" version and remove the other,
-# older, main version
-if (on_main) {
+# If we are on CRAN, use the newest benchmark (version = sha)
+# This benchmark is then labelled with the newest version number of SCDB (the one we just deployed to CRAN)
+if (on_cran) {
+  benchmarks <- benchmarks %>%
+    dplyr::filter(.data$version == !!sha) %>%
+    dplyr::mutate("version" = paste0("SCDB v", packageVersion("SCDB")))
+} else if (on_main) {
+  # If the SHA has been merged, use as the "main" version and remove the other, older, main version
   benchmarks <- benchmarks %>%
     dplyr::filter(.data$version != "main") %>%
-    dplyr::mutate(
-      "version" = dplyr::if_else(.data$version == "CRAN", "CRAN", "development")
-    )
+    dplyr::mutate("version" = dplyr::if_else(.data$version == sha, "development", .data$version))
 }
 
 # Mean and standard deviation (see ggplot2::mean_se())
@@ -96,44 +114,43 @@
     !stringr::str_ends(.data$benchmark_function, stringr::fixed("complexity"))
   )
 
-# Add note slow backends
-slow_backends <- benchmark_1 %>%
-  dplyr::distinct(.data$database, .data$n) %>%
-  dplyr::filter(.data$n < max(.data$n)) %>%
-  dplyr::pull("database")
-
-benchmark_1 <- benchmark_1 %>%
-  dplyr::mutate(
-    "database" = paste0(database, ifelse(database %in% slow_backends, "*", ""))
-  )
-
 # Insert newline into database name to improve rendering of figures
 labeller <- ggplot2::as_labeller(
   function(l) stringr::str_replace_all(l, stringr::fixed(" v"), "\nv")
 )
 
-
+# Apply "dodging" to sub-groups to show graphically
+dodge <- ggplot2::position_dodge(width = 0.6)
 
 g <- ggplot2::ggplot(
   benchmark_1,
-  ggplot2::aes(x = version, y = time / 1e9)
+  ggplot2::aes(x = version, y = time / 1e9, color = database)
 ) +
   ggplot2::stat_summary(
     fun.data = mean_sd,
     geom = "pointrange",
-    size = 0.5,
-    linewidth = 1
+    size = 0.5, linewidth = 1,
+    position = dodge
   ) +
+  ggplot2::scale_x_discrete(guide = ggplot2::guide_axis(n.dodge = 2)) +
+  ggplot2::labs(x = "Codebase version", y = "Time (s)") +
+  ggplot2::theme(legend.position = "bottom")
+
+
+if (on_cran) {
+  # Reduce font size for CRAN version
+  g <- g + ggplot2::theme(text = ggplot2::element_text(size = 8))
+
+  # Make the legend two rows
+  g <- g + ggplot2::guides(color = ggplot2::guide_legend(title = "", nrow = 2, byrow = TRUE))
+
+} else {
+  # Add facets to non-CRAN rendering
+  g <- g +
   ggplot2::facet_grid(
     rows = ggplot2::vars(benchmark_function),
     cols = ggplot2::vars(database),
     labeller = labeller
-  ) +
-  ggplot2::labs(x = "Codebase version", y = "Time (s)")
-
-if (length(slow_backends) > 1) {
-  g <- g + ggplot2::labs(
-    caption = "* IMPORTANT: Benchmark data halved for this backend!"
   )
 }
 
@@ -171,26 +188,52 @@
     )
   )
 
-ggplot2::ggplot(
+
+# Apply "dodging" to sub-groups to show graphically
+dodge <- ggplot2::position_dodge(width = 0.6)
+
+# Set aesthetics for CRAN and non-CRAN versions
+if (on_cran) {
+  aes <- ggplot2::aes(x = n * nrow(iris) / 1e3, y = time / 1e9, color = database)
+} else {
+  aes <- ggplot2::aes(x = n * nrow(iris) / 1e3, y = time / 1e9, color = version)
+}
+
+g <- ggplot2::ggplot(
   benchmark_2,
-  ggplot2::aes(x = n * nrow(iris) / 1e3, y = time / 1e9, color = version)
+  aes
 ) +
   ggplot2::stat_summary(
     fun.data = mean_sd,
     geom = "pointrange",
-    size = 0.5,
-    linewidth = 1
+    size = 0.5, linewidth = 1,
+    position = dodge
   ) +
   ggplot2::geom_smooth(method = "lm", formula = y ~ x, se = FALSE, linetype = 3) +
-  ggplot2::facet_grid(
-    rows = ggplot2::vars(benchmark_function),
-    cols = ggplot2::vars(database),
-    labeller = labeller
-  ) +
   ggplot2::labs(
     x = "Data size (1,000 rows)",
     y = "Time (s)",
     color = "Codebase version"
   ) +
   ggplot2::theme(panel.spacing = grid::unit(1, "lines"), legend.position = "bottom")
+
+
+if (on_cran) {
+  # Reduce font size for CRAN version
+  g <- g + ggplot2::theme(text = ggplot2::element_text(size = 8))
+
+  # Make the legend two rows
+  g <- g + ggplot2::guides(color = ggplot2::guide_legend(title = "", nrow = 2, byrow = TRUE))
+
+} else {
+  # Add facets to non-CRAN rendering
+  g <- g +
+  ggplot2::facet_grid(
+    rows = ggplot2::vars(benchmark_function),
+    cols = ggplot2::vars(database),
+    labeller = labeller
+  )
+}
+
+g
 ```