diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 0000000..c0d6c44 --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,61 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +name: test-coverage + +permissions: read-all + +jobs: + test-coverage: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v4 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::covr + needs: coverage + + - name: Test coverage + run: | + cov <- covr::package_coverage( + quiet = FALSE, + clean = FALSE, + install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") + ) + covr::to_cobertura(cov) + shell: Rscript {0} + + - uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }} + file: ./cobertura.xml + plugin: noop + disable_search: true + token: ${{ secrets.CODECOV_TOKEN }} + + - name: Show testthat output + if: always() + run: | + ## -------------------------------------------------------------------- + find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true + shell: bash + + - name: Upload test results + if: failure() + uses: actions/upload-artifact@v4 + with: + name: coverage-test-failures + path: ${{ runner.temp }}/package \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index c51f1ab..6498cf5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: chopin Title: Computation of Spatial Data by Hierarchical and Objective Partitioning of Inputs for Parallel Processing -Version: 0.8.3 +Version: 0.9.0 Authors@R: c( person("Insang", "Song", , "geoissong@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-8732-3256")), @@ -48,6 +48,7 @@ Imports: sf (>= 1.0-10), stars (>= 0.6-0), terra (>= 1.7-18), + mirai (>= 1.3.0), collapse, lifecycle Suggests: @@ -56,7 +57,6 @@ Suggests: targets, DiagrammeR, future.mirai, - mirai, knitr, rmarkdown, spatstat.random, diff --git a/NAMESPACE b/NAMESPACE index 79e5ee8..8d5db8e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,9 +4,12 @@ export(extract_at) export(kernelfunction) export(par_convert_f) export(par_grid) +export(par_grid_mirai) export(par_hierarchy) +export(par_hierarchy_mirai) export(par_merge_grid) export(par_multirasters) +export(par_multirasters_mirai) export(par_pad_balanced) export(par_pad_grid) export(par_split_list) @@ -39,6 +42,7 @@ importFrom(igraph,mst) importFrom(lifecycle,deprecated) importFrom(methods,findFunction) importFrom(methods,getPackageName) +importFrom(mirai,mirai_map) importFrom(rlang,"!!!") importFrom(rlang,inject) importFrom(rlang,sym) diff --git a/NEWS.md b/NEWS.md index ef54579..708a3cf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# 0.9 +- `mirai` based `par_*` functions for parallelization +- terra::extract mode in `extract_at()` with `terra` argument and auxiliary arguments including exact, weights, touches + # 0.8 - Bumped version from 0.7.8 to 0.8.0: improving package coverage - README.md: two mermaid plots are pre-generated as png files diff --git a/R/processing.R b/R/processing.R index 95f2c7c..6996c43 100644 --- a/R/processing.R +++ b/R/processing.R @@ -52,6 +52,7 @@ kernelfunction <- y_vec, id, extracted, + terra = FALSE, kernel_func = stats::weighted.mean, kernel = NULL, bandwidth = NULL, @@ -70,9 +71,17 @@ kernelfunction <- ) y_vec <- terra::centroids(y_vec, inside = TRUE) } - name_surf_val <- - ifelse(terra::nlyr(x_ras) == 1, - "value", names(x_ras)) + if (!terra) { + name_surf_val <- + ifelse(terra::nlyr(x_ras) == 1, + "value", names(x_ras)) + } else { + # TODO: "ID", "x", "y" are too generic to exclude; + # need to find a generalized way of excluding names + exclude_vec <- c("id_chopin", "coverage_fraction", "ID", "x", "y") + exclude_vec <- append(exclude_vec, names(y_vec)) + name_surf_val <- setdiff(names(extracted), exclude_vec) + } # convert to data.frame coords_df <- as.data.frame(y_vec, geom = "XY") # apply strict order @@ -90,7 +99,9 @@ kernelfunction <- coverage_fraction <- NULL # post-processing - extracted <- do.call(rbind, extracted) + if (!is.data.frame(extracted)) { + extracted <- do.call(rbind, extracted) + } names(extracted)[grep("(x|y)", names(extracted))] <- c("xdest", "ydest") extracted_summary <- extracted |> @@ -127,8 +138,10 @@ kernelfunction <- #' @param id character(1). Name of unique identifier field. #' @param func character(1)/function. supported function names or functions #' taking `x` and `w` in `exactextractr::exact_extract` +#' @param terra logical(1). If `TRUE`, use `terra::extract` instead of +#' `exactextractr::exact_extract`. #' @param extent numeric. Passed to .check_vector -#' @param radius numeric(1). +#' @param radius numeric(1). Buffer radius. #' @param out_class character(1). "sf" or "terra" #' @param kernel character(1). Name of kernel functions [kernelfunction] #' @param kernel_func function. Kernel weight summary function. @@ -139,6 +152,9 @@ kernelfunction <- #' @param .standalone logical(1). Whether or not running standalone mode. #' `TRUE` will apply internal input check functions, whereas #' `FALSE` will let `par_*` functions will check inputs. +#' @param weights passed to `terra::extract()` Default is TRUE. +#' @param exact passed to `terra::extract()` Default is TRUE. +#' @param touches passed to `terra::extract()` Default is FALSE. #' @keywords internal #' @noRd .extract_at <- function( @@ -146,6 +162,7 @@ kernelfunction <- y = NULL, id = NULL, func = "mean", + terra = FALSE, extent = NULL, radius = NULL, out_class = "sf", @@ -153,6 +170,9 @@ kernelfunction <- kernel_func = stats::weighted.mean, bandwidth = NULL, max_cells = NULL, + exact = TRUE, + weights = TRUE, + touches = FALSE, .standalone = TRUE, ... ) { @@ -172,7 +192,7 @@ kernelfunction <- ) # reproject polygons to raster's crs y <- reproject_to_raster(vector = y, raster = x) - if (dep_check(y) == "terra") { + if (dep_check(y) == "terra" && !terra) { y <- dep_switch(y) } } @@ -188,20 +208,40 @@ kernelfunction <- } iskernel <- !is.null(kernel) - extracted <- - exactextractr::exact_extract( - x = x, - y = y, - fun = if (iskernel) NULL else func, - force_df = TRUE, - stack_apply = !iskernel, - append_cols = if (iskernel) NULL else id, - include_cols = if (iskernel) id else NULL, - progress = FALSE, - include_area = iskernel, - include_xy = iskernel, - max_cells_in_memory = max_cells - ) + if (!terra) { + extracted <- + exactextractr::exact_extract( + x = x, + y = y, + fun = if (iskernel) NULL else func, + force_df = TRUE, + stack_apply = !iskernel, + append_cols = if (iskernel) NULL else id, + include_cols = if (iskernel) id else NULL, + progress = FALSE, + include_area = iskernel, + include_xy = iskernel, + max_cells_in_memory = max_cells + ) + } else { + extracted <- + terra::extract( + x = x, y = y, + fun = if (iskernel) NULL else func, + xy = TRUE, + weights = weights, + exact = exact, + touches = touches, + bind = TRUE, + ID = TRUE + ) + + extracted[[id]] <- y[[id]][extracted$ID] + extracted$ID <- NULL + names(extracted)[names(extracted) == "id_chopin"] <- id + names(extracted)[names(extracted) %in% c("weight", "fraction")] <- + "coverage_fraction" + } if (iskernel) { stopifnot(!is.null(bandwidth)) @@ -215,6 +255,7 @@ kernelfunction <- y_vec = y, id = id, extracted = extracted, + terra = terra, kernel = kernel, kernel_func = kernel_func, bandwidth = bandwidth @@ -243,6 +284,8 @@ kernelfunction <- #' @param func function taking one numeric vector argument. #' Default is `"mean"` for all supported signatures in arguments #' `x` and `y`. +#' @param terra logical(1). If `TRUE`, use `terra::extract` instead of +#' `exactextractr::exact_extract`. #' @param extent numeric(4) or SpatExtent. Extent of clipping vector. #' It only works with `points` of character(1) file path. #' @param radius numeric(1). Buffer radius. @@ -254,6 +297,9 @@ kernelfunction <- #' Default is [`stats::weighted.mean()`] #' @param bandwidth numeric(1). Kernel bandwidth. #' @param max_cells integer(1). Maximum number of cells in memory. +#' @param weights passed to `terra::extract()` Default is TRUE. +#' @param exact passed to `terra::extract()` Default is TRUE. +#' @param touches passed to `terra::extract()` Default is FALSE. #' @param .standalone logical(1). Default is `TRUE`, which means that #' the function will be executed in a standalone mode. #' When using this function in `par_*` functions, @@ -305,6 +351,7 @@ setMethod( y = NULL, id = NULL, func = "mean", + terra = FALSE, extent = NULL, radius = NULL, out_class = "sf", @@ -312,11 +359,15 @@ setMethod( kernel_func = stats::weighted.mean, bandwidth = NULL, max_cells = 3e+07, + exact = TRUE, + weights = TRUE, + touches = FALSE, .standalone = TRUE, ... ) { .extract_at( x = x, y = y, id = id, func = func, + terra = terra, extent = extent, radius = radius, out_class = out_class, @@ -324,6 +375,9 @@ setMethod( kernel_func = kernel_func, bandwidth = bandwidth, max_cells = max_cells, + exact = exact, + weights = weights, + touches = touches, .standalone = .standalone ) } @@ -343,6 +397,7 @@ setMethod( y = NULL, id = NULL, func = "mean", + terra = FALSE, extent = NULL, radius = NULL, out_class = "sf", @@ -350,11 +405,15 @@ setMethod( kernel_func = stats::weighted.mean, bandwidth = NULL, max_cells = 3e+07, + exact = TRUE, + weights = TRUE, + touches = FALSE, .standalone = TRUE, ... ) { .extract_at( x = x, y = y, id = id, func = func, + terra = terra, extent = extent, radius = radius, out_class = out_class, @@ -362,6 +421,9 @@ setMethod( kernel_func = kernel_func, bandwidth = bandwidth, max_cells = max_cells, + exact = exact, + weights = weights, + touches = touches, .standalone = .standalone ) } @@ -381,6 +443,7 @@ setMethod( y = NULL, id = NULL, func = "mean", + terra = FALSE, extent = NULL, radius = NULL, out_class = "sf", @@ -388,11 +451,15 @@ setMethod( kernel_func = stats::weighted.mean, bandwidth = NULL, max_cells = 3e+07, + exact = TRUE, + weights = TRUE, + touches = FALSE, .standalone = TRUE, ... ) { .extract_at( x = x, y = y, id = id, func = func, + terra = terra, extent = extent, radius = radius, out_class = out_class, @@ -400,6 +467,9 @@ setMethod( kernel_func = kernel_func, bandwidth = bandwidth, max_cells = max_cells, + exact = exact, + weights = weights, + touches = touches, .standalone = .standalone ) } @@ -418,6 +488,7 @@ setMethod( y = NULL, id = NULL, func = "mean", + terra = FALSE, extent = NULL, radius = NULL, out_class = "sf", @@ -425,11 +496,15 @@ setMethod( kernel_func = stats::weighted.mean, bandwidth = NULL, max_cells = 3e+07, + exact = TRUE, + weights = TRUE, + touches = FALSE, .standalone = TRUE, ... ) { .extract_at( x = x, y = y, id = id, func = func, + terra = terra, extent = extent, radius = radius, out_class = out_class, @@ -437,6 +512,9 @@ setMethod( kernel_func = kernel_func, bandwidth = bandwidth, max_cells = max_cells, + exact = exact, + weights = weights, + touches = touches, .standalone = .standalone ) } @@ -456,6 +534,7 @@ setMethod( y = NULL, id = NULL, func = "mean", + terra = FALSE, extent = NULL, radius = NULL, out_class = "sf", @@ -463,11 +542,15 @@ setMethod( kernel_func = stats::weighted.mean, bandwidth = NULL, max_cells = 3e+07, + exact = TRUE, + weights = TRUE, + touches = FALSE, .standalone = TRUE, ... ) { .extract_at( x = x, y = y, id = id, func = func, + terra = terra, extent = extent, radius = radius, out_class = out_class, @@ -475,6 +558,9 @@ setMethod( kernel_func = kernel_func, bandwidth = bandwidth, max_cells = max_cells, + exact = exact, + weights = weights, + touches = touches, .standalone = .standalone ) } @@ -495,6 +581,7 @@ setMethod( y = NULL, id = NULL, func = "mean", + terra = FALSE, extent = NULL, radius = NULL, out_class = "sf", @@ -502,11 +589,15 @@ setMethod( kernel_func = stats::weighted.mean, bandwidth = NULL, max_cells = 3e+07, + exact = TRUE, + weights = TRUE, + touches = FALSE, .standalone = TRUE, ... ) { .extract_at( x = x, y = y, id = id, func = func, + terra = terra, extent = extent, radius = radius, out_class = out_class, @@ -514,6 +605,9 @@ setMethod( kernel_func = kernel_func, bandwidth = bandwidth, max_cells = max_cells, + exact = exact, + weights = weights, + touches = touches, .standalone = .standalone ) } diff --git a/R/scale_process_mirai.R b/R/scale_process_mirai.R new file mode 100644 index 0000000..103fdb0 --- /dev/null +++ b/R/scale_process_mirai.R @@ -0,0 +1,822 @@ +#' Parallelize spatial computation over the computational grids +#' @family Parallelization +#' @description +#' [mirai::daemons] will set the parallel backend then [mirai::mirai_map] +#' will parallelize the work in each grid. For details of the terminology +#' in `mirai` package, refer to [`mirai::mirai`]. This function assumes that +#' users have one raster file and a sizable and spatially distributed +#' target locations. Each thread will process +#' the nearest integer of $|N_g| / |N_t|$ grids +#' where $|N_g|$ denotes the number of grids and $|N_t|$ denotes +#' the number of threads. +#' @note In dynamic dots (`...`), `fun_dist` arguments should include +#' x and y where sf/terra class objects or file paths are accepted. +#' Virtually any sf/terra functions that accept two arguments +#' can be put in `fun_dist`, but please be advised that +#' some spatial operations do not necessarily give the +#' exact result from what would have been done single-thread. +#' For example, distance calculated through this function may return the +#' lower value than actual because the computational region was reduced. +#' This would be the case especially where the target features +#' are spatially sparsely distributed. +#' @param grids List of two sf/SpatVector objects. Computational grids. +#' It takes a strict assumption that the grid input is +#' an output of `par_pad_grid``. +#' @param fun_dist `sf`, `terra` or `chopin` functions. +#' This function should have `x` and `y` arguments. +#' @param ... Arguments passed to the argument `fun_dist`. +#' @param pad_y logical(1). Whether to filter y with the padded grid. +#' Should be TRUE when x is where the values are calculated. +#' Default is `FALSE`. In the reverse case, like `terra::extent` or +#' `exactextractr::exact_extract`, the raster (x) extent should be set +#' with the padded grid. +#' @param .debug logical(1). Default is `FALSE`. Otherwise, +#' if a unit computation fails, the error message and the `CGRIDID` +#' value where the error occurred will be included in the output. +#' @returns a data.frame object with computation results. +#' For entries of the results, consult the documentation of the function put +#' in `fun_dist` argument. +#' @author Insang Song \email{geoissong@@gmail.com} +#' @examples +#' library(sf) +#' library(mirai) +#' daemons(4, dispatcher = "process") +#' ncpath <- system.file("shape/nc.shp", package = "sf") +#' ncpoly <- sf::st_read(ncpath) +#' # sf object +#' ncpnts <- +#' readRDS( +#' system.file("extdata/nc_random_point.rds", package = "chopin") +#' ) +#' # file path +#' ncelev <- +#' system.file("extdata/nc_srtm15_otm.tif", package = "chopin") +#' +# generate grids +#' nccompreg <- +#' chopin::par_pad_grid( +#' input = ncpnts, +#' mode = "grid", +#' nx = 4L, +#' ny = 2L, +#' padding = 5e3L +#' ) +#' res <- +#' par_grid_mirai( +#' grids = nccompreg, +#' fun_dist = extract_at, +#' x = ncelev, +#' y = ncpnts, +#' qsegs = 90L, +#' radius = 5e3L, +#' id = "pid" +#' ) +#' @seealso +#' [`mirai::daemons`], [`mirai::mirai_map`], [`par_convert_f`] +#' @importFrom mirai mirai_map +#' @importFrom rlang inject !!! +#' @importFrom collapse rowbind +#' @importFrom sf sf_use_s2 +#' @importFrom cli cli_abort cli_alert_info +#' @importFrom methods getPackageName +#' @export +par_grid_mirai <- + function( + grids, + fun_dist, + ..., + pad_y = FALSE, + .debug = TRUE + ) { + sf::sf_use_s2(FALSE) + + if (inherits(grids[[1]], "SpatVector")) { + grids <- Map(sf::st_as_sf, grids) + } + + # grid id selection check + grids_target_in <- grids$original + grids_target_list <- + split(grids_target_in, unlist(grids_target_in[["CGRIDID"]])) + + # initiate an index list + results <- as.list(seq_along(grids_target_list)) + + # is the function sf? + funname <- as.character(substitute(fun_dist)) + # is the function extract_at? + is_extract_at <- any(endsWith(funname, "extract_at")) + funname <- funname[length(funname)] + pkgname <- try(.check_package(funname), silent = TRUE) + + # parallel worker will take terra class objects + # if chopin function is used + class_vec <- + if (pkgname == "chopin") { + if (is_extract_at) { + "sf" + } else { + "terra" + } + } else { + pkgname + } + + # clean additional arguments + args_input <- list(...) + if (funname == "chopin" && is.null(args_input$.standalone)) { + args_input$.standalone <- FALSE + } + + # Track spatraster file path + args_input$x <- .check_par_spatraster(args_input$x) + args_input$y <- .check_par_spatraster(args_input$y) + # get hints from the inputs on data model + peek_x <- try(.check_character(args_input$x), silent = TRUE) + peek_y <- try(.check_character(args_input$y), silent = TRUE) + if (inherits(peek_x, "try-error")) { + crs_x <- terra::crs(args_input$x) + } else { + crs_x <- .check_character(args_input$x) + crs_x <- attr(crs_x, "crs") + } + + # class identity check + .check_align_fxy(pkgname, args_input$x, args_input$y) + + # Main parallelization + results <- + mirai::mirai_map( + .x = results, + .f = + function( + i, + grids, grids_target_list, + fun_dist, args_input, + peek_x, peek_y, + crs_x, + pad_y, class_vec, .debug + ) { + # inside each parallel job, feel free to use terra functions + # technically we do not export terra objects, rather calling + # terra functions directly to make objects from scratch in + # parallel workers. + requireNamespace("chopin") + requireNamespace("sf") + requireNamespace("terra") + options(sf_use_s2 = FALSE) + tryCatch({ + grid_in <- grids_target_list[[i]] + gpad_in <- grids$padded[grids$padded$CGRIDID %in% grid_in$CGRIDID, ] + + grid_in <- reproject_std(grid_in, crs_x) + gpad_in <- reproject_std(gpad_in, crs_x) + + args_input$x <- + .par_screen( + type = peek_x, + input = args_input$x, + input_id = NULL, + out_class = class_vec, + .window = if (pad_y) grid_in else gpad_in + ) + + args_input$y <- + .par_screen( + type = peek_y, + input = args_input$y, + input_id = NULL, + out_class = class_vec, + .window = if (pad_y) gpad_in else grid_in + ) + + res <- rlang::inject(fun_dist(!!!args_input)) + cli::cli_alert_info( + sprintf( + "Task at CGRIDID: %s is successfully dispatched.\n", + as.character(unlist(grid_in[["CGRIDID"]])) + ) + ) + + res <- try(as.data.frame(res), silent = TRUE) + return(res) + }, + error = function(e) { + if (.debug) { + grid_in <- grids_target_list[[i]] + data.frame( + CGRIDID = grid_in[["CGRIDID"]], + error_message = paste(unlist(e), collapse = " ") + ) + } else { + return(NULL) + } + }) + }, + .args = + list( + grids = grids, + grids_target_list = grids_target_list, + fun_dist = fun_dist, + args_input = args_input, + peek_x = peek_x, + peek_y = peek_y, + crs_x = crs_x, + pad_y = pad_y, + class_vec = class_vec, + .debug = .debug + ) + ) + + .progress <- NULL + results[.progress] + + # remove NULL + results <- results[] + results <- results[!vapply(results, is.null, logical(1))] + + # Bind rows + results <- collapse::rowbind(results, fill = TRUE) + + return(results) + } + + +# nolint start +#' Parallelize spatial computation by hierarchy in input data +#' @family Parallelization +#' @description "Hierarchy" refers to a system, +#' which divides the entire study region into multiple subregions. +#' It is usually reflected in an area code system +#' (e.g., FIPS for US Census geographies and +#' Nomenclature of Territorial Units for Statistics (NUTS), etc.). +#' [mirai::daemons] will set the parallel backend then [mirai::mirai_map] +#' will the work by splitting lower level features into +#' several higher level feature group. For details of the terminology +#' in `mirai` package, refer to [`mirai::mirai`]. +#' Each thread will process the number of lower level features +#' in each higher level feature. Please be advised that +#' accessing the same file simultaneously with +#' multiple processes may result in errors. +#' @details +#' In dynamic dots (`...`), `fun_dist` arguments should include +#' x and y where sf/terra class objects or file paths are accepted. +#' Hierarchy is interpreted by the `regions_id` argument first. +#' `regions_id` is assumed to be a field name in the `x` or `y` argument +#' object. It is expected that `regions` represents the higher level +#' boundaries and `x` or `y` in `fun_dist` is the lower level boundaries. +#' However, if that is not the case, with `trim` argument, the function +#' will generate the higher level codes from `regions_id` by extracting +#' the code from the left end (controlled by `length_left`). +#' Whether `x` or `y` is searched is determined by `pad_y` value. +#' `pad_y = TRUE` will make the function attempt to find `regions_id` +#' in `x`, whereas `pad_y = FALSE` will look for `regions_id` at +#' `y`. If the `regions_id` doesn't exist in `x` or `y`, the function +#' will utilize spatial relationship (intersects) to filter the data. +#' Note that dispatching computation by subregions based on the spatial +#' relationship may lead to a slight discrepancy in the result. For +#' example, if the higher and lower level features are not perfectly +#' aligned, there may be some features that are not included or duplicated +#' in the subregions. The function will alert the user if spatial relation- +#' ship is used to filter the data. +#' +#' @note +#' Virtually any sf/terra functions that accept two arguments +#' can be put in `fun_dist`, but please be advised that +#' some spatial operations do not necessarily give the +#' exact result from what would have been done with single thread. +#' For example, distance calculated through this function may return the +#' lower value than actual because the computational region was reduced. +#' This would be the case especially where the target features +#' are spatially sparsely distributed. +#' @param regions `sf`/`SpatVector` object. +#' Computational regions. Only polygons are accepted. +#' @param regions_id character(1). Name of unique ID field in `regions`. +#' The regions will be split by the common level value. +#' @param length_left integer(1). Length of the first characters of +#' the `regions_id` values. Default is NULL, which will not manipulate +#' the `regions_id` values. If the number of characters is not +#' consistent (for example, numerics), the function will alert the user. +#' @param fun_dist `sf`, `terra`, or `chopin` functions. +#' This function should have `x` and `y` arguments. +#' @param pad numeric(1). Padding distance for each subregion defined +#' by `regions_id` or trimmed `regions_id` values. +#' in linear unit of coordinate system. Default is 0, which means +#' each subregion is used as is. If the value is greater than 0, +#' the subregion will be buffered by the value. The padding distance will +#' be applied to `x` (`pad_y = FALSE`) or `y` (`pad_y = TRUE`) to filter +#' the data. +#' @param pad_y logical(1). Whether to filter y with the padded grid. +#' Should be TRUE when x is where the values are calculated. +#' Default is `FALSE`. In the reverse case, like `terra::extent` or +#' `exactextractr::exact_extract`, the raster (x) should be scoped +#' with the padded grid. +#' @param ... Arguments passed to the argument `fun_dist`. +#' @param .debug logical(1). Default is `FALSE` +#' If a unit computation fails, the error message and the `regions_id` +#' value where the error occurred will be included in the output. +#' @returns a data.frame object with computation results. +#' For entries of the results, consult the function used in +#' \code{fun_dist} argument. +#' @seealso +#' [`mirai::mirai_map`], [`mirai::daemons`], [`par_convert_f`] +#' @author Insang Song \email{geoissong@@gmail.com} +#' @examples +#' library(terra) +#' library(sf) +#' library(mirai) +#' options(sf_use_s2 = FALSE) +#' mirai::daemons(4, dispatcher = "process") +#' +#' ncpath <- system.file("extdata/nc_hierarchy.gpkg", package = "chopin") +#' nccnty <- sf::st_read(ncpath, layer = "county") +#' nctrct <- sf::st_read(ncpath, layer = "tracts") +#' ncelev <- +#' system.file("extdata/nc_srtm15_otm.tif", package = "chopin") +#' +#' ncsamp <- +#' sf::st_sample( +#' nccnty, +#' size = 1e4L +#' ) +#' # sfc to sf +#' ncsamp <- sf::st_as_sf(ncsamp) +#' # assign ID +#' ncsamp$kid <- sprintf("K-%05d", seq_len(nrow(ncsamp))) +#' res <- +#' par_hierarchy_mirai( +#' regions = nccnty, +#' regions_id = "GEOID", +#' fun_dist = extract_at, +#' y = nctrct, +#' x = ncelev, +#' id = "GEOID", +#' func = "mean", +#' .debug = TRUE +#' ) +#' @importFrom rlang inject !!! +#' @importFrom mirai mirai_map +#' @importFrom collapse rowbind +#' @importFrom sf sf_use_s2 +#' @importFrom cli cli_abort cli_alert_info +#' @importFrom stats var +#' @export +par_hierarchy_mirai <- + function( + regions, + regions_id = NULL, + length_left = NULL, + pad = 0, + pad_y = FALSE, + fun_dist, + ..., + .debug = TRUE + ) { + args_input <- list(...) + + # is the function sf? + funname <- as.character(substitute(fun_dist)) + # is the function extract_at? + is_extract_at <- any(endsWith(funname, "extract_at")) + funname <- funname[length(funname)] + pkgname <- try(.check_package(funname), silent = TRUE) + + # parallel worker will take terra class objects + # if chopin function is used + class_vec <- + if (pkgname == "chopin") { + if (is_extract_at) { + "sf" + } else { + "terra" + } + } else { + pkgname + } + + # Track spatraster file path + args_input$x <- .check_par_spatraster(args_input$x) + args_input$y <- .check_par_spatraster(args_input$y) + # get hints from the inputs + peek_x <- try(.check_character(args_input$x), silent = TRUE) + peek_y <- try(.check_character(args_input$y), silent = TRUE) + if (inherits(peek_x, "try-error")) { + crs_x <- terra::crs(args_input$x) + } else { + crs_x <- .check_character(args_input$x) + crs_x <- attr(crs_x, "crs") + } + + if (length(regions_id) != 1) { + cli::cli_abort("The length of regions_id is not valid.") + } + + # class identity check + .check_align_fxy(pkgname, args_input$x, args_input$y) + + # Region ID cleaning to get unique high-level IDs + # what if regions refers to a path string? + # vectorize the regions_id + vec_regions_id <- unlist(regions[[regions_id]], use.names = FALSE) + + if (is.null(length_left)) { + cli::cli_alert_info( + sprintf( + "%s is used to stratify the process.", + regions_id + ) + ) + regions_idn <- unique(vec_regions_id) + } else { + cli::cli_alert_info( + sprintf( + paste0( + "Substring is extracted from the left for level definition. ", + "First %d characters are used to stratify the process." + ), + length_left + ) + ) + check_nchar <- nchar(vec_regions_id) + if (var(check_nchar) != 0) { + cli::cli_alert_warning( + paste0( + "The regions_id values are in different lengths. ", + "substr may not work properly." + ) + ) + } + regions_idn <- + unique(substr(vec_regions_id, 1, length_left)) + } + regions_list <- as.list(regions_idn) + + # Main parallelization + results <- + mirai::mirai_map( + .x = seq_along(regions_list), + .f = + function( + i, + fun_dist, args_input, + regions_list, + pad, pad_y, + peek_x, peek_y, + class_vec, + crs_x, + .debug + ) { + # inside each parallel job, feel free to use terra functions + # technically we do not export terra objects, rather calling + # terra functions directly to make objects from scratch in + # parallel workers. + requireNamespace("chopin") + requireNamespace("sf") + requireNamespace("terra") + options(sf_use_s2 = FALSE) + result <- + tryCatch( + { + # subregion header string retrieval + region_i <- regions_list[[i]] + regions_ids <- vec_regions_id + + # subregion object + subregion_in <- + regions[startsWith(regions_ids, region_i), ] + # padding if necessary + # can be expanded to other classes in common packages + # but it elongates the function and lint failure + if (inherits(subregion_in, "sf")) { + subregion_inb <- sf::st_buffer(subregion_in, pad) + } else { + subregion_inb <- terra::buffer(subregion_in, pad) + } + + # interpret the function input x and y + args_input$x <- + .par_screen( + type = peek_x, + input = args_input$x, + input_id = NULL, + out_class = class_vec, + .window = NULL + ) + args_input$y <- + .par_screen( + type = peek_y, + input = args_input$y, + input_id = NULL, + out_class = class_vec, + .window = NULL + ) + + # Here we use twofold approach to filter the data + # 1. If pad_y is TRUE, y is filtered with: + # 1a. the string prefix if the same field `regions_id` + # exists in y + # 1b. Otherwise, it uses the padded subregion + # I believe there would be a succinct and sophisticated + # way, but this is the most straightforward way. + # 2. If pad_y is FALSE, x is filtered with: + # 2a and 2b: ditto as 1a and 1b but x replaces y + if (pad_y) { + # aligning the CRS + args_input$y <- + reproject_std(args_input$y, crs_x) + # if the same regions_id present in the x + if (regions_id %in% names(args_input$x)) { + args_input$x <- + args_input$x[ + startsWith( + unlist(args_input$x[[regions_id]], use.names = FALSE), + region_i + ), + ] + } else { + cli::cli_alert_info( + paste0( + "The regions_id is not found in the x object.", + " Spatial relationship is used to filter x." + ) + ) + args_input$x <- .intersect(args_input$x, subregion_in) + } + args_input$y <- .intersect(args_input$y, subregion_inb) + } else { + args_input$y <- + reproject_std(args_input$y, crs_x) + if (regions_id %in% names(args_input$y)) { + args_input$y <- + args_input$y[ + startsWith( + unlist(args_input$y[[regions_id]], use.names = FALSE), + region_i + ), + ] + } else { + cli::cli_alert_info( + paste0( + "The regions_id is not found in the x object.", + " Spatial relationship is used to filter y." + ) + ) + args_input$y <- .intersect(args_input$y, subregion_in) + } + args_input$x <- .intersect(args_input$x, subregion_inb) + } + + # Main dispatch + res <- rlang::inject(fun_dist(!!!args_input)) + res <- try(as.data.frame(res), silent = TRUE) + cli::cli_alert_info( + sprintf("Your input function at %s is dispatched.\n", + region_i) + ) + + return(res) + }, + error = function(e) { + if (.debug) { + data.frame( + regions_id = regions_list[[i]], + error_message = paste(unlist(e), collapse = " ") + ) + } else { + return(NULL) + } + }) + }, + .args = + list( + fun_dist = fun_dist, + args_input = args_input, + regions_list = regions_list, + peek_x = peek_x, + peek_y = peek_y, + crs_x = crs_x, + pad = pad, + pad_y = pad_y, + class_vec = class_vec, + .debug = .debug + ) + ) + + .progress <- NULL + results[.progress] + + # remove NULL + results <- results[] + results <- results[!vapply(results, is.null, logical(1))] + + # combine results + results <- collapse::rowbind(results, fill = TRUE) + + return(results) + } +# nolint end + + +#' @title Parallelize spatial computation over multiple raster files +#' @family Parallelization +#' @description Large raster files usually exceed the memory capacity in size. +#' This function can be helpful to process heterogenous raster files with +#' homogeneous summary functions. Heterogenous raster files refer to +#' rasters with different spatial extents and resolutions. +#' Cropping a large raster into a small subset even consumes +#' a lot of memory and adds processing time. +#' This function leverages `terra` `SpatRaster` +#' to distribute computation jobs over multiple threads. +#' It is assumed that users have multiple large raster files +#' in their disk, then each file path is assigned to a thread. +#' Each thread will directly read raster values from +#' the disk using C++ pointers that operate in terra functions. +#' For use, it is strongly recommended to use vector data with +#' small and confined spatial extent for computation to avoid +#' out-of-memory error. `y` argument in `fun_dist` will be used as-is. +#' That means no preprocessing or subsetting will be +#' applied. Please be aware of the spatial extent and size of the +#' inputs. +#' @param filenames character. A vector or list of +#' full file paths of raster files. n is the total number of raster files. +#' @param fun_dist terra or chopin functions that accept `SpatRaster` +#' object in an argument. In particular, `x` and `y` arguments +#' should be present and `x` should be a `SpatRaster`. +#' @param ... Arguments passed to the argument `fun_dist`. +#' @param .debug logical(1). Default is `FALSE`. If `TRUE` and +#' a unit computation fails, the error message and the file path +#' where the error occurred will be included in the output. +#' @returns a data.frame object with computation results. +#' For entries of the results, +#' consult the function used in `fun_dist` argument. +#' @author Insang Song \email{geoissong@@gmail.com} +#' @seealso +#' [`mirai::mirai`], [`mirai::mirai_map`], [`mirai::daemons`], +#' [`par_convert_f`] +#' +#' @examples +#' library(terra) +#' library(sf) +#' library(mirai) +#' sf::sf_use_s2(FALSE) +#' mirai::daemons(4, dispatcher = "process") +#' +#' ncpath <- system.file("extdata/nc_hierarchy.gpkg", package = "chopin") +#' nccnty <- sf::st_read(ncpath, layer = "county") +#' ncelev <- +#' system.file("extdata/nc_srtm15_otm.tif", package = "chopin") +#' ncelevras <- terra::rast(ncelev) +#' +#' tdir <- tempdir(check = TRUE) +#' terra::writeRaster(ncelevras, file.path(tdir, "test1.tif"), overwrite = TRUE) +#' terra::writeRaster(ncelevras, file.path(tdir, "test2.tif"), overwrite = TRUE) +#' testfiles <- list.files(tdir, pattern = "tif$", full.names = TRUE) +#' +#' res <- par_multirasters_mirai( +#' filenames = testfiles, +#' fun_dist = extract_at, +#' x = ncelev, +#' y = nccnty, +#' id = "GEOID", +#' func = "mean" +#' ) +#' @importFrom mirai mirai_map +#' @importFrom terra rast +#' @importFrom rlang inject !!! +#' @importFrom collapse rowbind +#' @importFrom cli cli_inform cli_alert_info +#' @export +par_multirasters_mirai <- + function( + filenames, + fun_dist, + ..., + .debug = TRUE + ) { + file_list <- filenames + file_iter <- as.list(seq_along(file_list)) + args_input <- list(...) + + # is the function sf? + funname <- as.character(substitute(fun_dist)) + # is the function extract_at? + is_extract_at <- any(endsWith(funname, "extract_at")) + funname <- funname[length(funname)] + pkgname <- try(.check_package(funname), silent = TRUE) + + # parallel worker will take terra class objects + # if chopin function is used + class_vec <- + if (pkgname == "chopin") { + if (is_extract_at) { + "sf" + } else { + "terra" + } + } else { + pkgname + } + + # Unlike other par_* functions, raster paths are not + # tracked by the function since the raster file paths + # are required to be passed as an argument to each parallel worker. + # y class identification + peek_y <- try(.check_character(args_input$y), silent = TRUE) + + # get hints from the inputs + crs_x <- .check_character(filenames[1]) + + # Main parallelization + results <- + mirai::mirai_map( + .x = file_iter, + .f = + function( + i, + fun_dist, + args_input, + filenames, + peek_y, + class_vec, + crs_x, + .debug + ) { + # inside each parallel job, feel free to use terra functions + # technically we do not export terra objects, rather calling + # terra functions directly to make objects from scratch in + # parallel workers. + requireNamespace("chopin") + requireNamespace("sf") + requireNamespace("terra") + options(sf_use_s2 = FALSE) + result <- + tryCatch({ + if (!"id" %in% names(formals(fun_dist))) { + args_input$id <- NULL + } + + # interpret the function input x and y + args_input$x <- + .par_screen( + type = "raster", + input = filenames[i], + input_id = NULL, + out_class = class_vec, + .window = NULL + ) + args_input$y <- + .par_screen( + type = peek_y, + input = args_input$y, + input_id = NULL, + out_class = class_vec, + .window = NULL + ) + args_input$y <- reproject_std(args_input$y, attr(crs_x, "crs")) + + res <- rlang::inject(fun_dist(!!!args_input)) + cli::cli_alert_info( + sprintf( + "Your input function at %s is dispatched.\n", filenames[i] + ) + ) + res <- try(as.data.frame(res), silent = TRUE) + res$base_raster <- filenames[i] + return(res) + }, error = function(e) { + if (.debug) { + data.frame( + base_raster = filenames[i], + error_message = paste(unlist(e), collapse = " ") + ) + } else { + return(NULL) + } + } + ) + return(result) + }, + .args = + list( + filenames = filenames, + fun_dist = fun_dist, + args_input = args_input, + peek_y = peek_y, + crs_x = crs_x, + class_vec = class_vec, + .debug = .debug + ) + ) + + .progress <- NULL + results[.progress] + + # remove NULL + results <- results[] + results <- results[!vapply(results, is.null, logical(1))] + + # combine results + results <- collapse::rowbind(results, fill = TRUE) + + return(results) + } diff --git a/README.Rmd b/README.Rmd index bfb9adb..ef43e5f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,7 +17,9 @@ knitr::opts_chunk$set( # Computation of Spatial Data by Hierarchical and Objective Partitioning of Inputs for Parallel Processing -[![cov](https://docs.ropensci.org/chopin/badges/coverage.svg)](https://github.com/ropensci/chopin/actions) +[![Codecov test +coverage](https://codecov.io/gh/ropensci/chopin/graph/badge.svg)](https://app.codecov.io/gh/chopin/osmapiR) + [![R-CMD-check](https://github.com/ropensci/chopin/actions/workflows/check-standard.yaml/badge.svg)](https://github.com/ropensci/chopin/actions/workflows/check-standard.yaml) [![Status at rOpenSci Software Peer Review](https://badges.ropensci.org/638_status.svg)](https://github.com/ropensci/software-review/issues/638) [![Lifecycle: @@ -48,6 +50,11 @@ __Second,__ users choose the proper data parallelization configuration by creati - `par_multirasters`: parallelize over multiple raster files +- Each of the `par_*` functions introduced above has `mirai` version with a suffix `_mirai` after the function names: `par_grid_mirai`, `par_hierarchy_mirai`, and `par_multirasters`. These functions will work properly after creating daemons with `mirai::daemons`. + +```r +mirai::daemons(4L, dispatcher = "process") +``` For grid partitioning, the entire study area will be divided into partly overlapped grids. We suggest two flowcharts to help which function to use for parallel processing below. The upper flowchart is raster-oriented and the lower is vector-oriented. They are supplementary to each other. When a user follows the raster-oriented one, they might visit the vector-oriented flowchart at each end of the raster-oriented flowchart. @@ -239,6 +246,7 @@ ncpoints_srtm <- ``` + ```{r compare-single-multi} colnames(ncpoints_srtm_mthr)[2] <- "mean_par" ncpoints_compar <- merge(ncpoints_srtm, ncpoints_srtm_mthr) @@ -256,6 +264,31 @@ plot(ncpoints_s[, "mean"], main = "Single-thread", pch = 19, cex = 0.33) plot(ncpoints_m[, "mean_par"], main = "Multi-thread", pch = 19, cex = 0.33) ``` +The same workflow operates on `mirai` dispatchers. + +```{r demo-par-grid-mirai} +future::plan(future::sequential) +mirai::daemons(n = 4L, dispatcher = "process") + +system.time( + ncpoints_srtm_mthr <- + par_grid_mirai( + grids = compregions, + fun_dist = extract_at, + x = srtm, + y = ncpoints, + id = "pid", + radius = 1e4L, + .standalone = FALSE + ) +) + +# remove mirai::daemons +mirai::daemons(0L) + +``` + + ### `chopin::par_hierarchy()`: parallelize geospatial computations using intrinsic data hierarchy We usually have nested/exhaustive hierarchies in real-world datasets. For example, land is organized by administrative/jurisdictional borders where multiple levels exist. In the U.S. context, a state consists of several counties, counties are split into census tracts, and they have a group of block groups. `chopin::par_hierarchy()` leverages such hierarchies to parallelize geospatial operations, which means that a group of lower-level geographic units in a higher-level geography is assigned to a process. A demonstration below shows that census tracts are grouped by their counties then each county will be processed in a CPU thread. @@ -279,6 +312,8 @@ nc_tracts$COUNTY <- substr(nc_tracts$GEOID, 1, 5) #### Extract average SRTM elevations by single and multiple threads ```{r compare-runtime-hierarchy} +future::plan(future.mirai::mirai_multisession, workers = 4L) + # single-thread system.time( nc_elev_tr_single <- diff --git a/README.md b/README.md index 9f2785d..815b352 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,9 @@ -[![cov](https://docs.ropensci.org/chopin/badges/coverage.svg)](https://github.com/ropensci/chopin/actions) +[![Codecov test +coverage](https://codecov.io/gh/ropensci/chopin/graph/badge.svg)](https://app.codecov.io/gh/chopin/osmapiR) + [![R-CMD-check](https://github.com/ropensci/chopin/actions/workflows/check-standard.yaml/badge.svg)](https://github.com/ropensci/chopin/actions/workflows/check-standard.yaml) [![Status at rOpenSci Software Peer Review](https://badges.ropensci.org/638_status.svg)](https://github.com/ropensci/software-review/issues/638) @@ -52,14 +54,25 @@ multiple raster file paths into `par_multirasters()`. **Finally,** users run `par_*()` function with the configurations set above to compute spatial variables from input data in parallel: -- `par_grid`: parallelize over artificial grid polygons that are - generated from the maximum extent of inputs. `par_pad_grid` is used to - generate the grid polygons before running this function. + - `par_grid`: parallelize over artificial grid polygons that are + generated from the maximum extent of inputs. `par_pad_grid` is used + to generate the grid polygons before running this function. -- `par_hierarchy`: parallelize over hierarchy coded in identifier fields - (for example, census blocks in each county in the US) + - `par_hierarchy`: parallelize over hierarchy coded in identifier + fields (for example, census blocks in each county in the US) -- `par_multirasters`: parallelize over multiple raster files + - `par_multirasters`: parallelize over multiple raster files + + - Each of the `par_*` functions introduced above has `mirai` version + with a suffix `_mirai` after the function names: `par_grid_mirai`, + `par_hierarchy_mirai`, and `par_multirasters`. These functions will + work properly after creating daemons with `mirai::daemons`. + + + +``` r +mirai::daemons(4L, dispatcher = "process") +``` For grid partitioning, the entire study area will be divided into partly overlapped grids. We suggest two flowcharts to help which function to @@ -75,14 +88,14 @@ classes for spatial data. Raster-vector overlay is done with `exactextractr`. Three helper functions encapsulate multiple geospatial data calculation steps over multiple CPU threads. -- `extract_at`: extract raster values with point buffers or polygons - with or without kernel weights + - `extract_at`: extract raster values with point buffers or polygons + with or without kernel weights -- `summarize_sedc`: calculate sums of [exponentially decaying - contributions](https://mserre.sph.unc.edu/BMElab_web/SEDCtutorial/index.html) + - `summarize_sedc`: calculate sums of [exponentially decaying + contributions](https://mserre.sph.unc.edu/BMElab_web/SEDCtutorial/index.html) -- `summarize_aw`: area-weighted covariates based on target and reference - polygons + - `summarize_aw`: area-weighted covariates based on target and + reference polygons ### Function selection guide @@ -93,32 +106,33 @@ users with large vector data. In **raster-oriented selection**, we suggest four factors to consider: -- Number of raster files: for multiple files, `par_multirasters` is - recommended. When there are multiple rasters that share the same - extent and resolution, consider stacking the rasters into multilayer - SpatRaster object by calling `terra::rast(filenames)`. -- Raster resolution: We suggest 100 meters as a threshold. Rasters with - resolution coarser than 100 meters and a few layers would be better - for the direct call of `exactextractr::exact_extract()`. -- Raster extent: Using `SpatRaster` in `exactextractr::exact_extract()` - is often minimally affected by the raster extent. -- Memory size: `max_cells_in_memory` argument value of - `exactextractr::exact_extract()`, raster resolution, and the number of - layers in `SpatRaster` are multiplicatively related to the memory - usage. + - Number of raster files: for multiple files, `par_multirasters` is + recommended. When there are multiple rasters that share the same + extent and resolution, consider stacking the rasters into multilayer + SpatRaster object by calling `terra::rast(filenames)`. + - Raster resolution: We suggest 100 meters as a threshold. Rasters + with resolution coarser than 100 meters and a few layers would be + better for the direct call of `exactextractr::exact_extract()`. + - Raster extent: Using `SpatRaster` in + `exactextractr::exact_extract()` is often minimally affected by the + raster extent. + - Memory size: `max_cells_in_memory` argument value of + `exactextractr::exact_extract()`, raster resolution, and the number + of layers in `SpatRaster` are multiplicatively related to the memory + usage. ![](man/figures/README-flowchart-raster.png) For **vector-oriented selection**, we suggest three factors to consider: -- Number of features: When the number of features is over 100,000, - consider using `par_grid` or `par_hierarchy` to split the data into - smaller chunks. -- Hierarchical structure: If the data has a hierarchical structure, - consider using `par_hierarchy` to parallelize the operation. -- Data grouping: If the data needs to be grouped in similar sizes, - consider using `par_pad_balanced` or `par_pad_grid` with - `mode = "grid_quantile"`. + - Number of features: When the number of features is over 100,000, + consider using `par_grid` or `par_hierarchy` to split the data into + smaller chunks. + - Hierarchical structure: If the data has a hierarchical structure, + consider using `par_hierarchy` to parallelize the operation. + - Data grouping: If the data needs to be grouped in similar sizes, + consider using `par_pad_balanced` or `par_pad_grid` with `mode = + "grid_quantile"`. ![](man/figures/README-flowchart-vector.png) @@ -164,7 +178,7 @@ library(dplyr) library(sf) #> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE library(terra) -#> terra 1.7.78 +#> terra 1.7.83 library(future) library(future.mirai) library(mirai) @@ -249,7 +263,7 @@ system.time( ) #> Input is a character. Attempt to read it with terra::rast... #> user system elapsed -#> 4.694 0.067 4.785 +#> 5.009 0.067 4.719 ``` #### Generate regular grid computational regions @@ -336,7 +350,7 @@ system.time( #> Input is a character. Attempt to read it with terra::rast... #> ℹ Task at CGRIDID: 4 is successfully dispatched. #> user system elapsed -#> 0.337 0.017 7.057 +#> 0.325 0.023 7.433 ncpoints_srtm <- extract_at( @@ -373,6 +387,34 @@ plot(ncpoints_m[, "mean_par"], main = "Multi-thread", pch = 19, cex = 0.33) +The same workflow operates on `mirai` dispatchers. + +``` r +future::plan(future::sequential) +mirai::daemons(n = 4L, dispatcher = "process") +#> [1] 4 + +system.time( + ncpoints_srtm_mthr <- + par_grid_mirai( + grids = compregions, + fun_dist = extract_at, + x = srtm, + y = ncpoints, + id = "pid", + radius = 1e4L, + .standalone = FALSE + ) +) +#> ℹ Input is not a character. +#> user system elapsed +#> 0.074 0.000 9.284 + +# remove mirai::daemons +mirai::daemons(0L) +#> [1] 0 +``` + ### `chopin::par_hierarchy()`: parallelize geospatial computations using intrinsic data hierarchy We usually have nested/exhaustive hierarchies in real-world datasets. @@ -395,7 +437,7 @@ path_nchrchy <- file.path(wdir, "nc_hierarchy.gpkg") nc_data <- path_nchrchy nc_county <- sf::st_read(nc_data, layer = "county") #> Reading layer `county' from data source -#> `/tmp/RtmpPmef5d/temp_libpath8faa11b6de6/chopin/extdata/nc_hierarchy.gpkg' +#> `/tmp/RtmpXuCaOL/temp_libpath358e3adc6f09/chopin/extdata/nc_hierarchy.gpkg' #> using driver `GPKG' #> Simple feature collection with 100 features and 1 field #> Geometry type: POLYGON @@ -404,7 +446,7 @@ nc_county <- sf::st_read(nc_data, layer = "county") #> Projected CRS: NAD83 / Conus Albers nc_tracts <- sf::st_read(nc_data, layer = "tracts") #> Reading layer `tracts' from data source -#> `/tmp/RtmpPmef5d/temp_libpath8faa11b6de6/chopin/extdata/nc_hierarchy.gpkg' +#> `/tmp/RtmpXuCaOL/temp_libpath358e3adc6f09/chopin/extdata/nc_hierarchy.gpkg' #> using driver `GPKG' #> Simple feature collection with 2672 features and 1 field #> Geometry type: MULTIPOLYGON @@ -421,6 +463,8 @@ nc_tracts$COUNTY <- substr(nc_tracts$GEOID, 1, 5) #### Extract average SRTM elevations by single and multiple threads ``` r +future::plan(future.mirai::mirai_multisession, workers = 4L) + # single-thread system.time( nc_elev_tr_single <- @@ -432,7 +476,7 @@ system.time( ) #> Input is a character. Attempt to read it with terra::rast... #> user system elapsed -#> 0.501 0.004 0.504 +#> 0.527 0.000 0.491 # hierarchical parallelization system.time( @@ -550,7 +594,7 @@ system.time( #> Input is a character. Attempt to read it with terra::rast...ℹ Your input function at 37055 is dispatched. #> Input is a character. Attempt to read it with terra::rast...ℹ Your input function at 37047 is dispatched. #> user system elapsed -#> 0.247 0.070 1.920 +#> 0.396 0.041 7.102 ``` ### `par_multirasters()`: parallelize over multiple rasters @@ -577,9 +621,9 @@ terra::writeRaster(ncelev, file.path(tdir, "test5.tif"), overwrite = TRUE) # check if the raster files were exported as expected testfiles <- list.files(tdir, pattern = "*.tif$", full.names = TRUE) testfiles -#> [1] "/tmp/RtmpVGq9CW/test1.tif" "/tmp/RtmpVGq9CW/test2.tif" -#> [3] "/tmp/RtmpVGq9CW/test3.tif" "/tmp/RtmpVGq9CW/test4.tif" -#> [5] "/tmp/RtmpVGq9CW/test5.tif" +#> [1] "/tmp/Rtmpe3n6BN/test1.tif" "/tmp/Rtmpe3n6BN/test2.tif" +#> [3] "/tmp/Rtmpe3n6BN/test3.tif" "/tmp/Rtmpe3n6BN/test4.tif" +#> [5] "/tmp/Rtmpe3n6BN/test5.tif" ``` ``` r @@ -596,32 +640,32 @@ system.time( ) #> ℹ Input is not a character. #> Input is a character. Attempt to read it with terra::rast... -#> ℹ Your input function at /tmp/RtmpVGq9CW/test1.tif is dispatched. +#> ℹ Your input function at /tmp/Rtmpe3n6BN/test1.tif is dispatched. #> #> Input is a character. Attempt to read it with terra::rast... -#> ℹ Your input function at /tmp/RtmpVGq9CW/test2.tif is dispatched. +#> ℹ Your input function at /tmp/Rtmpe3n6BN/test2.tif is dispatched. #> #> Input is a character. Attempt to read it with terra::rast... -#> ℹ Your input function at /tmp/RtmpVGq9CW/test3.tif is dispatched. +#> ℹ Your input function at /tmp/Rtmpe3n6BN/test3.tif is dispatched. #> #> Input is a character. Attempt to read it with terra::rast... -#> ℹ Your input function at /tmp/RtmpVGq9CW/test4.tif is dispatched. +#> ℹ Your input function at /tmp/Rtmpe3n6BN/test4.tif is dispatched. #> #> Input is a character. Attempt to read it with terra::rast... -#> ℹ Your input function at /tmp/RtmpVGq9CW/test5.tif is dispatched. +#> ℹ Your input function at /tmp/Rtmpe3n6BN/test5.tif is dispatched. #> user system elapsed -#> 1.149 0.183 2.393 +#> 1.329 0.087 2.830 knitr::kable(head(res)) ``` -| mean | base_raster | -|----------:|:--------------------------| -| 136.80203 | /tmp/RtmpVGq9CW/test1.tif | -| 189.76170 | /tmp/RtmpVGq9CW/test1.tif | -| 231.16968 | /tmp/RtmpVGq9CW/test1.tif | -| 98.03845 | /tmp/RtmpVGq9CW/test1.tif | -| 41.23463 | /tmp/RtmpVGq9CW/test1.tif | -| 270.96933 | /tmp/RtmpVGq9CW/test1.tif | +| mean | base\_raster | +| --------: | :------------------------ | +| 136.80203 | /tmp/Rtmpe3n6BN/test1.tif | +| 189.76170 | /tmp/Rtmpe3n6BN/test1.tif | +| 231.16968 | /tmp/Rtmpe3n6BN/test1.tif | +| 98.03845 | /tmp/Rtmpe3n6BN/test1.tif | +| 41.23463 | /tmp/Rtmpe3n6BN/test1.tif | +| 270.96933 | /tmp/Rtmpe3n6BN/test1.tif | ``` r @@ -657,7 +701,7 @@ pnts <- sf::st_as_sf(pnts) pnts$pid <- sprintf("RPID-%04d", seq(1, 5000)) rd1 <- sf::st_read(path_ncrd1) #> Reading layer `ncroads_first' from data source -#> `/tmp/RtmpPmef5d/temp_libpath8faa11b6de6/chopin/extdata/ncroads_first.gpkg' +#> `/tmp/RtmpXuCaOL/temp_libpath358e3adc6f09/chopin/extdata/ncroads_first.gpkg' #> using driver `GPKG' #> Simple feature collection with 620 features and 4 fields #> Geometry type: MULTILINESTRING @@ -710,11 +754,11 @@ system.time( restr <- terra::nearest(x = terra::vect(pntst), y = terra::vect(rd1t)) ) #> user system elapsed -#> 0.368 0.000 0.376 +#> 0.38 0.00 0.36 pnt_path <- file.path(tdir, "pntst.gpkg") sf::st_write(pntst, pnt_path) -#> Writing layer `pntst' to data source `/tmp/RtmpVGq9CW/pntst.gpkg' using driver `GPKG' +#> Writing layer `pntst' to data source `/tmp/Rtmpe3n6BN/pntst.gpkg' using driver `GPKG' #> Writing 5000 features with 1 fields and geometry type Point. # we use four threads that were configured above @@ -760,11 +804,13 @@ system.time( #> ℹ Input is a character. Trying to read with terra . #> ℹ Task at CGRIDID: 8 is successfully dispatched. #> user system elapsed -#> 0.070 0.000 0.502 +#> 0.066 0.000 0.374 ``` -- We will compare the results from the single-thread and multi-thread - calculation. + - We will compare the results from the single-thread and multi-thread + calculation. + + ``` r resj <- merge(restr, resd, by = c("from_x", "from_y")) diff --git a/_pkgdown.yml b/_pkgdown.yml index 7d84f0f..bc60d8e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -16,6 +16,9 @@ reference: - par_grid - par_hierarchy - par_multirasters + - par_grid_mirai + - par_hierarchy_mirai + - par_multirasters_mirai - title: "`par_grid` preparation functions" desc: > Regular grid or equal number partitioning to run `par_grid` diff --git a/codemeta.json b/codemeta.json index 0e8aa73..efb7f38 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,13 +8,13 @@ "codeRepository": "https://github.com/ropensci/chopin", "issueTracker": "https://github.com/ropensci/chopin/issues", "license": "https://spdx.org/licenses/MIT", - "version": "0.8.3", + "version": "0.9.0", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", "url": "https://r-project.org" }, - "runtimePlatform": "R version 4.4.1 (2024-06-14)", + "runtimePlatform": "R version 4.4.2 (2024-10-31)", "author": [ { "@type": "Person", @@ -108,18 +108,6 @@ }, "sameAs": "https://CRAN.R-project.org/package=future.mirai" }, - { - "@type": "SoftwareApplication", - "identifier": "mirai", - "name": "mirai", - "provider": { - "@id": "https://cran.r-project.org", - "@type": "Organization", - "name": "Comprehensive R Archive Network (CRAN)", - "url": "https://cran.r-project.org" - }, - "sameAs": "https://CRAN.R-project.org/package=mirai" - }, { "@type": "SoftwareApplication", "identifier": "knitr", @@ -344,6 +332,19 @@ "sameAs": "https://CRAN.R-project.org/package=terra" }, "14": { + "@type": "SoftwareApplication", + "identifier": "mirai", + "name": "mirai", + "version": ">= 1.3.0", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=mirai" + }, + "15": { "@type": "SoftwareApplication", "identifier": "collapse", "name": "collapse", @@ -355,7 +356,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=collapse" }, - "15": { + "16": { "@type": "SoftwareApplication", "identifier": "lifecycle", "name": "lifecycle", @@ -369,7 +370,7 @@ }, "SystemRequirements": "netcdf" }, - "fileSize": "27889.584KB", + "fileSize": "27935.52KB", "releaseNotes": "https://github.com/ropensci/chopin/blob/master/NEWS.md", "readme": "https://github.com/ropensci/chopin/blob/main/README.md", "contIntegration": ["https://github.com/ropensci/chopin/actions", "https://github.com/ropensci/chopin/actions/workflows/check-standard.yaml"], diff --git a/man/chopin-package.Rd b/man/chopin-package.Rd index 9fc81ae..a667492 100644 --- a/man/chopin-package.Rd +++ b/man/chopin-package.Rd @@ -198,18 +198,17 @@ extracting various climate/weather datasets. \strong{Notes on data restrictions} \code{chopin} works best with \strong{two-dimensional} (\strong{planar}) geometries. -Users should disable \code{s2} spherical geometry mode in \code{sf} by setting. +Users should disable \code{s2} spherical geometry mode in \code{sf} by setting +\code{sf::sf_use_s2(FALSE)}. Running any \code{chopin} functions at spherical or three-dimensional (e.g., including M/Z dimensions) geometries may produce incorrect or unexpected results. - -\if{html}{\out{