From d3ac136c8b75624cd660b4eabba454d654a01b84 Mon Sep 17 00:00:00 2001 From: CJ Yetman Date: Mon, 2 Dec 2024 14:23:00 +0100 Subject: [PATCH] add `sector_classification` argument to `match_name()` (#509) --- NEWS.md | 6 +++++ R/add_sector_and_borderline.R | 19 +++++----------- R/match_name.R | 38 +++++++++++++++----------------- R/restructure_abcd.R | 8 +++---- man/match_name.Rd | 35 +++++++++++++---------------- tests/testthat/test-match_name.R | 12 +++++----- 6 files changed, 54 insertions(+), 64 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7fc7326f..ea038e78 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,11 @@ # r2dii.match (development version) +## Lifecycle changes + +### Breaking changes + +* `match_name()` gains a new argument in its signature, `sector_classification`, which is placed before the `...` argument. Where users have used `...` to pass additional unnamed arguments by position, they will need to update their code to pass these arguments by name so that they are not confused as an input to `sector_classfication`. The new argument `sector_classification` is optional and defaults to `r2dii.data::sector_classifications`. Explicitly passing a `sector_classification` to `match_name()`, enables users to use their own sector classification systems to match loan books to the abcd, assuming the `sector_classification` is mapped appropriately to PACTA sectors and has the same format as `r2dii.data::sector_classifications`. There is no more need to pass own sector classifications to `match_name()` via options. + # r2dii.match 0.2.1 * r2dii.match is now [stable](https://lifecycle.r-lib.org/articles/stages.html). diff --git a/R/add_sector_and_borderline.R b/R/add_sector_and_borderline.R index 1f6da8a7..ccad5b38 100644 --- a/R/add_sector_and_borderline.R +++ b/R/add_sector_and_borderline.R @@ -31,7 +31,7 @@ #' out %>% #' select(new_columns, everything()) #' @noRd -add_sector_and_borderline <- function(data) { +add_sector_and_borderline <- function(data, sector_classification = default_sector_classification()) { crucial <- c( "sector_classification_system", "sector_classification_direct_loantaker" ) @@ -40,40 +40,31 @@ add_sector_and_borderline <- function(data) { check_crucial_names(crucial) %>% # Coerce crucial columns to character for more robust join() purrr::modify_at(crucial, as.character) %>% - check_classification(column = "sector_classification_system") %>% - check_classification(column = "sector_classification_direct_loantaker") + check_classification(column = "sector_classification_system", classification = sector_classification) %>% + check_classification(column = "sector_classification_direct_loantaker", classification = sector_classification) out <- left_join( - checked, get_classifications(), + checked, sector_classification, by = set_names(c("code_system", "code"), crucial) ) restore_typeof(data, out, crucial) } -get_classifications <- function() { - custom_sector_classifications() %||% default_sector_classification() -} - default_sector_classification <- function() { r2dii.data::sector_classifications } -custom_sector_classifications <- function() { - getOption("r2dii.match.sector_classifications") -} - check_classification <- function(data, column, # FIXME: Remove needless argument? classification = NULL) { - classification <- classification %||% get_classifications() # To call columns from both data and classification with the same colname reference <- rename_as_loanbook(classification) all_unknown <- !any(data[[column]] %in% reference[[column]]) known <- unique(reference[[column]]) - if (all_unknown && is.null(custom_sector_classifications())) { + if (all_unknown) { abort_all_sec_classif_unknown(column, known) } diff --git a/R/match_name.R b/R/match_name.R index c33611ab..3467b402 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -7,12 +7,6 @@ #' are assigned. The similarity between aliases in each of the loanbook and abcd #' is scored using [stringdist::stringsim()]. #' -#' @section Package options: -#' `r2dii.match.sector_classifications`: Allows you to use your own -#' `sector_classififications` instead of the default. This feature is -#' experimental and may be dropped and/or become a new argument to -#' `match_name()`. -#' #' @template alias-assign #' @template ignores-but-preserves-existing-groups #' @@ -35,6 +29,9 @@ #' character string, it assumes identical join columns between `loanbook` and #' `abcd`. If a named character vector, it uses the name as the join column of `loanbook` and #' the value as the join column of `abcd`. +#' @param sector_classification A data frame containing sector classifications +#' in the same format as `r2dii.data::sector_classifications`. The default +#' value is `r2dii.data::sector_classifications`. #' @param ... Arguments passed on to [stringdist::stringsim()]. #' #' @family main functions @@ -77,17 +74,9 @@ #' #' match_name(loanbook, abcd, min_score = 0.9) #' -#' # Use your own `sector_classifications` -#' your_classifications <- tibble( -#' sector = "power", -#' borderline = FALSE, -#' code = "D35.11", -#' code_system = "XYZ" -#' ) -#' #' # match on LEI #' loanbook <- tibble( -#' sector_classification_system = "XYZ", +#' sector_classification_system = "NACE", #' sector_classification_direct_loantaker = "D35.11", #' id_ultimate_parent = "UP15", #' name_ultimate_parent = "Won't fuzzy match", @@ -102,9 +91,15 @@ #' lei = "LEI123" #' ) #' -#' match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei")) +#' match_name(loanbook, abcd, join_id = c(lei_direct_loantaker = "lei")) #' -#' restore <- options(r2dii.match.sector_classifications = your_classifications) +#' # Use your own `sector_classifications` +#' your_classifications <- tibble( +#' sector = "power", +#' borderline = FALSE, +#' code = "D35.11", +#' code_system = "XYZ" +#' ) #' #' loanbook <- tibble( #' sector_classification_system = "XYZ", @@ -120,7 +115,7 @@ #' sector = "power" #' ) #' -#' match_name(loanbook, abcd) +#' match_name(loanbook, abcd, sector_classification = your_classifications) #' #' # Cleanup #' options(restore) @@ -133,6 +128,7 @@ match_name <- function(loanbook, p = 0.1, overwrite = NULL, join_id = NULL, + sector_classification = default_sector_classification(), ...) { restore <- options(datatable.allow.cartesian = TRUE) on.exit(options(restore), add = TRUE) @@ -152,7 +148,7 @@ match_name <- function(loanbook, prep_abcd <- dplyr::distinct(prep_abcd) - prep_lbk <- may_add_sector_and_borderline(loanbook) + prep_lbk <- may_add_sector_and_borderline(loanbook, sector_classification = sector_classification) prep_lbk <- distinct(prep_lbk) join_matched <- dplyr::inner_join( @@ -188,6 +184,7 @@ match_name <- function(loanbook, method = method, p = p, overwrite = overwrite, + sector_classification = sector_classification, ... ) } else { @@ -217,6 +214,7 @@ match_name_impl <- function(loanbook, method = "jw", p = 0.1, overwrite = NULL, + sector_classification = default_sector_classification(), ...) { old_groups <- dplyr::groups(loanbook) @@ -226,7 +224,7 @@ match_name_impl <- function(loanbook, if (!allow_reserved_columns()) abort_reserved_column(loanbook) loanbook_rowid <- tibble::rowid_to_column(loanbook) - prep_lbk <- restructure_loanbook(loanbook_rowid, overwrite = overwrite) + prep_lbk <- restructure_loanbook(loanbook_rowid, overwrite = overwrite, sector_classification = sector_classification) prep_abcd <- restructure_abcd(abcd) if (by_sector) { diff --git a/R/restructure_abcd.R b/R/restructure_abcd.R index 14a273b7..d4a82824 100644 --- a/R/restructure_abcd.R +++ b/R/restructure_abcd.R @@ -51,7 +51,7 @@ restructure_abcd <- function(data) { #' #' restructure_loanbook(lbk, overwrite = overwrite_demo) #' @noRd -restructure_loanbook <- function(data, overwrite = NULL) { +restructure_loanbook <- function(data, overwrite = NULL, sector_classification = default_sector_classification()) { check_prep_loanbook_overwrite(overwrite) check_prepare_loanbook_data(data) @@ -63,7 +63,7 @@ restructure_loanbook <- function(data, overwrite = NULL) { name_level <- extract_level_names(data, prefix = "name_") important_columns <- c("rowid", id_level, name_level) - out <- may_add_sector_and_borderline(data) + out <- may_add_sector_and_borderline(data, sector_classification = sector_classification) out <- select( out, all_of(c("rowid", important_columns, "sector", "borderline")) ) @@ -77,7 +77,7 @@ restructure_loanbook <- function(data, overwrite = NULL) { out } -may_add_sector_and_borderline <- function(data) { +may_add_sector_and_borderline <- function(data, sector_classification = default_sector_classification()) { if (lacks_borderline(data) || lacks_sector(data)) { abort("Must have both `sector` and `borderline`") } @@ -85,7 +85,7 @@ may_add_sector_and_borderline <- function(data) { if (has_sector(data) & has_borderline(data)) { data2 <- data } else { - data2 <- add_sector_and_borderline(data) + data2 <- add_sector_and_borderline(data, sector_classification = sector_classification) } data2 diff --git a/man/match_name.Rd b/man/match_name.Rd index 9fd5e97f..84746c39 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -13,6 +13,7 @@ match_name( p = 0.1, overwrite = NULL, join_id = NULL, + sector_classification = default_sector_classification(), ... ) } @@ -44,6 +45,10 @@ character string, it assumes identical join columns between \code{loanbook} and \code{abcd}. If a named character vector, it uses the name as the join column of \code{loanbook} and the value as the join column of \code{abcd}.} +\item{sector_classification}{A data frame containing sector classifications +in the same format as \code{r2dii.data::sector_classifications}. The default +value is \code{r2dii.data::sector_classifications}.} + \item{...}{Arguments passed on to \code{\link[stringdist:stringsim]{stringdist::stringsim()}}.} } \value{ @@ -80,14 +85,6 @@ can be \code{name_direct_loantaker}, \verb{name_intermediate_parent*} and are assigned. The similarity between aliases in each of the loanbook and abcd is scored using \code{\link[stringdist:stringsim]{stringdist::stringsim()}}. } -\section{Package options}{ - -\code{r2dii.match.sector_classifications}: Allows you to use your own -\code{sector_classififications} instead of the default. This feature is -experimental and may be dropped and/or become a new argument to -\code{match_name()}. -} - \section{Assigning aliases}{ The transformation process used to compare names between loanbook and abcd @@ -118,17 +115,9 @@ match_name(loanbook, abcd) match_name(loanbook, abcd, min_score = 0.9) -# Use your own `sector_classifications` -your_classifications <- tibble( - sector = "power", - borderline = FALSE, - code = "D35.11", - code_system = "XYZ" -) - # match on LEI loanbook <- tibble( - sector_classification_system = "XYZ", + sector_classification_system = "NACE", sector_classification_direct_loantaker = "D35.11", id_ultimate_parent = "UP15", name_ultimate_parent = "Won't fuzzy match", @@ -143,9 +132,15 @@ abcd <- tibble( lei = "LEI123" ) -match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei")) +match_name(loanbook, abcd, join_id = c(lei_direct_loantaker = "lei")) -restore <- options(r2dii.match.sector_classifications = your_classifications) +# Use your own `sector_classifications` +your_classifications <- tibble( + sector = "power", + borderline = FALSE, + code = "D35.11", + code_system = "XYZ" +) loanbook <- tibble( sector_classification_system = "XYZ", @@ -161,7 +156,7 @@ abcd <- tibble( sector = "power" ) -match_name(loanbook, abcd) +match_name(loanbook, abcd, sector_classification = your_classifications) # Cleanup options(restore) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 1c8ee124..7ea6efc3 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -700,19 +700,19 @@ test_that("errors if any id_loan is duplicated (#349)", { expect_error(class = "duplicated_id_loan", match_name(lbk, abcd)) }) -test_that("allows custom `sector_classifications` via options() (#354)", { - loanbook <- fake_lbk(sector_classification_system = "XYZ") +test_that("allows custom `sector_classification`", { + loanbook <- fake_lbk( + sector_classification_system = "XYZ", + sector_classification_direct_loantaker = "D35.11" + ) abcd <- fake_abcd() custom_classification <- tibble::tribble( ~sector, ~borderline, ~code, ~code_system, "power", FALSE, "D35.11", "XYZ", ) - # Allow users to inject their own `sector_classifications` - old <- options(r2dii.match.sector_classifications = custom_classification) - out <- match_name(loanbook, abcd) + out <- match_name(loanbook, abcd, sector_classification = custom_classification) expect_equal(nrow(out), 1L) - options(old) }) test_that("with `join_id`, joins as expected (#135)", {