From d3ac136c8b75624cd660b4eabba454d654a01b84 Mon Sep 17 00:00:00 2001
From: CJ Yetman <cyetman@rmi.org>
Date: Mon, 2 Dec 2024 14:23:00 +0100
Subject: [PATCH] add `sector_classification` argument to `match_name()` (#509)

---
 NEWS.md                          |  6 +++++
 R/add_sector_and_borderline.R    | 19 +++++-----------
 R/match_name.R                   | 38 +++++++++++++++-----------------
 R/restructure_abcd.R             |  8 +++----
 man/match_name.Rd                | 35 +++++++++++++----------------
 tests/testthat/test-match_name.R | 12 +++++-----
 6 files changed, 54 insertions(+), 64 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 7fc7326f..ea038e78 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,11 @@
 # r2dii.match (development version)
 
+## Lifecycle changes
+
+### Breaking changes
+
+* `match_name()` gains a new argument in its signature, `sector_classification`, which is placed before the `...` argument. Where users have used `...` to pass additional unnamed arguments by position, they will need to update their code to pass these arguments by name so that they are not confused as an input to `sector_classfication`. The new argument `sector_classification` is optional and defaults to `r2dii.data::sector_classifications`. Explicitly passing a `sector_classification` to `match_name()`, enables users to use their own sector classification systems to match loan books to the abcd, assuming the `sector_classification` is mapped appropriately to PACTA sectors and has the same format as `r2dii.data::sector_classifications`. There is no more need to pass own sector classifications to `match_name()` via options.
+
 # r2dii.match 0.2.1
 
 * r2dii.match is now [stable](https://lifecycle.r-lib.org/articles/stages.html).
diff --git a/R/add_sector_and_borderline.R b/R/add_sector_and_borderline.R
index 1f6da8a7..ccad5b38 100644
--- a/R/add_sector_and_borderline.R
+++ b/R/add_sector_and_borderline.R
@@ -31,7 +31,7 @@
 #' out %>%
 #'   select(new_columns, everything())
 #' @noRd
-add_sector_and_borderline <- function(data) {
+add_sector_and_borderline <- function(data, sector_classification = default_sector_classification()) {
   crucial <- c(
     "sector_classification_system", "sector_classification_direct_loantaker"
   )
@@ -40,40 +40,31 @@ add_sector_and_borderline <- function(data) {
     check_crucial_names(crucial) %>%
     # Coerce crucial columns to character for more robust join()
     purrr::modify_at(crucial, as.character) %>%
-    check_classification(column = "sector_classification_system") %>%
-    check_classification(column = "sector_classification_direct_loantaker")
+    check_classification(column = "sector_classification_system", classification = sector_classification) %>%
+    check_classification(column = "sector_classification_direct_loantaker", classification = sector_classification)
 
   out <- left_join(
-    checked, get_classifications(),
+    checked, sector_classification,
     by = set_names(c("code_system", "code"), crucial)
   )
 
   restore_typeof(data, out, crucial)
 }
 
-get_classifications <- function() {
-  custom_sector_classifications() %||% default_sector_classification()
-}
-
 default_sector_classification <- function() {
   r2dii.data::sector_classifications
 }
 
-custom_sector_classifications <- function() {
-  getOption("r2dii.match.sector_classifications")
-}
-
 check_classification <- function(data,
                                  column,
                                  # FIXME: Remove needless argument?
                                  classification = NULL) {
-  classification <- classification %||% get_classifications()
   # To call columns from both data and classification with the same colname
   reference <- rename_as_loanbook(classification)
 
   all_unknown <- !any(data[[column]] %in% reference[[column]])
   known <- unique(reference[[column]])
-  if (all_unknown && is.null(custom_sector_classifications())) {
+  if (all_unknown) {
     abort_all_sec_classif_unknown(column, known)
   }
 
diff --git a/R/match_name.R b/R/match_name.R
index c33611ab..3467b402 100644
--- a/R/match_name.R
+++ b/R/match_name.R
@@ -7,12 +7,6 @@
 #' are assigned. The similarity between aliases in each of the loanbook and abcd
 #'  is scored using [stringdist::stringsim()].
 #'
-#' @section Package options:
-#' `r2dii.match.sector_classifications`: Allows you to use your own
-#' `sector_classififications` instead of the default. This feature is
-#' experimental and may be dropped and/or become a new argument to
-#' `match_name()`.
-#'
 #' @template alias-assign
 #' @template ignores-but-preserves-existing-groups
 #'
@@ -35,6 +29,9 @@
 #'   character string, it assumes identical join columns between `loanbook` and
 #'   `abcd`. If a named character vector, it uses the name as the join column of `loanbook` and
 #'   the value as the join column of `abcd`.
+#' @param sector_classification A data frame containing sector classifications
+#'   in the same format as `r2dii.data::sector_classifications`. The default
+#'   value is `r2dii.data::sector_classifications`.
 #' @param ... Arguments passed on to [stringdist::stringsim()].
 #'
 #' @family main functions
@@ -77,17 +74,9 @@
 #'
 #' match_name(loanbook, abcd, min_score = 0.9)
 #'
-#' # Use your own `sector_classifications`
-#' your_classifications <- tibble(
-#'   sector = "power",
-#'   borderline = FALSE,
-#'   code = "D35.11",
-#'   code_system = "XYZ"
-#' )
-#'
 #' # match on LEI
 #' loanbook <- tibble(
-#'   sector_classification_system = "XYZ",
+#'   sector_classification_system = "NACE",
 #'   sector_classification_direct_loantaker = "D35.11",
 #'   id_ultimate_parent = "UP15",
 #'   name_ultimate_parent = "Won't fuzzy match",
@@ -102,9 +91,15 @@
 #'   lei = "LEI123"
 #' )
 #'
-#' match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei"))
+#' match_name(loanbook, abcd, join_id = c(lei_direct_loantaker = "lei"))
 #'
-#' restore <- options(r2dii.match.sector_classifications = your_classifications)
+#' # Use your own `sector_classifications`
+#' your_classifications <- tibble(
+#'   sector = "power",
+#'   borderline = FALSE,
+#'   code = "D35.11",
+#'   code_system = "XYZ"
+#' )
 #'
 #' loanbook <- tibble(
 #'   sector_classification_system = "XYZ",
@@ -120,7 +115,7 @@
 #'   sector = "power"
 #' )
 #'
-#' match_name(loanbook, abcd)
+#' match_name(loanbook, abcd, sector_classification = your_classifications)
 #'
 #' # Cleanup
 #' options(restore)
@@ -133,6 +128,7 @@ match_name <- function(loanbook,
                        p = 0.1,
                        overwrite = NULL,
                        join_id = NULL,
+                       sector_classification = default_sector_classification(),
                        ...) {
   restore <- options(datatable.allow.cartesian = TRUE)
   on.exit(options(restore), add = TRUE)
@@ -152,7 +148,7 @@ match_name <- function(loanbook,
 
     prep_abcd <- dplyr::distinct(prep_abcd)
 
-    prep_lbk <- may_add_sector_and_borderline(loanbook)
+    prep_lbk <- may_add_sector_and_borderline(loanbook, sector_classification = sector_classification)
     prep_lbk <- distinct(prep_lbk)
 
     join_matched <- dplyr::inner_join(
@@ -188,6 +184,7 @@ match_name <- function(loanbook,
       method = method,
       p = p,
       overwrite = overwrite,
+      sector_classification = sector_classification,
       ...
     )
   } else {
@@ -217,6 +214,7 @@ match_name_impl <- function(loanbook,
                             method = "jw",
                             p = 0.1,
                             overwrite = NULL,
+                            sector_classification = default_sector_classification(),
                             ...) {
 
   old_groups <- dplyr::groups(loanbook)
@@ -226,7 +224,7 @@ match_name_impl <- function(loanbook,
   if (!allow_reserved_columns()) abort_reserved_column(loanbook)
   loanbook_rowid <- tibble::rowid_to_column(loanbook)
 
-  prep_lbk <- restructure_loanbook(loanbook_rowid, overwrite = overwrite)
+  prep_lbk <- restructure_loanbook(loanbook_rowid, overwrite = overwrite, sector_classification = sector_classification)
   prep_abcd <- restructure_abcd(abcd)
 
   if (by_sector) {
diff --git a/R/restructure_abcd.R b/R/restructure_abcd.R
index 14a273b7..d4a82824 100644
--- a/R/restructure_abcd.R
+++ b/R/restructure_abcd.R
@@ -51,7 +51,7 @@ restructure_abcd <- function(data) {
 #'
 #' restructure_loanbook(lbk, overwrite = overwrite_demo)
 #' @noRd
-restructure_loanbook <- function(data, overwrite = NULL) {
+restructure_loanbook <- function(data, overwrite = NULL, sector_classification = default_sector_classification()) {
   check_prep_loanbook_overwrite(overwrite)
   check_prepare_loanbook_data(data)
 
@@ -63,7 +63,7 @@ restructure_loanbook <- function(data, overwrite = NULL) {
   name_level <- extract_level_names(data, prefix = "name_")
   important_columns <- c("rowid", id_level, name_level)
 
-  out <- may_add_sector_and_borderline(data)
+  out <- may_add_sector_and_borderline(data, sector_classification = sector_classification)
   out <- select(
     out, all_of(c("rowid", important_columns, "sector", "borderline"))
   )
@@ -77,7 +77,7 @@ restructure_loanbook <- function(data, overwrite = NULL) {
   out
 }
 
-may_add_sector_and_borderline <- function(data) {
+may_add_sector_and_borderline <- function(data, sector_classification = default_sector_classification()) {
   if (lacks_borderline(data) || lacks_sector(data)) {
     abort("Must have both `sector` and `borderline`")
   }
@@ -85,7 +85,7 @@ may_add_sector_and_borderline <- function(data) {
   if (has_sector(data) & has_borderline(data)) {
     data2 <- data
   } else {
-    data2 <- add_sector_and_borderline(data)
+    data2 <- add_sector_and_borderline(data, sector_classification = sector_classification)
   }
 
   data2
diff --git a/man/match_name.Rd b/man/match_name.Rd
index 9fd5e97f..84746c39 100644
--- a/man/match_name.Rd
+++ b/man/match_name.Rd
@@ -13,6 +13,7 @@ match_name(
   p = 0.1,
   overwrite = NULL,
   join_id = NULL,
+  sector_classification = default_sector_classification(),
   ...
 )
 }
@@ -44,6 +45,10 @@ character string, it assumes identical join columns between \code{loanbook} and
 \code{abcd}. If a named character vector, it uses the name as the join column of \code{loanbook} and
 the value as the join column of \code{abcd}.}
 
+\item{sector_classification}{A data frame containing sector classifications
+in the same format as \code{r2dii.data::sector_classifications}. The default
+value is \code{r2dii.data::sector_classifications}.}
+
 \item{...}{Arguments passed on to \code{\link[stringdist:stringsim]{stringdist::stringsim()}}.}
 }
 \value{
@@ -80,14 +85,6 @@ can be \code{name_direct_loantaker}, \verb{name_intermediate_parent*} and
 are assigned. The similarity between aliases in each of the loanbook and abcd
 is scored using \code{\link[stringdist:stringsim]{stringdist::stringsim()}}.
 }
-\section{Package options}{
-
-\code{r2dii.match.sector_classifications}: Allows you to use your own
-\code{sector_classififications} instead of the default. This feature is
-experimental and may be dropped and/or become a new argument to
-\code{match_name()}.
-}
-
 \section{Assigning aliases}{
 
 The transformation process used to compare names between loanbook and abcd
@@ -118,17 +115,9 @@ match_name(loanbook, abcd)
 
 match_name(loanbook, abcd, min_score = 0.9)
 
-# Use your own `sector_classifications`
-your_classifications <- tibble(
-  sector = "power",
-  borderline = FALSE,
-  code = "D35.11",
-  code_system = "XYZ"
-)
-
 # match on LEI
 loanbook <- tibble(
-  sector_classification_system = "XYZ",
+  sector_classification_system = "NACE",
   sector_classification_direct_loantaker = "D35.11",
   id_ultimate_parent = "UP15",
   name_ultimate_parent = "Won't fuzzy match",
@@ -143,9 +132,15 @@ abcd <- tibble(
   lei = "LEI123"
 )
 
-match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei"))
+match_name(loanbook, abcd, join_id = c(lei_direct_loantaker = "lei"))
 
-restore <- options(r2dii.match.sector_classifications = your_classifications)
+# Use your own `sector_classifications`
+your_classifications <- tibble(
+  sector = "power",
+  borderline = FALSE,
+  code = "D35.11",
+  code_system = "XYZ"
+)
 
 loanbook <- tibble(
   sector_classification_system = "XYZ",
@@ -161,7 +156,7 @@ abcd <- tibble(
   sector = "power"
 )
 
-match_name(loanbook, abcd)
+match_name(loanbook, abcd, sector_classification = your_classifications)
 
 # Cleanup
 options(restore)
diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R
index 1c8ee124..7ea6efc3 100644
--- a/tests/testthat/test-match_name.R
+++ b/tests/testthat/test-match_name.R
@@ -700,19 +700,19 @@ test_that("errors if any id_loan is duplicated (#349)", {
   expect_error(class = "duplicated_id_loan", match_name(lbk, abcd))
 })
 
-test_that("allows custom `sector_classifications` via options() (#354)", {
-  loanbook <- fake_lbk(sector_classification_system = "XYZ")
+test_that("allows custom `sector_classification`", {
+  loanbook <- fake_lbk(
+    sector_classification_system = "XYZ",
+    sector_classification_direct_loantaker = "D35.11"
+  )
   abcd <- fake_abcd()
   custom_classification <- tibble::tribble(
     ~sector,       ~borderline,  ~code, ~code_system,
     "power",             FALSE, "D35.11",        "XYZ",
   )
 
-  # Allow users to inject their own `sector_classifications`
-  old <- options(r2dii.match.sector_classifications = custom_classification)
-  out <- match_name(loanbook, abcd)
+  out <- match_name(loanbook, abcd, sector_classification = custom_classification)
   expect_equal(nrow(out), 1L)
-  options(old)
 })
 
 test_that("with `join_id`, joins as expected (#135)", {