r-lib · MichaelChirico · Nov 14, 2023 · Nov 10, 2023 · Nov 12, 2023 · Nov 12, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -155,6 +155,7 @@ Collate:
     'regex_subset_linter.R'
     'repeat_linter.R'
     'routine_registration_linter.R'
+    'sample_int_linter.R'
     'scalar_in_linter.R'
     'semicolon_linter.R'
     'seq_linter.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -118,6 +118,7 @@ export(redundant_ifelse_linter)
 export(regex_subset_linter)
 export(repeat_linter)
 export(routine_registration_linter)
+export(sample_int_linter)
 export(sarif_output)
 export(scalar_in_linter)
 export(semicolon_linter)

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,10 @@
 
 * More helpful errors for invalid configs (#2253, @MichaelChirico).
 
+### New linters
+
+* `sample_int_linter()` for encouraging `sample.int(n, ...)` over equivalents like `sample(1:n, ...)` (part of #884, @MichaelChirico).
+
 ### Lint accuracy fixes: removing false positives
 
 * `unreachable_code_linter()` ignores reachable code in inline functions like `function(x) if (x > 2) stop() else x` (#2259, @MEO265).

diff --git a/R/sample_int_linter.R b/R/sample_int_linter.R
@@ -0,0 +1,60 @@
+#' Require usage of sample.int(n, m, ...) over sample(1:n, m, ...)
+#'
+#' [sample.int()] is preferable to `sample()` for the case of sampling numbers
+#'   between 1 and `n`. `sample` calls `sample.int()` "under the hood".
+#'
+#' @evalRd rd_tags("sample_int_linter")
+#' @seealso [linters] for a complete list of linters available in lintr.
+#' @export
+sample_int_linter <- function() {
+  # looking for anything like sample(1: that doesn't come after a $ extraction
+  # exclude TRUE/FALSE for sample(replace = TRUE, ...) usage. better
+  #   would be match.arg() but this also works.
+  xpath <- glue("
+  //SYMBOL_FUNCTION_CALL[text() = 'sample']
+    /parent::expr[not(OP-DOLLAR or OP-AT)]
+    /following-sibling::expr[1][
+      (
+        expr[1]/NUM_CONST[text() = '1' or text() = '1L']
+        and OP-COLON
+      )
+      or expr/SYMBOL_FUNCTION_CALL[text() = 'seq_len']
+      or (
+        expr/SYMBOL_FUNCTION_CALL[text() = 'seq']
+        and (
+          count(expr) = 2
+          or (
+            expr[2]/NUM_CONST[text() = '1' or text() = '1L']
+            and not(SYMBOL_SUB[
+              text() = 'by'
+              and not(following-sibling::expr[1]/NUM_CONST[text() = '1' or text() = '1L'])
+            ])
+          )
+        )
+      )
+      or NUM_CONST[not(text() = 'TRUE' or text() = 'FALSE')]
+    ]
+    /parent::expr
+  ")
+
+  Linter(function(source_expression) {
+    if (!is_lint_level(source_expression, "expression")) {
+      return(list())
+    }
+
+    xml <- source_expression$xml_parsed_content
+
+    bad_expr <- xml_find_all(xml, xpath)
+    first_call <- xp_call_name(bad_expr, depth = 2L)
+    original <- sprintf("%s(n)", first_call)
+    original[!is.na(xml_find_first(bad_expr, "expr[2]/OP-COLON"))] <- "1:n"
+    original[!is.na(xml_find_first(bad_expr, "expr[2]/NUM_CONST"))] <- "n"
+
+    xml_nodes_to_lints(
+      bad_expr,
+      source_expression = source_expression,
+      lint_message = glue("sample.int(n, m, ...) is preferable to sample({original}, m, ...)."),
+      type = "warning"
+    )
+  })
+}
diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv
@@ -75,6 +75,7 @@ redundant_ifelse_linter,best_practices efficiency consistency configurable
 regex_subset_linter,best_practices efficiency
 repeat_linter,style readability
 routine_registration_linter,best_practices efficiency robustness
+sample_int_linter,efficiency readability robustness
 scalar_in_linter,readability consistency best_practices efficiency
 semicolon_linter,style readability default configurable
 semicolon_terminator_linter,style readability deprecated configurable

diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd
diff --git a/man/linters.Rd b/man/linters.Rd
diff --git a/man/readability_linters.Rd b/man/readability_linters.Rd
diff --git a/man/robustness_linters.Rd b/man/robustness_linters.Rd
diff --git a/man/sample_int_linter.Rd b/man/sample_int_linter.Rd
diff --git a/tests/testthat/test-sample_int_linter.R b/tests/testthat/test-sample_int_linter.R
@@ -0,0 +1,95 @@
+test_that("sample_int_linter skips allowed usages", {
+  linter <- sample_int_linter()
+
+  expect_lint("sample(n, m)", NULL, linter)
+  expect_lint("sample(n, m, TRUE)", NULL, linter)
+  expect_lint("sample(n, m, prob = 1:n/n)", NULL, linter)
+  expect_lint("sample(foo(x), m, TRUE)", NULL, linter)
+  expect_lint("sample(n, replace = TRUE)", NULL, linter)
+
+  expect_lint("sample(10:1, m)", NULL, linter)
+})
+
+test_that("sample_int_linter blocks simple disallowed usages", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(1:n, m, ...).")
+
+  expect_lint("sample(1:10, 2)", lint_msg, linter)
+  # also matches literal integer
+  expect_lint("sample(1L:10L, 2)", lint_msg, linter)
+  expect_lint("sample(1:n, 2)", lint_msg, linter)
+  expect_lint("sample(1:k, replace = TRUE)", lint_msg, linter)
+  expect_lint("sample(1:foo(x), prob = bar(x))", lint_msg, linter)
+})
+
+test_that("sample_int_linter blocks sample(seq_len(n), ...) as well", {
+  expect_lint(
+    "sample(seq_len(10), 2)",
+    rex::rex("sample.int(n, m, ...) is preferable to sample(seq_len(n), m, ...)."),
+    sample_int_linter()
+  )
+})
+
+test_that("sample_int_linter blocks sample(seq(n)) and sample(seq(1, ...))", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(seq(n), m, ...).")
+
+  expect_lint("sample(seq(n), 5)", lint_msg, linter)
+  expect_lint("sample(seq(1, 10), 5)", lint_msg, linter)
+  expect_lint("sample(seq(1, 10, by = 1), 5)", lint_msg, linter)
+  expect_lint("sample(seq(1L, 10, by = 1L), 5)", lint_msg, linter)
+
+  # lint doesn't apply when by= is used (except when set to literal 1)
+  expect_lint("sample(seq(1, 10, by = 2), 5)", NULL, linter)
+  expect_lint("sample(seq(1, 10, by = n), 5)", NULL, linter)
+})
+
+test_that("sample_int_linter catches literal integer/numeric in the first arg", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(n, m, ...).")
+
+  expect_lint("sample(10L, 4)", lint_msg, linter)
+  expect_lint("sample(10, 5)", lint_msg, linter)
+})
+
+test_that("sample_int_linter skips TRUE or FALSE in the first argument", {
+  linter <- sample_int_linter()
+
+  expect_lint("sample(replace = TRUE, letters)", NULL, linter)
+  expect_lint("sample(replace = FALSE, letters)", NULL, linter)
+})
+
+test_that("sample_int_linter skips x$sample() usage", {
+  linter <- sample_int_linter()
+  lint_msg <- rex::rex("sample.int(n, m, ...) is preferable to sample(n, m, ...).")
+
+  expect_lint("foo$sample(1L)", NULL, linter)
+  expect_lint("foo$sample(1:10)", NULL, linter)
+  expect_lint("foo$sample(seq_len(10L))", NULL, linter)
+  # ditto for '@' slot extraction
+  expect_lint("foo@sample(1L)", NULL, linter)
+
+  # however, base::sample qualification is still caught
+  expect_lint("base::sample(10L)", lint_msg, linter)
+
+  # but also, not everything "below" a $ extraction is skipped
+  expect_lint("foo$bar(sample(10L))", lint_msg, linter)
+})
+
+test_that("multiple lints are generated correctly", {
+  expect_lint(
+    trim_some("{
+      sample(1:10, 2)
+      sample(10, 2)
+      sample(seq_len(10), 2)
+      sample(seq(10), 2)
+    }"),
+    list(
+      rex::rex("sample(1:n"),
+      rex::rex("sample(n"),
+      rex::rex("sample(seq_len(n)"),
+      rex::rex("sample(seq(n)")
+    ),
+    sample_int_linter()
+  )
+})