-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Check for non-ascii characters and block validation (#517)
* Add check for invalid characters before upload & validation * Update description/news * Update _pkgdown.yml * Use lowercase v in macOS dev GH Action * Update V8 version in renv.lock * Try adding env var for downloading static libv8 from Abby's comment * Fix my mess up with renaming things * Add a colon * Add more information to summary * Move v8 env var * Lint
- Loading branch information
Showing
12 changed files
with
251 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#' @title Check for non-ascii characters | ||
#' | ||
#' @description Check for non-ascii characters in columns. | ||
#' | ||
#' @param data Data to check | ||
#' @inheritParams check_values | ||
#' @return A condition object indicating whether the data contains columns with | ||
#' a non-ascii character. | ||
#' @export | ||
#' @examples | ||
#' dat <- tibble::tibble( | ||
#' fails1 = c("study 1", "study&2"), | ||
#' succeeds = c("file1.ext", "file2.ext"), | ||
#' fails2 = c("foo<0xa0>", "bar") | ||
#' ) | ||
#' check_invalid_characters(dat) | ||
check_invalid_characters <- function(data, | ||
success_msg = "There are no invalid characters", | ||
fail_msg = "There is an invalid character in a column") { #nolint | ||
if (is.null(data)) { | ||
return(NULL) | ||
} | ||
has_invalid <- purrr::map_lgl(data, ~ contains_invalid(.)) | ||
behavior <- glue::glue( | ||
"Only standard ascii characters are allowed." | ||
) | ||
if (any(has_invalid)) { | ||
check_condition( | ||
msg = fail_msg, | ||
behavior = behavior, | ||
data = names(has_invalid)[has_invalid], | ||
type = "check_fail" | ||
) | ||
} else { | ||
check_pass( | ||
msg = success_msg, | ||
behavior = behavior | ||
) | ||
} | ||
} | ||
|
||
#' Check if a string contains an invalid character | ||
#' | ||
#' @noRd | ||
#' @param text String, or vector of strings, that might have special | ||
#' characters | ||
#' @return `TRUE` if any string contains an invalid character, else `FALSE` | ||
contains_invalid <- function(text) { | ||
any(purrr::map_lgl(text, function(value) { | ||
## Don't flag NA values as unacceptable | ||
if (is.na(value)) { | ||
return(FALSE) | ||
} | ||
conv <- iconv(value, from = "UTF-8", to = "ASCII//TRANSLIT") | ||
## Will recieve NA if there's an unacceptable character | ||
## Check for other types of invalid patterns | ||
pattern <- "<0x|&[a-zA-Z0-9]+;|&#[0-9]+;" | ||
if (is.na(conv) | grepl(pattern, value, useBytes = TRUE)) { | ||
return(TRUE) | ||
} | ||
return(FALSE) | ||
})) | ||
} | ||
|
||
## Summarize all invalid character checks | ||
summarize_invalid_char_check <- function(check_list) { | ||
## Only checks that are check_fail | ||
failed <- purrr::map_lgl(check_list, ~ inherits(., "check_fail")) | ||
failed_text <- purrr::map_chr(check_list[failed], ~ summarize_check(.)) | ||
glue::glue_collapse(failed_text, sep = "\n") | ||
} | ||
|
||
summarize_check <- function(check_result) { | ||
details <- glue::glue_collapse(check_result$data, sep = ", ") | ||
glue::glue("Only standard ascii characters are allowed in the files.\n{check_result$message}: {details}") #nolint | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
context("check-invalid-characters.R") | ||
|
||
test_that("check_invalid_characters returns check_pass if valid", { | ||
dat <- tibble::tibble( | ||
study = c("study 1", "study-2", "study_3"), | ||
path = c("file.ext", "dir/file.ext", "drive:dir\\xdir\\file.ext") | ||
) | ||
expect_true(inherits( | ||
check_invalid_characters(dat), | ||
"check_pass" | ||
)) | ||
}) | ||
|
||
test_that("check_invalid_characters returns check_fail if invalid", { | ||
## Non-ascii | ||
dat1 <- tibble::tibble( | ||
study = c("study 1", "study\xF02", "study_3"), | ||
path = c("file.ext", "dir/file.ext", "drive:dir\xdir\file.ext") | ||
) | ||
expect_true(inherits( | ||
check_invalid_characters(dat1), | ||
"check_fail" | ||
)) | ||
## Oddities that start with <0x | ||
dat2 <- tibble::tibble( | ||
study = c("study 1", "study-02", "study_3"), | ||
path = c("fi<0xa0>le.ext", "dir/file.ext", "drive:dir\xdir\file.ext") | ||
) | ||
expect_true(inherits( | ||
check_invalid_characters(dat2), | ||
"check_fail" | ||
)) | ||
## Oddities that start with & and end with ; | ||
dat3 <- tibble::tibble( | ||
study = c("study 1", "study-02", "study_3"), | ||
path = c("fi"e;le.ext", "dir/file.ext", "drive:dir\xdir\file.ext") | ||
) | ||
expect_true(inherits( | ||
check_invalid_characters(dat3), | ||
"check_fail" | ||
)) | ||
## Oddities that start with &# and end with ; | ||
dat4 <- tibble::tibble( | ||
study = c("study 1", "study-02", "study¨3"), | ||
path = c("file.ext", "dir/file.ext", "drive:dir\xdir\file.ext") | ||
) | ||
expect_true(inherits( | ||
check_invalid_characters(dat4), | ||
"check_fail" | ||
)) | ||
}) | ||
|
||
test_that("check_invalid_characters returns column name if invalid value", { | ||
## One column | ||
dat1 <- tibble::tibble( | ||
study = c("study 1", "study\xF02", "study_3"), | ||
path = c("file.ext", "dir/file.ext", "drive:dir\\dir\\file.ext") | ||
) | ||
res1 <- check_invalid_characters(dat1) | ||
expect_equal(res1$data, "study") | ||
## Two columns | ||
dat2 <- tibble::tibble( | ||
study = c("study 1", "study\xF02", "study_3"), | ||
path = c("fi<0xa0>le.ext", "dir/file.ext", "drive:dir\xdir\file.ext") | ||
) | ||
res2 <- check_invalid_characters(dat2) | ||
expect_equal(res2$data, c("study", "path")) | ||
}) | ||
|
||
test_that("contains_invalid returns true for typical invalid characters", { | ||
expect_true(contains_invalid("<0x00>")) | ||
expect_true(contains_invalid("<0x")) | ||
expect_true(contains_invalid(""e;")) | ||
expect_true(contains_invalid("£")) | ||
expect_true(contains_invalid("\xF0")) | ||
}) | ||
|
||
test_that("contains_invalid returns false for valid characters", { | ||
expect_false(contains_invalid("foo")) | ||
expect_false(contains_invalid("foo-bar")) | ||
expect_false(contains_invalid("foo_bar")) | ||
expect_false(contains_invalid("&")) | ||
expect_false(contains_invalid("& foo")) | ||
expect_false(contains_invalid("foo\\xbar")) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters