Skip to content

Commit

Permalink
Check for non-ascii characters and block validation (#517)
Browse files Browse the repository at this point in the history
* Add check for invalid characters before upload & validation

* Update description/news

* Update _pkgdown.yml

* Use lowercase v in macOS dev GH Action

* Update V8 version in renv.lock

* Try adding env var for downloading static libv8 from Abby's comment

* Fix my mess up with renaming things

* Add a colon

* Add more information to summary

* Move v8 env var

* Lint
  • Loading branch information
Aryllen authored Sep 7, 2021
1 parent 33a76ab commit 4fb56b2
Show file tree
Hide file tree
Showing 12 changed files with 251 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ jobs:
- name: Install system dependencies (macOS)
if: runner.os == 'macOS' && matrix.config.r == 'devel'
run: |
brew install V8
brew install libgit2
- name: Install dependencies
run: |
Sys.setenv(DOWNLOAD_STATIC_LIBV8 = 1)
library(remotes)
deps <- readRDS('depends.Rds')
deps[['installed']] <- vapply(deps[['package']], remotes:::local_sha, character(1))
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: dccvalidator
Title: Metadata Validation for Data Coordinating Centers
Version: 0.3.0.9025
Version: 0.3.0.9026
Authors@R:
c(person(given = "Kara",
family = "Woo",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ export(check_files_manifest)
export(check_ids_match)
export(check_indiv_ids_dup)
export(check_indiv_ids_match)
export(check_invalid_characters)
export(check_keys)
export(check_parent_syn)
export(check_pass)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
- Add ability to specify biospecimen templates based on type
- Fixed logic for resetting biospecimen type UI
- Fixed typo in `check_all()` that led to manifests being checked against assay templates
- Add `check_invalid_characters()` and include in `validator_server()` to block upload and validation

# dccvalidator v0.3.0

Expand Down
32 changes: 32 additions & 0 deletions R/check-all.R
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,35 @@ check_all <- function(data, annotations, syn, study = NA, samples_table = NA) {
)
res
}

## Check all for invalid characters
check_all_invalid_char <- function(manifest, indiv, biosp, assay) {

# Invalid characters ---------------------------------------------------------
invalid_characters_manifest <- check_invalid_characters(
manifest,
success_msg = "There are no invalid characters in the manifest",
fail_msg = "There are invalid characters in the manifest columns"
)
invalid_characters_individual <- check_invalid_characters(
indiv,
success_msg = "There are no invalid characters in the individual metadata",
fail_msg = "There are invalid characters in the individual metadata columns"
)
invalid_characters_biospecimen <- check_invalid_characters(
biosp,
success_msg = "There are no invalid characters in the biospecimen metadata",
fail_msg = "There are invalid characters in the biospecimen metadata columns" #nolint
)
invalid_characters_assay <- check_invalid_characters(
assay,
success_msg = "There are no invalid characters in the assay metadata",
fail_msg = "There are invalid characters in the assay metadata columns"
)
list(
invalid_characters_manifest = invalid_characters_manifest,
invalid_characters_individual = invalid_characters_individual,
invalid_characters_biospecimen = invalid_characters_biospecimen,
invalid_characters_assay = invalid_characters_assay
)
}
76 changes: 76 additions & 0 deletions R/check-invalid-characters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#' @title Check for non-ascii characters
#'
#' @description Check for non-ascii characters in columns.
#'
#' @param data Data to check
#' @inheritParams check_values
#' @return A condition object indicating whether the data contains columns with
#' a non-ascii character.
#' @export
#' @examples
#' dat <- tibble::tibble(
#' fails1 = c("study 1", "study&amp;2"),
#' succeeds = c("file1.ext", "file2.ext"),
#' fails2 = c("foo<0xa0>", "bar")
#' )
#' check_invalid_characters(dat)
check_invalid_characters <- function(data,
success_msg = "There are no invalid characters",
fail_msg = "There is an invalid character in a column") { #nolint
if (is.null(data)) {
return(NULL)
}
has_invalid <- purrr::map_lgl(data, ~ contains_invalid(.))
behavior <- glue::glue(
"Only standard ascii characters are allowed."
)
if (any(has_invalid)) {
check_condition(
msg = fail_msg,
behavior = behavior,
data = names(has_invalid)[has_invalid],
type = "check_fail"
)
} else {
check_pass(
msg = success_msg,
behavior = behavior
)
}
}

#' Check if a string contains an invalid character
#'
#' @noRd
#' @param text String, or vector of strings, that might have special
#' characters
#' @return `TRUE` if any string contains an invalid character, else `FALSE`
contains_invalid <- function(text) {
any(purrr::map_lgl(text, function(value) {
## Don't flag NA values as unacceptable
if (is.na(value)) {
return(FALSE)
}
conv <- iconv(value, from = "UTF-8", to = "ASCII//TRANSLIT")
## Will recieve NA if there's an unacceptable character
## Check for other types of invalid patterns
pattern <- "<0x|&[a-zA-Z0-9]+;|&#[0-9]+;"
if (is.na(conv) | grepl(pattern, value, useBytes = TRUE)) {
return(TRUE)
}
return(FALSE)
}))
}

## Summarize all invalid character checks
summarize_invalid_char_check <- function(check_list) {
## Only checks that are check_fail
failed <- purrr::map_lgl(check_list, ~ inherits(., "check_fail"))
failed_text <- purrr::map_chr(check_list[failed], ~ summarize_check(.))
glue::glue_collapse(failed_text, sep = "\n")
}

summarize_check <- function(check_result) {
details <- glue::glue_collapse(check_result$data, sep = ", ")
glue::glue("Only standard ascii characters are allowed in the files.\n{check_result$message}: {details}") #nolint
}
17 changes: 17 additions & 0 deletions R/mod-validator.R
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,23 @@ validator_server <- function(input, output, session, study_names, species_list,
"Manifest file must be .tsv or .txt, not .csv"
)
)
## Check for any invalid characters in files before continuing
invalid_checks <- check_all_invalid_char(
manifest = manifest(),
indiv = indiv(),
biosp = biosp(),
assay = assay()
)
has_invalid <- purrr::map_lgl(
invalid_checks,
~ inherits(., "check_fail")
)
validate(
need(
!any(has_invalid),
message = summarize_invalid_char_check(invalid_checks)
)
)

## Upload only the files that have been given
if (!is.null(indiv())) {
Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ reference:
- check_ages_over_90
- check_duplicate_paths
- check_complete_ids
- check_invalid_characters

- title: Validation helpers
desc: >
Expand Down
34 changes: 34 additions & 0 deletions man/check_invalid_characters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions renv.lock
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@
},
"V8": {
"Package": "V8",
"Version": "3.4.0",
"Version": "3.4.2",
"Source": "Repository",
"Repository": "CRAN",
"Hash": "8c5e8c58ff4fc8dfc67e24985005159b"
"Hash": "b181d4eb2904da0897a742a559964173"
},
"askpass": {
"Package": "askpass",
Expand Down
85 changes: 85 additions & 0 deletions tests/testthat/test-check-invalid-characters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
context("check-invalid-characters.R")

test_that("check_invalid_characters returns check_pass if valid", {
dat <- tibble::tibble(
study = c("study 1", "study-2", "study_3"),
path = c("file.ext", "dir/file.ext", "drive:dir\\xdir\\file.ext")
)
expect_true(inherits(
check_invalid_characters(dat),
"check_pass"
))
})

test_that("check_invalid_characters returns check_fail if invalid", {
## Non-ascii
dat1 <- tibble::tibble(
study = c("study 1", "study\xF02", "study_3"),
path = c("file.ext", "dir/file.ext", "drive:dir\xdir\file.ext")
)
expect_true(inherits(
check_invalid_characters(dat1),
"check_fail"
))
## Oddities that start with <0x
dat2 <- tibble::tibble(
study = c("study 1", "study-02", "study_3"),
path = c("fi<0xa0>le.ext", "dir/file.ext", "drive:dir\xdir\file.ext")
)
expect_true(inherits(
check_invalid_characters(dat2),
"check_fail"
))
## Oddities that start with & and end with ;
dat3 <- tibble::tibble(
study = c("study 1", "study-02", "study_3"),
path = c("fi&quote;le.ext", "dir/file.ext", "drive:dir\xdir\file.ext")
)
expect_true(inherits(
check_invalid_characters(dat3),
"check_fail"
))
## Oddities that start with &# and end with ;
dat4 <- tibble::tibble(
study = c("study 1", "study-02", "study&#168;3"),
path = c("file.ext", "dir/file.ext", "drive:dir\xdir\file.ext")
)
expect_true(inherits(
check_invalid_characters(dat4),
"check_fail"
))
})

test_that("check_invalid_characters returns column name if invalid value", {
## One column
dat1 <- tibble::tibble(
study = c("study 1", "study\xF02", "study_3"),
path = c("file.ext", "dir/file.ext", "drive:dir\\dir\\file.ext")
)
res1 <- check_invalid_characters(dat1)
expect_equal(res1$data, "study")
## Two columns
dat2 <- tibble::tibble(
study = c("study 1", "study\xF02", "study_3"),
path = c("fi<0xa0>le.ext", "dir/file.ext", "drive:dir\xdir\file.ext")
)
res2 <- check_invalid_characters(dat2)
expect_equal(res2$data, c("study", "path"))
})

test_that("contains_invalid returns true for typical invalid characters", {
expect_true(contains_invalid("<0x00>"))
expect_true(contains_invalid("<0x"))
expect_true(contains_invalid("&quote;"))
expect_true(contains_invalid("&#163;"))
expect_true(contains_invalid("\xF0"))
})

test_that("contains_invalid returns false for valid characters", {
expect_false(contains_invalid("foo"))
expect_false(contains_invalid("foo-bar"))
expect_false(contains_invalid("foo_bar"))
expect_false(contains_invalid("&"))
expect_false(contains_invalid("& foo"))
expect_false(contains_invalid("foo\\xbar"))
})
1 change: 0 additions & 1 deletion tests/testthat/test-metadata-template-dictionary.R
Original file line number Diff line number Diff line change
Expand Up @@ -385,4 +385,3 @@ test_that("get_template_synIDs does not return JSON schema ids", {
expected <- c("syn111111", "syn222222")
expect_equal(get_template_synIDs(dat), expected)
})

0 comments on commit 4fb56b2

Please sign in to comment.