Skip to content

Commit

Permalink
Merge pull request #43 from finddx/sample-classification
Browse files Browse the repository at this point in the history
better find duplicates function
  • Loading branch information
m-mburu authored Jun 26, 2024
2 parents 70f0b8c + 62839c6 commit 1182ac5
Show file tree
Hide file tree
Showing 11 changed files with 139 additions and 8 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ export(get_site_id)
export(get_specimen_type_id)
export(groupby_vec)
export(make_clean_os_names)
export(mark_all_duplicates)
export(num_thousandth_sep)
export(nums_clean)
export(parse_all_sites)
Expand Down
2 changes: 1 addition & 1 deletion R/gen_clinical_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ gen_clinical_data <- function(clinical_data,
stop("ppid_col is not present in the additional data set")
}
clinical_data <- merge(additional_data,
clinical_data, by = ppid_col )
clinical_data, by = ppid_col, all.x = TRUE)


}else{
Expand Down
27 changes: 27 additions & 0 deletions R/mark_all_duplicates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# WARNING - Generated by {fusen} from dev/sample_selections.Rmd: do not edit by hand

#' Mark All Duplicates in a Vector
#'
#' This function identifies all duplicate elements in a vector, marking both the first
#' occurrence and subsequent duplicates.
#'
#' @param vec A numeric or character vector where duplicates are to be identified.
#'
#' @return A logical vector of the same length as `vec`, where TRUE indicates that
#' the corresponding element in `vec` is a duplicate.
#'
#' @export
#' @examples
#'
#' vec <- c("apple", "banana", "apple", "cherry", "banana")
#' mark_all_duplicates(vec)
mark_all_duplicates <- function(vec) {
# Check duplicates from the start and from the end
dup_from_start <- duplicated(vec)
dup_from_end <- duplicated(vec, fromLast = TRUE)

# Combine the two to mark all duplicates (including first occurrences)
all_duplicates <- dup_from_start | dup_from_end
return(all_duplicates)
}

12 changes: 10 additions & 2 deletions R/timestamp_to_date.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,32 @@ timestamp_to_date <- function(timestamp, date_cols = NULL){

}


#' Convert Numeric Timestamp to Date
#'
#' This method converts a numeric timestamp to a datetime object.
#'
#' @param timestamp Numeric timestamp.
#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame. This is not needed for this method
#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame.
#'
#' @return doesn't return anything because the data frame is converted to data.table and function applied in place.
#' @return A datetime object.
#'
#' @export


timestamp_to_date.numeric <- function(timestamp, date_cols = NULL){
if (!is.numeric(timestamp)) {
warning("Input timestamp is not numeric. Skipping conversion.")
return(timestamp)
}

timestamp <- as.numeric(timestamp)
timestamp <- timestamp/1000
timestamp <- lubridate::as_datetime(timestamp)
return(timestamp)
}


#' Convert Timestamp Columns in a Data Frame to Date
#'
#' This method converts timestamp columns in a data frame to date columns.
Expand Down
12 changes: 10 additions & 2 deletions dev/OpenSpecimenAPI.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -118,24 +118,32 @@ timestamp_to_date <- function(timestamp, date_cols = NULL){
}
#' Convert Numeric Timestamp to Date
#'
#' This method converts a numeric timestamp to a datetime object.
#'
#' @param timestamp Numeric timestamp.
#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame. This is not needed for this method
#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame.
#'
#' @return doesn't return anything because the data frame is converted to data.table and function applied in place.
#' @return A datetime object.
#'
#' @export
timestamp_to_date.numeric <- function(timestamp, date_cols = NULL){
if (!is.numeric(timestamp)) {
warning("Input timestamp is not numeric. Skipping conversion.")
return(timestamp)
}
timestamp <- as.numeric(timestamp)
timestamp <- timestamp/1000
timestamp <- lubridate::as_datetime(timestamp)
return(timestamp)
}
#' Convert Timestamp Columns in a Data Frame to Date
#'
#' This method converts timestamp columns in a data frame to date columns.
Expand Down
2 changes: 2 additions & 0 deletions dev/config_fusen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ sample_selections.Rmd:
- R/create_save_workbook.R
- R/gen_clinical_data.R
- R/make_clean_os_names.R
- R/mark_all_duplicates.R
- R/process_save_selected_aliquots.R
- R/read_os_data.R
- R/samples_distributed_summary.R
Expand All @@ -124,6 +125,7 @@ sample_selections.Rmd:
- tests/testthat/test-create_save_workbook.R
- tests/testthat/test-gen_clinical_data.R
- tests/testthat/test-samples_distributed_summary.R
- tests/testthat/test-mark_all_duplicates.R
vignettes: vignettes/sample-selections-vignette.Rmd
inflate:
flat_file: dev/sample_selections.Rmd
Expand Down
41 changes: 40 additions & 1 deletion dev/sample_selections.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -1256,7 +1256,7 @@ gen_clinical_data <- function(clinical_data,
stop("ppid_col is not present in the additional data set")
}
clinical_data <- merge(additional_data,
clinical_data, by = ppid_col )
clinical_data, by = ppid_col, all.x = TRUE)
}else{
Expand Down Expand Up @@ -1498,6 +1498,45 @@ test_that("samples_distributed_summary works", {
```


# mark_all_duplicates

```{r function-mark_all_duplicates}
#' Mark All Duplicates in a Vector
#'
#' This function identifies all duplicate elements in a vector, marking both the first
#' occurrence and subsequent duplicates.
#'
#' @param vec A numeric or character vector where duplicates are to be identified.
#'
#' @return A logical vector of the same length as `vec`, where TRUE indicates that
#' the corresponding element in `vec` is a duplicate.
#'
#' @export
mark_all_duplicates <- function(vec) {
# Check duplicates from the start and from the end
dup_from_start <- duplicated(vec)
dup_from_end <- duplicated(vec, fromLast = TRUE)
# Combine the two to mark all duplicates (including first occurrences)
all_duplicates <- dup_from_start | dup_from_end
return(all_duplicates)
}
```

```{r example-mark_all_duplicates}
vec <- c("apple", "banana", "apple", "cherry", "banana")
mark_all_duplicates(vec)
```

```{r tests-mark_all_duplicates}
test_that("mark_all_duplicates works", {
expect_true(inherits(mark_all_duplicates, "function"))
})
```


```{r development-load}
# Load already included functions if relevant
pkgload::load_all(export_all = FALSE)
Expand Down
24 changes: 24 additions & 0 deletions man/mark_all_duplicates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/timestamp_to_date.numeric.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions tests/testthat/test-mark_all_duplicates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# WARNING - Generated by {fusen} from dev/sample_selections.Rmd: do not edit by hand

test_that("mark_all_duplicates works", {
expect_true(inherits(mark_all_duplicates, "function"))
})
17 changes: 17 additions & 0 deletions vignettes/sample-selections-vignette.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -461,3 +461,20 @@ samples_distributed_summary(df =list_dfs$clinical_data ,
order_col = "Total")
```

# mark_all_duplicates





```{r example-mark_all_duplicates}
vec <- c("apple", "banana", "apple", "cherry", "banana")
mark_all_duplicates(vec)
```






0 comments on commit 1182ac5

Please sign in to comment.