Merge pull request #43 from finddx/sample-classification

better find duplicates function
finddx · Jun 26, 2024 · 1182ac5 · 1182ac5
2 parents 70f0b8c + 62839c6
commit 1182ac5
Show file tree

Hide file tree

Showing 11 changed files with 139 additions and 8 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -36,6 +36,7 @@ export(get_site_id)
 export(get_specimen_type_id)
 export(groupby_vec)
 export(make_clean_os_names)
+export(mark_all_duplicates)
 export(num_thousandth_sep)
 export(nums_clean)
 export(parse_all_sites)

diff --git a/R/gen_clinical_data.R b/R/gen_clinical_data.R
@@ -120,7 +120,7 @@ gen_clinical_data <- function(clinical_data,
       stop("ppid_col is not present in the additional data set")
     }
     clinical_data <- merge(additional_data,
-                           clinical_data, by = ppid_col ) 
+                           clinical_data, by = ppid_col, all.x = TRUE) 
 
 
   }else{

diff --git a/R/mark_all_duplicates.R b/R/mark_all_duplicates.R
@@ -0,0 +1,27 @@
+# WARNING - Generated by {fusen} from dev/sample_selections.Rmd: do not edit by hand
+
+#' Mark All Duplicates in a Vector
+#'
+#' This function identifies all duplicate elements in a vector, marking both the first
+#' occurrence and subsequent duplicates.
+#'
+#' @param vec A numeric or character vector where duplicates are to be identified.
+#'
+#' @return A logical vector of the same length as `vec`, where TRUE indicates that
+#' the corresponding element in `vec` is a duplicate.
+#'
+#' @export
+#' @examples
+#'
+#' vec <- c("apple", "banana", "apple", "cherry", "banana")
+#' mark_all_duplicates(vec)
+mark_all_duplicates <- function(vec) {
+  # Check duplicates from the start and from the end
+  dup_from_start <- duplicated(vec)
+  dup_from_end <- duplicated(vec, fromLast = TRUE)
+
+  # Combine the two to mark all duplicates (including first occurrences)
+  all_duplicates <- dup_from_start | dup_from_end
+  return(all_duplicates)
+}
+
diff --git a/R/timestamp_to_date.R b/R/timestamp_to_date.R
@@ -31,24 +31,32 @@ timestamp_to_date <- function(timestamp, date_cols = NULL){
 
 }
 
+
 #' Convert Numeric Timestamp to Date
 #'
 #' This method converts a numeric timestamp to a datetime object.
 #'
 #' @param timestamp Numeric timestamp.
-#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame. This is not needed for this method
+#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame.
 #'
-#' @return doesn't return anything because the data frame is converted to data.table and function applied in place.
+#' @return A datetime object.
 #'
 #' @export
+
+
 timestamp_to_date.numeric <- function(timestamp, date_cols = NULL){
+  if (!is.numeric(timestamp)) {
+    warning("Input timestamp is not numeric. Skipping conversion.")
+    return(timestamp)
+  }
 
   timestamp <- as.numeric(timestamp)
   timestamp <- timestamp/1000
   timestamp <- lubridate::as_datetime(timestamp)
   return(timestamp)
 }
 
+
 #' Convert Timestamp Columns in a Data Frame to Date
 #'
 #' This method converts timestamp columns in a data frame to date columns.

diff --git a/dev/OpenSpecimenAPI.Rmd b/dev/OpenSpecimenAPI.Rmd
@@ -118,24 +118,32 @@ timestamp_to_date <- function(timestamp, date_cols = NULL){
   
 }
 
+
 #' Convert Numeric Timestamp to Date
 #'
 #' This method converts a numeric timestamp to a datetime object.
 #'
 #' @param timestamp Numeric timestamp.
-#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame. This is not needed for this method
+#' @param date_cols Names of the timestamp columns to convert if the object timestamp is a data.frame.
 #'
-#' @return doesn't return anything because the data frame is converted to data.table and function applied in place.
+#' @return A datetime object.
 #'
 #' @export
+
+
 timestamp_to_date.numeric <- function(timestamp, date_cols = NULL){
+  if (!is.numeric(timestamp)) {
+    warning("Input timestamp is not numeric. Skipping conversion.")
+    return(timestamp)
+  }
   
   timestamp <- as.numeric(timestamp)
   timestamp <- timestamp/1000
   timestamp <- lubridate::as_datetime(timestamp)
   return(timestamp)
 }
 
+
 #' Convert Timestamp Columns in a Data Frame to Date
 #'
 #' This method converts timestamp columns in a data frame to date columns.

diff --git a/dev/config_fusen.yaml b/dev/config_fusen.yaml
@@ -102,6 +102,7 @@ sample_selections.Rmd:
   - R/create_save_workbook.R
   - R/gen_clinical_data.R
   - R/make_clean_os_names.R
+  - R/mark_all_duplicates.R
   - R/process_save_selected_aliquots.R
   - R/read_os_data.R
   - R/samples_distributed_summary.R
@@ -124,6 +125,7 @@ sample_selections.Rmd:
   - tests/testthat/test-create_save_workbook.R
   - tests/testthat/test-gen_clinical_data.R
   - tests/testthat/test-samples_distributed_summary.R
+  - tests/testthat/test-mark_all_duplicates.R
   vignettes: vignettes/sample-selections-vignette.Rmd
   inflate:
     flat_file: dev/sample_selections.Rmd

diff --git a/dev/sample_selections.Rmd b/dev/sample_selections.Rmd
@@ -1256,7 +1256,7 @@ gen_clinical_data <- function(clinical_data,
       stop("ppid_col is not present in the additional data set")
     }
     clinical_data <- merge(additional_data,
-                           clinical_data, by = ppid_col ) 
+                           clinical_data, by = ppid_col, all.x = TRUE) 
     
     
   }else{
@@ -1498,6 +1498,45 @@ test_that("samples_distributed_summary works", {
 ```
 
 
+# mark_all_duplicates
+
+```{r function-mark_all_duplicates}
+#' Mark All Duplicates in a Vector
+#'
+#' This function identifies all duplicate elements in a vector, marking both the first
+#' occurrence and subsequent duplicates.
+#'
+#' @param vec A numeric or character vector where duplicates are to be identified.
+#'
+#' @return A logical vector of the same length as `vec`, where TRUE indicates that
+#' the corresponding element in `vec` is a duplicate.
+#'
+#' @export
+mark_all_duplicates <- function(vec) {
+  # Check duplicates from the start and from the end
+  dup_from_start <- duplicated(vec)
+  dup_from_end <- duplicated(vec, fromLast = TRUE)
+  
+  # Combine the two to mark all duplicates (including first occurrences)
+  all_duplicates <- dup_from_start | dup_from_end
+  return(all_duplicates)
+}
+
+```
+
+```{r example-mark_all_duplicates}
+
+vec <- c("apple", "banana", "apple", "cherry", "banana")
+mark_all_duplicates(vec)
+```
+
+```{r tests-mark_all_duplicates}
+test_that("mark_all_duplicates works", {
+  expect_true(inherits(mark_all_duplicates, "function")) 
+})
+```
+
+
 ```{r development-load}
 # Load already included functions if relevant
 pkgload::load_all(export_all = FALSE)

diff --git a/man/mark_all_duplicates.Rd b/man/mark_all_duplicates.Rd
diff --git a/man/timestamp_to_date.numeric.Rd b/man/timestamp_to_date.numeric.Rd
diff --git a/tests/testthat/test-mark_all_duplicates.R b/tests/testthat/test-mark_all_duplicates.R
@@ -0,0 +1,5 @@
+# WARNING - Generated by {fusen} from dev/sample_selections.Rmd: do not edit by hand
+
+test_that("mark_all_duplicates works", {
+  expect_true(inherits(mark_all_duplicates, "function")) 
+})
diff --git a/vignettes/sample-selections-vignette.Rmd b/vignettes/sample-selections-vignette.Rmd
@@ -461,3 +461,20 @@ samples_distributed_summary(df =list_dfs$clinical_data ,
                             order_col = "Total")
 ```
 
+# mark_all_duplicates
+
+
+
+
+
+```{r example-mark_all_duplicates}
+
+vec <- c("apple", "banana", "apple", "cherry", "banana")
+mark_all_duplicates(vec)
+```
+
+
+
+
+
+