documentaton updated

aminuldu07 · Dec 31, 2024 · cb37e06 · cb37e06
1 parent 2b389a6
commit cb37e06
Show file tree

Hide file tree

Showing 9 changed files with 934 additions and 42 deletions.
diff --git a/R/get_7col_harmonized_scores_df.R → R/get_col_harmonized_scores_df.R b/R/get_7col_harmonized_scores_df.R → R/get_col_harmonized_scores_df.R
@@ -1,20 +1,36 @@
-
-#' @title get_random_forest_model
-#' @param data_frame Mandatory, character \cr
-#'   Studyid number
-#' @param Liver_get_liver_om_lb_mi_tox_score_list Mandatory, character \cr
-#'   Studyid number
-#' @param not_Liver_get_liver_om_lb_mi_tox_score_list Mandatory, character \cr
-#'   path of database
-#' @return score
+#' @title get_col_harmonized_scores_df
+#' @description
+#' This function harmonizes liver score data by cleaning column names,
+#' replacing missing values with zeros, and optionally rounding specific columns.
+#' The function also identifies and harmonizes synonyms, removes unnecessary columns,
+#' and reorders the data based on column sums.
+#'
+#' @param liver_score_data_frame A data frame containing liver score data.
+#'   This data frame should have column names that may require harmonization.
+#' @param Round A logical value indicating whether the data should be rounded.
+#'   If TRUE, certain liver-related columns are floored and capped, and histology-related columns are ceiled. Default is FALSE.
+#'
+#' @details
+#' The function performs the following operations:
+#' - Harmonizes column names by replacing spaces, commas, and slashes with dots.
+#' - Replaces missing values (NA) with zero.
+#' - Identifies and harmonizes synonym columns, replacing their values with the higher value between the synonyms.
+#' - Removes specific unwanted columns such as 'INFILTRATE', 'UNREMARKABLE', 'THIKENING', and 'POSITIVE'.
+#' - Optionally rounds liver score columns by flooring and capping them at 5, and histology-related columns by ceiling.
+#' - Reorders columns based on the sum of their values.
+#'
+#' @return A data frame with harmonized liver scores, optional rounding, and columns reordered based on their sums.
 #'
 #' @examples
 #' \dontrun{
-#' get_liver_lb_score(studyid='1234123', database_path = dbtoken)
+#' # Example usage
+#' result <- get_col_harmonized_scores_df(liver_score_data_frame = liver_scores, Round = TRUE)
 #' }
-#' @export
 #'
-
+#' @export
+get_col_harmonized_scores_df <- function(liver_score_data_frame, Round = FALSE) {
+  # Function implementation here...
+}
 
 
 get_col_harmonized_scores_df <- function(liver_score_data_frame,

diff --git a/R/get_8ml_data_and_tuned_hyperparameters.R → R/get_ml_data_and_tuned_hyperparameters.R b/R/get_8ml_data_and_tuned_hyperparameters.R → R/get_ml_data_and_tuned_hyperparameters.R
@@ -1,40 +1,47 @@
-#' Get Random Forest Data and Best Model
+#' @title Get Random Forest Data and Tuned Hyperparameters
 #'
-#' This function retrieves and processes data for random forest analysis from a SQLite database.
-#' It performs the following steps:
-#' 1. Connects to the SQLite database and retrieves unique `STUDYID` values from the `dm` table.
-#' 2. Generates liver toxicity scores for the given study IDs.
-#' 3. Harmonizes the columns in the scores data frame.
-#' 4. Reads metadata for study IDs.
-#' 5. Prepares the data and tunes hyperparameters for a random forest model.
+#' @description
+#' The `get_ml_data_and_tuned_hyperparameters` function processes input data and metadata to prepare data for
+#' random forest analysis. It includes steps for data preprocessing, optional imputation, rounding,
+#' error correction, and hyperparameter tuning.
 #'
-#' @param path_db Character. Path to the SQLite database.
-#' @param studyid_metadata_path Character. Path to the CSV file containing metadata for study IDs.
-#' @param fake_study Logical. Whether to use fake study IDs. Default is `TRUE`.
-#' @param use_xpt_file Logical. Whether to use XPT file format. Default is `FALSE`.
-#' @param output_individual_scores Logical. Whether to output individual scores. Default is `TRUE`.
-#' @param output_zscore_by_USUBJID Logical. Whether to output z-scores by `USUBJID`. Default is `FALSE`.
-#' @param Impute Logical. Whether to impute missing values in the data. Default is `TRUE`.
-#' @param Round Logical. Whether to round numerical values in the data. Default is `TRUE`.
-#' @param reps Integer. Number of repetitions for model evaluation. Default is `1`.
-#' @param holdback Numeric. Proportion of data to hold back for validation. Default is `0.75`.
-#' @param Undersample Logical. Whether to perform undersampling to balance the data. Default is `TRUE`.
-#' @param hyperparameter_tuning Logical. Whether to perform hyperparameter tuning. Default is `FALSE`.
-#' @param error_correction_method Character. Method for error correction. Default is `'None'`.
+#' @param Data data.frame. Input data frame containing scores, typically named `scores_df`.
+#' @param studyid_metadata data.frame. Metadata containing `STUDYID` values, used for joining with `Data`.
+#' @param Impute logical. Indicates whether to impute missing values in the dataset using random forest imputation. Default is `FALSE`.
+#' @param Round logical. Specifies whether to round specific numerical columns according to predefined rules. Default is `FALSE`.
+#' @param reps integer. Number of repetitions for cross-validation. A value of `0` skips repetition.
+#' @param holdback numeric. Fraction of data to hold back for testing. A value of `1` performs leave-one-out cross-validation.
+#' @param Undersample logical. Indicates whether to undersample the training data to balance the target classes. Default is `FALSE`.
+#' @param hyperparameter_tuning logical. Specifies whether to perform hyperparameter tuning for the random forest model. Default is `FALSE`.
+#' @param error_correction_method character. Specifies the method for error correction. Can be `"Flip"`, `"Prune"`, or `NULL`. Default is `NULL`.
+#'
+#' @return
+#' A list containing:
+#' \describe{
+#'   \item{rfData}{The final processed data after preprocessing and error correction.}
+#'   \item{best.m}{The best `mtry` hyperparameter determined for the random forest model.}
+#' }
 #'
-#' @return A list containing the processed data and the best model parameters.
 #' @export
 #'
 #' @examples
-#' path_db <- "C:/path/to/database.db"
-#' studyid_metadata_path <- "C:/path/to/study_metadata.csv"
-#' rfData_and_best_m <- get_rfData_and_best_m(
-#'   path_db = path_db,
-#'   studyid_metadata_path = studyid_metadata_path,
-#'   fake_study = TRUE,
+#' # Example usage:
+#' Data <- scores_df
+#' studyid_metadata <- read.csv("path/to/study_metadata.csv")
+#' result <- get_ml_data_and_tuned_hyperparameters(
+#'   Data = Data,
+#'   studyid_metadata = studyid_metadata,
+#'   Impute = TRUE,
 #'   Round = TRUE,
-#'   Undersample = TRUE
+#'   reps = 10,
+#'   holdback = 0.75,
+#'   Undersample = TRUE,
+#'   hyperparameter_tuning = TRUE,
+#'   error_correction_method = "Flip"
 #' )
+#' rfData <- result$rfData
+#' best_mtry <- result$best.m
+
 
 get_ml_data_and_tuned_hyperparameters <- function(Data, # Data == "scores_df"
                                                   studyid_metadata,

diff --git a/R/get_9rf_model_with_cv.R → R/get_rf_model_with_cv.R b/R/get_9rf_model_with_cv.R → R/get_rf_model_with_cv.R
@@ -1,3 +1,55 @@
+#' @title Random Forest with Cross-Validation
+#'
+#' @description
+#' This function builds a random forest model using the `randomForest` package, evaluates it through cross-validation,
+#' and computes performance metrics such as sensitivity, specificity, and accuracy.
+#' It optionally applies undersampling to handle class imbalance and supports custom settings for the number of predictors sampled at each split.
+#'
+#' @param Data Mandatory, data frame
+#'   The input dataset, which must include a column named `Target_Organ` as the response variable.
+#' @param Undersample Optional, logical
+#'   If `TRUE`, balances the dataset by undersampling the majority class. Default is `FALSE`.
+#' @param best.m Optional, numeric or `NULL`
+#'   Specifies the number of predictors sampled at each split. If `NULL`, the default value of `randomForest` is used.
+#' @param testReps Mandatory, integer
+#'   The number of cross-validation repetitions. Must be at least 2.
+#' @param Type Mandatory, numeric
+#'   Specifies the importance metric type: `1` for Mean Decrease Accuracy or `2` for Gini.
+#'
+#' @return
+#' A list with the following elements:
+#' \itemize{
+#'   \item \code{performance_metrics}: A vector of aggregated performance metrics, including sensitivity, specificity, and accuracy.
+#'   \item \code{raw_results}: A list containing raw sensitivity, specificity, and accuracy values for each cross-validation fold.
+#' }
+#'
+#' @details
+#' This function splits the input data into training and testing subsets based on the specified `testReps` cross-validation folds.
+#' If undersampling is enabled, the function balances the training set to reduce class imbalance.
+#' A random forest model is trained on the training set, and predictions are evaluated on the test set. The results are aggregated to provide summary performance metrics.
+#'
+#' @examples
+#' # Load necessary libraries
+#' library(randomForest)
+#' library(caret)
+#'
+#' # Example dataset
+#' data(iris)
+#' iris$Target_Organ <- ifelse(iris$Species == "setosa", 1, 0)
+#' iris <- iris[, -5]  # Remove Species column
+#'
+#' # Run the function
+#' results <- get_rf_model_with_cv(Data = iris,
+#'                                 Undersample = TRUE,
+#'                                 best.m = 2,
+#'                                 testReps = 5,
+#'                                 Type = 2)
+#'
+#' # Print results
+#' print(results$performance_metrics)
+#'
+#' @export
+
 
 get_rf_model_with_cv <- function(Data,
                                  Undersample = FALSE,
@@ -195,6 +247,17 @@ print(".........................................................................
 
 }
 
+
+
+
+
+
+
+
+
+
+
+
 # #Add a special case for testReps = 1 that directly splits data into train and test sets without looping or iterative sampling.
 # if (testReps == 1) {
 #   # Use a single random split (e.g., 70% train, 30% test)

diff --git a/R/get_10zone_exclusioned_rf_model_with_cv.R → R/get_zone_exclusioned_rf_model_with_cv.R b/R/get_10zone_exclusioned_rf_model_with_cv.R → R/get_zone_exclusioned_rf_model_with_cv.R
@@ -1,3 +1,67 @@
+#' @title Random Forest Model with Cross-validation and Exclusion
+#' @description This function implements a Random Forest classification model
+#'   with cross-validation and allows for undersampling, handling indeterminate
+#'   predictions, and calculating various model performance metrics such as
+#'   sensitivity, specificity, and accuracy. It tracks the proportion of indeterminate
+#'   predictions and provides an aggregated performance summary across multiple test repetitions.
+#'
+#' @param Data A data frame containing the features and the target variable `Target_Organ`
+#'   to train the Random Forest model on.
+#'
+#' @param Undersample A logical value indicating whether to perform undersampling to
+#'   balance the classes in the training data. Defaults to `FALSE`.
+#'
+#' @param best.m A numeric value representing the best number of variables (`mytry`)
+#'   to use at each split in the Random Forest model. This can be manually set or
+#'   determined through optimization.
+#'
+#' @param testReps An integer specifying the number of test repetitions. This must
+#'   be at least 2, as the function relies on multiple test sets to assess the model performance.
+#'
+#' @param indeterminateUpper A numeric value indicating the upper bound for the
+#'   predicted probability to consider a prediction indeterminate. Predictions with
+#'   probabilities within this range are marked as indeterminate.
+#'
+#' @param indeterminateLower A numeric value indicating the lower bound for the
+#'   predicted probability to consider a prediction indeterminate. Predictions with
+#'   probabilities within this range are marked as indeterminate.
+#'
+#' @param Type An integer indicating the type of feature importance to use in the
+#'   Random Forest model. Typically, `1` for "Mean Decrease Accuracy" or `2` for "Mean Decrease Gini".
+#'
+#' @return A list containing two components:
+#' \describe{
+#'   \item{performance_metrics}{A vector with the aggregated performance metrics,
+#'     including sensitivity, specificity, accuracy, and others, calculated across
+#'     all test repetitions.}
+#'   \item{raw_results}{A list containing the raw performance metrics for each repetition,
+#'     including sensitivity, specificity, and accuracy.}
+#' }
+#'
+#' @examples
+#' \dontrun{
+#' # Example usage
+#' Data <- your_data_frame  # Replace with actual dataset
+#' results <- get_zone_exclusioned_rf_model_with_cv(Data = Data,
+#'                                                 Undersample = TRUE,
+#'                                                 best.m = 5,
+#'                                                 testReps = 10,
+#'                                                 indeterminateUpper = 0.8,
+#'                                                 indeterminateLower = 0.2,
+#'                                                 Type = 1)
+#'
+#' # View the aggregated performance metrics
+#' print(results$performance_metrics)
+#'
+#' # Access raw results for further analysis
+#' print(results$raw_results)
+#' }
+#'
+#' @seealso \link[randomForest]{randomForest}, \link[caret]{confusionMatrix}
+#'
+#' @import randomForest
+#' @import caret
+
 
 get_zone_exclusioned_rf_model_with_cv <- function(Data=NULL, #scores_df
                                       Undersample = FALSE,
@@ -6,7 +70,7 @@ get_zone_exclusioned_rf_model_with_cv <- function(Data=NULL, #scores_df
                                       indeterminateUpper,
                                       indeterminateLower,
                                       Type) {
-browser()
+
     rfData <- Data #rfData <- scores_df
     #---------------------------------------------------------------------
     # Initialize model performance metric trackers------------------------