diff --git a/NAMESPACE b/NAMESPACE index 778de89..b34b532 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,16 +1,31 @@ # Generated by roxygen2: do not edit by hand -export(get_all_lb_TESTCD_zscore) +export(get_Data_formatted_for_ml_and_best.m) +export(get_auc_curve_with_rf_model) export(get_bw_score) +export(get_col_harmonized_scores_df) export(get_compile_data) -export(get_fixed_parameter_rf_model) -export(get_harmonized_column) +export(get_histogram_barplot) +export(get_imp_features_from_rf_model_with_cv) export(get_lb_score) export(get_liver_om_lb_mi_tox_score_list) export(get_livertobw_score) export(get_mi_score) -export(get_random_forest_model) -export(predicted_random_forest_model) +export(get_ml_data_and_tuned_hyperparameters) +export(get_prediction_plot) +export(get_repeat_dose_parallel_studyids) +export(get_reprtree_from_rf_model) +export(get_rf_input_param_list_output_cv_imp) +export(get_rf_model_output_cv_imp) +export(get_rf_model_with_cv) +import(DBI) +import(ROCR) +import(RSQLite) +import(caret) +import(ggplot2) +import(randomForest) +import(reprtree) +import(stats) importFrom(RSQLite,SQLite) importFrom(RSQLite,dbConnect) -importFrom(magrittr,"%>%") +importFrom(stats,lm) diff --git a/R/get_c_1Data_formatted_for_ml_and_best.m.R b/R/get_Data_formatted_for_ml_and_best.m.R similarity index 70% rename from R/get_c_1Data_formatted_for_ml_and_best.m.R rename to R/get_Data_formatted_for_ml_and_best.m.R index 750b707..7c12b25 100644 --- a/R/get_c_1Data_formatted_for_ml_and_best.m.R +++ b/R/get_Data_formatted_for_ml_and_best.m.R @@ -1,4 +1,53 @@ - +#' @title Retrieve and Preprocess Data for Machine Learning Models +#' +#'@description +#' This function processes data from a given SQLite database or XPT file, calculates liver toxicity scores, and prepares data for machine learning models. +#' It can also tune hyperparameters and apply error correction methods. +#' +#' @param path_db A character string representing the path to the SQLite database or XPT file. +#' @param rat_studies A logical flag to filter for rat studies (default is FALSE). +#' @param studyid_metadata A data frame containing metadata for the study IDs. If NULL, metadata is generated (default is NULL). +#' @param fake_study A logical flag to use fake study data (default is FALSE). +#' @param use_xpt_file A logical flag to indicate whether to use an XPT file instead of a SQLite database (default is FALSE). +#' @param Round A logical flag to round liver toxicity scores (default is FALSE). +#' @param Impute A logical flag to impute missing values in the dataset (default is FALSE). +#' @param reps An integer specifying the number of repetitions for cross-validation. +#' @param holdback A numeric value indicating the fraction of data to hold back for validation. +#' @param Undersample A logical flag to undersample the majority class (default is FALSE). +#' @param hyperparameter_tuning A logical flag to perform hyperparameter tuning (default is FALSE). +#' @param error_correction_method A character string specifying the error correction method. Must be one of 'Flip', 'Prune', or 'None'. +#' +#' @return A list containing: +#' \item{Data}{A data frame containing the preprocessed data ready for machine learning.} +#' \item{best.m}{The best machine learning model after hyperparameter tuning, if applicable.} +#' +#' @details +#' This function performs several key steps: +#' - Retrieves study IDs from an SQLite database or XPT file. +#' - Generates or uses provided study metadata, including a random assignment of "Target_Organ" values (either "Liver" or "not_Liver"). +#' - Calculates liver toxicity scores using the `get_liver_om_lb_mi_tox_score_list` function. +#' - Harmonizes the calculated scores using the `get_col_harmonized_scores_df` function. +#' - Prepares the data for machine learning and tunes hyperparameters (if enabled) using the `get_ml_data_and_tuned_hyperparameters` function. +#' - Returns the processed data and the best model. +#' +#' @examples +#' \dontrun{ +#' result <- get_Data_formatted_for_ml_and_best.m( +#' path_db = "path/to/database.db", +#' rat_studies = TRUE, +#' reps = 5, +#' holdback = 0.2, +#' error_correction_method = "Flip" +#' ) +#' +#' # Access the processed data and the best model +#' processed_data <- result$Data +#' best_model <- result$best.m +#' } +#' +#' @import DBI +#' @import RSQLite +#' @export diff --git a/R/get_12auc_curve_with_rf_model.R b/R/get_auc_curve_with_rf_model.R similarity index 67% rename from R/get_12auc_curve_with_rf_model.R rename to R/get_auc_curve_with_rf_model.R index 4e14ff3..8b3ec2f 100644 --- a/R/get_12auc_curve_with_rf_model.R +++ b/R/get_auc_curve_with_rf_model.R @@ -1,42 +1,61 @@ -#' Generate and Plot AUC Curve for Random Forest Model +#' @title Compute and Plot AUC Curve with Random Forest Model #' -#' This function trains a Random Forest model on provided or dynamically generated data, computes the -#' Area Under the Curve (AUC) for the model's performance, and plots the Receiver Operating Characteristic (ROC) curve. +#'@description +#' This function trains a Random Forest model, computes the ROC curve, and +#' calculates the AUC (Area Under the Curve). It allows various preprocessing +#' options, such as imputation, rounding, undersampling, and hyperparameter tuning. #' -#' @param rfData Data frame. The input data for training the Random Forest model. If `NULL`, the data is generated using -#' \code{get_rfData_and_best_m}. -#' @param best.m Integer. The `mtry` hyperparameter for Random Forest. If `NULL`, the value is determined dynamically -#' using \code{get_rfData_and_best_m}. -#' @param path_db Character. Path to the SQLite database. Required if `rfData` or `best.m` is `NULL`. -#' @param studyid_metadata_path Character. Path to the CSV file containing study ID metadata. Required if `rfData` or -#' `best.m` is `NULL`. -#' @param fake_study Logical. Whether to use fake study IDs. Default is \code{TRUE}. -#' @param Round Logical. Whether to round numerical values in the data. Default is \code{TRUE}. -#' @param Undersample Logical. Whether to perform undersampling to balance the data. Default is \code{TRUE}. +#' @param Data A data frame containing the training data. If `NULL`, data will be fetched from the database. +#' @param path_db A string representing the path to the SQLite database used to fetch data when `Data` is `NULL`. +#' @param rat_studies Logical; whether to filter for rat studies. Defaults to `FALSE`. +#' @param studyid_metadata A data frame containing metadata associated with study IDs. +#' @param fake_study Logical; whether to use fake study IDs for data simulation. Defaults to `FALSE`. +#' @param use_xpt_file Logical; whether to use an XPT file for input data. Defaults to `FALSE`. +#' @param Round Logical; whether to round numerical values. Defaults to `FALSE`. +#' @param Impute Logical; whether to perform imputation on missing values. Defaults to `FALSE`. +#' @param best.m The 'mtry' hyperparameter for Random Forest. If `NULL`, it is determined by the function. +#' @param reps A numeric value indicating the number of repetitions for cross-validation. Defaults to a numeric value. +#' @param holdback Numeric; either 1 or a fraction value (e.g., 0.75) for holdback during cross-validation. +#' @param Undersample Logical; whether to perform undersampling. Defaults to `FALSE`. +#' @param hyperparameter_tuning Logical; whether to perform hyperparameter tuning. Defaults to `FALSE`. +#' @param error_correction_method Character; one of "Flip", "Prune", or "None", specifying the method of error correction. +#' @param output_individual_scores Logical; whether to output individual scores. Defaults to `TRUE`. +#' @param output_zscore_by_USUBJID Logical; whether to output z-scores by subject ID. Defaults to `FALSE`. #' -#' @return This function does not return a value. It prints the AUC value and plots the ROC curve. -#' @details -#' If `rfData` and `best.m` are not provided, the function dynamically generates the required data by connecting to -#' the specified SQLite database and processing metadata. +#' @return This function does not return any explicit value. It generates: +#' \itemize{ +#' \item The AUC (Area Under the Curve) printed to the console. +#' \item A ROC curve plot with the calculated AUC value. +#' \item Various performance metrics (e.g., True Positive Rate, False Positive Rate), displayed in the plot. +#' } #' -#' The function uses the `randomForest` package to train the model and the `ROCR` package to calculate and plot -#' the AUC and ROC curve. +#' @details +#' The function prepares data for training a Random Forest model by first fetching data from an SQLite database +#' or generating synthetic data (if `fake_study` is `TRUE`). It processes the data using various options such +#' as imputation, rounding, and undersampling. The model is trained using the Random Forest algorithm, and +#' performance is evaluated via the ROC curve and AUC metric. #' -#' @export +#' The function also allows for hyperparameter tuning and error correction. After training the model, +#' predictions are made, and the AUC is calculated and visualized with a ROC curve plot. #' #' @examples -#' # Using pre-calculated rfData and best.m -#' get_auc_curve(rfData = my_rfData, best.m = 5) +#' # Example 1: Using real data from the database +#' get_auc_curve_with_rf_model(Data = NULL, path_db = "path/to/database.db", rat_studies = TRUE, reps = 10, +#' holdback = 0.75, error_correction_method = "Prune") #' -#' # Dynamically generating rfData and best.m -#' get_auc_curve( -#' path_db = "path/to/database.db", -#' studyid_metadata_path = "path/to/study_metadata.csv", -#' fake_study = TRUE, -#' Round = TRUE, -#' Undersample = TRUE -#' ) - +#' # Example 2: Using synthetic data with fake study IDs +#' get_auc_curve_with_rf_model(Data = NULL, fake_study = TRUE, reps = 5, holdback = 0.8, +#' error_correction_method = "Flip") +#' +#' @seealso +#' `randomForest`, `ROCR` +#' +#' @import DBI +#' @import RSQLite +#' @import ROCR +#' @import randomForest +#' +#' @export diff --git a/R/get_13histogram_barplot.R b/R/get_histogram_barplot.R similarity index 61% rename from R/get_13histogram_barplot.R rename to R/get_histogram_barplot.R index b22a982..ad705a0 100644 --- a/R/get_13histogram_barplot.R +++ b/R/get_histogram_barplot.R @@ -1,3 +1,54 @@ +#' @title Generate Histogram or Bar Plot for Liver-Related Scores +#' +#'@description +#' This function generates a bar plot comparing liver-related findings to non-liver-related findings, +#' or returns processed data for further analysis. The function can fetch data from an SQLite database, +#' a provided XPT file, or simulate data if `fake_study` is set to TRUE. +#' +#' @param Data A data frame containing liver-related scores. If NULL, the function will attempt to +#' generate or fetch the data from a database or file. +#' @param generateBarPlot A logical flag (default = FALSE). If TRUE, generates a bar plot. If FALSE, +#' returns the processed data. +#' @param path_db A character string representing the path to the SQLite database. Required if +#' `use_xpt_file` is FALSE or `fake_study` is FALSE. +#' @param rat_studies A logical flag (default = FALSE) to filter for rat studies when fetching data +#' from the database. +#' @param studyid_metadata A data frame containing metadata associated with study IDs. Required when +#' `fake_study` is FALSE and real data is fetched. +#' @param fake_study A logical flag (default = FALSE). If TRUE, the function simulates study data +#' instead of fetching it from a database. +#' @param use_xpt_file A logical flag (default = FALSE). If TRUE, the function will use an XPT file +#' to fetch data, instead of relying on the database. +#' @param Round A logical flag (default = FALSE). Whether to round the liver scores. +#' @param output_individual_scores A logical flag (default = TRUE). Whether to output individual +#' scores or aggregated scores. +#' @param output_zscore_by_USUBJID A logical flag (default = FALSE). Whether to output z-scores +#' by USUBJID (unique subject identifier). +#' +#' @return If `generateBarPlot = TRUE`, a `ggplot2` bar plot object is returned displaying the +#' average scores for liver-related findings versus non-liver-related findings. If +#' `generateBarPlot = FALSE`, a data frame (`plotData`) containing the calculated values +#' for each finding, liver status (`LIVER`), and mean values (`Value`) is returned. +#' +#' @details +#' If no data is provided, the function attempts to fetch data from an SQLite database or simulate +#' data based on the `fake_study` flag. The function also supports the use of XPT files and allows +#' customization of study filtering through the `rat_studies` and `studyid_metadata` parameters. +#' When generating a plot, the function compares liver-related findings to other findings, +#' displaying the average scores for each finding in a bar plot. +#' +#' @examples +#' # Example 1: Generate a bar plot with fake study data +#' get_histogram_barplot(generateBarPlot = TRUE, fake_study = TRUE) +#' +#' # Example 2: Get processed data without generating a plot +#' data <- get_histogram_barplot(generateBarPlot = FALSE, fake_study = FALSE, path_db = "path/to/db") +#' +#' @import ggplot2 +#' @import DBI +#' @import RSQLite +#' @export + get_histogram_barplot <- function(Data =NULL, generateBarPlot= FALSE, diff --git a/R/get_11imp_features_from_rf_model_with_cv.R b/R/get_imp_features_from_rf_model_with_cv.R similarity index 68% rename from R/get_11imp_features_from_rf_model_with_cv.R rename to R/get_imp_features_from_rf_model_with_cv.R index 329be39..7eb6074 100644 --- a/R/get_11imp_features_from_rf_model_with_cv.R +++ b/R/get_imp_features_from_rf_model_with_cv.R @@ -1,3 +1,48 @@ +#' @title Get Important Features from Random Forest Model with Cross-Validation +#' +#'@description +#' This function performs cross-validation with test repetitions on a random forest model, calculates feature importance using Gini importance, and returns the top `n` important features. +#' +#' @param Data A data frame containing the training data (rows as samples, columns as features). The first column is assumed to be the target variable. +#' @param Undersample A logical value indicating whether to apply under-sampling to balance the classes in the training data. Default is `FALSE`. +#' @param best.m A numeric value representing the number of variables to consider at each split of the Random Forest model (or a function to determine this). Default is `NULL`. +#' @param testReps A numeric value indicating the number of test repetitions (must be at least 2). +#' @param Type A numeric value indicating the type of importance to be calculated. `1` for Mean Decrease Accuracy and `2` for Mean Decrease Gini. +#' @param nTopImportance A numeric value indicating the number of top important features to return based on their importance scores. +#' +#' @return A list containing: +#' \describe{ +#' \item{gini_scores}{A matrix of Gini importance scores for each feature across the different cross-validation iterations. The matrix has rows representing features and columns representing test iterations.} +#' } +#' +#' @details +#' This function trains a Random Forest model using cross-validation with specified repetitions and calculates the feature importance using Gini importance scores. The function also supports optional under-sampling to balance the class distribution in the training set. +#' +#' The function performs the following steps: +#' \itemize{ +#' \item Initializes performance metric trackers. +#' \item Prepares the input data for cross-validation. +#' \item Performs cross-validation, where each repetition involves training the model on a subset of data and testing on the remaining data. +#' \item Optionally applies under-sampling to the training data. +#' \item Trains a Random Forest model on each fold and calculates Gini importance scores. +#' \item Aggregates and sorts the Gini importance scores to identify the top features. +#' \item Plots the importance of top features. +#' } +#' +#' @examples +#' # Example of calling the function +#' result <- get_imp_features_from_rf_model_with_cv( +#' Data = scores_df, +#' Undersample = FALSE, +#' best.m = 3, +#' testReps = 5, +#' Type = 2, +#' nTopImportance = 10 +#' ) +#' +#' @export + + get_imp_features_from_rf_model_with_cv <- function(Data=NULL, #scores_df Undersample = FALSE, diff --git a/R/get_15prediction_plot.R b/R/get_prediction_plot.R similarity index 66% rename from R/get_15prediction_plot.R rename to R/get_prediction_plot.R index 3ed7aa8..14ee74d 100644 --- a/R/get_15prediction_plot.R +++ b/R/get_prediction_plot.R @@ -1,3 +1,49 @@ +#' @title Generate Prediction Plot for Random Forest Model +#' +#'@description +#' This function performs model building and prediction using a random forest algorithm. It iterates over multiple test repetitions, training the model on the training data and predicting on the test data. After predictions are made, a histogram plot is generated to visualize the distribution of predicted probabilities for the outcome variable (`LIVER`). +#' +#' @param Data A data frame containing the dataset to use for training and testing. If `NULL`, the function will attempt to fetch and format the data from the database using `get_Data_formatted_for_ml_and_best.m` function. +#' @param path_db A string indicating the path to the database that contains the dataset. +#' @param rat_studies A logical flag indicating whether to use rat studies data. Defaults to `FALSE`. +#' @param studyid_metadata A data frame containing metadata related to the study IDs. Defaults to `NULL`. +#' @param fake_study A logical flag indicating whether to use fake study data. Defaults to `FALSE`. +#' @param use_xpt_file A logical flag indicating whether to use an XPT file. Defaults to `FALSE`. +#' @param Round A logical flag indicating whether to round the predictions. Defaults to `FALSE`. +#' @param Impute A logical flag indicating whether to impute missing values. Defaults to `FALSE`. +#' @param reps An integer specifying the number of repetitions for cross-validation. +#' @param holdback A numeric value indicating the proportion of data to hold back for testing during cross-validation. +#' @param Undersample A logical flag indicating whether to perform undersampling on the dataset to balance the classes. Defaults to `FALSE`. +#' @param hyperparameter_tuning A logical flag indicating whether to perform hyperparameter tuning. Defaults to `FALSE`. +#' @param error_correction_method A string specifying the error correction method to be used. Possible values are "Flip", "Prune", or "None". +#' @param testReps An integer specifying the number of test repetitions for model evaluation. +#' +#' @return A `ggplot` object representing the histogram of predicted probabilities for the `LIVER` variable across test repetitions. +#' +#' @details +#' The function works as follows: +#' - If `Data` is `NULL`, the function fetches the data and the best model configuration by calling the `get_Data_formatted_for_ml_and_best.m` function. +#' - The dataset is divided into training and test sets for each repetition (`testReps`). +#' - If `Undersample` is enabled, undersampling is applied to balance the dataset. +#' - A random forest model is trained on the training data and predictions are made on the test data. +#' - The predictions are averaged over the test repetitions and a histogram is plotted to visualize the distribution of predicted probabilities for `LIVER`. +#' +#' @examples +#' # Example function call +#' get_prediction_plot( +#' path_db = "path_to_db", +#' rat_studies = FALSE, +#' reps = 10, +#' holdback = 0.2, +#' Undersample = TRUE, +#' hyperparameter_tuning = FALSE, +#' error_correction_method = "Flip", +#' testReps = 5 +#' ) +#' +#' @export + + get_prediction_plot <- function(Data=NULL, path_db, rat_studies=FALSE, diff --git a/R/get_0repeat_dose_parallel_studyids.R b/R/get_repeat_dose_parallel_studyids.R similarity index 69% rename from R/get_0repeat_dose_parallel_studyids.R rename to R/get_repeat_dose_parallel_studyids.R index a06cae7..5c645f8 100644 --- a/R/get_0repeat_dose_parallel_studyids.R +++ b/R/get_repeat_dose_parallel_studyids.R @@ -1,4 +1,29 @@ - +#' @title Get Repeat Dose Parallel Study IDs +#' +#'@description +#' This function retrieves study IDs from a database that correspond to parallel-design studies involving repeat-dose toxicity. +#' It optionally filters the studies for rat species. +#' +#' @param path_db A character string representing the file path to the SQLite database. This is a required parameter. +#' @param rat_studies A logical flag indicating whether to filter the studies for rats only. Defaults to `FALSE`. +#' +#' @return A vector of study IDs that meet the specified criteria. This includes: +#' \itemize{ +#' \item Study IDs that match both the parallel design and repeat-dose toxicity criteria. +#' \item Optionally, study IDs that match rat species if `rat_studies = TRUE`. +#' } +#' +#' @examples +#' \dontrun{ +#' # Example without filtering for rat studies +#' study_ids <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite") +#' +#' # Example with filtering for rat studies +#' study_ids_rats <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite", rat_studies = TRUE) +#' } +#' +#' @export +#' get_repeat_dose_parallel_studyids <- function (path_db, rat_studies = FALSE) { diff --git a/R/get_14reprtree_from_rf_model .R b/R/get_reprtree_from_rf_model .R similarity index 56% rename from R/get_14reprtree_from_rf_model .R rename to R/get_reprtree_from_rf_model .R index 74e8498..c09bb1c 100644 --- a/R/get_14reprtree_from_rf_model .R +++ b/R/get_reprtree_from_rf_model .R @@ -1,3 +1,52 @@ +#' @title Get Representation Tree from Random Forest Model +#' +#'@description +#' This function trains a Random Forest model on a provided dataset and generates a representation tree (ReprTree) from the trained model. It supports various preprocessing configurations, model hyperparameters, and sampling strategies, including random undersampling. The function also allows for error correction and hyperparameter tuning. +#' +#' @param Data A data frame containing the dataset to train the Random Forest model. If `NULL`, data is fetched using the `get_Data_formatted_for_ml_and_best.m` function. +#' @param path_db A character string representing the path to the database used for fetching or processing the data. +#' @param rat_studies A logical flag indicating whether rat studies are used (default: `FALSE`). +#' @param studyid_metadata A data frame containing metadata related to study IDs (default: `NULL`). +#' @param fake_study A logical flag indicating whether to use fake study data (default: `FALSE`). +#' @param use_xpt_file A logical flag indicating whether to use the XPT file format for data input (default: `FALSE`). +#' @param Round A logical flag indicating whether to round the data before processing (default: `FALSE`). +#' @param Impute A logical flag indicating whether to impute missing values in the data (default: `FALSE`). +#' @param reps An integer specifying the number of repetitions to perform for cross-validation or resampling. +#' @param holdback A numeric value representing the fraction of data to hold back for testing. +#' @param Undersample A logical flag indicating whether undersampling should be applied to balance the dataset (default: `FALSE`). +#' @param hyperparameter_tuning A logical flag indicating whether hyperparameter tuning should be performed (default: `FALSE`). +#' @param error_correction_method A character string specifying the method for error correction. Must be one of `'Flip'`, `'Prune'`, or `'None'`. +#' +#' @return A plot of the first tree from the Random Forest model is displayed. The function does not return the ReprTree object explicitly, but it is generated and used for plotting. +#' +#' @details +#' The function performs the following steps: +#' 1. **Data Preparation**: If `Data` is `NULL`, it is fetched using the `get_Data_formatted_for_ml_and_best.m` function. Data is then split into training (70%) and testing (30%) sets. If `Undersample` is `TRUE`, the training data is balanced using undersampling. +#' 2. **Model Training**: A Random Forest model is trained using the `randomForest::randomForest` function. The target variable is `Target_Organ`, and the model uses the best hyperparameter (`best.m`). The number of trees is set to 500. +#' 3. **ReprTree Generation**: The `reprtree::ReprTree` function is used to generate the representation tree from the trained Random Forest model. +#' 4. **Visualization**: The first tree from the Random Forest model is plotted using the `reprtree::plot.getTree` function. +#' +#' @examples +#' get_reprtree_from_rf_model( +#' Data = my_data, +#' path_db = "path/to/database", +#' rat_studies = TRUE, +#' studyid_metadata = my_metadata, +#' fake_study = FALSE, +#' use_xpt_file = TRUE, +#' Round = TRUE, +#' Impute = TRUE, +#' reps = 5, +#' holdback = 0.3, +#' Undersample = TRUE, +#' hyperparameter_tuning = FALSE, +#' error_correction_method = "Flip" +#' ) +#' +#' @import randomForest +#' @import reprtree +#' @export + get_reprtree_from_rf_model <- function ( Data=NULL, path_db, diff --git a/R/get_c2rf_input_param_list_output_cv_imp.R b/R/get_rf_input_param_list_output_cv_imp.R similarity index 61% rename from R/get_c2rf_input_param_list_output_cv_imp.R rename to R/get_rf_input_param_list_output_cv_imp.R index b0bbfdf..4fb33f4 100644 --- a/R/get_c2rf_input_param_list_output_cv_imp.R +++ b/R/get_rf_input_param_list_output_cv_imp.R @@ -1,4 +1,68 @@ - +#' @title Prepare and Evaluate Random Forest Model with Cross-Validation and Feature Importance +#' +#'@description +#' This function prepares the data for training a Random Forest (RF) model with cross-validation, handles imputation, hyperparameter tuning, and evaluates the model's performance. It supports both real and fake study data, with options for rat studies, error correction, and feature importance selection. +#' +#' @param path_db A character string specifying the path to the SQLite database or directory containing the XPT file. +#' @param rat_studies A logical value indicating whether to filter for rat studies. Default is `FALSE`. +#' @param studyid_metadata A data frame containing metadata for the studies. +#' @param fake_study A logical value indicating whether to use fake study data. Default is `FALSE`. +#' @param use_xpt_file A logical value indicating whether to use XPT file data. Default is `FALSE`. +#' @param Round A logical value indicating whether to round the liver scores. Default is `FALSE`. +#' @param Impute A logical value indicating whether to impute missing values. Default is `FALSE`. +#' @param reps An integer specifying the number of repetitions for model evaluation. +#' @param holdback A numeric value specifying the proportion of data to hold back for validation. +#' @param Undersample A logical value indicating whether to undersample the data to balance classes. Default is `FALSE`. +#' @param hyperparameter_tuning A logical value indicating whether to tune the Random Forest model's hyperparameters. Default is `FALSE`. +#' @param error_correction_method A character string specifying the error correction method. Options are 'Flip', 'Prune', or 'None'. +#' @param best.m A numeric value specifying the number of trees in the Random Forest model. If `NULL`, the function determines this automatically. +#' @param testReps An integer specifying the number of test repetitions for model evaluation. +#' @param indeterminateUpper A numeric value for the upper threshold of indeterminate predictions. +#' @param indeterminateLower A numeric value for the lower threshold of indeterminate predictions. +#' @param Type A character string specifying the type of Random Forest model to use. Options include 'classification' or 'regression'. +#' @param nTopImportance An integer specifying the number of top important features to consider for the model. +#' +#' @return A list containing the trained Random Forest model, cross-validation results, and feature importance scores. +#' The list is returned by the `get_rf_model_with_cv` function. +#' +#' @details +#' The function performs the following steps: +#' \itemize{ +#' \item Fetches the study data based on the specified parameters. +#' \item Calculates liver scores and harmonizes the data. +#' \item Prepares data for machine learning, including imputation and optional hyperparameter tuning. +#' \item Trains and evaluates the Random Forest model with cross-validation. +#' \item Applies error correction (if specified) and selects the most important features. +#' } +#' +#' @examples +#' # Example usage of the function +#' result <- get_rf_input_param_list_output_cv_imp( +#' path_db = "path/to/database", +#' rat_studies = TRUE, +#' studyid_metadata = metadata_df, +#' fake_study = FALSE, +#' use_xpt_file = FALSE, +#' Round = TRUE, +#' Impute = TRUE, +#' reps = 10, +#' holdback = 0.2, +#' Undersample = TRUE, +#' hyperparameter_tuning = TRUE, +#' error_correction_method = "Flip", +#' best.m = NULL, +#' testReps = 5, +#' indeterminateUpper = 0.9, +#' indeterminateLower = 0.1, +#' Type = "classification", +#' nTopImportance = 10 +#' ) +#' +#' @import DBI +#' @import RSQLite +#' @importFrom stats lm +#' +#' @export diff --git a/R/get_c3zone_exclusioned_rf_model_cv_imp.R b/R/get_zone_exclusioned_rf_model_cv_imp.R similarity index 73% rename from R/get_c3zone_exclusioned_rf_model_cv_imp.R rename to R/get_zone_exclusioned_rf_model_cv_imp.R index b067e69..ab71329 100644 --- a/R/get_c3zone_exclusioned_rf_model_cv_imp.R +++ b/R/get_zone_exclusioned_rf_model_cv_imp.R @@ -1,3 +1,59 @@ +#' @title Perform Cross-Validation with Random Forest and Feature Importance Calculation +#' +#'@description +#' This function performs cross-validation on a Random Forest model, tracks +#' performance metrics (such as sensitivity, specificity, accuracy), handles +#' indeterminate predictions, and computes feature importance based on either +#' Gini or Accuracy. The function returns performance summaries and feature +#' importance rankings after a specified number of test repetitions. +#' +#' @param scores_df A data frame containing the features and target variable for training and testing the model. +#' @param Undersample A logical flag indicating whether to apply undersampling to the training data. Defaults to `FALSE`. +#' @param best.m A numeric value representing the number of features to sample for the Random Forest model, or `NULL` to calculate it automatically. +#' @param testReps An integer specifying the number of repetitions for cross-validation. Must be at least 2. +#' @param indeterminateUpper A numeric threshold above which predictions are not considered indeterminate. +#' @param indeterminateLower A numeric threshold below which predictions are not considered indeterminate. +#' @param Type An integer specifying the type of importance to compute. `1` for MeanDecreaseAccuracy, `2` for MeanDecreaseGini. +#' @param nTopImportance An integer specifying the number of top features to display based on their importance scores. +#' +#' @return A list with the following elements: +#' \describe{ +#' \item{performance_metrics}{A vector of aggregated performance metrics (e.g., sensitivity, specificity, accuracy, etc.).} +#' \item{feature_importance}{A matrix containing the importance of the top `nTopImportance` features, ordered by their importance score.} +#' \item{raw_results}{A list containing raw results for debugging or further analysis, including sensitivity, specificity, accuracy, and Gini scores across all test repetitions.} +#' } +#' +#' @details +#' The function splits the input data into training and testing sets based on the specified number of test repetitions (`testReps`). +#' During each iteration, it trains a Random Forest model and makes predictions on the test data. Indeterminate predictions are handled +#' by marking them as `NA`. The function tracks performance metrics such as sensitivity, specificity, and accuracy, and computes the +#' top `nTopImportance` features based on either Mean Decrease Accuracy or Mean Decrease Gini. +#' +#' @examples +#' # Example usage of the function +#' result <- get_rf_model_output_cv_imp( +#' scores_df = your_data, +#' Undersample = FALSE, +#' best.m = 3, +#' testReps = 5, +#' indeterminateUpper = 0.8, +#' indeterminateLower = 0.2, +#' Type = 1, +#' nTopImportance = 10 +#' ) +#' +#' # View performance metrics +#' print(result$performance_metrics) +#' +#' # View top features by importance +#' print(result$feature_importance) +#' +#' @import randomForest +#' @import caret +#' @import stats +#' @export + + get_rf_model_output_cv_imp <- function(scores_df=NULL, Undersample = FALSE, diff --git a/README.Rmd b/README.Rmd index 3299e88..176191c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,111 +1,55 @@ --- -title: "SENDQSAR" output: github_document --- -# SENDQSAR: QSAR Modeling with SEND Database + -## About - -This package facilitates developing Quantitative Structure-Activity Relationship (QSAR) models using the SEND database. It streamlines data acquisition, preprocessing, descriptor calculation, and model evaluation, enabling researchers to efficiently explore molecular descriptors and create robust predictive models. - -## Features - -- **Automated Data Processing**: Simplifies data acquisition and preprocessing steps. -- **Comprehensive Analysis**: Provides z-score calculations for various parameters such as body weight, liver-to-body weight ratio, and laboratory tests. -- **Machine Learning Integration**: Supports classification modeling, hyperparameter tuning, and performance evaluation. -- **Visualization Tools**: Includes histograms, bar plots, and AUC curves for better data interpretation. - -## Functions Overview - -### Data Acquisition and Processing - -- `get_compile_data` - Fetches data from the database specified by the database path into a structured data frame for analysis. -- `get_bw_score` - Calculates body weight (BW) z-scores for each animal. -- `get_livertobw_zscore` - Computes liver-to-body weight z-scores. -- `get_lb_score` - Calculates z-scores for laboratory test (LB) results. -- `get_mi_score` - Computes z-scores for microscopic findings (MI). -- `get_liver_om_lb_mi_tox_score_list` - Combines z-scores of LB, MI, and liver-to-BW into a single data frame. -- `get_col_harmonized_scores_df` - Harmonizes column names across studies. - -### Machine Learning Preparation and Modeling - -- `get_ml_data_and_tuned_hyperparameters` - Prepares data and tunes hyperparameters for machine learning. -- `get_rf_model_with_cv` - Builds a random forest model with cross-validation and outputs performance metrics. -- `get_zone_exclusioned_rf_model_with_cv` - Introduces an indeterminate zone for improved classification accuracy. -- `get_imp_features_from_rf_model_with_cv` - Computes feature importance for model interpretation. -- `get_auc_curve_with_rf_model` - Generates AUC curves to evaluate model performance. - -### Visualization and Reporting - -- `get_histogram_barplot` - Creates bar plots for target variable classes. -- `get_reprtree_from_rf_model` - Builds representative decision trees for interpretability. -- `get_prediction_plot` - Visualizes prediction probabilities with histograms. - -### Automated Pipelines - -- `get_Data_formatted_for_ml_and_best.m` - Formats data for machine learning pipelines. -- `get_rf_input_param_list_output_cv_imp` - Automates preprocessing, modeling, and evaluation in one step. -- `get_zone_exclusioned_rf_model_cv_imp` - Similar to the above function, but excludes uncertain predictions based on thresholds. - -## Workflow +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%" +) +``` -1. **Input Database Path**: Provide the database path containing nonclinical study results for each STUDYID. -2. **Preprocessing**: Use functions 1-8 to clean, harmonize, and prepare data. -3. **Model Building**: Employ machine learning functions (9-18) for training, validation, and evaluation. -4. **Visualization**: Generate plots and performance metrics for better interpretation. +# SENDQSAR -## Dependencies + + -- `randomForest` -- `ROCR` -- `ggplot2` -- `reprtree` +The goal of SENDQSAR is to ... ## Installation -```R -# Install from GitHub -devtools::install_github("aminuldu07/SENDQSAR") +You can install the development version of SENDQSAR from [GitHub](https://github.com/) with: + +``` r +# install.packages("pak") +pak::pak("aminuldu07/SENDQSAR") ``` -## Examples +## Example -### Example 1: Basic Data Compilation +This is a basic example which shows you how to solve a common problem: -```R +```{r example} library(SENDQSAR) -data <- get_compile_data("/path/to/database") +## basic example code ``` -### Example 2: Z-Score Calculation +What is special about using `README.Rmd` instead of just `README.md`? You can include R chunks like so: -```R -bw_scores <- get_bw_score(data) -liver_scores <- get_livertobw_zscore(data) +```{r cars} +summary(cars) ``` -### Example 3: Machine Learning Model - -```R -model <- get_rf_model_with_cv(data, n_repeats=10) -print(model$confusion_matrix) -``` +You'll still need to render `README.Rmd` regularly, to keep `README.md` up-to-date. `devtools::build_readme()` is handy for this. -### Example 4: Visualization +You can also embed plots, for example: -```R -get_histogram_barplot(data, target_col="target_variable") +```{r pressure, echo = FALSE} +plot(pressure) ``` -## Contribution - -Contributions are welcome! Feel free to submit issues or pull requests via GitHub. - -## License - -This project is licensed under the MIT License - see the LICENSE file for details. - -## Contact - -For more information, visit the project GitHub Page or contact email@example.com. +In that case, don't forget to commit and push the resulting figure files, so they display on GitHub and CRAN. diff --git a/README.html b/README.html new file mode 100644 index 0000000..7a5ee86 --- /dev/null +++ b/README.html @@ -0,0 +1,641 @@ + + + + + + + + + + + + + + + + + + + + + +

SENDQSAR

+ + + + +

The goal of SENDQSAR is to …

+

Installation

+

You can install the development version of SENDQSAR from GitHub with:

+
# install.packages("pak")
+pak::pak("aminuldu07/SENDQSAR")
+

Example

+

This is a basic example which shows you how to solve a common +problem:

+
library(SENDQSAR)
+## basic example code
+

What is special about using README.Rmd instead of just +README.md? You can include R chunks like so:

+
summary(cars)
+#>      speed           dist       
+#>  Min.   : 4.0   Min.   :  2.00  
+#>  1st Qu.:12.0   1st Qu.: 26.00  
+#>  Median :15.0   Median : 36.00  
+#>  Mean   :15.4   Mean   : 42.98  
+#>  3rd Qu.:19.0   3rd Qu.: 56.00  
+#>  Max.   :25.0   Max.   :120.00
+

You’ll still need to render README.Rmd regularly, to +keep README.md up-to-date. +devtools::build_readme() is handy for this.

+

You can also embed plots, for example:

+ + +

In that case, don’t forget to commit and push the resulting figure +files, so they display on GitHub and CRAN.

+ + + diff --git a/README.md b/README.md new file mode 100644 index 0000000..36f0f00 --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ + + + +# SENDQSAR + + + + +The goal of SENDQSAR is to … + +## Installation + +You can install the development version of SENDQSAR from +[GitHub](https://github.com/) with: + +``` r +# install.packages("pak") +pak::pak("aminuldu07/SENDQSAR") +``` + +## Example + +This is a basic example which shows you how to solve a common problem: + +``` r +library(SENDQSAR) +## basic example code +``` + +What is special about using `README.Rmd` instead of just `README.md`? +You can include R chunks like so: + +``` r +summary(cars) +#> speed dist +#> Min. : 4.0 Min. : 2.00 +#> 1st Qu.:12.0 1st Qu.: 26.00 +#> Median :15.0 Median : 36.00 +#> Mean :15.4 Mean : 42.98 +#> 3rd Qu.:19.0 3rd Qu.: 56.00 +#> Max. :25.0 Max. :120.00 +``` + +You’ll still need to render `README.Rmd` regularly, to keep `README.md` +up-to-date. `devtools::build_readme()` is handy for this. + +You can also embed plots, for example: + + + +In that case, don’t forget to commit and push the resulting figure +files, so they display on GitHub and CRAN. diff --git a/vignettes/.gitignore b/inst/.gitignore similarity index 100% rename from vignettes/.gitignore rename to inst/.gitignore diff --git a/man/get_Data_formatted_for_ml_and_best.m.Rd b/man/get_Data_formatted_for_ml_and_best.m.Rd new file mode 100644 index 0000000..2def295 --- /dev/null +++ b/man/get_Data_formatted_for_ml_and_best.m.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_Data_formatted_for_ml_and_best.m.R +\name{get_Data_formatted_for_ml_and_best.m} +\alias{get_Data_formatted_for_ml_and_best.m} +\title{Retrieve and Preprocess Data for Machine Learning Models} +\usage{ +get_Data_formatted_for_ml_and_best.m( + path_db, + rat_studies = FALSE, + studyid_metadata = NULL, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + Impute = FALSE, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method +) +} +\arguments{ +\item{path_db}{A character string representing the path to the SQLite database or XPT file.} + +\item{rat_studies}{A logical flag to filter for rat studies (default is FALSE).} + +\item{studyid_metadata}{A data frame containing metadata for the study IDs. If NULL, metadata is generated (default is NULL).} + +\item{fake_study}{A logical flag to use fake study data (default is FALSE).} + +\item{use_xpt_file}{A logical flag to indicate whether to use an XPT file instead of a SQLite database (default is FALSE).} + +\item{Round}{A logical flag to round liver toxicity scores (default is FALSE).} + +\item{Impute}{A logical flag to impute missing values in the dataset (default is FALSE).} + +\item{reps}{An integer specifying the number of repetitions for cross-validation.} + +\item{holdback}{A numeric value indicating the fraction of data to hold back for validation.} + +\item{Undersample}{A logical flag to undersample the majority class (default is FALSE).} + +\item{hyperparameter_tuning}{A logical flag to perform hyperparameter tuning (default is FALSE).} + +\item{error_correction_method}{A character string specifying the error correction method. Must be one of 'Flip', 'Prune', or 'None'.} +} +\value{ +A list containing: +\item{Data}{A data frame containing the preprocessed data ready for machine learning.} +\item{best.m}{The best machine learning model after hyperparameter tuning, if applicable.} +} +\description{ +This function processes data from a given SQLite database or XPT file, calculates liver toxicity scores, and prepares data for machine learning models. +It can also tune hyperparameters and apply error correction methods. +} +\details{ +This function performs several key steps: +\itemize{ +\item Retrieves study IDs from an SQLite database or XPT file. +\item Generates or uses provided study metadata, including a random assignment of "Target_Organ" values (either "Liver" or "not_Liver"). +\item Calculates liver toxicity scores using the \code{get_liver_om_lb_mi_tox_score_list} function. +\item Harmonizes the calculated scores using the \code{get_col_harmonized_scores_df} function. +\item Prepares the data for machine learning and tunes hyperparameters (if enabled) using the \code{get_ml_data_and_tuned_hyperparameters} function. +\item Returns the processed data and the best model. +} +} +\examples{ +\dontrun{ +result <- get_Data_formatted_for_ml_and_best.m( + path_db = "path/to/database.db", + rat_studies = TRUE, + reps = 5, + holdback = 0.2, + error_correction_method = "Flip" +) + +# Access the processed data and the best model +processed_data <- result$Data +best_model <- result$best.m +} + +} diff --git a/man/get_all_lb_TESTCD_zscore.Rd b/man/get_all_lb_TESTCD_zscore.Rd deleted file mode 100644 index de17981..0000000 --- a/man/get_all_lb_TESTCD_zscore.Rd +++ /dev/null @@ -1,45 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_all_LB_TESTCD_score.R -\name{get_all_lb_TESTCD_zscore} -\alias{get_all_lb_TESTCD_zscore} -\title{get LB score for a given studyid} -\usage{ -get_all_lb_TESTCD_zscore( - studyid = NULL, - path_db, - fake_study = FALSE, - use_xpt_file = FALSE, - master_compiledata = NULL, - return_individual_scores = FALSE -) -} -\arguments{ -\item{studyid}{Mandatory, character \cr -Studyid number} - -\item{path_db}{Mandatory, character \cr -path of database} - -\item{fake_study}{optional, Boolean \cr -whether study generated by SENDsanitizer package} - -\item{use_xpt_file}{Mandatory, character \cr -Studyid number} - -\item{master_compiledata}{Mandatory, character \cr -path of database} - -\item{return_individual_scores}{optional, Boolean \cr -whether study generated by SENDsanitizer package} -} -\value{ -score -} -\description{ -get LB score for a given studyid -} -\examples{ -\dontrun{ -get_lb_score(studyid='1234123', path_db='path/to/database.db') -} -} diff --git a/man/get_auc_curve_with_rf_model.Rd b/man/get_auc_curve_with_rf_model.Rd new file mode 100644 index 0000000..3471971 --- /dev/null +++ b/man/get_auc_curve_with_rf_model.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_auc_curve_with_rf_model.R +\name{get_auc_curve_with_rf_model} +\alias{get_auc_curve_with_rf_model} +\title{Compute and Plot AUC Curve with Random Forest Model} +\usage{ +get_auc_curve_with_rf_model( + Data = NULL, + path_db = NULL, + rat_studies = FALSE, + studyid_metadata, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + Impute = FALSE, + best.m = NULL, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method, + output_individual_scores = TRUE, + output_zscore_by_USUBJID = FALSE +) +} +\arguments{ +\item{Data}{A data frame containing the training data. If \code{NULL}, data will be fetched from the database.} + +\item{path_db}{A string representing the path to the SQLite database used to fetch data when \code{Data} is \code{NULL}.} + +\item{rat_studies}{Logical; whether to filter for rat studies. Defaults to \code{FALSE}.} + +\item{studyid_metadata}{A data frame containing metadata associated with study IDs.} + +\item{fake_study}{Logical; whether to use fake study IDs for data simulation. Defaults to \code{FALSE}.} + +\item{use_xpt_file}{Logical; whether to use an XPT file for input data. Defaults to \code{FALSE}.} + +\item{Round}{Logical; whether to round numerical values. Defaults to \code{FALSE}.} + +\item{Impute}{Logical; whether to perform imputation on missing values. Defaults to \code{FALSE}.} + +\item{best.m}{The 'mtry' hyperparameter for Random Forest. If \code{NULL}, it is determined by the function.} + +\item{reps}{A numeric value indicating the number of repetitions for cross-validation. Defaults to a numeric value.} + +\item{holdback}{Numeric; either 1 or a fraction value (e.g., 0.75) for holdback during cross-validation.} + +\item{Undersample}{Logical; whether to perform undersampling. Defaults to \code{FALSE}.} + +\item{hyperparameter_tuning}{Logical; whether to perform hyperparameter tuning. Defaults to \code{FALSE}.} + +\item{error_correction_method}{Character; one of "Flip", "Prune", or "None", specifying the method of error correction.} + +\item{output_individual_scores}{Logical; whether to output individual scores. Defaults to \code{TRUE}.} + +\item{output_zscore_by_USUBJID}{Logical; whether to output z-scores by subject ID. Defaults to \code{FALSE}.} +} +\value{ +This function does not return any explicit value. It generates: +\itemize{ +\item The AUC (Area Under the Curve) printed to the console. +\item A ROC curve plot with the calculated AUC value. +\item Various performance metrics (e.g., True Positive Rate, False Positive Rate), displayed in the plot. +} +} +\description{ +This function trains a Random Forest model, computes the ROC curve, and +calculates the AUC (Area Under the Curve). It allows various preprocessing +options, such as imputation, rounding, undersampling, and hyperparameter tuning. +} +\details{ +The function prepares data for training a Random Forest model by first fetching data from an SQLite database +or generating synthetic data (if \code{fake_study} is \code{TRUE}). It processes the data using various options such +as imputation, rounding, and undersampling. The model is trained using the Random Forest algorithm, and +performance is evaluated via the ROC curve and AUC metric. + +The function also allows for hyperparameter tuning and error correction. After training the model, +predictions are made, and the AUC is calculated and visualized with a ROC curve plot. +} +\examples{ +# Example 1: Using real data from the database +get_auc_curve_with_rf_model(Data = NULL, path_db = "path/to/database.db", rat_studies = TRUE, reps = 10, + holdback = 0.75, error_correction_method = "Prune") + +# Example 2: Using synthetic data with fake study IDs +get_auc_curve_with_rf_model(Data = NULL, fake_study = TRUE, reps = 5, holdback = 0.8, + error_correction_method = "Flip") + +} +\seealso{ +\code{randomForest}, \code{ROCR} +} diff --git a/man/get_bw_score.Rd b/man/get_bw_score.Rd index a03bb1e..fa9b7c4 100644 --- a/man/get_bw_score.Rd +++ b/man/get_bw_score.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get_bw_score.R \name{get_bw_score} \alias{get_bw_score} -\title{get BW score for a given studyid} +\title{Calculate BW Score for a Given Study ID} \usage{ get_bw_score( studyid = NULL, @@ -16,34 +16,48 @@ get_bw_score( } \arguments{ \item{studyid}{Mandatory, character \cr -Studyid number} +The study ID for which the BW score is to be calculated. If \code{NULL}, all studies in the database are analyzed.} \item{path_db}{Mandatory, character \cr -path of database} +The path to the database file containing the study data.} -\item{fake_study}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{fake_study}{Optional, Boolean \cr +Indicates whether the study was generated by the \code{SENDsanitizer} package. Default is \code{FALSE}.} -\item{use_xpt_file}{Mandatory, character \cr -Studyid number} +\item{use_xpt_file}{Mandatory, Boolean \cr +If \code{TRUE}, the function uses \code{.xpt} files for processing the study data. Default is \code{FALSE}.} -\item{master_compiledata}{Mandatory, character \cr -path of database} +\item{master_compiledata}{Optional, character \cr +The path to an additional database or compiled data file for analysis. If \code{NULL}, only the primary database is used.} -\item{return_individual_scores}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{return_individual_scores}{Optional, Boolean \cr +If \code{TRUE}, the function returns individual scores for each record in the study. Default is \code{FALSE}.} -\item{return_zscore_by_USUBJID}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{return_zscore_by_USUBJID}{Optional, Boolean \cr +If \code{TRUE}, the function returns z-scores calculated by \code{USUBJID} (unique subject identifiers). Default is \code{FALSE}.} } \value{ -dataframe +A \code{data.frame} containing the calculated BW scores. The structure of the output depends on the provided parameters: +\itemize{ +\item If \code{return_individual_scores = TRUE}: Returns individual scores for each record. +\item If \code{return_zscore_by_USUBJID = TRUE}: Returns z-scores by \code{USUBJID}. +\item Otherwise, a summarized BW score for the specified \code{studyid}. +} } \description{ -get BW score for a given studyid +The \code{get_bw_score} function calculates the Bayesian Weighted (BW) score for a specified study ID using data from a provided database. +It supports optional parameters for fine-tuning the analysis and offers the flexibility to return individual scores or z-scores by \code{USUBJID}. } \examples{ \dontrun{ -get_bw_score(studyid='1234123', path_db='path/to/database.db') +# Example 1: Basic usage +get_bw_score(studyid = '1234123', path_db = 'path/to/database.db') + +# Example 2: Include individual scores +get_bw_score(studyid = '1234123', path_db = 'path/to/database.db', return_individual_scores = TRUE) + +# Example 3: Include z-scores by USUBJID +get_bw_score(studyid = '1234123', path_db = 'path/to/database.db', return_zscore_by_USUBJID = TRUE) } + } diff --git a/man/get_col_harmonized_scores_df.Rd b/man/get_col_harmonized_scores_df.Rd new file mode 100644 index 0000000..ef1ea98 --- /dev/null +++ b/man/get_col_harmonized_scores_df.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_col_harmonized_scores_df.R +\name{get_col_harmonized_scores_df} +\alias{get_col_harmonized_scores_df} +\title{get_col_harmonized_scores_df} +\usage{ +get_col_harmonized_scores_df(liver_score_data_frame, Round = FALSE) +} +\arguments{ +\item{liver_score_data_frame}{A data frame containing liver score data. +This data frame should have column names that may require harmonization.} + +\item{Round}{A logical value indicating whether the data should be rounded. +If TRUE, certain liver-related columns are floored and capped, and histology-related columns are ceiled. Default is FALSE.} +} +\value{ +A data frame with harmonized liver scores, optional rounding, and columns reordered based on their sums. +} +\description{ +This function harmonizes liver score data by cleaning column names, +replacing missing values with zeros, and optionally rounding specific columns. +The function also identifies and harmonizes synonyms, removes unnecessary columns, +and reorders the data based on column sums. +} +\details{ +The function performs the following operations: +\itemize{ +\item Harmonizes column names by replacing spaces, commas, and slashes with dots. +\item Replaces missing values (NA) with zero. +\item Identifies and harmonizes synonym columns, replacing their values with the higher value between the synonyms. +\item Removes specific unwanted columns such as 'INFILTRATE', 'UNREMARKABLE', 'THIKENING', and 'POSITIVE'. +\item Optionally rounds liver score columns by flooring and capping them at 5, and histology-related columns by ceiling. +\item Reorders columns based on the sum of their values. +} +} +\examples{ +\dontrun{ +# Example usage +result <- get_col_harmonized_scores_df(liver_score_data_frame = liver_scores, Round = TRUE) +} + +} diff --git a/man/get_compile_data.Rd b/man/get_compile_data.Rd index 1a1acee..8552073 100644 --- a/man/get_compile_data.Rd +++ b/man/get_compile_data.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get_compile_data.R \name{get_compile_data} \alias{get_compile_data} -\title{filter out tk and recovery animal} +\title{Retrieve Compiled Data from SQLite Database or XPT File} \usage{ get_compile_data( studyid = NULL, @@ -12,26 +12,28 @@ get_compile_data( ) } \arguments{ -\item{studyid}{Mandatory, character \cr -Studyid number} +\item{studyid}{Character. Study ID number. Defaults to \code{NULL}. +If \code{NULL}, all available studies may be retrieved (behavior depends on the database structure).} -\item{path_db}{Mandatory, character \cr -path of database} +\item{path_db}{Character. Path to the SQLite database file. Mandatory.} -\item{fake_study}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{fake_study}{Logical. Whether the study data is generated by the \code{SENDsanitizer} package. Defaults to \code{FALSE}.} -\item{use_xpt_file}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{use_xpt_file}{Logical. Whether to retrieve study data from an XPT file format instead of the database. Defaults to \code{FALSE}.} } \value{ -dataframe +A data frame containing the compiled study data. The structure of the returned data frame depends on the database or XPT file contents. } \description{ -filter out tk and recovery animal +This function retrieves and compiles data for a given study ID +from either a SQLite database or XPT file. } \examples{ \dontrun{ -get_compile_data(studyid='1234123', path_db='path/to/database.db') +# Retrieve data for a specific study ID from the database +get_compile_data(studyid = '1234123', path_db = 'path/to/database.db') + +# Retrieve data from an XPT file +get_compile_data(path_db = 'path/to/file.xpt', use_xpt_file = TRUE) } } diff --git a/man/get_fixed_parameter_rf_model.Rd b/man/get_fixed_parameter_rf_model.Rd deleted file mode 100644 index cf62894..0000000 --- a/man/get_fixed_parameter_rf_model.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fixed_parameter_rf_model.R -\name{get_fixed_parameter_rf_model} -\alias{get_fixed_parameter_rf_model} -\title{get_random_forest_model} -\usage{ -get_fixed_parameter_rf_model( - Liver_get_liver_om_lb_mi_tox_score_list, - not_Liver_get_liver_om_lb_mi_tox_score_list -) -} -\arguments{ -\item{Liver_get_liver_om_lb_mi_tox_score_list}{Mandatory, character \cr -Studyid number} - -\item{not_Liver_get_liver_om_lb_mi_tox_score_list}{Mandatory, character \cr -path of database} -} -\value{ -score -} -\description{ -get_random_forest_model -} -\examples{ -\dontrun{ -get_liver_lb_score(studyid='1234123', database_path = dbtoken) -} -} diff --git a/man/get_harmonized_column.Rd b/man/get_harmonized_column.Rd deleted file mode 100644 index fbb7966..0000000 --- a/man/get_harmonized_column.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Copy__Ofcolumn_harmization.R, -% R/column_harmization.R -\name{get_harmonized_column} -\alias{get_harmonized_column} -\title{get_random_forest_model} -\usage{ -get_harmonized_column( - data_frame = NULL, - Liver_get_liver_om_lb_mi_tox_score_list, - not_Liver_get_liver_om_lb_mi_tox_score_list -) - -get_harmonized_column( - data_frame = NULL, - Liver_get_liver_om_lb_mi_tox_score_list, - not_Liver_get_liver_om_lb_mi_tox_score_list -) -} -\arguments{ -\item{data_frame}{Mandatory, character \cr -Studyid number} - -\item{Liver_get_liver_om_lb_mi_tox_score_list}{Mandatory, character \cr -Studyid number} - -\item{not_Liver_get_liver_om_lb_mi_tox_score_list}{Mandatory, character \cr -path of database} -} -\value{ -score - -score -} -\description{ -get_random_forest_model - -get_random_forest_model -} -\examples{ -\dontrun{ -get_liver_lb_score(studyid='1234123', database_path = dbtoken) -} -\dontrun{ -get_liver_lb_score(studyid='1234123', database_path = dbtoken) -} -} diff --git a/man/get_histogram_barplot.Rd b/man/get_histogram_barplot.Rd new file mode 100644 index 0000000..ccd6446 --- /dev/null +++ b/man/get_histogram_barplot.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_histogram_barplot.R +\name{get_histogram_barplot} +\alias{get_histogram_barplot} +\title{Generate Histogram or Bar Plot for Liver-Related Scores} +\usage{ +get_histogram_barplot( + Data = NULL, + generateBarPlot = FALSE, + path_db = FALSE, + rat_studies = FALSE, + studyid_metadata, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + output_individual_scores = TRUE, + output_zscore_by_USUBJID = FALSE +) +} +\arguments{ +\item{Data}{A data frame containing liver-related scores. If NULL, the function will attempt to +generate or fetch the data from a database or file.} + +\item{generateBarPlot}{A logical flag (default = FALSE). If TRUE, generates a bar plot. If FALSE, +returns the processed data.} + +\item{path_db}{A character string representing the path to the SQLite database. Required if +\code{use_xpt_file} is FALSE or \code{fake_study} is FALSE.} + +\item{rat_studies}{A logical flag (default = FALSE) to filter for rat studies when fetching data +from the database.} + +\item{studyid_metadata}{A data frame containing metadata associated with study IDs. Required when +\code{fake_study} is FALSE and real data is fetched.} + +\item{fake_study}{A logical flag (default = FALSE). If TRUE, the function simulates study data +instead of fetching it from a database.} + +\item{use_xpt_file}{A logical flag (default = FALSE). If TRUE, the function will use an XPT file +to fetch data, instead of relying on the database.} + +\item{Round}{A logical flag (default = FALSE). Whether to round the liver scores.} + +\item{output_individual_scores}{A logical flag (default = TRUE). Whether to output individual +scores or aggregated scores.} + +\item{output_zscore_by_USUBJID}{A logical flag (default = FALSE). Whether to output z-scores +by USUBJID (unique subject identifier).} +} +\value{ +If \code{generateBarPlot = TRUE}, a \code{ggplot2} bar plot object is returned displaying the +average scores for liver-related findings versus non-liver-related findings. If +\code{generateBarPlot = FALSE}, a data frame (\code{plotData}) containing the calculated values +for each finding, liver status (\code{LIVER}), and mean values (\code{Value}) is returned. +} +\description{ +This function generates a bar plot comparing liver-related findings to non-liver-related findings, +or returns processed data for further analysis. The function can fetch data from an SQLite database, +a provided XPT file, or simulate data if \code{fake_study} is set to TRUE. +} +\details{ +If no data is provided, the function attempts to fetch data from an SQLite database or simulate +data based on the \code{fake_study} flag. The function also supports the use of XPT files and allows +customization of study filtering through the \code{rat_studies} and \code{studyid_metadata} parameters. +When generating a plot, the function compares liver-related findings to other findings, +displaying the average scores for each finding in a bar plot. +} +\examples{ +# Example 1: Generate a bar plot with fake study data +get_histogram_barplot(generateBarPlot = TRUE, fake_study = TRUE) + +# Example 2: Get processed data without generating a plot +data <- get_histogram_barplot(generateBarPlot = FALSE, fake_study = FALSE, path_db = "path/to/db") + +} diff --git a/man/get_imp_features_from_rf_model_with_cv.Rd b/man/get_imp_features_from_rf_model_with_cv.Rd new file mode 100644 index 0000000..a58735d --- /dev/null +++ b/man/get_imp_features_from_rf_model_with_cv.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_imp_features_from_rf_model_with_cv.R +\name{get_imp_features_from_rf_model_with_cv} +\alias{get_imp_features_from_rf_model_with_cv} +\title{Get Important Features from Random Forest Model with Cross-Validation} +\usage{ +get_imp_features_from_rf_model_with_cv( + Data = NULL, + Undersample = FALSE, + best.m = NULL, + testReps, + Type, + nTopImportance +) +} +\arguments{ +\item{Data}{A data frame containing the training data (rows as samples, columns as features). The first column is assumed to be the target variable.} + +\item{Undersample}{A logical value indicating whether to apply under-sampling to balance the classes in the training data. Default is \code{FALSE}.} + +\item{best.m}{A numeric value representing the number of variables to consider at each split of the Random Forest model (or a function to determine this). Default is \code{NULL}.} + +\item{testReps}{A numeric value indicating the number of test repetitions (must be at least 2).} + +\item{Type}{A numeric value indicating the type of importance to be calculated. \code{1} for Mean Decrease Accuracy and \code{2} for Mean Decrease Gini.} + +\item{nTopImportance}{A numeric value indicating the number of top important features to return based on their importance scores.} +} +\value{ +A list containing: +\describe{ +\item{gini_scores}{A matrix of Gini importance scores for each feature across the different cross-validation iterations. The matrix has rows representing features and columns representing test iterations.} +} +} +\description{ +This function performs cross-validation with test repetitions on a random forest model, calculates feature importance using Gini importance, and returns the top \code{n} important features. +} +\details{ +This function trains a Random Forest model using cross-validation with specified repetitions and calculates the feature importance using Gini importance scores. The function also supports optional under-sampling to balance the class distribution in the training set. + +The function performs the following steps: +\itemize{ +\item Initializes performance metric trackers. +\item Prepares the input data for cross-validation. +\item Performs cross-validation, where each repetition involves training the model on a subset of data and testing on the remaining data. +\item Optionally applies under-sampling to the training data. +\item Trains a Random Forest model on each fold and calculates Gini importance scores. +\item Aggregates and sorts the Gini importance scores to identify the top features. +\item Plots the importance of top features. +} +} +\examples{ +# Example of calling the function +result <- get_imp_features_from_rf_model_with_cv( + Data = scores_df, + Undersample = FALSE, + best.m = 3, + testReps = 5, + Type = 2, + nTopImportance = 10 +) + +} diff --git a/man/get_lb_score.Rd b/man/get_lb_score.Rd index 9825322..c9868f7 100644 --- a/man/get_lb_score.Rd +++ b/man/get_lb_score.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get_lb_score.R \name{get_lb_score} \alias{get_lb_score} -\title{get LB score for a given studyid} +\title{Get LB Score for a Given Study ID} \usage{ get_lb_score( studyid = NULL, @@ -15,35 +15,39 @@ get_lb_score( ) } \arguments{ -\item{studyid}{Mandatory, character \cr -Studyid number} +\item{studyid}{Mandatory, character +The study ID number for which the LB score is calculated.} -\item{path_db}{Mandatory, character \cr -path of database} +\item{path_db}{Mandatory, character +The path to the database containing the necessary data for the calculation.} -\item{fake_study}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{fake_study}{Optional, boolean +Indicates whether the study is generated by the SENDsanitizer package. Defaults to \code{FALSE}.} -\item{use_xpt_file}{Mandatory, character \cr -Studyid number} +\item{use_xpt_file}{Mandatory, character +Specifies the path to the XPT (SAS transport) file if it is being used for the study.} -\item{master_compiledata}{Mandatory, character \cr -path of database} +\item{master_compiledata}{Mandatory, character +The path to the compiled master dataset that will be used to calculate the LB score.} -\item{return_individual_scores}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{return_individual_scores}{Optional, boolean +If \code{TRUE}, the function will return individual scores for each subject. Defaults to \code{FALSE}.} -\item{return_zscore_by_USUBJID}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{return_zscore_by_USUBJID}{Optional, boolean +If \code{TRUE}, the function will return Z-scores by \code{USUBJID}. Defaults to \code{FALSE}.} } \value{ -score +numeric +The calculated LB score based on the provided data and parameters. } \description{ -get LB score for a given studyid +This function computes the LB score for a given study ID using data stored in a specified database. +It offers various optional parameters to customize the output, such as whether to return individual scores or Z-scores by \code{USUBJID}. } \examples{ \dontrun{ +# Example usage of the function get_lb_score(studyid='1234123', path_db='path/to/database.db') } + } diff --git a/man/get_liver_om_lb_mi_tox_score_list.Rd b/man/get_liver_om_lb_mi_tox_score_list.Rd index 4d02fbe..ed79159 100644 --- a/man/get_liver_om_lb_mi_tox_score_list.Rd +++ b/man/get_liver_om_lb_mi_tox_score_list.Rd @@ -5,45 +5,59 @@ \title{get_liver_om_lb_mi_tox_score_list} \usage{ get_liver_om_lb_mi_tox_score_list( - selected_studies, + studyid_or_studyids = FALSE, path_db, fake_study = FALSE, use_xpt_file = FALSE, - multiple_xpt_folder = FALSE, output_individual_scores = FALSE, output_zscore_by_USUBJID = FALSE ) } \arguments{ -\item{selected_studies}{Mandatory, character \cr -Studyid number} +\item{studyid_or_studyids}{A character vector or a single study ID to process. +If multiple studies are provided, the function processes each study sequentially. (Mandatory)} -\item{path_db}{Mandatory, character \cr -path of database} +\item{path_db}{A character string specifying the path to the database or directory containing the data files. +(Mandatory)} -\item{fake_study}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{fake_study}{A boolean flag indicating if the study data is simulated (\code{TRUE}) or real (\code{FALSE}). Default is \code{FALSE}. (Optional)} -\item{use_xpt_file}{Mandatory, character \cr -path of database} +\item{use_xpt_file}{A boolean flag indicating whether to use an XPT file for the study data. Default is \code{FALSE}. (Mandatory)} -\item{multiple_xpt_folder}{Mandatory, character \cr -path of database} +\item{output_individual_scores}{A boolean flag indicating whether individual scores should be returned (\code{TRUE}) or averaged scores (\code{FALSE}). Default is \code{FALSE}. (Optional)} -\item{output_individual_scores}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{output_zscore_by_USUBJID}{A boolean flag indicating whether to output z-scores by \code{USUBJID} (\code{TRUE}) or averaged scores (\code{FALSE}). Default is \code{FALSE}. (Optional)} -\item{output_zscore_by_USUBJID}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{multiple_xpt_folder}{A character string specifying the path to the folder containing multiple XPT files. +(Optional)} } \value{ -dataframe +A data frame containing the calculated scores for each study. The type of result depends on the flags passed: +\itemize{ +\item If \code{output_individual_scores} is \code{TRUE}, a data frame with individual scores for each study is returned. +\item If \code{output_zscore_by_USUBJID} is \code{TRUE}, a data frame with z-scores by \code{USUBJID} for each study is returned. +\item If neither flag is set, the function returns a data frame with averaged scores for each study. +} } \description{ -get_liver_om_lb_mi_tox_score_list +This function processes liver organ toxicity scores, body weight z-scores, and other related metrics +for a set of studies or XPT files. It can output individual scores, z-scores by USUBJID, or averaged scores +for multiple studies, and handles errors during the processing steps. } \examples{ \dontrun{ -get_compile_data(studyid='1234123', path_db='path/to/database.db') +# Get averaged scores for a single study +result <- get_liver_om_lb_mi_tox_score_list( + studyid_or_studyids = "Study_001", + path_db = "path/to/database" +) + +# Get individual scores for multiple studies +result_individual_scores <- get_liver_om_lb_mi_tox_score_list( + studyid_or_studyids = c("Study_001", "Study_002"), + path_db = "path/to/database", + output_individual_scores = TRUE +) } + } diff --git a/man/get_livertobw_score.Rd b/man/get_livertobw_score.Rd index b8d4137..4b89855 100644 --- a/man/get_livertobw_score.Rd +++ b/man/get_livertobw_score.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get_livertobw_score.R \name{get_livertobw_score} \alias{get_livertobw_score} -\title{get_liver_livertobw_score} +\title{Calculate Liver-to-Body-Weight Scores and Z-Scores} \usage{ get_livertobw_score( studyid = NULL, @@ -16,38 +16,66 @@ get_livertobw_score( ) } \arguments{ -\item{studyid}{Mandatory, character \cr -Studyid number} +\item{studyid}{Optional, character. \cr +Study ID for which the calculations are performed. If \code{NULL}, data for all studies in the database is used.} -\item{path_db}{Mandatory, character \cr -path of database} +\item{path_db}{Mandatory, character. \cr +Path to the SQLite database or directory containing \code{.xpt} files.} -\item{fake_study}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{fake_study}{Optional, logical. \cr +Indicates whether the study is a fake/test study generated by the \code{SENDsanitizer} package. Default is \code{FALSE}.} -\item{use_xpt_file}{optional, Boolean \cr -whether use_xpt_file is used on not} +\item{use_xpt_file}{Optional, logical. \cr +Specifies whether to use \code{.xpt} files instead of a SQLite database. Default is \code{FALSE}.} -\item{master_compiledata}{optional, Boolean \cr -whether use_xpt_file is used on not} +\item{master_compiledata}{Optional, data.frame. \cr +Precompiled dataset of study information. If \code{NULL}, the function fetches the data using \code{get_compile_data}.} -\item{bwzscore_BW}{optional, Boolean \cr -whether use_xpt_file is used on not} +\item{bwzscore_BW}{Optional, data.frame. \cr +Precomputed body weight z-scores. If \code{NULL}, they are calculated using \code{get_bw_score}.} -\item{return_individual_scores}{optional, logical \cr -whether use_xpt_file is used on not} +\item{return_individual_scores}{Optional, logical. \cr +If \code{TRUE}, returns individual z-scores averaged by study. Default is \code{FALSE}.} -\item{return_zscore_by_USUBJID}{optional, logical \cr -whether use_xpt_file is used on not} +\item{return_zscore_by_USUBJID}{Optional, logical. \cr +If \code{TRUE}, returns z-scores grouped by \code{USUBJID}. Default is \code{FALSE}.} } \value{ -dataframe +A data frame containing liver-to-body-weight z-scores: +\itemize{ +\item Averaged by study (default). +\item Individual scores averaged by study (\code{return_individual_scores = TRUE}). +\item Z-scores grouped by \code{USUBJID} (\code{return_zscore_by_USUBJID = TRUE}). +} } \description{ -get_liver_livertobw_score +This function computes liver-to-body-weight (Liver:BW) ratios and their corresponding z-scores from study data. +It supports retrieving data from SQLite databases or \code{.xpt} files and provides flexible options for output formats. } \examples{ \dontrun{ -get_compile_data(studyid='1234123', path_db='path/to/database.db') +# Example 1: Default averaged scores +result <- get_livertobw_score( + studyid = '1234123', + path_db = 'path/to/database.db' +) +head(result) + +# Example 2: Individual scores by study +result <- get_livertobw_score( + studyid = '1234123', + path_db = 'path/to/database.db', + return_individual_scores = TRUE +) +head(result) + +# Example 3: Z-scores by USUBJID +result <- get_livertobw_score( + studyid = '1234123', + path_db = 'path/to/database.db', + return_zscore_by_USUBJID = TRUE +) +head(result) } + } diff --git a/man/get_mi_score.Rd b/man/get_mi_score.Rd index ba12714..0a90a35 100644 --- a/man/get_mi_score.Rd +++ b/man/get_mi_score.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get_mi_score.R \name{get_mi_score} \alias{get_mi_score} -\title{get MI score for a given studyid} +\title{Get MI score for a given studyid} \usage{ get_mi_score( studyid = NULL, @@ -16,34 +16,36 @@ get_mi_score( } \arguments{ \item{studyid}{Mandatory, character \cr -Studyid number} +The study ID number for the clinical study.} \item{path_db}{Mandatory, character \cr -path of database} +The file path to the database that contains the study data.} -\item{fake_study}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{fake_study}{Optional, logical \cr +If TRUE, the function assumes that the study data was generated by the SENDsanitizer package. Default is FALSE.} -\item{use_xpt_file}{Mandatory, character \cr -Studyid number} +\item{use_xpt_file}{Mandatory, logical \cr +If TRUE, indicates that an XPT file should be used instead of a database for analysis.} \item{master_compiledata}{Mandatory, character \cr -path of database} +The path to the master compile data, often used to supplement or compile data from multiple sources.} -\item{return_individual_scores}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{return_individual_scores}{Optional, logical \cr +If TRUE, the function returns individual MI scores for each participant. Default is FALSE.} -\item{return_zscore_by_USUBJID}{optional, Boolean \cr -whether study generated by SENDsanitizer package} +\item{return_zscore_by_USUBJID}{Optional, logical \cr +If TRUE, the function returns the Z-scores by \code{USUBJID} (subject identifier). Default is FALSE.} } \value{ -score +A numeric vector or data frame containing the MI scores. The format depends on the specified parameters, such as individual scores or aggregated scores. } \description{ -get MI score for a given studyid +This function calculates the MI score for a given study using the provided study ID and database. It allows flexibility in terms of returning individual scores, Z-scores, and more. The function is compatible with both SENDsanitizer-generated datasets and standard clinical study databases. } \examples{ \dontrun{ -get_mi_score(studyid='1234123', path_db='path/to/database.db') +# Example usage of get_mi_score +get_mi_score(studyid = '1234123', path_db = 'path/to/database.db') } + } diff --git a/man/get_ml_data_and_tuned_hyperparameters.Rd b/man/get_ml_data_and_tuned_hyperparameters.Rd new file mode 100644 index 0000000..20926ba --- /dev/null +++ b/man/get_ml_data_and_tuned_hyperparameters.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_ml_data_and_tuned_hyperparameters.R +\name{get_ml_data_and_tuned_hyperparameters} +\alias{get_ml_data_and_tuned_hyperparameters} +\title{Get Random Forest Data and Tuned Hyperparameters} +\usage{ +get_ml_data_and_tuned_hyperparameters( + Data, + studyid_metadata, + Impute = FALSE, + Round = FALSE, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method = NULL +) +} +\arguments{ +\item{Data}{data.frame. Input data frame containing scores, typically named \code{scores_df}.} + +\item{studyid_metadata}{data.frame. Metadata containing \code{STUDYID} values, used for joining with \code{Data}.} + +\item{Impute}{logical. Indicates whether to impute missing values in the dataset using random forest imputation. Default is \code{FALSE}.} + +\item{Round}{logical. Specifies whether to round specific numerical columns according to predefined rules. Default is \code{FALSE}.} + +\item{reps}{integer. Number of repetitions for cross-validation. A value of \code{0} skips repetition.} + +\item{holdback}{numeric. Fraction of data to hold back for testing. A value of \code{1} performs leave-one-out cross-validation.} + +\item{Undersample}{logical. Indicates whether to undersample the training data to balance the target classes. Default is \code{FALSE}.} + +\item{hyperparameter_tuning}{logical. Specifies whether to perform hyperparameter tuning for the random forest model. Default is \code{FALSE}.} + +\item{error_correction_method}{character. Specifies the method for error correction. Can be \code{"Flip"}, \code{"Prune"}, or \code{NULL}. Default is \code{NULL}.} +} +\value{ +A list containing: +\describe{ +\item{rfData}{The final processed data after preprocessing and error correction.} +\item{best.m}{The best \code{mtry} hyperparameter determined for the random forest model.} +} +} +\description{ +The \code{get_ml_data_and_tuned_hyperparameters} function processes input data and metadata to prepare data for +random forest analysis. It includes steps for data preprocessing, optional imputation, rounding, +error correction, and hyperparameter tuning. +} +\examples{ +# Example usage: +Data <- scores_df +studyid_metadata <- read.csv("path/to/study_metadata.csv") +result <- get_ml_data_and_tuned_hyperparameters( + Data = Data, + studyid_metadata = studyid_metadata, + Impute = TRUE, + Round = TRUE, + reps = 10, + holdback = 0.75, + Undersample = TRUE, + hyperparameter_tuning = TRUE, + error_correction_method = "Flip" +) +rfData <- result$rfData +best_mtry <- result$best.m +} diff --git a/man/get_prediction_plot.Rd b/man/get_prediction_plot.Rd new file mode 100644 index 0000000..ed49d38 --- /dev/null +++ b/man/get_prediction_plot.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_prediction_plot.R +\name{get_prediction_plot} +\alias{get_prediction_plot} +\title{Generate Prediction Plot for Random Forest Model} +\usage{ +get_prediction_plot( + Data = NULL, + path_db, + rat_studies = FALSE, + studyid_metadata = NULL, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + Impute = FALSE, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method, + testReps +) +} +\arguments{ +\item{Data}{A data frame containing the dataset to use for training and testing. If \code{NULL}, the function will attempt to fetch and format the data from the database using \code{get_Data_formatted_for_ml_and_best.m} function.} + +\item{path_db}{A string indicating the path to the database that contains the dataset.} + +\item{rat_studies}{A logical flag indicating whether to use rat studies data. Defaults to \code{FALSE}.} + +\item{studyid_metadata}{A data frame containing metadata related to the study IDs. Defaults to \code{NULL}.} + +\item{fake_study}{A logical flag indicating whether to use fake study data. Defaults to \code{FALSE}.} + +\item{use_xpt_file}{A logical flag indicating whether to use an XPT file. Defaults to \code{FALSE}.} + +\item{Round}{A logical flag indicating whether to round the predictions. Defaults to \code{FALSE}.} + +\item{Impute}{A logical flag indicating whether to impute missing values. Defaults to \code{FALSE}.} + +\item{reps}{An integer specifying the number of repetitions for cross-validation.} + +\item{holdback}{A numeric value indicating the proportion of data to hold back for testing during cross-validation.} + +\item{Undersample}{A logical flag indicating whether to perform undersampling on the dataset to balance the classes. Defaults to \code{FALSE}.} + +\item{hyperparameter_tuning}{A logical flag indicating whether to perform hyperparameter tuning. Defaults to \code{FALSE}.} + +\item{error_correction_method}{A string specifying the error correction method to be used. Possible values are "Flip", "Prune", or "None".} + +\item{testReps}{An integer specifying the number of test repetitions for model evaluation.} +} +\value{ +A \code{ggplot} object representing the histogram of predicted probabilities for the \code{LIVER} variable across test repetitions. +} +\description{ +This function performs model building and prediction using a random forest algorithm. It iterates over multiple test repetitions, training the model on the training data and predicting on the test data. After predictions are made, a histogram plot is generated to visualize the distribution of predicted probabilities for the outcome variable (\code{LIVER}). +} +\details{ +The function works as follows: +\itemize{ +\item If \code{Data} is \code{NULL}, the function fetches the data and the best model configuration by calling the \code{get_Data_formatted_for_ml_and_best.m} function. +\item The dataset is divided into training and test sets for each repetition (\code{testReps}). +\item If \code{Undersample} is enabled, undersampling is applied to balance the dataset. +\item A random forest model is trained on the training data and predictions are made on the test data. +\item The predictions are averaged over the test repetitions and a histogram is plotted to visualize the distribution of predicted probabilities for \code{LIVER}. +} +} +\examples{ +# Example function call +get_prediction_plot( + path_db = "path_to_db", + rat_studies = FALSE, + reps = 10, + holdback = 0.2, + Undersample = TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = "Flip", + testReps = 5 +) + +} diff --git a/man/get_random_forest_model.Rd b/man/get_random_forest_model.Rd deleted file mode 100644 index 6268746..0000000 --- a/man/get_random_forest_model.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/train_random_forest_model.R -\name{get_random_forest_model} -\alias{get_random_forest_model} -\title{get_random_forest_model} -\usage{ -get_random_forest_model( - Liver_get_liver_om_lb_mi_tox_score_list, - not_Liver_get_liver_om_lb_mi_tox_score_list -) -} -\arguments{ -\item{Liver_get_liver_om_lb_mi_tox_score_list}{Mandatory, character \cr -Studyid number} - -\item{not_Liver_get_liver_om_lb_mi_tox_score_list}{Mandatory, character \cr -path of database} -} -\value{ -score -} -\description{ -get_random_forest_model -} -\examples{ -\dontrun{ -get_liver_lb_score(studyid='1234123', database_path = dbtoken) -} -} diff --git a/man/get_repeat_dose_parallel_studyids.Rd b/man/get_repeat_dose_parallel_studyids.Rd new file mode 100644 index 0000000..add15ec --- /dev/null +++ b/man/get_repeat_dose_parallel_studyids.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_repeat_dose_parallel_studyids.R +\name{get_repeat_dose_parallel_studyids} +\alias{get_repeat_dose_parallel_studyids} +\title{Get Repeat Dose Parallel Study IDs} +\usage{ +get_repeat_dose_parallel_studyids(path_db, rat_studies = FALSE) +} +\arguments{ +\item{path_db}{A character string representing the file path to the SQLite database. This is a required parameter.} + +\item{rat_studies}{A logical flag indicating whether to filter the studies for rats only. Defaults to \code{FALSE}.} +} +\value{ +A vector of study IDs that meet the specified criteria. This includes: +\itemize{ +\item Study IDs that match both the parallel design and repeat-dose toxicity criteria. +\item Optionally, study IDs that match rat species if \code{rat_studies = TRUE}. +} +} +\description{ +This function retrieves study IDs from a database that correspond to parallel-design studies involving repeat-dose toxicity. +It optionally filters the studies for rat species. +} +\examples{ +\dontrun{ + # Example without filtering for rat studies + study_ids <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite") + + # Example with filtering for rat studies + study_ids_rats <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite", rat_studies = TRUE) +} + +} diff --git a/man/get_reprtree_from_rf_model.Rd b/man/get_reprtree_from_rf_model.Rd new file mode 100644 index 0000000..bd477cf --- /dev/null +++ b/man/get_reprtree_from_rf_model.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_reprtree_from_rf_model .R +\name{get_reprtree_from_rf_model} +\alias{get_reprtree_from_rf_model} +\title{Get Representation Tree from Random Forest Model} +\usage{ +get_reprtree_from_rf_model( + Data = NULL, + path_db, + rat_studies = FALSE, + studyid_metadata = NULL, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + Impute = FALSE, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method +) +} +\arguments{ +\item{Data}{A data frame containing the dataset to train the Random Forest model. If \code{NULL}, data is fetched using the \code{get_Data_formatted_for_ml_and_best.m} function.} + +\item{path_db}{A character string representing the path to the database used for fetching or processing the data.} + +\item{rat_studies}{A logical flag indicating whether rat studies are used (default: \code{FALSE}).} + +\item{studyid_metadata}{A data frame containing metadata related to study IDs (default: \code{NULL}).} + +\item{fake_study}{A logical flag indicating whether to use fake study data (default: \code{FALSE}).} + +\item{use_xpt_file}{A logical flag indicating whether to use the XPT file format for data input (default: \code{FALSE}).} + +\item{Round}{A logical flag indicating whether to round the data before processing (default: \code{FALSE}).} + +\item{Impute}{A logical flag indicating whether to impute missing values in the data (default: \code{FALSE}).} + +\item{reps}{An integer specifying the number of repetitions to perform for cross-validation or resampling.} + +\item{holdback}{A numeric value representing the fraction of data to hold back for testing.} + +\item{Undersample}{A logical flag indicating whether undersampling should be applied to balance the dataset (default: \code{FALSE}).} + +\item{hyperparameter_tuning}{A logical flag indicating whether hyperparameter tuning should be performed (default: \code{FALSE}).} + +\item{error_correction_method}{A character string specifying the method for error correction. Must be one of \code{'Flip'}, \code{'Prune'}, or \code{'None'}.} +} +\value{ +A plot of the first tree from the Random Forest model is displayed. The function does not return the ReprTree object explicitly, but it is generated and used for plotting. +} +\description{ +This function trains a Random Forest model on a provided dataset and generates a representation tree (ReprTree) from the trained model. It supports various preprocessing configurations, model hyperparameters, and sampling strategies, including random undersampling. The function also allows for error correction and hyperparameter tuning. +} +\details{ +The function performs the following steps: +\enumerate{ +\item \strong{Data Preparation}: If \code{Data} is \code{NULL}, it is fetched using the \code{get_Data_formatted_for_ml_and_best.m} function. Data is then split into training (70\%) and testing (30\%) sets. If \code{Undersample} is \code{TRUE}, the training data is balanced using undersampling. +\item \strong{Model Training}: A Random Forest model is trained using the \code{randomForest::randomForest} function. The target variable is \code{Target_Organ}, and the model uses the best hyperparameter (\code{best.m}). The number of trees is set to 500. +\item \strong{ReprTree Generation}: The \code{reprtree::ReprTree} function is used to generate the representation tree from the trained Random Forest model. +\item \strong{Visualization}: The first tree from the Random Forest model is plotted using the \code{reprtree::plot.getTree} function. +} +} +\examples{ +get_reprtree_from_rf_model( + Data = my_data, + path_db = "path/to/database", + rat_studies = TRUE, + studyid_metadata = my_metadata, + fake_study = FALSE, + use_xpt_file = TRUE, + Round = TRUE, + Impute = TRUE, + reps = 5, + holdback = 0.3, + Undersample = TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = "Flip" +) + +} diff --git a/man/get_rf_input_param_list_output_cv_imp.Rd b/man/get_rf_input_param_list_output_cv_imp.Rd new file mode 100644 index 0000000..313fac4 --- /dev/null +++ b/man/get_rf_input_param_list_output_cv_imp.Rd @@ -0,0 +1,105 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_rf_input_param_list_output_cv_imp.R +\name{get_rf_input_param_list_output_cv_imp} +\alias{get_rf_input_param_list_output_cv_imp} +\title{Prepare and Evaluate Random Forest Model with Cross-Validation and Feature Importance} +\usage{ +get_rf_input_param_list_output_cv_imp( + path_db, + rat_studies = FALSE, + studyid_metadata, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + Impute = FALSE, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method, + best.m = NULL, + testReps, + indeterminateUpper, + indeterminateLower, + Type, + nTopImportance +) +} +\arguments{ +\item{path_db}{A character string specifying the path to the SQLite database or directory containing the XPT file.} + +\item{rat_studies}{A logical value indicating whether to filter for rat studies. Default is \code{FALSE}.} + +\item{studyid_metadata}{A data frame containing metadata for the studies.} + +\item{fake_study}{A logical value indicating whether to use fake study data. Default is \code{FALSE}.} + +\item{use_xpt_file}{A logical value indicating whether to use XPT file data. Default is \code{FALSE}.} + +\item{Round}{A logical value indicating whether to round the liver scores. Default is \code{FALSE}.} + +\item{Impute}{A logical value indicating whether to impute missing values. Default is \code{FALSE}.} + +\item{reps}{An integer specifying the number of repetitions for model evaluation.} + +\item{holdback}{A numeric value specifying the proportion of data to hold back for validation.} + +\item{Undersample}{A logical value indicating whether to undersample the data to balance classes. Default is \code{FALSE}.} + +\item{hyperparameter_tuning}{A logical value indicating whether to tune the Random Forest model's hyperparameters. Default is \code{FALSE}.} + +\item{error_correction_method}{A character string specifying the error correction method. Options are 'Flip', 'Prune', or 'None'.} + +\item{best.m}{A numeric value specifying the number of trees in the Random Forest model. If \code{NULL}, the function determines this automatically.} + +\item{testReps}{An integer specifying the number of test repetitions for model evaluation.} + +\item{indeterminateUpper}{A numeric value for the upper threshold of indeterminate predictions.} + +\item{indeterminateLower}{A numeric value for the lower threshold of indeterminate predictions.} + +\item{Type}{A character string specifying the type of Random Forest model to use. Options include 'classification' or 'regression'.} + +\item{nTopImportance}{An integer specifying the number of top important features to consider for the model.} +} +\value{ +A list containing the trained Random Forest model, cross-validation results, and feature importance scores. +The list is returned by the \code{get_rf_model_with_cv} function. +} +\description{ +This function prepares the data for training a Random Forest (RF) model with cross-validation, handles imputation, hyperparameter tuning, and evaluates the model's performance. It supports both real and fake study data, with options for rat studies, error correction, and feature importance selection. +} +\details{ +The function performs the following steps: +\itemize{ +\item Fetches the study data based on the specified parameters. +\item Calculates liver scores and harmonizes the data. +\item Prepares data for machine learning, including imputation and optional hyperparameter tuning. +\item Trains and evaluates the Random Forest model with cross-validation. +\item Applies error correction (if specified) and selects the most important features. +} +} +\examples{ +# Example usage of the function +result <- get_rf_input_param_list_output_cv_imp( + path_db = "path/to/database", + rat_studies = TRUE, + studyid_metadata = metadata_df, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = TRUE, + Impute = TRUE, + reps = 10, + holdback = 0.2, + Undersample = TRUE, + hyperparameter_tuning = TRUE, + error_correction_method = "Flip", + best.m = NULL, + testReps = 5, + indeterminateUpper = 0.9, + indeterminateLower = 0.1, + Type = "classification", + nTopImportance = 10 +) + +} diff --git a/man/get_rf_model_output_cv_imp.Rd b/man/get_rf_model_output_cv_imp.Rd new file mode 100644 index 0000000..4bc5263 --- /dev/null +++ b/man/get_rf_model_output_cv_imp.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_zone_exclusioned_rf_model_cv_imp.R +\name{get_rf_model_output_cv_imp} +\alias{get_rf_model_output_cv_imp} +\title{Perform Cross-Validation with Random Forest and Feature Importance Calculation} +\usage{ +get_rf_model_output_cv_imp( + scores_df = NULL, + Undersample = FALSE, + best.m = NULL, + testReps, + indeterminateUpper, + indeterminateLower, + Type, + nTopImportance +) +} +\arguments{ +\item{scores_df}{A data frame containing the features and target variable for training and testing the model.} + +\item{Undersample}{A logical flag indicating whether to apply undersampling to the training data. Defaults to \code{FALSE}.} + +\item{best.m}{A numeric value representing the number of features to sample for the Random Forest model, or \code{NULL} to calculate it automatically.} + +\item{testReps}{An integer specifying the number of repetitions for cross-validation. Must be at least 2.} + +\item{indeterminateUpper}{A numeric threshold above which predictions are not considered indeterminate.} + +\item{indeterminateLower}{A numeric threshold below which predictions are not considered indeterminate.} + +\item{Type}{An integer specifying the type of importance to compute. \code{1} for MeanDecreaseAccuracy, \code{2} for MeanDecreaseGini.} + +\item{nTopImportance}{An integer specifying the number of top features to display based on their importance scores.} +} +\value{ +A list with the following elements: +\describe{ +\item{performance_metrics}{A vector of aggregated performance metrics (e.g., sensitivity, specificity, accuracy, etc.).} +\item{feature_importance}{A matrix containing the importance of the top \code{nTopImportance} features, ordered by their importance score.} +\item{raw_results}{A list containing raw results for debugging or further analysis, including sensitivity, specificity, accuracy, and Gini scores across all test repetitions.} +} +} +\description{ +This function performs cross-validation on a Random Forest model, tracks +performance metrics (such as sensitivity, specificity, accuracy), handles +indeterminate predictions, and computes feature importance based on either +Gini or Accuracy. The function returns performance summaries and feature +importance rankings after a specified number of test repetitions. +} +\details{ +The function splits the input data into training and testing sets based on the specified number of test repetitions (\code{testReps}). +During each iteration, it trains a Random Forest model and makes predictions on the test data. Indeterminate predictions are handled +by marking them as \code{NA}. The function tracks performance metrics such as sensitivity, specificity, and accuracy, and computes the +top \code{nTopImportance} features based on either Mean Decrease Accuracy or Mean Decrease Gini. +} +\examples{ +# Example usage of the function +result <- get_rf_model_output_cv_imp( + scores_df = your_data, + Undersample = FALSE, + best.m = 3, + testReps = 5, + indeterminateUpper = 0.8, + indeterminateLower = 0.2, + Type = 1, + nTopImportance = 10 +) + +# View performance metrics +print(result$performance_metrics) + +# View top features by importance +print(result$feature_importance) + +} diff --git a/man/get_rf_model_with_cv.Rd b/man/get_rf_model_with_cv.Rd new file mode 100644 index 0000000..65bf759 --- /dev/null +++ b/man/get_rf_model_with_cv.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_rf_model_with_cv.R +\name{get_rf_model_with_cv} +\alias{get_rf_model_with_cv} +\title{Random Forest with Cross-Validation} +\usage{ +get_rf_model_with_cv(Data, Undersample = FALSE, best.m = NULL, testReps, Type) +} +\arguments{ +\item{Data}{Mandatory, data frame +The input dataset, which must include a column named \code{Target_Organ} as the response variable.} + +\item{Undersample}{Optional, logical +If \code{TRUE}, balances the dataset by undersampling the majority class. Default is \code{FALSE}.} + +\item{best.m}{Optional, numeric or \code{NULL} +Specifies the number of predictors sampled at each split. If \code{NULL}, the default value of \code{randomForest} is used.} + +\item{testReps}{Mandatory, integer +The number of cross-validation repetitions. Must be at least 2.} + +\item{Type}{Mandatory, numeric +Specifies the importance metric type: \code{1} for Mean Decrease Accuracy or \code{2} for Gini.} +} +\value{ +A list with the following elements: +\itemize{ +\item \code{performance_metrics}: A vector of aggregated performance metrics, including sensitivity, specificity, and accuracy. +\item \code{raw_results}: A list containing raw sensitivity, specificity, and accuracy values for each cross-validation fold. +} +} +\description{ +This function builds a random forest model using the \code{randomForest} package, evaluates it through cross-validation, +and computes performance metrics such as sensitivity, specificity, and accuracy. +It optionally applies undersampling to handle class imbalance and supports custom settings for the number of predictors sampled at each split. +} +\details{ +This function splits the input data into training and testing subsets based on the specified \code{testReps} cross-validation folds. +If undersampling is enabled, the function balances the training set to reduce class imbalance. +A random forest model is trained on the training set, and predictions are evaluated on the test set. The results are aggregated to provide summary performance metrics. +} +\examples{ +# Load necessary libraries +library(randomForest) +library(caret) + +# Example dataset +data(iris) +iris$Target_Organ <- ifelse(iris$Species == "setosa", 1, 0) +iris <- iris[, -5] # Remove Species column + +# Run the function +results <- get_rf_model_with_cv(Data = iris, + Undersample = TRUE, + best.m = 2, + testReps = 5, + Type = 2) + +# Print results +print(results$performance_metrics) + +} diff --git a/man/get_zone_exclusioned_rf_model_with_cv.Rd b/man/get_zone_exclusioned_rf_model_with_cv.Rd new file mode 100644 index 0000000..ba84b3e --- /dev/null +++ b/man/get_zone_exclusioned_rf_model_with_cv.Rd @@ -0,0 +1,81 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_zone_exclusioned_rf_model_with_cv.R +\name{get_zone_exclusioned_rf_model_with_cv} +\alias{get_zone_exclusioned_rf_model_with_cv} +\title{Random Forest Model with Cross-validation and Exclusion} +\usage{ +get_zone_exclusioned_rf_model_with_cv( + Data = NULL, + Undersample = FALSE, + best.m = NULL, + testReps, + indeterminateUpper, + indeterminateLower, + Type +) +} +\arguments{ +\item{Data}{A data frame containing the features and the target variable \code{Target_Organ} +to train the Random Forest model on.} + +\item{Undersample}{A logical value indicating whether to perform undersampling to +balance the classes in the training data. Defaults to \code{FALSE}.} + +\item{best.m}{A numeric value representing the best number of variables (\code{mytry}) +to use at each split in the Random Forest model. This can be manually set or +determined through optimization.} + +\item{testReps}{An integer specifying the number of test repetitions. This must +be at least 2, as the function relies on multiple test sets to assess the model performance.} + +\item{indeterminateUpper}{A numeric value indicating the upper bound for the +predicted probability to consider a prediction indeterminate. Predictions with +probabilities within this range are marked as indeterminate.} + +\item{indeterminateLower}{A numeric value indicating the lower bound for the +predicted probability to consider a prediction indeterminate. Predictions with +probabilities within this range are marked as indeterminate.} + +\item{Type}{An integer indicating the type of feature importance to use in the +Random Forest model. Typically, \code{1} for "Mean Decrease Accuracy" or \code{2} for "Mean Decrease Gini".} +} +\value{ +A list containing two components: +\describe{ +\item{performance_metrics}{A vector with the aggregated performance metrics, +including sensitivity, specificity, accuracy, and others, calculated across +all test repetitions.} +\item{raw_results}{A list containing the raw performance metrics for each repetition, +including sensitivity, specificity, and accuracy.} +} +} +\description{ +This function implements a Random Forest classification model +with cross-validation and allows for undersampling, handling indeterminate +predictions, and calculating various model performance metrics such as +sensitivity, specificity, and accuracy. It tracks the proportion of indeterminate +predictions and provides an aggregated performance summary across multiple test repetitions. +} +\examples{ +\dontrun{ +# Example usage +Data <- your_data_frame # Replace with actual dataset +results <- get_zone_exclusioned_rf_model_with_cv(Data = Data, + Undersample = TRUE, + best.m = 5, + testReps = 10, + indeterminateUpper = 0.8, + indeterminateLower = 0.2, + Type = 1) + +# View the aggregated performance metrics +print(results$performance_metrics) + +# Access raw results for further analysis +print(results$raw_results) +} + +} +\seealso{ +\link[randomForest]{randomForest}, \link[caret]{confusionMatrix} +} diff --git a/man/predicted_random_forest_model.Rd b/man/predicted_random_forest_model.Rd deleted file mode 100644 index d37a49c..0000000 --- a/man/predicted_random_forest_model.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/generate_rf_predictions.R, -% R/predicted_random_forest_model.R -\name{predicted_random_forest_model} -\alias{predicted_random_forest_model} -\title{predicted_random_forest_model} -\usage{ -predicted_random_forest_model(dbPath_liver, dbPath_not_liver) - -predicted_random_forest_model(dbPath_liver, dbPath_not_liver) -} -\arguments{ -\item{dbPath_liver}{Mandatory, character \cr -Studyid number} - -\item{dbPath_not_liver}{Mandatory, character \cr -path of database} -} -\value{ -random forst model - -random forst model -} -\description{ -predicted_random_forest_model - -predicted_random_forest_model -} -\examples{ -\dontrun{ -get_mi_score(studyid='1234123', path_db='path/to/database.db') -} -\dontrun{ -get_mi_score(studyid='1234123', path_db='path/to/database.db') -} -} diff --git a/vignettes/get_Data_formatted_for_ml_and_best.m.Rmd b/vignettes/get_Data_formatted_for_ml_and_best.m.Rmd new file mode 100644 index 0000000..6d49073 --- /dev/null +++ b/vignettes/get_Data_formatted_for_ml_and_best.m.Rmd @@ -0,0 +1,69 @@ +--- +title: "Documentation for `get_Data_formatted_for_ml_and_best.m` Function" +author: "Your Name" +output: html_document +--- + +## Purpose +The function `get_Data_formatted_for_ml_and_best.m` is designed to retrieve and preprocess data for machine learning (ML) models from a given SQLite database or XPT file. It performs several tasks such as fetching study IDs, retrieving study metadata, calculating liver toxicity scores, and tuning hyperparameters for ML models. The final output is a list containing processed data ready for machine learning and the best model. + +## Input Parameters + +| Parameter | Description | Type | Default Value | +|--------------------------|-----------------------------------------------------------------------------|-------------|-----------------| +| `path_db` | Path to the SQLite database or XPT file location. | character | None | +| `rat_studies` | Flag to filter for rat studies. | logical | FALSE | +| `studyid_metadata` | Optional metadata for study IDs. If NULL, will be generated. | data.frame | NULL | +| `fake_study` | Flag to use fake study data. | logical | FALSE | +| `use_xpt_file` | Flag to indicate whether to use an XPT file instead of SQLite database. | logical | FALSE | +| `Round` | Flag to round liver toxicity scores. | logical | FALSE | +| `Impute` | Flag to impute missing values in the dataset. | logical | FALSE | +| `reps` | Number of repetitions for cross-validation. | integer | None | +| `holdback` | Fraction of data to hold back for validation. | numeric | None | +| `Undersample` | Flag to undersample the majority class. | logical | FALSE | +| `hyperparameter_tuning` | Flag to perform hyperparameter tuning for the model. | logical | FALSE | +| `error_correction_method`| Method to handle error correction. Must be one of 'Flip', 'Prune', or 'None'.| character | None | + +## Output + +The function returns a list with the following elements: + +- `Data`: A data frame containing the preprocessed data ready for machine learning. +- `best.m`: The best machine learning model after hyperparameter tuning, if applicable. + +## Key Steps + +1. **Fetch Study IDs**: + - If `use_xpt_file` is `TRUE`, it retrieves study IDs from directories within the specified path. + - If `use_xpt_file` is `FALSE` and `fake_study` is `TRUE`, the function connects to an SQLite database and retrieves the study IDs from the 'dm' table. + - If `fake_study` is `FALSE`, it fetches repeat-dose and parallel study IDs from the database. + +2. **Process Study Metadata**: + - If `studyid_metadata` is not provided, it generates metadata by selecting unique study IDs and assigning random "Target_Organ" values (either "Liver" or "not_Liver"). + +3. **Calculate Liver Toxicity Scores**: + - The function calculates liver toxicity scores using the `get_liver_om_lb_mi_tox_score_list` function. + +4. **Harmonize Scores**: + - The calculated liver toxicity scores are harmonized using the `get_col_harmonized_scores_df` function, optionally rounding them based on the `Round` parameter. + +5. **Machine Learning Data Preparation**: + - The function prepares the data for machine learning and performs hyperparameter tuning (if `hyperparameter_tuning` is `TRUE`) using the `get_ml_data_and_tuned_hyperparameters` function. + +6. **Return Processed Data and Best Model**: + - The final output consists of the processed data and the best machine learning model (`best.m`). + +## Example Usage + +```r +result <- get_Data_formatted_for_ml_and_best.m( + path_db = "path/to/database.db", + rat_studies = TRUE, + reps = 5, + holdback = 0.2, + error_correction_method = "Flip" +) + +# Access the processed data and the best model +processed_data <- result$Data +best_model <- result$best.m diff --git a/vignettes/get_auc_curve_with_rf_model.Rmd b/vignettes/get_auc_curve_with_rf_model.Rmd new file mode 100644 index 0000000..9744690 --- /dev/null +++ b/vignettes/get_auc_curve_with_rf_model.Rmd @@ -0,0 +1,63 @@ +--- +title: "Documentation for get_auc_curve_with_rf_model" +output: html_document +--- + +# Function: `get_auc_curve_with_rf_model` + +## Purpose + +The function `get_auc_curve_with_rf_model` is designed to train a Random Forest model using a provided dataset, optionally from an SQLite database. It computes and visualizes the ROC curve along with the AUC (Area Under the Curve) metric. The function offers various options for handling data preprocessing, including hyperparameter tuning, imputation, and undersampling, and outputs the model performance via the ROC curve. + +## Input Parameters + +The function accepts the following parameters: + +| Parameter | Type | Description | +|----------------------------------|----------|-------------| +| `Data` | data.frame | Input data frame for training. If `NULL`, the function fetches data from the database. | +| `path_db` | string | Path to the SQLite database. Used to fetch study data if `Data` is `NULL`. | +| `rat_studies` | logical | Whether to filter for rat studies (default is `FALSE`). | +| `studyid_metadata` | data.frame | Metadata associated with study IDs. | +| `fake_study` | logical | Whether to use fake study IDs for data simulation (default is `FALSE`). | +| `use_xpt_file` | logical | Whether to use an XPT file for input data (default is `FALSE`). | +| `Round` | logical | Whether to round numerical values (default is `FALSE`). | +| `Impute` | logical | Whether to perform imputation on missing values (default is `FALSE`). | +| `best.m` | numeric | The 'mtry' hyperparameter for Random Forest (optional). | +| `reps` | numeric | Number of repetitions for cross-validation (numeric value). | +| `holdback` | numeric | Fraction value (e.g., 0.75) for holdback during cross-validation. | +| `Undersample` | logical | Whether to perform undersampling (default is `FALSE`). | +| `hyperparameter_tuning` | logical | Whether to perform hyperparameter tuning (default is `FALSE`). | +| `error_correction_method` | string | Method for error correction: "Flip", "Prune", or "None". | +| `output_individual_scores` | logical | Whether to output individual scores (default is `TRUE`). | +| `output_zscore_by_USUBJID` | logical | Whether to output z-scores by subject ID (default is `FALSE`). | + +## Output + +The function does not return any explicit values. However, it generates the following outputs: + +1. **AUC Value**: The AUC of the ROC curve is printed to the console. +2. **ROC Curve Plot**: A ROC curve is displayed, showing the model's performance with the computed AUC value. +3. **Performance Metrics**: Other performance metrics (e.g., True Positive Rate, False Positive Rate) are computed but not returned directly. + +## Key Steps + +1. **Data Generation or Fetching**: + - If `Data` is not provided, the function fetches the data either from the SQLite database or generates synthetic data (if `fake_study` is `TRUE`). + - If `use_xpt_file` is `TRUE`, it fetches data from the specified XPT files. + +2. **Data Preprocessing**: + - The function performs data preprocessing, including imputation (if `Impute` is `TRUE`), rounding (if `Round` is `TRUE`), and undersampling (if `Undersample` is `TRUE`). + - It harmonizes the liver scores and prepares the data for machine learning. + +3. **Model Training**: + - The function then prepares the data for Random Forest (RF) modeling, tuning hyperparameters if `hyperparameter_tuning` is enabled. + - A Random Forest model is trained using the prepared data, and predictions are generated. + +4. **AUC Calculation and Plotting**: + - The model's performance is evaluated by computing the AUC (Area Under the Curve) and plotting the ROC curve. + - The AUC is printed to the console, and the ROC curve is displayed with the calculated AUC value. + +5. **Error Correction and Hyperparameter Tuning**: + - If specified, the function applies an error correction method (`error_correction_method`) and performs hyperparameter tuning to optimize the model. + diff --git a/vignettes/get_histogram_barplot.Rmd b/vignettes/get_histogram_barplot.Rmd new file mode 100644 index 0000000..a9d0ad9 --- /dev/null +++ b/vignettes/get_histogram_barplot.Rmd @@ -0,0 +1,57 @@ +--- +title: "Documentation for `get_histogram_barplot` function" +output: html_document +--- + +## Purpose + +The `get_histogram_barplot` function is designed to generate a bar plot displaying liver-related scores, based on data either provided directly or fetched from an SQLite database. It calculates mean values for specific findings, compares liver-related and non-liver-related groups, and produces either a plot or a processed data frame depending on the function’s parameters. + +## Input Parameters + +The function accepts the following parameters: + +| **Parameter** | **Type** | **Description** | +|----------------------------------|--------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| `Data` | `data.frame`, optional | A data frame containing liver-related scores. If `NULL`, the function will attempt to generate or fetch data from a database or file. | +| `generateBarPlot` | `logical`, default = `FALSE` | Flag indicating whether to generate a bar plot. If `TRUE`, a bar plot is generated; otherwise, the function returns a data frame. | +| `path_db` | `character`, optional | Path to the SQLite database if data needs to be fetched from it. Required if `use_xpt_file` is `FALSE` or `fake_study` is `FALSE`. | +| `rat_studies` | `logical`, default = `FALSE` | Flag to specify whether to filter for rat studies when fetching data from the database. | +| `studyid_metadata` | `data.frame`, required | Metadata associated with the study IDs. Needed when `fake_study` is `FALSE` and real data is fetched. | +| `fake_study` | `logical`, default = `FALSE` | If `TRUE`, the function simulates study data instead of fetching it from a database. | +| `use_xpt_file` | `logical`, default = `FALSE` | If `TRUE`, the function will use an XPT file to fetch data, instead of relying on the database. | +| `Round` | `logical`, default = `FALSE` | Whether to round the liver scores. If `TRUE`, scores are rounded. | +| `output_individual_scores` | `logical`, default = `TRUE` | Whether to output individual scores or aggregated ones. | +| `output_zscore_by_USUBJID` | `logical`, default = `FALSE` | Whether to output z-scores by USUBJID (unique subject identifier). | + +## Output + +- If `generateBarPlot = TRUE`: The function returns a `ggplot2` bar plot object displaying the average scores for liver-related findings versus non-liver-related findings. + +- If `generateBarPlot = FALSE`: The function returns a `data.frame` (`plotData`) containing the calculated values for each finding, with columns for the finding, liver status (`LIVER`), and mean values (`Value`). + +## Key Steps + +1. **Data Generation/Fallback**: + - If no data is provided, the function attempts to fetch the data from an SQLite database or use a fake study dataset. + - It fetches study data from the `dm` domain of the database if `fake_study = FALSE`. The study IDs are then extracted, filtered for liver-related studies, and used for subsequent score calculations. + +2. **Data Harmonization**: + - The `get_liver_om_lb_mi_tox_score_list` function calculates liver scores for the provided study IDs. + - The resulting data is harmonized using `get_col_harmonized_scores_df` to ensure consistency in the output data frame. + +3. **Plot Generation**: + - If `generateBarPlot = TRUE`, the function iterates over the findings and computes the average liver-related score (`Liver` status) for each finding. + - It then generates a `ggplot2` bar plot with the findings on the x-axis, the average values on the y-axis, and distinct colors representing liver vs. non-liver status. + +4. **Error Handling**: + - The function checks whether the `Data` parameter is a valid data frame. If not, an error is thrown. + +## Example Usage + +```r +# Example with fake study data, generating a bar plot +get_histogram_barplot(generateBarPlot = TRUE, fake_study = TRUE) + +# Example with real study data, without generating a plot +data <- get_histogram_barplot(generateBarPlot = FALSE, fake_study = FALSE, path_db = "path/to/db") diff --git a/vignettes/get_imp_features_from_rf_model_with_cv.Rmd b/vignettes/get_imp_features_from_rf_model_with_cv.Rmd new file mode 100644 index 0000000..d6c7f40 --- /dev/null +++ b/vignettes/get_imp_features_from_rf_model_with_cv.Rmd @@ -0,0 +1,56 @@ +--- +title: "Function Documentation: get_imp_features_from_rf_model_with_cv" +author: "Your Name" +date: "`r Sys.Date()`" +output: html_document +--- + +## Purpose + +The `get_imp_features_from_rf_model_with_cv` function performs cross-validation with test repetitions on a random forest model, calculates feature importance using Gini importance, and returns the top `n` important features. It is primarily used for evaluating feature importance in classification tasks by utilizing Random Forest with optional under-sampling and custom test repetitions. + +## Input Parameters + +The function accepts the following parameters: + +- **`Data`**: A data frame containing the training data (typically with rows as samples and columns as features). The first column is assumed to be the target variable. +- **`Undersample`**: A logical value (`TRUE` or `FALSE`) indicating whether to apply under-sampling to balance the classes in the training data. Default is `FALSE`. +- **`best.m`**: A numeric value representing the number of variables to be considered at each split of the Random Forest model (or a function to determine this). Default is `NULL`. +- **`testReps`**: A numeric value indicating the number of test repetitions (must be at least 2). +- **`Type`**: A numeric value indicating the type of importance to be calculated. `1` for Mean Decrease Accuracy and `2` for Mean Decrease Gini. +- **`nTopImportance`**: A numeric value indicating the number of top important features to return based on their importance scores. + +## Output + +The function returns a list containing: + +- **`gini_scores`**: A matrix of Gini importance scores for each feature across the different cross-validation iterations. The matrix has rows representing features and columns representing test iterations. + +## Key Steps + +1. **Initialize Metrics**: The function starts by defining several empty vectors to track performance metrics like Sensitivity, Specificity, PPV, NPV, and others, which are initialized but not used in the current version. + +2. **Prepare Data**: The function prepares the data by renaming the columns of the input `Data` for consistency and initializing a new data frame (`rfTestData`) to store prediction results across iterations. + +3. **Cross-Validation Setup**: The function sets up a cross-validation loop with test repetitions. For each repetition, it selects a random subset of data to test and uses the rest for training. Optionally, under-sampling can be applied to balance the dataset. + +4. **Model Training**: A Random Forest model is trained on the training data in each iteration using the `randomForest` package. It uses the specified value for `best.m` to control the number of variables considered at each split. + +5. **Calculate Gini Importance**: After training the model, Gini importance scores are calculated for each feature using the `randomForest::importance` function. The Gini scores are aggregated across all test repetitions. + +6. **Aggregate and Sort Importance Scores**: After completing the cross-validation iterations, the mean Gini importance scores for each feature are calculated and sorted in decreasing order. + +7. **Plot Feature Importance**: A dotchart is generated to visualize the top `nTopImportance` features based on their importance scores. + +8. **Return Results**: The function returns a list containing the Gini importance scores across all iterations. + +```r +# Example of how to call the function +result <- get_imp_features_from_rf_model_with_cv( + Data = scores_df, + Undersample = FALSE, + best.m = 3, + testReps = 5, + Type = 2, + nTopImportance = 10 +) diff --git a/vignettes/get_prediction_plot.Rmd b/vignettes/get_prediction_plot.Rmd new file mode 100644 index 0000000..dd628dd --- /dev/null +++ b/vignettes/get_prediction_plot.Rmd @@ -0,0 +1,70 @@ +--- +title: "Documentation for get_prediction_plot Function" +output: html_document +--- + +## Function Purpose + +The `get_prediction_plot` function performs model building and prediction for a dataset using a random forest model. It iterates over multiple test repetitions, trains the model on the training data, and makes predictions on the test data. The function then generates a histogram to visualize the distribution of predictions for the outcome variable (`LIVER`). + +## Input Parameters + +The function accepts the following input parameters: + +| Parameter | Description | Type | +|------------------------------|-------------------------------------------------------------------------------------------------|---------| +| `Data` | The dataset to use for training and testing. If `NULL`, it will be fetched using the `get_Data_formatted_for_ml_and_best.m` function. | DataFrame (optional) | +| `path_db` | The path to the database that contains the dataset. | String | +| `rat_studies` | A flag indicating whether to use rat studies data. Default is `FALSE`. | Boolean | +| `studyid_metadata` | Metadata related to the study IDs. Default is `NULL`. | DataFrame (optional) | +| `fake_study` | A flag indicating whether to use fake study data. Default is `FALSE`. | Boolean | +| `use_xpt_file` | A flag indicating whether to use an XPT file. Default is `FALSE`. | Boolean | +| `Round` | A flag indicating whether to round the predictions. Default is `FALSE`. | Boolean | +| `Impute` | A flag indicating whether to impute missing values. Default is `FALSE`. | Boolean | +| `reps` | The number of repetitions for the cross-validation process. | Integer | +| `holdback` | The proportion of data to hold back for testing during cross-validation. | Numeric | +| `Undersample` | A flag indicating whether to perform undersampling on the dataset. Default is `FALSE`. | Boolean | +| `hyperparameter_tuning` | A flag indicating whether to perform hyperparameter tuning. Default is `FALSE`. | Boolean | +| `error_correction_method` | The method to use for error correction (e.g., "Flip", "Prune", or "None"). | String | +| `testReps` | The number of test repetitions for model evaluation. | Integer | + +## Output + +The function returns a histogram plot visualizing the predicted probabilities for the `LIVER` variable across test repetitions. The plot shows the distribution of predictions (probabilities) for both classes (LIVER = "Y" or "N"). + +## Key Steps + +1. **Data Preparation**: + - If `Data` is `NULL`, the function fetches and formats the data using the `get_Data_formatted_for_ml_and_best.m` function. + +2. **Cross-Validation**: + - The dataset is divided into training and testing sets for each repetition (`testReps`). + - If `Undersample` is enabled, undersampling is applied to balance the dataset. + +3. **Model Training**: + - A random forest model is trained using the training set for each repetition. + +4. **Prediction**: + - The model makes predictions on the test set. + - The predicted probabilities are stored for each repetition. + +5. **Result Visualization**: + - The predictions are averaged across repetitions, and a histogram is created to visualize the distribution of the predicted probabilities for the `LIVER` variable. + +6. **Plot**: + - The histogram is displayed using `ggplot2`, showing the predicted probabilities for the `LIVER` outcome (coded as "Y" or "N"). + +## Example Usage + +```r +# Example function call +get_prediction_plot( + path_db = "path_to_db", + rat_studies = FALSE, + reps = 10, + holdback = 0.2, + Undersample = TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = "Flip", + testReps = 5 +) diff --git a/vignettes/get_repeat_dose_parallel_studyids.Rmd b/vignettes/get_repeat_dose_parallel_studyids.Rmd new file mode 100644 index 0000000..f0e7f90 --- /dev/null +++ b/vignettes/get_repeat_dose_parallel_studyids.Rmd @@ -0,0 +1,55 @@ +--- +title: "Documentation for `get_repeat_dose_parallel_studyids` Function" +output: html_document +--- + +# Function Purpose + +The `get_repeat_dose_parallel_studyids` function is designed to retrieve study IDs from a database that correspond to parallel-design studies involving repeat-dose toxicity. It filters the studies based on the specified design and whether the species involved are rats. + +# Input Parameters + +The function accepts the following parameters: + +| **Parameter** | **Description** | **Default Value** | +|---------------------------|----------------------------------------------------------------------------------------------------------|-------------------| +| `path_db` | The file path to the SQLite database. It must be provided as a valid string. | **Required** | +| `rat_studies` | A logical flag indicating whether to filter the studies for rats only. Defaults to `FALSE`. | `FALSE` | + +# Output + +The function returns a vector of study IDs that meet the specified criteria. The returned vector contains the following: + +- **Study IDs**: A list of study IDs that match both the parallel design and repeat-dose toxicity criteria (and rat species, if specified). + +# Key Steps + +1. **Database Existence Check**: + The function first checks if the database file exists at the provided path. If not, an error is raised. + +2. **Database Connection**: + The database connection is established using the `sendigR` package. A connection to the database is initialized using `sendigR::initEnvironment()`. + +3. **Retrieve Parallel Study IDs**: + The function uses `sendigR::getStudiesSDESIGN()` to retrieve all study IDs associated with the parallel design. + +4. **Retrieve Repeat-Dose Studies**: + A SQL query is executed via `sendigR::genericQuery()` to fetch study IDs that are associated with repeat-dose toxicity. This query looks for studies with specific `TSPARMCD` values related to repeat-dose toxicity. + +5. **Intersect Parallel and Repeat-Dose Studies**: + The study IDs obtained from the parallel design and the repeat-dose toxicity studies are intersected to identify common study IDs. + +6. **Optionally Filter for Rat Studies**: + If `rat_studies = TRUE`, the function retrieves study IDs that involve rats as the species. This is done by querying the `SPECIES` field in the database and filtering based on the presence of "RAT". + +7. **Return Study IDs**: + The final result is a vector of study IDs that meet the filter conditions, including parallel design, repeat-dose toxicity, and optionally, rat species. + +# Example Usage + +```r +# Example without filtering for rat studies +study_ids <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite") + +# Example with filtering for rat studies +study_ids_rats <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite", rat_studies = TRUE) diff --git a/vignettes/get_reprtree_from_rf_model .Rmd b/vignettes/get_reprtree_from_rf_model .Rmd new file mode 100644 index 0000000..b26d025 --- /dev/null +++ b/vignettes/get_reprtree_from_rf_model .Rmd @@ -0,0 +1,74 @@ +--- +title: "get_reprtree_from_rf_model Function Documentation" +author: "Your Name" +date: "2024-12-31" +output: html_document +--- + +## Purpose + +The `get_reprtree_from_rf_model` function is designed to train a Random Forest model on a provided dataset and generate a representation tree (ReprTree) from the trained model. The function supports various configurations for data preprocessing, model hyperparameters, and sampling strategies, including random undersampling. Additionally, it allows for error correction and hyperparameter tuning. + +## Input Parameters + +The following table describes the input parameters for the `get_reprtree_from_rf_model` function: + +| Parameter | Description | Type | Default Value | +|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------|-------------------| +| `Data` | A dataset to train the Random Forest model. If NULL, data is fetched using the `get_Data_formatted_for_ml_and_best.m` function. | Data frame | NULL | +| `path_db` | The path to the database used for fetching or processing the data. | Character string | - | +| `rat_studies` | A flag indicating whether rat studies are used. | Boolean (TRUE/FALSE) | FALSE | +| `studyid_metadata` | Metadata related to study IDs. | Data frame | NULL | +| `fake_study` | A flag indicating whether to use fake study data. | Boolean (TRUE/FALSE) | FALSE | +| `use_xpt_file` | A flag to indicate whether to use the XPT file format for data input. | Boolean (TRUE/FALSE) | FALSE | +| `Round` | A flag to round the data before processing. | Boolean (TRUE/FALSE) | FALSE | +| `Impute` | A flag to specify whether to impute missing values in the data. | Boolean (TRUE/FALSE) | FALSE | +| `reps` | The number of repetitions to perform for cross-validation or resampling. | Integer | - | +| `holdback` | The fraction of data to hold back for testing. | Numeric | - | +| `Undersample` | A flag indicating whether undersampling should be applied to balance the dataset. | Boolean (TRUE/FALSE) | FALSE | +| `hyperparameter_tuning` | A flag indicating whether hyperparameter tuning should be performed. | Boolean (TRUE/FALSE) | FALSE | +| `error_correction_method` | The method for error correction; valid options are `'Flip'`, `'Prune'`, or `'None'`. | Character string | - | + +## Output + +The function generates a representation tree (ReprTree) from the trained Random Forest model and visualizes the first tree (k=5) from the model. + +- A plot of the first tree from the Random Forest is displayed. +- The representation tree object is generated but not explicitly returned. + +## Key Steps + +1. **Data Preparation:** + - If the `Data` parameter is `NULL`, the function calls `get_Data_formatted_for_ml_and_best.m` to prepare the data for modeling. + - Data is split into training and testing sets (70% for training and 30% for testing). + - If undersampling is enabled (`Undersample = TRUE`), positive and negative samples are balanced in the training set by undersampling the majority class. + +2. **Model Training:** + - A Random Forest model is trained using the `randomForest` function. The target variable is `Target_Organ`, and the model uses the best hyperparameter (`best.m`) determined beforehand. + - The number of trees in the forest is set to 500, and proximity calculations are enabled. + +3. **ReprTree Generation:** + - A ReprTree is generated using the `reprtree::ReprTree` function, which creates a representation of the trained Random Forest model. + - The first tree (k=5) is plotted using `reprtree::plot.getTree`. + +4. **Visualization:** + - The first tree in the Random Forest model is visualized using the `reprtree::plot.getTree` function. + +## Example Usage + +```r +get_reprtree_from_rf_model( + Data = my_data, + path_db = "path/to/database", + rat_studies = TRUE, + studyid_metadata = my_metadata, + fake_study = FALSE, + use_xpt_file = TRUE, + Round = TRUE, + Impute = TRUE, + reps = 5, + holdback = 0.3, + Undersample = TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = "Flip" +) diff --git a/vignettes/get_rf_input_param_list_output_cv_imp.Rmd b/vignettes/get_rf_input_param_list_output_cv_imp.Rmd new file mode 100644 index 0000000..6fe6cc4 --- /dev/null +++ b/vignettes/get_rf_input_param_list_output_cv_imp.Rmd @@ -0,0 +1,81 @@ +--- +title: "Documentation for `get_rf_input_param_list_output_cv_imp` Function" +output: html_document +--- + +## Purpose + +The `get_rf_input_param_list_output_cv_imp` function prepares the necessary data for training and evaluating a Random Forest (RF) model with cross-validation and variable importance scores. It handles various configurations, such as imputation, hyperparameter tuning, and the inclusion of rat studies. The function interacts with either an XPT file or an SQLite database to extract and harmonize study data, followed by model training and evaluation. + +## Input Parameters + +| Parameter | Type | Description | Default Value | +|---------------------------|-------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| +| `path_db` | character | Path to the SQLite database. | N/A | +| `rat_studies` | logical | If `TRUE`, limits the studies to rat studies. | `FALSE` | +| `studyid_metadata` | data.frame | A data frame containing metadata for the studies. | N/A | +| `fake_study` | logical | If `TRUE`, uses a fake study for data processing. | `FALSE` | +| `use_xpt_file` | logical | If `TRUE`, reads data from an XPT file instead of a database. | `FALSE` | +| `Round` | logical | If `TRUE`, rounds the liver scores. | `FALSE` | +| `Impute` | logical | If `TRUE`, imputes missing values in the data. | `FALSE` | +| `reps` | integer | Number of repetitions for model evaluation. | N/A | +| `holdback` | numeric | The proportion of data to hold back for validation. | N/A | +| `Undersample` | logical | If `TRUE`, undersamples the data to balance the classes. | `FALSE` | +| `hyperparameter_tuning` | logical | If `TRUE`, tunes hyperparameters for the Random Forest model. | `FALSE` | +| `error_correction_method` | character | The error correction method. Options: 'Flip', 'Prune', or 'None'. | N/A | +| `best.m` | numeric | A predefined value for the number of trees in the Random Forest model. If `NULL`, the function will determine this automatically. | `NULL` | +| `testReps` | integer | Number of test repetitions for model evaluation. | N/A | +| `indeterminateUpper` | numeric | Upper threshold for indeterminate predictions. | N/A | +| `indeterminateLower` | numeric | Lower threshold for indeterminate predictions. | N/A | +| `Type` | character | The type of Random Forest model to use. Options include classification or regression models. | N/A | +| `nTopImportance` | integer | The number of top important features to consider for the model. | N/A | + +## Output + +The function returns a Random Forest model trained with cross-validation (CV) and includes a list of variable importance scores. Specifically, it returns the result of the `get_rf_model_with_cv` function, which includes the trained model, cross-validation results, and feature importance scores. + +## Key Steps + +1. **Data Source Selection**: + - If `use_xpt_file` is `TRUE`, the function loads data from an XPT file. + - If `fake_study` is `TRUE`, it fetches data from a SQLite database and filters based on `rat_studies`. + - If neither condition is met, it retrieves study IDs from the database using `get_repeat_dose_parallel_studyids`. + +2. **Data Harmonization**: + - The function calls `get_liver_om_lb_mi_tox_score_list` to calculate liver scores for the studies, which are then harmonized using `get_col_harmonized_scores_df`. + +3. **Machine Learning Data Preparation**: + - The function prepares data for Random Forest model training by calling `get_ml_data_and_tuned_hyperparameters`. This step involves imputation, optional hyperparameter tuning, and data balancing. + +4. **Random Forest Model Training and Evaluation**: + - The function calls `get_rf_model_with_cv` to train and evaluate the Random Forest model with cross-validation. The model's performance is evaluated across multiple repetitions (`testReps`), with the option to include top importance features. + +5. **Error Correction**: + - If specified, the function applies an error correction method (either "Flip", "Prune", or "None"). + +6. **Return**: + - The function returns the trained Random Forest model along with cross-validation results and feature importance scores. + +## Example Usage + +```r +result <- get_rf_input_param_list_output_cv_imp( + path_db = "path/to/database", + rat_studies = TRUE, + studyid_metadata = metadata_df, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = TRUE, + Impute = TRUE, + reps = 10, + holdback = 0.2, + Undersample = TRUE, + hyperparameter_tuning = TRUE, + error_correction_method = "Flip", + best.m = NULL, + testReps = 5, + indeterminateUpper = 0.9, + indeterminateLower = 0.1, + Type = "classification", + nTopImportance = 10 +) diff --git a/vignettes/get_zone_exclusioned_rf_model_cv_imp.Rmd b/vignettes/get_zone_exclusioned_rf_model_cv_imp.Rmd new file mode 100644 index 0000000..8f3e4d8 --- /dev/null +++ b/vignettes/get_zone_exclusioned_rf_model_cv_imp.Rmd @@ -0,0 +1,97 @@ +--- +title: "Random Forest Model with Cross-Validation and Feature Importance" +author: "Md Aminul Islam Prodhan" +output: html_document +--- + +## Function Purpose + +The `get_rf_model_output_cv_imp` function is designed to perform cross-validation on a Random Forest model, track performance metrics (such as sensitivity, specificity, accuracy), handle indeterminate predictions, and compute feature importance based on either Gini or Accuracy. This function outputs performance summaries and feature importance rankings after a specified number of test repetitions. + +## Input Parameters + +The function takes several input parameters that control the model's training process, validation, and feature importance calculations. Below is a table describing each parameter: + +| **Parameter** | **Type** | **Description** | +|-------------------------|--------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `scores_df` | `data.frame` | A data frame containing the features and target variable for training and testing the model. | +| `Undersample` | `logical` | A flag indicating whether to apply undersampling to the training data. Defaults to `FALSE`. | +| `best.m` | `numeric` / `NULL` | A numeric value representing the number of features to sample for the Random Forest model, or `NULL` to calculate it automatically. | +| `testReps` | `integer` | The number of repetitions for cross-validation. This should be at least 2. | +| `indeterminateUpper` | `numeric` | The upper threshold for considering a prediction as indeterminate (e.g., a probability below this value is deemed indeterminate). | +| `indeterminateLower` | `numeric` | The lower threshold for considering a prediction as indeterminate (e.g., a probability above this value is deemed indeterminate). | +| `Type` | `integer` | The type of importance to compute (`1` for MeanDecreaseAccuracy, `2` for MeanDecreaseGini). | +| `nTopImportance` | `integer` | The number of top features to display based on their importance scores. | + +## Output + +The function returns a list containing the following elements: + +- **`performance_metrics`**: A vector of aggregated performance metrics (e.g., sensitivity, specificity, accuracy, etc.). +- **`feature_importance`**: A matrix containing the importance of the top `nTopImportance` features, ordered by their importance score. +- **`raw_results`**: A list containing raw results for debugging or further analysis, including sensitivity, specificity, accuracy, and Gini scores across all test repetitions. + +## Key Steps + +### 1. Data Preparation + +The input data is prepared by creating a copy of `scores_df` called `rfTestData`, which is then initialized with `NA` values to hold predictions from each test repetition. The column names are simplified to numeric identifiers. + +### 2. Cross-Validation + +The function iterates through `testReps` repetitions to perform cross-validation: + +- The dataset is split into training and testing sets in each iteration. +- If `Undersample` is set to `TRUE`, the training set is undersampled to balance the class distribution. +- A Random Forest model is trained on the training data. +- Predictions are made on the test data and stored in `rfTestData`. + +### 3. Handling Indeterminate Predictions + +During each repetition, predictions with probabilities between the `indeterminateUpper` and `indeterminateLower` thresholds are considered indeterminate. These predictions are replaced with `NA`, and the proportion of indeterminate predictions is tracked. + +### 4. Performance Metrics + +For each test repetition, the function computes a confusion matrix using the `caret` package and extracts various performance metrics, including: + +- Sensitivity +- Specificity +- Positive Predictive Value (PPV) +- Negative Predictive Value (NPV) +- Prevalence +- Accuracy + +These metrics are stored and aggregated across all test repetitions to provide an overall performance summary. + +### 5. Feature Importance + +The feature importance is computed using the `randomForest::importance()` function. The importance scores are aggregated over all repetitions, and the top `nTopImportance` features are identified and returned. + +### 6. Return Results + +The function returns a list containing: + +- Aggregated performance metrics +- Top `nTopImportance` features ranked by their importance score +- Raw results for further analysis (e.g., confusion matrix outputs) + +## Example Usage + +```r +# Example usage of the function +result <- get_rf_model_output_cv_imp( + scores_df = your_data, + Undersample = FALSE, + best.m = 3, + testReps = 5, + indeterminateUpper = 0.8, + indeterminateLower = 0.2, + Type = 1, + nTopImportance = 10 +) + +# View performance metrics +print(result$performance_metrics) + +# View top features by importance +print(result$feature_importance)