documenation updated

aminuldu07 · Jan 1, 2025 · 136841a · 136841a
1 parent cb37e06
commit 136841a
Show file tree

Hide file tree

Showing 47 changed files with 2,910 additions and 411 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,16 +1,31 @@
 # Generated by roxygen2: do not edit by hand
 
-export(get_all_lb_TESTCD_zscore)
+export(get_Data_formatted_for_ml_and_best.m)
+export(get_auc_curve_with_rf_model)
 export(get_bw_score)
+export(get_col_harmonized_scores_df)
 export(get_compile_data)
-export(get_fixed_parameter_rf_model)
-export(get_harmonized_column)
+export(get_histogram_barplot)
+export(get_imp_features_from_rf_model_with_cv)
 export(get_lb_score)
 export(get_liver_om_lb_mi_tox_score_list)
 export(get_livertobw_score)
 export(get_mi_score)
-export(get_random_forest_model)
-export(predicted_random_forest_model)
+export(get_ml_data_and_tuned_hyperparameters)
+export(get_prediction_plot)
+export(get_repeat_dose_parallel_studyids)
+export(get_reprtree_from_rf_model)
+export(get_rf_input_param_list_output_cv_imp)
+export(get_rf_model_output_cv_imp)
+export(get_rf_model_with_cv)
+import(DBI)
+import(ROCR)
+import(RSQLite)
+import(caret)
+import(ggplot2)
+import(randomForest)
+import(reprtree)
+import(stats)
 importFrom(RSQLite,SQLite)
 importFrom(RSQLite,dbConnect)
-importFrom(magrittr,"%>%")
+importFrom(stats,lm)
diff --git a/R/get_c_1Data_formatted_for_ml_and_best.m.R → R/get_Data_formatted_for_ml_and_best.m.R b/R/get_c_1Data_formatted_for_ml_and_best.m.R → R/get_Data_formatted_for_ml_and_best.m.R
@@ -1,4 +1,53 @@
-
+#' @title Retrieve and Preprocess Data for Machine Learning Models
+#'
+#'@description
+#' This function processes data from a given SQLite database or XPT file, calculates liver toxicity scores, and prepares data for machine learning models.
+#' It can also tune hyperparameters and apply error correction methods.
+#'
+#' @param path_db A character string representing the path to the SQLite database or XPT file.
+#' @param rat_studies A logical flag to filter for rat studies (default is FALSE).
+#' @param studyid_metadata A data frame containing metadata for the study IDs. If NULL, metadata is generated (default is NULL).
+#' @param fake_study A logical flag to use fake study data (default is FALSE).
+#' @param use_xpt_file A logical flag to indicate whether to use an XPT file instead of a SQLite database (default is FALSE).
+#' @param Round A logical flag to round liver toxicity scores (default is FALSE).
+#' @param Impute A logical flag to impute missing values in the dataset (default is FALSE).
+#' @param reps An integer specifying the number of repetitions for cross-validation.
+#' @param holdback A numeric value indicating the fraction of data to hold back for validation.
+#' @param Undersample A logical flag to undersample the majority class (default is FALSE).
+#' @param hyperparameter_tuning A logical flag to perform hyperparameter tuning (default is FALSE).
+#' @param error_correction_method A character string specifying the error correction method. Must be one of 'Flip', 'Prune', or 'None'.
+#'
+#' @return A list containing:
+#'   \item{Data}{A data frame containing the preprocessed data ready for machine learning.}
+#'   \item{best.m}{The best machine learning model after hyperparameter tuning, if applicable.}
+#'
+#' @details
+#' This function performs several key steps:
+#' - Retrieves study IDs from an SQLite database or XPT file.
+#' - Generates or uses provided study metadata, including a random assignment of "Target_Organ" values (either "Liver" or "not_Liver").
+#' - Calculates liver toxicity scores using the `get_liver_om_lb_mi_tox_score_list` function.
+#' - Harmonizes the calculated scores using the `get_col_harmonized_scores_df` function.
+#' - Prepares the data for machine learning and tunes hyperparameters (if enabled) using the `get_ml_data_and_tuned_hyperparameters` function.
+#' - Returns the processed data and the best model.
+#'
+#' @examples
+#' \dontrun{
+#' result <- get_Data_formatted_for_ml_and_best.m(
+#'   path_db = "path/to/database.db",
+#'   rat_studies = TRUE,
+#'   reps = 5,
+#'   holdback = 0.2,
+#'   error_correction_method = "Flip"
+#' )
+#'
+#' # Access the processed data and the best model
+#' processed_data <- result$Data
+#' best_model <- result$best.m
+#' }
+#'
+#' @import DBI
+#' @import RSQLite
+#' @export
 
 
 

diff --git a/R/get_12auc_curve_with_rf_model.R → R/get_auc_curve_with_rf_model.R b/R/get_12auc_curve_with_rf_model.R → R/get_auc_curve_with_rf_model.R
@@ -1,42 +1,61 @@
-#' Generate and Plot AUC Curve for Random Forest Model
+#' @title Compute and Plot AUC Curve with Random Forest Model
 #'
-#' This function trains a Random Forest model on provided or dynamically generated data, computes the
-#' Area Under the Curve (AUC) for the model's performance, and plots the Receiver Operating Characteristic (ROC) curve.
+#'@description
+#' This function trains a Random Forest model, computes the ROC curve, and
+#' calculates the AUC (Area Under the Curve). It allows various preprocessing
+#' options, such as imputation, rounding, undersampling, and hyperparameter tuning.
 #'
-#' @param rfData Data frame. The input data for training the Random Forest model. If `NULL`, the data is generated using
-#'   \code{get_rfData_and_best_m}.
-#' @param best.m Integer. The `mtry` hyperparameter for Random Forest. If `NULL`, the value is determined dynamically
-#'   using \code{get_rfData_and_best_m}.
-#' @param path_db Character. Path to the SQLite database. Required if `rfData` or `best.m` is `NULL`.
-#' @param studyid_metadata_path Character. Path to the CSV file containing study ID metadata. Required if `rfData` or
-#'   `best.m` is `NULL`.
-#' @param fake_study Logical. Whether to use fake study IDs. Default is \code{TRUE}.
-#' @param Round Logical. Whether to round numerical values in the data. Default is \code{TRUE}.
-#' @param Undersample Logical. Whether to perform undersampling to balance the data. Default is \code{TRUE}.
+#' @param Data A data frame containing the training data. If `NULL`, data will be fetched from the database.
+#' @param path_db A string representing the path to the SQLite database used to fetch data when `Data` is `NULL`.
+#' @param rat_studies Logical; whether to filter for rat studies. Defaults to `FALSE`.
+#' @param studyid_metadata A data frame containing metadata associated with study IDs.
+#' @param fake_study Logical; whether to use fake study IDs for data simulation. Defaults to `FALSE`.
+#' @param use_xpt_file Logical; whether to use an XPT file for input data. Defaults to `FALSE`.
+#' @param Round Logical; whether to round numerical values. Defaults to `FALSE`.
+#' @param Impute Logical; whether to perform imputation on missing values. Defaults to `FALSE`.
+#' @param best.m The 'mtry' hyperparameter for Random Forest. If `NULL`, it is determined by the function.
+#' @param reps A numeric value indicating the number of repetitions for cross-validation. Defaults to a numeric value.
+#' @param holdback Numeric; either 1 or a fraction value (e.g., 0.75) for holdback during cross-validation.
+#' @param Undersample Logical; whether to perform undersampling. Defaults to `FALSE`.
+#' @param hyperparameter_tuning Logical; whether to perform hyperparameter tuning. Defaults to `FALSE`.
+#' @param error_correction_method Character; one of "Flip", "Prune", or "None", specifying the method of error correction.
+#' @param output_individual_scores Logical; whether to output individual scores. Defaults to `TRUE`.
+#' @param output_zscore_by_USUBJID Logical; whether to output z-scores by subject ID. Defaults to `FALSE`.
 #'
-#' @return This function does not return a value. It prints the AUC value and plots the ROC curve.
-#' @details
-#' If `rfData` and `best.m` are not provided, the function dynamically generates the required data by connecting to
-#' the specified SQLite database and processing metadata.
+#' @return This function does not return any explicit value. It generates:
+#'   \itemize{
+#'     \item The AUC (Area Under the Curve) printed to the console.
+#'     \item A ROC curve plot with the calculated AUC value.
+#'     \item Various performance metrics (e.g., True Positive Rate, False Positive Rate), displayed in the plot.
+#'   }
 #'
-#' The function uses the `randomForest` package to train the model and the `ROCR` package to calculate and plot
-#' the AUC and ROC curve.
+#' @details
+#' The function prepares data for training a Random Forest model by first fetching data from an SQLite database
+#' or generating synthetic data (if `fake_study` is `TRUE`). It processes the data using various options such
+#' as imputation, rounding, and undersampling. The model is trained using the Random Forest algorithm, and
+#' performance is evaluated via the ROC curve and AUC metric.
 #'
-#' @export
+#' The function also allows for hyperparameter tuning and error correction. After training the model,
+#' predictions are made, and the AUC is calculated and visualized with a ROC curve plot.
 #'
 #' @examples
-#' # Using pre-calculated rfData and best.m
-#' get_auc_curve(rfData = my_rfData, best.m = 5)
+#' # Example 1: Using real data from the database
+#' get_auc_curve_with_rf_model(Data = NULL, path_db = "path/to/database.db", rat_studies = TRUE, reps = 10,
+#'                             holdback = 0.75, error_correction_method = "Prune")
 #'
-#' # Dynamically generating rfData and best.m
-#' get_auc_curve(
-#'   path_db = "path/to/database.db",
-#'   studyid_metadata_path = "path/to/study_metadata.csv",
-#'   fake_study = TRUE,
-#'   Round = TRUE,
-#'   Undersample = TRUE
-#' )
-
+#' # Example 2: Using synthetic data with fake study IDs
+#' get_auc_curve_with_rf_model(Data = NULL, fake_study = TRUE, reps = 5, holdback = 0.8,
+#'                             error_correction_method = "Flip")
+#'
+#' @seealso
+#' `randomForest`, `ROCR`
+#'
+#' @import DBI
+#' @import RSQLite
+#' @import ROCR
+#' @import randomForest
+#'
+#' @export
 
 
 

diff --git a/R/get_13histogram_barplot.R → R/get_histogram_barplot.R b/R/get_13histogram_barplot.R → R/get_histogram_barplot.R
@@ -1,3 +1,54 @@
+#' @title Generate Histogram or Bar Plot for Liver-Related Scores
+#'
+#'@description
+#' This function generates a bar plot comparing liver-related findings to non-liver-related findings,
+#' or returns processed data for further analysis. The function can fetch data from an SQLite database,
+#' a provided XPT file, or simulate data if `fake_study` is set to TRUE.
+#'
+#' @param Data A data frame containing liver-related scores. If NULL, the function will attempt to
+#'        generate or fetch the data from a database or file.
+#' @param generateBarPlot A logical flag (default = FALSE). If TRUE, generates a bar plot. If FALSE,
+#'        returns the processed data.
+#' @param path_db A character string representing the path to the SQLite database. Required if
+#'        `use_xpt_file` is FALSE or `fake_study` is FALSE.
+#' @param rat_studies A logical flag (default = FALSE) to filter for rat studies when fetching data
+#'        from the database.
+#' @param studyid_metadata A data frame containing metadata associated with study IDs. Required when
+#'        `fake_study` is FALSE and real data is fetched.
+#' @param fake_study A logical flag (default = FALSE). If TRUE, the function simulates study data
+#'        instead of fetching it from a database.
+#' @param use_xpt_file A logical flag (default = FALSE). If TRUE, the function will use an XPT file
+#'        to fetch data, instead of relying on the database.
+#' @param Round A logical flag (default = FALSE). Whether to round the liver scores.
+#' @param output_individual_scores A logical flag (default = TRUE). Whether to output individual
+#'        scores or aggregated scores.
+#' @param output_zscore_by_USUBJID A logical flag (default = FALSE). Whether to output z-scores
+#'        by USUBJID (unique subject identifier).
+#'
+#' @return If `generateBarPlot = TRUE`, a `ggplot2` bar plot object is returned displaying the
+#'         average scores for liver-related findings versus non-liver-related findings. If
+#'         `generateBarPlot = FALSE`, a data frame (`plotData`) containing the calculated values
+#'         for each finding, liver status (`LIVER`), and mean values (`Value`) is returned.
+#'
+#' @details
+#' If no data is provided, the function attempts to fetch data from an SQLite database or simulate
+#' data based on the `fake_study` flag. The function also supports the use of XPT files and allows
+#' customization of study filtering through the `rat_studies` and `studyid_metadata` parameters.
+#' When generating a plot, the function compares liver-related findings to other findings,
+#' displaying the average scores for each finding in a bar plot.
+#'
+#' @examples
+#' # Example 1: Generate a bar plot with fake study data
+#' get_histogram_barplot(generateBarPlot = TRUE, fake_study = TRUE)
+#'
+#' # Example 2: Get processed data without generating a plot
+#' data <- get_histogram_barplot(generateBarPlot = FALSE, fake_study = FALSE, path_db = "path/to/db")
+#'
+#' @import ggplot2
+#' @import DBI
+#' @import RSQLite
+#' @export
+
 
 get_histogram_barplot <- function(Data =NULL,
                            generateBarPlot= FALSE,

diff --git a/R/get_11imp_features_from_rf_model_with_cv.R → R/get_imp_features_from_rf_model_with_cv.R b/R/get_11imp_features_from_rf_model_with_cv.R → R/get_imp_features_from_rf_model_with_cv.R
@@ -1,3 +1,48 @@
+#' @title Get Important Features from Random Forest Model with Cross-Validation
+#'
+#'@description
+#' This function performs cross-validation with test repetitions on a random forest model, calculates feature importance using Gini importance, and returns the top `n` important features.
+#'
+#' @param Data A data frame containing the training data (rows as samples, columns as features). The first column is assumed to be the target variable.
+#' @param Undersample A logical value indicating whether to apply under-sampling to balance the classes in the training data. Default is `FALSE`.
+#' @param best.m A numeric value representing the number of variables to consider at each split of the Random Forest model (or a function to determine this). Default is `NULL`.
+#' @param testReps A numeric value indicating the number of test repetitions (must be at least 2).
+#' @param Type A numeric value indicating the type of importance to be calculated. `1` for Mean Decrease Accuracy and `2` for Mean Decrease Gini.
+#' @param nTopImportance A numeric value indicating the number of top important features to return based on their importance scores.
+#'
+#' @return A list containing:
+#' \describe{
+#'   \item{gini_scores}{A matrix of Gini importance scores for each feature across the different cross-validation iterations. The matrix has rows representing features and columns representing test iterations.}
+#' }
+#'
+#' @details
+#' This function trains a Random Forest model using cross-validation with specified repetitions and calculates the feature importance using Gini importance scores. The function also supports optional under-sampling to balance the class distribution in the training set.
+#'
+#' The function performs the following steps:
+#' \itemize{
+#'   \item Initializes performance metric trackers.
+#'   \item Prepares the input data for cross-validation.
+#'   \item Performs cross-validation, where each repetition involves training the model on a subset of data and testing on the remaining data.
+#'   \item Optionally applies under-sampling to the training data.
+#'   \item Trains a Random Forest model on each fold and calculates Gini importance scores.
+#'   \item Aggregates and sorts the Gini importance scores to identify the top features.
+#'   \item Plots the importance of top features.
+#' }
+#'
+#' @examples
+#' # Example of calling the function
+#' result <- get_imp_features_from_rf_model_with_cv(
+#'   Data = scores_df,
+#'   Undersample = FALSE,
+#'   best.m = 3,
+#'   testReps = 5,
+#'   Type = 2,
+#'   nTopImportance = 10
+#' )
+#'
+#' @export
+
+
 
 get_imp_features_from_rf_model_with_cv <- function(Data=NULL, #scores_df
                                                   Undersample = FALSE,

diff --git a/R/get_15prediction_plot.R → R/get_prediction_plot.R b/R/get_15prediction_plot.R → R/get_prediction_plot.R
@@ -1,3 +1,49 @@
+#' @title Generate Prediction Plot for Random Forest Model
+#'
+#'@description
+#' This function performs model building and prediction using a random forest algorithm. It iterates over multiple test repetitions, training the model on the training data and predicting on the test data. After predictions are made, a histogram plot is generated to visualize the distribution of predicted probabilities for the outcome variable (`LIVER`).
+#'
+#' @param Data A data frame containing the dataset to use for training and testing. If `NULL`, the function will attempt to fetch and format the data from the database using `get_Data_formatted_for_ml_and_best.m` function.
+#' @param path_db A string indicating the path to the database that contains the dataset.
+#' @param rat_studies A logical flag indicating whether to use rat studies data. Defaults to `FALSE`.
+#' @param studyid_metadata A data frame containing metadata related to the study IDs. Defaults to `NULL`.
+#' @param fake_study A logical flag indicating whether to use fake study data. Defaults to `FALSE`.
+#' @param use_xpt_file A logical flag indicating whether to use an XPT file. Defaults to `FALSE`.
+#' @param Round A logical flag indicating whether to round the predictions. Defaults to `FALSE`.
+#' @param Impute A logical flag indicating whether to impute missing values. Defaults to `FALSE`.
+#' @param reps An integer specifying the number of repetitions for cross-validation.
+#' @param holdback A numeric value indicating the proportion of data to hold back for testing during cross-validation.
+#' @param Undersample A logical flag indicating whether to perform undersampling on the dataset to balance the classes. Defaults to `FALSE`.
+#' @param hyperparameter_tuning A logical flag indicating whether to perform hyperparameter tuning. Defaults to `FALSE`.
+#' @param error_correction_method A string specifying the error correction method to be used. Possible values are "Flip", "Prune", or "None".
+#' @param testReps An integer specifying the number of test repetitions for model evaluation.
+#'
+#' @return A `ggplot` object representing the histogram of predicted probabilities for the `LIVER` variable across test repetitions.
+#'
+#' @details
+#' The function works as follows:
+#' - If `Data` is `NULL`, the function fetches the data and the best model configuration by calling the `get_Data_formatted_for_ml_and_best.m` function.
+#' - The dataset is divided into training and test sets for each repetition (`testReps`).
+#' - If `Undersample` is enabled, undersampling is applied to balance the dataset.
+#' - A random forest model is trained on the training data and predictions are made on the test data.
+#' - The predictions are averaged over the test repetitions and a histogram is plotted to visualize the distribution of predicted probabilities for `LIVER`.
+#'
+#' @examples
+#' # Example function call
+#' get_prediction_plot(
+#'   path_db = "path_to_db",
+#'   rat_studies = FALSE,
+#'   reps = 10,
+#'   holdback = 0.2,
+#'   Undersample = TRUE,
+#'   hyperparameter_tuning = FALSE,
+#'   error_correction_method = "Flip",
+#'   testReps = 5
+#' )
+#'
+#' @export
+
+
 get_prediction_plot <- function(Data=NULL,
                                 path_db,
                                 rat_studies=FALSE,

diff --git a/R/get_0repeat_dose_parallel_studyids.R → R/get_repeat_dose_parallel_studyids.R b/R/get_0repeat_dose_parallel_studyids.R → R/get_repeat_dose_parallel_studyids.R
@@ -1,4 +1,29 @@
-
+#' @title Get Repeat Dose Parallel Study IDs
+#'
+#'@description
+#' This function retrieves study IDs from a database that correspond to parallel-design studies involving repeat-dose toxicity.
+#' It optionally filters the studies for rat species.
+#'
+#' @param path_db A character string representing the file path to the SQLite database. This is a required parameter.
+#' @param rat_studies A logical flag indicating whether to filter the studies for rats only. Defaults to `FALSE`.
+#'
+#' @return A vector of study IDs that meet the specified criteria. This includes:
+#'   \itemize{
+#'     \item Study IDs that match both the parallel design and repeat-dose toxicity criteria.
+#'     \item Optionally, study IDs that match rat species if `rat_studies = TRUE`.
+#'   }
+#'
+#' @examples
+#' \dontrun{
+#'   # Example without filtering for rat studies
+#'   study_ids <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite")
+#'
+#'   # Example with filtering for rat studies
+#'   study_ids_rats <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite", rat_studies = TRUE)
+#' }
+#'
+#' @export
+#'
 get_repeat_dose_parallel_studyids <- function (path_db,
                                                rat_studies = FALSE) {