Skip to content

Commit

Permalink
documenation updated
Browse files Browse the repository at this point in the history
  • Loading branch information
aminuldu07 committed Jan 1, 2025
1 parent cb37e06 commit 136841a
Show file tree
Hide file tree
Showing 47 changed files with 2,910 additions and 411 deletions.
27 changes: 21 additions & 6 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
# Generated by roxygen2: do not edit by hand

export(get_all_lb_TESTCD_zscore)
export(get_Data_formatted_for_ml_and_best.m)
export(get_auc_curve_with_rf_model)
export(get_bw_score)
export(get_col_harmonized_scores_df)
export(get_compile_data)
export(get_fixed_parameter_rf_model)
export(get_harmonized_column)
export(get_histogram_barplot)
export(get_imp_features_from_rf_model_with_cv)
export(get_lb_score)
export(get_liver_om_lb_mi_tox_score_list)
export(get_livertobw_score)
export(get_mi_score)
export(get_random_forest_model)
export(predicted_random_forest_model)
export(get_ml_data_and_tuned_hyperparameters)
export(get_prediction_plot)
export(get_repeat_dose_parallel_studyids)
export(get_reprtree_from_rf_model)
export(get_rf_input_param_list_output_cv_imp)
export(get_rf_model_output_cv_imp)
export(get_rf_model_with_cv)
import(DBI)
import(ROCR)
import(RSQLite)
import(caret)
import(ggplot2)
import(randomForest)
import(reprtree)
import(stats)
importFrom(RSQLite,SQLite)
importFrom(RSQLite,dbConnect)
importFrom(magrittr,"%>%")
importFrom(stats,lm)
Original file line number Diff line number Diff line change
@@ -1,4 +1,53 @@

#' @title Retrieve and Preprocess Data for Machine Learning Models
#'
#'@description
#' This function processes data from a given SQLite database or XPT file, calculates liver toxicity scores, and prepares data for machine learning models.
#' It can also tune hyperparameters and apply error correction methods.
#'
#' @param path_db A character string representing the path to the SQLite database or XPT file.
#' @param rat_studies A logical flag to filter for rat studies (default is FALSE).
#' @param studyid_metadata A data frame containing metadata for the study IDs. If NULL, metadata is generated (default is NULL).
#' @param fake_study A logical flag to use fake study data (default is FALSE).
#' @param use_xpt_file A logical flag to indicate whether to use an XPT file instead of a SQLite database (default is FALSE).
#' @param Round A logical flag to round liver toxicity scores (default is FALSE).
#' @param Impute A logical flag to impute missing values in the dataset (default is FALSE).
#' @param reps An integer specifying the number of repetitions for cross-validation.
#' @param holdback A numeric value indicating the fraction of data to hold back for validation.
#' @param Undersample A logical flag to undersample the majority class (default is FALSE).
#' @param hyperparameter_tuning A logical flag to perform hyperparameter tuning (default is FALSE).
#' @param error_correction_method A character string specifying the error correction method. Must be one of 'Flip', 'Prune', or 'None'.
#'
#' @return A list containing:
#' \item{Data}{A data frame containing the preprocessed data ready for machine learning.}
#' \item{best.m}{The best machine learning model after hyperparameter tuning, if applicable.}
#'
#' @details
#' This function performs several key steps:
#' - Retrieves study IDs from an SQLite database or XPT file.
#' - Generates or uses provided study metadata, including a random assignment of "Target_Organ" values (either "Liver" or "not_Liver").
#' - Calculates liver toxicity scores using the `get_liver_om_lb_mi_tox_score_list` function.
#' - Harmonizes the calculated scores using the `get_col_harmonized_scores_df` function.
#' - Prepares the data for machine learning and tunes hyperparameters (if enabled) using the `get_ml_data_and_tuned_hyperparameters` function.
#' - Returns the processed data and the best model.
#'
#' @examples
#' \dontrun{
#' result <- get_Data_formatted_for_ml_and_best.m(
#' path_db = "path/to/database.db",
#' rat_studies = TRUE,
#' reps = 5,
#' holdback = 0.2,
#' error_correction_method = "Flip"
#' )
#'
#' # Access the processed data and the best model
#' processed_data <- result$Data
#' best_model <- result$best.m
#' }
#'
#' @import DBI
#' @import RSQLite
#' @export



Expand Down
Original file line number Diff line number Diff line change
@@ -1,42 +1,61 @@
#' Generate and Plot AUC Curve for Random Forest Model
#' @title Compute and Plot AUC Curve with Random Forest Model
#'
#' This function trains a Random Forest model on provided or dynamically generated data, computes the
#' Area Under the Curve (AUC) for the model's performance, and plots the Receiver Operating Characteristic (ROC) curve.
#'@description
#' This function trains a Random Forest model, computes the ROC curve, and
#' calculates the AUC (Area Under the Curve). It allows various preprocessing
#' options, such as imputation, rounding, undersampling, and hyperparameter tuning.
#'
#' @param rfData Data frame. The input data for training the Random Forest model. If `NULL`, the data is generated using
#' \code{get_rfData_and_best_m}.
#' @param best.m Integer. The `mtry` hyperparameter for Random Forest. If `NULL`, the value is determined dynamically
#' using \code{get_rfData_and_best_m}.
#' @param path_db Character. Path to the SQLite database. Required if `rfData` or `best.m` is `NULL`.
#' @param studyid_metadata_path Character. Path to the CSV file containing study ID metadata. Required if `rfData` or
#' `best.m` is `NULL`.
#' @param fake_study Logical. Whether to use fake study IDs. Default is \code{TRUE}.
#' @param Round Logical. Whether to round numerical values in the data. Default is \code{TRUE}.
#' @param Undersample Logical. Whether to perform undersampling to balance the data. Default is \code{TRUE}.
#' @param Data A data frame containing the training data. If `NULL`, data will be fetched from the database.
#' @param path_db A string representing the path to the SQLite database used to fetch data when `Data` is `NULL`.
#' @param rat_studies Logical; whether to filter for rat studies. Defaults to `FALSE`.
#' @param studyid_metadata A data frame containing metadata associated with study IDs.
#' @param fake_study Logical; whether to use fake study IDs for data simulation. Defaults to `FALSE`.
#' @param use_xpt_file Logical; whether to use an XPT file for input data. Defaults to `FALSE`.
#' @param Round Logical; whether to round numerical values. Defaults to `FALSE`.
#' @param Impute Logical; whether to perform imputation on missing values. Defaults to `FALSE`.
#' @param best.m The 'mtry' hyperparameter for Random Forest. If `NULL`, it is determined by the function.
#' @param reps A numeric value indicating the number of repetitions for cross-validation. Defaults to a numeric value.
#' @param holdback Numeric; either 1 or a fraction value (e.g., 0.75) for holdback during cross-validation.
#' @param Undersample Logical; whether to perform undersampling. Defaults to `FALSE`.
#' @param hyperparameter_tuning Logical; whether to perform hyperparameter tuning. Defaults to `FALSE`.
#' @param error_correction_method Character; one of "Flip", "Prune", or "None", specifying the method of error correction.
#' @param output_individual_scores Logical; whether to output individual scores. Defaults to `TRUE`.
#' @param output_zscore_by_USUBJID Logical; whether to output z-scores by subject ID. Defaults to `FALSE`.
#'
#' @return This function does not return a value. It prints the AUC value and plots the ROC curve.
#' @details
#' If `rfData` and `best.m` are not provided, the function dynamically generates the required data by connecting to
#' the specified SQLite database and processing metadata.
#' @return This function does not return any explicit value. It generates:
#' \itemize{
#' \item The AUC (Area Under the Curve) printed to the console.
#' \item A ROC curve plot with the calculated AUC value.
#' \item Various performance metrics (e.g., True Positive Rate, False Positive Rate), displayed in the plot.
#' }
#'
#' The function uses the `randomForest` package to train the model and the `ROCR` package to calculate and plot
#' the AUC and ROC curve.
#' @details
#' The function prepares data for training a Random Forest model by first fetching data from an SQLite database
#' or generating synthetic data (if `fake_study` is `TRUE`). It processes the data using various options such
#' as imputation, rounding, and undersampling. The model is trained using the Random Forest algorithm, and
#' performance is evaluated via the ROC curve and AUC metric.
#'
#' @export
#' The function also allows for hyperparameter tuning and error correction. After training the model,
#' predictions are made, and the AUC is calculated and visualized with a ROC curve plot.
#'
#' @examples
#' # Using pre-calculated rfData and best.m
#' get_auc_curve(rfData = my_rfData, best.m = 5)
#' # Example 1: Using real data from the database
#' get_auc_curve_with_rf_model(Data = NULL, path_db = "path/to/database.db", rat_studies = TRUE, reps = 10,
#' holdback = 0.75, error_correction_method = "Prune")
#'
#' # Dynamically generating rfData and best.m
#' get_auc_curve(
#' path_db = "path/to/database.db",
#' studyid_metadata_path = "path/to/study_metadata.csv",
#' fake_study = TRUE,
#' Round = TRUE,
#' Undersample = TRUE
#' )

#' # Example 2: Using synthetic data with fake study IDs
#' get_auc_curve_with_rf_model(Data = NULL, fake_study = TRUE, reps = 5, holdback = 0.8,
#' error_correction_method = "Flip")
#'
#' @seealso
#' `randomForest`, `ROCR`
#'
#' @import DBI
#' @import RSQLite
#' @import ROCR
#' @import randomForest
#'
#' @export



Expand Down
51 changes: 51 additions & 0 deletions R/get_13histogram_barplot.R → R/get_histogram_barplot.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,54 @@
#' @title Generate Histogram or Bar Plot for Liver-Related Scores
#'
#'@description
#' This function generates a bar plot comparing liver-related findings to non-liver-related findings,
#' or returns processed data for further analysis. The function can fetch data from an SQLite database,
#' a provided XPT file, or simulate data if `fake_study` is set to TRUE.
#'
#' @param Data A data frame containing liver-related scores. If NULL, the function will attempt to
#' generate or fetch the data from a database or file.
#' @param generateBarPlot A logical flag (default = FALSE). If TRUE, generates a bar plot. If FALSE,
#' returns the processed data.
#' @param path_db A character string representing the path to the SQLite database. Required if
#' `use_xpt_file` is FALSE or `fake_study` is FALSE.
#' @param rat_studies A logical flag (default = FALSE) to filter for rat studies when fetching data
#' from the database.
#' @param studyid_metadata A data frame containing metadata associated with study IDs. Required when
#' `fake_study` is FALSE and real data is fetched.
#' @param fake_study A logical flag (default = FALSE). If TRUE, the function simulates study data
#' instead of fetching it from a database.
#' @param use_xpt_file A logical flag (default = FALSE). If TRUE, the function will use an XPT file
#' to fetch data, instead of relying on the database.
#' @param Round A logical flag (default = FALSE). Whether to round the liver scores.
#' @param output_individual_scores A logical flag (default = TRUE). Whether to output individual
#' scores or aggregated scores.
#' @param output_zscore_by_USUBJID A logical flag (default = FALSE). Whether to output z-scores
#' by USUBJID (unique subject identifier).
#'
#' @return If `generateBarPlot = TRUE`, a `ggplot2` bar plot object is returned displaying the
#' average scores for liver-related findings versus non-liver-related findings. If
#' `generateBarPlot = FALSE`, a data frame (`plotData`) containing the calculated values
#' for each finding, liver status (`LIVER`), and mean values (`Value`) is returned.
#'
#' @details
#' If no data is provided, the function attempts to fetch data from an SQLite database or simulate
#' data based on the `fake_study` flag. The function also supports the use of XPT files and allows
#' customization of study filtering through the `rat_studies` and `studyid_metadata` parameters.
#' When generating a plot, the function compares liver-related findings to other findings,
#' displaying the average scores for each finding in a bar plot.
#'
#' @examples
#' # Example 1: Generate a bar plot with fake study data
#' get_histogram_barplot(generateBarPlot = TRUE, fake_study = TRUE)
#'
#' # Example 2: Get processed data without generating a plot
#' data <- get_histogram_barplot(generateBarPlot = FALSE, fake_study = FALSE, path_db = "path/to/db")
#'
#' @import ggplot2
#' @import DBI
#' @import RSQLite
#' @export


get_histogram_barplot <- function(Data =NULL,
generateBarPlot= FALSE,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,48 @@
#' @title Get Important Features from Random Forest Model with Cross-Validation
#'
#'@description
#' This function performs cross-validation with test repetitions on a random forest model, calculates feature importance using Gini importance, and returns the top `n` important features.
#'
#' @param Data A data frame containing the training data (rows as samples, columns as features). The first column is assumed to be the target variable.
#' @param Undersample A logical value indicating whether to apply under-sampling to balance the classes in the training data. Default is `FALSE`.
#' @param best.m A numeric value representing the number of variables to consider at each split of the Random Forest model (or a function to determine this). Default is `NULL`.
#' @param testReps A numeric value indicating the number of test repetitions (must be at least 2).
#' @param Type A numeric value indicating the type of importance to be calculated. `1` for Mean Decrease Accuracy and `2` for Mean Decrease Gini.
#' @param nTopImportance A numeric value indicating the number of top important features to return based on their importance scores.
#'
#' @return A list containing:
#' \describe{
#' \item{gini_scores}{A matrix of Gini importance scores for each feature across the different cross-validation iterations. The matrix has rows representing features and columns representing test iterations.}
#' }
#'
#' @details
#' This function trains a Random Forest model using cross-validation with specified repetitions and calculates the feature importance using Gini importance scores. The function also supports optional under-sampling to balance the class distribution in the training set.
#'
#' The function performs the following steps:
#' \itemize{
#' \item Initializes performance metric trackers.
#' \item Prepares the input data for cross-validation.
#' \item Performs cross-validation, where each repetition involves training the model on a subset of data and testing on the remaining data.
#' \item Optionally applies under-sampling to the training data.
#' \item Trains a Random Forest model on each fold and calculates Gini importance scores.
#' \item Aggregates and sorts the Gini importance scores to identify the top features.
#' \item Plots the importance of top features.
#' }
#'
#' @examples
#' # Example of calling the function
#' result <- get_imp_features_from_rf_model_with_cv(
#' Data = scores_df,
#' Undersample = FALSE,
#' best.m = 3,
#' testReps = 5,
#' Type = 2,
#' nTopImportance = 10
#' )
#'
#' @export



get_imp_features_from_rf_model_with_cv <- function(Data=NULL, #scores_df
Undersample = FALSE,
Expand Down
46 changes: 46 additions & 0 deletions R/get_15prediction_plot.R → R/get_prediction_plot.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,49 @@
#' @title Generate Prediction Plot for Random Forest Model
#'
#'@description
#' This function performs model building and prediction using a random forest algorithm. It iterates over multiple test repetitions, training the model on the training data and predicting on the test data. After predictions are made, a histogram plot is generated to visualize the distribution of predicted probabilities for the outcome variable (`LIVER`).
#'
#' @param Data A data frame containing the dataset to use for training and testing. If `NULL`, the function will attempt to fetch and format the data from the database using `get_Data_formatted_for_ml_and_best.m` function.
#' @param path_db A string indicating the path to the database that contains the dataset.
#' @param rat_studies A logical flag indicating whether to use rat studies data. Defaults to `FALSE`.
#' @param studyid_metadata A data frame containing metadata related to the study IDs. Defaults to `NULL`.
#' @param fake_study A logical flag indicating whether to use fake study data. Defaults to `FALSE`.
#' @param use_xpt_file A logical flag indicating whether to use an XPT file. Defaults to `FALSE`.
#' @param Round A logical flag indicating whether to round the predictions. Defaults to `FALSE`.
#' @param Impute A logical flag indicating whether to impute missing values. Defaults to `FALSE`.
#' @param reps An integer specifying the number of repetitions for cross-validation.
#' @param holdback A numeric value indicating the proportion of data to hold back for testing during cross-validation.
#' @param Undersample A logical flag indicating whether to perform undersampling on the dataset to balance the classes. Defaults to `FALSE`.
#' @param hyperparameter_tuning A logical flag indicating whether to perform hyperparameter tuning. Defaults to `FALSE`.
#' @param error_correction_method A string specifying the error correction method to be used. Possible values are "Flip", "Prune", or "None".
#' @param testReps An integer specifying the number of test repetitions for model evaluation.
#'
#' @return A `ggplot` object representing the histogram of predicted probabilities for the `LIVER` variable across test repetitions.
#'
#' @details
#' The function works as follows:
#' - If `Data` is `NULL`, the function fetches the data and the best model configuration by calling the `get_Data_formatted_for_ml_and_best.m` function.
#' - The dataset is divided into training and test sets for each repetition (`testReps`).
#' - If `Undersample` is enabled, undersampling is applied to balance the dataset.
#' - A random forest model is trained on the training data and predictions are made on the test data.
#' - The predictions are averaged over the test repetitions and a histogram is plotted to visualize the distribution of predicted probabilities for `LIVER`.
#'
#' @examples
#' # Example function call
#' get_prediction_plot(
#' path_db = "path_to_db",
#' rat_studies = FALSE,
#' reps = 10,
#' holdback = 0.2,
#' Undersample = TRUE,
#' hyperparameter_tuning = FALSE,
#' error_correction_method = "Flip",
#' testReps = 5
#' )
#'
#' @export


get_prediction_plot <- function(Data=NULL,
path_db,
rat_studies=FALSE,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,29 @@

#' @title Get Repeat Dose Parallel Study IDs
#'
#'@description
#' This function retrieves study IDs from a database that correspond to parallel-design studies involving repeat-dose toxicity.
#' It optionally filters the studies for rat species.
#'
#' @param path_db A character string representing the file path to the SQLite database. This is a required parameter.
#' @param rat_studies A logical flag indicating whether to filter the studies for rats only. Defaults to `FALSE`.
#'
#' @return A vector of study IDs that meet the specified criteria. This includes:
#' \itemize{
#' \item Study IDs that match both the parallel design and repeat-dose toxicity criteria.
#' \item Optionally, study IDs that match rat species if `rat_studies = TRUE`.
#' }
#'
#' @examples
#' \dontrun{
#' # Example without filtering for rat studies
#' study_ids <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite")
#'
#' # Example with filtering for rat studies
#' study_ids_rats <- get_repeat_dose_parallel_studyids(path_db = "path/to/database.sqlite", rat_studies = TRUE)
#' }
#'
#' @export
#'
get_repeat_dose_parallel_studyids <- function (path_db,
rat_studies = FALSE) {

Expand Down
Loading

0 comments on commit 136841a

Please sign in to comment.