From 43c135cca2bc305fd1b12b61d70cfd33a83eb3ae Mon Sep 17 00:00:00 2001
From: Md Aminul Islam Prodhan <aminuldu07@gmail.com>
Date: Sun, 29 Dec 2024 10:51:38 -0600
Subject: [PATCH] rf updated

---
 R/get_Data_formatted_for_ml.R                 | 162 +++++++++++++
 R/get_imp_features_from_rf_model_cv_imp.R     | 214 +++++++++++++++++
 R/get_rf_input_param_list_output_cv_imp.R     |  42 ++--
 R/get_rf_model_with_cv.R                      | 219 ++++++++++++++++++
 ...=> get_zone_exclusioned_rf_model_cv_imp.R} |   0
 inst/test_get_Data_formatted_for_ml.R         |  32 +++
 inst/test_get_rf_model_with_cv.R              |  88 +++++++
 inst/test_train_eval_rf_wth_cv_imp.R          |  66 ------
 8 files changed, 743 insertions(+), 80 deletions(-)
 create mode 100644 R/get_Data_formatted_for_ml.R
 create mode 100644 R/get_imp_features_from_rf_model_cv_imp.R
 create mode 100644 R/get_rf_model_with_cv.R
 rename R/{get_rf_model_cv_imp.R => get_zone_exclusioned_rf_model_cv_imp.R} (100%)
 create mode 100644 inst/test_get_Data_formatted_for_ml.R
 create mode 100644 inst/test_get_rf_model_with_cv.R
 delete mode 100644 inst/test_train_eval_rf_wth_cv_imp.R

diff --git a/R/get_Data_formatted_for_ml.R b/R/get_Data_formatted_for_ml.R
new file mode 100644
index 0000000..9e3be6b
--- /dev/null
+++ b/R/get_Data_formatted_for_ml.R
@@ -0,0 +1,162 @@
+
+
+
+
+
+get_Data_formatted_for_ml <- function(path_db,
+                                      rat_studies=FALSE,
+                                      studyid_metadata=NULL,
+                                      fake_study = FALSE,
+                                      use_xpt_file = FALSE,
+                                      Round = FALSE,
+                                      Impute = FALSE,
+                                      reps,
+                                      holdback,
+                                      Undersample = FALSE,
+                                      hyperparameter_tuning = FALSE,
+                                      error_correction_method # = must be 'Flip' or "Prune' or 'None'
+                                      ){
+
+  # Process the database to retrieve the vector of "STUDYIDs"-------------
+  if(use_xpt_file){
+
+      studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE)
+
+    } else {
+
+      if (fake_study) {
+      # Helper function to fetch data from SQLite database
+      fetch_domain_data <- function(db_connection, domain_name) {
+        # Convert domain name to uppercase
+        domain_name <- toupper(domain_name)
+        # Create SQL query statement
+        query_statement <- paste0('SELECT * FROM ', domain_name)
+        # Execute query and fetch the data
+        query_result <- DBI::dbGetQuery(db_connection, statement = query_statement)
+        # Return the result
+        query_result
+      }
+      # Establish a connection to the SQLite database
+      db_connection <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db)
+
+      # Fetch data for required domains
+      dm <- fetch_domain_data(db_connection, 'dm')
+
+      # Close the database connection
+      DBI::dbDisconnect(db_connection)
+
+      # get the studyids from the dm table
+      studyid_or_studyids <- as.vector(unique(dm$STUDYID)) # unique STUDYIDS from DM table
+
+      # Filter the fake data for the "rat_studies"
+      if(rat_studies){
+
+        studyid_or_studyids <- studyid_or_studyids
+      }
+
+      #--------------------------------------------------------------------
+      #-----------we can set logic here for rat studies in "fake data"----
+      #--------------------------------------------------------------------
+
+      } else {
+        # For the real data in sqlite database
+        # filter for the repeat-dose and parallel studyids
+
+        studyid_or_studyids <- get_repeat_dose_parallel_studyids(path_db=path_db,
+                                                                 rat_studies = rat_studies)
+
+      }
+    }
+
+
+  # process the database to get the "studyid_metadata"------------
+  if(is.null(studyid_metadata)) {
+    if(fake_study) {
+      # Extract study ID metadata
+      studyid_metadata <- dm[, "STUDYID", drop=FALSE]
+
+      # Remove duplicates based on STUDYID
+      studyid_metadata <- studyid_metadata[!duplicated(studyid_metadata$STUDYID), , drop =FALSE]
+
+      # Add a new column for Target_Organ
+      studyid_metadata$Target_Organ <- NA
+
+      # assign "Target_Organ" column values randomly
+      # randomly 50% of the value is Liver and rest are not_Liver
+      set.seed(123)  # Set seed for reproducibility
+      rows_number <- nrow(studyid_metadata)  # Number of rows
+
+      # Randomly sample 50% for "Liver" and rest for "not_Liver"
+      studyid_metadata$Target_Organ <- sample(c("Liver", "not_Liver"), size = rows_number, replace = TRUE, prob = c(0.5, 0.5))
+
+      # View the result
+
+    } else {
+
+      # create "studyid_metadata" data frame from "studyid_or_studyids" vector
+      studyid_metadata <- data.frame(STUDYID = studyid_or_studyids)
+
+      # Remove duplicates based on STUDYID
+      studyid_metadata <- studyid_metadata[!duplicated(studyid_metadata$STUDYID), , drop = FALSE]
+
+      # Add a new column for Target_Organ
+      studyid_metadata$Target_Organ <- NA
+
+      # assign "Target_Organ" column values randomly
+      # randomly 50% of the value is Liver and rest are not_Liver
+      set.seed(123)  # Set seed for reproducibility
+      rows_number <- nrow(studyid_metadata)  # Number of rows
+
+      # Randomly sample 50% for "Liver" and rest for "not_Liver"
+      studyid_metadata$Target_Organ <- sample(c("Liver", "not_Liver"), size = rows_number, replace = TRUE, prob = c(0.5, 0.5))
+
+    }
+  }
+
+
+  #-----------------------------------------------------------------------
+  # if studyid_metadata is not provided then use the data frame to
+  # creae a data frame with two columns "STUDYID" and "Target_Organ"
+
+  #-------------------------------------------------------------------------
+
+  # get_liver_om_lb_mi_tox_score_list(
+  calculated_liver_scores <- get_liver_om_lb_mi_tox_score_list(studyid_or_studyids = studyid_or_studyids,
+                                                         path_db = path_db,
+                                                         fake_study = fake_study,
+                                                         use_xpt_file = use_xpt_file,
+                                                         output_individual_scores = TRUE,
+                                                         output_zscore_by_USUBJID = FALSE)
+
+  # Harmonize the column
+  column_harmonized_liverscr_df <- get_col_harmonized_scores_df(liver_score_data_frame = calculated_liver_scores,
+                                                              Round = Round)
+
+
+
+  rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df,
+                                                            studyid_metadata = studyid_metadata,
+                                                            Impute = Impute,
+                                                            Round = Round,
+                                                            reps=reps,
+                                                            holdback=holdback,
+                                                            Undersample = Undersample,
+                                                            hyperparameter_tuning = hyperparameter_tuning,
+                                                            error_correction_method = error_correction_method)
+
+
+
+  rfData <- rfData_and_best_m[["rfData"]]
+
+  # best.m input handling------------------------------------------------
+  # if(is.null(best.m)){
+  #   best.m <- rfData_and_best_m[["best.m"]]
+  #   } else {
+  #   best.m <- best.m
+  # }
+
+
+
+return(Data = rfData)
+
+}
diff --git a/R/get_imp_features_from_rf_model_cv_imp.R b/R/get_imp_features_from_rf_model_cv_imp.R
new file mode 100644
index 0000000..3213a1e
--- /dev/null
+++ b/R/get_imp_features_from_rf_model_cv_imp.R
@@ -0,0 +1,214 @@
+
+get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL,
+                                                  Undersample = FALSE,
+                                                  best.m = NULL, # any numeric value or call function to get it
+                                                  testReps, # testRps must be at least 2;
+                                                  indeterminateUpper,
+                                                  indeterminateLower,
+                                                  Type,
+                                                  nTopImportance) {
+
+  rfData <- scores_df
+    #---------------------------------------------------------------------
+    # Initialize model performance metric trackers------------------------
+    #---------------------------------------------------------------------
+
+    # custom function definition
+    `%ni%` <- Negate('%in%')
+
+    Sensitivity <- NULL
+    Specificity <- NULL
+    PPV <- NULL
+    NPV <- NULL
+    Prevalence <- NULL
+    Accuracy <- NULL
+    nRemoved <- NULL
+
+
+    #-----------------doing cross-validation--------------------------
+    #-----------------------------------------------------------------
+    #------------------------------------------------------------------
+
+    #-----create and prepare "`rfTestData data` frame" for storing predictions----
+    rfTestData <- rfData
+
+    #replaces the existing column names with simple numeric identifiers
+    colnames(rfTestData) <- seq(ncol(rfTestData))
+
+    #emptying the data frame.
+    for (j in seq(ncol(rfTestData))) {
+      rfTestData[,j] <- NA
+    }
+
+    #prepares rfTestData to maintain a consistent structure with the necessary
+    #columns for storing predictions in subsequent iterations of the loop
+    rfTestData <- rfTestData[,1:2] # Keep structure for predictions
+
+    #remove 'gini' from the previous iteration
+    if (exists('gini')) {rm(gini)}
+
+
+    #-------------------------------------------------------------------
+    # model building and testing----------------------------------------
+    #-------------------------------------------------------------------
+
+
+    # Iterate through test repetitions----------------------------------
+    for (i in seq(testReps)) {
+      if (i == 1) {
+        sampleIndicies <- seq(nrow(rfData))
+      }
+      if (i < testReps) {
+        ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F)
+        sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)]
+      } else {
+        ind <- sampleIndicies
+      }
+
+      trainIndex <- which(seq(nrow(rfData)) %ni% ind)
+      testIndex <- ind
+
+      # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack))
+      train <- rfData[trainIndex,]
+
+      #train_data_two <- train
+
+      test <- rfData[testIndex,]
+
+      # rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m,
+      #                                     importance = F, ntree = 500, proximity = T)
+
+
+      # Perform under sampling if enabled
+      if (Undersample == T) {
+        posIndex <- which(train[,1] == 1)
+        nPos <- length(posIndex)
+        # trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F))
+        trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T))
+        train <- train[trainIndex,]
+        test <- rbind(train[-trainIndex,], test)
+      }
+
+      #train_data_two <- train
+
+browser()
+      #model building with current iteration train data
+      # Train Random Forest model--------------------------------------------
+      rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m,
+                                       importance = T, ntree = 500, proximity = T)
+
+      print(rf)
+
+      #----------------------------------------------------------------------
+      #predictions with current model  with current test data
+      # @___________________this_line_has_problems_______
+      # Predict probabilities on test data
+      #----------------------------------------------------------------------
+
+      p2r <- stats::predict(rf, test, type = 'prob')[,1]
+
+      #Store these predictions in a structured data frame
+      rfTestData[names(p2r), i] <- as.numeric(p2r)
+
+
+      #--------------------------------------------------------------------------
+      #--------------------------------------------------------------------------
+      #--------------------------------------------------------------------------
+      #Identifying Indeterminate Predictions (Tracking Indeterminate Predictions)
+      #Keeps track of the proportion of indeterminate predictions in each iteration
+      #Proportion Tracking
+      #------------------------------------------------------------------------
+      #------------------------------------------------------------------------
+
+      indeterminateIndex <- which((p2r < indeterminateUpper)&(p2r > indeterminateLower))
+
+      #Calculating the Proportion of Indeterminate Predictions
+      #Sets the indeterminate predictions to NA, effectively marking them
+      #as missing or invalid.
+      nRemoved <- c(nRemoved, length(indeterminateIndex)/length(p2r))
+
+      #Handling Indeterminate Predictions
+      p2r[indeterminateIndex] <- NA
+
+      #Rounding the Predictions:
+      p2r <- round(p2r)
+
+
+      # Compute confusion matrix and extract metrics using "caret" package----
+
+      Results <- caret::confusionMatrix(factor(p2r, levels = c(1, 0)), factor(test$Target_Organ, levels = c(1, 0)))
+      Sensitivity <- c(Sensitivity, Results$byClass[['Sensitivity']])
+      Specificity <- c(Specificity, Results$byClass[['Specificity']])
+      PPV <- c(PPV, Results$byClass[['Pos Pred Value']])
+      NPV <- c(NPV, Results$byClass[['Neg Pred Value']])
+      Prevalence <- c(Prevalence, Results$byClass[['Prevalence']])
+      Accuracy <- c(Accuracy, Results$byClass[['Balanced Accuracy']])
+
+
+      # Aggregate Gini importance scores
+      giniTmp <-  randomForest::importance(rf, type = Type)
+      if (exists('gini')) {
+        gini <- cbind(gini, giniTmp)
+      } else {
+        gini <- giniTmp
+      }
+    }
+
+
+    #------------------------------------------------------------------------
+    # Performance Summary
+    #-------------------------------------------------------------------------
+
+    PerformanceMatrix <- cbind(Sensitivity,
+                               Specificity,
+                               PPV,
+                               NPV,
+                               Prevalence,
+                               Accuracy,
+                               nRemoved)
+    PerformanceSummary <- colMeans(PerformanceMatrix, na.rm = T)
+    print(PerformanceSummary)
+
+    #-------------------------------------------------------------------------
+    # Feature Importance------------------------------------------------------
+    #-------------------------------------------------------------------------
+
+    print("Feature Importance (Mean Decrease):")
+    print(sort(rowMeans(gini), decreasing = T))
+
+
+    #-------------------------------------------------------------------------
+    # Top Important Features--------------------------------------------------
+    #--------------------------------------------------------------------------
+    imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
+    if (Type == 1) {
+      colnames(imp) <- 'MeanDecreaseAccuracy'
+    } else {
+      colnames(imp) <- 'MeanDecreaseGini'
+    }
+    ord <- order(imp[,1])
+
+    # #------------------------------------------------------------------------
+    # # Dotchart for top Variable Importance
+    # #------------------------------------------------------------------------
+    # dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "",
+    #          main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i])))
+    # # varImpPlot(rf,
+    # #            sort = T,
+    # #            n.var = 20,
+    # #            main = "Top 20 - Variable Importance")
+print(".............................................................................")
+    print(PerformanceSummary)
+
+    return(list(
+      performance_metrics = PerformanceSummary,  # Aggregated performance metrics
+      feature_importance = imp,                  # Top n features by importance
+      raw_results = list(                        # Raw data for debugging or extended analysis
+        sensitivity = Sensitivity,
+        specificity = Specificity,
+        accuracy = Accuracy,
+        gini_scores = gini
+      )
+    ))
+
+  }
diff --git a/R/get_rf_input_param_list_output_cv_imp.R b/R/get_rf_input_param_list_output_cv_imp.R
index 8f2ca12..b0bbfdf 100644
--- a/R/get_rf_input_param_list_output_cv_imp.R
+++ b/R/get_rf_input_param_list_output_cv_imp.R
@@ -15,6 +15,7 @@ get_rf_input_param_list_output_cv_imp <- function(path_db,
                                                   Undersample = FALSE,
                                                   hyperparameter_tuning = FALSE,
                                                   error_correction_method, # = must be 'Flip' or "Prune' or 'None'
+                                                  best.m = NULL, #rf mytr parameter
                                                   testReps , # at least 2
                                                   indeterminateUpper,
                                                   indeterminateLower,
@@ -73,21 +74,27 @@ get_rf_input_param_list_output_cv_imp <- function(path_db,
       }
     }
 
+  #-----------------------------------------------------------------------
+  # if studyid_metadata is not provided then use the data frame to
+  # creae a data frame with two columns "STUDYID" and "Target_Organ"
 
-calculated_liver_scores <- get_liver_om_lb_mi_tox_score_list(studyid_or_studyids = studyid_or_studyids,
+  #-------------------------------------------------------------------------
+
+  # get_liver_om_lb_mi_tox_score_list(
+  calculated_liver_scores <- get_liver_om_lb_mi_tox_score_list(studyid_or_studyids = studyid_or_studyids,
                                                          path_db = path_db,
                                                          fake_study = fake_study,
                                                          use_xpt_file = use_xpt_file,
                                                          output_individual_scores = TRUE,
                                                          output_zscore_by_USUBJID = FALSE)
 
-# Harmonize the column
-column_harmonized_liverscr_df <- get_col_harmonized_scores_df(liver_score_data_frame = calculated_liver_scores,
+  # Harmonize the column
+  column_harmonized_liverscr_df <- get_col_harmonized_scores_df(liver_score_data_frame = calculated_liver_scores,
                                                               Round = Round)
 
 
 
-rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df,
+  rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df,
                                                             studyid_metadata = studyid_metadata,
                                                             Impute = Impute,
                                                             Round = Round,
@@ -99,17 +106,24 @@ rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_h
 
 
 
-rfData <- rfData_and_best_m[["rfData"]]
-best.m <- rfData_and_best_m[["best.m"]]
+  rfData <- rfData_and_best_m[["rfData"]]
 
-train_and_evaluate_rf_model <- get_rf_model_output_cv_imp(scores_df = rfData,
-                                                         Undersample = Undersample,
-                                                         best.m = best.m ,
-                                                         testReps = testReps,
-                                                         indeterminateUpper = indeterminateUpper,
-                                                         indeterminateLower = indeterminateLower,
-                                                         Type = Type ,
-                                                         nTopImportance =  nTopImportance)
+  # best.m input handling------------------------------------------------
+  if(is.null(best.m)){
+    best.m <- rfData_and_best_m[["best.m"]]
+    } else {
+    best.m <- best.m
+  }
+
+
+  train_and_evaluate_rf_model <- get_rf_model_with_cv (scores_data_df = rfData,
+                                                     Undersample = Undersample,
+                                                      best.m = best.m ,
+                                                      testReps = testReps,
+                                                      indeterminateUpper = indeterminateUpper,
+                                                      indeterminateLower = indeterminateLower,
+                                                       Type = Type ,
+                                                       nTopImportance =  nTopImportance)
 
 
 
diff --git a/R/get_rf_model_with_cv.R b/R/get_rf_model_with_cv.R
new file mode 100644
index 0000000..fea7876
--- /dev/null
+++ b/R/get_rf_model_with_cv.R
@@ -0,0 +1,219 @@
+
+get_rf_model_with_cv <- function(Data,
+                                 Undersample = FALSE,
+                                 best.m = NULL, # any numeric value or call function to get it
+                                 testReps, # testRps must be at least 2;
+                                 Type) {
+
+
+# This functin must need a data input.
+# This funcitno is is designed to work with the chanin  way.
+# There is scond function with list of parmaters
+
+
+
+    rfData <- Data
+    #---------------------------------------------------------------------
+    # Initialize model performance metric trackers------------------------
+    #---------------------------------------------------------------------
+
+    # custom function definition
+    `%ni%` <- Negate('%in%')
+
+    Sensitivity <- NULL
+    Specificity <- NULL
+    PPV <- NULL
+    NPV <- NULL
+    Prevalence <- NULL
+    Accuracy <- NULL
+    #nRemoved <- NULL
+
+
+    #-----------------doing cross-validation--------------------------
+    #-----------------------------------------------------------------
+    #------------------------------------------------------------------
+
+    #-----create and prepare "`rfTestData data` frame" for storing predictions----
+    rfTestData <- rfData
+
+    #replaces the existing column names with simple numeric identifiers
+    colnames(rfTestData) <- seq(ncol(rfTestData))
+
+    #emptying the data frame.
+    for (j in seq(ncol(rfTestData))) {
+      rfTestData[,j] <- NA
+    }
+
+    #prepares rfTestData to maintain a consistent structure with the necessary
+    #columns for storing predictions in subsequent iterations of the loop
+    rfTestData <- rfTestData[,1:2] # Keep structure for predictions
+
+    #remove 'gini' from the previous iteration
+    #if (exists('gini')) {rm(gini)}
+
+
+    #-------------------------------------------------------------------
+    # model building and testing----------------------------------------
+    #-------------------------------------------------------------------
+
+
+    # Iterate through test repetitions----------------------------------
+    for (i in seq(testReps)) {
+      if (i == 1) {
+        sampleIndicies <- seq(nrow(rfData))
+      }
+      if (i < testReps) {
+        ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F)
+        sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)]
+      } else {
+        ind <- sampleIndicies
+      }
+
+      trainIndex <- which(seq(nrow(rfData)) %ni% ind)
+      testIndex <- ind
+
+      # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack))
+      train <- rfData[trainIndex,]
+
+      #train_data_two <- train
+
+      test <- rfData[testIndex,]
+
+      # rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m,
+      #                                     importance = F, ntree = 500, proximity = T)
+
+
+      # Perform under sampling if enabled
+      if (Undersample == T) {
+        posIndex <- which(train[,1] == 1)
+        nPos <- length(posIndex)
+        # trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F))
+        trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T))
+        train <- train[trainIndex,]
+        test <- rbind(train[-trainIndex,], test)
+      }
+
+      #train_data_two <- train
+
+
+      #model building with current iteration train data
+      # Train Random Forest model--------------------------------------------
+      rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m,
+                                       importance = T, ntree = 500, proximity = T)
+
+      print(rf)
+
+      #----------------------------------------------------------------------
+      #predictions with current model  with current test data
+      # @___________________this_line_has_problems_______
+      # Predict probabilities on test data
+      #----------------------------------------------------------------------
+
+      p2r <- stats::predict(rf, test, type = 'prob')[,1]
+
+      #Store these predictions in a structured data frame
+      rfTestData[names(p2r), i] <- as.numeric(p2r)
+
+      #Rounding the Predictions:
+      p2r <- round(p2r)
+
+      # Compute confusion matrix and extract metrics using "caret" package----
+
+      Results <- caret::confusionMatrix(factor(p2r, levels = c(1, 0)), factor(test$Target_Organ, levels = c(1, 0)))
+      Sensitivity <- c(Sensitivity, Results$byClass[['Sensitivity']])
+      Specificity <- c(Specificity, Results$byClass[['Specificity']])
+      PPV <- c(PPV, Results$byClass[['Pos Pred Value']])
+      NPV <- c(NPV, Results$byClass[['Neg Pred Value']])
+      Prevalence <- c(Prevalence, Results$byClass[['Prevalence']])
+      Accuracy <- c(Accuracy, Results$byClass[['Balanced Accuracy']])
+
+
+      # # Aggregate Gini importance scores
+      # giniTmp <-  randomForest::importance(rf, type = Type)
+      # if (exists('gini')) {
+      #   gini <- cbind(gini, giniTmp)
+      # } else {
+      #   gini <- giniTmp
+      # }
+    }
+
+
+    #------------------------------------------------------------------------
+    # Performance Summary
+    #-------------------------------------------------------------------------
+
+    PerformanceMatrix <- cbind(Sensitivity,
+                               Specificity,
+                               PPV,
+                               NPV,
+                               Prevalence,
+                               Accuracy)
+    PerformanceSummary <- colMeans(PerformanceMatrix, na.rm = T)
+    print(PerformanceSummary)
+
+    # #-------------------------------------------------------------------------
+    # # Feature Importance------------------------------------------------------
+    # #-------------------------------------------------------------------------
+    #
+    # print("Feature Importance (Mean Decrease):")
+    # print(sort(rowMeans(gini), decreasing = T))
+    #
+    #
+    # #-------------------------------------------------------------------------
+    # # Top Important Features--------------------------------------------------
+    # #--------------------------------------------------------------------------
+    # imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
+    # if (Type == 1) {
+    #   colnames(imp) <- 'MeanDecreaseAccuracy'
+    # } else {
+    #   colnames(imp) <- 'MeanDecreaseGini'
+    # }
+    # ord <- order(imp[,1])
+
+    # #------------------------------------------------------------------------
+    # # Dotchart for top Variable Importance
+    # #------------------------------------------------------------------------
+    # dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "",
+    #          main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i])))
+    # # varImpPlot(rf,
+    # #            sort = T,
+    # #            n.var = 20,
+    # #            main = "Top 20 - Variable Importance")
+print(".............................................................................")
+    print(PerformanceSummary)
+
+    return(list(
+      performance_metrics = PerformanceSummary,  # Aggregated performance metrics
+      #feature_importance = imp,                  # Top n features by importance
+      raw_results = list(                        # Raw data for debugging or extended analysis
+        sensitivity = Sensitivity,
+        specificity = Specificity,
+        accuracy = Accuracy
+
+      )
+    ))
+
+}
+
+#--------------------------------------------------------------------------
+#--------------------------------------------------------------------------
+#--------------------------------------------------------------------------
+#Identifying Indeterminate Predictions (Tracking Indeterminate Predictions)
+#Keeps track of the proportion of indeterminate predictions in each iteration
+#Proportion Tracking
+#------------------------------------------------------------------------
+#------------------------------------------------------------------------
+
+# indeterminateIndex <- which((p2r < indeterminateUpper)&(p2r > indeterminateLower))
+#
+# #Calculating the Proportion of Indeterminate Predictions
+# #Sets the indeterminate predictions to NA, effectively marking them
+# #as missing or invalid.
+# nRemoved <- c(nRemoved, length(indeterminateIndex)/length(p2r))
+#
+# #Handling Indeterminate Predictions
+# p2r[indeterminateIndex] <- NA
+#
+# #Rounding the Predictions:
+# p2r <- round(p2r)
+
diff --git a/R/get_rf_model_cv_imp.R b/R/get_zone_exclusioned_rf_model_cv_imp.R
similarity index 100%
rename from R/get_rf_model_cv_imp.R
rename to R/get_zone_exclusioned_rf_model_cv_imp.R
diff --git a/inst/test_get_Data_formatted_for_ml.R b/inst/test_get_Data_formatted_for_ml.R
new file mode 100644
index 0000000..90b9182
--- /dev/null
+++ b/inst/test_get_Data_formatted_for_ml.R
@@ -0,0 +1,32 @@
+rm(list = ls())
+devtools::load_all(".")
+
+# Initialize a connection to the SQLite database
+#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db'
+
+#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt'
+
+#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE)
+
+path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db"
+
+# studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv",
+#                             header = TRUE, sep = ",", stringsAsFactors = FALSE)
+
+# ----------------------------------------------------
+# For this function we need ml format data
+#----------------------------------------------------
+Data <- get_Data_formatted_for_ml(path_db=path_db,
+                                      rat_studies=TRUE,
+                                      studyid_metadata=NULL,
+                                      fake_study = FALSE,
+                                      use_xpt_file = FALSE,
+                                      Round = TRUE,
+                                      Impute = TRUE,
+                                      reps=1,
+                                      holdback=0.25,
+                                      Undersample =TRUE,
+                                      hyperparameter_tuning = FALSE,
+                                      error_correction_method = 'None' # = must be 'Flip' or "Prune' or 'None'
+                                      )
+
diff --git a/inst/test_get_rf_model_with_cv.R b/inst/test_get_rf_model_with_cv.R
new file mode 100644
index 0000000..b9c9782
--- /dev/null
+++ b/inst/test_get_rf_model_with_cv.R
@@ -0,0 +1,88 @@
+rm(list = ls())
+devtools::load_all(".")
+
+# Initialize a connection to the SQLite database
+path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db'
+
+#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt'
+#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE)
+#path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db"
+studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv",
+                            header = TRUE, sep = ",", stringsAsFactors = FALSE)
+
+# ----------------------------------------------------
+# For this function we need ml format data
+#----------------------------------------------------
+Data <- get_Data_formatted_for_ml(path_db=path_db,
+                                  rat_studies=FALSE,
+                                  studyid_metadata=studyid_metadata,
+                                  fake_study = TRUE,
+                                  use_xpt_file = FALSE,
+                                  Round = TRUE,
+                                  Impute = TRUE,
+                                  reps=1,
+                                  holdback=0.25,
+                                  Undersample =TRUE,
+                                  hyperparameter_tuning = FALSE,
+                                  error_correction_method = 'None' # = must be 'Flip' or "Prune' or 'None'
+)
+
+
+simple_rf_model <- get_rf_model_with_cv(Data = Data,
+                                        Undersample = FALSE,
+                                        best.m = NULL, # any numeric value or call function to get it
+                                        testReps=2, # testRps must be at least 2;
+                                        Type=1)
+
+
+
+# rf_with_intermediate <- get_imp_features_from_rf_model_with_cv(scores_df=Data,
+#                                                    Undersample = TRUE,
+#                                                    best.m = 4, # any numeric value or call function to get it
+#                                                    testReps=2, # testRps must be at least 2;
+#                                                    indeterminateUpper=0.75,
+#                                                    indeterminateLower=0.25,
+#                                                    Type=1,
+#                                                    nTopImportance=20)
+
+
+
+
+
+
+
+
+#rf_model <- get_random_forest_model_amin2(Data=rf_Data)
+
+
+
+# # Create a connection to the database
+# dbtoken <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db)
+#
+# # Retrieve the STUDYID column from the dm table
+# query <- "SELECT STUDYID FROM dm"
+# studyid_data <- DBI::dbGetQuery(dbtoken, query)
+#
+# # Extract unique STUDYID values
+# unique_studyids <- unique(studyid_data$STUDYID)
+#
+# # Disconnect from the database
+# DBI::dbDisconnect(dbtoken)
+#
+# studyid_or_studyids <- unique_studyids
+
+#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE)
+
+# #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+# rm(list = ls())
+# devtools::load_all(".")
+# path_db <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db"
+# studyid_metadata_path <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv"
+#
+# rfData_and_best_m <- get_rfData_and_best_m(
+#   path_db = path_db,
+#   studyid_metadata_path = studyid_metadata_path,
+#   fake_study = TRUE,
+#   Round = TRUE,
+#   Undersample = TRUE
+# )
diff --git a/inst/test_train_eval_rf_wth_cv_imp.R b/inst/test_train_eval_rf_wth_cv_imp.R
deleted file mode 100644
index 3655434..0000000
--- a/inst/test_train_eval_rf_wth_cv_imp.R
+++ /dev/null
@@ -1,66 +0,0 @@
-rm(list = ls())
-devtools::load_all(".")
-
-# Initialize a connection to the SQLite database
-path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db'
-
-#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt'
-#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE)
-#path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db"
-studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv",
-                            header = TRUE, sep = ",", stringsAsFactors = FALSE)
-
-
-train_and_evaluate_rf_model <- train_eval_rf_with_cv_imp(scores_df=NULL,
-                                                         path_db = path_db, # need when scores_df is NULL and need to calculate it
-                                                         studyid_metadata=studyid_metadata,
-                                                         use_xpt_file = FALSE,
-                                                         fake_study= TRUE,
-                                                         Impute = FALSE,
-                                                         Round =TRUE,
-                                                         reps=1, # from 0 to any numeric number
-                                                         holdback=0.25, # either 1 or fraction value like 0.75 etc.
-                                                         Undersample = FALSE,
-                                                         hyperparameter_tuning = FALSE,
-                                                         error_correction_method = 'None',
-                                                         best.m = NULL, # any numeric value or call function to get it
-                                                         testReps=1, # testRps must be at least 2;
-                                                         indeterminateUpper = .75,
-                                                         indeterminateLower = .25,
-                                                         Type=1,
-                                                         nTopImportance=20)
-
-#rf_model <- get_random_forest_model_amin2(Data=rf_Data)
-
-
-
-# # Create a connection to the database
-# dbtoken <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db)
-#
-# # Retrieve the STUDYID column from the dm table
-# query <- "SELECT STUDYID FROM dm"
-# studyid_data <- DBI::dbGetQuery(dbtoken, query)
-#
-# # Extract unique STUDYID values
-# unique_studyids <- unique(studyid_data$STUDYID)
-#
-# # Disconnect from the database
-# DBI::dbDisconnect(dbtoken)
-#
-# studyid_or_studyids <- unique_studyids
-
-#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE)
-
-# #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-# rm(list = ls())
-# devtools::load_all(".")
-# path_db <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db"
-# studyid_metadata_path <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv"
-#
-# rfData_and_best_m <- get_rfData_and_best_m(
-#   path_db = path_db,
-#   studyid_metadata_path = studyid_metadata_path,
-#   fake_study = TRUE,
-#   Round = TRUE,
-#   Undersample = TRUE
-# )