reprtree package installed

aminuldu07 · Dec 30, 2024 · 8fb4e96 · 8fb4e96
1 parent 43c135c
commit 8fb4e96
Show file tree

Hide file tree

Showing 27 changed files with 807 additions and 55 deletions.
diff --git a/R/get_repeat_dose_parallel_studyids.R → R/get_0repeat_dose_parallel_studyids.R b/R/get_repeat_dose_parallel_studyids.R → R/get_0repeat_dose_parallel_studyids.R
diff --git a/R/get_imp_features_from_rf_model_cv_imp.R → R/get_10zone_exclusioned_rf_model_with_cv.R b/R/get_imp_features_from_rf_model_cv_imp.R → R/get_10zone_exclusioned_rf_model_with_cv.R
@@ -1,14 +1,13 @@
 
-get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL,
-                                                  Undersample = FALSE,
-                                                  best.m = NULL, # any numeric value or call function to get it
-                                                  testReps, # testRps must be at least 2;
-                                                  indeterminateUpper,
-                                                  indeterminateLower,
-                                                  Type,
-                                                  nTopImportance) {
-
-  rfData <- scores_df
+get_zone_exclusioned_rf_model_with_cv <- function(Data=NULL, #scores_df
+                                      Undersample = FALSE,
+                                      best.m = NULL, # any numeric value or call function to get it
+                                      testReps, # testRps must be at least 2;
+                                      indeterminateUpper,
+                                      indeterminateLower,
+                                      Type) {
+browser()
+    rfData <- Data #rfData <- scores_df
     #---------------------------------------------------------------------
     # Initialize model performance metric trackers------------------------
     #---------------------------------------------------------------------
@@ -91,7 +90,7 @@ get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL,
 
       #train_data_two <- train
 
-browser()
+
       #model building with current iteration train data
       # Train Random Forest model--------------------------------------------
       rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m,
@@ -146,12 +145,12 @@ browser()
 
 
       # Aggregate Gini importance scores
-      giniTmp <-  randomForest::importance(rf, type = Type)
-      if (exists('gini')) {
-        gini <- cbind(gini, giniTmp)
-      } else {
-        gini <- giniTmp
-      }
+      # giniTmp <-  randomForest::importance(rf, type = Type)
+      # if (exists('gini')) {
+      #   gini <- cbind(gini, giniTmp)
+      # } else {
+      #   gini <- giniTmp
+      # }
     }
 
 
@@ -173,20 +172,20 @@ browser()
     # Feature Importance------------------------------------------------------
     #-------------------------------------------------------------------------
 
-    print("Feature Importance (Mean Decrease):")
-    print(sort(rowMeans(gini), decreasing = T))
+    # print("Feature Importance (Mean Decrease):")
+    # print(sort(rowMeans(gini), decreasing = T))
 
 
     #-------------------------------------------------------------------------
     # Top Important Features--------------------------------------------------
     #--------------------------------------------------------------------------
-    imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
-    if (Type == 1) {
-      colnames(imp) <- 'MeanDecreaseAccuracy'
-    } else {
-      colnames(imp) <- 'MeanDecreaseGini'
-    }
-    ord <- order(imp[,1])
+    # imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
+    # if (Type == 1) {
+    #   colnames(imp) <- 'MeanDecreaseAccuracy'
+    # } else {
+    #   colnames(imp) <- 'MeanDecreaseGini'
+    # }
+    # ord <- order(imp[,1])
 
     # #------------------------------------------------------------------------
     # # Dotchart for top Variable Importance
@@ -202,12 +201,10 @@ print(".........................................................................
 
     return(list(
       performance_metrics = PerformanceSummary,  # Aggregated performance metrics
-      feature_importance = imp,                  # Top n features by importance
       raw_results = list(                        # Raw data for debugging or extended analysis
         sensitivity = Sensitivity,
         specificity = Specificity,
-        accuracy = Accuracy,
-        gini_scores = gini
+        accuracy = Accuracy
       )
     ))
 

diff --git a/R/get_11imp_features_from_rf_model_with_cv.R b/R/get_11imp_features_from_rf_model_with_cv.R
@@ -0,0 +1,150 @@
+
+get_imp_features_from_rf_model_with_cv <- function(Data=NULL, #scores_df
+                                                  Undersample = FALSE,
+                                                  best.m = NULL, # any numeric value or call function to get it
+                                                  testReps, # testRps must be at least 2;
+                                                  Type,
+                                                  nTopImportance) {
+
+
+    rfData <- Data #rfData <- scores_df
+    #---------------------------------------------------------------------
+    # Initialize model performance metric trackers------------------------
+    #---------------------------------------------------------------------
+
+    # custom function definition
+    `%ni%` <- Negate('%in%')
+
+    # Initialize model performance metric trackers
+    Sensitivity <- NULL
+    Specificity <- NULL
+    PPV <- NULL
+    NPV <- NULL
+    Prevalence <- NULL
+    Accuracy <- NULL
+    nRemoved <- NULL
+
+
+    #-----------------doing cross-validation--------------------------
+    #-----------------------------------------------------------------
+    #------------------------------------------------------------------
+
+    #-----create and prepare "`rfTestData data` frame" for storing predictions----
+    rfTestData <- rfData
+
+    #replaces the existing column names with simple numeric identifiers
+    colnames(rfTestData) <- seq(ncol(rfTestData))
+
+    #emptying the data frame.
+    for (j in seq(ncol(rfTestData))) {
+      rfTestData[,j] <- NA
+    }
+
+    #prepares rfTestData to maintain a consistent structure with the necessary
+    #columns for storing predictions in subsequent iterations of the loop
+    rfTestData <- rfTestData[,1:2] # Keep structure for predictions
+
+    #remove 'gini' from the previous iteration
+    if (exists('gini')) {rm(gini)}
+
+
+    #-------------------------------------------------------------------
+    # model building and testing----------------------------------------
+    #-------------------------------------------------------------------
+
+
+    # Perform cross-validation with test repetitions
+    # Iterate through test repetitions----------------------------------
+    for (i in seq(testReps)) {
+      if (i == 1) {
+        sampleIndicies <- seq(nrow(rfData))
+      }
+      if (i < testReps) {
+        ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F)
+        sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)]
+      } else {
+        ind <- sampleIndicies
+      }
+
+      trainIndex <- which(seq(nrow(rfData)) %ni% ind)
+      testIndex <- ind
+
+      # Extract train and test data--------------------------------
+      # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack))
+      train <- rfData[trainIndex,]
+      test <- rfData[testIndex,]
+
+      # rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m,
+      #                                     importance = F, ntree = 500, proximity = T)
+
+
+      # Perform under sampling if enabled
+      if (Undersample == T) {
+        posIndex <- which(train[,1] == 1)
+        nPos <- length(posIndex)
+        # trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F))
+        trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T))
+        train <- train[trainIndex,]
+        test <- rbind(train[-trainIndex,], test)
+      }
+
+
+      # Train Random Forest model with current iteration's train data
+      rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m,
+                                       importance = T, ntree = 500, proximity = T)
+
+      print(rf)
+
+
+      # Calculate Gini importance scores for the model
+      giniTmp <-  randomForest::importance(rf, type = Type)
+
+      # Aggregate Gini importance scores across iterations
+      if (exists('gini')) {
+        gini <- cbind(gini, giniTmp)
+      } else {
+        gini <- giniTmp
+      }
+    }
+
+
+
+    #-------------------------------------------------------------------------
+    # Feature Importance------------------------------------------------------
+    #-------------------------------------------------------------------------
+
+    print("Feature Importance (Mean Decrease):")
+    print(sort(rowMeans(gini), decreasing = T))
+
+
+    #-------------------------------------------------------------------------
+    # Top Important Features--------------------------------------------------
+    #--------------------------------------------------------------------------
+    # Get the top n important features based on Gini importance
+    imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
+    if (Type == 1) {
+      colnames(imp) <- 'MeanDecreaseAccuracy'
+    } else {
+      colnames(imp) <- 'MeanDecreaseGini'
+    }
+    ord <- order(imp[,1])
+
+    # #------------------------------------------------------------------------
+    # # Dotchart for top Variable Importance
+    # #------------------------------------------------------------------------
+    dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "",
+             main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i])))
+
+    # varImpPlot(rf,
+    #            sort = T,
+    #            n.var = 20,
+    #            main = "Top 20 - Variable Importance")
+print(".............................................................................")
+
+    return(list(
+
+        gini_scores = gini
+      )
+    )
+
+  }
diff --git a/R/get_auc_curve_with_rf_model.R → R/get_12auc_curve_with_rf_model.R b/R/get_auc_curve_with_rf_model.R → R/get_12auc_curve_with_rf_model.R
@@ -125,7 +125,7 @@ get_auc_curve_with_rf_model  <- function(Data = NULL, # Input data frame for tra
 
     #Data <- column_harmonized_liverscr_df
 
-    rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df,
+    rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( Data = column_harmonized_liverscr_df,
                                                                 studyid_metadata = studyid_metadata,
                                                                 Impute = Impute,
                                                                 Round = Round,
@@ -137,6 +137,7 @@ get_auc_curve_with_rf_model  <- function(Data = NULL, # Input data frame for tra
 
   }
 
+
   # reassignment of the data
   rfData <- rfData_and_best_m[["rfData"]]
 
@@ -148,9 +149,8 @@ get_auc_curve_with_rf_model  <- function(Data = NULL, # Input data frame for tra
   }
 
 
-  # Train a Random Forest model using the specified mtry value
-  rfAll <- randomForest::randomForest(Target_Organ ~ ., data = rfData, mytry = best.m,
-                                      importance = FALSE, ntree = 500, proximity = TRUE)
+
+
 
   # Predict probabilities and calculate AUC
   pred1 <- stats::predict(rfAll, type = "prob")

diff --git a/R/get_histogram_barplot.R → R/get_13histogram_barplot.R b/R/get_histogram_barplot.R → R/get_13histogram_barplot.R
diff --git a/R/get_compile_data.R → R/get_1compile_data.R b/R/get_compile_data.R → R/get_1compile_data.R
diff --git a/R/get_bw_score.R → R/get_2bw_score.R b/R/get_bw_score.R → R/get_2bw_score.R
diff --git a/R/get_livertobw_score.R → R/get_3livertobw_score.R b/R/get_livertobw_score.R → R/get_3livertobw_score.R
diff --git a/R/get_lb_score.R → R/get_4lb_score.R b/R/get_lb_score.R → R/get_4lb_score.R
diff --git a/R/get_mi_score.R → R/get_5mi_score.R b/R/get_mi_score.R → R/get_5mi_score.R
diff --git a/R/get_liver_om_lb_mi_tox_score_list.R → R/get_6liver_om_lb_mi_tox_score_list.R b/R/get_liver_om_lb_mi_tox_score_list.R → R/get_6liver_om_lb_mi_tox_score_list.R
diff --git a/R/get_col_harmonized_scores_df.R → R/get_7col_harmonized_scores_df.R b/R/get_col_harmonized_scores_df.R → R/get_7col_harmonized_scores_df.R
diff --git a/R/get_ml_data_and_tuned_hyperparameters.R → R/get_8ml_data_and_tuned_hyperparameters.R b/R/get_ml_data_and_tuned_hyperparameters.R → R/get_8ml_data_and_tuned_hyperparameters.R
@@ -36,7 +36,7 @@
 #'   Undersample = TRUE
 #' )
 
-get_ml_data_and_tuned_hyperparameters <- function(scores_df,
+get_ml_data_and_tuned_hyperparameters <- function(Data, # Data == "scores_df"
                                                   studyid_metadata,
                                                   Impute = FALSE,
                                                   Round =FALSE,
@@ -46,7 +46,9 @@ get_ml_data_and_tuned_hyperparameters <- function(scores_df,
                                                   hyperparameter_tuning = FALSE,
                                                   error_correction_method = NULL) { # # Choose: "Flip" or "Prune" or "None"
 
-  input_scores_df <- scores_df
+  input_scores_df <- Data #input_scores_df <- scores_df
+
+
   metadata_df <- studyid_metadata
 
 

diff --git a/R/get_rf_model_with_cv.R → R/get_9rf_model_with_cv.R b/R/get_rf_model_with_cv.R → R/get_9rf_model_with_cv.R
@@ -194,6 +194,59 @@ print(".........................................................................
     ))
 
 }
+
+# #Add a special case for testReps = 1 that directly splits data into train and test sets without looping or iterative sampling.
+# if (testReps == 1) {
+#   # Use a single random split (e.g., 70% train, 30% test)
+#   set.seed(123)  # Ensure reproducibility
+#   trainIndex <- sample(seq(nrow(rfData)), size = floor(0.7 * nrow(rfData)), replace = FALSE)
+#   testIndex <- setdiff(seq(nrow(rfData)), trainIndex)
+#
+#   train <- rfData[trainIndex, ]
+#   test <- rfData[testIndex, ]
+#
+# } else {
+#   # Multiple iterations (original logic)
+#   for (i in seq(testReps)) {
+#     if (i == 1) {
+#       sampleIndicies <- seq(nrow(rfData))
+#     }
+#     if (i < testReps) {
+#       ind <- sample(seq(nrow(rfData)), floor((nrow(rfData) / testReps) - 1), replace = FALSE)
+#       sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)]
+#     } else {
+#       ind <- sampleIndicies
+#     }
+#
+#     trainIndex <- which(seq(nrow(rfData)) %ni% ind)
+#     testIndex <- ind
+#
+#     train <- rfData[trainIndex, ]
+#     test <- rfData[testIndex, ]
+#   }
+# }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 #--------------------------------------------------------------------------
 #--------------------------------------------------------------------------

diff --git a/R/get_rf_input_param_list_output_cv_imp.R → R/get_c2rf_input_param_list_output_cv_imp.R b/R/get_rf_input_param_list_output_cv_imp.R → R/get_c2rf_input_param_list_output_cv_imp.R
diff --git a/R/get_zone_exclusioned_rf_model_cv_imp.R → R/get_c3zone_exclusioned_rf_model_cv_imp.R b/R/get_zone_exclusioned_rf_model_cv_imp.R → R/get_c3zone_exclusioned_rf_model_cv_imp.R
diff --git a/R/get_Data_formatted_for_ml.R → R/get_c_1Data_formatted_for_ml_and_best.m.R b/R/get_Data_formatted_for_ml.R → R/get_c_1Data_formatted_for_ml_and_best.m.R
@@ -3,7 +3,7 @@
 
 
 
-get_Data_formatted_for_ml <- function(path_db,
+get_Data_formatted_for_ml_and_best.m <- function(path_db,
                                       rat_studies=FALSE,
                                       studyid_metadata=NULL,
                                       fake_study = FALSE,
@@ -134,7 +134,7 @@ get_Data_formatted_for_ml <- function(path_db,
 
 
 
-  rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df,
+  rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( Data = column_harmonized_liverscr_df,
                                                             studyid_metadata = studyid_metadata,
                                                             Impute = Impute,
                                                             Round = Round,
@@ -147,8 +147,9 @@ get_Data_formatted_for_ml <- function(path_db,
 
 
   rfData <- rfData_and_best_m[["rfData"]]
+  best.m <- rfData_and_best_m[["best.m"]]
 
-  # best.m input handling------------------------------------------------
+  # # best.m input handling------------------------------------------------
   # if(is.null(best.m)){
   #   best.m <- rfData_and_best_m[["best.m"]]
   #   } else {
@@ -157,6 +158,7 @@ get_Data_formatted_for_ml <- function(path_db,
 
 
 
-return(Data = rfData)
+return(list(Data = rfData,
+            best.m= best.m))
 
 }