diff --git a/R/get_repeat_dose_parallel_studyids.R b/R/get_0repeat_dose_parallel_studyids.R similarity index 100% rename from R/get_repeat_dose_parallel_studyids.R rename to R/get_0repeat_dose_parallel_studyids.R diff --git a/R/get_imp_features_from_rf_model_cv_imp.R b/R/get_10zone_exclusioned_rf_model_with_cv.R similarity index 85% rename from R/get_imp_features_from_rf_model_cv_imp.R rename to R/get_10zone_exclusioned_rf_model_with_cv.R index 3213a1e..6196177 100644 --- a/R/get_imp_features_from_rf_model_cv_imp.R +++ b/R/get_10zone_exclusioned_rf_model_with_cv.R @@ -1,14 +1,13 @@ -get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL, - Undersample = FALSE, - best.m = NULL, # any numeric value or call function to get it - testReps, # testRps must be at least 2; - indeterminateUpper, - indeterminateLower, - Type, - nTopImportance) { - - rfData <- scores_df +get_zone_exclusioned_rf_model_with_cv <- function(Data=NULL, #scores_df + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps, # testRps must be at least 2; + indeterminateUpper, + indeterminateLower, + Type) { +browser() + rfData <- Data #rfData <- scores_df #--------------------------------------------------------------------- # Initialize model performance metric trackers------------------------ #--------------------------------------------------------------------- @@ -91,7 +90,7 @@ get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL, #train_data_two <- train -browser() + #model building with current iteration train data # Train Random Forest model-------------------------------------------- rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m, @@ -146,12 +145,12 @@ browser() # Aggregate Gini importance scores - giniTmp <- randomForest::importance(rf, type = Type) - if (exists('gini')) { - gini <- cbind(gini, giniTmp) - } else { - gini <- giniTmp - } + # giniTmp <- randomForest::importance(rf, type = Type) + # if (exists('gini')) { + # gini <- cbind(gini, giniTmp) + # } else { + # gini <- giniTmp + # } } @@ -173,20 +172,20 @@ browser() # Feature Importance------------------------------------------------------ #------------------------------------------------------------------------- - print("Feature Importance (Mean Decrease):") - print(sort(rowMeans(gini), decreasing = T)) + # print("Feature Importance (Mean Decrease):") + # print(sort(rowMeans(gini), decreasing = T)) #------------------------------------------------------------------------- # Top Important Features-------------------------------------------------- #-------------------------------------------------------------------------- - imp <- as.matrix(rowMeans(gini)[1:nTopImportance]) - if (Type == 1) { - colnames(imp) <- 'MeanDecreaseAccuracy' - } else { - colnames(imp) <- 'MeanDecreaseGini' - } - ord <- order(imp[,1]) + # imp <- as.matrix(rowMeans(gini)[1:nTopImportance]) + # if (Type == 1) { + # colnames(imp) <- 'MeanDecreaseAccuracy' + # } else { + # colnames(imp) <- 'MeanDecreaseGini' + # } + # ord <- order(imp[,1]) # #------------------------------------------------------------------------ # # Dotchart for top Variable Importance @@ -202,12 +201,10 @@ print("......................................................................... return(list( performance_metrics = PerformanceSummary, # Aggregated performance metrics - feature_importance = imp, # Top n features by importance raw_results = list( # Raw data for debugging or extended analysis sensitivity = Sensitivity, specificity = Specificity, - accuracy = Accuracy, - gini_scores = gini + accuracy = Accuracy ) )) diff --git a/R/get_11imp_features_from_rf_model_with_cv.R b/R/get_11imp_features_from_rf_model_with_cv.R new file mode 100644 index 0000000..329be39 --- /dev/null +++ b/R/get_11imp_features_from_rf_model_with_cv.R @@ -0,0 +1,150 @@ + +get_imp_features_from_rf_model_with_cv <- function(Data=NULL, #scores_df + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps, # testRps must be at least 2; + Type, + nTopImportance) { + + + rfData <- Data #rfData <- scores_df + #--------------------------------------------------------------------- + # Initialize model performance metric trackers------------------------ + #--------------------------------------------------------------------- + + # custom function definition + `%ni%` <- Negate('%in%') + + # Initialize model performance metric trackers + Sensitivity <- NULL + Specificity <- NULL + PPV <- NULL + NPV <- NULL + Prevalence <- NULL + Accuracy <- NULL + nRemoved <- NULL + + + #-----------------doing cross-validation-------------------------- + #----------------------------------------------------------------- + #------------------------------------------------------------------ + + #-----create and prepare "`rfTestData data` frame" for storing predictions---- + rfTestData <- rfData + + #replaces the existing column names with simple numeric identifiers + colnames(rfTestData) <- seq(ncol(rfTestData)) + + #emptying the data frame. + for (j in seq(ncol(rfTestData))) { + rfTestData[,j] <- NA + } + + #prepares rfTestData to maintain a consistent structure with the necessary + #columns for storing predictions in subsequent iterations of the loop + rfTestData <- rfTestData[,1:2] # Keep structure for predictions + + #remove 'gini' from the previous iteration + if (exists('gini')) {rm(gini)} + + + #------------------------------------------------------------------- + # model building and testing---------------------------------------- + #------------------------------------------------------------------- + + + # Perform cross-validation with test repetitions + # Iterate through test repetitions---------------------------------- + for (i in seq(testReps)) { + if (i == 1) { + sampleIndicies <- seq(nrow(rfData)) + } + if (i < testReps) { + ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F) + sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)] + } else { + ind <- sampleIndicies + } + + trainIndex <- which(seq(nrow(rfData)) %ni% ind) + testIndex <- ind + + # Extract train and test data-------------------------------- + # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack)) + train <- rfData[trainIndex,] + test <- rfData[testIndex,] + + # rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m, + # importance = F, ntree = 500, proximity = T) + + + # Perform under sampling if enabled + if (Undersample == T) { + posIndex <- which(train[,1] == 1) + nPos <- length(posIndex) + # trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F)) + trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T)) + train <- train[trainIndex,] + test <- rbind(train[-trainIndex,], test) + } + + + # Train Random Forest model with current iteration's train data + rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m, + importance = T, ntree = 500, proximity = T) + + print(rf) + + + # Calculate Gini importance scores for the model + giniTmp <- randomForest::importance(rf, type = Type) + + # Aggregate Gini importance scores across iterations + if (exists('gini')) { + gini <- cbind(gini, giniTmp) + } else { + gini <- giniTmp + } + } + + + + #------------------------------------------------------------------------- + # Feature Importance------------------------------------------------------ + #------------------------------------------------------------------------- + + print("Feature Importance (Mean Decrease):") + print(sort(rowMeans(gini), decreasing = T)) + + + #------------------------------------------------------------------------- + # Top Important Features-------------------------------------------------- + #-------------------------------------------------------------------------- + # Get the top n important features based on Gini importance + imp <- as.matrix(rowMeans(gini)[1:nTopImportance]) + if (Type == 1) { + colnames(imp) <- 'MeanDecreaseAccuracy' + } else { + colnames(imp) <- 'MeanDecreaseGini' + } + ord <- order(imp[,1]) + + # #------------------------------------------------------------------------ + # # Dotchart for top Variable Importance + # #------------------------------------------------------------------------ + dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "", + main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i]))) + + # varImpPlot(rf, + # sort = T, + # n.var = 20, + # main = "Top 20 - Variable Importance") +print(".............................................................................") + + return(list( + + gini_scores = gini + ) + ) + + } diff --git a/R/get_auc_curve_with_rf_model.R b/R/get_12auc_curve_with_rf_model.R similarity index 96% rename from R/get_auc_curve_with_rf_model.R rename to R/get_12auc_curve_with_rf_model.R index df632ba..4e14ff3 100644 --- a/R/get_auc_curve_with_rf_model.R +++ b/R/get_12auc_curve_with_rf_model.R @@ -125,7 +125,7 @@ get_auc_curve_with_rf_model <- function(Data = NULL, # Input data frame for tra #Data <- column_harmonized_liverscr_df - rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df, + rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( Data = column_harmonized_liverscr_df, studyid_metadata = studyid_metadata, Impute = Impute, Round = Round, @@ -137,6 +137,7 @@ get_auc_curve_with_rf_model <- function(Data = NULL, # Input data frame for tra } + # reassignment of the data rfData <- rfData_and_best_m[["rfData"]] @@ -148,9 +149,8 @@ get_auc_curve_with_rf_model <- function(Data = NULL, # Input data frame for tra } - # Train a Random Forest model using the specified mtry value - rfAll <- randomForest::randomForest(Target_Organ ~ ., data = rfData, mytry = best.m, - importance = FALSE, ntree = 500, proximity = TRUE) + + # Predict probabilities and calculate AUC pred1 <- stats::predict(rfAll, type = "prob") diff --git a/R/get_histogram_barplot.R b/R/get_13histogram_barplot.R similarity index 100% rename from R/get_histogram_barplot.R rename to R/get_13histogram_barplot.R diff --git a/R/get_compile_data.R b/R/get_1compile_data.R similarity index 100% rename from R/get_compile_data.R rename to R/get_1compile_data.R diff --git a/R/get_bw_score.R b/R/get_2bw_score.R similarity index 100% rename from R/get_bw_score.R rename to R/get_2bw_score.R diff --git a/R/get_livertobw_score.R b/R/get_3livertobw_score.R similarity index 100% rename from R/get_livertobw_score.R rename to R/get_3livertobw_score.R diff --git a/R/get_lb_score.R b/R/get_4lb_score.R similarity index 100% rename from R/get_lb_score.R rename to R/get_4lb_score.R diff --git a/R/get_mi_score.R b/R/get_5mi_score.R similarity index 100% rename from R/get_mi_score.R rename to R/get_5mi_score.R diff --git a/R/get_liver_om_lb_mi_tox_score_list.R b/R/get_6liver_om_lb_mi_tox_score_list.R similarity index 100% rename from R/get_liver_om_lb_mi_tox_score_list.R rename to R/get_6liver_om_lb_mi_tox_score_list.R diff --git a/R/get_col_harmonized_scores_df.R b/R/get_7col_harmonized_scores_df.R similarity index 100% rename from R/get_col_harmonized_scores_df.R rename to R/get_7col_harmonized_scores_df.R diff --git a/R/get_ml_data_and_tuned_hyperparameters.R b/R/get_8ml_data_and_tuned_hyperparameters.R similarity index 98% rename from R/get_ml_data_and_tuned_hyperparameters.R rename to R/get_8ml_data_and_tuned_hyperparameters.R index 87324d1..c86e1b6 100644 --- a/R/get_ml_data_and_tuned_hyperparameters.R +++ b/R/get_8ml_data_and_tuned_hyperparameters.R @@ -36,7 +36,7 @@ #' Undersample = TRUE #' ) -get_ml_data_and_tuned_hyperparameters <- function(scores_df, +get_ml_data_and_tuned_hyperparameters <- function(Data, # Data == "scores_df" studyid_metadata, Impute = FALSE, Round =FALSE, @@ -46,7 +46,9 @@ get_ml_data_and_tuned_hyperparameters <- function(scores_df, hyperparameter_tuning = FALSE, error_correction_method = NULL) { # # Choose: "Flip" or "Prune" or "None" - input_scores_df <- scores_df + input_scores_df <- Data #input_scores_df <- scores_df + + metadata_df <- studyid_metadata diff --git a/R/get_rf_model_with_cv.R b/R/get_9rf_model_with_cv.R similarity index 88% rename from R/get_rf_model_with_cv.R rename to R/get_9rf_model_with_cv.R index fea7876..9120638 100644 --- a/R/get_rf_model_with_cv.R +++ b/R/get_9rf_model_with_cv.R @@ -194,6 +194,59 @@ print("......................................................................... )) } + +# #Add a special case for testReps = 1 that directly splits data into train and test sets without looping or iterative sampling. +# if (testReps == 1) { +# # Use a single random split (e.g., 70% train, 30% test) +# set.seed(123) # Ensure reproducibility +# trainIndex <- sample(seq(nrow(rfData)), size = floor(0.7 * nrow(rfData)), replace = FALSE) +# testIndex <- setdiff(seq(nrow(rfData)), trainIndex) +# +# train <- rfData[trainIndex, ] +# test <- rfData[testIndex, ] +# +# } else { +# # Multiple iterations (original logic) +# for (i in seq(testReps)) { +# if (i == 1) { +# sampleIndicies <- seq(nrow(rfData)) +# } +# if (i < testReps) { +# ind <- sample(seq(nrow(rfData)), floor((nrow(rfData) / testReps) - 1), replace = FALSE) +# sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)] +# } else { +# ind <- sampleIndicies +# } +# +# trainIndex <- which(seq(nrow(rfData)) %ni% ind) +# testIndex <- ind +# +# train <- rfData[trainIndex, ] +# test <- rfData[testIndex, ] +# } +# } + + + + + + + + + + + + + + + + + + + + + + #-------------------------------------------------------------------------- #-------------------------------------------------------------------------- diff --git a/R/get_rf_input_param_list_output_cv_imp.R b/R/get_c2rf_input_param_list_output_cv_imp.R similarity index 100% rename from R/get_rf_input_param_list_output_cv_imp.R rename to R/get_c2rf_input_param_list_output_cv_imp.R diff --git a/R/get_zone_exclusioned_rf_model_cv_imp.R b/R/get_c3zone_exclusioned_rf_model_cv_imp.R similarity index 100% rename from R/get_zone_exclusioned_rf_model_cv_imp.R rename to R/get_c3zone_exclusioned_rf_model_cv_imp.R diff --git a/R/get_Data_formatted_for_ml.R b/R/get_c_1Data_formatted_for_ml_and_best.m.R similarity index 94% rename from R/get_Data_formatted_for_ml.R rename to R/get_c_1Data_formatted_for_ml_and_best.m.R index 9e3be6b..750b707 100644 --- a/R/get_Data_formatted_for_ml.R +++ b/R/get_c_1Data_formatted_for_ml_and_best.m.R @@ -3,7 +3,7 @@ -get_Data_formatted_for_ml <- function(path_db, +get_Data_formatted_for_ml_and_best.m <- function(path_db, rat_studies=FALSE, studyid_metadata=NULL, fake_study = FALSE, @@ -134,7 +134,7 @@ get_Data_formatted_for_ml <- function(path_db, - rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df, + rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( Data = column_harmonized_liverscr_df, studyid_metadata = studyid_metadata, Impute = Impute, Round = Round, @@ -147,8 +147,9 @@ get_Data_formatted_for_ml <- function(path_db, rfData <- rfData_and_best_m[["rfData"]] + best.m <- rfData_and_best_m[["best.m"]] - # best.m input handling------------------------------------------------ + # # best.m input handling------------------------------------------------ # if(is.null(best.m)){ # best.m <- rfData_and_best_m[["best.m"]] # } else { @@ -157,6 +158,7 @@ get_Data_formatted_for_ml <- function(path_db, -return(Data = rfData) +return(list(Data = rfData, + best.m= best.m)) } diff --git a/R/get_reprtree_from_rf_model .R b/R/get_reprtree_from_rf_model .R new file mode 100644 index 0000000..5defa1a --- /dev/null +++ b/R/get_reprtree_from_rf_model .R @@ -0,0 +1,93 @@ + +get_reprtree_from_rf_model <- function ( Data=NULL, + path_db, + rat_studies=FALSE, + studyid_metadata=NULL, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + Impute = FALSE, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method) { # = must be 'Flip' or "Prune' or 'None' + + browser() + if(is.null(Data)){ + data_and_best.m <- get_Data_formatted_for_ml_and_best.m(path_db=path_db, + rat_studies=rat_studies, + studyid_metadata=studyid_metadata, + fake_study = fake_study, + use_xpt_file = use_xpt_file, + Round = Round, + Impute = Impute, + reps=reps, + holdback=holdback, + Undersample = Undersample, + hyperparameter_tuning = hyperparameter_tuning, + error_correction_method=error_correction_method) # = must be 'Flip' or "Prune' or 'None' + + } + browser() + Data <- data_and_best.m[["Data"]] + best.m <- data_and_best.m[["best.m"]] + + # First way--------------------------------------------------------- + # Use a single random split (e.g., 70% train, 30% test) + set.seed(123) # Ensure reproducibility + trainIndex <- sample(seq(nrow(Data)), size = floor(0.7 * nrow(Data)), replace = FALSE) + testIndex <- setdiff(seq(nrow(Data)), trainIndex) + + train <- Data[trainIndex, ] + test <- Data[testIndex, ] + + if (Undersample == T) { + + posIndex <- which(train[,1] == 1) + nPos <- length(posIndex) + trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T)) + train <- train[trainIndex,] + test <- rbind(train[-trainIndex,], test) + } + + # # Second wat---------------------------------------------------- + # # Use a single random split (e.g., 70% train, 30% test) + # set.seed(123) # Ensure reproducibility + # trainIndex <- sample(seq(nrow(rfData)), size = floor(0.7 * nrow(rfData)), replace = FALSE) + # testIndex <- setdiff(seq(nrow(rfData)), trainIndex) + # + # train <- rfData[trainIndex, ] + # test <- rfData[testIndex, ] + # + # # Handle undersampling if required + # if (Undersample == TRUE) { + # + # # Find indices of positive samples in the training data + # posIndex <- which(train[, 1] == 1) + # nPos <- length(posIndex) + # + # # Sample the same number of negative samples (undersampling) + # negIndex <- sample(which(train[, 1] == 0), nPos, replace = FALSE) + # + # # Create balanced training data + # balancedIndex <- c(posIndex, negIndex) + # train <- train[balancedIndex, ] + # } + + + # Train a Random Forest model using the specified mtry value + rfAll <- randomForest::randomForest(Target_Organ ~ ., + data = Data, + mytry = best.m, + importance = FALSE, + ntree = 500, + proximity = TRUE) + + ReprTree <- reprtree::ReprTree(rfAll, + train, + metric='d2') + + plot.reprtree(ReprTree(rfAll, train, metric='d2')) + +} diff --git a/inst/Needed_function/xx_get_11imp_features_from_rf_model_with_cv.R b/inst/Needed_function/xx_get_11imp_features_from_rf_model_with_cv.R new file mode 100644 index 0000000..84e2e17 --- /dev/null +++ b/inst/Needed_function/xx_get_11imp_features_from_rf_model_with_cv.R @@ -0,0 +1,215 @@ + +xxget_imp_features_from_rf_model_with_cv <- function(Data=NULL, #scores_df + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps, # testRps must be at least 2; + indeterminateUpper, + indeterminateLower, + Type, + nTopImportance) { + + rfData <- Data #rfData <- scores_df + #--------------------------------------------------------------------- + # Initialize model performance metric trackers------------------------ + #--------------------------------------------------------------------- + + # custom function definition + `%ni%` <- Negate('%in%') + + # Initialize model performance metric trackers + Sensitivity <- NULL + Specificity <- NULL + PPV <- NULL + NPV <- NULL + Prevalence <- NULL + Accuracy <- NULL + nRemoved <- NULL + + + #-----------------doing cross-validation-------------------------- + #----------------------------------------------------------------- + #------------------------------------------------------------------ + + #-----create and prepare "`rfTestData data` frame" for storing predictions---- + rfTestData <- rfData + + #replaces the existing column names with simple numeric identifiers + colnames(rfTestData) <- seq(ncol(rfTestData)) + + #emptying the data frame. + for (j in seq(ncol(rfTestData))) { + rfTestData[,j] <- NA + } + + #prepares rfTestData to maintain a consistent structure with the necessary + #columns for storing predictions in subsequent iterations of the loop + rfTestData <- rfTestData[,1:2] # Keep structure for predictions + + #remove 'gini' from the previous iteration + if (exists('gini')) {rm(gini)} + + + #------------------------------------------------------------------- + # model building and testing---------------------------------------- + #------------------------------------------------------------------- + + + # Perform cross-validation with test repetitions + # Iterate through test repetitions---------------------------------- + for (i in seq(testReps)) { + if (i == 1) { + sampleIndicies <- seq(nrow(rfData)) + } + if (i < testReps) { + ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F) + sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)] + } else { + ind <- sampleIndicies + } + + trainIndex <- which(seq(nrow(rfData)) %ni% ind) + testIndex <- ind + + # Extract train and test data-------------------------------- + # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack)) + train <- rfData[trainIndex,] + test <- rfData[testIndex,] + + # rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m, + # importance = F, ntree = 500, proximity = T) + + + # Perform under sampling if enabled + if (Undersample == T) { + posIndex <- which(train[,1] == 1) + nPos <- length(posIndex) + # trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F)) + trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T)) + train <- train[trainIndex,] + test <- rbind(train[-trainIndex,], test) + } + +browser() + # Train Random Forest model with current iteration's train data + rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m, + importance = T, ntree = 500, proximity = T) + + print(rf) + + # #---------------------------------------------------------------------- + # #predictions with current model with current test data + # # @___________________this_line_has_problems_______ + # # Predict probabilities on test data + # #---------------------------------------------------------------------- + # + # p2r <- stats::predict(rf, test, type = 'prob')[,1] + # + # #Store these predictions in a structured data frame + # rfTestData[names(p2r), i] <- as.numeric(p2r) + # + # + # #-------------------------------------------------------------------------- + # #-------------------------------------------------------------------------- + # #-------------------------------------------------------------------------- + # #Identifying Indeterminate Predictions (Tracking Indeterminate Predictions) + # #Keeps track of the proportion of indeterminate predictions in each iteration + # #Proportion Tracking + # #------------------------------------------------------------------------ + # #------------------------------------------------------------------------ + # + # indeterminateIndex <- which((p2r < indeterminateUpper)&(p2r > indeterminateLower)) + # + # #Calculating the Proportion of Indeterminate Predictions + # #Sets the indeterminate predictions to NA, effectively marking them + # #as missing or invalid. + # nRemoved <- c(nRemoved, length(indeterminateIndex)/length(p2r)) + # + # #Handling Indeterminate Predictions + # p2r[indeterminateIndex] <- NA + # + # #Rounding the Predictions: + # p2r <- round(p2r) + + + # # Compute confusion matrix and extract metrics using "caret" package---- + # + # Results <- caret::confusionMatrix(factor(p2r, levels = c(1, 0)), factor(test$Target_Organ, levels = c(1, 0))) + # Sensitivity <- c(Sensitivity, Results$byClass[['Sensitivity']]) + # Specificity <- c(Specificity, Results$byClass[['Specificity']]) + # PPV <- c(PPV, Results$byClass[['Pos Pred Value']]) + # NPV <- c(NPV, Results$byClass[['Neg Pred Value']]) + # Prevalence <- c(Prevalence, Results$byClass[['Prevalence']]) + # Accuracy <- c(Accuracy, Results$byClass[['Balanced Accuracy']]) + + + + # Calculate Gini importance scores for the model + giniTmp <- randomForest::importance(rf, type = Type) + + # Aggregate Gini importance scores across iterations + if (exists('gini')) { + gini <- cbind(gini, giniTmp) + } else { + gini <- giniTmp + } + } + + + #------------------------------------------------------------------------ + # Performance Summary + #------------------------------------------------------------------------- + + PerformanceMatrix <- cbind(Sensitivity, + Specificity, + PPV, + NPV, + Prevalence, + Accuracy, + nRemoved) + PerformanceSummary <- colMeans(PerformanceMatrix, na.rm = T) + print(PerformanceSummary) + + #------------------------------------------------------------------------- + # Feature Importance------------------------------------------------------ + #------------------------------------------------------------------------- + + print("Feature Importance (Mean Decrease):") + print(sort(rowMeans(gini), decreasing = T)) + + + #------------------------------------------------------------------------- + # Top Important Features-------------------------------------------------- + #-------------------------------------------------------------------------- + imp <- as.matrix(rowMeans(gini)[1:nTopImportance]) + if (Type == 1) { + colnames(imp) <- 'MeanDecreaseAccuracy' + } else { + colnames(imp) <- 'MeanDecreaseGini' + } + ord <- order(imp[,1]) + + # #------------------------------------------------------------------------ + # # Dotchart for top Variable Importance + # #------------------------------------------------------------------------ + dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "", + main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i]))) + + varImpPlot(rf, + sort = T, + n.var = 20, + main = "Top 20 - Variable Importance") +print(".............................................................................") + print(PerformanceSummary) + + return(list( + performance_metrics = PerformanceSummary, # Aggregated performance metrics + feature_importance = imp, # Top n features by importance + raw_results = list( # Raw data for debugging or extended analysis + sensitivity = Sensitivity, + specificity = Specificity, + accuracy = Accuracy, + gini_scores = gini + ) + )) + + } diff --git a/R/get_all_LB_TESTCD_score.R b/inst/R_move_to_R/get_all_LB_TESTCD_score.R similarity index 100% rename from R/get_all_LB_TESTCD_score.R rename to inst/R_move_to_R/get_all_LB_TESTCD_score.R diff --git a/R/get_treatment_group_&_dose.R b/inst/R_move_to_R/get_treatment_group_&_dose.R similarity index 100% rename from R/get_treatment_group_&_dose.R rename to inst/R_move_to_R/get_treatment_group_&_dose.R diff --git a/inst/needed_important/xx_org_train_random_forest_model.R b/inst/needed_important/xx_org_train_random_forest_model.R index 86a47e4..4275151 100644 --- a/inst/needed_important/xx_org_train_random_forest_model.R +++ b/inst/needed_important/xx_org_train_random_forest_model.R @@ -435,8 +435,8 @@ get_random_forest_model <- function(Liver_get_liver_om_lb_mi_tox_score_list, # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack)) train <- rfData[trainIndex,] - train_data_two <- train - print(dim(train_data_two)) + #train_data_two <- train + #print(dim(train_data_two)) test <- rfData[testIndex,] @@ -453,9 +453,9 @@ get_random_forest_model <- function(Liver_get_liver_om_lb_mi_tox_score_list, train <- train[trainIndex,] test <- rbind(train[-trainIndex,], test) } - - train_data_two <- train - print(dim(train_data_two)) +# +# train_data_two <- train +# print(dim(train_data_two)) #model building with current train data diff --git a/inst/test_get_imp_features_from_rf_model_with_cv.R b/inst/test_get_imp_features_from_rf_model_with_cv.R new file mode 100644 index 0000000..2485c2a --- /dev/null +++ b/inst/test_get_imp_features_from_rf_model_with_cv.R @@ -0,0 +1,41 @@ +rm(list = ls()) +devtools::load_all(".") + +# Initialize a connection to the SQLite database +path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db' + +#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt' +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) +#path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db" +studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv", + header = TRUE, sep = ",", stringsAsFactors = FALSE) + +#---------------------------------------------------- +# For this function we need ml format data +#---------------------------------------------------- +Data <- get_Data_formatted_for_ml(path_db=path_db, + rat_studies=FALSE, + studyid_metadata=studyid_metadata, + fake_study = TRUE, + use_xpt_file = FALSE, + Round = TRUE, + Impute = TRUE, + reps=1, + holdback=0.25, + Undersample =TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = 'None') # = must be 'Flip' or "Prune' or 'None' + + + +gini_imp <- get_imp_features_from_rf_model_with_cv(Data=Data, #scores_df + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps=2, # testRps must be at least 2; + Type=1, + nTopImportance=20) + + + + + diff --git a/inst/test_get_reprtree_from_rf_model .R b/inst/test_get_reprtree_from_rf_model .R new file mode 100644 index 0000000..ed2b262 --- /dev/null +++ b/inst/test_get_reprtree_from_rf_model .R @@ -0,0 +1,89 @@ +rm(list = ls()) +devtools::load_all(".") + +# Initialize a connection to the SQLite database +path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db' + +#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt' +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) +#path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db" +studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv", + header = TRUE, sep = ",", stringsAsFactors = FALSE) + +# ---------------------------------------------------- +# For this function we need ml format data +#---------------------------------------------------- +reprtreeddd <- get_reprtree_from_rf_model(Data=NULL, + path_db=path_db, + rat_studies=FALSE, + studyid_metadata=studyid_metadata, + fake_study = TRUE, + use_xpt_file = FALSE, + Round = TRUE, + Impute = TRUE, + reps=1, + holdback=0.25, + Undersample =TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = 'None' # = must be 'Flip' or "Prune' or 'None' +) + + +# simple_rf_model <- get_rf_model_with_cv(Data = Data, +# Undersample = FALSE, +# best.m = NULL, # any numeric value or call function to get it +# testReps=2, # testRps must be at least 2; +# Type=1) + + + +# rf_with_intermediate <- get_imp_features_from_rf_model_with_cv(scores_df=Data, +# Undersample = TRUE, +# best.m = 4, # any numeric value or call function to get it +# testReps=2, # testRps must be at least 2; +# indeterminateUpper=0.75, +# indeterminateLower=0.25, +# Type=1, +# nTopImportance=20) + + + + + + + + +#rf_model <- get_random_forest_model_amin2(Data=rf_Data) + + + +# # Create a connection to the database +# dbtoken <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db) +# +# # Retrieve the STUDYID column from the dm table +# query <- "SELECT STUDYID FROM dm" +# studyid_data <- DBI::dbGetQuery(dbtoken, query) +# +# # Extract unique STUDYID values +# unique_studyids <- unique(studyid_data$STUDYID) +# +# # Disconnect from the database +# DBI::dbDisconnect(dbtoken) +# +# studyid_or_studyids <- unique_studyids + +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) + +# #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +# rm(list = ls()) +# devtools::load_all(".") +# path_db <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db" +# studyid_metadata_path <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv" +# +# rfData_and_best_m <- get_rfData_and_best_m( +# path_db = path_db, +# studyid_metadata_path = studyid_metadata_path, +# fake_study = TRUE, +# Round = TRUE, +# Undersample = TRUE +# ) diff --git a/inst/test_get_zone_exclusioned_rf_model_with_cv.R b/inst/test_get_zone_exclusioned_rf_model_with_cv.R new file mode 100644 index 0000000..1bce923 --- /dev/null +++ b/inst/test_get_zone_exclusioned_rf_model_with_cv.R @@ -0,0 +1,110 @@ +rm(list = ls()) +devtools::load_all(".") + +# Initialize a connection to the SQLite database +path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db' + +#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt' +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) +#path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db" +studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv", + header = TRUE, sep = ",", stringsAsFactors = FALSE) + +# ---------------------------------------------------- +# For this function we need ml format data +#---------------------------------------------------- +Data <- get_Data_formatted_for_ml(path_db=path_db, + rat_studies=FALSE, + studyid_metadata=studyid_metadata, + fake_study = TRUE, + use_xpt_file = FALSE, + Round = TRUE, + Impute = TRUE, + reps=1, + holdback=0.25, + Undersample =TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = 'None' # = must be 'Flip' or "Prune' or 'None' +) + +# +# simple_rf_model <- get_rf_model_with_cv(Data = Data, +# Undersample = FALSE, +# best.m = NULL, # any numeric value or call function to get it +# testReps=2, # testRps must be at least 2; +# Type=1) + + +get_zone_exclusioned_rf_model <- get_zone_exclusioned_rf_model_with_cv(Data=Data, #scores_df + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps=2, # testRps must be at least 2; + indeterminateUpper=0.75, + indeterminateLower=0.25, + Type=1) + + + + + + + + + + + + + + + + +# rf_with_intermediate <- get_imp_features_from_rf_model_with_cv(scores_df=Data, +# Undersample = TRUE, +# best.m = 4, # any numeric value or call function to get it +# testReps=2, # testRps must be at least 2; +# indeterminateUpper=0.75, +# indeterminateLower=0.25, +# Type=1, +# nTopImportance=20) + + + + + + + + +#rf_model <- get_random_forest_model_amin2(Data=rf_Data) + + + +# # Create a connection to the database +# dbtoken <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db) +# +# # Retrieve the STUDYID column from the dm table +# query <- "SELECT STUDYID FROM dm" +# studyid_data <- DBI::dbGetQuery(dbtoken, query) +# +# # Extract unique STUDYID values +# unique_studyids <- unique(studyid_data$STUDYID) +# +# # Disconnect from the database +# DBI::dbDisconnect(dbtoken) +# +# studyid_or_studyids <- unique_studyids + +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) + +# #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +# rm(list = ls()) +# devtools::load_all(".") +# path_db <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db" +# studyid_metadata_path <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv" +# +# rfData_and_best_m <- get_rfData_and_best_m( +# path_db = path_db, +# studyid_metadata_path = studyid_metadata_path, +# fake_study = TRUE, +# Round = TRUE, +# Undersample = TRUE +# ) diff --git a/inst/vvc_rf_functions/histo.R b/inst/vvc_rf_functions/histo.R new file mode 100644 index 0000000..e69de29 diff --git a/inst/vvc_rf_functions/visualization.R b/inst/vvc_rf_functions/visualization.R index 67fc320..ee8810c 100644 --- a/inst/vvc_rf_functions/visualization.R +++ b/inst/vvc_rf_functions/visualization.R @@ -4,17 +4,17 @@ ###-----------------------@ROC-Curve-and-AUC------------------------------------ # @ROC-Curve-and-AUC~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -pred1= stats::predict(rfAll,type = "prob") -perf = ROCR::prediction(pred1[,1], levels(rfData[,1])[rfData[,1]]) -# 1. Area under curve -auc = ROCR::performance(perf, "auc") -AUC <- auc@y.values[[1]] -print(AUC) -# 2. True Positive and Negative Rate -pred3 = ROCR::performance(perf, "tpr","fpr") # check the ROCR packge assignment here -# 3. Plot the ROC curve -plot(pred3,main=paste0("ROC Curve for Random Forest (AUC = ", round(AUC, digits = 3), ")"),col=2,lwd=2) -abline(a=0,b=1,lwd=2,lty=2,col="gray") +# pred1= stats::predict(rfAll,type = "prob") +# perf = ROCR::prediction(pred1[,1], levels(rfData[,1])[rfData[,1]]) +# # 1. Area under curve +# auc = ROCR::performance(perf, "auc") +# AUC <- auc@y.values[[1]] +# print(AUC) +# # 2. True Positive and Negative Rate +# pred3 = ROCR::performance(perf, "tpr","fpr") # check the ROCR packge assignment here +# # 3. Plot the ROC curve +# plot(pred3,main=paste0("ROC Curve for Random Forest (AUC = ", round(AUC, digits = 3), ")"),col=2,lwd=2) +# abline(a=0,b=1,lwd=2,lty=2,col="gray")