From 43c135cca2bc305fd1b12b61d70cfd33a83eb3ae Mon Sep 17 00:00:00 2001 From: Md Aminul Islam Prodhan Date: Sun, 29 Dec 2024 10:51:38 -0600 Subject: [PATCH] rf updated --- R/get_Data_formatted_for_ml.R | 162 +++++++++++++ R/get_imp_features_from_rf_model_cv_imp.R | 214 +++++++++++++++++ R/get_rf_input_param_list_output_cv_imp.R | 42 ++-- R/get_rf_model_with_cv.R | 219 ++++++++++++++++++ ...=> get_zone_exclusioned_rf_model_cv_imp.R} | 0 inst/test_get_Data_formatted_for_ml.R | 32 +++ inst/test_get_rf_model_with_cv.R | 88 +++++++ inst/test_train_eval_rf_wth_cv_imp.R | 66 ------ 8 files changed, 743 insertions(+), 80 deletions(-) create mode 100644 R/get_Data_formatted_for_ml.R create mode 100644 R/get_imp_features_from_rf_model_cv_imp.R create mode 100644 R/get_rf_model_with_cv.R rename R/{get_rf_model_cv_imp.R => get_zone_exclusioned_rf_model_cv_imp.R} (100%) create mode 100644 inst/test_get_Data_formatted_for_ml.R create mode 100644 inst/test_get_rf_model_with_cv.R delete mode 100644 inst/test_train_eval_rf_wth_cv_imp.R diff --git a/R/get_Data_formatted_for_ml.R b/R/get_Data_formatted_for_ml.R new file mode 100644 index 0000000..9e3be6b --- /dev/null +++ b/R/get_Data_formatted_for_ml.R @@ -0,0 +1,162 @@ + + + + + +get_Data_formatted_for_ml <- function(path_db, + rat_studies=FALSE, + studyid_metadata=NULL, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = FALSE, + Impute = FALSE, + reps, + holdback, + Undersample = FALSE, + hyperparameter_tuning = FALSE, + error_correction_method # = must be 'Flip' or "Prune' or 'None' + ){ + + # Process the database to retrieve the vector of "STUDYIDs"------------- + if(use_xpt_file){ + + studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) + + } else { + + if (fake_study) { + # Helper function to fetch data from SQLite database + fetch_domain_data <- function(db_connection, domain_name) { + # Convert domain name to uppercase + domain_name <- toupper(domain_name) + # Create SQL query statement + query_statement <- paste0('SELECT * FROM ', domain_name) + # Execute query and fetch the data + query_result <- DBI::dbGetQuery(db_connection, statement = query_statement) + # Return the result + query_result + } + # Establish a connection to the SQLite database + db_connection <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db) + + # Fetch data for required domains + dm <- fetch_domain_data(db_connection, 'dm') + + # Close the database connection + DBI::dbDisconnect(db_connection) + + # get the studyids from the dm table + studyid_or_studyids <- as.vector(unique(dm$STUDYID)) # unique STUDYIDS from DM table + + # Filter the fake data for the "rat_studies" + if(rat_studies){ + + studyid_or_studyids <- studyid_or_studyids + } + + #-------------------------------------------------------------------- + #-----------we can set logic here for rat studies in "fake data"---- + #-------------------------------------------------------------------- + + } else { + # For the real data in sqlite database + # filter for the repeat-dose and parallel studyids + + studyid_or_studyids <- get_repeat_dose_parallel_studyids(path_db=path_db, + rat_studies = rat_studies) + + } + } + + + # process the database to get the "studyid_metadata"------------ + if(is.null(studyid_metadata)) { + if(fake_study) { + # Extract study ID metadata + studyid_metadata <- dm[, "STUDYID", drop=FALSE] + + # Remove duplicates based on STUDYID + studyid_metadata <- studyid_metadata[!duplicated(studyid_metadata$STUDYID), , drop =FALSE] + + # Add a new column for Target_Organ + studyid_metadata$Target_Organ <- NA + + # assign "Target_Organ" column values randomly + # randomly 50% of the value is Liver and rest are not_Liver + set.seed(123) # Set seed for reproducibility + rows_number <- nrow(studyid_metadata) # Number of rows + + # Randomly sample 50% for "Liver" and rest for "not_Liver" + studyid_metadata$Target_Organ <- sample(c("Liver", "not_Liver"), size = rows_number, replace = TRUE, prob = c(0.5, 0.5)) + + # View the result + + } else { + + # create "studyid_metadata" data frame from "studyid_or_studyids" vector + studyid_metadata <- data.frame(STUDYID = studyid_or_studyids) + + # Remove duplicates based on STUDYID + studyid_metadata <- studyid_metadata[!duplicated(studyid_metadata$STUDYID), , drop = FALSE] + + # Add a new column for Target_Organ + studyid_metadata$Target_Organ <- NA + + # assign "Target_Organ" column values randomly + # randomly 50% of the value is Liver and rest are not_Liver + set.seed(123) # Set seed for reproducibility + rows_number <- nrow(studyid_metadata) # Number of rows + + # Randomly sample 50% for "Liver" and rest for "not_Liver" + studyid_metadata$Target_Organ <- sample(c("Liver", "not_Liver"), size = rows_number, replace = TRUE, prob = c(0.5, 0.5)) + + } + } + + + #----------------------------------------------------------------------- + # if studyid_metadata is not provided then use the data frame to + # creae a data frame with two columns "STUDYID" and "Target_Organ" + + #------------------------------------------------------------------------- + + # get_liver_om_lb_mi_tox_score_list( + calculated_liver_scores <- get_liver_om_lb_mi_tox_score_list(studyid_or_studyids = studyid_or_studyids, + path_db = path_db, + fake_study = fake_study, + use_xpt_file = use_xpt_file, + output_individual_scores = TRUE, + output_zscore_by_USUBJID = FALSE) + + # Harmonize the column + column_harmonized_liverscr_df <- get_col_harmonized_scores_df(liver_score_data_frame = calculated_liver_scores, + Round = Round) + + + + rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df, + studyid_metadata = studyid_metadata, + Impute = Impute, + Round = Round, + reps=reps, + holdback=holdback, + Undersample = Undersample, + hyperparameter_tuning = hyperparameter_tuning, + error_correction_method = error_correction_method) + + + + rfData <- rfData_and_best_m[["rfData"]] + + # best.m input handling------------------------------------------------ + # if(is.null(best.m)){ + # best.m <- rfData_and_best_m[["best.m"]] + # } else { + # best.m <- best.m + # } + + + +return(Data = rfData) + +} diff --git a/R/get_imp_features_from_rf_model_cv_imp.R b/R/get_imp_features_from_rf_model_cv_imp.R new file mode 100644 index 0000000..3213a1e --- /dev/null +++ b/R/get_imp_features_from_rf_model_cv_imp.R @@ -0,0 +1,214 @@ + +get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL, + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps, # testRps must be at least 2; + indeterminateUpper, + indeterminateLower, + Type, + nTopImportance) { + + rfData <- scores_df + #--------------------------------------------------------------------- + # Initialize model performance metric trackers------------------------ + #--------------------------------------------------------------------- + + # custom function definition + `%ni%` <- Negate('%in%') + + Sensitivity <- NULL + Specificity <- NULL + PPV <- NULL + NPV <- NULL + Prevalence <- NULL + Accuracy <- NULL + nRemoved <- NULL + + + #-----------------doing cross-validation-------------------------- + #----------------------------------------------------------------- + #------------------------------------------------------------------ + + #-----create and prepare "`rfTestData data` frame" for storing predictions---- + rfTestData <- rfData + + #replaces the existing column names with simple numeric identifiers + colnames(rfTestData) <- seq(ncol(rfTestData)) + + #emptying the data frame. + for (j in seq(ncol(rfTestData))) { + rfTestData[,j] <- NA + } + + #prepares rfTestData to maintain a consistent structure with the necessary + #columns for storing predictions in subsequent iterations of the loop + rfTestData <- rfTestData[,1:2] # Keep structure for predictions + + #remove 'gini' from the previous iteration + if (exists('gini')) {rm(gini)} + + + #------------------------------------------------------------------- + # model building and testing---------------------------------------- + #------------------------------------------------------------------- + + + # Iterate through test repetitions---------------------------------- + for (i in seq(testReps)) { + if (i == 1) { + sampleIndicies <- seq(nrow(rfData)) + } + if (i < testReps) { + ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F) + sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)] + } else { + ind <- sampleIndicies + } + + trainIndex <- which(seq(nrow(rfData)) %ni% ind) + testIndex <- ind + + # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack)) + train <- rfData[trainIndex,] + + #train_data_two <- train + + test <- rfData[testIndex,] + + # rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m, + # importance = F, ntree = 500, proximity = T) + + + # Perform under sampling if enabled + if (Undersample == T) { + posIndex <- which(train[,1] == 1) + nPos <- length(posIndex) + # trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F)) + trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T)) + train <- train[trainIndex,] + test <- rbind(train[-trainIndex,], test) + } + + #train_data_two <- train + +browser() + #model building with current iteration train data + # Train Random Forest model-------------------------------------------- + rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m, + importance = T, ntree = 500, proximity = T) + + print(rf) + + #---------------------------------------------------------------------- + #predictions with current model with current test data + # @___________________this_line_has_problems_______ + # Predict probabilities on test data + #---------------------------------------------------------------------- + + p2r <- stats::predict(rf, test, type = 'prob')[,1] + + #Store these predictions in a structured data frame + rfTestData[names(p2r), i] <- as.numeric(p2r) + + + #-------------------------------------------------------------------------- + #-------------------------------------------------------------------------- + #-------------------------------------------------------------------------- + #Identifying Indeterminate Predictions (Tracking Indeterminate Predictions) + #Keeps track of the proportion of indeterminate predictions in each iteration + #Proportion Tracking + #------------------------------------------------------------------------ + #------------------------------------------------------------------------ + + indeterminateIndex <- which((p2r < indeterminateUpper)&(p2r > indeterminateLower)) + + #Calculating the Proportion of Indeterminate Predictions + #Sets the indeterminate predictions to NA, effectively marking them + #as missing or invalid. + nRemoved <- c(nRemoved, length(indeterminateIndex)/length(p2r)) + + #Handling Indeterminate Predictions + p2r[indeterminateIndex] <- NA + + #Rounding the Predictions: + p2r <- round(p2r) + + + # Compute confusion matrix and extract metrics using "caret" package---- + + Results <- caret::confusionMatrix(factor(p2r, levels = c(1, 0)), factor(test$Target_Organ, levels = c(1, 0))) + Sensitivity <- c(Sensitivity, Results$byClass[['Sensitivity']]) + Specificity <- c(Specificity, Results$byClass[['Specificity']]) + PPV <- c(PPV, Results$byClass[['Pos Pred Value']]) + NPV <- c(NPV, Results$byClass[['Neg Pred Value']]) + Prevalence <- c(Prevalence, Results$byClass[['Prevalence']]) + Accuracy <- c(Accuracy, Results$byClass[['Balanced Accuracy']]) + + + # Aggregate Gini importance scores + giniTmp <- randomForest::importance(rf, type = Type) + if (exists('gini')) { + gini <- cbind(gini, giniTmp) + } else { + gini <- giniTmp + } + } + + + #------------------------------------------------------------------------ + # Performance Summary + #------------------------------------------------------------------------- + + PerformanceMatrix <- cbind(Sensitivity, + Specificity, + PPV, + NPV, + Prevalence, + Accuracy, + nRemoved) + PerformanceSummary <- colMeans(PerformanceMatrix, na.rm = T) + print(PerformanceSummary) + + #------------------------------------------------------------------------- + # Feature Importance------------------------------------------------------ + #------------------------------------------------------------------------- + + print("Feature Importance (Mean Decrease):") + print(sort(rowMeans(gini), decreasing = T)) + + + #------------------------------------------------------------------------- + # Top Important Features-------------------------------------------------- + #-------------------------------------------------------------------------- + imp <- as.matrix(rowMeans(gini)[1:nTopImportance]) + if (Type == 1) { + colnames(imp) <- 'MeanDecreaseAccuracy' + } else { + colnames(imp) <- 'MeanDecreaseGini' + } + ord <- order(imp[,1]) + + # #------------------------------------------------------------------------ + # # Dotchart for top Variable Importance + # #------------------------------------------------------------------------ + # dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "", + # main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i]))) + # # varImpPlot(rf, + # # sort = T, + # # n.var = 20, + # # main = "Top 20 - Variable Importance") +print(".............................................................................") + print(PerformanceSummary) + + return(list( + performance_metrics = PerformanceSummary, # Aggregated performance metrics + feature_importance = imp, # Top n features by importance + raw_results = list( # Raw data for debugging or extended analysis + sensitivity = Sensitivity, + specificity = Specificity, + accuracy = Accuracy, + gini_scores = gini + ) + )) + + } diff --git a/R/get_rf_input_param_list_output_cv_imp.R b/R/get_rf_input_param_list_output_cv_imp.R index 8f2ca12..b0bbfdf 100644 --- a/R/get_rf_input_param_list_output_cv_imp.R +++ b/R/get_rf_input_param_list_output_cv_imp.R @@ -15,6 +15,7 @@ get_rf_input_param_list_output_cv_imp <- function(path_db, Undersample = FALSE, hyperparameter_tuning = FALSE, error_correction_method, # = must be 'Flip' or "Prune' or 'None' + best.m = NULL, #rf mytr parameter testReps , # at least 2 indeterminateUpper, indeterminateLower, @@ -73,21 +74,27 @@ get_rf_input_param_list_output_cv_imp <- function(path_db, } } + #----------------------------------------------------------------------- + # if studyid_metadata is not provided then use the data frame to + # creae a data frame with two columns "STUDYID" and "Target_Organ" -calculated_liver_scores <- get_liver_om_lb_mi_tox_score_list(studyid_or_studyids = studyid_or_studyids, + #------------------------------------------------------------------------- + + # get_liver_om_lb_mi_tox_score_list( + calculated_liver_scores <- get_liver_om_lb_mi_tox_score_list(studyid_or_studyids = studyid_or_studyids, path_db = path_db, fake_study = fake_study, use_xpt_file = use_xpt_file, output_individual_scores = TRUE, output_zscore_by_USUBJID = FALSE) -# Harmonize the column -column_harmonized_liverscr_df <- get_col_harmonized_scores_df(liver_score_data_frame = calculated_liver_scores, + # Harmonize the column + column_harmonized_liverscr_df <- get_col_harmonized_scores_df(liver_score_data_frame = calculated_liver_scores, Round = Round) -rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df, + rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df, studyid_metadata = studyid_metadata, Impute = Impute, Round = Round, @@ -99,17 +106,24 @@ rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_h -rfData <- rfData_and_best_m[["rfData"]] -best.m <- rfData_and_best_m[["best.m"]] + rfData <- rfData_and_best_m[["rfData"]] -train_and_evaluate_rf_model <- get_rf_model_output_cv_imp(scores_df = rfData, - Undersample = Undersample, - best.m = best.m , - testReps = testReps, - indeterminateUpper = indeterminateUpper, - indeterminateLower = indeterminateLower, - Type = Type , - nTopImportance = nTopImportance) + # best.m input handling------------------------------------------------ + if(is.null(best.m)){ + best.m <- rfData_and_best_m[["best.m"]] + } else { + best.m <- best.m + } + + + train_and_evaluate_rf_model <- get_rf_model_with_cv (scores_data_df = rfData, + Undersample = Undersample, + best.m = best.m , + testReps = testReps, + indeterminateUpper = indeterminateUpper, + indeterminateLower = indeterminateLower, + Type = Type , + nTopImportance = nTopImportance) diff --git a/R/get_rf_model_with_cv.R b/R/get_rf_model_with_cv.R new file mode 100644 index 0000000..fea7876 --- /dev/null +++ b/R/get_rf_model_with_cv.R @@ -0,0 +1,219 @@ + +get_rf_model_with_cv <- function(Data, + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps, # testRps must be at least 2; + Type) { + + +# This functin must need a data input. +# This funcitno is is designed to work with the chanin way. +# There is scond function with list of parmaters + + + + rfData <- Data + #--------------------------------------------------------------------- + # Initialize model performance metric trackers------------------------ + #--------------------------------------------------------------------- + + # custom function definition + `%ni%` <- Negate('%in%') + + Sensitivity <- NULL + Specificity <- NULL + PPV <- NULL + NPV <- NULL + Prevalence <- NULL + Accuracy <- NULL + #nRemoved <- NULL + + + #-----------------doing cross-validation-------------------------- + #----------------------------------------------------------------- + #------------------------------------------------------------------ + + #-----create and prepare "`rfTestData data` frame" for storing predictions---- + rfTestData <- rfData + + #replaces the existing column names with simple numeric identifiers + colnames(rfTestData) <- seq(ncol(rfTestData)) + + #emptying the data frame. + for (j in seq(ncol(rfTestData))) { + rfTestData[,j] <- NA + } + + #prepares rfTestData to maintain a consistent structure with the necessary + #columns for storing predictions in subsequent iterations of the loop + rfTestData <- rfTestData[,1:2] # Keep structure for predictions + + #remove 'gini' from the previous iteration + #if (exists('gini')) {rm(gini)} + + + #------------------------------------------------------------------- + # model building and testing---------------------------------------- + #------------------------------------------------------------------- + + + # Iterate through test repetitions---------------------------------- + for (i in seq(testReps)) { + if (i == 1) { + sampleIndicies <- seq(nrow(rfData)) + } + if (i < testReps) { + ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F) + sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)] + } else { + ind <- sampleIndicies + } + + trainIndex <- which(seq(nrow(rfData)) %ni% ind) + testIndex <- ind + + # ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack)) + train <- rfData[trainIndex,] + + #train_data_two <- train + + test <- rfData[testIndex,] + + # rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m, + # importance = F, ntree = 500, proximity = T) + + + # Perform under sampling if enabled + if (Undersample == T) { + posIndex <- which(train[,1] == 1) + nPos <- length(posIndex) + # trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F)) + trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T)) + train <- train[trainIndex,] + test <- rbind(train[-trainIndex,], test) + } + + #train_data_two <- train + + + #model building with current iteration train data + # Train Random Forest model-------------------------------------------- + rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m, + importance = T, ntree = 500, proximity = T) + + print(rf) + + #---------------------------------------------------------------------- + #predictions with current model with current test data + # @___________________this_line_has_problems_______ + # Predict probabilities on test data + #---------------------------------------------------------------------- + + p2r <- stats::predict(rf, test, type = 'prob')[,1] + + #Store these predictions in a structured data frame + rfTestData[names(p2r), i] <- as.numeric(p2r) + + #Rounding the Predictions: + p2r <- round(p2r) + + # Compute confusion matrix and extract metrics using "caret" package---- + + Results <- caret::confusionMatrix(factor(p2r, levels = c(1, 0)), factor(test$Target_Organ, levels = c(1, 0))) + Sensitivity <- c(Sensitivity, Results$byClass[['Sensitivity']]) + Specificity <- c(Specificity, Results$byClass[['Specificity']]) + PPV <- c(PPV, Results$byClass[['Pos Pred Value']]) + NPV <- c(NPV, Results$byClass[['Neg Pred Value']]) + Prevalence <- c(Prevalence, Results$byClass[['Prevalence']]) + Accuracy <- c(Accuracy, Results$byClass[['Balanced Accuracy']]) + + + # # Aggregate Gini importance scores + # giniTmp <- randomForest::importance(rf, type = Type) + # if (exists('gini')) { + # gini <- cbind(gini, giniTmp) + # } else { + # gini <- giniTmp + # } + } + + + #------------------------------------------------------------------------ + # Performance Summary + #------------------------------------------------------------------------- + + PerformanceMatrix <- cbind(Sensitivity, + Specificity, + PPV, + NPV, + Prevalence, + Accuracy) + PerformanceSummary <- colMeans(PerformanceMatrix, na.rm = T) + print(PerformanceSummary) + + # #------------------------------------------------------------------------- + # # Feature Importance------------------------------------------------------ + # #------------------------------------------------------------------------- + # + # print("Feature Importance (Mean Decrease):") + # print(sort(rowMeans(gini), decreasing = T)) + # + # + # #------------------------------------------------------------------------- + # # Top Important Features-------------------------------------------------- + # #-------------------------------------------------------------------------- + # imp <- as.matrix(rowMeans(gini)[1:nTopImportance]) + # if (Type == 1) { + # colnames(imp) <- 'MeanDecreaseAccuracy' + # } else { + # colnames(imp) <- 'MeanDecreaseGini' + # } + # ord <- order(imp[,1]) + + # #------------------------------------------------------------------------ + # # Dotchart for top Variable Importance + # #------------------------------------------------------------------------ + # dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "", + # main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i]))) + # # varImpPlot(rf, + # # sort = T, + # # n.var = 20, + # # main = "Top 20 - Variable Importance") +print(".............................................................................") + print(PerformanceSummary) + + return(list( + performance_metrics = PerformanceSummary, # Aggregated performance metrics + #feature_importance = imp, # Top n features by importance + raw_results = list( # Raw data for debugging or extended analysis + sensitivity = Sensitivity, + specificity = Specificity, + accuracy = Accuracy + + ) + )) + +} + +#-------------------------------------------------------------------------- +#-------------------------------------------------------------------------- +#-------------------------------------------------------------------------- +#Identifying Indeterminate Predictions (Tracking Indeterminate Predictions) +#Keeps track of the proportion of indeterminate predictions in each iteration +#Proportion Tracking +#------------------------------------------------------------------------ +#------------------------------------------------------------------------ + +# indeterminateIndex <- which((p2r < indeterminateUpper)&(p2r > indeterminateLower)) +# +# #Calculating the Proportion of Indeterminate Predictions +# #Sets the indeterminate predictions to NA, effectively marking them +# #as missing or invalid. +# nRemoved <- c(nRemoved, length(indeterminateIndex)/length(p2r)) +# +# #Handling Indeterminate Predictions +# p2r[indeterminateIndex] <- NA +# +# #Rounding the Predictions: +# p2r <- round(p2r) + diff --git a/R/get_rf_model_cv_imp.R b/R/get_zone_exclusioned_rf_model_cv_imp.R similarity index 100% rename from R/get_rf_model_cv_imp.R rename to R/get_zone_exclusioned_rf_model_cv_imp.R diff --git a/inst/test_get_Data_formatted_for_ml.R b/inst/test_get_Data_formatted_for_ml.R new file mode 100644 index 0000000..90b9182 --- /dev/null +++ b/inst/test_get_Data_formatted_for_ml.R @@ -0,0 +1,32 @@ +rm(list = ls()) +devtools::load_all(".") + +# Initialize a connection to the SQLite database +#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db' + +#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt' + +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) + +path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db" + +# studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv", +# header = TRUE, sep = ",", stringsAsFactors = FALSE) + +# ---------------------------------------------------- +# For this function we need ml format data +#---------------------------------------------------- +Data <- get_Data_formatted_for_ml(path_db=path_db, + rat_studies=TRUE, + studyid_metadata=NULL, + fake_study = FALSE, + use_xpt_file = FALSE, + Round = TRUE, + Impute = TRUE, + reps=1, + holdback=0.25, + Undersample =TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = 'None' # = must be 'Flip' or "Prune' or 'None' + ) + diff --git a/inst/test_get_rf_model_with_cv.R b/inst/test_get_rf_model_with_cv.R new file mode 100644 index 0000000..b9c9782 --- /dev/null +++ b/inst/test_get_rf_model_with_cv.R @@ -0,0 +1,88 @@ +rm(list = ls()) +devtools::load_all(".") + +# Initialize a connection to the SQLite database +path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db' + +#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt' +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) +#path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db" +studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv", + header = TRUE, sep = ",", stringsAsFactors = FALSE) + +# ---------------------------------------------------- +# For this function we need ml format data +#---------------------------------------------------- +Data <- get_Data_formatted_for_ml(path_db=path_db, + rat_studies=FALSE, + studyid_metadata=studyid_metadata, + fake_study = TRUE, + use_xpt_file = FALSE, + Round = TRUE, + Impute = TRUE, + reps=1, + holdback=0.25, + Undersample =TRUE, + hyperparameter_tuning = FALSE, + error_correction_method = 'None' # = must be 'Flip' or "Prune' or 'None' +) + + +simple_rf_model <- get_rf_model_with_cv(Data = Data, + Undersample = FALSE, + best.m = NULL, # any numeric value or call function to get it + testReps=2, # testRps must be at least 2; + Type=1) + + + +# rf_with_intermediate <- get_imp_features_from_rf_model_with_cv(scores_df=Data, +# Undersample = TRUE, +# best.m = 4, # any numeric value or call function to get it +# testReps=2, # testRps must be at least 2; +# indeterminateUpper=0.75, +# indeterminateLower=0.25, +# Type=1, +# nTopImportance=20) + + + + + + + + +#rf_model <- get_random_forest_model_amin2(Data=rf_Data) + + + +# # Create a connection to the database +# dbtoken <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db) +# +# # Retrieve the STUDYID column from the dm table +# query <- "SELECT STUDYID FROM dm" +# studyid_data <- DBI::dbGetQuery(dbtoken, query) +# +# # Extract unique STUDYID values +# unique_studyids <- unique(studyid_data$STUDYID) +# +# # Disconnect from the database +# DBI::dbDisconnect(dbtoken) +# +# studyid_or_studyids <- unique_studyids + +#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) + +# #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +# rm(list = ls()) +# devtools::load_all(".") +# path_db <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db" +# studyid_metadata_path <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv" +# +# rfData_and_best_m <- get_rfData_and_best_m( +# path_db = path_db, +# studyid_metadata_path = studyid_metadata_path, +# fake_study = TRUE, +# Round = TRUE, +# Undersample = TRUE +# ) diff --git a/inst/test_train_eval_rf_wth_cv_imp.R b/inst/test_train_eval_rf_wth_cv_imp.R deleted file mode 100644 index 3655434..0000000 --- a/inst/test_train_eval_rf_wth_cv_imp.R +++ /dev/null @@ -1,66 +0,0 @@ -rm(list = ls()) -devtools::load_all(".") - -# Initialize a connection to the SQLite database -path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db' - -#path_db='C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_xpt' -#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) -#path_db = "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/TestDB.db" -studyid_metadata <- read.csv("C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv", - header = TRUE, sep = ",", stringsAsFactors = FALSE) - - -train_and_evaluate_rf_model <- train_eval_rf_with_cv_imp(scores_df=NULL, - path_db = path_db, # need when scores_df is NULL and need to calculate it - studyid_metadata=studyid_metadata, - use_xpt_file = FALSE, - fake_study= TRUE, - Impute = FALSE, - Round =TRUE, - reps=1, # from 0 to any numeric number - holdback=0.25, # either 1 or fraction value like 0.75 etc. - Undersample = FALSE, - hyperparameter_tuning = FALSE, - error_correction_method = 'None', - best.m = NULL, # any numeric value or call function to get it - testReps=1, # testRps must be at least 2; - indeterminateUpper = .75, - indeterminateLower = .25, - Type=1, - nTopImportance=20) - -#rf_model <- get_random_forest_model_amin2(Data=rf_Data) - - - -# # Create a connection to the database -# dbtoken <- DBI::dbConnect(RSQLite::SQLite(), dbname = path_db) -# -# # Retrieve the STUDYID column from the dm table -# query <- "SELECT STUDYID FROM dm" -# studyid_data <- DBI::dbGetQuery(dbtoken, query) -# -# # Extract unique STUDYID values -# unique_studyids <- unique(studyid_data$STUDYID) -# -# # Disconnect from the database -# DBI::dbDisconnect(dbtoken) -# -# studyid_or_studyids <- unique_studyids - -#studyid_or_studyids <- list.dirs(path_db , full.names = TRUE, recursive = FALSE) - -# #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -# rm(list = ls()) -# devtools::load_all(".") -# path_db <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_merged_liver_not_liver.db" -# studyid_metadata_path <- "C:/Users/MdAminulIsla.Prodhan/OneDrive - FDA/Documents/DATABASES/fake_80_MD.csv" -# -# rfData_and_best_m <- get_rfData_and_best_m( -# path_db = path_db, -# studyid_metadata_path = studyid_metadata_path, -# fake_study = TRUE, -# Round = TRUE, -# Undersample = TRUE -# )