Skip to content

Commit

Permalink
reprtree package installed
Browse files Browse the repository at this point in the history
  • Loading branch information
aminuldu07 committed Dec 30, 2024
1 parent 43c135c commit 8fb4e96
Show file tree
Hide file tree
Showing 27 changed files with 807 additions and 55 deletions.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@

get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL,
Undersample = FALSE,
best.m = NULL, # any numeric value or call function to get it
testReps, # testRps must be at least 2;
indeterminateUpper,
indeterminateLower,
Type,
nTopImportance) {

rfData <- scores_df
get_zone_exclusioned_rf_model_with_cv <- function(Data=NULL, #scores_df
Undersample = FALSE,
best.m = NULL, # any numeric value or call function to get it
testReps, # testRps must be at least 2;
indeterminateUpper,
indeterminateLower,
Type) {
browser()
rfData <- Data #rfData <- scores_df
#---------------------------------------------------------------------
# Initialize model performance metric trackers------------------------
#---------------------------------------------------------------------
Expand Down Expand Up @@ -91,7 +90,7 @@ get_imp_features_from_rf_model_with_cv <- function(scores_df=NULL,

#train_data_two <- train

browser()

#model building with current iteration train data
# Train Random Forest model--------------------------------------------
rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m,
Expand Down Expand Up @@ -146,12 +145,12 @@ browser()


# Aggregate Gini importance scores
giniTmp <- randomForest::importance(rf, type = Type)
if (exists('gini')) {
gini <- cbind(gini, giniTmp)
} else {
gini <- giniTmp
}
# giniTmp <- randomForest::importance(rf, type = Type)
# if (exists('gini')) {
# gini <- cbind(gini, giniTmp)
# } else {
# gini <- giniTmp
# }
}


Expand All @@ -173,20 +172,20 @@ browser()
# Feature Importance------------------------------------------------------
#-------------------------------------------------------------------------

print("Feature Importance (Mean Decrease):")
print(sort(rowMeans(gini), decreasing = T))
# print("Feature Importance (Mean Decrease):")
# print(sort(rowMeans(gini), decreasing = T))


#-------------------------------------------------------------------------
# Top Important Features--------------------------------------------------
#--------------------------------------------------------------------------
imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
if (Type == 1) {
colnames(imp) <- 'MeanDecreaseAccuracy'
} else {
colnames(imp) <- 'MeanDecreaseGini'
}
ord <- order(imp[,1])
# imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
# if (Type == 1) {
# colnames(imp) <- 'MeanDecreaseAccuracy'
# } else {
# colnames(imp) <- 'MeanDecreaseGini'
# }
# ord <- order(imp[,1])

# #------------------------------------------------------------------------
# # Dotchart for top Variable Importance
Expand All @@ -202,12 +201,10 @@ print(".........................................................................

return(list(
performance_metrics = PerformanceSummary, # Aggregated performance metrics
feature_importance = imp, # Top n features by importance
raw_results = list( # Raw data for debugging or extended analysis
sensitivity = Sensitivity,
specificity = Specificity,
accuracy = Accuracy,
gini_scores = gini
accuracy = Accuracy
)
))

Expand Down
150 changes: 150 additions & 0 deletions R/get_11imp_features_from_rf_model_with_cv.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@

get_imp_features_from_rf_model_with_cv <- function(Data=NULL, #scores_df
Undersample = FALSE,
best.m = NULL, # any numeric value or call function to get it
testReps, # testRps must be at least 2;
Type,
nTopImportance) {


rfData <- Data #rfData <- scores_df
#---------------------------------------------------------------------
# Initialize model performance metric trackers------------------------
#---------------------------------------------------------------------

# custom function definition
`%ni%` <- Negate('%in%')

# Initialize model performance metric trackers
Sensitivity <- NULL
Specificity <- NULL
PPV <- NULL
NPV <- NULL
Prevalence <- NULL
Accuracy <- NULL
nRemoved <- NULL


#-----------------doing cross-validation--------------------------
#-----------------------------------------------------------------
#------------------------------------------------------------------

#-----create and prepare "`rfTestData data` frame" for storing predictions----
rfTestData <- rfData

#replaces the existing column names with simple numeric identifiers
colnames(rfTestData) <- seq(ncol(rfTestData))

#emptying the data frame.
for (j in seq(ncol(rfTestData))) {
rfTestData[,j] <- NA
}

#prepares rfTestData to maintain a consistent structure with the necessary
#columns for storing predictions in subsequent iterations of the loop
rfTestData <- rfTestData[,1:2] # Keep structure for predictions

#remove 'gini' from the previous iteration
if (exists('gini')) {rm(gini)}


#-------------------------------------------------------------------
# model building and testing----------------------------------------
#-------------------------------------------------------------------


# Perform cross-validation with test repetitions
# Iterate through test repetitions----------------------------------
for (i in seq(testReps)) {
if (i == 1) {
sampleIndicies <- seq(nrow(rfData))
}
if (i < testReps) {
ind <- sample(seq(nrow(rfData)), floor((nrow(rfData)/testReps)-1), replace = F)
sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)]
} else {
ind <- sampleIndicies
}

trainIndex <- which(seq(nrow(rfData)) %ni% ind)
testIndex <- ind

# Extract train and test data--------------------------------
# ind <- sample(2, nrow(rfData), replace = T, prob = c((1- testHoldBack), testHoldBack))
train <- rfData[trainIndex,]
test <- rfData[testIndex,]

# rfAll <- randomForest::randomForest(Target_Organ ~ ., data=rfData, mytry = best.m,
# importance = F, ntree = 500, proximity = T)


# Perform under sampling if enabled
if (Undersample == T) {
posIndex <- which(train[,1] == 1)
nPos <- length(posIndex)
# trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = F))
trainIndex <- c(posIndex, sample(which(train[,1] == 0), nPos, replace = T))
train <- train[trainIndex,]
test <- rbind(train[-trainIndex,], test)
}


# Train Random Forest model with current iteration's train data
rf <- randomForest::randomForest(Target_Organ ~ ., data=train, mytry = best.m,
importance = T, ntree = 500, proximity = T)

print(rf)


# Calculate Gini importance scores for the model
giniTmp <- randomForest::importance(rf, type = Type)

# Aggregate Gini importance scores across iterations
if (exists('gini')) {
gini <- cbind(gini, giniTmp)
} else {
gini <- giniTmp
}
}



#-------------------------------------------------------------------------
# Feature Importance------------------------------------------------------
#-------------------------------------------------------------------------

print("Feature Importance (Mean Decrease):")
print(sort(rowMeans(gini), decreasing = T))


#-------------------------------------------------------------------------
# Top Important Features--------------------------------------------------
#--------------------------------------------------------------------------
# Get the top n important features based on Gini importance
imp <- as.matrix(rowMeans(gini)[1:nTopImportance])
if (Type == 1) {
colnames(imp) <- 'MeanDecreaseAccuracy'
} else {
colnames(imp) <- 'MeanDecreaseGini'
}
ord <- order(imp[,1])

# #------------------------------------------------------------------------
# # Dotchart for top Variable Importance
# #------------------------------------------------------------------------
dotchart(imp[ord, 1], xlab = colnames(imp)[1], ylab = "",
main = paste0('Top ', nrow(imp), ' - Variable Importance'))#, xlim = c(xmin, max(imp[, i])))

# varImpPlot(rf,
# sort = T,
# n.var = 20,
# main = "Top 20 - Variable Importance")
print(".............................................................................")

return(list(

gini_scores = gini
)
)

}
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ get_auc_curve_with_rf_model <- function(Data = NULL, # Input data frame for tra

#Data <- column_harmonized_liverscr_df

rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df,
rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( Data = column_harmonized_liverscr_df,
studyid_metadata = studyid_metadata,
Impute = Impute,
Round = Round,
Expand All @@ -137,6 +137,7 @@ get_auc_curve_with_rf_model <- function(Data = NULL, # Input data frame for tra

}


# reassignment of the data
rfData <- rfData_and_best_m[["rfData"]]

Expand All @@ -148,9 +149,8 @@ get_auc_curve_with_rf_model <- function(Data = NULL, # Input data frame for tra
}


# Train a Random Forest model using the specified mtry value
rfAll <- randomForest::randomForest(Target_Organ ~ ., data = rfData, mytry = best.m,
importance = FALSE, ntree = 500, proximity = TRUE)



# Predict probabilities and calculate AUC
pred1 <- stats::predict(rfAll, type = "prob")
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#' Undersample = TRUE
#' )

get_ml_data_and_tuned_hyperparameters <- function(scores_df,
get_ml_data_and_tuned_hyperparameters <- function(Data, # Data == "scores_df"
studyid_metadata,
Impute = FALSE,
Round =FALSE,
Expand All @@ -46,7 +46,9 @@ get_ml_data_and_tuned_hyperparameters <- function(scores_df,
hyperparameter_tuning = FALSE,
error_correction_method = NULL) { # # Choose: "Flip" or "Prune" or "None"

input_scores_df <- scores_df
input_scores_df <- Data #input_scores_df <- scores_df


metadata_df <- studyid_metadata


Expand Down
53 changes: 53 additions & 0 deletions R/get_rf_model_with_cv.R → R/get_9rf_model_with_cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,59 @@ print(".........................................................................
))

}

# #Add a special case for testReps = 1 that directly splits data into train and test sets without looping or iterative sampling.
# if (testReps == 1) {
# # Use a single random split (e.g., 70% train, 30% test)
# set.seed(123) # Ensure reproducibility
# trainIndex <- sample(seq(nrow(rfData)), size = floor(0.7 * nrow(rfData)), replace = FALSE)
# testIndex <- setdiff(seq(nrow(rfData)), trainIndex)
#
# train <- rfData[trainIndex, ]
# test <- rfData[testIndex, ]
#
# } else {
# # Multiple iterations (original logic)
# for (i in seq(testReps)) {
# if (i == 1) {
# sampleIndicies <- seq(nrow(rfData))
# }
# if (i < testReps) {
# ind <- sample(seq(nrow(rfData)), floor((nrow(rfData) / testReps) - 1), replace = FALSE)
# sampleIndicies <- sampleIndicies[which(sampleIndicies %ni% ind)]
# } else {
# ind <- sampleIndicies
# }
#
# trainIndex <- which(seq(nrow(rfData)) %ni% ind)
# testIndex <- ind
#
# train <- rfData[trainIndex, ]
# test <- rfData[testIndex, ]
# }
# }























#--------------------------------------------------------------------------
#--------------------------------------------------------------------------
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@



get_Data_formatted_for_ml <- function(path_db,
get_Data_formatted_for_ml_and_best.m <- function(path_db,
rat_studies=FALSE,
studyid_metadata=NULL,
fake_study = FALSE,
Expand Down Expand Up @@ -134,7 +134,7 @@ get_Data_formatted_for_ml <- function(path_db,



rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( scores_df = column_harmonized_liverscr_df,
rfData_and_best_m <- get_ml_data_and_tuned_hyperparameters( Data = column_harmonized_liverscr_df,
studyid_metadata = studyid_metadata,
Impute = Impute,
Round = Round,
Expand All @@ -147,8 +147,9 @@ get_Data_formatted_for_ml <- function(path_db,


rfData <- rfData_and_best_m[["rfData"]]
best.m <- rfData_and_best_m[["best.m"]]

# best.m input handling------------------------------------------------
# # best.m input handling------------------------------------------------
# if(is.null(best.m)){
# best.m <- rfData_and_best_m[["best.m"]]
# } else {
Expand All @@ -157,6 +158,7 @@ get_Data_formatted_for_ml <- function(path_db,



return(Data = rfData)
return(list(Data = rfData,
best.m= best.m))

}
Loading

0 comments on commit 8fb4e96

Please sign in to comment.