-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of https://github.com/ncss-tech/dsm-focus
- Loading branch information
Showing
5 changed files
with
631 additions
and
2 deletions.
There are no files selected for viewing
350 changes: 350 additions & 0 deletions
350
google-cloud-platform/aws-migration/8-vic_scripts_modeling.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,350 @@ | ||
# 8-VIC modeling | ||
|
||
|
||
# 10/31/24 | ||
|
||
# Dave White | ||
start.time <- Sys.time() | ||
|
||
# load and install packages | ||
required.packages <- c( "caret", "sf", "terra", "randomForest", "doParallel", "aqp", "parallel", "snow", "foreign", "foreach") | ||
new.packages <- required.packages[!(required.packages %in% installed.packages()[,"Package"])] | ||
if(length(new.packages)) install.packages(new.packages) | ||
lapply(required.packages, require, character.only=T) | ||
rm(required.packages, new.packages) | ||
|
||
|
||
|
||
# bring in covariate data and create a stack | ||
|
||
# set working directory to dem covs | ||
setwd("~/data/8-vic/cov") | ||
|
||
# read in raster file names as a list | ||
rList=list.files(getwd(), pattern="tif$", full.names = FALSE) | ||
|
||
# create a raster stack of dem covs | ||
rStack <- rast(rList) | ||
names(rStack) <- gsub(".tif", "",rList) | ||
names(rStack) <- gsub("-", "", names(rStack)) | ||
names(rStack) <- gsub("_", "", names(rStack)) | ||
names(rStack) | ||
|
||
|
||
# Bring in training data points | ||
# read in shapefile | ||
# bring in pedon data | ||
|
||
#set working directory to training data. | ||
setwd("~/data/8-vic/data") | ||
|
||
pts <- read_sf("pedons.shp") | ||
|
||
# extract raster covariate values at each training point for the all.pts dataset | ||
pts.sv <- terra::extract(rStack, pts, xy=F, bind=TRUE, na.rm=TRUE) | ||
|
||
# convert SpatVector to sf | ||
all.pts <- sf::st_as_sf(pts.sv) | ||
|
||
names(all.pts) | ||
|
||
# remove unwanted cols | ||
all.pts <- all.pts[-c(1:14)] | ||
colnames(all.pts)[1] <- "class" | ||
names(all.pts) | ||
all.pts$class <- as.factor(all.pts$class) | ||
names(all.pts) | ||
# | ||
levels(all.pts$class) | ||
|
||
|
||
# convert sf to a dataframe | ||
pts.all <- as.data.frame(all.pts) | ||
names(pts.all) | ||
|
||
comp <- pts.all[,!(colnames(pts.all)%in% c("geometry"))] | ||
names(comp) | ||
|
||
|
||
# remove any observations with NAs | ||
comp <- comp[complete.cases(comp),] | ||
|
||
# convert Class col to a factor | ||
#comp$Class <- as.factor(comp$Class) | ||
is.factor(comp$class) | ||
levels(comp$class) | ||
summary((comp$class)) | ||
|
||
# There are a few classes with < 2 observations remove those classes | ||
|
||
nlevels(comp$class) | ||
|
||
comp = droplevels(comp[comp$class %in% names(table(comp$class)) [table(comp$class) >2],]) | ||
|
||
nlevels(comp$class) | ||
|
||
levels(comp$class) | ||
summary(comp$class) | ||
|
||
|
||
length(levels(comp$class)) | ||
gc() | ||
|
||
#--------------------------------------------------------------------------- | ||
# Recursive Feature Selection (RFE) | ||
|
||
|
||
subsets <- seq(50, length(names(rStack)), 5) | ||
|
||
#set number and repeats for cross validation | ||
number = 10 | ||
repeats = 1 | ||
|
||
# set seeds to get reproducible results when running the process in parallel | ||
set.seed(12) | ||
seeds <- vector(mode = "list", length=(number*repeats+1)) | ||
for(i in 1:(number*repeats)) seeds[[i]] <- sample.int(1000, length(subsets)+1) | ||
seeds[[(number*repeats+1)]] <- sample.int(1000, 1) | ||
|
||
|
||
cl <- makeCluster(121) # base R only recognizes 128 cores and about 5 need to be left for OS | ||
registerDoParallel(cl) | ||
|
||
# set up the rfe control | ||
ctrl.RFE <- rfeControl(functions = rfFuncs, | ||
method = "repeatedcv", | ||
number = number, | ||
repeats = repeats, | ||
seeds = seeds, | ||
verbose = F) | ||
|
||
## highlight and run everything from c1 to stopCluster(c1) to run RFE | ||
rfe <- rfe(x = comp[,-c(1)], | ||
y = comp$class, | ||
sizes = subsets, | ||
rfeControl = ctrl.RFE, | ||
allowParallel = TRUE | ||
) | ||
stopCluster(cl) | ||
gc() | ||
|
||
|
||
# Look at the results | ||
rfe | ||
plot(rfe) | ||
|
||
|
||
|
||
rfe$fit$confusion | ||
rfe$results | ||
|
||
# see list of predictors | ||
predictors(rfe) | ||
|
||
# the rfe function retuns the covariates with the highest accuracy for the rf model | ||
# view the highest accuracy noted by the * | ||
rfe | ||
|
||
# take the accuracy and subtract the accuracySD. look in the results of rf.RFE and find the accuracy that is > or = to this value. this is the number of covariates to use below | ||
#predictors(rfeLand) # top number of covariates | ||
|
||
# look at the top number of covariates that are equal to greater than the accuracy minus the accuracySD | ||
#predictors(rf.RFE)[c(1:9,24:26)] | ||
|
||
# assign this to a variable | ||
a <- predictors(rfe)#[c(1:50)] | ||
|
||
|
||
# subset the covariate stack and the data frame by the selected covariates the variable a is your selected covariates | ||
|
||
# subsed the raser stack to the selected covariates | ||
rsm <- subset(rStack, a) | ||
|
||
|
||
# subset the data frame points with the number of covariates selected | ||
names(comp) | ||
comp.sub <- (comp[,c("class", a)]) | ||
|
||
|
||
names(comp.sub) | ||
is.factor(comp.sub$class) | ||
|
||
subsets <- seq(1, length(names(rsm)), 5) | ||
|
||
#set number and repeats for cross validation | ||
number = 10 | ||
repeats = 5 | ||
|
||
# set seeds to get reproducible results when running the process in parallel | ||
set.seed(12) | ||
seeds <- vector(mode = "list", length=(number*repeats+1)) | ||
for(i in 1:(number*repeats)) seeds[[i]] <- sample.int(1000, length(subsets)+1) | ||
seeds[[(number*repeats+1)]] <- sample.int(1000, 1) | ||
|
||
|
||
# set up the train control | ||
fitControl <- trainControl(method = "repeatedcv", | ||
number = number, | ||
repeats = repeats, | ||
p = 0.8, #30% used for test set, 70% used for training set | ||
selectionFunction = 'best', | ||
classProbs = T, | ||
savePredictions = T, | ||
returnResamp = 'final', | ||
search = "random", | ||
seeds = seeds) | ||
|
||
cl <- makeCluster(121) | ||
registerDoParallel(cl) | ||
# Random Forest - Parallel process | ||
rfm = train(x = comp.sub[,-c(1)], | ||
y = comp.sub$class, | ||
"rf", | ||
trControl = fitControl, | ||
ntree = 500, #number of trees default is 500, which seems to work best anyway. | ||
tuneLength=10, | ||
metric = 'Kappa', | ||
na.action=na.pass, | ||
keep.forest=TRUE, # added this line and the next for partial dependence plots | ||
importance=TRUE) | ||
stopCluster(cl) | ||
gc() | ||
|
||
|
||
# Inspect rfFit | ||
rfm$finalModel | ||
|
||
|
||
#look at overall accuracy, kappa, and SD associated with each mtry value | ||
print(rfm) | ||
rfm$results | ||
|
||
|
||
# Convert caret wrapper into randomforest model object for raster prediction | ||
rfm <- rfm$finalModel | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# make predictions | ||
|
||
# set wd to store tiles for prediction - tiles are written to hard drive, make sure there is enough room | ||
setwd("~/data/8-vic/data/tiles/") | ||
|
||
|
||
#numTiles <- 11 # numTiles^2 should <= the number of cores - in this case I have 123 cores available and 11x11 would give 121 tiles | ||
numTiles <- (round(sqrt(121), digits = 0)) | ||
|
||
x <- rast(ncol=numTiles, nrow=numTiles, extent=ext(rsm)) | ||
|
||
tl <- makeTiles(rsm, x, overwrite=T, extend=T) | ||
tl <- list.files("~/data/8-vic/data/tiles/", pattern=".tif$", full.names=FALSE) | ||
|
||
gc() | ||
# read in raster file names as a list | ||
#rList=list.files(getwd(), pattern="tif$", full.names = FALSE) | ||
|
||
|
||
cl <- parallel::makeCluster(round(length(tl)/2)) | ||
registerDoParallel(cl) | ||
|
||
pred <- foreach(i = 1:length(tl), .packages = c("terra", "randomForest")) %dopar% { | ||
pred <- wrap(terra::predict(rast(tl[i]),rfm, na.rm=T))#wopt=list(steps=40) | ||
return(pred) | ||
} | ||
pred <- do.call(terra::merge,lapply(pred,terra::rast)) | ||
plot(pred) | ||
setwd("~/data/8-vic/results") | ||
# write rasters | ||
writeRaster(pred, overwrite = TRUE, filename = "class.tif", gdal=c("TFW=YES"),datatype='INT1U') | ||
# write raster attribute table | ||
#library(foreign) | ||
write.dbf(levels(pred)[[1]], file='class.tif.vat.dbf') # make sure the first part of the file name is exactly the same as the predicted raster | ||
stopCluster(cl) | ||
gc() | ||
|
||
|
||
|
||
|
||
#set directory to prob tiles | ||
setwd("~/data/8-vic/data/tiles/") | ||
|
||
# read in raster file names as a list | ||
tl=list.files(getwd(), pattern="tif$", full.names = TRUE) | ||
tlnames=list.files(getwd(), pattern="tif$", full.names = FALSE) | ||
tlnames <- gsub(".tif", "", tlnames) | ||
tlnames <- gsub("tile", "prob", tlnames) | ||
|
||
cl <- parallel::makeCluster(round(length(tl)/2)) | ||
registerDoParallel(cl) | ||
|
||
foreach(i = 1:length(tl), .packages = c("terra", "randomForest")) %dopar% { | ||
wrap(terra::predict(rast(tl[i]), rfm, na.rm=T, type="prob", | ||
filename = paste0("~/data/8-vic/results/prob/",tlnames[i],".tif"), | ||
gdal=c("TFW=YES"), | ||
overwrite=T)) | ||
} | ||
|
||
stopCluster(cl) | ||
gc() | ||
|
||
|
||
|
||
|
||
|
||
|
||
#set directory to prob tiles | ||
setwd("~/data/8-vic/results/prob") | ||
|
||
# read in raster file names as a list | ||
rList=list.files(getwd(), pattern="tif$", full.names = FALSE) | ||
|
||
|
||
|
||
cl <- parallel::makeCluster(round(length(tl)/2)) | ||
registerDoParallel(cl) | ||
shan <- foreach(i = 1:length(rList), | ||
.packages = c("terra", "aqp")) %dopar% { | ||
shan <- wrap(aqp::shannonEntropy(terra::rast(rList[i]))) | ||
return(shan) | ||
} | ||
shan <- do.call(terra::merge,lapply(shan, terra::rast)) | ||
plot(shan) | ||
|
||
gc() | ||
|
||
#normalized shan entropy | ||
b <- length(levels(pred)[[1]]) | ||
shanNorm <- foreach(i = 1:length(rList), | ||
.packages = c("terra", "aqp")) %dopar% { | ||
shanNorm1 <- wrap(aqp::shannonEntropy(terra::rast(rList[i]), b=b)) | ||
return(shanNorm1) | ||
} | ||
shanNorm <- do.call(terra::merge,lapply(shanNorm, terra::rast)) | ||
plot(shanNorm) | ||
|
||
setwd("~/data/8-vic/results") | ||
writeRaster(shan, overwrite = TRUE, filename = "shan.tif", gdal=c("TFW=YES")) | ||
writeRaster(shanNorm, overwrite = TRUE, filename = "shanNorm.tif", gdal=c("TFW=YES")) | ||
|
||
stopCluster(cl) | ||
|
||
# get confusion matrix from model | ||
cm <- confusionMatrix(rfm$predicted, rfm$y) | ||
|
||
|
||
end.time <- Sys.time() | ||
|
||
time.taken <- end.time - start.time | ||
|
||
time.taken | ||
saveRDS(time.taken, "timeTaken.rds") | ||
|
||
|
||
## Transfer the script to the storage bucket: | ||
## Copy the line below and run in the terminal | ||
## change the name of file and path as necessary | ||
##gsutil -m cp ~/data/8-vic/scripts/modeling.R gs://sbs-aws-migration-s2026-8-vic/8-vic/scripts/ |
Oops, something went wrong.