-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path02 Dimention reduction.R
160 lines (107 loc) · 3.79 KB
/
02 Dimention reduction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
library(rsample)
library(recipes)
library(yardstick)
library(caret)
# The data ----------------------------------------------------------------
data("Hitters", package = "ISLR")
Hitters <- na.omit(Hitters)
set.seed(1)
splits <- initial_split(Hitters, prop = 0.7)
Hitters_train <- training(splits)
Hitters_test <- testing(splits)
# PCR and PLS -------------------------------------------------------------
## (A) Prepossessing
rec <- recipe(Salary ~ ., data = Hitters_train) |>
# Standardize numeric variables:
step_center(all_numeric_predictors()) |>
step_scale(all_numeric_predictors()) |>
# Make one hot encoding dummy variables
step_dummy(all_nominal_predictors(), one_hot = TRUE)
## PCR -----------------------------------
# This is a method that use PCA as a first step for predicting y.
# IMPORTANT- this is an UNSUPERVISED method for dimension reduction - since PCA
# don't uses a response variable when building the components. There are other
# methods, which uses the response (hence, SUPERVISED), for using a similar
# goal- predicting y from components
## (B) Tune:
tg <- expand.grid(
ncomp = 1:15 # [1, p]
)
tc <- trainControl(method = "cv", number = 10)
set.seed(44)
PCR_fit <- train(
x = rec,
data = Hitters_train,
method = "pcr",
tuneGrid = tg,
trControl = tc
)
PCR_fit$bestTune
plot(PCR_fit)
summary(PCR_fit$finalModel)
# Note these are back transformed to the X predictor space.
coef(PCR_fit$finalModel, ncomp = 7)
## PLS ---------------------------
# We will now address the same problem using another dimension reduction method-
# PLS. Importantly, PCR is an UNSUPERVISED method and PLS is a SUPERVISED
# method. PLS counts for *Partial Least Squares*.
## (B) Tune:
PLS_fit <- train(
x = rec,
data = Hitters_train,
method = "pls", # change method
tuneGrid = tg, # same tune-grid
trControl = tc
)
PLS_fit$bestTune
plot(PLS_fit)
summary(PLS_fit$finalModel)
coef(PLS_fit$finalModel, ncomp = 3)
## Compare ----------------
cbind(coef(PCR_fit$finalModel, ncomp = 7),
coef(PLS_fit$finalModel, ncomp = 3))
Hitters_test$PCR_pred <- predict(PCR_fit, newdata = Hitters_test)
Hitters_test$PLS_pred <- predict(PLS_fit, newdata = Hitters_test)
rmse(Hitters_test, Salary, PCR_pred)
rmse(Hitters_test, Salary, PLS_pred)
rsq(Hitters_test, Salary, PCR_pred)
rsq(Hitters_test, Salary, PLS_pred)
# In this case the performance is nearly identical
# Using PCA in other methods w/ {recipe} ----------------------------------
# What if I want to use KNN??
# We can still use PCA as part of our recipe (however, the number of dimensions
# is no longer a tunable hyperparameter). This can be achieved with
# `step_pca()`. There are two arguments that can be used to control how many PCs
# to save:
# - num_comp: the number of components
# - threshold: what proportion of variance should be saved?
# * Note that the predictors should be all be re-scaled _prior_ to the PCA step.
rec
# Already has a scaling step and centering step
# If we didn't we would have to add
# step_pca(..., options = list(center = TRUE, scale. = TRUE))
?step_pca
rec_with_PCA <- rec |>
step_pca(all_numeric_predictors(),
threshold = 0.9) # give k PCs that represent 90% of the total variance
## (B) Tune:
tg <- expand.grid(
k = c(1, 2, 5, 10, 20, 50, 100) # [1, N]
)
set.seed(44)
KNN_fit <- train(
x = rec_with_PCA,
data = Hitters_train,
method = "knn",
tuneGrid = tg,
trControl = tc
)
PLS_fit$recipe |> bake(new_data = NULL) |> ncol() # We had 22 predictors
KNN_fit$recipe |> bake(new_data = NULL) |> ncol() # We used 7 PCs
## Compare -------------------
Hitters_test$KNN_pred <- predict(KNN_fit, newdata = Hitters_test)
rsq(Hitters_test, Salary, PCR_pred)
rsq(Hitters_test, Salary, PLS_pred)
rsq(Hitters_test, Salary, KNN_pred)
# KNN with PCA is better...
# How about KNN without PCA?