danhalligan · danhalligan · Jan 2, 2025 · Jan 2, 2025
diff --git a/10-deep-learning.Rmd b/10-deep-learning.Rmd
@@ -46,12 +46,12 @@ library(ISLR2)
 library(neuralnet)
 library(sigmoid)
 set.seed(5)
-train <- sample(seq_len(nrow(ISLR2::Boston)), nrow(ISLR2::Boston) * 2/3)
+train <- sample(seq_len(nrow(ISLR2::Boston)), nrow(ISLR2::Boston) * 2 / 3)
 
 net <- neuralnet(crim ~ lstat + medv + ptratio + rm,
-    data = ISLR2::Boston[train, ],
-    act.fct = relu,
-    hidden = c(2, 3)
+  data = ISLR2::Boston[train, ],
+  act.fct = relu,
+  hidden = c(2, 3)
 )
 plot(net)
 ```
@@ -201,10 +201,13 @@ When we take the negative of this, it is equivalent to 10.14 for two classes
 knitr::include_graphics("images/nn2.png")
 ```
 
+Note that, because there is no boundary padding, each convolution layer will
+consist of a 28x28 array.
+
 > b. How many parameters are in this model?
 
-There are 5 convolution matrices each with 5x5 weights (plus 5 bias terms) to
-estimate, therefore 130 parameters 
+There are 3 convolution matrices each with 5x5 weights (plus 3 bias terms) to
+estimate, therefore $3 \times 5 \times 5 + 3 = 78$ parameters 
 
 > c. Explain how this model can be thought of as an ordinary feed-forward
 > neural network with the individual pixels as inputs, and with constraints on
@@ -222,9 +225,11 @@ connections to all other output nodes.
 > d. If there were no constraints, then how many weights would there be in the
 > ordinary feed-forward neural network in (c)?
 
-With no constraints, we would connect each output pixel in our 5x32x32
-convolution layer to each node in the 32x32 original image (plus 5 bias terms),
-giving a total of 5,242,885 weights to estimate.
+With no constraints, we would connect each input pixel in our original 32x32 
+image to each output pixel in each of our convolution layers, with an bias 
+term for each original pixel. So each output pixel would require 32x32 weights 
++ 1 bias term. This would give a total of (32×32+1)×28×28×3 = 2,410,800 
+parameters.
 
 ### Question 5
 
@@ -246,7 +251,7 @@ absolute error.
 > a. Draw a graph of this function over the range $\beta \in [−6, 6]$.
 
 ```{r}
-r <- function(x) sin(x) + x/10
+r <- function(x) sin(x) + x / 10
 x <- seq(-6, 6, 0.1)
 plot(x, r(x), type = "l")
 ```
@@ -270,11 +275,11 @@ x^{m+1} = x^m - \rho (cos(x^m) + 1/10)
 $$
 
 ```{r}
-iter <- function(x, rho) x - rho*(cos(x) + 1/10)
+iter <- function(x, rho) x - rho * (cos(x) + 1 / 10)
 gd <- function(start, rho = 0.1) {
   b <- start
   v <- b
-  while(abs(b - iter(b, 0.1)) > 1e-8) {
+  while (abs(b - iter(b, 0.1)) > 1e-8) {
     b <- iter(b, 0.1)
     v <- c(v, b)
   }
@@ -306,7 +311,7 @@ points(res, r(res), col = "red", pch = 19)
 ### Question 7
 
 > Fit a neural network to the `Default` data. Use a single hidden layer with 10
-> units, and dropout regularization. Have a look at Labs 10.9.1–-10.9.2 for
+> units, and dropout regularization. Have a look at Labs 10.9.1--10.9.2 for
 > guidance. Compare the classification performance of your model with that of
 > linear logistic regression.
 
@@ -331,15 +336,16 @@ nn <- keras_model_sequential() |>
   layer_dropout(rate = 0.4) |>
   layer_dense(units = 1)
 
-compile(nn, loss = "mse", 
-  optimizer = optimizer_rmsprop(), 
-  metrics = list("mean_absolute_error") 
+compile(nn,
+  loss = "mse",
+  optimizer = optimizer_rmsprop(),
+  metrics = list("mean_absolute_error")
 )
 
 history <- fit(nn,
-  x[-testid, ], y[-testid], 
-  epochs = 100, 
-  batch_size = 26, 
+  x[-testid, ], y[-testid],
+  epochs = 100,
+  batch_size = 26,
   validation_data = list(x[testid, ], y[testid]),
   verbose = 0
 )
@@ -381,15 +387,15 @@ images <- list.files("images/animals")
 x <- array(dim = c(length(images), 224, 224, 3))
 for (i in seq_len(length(images))) {
   img <- image_load(paste0("images/animals/", images[i]), target_size = c(224, 224))
-  x[i,,,] <- image_to_array(img)
+  x[i, , , ] <- image_to_array(img)
 }
 
 model <- application_resnet50(weights = "imagenet")
 
 pred <- model |>
   predict(x) |>
   imagenet_decode_predictions(top = 5)
-  
+
 names(pred) <- images
 print(pred)
 ```
@@ -405,7 +411,7 @@ Fitting the model as described in the text.
 ```{r}
 library(tidyverse)
 library(ISLR2)
-xdata <- data.matrix(NYSE[, c("DJ_return", "log_volume","log_volatility")])
+xdata <- data.matrix(NYSE[, c("DJ_return", "log_volume", "log_volatility")])
 istrain <- NYSE[, "train"]
 xdata <- scale(xdata)
 
@@ -416,8 +422,8 @@ lagm <- function(x, k = 1) {
 }
 
 arframe <- data.frame(
-  log_volume = xdata[, "log_volume"], 
-  L1 = lagm(xdata, 1), 
+  log_volume = xdata[, "log_volume"],
+  L1 = lagm(xdata, 1),
   L2 = lagm(xdata, 2),
   L3 = lagm(xdata, 3),
   L4 = lagm(xdata, 4),
@@ -436,7 +442,7 @@ V0 <- var(arframe[!istrain, "log_volume"])
 Now we add month (and work with tidyverse).
 
 ```{r}
-arframe$month = as.factor(str_match(NYSE$date, "-(\\d+)-")[,2])[-(1:5)]
+arframe$month <- as.factor(str_match(NYSE$date, "-(\\d+)-")[, 2])[-(1:5)]
 arfit2 <- lm(log_volume ~ ., data = arframe[istrain, ])
 arpred2 <- predict(arfit2, arframe[!istrain, ])
 V0 <- var(arframe[!istrain, "log_volume"])
@@ -498,16 +504,16 @@ model |>
 
 history <- model |>
   fit(
-    xrnn[istrain,, ],
+    xrnn[istrain, , ],
     arframe[istrain, "log_volume"],
     batch_size = 64,
     epochs = 200,
-    validation_data = list(xrnn[!istrain,, ], arframe[!istrain, "log_volume"]),
+    validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
     verbose = 0
   )
 
 plot(history, smooth = FALSE)
-kpred <- predict(model, xrnn[!istrain,, ])
+kpred <- predict(model, xrnn[!istrain, , ])
 1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
 ```
 
@@ -536,33 +542,31 @@ From the book:
 
 ```{r, c10q11}
 xfun::cache_rds({
-
-  model <- keras_model_sequential() |> 
+  model <- keras_model_sequential() |>
     layer_flatten(input_shape = c(5, 3)) |>
     layer_dense(units = 32, activation = "relu") |>
-    layer_dropout(rate = 0.4) |> 
+    layer_dropout(rate = 0.4) |>
     layer_dense(units = 1)
 
   model |> compile(
-    loss = "mse", 
-    optimizer = optimizer_rmsprop(), 
+    loss = "mse",
+    optimizer = optimizer_rmsprop(),
     metrics = "mse"
   )
 
   history <- model |>
     fit(
-      xrnn[istrain,, ],
+      xrnn[istrain, , ],
       arframe[istrain, "log_volume"],
       batch_size = 64,
       epochs = 200,
-      validation_data = list(xrnn[!istrain,, ], arframe[!istrain, "log_volume"]),
+      validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
       verbose = 0
     )
 
   plot(history, smooth = FALSE, metrics = "mse")
-  kpred <- predict(model, xrnn[!istrain,, ])
+  kpred <- predict(model, xrnn[!istrain, , ])
   1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
-
 })
 ```
 
@@ -581,16 +585,16 @@ in the RNN. Thus, our input for each observation will be 4 x 5 (rather than
 ```{r, c10q12}
 xfun::cache_rds({
   xdata <- data.matrix(
-    NYSE[, c("day_of_week", "DJ_return", "log_volume","log_volatility")] 
+    NYSE[, c("day_of_week", "DJ_return", "log_volume", "log_volatility")]
   )
   istrain <- NYSE[, "train"]
   xdata <- scale(xdata)
 
   arframe <- data.frame(
-    log_volume = xdata[, "log_volume"], 
+    log_volume = xdata[, "log_volume"],
     L1 = lagm(xdata, 1),
     L2 = lagm(xdata, 2),
-    L3 = lagm(xdata, 3), 
+    L3 = lagm(xdata, 3),
     L4 = lagm(xdata, 4),
     L5 = lagm(xdata, 5)
   )
@@ -600,33 +604,33 @@ xfun::cache_rds({
   n <- nrow(arframe)
   xrnn <- data.matrix(arframe[, -1])
   xrnn <- array(xrnn, c(n, 4, 5))
-  xrnn <- xrnn[,, 5:1]
+  xrnn <- xrnn[, , 5:1]
   xrnn <- aperm(xrnn, c(1, 3, 2))
   dim(xrnn)
 
   model <- keras_model_sequential() |>
-      layer_simple_rnn(units = 12,
+    layer_simple_rnn(
+      units = 12,
       input_shape = list(5, 4),
-      dropout = 0.1, 
+      dropout = 0.1,
       recurrent_dropout = 0.1
     ) |>
     layer_dense(units = 1)
 
   model |> compile(optimizer = optimizer_rmsprop(), loss = "mse")
 
-  history <- model |> 
+  history <- model |>
     fit(
-      xrnn[istrain,, ],
+      xrnn[istrain, , ],
       arframe[istrain, "log_volume"],
       batch_size = 64,
       epochs = 200,
-      validation_data = list(xrnn[!istrain,, ], arframe[!istrain, "log_volume"]),
+      validation_data = list(xrnn[!istrain, , ], arframe[!istrain, "log_volume"]),
       verbose = 0
-  )
+    )
 
-  kpred <- predict(model, xrnn[!istrain,, ])
+  kpred <- predict(model, xrnn[!istrain, , ])
   1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
-
 })
 ```
 
@@ -641,7 +645,7 @@ xfun::cache_rds({
 xfun::cache_rds({
   library(knitr)
   accuracy <- c()
-  for(max_features in c(1000, 3000, 5000, 10000)) {
+  for (max_features in c(1000, 3000, 5000, 10000)) {
     imdb <- dataset_imdb(num_words = max_features)
     c(c(x_train, y_train), c(x_test, y_test)) %<-% imdb
 
@@ -656,13 +660,13 @@ xfun::cache_rds({
 
     model |> compile(
       optimizer = "rmsprop",
-      loss = "binary_crossentropy", 
+      loss = "binary_crossentropy",
       metrics = "acc"
     )
 
-    history <- fit(model, x_train, y_train, 
-      epochs = 10, 
-      batch_size = 128, 
+    history <- fit(model, x_train, y_train,
+      epochs = 10,
+      batch_size = 128,
       validation_data = list(x_test, y_test),
       verbose = 0
     )
@@ -676,7 +680,6 @@ xfun::cache_rds({
     "Accuracy" = accuracy
   ) |>
     kable()
-
 })
 ```
 

diff --git a/images/nn2.png b/images/nn2.png