danhalligan · danhalligan · Jan 3, 2025 · Jan 3, 2025
diff --git a/02-statistical-learning.Rmd b/02-statistical-learning.Rmd
@@ -302,7 +302,7 @@ college$Elite <- factor(ifelse(college$Top10perc > 50, "Yes", "No"))
 summary(college$Elite)
 plot(college$Outstate ~ college$Elite, xlab = "Elite", ylab = "Outstate")
 
-par(mfrow = c(2,2))
+par(mfrow = c(2, 2))
 for (n in c(5, 10, 20, 50)) {
   hist(college$Enroll, breaks = n, main = paste("n =", n), xlab = "Enroll")
 }
@@ -374,7 +374,7 @@ x[-(10:85), numeric] |>
 >    the relationships among the predictors. Comment on your findings.
 
 ```{r}
-pairs(x[, numeric], cex = 0.2) 
+pairs(x[, numeric], cex = 0.2)
 cor(x[, numeric]) |>
   kable()
 
@@ -425,8 +425,10 @@ library(tidyverse)
 ```
 
 ```{r}
-ggplot(Boston, aes(nox, rm)) + geom_point()
-ggplot(Boston, aes(ptratio, rm)) + geom_point()
+ggplot(Boston, aes(nox, rm)) +
+  geom_point()
+ggplot(Boston, aes(ptratio, rm)) +
+  geom_point()
 heatmap(cor(Boston, method = "spearman"), cexRow = 1.1, cexCol = 1.1)
 ```
 
@@ -440,12 +442,12 @@ Yes
 >    predictor.
 
 ```{r}
-Boston |> 
-  pivot_longer(cols = 1:13) |> 
-  filter(name %in% c("crim", "tax", "ptratio")) |> 
-  ggplot(aes(value)) + 
-    geom_histogram(bins = 20) + 
-    facet_wrap(~name, scales="free", ncol= 1)
+Boston |>
+  pivot_longer(cols = 1:13) |>
+  filter(name %in% c("crim", "tax", "ptratio")) |>
+  ggplot(aes(value)) +
+  geom_histogram(bins = 20) +
+  facet_wrap(~name, scales = "free", ncol = 1)
 ```
 
 Yes, particularly crime and tax rates.
@@ -496,9 +498,9 @@ Boston |>
   select(-c(crim, zn)) |>
   pivot_longer(!rm) |>
   mutate(">8 rooms" = rm > 8) |>
-  ggplot(aes(`>8 rooms`, value)) + 
-    geom_boxplot() + 
-    facet_wrap(~name, scales = "free")
+  ggplot(aes(`>8 rooms`, value)) +
+  geom_boxplot() +
+  facet_wrap(~name, scales = "free")
 ```
 
 Census tracts with big average properties (more than eight rooms per dwelling)

diff --git a/03-linear-regression.Rmd b/03-linear-regression.Rmd
@@ -67,11 +67,11 @@ library(plotly)
 ```{r}
 model <- function(gpa, iq, level) {
   50 +
-  gpa * 20 +
-  iq * 0.07 +
-  level * 35 +
-  gpa * iq * 0.01 +
-  gpa * level * -10
+    gpa * 20 +
+    iq * 0.07 +
+    level * 35 +
+    gpa * iq * 0.01 +
+    gpa * level * -10
 }
 x <- seq(1, 5, length = 10)
 y <- seq(1, 200, length = 20)
@@ -82,15 +82,18 @@ plot_ly(x = x, y = y) |>
   add_surface(
     z = ~college,
     colorscale = list(c(0, 1), c("rgb(107,184,214)", "rgb(0,90,124)")),
-    colorbar = list(title = "College")) |>
+    colorbar = list(title = "College")
+  ) |>
   add_surface(
     z = ~high_school,
     colorscale = list(c(0, 1), c("rgb(255,112,184)", "rgb(128,0,64)")),
-    colorbar = list(title = "High school")) |>
+    colorbar = list(title = "High school")
+  ) |>
   layout(scene = list(
     xaxis = list(title = "GPA"),
     yaxis = list(title = "IQ"),
-    zaxis = list(title = "Salary")))
+    zaxis = list(title = "Salary")
+  ))
 ```
 
 Option iii correct.
@@ -366,7 +369,7 @@ par(mfrow = c(2, 2))
 plot(Auto$horsepower, Auto$mpg, cex = 0.2)
 plot(log(Auto$horsepower), Auto$mpg, cex = 0.2)
 plot(sqrt(Auto$horsepower), Auto$mpg, cex = 0.2)
-plot(Auto$horsepower ^ 2, Auto$mpg, cex = 0.2)
+plot(Auto$horsepower^2, Auto$mpg, cex = 0.2)
 
 x <- subset(Auto, select = -name)
 x$horsepower <- log(x$horsepower)
@@ -553,7 +556,7 @@ We can show this numerically in R by computing $t$ using the above equation.
 
 ```{r}
 n <- length(x)
-sqrt(n - 1) * sum(x * y) / sqrt(sum(x ^ 2) * sum(y ^ 2) - sum(x * y) ^ 2)
+sqrt(n - 1) * sum(x * y) / sqrt(sum(x^2) * sum(y^2) - sum(x * y)^2)
 ```
 
 > e. Using the results from (d), argue that the _t_-statistic for the
@@ -846,9 +849,9 @@ contributions.
 >    answers.
 
 ```{r}
-x1 <- c(x1 , 0.1)
-x2 <- c(x2 , 0.8)
-y <- c(y ,6)
+x1 <- c(x1, 0.1)
+x2 <- c(x2, 0.8)
+y <- c(y, 6)
 summary(lm(y ~ x1 + x2))
 summary(lm(y ~ x1))
 summary(lm(y ~ x2))
@@ -929,9 +932,10 @@ The results from (b) show reduced significance compared to the models fit in
 (a). 
 
 ```{r}
-plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1], 
-  xlab = "Univariate regression", 
-  ylab = "multiple regression")
+plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1],
+  xlab = "Univariate regression",
+  ylab = "multiple regression"
+)
 ```
 
 The estimated coefficients differ (in particular the estimated coefficient for

diff --git a/04-classification.Rmd b/04-classification.Rmd
@@ -23,11 +23,10 @@ $$
 Letting $x = e^{\beta_0 + \beta_1X}$
 
 \begin{align}
-\frac{P(X)}{1-p(X)} &= \frac{\frac{x}{1 + x}}
-                            {1 - \frac{x}{1 + x}} \\
-              &= \frac{\frac{x}{1 + x}}
-                      {\frac{1}{1 + x}} \\
-              &= x
+\frac{P(X)}{1-p(X)} 
+  &= \frac{\frac{x}{1 + x}} {1 - \frac{x}{1 + x}} \\
+  &= \frac{\frac{x}{1 + x}} {\frac{1}{1 + x}} \\
+  &= x
 \end{align}
 
 ### Question 2
@@ -65,7 +64,7 @@ therefore, we can consider maximizing $\log(p_K(X))$
 
 $$
 \log(p_k(x)) = \log(\pi_k) - \frac{1}{2\sigma^2}(x - \mu_k)^2 -
-              \log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right)
+               \log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right)
 $$
 
 Remember that we are maximizing over $k$, and since the last term does not
@@ -265,7 +264,7 @@ when $X_1 = 40$ and $X_2 = 3.5$, $p(X) = 0.38$
 >    chance of getting an A in the class?
 
 We would like to solve for $X_1$ where $p(X) = 0.5$. Taking the first equation
-above, we need to solve $0 = −6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours.
+above, we need to solve $0 = -6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours.
 
 ### Question 7
 
@@ -305,7 +304,7 @@ p(D|v) &= \frac{p(v|D) p(D)}{p(v|D)p(D) + p(v|N)p(N)} \\
 \end{align}
 
 ```{r}
-exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2/9) * 0.2)
+exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2 / 9) * 0.2)
 ```
 
 ### Question 8
@@ -412,7 +411,7 @@ $$
 (\hat\alpha_{orange0} - \hat\alpha_{apple0}) + (\hat\alpha_{orange1} - \hat\alpha_{apple1})x
 $$
 
-> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = −1$. What
+> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = -1$. What
 >    are the coefficient estimates in your friend's model? Be as specific as
 >    possible.
 
@@ -423,7 +422,7 @@ We are unable to know the specific value of each parameter however.
 
 > d. Now suppose that you and your friend fit the same two models on a different
 >    data set. This time, your friend gets the coefficient estimates 
->    $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = −2$, 
+>    $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = -2$, 
 >    $\hat\alpha_{apple0} = 3$, $\hat\alpha_{apple1} = 0.6$. What are the 
 >    coefficient estimates in your model?
 
@@ -571,7 +570,7 @@ fit <- glm(Direction ~ Lag3, data = Weekly[train, ], family = binomial)
 pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
 mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
 
-fit <- glm(Direction ~Lag4, data = Weekly[train, ], family = binomial)
+fit <- glm(Direction ~ Lag4, data = Weekly[train, ], family = binomial)
 pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
 mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
 
@@ -583,7 +582,7 @@ fit <- glm(Direction ~ Lag1 * Lag2 * Lag3 * Lag4, data = Weekly[train, ], family
 pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
 mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)
 
-fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4,data = Weekly[train, ])
+fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ])
 pred <- predict(fit, Weekly[!train, ], type = "response")$class
 mean(pred == Weekly[!train, ]$Direction)
 
@@ -658,7 +657,7 @@ variables are colinear.
 
 ```{r}
 set.seed(1)
-train <- sample(seq_len(nrow(x)), nrow(x) * 2/3)
+train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3)
 ```
 
 > d. Perform LDA on the training data in order to predict `mpg01` using the
@@ -787,8 +786,8 @@ Power3 <- function(x, a) {
 >    `log = "y"`, or `log = "xy"` as arguments to the `plot()` function.
 
 ```{r}
-plot(1:10, Power3(1:10, 2), 
-  xlab = "x", 
+plot(1:10, Power3(1:10, 2),
+  xlab = "x",
   ylab = expression(paste("x"^"2")),
   log = "y"
 )
@@ -806,7 +805,7 @@ plot(1:10, Power3(1:10, 2),
 ```{r}
 PlotPower <- function(x, a, log = "y") {
   plot(x, Power3(x, a),
-    xlab = "x", 
+    xlab = "x",
     ylab = substitute("x"^a, list(a = a)),
     log = log
   )
@@ -827,11 +826,11 @@ PlotPower(1:10, 3)
 
 ```{r}
 x <- cbind(
-  ISLR2::Boston[, -1], 
+  ISLR2::Boston[, -1],
   data.frame("highcrim" = Boston$crim > median(Boston$crim))
 )
 set.seed(1)
-train <- sample(seq_len(nrow(x)), nrow(x) * 2/3)
+train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3)
 ```
 
 We can find the most associated variables by performing wilcox tests.
@@ -861,8 +860,8 @@ Let's look at univariate associations with `highcrim` (in the training data)
 x[train, ] |>
   pivot_longer(!highcrim) |>
   mutate(name = factor(name, levels = ord)) |>
-  ggplot(aes(highcrim, value)) + 
-  geom_boxplot() + 
+  ggplot(aes(highcrim, value)) +
+  geom_boxplot() +
   facet_wrap(~name, scale = "free")
 ```
 
@@ -902,9 +901,9 @@ res <- sapply(1:12, function(max) fit_models(1:max))
 res <- as_tibble(t(res))
 res$n_var <- 1:12
 pivot_longer(res, cols = !n_var) |>
-  ggplot(aes(n_var, value, col = name)) + 
-  geom_line() + 
-  xlab("Number of predictors") + 
+  ggplot(aes(n_var, value, col = name)) +
+  geom_line() +
+  xlab("Number of predictors") +
   ylab("Error rate")
 ```
 

diff --git a/07-moving-beyond-linearity.Rmd b/07-moving-beyond-linearity.Rmd
@@ -175,9 +175,9 @@ grid()
 
 ```{r}
 x <- seq(-2, 6, length.out = 1000)
-b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x &  x <= 2)
-b2 <- function(x) (x - 3) * I(3 <= x  & x <= 4) + I(4 < x & x <= 5)
-f <- function(x) 1 + 1*b1(x) + 3*b2(x)
+b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2)
+b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5)
+f <- function(x) 1 + 1 * b1(x) + 3 * b2(x)
 plot(x, f(x), type = "l")
 grid()
 ```
@@ -364,7 +364,7 @@ err5 <- mean(replicate(10, {
 c(err, err1, err2, err3, err4, err5)
 anova(fit, fit1, fit2, fit3, fit4, fit5)
 
-x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out=1000)
+x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 1000)
 pred <- data.frame(
   x = x,
   "Linear" = predict(fit, data.frame(horsepower = x)),
@@ -407,7 +407,7 @@ lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2)
 ```{r}
 fits <- lapply(1:10, function(i) glm(nox ~ poly(dis, i), data = Boston))
 
-x <- seq(min(Boston$dis), max(Boston$dis), length.out=1000)
+x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
 pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x))))
 colnames(pred) <- 1:10
 pred$x <- x
@@ -604,7 +604,7 @@ beta1 <- 20
 >    ```
 
 ```{r}
-a <- y - beta1*x1
+a <- y - beta1 * x1
 beta2 <- lm(a ~ x2)$coef[2]
 ```
 
@@ -633,15 +633,15 @@ res <- matrix(NA, nrow = 1000, ncol = 3)
 colnames(res) <- c("beta0", "beta1", "beta2")
 beta1 <- 20
 for (i in 1:1000) {
-  beta2 <- lm(y - beta1*x1 ~ x2)$coef[2]
-  beta1 <- lm(y - beta2*x2 ~ x1)$coef[2]
-  beta0 <- lm(y - beta2*x2 ~ x1)$coef[1]
+  beta2 <- lm(y - beta1 * x1 ~ x2)$coef[2]
+  beta1 <- lm(y - beta2 * x2 ~ x1)$coef[2]
+  beta0 <- lm(y - beta2 * x2 ~ x1)$coef[1]
   res[i, ] <- c(beta0, beta1, beta2)
 }
 res <- as.data.frame(res)
 res$Iteration <- 1:1000
 res <- pivot_longer(res, !Iteration)
-p <- ggplot(res, aes(x=Iteration, y=value, color=name)) +
+p <- ggplot(res, aes(x = Iteration, y = value, color = name)) +
   geom_line() +
   geom_point() +
   scale_x_continuous(trans = "log10")
@@ -682,7 +682,7 @@ n <- 1000
 
 betas <- rnorm(p) * 5
 x <- matrix(rnorm(n * p), ncol = p, nrow = n)
-y <- (x %*% betas) + rnorm(n)  # ignore beta0 for simplicity
+y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity
 
 # multiple regression
 fit <- lm(y ~ x - 1)