Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Formatting updates #19

Merged
merged 1 commit into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions 02-statistical-learning.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ college$Elite <- factor(ifelse(college$Top10perc > 50, "Yes", "No"))
summary(college$Elite)
plot(college$Outstate ~ college$Elite, xlab = "Elite", ylab = "Outstate")

par(mfrow = c(2,2))
par(mfrow = c(2, 2))
for (n in c(5, 10, 20, 50)) {
hist(college$Enroll, breaks = n, main = paste("n =", n), xlab = "Enroll")
}
Expand Down Expand Up @@ -374,7 +374,7 @@ x[-(10:85), numeric] |>
> the relationships among the predictors. Comment on your findings.

```{r}
pairs(x[, numeric], cex = 0.2)
pairs(x[, numeric], cex = 0.2)
cor(x[, numeric]) |>
kable()

Expand Down Expand Up @@ -425,8 +425,10 @@ library(tidyverse)
```

```{r}
ggplot(Boston, aes(nox, rm)) + geom_point()
ggplot(Boston, aes(ptratio, rm)) + geom_point()
ggplot(Boston, aes(nox, rm)) +
geom_point()
ggplot(Boston, aes(ptratio, rm)) +
geom_point()
heatmap(cor(Boston, method = "spearman"), cexRow = 1.1, cexCol = 1.1)
```

Expand All @@ -440,12 +442,12 @@ Yes
> predictor.

```{r}
Boston |>
pivot_longer(cols = 1:13) |>
filter(name %in% c("crim", "tax", "ptratio")) |>
ggplot(aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~name, scales="free", ncol= 1)
Boston |>
pivot_longer(cols = 1:13) |>
filter(name %in% c("crim", "tax", "ptratio")) |>
ggplot(aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~name, scales = "free", ncol = 1)
```

Yes, particularly crime and tax rates.
Expand Down Expand Up @@ -496,9 +498,9 @@ Boston |>
select(-c(crim, zn)) |>
pivot_longer(!rm) |>
mutate(">8 rooms" = rm > 8) |>
ggplot(aes(`>8 rooms`, value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free")
ggplot(aes(`>8 rooms`, value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free")
```

Census tracts with big average properties (more than eight rooms per dwelling)
Expand Down
36 changes: 20 additions & 16 deletions 03-linear-regression.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ library(plotly)
```{r}
model <- function(gpa, iq, level) {
50 +
gpa * 20 +
iq * 0.07 +
level * 35 +
gpa * iq * 0.01 +
gpa * level * -10
gpa * 20 +
iq * 0.07 +
level * 35 +
gpa * iq * 0.01 +
gpa * level * -10
}
x <- seq(1, 5, length = 10)
y <- seq(1, 200, length = 20)
Expand All @@ -82,15 +82,18 @@ plot_ly(x = x, y = y) |>
add_surface(
z = ~college,
colorscale = list(c(0, 1), c("rgb(107,184,214)", "rgb(0,90,124)")),
colorbar = list(title = "College")) |>
colorbar = list(title = "College")
) |>
add_surface(
z = ~high_school,
colorscale = list(c(0, 1), c("rgb(255,112,184)", "rgb(128,0,64)")),
colorbar = list(title = "High school")) |>
colorbar = list(title = "High school")
) |>
layout(scene = list(
xaxis = list(title = "GPA"),
yaxis = list(title = "IQ"),
zaxis = list(title = "Salary")))
zaxis = list(title = "Salary")
))
```

Option iii correct.
Expand Down Expand Up @@ -366,7 +369,7 @@ par(mfrow = c(2, 2))
plot(Auto$horsepower, Auto$mpg, cex = 0.2)
plot(log(Auto$horsepower), Auto$mpg, cex = 0.2)
plot(sqrt(Auto$horsepower), Auto$mpg, cex = 0.2)
plot(Auto$horsepower ^ 2, Auto$mpg, cex = 0.2)
plot(Auto$horsepower^2, Auto$mpg, cex = 0.2)

x <- subset(Auto, select = -name)
x$horsepower <- log(x$horsepower)
Expand Down Expand Up @@ -553,7 +556,7 @@ We can show this numerically in R by computing $t$ using the above equation.

```{r}
n <- length(x)
sqrt(n - 1) * sum(x * y) / sqrt(sum(x ^ 2) * sum(y ^ 2) - sum(x * y) ^ 2)
sqrt(n - 1) * sum(x * y) / sqrt(sum(x^2) * sum(y^2) - sum(x * y)^2)
```

> e. Using the results from (d), argue that the _t_-statistic for the
Expand Down Expand Up @@ -846,9 +849,9 @@ contributions.
> answers.

```{r}
x1 <- c(x1 , 0.1)
x2 <- c(x2 , 0.8)
y <- c(y ,6)
x1 <- c(x1, 0.1)
x2 <- c(x2, 0.8)
y <- c(y, 6)
summary(lm(y ~ x1 + x2))
summary(lm(y ~ x1))
summary(lm(y ~ x2))
Expand Down Expand Up @@ -929,9 +932,10 @@ The results from (b) show reduced significance compared to the models fit in
(a).

```{r}
plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1],
xlab = "Univariate regression",
ylab = "multiple regression")
plot(sapply(fits, function(x) coef(x)[2]), coef(mfit)[-1],
xlab = "Univariate regression",
ylab = "multiple regression"
)
```

The estimated coefficients differ (in particular the estimated coefficient for
Expand Down
45 changes: 22 additions & 23 deletions 04-classification.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@ $$
Letting $x = e^{\beta_0 + \beta_1X}$

\begin{align}
\frac{P(X)}{1-p(X)} &= \frac{\frac{x}{1 + x}}
{1 - \frac{x}{1 + x}} \\
&= \frac{\frac{x}{1 + x}}
{\frac{1}{1 + x}} \\
&= x
\frac{P(X)}{1-p(X)}
&= \frac{\frac{x}{1 + x}} {1 - \frac{x}{1 + x}} \\
&= \frac{\frac{x}{1 + x}} {\frac{1}{1 + x}} \\
&= x
\end{align}

### Question 2
Expand Down Expand Up @@ -65,7 +64,7 @@ therefore, we can consider maximizing $\log(p_K(X))$

$$
\log(p_k(x)) = \log(\pi_k) - \frac{1}{2\sigma^2}(x - \mu_k)^2 -
\log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right)
\log\left(\sum_{l=1}^k \pi_l \exp\left(-\frac{1}{2\sigma^2}(x - \mu_l)^2\right)\right)
$$

Remember that we are maximizing over $k$, and since the last term does not
Expand Down Expand Up @@ -265,7 +264,7 @@ when $X_1 = 40$ and $X_2 = 3.5$, $p(X) = 0.38$
> chance of getting an A in the class?

We would like to solve for $X_1$ where $p(X) = 0.5$. Taking the first equation
above, we need to solve $0 = 6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours.
above, we need to solve $0 = -6 + 0.05X_1 + 3.5$, so $X_1 = 50$ hours.

### Question 7

Expand Down Expand Up @@ -305,7 +304,7 @@ p(D|v) &= \frac{p(v|D) p(D)}{p(v|D)p(D) + p(v|N)p(N)} \\
\end{align}

```{r}
exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2/9) * 0.2)
exp(-0.5) * 0.8 / (exp(-0.5) * 0.8 + exp(-2 / 9) * 0.2)
```

### Question 8
Expand Down Expand Up @@ -412,7 +411,7 @@ $$
(\hat\alpha_{orange0} - \hat\alpha_{apple0}) + (\hat\alpha_{orange1} - \hat\alpha_{apple1})x
$$

> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = 1$. What
> c. Suppose that in your model, $\hat\beta_0 = 2$ and $\hat\beta = -1$. What
> are the coefficient estimates in your friend's model? Be as specific as
> possible.

Expand All @@ -423,7 +422,7 @@ We are unable to know the specific value of each parameter however.

> d. Now suppose that you and your friend fit the same two models on a different
> data set. This time, your friend gets the coefficient estimates
> $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = 2$,
> $\hat\alpha_{orange0} = 1.2$, $\hat\alpha_{orange1} = -2$,
> $\hat\alpha_{apple0} = 3$, $\hat\alpha_{apple1} = 0.6$. What are the
> coefficient estimates in your model?

Expand Down Expand Up @@ -571,7 +570,7 @@ fit <- glm(Direction ~ Lag3, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)

fit <- glm(Direction ~Lag4, data = Weekly[train, ], family = binomial)
fit <- glm(Direction ~ Lag4, data = Weekly[train, ], family = binomial)
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)

Expand All @@ -583,7 +582,7 @@ fit <- glm(Direction ~ Lag1 * Lag2 * Lag3 * Lag4, data = Weekly[train, ], family
pred <- predict(fit, Weekly[!train, ], type = "response") > 0.5
mean(ifelse(pred, "Up", "Down") == Weekly[!train, ]$Direction)

fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4,data = Weekly[train, ])
fit <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4, data = Weekly[train, ])
pred <- predict(fit, Weekly[!train, ], type = "response")$class
mean(pred == Weekly[!train, ]$Direction)

Expand Down Expand Up @@ -658,7 +657,7 @@ variables are colinear.

```{r}
set.seed(1)
train <- sample(seq_len(nrow(x)), nrow(x) * 2/3)
train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3)
```

> d. Perform LDA on the training data in order to predict `mpg01` using the
Expand Down Expand Up @@ -787,8 +786,8 @@ Power3 <- function(x, a) {
> `log = "y"`, or `log = "xy"` as arguments to the `plot()` function.

```{r}
plot(1:10, Power3(1:10, 2),
xlab = "x",
plot(1:10, Power3(1:10, 2),
xlab = "x",
ylab = expression(paste("x"^"2")),
log = "y"
)
Expand All @@ -806,7 +805,7 @@ plot(1:10, Power3(1:10, 2),
```{r}
PlotPower <- function(x, a, log = "y") {
plot(x, Power3(x, a),
xlab = "x",
xlab = "x",
ylab = substitute("x"^a, list(a = a)),
log = log
)
Expand All @@ -827,11 +826,11 @@ PlotPower(1:10, 3)

```{r}
x <- cbind(
ISLR2::Boston[, -1],
ISLR2::Boston[, -1],
data.frame("highcrim" = Boston$crim > median(Boston$crim))
)
set.seed(1)
train <- sample(seq_len(nrow(x)), nrow(x) * 2/3)
train <- sample(seq_len(nrow(x)), nrow(x) * 2 / 3)
```

We can find the most associated variables by performing wilcox tests.
Expand Down Expand Up @@ -861,8 +860,8 @@ Let's look at univariate associations with `highcrim` (in the training data)
x[train, ] |>
pivot_longer(!highcrim) |>
mutate(name = factor(name, levels = ord)) |>
ggplot(aes(highcrim, value)) +
geom_boxplot() +
ggplot(aes(highcrim, value)) +
geom_boxplot() +
facet_wrap(~name, scale = "free")
```

Expand Down Expand Up @@ -902,9 +901,9 @@ res <- sapply(1:12, function(max) fit_models(1:max))
res <- as_tibble(t(res))
res$n_var <- 1:12
pivot_longer(res, cols = !n_var) |>
ggplot(aes(n_var, value, col = name)) +
geom_line() +
xlab("Number of predictors") +
ggplot(aes(n_var, value, col = name)) +
geom_line() +
xlab("Number of predictors") +
ylab("Error rate")
```

Expand Down
22 changes: 11 additions & 11 deletions 07-moving-beyond-linearity.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,9 @@ grid()

```{r}
x <- seq(-2, 6, length.out = 1000)
b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2)
b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5)
f <- function(x) 1 + 1*b1(x) + 3*b2(x)
b1 <- function(x) I(0 <= x & x <= 2) - (x - 1) * I(1 <= x & x <= 2)
b2 <- function(x) (x - 3) * I(3 <= x & x <= 4) + I(4 < x & x <= 5)
f <- function(x) 1 + 1 * b1(x) + 3 * b2(x)
plot(x, f(x), type = "l")
grid()
```
Expand Down Expand Up @@ -364,7 +364,7 @@ err5 <- mean(replicate(10, {
c(err, err1, err2, err3, err4, err5)
anova(fit, fit1, fit2, fit3, fit4, fit5)

x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out=1000)
x <- seq(min(Auto$horsepower), max(Auto$horsepower), length.out = 1000)
pred <- data.frame(
x = x,
"Linear" = predict(fit, data.frame(horsepower = x)),
Expand Down Expand Up @@ -407,7 +407,7 @@ lines(x, predict(fit, data.frame(dis = x)), col = "red", lty = 2)
```{r}
fits <- lapply(1:10, function(i) glm(nox ~ poly(dis, i), data = Boston))

x <- seq(min(Boston$dis), max(Boston$dis), length.out=1000)
x <- seq(min(Boston$dis), max(Boston$dis), length.out = 1000)
pred <- data.frame(lapply(fits, function(fit) predict(fit, data.frame(dis = x))))
colnames(pred) <- 1:10
pred$x <- x
Expand Down Expand Up @@ -604,7 +604,7 @@ beta1 <- 20
> ```

```{r}
a <- y - beta1*x1
a <- y - beta1 * x1
beta2 <- lm(a ~ x2)$coef[2]
```

Expand Down Expand Up @@ -633,15 +633,15 @@ res <- matrix(NA, nrow = 1000, ncol = 3)
colnames(res) <- c("beta0", "beta1", "beta2")
beta1 <- 20
for (i in 1:1000) {
beta2 <- lm(y - beta1*x1 ~ x2)$coef[2]
beta1 <- lm(y - beta2*x2 ~ x1)$coef[2]
beta0 <- lm(y - beta2*x2 ~ x1)$coef[1]
beta2 <- lm(y - beta1 * x1 ~ x2)$coef[2]
beta1 <- lm(y - beta2 * x2 ~ x1)$coef[2]
beta0 <- lm(y - beta2 * x2 ~ x1)$coef[1]
res[i, ] <- c(beta0, beta1, beta2)
}
res <- as.data.frame(res)
res$Iteration <- 1:1000
res <- pivot_longer(res, !Iteration)
p <- ggplot(res, aes(x=Iteration, y=value, color=name)) +
p <- ggplot(res, aes(x = Iteration, y = value, color = name)) +
geom_line() +
geom_point() +
scale_x_continuous(trans = "log10")
Expand Down Expand Up @@ -682,7 +682,7 @@ n <- 1000

betas <- rnorm(p) * 5
x <- matrix(rnorm(n * p), ncol = p, nrow = n)
y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity
y <- (x %*% betas) + rnorm(n) # ignore beta0 for simplicity

# multiple regression
fit <- lm(y ~ x - 1)
Expand Down
Loading
Loading