-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject-template.Rmd
315 lines (258 loc) · 12.1 KB
/
project-template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
---
title: "Your Project Title (be descriptive and creative!)"
author: "Group 35: Max Ming Yi Koh (1007972785) and Kevin Le (1007952805)"
date: March 31, 2022
output:
beamer_presentation:
theme: "Pittsburgh"
colortheme: "orchid"
fonttheme: "structurebold"
slide_level: 2
classoption: "aspectratio=169"
fontsize: 11pt
urlcolor: blue
---
```{r, echo=FALSE, message=FALSE, warning=FALSE}
# echo=FALSE will stop the code chunk from appearing in the knit document
# warning=FALSE and message=FALSE will stop R messages from appearing in the knit document
library(tidyverse)
library(dplyr)
# here is the data for this project
expedia_data <- read_csv("ExpediaSearchData.csv")
glimpse(expedia_data)
# see the Project Overview page for more information:
# https://q.utoronto.ca/courses/235890/pages/project-overview
```
## Overall Introductions
## Data Summary
## Research Question 1
Does urgency affect how much consumers are willing to pay?
---
# TODO: Add visualization
```{r}
expedia_data_q1 <- expedia_data %>%
select(is_trans1, is_trans2, is_trans3) %>%
mutate(is_trans1 = ifelse(!is.na(is_trans1) & is_trans1 == 1, 1, 0),
is_trans2 = ifelse(!is.na(is_trans2) & is_trans2 == 1, 1, 0),
is_trans3 = ifelse(!is.na(is_trans3) & is_trans3 == 1, 1, 0),
trans_made = ifelse(is_trans1 + is_trans2 + is_trans3 > 0, TRUE, FALSE))
ggplot(data = expedia_data_q1, aes(x=trans_made)) +
geom_bar(color="black",
fill="gray") +
labs(x="Transaction made", y="Property count",
title = "TODO: add title")
# expedia_data_q1 <- expedia_data %>%
# select(is_trans1, is_trans2, is_trans3, price_bucket1, price_bucket2, price_bucket3,
# checkin_date, search_timestamp, destination_id) %>%
# filter(is_trans1 + is_trans2 + is_trans3 == 1 &
# !is.na(checkin_date) & !is.na(search_timestamp) & !is.na(destination_id)) %>%
# mutate(
# price_bucket1 = ifelse(is.na(is_trans1) | is_trans1 == 0, 0, price_bucket1),
# price_bucket2 = ifelse(is.na(is_trans2) | is_trans2 == 0, 0, price_bucket2),
# price_bucket3 = ifelse(is.na(is_trans3) | is_trans3 == 0, 0, price_bucket2),
# price_bucket = (price_bucket1 + price_bucket2 + price_bucket3) / (is_trans1 + is_trans2 + is_trans3),
# search_timestamp = as.Date(as.character(search_timestamp)))
# expedia_data_q1 %>% ggplot(aes(x=checkin_date - search_timestamp, y=price_bucket)) +
# geom_point() +
# labs(x = "TODO: add x-axis label",
# y = "TODO: add y-axis label",
# title = "TODO: add title") +
# geom_smooth(method="lm", se=FALSE) +
# theme_minimal()
```
---
# TODO: Set up statistical model
```{r}
set.seed(15)
sample_means <- rep(NA, 1000)
for(i in 1:1000) {
# sample for new 25 rows each iteration
sample <- expedia_data_q1 %>% sample_n(size = 1000, replace = TRUE)
sample_means[i] <- as.numeric(sample %>% summarize(mean(trans_made)))
}
# convert into a tibble
sample_means = tibble(trans_proportion = sample_means)
quantile(sample_means$trans_proportion, c(0.025, 0.975))
# expedia_data_q2 <- expedia_data %>%
# select(is_trans1, is_trans2, is_trans3, price_bucket1, price_bucket2, price_bucket3,
# checkin_date, search_timestamp, destination_id) %>%
# filter(is_trans1 + is_trans2 + is_trans3 == 1 &
# !is.na(checkin_date) & !is.na(search_timestamp) & !is.na(destination_id)) %>%
# mutate(
# price_bucket1 = ifelse(is.na(is_trans1) | is_trans1 == 0, 0, price_bucket1),
# price_bucket2 = ifelse(is.na(is_trans2) | is_trans2 == 0, 0, price_bucket2),
# price_bucket3 = ifelse(is.na(is_trans3) | is_trans3 == 0, 0, price_bucket2),
# price_bucket = (price_bucket1 + price_bucket2 + price_bucket3) / (is_trans1 + is_trans2 + is_trans3),
# search_timestamp = as.Date(as.character(search_timestamp)))
# expedia_data_q1 %>% ggplot(aes(x=checkin_date - search_timestamp, y=price_bucket)) +
# geom_point() +
# labs(x = "TODO: add x-axis label",
# y = "TODO: add y-axis label",
# title = "TODO: add title") +
# geom_smooth(method="lm", se=FALSE) +
# theme_minimal()
```
---
# TODO: Analyze results from statistical model
- we are 95% confident that the number of people who transact one of the top 3 listings on Expedia is (0.003, 0.015). This means that if we were to repeat the experiment ...
## Research Question 2
Are properties with discounted prices more likely to be sold?
Motivation of research question:
---
# TODO: Add visualization
---
# TODO: Set up statistical model
```{r}
untidy_expedia_data_q2 <- expedia_data %>%
select(num_clicks1, is_drr1, num_clicks2, is_drr2, num_clicks3, is_drr3)
expedia_data_q2 <- tibble(num_clicks = c(NA), is_drr = c(NA))
for (i in 1:nrow(untidy_expedia_data_q2)) {
expedia_data_q2 <- add_row(expedia_data_q2,
num_clicks = as.numeric(untidy_expedia_data_q2[i,"num_clicks1"]),
is_drr = as.numeric(untidy_expedia_data_q2[i,"is_drr1"]))
expedia_data_q2 <- add_row(expedia_data_q2,
num_clicks = as.numeric(untidy_expedia_data_q2[i,"num_clicks2"]),
is_drr = as.numeric(untidy_expedia_data_q2[i,"is_drr2"]))
expedia_data_q2 <- add_row(expedia_data_q2,
num_clicks = as.numeric(untidy_expedia_data_q2[i,"num_clicks2"]),
is_drr = as.numeric(untidy_expedia_data_q2[i,"is_drr2"]))
}
expedia_data_q2 <- expedia_data_q2 %>%
filter(!is.na(num_clicks) & !is.na(is_drr))
is_drr_mean <- expedia_data_q2 %>%
group_by(is_drr) %>%
summarize(means = mean(num_clicks))
# diff is a function that calculates the difference between values in a vector
diff_means <- is_drr_mean %>%
summarize(test_stat = diff(means))
test_stat <- as.numeric(diff_means)
# sampling
set.seed(15)
simulated_values <- rep(NA, 1000)
for(i in 1:1000) {
simdata <- expedia_data_q2 %>%
mutate(is_drr = sample(is_drr))
sim_value <- simdata %>%
group_by(is_drr) %>%
summarize(mean_clicks=mean(num_clicks)) %>%
summarize(value = diff(mean_clicks))
simulated_values[i] <- as.numeric(sim_value)
}
# visualizing the sampling distribution
sim <- tibble(mean_diff = simulated_values)
# calculating the p-value
num_more_extreme <- sim %>%
filter(abs(mean_diff) >= abs(diff_means-test_stat)) %>%
summarize(n=n())
p_value <- as.numeric(num_more_extreme) / 1000
p_value
```
---
# TODO: Analyze results from statistical model
- p value
## Research Question 3
Do the number of children and adults affect the length of travel?
---
# TODO: Add visualization
```{r}
expedia_data_q3 <- expedia_data %>%
select(search_timestamp, adult_count, child_count, checkin_date) %>%
filter(!is.na(checkin_date) & !is.na(search_timestamp)) %>%
mutate(travel_length = as.numeric(as.Date(as.character(checkin_date)) - as.Date(as.character(search_timestamp))),
has_child = case_when(child_count > 0 ~ TRUE,
child_count == 0 ~ FALSE))
glimpse(expedia_data_q3)
expedia_data_q3 %>% ggplot(aes(x=child_count, y= travel_length)) +
geom_point() +
labs(x="number of children",
y="length of travel (days)",
title = "TODO: add title") +
theme_minimal()
expedia_data_q3 %>% ggplot(aes(x=adult_count, y= travel_length)) +
geom_point() +
geom_smooth(se=FALSE, method="lm") +
labs(x="number of adults",
y="length of travel (days)",
title = "TODO: add title") +
theme_minimal()
# model 3's starter code
model3 <- lm(travel_length ~ adult_count + has_child, data = expedia_data_q3)
expedia_data_q3 %>% ggplot(aes(x = adult_count, y = travel_length, colour = has_child)) +
labs(x = "number of adults", y = "length of travel (days)",
title = "TODO: add title") +
geom_smooth(method = "lm", se = FALSE) +
geom_point()
# model 4's starter code
library(broom)
model4 <- lm(travel_length ~ adult_count * has_child, data = expedia_data_q3)
augment(model3) # creates dataframe with predicted value in .fitted column
# (y_i hat). There is 1 row for each observation in the training data
expedia_data_q3 %>% ggplot(aes(x = adult_count, y = travel_length, color = has_child)) +
geom_point(alpha = 0.5) + # alpha controls transparency of points
geom_line(data = augment(model3), aes(y = .fitted, colour = has_child)) +
labs(x = "number of adults",y = "length of travel (days)",
title = "TODO: add title")
```
---
# TODO: Set up statistical model
```{r}
# 4 models (adult vs length, children vs length, adult + children vs length, adult * children vs length)
set.seed(12)
n <- nrow(expedia_data_q3)
training_indices <- sample(1:n, size=round(0.8 * n))
train <- expedia_data_q3 %>% filter(rowid %in% training_indices)
y_train <- train$travel_length
test <- expedia_data_q3 %>% filter(!(rowid %in% training_indices))
y_test <- test$travel_length
mod1_train <- lm(travel_length ~ adult_count, data = train)
mod2_train <- lm(travel_length ~ child_count, data = train)
mod3_train <- lm(travel_length ~ adult_count + child_count, data = train)
mod4_train <- lm(travel_length ~ adult_count * child_count, data = train)
yhat_mod1_test <- predict(mod1_train, newdata = test)
yhat_mod2_test <- predict(mod2_train, newdata = test)
yhat_mod3_test <- predict(mod3_train, newdata = test)
yhat_mod4_test <- predict(mod4_train, newdata = test)
yhat_mod1_train <- predict(mod1_train, newdata = train)
yhat_mod2_train <- predict(mod2_train, newdata = train)
yhat_mod3_train <- predict(mod3_train, newdata = train)
yhat_mod4_train <- predict(mod4_train, newdata = train)
mod1_test_RMSE <- sqrt(sum((y_test - yhat_mod1_test)^2 / nrow(test)))
mod2_test_RMSE <- sqrt(sum((y_test - yhat_mod2_test)^2 / nrow(test)))
mod3_test_RMSE <- sqrt(sum((y_test - yhat_mod3_test)^2 / nrow(test)))
mod4_test_RMSE <- sqrt(sum((y_test - yhat_mod4_test)^2 / nrow(test)))
mod1_train_RMSE <- sqrt(sum((y_train - yhat_mod1_train)^2 / nrow(train)))
mod2_train_RMSE <- sqrt(sum((y_train - yhat_mod2_train)^2 / nrow(train)))
mod3_train_RMSE <- sqrt(sum((y_train - yhat_mod3_train)^2 / nrow(train)))
mod4_train_RMSE <- sqrt(sum((y_train - yhat_mod4_train)^2 / nrow(train)))
my_table <- tibble(Model = c("1", "2", "3", "4"),
RMSE_testdata = c(mod1_test_RMSE, mod2_test_RMSE,
mod3_test_RMSE, mod4_test_RMSE),
RMSE_traindata = c(mod1_train_RMSE, mod2_train_RMSE,
mod3_train_RMSE, mod4_train_RMSE),
ratio_of_RMSEs = RMSE_testdata / RMSE_traindata)
my_table
summary(mod1_train)$coefficients
summary(mod2_train)$coefficients
summary(mod3_train)$coefficients
summary(mod4_train)$coefficients
```
---
# TODO: Analyze results from statistical model
- mention that 4 models are tested
- talk about model 1 being the best
* simple (in terms of # of predictors)
* RMSE ratio fairly similar in all 4 models (no overfitting)
* RMSE ratio for test data is fairly similar (equally accurate)
- based on p-value of 6.675124e-07, there is very strong evidence against the null hypothesis that ...
- based on the slope of 9.52, it means that on average for each increase in adult in a trip, the trip's length increases by 9.52 days.
- variation of data is weak as 90% of the dataset are with searchs with no children/ infants
- p-value for the child_count model is approximately 0.563 which suggests that there is weak evidence against the null hypothesis
- p-value for the adult_count model is relatively small (p-value < 5.393e-08) which suggests very strong evidience against the null hypothesis
- p-value for the infant_count model is relatively small (p-value < 1.393e-02) which suggests strong evidience against the null hypothesis
~ more onto the infant sample, it is noticed that only 2 searches have infants in their group which causes small generalizations to be yield here
- all R^2 values (coefficient of determination) yields a very small value for all three of the models which could be due to the fact of low variation in the data
- we can conclude that the number of adults will lead to greater travel lengths
## Limitations
- assume that the data filled in is accurate
- sample is representative of population (1000 people)
## Overall Conclusion