-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
183 lines (140 loc) · 6.88 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
######################################## Import libraries ##########################################
library(data.table)
library(dplyr)
library(plotly)
library(zoo)
library(ggplot2)
library(tsbox)
library(ggfortify)
library(forecast)
library(tseries)
library(prophet)
##################################### Read Dataset ##############################################
dataset <- fread("owid-covid-data-USA.csv")
columns <- c("date","total_deaths","new_deaths","total_cases","new_cases") # Select necessary columns.
data2 <- select(dataset, colnames<-columns)
d_col <- as.Date(data2$date, format = "%m/%d/%Y") # Change Date column to date format.
day <- as.numeric(format(d_col, "%d"))
month <- as.numeric(format(d_col, "%m"))
year <- as.numeric(format(d_col, "%Y"))
data2 <- cbind(data2, d_col, day, month, year)
###################################### Missing values plot ######################################
new_columns = c("date","total_deaths","new_deaths","total_cases","new_cases","d_col", "day","month","year")
sum_na = colSums(is.na(data2))
temp = data.frame(sum_na)
temp %>%
plot_ly(x=new_columns,y=~sum_na, type='bar') %>%
layout(title = 'Missing values count')
######################################### Fill missing cells ########################################
data2$total_deaths = ifelse(is.na(data2$total_deaths),
0, data2$total_deaths)
data2$new_deaths = ifelse(is.na(data2$new_deaths),
0, data2$new_deaths)
data2$new_cases = ifelse(is.na(data2$new_cases),
0, data2$new_cases)
############################### Applying rolling mean for new deaths and cases #######################
data2 <- data2 %>%
dplyr::mutate(new_death_roll12 = zoo::rollmean(new_deaths,k=12, fill=NA)) %>%
dplyr::mutate(new_cases_roll12 = zoo::rollmean(new_cases,k=12, fill=NA))
############################################## EDA ##############################################
# This section is only for exploratory data analysis. Applied rolling mean to smoothen the line plot for better view.
temp = data2 %>%
select(new_death_roll12, new_cases_roll12)
row.names(temp) <- data2$d_col
n_days = 1:dim(temp)[1]
deathsRoll12=temp$new_death_roll12
casesRoll12 = temp$new_cases_roll12
rownames(data2) = data2$d_col
# Plotting number of deaths - rolling mean.
data2 %>%
ggplot(aes(x=d_col,y=deathsRoll12, color='smoker')) +
geom_line() +
ggtitle(" Rolling mean plot - New Deaths") +
xlab("Days Jan 2020 - Nov 2021") +
ylab("New Deaths")
# This section is for new cases.
data2 %>%
ggplot(aes(x=d_col, y=casesRoll12, color = 'smoker')) +
geom_line() +
ggtitle(" Rolling mean plot - New Cases") +
xlab("Days Jan 2020 - Nov 2021") +
ylab("New Cases")
################################# Prepare data for forecasting new deaths #########################
# Converting the dataset into time series. R packages expects data to be in time series format for forecasting.
new_data <- data2 %>%
select(new_deaths)
row.names(new_data) = data2$d_col
inds = seq(as.Date("2020-01-22"), as.Date("2021-11-28"), by="day")
# Making sure the time series is based on daily basis.
set.seed(123)
new_data_ts = ts(new_data, frequency = 365,
start = c(2020, as.numeric(format(inds[1],"%j"))),
end = c(2021, as.numeric(format(inds[677],"%j"))))
ts_ggplot(new_data_ts, xlab = "Timeline", ylab = "Deaths")
###################################### Making plot stationary ######################################
nsdiffs(new_data_ts)
data_diff <- diff(new_data_ts, lag=frequency(new_data_ts), differences=1)
plot(data_diff, type="l", main="Seasonally Differenced")
ndiffs(data_diff)
stationaryTS <- diff(data_diff, differences= 1)
plot(stationaryTS, type="l", main="Differenced and Stationary")
################################# Training and testing set #######################################
# Splitting the dataset into training and test set.
training_set = window(new_data_ts, start = c(2020, 22), end = c(2021, 302))
validation_set = window(new_data_ts, start = c(2021, 302), end = c(2021, 332))
# Perform dicky-fuller and Kwiatkowski–Phillips–Schmidt–Shin (KPSS) tests to find out the p-value.
acf(new_data_ts)
kpss.test(new_data_ts)
adf.test(training_set)
######################################## Arima model #####################################
AR = arima(training_set, order = c(5,1,7)) # Tune the order.
print(AR)
predict_AR = predict(AR, n.ahead = length(validation_set))
# Evaluation metrics for the Arima model.
library(Metrics)
print(paste("MAE:",mae(validation_set, predict_AR$pred)))
print(paste("MSE:",mse(validation_set, predict_AR$pred)))
print(paste("RMSE:",rmse(validation_set, predict_AR$pred)))
# Plot - Training set with forecast.
ts_ggplot(Training=ts(training_set[1:646], frequency = 365,
start = c(2020, as.numeric(format(inds[1],"%j"))),
end = c(2021, as.numeric(format(inds[646],"%j")))),
Forecast=predict_AR$pred,
title = " Time Series Forecasting - ARIMA",
ylab = "Deaths")
# Closer look at forecast (validation vs forecast)
ts_ggplot(Validation=validation_set,
Forecast=predict_AR$pred,
title = " Time Series Forecasting - November - Validation",
ylab = "Deaths")
########################################### Auto-Arima Model #######################################
# In this section we use auto-arima model which tells us which order(parameters) to start with.
arima_1 = auto.arima(training_set)
print(arima_1)
forecast_1 = forecast(arima_1, h=30)
acc_arima_1 = accuracy(f = forecast_1,x = validation_set)
summary(acc_arima_1)
autoplot(forecast_1,main = 'Forecast Test-Set',xlab = 'Deaths',ylab = 'Timeline')
########################################### Facebook Prophet model ##################################
fb_frame = data2 %>%
select(d_col, new_deaths)
# Facebook prophet model expects columns to be with specific names.
fb_frame = rename(fb_frame, c(ds=d_col, y=new_deaths))
# Split dataset into train and test set.
fb_train = fb_frame[1:647]
fb_test = fb_frame[647:677]
m <- prophet(fb_train, seasonality.prior.scale = 0.1,
yearly.seasonality=TRUE,
changepoint.prior.scale = 0.01)
future = make_future_dataframe(m, periods = 30)
fb_forecast = predict(m, future)
fb_forecast1 = fb_forecast[c('ds','yhat')]
plot(m, fb_forecast1, xlab = "Timeline",ylab = "Deaths")
# Plot forecast and test set.
ts_ggplot(Forecast=ts(fb_forecast1$yhat[647:677]), Testset=ts(fb_test$y))
# Plotting forecast components.
prophet_plot_components(m, fb_forecast)
# Evaluation metrics for Facebook prophet model.
print(paste("MAE:",mae(fb_test$y, fb_forecast1$yhat[647:677])))
print(paste("MSE:",mse(fb_test$y, fb_forecast1$yhat[647:677])))
print(paste("RMSE:",rmse(fb_test$y, fb_forecast1$yhat[647:677])))