-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTennis_Data_Code.R
163 lines (127 loc) · 6.7 KB
/
Tennis_Data_Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
Player_Data <- read.csv('Final_PlayerDataPerMatch.csv')
str(Player_Data)
Player_Data$Matches<-as.numeric(Player_Data$Matches)
Player_Data$Aces<-as.numeric(Player_Data$Aces)
Player_Data$Breakpoints_won<-as.numeric(Player_Data$Breakpoints_won)
Player_Data$double_faults<-as.numeric(Player_Data$double_faults)
Player_Data$first_serve_points_won<-as.numeric(Player_Data$first_serve_points_won)
Player_Data$first_serve_successful<-as.numeric(Player_Data$first_serve_successful)
Player_Data$Games_won<-as.numeric(Player_Data$Games_won)
Player_Data$Max_games_won_in_a_row<-as.numeric(Player_Data$Max_games_won_in_a_row)
Player_Data$Max_points_in_a_row<-as.numeric(Player_Data$Max_points_in_a_row)
Player_Data$Points_won<-as.numeric(Player_Data$Points_won)
Player_Data$Reciever_points_won<-as.numeric(Player_Data$Reciever_points_won)
Player_Data$second_serve_points_won<-as.numeric(Player_Data$second_serve_points_won)
Player_Data$second_serve_successful<-as.numeric(Player_Data$second_serve_successful)
Player_Data$service_games_won<-as.numeric(Player_Data$service_games_won)
Player_Data$service_points_won<-as.numeric(Player_Data$service_points_won)
Player_Data$tiebreaks_won<-as.numeric(Player_Data$tiebreaks_won)
DataQualityReport(Player_Data)
descrCor <- cor(Player_Data[,5:ncol(Player_Data)]) # correlation matrix
highCorr <- sum(abs(descrCor[upper.tri(descrCor)]) > .85) # number of Xs having a corr > some value
summary(descrCor[upper.tri(descrCor)])
# which columns in your correlation matrix have a correlation greater than some
# specified absolute cutoff?
highlyCorDescr <- findCorrelation(descrCor, cutoff = 0.85)
filteredDescr <- Player_Data[,5:ncol(Player_Data)][,-highlyCorDescr] # remove those specific columns from your dataset
descrCor2 <- cor(filteredDescr)
# summarize those correlations to see if all features are now within our range
summary(descrCor2[upper.tri(descrCor2)])
Player_Data_2 <- cbind(Player_Data[,c(1:4)],filteredDescr)
backup2 <- Player_Data_2
##Dropping Player Names, Gender, Matches
Player_Data_2 <- Player_Data_2[,-c(2:4)]
#Storing ID's
Player_Data_2_ID <- Player_Data_2$Player_ID
Player_Data_2$Player_ID <- NULL#Removing IDs from final dataset
#Y variables
names(Player_Data_2)[6] <- 'y'
Player_Data_TargetValue <- Player_Data_2$y #Target variable stored and remove
Player_Data_2$y <- NULL
#Histograms
hist(Player_Data_2$y, main="y", xlab="Ace", col="gold")
hist(Player_Data_2$Aces, main="Aces", xlab="Ace", col="gold")
hist(Player_Data_2$Breakpoints_won, main="Breakpoints_won", xlab="Ace", col="gold")
hist(Player_Data_2$double_faults, main="double_faults", xlab="Ace", col="gold")
hist(Player_Data_2$first_serve_points_won, main="first_serve_points_won", xlab="Ace", col="gold")
hist(Player_Data_2$first_serve_successful, main="first_serve_successful", xlab="Ace", col="gold")
hist(Player_Data_2$Max_games_won_in_a_row, main="Max_games_won_in_a_row", xlab="Ace", col="gold")
hist(Player_Data_2$Reciever_points_won, main="Reciever_points_won", xlab="Aces.y", col="gold")
hist(Player_Data_2$second_serve_points_won, main="second_serve_points_won", xlab="Aces.y", col="gold")
hist(Player_Data_2$second_serve_successful, main="second_serve_successful", xlab="Aces.y", col="gold")
hist(Player_Data_2$service_games_won, main="service_games_won", xlab="Aces.y", col="gold")
hist(Player_Data_2$tiebreaks_won, main="tiebreaks_won", xlab="Aces.y", col="gold")
#backup2 <- Player_Data_2
#Scaling all variables
preProcValues <- preProcess(Player_Data_2, method = c("range","YeoJohnson"))
Player_Data_2 <- predict(preProcValues, Player_Data_2)
################################################################################
# Data partitioning
################################################################################
set.seed(1234) # set a seed so you can replicate your results
Player_Data_2 <- cbind(Player_Data_TargetValue,Player_Data_2)
names(Player_Data_2)[1] <- 'y'
#identify records that will be used in the training set. Here we are doing a
#85% train/ 15% test split. You might modify this.
inTrain <- createDataPartition(y = Player_Data_2$y, # outcome variable
p = .85, # % of training data you want
list = F)
# create your partitions
train_dataset <- Player_Data_2[inTrain,] # training data set
validation <- Player_Data_2[-inTrain,] # test data set
#d1_score <- final_score
#using linear regression
ctrl <- trainControl(method="cv", # cross-validation set approach to use
number=3, # k number of times to do k-fold
classProbs = F,
summaryFunction = defaultSummary,
allowParallel=T)
lm_model <- train(y ~ .,
data = train_dataset,
method = "lm",
importance=T, # we add this in or it varImp cannot be computed
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
pred_val_lm <- predict(lm_model,validation)
postResample(pred_val_lm,validation$y)
summary(lm_model)
# automatic backward selection
library(leaps)
mb <- regsubsets(y ~ ., data=train
, nbest=1
, intercept=T
, method='backward'
, really.big=T
)
vars2keep <- data.frame(summary(mb)$which[which.max(summary(mb)$adjr2),])
names(vars2keep) <- c("keep")
head(vars2keep)
library(data.table)
vars2keep <- setDT(vars2keep, keep.rownames=T)[]
vars2keep <- c(vars2keep[which(vars2keep$keep==T & vars2keep$rn!="(Intercept)"),"rn"])[[1]]
# here are the final features found to be statistically significant
as.data.frame(vars2keep)
final_var <- c('y','Aces','Breakpoints_won','double_faults','first_serve_points_won',
'first_serve_successful','Reciever_points_won','second_serve_successful',
'service_games_won','tiebreaks_won')
train_new <- train[,final_var]
validation_new <- validation[,final_var]
#using linear regression again
ctrl <- trainControl(method="cv", # cross-validation set approach to use
number=3, # k number of times to do k-fold
classProbs = F,
summaryFunction = defaultSummary,
allowParallel=T)
lm_model_2 <- train(y ~ .,
data = train_new,
method = "lm",
importance=T, # we add this in or it varImp cannot be computed
trControl = ctrl,
tuneLength = 10,
metric = "RMSE"
)
pred_val_lm_2 <- predict(lm_model_2,validation)
postResample(pred_val_lm_2,validation$y)
summary(lm_model_2)