-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProblemSet_02_LDA,QDA.R
162 lines (131 loc) · 4.04 KB
/
ProblemSet_02_LDA,QDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#####################################################################
### ProblemSet 02: LDA and QDA
#####################################################################
### Preface
# clearing workspace and set wd
rm(list=ls())
cat("\014")
dev.off()
#library(plm) # Panel Methods - regression models
#library(mvtnorm) # random draws from a multivariate normal distr.
library(MASS) # to fit LDA and QDA analysis
# Note: packages above require the data to be saved as a data frame
#####################################################################
### Exercise 1
# class 1
n1 = 300
mu1= c(-3,3)
# class 2
n2 = 500
mu2= c(5,5)
# Variance-Covariance Matrix
covmat=matrix(c(16,-2,-2,9), nrow=2, ncol=2)
#total number of observations
N=n1+n2
## a)
# DGP - for both classes
set.seed(123)
X1=mvrnorm(n = n1, mu = mu1, Sigma = covmat)
X2=mvrnorm(n = n2, mu = mu2, Sigma = covmat)
df1 <- data.frame(X1)
df1['class'] = 1
df2 <- data.frame(X2)
df2['class'] = 2
df <- merge(df1,df2, all=TRUE)
rm(df1, df2, X1, X2)
## b)
mod_lda = lda(class ~ X1 + X2, data=df)
summary(mod_lda)
class_lfit <- as.numeric(predict(mod_lda)$class)
mod_qda = qda(class ~ X1 + X2 , data=df)
summary(mod_qda)
class_qfit <- as.numeric(predict(mod_qda)$class)
## c)
# Note: since classes are ordinal scale we can not use MSE, due to the fact
# that the distance between class 1 and 3 are the same as between 1 and 2
# furthermore it is not appropriarte to use OLS
MTE_LDA=sum(class_lfit!=df$class)/N
MTE_QDA=sum(class_qfit!=df$class)/N
print(MTE_LDA)
print(MTE_QDA)
print(MTE_LDA-MTE_QDA)
#####################################################################
### Exercise 2 - Simulation Study
# a)
rm(list=ls())
cat("\014")
MCN=100
MSE=matrix(NaN,MCN,2)
n1 = 300
mu1= c(-3,3)
n2 = 500
mu2= c(5,5)
covmat=matrix(c(16,-2,-2,9), nrow=2, ncol=2)
N=n1+n2
set.seed(123)
for (i in 1:MCN){
X1=mvrnorm(n = n1, mu = mu1, Sigma = covmat)
X2=mvrnorm(n = n2, mu = mu2, Sigma = covmat)
df1 <- data.frame(X1)
df1['class'] = 1
df2 <- data.frame(X2)
df2['class'] = 2
df <- merge(df1,df2, all=TRUE)
mod_lda = lda(class ~ X1 + X2, data=df)
class_lfit <- as.numeric(predict(mod_lda)$class)
mod_qda = qda(class ~ X1 + X2 , data=df)
class_qfit <- as.numeric(predict(mod_qda)$class)
MSE[i,1]=sum(class_lfit!=df$class)/N
MSE[i,2]=sum(class_qfit!=df$class)/N
}
avg_MSE_LDA=mean(MSE[,1])
avg_MSE_QDA=mean(MSE[,2])
par(mfrow=c(1,2))
plot(MSE[,1], ylab="LDA")
abline(h=avg_MSE_LDA, col="red")
plot(MSE[,2], ylab="QDA")
abline(h=avg_MSE_QDA, col="red")
summary(avg_MSE_LDA-avg_MSE_QDA)
# b)
# Note: since we have different covariate matrixes sigma 1 and sigma 2
# QDA is more precise than LDA
# From a theoretical perspective:
# * if LDAs assumption that the K classes share a common covariance matrix
# is badly off, then LDA --> high bias
# * LDA is a much less flexible classifier than QDA --> lower variance
# Hence: Try with different covariance matrices
rm(list=ls())
cat("\014")
MCN=100
MSE=matrix(NaN,MCN,2)
n1 = 300
mu1= c(-3,3)
n2 = 500
mu2= c(5,5)
covmat_1=matrix(c(16,-2,-2,9), nrow=2, ncol=2)
covmat_2=matrix(c(10,-2,-2,5), nrow=2, ncol=2)
N=n1+n2
set.seed(123)
for (i in 1:MCN){
X1=mvrnorm(n = n1, mu = mu1, Sigma = covmat_1)
X2=mvrnorm(n = n2, mu = mu2, Sigma = covmat_2)
df1 <- data.frame(X1)
df1['class'] = 1
df2 <- data.frame(X2)
df2['class'] = 2
df <- merge(df1,df2, all=TRUE)
mod_lda = lda(class ~ X1 + X2, data=df)
class_lfit <- as.numeric(predict(mod_lda)$class)
mod_qda = qda(class ~ X1 + X2 , data=df)
class_qfit <- as.numeric(predict(mod_qda)$class)
MSE[i,1]=sum(class_lfit!=df$class)/N
MSE[i,2]=sum(class_qfit!=df$class)/N
}
avg_MSE_LDA=mean(MSE[,1])
avg_MSE_QDA=mean(MSE[,2])
par(mfrow=c(1,2))
plot(MSE[,1], ylab="LDA")
abline(h=avg_MSE_LDA, col="red")
plot(MSE[,2], ylab="QDA")
abline(h=avg_MSE_QDA, col="red")
summary(avg_MSE_LDA-avg_MSE_QDA)