-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal_cs156.R
158 lines (136 loc) · 6.76 KB
/
final_cs156.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
library(Matching)
library(foreign)
library(rbounds)
library(cluster)
setwd("C:/Users/DELL/Downloads")
x <- read.dta("basic.dta")
#covariates
set.seed(14432)
x <- na.omit(x) #Leaving out, specifically, the missing notwot values
num = 1:nrow(x)
sub1 = sample(num, 0.5*nrow(x))
train <- x[sub1,] #training set
sub2 = num[! num %in% sub1]
test <- x[sub2,] #test set
treat <- test$anygirls
#Did not pick x$perf because x$female should be more important in my reasoning #x$aauw, x$rtl, x$rgroup, x$statalph
#Xi <- cbind(test$female, test$white, test$repub, test$age, test$srvlng, test$demvote)
Xi <- cbind(test$female, test$white, test$repub, test$age, I(test$age ^2), test$srvlng, I(test$srvlng^2), test$demvote)
BalanceMat <- cbind(test$female, test$white, test$repub, test$age, I(test$age ^2), test$srvlng, I(test$srvlng^2), test$demvote)
Y <- test$nowtot
genout <- GenMatch(Tr=treat, X = Xi, BalanceMatrix=BalanceMat, estimand="ATT", M=1,
pop.size=500, max.generations=100, wait.generations=15, cluster = c("localhost", "localhost"))
mout <- Match(Y=Y, Tr=treat, X = Xi, estimand="ATT", Weight.matrix=genout)
summary(mout)
#summary(X)
#Check Balance
mb <- MatchBalance(treat~test$female + test$white + test$repub + test$age + I(test$age ^2) + test$srvlng + I(test$srvlng^2) + test$demvote,
match.out=mout, nboots=500)
#Run linear regression
basemod <- lm(nowtot~female + white + repub + age + I(age ^2) + srvlng + I(srvlng^2) + demvote, train)
plot(basemod)#plot regression line
#Root Mean Square
rmse1.0 <- sqrt(mean(residuals(basemod)^2)) #calculate RMSE
rmse1.0
basemod2 <- lm(test$nowtot~test$female + test$white + test$repub + test$age + I(test$age ^2) + test$srvlng + I(test$srvlng^2) + test$demvote, test)
rmse2.0 <- sqrt(mean(residuals(basemod2)^2)) #calculate RMSE
rmse2.0
#Calculating model's accuracy on itself
#2: confidence interval
train_pred1 <- predict(basemod, newdata = train, type = 'response', interval='predict')
accurate11 = 0
misclassified11 = 0
countz = 1
while (countz <= length(train$nowtot)){
if (!is.na(train_pred1[countz, 2])){
if ((train_pred1[countz, 3] >= train$nowtot[countz]) && (train_pred1[countz, 2] <= train$nowtot[countz])) {
accurate11 = accurate11 + 1
}
else{
misclassified11 = misclassified11 + 1
}
}
countz = countz + 1
}
accuracy11 = accurate11/(accurate11 + misclassified11)
print(paste('Accuracy:', accuracy11, 'or,', accuracy11 * 100, '%'))
#Out-of-sample prediction
test_pred1 <- predict(basemod, newdata = test, type = 'response', interval='predict')
accurate12 = 0
misclassified12 = 0
county = 1
while (county <= length(test$nowtot)){
if (!is.na(test_pred1[county, 2])){
if ((test_pred1[county, 3] >= test$nowtot[county]) && (test_pred1[county, 2] <= test$nowtot[county])) {
accurate12 = accurate12 + 1
}
else{
misclassified12 = misclassified12 + 1
}
}
county = county + 1
}
accuracy12 = accurate12/(accurate12 + misclassified12)
print(paste('Accuracy:', accuracy12, 'or,', accuracy12 * 100, '%'))
#Calculating counterfactuals
counter_test = test
counter_test$anygirls = 1- counter_test$anygirls
ctest_pred <- predict(basemod, newdata = counter_test, type = 'response', interval = 'predict')
#Calculation of Gower's Metric - meaurement of dissimilarity along the rows of x
Mat <- cbind(test$nowtot, test$anygirls, test$female, test$white, test$repub, test$age, I(test$age ^2), test$srvlng, I(test$srvlng^2), test$demvote)
Mat2 <- cbind(train$nowtot, train$anygirls, train$female, train$white, train$repub, train$age, I(train$age ^2), train$srvlng, I(train$srvlng^2), train$demvote)
Mat3 <- cbind(ctest_pred[,1],counter_test$anygirls, counter_test$female, counter_test$white, counter_test$repub, counter_test$age, I(counter_test$age ^2), counter_test$srvlng, I(counter_test$srvlng^2), counter_test$demvote)
Mat4 <- rbind(Mat, Mat3)
Mat5 <- rbind(Mat2, Mat3)
daisy.mat <- as.matrix(daisy(Mat, metric="gower"))
print(paste("Maximum Gower Metric (G^2) in the test data is", max(daisy.mat)))
daisy.mat2 <- as.matrix(daisy(Mat2, metric="gower"))
print(paste("Maximum Gower Metric (G^2) in the train data is", max(daisy.mat2)))
daisy.mat3 <- as.matrix(daisy(Mat3, metric="gower"))
print(paste("Maximum Gower Metric (G^2) in the counterfactuals is", max(daisy.mat3)))
daisy.mat4 <- as.matrix(daisy(Mat4, metric="gower"))
print(paste("Maximum Gower Metric (G^2) in the test data plus counterfactual is", max(daisy.mat4)))
daisy.mat5 <- as.matrix(daisy(Mat5, metric="gower"))
print(paste("Maximum Gower Metric (G^2) in the train data plus counterfactual is", max(daisy.mat5)))
#Compare with Genetic Matching Counterfactuals
gentreat <- test[mout$index.treated,]
gencontl <- test[mout$index.control,]
c_treat <- counter_test[mout$index.treated,]
c_contl <- ctest_pred[mout$index.treated,]
MatchRes <- cbind(gentreat$nowtot, gencontl$nowtot, c_contl)
colnames(MatchRes) <- c('Genetic Matching Treatment', 'Genetic Matching Control', 'Counterfactual Prediction for GenMatch Treat', 'Lower Confidence Interval', 'Upper Confidence Interval')
print(paste('Mean of Genetic Matching Treatment:', mean(gentreat$nowtot)))
print(paste('Mean of Genetic Matching Control:', mean(gencontl$nowtot)))
print(paste('Mean of Predicted Counterfactual for Genetic Matching Treatment:', mean(c_contl[,1])))
#Calculating Counterfactual Accuracy Against the Control Group of GenMatch
accurate= 0
misclassified = 0
count0 = 1
while (count0 <= nrow(MatchRes)){
if (!is.na(c_contl[count0, 2])){
if ((c_contl[count0, 3] >= gencontl$nowtot[count0]) && (c_contl[count0, 2] <= gencontl$nowtot[count0])) {
accurate = accurate + 1
}
else{
misclassified = misclassified + 1
}
}
count0 = count0 + 1
}
accuracy = accurate/(accurate + misclassified)
print(paste('Number of observations that fell into confidence interval:', accuracy, 'or,', accuracy * 100, '%'))
#Download Data and see the prediction and GenMatch control results
write.csv(MatchRes, file = 'PredictedCounterfactuals.csv')
parcoord(BalanceMat, col = c(1, 2), lty = 1, var.label = TRUE)
par(mfrow = c(1,2))
plot(density(rnorm(1000)))
hist(rnorm(1000))
plot(density(test$age[treat==1], bw = 2), lwd = 3, col = "red")
lines(density(test$age[treat==0], bw = 2), lwd = 3, col = "blue")
plot(density(test$demvote[treat==1], bw = 2), lwd = 3, col = "red")
lines(density(test$demvote[treat==0], bw = 2), lwd = 3, col = "blue")
#After
plot(density(test$age[mout$index.treated], bw = 2), lwd = 3, col = "red")
lines(density(test$age[mout$index.control], bw = 2), lwd = 3, col = "blue")
plot(density(test$demvote[mout$index.treated], bw = 2), lwd = 3, col = "red")
lines(density(test$demvote[mout$index.control], bw = 2), lwd = 3, col = "blue")