-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathData_Cleaning_2012.R
127 lines (99 loc) · 5.67 KB
/
Data_Cleaning_2012.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#Load the data
Change_2012 <- read.csv(file="./Data/2012_1YR_3YR_change.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Coact_2012 <- read.csv(file="./Data/2012_COACT.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
DataMap_2012 <- read.csv(file="./Data/2012_data_map.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Enrlworking_2012 <- read.csv(file="./Data/2012_enrl_working.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
FinalGrade_2012 <- read.csv(file="./Data/2012_final_grade.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
FRL_2012 <- read.csv(file="./Data/2012_k_12_FRL.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Remediation_2012 <- read.csv(file="./Data/2012_remediation_HS.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Address_2012 <- read.csv(file="./Data/2012_school_address.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
# Change_2012 : 1 & 3 year changes in academic acheivement, growth, overall grade.
# Coact_2012 : Ready for college or not based on ACT (Yes=1,No=2)
# DataMap_2012 :
# Enrlworking_2012 : Percentage of each ethnicity/race characteristics
# FinalGrade_2012 : Grades given for each school
# FRL_2012 : Percentage of students getting free/reduced price lunch
# Remediation_2012 : Percentage of students requiring remediation
# Address_2012 : Physical address of the schools
#------------------Combine all the data for this year----------------------------#
#Order : FinalGrade - Change - Coact - Enrlworking - FRL- remediation
#1. Combine FinalGrade and Change -> total
Change_2012_new <- Change_2012[,-which(names(Change_2012) %in%
c("Record_no","SPF_DIST_NUMBER","SPF_DISTRICT_NAME","SPF_SCHOOL_NAME","SPF_INCLUDED_EMH_FOR_A"))]
Total_2012 <- merge(FinalGrade_2012,Change_2012_new,
by.x=c("School.Code","EMH"),by.y=c("SPF_SCHOOL_NUMBER","SPF_EMH_CODE"),
all.x = TRUE)
#2. Combine total and Coact
h = dim(Coact_2012)[1]
EMH = rep("H",h)
Coact_2012_new <- Coact_2012[,0:8]
Coact_2012_new <- cbind(Coact_2012_new,EMH)
Coact_2012_new <- Coact_2012_new[,-which(names(Coact_2012_new) %in%
c("District.No","District.Name","School.Name"))]
Total_2012 <- merge(Total_2012,Coact_2012_new,
by.x=c("School.Code","EMH"),by.y=c("School.No","EMH"),
all.x = TRUE)
#3. Combine total and EnrlWorking
Enrlworking_2012_new <- Enrlworking_2012[,0:12]
Enrlworking_2012_new <- Enrlworking_2012_new[,-which(names(Enrlworking_2012_new) %in%
c("Organization.Code","Organization.Name","School.Name"))]
Total_2012 <- merge(Total_2012,Enrlworking_2012_new,
by.x="School.Code",by.y="School.Code",
all.x = TRUE)
#4. Combine total and FRL
FRL_2012_new <- FRL_2012[,0:5]
FRL_2012_new = FRL_2012_new[,-which(names(FRL_2012_new) %in%
c("DISTRICT.CODE","DISTRICT.NAME","SCHOOL.NAME"))]
Total_2012 <- merge(Total_2012,FRL_2012_new,
by.x="School.Code",by.y="SCHOOL.CODE",
all.x = TRUE)
#5. Combine Remediation
Remediation_2012_new <- Remediation_2012[,0:5]
h = dim(Remediation_2012_new)[1]
EMH = rep("H",h)
Remediation_2012_new <- cbind(Remediation_2012_new,EMH)
Remediation_2012_new <- Remediation_2012_new[,-which(names(Remediation_2012_new) %in%
c("School_Districte","School_Name"))]
Total_2012 <- merge(Total_2012,Remediation_2012_new,
by.x=c("School.Code","EMH"),by.y=c("Schoolnumber","EMH"),
all.x = TRUE)
h = dim(Total_2012)[1]
Dummy <- matrix(-1,h,8)
DummyNames <- rep("Dummy",8)
colnames(Dummy) <- DummyNames
Total_2012 <- cbind(Total_2012,Dummy)
Total_2012_Names <- colnames(Total_2012)
#--------FIGURE OUT INDEX-----------#
Index_2012 <- read.csv(file="./CleanedData/Index_2012.csv", header=FALSE, sep=",", stringsAsFactors = FALSE)
Total_2012 <- Total_2012[,unlist(Index_2012)]
Total_2012_Names <- colnames(Total_2012)
write.table(Total_2012_Names,file="./CleanedData/Total_2012_Names.csv", row.names=FALSE,col.names=FALSE,sep=",")
#-------------Clean the data-------------------#
#Get rid of unnecessary data
#Get rid of newly added variables
Data_2012 <- Total_2012[,-(44:48)]
#get rid of dummy variables
Data_2012 <- Data_2012[,-(7:14)]
#To match format of Data_2010
changeName <- read.csv(file="./CleanedData/Raw_Data_Names_2012.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
#Clean up the column names
require(plyr)
colnames(Data_2012) <- mapvalues(colnames(Data_2012), from = changeName[,1],to=changeName[,2])
Data_2012 <- as.data.frame(append(Data_2012, list(AlternativeSchool = 0), after = 6), stringsAsFactors = FALSE)
#Make the decimal values into percentage values
Data_2012[,7] <- as.integer(Data_2012[,7])
Data_2012[,19] <- as.numeric(Data_2012[,19])
Data_2012[,20] <- as.integer(Data_2012[,20])
Data_2012[,21] <- as.integer(Data_2012[,21])
Data_2012[,22] <- as.integer(Data_2012[,22])
Data_2012[,33] <- as.numeric(Data_2012[,33])
Data_2012[,34] <- as.numeric(Data_2012[,34])
Data_2012[,28:34] <- Data_2012[,28:34] * 100
Data_2012[,35] <- as.numeric(sub("%", "", Data_2012[,35]),na.rm = TRUE)
Data_2012[,36] <- as.numeric(sub("%", "", Data_2012[,36]),na.rm = TRUE)
Data_2012[,36] <- Data_2012[,36]/100.0
Data_2012_Names <- colnames(Data_2012)
Data_2012_Type = sapply(Data_2012, class)
write.table(Data_2012_Names,file="./CleanedData/Data_2012_Names.csv", row.names=FALSE,col.names=FALSE,sep=",")
write.table(Data_2012,file="./CleanedData/Data_2012.csv", row.names=FALSE,col.names=FALSE,sep=",")
write.table(Data_2012_Type,file="./CleanedData/Data_2012_Type.csv", row.names=FALSE,col.names=FALSE,sep=",")