forked from MinHyung-Kang/ColoradoEducation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData_Cleaning.R
87 lines (73 loc) · 4.44 KB
/
Data_Cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#Load the data
Change_2010 <- read.csv(file="./Data/2010_1YR_3YR_change.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Coact_2010 <- read.csv(file="./Data/2010_COACT.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
DataMap_2010 <- read.csv(file="./Data/2010_data_map.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Enrlworking_2010 <- read.csv(file="./Data/2010_enrl_working.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
FinalGrade_2010 <- read.csv(file="./Data/2010_final_grade.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
FRL_2010 <- read.csv(file="./Data/2010_k_12_FRL.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Remediation_2010 <- read.csv(file="./Data/2010_remediation_HS.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
Address_2010 <- read.csv(file="./Data/2010_school_address.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
# Change_2010 : 1 & 3 year changes in academic acheivement, growth, overall grade.
# Coact_2010 : Ready for college or not based on ACT (Yes=1,No=2)
# DataMap_2010 :
# Enrlworking_2010 : Percentage of each ethnicity/race characteristics
# FinalGrade_2010 : Grades given for each school
# FRL_2010 : Percentage of students getting free/reduced price lunch
# Remediation_2010 : Percentage of students requiring remediation
# Address_2010 : Physical address of the schools
#------------------Combine all the data for this year----------------------------#
#Order : FinalGrade - Change - Coact - Enrlworking - FRL- remediation
#1. Combine FinalGrade and Change -> total
Change_2010_new <- Change_2010[,-which(names(Change_2010) %in%
c("School.Name","District.Number","District.Name","EMH_combined"))]
Total_2010 <- merge(FinalGrade_2010,Change_2010_new,
by.x=c("SchoolNumber","EMH"),by.y=c("School.Number","EMH"),
all.x = TRUE)
#2. Combine total and Coact
h = dim(Coact_2010)[1]
EMH = rep("H",h)
Coact_2010_new <- cbind(Coact_2010,EMH)
Coact_2010_new <- Coact_2010_new[,-which(names(Coact_2010_new) %in%
c("District.No","X2010.School.Name"))]
Total_2010 <- merge(Total_2010,Coact_2010_new,
by.x=c("SchoolNumber","EMH"),by.y=c("School.No","EMH"),
all.x = TRUE)
#3. Combine total and EnrlWorking
Enrlworking_2010_new <- Enrlworking_2010[,-which(names(Enrlworking_2010) %in%
c("Org..Code","Organization.Name","School.Name"))]
Total_2010 <- merge(Total_2010,Enrlworking_2010_new,
by.x="SchoolNumber",by.y="School.Code",
all.x = TRUE)
#4. Combine total and FRL
FRL_2010_new = FRL_2010[,-which(names(FRL_2010) %in%
c("DISTRICT.CODE","DISTRICT.NAME","SCHOOL.NAME"))]
Total_2010 <- merge(Total_2010,FRL_2010_new,
by.x="SchoolNumber",by.y="SCHOOL.CODE",
all.x = TRUE)
#5. Combine Remediation
h = dim(Remediation_2010)[1]
EMH = rep("H",h)
Remediation_2010_new <- cbind(Remediation_2010,EMH)
Remediation_2010_new <- Remediation_2010_new[,-which(names(Remediation_2010_new) %in%
c("School_District","SchoolName"))]
Total_2010 <- merge(Total_2010,Remediation_2010_new,
by.x=c("SchoolNumber","EMH"),by.y=c("SchoolNumber","EMH"),
all.x = TRUE)
Total_2010_Names <- colnames(Total_2010)
write.table(Total_2010_Names,file="Total_2010_Names.csv", row.names=FALSE,col.names=FALSE,sep=",")
write.table(Total_2010,file="Total_2010.csv", row.names=FALSE,col.names=FALSE,sep=",")
#-------------Clean the data-------------------#
#Get rid of unnecessary data
Data_2010 <- Total_2010[,-(8:15)]
changeName <- read.csv(file="./Raw_Data_Names.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
#Clean up the column names
require(plyr)
colnames(Data_2010) <- mapvalues(colnames(Data_2010), from = changeName[,1],to=changeName[,2])
#Make the decimal values into percentage values
Data_2010[,28:34] <- Data_2010[,28:34] * 100
Data_2010[,35] <- as.numeric(sub("%", "", Data_2010[,35]),na.rm = TRUE)
Data_2010_Names <- colnames(Data_2010)
Data_2010_Type = sapply(Data_2010, class)
write.table(Data_2010_Names,file="Data_2010_Names.csv", row.names=FALSE,col.names=FALSE,sep=",")
write.table(Data_2010,file="Data_2010.csv", row.names=FALSE,col.names=FALSE,sep=",")
write.table(Data_2010_Type,file="Data_2010_Type.csv", row.names=FALSE,col.names=FALSE,sep=",")