-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.R
184 lines (127 loc) · 7.04 KB
/
test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#### test file for sparkr. this code only handles a small state of AK.
#### this test temporarily partitions the top 100 rows of the dataset, for the
#### sake of the memory error..
library(SparkR)
## from https://stackoverflow.com/questions/38577939/not-able-to-retrieve-data-from-sparkr-created-dataframe
# Sys.setenv("SPARKR_SUBMIT_ARGS"="--master yarn-client sparkr-shell")
rm(list=ls())
PkgTest <- function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dep = TRUE)
if (!require(x, character.only = TRUE)) {
stop("Package not found")
}
}
}
## These lines load the required packages
packages <- c("readxl", "data.table", "rgdal", "sp")
lapply(packages, PkgTest)
sparkR.session()
# namedConfig <- sparkR.conf("spark.executor.memory", "4g")
# setLogLevel("ERROR")
dir <- "/gpfs01/hadoop/user/cng10/Erick/zillow/stores/"
dir2 <- paste(dir, "46/", sep = "")
#("../stores/", 'Layout.xlsx'), sheet = 1)
layoutZAsmt <- read_excel(file.path(dir, 'Layout.xlsx'), sheet = 1)
layoutZTrans <- read_excel(file.path(dir, 'Layout.xlsx'), sheet = 2, col_types = c("text", "text", "numeric", "text", "text"))
print("finished loading layout")
rows2load <- -1 # load everything
options(scipen = 999) # Do not print scientific notation
options(stringsAsFactors = FALSE) ## Do not load strings as factors
## Create property attribute table
# Need 3 tables
# 1) Main table in assessor database
# 2) Building table
# 3) BuildingAreas
col_namesMain <- t(layoutZAsmt[layoutZAsmt$TableName == 'utMain', 'FieldName'])
col_namesBldg <- t(layoutZAsmt[layoutZAsmt$TableName == 'utBuilding', 'FieldName'])
col_namesBldgA <- t(layoutZAsmt[layoutZAsmt$TableName == 'utBuildingAreas', 'FieldName'])
##################
#### BASE ####
##################
base <- read.table(file.path(dir2, "ZAsmt/Main.txt"),
nrows = rows2load,
sep = '|',
header = FALSE,
stringsAsFactors = FALSE,
skipNul = TRUE, # tells R to treat two ajacent delimiters as dividing a column
comment.char="", # tells R not to read any symbol as a comment
quote = "", # this tells R not to read quotation marks as a special symbol
col.names = col_namesMain
)
# base_s <- head(base, n = 100L)
print("finished reading table")
print(object.size(base))
print(dim(base))
# object.size()
#### using sparkr dataframe object
base1 <- as.DataFrame(base, numPartitions = 100)
#base1 <- as.DataFrame(as.matrix(base))
#base1 <- read.text(file.path(dir2, "ZAsmt/Main.txt"))
#base1 <- read.df(paste0(dir2, "ZAsmt/Main.txt"), source = "csv", delimiter = "|", header="true", inferSchema="true")
print("finished converting dataframe")
## select the useful cols
base1 <- select(base1, list("RowID", "ImportParcelID", "LoadID", "FIPS", "State", "County",
"PropertyFullStreetAddress", "PropertyHouseNumber",
"PropertyHouseNumberExt", "PropertyStreetPreDirectional",
"PropertyStreetName", "PropertyStreetSuffix", "PropertyStreetPostDirectional",
"PropertyCity", "PropertyState", "PropertyZip",
"PropertyBuildingNumber", "PropertyAddressUnitDesignator", "PropertyAddressUnitNumber",
"PropertyAddressLatitude", "PropertyAddressLongitude", "PropertyAddressCensusTractAndBlock",
"NoOfBuildings", "LotSizeAcres", "LotSizeSquareFeet", "TaxAmount", "TaxYear"))
print("finished selection")
##### TODO: convert this part into SparkR style.
if (length(unique(base$ImportParcelID)) != dim(base)[1] ) {
#Example: Print all entries for parcels with at least two records.
base[ImportParcelID %in% base[duplicated(ImportParcelID), ImportParcelID], ][order(ImportParcelID)]
setkeyv(base, c("ImportParcelID", "LoadID")) # Sets the index and also orders by ImportParcelID, then LoadID increasing
keepRows <- base[ , .I[.N], by = c("ImportParcelID")] # Creates a table where the 1st column is ImportParcelID and the second column
# gives the row number of the last row that ImportParcelID appears.
base <- base[keepRows[[2]], ] # Keeps only those rows identified in previous step
}
print(base1)
##################
#### BLDG ####
##################
######################################################################
#### Load most property attributes
bldg <- read.table(file.path(dir2, "ZAsmt/Building.txt"),
nrows = rows2load, # this is set just to test it out. Remove when code runs smoothly.
sep = '|',
header = FALSE,
stringsAsFactors = FALSE,
skipNul = TRUE, # tells R to treat two ajacent delimiters as dividing a column
comment.char="", # tells R not to read any symbol as a comment
quote = "", # this tells R not to read quotation marks as a special symbol
col.names = col_namesBldg
)
print("finished read.table")
# bldg_s <- head(bldg, n = 100L)
bldg <- as.DataFrame(bldg, numPartition = 100)
print("finished as.DataFrame")
bldg <- select(bldg, c("RowID", "NoOfUnits", "BuildingOrImprovementNumber",
"YearBuilt", "EffectiveYearBuilt", "YearRemodeled",
"NoOfStories", "StoryTypeStndCode", "TotalRooms", "TotalBedrooms",
"FullBath", "ThreeQuarterBath", "HalfBath", "QuarterBath",
"HeatingTypeorSystemStndCode",
"PropertyLandUseStndCode", "WaterStndCode"))
print("finished select")
# Reduce bldg dataset to Single-Family Residence, Condo's, Co-opts (or similar)
bldg <- filter(bldg, bldg$PropertyLandUseStndCode %in% c('RR101', # SFR
'RR999', # Inferred SFR
'RR104', # Townhouse
'RR105', # Cluster Home
'RR106', # Condominium
'RR107', # Cooperative
'RR108', # Row House
'RR109', # Planned Unit Development
'RR113', # Bungalow
'RR116', # Patio Home
'RR119', # Garden Home
'RR120')) # Landominium
print("finished reducing bldg dataset")
###### merging !!!!!
attr <- merge(base1, bldg, by = "RowID")
print("finished merging")
print(attr)
print("hello, world.")