Skip to content

Commit

Permalink
nonparametric regression
Browse files Browse the repository at this point in the history
  • Loading branch information
zafercavdar committed Nov 21, 2017
1 parent 8cab03c commit 199a219
Show file tree
Hide file tree
Showing 2 changed files with 277 additions and 0 deletions.
134 changes: 134 additions & 0 deletions 4-Non Parametric Regression/hw04_data_set.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"x","y"
26.4,-65.6
6.2,-2.7
20.2,-123.1
46.6,10.7
33.4,16
16.8,-77.7
10,-2.7
15.8,-21.5
13.6,-2.7
55,-2.7
16.2,-50.8
8.2,-2.7
26.2,-107.1
47.8,-14.7
11,-5.4
15.4,-32.1
16.8,-91.1
32.8,46.9
35.2,-54.9
43,14.7
16.4,-5.4
16,-26.8
19.2,-123.1
45,10.7
16.2,-21.5
38,10.7
25.4,-72.3
7.8,-2.7
28.4,-21.5
9.6,-2.7
14.6,-13.3
23.2,-123.1
8.8,-2.7
29.4,-17.4
39.4,-1.3
3.2,-2.7
15.6,-40.2
57.6,10.7
38,46.9
36.2,-37.5
23.4,-128.5
35.6,32.1
25.4,-44.3
24.2,-81.8
15.6,-21.5
33.8,45.6
21.2,-134
19.4,-72.3
13.8,0
17.8,-99.1
25,-64.4
17.6,-37.5
22,-123.1
4,-2.7
27.2,-24.2
31.2,8.1
17.6,-123.1
42.8,-10.7
17.6,-85.6
2.4,0
55.4,-2.7
17.6,-101.9
14.8,-2.7
13.2,-2.7
19.6,-127.2
52,10.7
40,-21.5
35.2,-16
47.8,-26.8
24.2,-95.1
35.4,69.6
32,54.9
55,10.7
30.2,36.2
16.6,-59
26,-5.4
15.4,-53.5
53.2,-14.7
15.8,-50.8
15.4,-54.9
10.6,-2.7
6.8,-1.3
2.6,-1.3
20.4,-117.9
6.6,-2.7
28.4,37.5
27,-16
10.2,-5.4
14.6,-5.4
31,75
11.4,0
14.6,-16
48.8,-13.3
18.6,-112.5
27.6,4
39.2,5.4
34.4,1.3
41.6,-10.7
44,-1.3
27.2,9.5
26.2,-21.5
19.4,-85.6
32,48.2
42.4,29.4
24.6,-53.5
36.2,22.8
3.6,0
21.4,-101.9
21.8,-108.4
14.6,-22.8
16.2,-61.7
16,-42.9
40.4,-13.3
14.6,-9.3
41.6,30.8
16.4,-80.4
44.4,0
25,-57.6
17.8,-104.4
25.6,-26.8
14.6,-5.4
8.8,-1.3
34.8,75
35.6,34.8
28.6,46.9
50.6,0
27.2,-45.6
18.6,-50.8
16.8,-71
24,-112.5
42.8,0
28.2,12
15.4,-22.8
143 changes: 143 additions & 0 deletions 4-Non Parametric Regression/nonparametric_regression.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Zafer Cavdar - COMP 421 Homework 4 - Nonparametric Regression

# read data into memory
data_set <- read.csv("hw04_data_set.csv")

# get x and y values
set.seed(521)
x_all <- data_set$x
y_all <- data_set$y
train_indices <- sample(length(x_all), 100)
x_train <- x_all[train_indices]
y_train <- y_all[train_indices]
x_test <- x_all[-train_indices]
y_test <- y_all[-train_indices]

# set bin width and borders
minimum_value <- floor(min(x_all)) - 2
maximum_value <- ceiling(max(x_all)) + 2
bin_width <- 3
grid_interval <- 0.01
data_interval <- seq(from = minimum_value, to = maximum_value, by = grid_interval)

# Regressogram
left_borders <- seq(from = minimum_value, to = maximum_value - bin_width, by = bin_width)
right_borders <- seq(from = minimum_value + bin_width, to = maximum_value, by = bin_width)
g_head <- sapply(1:length(left_borders), function(i) {
bin <- y_train[left_borders[i] < x_train & x_train <= right_borders[i]]
return(mean(bin))
}
)

get_bin_no <- function(v) {
return(ceiling((v-minimum_value) / bin_width))
}

plot(x_train, y_train, type = "p", pch = 19, col = "blue",
ylim = c(min(y_train), max(y_train)), xlim = c(minimum_value, maximum_value),
ylab = "y", xlab = "x", las = 1, main = sprintf("h = %g", bin_width))
points(x_test, y_test,type = "p", pch = 19, col= "red")
legend(55,85, legend=c("training", "test"),
col=c("blue", "red"), pch = 19, cex = 0.5, bty = "y")
for (b in 1:length(left_borders)) {
lines(c(left_borders[b], right_borders[b]), c(g_head[b], g_head[b]), lwd = 2, col = "black")
if (b < length(left_borders)) {
lines(c(right_borders[b], right_borders[b]), c(g_head[b], g_head[b + 1]), lwd = 2, col = "black")
}
}

# Calculate RMSE for regressogram
distances <- sapply(1:length(y_test), function(i) {
x_test_i <- x_test[i]
bin <- get_bin_no(x_test_i)
y_estimated_i <- g_head[bin]
y_test_i <- y_test[i]
diff <- y_test_i - y_estimated_i
return(diff^2)
})
RMSE <- sqrt(sum(distances) / length(distances))
sprintf("Regressogram => RMSE is %s when h is %s", RMSE, bin_width)

# Running mean smoother
g_head <- sapply(data_interval, function(x) {
y_train_bin <- y_train[(x - 0.5 * bin_width) < x_train & x_train <= (x + 0.5 * bin_width)]
return(mean(y_train_bin))
})

plot(x_train, y_train, type = "p", pch = 19, col = "blue",
ylim = c(min(y_train), max(y_train)), xlim = c(minimum_value, maximum_value),
ylab = "y", xlab = "x", las = 1, main = sprintf("h = %g", bin_width))
points(x_test, y_test,type = "p", pch = 19, col= "red")
legend(55,85, legend=c("training", "test"),
col=c("blue", "red"), pch = 19, cex = 0.5, bty = "y")
for (b in 1:length(data_interval)) {
lines(c(data_interval[b], data_interval[b+1]), c(g_head[b], g_head[b]), lwd = 2, col = "black")
if (b < length(data_interval)) {
lines(c(data_interval[b+1], data_interval[b+1]), c(g_head[b], g_head[b + 1]), lwd = 2, col = "black")
}
}

# Calculate RMSE for running mean smoother
get_interval_no <- function(v) {
return(ceiling((v-minimum_value) / grid_interval))
}

distances <- sapply(1:length(y_test), function(i) {
x_test_i <- x_test[i]
box <- get_interval_no(x_test_i)
y_estimated_i <- g_head[box]
y_test_i <- y_test[i]
diff <- y_test_i - y_estimated_i
return(diff^2)
})
RMSE <- sqrt(sum(distances) / length(distances))
sprintf("Running Mean Smoother => RMSE is %s when h is %s", RMSE, bin_width)

# Kernel Smoother
bin_width <- 1
gaussian_kernel = function(u) {
(1 / sqrt((2 * pi))) * exp(-u^2 / 2)
}

g_head <- sapply(data_interval, function(x) {
nominator <- sapply(1:length(x_train), function(i) {
u <- (x - x_train[i]) / bin_width
kernel <- gaussian_kernel(u)
return(kernel*y_train[i])
})
denominator <- sapply(1:length(x_train), function(i) {
u <- (x - x_train[i]) / bin_width
kernel <- gaussian_kernel(u)
return(kernel)
})
return(sum(nominator) / sum(denominator))
})

plot(x_train, y_train, type = "p", pch = 19, col = "blue",
ylim = c(min(y_train), max(y_train)), xlim = c(minimum_value, maximum_value),
ylab = "y", xlab = "x", las = 1, main = sprintf("h = %g", bin_width))
points(x_test, y_test,type = "p", pch = 19, col= "red")
legend(55,85, legend=c("training", "test"),
col=c("blue", "red"), pch = 19, cex = 0.5, bty = "y")
for (b in 1:length(data_interval)) {
lines(c(data_interval[b], data_interval[b+1]), c(g_head[b], g_head[b]), lwd = 2, col = "black")
if (b < length(data_interval)) {
lines(c(data_interval[b+1], data_interval[b+1]), c(g_head[b], g_head[b + 1]), lwd = 2, col = "black")
}
}

#for (i in 1:length(x_test)) {
# lines(c(x_test[i], x_test[i]), c(y_test[i], g_head[get_interval_no(x_test[i])]), lwd = 2, col = "black")
#}

# Calculate RMSE for kernel smoother
distances <- sapply(1:length(y_test), function(i) {
x_test_i <- x_test[i]
box <- get_interval_no(x_test_i)
y_estimated_i <- g_head[box]
y_test_i <- y_test[i]
diff <- y_test_i - y_estimated_i
return(diff^2)
})
RMSE <- sqrt(sum(distances) / length(distances))
sprintf("Kernel Smoother => RMSE is %s when h is %s", RMSE, bin_width)

0 comments on commit 199a219

Please sign in to comment.