Skip to content

Commit

Permalink
Fix error in treatment_corr() that is "All columns in a tibble must b…
Browse files Browse the repository at this point in the history
…e vectors." error. (#6, thanks to Cathy Tomson)
  • Loading branch information
choonghyunryu committed Sep 25, 2022
1 parent ae65157 commit 8812a9f
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 64 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: alookr
Type: Package
Title: Model Classifier for Binary Classification
Version: 0.3.7
Version: 0.3.8.9000
Authors@R: c(
person("Choonghyun", "Ryu",, "[email protected]", role = c("aut", "cre"))
)
Expand Down
18 changes: 15 additions & 3 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
CHANGES IN R VERSION 0.3.8:

BUG FIXES:

* Fix error in treatment_corr() that is "All columns in a tibble must be
vectors." error. (#6, thanks to Cathy Tomson)



CHANGES IN R VERSION 0.3.7:

MAJOR CHANGES:

* Removed plan(multiprocess) from logic for parallel processing. Because,
plan(multiprocess) of future is deprecated. (#2, thanks to Henrik Bengtsson)

MINOR CHANGES:

* Remove the waring of "UNRELIABLE VALUE" with seed = TRUE in future function.

BUG FIXES:

* Fix error in run_performance() that is "replacement has length zero" error.
(#5, thanks to Muhammad Fawad)

Expand All @@ -17,6 +28,7 @@ CHANGES IN R VERSION 0.3.7:
CHANGES IN R VERSION 0.3.6:

MINOR CHANGES:

* Implemented a function to replace the unbalanced package used
in the process of performing split data. This is because unbalanced
packages have been removed from CRAN. (#3)
Expand All @@ -26,7 +38,7 @@ CHANGES IN R VERSION 0.3.6:
CHANGES IN R VERSION 0.3.5:

BUG FIXES:

* Fix error in glmnet when run_predict() is performed with
test data that has more variables than train data.

Expand All @@ -45,7 +57,7 @@ CHANGES IN R VERSION 0.3.4:
CHANGES IN R VERSION 0.3.3:

BUG FIXES:

* run_predict() fixed error when try to predict on dataset without
the response variable (thanks @shivakhanal, #1).

Expand Down
140 changes: 80 additions & 60 deletions R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -251,30 +251,44 @@ cleanse.data.frame <- function(.data, uniq = TRUE, uniq_thres = 0.1, char = TRUE
#' @import dplyr
#' @export
treatment_corr <- function(.data, corr_thres = 0.8, treat = TRUE, verbose = TRUE) {
## Pearson correlation for numaric variables
corr <- .data %>%
dlookr::correlate() %>%
filter(abs(coef_corr) > corr_thres) %>%
filter(as.integer(var1) > as.integer(var2))

vars <- corr %>%
distinct(var2) %>%
pull %>%
as.character

if (nrow(corr) > 0) {
if (verbose) {
message(sprintf("* remove variables whose strong correlation (pearson >= %s)",
corr_thres))
message(paste(" - remove ", format(corr$var2), " : with ", corr$var1,
" (", round(corr$coef_corr, 4), ")\n", sep = ""))
}

if (treat) {
.data <- .data %>%
dplyr::select(-vars)
}
}
## Pearson correlation for numerical variables
n_numeric <- .data %>%
diagnose() %>%
filter(types %in% c("integer", "numeric")) %>%
filter(!variables %in% "TARGET") %>%
select(variables) %>%
pull() %>%
length()

if (n_numeric > 2) {
corr <- .data %>%
dlookr::correlate() %>%
filter(abs(coef_corr) > corr_thres) %>%
filter(as.integer(var1) > as.integer(var2))

vars <- corr %>%
distinct(var2) %>%
pull %>%
as.character

if (nrow(corr) > 0) {
if (verbose) {
message(sprintf("* remove variables whose strong correlation (pearson >= %s)",
corr_thres))
message(paste(" - remove ", format(corr$var2), " : with ", corr$var1,
" (", round(corr$coef_corr, 4), ")\n", sep = ""))
}

if (treat) {
.data <- .data %>%
dplyr::select(-vars)
}
}

n_corr <- nrow(corr)
} else {
n_corr <- 0
}

## Spearman correlation for categorical variables
vars <- .data %>%
Expand All @@ -284,43 +298,49 @@ treatment_corr <- function(.data, corr_thres = 0.8, treat = TRUE, verbose = TRUE
select(variables) %>%
pull

M <- .data %>%
select(vars) %>%
mutate_all(as.integer) %>%
cor(method = "spearman")

m <- as.vector(M)
tab <- tibble::as_tibble(expand.grid(var1 = row.names(M),
var2 = row.names(M)))
corr2 <- tibble::add_column(tab, coef_corr = m) %>%
filter(var1 != var2) %>%
filter(var1 %in% vars) %>%
filter(abs(coef_corr) > corr_thres) %>%
filter(as.integer(var1) > as.integer(var2))

vars <- corr2 %>%
distinct(var2) %>%
pull %>%
as.character

if (nrow(corr2) > 0) {
if (verbose) {
message(sprintf("* remove variables whose strong correlation (spearman >= %s)",
corr_thres))
message(paste(" - remove ", format(corr2$var2), " : with ", corr2$var1,
" (", round(corr2$coef_corr, 4), ")\n", sep = ""))
}

if (treat) {
.data <- .data %>%
dplyr::select(-vars)
}
}

if ((nrow(corr) + nrow(corr2)) == 0 & verbose) {
if (length(vars) > 2) {
M <- .data %>%
select(vars) %>%
mutate_all(as.integer) %>%
cor(method = "spearman")

m <- as.vector(M)
tab <- tibble::as_tibble(expand.grid(var1 = row.names(M),
var2 = row.names(M)))
corr2 <- tibble::add_column(tab, coef_corr = m) %>%
filter(var1 != var2) %>%
filter(var1 %in% vars) %>%
filter(abs(coef_corr) > corr_thres) %>%
filter(as.integer(var1) > as.integer(var2))

vars <- corr2 %>%
distinct(var2) %>%
pull %>%
as.character

if (nrow(corr2) > 0) {
if (verbose) {
message(sprintf("* remove variables whose strong correlation (spearman >= %s)",
corr_thres))
message(paste(" - remove ", format(corr2$var2), " : with ", corr2$var1,
" (", round(corr2$coef_corr, 4), ")\n", sep = ""))
}

if (treat) {
.data <- .data %>%
dplyr::select(-vars)
}
}

n_corr2 <- nrow(corr2)
} else {
n_corr2 <- 0
}

if ((n_corr + n_corr2) == 0 & verbose) {
message("All correlation coefficient is below threshold")
}

}
if (treat) {
.data
}
Expand Down

0 comments on commit 8812a9f

Please sign in to comment.