Skip to content

Commit

Permalink
Merge pull request #37 from larahabashy/main
Browse files Browse the repository at this point in the history
updating plot results
  • Loading branch information
HazelJJJ authored Nov 29, 2020
2 parents 41ba994 + 35a6d83 commit d9dc73e
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
Binary file added results/correlation_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added results/education_histogram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
43 changes: 38 additions & 5 deletions src/eda_cred.r
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# author: Lara Habashy
# date: 2020-11-26

"Creates eda plots for the pre-processed training data from the Wisconsin breast cancer data (from http://mlr.cs.umass.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data).
Saves the plots as a pdf and png file.
Usage: src/eda_cred.r --train=<train> --out_dir=<out_dir>
"Creates eda plots for the pre-processed training data from the default of credit card clients data (from http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients).
Saves the plots as a png file.
Usage: Rscript src/eda_cred.r --train=<train> --out_dir=<out_dir>
Options:
--train=<train> Path (including filename) to training data (which needs to be saved as a feather file)
Expand All @@ -12,9 +12,12 @@ Options:

library(feather)
library(tidyverse)
library(caret)
#library(caret)
library(docopt)
library(ggthemes)
#library (PerformanceAnalytics)
#library(graphics)

theme_set(theme_minimal())

opt <- docopt(doc)
Expand All @@ -32,11 +35,41 @@ main <- function(train, out_dir) {
xlab("Credit Limit") +
ylab("Density") +
ggtitle("Density of Credit Limit")

ggsave(paste0(out_dir, "/density_plot.png"),
density_plot,
width = 8,
height = 10)

training_scaled$education <- as.factor(toupper(names(training_scaled[,23:29])[max.col(training_scaled[,23:29])]))

#histogram
education_histogram <- ggplot(training_scaled, aes(pay_1, fill = default)) +
geom_histogram(binwidth = 1) +
ggtitle("Histogram of Education Levels") +
facet_grid(.~education) +
theme_fivethirtyeight()

ggsave(paste0(out_dir, "/education_histogram.png"),
education_histogram,
width = 8,
height = 10)

numeric_df <- training_scaled
numeric_df$default <- as.numeric(numeric_df$default)
numeric_df$age <- as.numeric(numeric_df$age)
numeric_df$sex <- NULL
numeric_df$education <- NULL
numeric_df$marriage <- NULL
ggsave(corr_plot, "correlation_plot.png")
#correlation plot
corr_plot <- PerformanceAnalytics::chart.Correlation(numeric_df %>% select_if(is.numeric), histogram=TRUE, method = "pearson", col="blue", pch=1, main="all")

ggsave(paste0(out_dir, "/correlation_plot.png"),
corr_plot,
width = 8,
height = 10)

}

main(opt[["--train"]], opt[["--out_dir"]])

0 comments on commit d9dc73e

Please sign in to comment.