diff --git a/Dockerfile b/Dockerfile index fcfd9ff..ac8af33 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,4 +32,15 @@ RUN apt-get update -qq && install2.r --error \ caret \ pacman \ feather \ - ggthemes + ggthemes \ + ggcorrplot \ + here \ + PerformanceAnalytics \ + GGally \ + FSinR \ + ggpubr \ + VSURF + + +# install janitor R package +RUN Rscript -e 'devtools::install_github("sfirke/janitor")' diff --git a/src/eda_cred.r b/src/eda_cred.r index f154b15..8492922 100644 --- a/src/eda_cred.r +++ b/src/eda_cred.r @@ -19,6 +19,7 @@ library(docopt) library(ggthemes) #library(graphics) library(ggcorrplot) +library(arrow) theme_set(theme_minimal()) @@ -35,8 +36,8 @@ main <- function(train, train_scaled, out_dir) { } # read data - training_data <- read_feather(train) - training_scaled <- read_feather(train_scaled) + training_data <- arrow::read_feather(train) + training_scaled <- arrow::read_feather(train_scaled) # density plot density_plot <- training_scaled %>% @@ -75,23 +76,6 @@ main <- function(train, train_scaled, out_dir) { width = 8, height = 10) - # target counts bar plot - count_plot <- training_data %>% - ggplot(aes(x=as.numeric(default), fill=default)) + - geom_bar() + - geom_text(stat='count', aes(label=..count..), vjust=3, hjust=0.9, color = 'black') + - ggtitle("Count of Defaulting Clients") + - xlab("Defaults") + - ylab("Count") - - count_plot <- prop_plot + guides(fill=guide_legend(title="Default Flag")) - count_plot <- prop_plot + scale_shape_discrete(labels = c("Non-Defaults", "Defaults")) + scale_fill_discrete(labels = c("Non-Defaults", "Defaults")) - - ggsave(paste0(out_dir, "/counts_plot.png"), - count_plot, - width = 8, - height = 10) - # proportions plot prop_plot <- training_data %>% ggplot(aes(x=as.numeric(default), y = ..prop.., fill = factor(..x..), group = 1)) + @@ -115,6 +99,25 @@ main <- function(train, train_scaled, out_dir) { numeric_df$default <- as.numeric(numeric_df$default) numeric_df$age <- as.numeric(numeric_df$age) + # target counts bar plot + count_plot <- training_data %>% + ggplot(aes(x=as.numeric(default), fill=default)) + + geom_bar() + + geom_text(stat='count', aes(label=..count..), vjust=3, hjust=0.9, color = 'black') + + ggtitle("Count of Defaulting Clients") + + xlab("Defaults") + + ylab("Count") + + count_plot <- prop_plot + guides(fill=guide_legend(title="Default Flag")) + count_plot <- prop_plot + scale_shape_discrete(labels = c("Non-Defaults", "Defaults")) + scale_fill_discrete(labels = c("Non-Defaults", "Defaults")) + + ggsave(paste0(out_dir, "/counts_plot.png"), + count_plot, + width = 8, + height = 10) + + + # correlation plot corr <- round(cor(numeric_df %>% select_if(is.numeric)), 1) correlation_plot <- ggcorrplot(corr, hc.order = TRUE, outline.col = "white") diff --git a/src/fit_predict_default_model.py b/src/fit_predict_default_model.py index 8160114..86d71a9 100644 --- a/src/fit_predict_default_model.py +++ b/src/fit_predict_default_model.py @@ -18,10 +18,12 @@ import os import numpy as np import pandas as pd +import matplotlib.pyplot as plt +from matplotlib import rcParams from sklearn.dummy import DummyClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression -from sklearn.metrics import f1_score, recall_score +from sklearn.metrics import f1_score, recall_score, plot_confusion_matrix from sklearn.model_selection import ( RandomizedSearchCV, cross_validate, @@ -169,6 +171,12 @@ def predict_test(X_train, y_train, X_test, y_test, best_model): y_pred = best_model.predict(X_test) test_score = f1_score(y_test, y_pred) print(f"Best test score: {test_score}") + + # Plot confusion matrix results on test + rcParams.update({'figure.autolayout': True}) + plot_confusion_matrix(best_model, X_test, y_test, display_labels=["default", "pay bill"], values_format="d", cmap="Purples") + plt.title("Confusion matrix on test set results") + plt.savefig('results/figures/confusionmtx.png') return test_score @@ -195,4 +203,4 @@ def store_results(model, scores, results_dict): } if __name__ == "__main__": - main(opt["--train_data"], opt["--test_data"], opt["--hp_out_dir"], opt["--prelim_results_dir"]) \ No newline at end of file + main(opt["--train_data"], opt["--test_data"], opt["--hp_out_dir"], opt["--prelim_results_dir"])