Merge branch 'main' of https://github.com/UBC-MDS/DSCI522_group_12 in…

…to main
HazelJJJ · Dec 13, 2020 · 8e62161 · 8e62161
2 parents 8f4f2b6 + a9cb189
commit 8e62161
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 22 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -32,4 +32,15 @@ RUN apt-get update -qq && install2.r --error \
     caret \
     pacman \
     feather \
-    ggthemes
+    ggthemes \
+    ggcorrplot \
+    here \
+    PerformanceAnalytics \
+    GGally \
+    FSinR \
+    ggpubr \
+    VSURF
+
+
+# install janitor R package    
+RUN Rscript -e 'devtools::install_github("sfirke/janitor")'
diff --git a/src/eda_cred.r b/src/eda_cred.r
@@ -19,6 +19,7 @@ library(docopt)
 library(ggthemes)
 #library(graphics)
 library(ggcorrplot)
+library(arrow)
 
 theme_set(theme_minimal())
 
@@ -35,8 +36,8 @@ main <- function(train, train_scaled, out_dir) {
   }
 
   # read data 
-  training_data <- read_feather(train)
-  training_scaled <- read_feather(train_scaled)
+  training_data <- arrow::read_feather(train)
+  training_scaled <- arrow::read_feather(train_scaled)
 
   # density plot
   density_plot <- training_scaled %>%
@@ -75,23 +76,6 @@ main <- function(train, train_scaled, out_dir) {
          width = 8, 
          height = 10)
 
-  # target counts bar plot 
-  count_plot <- training_data %>% 
-    ggplot(aes(x=as.numeric(default), fill=default)) +
-    geom_bar() +
-    geom_text(stat='count', aes(label=..count..), vjust=3, hjust=0.9, color = 'black') + 
-    ggtitle("Count of Defaulting Clients") +
-    xlab("Defaults") +
-    ylab("Count") 
-
-  count_plot <- prop_plot + guides(fill=guide_legend(title="Default Flag")) 
-  count_plot <- prop_plot + scale_shape_discrete(labels = c("Non-Defaults", "Defaults")) + scale_fill_discrete(labels = c("Non-Defaults", "Defaults"))
-
-  ggsave(paste0(out_dir, "/counts_plot.png"), 
-         count_plot,
-         width = 8, 
-         height = 10)
-
   # proportions plot
   prop_plot <- training_data %>% 
     ggplot(aes(x=as.numeric(default),  y = ..prop.., fill = factor(..x..), group = 1)) +
@@ -115,6 +99,25 @@ main <- function(train, train_scaled, out_dir) {
   numeric_df$default <- as.numeric(numeric_df$default)
   numeric_df$age <- as.numeric(numeric_df$age)
 
+  # target counts bar plot 
+  count_plot <- training_data %>% 
+    ggplot(aes(x=as.numeric(default), fill=default)) +
+    geom_bar() +
+    geom_text(stat='count', aes(label=..count..), vjust=3, hjust=0.9, color = 'black') + 
+    ggtitle("Count of Defaulting Clients") +
+    xlab("Defaults") +
+    ylab("Count") 
+
+  count_plot <- prop_plot + guides(fill=guide_legend(title="Default Flag")) 
+  count_plot <- prop_plot + scale_shape_discrete(labels = c("Non-Defaults", "Defaults")) + scale_fill_discrete(labels = c("Non-Defaults", "Defaults"))
+
+  ggsave(paste0(out_dir, "/counts_plot.png"), 
+         count_plot,
+         width = 8, 
+         height = 10)
+
+
+
   # correlation plot
   corr <- round(cor(numeric_df %>% select_if(is.numeric)), 1)
   correlation_plot <- ggcorrplot(corr, hc.order = TRUE, outline.col = "white")

diff --git a/src/fit_predict_default_model.py b/src/fit_predict_default_model.py
@@ -18,10 +18,12 @@
 import os
 import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
 from sklearn.dummy import DummyClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import f1_score, recall_score
+from sklearn.metrics import f1_score, recall_score, plot_confusion_matrix
 from sklearn.model_selection import (
     RandomizedSearchCV,
     cross_validate,
@@ -169,6 +171,12 @@ def predict_test(X_train, y_train, X_test, y_test, best_model):
     y_pred = best_model.predict(X_test)
     test_score = f1_score(y_test, y_pred)
     print(f"Best test score: {test_score}")
+
+    # Plot confusion matrix results on test
+    rcParams.update({'figure.autolayout': True})
+    plot_confusion_matrix(best_model, X_test, y_test, display_labels=["default", "pay bill"], values_format="d", cmap="Purples")
+    plt.title("Confusion matrix on test set results")
+    plt.savefig('results/figures/confusionmtx.png')
     return test_score
 
 
@@ -195,4 +203,4 @@ def store_results(model, scores, results_dict):
     }
 
 if __name__ == "__main__":
-    main(opt["--train_data"], opt["--test_data"], opt["--hp_out_dir"], opt["--prelim_results_dir"])
+    main(opt["--train_data"], opt["--test_data"], opt["--hp_out_dir"], opt["--prelim_results_dir"])