Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/UBC-MDS/DSCI522_group_12 in…
Browse files Browse the repository at this point in the history
…to main
  • Loading branch information
HazelJJJ committed Dec 13, 2020
2 parents 8f4f2b6 + a9cb189 commit 8e62161
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 22 deletions.
13 changes: 12 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,15 @@ RUN apt-get update -qq && install2.r --error \
caret \
pacman \
feather \
ggthemes
ggthemes \
ggcorrplot \
here \
PerformanceAnalytics \
GGally \
FSinR \
ggpubr \
VSURF


# install janitor R package
RUN Rscript -e 'devtools::install_github("sfirke/janitor")'
41 changes: 22 additions & 19 deletions src/eda_cred.r
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ library(docopt)
library(ggthemes)
#library(graphics)
library(ggcorrplot)
library(arrow)

theme_set(theme_minimal())

Expand All @@ -35,8 +36,8 @@ main <- function(train, train_scaled, out_dir) {
}

# read data
training_data <- read_feather(train)
training_scaled <- read_feather(train_scaled)
training_data <- arrow::read_feather(train)
training_scaled <- arrow::read_feather(train_scaled)

# density plot
density_plot <- training_scaled %>%
Expand Down Expand Up @@ -75,23 +76,6 @@ main <- function(train, train_scaled, out_dir) {
width = 8,
height = 10)

# target counts bar plot
count_plot <- training_data %>%
ggplot(aes(x=as.numeric(default), fill=default)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=3, hjust=0.9, color = 'black') +
ggtitle("Count of Defaulting Clients") +
xlab("Defaults") +
ylab("Count")

count_plot <- prop_plot + guides(fill=guide_legend(title="Default Flag"))
count_plot <- prop_plot + scale_shape_discrete(labels = c("Non-Defaults", "Defaults")) + scale_fill_discrete(labels = c("Non-Defaults", "Defaults"))

ggsave(paste0(out_dir, "/counts_plot.png"),
count_plot,
width = 8,
height = 10)

# proportions plot
prop_plot <- training_data %>%
ggplot(aes(x=as.numeric(default), y = ..prop.., fill = factor(..x..), group = 1)) +
Expand All @@ -115,6 +99,25 @@ main <- function(train, train_scaled, out_dir) {
numeric_df$default <- as.numeric(numeric_df$default)
numeric_df$age <- as.numeric(numeric_df$age)

# target counts bar plot
count_plot <- training_data %>%
ggplot(aes(x=as.numeric(default), fill=default)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=3, hjust=0.9, color = 'black') +
ggtitle("Count of Defaulting Clients") +
xlab("Defaults") +
ylab("Count")

count_plot <- prop_plot + guides(fill=guide_legend(title="Default Flag"))
count_plot <- prop_plot + scale_shape_discrete(labels = c("Non-Defaults", "Defaults")) + scale_fill_discrete(labels = c("Non-Defaults", "Defaults"))

ggsave(paste0(out_dir, "/counts_plot.png"),
count_plot,
width = 8,
height = 10)



# correlation plot
corr <- round(cor(numeric_df %>% select_if(is.numeric)), 1)
correlation_plot <- ggcorrplot(corr, hc.order = TRUE, outline.col = "white")
Expand Down
12 changes: 10 additions & 2 deletions src/fit_predict_default_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score
from sklearn.metrics import f1_score, recall_score, plot_confusion_matrix
from sklearn.model_selection import (
RandomizedSearchCV,
cross_validate,
Expand Down Expand Up @@ -169,6 +171,12 @@ def predict_test(X_train, y_train, X_test, y_test, best_model):
y_pred = best_model.predict(X_test)
test_score = f1_score(y_test, y_pred)
print(f"Best test score: {test_score}")

# Plot confusion matrix results on test
rcParams.update({'figure.autolayout': True})
plot_confusion_matrix(best_model, X_test, y_test, display_labels=["default", "pay bill"], values_format="d", cmap="Purples")
plt.title("Confusion matrix on test set results")
plt.savefig('results/figures/confusionmtx.png')
return test_score


Expand All @@ -195,4 +203,4 @@ def store_results(model, scores, results_dict):
}

if __name__ == "__main__":
main(opt["--train_data"], opt["--test_data"], opt["--hp_out_dir"], opt["--prelim_results_dir"])
main(opt["--train_data"], opt["--test_data"], opt["--hp_out_dir"], opt["--prelim_results_dir"])

0 comments on commit 8e62161

Please sign in to comment.