Skip to content

Commit

Permalink
Create ML models with optimized parameters, reducing size by 13x-15x
Browse files Browse the repository at this point in the history
  • Loading branch information
johnisom committed Dec 12, 2023
1 parent 15f5ca3 commit 8ae83ae
Show file tree
Hide file tree
Showing 8 changed files with 27 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.sqlite filter=lfs diff=lfs merge=lfs -text
*.csv filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
data-backups/
__pycache__/
joblib-objects/*.joblib
__pycache__/
3 changes: 0 additions & 3 deletions joblib-objects.zip

This file was deleted.

3 changes: 3 additions & 0 deletions joblib-objects/fipscode-labelencoder.joblib
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
4 changes: 3 additions & 1 deletion src/gui/welcome.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ def __init__(self, *args, **kwargs):
'To filter the results, you can specify a year range, and also which region of the US to show data for. Additionally, there\'s the option to filter on a per-state basis.'
plots_text_label = ttk.Label(self, text=plots_text, wraplength=630, justify=LEFT)
predictions_subtitle = Subtitle(self, text='Predictions')
predictions_text = 'Here you\'ll be able to predict the likely cause of a fire by giving the machine learning model some information about the fire: final size, county location, and date of discovery.'
predictions_text = 'Here you\'ll be able to predict the likely cause of a fire by giving the machine learning model some information about the fire: final size, county location, and date of discovery.\n' \
'The accuracy is 60.93% for the Longitude/Latitude model and 57.64% for the State/County model with 13 labels to choose from.\n' \
'That is actually rather good (random chance would be 7.69%).'
predictions_text_label = ttk.Label(self, text=predictions_text, wraplength=630, justify=LEFT)
credits_subtitle = Subtitle(self, text='Credits / License')
credits_text = 'The credits and license pages have more information about the author of the project, the license used, and credits for others where credits are due.'
Expand Down
22 changes: 13 additions & 9 deletions src/oneshot/create_and_save_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pathlib import Path
from ..prediction import FIPS_MODEL_PATH, FIPS_ENCODER_PATH, LONLAT_MODEL_PATH

DB_FILENAME = 'db/fires.sqlite'
DB_PATH = Path().parent.parent / 'db' / 'fires.sqlite'

# TASK:
# Given a fire’s location, start and end date, and size, predict the cause of the fire.
Expand All @@ -31,9 +31,11 @@ def create_trained_lonlat_model_and_datasets(fires_df):
y = df['stat_cause_code'].astype(int)

# Split into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf = RandomForestClassifier(n_estimators=100)
# These kwargs give best tradeoff between model performance and size.
# Compared to default params, it reduces the size of the model by 13x yet only reduces performance by 1.6% (62.5% -> 60.93%)
clf = RandomForestClassifier(n_estimators=18, max_samples=0.33, n_jobs=-1)
clf.fit(X_train, y_train)
return clf, X_train, X_test, y_train, y_test

Expand All @@ -56,23 +58,25 @@ def create_trained_fipscode_model_and_encoder_and_datasets(fires_df):
y = df['stat_cause_code'].astype(int)

# Split into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf = RandomForestClassifier(n_estimators=100)
# These kwargs reduce size of the model by >15x while actually improving test data performance by ~1%,
# however, it reduces training data performance.
clf = RandomForestClassifier(n_estimators=24, max_depth=20, max_samples=0.4, n_jobs=-1)
clf.fit(X_train, y_train)
return clf, combined_fips_code_le, X_train, X_test, y_train, y_test

def train_and_save_fipscode_model_and_encoder(fires_df):
clf, combined_fips_code_le, *_ = create_trained_fipscode_model_and_encoder_and_datasets(fires_df)
joblib_dump(clf, FIPS_MODEL_PATH)
joblib_dump(combined_fips_code_le, FIPS_ENCODER_PATH)
joblib_dump(clf, FIPS_MODEL_PATH, compress=9)
joblib_dump(combined_fips_code_le, FIPS_ENCODER_PATH, compress=9)

def train_and_save_lonlat_model(fires_df):
clf, *_ = create_trained_lonlat_model_and_datasets(fires_df)
joblib_dump(clf, LONLAT_MODEL_PATH)
joblib_dump(clf, LONLAT_MODEL_PATH, compress=9)

def train_and_save_both_models():
con = sqlite3.connect(f'file:{DB_FILENAME}?ro', uri=True)
con = sqlite3.connect(f'file:{DB_PATH}?ro', uri=True)
print('Loading all fires from database...')
fires_dataframe = pd.read_sql_query('SELECT * FROM fires', con)
con.close()
Expand Down

0 comments on commit 8ae83ae

Please sign in to comment.