Create ML models with optimized parameters, reducing size by 13x-15x

johnisom · Dec 12, 2023 · 8ae83ae · 8ae83ae
1 parent 15f5ca3
commit 8ae83ae
Show file tree

Hide file tree

Showing 8 changed files with 27 additions and 15 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,4 @@
 *.sqlite filter=lfs diff=lfs merge=lfs -text
 *.csv filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,2 @@
 data-backups/
-__pycache__/
-joblib-objects/*.joblib
+__pycache__/
diff --git a/joblib-objects.zip b/joblib-objects.zip
diff --git a/joblib-objects/fipscode-labelencoder.joblib b/joblib-objects/fipscode-labelencoder.joblib
diff --git a/joblib-objects/firesize-fipscode-discoverycontaineddates-causecode-classifier.joblib b/joblib-objects/firesize-fipscode-discoverycontaineddates-causecode-classifier.joblib
diff --git a/joblib-objects/firesize-lonlat-discoverycontaineddates-causecode-classifier.joblib b/joblib-objects/firesize-lonlat-discoverycontaineddates-causecode-classifier.joblib
diff --git a/src/gui/welcome.py b/src/gui/welcome.py
@@ -18,7 +18,9 @@ def __init__(self, *args, **kwargs):
       'To filter the results, you can specify a year range, and also which region of the US to show data for. Additionally, there\'s the option to filter on a per-state basis.'
     plots_text_label = ttk.Label(self, text=plots_text, wraplength=630, justify=LEFT)
     predictions_subtitle = Subtitle(self, text='Predictions')
-    predictions_text = 'Here you\'ll be able to predict the likely cause of a fire by giving the machine learning model some information about the fire: final size, county location, and date of discovery.'
+    predictions_text = 'Here you\'ll be able to predict the likely cause of a fire by giving the machine learning model some information about the fire: final size, county location, and date of discovery.\n' \
+      'The accuracy is 60.93% for the Longitude/Latitude model and 57.64% for the State/County model with 13 labels to choose from.\n' \
+      'That is actually rather good (random chance would be 7.69%).'
     predictions_text_label = ttk.Label(self, text=predictions_text, wraplength=630, justify=LEFT)
     credits_subtitle = Subtitle(self, text='Credits / License')
     credits_text = 'The credits and license pages have more information about the author of the project, the license used, and credits for others where credits are due.'

diff --git a/src/oneshot/create_and_save_models.py b/src/oneshot/create_and_save_models.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from ..prediction import FIPS_MODEL_PATH, FIPS_ENCODER_PATH, LONLAT_MODEL_PATH
 
-DB_FILENAME = 'db/fires.sqlite'
+DB_PATH = Path().parent.parent / 'db' / 'fires.sqlite'
 
 # TASK:
 # Given a fire’s location, start and end date, and size, predict the cause of the fire.
@@ -31,9 +31,11 @@ def create_trained_lonlat_model_and_datasets(fires_df):
   y = df['stat_cause_code'].astype(int)
 
   # Split into test and training sets
-  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
 
-  clf = RandomForestClassifier(n_estimators=100)
+  # These kwargs give best tradeoff between model performance and size.
+  # Compared to default params, it reduces the size of the model by 13x yet only reduces performance by 1.6% (62.5% -> 60.93%)
+  clf = RandomForestClassifier(n_estimators=18, max_samples=0.33, n_jobs=-1)
   clf.fit(X_train, y_train)
   return clf, X_train, X_test, y_train, y_test
 
@@ -56,23 +58,25 @@ def create_trained_fipscode_model_and_encoder_and_datasets(fires_df):
   y = df['stat_cause_code'].astype(int)
 
   # Split into test and training sets
-  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
 
-  clf = RandomForestClassifier(n_estimators=100)
+  # These kwargs reduce size of the model by >15x while actually improving test data performance by ~1%,
+  # however, it reduces training data performance.
+  clf = RandomForestClassifier(n_estimators=24, max_depth=20, max_samples=0.4, n_jobs=-1)
   clf.fit(X_train, y_train)
   return clf, combined_fips_code_le, X_train, X_test, y_train, y_test
 
 def train_and_save_fipscode_model_and_encoder(fires_df):
   clf, combined_fips_code_le, *_ = create_trained_fipscode_model_and_encoder_and_datasets(fires_df)
-  joblib_dump(clf, FIPS_MODEL_PATH)
-  joblib_dump(combined_fips_code_le, FIPS_ENCODER_PATH)
+  joblib_dump(clf, FIPS_MODEL_PATH, compress=9)
+  joblib_dump(combined_fips_code_le, FIPS_ENCODER_PATH, compress=9)
 
 def train_and_save_lonlat_model(fires_df):
   clf, *_ = create_trained_lonlat_model_and_datasets(fires_df)
-  joblib_dump(clf, LONLAT_MODEL_PATH)
+  joblib_dump(clf, LONLAT_MODEL_PATH, compress=9)
 
 def train_and_save_both_models():
-  con = sqlite3.connect(f'file:{DB_FILENAME}?ro', uri=True)
+  con = sqlite3.connect(f'file:{DB_PATH}?ro', uri=True)
   print('Loading all fires from database...')
   fires_dataframe = pd.read_sql_query('SELECT * FROM fires', con)
   con.close()