Skip to content

Commit

Permalink
Replace LabelEncoder with OrdinalEncoder
Browse files Browse the repository at this point in the history
  • Loading branch information
johnisom committed Dec 13, 2023
1 parent ca2b40f commit f50b6dd
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 27 deletions.
Empty file removed joblib-objects/.gitkeep
Empty file.
3 changes: 0 additions & 3 deletions joblib-objects/fipscode-labelencoder.joblib

This file was deleted.

3 changes: 3 additions & 0 deletions joblib-objects/fipscode-ordinalencoder.joblib
Git LFS file not shown
Git LFS file not shown
40 changes: 25 additions & 15 deletions src/oneshot/create_and_save_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import sqlite3
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from joblib import dump as joblib_dump
Expand Down Expand Up @@ -42,13 +42,13 @@ def create_trained_lonlat_model_and_datasets(fires_df):
def create_trained_fipscode_model_and_encoder_and_datasets(fires_df):
"""
Given a dataframe of all data from the fires database, format and split the data for training and train a machine learning model.
Returns the model, the combined fips code label encoder, and the split training and testing data.
Returns the model, the combined fips code ordinal encoder, and the split training and testing data.
"""
df = fires_df[['fire_size', 'combined_fips_code', 'discovery_datetime', 'contained_datetime', 'stat_cause_code']]

# Encode data for the model
combined_fips_code_le = LabelEncoder()
df.loc[:, ['combined_fips_code']] = combined_fips_code_le.fit_transform(df['combined_fips_code'])
combined_fips_code_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df.loc[:, ['combined_fips_code']] = combined_fips_code_oe.fit_transform(df[['combined_fips_code']]).astype(int)
df.loc[:, ['stat_cause_code']] = df['stat_cause_code'] - 1 # change range from 1-13 to 0-12

# Only train with good data
Expand All @@ -64,23 +64,33 @@ def create_trained_fipscode_model_and_encoder_and_datasets(fires_df):
# however, it reduces training data performance.
clf = RandomForestClassifier(n_estimators=24, max_depth=20, max_samples=0.4, n_jobs=-1)
clf.fit(X_train, y_train)
return clf, combined_fips_code_le, X_train, X_test, y_train, y_test

def train_and_save_fipscode_model_and_encoder(fires_df):
clf, combined_fips_code_le, *_ = create_trained_fipscode_model_and_encoder_and_datasets(fires_df)
return clf, combined_fips_code_oe, X_train, X_test, y_train, y_test

def train_and_save_fipscode_model_and_encoder(fires_df=None):
if fires_df is None:
con = sqlite3.connect(f'file:{DB_PATH}?ro', uri=True)
print('Loading all fires from database...')
fires_df = pd.read_sql_query('SELECT * FROM fires', con)
con.close()
clf, combined_fips_code_oe, *_ = create_trained_fipscode_model_and_encoder_and_datasets(fires_df)
joblib_dump(clf, FIPS_MODEL_PATH, compress=9)
joblib_dump(combined_fips_code_le, FIPS_ENCODER_PATH, compress=9)

def train_and_save_lonlat_model(fires_df):
joblib_dump(combined_fips_code_oe, FIPS_ENCODER_PATH, compress=9)

def train_and_save_lonlat_model(fires_df=None):
if fires_df is None:
con = sqlite3.connect(f'file:{DB_PATH}?ro', uri=True)
print('Loading all fires from database...')
fires_df = pd.read_sql_query('SELECT * FROM fires', con)
con.close()
clf, *_ = create_trained_lonlat_model_and_datasets(fires_df)
joblib_dump(clf, LONLAT_MODEL_PATH, compress=9)

def train_and_save_both_models():
con = sqlite3.connect(f'file:{DB_PATH}?ro', uri=True)
print('Loading all fires from database...')
fires_dataframe = pd.read_sql_query('SELECT * FROM fires', con)
fires_df = pd.read_sql_query('SELECT * FROM fires', con)
con.close()
print('Training and saving the FIPS code model and label encoder...')
train_and_save_fipscode_model_and_encoder(fires_dataframe)
print('Training and saving the FIPS code model and ordinal encoder...')
train_and_save_fipscode_model_and_encoder(fires_df)
print('Training and saving the longitude/latitude model...')
train_and_save_lonlat_model(fires_dataframe)
train_and_save_lonlat_model(fires_df)
2 changes: 1 addition & 1 deletion src/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def plot_fipscode_model_confusion_matrix():
model = get_fips_model()
fires_df = get_fires_dataframe()
df = fires_df[['fire_size', 'combined_fips_code', 'discovery_datetime', 'contained_datetime', 'stat_cause_code']].dropna()
df.loc[:, ['combined_fips_code']] = encoder.transform(df['combined_fips_code'].astype(int))
df.loc[:, ['combined_fips_code']] = encoder.transform(df[['combined_fips_code']])[0].astype(int)
X = df.drop('stat_cause_code', axis=1)
y = df['stat_cause_code'] - 1
fig, ax = plt.subplots(figsize=[10, 10])
Expand Down
9 changes: 3 additions & 6 deletions src/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd

FIPS_MODEL_PATH = Path().parent / 'joblib-objects' / 'firesize-fipscode-discoverycontaineddates-causecode-classifier.joblib'
FIPS_ENCODER_PATH = Path().parent / 'joblib-objects' / 'fipscode-labelencoder.joblib'
FIPS_ENCODER_PATH = Path().parent / 'joblib-objects' / 'fipscode-ordinalencoder.joblib'
LONLAT_MODEL_PATH = Path().parent / 'joblib-objects' / 'firesize-lonlat-discoverycontaineddates-causecode-classifier.joblib'
STAT_CAUSE_CODE_TO_DESCR = {
1: 'Lightning', 2: 'Equipment Use', 3: 'Smoking', 4: 'Campfire', 5: 'Debris Burning',
Expand Down Expand Up @@ -46,12 +46,9 @@ def get_lonlat_model():
return _lonlat_model

def run_fips_model_prediction(fire_size, combined_fips_code, discovery_datetime, contained_datetime):
fips_encoder = get_fips_encoder()
encoder = get_fips_encoder()
classifier = get_fips_model()
try:
encoded_fips_code = fips_encoder.transform([combined_fips_code])[0]
except ValueError:
encoded_fips_code = -1
encoded_fips_code = encoder.transform([[combined_fips_code]])[0].astype(int)[0]
df = pd.DataFrame(
data=[[fire_size, encoded_fips_code, discovery_datetime.timestamp(), contained_datetime.timestamp()]],
columns=['fire_size', 'combined_fips_code', 'discovery_datetime', 'contained_datetime']
Expand Down

0 comments on commit f50b6dd

Please sign in to comment.