WIP

johnisom · Dec 13, 2023 · ca2b40f · ca2b40f
1 parent edf0070
commit ca2b40f
Show file tree

Hide file tree

Showing 8 changed files with 50 additions and 41 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1 @@
-data-backups/
 __pycache__/
diff --git a/README.md b/README.md
@@ -1,6 +1,5 @@
 # Wildfire Analyzer
 
-
 ## Installation
 
 Install git, install git lfs, then clone github repo to the computer.
@@ -15,30 +14,19 @@ Make sure you have 5 GB of free memory first.
 
 ## About
 
-This project is a data analytics and prediction project that uses machine learning. It runs in jupyter notebooks.
-
-The dataset I'm using is https://www.kaggle.com/datasets/rtatman/188-million-us-wildfires/data. I cleaned and formatted the data and selected only
-that which I needed.
-Reverse geocoding is made possible through the `pygris` python package. It is a package that downloads and reads the US Census Bureau's TIGRIS file on the shapes and locations of all counties and county equivalents in the USA, and I used that to place the location of latitude and longitude coordinates in specific counties, as part of my cleaning and formatting the data.
-
-This is heavily work in progress and the requirements may change.
+This project is a data analytics and prediction project that uses machine learning. It runs locally as a TK desktop application.
 
 ## Structure
 
 stand alone app running on windows 10 with jupyter notebook and sqlite for data access.
 
-## Requirements
+## Credits
 
-make the following descriptive (analytical) methods:
-1. table showing ranking of counties that have the most fires and most acres burned by fires, per year.
-  a. alternatively, have it be a map with counties that are shaded according to number of fires (this is what i went with)
-2. a pie chart that shows causes of wildfires in the US. the user can select an individual state if they wish, or all of the USA.
-3. a confusion matrix to visualize the model's performance
+### Wildfires Dataset
 
-make the following non-descriptive (predictive) methods:
-1. model that predicts fire size class based off of some parameters
-  a. suggested params are location (probably granular to county), date, reporting agency
+The dataset I'm using is https://www.kaggle.com/datasets/rtatman/188-million-us-wildfires/data. I cleaned and formatted the data and selected only that which I needed, greatly reducing size.
 
-## TODOS
+### County-level Mapping
 
-There are 1,880,465 data points which need to be split into training and test sets.
+Reverse geocoding is made possible through the `pygris` python package.
+It is a package that downloads and reads the US Census Bureau's TIGRIS file on the shapes and locations of all counties and county equivalents in the USA, and I used that to place the location of latitude and longitude coordinates in specific counties, as part of my cleaning and formatting the data. It is also used to display the map under the "Plots" tab of this application.
diff --git a/src/bindings.py b/src/bindings.py
@@ -1,6 +1,6 @@
 from math import inf
 import matplotlib.pyplot as plt
-from .plotting import plot_causes_of_fires_by_number_of_fires, plot_causes_of_fires_by_total_area_burned, plot_counties_by_number_of_fires, plot_counties_by_total_area_burned
+from .plotting import plot_causes_of_fires_by_number_of_fires, plot_causes_of_fires_by_total_area_burned, plot_counties_by_number_of_fires, plot_counties_by_total_area_burned, plot_fipscode_model_confusion_matrix, plot_lonlat_model_confusion_matrix
 from .myutils import add_fire_count_to_counties, add_acres_burned_to_counties
 from .location_info import get_counties_geodf
 from .fires_info import get_fires_dataframe
@@ -74,3 +74,11 @@ def plot_everything(keys=('lower48',), year_start=None, year_end=None):
   plot_causes_of_fires_by_number_of_fires(fires_df, keys, fire_cause_counts_title)
   plot_counties_by_number_of_fires(counties_geo_df, keys, fire_counts_title)
   plt.show()
+
+def plot_lonlat_confusion_matrix():
+  plot_lonlat_model_confusion_matrix()
+  plt.show()
+
+def plot_fipscode_confusion_matrix():
+  plot_fipscode_model_confusion_matrix()
+  plt.show()
diff --git a/src/gui/app.py b/src/gui/app.py
@@ -4,7 +4,7 @@
 from .welcome import WelcomeFrame
 from .plots import PlotsFrame
 from .predictions import PredictionsFrame
-from .credits import CreditsFrame, LicenseFrame
+from .license import LicenseFrame
 from ..prediction import FIPS_ENCODER_PATH, FIPS_MODEL_PATH, LONLAT_MODEL_PATH
 from .custom_widgets import Title
 
@@ -20,7 +20,6 @@ def __init__(self, enable_predictions=True, **kwargs):
     notebook = ttk.Notebook(self, padding=2)
     welcome_frame = WelcomeFrame(notebook)
     plots_frame = PlotsFrame(notebook)
-    credits_frame = CreditsFrame(notebook)
     license_frame = LicenseFrame(notebook)
     notebook.add(welcome_frame, text=welcome_frame.title)
     notebook.add(plots_frame, text=plots_frame.title)
@@ -31,7 +30,6 @@ def __init__(self, enable_predictions=True, **kwargs):
       no_ml_message = 'The "joblib-objects" directory was unable to be found, or the files within don\'t match the registered name for the predictive models.\n' \
         'Make sure to have git-lfs installed and pull the latest version of the repository.'
       self.warn(title='Unable to load ML predictive models!', message=no_ml_message)
-    notebook.add(credits_frame, text=credits_frame.title)
     notebook.add(license_frame, text=license_frame.title)
 
     # Set up the grid

diff --git a/src/gui/credits.py → src/gui/license.py b/src/gui/credits.py → src/gui/license.py
@@ -1,18 +1,7 @@
 from pathlib import Path
 from tkinter import *
 from tkinter import ttk
-from .custom_widgets import NotebookFrame, Title, Subtitle
-
-class CreditsFrame(NotebookFrame):
-  title = 'Credits'
-
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-
-    title = Title(self, text='Credits')
-
-    # Set items on the grid
-    title.grid(row=0, column=0, sticky=(N, E, W))
+from .custom_widgets import NotebookFrame, Title
 
 class LicenseFrame(NotebookFrame):
   title = 'License'
@@ -31,4 +20,3 @@ def __init__(self, *args, **kwargs):
     # Set items on the grid
     title.grid(row=0, column=0, sticky=(N, E, W), pady='3')
     license.grid(row=1, column=0, sticky=(S, E, W))
-
diff --git a/src/gui/predictions.py b/src/gui/predictions.py
@@ -47,7 +47,7 @@ def __init__(self, *args, **kwargs):
     self.set_up_state_county_variables()
 
     title = Title(self, text='Predictions')
-    subframe = ttk.LabelFrame(self, text='Predict cause of fire in giving some parameters')
+    subframe = ttk.LabelFrame(self, text='Predict cause of fire by giving some parameters')
 
     input_frame = ttk.Frame(subframe, padding=5)
     discovery_datetime_label = ttk.Label(input_frame, text='Discovery datetime: ')

diff --git a/src/gui/welcome.py b/src/gui/welcome.py
@@ -10,7 +10,7 @@ def __init__(self, *args, **kwargs):
 
     title = Title(self, text='Welcome')
     intro_text = 'Welcome to Wildfire Analyzer! This application is a data analytics application for wildfires in the US.\n' \
-      'Other than this welcome page, there are 3 pages: Plots, Predictions, and Credits.'
+      'Other than this welcome page, there are 4 pages: Plots, Predictions, Credits, and License.'
     intro_text_label = ttk.Label(self, text=intro_text, wraplength=630, justify=LEFT)
     plots_subtitle = Subtitle(self, text='Plots')
     plots_text = 'In the plots page you can plot pie charts of the causes of wildfires (such as lightning or arson) by number of fires started or by number of acres burned.\n' \
@@ -19,8 +19,7 @@ def __init__(self, *args, **kwargs):
     plots_text_label = ttk.Label(self, text=plots_text, wraplength=630, justify=LEFT)
     predictions_subtitle = Subtitle(self, text='Predictions')
     predictions_text = 'Here you\'ll be able to predict the likely cause of a fire by giving the machine learning model some information about the fire: final size, county location, and date of discovery.\n' \
-      'The accuracy is 60.93% for the Longitude/Latitude model and 57.64% for the State/County model with 13 labels to choose from.\n' \
-      'That is actually rather good (random chance would be 7.69%).'
+      'The performance is 60.93% accuracy for the Longitude/Latitude model and 57.64% for the State/County model with 13 labels to choose from. That is actually rather good (random chance would be 7.69%).'
     predictions_text_label = ttk.Label(self, text=predictions_text, wraplength=630, justify=LEFT)
     credits_subtitle = Subtitle(self, text='Credits / License')
     credits_text = 'The credits and license pages have more information about the author of the project, the license used, and credits for others where credits are due.'

diff --git a/src/plotting.py b/src/plotting.py
@@ -1,6 +1,9 @@
 import matplotlib.pyplot as plt
 from matplotlib.colors import LogNorm
+from sklearn.metrics import ConfusionMatrixDisplay
+from .fires_info import get_fires_dataframe
 from .location_info import get_state_fips_codes
+from .prediction import get_fips_encoder, get_fips_model, get_lonlat_model, STAT_CAUSE_CODE_TO_DESCR
 
 def plot_counties_by_number_of_fires(counties_geo_df, keys, plot_title):
   fips_codes = get_state_fips_codes(keys)
@@ -57,3 +60,29 @@ def plot_causes_of_fires_by_total_area_burned(fires_df, keys, plot_title):
   fig.supxlabel(f'(Total: {total_sum:,.0f} acres)')
   fig.tight_layout()
   return fig, ax
+
+def plot_lonlat_model_confusion_matrix():
+  model = get_lonlat_model()
+  fires_df = get_fires_dataframe()
+  df = fires_df[['fire_size', 'longitude', 'latitude', 'discovery_datetime', 'contained_datetime', 'stat_cause_code']].dropna()
+  X = df.drop('stat_cause_code', axis=1)
+  y = df['stat_cause_code'] - 1
+  fig, ax = plt.subplots(figsize=[12, 8])
+  ConfusionMatrixDisplay.from_predictions(y_true=y, y_pred=model.predict(X), normalize='true', xticks_rotation='vertical', display_labels=STAT_CAUSE_CODE_TO_DESCR.values(), values_format='.2f', ax=ax)
+  ax.set_title('Confusion Matrix for the Longitude/Latitude prediction model.')
+  fig.tight_layout()
+  return fig, ax
+
+def plot_fipscode_model_confusion_matrix():
+  encoder = get_fips_encoder()
+  model = get_fips_model()
+  fires_df = get_fires_dataframe()
+  df = fires_df[['fire_size', 'combined_fips_code', 'discovery_datetime', 'contained_datetime', 'stat_cause_code']].dropna()
+  df.loc[:, ['combined_fips_code']] = encoder.transform(df['combined_fips_code'].astype(int))
+  X = df.drop('stat_cause_code', axis=1)
+  y = df['stat_cause_code'] - 1
+  fig, ax = plt.subplots(figsize=[10, 10])
+  ConfusionMatrixDisplay.from_predictions(y_true=y, y_pred=model.predict(X), normalize='true', xticks_rotation='vertical', display_labels=STAT_CAUSE_CODE_TO_DESCR.values(), values_format='.2f', ax=ax)
+  ax.set_title('Confusion Matrix for the State/County prediction model.')
+  fig.tight_layout()
+  return fig, ax