Moving code to fix tox and other warnings over to this branch

washingtonpost · Dec 13, 2024 · d0935dc · d0935dc
1 parent 3eed4c4
commit d0935dc
Show file tree

Hide file tree

Showing 8 changed files with 66 additions and 33 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -10,12 +10,12 @@ good-names= on, x, df, NonparametricElectionModel, GaussianElectionModel,
             BaseElectionModel, qr, X, y, f, LiveData, n, Featurizer, Estimandizer, fe, PreprocessedData, CombinedData,
             ModelResults, GaussianModel, MODEL_THRESHOLD, LOG, w, df_X, df_y, v, n, g, a, b
 disable=missing-function-docstring, missing-module-docstring, missing-class-docstring, #missing
-        too-many-arguments, too-many-locals, too-many-branches, too-many-instance-attributes, too-many-statements, #structure: too-many
+        too-many-arguments, too-many-locals, too-many-branches, too-many-instance-attributes, too-many-statements, too-many-positional-arguments, #structure: too-many
         too-few-public-methods, #structure: too-few
         cell-var-from-loop, function-redefined, attribute-defined-outside-init, arguments-differ, unnecessary-dict-index-lookup, unnecessary-lambda, #structure: other
         invalid-name, #naming
         redefined-outer-name, pointless-statement, no-member, dangerous-default-value, broad-exception-raised, inconsistent-return-statements, #testing specific
         R0801, #similar lines in two files
         pointless-string-statement, unused-argument, wrong-import-position, #readability
-        protected-access, useless-parent-delegation #other
-
+        protected-access, useless-parent-delegation, #other
+        logging-fstring-interpolation
diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
@@ -393,8 +393,7 @@ def get_estimates(
         minimum_reporting_units_max = 0
         for alpha in prediction_intervals:
             minimum_reporting_units = self.model.get_minimum_reporting_units(alpha)
-            if minimum_reporting_units > minimum_reporting_units_max:
-                minimum_reporting_units_max = minimum_reporting_units
+            minimum_reporting_units_max = max(minimum_reporting_units, minimum_reporting_units_max)
 
         if APP_ENV != "local" and self.save_results:
             data.write_data(self.election_id, self.office)

diff --git a/src/elexmodel/handlers/data/VersionedData.py b/src/elexmodel/handlers/data/VersionedData.py
@@ -1,3 +1,4 @@
+import warnings
 from datetime import datetime
 
 import numpy as np
@@ -64,10 +65,12 @@ def get_versioned_results(self, filepath=None):
 
         if self.election_id.startswith("2020-11-03_USA_G"):
             path = "elex-models-prod/2020-general/results/pres/current.csv"
-        elif self.election_id.startswith("2024-11-05_USA_G"):
-            path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current_counties.csv"
         else:
-            path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current.csv"
+            base_dir = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}"
+            if self.election_id.startswith("2024-11-05_USA_G"):
+                path = base_dir + "/current_counties.csv"
+            else:
+                path = base_dir + "/current.csv"
 
         data = self.s3_client.get(path, self.sample)
         LOG.info("Loaded versioned results from S3")
@@ -124,7 +127,8 @@ def compute_estimated_margin(df):
                 casting="unsafe",
             )
 
-            # check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin)
+            # check if perc_expected_vote_corr is monotone increasing
+            # (if not, give up and don't try to estimate a margin)
             if not np.all(np.diff(perc_expected_vote_corr) >= 0):
                 return pd.DataFrame(
                     {
@@ -143,15 +147,18 @@ def compute_estimated_margin(df):
             # Compute batch_margin using NumPy
             # this is the difference in dem_votes - the difference in gop_votes divided by the difference in total votes
             # that is, this is the normalized margin in the batch of votes recorded between versions
-            batch_margin = (
-                np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1])
-            ) / np.diff(results_weights, append=results_weights[-1])
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", RuntimeWarning)
+                batch_margin = (
+                    np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1])
+                ) / np.diff(results_weights, append=results_weights[-1])
 
             # nan values in batch_margin are due to div-by-zero since there's no change in votes
             batch_margin[np.isnan(batch_margin)] = 0  # Set NaN margins to 0
             df["batch_margin"] = batch_margin
 
-            # batch_margins should be between -1 and 1 (otherwise, there was a data entry issue and we will not use this unit)
+            # batch_margins should be between -1 and 1
+            # (otherwise, there was a data entry issue and we will not use this unit)
             if np.abs(batch_margin).max() > 1:
                 return pd.DataFrame(
                     {
@@ -208,7 +215,9 @@ def compute_estimated_margin(df):
                 }
             )
 
-        results = results.groupby("geographic_unit_fips").apply(compute_estimated_margin).reset_index()
+        results = (
+            results.groupby("geographic_unit_fips").apply(compute_estimated_margin, include_groups=False).reset_index()
+        )
 
         for error_type in sorted(set(results["error_type"])):
             if error_type == "none":
@@ -222,9 +231,8 @@ def get_versioned_predictions(self, filepath=None):
             return pd.read_csv(filepath)
 
         if self.election_id.startswith("2020-11-03_USA_G"):
-            path = "elex-models-prod/2020-general/prediction/pres/current.csv"
             raise ValueError("No versioned predictions available for this election.")
-        else:
-            path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv"
+
+        path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv"
 
         return self.s3_client.get(path, self.sample)
diff --git a/src/elexmodel/handlers/s3.py b/src/elexmodel/handlers/s3.py
@@ -43,9 +43,13 @@ def put(self, filename, data, **kwargs):
 
     def get_file_path(self, file_type, path_info):
         if file_type == "preprocessed":
-            file_path = f'{S3_FILE_PATH}/{path_info["election_id"]}/data/{path_info["office"]}/data_{path_info["geographic_unit_type"]}.csv'
+            csv_file = f'data_{path_info["geographic_unit_type"]}.csv'
+            file_path = f'{S3_FILE_PATH}/{path_info["election_id"]}/data/{path_info["office"]}/{csv_file}'
         elif file_type == "config":
             file_path = f'{S3_FILE_PATH}/{path_info["election_id"]}/config/{path_info["election_id"]}'
+        else:
+            LOG.warning("Unknown file type %s", file_type)
+            file_path = None
         return file_path
 
 
@@ -128,7 +132,7 @@ def wait_for_versions(self, q):
             try:
                 future.result()
                 yield version, data
-            except Exception as e:
+            except Exception as e:  # pylint: disable=broad-exception-caught
                 LOG.error(f"Error downloading {version['VersionId']}: {e}")
 
             q.task_done()

diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py
@@ -105,8 +105,10 @@ def __init__(self, model_settings={}, versioned_data_handler=None, pres_predicti
         self.rng = np.random.default_rng(seed=self.seed)  # used for sampling
         self.ran_bootstrap = False
 
-        # these are the max/min values for called races. Ie. if a contest is called for LHS party then the prediction/intervals should be at least lhs_called_threshold
-        # if a contest is called for RHS party then the prediction/interval should be at most rhs_called_threshold (at most because the values are negative)
+        # these are the max/min values for called races. Ie.
+        # if a contest is called for LHS party then the prediction/intervals should be at least lhs_called_threshold
+        # if a contest is called for RHS party then the prediction/interval should be at most rhs_called_threshold
+        # (at most because the values are negative)
         self.lhs_called_threshold = 0.005
         self.rhs_called_threshold = -0.005
 
@@ -359,6 +361,9 @@ def _generate_nonreporting_bounds(
             # if 0 percent of the vote is in, the upper bound would be zero if we used the above
             # code. So instead we set it to the naive bound
             upper_bound[np.isclose(upper_bound, 0)] = unobserved_upper_bound
+        else:
+            LOG.warning("Unknown bootstrap estimand %s", bootstrap_estimand)
+            return None, None
 
         # if percent reporting is 0 or 1, don't try to compute anything and revert to naive bounds
         lower_bound[
@@ -639,7 +644,8 @@ def _get_epsilon_hat_std(residuals, epsilon):
                 1 - aggregate_indicator_train.sum(axis=0) / aggregate_indicator.sum(axis=0)
             ) / aggregate_indicator_train.sum(axis=0)
 
-            # where we have < 2 units in a contest, we set the variance to the variance of the observed epsilon_hat values
+            # where we have < 2 units in a contest,
+            # we set the variance to the variance of the observed epsilon_hat values
             var[np.isnan(var) | np.isinf(var)] = np.var(epsilon[np.nonzero(epsilon)[0]].T, ddof=1)
             return np.sqrt(var)
 
@@ -789,7 +795,8 @@ def _extrapolate_unit_margin(self, reporting_units: pd.DataFrame, nonreporting_u
         percent_expected_vote is too far away.
 
         4) The correction estimates (obtained using VersionedResultsHandler) are also np.nan when there are
-        irregularities in the reporting (e.g., there's a correction to the dem/gop vote totals that revises them downwards).
+        irregularities in the reporting
+        (e.g., there's a correction to the dem/gop vote totals that revises them downwards).
 
         5) We only run this method in states with at least self.min_extrapolating_units counties available.
         """
@@ -954,6 +961,9 @@ def compute_correction_statistics(df):
             prediction_std = (
                 nonreporting_units.est_correction_max.values - nonreporting_units.est_correction_min.values
             ).reshape(-1, 1)
+        else:
+            LOG.warning("Unknown extrapolate standard deviation method %s", self.extrapolate_std_method)
+            prediction_std = 0
 
         return prediction, prediction_std
 
@@ -1444,19 +1454,28 @@ def _format_called_contests(
         lhs_rhs_intersection = set(lhs_called_contests) & set(rhs_called_contests)
         if len(lhs_rhs_intersection) > 0:
             raise BootstrapElectionModelException(
-                f"You can only call a contest for one party, not for both. Currently these contests are called for both parties: {lhs_rhs_intersection}"
+                (
+                    "You can only call a contest for one party, not for both. "
+                    + f"Currently these contests are called for both parties: {lhs_rhs_intersection}"
+                )
             )
 
         lhs_difference_with_contests = set(lhs_called_contests) - set(contests)
         if len(lhs_difference_with_contests) > 0:
             raise BootstrapElectionModelException(
-                f"You can only call contests that are being run by the model. These LHS called contests do not exist: {lhs_difference_with_contests}"
+                (
+                    "You can only call contests that are being run by the model. "
+                    + f"These LHS called contests do not exist: {lhs_difference_with_contests}"
+                )
             )
 
         rhs_difference_with_contests = set(rhs_called_contests) - set(contests)
         if len(rhs_difference_with_contests) > 0:
             raise BootstrapElectionModelException(
-                f"You can only call contests that are being run by the model. These RHS called contests do not exist: {rhs_difference_with_contests}"
+                (
+                    "You can only call contests that are being run by the model. "
+                    + f"These RHS called contests do not exist: {rhs_difference_with_contests}"
+                )
             )
 
         # the order in called_coteests need
@@ -1797,7 +1816,10 @@ def get_national_summary_estimates(self, nat_sum_data_dict: dict, base_to_add: i
         # (ie. the number of contests) then raise an exception
         if len(nat_sum_data_dict) != self.divided_error_B_1.shape[0]:
             raise BootstrapElectionModelException(
-                f"nat_sum_data_dict is of length {len(nat_sum_data_dict)} but there are {self.divided_error_B_1.shape[0]} contests"
+                (
+                    f"nat_sum_data_dict is of length {len(nat_sum_data_dict)} "
+                    + f"but there are {self.divided_error_B_1.shape[0]} contests"
+                )
             )
 
         # NOTE: This assumes that pd.get_dummies does alphabetical ordering

diff --git a/src/elexmodel/models/ConformalElectionModel.py b/src/elexmodel/models/ConformalElectionModel.py
@@ -22,7 +22,7 @@
 
 class ConformalElectionModel(BaseElectionModel.BaseElectionModel, ABC):
     def __init__(self, model_settings: dict):
-        super(ConformalElectionModel, self).__init__(model_settings)
+        super(ConformalElectionModel, self).__init__(model_settings)  # pylint: disable=super-with-arguments
         self.lambda_ = model_settings.get("lambda_", 0)
 
     @classmethod
@@ -207,5 +207,5 @@ def get_all_conformalization_data_agg(cls):
         """
         raise NotImplementedError
 
-    def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add, alpha):
+    def get_national_summary_estimates(self, nat_sum_data_dict, base_to_add, alpha):
         raise NotImplementedError()
diff --git a/tests/distributions/test_gaussian_model.py b/tests/distributions/test_gaussian_model.py
@@ -390,7 +390,7 @@ def test_winsorized_intervals():
     lower_n = lower - lower_correction_n[0]
     upper_n = upper + upper_correction_n[0]
 
-    for i in range(len(lower_w)):
+    for i in range(len(lower_w)):  # pylint: disable=consider-using-enumerate
         assert lower_w[i] >= lower_n[i]
         assert upper_w[i] <= upper_n[i]
 

diff --git a/tests/models/test_bootstrap_election_model.py b/tests/models/test_bootstrap_election_model.py
@@ -1063,7 +1063,7 @@ def test_get_national_summary_estimates(bootstrap_election_model, rng):
     bootstrap_election_model.get_aggregate_predictions(
         reporting_units, nonreporting_units, unexpected_units, ["postal_code"], "margin"
     )  # race calling for aggregate prediction interval assumes that the point prediction has been set accordingly
-    lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
+    _, _ = bootstrap_election_model.get_aggregate_prediction_intervals(
         reporting_units, nonreporting_units, unexpected_units, ["postal_code"], 0.95, None, None
     )
 
@@ -1084,7 +1084,7 @@ def test_get_national_summary_estimates(bootstrap_election_model, rng):
         "margin",
         lhs_called_contests=lhs_called_contests,
     )  # race calling for aggregate prediction interval assumes that the point prediction has been set accordingly
-    lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
+    _, _ = bootstrap_election_model.get_aggregate_prediction_intervals(
         reporting_units,
         nonreporting_units,
         unexpected_units,
@@ -1115,7 +1115,7 @@ def test_get_national_summary_estimates(bootstrap_election_model, rng):
         lhs_called_contests=lhs_called_contests,
         rhs_called_contests=rhs_called_contests,
     )  # race calling for aggregate prediction interval assumes that the point prediction has been set accordingly
-    lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
+    _, _ = bootstrap_election_model.get_aggregate_prediction_intervals(
         reporting_units,
         nonreporting_units,
         unexpected_units,