Skip to content

Commit

Permalink
Moving code to fix tox and other warnings over to this branch
Browse files Browse the repository at this point in the history
  • Loading branch information
dmnapolitano committed Dec 13, 2024
1 parent 3eed4c4 commit d0935dc
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 33 deletions.
6 changes: 3 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ good-names= on, x, df, NonparametricElectionModel, GaussianElectionModel,
BaseElectionModel, qr, X, y, f, LiveData, n, Featurizer, Estimandizer, fe, PreprocessedData, CombinedData,
ModelResults, GaussianModel, MODEL_THRESHOLD, LOG, w, df_X, df_y, v, n, g, a, b
disable=missing-function-docstring, missing-module-docstring, missing-class-docstring, #missing
too-many-arguments, too-many-locals, too-many-branches, too-many-instance-attributes, too-many-statements, #structure: too-many
too-many-arguments, too-many-locals, too-many-branches, too-many-instance-attributes, too-many-statements, too-many-positional-arguments, #structure: too-many
too-few-public-methods, #structure: too-few
cell-var-from-loop, function-redefined, attribute-defined-outside-init, arguments-differ, unnecessary-dict-index-lookup, unnecessary-lambda, #structure: other
invalid-name, #naming
redefined-outer-name, pointless-statement, no-member, dangerous-default-value, broad-exception-raised, inconsistent-return-statements, #testing specific
R0801, #similar lines in two files
pointless-string-statement, unused-argument, wrong-import-position, #readability
protected-access, useless-parent-delegation #other

protected-access, useless-parent-delegation, #other
logging-fstring-interpolation
3 changes: 1 addition & 2 deletions src/elexmodel/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,8 +393,7 @@ def get_estimates(
minimum_reporting_units_max = 0
for alpha in prediction_intervals:
minimum_reporting_units = self.model.get_minimum_reporting_units(alpha)
if minimum_reporting_units > minimum_reporting_units_max:
minimum_reporting_units_max = minimum_reporting_units
minimum_reporting_units_max = max(minimum_reporting_units, minimum_reporting_units_max)

if APP_ENV != "local" and self.save_results:
data.write_data(self.election_id, self.office)
Expand Down
32 changes: 20 additions & 12 deletions src/elexmodel/handlers/data/VersionedData.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import warnings
from datetime import datetime

import numpy as np
Expand Down Expand Up @@ -64,10 +65,12 @@ def get_versioned_results(self, filepath=None):

if self.election_id.startswith("2020-11-03_USA_G"):
path = "elex-models-prod/2020-general/results/pres/current.csv"
elif self.election_id.startswith("2024-11-05_USA_G"):
path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current_counties.csv"
else:
path = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}/current.csv"
base_dir = f"{S3_FILE_PATH}/{self.election_id}/results/{self.office_id}/{self.geographic_unit_type}"
if self.election_id.startswith("2024-11-05_USA_G"):
path = base_dir + "/current_counties.csv"
else:
path = base_dir + "/current.csv"

data = self.s3_client.get(path, self.sample)
LOG.info("Loaded versioned results from S3")
Expand Down Expand Up @@ -124,7 +127,8 @@ def compute_estimated_margin(df):
casting="unsafe",
)

# check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin)
# check if perc_expected_vote_corr is monotone increasing
# (if not, give up and don't try to estimate a margin)
if not np.all(np.diff(perc_expected_vote_corr) >= 0):
return pd.DataFrame(
{
Expand All @@ -143,15 +147,18 @@ def compute_estimated_margin(df):
# Compute batch_margin using NumPy
# this is the difference in dem_votes - the difference in gop_votes divided by the difference in total votes
# that is, this is the normalized margin in the batch of votes recorded between versions
batch_margin = (
np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1])
) / np.diff(results_weights, append=results_weights[-1])
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
batch_margin = (
np.diff(results_dem, append=results_dem[-1]) - np.diff(results_gop, append=results_gop[-1])
) / np.diff(results_weights, append=results_weights[-1])

# nan values in batch_margin are due to div-by-zero since there's no change in votes
batch_margin[np.isnan(batch_margin)] = 0 # Set NaN margins to 0
df["batch_margin"] = batch_margin

# batch_margins should be between -1 and 1 (otherwise, there was a data entry issue and we will not use this unit)
# batch_margins should be between -1 and 1
# (otherwise, there was a data entry issue and we will not use this unit)
if np.abs(batch_margin).max() > 1:
return pd.DataFrame(
{
Expand Down Expand Up @@ -208,7 +215,9 @@ def compute_estimated_margin(df):
}
)

results = results.groupby("geographic_unit_fips").apply(compute_estimated_margin).reset_index()
results = (
results.groupby("geographic_unit_fips").apply(compute_estimated_margin, include_groups=False).reset_index()
)

for error_type in sorted(set(results["error_type"])):
if error_type == "none":
Expand All @@ -222,9 +231,8 @@ def get_versioned_predictions(self, filepath=None):
return pd.read_csv(filepath)

if self.election_id.startswith("2020-11-03_USA_G"):
path = "elex-models-prod/2020-general/prediction/pres/current.csv"
raise ValueError("No versioned predictions available for this election.")
else:
path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv"

path = f"{S3_FILE_PATH}/{self.election_id}/predictions/{self.office_id}/{self.geographic_unit_type}/current.csv"

return self.s3_client.get(path, self.sample)
8 changes: 6 additions & 2 deletions src/elexmodel/handlers/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,13 @@ def put(self, filename, data, **kwargs):

def get_file_path(self, file_type, path_info):
if file_type == "preprocessed":
file_path = f'{S3_FILE_PATH}/{path_info["election_id"]}/data/{path_info["office"]}/data_{path_info["geographic_unit_type"]}.csv'
csv_file = f'data_{path_info["geographic_unit_type"]}.csv'
file_path = f'{S3_FILE_PATH}/{path_info["election_id"]}/data/{path_info["office"]}/{csv_file}'
elif file_type == "config":
file_path = f'{S3_FILE_PATH}/{path_info["election_id"]}/config/{path_info["election_id"]}'
else:
LOG.warning("Unknown file type %s", file_type)
file_path = None
return file_path


Expand Down Expand Up @@ -128,7 +132,7 @@ def wait_for_versions(self, q):
try:
future.result()
yield version, data
except Exception as e:
except Exception as e: # pylint: disable=broad-exception-caught
LOG.error(f"Error downloading {version['VersionId']}: {e}")

q.task_done()
Expand Down
38 changes: 30 additions & 8 deletions src/elexmodel/models/BootstrapElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,10 @@ def __init__(self, model_settings={}, versioned_data_handler=None, pres_predicti
self.rng = np.random.default_rng(seed=self.seed) # used for sampling
self.ran_bootstrap = False

# these are the max/min values for called races. Ie. if a contest is called for LHS party then the prediction/intervals should be at least lhs_called_threshold
# if a contest is called for RHS party then the prediction/interval should be at most rhs_called_threshold (at most because the values are negative)
# these are the max/min values for called races. Ie.
# if a contest is called for LHS party then the prediction/intervals should be at least lhs_called_threshold
# if a contest is called for RHS party then the prediction/interval should be at most rhs_called_threshold
# (at most because the values are negative)
self.lhs_called_threshold = 0.005
self.rhs_called_threshold = -0.005

Expand Down Expand Up @@ -359,6 +361,9 @@ def _generate_nonreporting_bounds(
# if 0 percent of the vote is in, the upper bound would be zero if we used the above
# code. So instead we set it to the naive bound
upper_bound[np.isclose(upper_bound, 0)] = unobserved_upper_bound
else:
LOG.warning("Unknown bootstrap estimand %s", bootstrap_estimand)
return None, None

# if percent reporting is 0 or 1, don't try to compute anything and revert to naive bounds
lower_bound[
Expand Down Expand Up @@ -639,7 +644,8 @@ def _get_epsilon_hat_std(residuals, epsilon):
1 - aggregate_indicator_train.sum(axis=0) / aggregate_indicator.sum(axis=0)
) / aggregate_indicator_train.sum(axis=0)

# where we have < 2 units in a contest, we set the variance to the variance of the observed epsilon_hat values
# where we have < 2 units in a contest,
# we set the variance to the variance of the observed epsilon_hat values
var[np.isnan(var) | np.isinf(var)] = np.var(epsilon[np.nonzero(epsilon)[0]].T, ddof=1)
return np.sqrt(var)

Expand Down Expand Up @@ -789,7 +795,8 @@ def _extrapolate_unit_margin(self, reporting_units: pd.DataFrame, nonreporting_u
percent_expected_vote is too far away.
4) The correction estimates (obtained using VersionedResultsHandler) are also np.nan when there are
irregularities in the reporting (e.g., there's a correction to the dem/gop vote totals that revises them downwards).
irregularities in the reporting
(e.g., there's a correction to the dem/gop vote totals that revises them downwards).
5) We only run this method in states with at least self.min_extrapolating_units counties available.
"""
Expand Down Expand Up @@ -954,6 +961,9 @@ def compute_correction_statistics(df):
prediction_std = (
nonreporting_units.est_correction_max.values - nonreporting_units.est_correction_min.values
).reshape(-1, 1)
else:
LOG.warning("Unknown extrapolate standard deviation method %s", self.extrapolate_std_method)
prediction_std = 0

return prediction, prediction_std

Expand Down Expand Up @@ -1444,19 +1454,28 @@ def _format_called_contests(
lhs_rhs_intersection = set(lhs_called_contests) & set(rhs_called_contests)
if len(lhs_rhs_intersection) > 0:
raise BootstrapElectionModelException(
f"You can only call a contest for one party, not for both. Currently these contests are called for both parties: {lhs_rhs_intersection}"
(
"You can only call a contest for one party, not for both. "
+ f"Currently these contests are called for both parties: {lhs_rhs_intersection}"
)
)

lhs_difference_with_contests = set(lhs_called_contests) - set(contests)
if len(lhs_difference_with_contests) > 0:
raise BootstrapElectionModelException(
f"You can only call contests that are being run by the model. These LHS called contests do not exist: {lhs_difference_with_contests}"
(
"You can only call contests that are being run by the model. "
+ f"These LHS called contests do not exist: {lhs_difference_with_contests}"
)
)

rhs_difference_with_contests = set(rhs_called_contests) - set(contests)
if len(rhs_difference_with_contests) > 0:
raise BootstrapElectionModelException(
f"You can only call contests that are being run by the model. These RHS called contests do not exist: {rhs_difference_with_contests}"
(
"You can only call contests that are being run by the model. "
+ f"These RHS called contests do not exist: {rhs_difference_with_contests}"
)
)

# the order in called_coteests need
Expand Down Expand Up @@ -1797,7 +1816,10 @@ def get_national_summary_estimates(self, nat_sum_data_dict: dict, base_to_add: i
# (ie. the number of contests) then raise an exception
if len(nat_sum_data_dict) != self.divided_error_B_1.shape[0]:
raise BootstrapElectionModelException(
f"nat_sum_data_dict is of length {len(nat_sum_data_dict)} but there are {self.divided_error_B_1.shape[0]} contests"
(
f"nat_sum_data_dict is of length {len(nat_sum_data_dict)} "
+ f"but there are {self.divided_error_B_1.shape[0]} contests"
)
)

# NOTE: This assumes that pd.get_dummies does alphabetical ordering
Expand Down
4 changes: 2 additions & 2 deletions src/elexmodel/models/ConformalElectionModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

class ConformalElectionModel(BaseElectionModel.BaseElectionModel, ABC):
def __init__(self, model_settings: dict):
super(ConformalElectionModel, self).__init__(model_settings)
super(ConformalElectionModel, self).__init__(model_settings) # pylint: disable=super-with-arguments
self.lambda_ = model_settings.get("lambda_", 0)

@classmethod
Expand Down Expand Up @@ -207,5 +207,5 @@ def get_all_conformalization_data_agg(cls):
"""
raise NotImplementedError

def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add, alpha):
def get_national_summary_estimates(self, nat_sum_data_dict, base_to_add, alpha):
raise NotImplementedError()
2 changes: 1 addition & 1 deletion tests/distributions/test_gaussian_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,7 @@ def test_winsorized_intervals():
lower_n = lower - lower_correction_n[0]
upper_n = upper + upper_correction_n[0]

for i in range(len(lower_w)):
for i in range(len(lower_w)): # pylint: disable=consider-using-enumerate
assert lower_w[i] >= lower_n[i]
assert upper_w[i] <= upper_n[i]

Expand Down
6 changes: 3 additions & 3 deletions tests/models/test_bootstrap_election_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,7 @@ def test_get_national_summary_estimates(bootstrap_election_model, rng):
bootstrap_election_model.get_aggregate_predictions(
reporting_units, nonreporting_units, unexpected_units, ["postal_code"], "margin"
) # race calling for aggregate prediction interval assumes that the point prediction has been set accordingly
lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
_, _ = bootstrap_election_model.get_aggregate_prediction_intervals(
reporting_units, nonreporting_units, unexpected_units, ["postal_code"], 0.95, None, None
)

Expand All @@ -1084,7 +1084,7 @@ def test_get_national_summary_estimates(bootstrap_election_model, rng):
"margin",
lhs_called_contests=lhs_called_contests,
) # race calling for aggregate prediction interval assumes that the point prediction has been set accordingly
lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
_, _ = bootstrap_election_model.get_aggregate_prediction_intervals(
reporting_units,
nonreporting_units,
unexpected_units,
Expand Down Expand Up @@ -1115,7 +1115,7 @@ def test_get_national_summary_estimates(bootstrap_election_model, rng):
lhs_called_contests=lhs_called_contests,
rhs_called_contests=rhs_called_contests,
) # race calling for aggregate prediction interval assumes that the point prediction has been set accordingly
lower, upper = bootstrap_election_model.get_aggregate_prediction_intervals(
_, _ = bootstrap_election_model.get_aggregate_prediction_intervals(
reporting_units,
nonreporting_units,
unexpected_units,
Expand Down

0 comments on commit d0935dc

Please sign in to comment.