From b96dc02ff2861db5583d3139cd35aff356e92bec Mon Sep 17 00:00:00 2001 From: OpenDisclosure Deploybot Date: Thu, 9 Nov 2023 16:47:19 -0800 Subject: [PATCH 01/12] Run `make clean download import process` This is an automated update by travis-ci at Thu Nov 9 16:47:19 PST 2023 [skip ci] --- build/_data/elections/oakland/2022-11-08.json | 2 +- build/_data/stats.json | 2 +- build/_data/totals.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build/_data/elections/oakland/2022-11-08.json b/build/_data/elections/oakland/2022-11-08.json index b906816a9..f71600820 100644 --- a/build/_data/elections/oakland/2022-11-08.json +++ b/build/_data/elections/oakland/2022-11-08.json @@ -8,7 +8,7 @@ "contributions_by_type": { "PTY": 15900.0, "Committee": 995101.14, - "Individual": 2507910.869999998, + "Individual": 2507910.870000002, "Unitemized": 135828.3, "Self Funding": 2711.0, "Other (includes Businesses)": 2638453.6399999997 diff --git a/build/_data/stats.json b/build/_data/stats.json index 732bfa410..47fbd546a 100644 --- a/build/_data/stats.json +++ b/build/_data/stats.json @@ -1,3 +1,3 @@ { - "date_processed": "2023-11-08 00:07:22 -0800" + "date_processed": "2023-11-09 00:07:21 -0800" } diff --git a/build/_data/totals.json b/build/_data/totals.json index e338c52aa..fa7c50955 100644 --- a/build/_data/totals.json +++ b/build/_data/totals.json @@ -788,7 +788,7 @@ "contributions_by_type": { "PTY": 15900.0, "Committee": 995101.14, - "Individual": 2507910.869999998, + "Individual": 2507910.870000002, "Unitemized": 135828.3, "Self Funding": 2711.0, "Other (includes Businesses)": 2638453.6399999997 From a87f06ef6ac9452be7b6c60c6f7fdd7b4712821c Mon Sep 17 00:00:00 2001 From: OpenDisclosure Deploybot Date: Fri, 10 Nov 2023 16:45:57 -0800 Subject: [PATCH 02/12] Run `make clean download import process` This is an automated update by travis-ci at Fri Nov 10 16:45:57 PST 2023 [skip ci] --- build/_data/stats.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/_data/stats.json b/build/_data/stats.json index 47fbd546a..69bbf130c 100644 --- a/build/_data/stats.json +++ b/build/_data/stats.json @@ -1,3 +1,3 @@ { - "date_processed": "2023-11-09 00:07:21 -0800" + "date_processed": "2023-11-10 00:07:18 -0800" } From b2bc9bfc7a42d4a652b3814306286bc066f71643 Mon Sep 17 00:00:00 2001 From: OpenDisclosure Deploybot Date: Sat, 11 Nov 2023 16:46:40 -0800 Subject: [PATCH 03/12] Run `make clean download import process` This is an automated update by travis-ci at Sat Nov 11 16:46:40 PST 2023 [skip ci] --- build/_data/stats.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/_data/stats.json b/build/_data/stats.json index 69bbf130c..dae176ea9 100644 --- a/build/_data/stats.json +++ b/build/_data/stats.json @@ -1,3 +1,3 @@ { - "date_processed": "2023-11-10 00:07:18 -0800" + "date_processed": "2023-11-11 00:07:00 -0800" } From ebc9cda034db144a9b0f761afa92494f1e8d5590 Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sat, 11 Nov 2023 22:51:04 -0800 Subject: [PATCH 04/12] add README and comments to provide info on how to check data using digests --- Makefile | 2 +- README.md | 15 ++++++++++++ bin/{create-digests.py => create-digests} | 30 ++++++++++++++++++----- build/_data/stats.json | 2 +- 4 files changed, 41 insertions(+), 8 deletions(-) rename bin/{create-digests.py => create-digests} (73%) diff --git a/Makefile b/Makefile index ba9a2ecf6..07bc81c64 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ process: process.rb # todo: remove RUBYOPT variable when activerecord fixes deprecation warnings echo 'delete from calculations;'| psql $(DATABASE_NAME) rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb - python bin/create-digests.py + bin/create-digests download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \ downloads/csv/referendums.csv downloads/csv/name_to_number.csv \ diff --git a/README.md b/README.md index 61dfde294..1a7ac3e91 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,21 @@ If you want to serve the static JSON files via a local web server: $ make run ## Developing + +### Checking output data changes + +This repository is used to generate data files that are used by the website. After `make process` is run, a `build` directory is generated containing the data files. This directory is checked in to the repository and later checked out when generating the website. After making code changes, it is important to compare the generated `build` directory against the `build` directory generated before the code changes and verify that changes from the code changes are as expected. + +Because a strict comparison of all contents of the `build` directory will always include changes that occur independent of any code change, every developer has to know about these expected changes in order to perform this check. To remove the need for this, a specify file, `build/digests.json`, generate digests for JSON data in the `build` directory after excluding these expected changes. To look for changes that exclude these expected changes, simply look for a change in the `build/digests.json` file. + +Currently, these are the expected changes that occur independent of any code change: +* timestamps change for each run +* top contributors lists contain undefined ordering of contributors with the same contribution +* top spenders lists contain undefined ordering of spenders with the same spending +* rounding differences for floats + +The expected changes are excluded before generating digests for data in the `build` directory. The logic for this can be found in the function `clean_data`, found in the file `bin/create-digests`. After the code is modified such that an expected change no longer exists, the exclusion of that change can be removed from `clean_data`. For example, the rounding of floats are not consistently the same each time `make process` is run, due to differences in the environment. When the code is fixed so that the rounding of floats is the same as long as the data hasn't changed, the `round_float` call in `clean_data` can be removed. + ### Adding a calculator Each metric about a candidate is calculated independently. A metric might be diff --git a/bin/create-digests.py b/bin/create-digests similarity index 73% rename from bin/create-digests.py rename to bin/create-digests index a4b3c50c8..2c300c30f 100644 --- a/bin/create-digests.py +++ b/bin/create-digests @@ -1,3 +1,5 @@ +#!/usr/bin/env python + import os import json import hashlib @@ -37,23 +39,35 @@ def sort_arrays(data): def redact(data): if type(data) == dict: if 'date_processed' in data: + # redact timestamps data['date_processed'] = '***' else: for key in data.keys(): if key.startswith('top_') : - # Redact names for items with duplicate amounts and last item in case the next - # was duplicated. We have to do this now because the ordering for these lists - # are undefined by the amounts are the same + # For top contributors or top spenders lists, items with + # the same amount can be the same except for the name. The ordering of these items + # with the same amount are undefined. By ignoring the name when comparing + # these lists, we hide the differences caused by the undefined ordering for + # items with the same amount. We ignore the name in this special case + # by redacting the name for items with duplicate amounts. + # Because the last item in the list has the potential to be a duplicate + # of the next item that did not make the list, we also always redact the name + # of the last item. last_item = None for item in data[key]: if 'name' in item: + # potentially redact the name of there's a name for an item if 'total_contributions' in item: + # for top contributors, this key is used for the amount amount_key = 'total_contributions' elif 'total_spending' in item: + # for top spenders, this key is used for the amount amount_key = 'total_spending' else: continue + # If there's a previous item, compare its amount with the + # current item and if they are the same, redact the name amount = item[amount_key] if last_item is not None: last_amount = last_item[amount_key] @@ -61,6 +75,7 @@ def redact(data): last_item['name'] = '***' item['name'] = '***' last_item = item + # always redact the name for the last item if (last_item is not None) and ('name' in last_item): last_item['name'] = '***' elif type(data[key]) == list: @@ -69,6 +84,11 @@ def redact(data): else: redact(data[key]) +def clean_data(data): + redact(data) + round_floats(data) + sort_arrays(data) + def collect_digests(digests, subdir, exclude=[]): filenames = os.listdir(subdir) for filename in filenames: @@ -82,9 +102,7 @@ def collect_digests(digests, subdir, exclude=[]): logging.info(filepath) data = json.load(fp) # clean data before generating digests - redact(data) - round_floats(data) - sort_arrays(data) + clean_data(data) # generate digests if type(data) == dict: for key in data: diff --git a/build/_data/stats.json b/build/_data/stats.json index 687aadd1b..d5ad591d1 100644 --- a/build/_data/stats.json +++ b/build/_data/stats.json @@ -1,3 +1,3 @@ { - "date_processed": "2023-11-08 08:07:22 +0000" + "date_processed": "2023-11-11 08:07:00 +0000" } From 53eda845bd8a67a70e4a4b53014819f9e77776c7 Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sun, 12 Nov 2023 00:24:21 -0800 Subject: [PATCH 05/12] try fixing path to create-digests to see if it runs under travis ci --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 07bc81c64..52348a7e5 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ process: process.rb # todo: remove RUBYOPT variable when activerecord fixes deprecation warnings echo 'delete from calculations;'| psql $(DATABASE_NAME) rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb - bin/create-digests + ./bin/create-digests download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \ downloads/csv/referendums.csv downloads/csv/name_to_number.csv \ From e8cb2adca957a25b1b632aadcd629b104bdca429 Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sun, 12 Nov 2023 00:26:58 -0800 Subject: [PATCH 06/12] some debugging --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 52348a7e5..b481fd06e 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ process: process.rb # todo: remove RUBYOPT variable when activerecord fixes deprecation warnings echo 'delete from calculations;'| psql $(DATABASE_NAME) rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb + ls -al /bin ./bin/create-digests download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \ From e100cc7c7eea0bd10d1024eaac4a3289b5ec01c3 Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sun, 12 Nov 2023 00:39:38 -0800 Subject: [PATCH 07/12] show diff for digests.json after generating it --- Makefile | 4 ++-- bin/{create-digests => create-digests.py} | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) rename bin/{create-digests => create-digests.py} (99%) diff --git a/Makefile b/Makefile index b481fd06e..9eb245b4f 100644 --- a/Makefile +++ b/Makefile @@ -16,8 +16,8 @@ process: process.rb # todo: remove RUBYOPT variable when activerecord fixes deprecation warnings echo 'delete from calculations;'| psql $(DATABASE_NAME) rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb - ls -al /bin - ./bin/create-digests + python bin/create-digests + git diff build/digests.json download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \ downloads/csv/referendums.csv downloads/csv/name_to_number.csv \ diff --git a/bin/create-digests b/bin/create-digests.py similarity index 99% rename from bin/create-digests rename to bin/create-digests.py index 2c300c30f..8afa10895 100644 --- a/bin/create-digests +++ b/bin/create-digests.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - import os import json import hashlib From 31fb848d6627f696debb20e888f7a7e15459cd8f Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sun, 12 Nov 2023 09:46:03 -0800 Subject: [PATCH 08/12] fix name of file called in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9eb245b4f..0de42df0c 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,7 @@ process: process.rb # todo: remove RUBYOPT variable when activerecord fixes deprecation warnings echo 'delete from calculations;'| psql $(DATABASE_NAME) rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb - python bin/create-digests + python bin/create-digests.py git diff build/digests.json download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \ From 189fe27827fb2c9470b871ae079e037a35be2f87 Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sun, 12 Nov 2023 11:52:07 -0800 Subject: [PATCH 09/12] remove encoding param for logging --- bin/create-digests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/create-digests.py b/bin/create-digests.py index 8afa10895..323f7b847 100644 --- a/bin/create-digests.py +++ b/bin/create-digests.py @@ -3,7 +3,7 @@ import hashlib import logging -logging.basicConfig(encoding='utf-8', level=logging.INFO) +logging.basicConfig(level=logging.INFO) def round_floats(data): if type(data) == list: From b2fb864f80fff10a9e424419b41eef903d91ba3b Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sun, 12 Nov 2023 21:12:21 -0800 Subject: [PATCH 10/12] allow csvkit to pull in the correct agate dependencies and add script to trim whitespace for some columns --- Makefile | 3 ++- bin/remove-whitespace | 25 +++++++++++++++++++++++++ requirements.txt | 6 +----- 3 files changed, 28 insertions(+), 6 deletions(-) create mode 100644 bin/remove-whitespace diff --git a/Makefile b/Makefile index 5d0d90f99..c0080aa73 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ process: process.rb echo 'delete from calculations;'| psql $(DATABASE_NAME) rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb python bin/create-digests.py - git diff build/digests.json + git --no-pager diff build/digests.json download-spreadsheets: downloads/csv/candidates.csv downloads/csv/committees.csv \ downloads/csv/referendums.csv downloads/csv/name_to_number.csv \ @@ -79,6 +79,7 @@ do-import-spreadsheets: ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) candidates csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/candidates.csv echo 'ALTER TABLE "candidates" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) + ./bin/remove-whitespace $(DATABASE_NAME) candidates Candidate echo 'DROP TABLE IF EXISTS referendums;' | psql $(DATABASE_NAME) ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) referendums diff --git a/bin/remove-whitespace b/bin/remove-whitespace new file mode 100644 index 000000000..4717d1c9e --- /dev/null +++ b/bin/remove-whitespace @@ -0,0 +1,25 @@ +#!/bin/bash +# Usage: ./bin/remove-whitespace [database name] [table name] [column name] +# ./bin/remove-whitespace disclosure-backend candidates Candidate +set -euo pipefail + +if [ $# -ne 3 ]; then + echo 'Usage: ./bin/remove-whitespace [database name] [table name] [column name]' + exit 1 +fi + +database_name=$1 +table_name=$2 +column_name=$3 + +cat <<-QUERY | psql ${database_name} + \\set ON_ERROR_STOP on + UPDATE "$table_name" t SET "$column_name" = REGEXP_REPLACE("$column_name", '\s+$', ''); + +QUERY + +cat <<-QUERY | psql ${database_name} + \\set ON_ERROR_STOP on + UPDATE "$table_name" t SET "$column_name" = REGEXP_REPLACE("$column_name", '^\s+', ''); + +QUERY diff --git a/requirements.txt b/requirements.txt index 075265a3c..a2ad39df2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,3 @@ -agate==1.6.1 -agate-dbf==0.2.2 -agate-excel==0.2.2 -agate-sql==0.5.3 awesome-slugify==1.6.5 awscli>=1.16.89 Babel==2.9.1 @@ -21,6 +17,6 @@ pytimeparse==1.1.5 pytz==2016.10 regex==2016.12.27 six==1.10.0 -SQLAlchemy~=1.3.0 +SQLAlchemy~=2.0.23 Unidecode==0.4.19 xlrd==1.0.0 From 3c68082918a96f15d762f096c518c6154dde24ad Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Sun, 12 Nov 2023 21:50:56 -0800 Subject: [PATCH 11/12] remove whitespace for some key columns --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index c0080aa73..80f968815 100644 --- a/Makefile +++ b/Makefile @@ -80,11 +80,13 @@ do-import-spreadsheets: csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/candidates.csv echo 'ALTER TABLE "candidates" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) ./bin/remove-whitespace $(DATABASE_NAME) candidates Candidate + ./bin/remove-whitespace $(DATABASE_NAME) candidates Committee_Name echo 'DROP TABLE IF EXISTS referendums;' | psql $(DATABASE_NAME) ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) referendums csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/referendums.csv echo 'ALTER TABLE "referendums" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) + ./bin/remove-whitespace $(DATABASE_NAME) referendums Short_Title echo 'DROP TABLE IF EXISTS name_to_number;' | psql $(DATABASE_NAME) ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) name_to_number @@ -94,6 +96,7 @@ do-import-spreadsheets: ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) committees csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/committees.csv echo 'ALTER TABLE "committees" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) + ./bin/remove-whitespace $(DATABASE_NAME) committees Filer_NamL echo 'DROP TABLE IF EXISTS office_elections;' | psql $(DATABASE_NAME) ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) office_elections From 5288fbc590de93d8177c32d422173567baeb6bc7 Mon Sep 17 00:00:00 2001 From: Chenglim Ear Date: Mon, 13 Nov 2023 06:10:51 +0000 Subject: [PATCH 12/12] update build output --- build/_data/elections/oakland/2022-11-08.json | 2 +- build/_data/stats.json | 2 +- build/_data/totals.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build/_data/elections/oakland/2022-11-08.json b/build/_data/elections/oakland/2022-11-08.json index 597fbd318..2b8601081 100644 --- a/build/_data/elections/oakland/2022-11-08.json +++ b/build/_data/elections/oakland/2022-11-08.json @@ -8,7 +8,7 @@ "contributions_by_type": { "PTY": 15900.0, "Committee": 995101.14, - "Individual": 2507910.870000002, + "Individual": 2507910.869999998, "Unitemized": 135828.3, "Self Funding": 2711.0, "Other (includes Businesses)": 2638453.6399999997 diff --git a/build/_data/stats.json b/build/_data/stats.json index d5ad591d1..76ec343ea 100644 --- a/build/_data/stats.json +++ b/build/_data/stats.json @@ -1,3 +1,3 @@ { - "date_processed": "2023-11-11 08:07:00 +0000" + "date_processed": "2023-11-12 08:07:02 +0000" } diff --git a/build/_data/totals.json b/build/_data/totals.json index accc946b9..d341766e6 100644 --- a/build/_data/totals.json +++ b/build/_data/totals.json @@ -788,7 +788,7 @@ "contributions_by_type": { "PTY": 15900.0, "Committee": 995101.14, - "Individual": 2507910.870000002, + "Individual": 2507910.869999998, "Unitemized": 135828.3, "Self Funding": 2711.0, "Other (includes Businesses)": 2638453.6399999997