diff --git a/Makefile b/Makefile index afab8fd83..72c25a8ec 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,7 @@ process: process.rb echo 'delete from calculations;'| psql $(DATABASE_NAME) rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb python bin/create-digests.py + git --no-pager diff build/digests.json download-netfile-v2: python download/main.py @@ -82,11 +83,14 @@ do-import-spreadsheets: ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) candidates csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/candidates.csv echo 'ALTER TABLE "candidates" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) + ./bin/remove-whitespace $(DATABASE_NAME) candidates Candidate + ./bin/remove-whitespace $(DATABASE_NAME) candidates Committee_Name echo 'DROP TABLE IF EXISTS referendums;' | psql $(DATABASE_NAME) ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) referendums csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/referendums.csv echo 'ALTER TABLE "referendums" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) + ./bin/remove-whitespace $(DATABASE_NAME) referendums Short_Title echo 'DROP TABLE IF EXISTS name_to_number;' | psql $(DATABASE_NAME) ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) name_to_number @@ -96,6 +100,7 @@ do-import-spreadsheets: ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) committees csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/committees.csv echo 'ALTER TABLE "committees" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME) + ./bin/remove-whitespace $(DATABASE_NAME) committees Filer_NamL echo 'DROP TABLE IF EXISTS office_elections;' | psql $(DATABASE_NAME) ./bin/create-table $(DATABASE_NAME) $(CSV_PATH) office_elections diff --git a/README.md b/README.md index 61dfde294..1a7ac3e91 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,21 @@ If you want to serve the static JSON files via a local web server: $ make run ## Developing + +### Checking output data changes + +This repository is used to generate data files that are used by the website. After `make process` is run, a `build` directory is generated containing the data files. This directory is checked in to the repository and later checked out when generating the website. After making code changes, it is important to compare the generated `build` directory against the `build` directory generated before the code changes and verify that changes from the code changes are as expected. + +Because a strict comparison of all contents of the `build` directory will always include changes that occur independent of any code change, every developer has to know about these expected changes in order to perform this check. To remove the need for this, a specify file, `build/digests.json`, generate digests for JSON data in the `build` directory after excluding these expected changes. To look for changes that exclude these expected changes, simply look for a change in the `build/digests.json` file. + +Currently, these are the expected changes that occur independent of any code change: +* timestamps change for each run +* top contributors lists contain undefined ordering of contributors with the same contribution +* top spenders lists contain undefined ordering of spenders with the same spending +* rounding differences for floats + +The expected changes are excluded before generating digests for data in the `build` directory. The logic for this can be found in the function `clean_data`, found in the file `bin/create-digests`. After the code is modified such that an expected change no longer exists, the exclusion of that change can be removed from `clean_data`. For example, the rounding of floats are not consistently the same each time `make process` is run, due to differences in the environment. When the code is fixed so that the rounding of floats is the same as long as the data hasn't changed, the `round_float` call in `clean_data` can be removed. + ### Adding a calculator Each metric about a candidate is calculated independently. A metric might be diff --git a/bin/create-digests.py b/bin/create-digests.py index a4b3c50c8..323f7b847 100644 --- a/bin/create-digests.py +++ b/bin/create-digests.py @@ -3,7 +3,7 @@ import hashlib import logging -logging.basicConfig(encoding='utf-8', level=logging.INFO) +logging.basicConfig(level=logging.INFO) def round_floats(data): if type(data) == list: @@ -37,23 +37,35 @@ def sort_arrays(data): def redact(data): if type(data) == dict: if 'date_processed' in data: + # redact timestamps data['date_processed'] = '***' else: for key in data.keys(): if key.startswith('top_') : - # Redact names for items with duplicate amounts and last item in case the next - # was duplicated. We have to do this now because the ordering for these lists - # are undefined by the amounts are the same + # For top contributors or top spenders lists, items with + # the same amount can be the same except for the name. The ordering of these items + # with the same amount are undefined. By ignoring the name when comparing + # these lists, we hide the differences caused by the undefined ordering for + # items with the same amount. We ignore the name in this special case + # by redacting the name for items with duplicate amounts. + # Because the last item in the list has the potential to be a duplicate + # of the next item that did not make the list, we also always redact the name + # of the last item. last_item = None for item in data[key]: if 'name' in item: + # potentially redact the name of there's a name for an item if 'total_contributions' in item: + # for top contributors, this key is used for the amount amount_key = 'total_contributions' elif 'total_spending' in item: + # for top spenders, this key is used for the amount amount_key = 'total_spending' else: continue + # If there's a previous item, compare its amount with the + # current item and if they are the same, redact the name amount = item[amount_key] if last_item is not None: last_amount = last_item[amount_key] @@ -61,6 +73,7 @@ def redact(data): last_item['name'] = '***' item['name'] = '***' last_item = item + # always redact the name for the last item if (last_item is not None) and ('name' in last_item): last_item['name'] = '***' elif type(data[key]) == list: @@ -69,6 +82,11 @@ def redact(data): else: redact(data[key]) +def clean_data(data): + redact(data) + round_floats(data) + sort_arrays(data) + def collect_digests(digests, subdir, exclude=[]): filenames = os.listdir(subdir) for filename in filenames: @@ -82,9 +100,7 @@ def collect_digests(digests, subdir, exclude=[]): logging.info(filepath) data = json.load(fp) # clean data before generating digests - redact(data) - round_floats(data) - sort_arrays(data) + clean_data(data) # generate digests if type(data) == dict: for key in data: diff --git a/bin/remove-whitespace b/bin/remove-whitespace new file mode 100644 index 000000000..4717d1c9e --- /dev/null +++ b/bin/remove-whitespace @@ -0,0 +1,25 @@ +#!/bin/bash +# Usage: ./bin/remove-whitespace [database name] [table name] [column name] +# ./bin/remove-whitespace disclosure-backend candidates Candidate +set -euo pipefail + +if [ $# -ne 3 ]; then + echo 'Usage: ./bin/remove-whitespace [database name] [table name] [column name]' + exit 1 +fi + +database_name=$1 +table_name=$2 +column_name=$3 + +cat <<-QUERY | psql ${database_name} + \\set ON_ERROR_STOP on + UPDATE "$table_name" t SET "$column_name" = REGEXP_REPLACE("$column_name", '\s+$', ''); + +QUERY + +cat <<-QUERY | psql ${database_name} + \\set ON_ERROR_STOP on + UPDATE "$table_name" t SET "$column_name" = REGEXP_REPLACE("$column_name", '^\s+', ''); + +QUERY diff --git a/build/_data/stats.json b/build/_data/stats.json index 687aadd1b..76ec343ea 100644 --- a/build/_data/stats.json +++ b/build/_data/stats.json @@ -1,3 +1,3 @@ { - "date_processed": "2023-11-08 08:07:22 +0000" + "date_processed": "2023-11-12 08:07:02 +0000" } diff --git a/requirements.txt b/requirements.txt index dccfb036d..2c2e5674a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,3 @@ -agate==1.6.1 -agate-dbf==0.2.2 -agate-excel==0.2.2 -agate-sql==0.5.3 awesome-slugify==1.6.5 awscli>=1.16.89 Babel==2.13.1 @@ -23,6 +19,6 @@ pytimeparse==1.1.5 pytz==2020.1 regex==2016.12.27 six==1.10.0 -SQLAlchemy~=1.3.0 +SQLAlchemy~=2.0.23 Unidecode==0.4.19 xlrd==1.0.0