Merge branch 'upgrade-csvkit' into add-workflow-to-gen-data

caciviclab · Nov 13, 2023 · fb33f7b · fb33f7b
2 parents a1bfe9a + 05cee7c
commit fb33f7b
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 13 deletions.
diff --git a/Makefile b/Makefile
@@ -17,6 +17,7 @@ process: process.rb
 	echo 'delete from calculations;'| psql $(DATABASE_NAME)
 	rm -rf build && RUBYOPT="-W:no-deprecated -W:no-experimental" bundle exec ruby process.rb
 	python bin/create-digests.py
+	git --no-pager diff build/digests.json
 
 download-netfile-v2: 
 	python download/main.py
@@ -82,11 +83,14 @@ do-import-spreadsheets:
 	./bin/create-table $(DATABASE_NAME) $(CSV_PATH) candidates
 	csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/candidates.csv
 	echo 'ALTER TABLE "candidates" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME)
+	./bin/remove-whitespace $(DATABASE_NAME) candidates Candidate
+	./bin/remove-whitespace $(DATABASE_NAME) candidates Committee_Name
 
 	echo 'DROP TABLE IF EXISTS referendums;' | psql $(DATABASE_NAME)
 	./bin/create-table $(DATABASE_NAME) $(CSV_PATH) referendums
 	csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/referendums.csv
 	echo 'ALTER TABLE "referendums" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME)
+	./bin/remove-whitespace $(DATABASE_NAME) referendums Short_Title
 
 	echo 'DROP TABLE IF EXISTS name_to_number;' | psql $(DATABASE_NAME)
 	./bin/create-table $(DATABASE_NAME) $(CSV_PATH) name_to_number
@@ -96,6 +100,7 @@ do-import-spreadsheets:
 	./bin/create-table $(DATABASE_NAME) $(CSV_PATH) committees
 	csvsql --db postgresql:///$(DATABASE_NAME) --insert --no-create --no-inference $(CSV_PATH)/committees.csv
 	echo 'ALTER TABLE "committees" ADD COLUMN id SERIAL PRIMARY KEY;' | psql $(DATABASE_NAME)
+	./bin/remove-whitespace $(DATABASE_NAME) committees Filer_NamL
 
 	echo 'DROP TABLE IF EXISTS office_elections;' | psql $(DATABASE_NAME)
 	./bin/create-table $(DATABASE_NAME) $(CSV_PATH) office_elections

diff --git a/README.md b/README.md
@@ -75,6 +75,21 @@ If you want to serve the static JSON files via a local web server:
     $ make run
 
 ## Developing
+
+### Checking output data changes
+
+This repository is used to generate data files that are used by the website.  After `make process` is run, a `build` directory is generated containing the data files.  This directory is checked in to the repository and later checked out when generating the website.  After making code changes, it is important to compare the generated `build` directory against the `build` directory generated before the code changes and verify that changes from the code changes are as expected.
+
+Because a strict comparison of all contents of the `build` directory will always include changes that occur independent of any code change, every developer has to know about these expected changes in order to perform this check.  To remove the need for this, a specify file, `build/digests.json`, generate digests for JSON data in the `build` directory after excluding these expected changes.  To look for changes that exclude these expected changes, simply look for a change in the `build/digests.json` file.
+
+Currently, these are the expected changes that occur independent of any code change:
+* timestamps change for each run
+* top contributors lists contain undefined ordering of contributors with the same contribution
+* top spenders lists contain undefined ordering of spenders with the same spending
+* rounding differences for floats
+
+The expected changes are excluded before generating digests for data in the `build` directory.  The logic for this can be found in the function `clean_data`, found in the file `bin/create-digests`.  After the code is modified such that an expected change no longer exists, the exclusion of that change can be removed from `clean_data`.  For example, the rounding of floats are not consistently the same each time `make process` is run, due to differences in the environment.  When the code is fixed so that the rounding of floats is the same as long as the data hasn't changed, the `round_float` call in `clean_data` can be removed.
+
 ### Adding a calculator
 
 Each metric about a candidate is calculated independently. A metric might be

diff --git a/bin/create-digests.py b/bin/create-digests.py
@@ -3,7 +3,7 @@
 import hashlib
 import logging
 
-logging.basicConfig(encoding='utf-8', level=logging.INFO)
+logging.basicConfig(level=logging.INFO)
 
 def round_floats(data):
     if type(data) == list:
@@ -37,30 +37,43 @@ def sort_arrays(data):
 def redact(data):
     if type(data) == dict:
         if 'date_processed' in data:
+            # redact timestamps
             data['date_processed'] = '***'
         else:
             for key in data.keys():
                 if key.startswith('top_') :
-                    # Redact names for items with duplicate amounts and last item in case the next
-                    # was duplicated.  We have to do this now because the ordering for these lists
-                    # are undefined by the amounts are the same
+                    # For top contributors or top spenders lists, items with
+                    # the same amount can be the same except for the name.  The ordering of these items
+                    # with the same amount are undefined.  By ignoring the name when comparing
+                    # these lists, we hide the differences caused by the undefined ordering for
+                    # items with the same amount.  We ignore the name in this special case
+                    # by redacting the name for items with duplicate amounts.
+                    # Because the last item in the list has the potential to be a duplicate 
+                    # of the next item that did not make the list, we also always redact the name
+                    # of the last item.
                     last_item = None
                     for item in data[key]:
                         if 'name' in item:
+                            # potentially redact the name of there's a name for an item
                             if 'total_contributions' in item:
+                                # for top contributors, this key is used for the amount
                                 amount_key = 'total_contributions'
                             elif 'total_spending' in item:
+                                # for top spenders, this key is used for the amount
                                 amount_key = 'total_spending'
                             else:
                                 continue
 
+                            # If there's a previous item, compare its amount with the
+                            # current item and if they are the same, redact the name
                             amount = item[amount_key]
                             if last_item is not None:
                                 last_amount = last_item[amount_key]
                                 if amount == last_amount:
                                     last_item['name'] = '***'
                                     item['name'] = '***'
                         last_item = item
+                    # always redact the name for the last item
                     if (last_item is not None) and ('name' in last_item):
                         last_item['name'] = '***'
                 elif type(data[key]) == list:
@@ -69,6 +82,11 @@ def redact(data):
                 else:
                     redact(data[key])
 
+def clean_data(data):
+    redact(data)
+    round_floats(data)
+    sort_arrays(data)
+
 def collect_digests(digests, subdir, exclude=[]):
     filenames = os.listdir(subdir)
     for filename in filenames:
@@ -82,9 +100,7 @@ def collect_digests(digests, subdir, exclude=[]):
                 logging.info(filepath)
                 data = json.load(fp)
                 # clean data before generating digests
-                redact(data)
-                round_floats(data)
-                sort_arrays(data)
+                clean_data(data)
                 # generate digests
                 if type(data) == dict:
                     for key in data:

diff --git a/bin/remove-whitespace b/bin/remove-whitespace
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Usage: ./bin/remove-whitespace [database name] [table name] [column name]
+#        ./bin/remove-whitespace disclosure-backend candidates Candidate
+set -euo pipefail
+
+if [ $# -ne 3 ]; then
+  echo 'Usage: ./bin/remove-whitespace [database name] [table name] [column name]'
+  exit 1
+fi
+
+database_name=$1
+table_name=$2
+column_name=$3
+
+cat <<-QUERY | psql ${database_name}
+  \\set ON_ERROR_STOP on
+  UPDATE "$table_name" t SET "$column_name" = REGEXP_REPLACE("$column_name", '\s+$', '');
+
+QUERY
+
+cat <<-QUERY | psql ${database_name}
+  \\set ON_ERROR_STOP on
+  UPDATE "$table_name" t SET "$column_name" = REGEXP_REPLACE("$column_name", '^\s+', '');
+
+QUERY
diff --git a/build/_data/stats.json b/build/_data/stats.json
@@ -1,3 +1,3 @@
 {
-  "date_processed": "2023-11-08 08:07:22 +0000"
+  "date_processed": "2023-11-12 08:07:02 +0000"
 }
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,3 @@
-agate==1.6.1
-agate-dbf==0.2.2
-agate-excel==0.2.2
-agate-sql==0.5.3
 awesome-slugify==1.6.5
 awscli>=1.16.89
 Babel==2.13.1
@@ -23,6 +19,6 @@ pytimeparse==1.1.5
 pytz==2020.1
 regex==2016.12.27
 six==1.10.0
-SQLAlchemy~=1.3.0
+SQLAlchemy~=2.0.23
 Unidecode==0.4.19
 xlrd==1.0.0