From 671b751328b17d9188608c05a7c64454ff4c626f Mon Sep 17 00:00:00 2001 From: Jacob Heider Date: Mon, 21 Oct 2024 13:19:49 -0400 Subject: [PATCH] pipeline.sh improvements --- package_managers/homebrew/pipeline.sh | 183 ++++++++++++++++---------- 1 file changed, 113 insertions(+), 70 deletions(-) diff --git a/package_managers/homebrew/pipeline.sh b/package_managers/homebrew/pipeline.sh index b69dc25..6cf8487 100755 --- a/package_managers/homebrew/pipeline.sh +++ b/package_managers/homebrew/pipeline.sh @@ -1,86 +1,129 @@ #!/bin/bash +# Homebrew Pipeline Script +# This script fetches, transforms, and loads Homebrew package data into a PostgreSQL database. + +# Set bash options: +# -e: Exit immediately if a command exits with a non-zero status. +# -x: Print commands and their arguments as they are executed. +# -u: Treat unset variables as an error when substituting. +# -o pipefail: Return value of a pipeline is the status of the last command to exit with a non-zero status. set -exuo pipefail -# get all the required IDs and URLs from the database -IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') +# Function to log messages with timestamps +log() { + echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" +} -# Parse the results and export variables -IFS='|' read -r PACKAGE_MANAGER_ID HOMEPAGE_URL_TYPE_ID SOURCE_URL_TYPE_ID \ - BUILD_DEPENDS_ON_TYPE_ID RUNTIME_DEPENDS_ON_TYPE_ID \ - RECOMMENDED_DEPENDS_ON_TYPE_ID OPTIONAL_DEPENDS_ON_TYPE_ID \ - TEST_DEPENDS_ON_TYPE_ID USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" +log "Starting Homebrew pipeline script" -export PACKAGE_MANAGER_ID -export HOMEPAGE_URL_TYPE_ID -export SOURCE_URL_TYPE_ID -export BUILD_DEPENDS_ON_TYPE_ID -export RUNTIME_DEPENDS_ON_TYPE_ID -export RECOMMENDED_DEPENDS_ON_TYPE_ID -export OPTIONAL_DEPENDS_ON_TYPE_ID -export TEST_DEPENDS_ON_TYPE_ID -export USES_FROM_MACOS_DEPENDS_ON_TYPE_ID +# Fetch required IDs and URLs from the database +log "Fetching required IDs and URLs from the database" +IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') -# if any of the IDs are empty, exit -if [ -z "$PACKAGE_MANAGER_ID" ] || [ -z "$HOMEPAGE_URL_TYPE_ID" ] || [ -z "$SOURCE_URL_TYPE_ID" ] || [ -z "$BUILD_DEPENDS_ON_TYPE_ID" ] || [ -z "$RUNTIME_DEPENDS_ON_TYPE_ID" ] || [ -z "$RECOMMENDED_DEPENDS_ON_TYPE_ID" ] || [ -z "$OPTIONAL_DEPENDS_ON_TYPE_ID" ] || [ -z "$TEST_DEPENDS_ON_TYPE_ID" ] || [ -z "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" ]; then - echo "One or more IDs are empty. Exiting." - exit 1 -fi +# Parse the results +IFS='|' read -r \ + PACKAGE_MANAGER_ID \ + HOMEPAGE_URL_TYPE_ID \ + SOURCE_URL_TYPE_ID \ + BUILD_DEPENDS_ON_TYPE_ID \ + RUNTIME_DEPENDS_ON_TYPE_ID \ + RECOMMENDED_DEPENDS_ON_TYPE_ID \ + OPTIONAL_DEPENDS_ON_TYPE_ID \ + TEST_DEPENDS_ON_TYPE_ID \ + USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" -# if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the -# download, and just work off the latest symlink +# Validate that all required IDs are present and export them +required_vars=( + PACKAGE_MANAGER_ID + HOMEPAGE_URL_TYPE_ID + SOURCE_URL_TYPE_ID + BUILD_DEPENDS_ON_TYPE_ID + RUNTIME_DEPENDS_ON_TYPE_ID + RECOMMENDED_DEPENDS_ON_TYPE_ID + OPTIONAL_DEPENDS_ON_TYPE_ID + TEST_DEPENDS_ON_TYPE_ID + USES_FROM_MACOS_DEPENDS_ON_TYPE_ID +) -# > [!IMPORTANT] -# > -# > ONLY WORKS IF THE VOLUMES ARE MOUNTED +for var in "${required_vars[@]}"; do + if [ -z "${!var}" ]; then + log "ERROR: Required variable $var is empty or unset. Exiting." + exit 1 + fi + # shellcheck disable=SC2163 + export "$var" +done +# Data fetching and processing if [ "$FETCH" = true ]; then - NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - mkdir -p "$DATA_DIR"/"$NOW" + log "Fetching new data from Homebrew" - # extract - curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json + # Create timestamped directory for this run + NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + mkdir -p "$DATA_DIR"/"$NOW" - # make a symlink called latest, pointing to $NOW - ln -sfn "$NOW" "$DATA_DIR"/latest + # Download source data + log "Downloading source data" + curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json - # transform - for x in "$CODE_DIR"/jq/*.jq; do - filename=$(basename "$x" .jq) - # use the formulas defined in the jq folder for each data model - if [ "$filename" = "packages" ]; then - jq -f "$x" -r --arg package_manager_id "$PACKAGE_MANAGER_ID" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "urls" ]; then - jq -f "$x" -r \ - --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ - --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "versions" ]; then - jq -f "$x" -r \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "package_url" ]; then - jq -f "$x" -r \ - --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ - --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - elif [ "$filename" = "dependencies" ]; then - jq -f "$x" -r \ - --arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ - --arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ - --arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ - --arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ - --arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ - --arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ - "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql - else - echo "skipping $filename" - fi - done + # Update 'latest' symlink + ln -sfn "$NOW" "$DATA_DIR"/latest + + # Transform data using jq scripts + log "Transforming data" + for x in "$CODE_DIR"/jq/*.jq; do + filename=$(basename "$x" .jq) + log "Processing $filename" + case "$filename" in + packages) + jq -f "$x" -r \ + --arg package_manager_id "$PACKAGE_MANAGER_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + urls) + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + versions) + jq -f "$x" -r \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + package_url) + jq -f "$x" -r \ + --arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ + --arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + dependencies) + jq -f "$x" -r \ + --arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ + --arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ + --arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ + --arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ + --arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ + --arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ + "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql + ;; + *) + log "Skipping unknown file: $filename" + ;; + esac + done +else + log "Skipping data fetch (FETCH=false)" fi -# load - order matters -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/packages.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/urls.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/versions.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url.sql -psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/dependencies.sql +# Load data into database +log "Loading data into database" +psql "$CHAI_DATABASE_URL" <