Skip to content

Commit

Permalink
pipeline.sh improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
jhheider committed Oct 21, 2024
1 parent 2b7fd8d commit 671b751
Showing 1 changed file with 113 additions and 70 deletions.
183 changes: 113 additions & 70 deletions package_managers/homebrew/pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,86 +1,129 @@
#!/bin/bash

# Homebrew Pipeline Script
# This script fetches, transforms, and loads Homebrew package data into a PostgreSQL database.

# Set bash options:
# -e: Exit immediately if a command exits with a non-zero status.
# -x: Print commands and their arguments as they are executed.
# -u: Treat unset variables as an error when substituting.
# -o pipefail: Return value of a pipeline is the status of the last command to exit with a non-zero status.
set -exuo pipefail

# get all the required IDs and URLs from the database
IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|')
# Function to log messages with timestamps
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1"
}

# Parse the results and export variables
IFS='|' read -r PACKAGE_MANAGER_ID HOMEPAGE_URL_TYPE_ID SOURCE_URL_TYPE_ID \
BUILD_DEPENDS_ON_TYPE_ID RUNTIME_DEPENDS_ON_TYPE_ID \
RECOMMENDED_DEPENDS_ON_TYPE_ID OPTIONAL_DEPENDS_ON_TYPE_ID \
TEST_DEPENDS_ON_TYPE_ID USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS"
log "Starting Homebrew pipeline script"

export PACKAGE_MANAGER_ID
export HOMEPAGE_URL_TYPE_ID
export SOURCE_URL_TYPE_ID
export BUILD_DEPENDS_ON_TYPE_ID
export RUNTIME_DEPENDS_ON_TYPE_ID
export RECOMMENDED_DEPENDS_ON_TYPE_ID
export OPTIONAL_DEPENDS_ON_TYPE_ID
export TEST_DEPENDS_ON_TYPE_ID
export USES_FROM_MACOS_DEPENDS_ON_TYPE_ID
# Fetch required IDs and URLs from the database
log "Fetching required IDs and URLs from the database"
IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|')

# if any of the IDs are empty, exit
if [ -z "$PACKAGE_MANAGER_ID" ] || [ -z "$HOMEPAGE_URL_TYPE_ID" ] || [ -z "$SOURCE_URL_TYPE_ID" ] || [ -z "$BUILD_DEPENDS_ON_TYPE_ID" ] || [ -z "$RUNTIME_DEPENDS_ON_TYPE_ID" ] || [ -z "$RECOMMENDED_DEPENDS_ON_TYPE_ID" ] || [ -z "$OPTIONAL_DEPENDS_ON_TYPE_ID" ] || [ -z "$TEST_DEPENDS_ON_TYPE_ID" ] || [ -z "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" ]; then
echo "One or more IDs are empty. Exiting."
exit 1
fi
# Parse the results
IFS='|' read -r \
PACKAGE_MANAGER_ID \
HOMEPAGE_URL_TYPE_ID \
SOURCE_URL_TYPE_ID \
BUILD_DEPENDS_ON_TYPE_ID \
RUNTIME_DEPENDS_ON_TYPE_ID \
RECOMMENDED_DEPENDS_ON_TYPE_ID \
OPTIONAL_DEPENDS_ON_TYPE_ID \
TEST_DEPENDS_ON_TYPE_ID \
USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS"

# if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the
# download, and just work off the latest symlink
# Validate that all required IDs are present and export them
required_vars=(
PACKAGE_MANAGER_ID
HOMEPAGE_URL_TYPE_ID
SOURCE_URL_TYPE_ID
BUILD_DEPENDS_ON_TYPE_ID
RUNTIME_DEPENDS_ON_TYPE_ID
RECOMMENDED_DEPENDS_ON_TYPE_ID
OPTIONAL_DEPENDS_ON_TYPE_ID
TEST_DEPENDS_ON_TYPE_ID
USES_FROM_MACOS_DEPENDS_ON_TYPE_ID
)

# > [!IMPORTANT]
# >
# > ONLY WORKS IF THE VOLUMES ARE MOUNTED
for var in "${required_vars[@]}"; do
if [ -z "${!var}" ]; then
log "ERROR: Required variable $var is empty or unset. Exiting."
exit 1
fi
# shellcheck disable=SC2163
export "$var"
done

# Data fetching and processing
if [ "$FETCH" = true ]; then
NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
mkdir -p "$DATA_DIR"/"$NOW"
log "Fetching new data from Homebrew"

# extract
curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json
# Create timestamped directory for this run
NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
mkdir -p "$DATA_DIR"/"$NOW"

# make a symlink called latest, pointing to $NOW
ln -sfn "$NOW" "$DATA_DIR"/latest
# Download source data
log "Downloading source data"
curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json

# transform
for x in "$CODE_DIR"/jq/*.jq; do
filename=$(basename "$x" .jq)
# use the formulas defined in the jq folder for each data model
if [ "$filename" = "packages" ]; then
jq -f "$x" -r --arg package_manager_id "$PACKAGE_MANAGER_ID" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "urls" ]; then
jq -f "$x" -r \
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "versions" ]; then
jq -f "$x" -r \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "package_url" ]; then
jq -f "$x" -r \
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "dependencies" ]; then
jq -f "$x" -r \
--arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \
--arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \
--arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \
--arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \
--arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \
--arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
else
echo "skipping $filename"
fi
done
# Update 'latest' symlink
ln -sfn "$NOW" "$DATA_DIR"/latest

# Transform data using jq scripts
log "Transforming data"
for x in "$CODE_DIR"/jq/*.jq; do
filename=$(basename "$x" .jq)
log "Processing $filename"
case "$filename" in
packages)
jq -f "$x" -r \
--arg package_manager_id "$PACKAGE_MANAGER_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
;;
urls)
jq -f "$x" -r \
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
;;
versions)
jq -f "$x" -r \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
;;
package_url)
jq -f "$x" -r \
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
;;
dependencies)
jq -f "$x" -r \
--arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \
--arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \
--arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \
--arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \
--arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \
--arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
;;
*)
log "Skipping unknown file: $filename"
;;
esac
done
else
log "Skipping data fetch (FETCH=false)"
fi

# load - order matters
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/packages.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/urls.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/versions.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/dependencies.sql
# Load data into database
log "Loading data into database"
psql "$CHAI_DATABASE_URL" <<EOSQL
\i $DATA_DIR/latest/packages.sql
\i $DATA_DIR/latest/urls.sql
\i $DATA_DIR/latest/versions.sql
\i $DATA_DIR/latest/package_url.sql
\i $DATA_DIR/latest/dependencies.sql
EOSQL

log "Homebrew pipeline completed successfully"

0 comments on commit 671b751

Please sign in to comment.