-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
113 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,86 +1,129 @@ | ||
#!/bin/bash | ||
|
||
# Homebrew Pipeline Script | ||
# This script fetches, transforms, and loads Homebrew package data into a PostgreSQL database. | ||
|
||
# Set bash options: | ||
# -e: Exit immediately if a command exits with a non-zero status. | ||
# -x: Print commands and their arguments as they are executed. | ||
# -u: Treat unset variables as an error when substituting. | ||
# -o pipefail: Return value of a pipeline is the status of the last command to exit with a non-zero status. | ||
set -exuo pipefail | ||
|
||
# get all the required IDs and URLs from the database | ||
IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') | ||
# Function to log messages with timestamps | ||
log() { | ||
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" | ||
} | ||
|
||
# Parse the results and export variables | ||
IFS='|' read -r PACKAGE_MANAGER_ID HOMEPAGE_URL_TYPE_ID SOURCE_URL_TYPE_ID \ | ||
BUILD_DEPENDS_ON_TYPE_ID RUNTIME_DEPENDS_ON_TYPE_ID \ | ||
RECOMMENDED_DEPENDS_ON_TYPE_ID OPTIONAL_DEPENDS_ON_TYPE_ID \ | ||
TEST_DEPENDS_ON_TYPE_ID USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" | ||
log "Starting Homebrew pipeline script" | ||
|
||
export PACKAGE_MANAGER_ID | ||
export HOMEPAGE_URL_TYPE_ID | ||
export SOURCE_URL_TYPE_ID | ||
export BUILD_DEPENDS_ON_TYPE_ID | ||
export RUNTIME_DEPENDS_ON_TYPE_ID | ||
export RECOMMENDED_DEPENDS_ON_TYPE_ID | ||
export OPTIONAL_DEPENDS_ON_TYPE_ID | ||
export TEST_DEPENDS_ON_TYPE_ID | ||
export USES_FROM_MACOS_DEPENDS_ON_TYPE_ID | ||
# Fetch required IDs and URLs from the database | ||
log "Fetching required IDs and URLs from the database" | ||
IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|') | ||
|
||
# if any of the IDs are empty, exit | ||
if [ -z "$PACKAGE_MANAGER_ID" ] || [ -z "$HOMEPAGE_URL_TYPE_ID" ] || [ -z "$SOURCE_URL_TYPE_ID" ] || [ -z "$BUILD_DEPENDS_ON_TYPE_ID" ] || [ -z "$RUNTIME_DEPENDS_ON_TYPE_ID" ] || [ -z "$RECOMMENDED_DEPENDS_ON_TYPE_ID" ] || [ -z "$OPTIONAL_DEPENDS_ON_TYPE_ID" ] || [ -z "$TEST_DEPENDS_ON_TYPE_ID" ] || [ -z "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" ]; then | ||
echo "One or more IDs are empty. Exiting." | ||
exit 1 | ||
fi | ||
# Parse the results | ||
IFS='|' read -r \ | ||
PACKAGE_MANAGER_ID \ | ||
HOMEPAGE_URL_TYPE_ID \ | ||
SOURCE_URL_TYPE_ID \ | ||
BUILD_DEPENDS_ON_TYPE_ID \ | ||
RUNTIME_DEPENDS_ON_TYPE_ID \ | ||
RECOMMENDED_DEPENDS_ON_TYPE_ID \ | ||
OPTIONAL_DEPENDS_ON_TYPE_ID \ | ||
TEST_DEPENDS_ON_TYPE_ID \ | ||
USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS" | ||
|
||
# if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the | ||
# download, and just work off the latest symlink | ||
# Validate that all required IDs are present and export them | ||
required_vars=( | ||
PACKAGE_MANAGER_ID | ||
HOMEPAGE_URL_TYPE_ID | ||
SOURCE_URL_TYPE_ID | ||
BUILD_DEPENDS_ON_TYPE_ID | ||
RUNTIME_DEPENDS_ON_TYPE_ID | ||
RECOMMENDED_DEPENDS_ON_TYPE_ID | ||
OPTIONAL_DEPENDS_ON_TYPE_ID | ||
TEST_DEPENDS_ON_TYPE_ID | ||
USES_FROM_MACOS_DEPENDS_ON_TYPE_ID | ||
) | ||
|
||
# > [!IMPORTANT] | ||
# > | ||
# > ONLY WORKS IF THE VOLUMES ARE MOUNTED | ||
for var in "${required_vars[@]}"; do | ||
if [ -z "${!var}" ]; then | ||
log "ERROR: Required variable $var is empty or unset. Exiting." | ||
exit 1 | ||
fi | ||
# shellcheck disable=SC2163 | ||
export "$var" | ||
done | ||
|
||
# Data fetching and processing | ||
if [ "$FETCH" = true ]; then | ||
NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | ||
mkdir -p "$DATA_DIR"/"$NOW" | ||
log "Fetching new data from Homebrew" | ||
|
||
# extract | ||
curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json | ||
# Create timestamped directory for this run | ||
NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | ||
mkdir -p "$DATA_DIR"/"$NOW" | ||
|
||
# make a symlink called latest, pointing to $NOW | ||
ln -sfn "$NOW" "$DATA_DIR"/latest | ||
# Download source data | ||
log "Downloading source data" | ||
curl -s "$SOURCE" > "$DATA_DIR"/"$NOW"/source.json | ||
|
||
# transform | ||
for x in "$CODE_DIR"/jq/*.jq; do | ||
filename=$(basename "$x" .jq) | ||
# use the formulas defined in the jq folder for each data model | ||
if [ "$filename" = "packages" ]; then | ||
jq -f "$x" -r --arg package_manager_id "$PACKAGE_MANAGER_ID" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
elif [ "$filename" = "urls" ]; then | ||
jq -f "$x" -r \ | ||
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ | ||
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
elif [ "$filename" = "versions" ]; then | ||
jq -f "$x" -r \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
elif [ "$filename" = "package_url" ]; then | ||
jq -f "$x" -r \ | ||
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ | ||
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
elif [ "$filename" = "dependencies" ]; then | ||
jq -f "$x" -r \ | ||
--arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ | ||
--arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ | ||
--arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ | ||
--arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ | ||
--arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ | ||
--arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
else | ||
echo "skipping $filename" | ||
fi | ||
done | ||
# Update 'latest' symlink | ||
ln -sfn "$NOW" "$DATA_DIR"/latest | ||
|
||
# Transform data using jq scripts | ||
log "Transforming data" | ||
for x in "$CODE_DIR"/jq/*.jq; do | ||
filename=$(basename "$x" .jq) | ||
log "Processing $filename" | ||
case "$filename" in | ||
packages) | ||
jq -f "$x" -r \ | ||
--arg package_manager_id "$PACKAGE_MANAGER_ID" \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
;; | ||
urls) | ||
jq -f "$x" -r \ | ||
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ | ||
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
;; | ||
versions) | ||
jq -f "$x" -r \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
;; | ||
package_url) | ||
jq -f "$x" -r \ | ||
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \ | ||
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
;; | ||
dependencies) | ||
jq -f "$x" -r \ | ||
--arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \ | ||
--arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \ | ||
--arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \ | ||
--arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \ | ||
--arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \ | ||
--arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \ | ||
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql | ||
;; | ||
*) | ||
log "Skipping unknown file: $filename" | ||
;; | ||
esac | ||
done | ||
else | ||
log "Skipping data fetch (FETCH=false)" | ||
fi | ||
|
||
# load - order matters | ||
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/packages.sql | ||
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/urls.sql | ||
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/versions.sql | ||
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url.sql | ||
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/dependencies.sql | ||
# Load data into database | ||
log "Loading data into database" | ||
psql "$CHAI_DATABASE_URL" <<EOSQL | ||
\i $DATA_DIR/latest/packages.sql | ||
\i $DATA_DIR/latest/urls.sql | ||
\i $DATA_DIR/latest/versions.sql | ||
\i $DATA_DIR/latest/package_url.sql | ||
\i $DATA_DIR/latest/dependencies.sql | ||
EOSQL | ||
|
||
log "Homebrew pipeline completed successfully" |