Skip to content

Commit

Permalink
jq corrections, pipeline.sh fixes, dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
sanchitram1 committed Oct 21, 2024
1 parent 2a34bd8 commit 86c3e6e
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 71 deletions.
2 changes: 0 additions & 2 deletions package_managers/homebrew/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,4 @@ RUN apt-get update && \
COPY . .
WORKDIR /package_managers/homebrew
RUN chmod +x /package_managers/homebrew/pipeline.sh
# RUN pip install --no-cache-dir -r requirements.txt
CMD ["/package_managers/homebrew/pipeline.sh"]

6 changes: 3 additions & 3 deletions package_managers/homebrew/jq/dependencies.jq
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
# and only look at the ones that are strings TODO: some are JSONs?
select(.depends_on | type == "string") |
# generate the sql statements!
"INSERT INTO dependencies (version_id, dependency_id, depends_on_type_id) VALUES (
"INSERT INTO dependencies (version_id, dependency_id, dependency_type_id) VALUES (
(SELECT id FROM versions WHERE import_id = '" + .package_name + "'),
(SELECT id FROM packages WHERE name = '" + .depends_on + "'),
'" + .depends_on_type + "');"
(SELECT id FROM packages WHERE import_id = '" + .depends_on + "'),
'" + .depends_on_type + "') ON CONFLICT DO NOTHING;"
] | join("\n")
3 changes: 2 additions & 1 deletion package_managers/homebrew/jq/package_url.jq
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
# and here we say "for each url, generate an insert statement"
"INSERT INTO package_urls (package_id, url_id) VALUES (
(SELECT id FROM packages WHERE name = '" + .package_name + "'),
(SELECT id FROM urls WHERE url = '" + .url + "' AND url_type_id = '" + .type + "'));"
(SELECT id FROM urls WHERE url = '" + .url + "' AND url_type_id = '" + .type + "'))
ON CONFLICT DO NOTHING;"
] | join("\n")
2 changes: 1 addition & 1 deletion package_managers/homebrew/jq/packages.jq
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
# the import_id is the same as the package name (used for joins)
.name + "', '" +
# the package manager ID is passed in as a variable
$package_manager_id + "');"
$package_manager_id + "') ON CONFLICT DO NOTHING;"
] | join("\n")
3 changes: 2 additions & 1 deletion package_managers/homebrew/jq/urls.jq
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
# and here, we can generate our SQL statement!
"INSERT INTO urls (url, url_type_id) VALUES ('" +
.url + "', '" +
if .name == "source" then $source_url_type_id else $homepage_url_type_id end + "');"
if .name == "source" then $source_url_type_id else $homepage_url_type_id end + "')
ON CONFLICT DO NOTHING;"
] | join("\n")
8 changes: 5 additions & 3 deletions package_managers/homebrew/jq/versions.jq
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
version: .versions.stable,
import_id: .name
} |
"INSERT INTO versions (version, package_id) VALUES ('" +
.version + "', '" +
.import_id + "');"
"INSERT INTO versions (version, import_id, package_id) VALUES (
'" + .version + "',
'" + .import_id + "',
(SELECT id FROM packages WHERE import_id = '" + .import_id + "')
) ON CONFLICT DO NOTHING;"
] | join("\n")
121 changes: 61 additions & 60 deletions package_managers/homebrew/pipeline.sh
Original file line number Diff line number Diff line change
@@ -1,29 +1,39 @@
#!/bin/bash

set -exu
set -exuo pipefail

# Set PSQL_FLAGS based on DEBUG environment variable
if [ "${DEBUG:-false}" = false ]; then
PSQL_FLAGS="-q"
else
PSQL_FLAGS=""
fi
# get all the required IDs and URLs from the database
IDS=$(psql "$CHAI_DATABASE_URL" -f sql/homebrew_vars.sql -t -A -F'|')

# Parse the results and export variables
IFS='|' read -r PACKAGE_MANAGER_ID HOMEPAGE_URL_TYPE_ID SOURCE_URL_TYPE_ID \
BUILD_DEPENDS_ON_TYPE_ID RUNTIME_DEPENDS_ON_TYPE_ID \
RECOMMENDED_DEPENDS_ON_TYPE_ID OPTIONAL_DEPENDS_ON_TYPE_ID \
TEST_DEPENDS_ON_TYPE_ID USES_FROM_MACOS_DEPENDS_ON_TYPE_ID <<< "$IDS"

# get the ID for Homebrew from our database
HOMEBREW_ID=$(psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f homebrew_id.sql -v "ON_ERROR_STOP=1" -tA)
export PACKAGE_MANAGER_ID
export HOMEPAGE_URL_TYPE_ID
export SOURCE_URL_TYPE_ID
export BUILD_DEPENDS_ON_TYPE_ID
export RUNTIME_DEPENDS_ON_TYPE_ID
export RECOMMENDED_DEPENDS_ON_TYPE_ID
export OPTIONAL_DEPENDS_ON_TYPE_ID
export TEST_DEPENDS_ON_TYPE_ID
export USES_FROM_MACOS_DEPENDS_ON_TYPE_ID

# fail if HOMEBREW_ID is empty
if [ -z "$HOMEBREW_ID" ]; then
echo "Error: Failed to retrieve Homebrew ID from the database."
# if any of the IDs are empty, exit
if [ -z "$PACKAGE_MANAGER_ID" ] || [ -z "$HOMEPAGE_URL_TYPE_ID" ] || [ -z "$SOURCE_URL_TYPE_ID" ] || [ -z "$BUILD_DEPENDS_ON_TYPE_ID" ] || [ -z "$RUNTIME_DEPENDS_ON_TYPE_ID" ] || [ -z "$RECOMMENDED_DEPENDS_ON_TYPE_ID" ] || [ -z "$OPTIONAL_DEPENDS_ON_TYPE_ID" ] || [ -z "$TEST_DEPENDS_ON_TYPE_ID" ] || [ -z "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" ]; then
echo "One or more IDs are empty. Exiting."
exit 1
fi

# homebrew provides `source` and `homepage` url types - let's create them ahead of time
psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f create_url_types.sql

# if you've already pulled the Homebrew data, you can `export FETCH=false` to skip the
# download, and just work off the latest symlink
# Note that this only works if the volumes are mounted

# > [!IMPORTANT]
# >
# > ONLY WORKS IF THE VOLUMES ARE MOUNTED

if [ "$FETCH" = true ]; then
NOW=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
mkdir -p "$DATA_DIR"/"$NOW"
Expand All @@ -35,51 +45,42 @@ if [ "$FETCH" = true ]; then
ln -sfn "$NOW" "$DATA_DIR"/latest

# transform
echo "$JQ_DIR"
for x in "$JQ_DIR"/*.jq; do
for x in "$CODE_DIR"/jq/*.jq; do
filename=$(basename "$x" .jq)
# first jq line uses the formulas defined in the jq folder for each data model
# second jq line transforms the json into csv so we can use sed to prep psql stmts
jq -f "$x" "$DATA_DIR"/latest/source.json \
| jq -r '
(map(keys) | add | unique) as $cols |
map(. as $row | $cols | map($row[.])) as $rows |
$cols, $rows[] | @csv
' \
> "$DATA_DIR"/latest/"${filename}".csv
# use the formulas defined in the jq folder for each data model
if [ "$filename" = "packages" ]; then
jq -f "$x" -r --arg package_manager_id "$PACKAGE_MANAGER_ID" "$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "urls" ]; then
jq -f "$x" -r \
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "versions" ]; then
jq -f "$x" -r \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "package_url" ]; then
jq -f "$x" -r \
--arg homepage_url_type_id "$HOMEPAGE_URL_TYPE_ID" \
--arg source_url_type_id "$SOURCE_URL_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
elif [ "$filename" = "dependencies" ]; then
jq -f "$x" -r \
--arg build_deps_type_id "$BUILD_DEPENDS_ON_TYPE_ID" \
--arg runtime_deps_type_id "$RUNTIME_DEPENDS_ON_TYPE_ID" \
--arg recommended_deps_type_id "$RECOMMENDED_DEPENDS_ON_TYPE_ID" \
--arg optional_deps_type_id "$OPTIONAL_DEPENDS_ON_TYPE_ID" \
--arg test_deps_type_id "$TEST_DEPENDS_ON_TYPE_ID" \
--arg uses_from_macos_type_id "$USES_FROM_MACOS_DEPENDS_ON_TYPE_ID" \
"$DATA_DIR"/latest/source.json > "$DATA_DIR"/latest/"${filename}".sql
else
echo "skipping $filename"
fi
done
fi

# load
# TODO: loop?

# packages
# pass HOMEBREW_ID to sed to replace the @@HOMEBREW_ID@@ placeholder
sed \
-f "$SED_DIR/packages.sed" "$DATA_DIR/latest/packages.csv" | \
sed "s/@@HOMEBREW_ID@@/$HOMEBREW_ID/" \
> "$DATA_DIR/latest/package_inserts.sql"
psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_inserts.sql

# urls
sed \
-f "$SED_DIR/urls.sed" "$DATA_DIR/latest/urls.csv" \
> "$DATA_DIR/latest/url_inserts.sql"
psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/url_inserts.sql

# versions
# TODO: licenses (license id is annoying)
# TODO: some random parsing errors happening in versions.csv
sed \
-f "$SED_DIR/versions.sed" "$DATA_DIR/latest/versions.csv" \
> "$DATA_DIR/latest/version_inserts.sql"
psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/version_inserts.sql

# package_urls
# TODO: ERROR: more than one row returned by a subquery used as an expression
sed \
-f "$SED_DIR/package_url.sed" "$DATA_DIR/latest/package_url.csv" \
> "$DATA_DIR/latest/package_url_inserts.sql"
psql $PSQL_FLAGS "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url_inserts.sql

# TODO: dependencies -> dependency_type is annoying
# load - order matters
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/packages.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/urls.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/versions.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/package_url.sql
psql "$CHAI_DATABASE_URL" -f "$DATA_DIR"/latest/dependencies.sql

0 comments on commit 86c3e6e

Please sign in to comment.