Skip to content

Commit

Permalink
bug-1886021: Implement GCS storage classes for GCP
Browse files Browse the repository at this point in the history
  • Loading branch information
relud committed Apr 9, 2024
1 parent 89f3f05 commit 11f76be
Show file tree
Hide file tree
Showing 17 changed files with 1,249 additions and 45 deletions.
41 changes: 41 additions & 0 deletions bin/gcs_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# Usage: ./bin/gcs_cli.py CMD

import os
from pathlib import Path

import click

Expand Down Expand Up @@ -126,6 +127,46 @@ def list_objects(bucket_name, details):
click.echo("No objects in bucket.")


@gcs_group.command("upload")
@click.argument("source")
@click.argument("destination")
def upload(source, destination):
"""Upload files to a bucket"""

client = get_client()

# remove protocol from destination if present
destination = destination.split("://", 1)[-1]
bucket_name, _, prefix = destination.partition("/")

try:
bucket = client.get_bucket(bucket_name)
except NotFound:
click.error(f"GCS bucket {bucket_name!r} does not exist.")
return

source_path = Path(source)
if not source_path.exists():
click.error("local path {source!r} does not exist.")
if source_path.is_dir():
prefix_path = Path(prefix)
sources = [p for p in source_path.rglob("*") if not p.is_dir()]
else:
sources = [source_path]
if not sources:
click.echo("No files in directory {source!r}.")
return
for path in sources:
if path == source_path:
key_path = prefix_path
else:
key_path = prefix_path / path.relative_to(source_path)
key = "/".join(key_path.parts)
blob = bucket.blob(key)
blob.upload_from_filename(path)
click.echo(f"Uploaded gs://{bucket_name}/{key}")


def main(argv=None):
argv = argv or []
gcs_group(argv)
Expand Down
13 changes: 10 additions & 3 deletions bin/process_crashes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,16 @@ mkdir "${DATADIR}" || echo "${DATADIR} already exists."
./socorro-cmd fetch_crash_data "${DATADIR}" $@

# Make the bucket and sync contents
./bin/socorro_aws_s3.sh mb s3://dev-bucket/
./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/"
./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/"
# ^^ returns CLOUD_PROVIDER value as uppercase
if [[ "${CLOUD_PROVIDER^^}" == "GCP" ]]; then
./socorro-cmd gcs create "${CRASHSTORAGE_GCS_BUCKET}"
./socorro-cmd gcs upload "${DATADIR}" "${CRASHSTORAGE_GCS_BUCKET}"
./socorro-cmd gcs list_objects "${CRASHSTORAGE_GCS_BUCKET}"
else
./bin/socorro_aws_s3.sh mb "s3://${CRASHSTORAGE_S3_BUCKET}/"
./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/"
./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/"
fi

# Add crash ids to queue
# ^^ returns CLOUD_PROVIDER value as uppercase
Expand Down
23 changes: 23 additions & 0 deletions bin/recreate_gcs_buckets.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

# Usage: bin/recreate_gcs_buckets.sh
#
# Deletes and recreates GCS bucket used for crash storage
#
# Note: This should be called from inside a container.

set -euo pipefail

cd /app

echo "Dropping and recreating GCS crash bucket..."
/app/socorro-cmd gcs delete "${CRASHSTORAGE_GCS_BUCKET}"
/app/socorro-cmd gcs create "${CRASHSTORAGE_GCS_BUCKET}"

echo "Dropping and recreating GCS telemetry bucket..."
/app/socorro-cmd gcs delete "${TELEMETRY_GCS_BUCKET}"
/app/socorro-cmd gcs create "${TELEMETRY_GCS_BUCKET}"
3 changes: 3 additions & 0 deletions bin/setup_services.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ set -euo pipefail
# Delete and create local S3 buckets
/app/bin/recreate_s3_buckets.sh

# Delete and create local GCS buckets
/app/bin/recreate_gcs_buckets.sh

# Delete and create Elasticsearch indices
/app/socorro-cmd es delete
/app/socorro-cmd es create
Expand Down
2 changes: 2 additions & 0 deletions bin/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ echo ">>> run tests"

# Run socorro tests
"${PYTEST}"
# default cloud provider is aws, now configure gcp and run only impacted tests
CLOUD_PROVIDER=GCP "${PYTEST}" -m gcp

# Collect static and then run pytest in the webapp
pushd webapp
Expand Down
2 changes: 2 additions & 0 deletions docker/config/local_dev.env
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,5 @@ PUBSUB_REPROCESSING_SUBSCRIPTION_NAME=local-reprocessing-sub
# GCS
# ---
STORAGE_EMULATOR_HOST=http://gcs-emulator:8001
CRASHSTORAGE_GCS_BUCKET=dev-bucket
TELEMETRY_GCS_BUCKET=telemetry-bucket
4 changes: 4 additions & 0 deletions docker/config/test.env
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ SQS_STANDARD_QUEUE=test-standard
SQS_PRIORITY_QUEUE=test-priority
SQS_REPROCESSING_QUEUE=test-reprocessing

# GCS
CRASHSTORAGE_GCS_BUCKET=crashstats-test
TELEMETRY_GCS_BUCKET=telemetry-test

# Pub/Sub
PUBSUB_PROJECT_ID=test
PUBSUB_STANDARD_TOPIC_NAME=test-standard
Expand Down
21 changes: 19 additions & 2 deletions docs/crashstorage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,8 @@ Implements Radix Tree storage of crashes in a filesystem.
Use cases:

* For Mozilla use by the collectors.
* For other users, you can use this class as your primary storage instead of S3.
Be sure to implement this in collectors, crashmovers, processors and
* For other users, you can use this class as your primary storage instead of S3
or GCS. Be sure to implement this in collectors, crashmovers, processors and
middleware (depending on which components you use in your configuration).

.. Note::
Expand Down Expand Up @@ -286,3 +286,20 @@ The "directory" hierarchy of that bucket looks like this:
* ``{prefix}/v1/{name_of_thing}/{date}/{id}``: Raw crash data.
* ``{prefix}/v1/{name_of_thing}/{id}``: Processed crash data, dumps, dump_names,
and other things.


socorro.external.gcs: Google Cloud Storage
=============================

The collector saves raw crash data to Google Cloud Storage.

The processor loads raw crash data from Google Cloud Storage, processes it, and
then saves the processed crash data back to Google Cloud Storage.

All of this is done in a single Google Cloud Storage bucket.

The "directory" hierarchy of that bucket looks like this:

* ``{prefix}/v1/{name_of_thing}/{date}/{id}``: Raw crash data.
* ``{prefix}/v1/{name_of_thing}/{id}``: Processed crash data, dumps, dump_names,
and other things.
33 changes: 21 additions & 12 deletions docs/dev.rst
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Setup quickstart
That will build the app Docker image required for development.

5. Initialize Postgres, Elasticsearch, S3, and SQS.
5. Initialize Postgres, Elasticsearch, GCS, Pub/Sub, S3, and SQS.

Then you need to set up services. To do that, run:

Expand All @@ -92,7 +92,9 @@ Setup quickstart
For Elasticsearch, it sets up Super Search fields and the index for
processed crash data.

For S3, this creates the required buckets.
For S3 and GCS, this creates the required buckets.

For Pub/Sub this creates topics and subscriptions.

For SQS, this creates queues.

Expand Down Expand Up @@ -651,8 +653,8 @@ Running the processor is pretty uninteresting since it'll just sit there until
you give it something to process.

In order to process something, you first need to acquire raw crash data, put the
data in the S3 container in the appropriate place, then you need to add the
crash id to the AWS SQS standard queue.
data in the cloud storage container in the appropriate place, then you need to
add the crash id to the standard queue.

We have helper scripts for these steps.

Expand Down Expand Up @@ -702,8 +704,8 @@ bin/process_crashes.sh
----------------------

You can use the ``bin/process_crashes.sh`` script which will fetch crash
data, sync it with the S3 bucket, and publish the crash ids to AWS SQS queue
for processing. If you have access to memory dumps and use a valid
data, sync it with the cloud storage emulator bucket, and publish the crash ids
to the queue for processing. If you have access to memory dumps and use a valid
`API token`_, then memory dumps will be fetched for processing as well.

It takes one or more crash ids as arguments.
Expand Down Expand Up @@ -857,7 +859,7 @@ For help:
.. Note::

Processing will fail unless the crash data is in the S3 container first!
Processing will fail unless the crash data is in the cloud storage container first!


Example using all the scripts
Expand All @@ -878,15 +880,22 @@ Let's process crashes for Firefox from yesterday. We'd do this:
# "crashdata" directory on the host
app@socorro:/app$ cat crashids.txt | socorro-cmd fetch_crash_data ./crashdata
# if using CLOUD_PROVIDER=AWS (default)
# Create a dev-bucket in localstack s3
app@socorro:/app$ bin/socorro_aws_s3.sh mb s3://dev-bucket/
# Copy that data from the host into the localstack s3 container
app@socorro:/app$ bin/socorro_aws_s3.sh sync ./crashdata s3://dev-bucket/
# Add all the crash ids to the queue
# Add all the crash ids to the sqs queue
app@socorro:/app$ cat crashids.txt | socorro-cmd sqs publish local-dev-standard
# or if using CLOUD_PROVIDER=GCP
# Create a dev-bucket in the GCS emulator
app@socorro:/app$ socorro-cmd gcs create dev-bucket
# Copy that data from the host into the GCS emulator
app@socorro:/app$ socorro-cmd gcs upload ./crashdata gs://dev-bucket/
# Add all the crash ids to the pubsub topic
app@socorro:/app$ cat crashids.txt | socorro-cmd pubsub publish local-standard-topic
# Then exit the container
app@socorro:/app$ exit
Expand All @@ -911,8 +920,8 @@ To run Antenna in the Socorro local dev environment, do::

It will listen on ``http://localhost:8888/`` for incoming crashes from a
breakpad crash reporter. It will save crash data to the ``dev-bucket`` in the
local S3 which is where the processor looks for it. It will publish the crash
ids to the AWS SQS standard queue.
local cloud storage which is where the processor looks for it. It will publish
the crash ids to the standard queue.


Connect to PostgreSQL database
Expand Down
7 changes: 6 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
# -rsxX - show skipped, failed, and passed tests
# --tb=native - print native traceback
# -p no:django - disable the pytest-django plugin for Socorro tests
addopts = -rsxX --tb=native -p no:django
# -m 'not gcp' - skip gcp tests unless explicitly requested
addopts = -rsxX --tb=native -p no:django -m 'not gcp'
norecursedirs = .git docs config docker __pycache__
testpaths = socorro/

Expand All @@ -21,3 +22,7 @@ filterwarnings =
# pubsub deprecated the return_immediately flag because it negatively impacts performance, but
# that performance cost is fine for our use case, especially in tests.
ignore:The return_immediately flag is deprecated and should be set to False.:DeprecationWarning:google.pubsub_v1

markers =
aws: tests that require aws backends to be configured in the environment. this is the default.
gcp: tests that require gcp backends to be configured in the environment. skipped unless explicitly requested.
3 changes: 3 additions & 0 deletions socorro/external/gcs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
Loading

0 comments on commit 11f76be

Please sign in to comment.