bug-1886021: Implement GCS storage classes for GCP

mozilla-services · Apr 9, 2024 · 11f76be · 11f76be
1 parent 89f3f05
commit 11f76be
Show file tree

Hide file tree

Showing 17 changed files with 1,249 additions and 45 deletions.
diff --git a/bin/gcs_cli.py b/bin/gcs_cli.py
@@ -9,6 +9,7 @@
 # Usage: ./bin/gcs_cli.py CMD
 
 import os
+from pathlib import Path
 
 import click
 
@@ -126,6 +127,46 @@ def list_objects(bucket_name, details):
         click.echo("No objects in bucket.")
 
 
+@gcs_group.command("upload")
+@click.argument("source")
+@click.argument("destination")
+def upload(source, destination):
+    """Upload files to a bucket"""
+
+    client = get_client()
+
+    # remove protocol from destination if present
+    destination = destination.split("://", 1)[-1]
+    bucket_name, _, prefix = destination.partition("/")
+
+    try:
+        bucket = client.get_bucket(bucket_name)
+    except NotFound:
+        click.error(f"GCS bucket {bucket_name!r} does not exist.")
+        return
+
+    source_path = Path(source)
+    if not source_path.exists():
+        click.error("local path {source!r} does not exist.")
+    if source_path.is_dir():
+        prefix_path = Path(prefix)
+        sources = [p for p in source_path.rglob("*") if not p.is_dir()]
+    else:
+        sources = [source_path]
+    if not sources:
+        click.echo("No files in directory {source!r}.")
+        return
+    for path in sources:
+        if path == source_path:
+            key_path = prefix_path
+        else:
+            key_path = prefix_path / path.relative_to(source_path)
+        key = "/".join(key_path.parts)
+        blob = bucket.blob(key)
+        blob.upload_from_filename(path)
+        click.echo(f"Uploaded gs://{bucket_name}/{key}")
+
+
 def main(argv=None):
     argv = argv or []
     gcs_group(argv)

diff --git a/bin/process_crashes.sh b/bin/process_crashes.sh
@@ -47,9 +47,16 @@ mkdir "${DATADIR}" || echo "${DATADIR} already exists."
 ./socorro-cmd fetch_crash_data "${DATADIR}" $@
 
 # Make the bucket and sync contents
-./bin/socorro_aws_s3.sh mb s3://dev-bucket/
-./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/"
-./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/"
+# ^^ returns CLOUD_PROVIDER value as uppercase
+if [[ "${CLOUD_PROVIDER^^}" == "GCP" ]]; then
+  ./socorro-cmd gcs create "${CRASHSTORAGE_GCS_BUCKET}"
+  ./socorro-cmd gcs upload "${DATADIR}" "${CRASHSTORAGE_GCS_BUCKET}"
+  ./socorro-cmd gcs list_objects "${CRASHSTORAGE_GCS_BUCKET}"
+else
+  ./bin/socorro_aws_s3.sh mb "s3://${CRASHSTORAGE_S3_BUCKET}/"
+  ./bin/socorro_aws_s3.sh cp --recursive "${DATADIR}" "s3://${CRASHSTORAGE_S3_BUCKET}/"
+  ./bin/socorro_aws_s3.sh ls --recursive "s3://${CRASHSTORAGE_S3_BUCKET}/"
+fi
 
 # Add crash ids to queue
 # ^^ returns CLOUD_PROVIDER value as uppercase

diff --git a/bin/recreate_gcs_buckets.sh b/bin/recreate_gcs_buckets.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+# Usage: bin/recreate_gcs_buckets.sh
+#
+# Deletes and recreates GCS bucket used for crash storage
+#
+# Note: This should be called from inside a container.
+
+set -euo pipefail
+
+cd /app
+
+echo "Dropping and recreating GCS crash bucket..."
+/app/socorro-cmd gcs delete "${CRASHSTORAGE_GCS_BUCKET}"
+/app/socorro-cmd gcs create "${CRASHSTORAGE_GCS_BUCKET}"
+
+echo "Dropping and recreating GCS telemetry bucket..."
+/app/socorro-cmd gcs delete "${TELEMETRY_GCS_BUCKET}"
+/app/socorro-cmd gcs create "${TELEMETRY_GCS_BUCKET}"
diff --git a/bin/setup_services.sh b/bin/setup_services.sh
@@ -20,6 +20,9 @@ set -euo pipefail
 # Delete and create local S3 buckets
 /app/bin/recreate_s3_buckets.sh
 
+# Delete and create local GCS buckets
+/app/bin/recreate_gcs_buckets.sh
+
 # Delete and create Elasticsearch indices
 /app/socorro-cmd es delete
 /app/socorro-cmd es create

diff --git a/bin/test.sh b/bin/test.sh
@@ -54,6 +54,8 @@ echo ">>> run tests"
 
 # Run socorro tests
 "${PYTEST}"
+# default cloud provider is aws, now configure gcp and run only impacted tests
+CLOUD_PROVIDER=GCP "${PYTEST}" -m gcp
 
 # Collect static and then run pytest in the webapp
 pushd webapp

diff --git a/docker/config/local_dev.env b/docker/config/local_dev.env
@@ -145,3 +145,5 @@ PUBSUB_REPROCESSING_SUBSCRIPTION_NAME=local-reprocessing-sub
 # GCS
 # ---
 STORAGE_EMULATOR_HOST=http://gcs-emulator:8001
+CRASHSTORAGE_GCS_BUCKET=dev-bucket
+TELEMETRY_GCS_BUCKET=telemetry-bucket
diff --git a/docker/config/test.env b/docker/config/test.env
@@ -15,6 +15,10 @@ SQS_STANDARD_QUEUE=test-standard
 SQS_PRIORITY_QUEUE=test-priority
 SQS_REPROCESSING_QUEUE=test-reprocessing
 
+# GCS
+CRASHSTORAGE_GCS_BUCKET=crashstats-test
+TELEMETRY_GCS_BUCKET=telemetry-test
+
 # Pub/Sub
 PUBSUB_PROJECT_ID=test
 PUBSUB_STANDARD_TOPIC_NAME=test-standard

diff --git a/docs/crashstorage.rst b/docs/crashstorage.rst
@@ -252,8 +252,8 @@ Implements Radix Tree storage of crashes in a filesystem.
 Use cases:
 
 * For Mozilla use by the collectors.
-* For other users, you can use this class as your primary storage instead of S3.
-  Be sure to implement this in collectors, crashmovers, processors and
+* For other users, you can use this class as your primary storage instead of S3
+  or GCS. Be sure to implement this in collectors, crashmovers, processors and
   middleware (depending on which components you use in your configuration).
 
 .. Note::
@@ -286,3 +286,20 @@ The "directory" hierarchy of that bucket looks like this:
 * ``{prefix}/v1/{name_of_thing}/{date}/{id}``: Raw crash data.
 * ``{prefix}/v1/{name_of_thing}/{id}``: Processed crash data, dumps, dump_names,
   and other things.
+
+
+socorro.external.gcs: Google Cloud Storage
+=============================
+
+The collector saves raw crash data to Google Cloud Storage.
+
+The processor loads raw crash data from Google Cloud Storage, processes it, and
+then saves the processed crash data back to Google Cloud Storage.
+
+All of this is done in a single Google Cloud Storage bucket.
+
+The "directory" hierarchy of that bucket looks like this:
+
+* ``{prefix}/v1/{name_of_thing}/{date}/{id}``: Raw crash data.
+* ``{prefix}/v1/{name_of_thing}/{id}``: Processed crash data, dumps, dump_names,
+  and other things.
diff --git a/docs/dev.rst b/docs/dev.rst
@@ -71,7 +71,7 @@ Setup quickstart
 
    That will build the app Docker image required for development.
 
-5. Initialize Postgres, Elasticsearch, S3, and SQS.
+5. Initialize Postgres, Elasticsearch, GCS, Pub/Sub, S3, and SQS.
 
    Then you need to set up services. To do that, run:
 
@@ -92,7 +92,9 @@ Setup quickstart
    For Elasticsearch, it sets up Super Search fields and the index for
    processed crash data.
 
-   For S3, this creates the required buckets.
+   For S3 and GCS, this creates the required buckets.
+
+   For Pub/Sub this creates topics and subscriptions.
 
    For SQS, this creates queues.
 
@@ -651,8 +653,8 @@ Running the processor is pretty uninteresting since it'll just sit there until
 you give it something to process.
 
 In order to process something, you first need to acquire raw crash data, put the
-data in the S3 container in the appropriate place, then you need to add the
-crash id to the AWS SQS standard queue.
+data in the cloud storage container in the appropriate place, then you need to
+add the crash id to the standard queue.
 
 We have helper scripts for these steps.
 
@@ -702,8 +704,8 @@ bin/process_crashes.sh
 ----------------------
 
 You can use the ``bin/process_crashes.sh`` script which will fetch crash
-data, sync it with the S3 bucket, and publish the crash ids to AWS SQS queue
-for processing. If you have access to memory dumps and use a valid
+data, sync it with the cloud storage emulator bucket, and publish the crash ids
+to the queue for processing. If you have access to memory dumps and use a valid
 `API token`_, then memory dumps will be fetched for processing as well.
 
 It takes one or more crash ids as arguments.
@@ -857,7 +859,7 @@ For help:
 
 .. Note::
 
-   Processing will fail unless the crash data is in the S3 container first!
+   Processing will fail unless the crash data is in the cloud storage container first!
 
 
 Example using all the scripts
@@ -878,15 +880,22 @@ Let's process crashes for Firefox from yesterday. We'd do this:
   # "crashdata" directory on the host
   app@socorro:/app$ cat crashids.txt | socorro-cmd fetch_crash_data ./crashdata
 
+  # if using CLOUD_PROVIDER=AWS (default)
   # Create a dev-bucket in localstack s3
   app@socorro:/app$ bin/socorro_aws_s3.sh mb s3://dev-bucket/
-
   # Copy that data from the host into the localstack s3 container
   app@socorro:/app$ bin/socorro_aws_s3.sh sync ./crashdata s3://dev-bucket/
-
-  # Add all the crash ids to the queue
+  # Add all the crash ids to the sqs queue
   app@socorro:/app$ cat crashids.txt | socorro-cmd sqs publish local-dev-standard
 
+  # or if using CLOUD_PROVIDER=GCP
+  # Create a dev-bucket in the GCS emulator
+  app@socorro:/app$ socorro-cmd gcs create dev-bucket
+  # Copy that data from the host into the GCS emulator
+  app@socorro:/app$ socorro-cmd gcs upload ./crashdata gs://dev-bucket/
+  # Add all the crash ids to the pubsub topic
+  app@socorro:/app$ cat crashids.txt | socorro-cmd pubsub publish local-standard-topic
+
   # Then exit the container
   app@socorro:/app$ exit
 
@@ -911,8 +920,8 @@ To run Antenna in the Socorro local dev environment, do::
 
 It will listen on ``http://localhost:8888/`` for incoming crashes from a
 breakpad crash reporter. It will save crash data to the ``dev-bucket`` in the
-local S3 which is where the processor looks for it. It will publish the crash
-ids to the AWS SQS standard queue.
+local cloud storage which is where the processor looks for it. It will publish
+the crash ids to the standard queue.
 
 
 Connect to PostgreSQL database

diff --git a/pytest.ini b/pytest.ini
@@ -2,7 +2,8 @@
 # -rsxX        - show skipped, failed, and passed tests
 # --tb=native  - print native traceback
 # -p no:django - disable the pytest-django plugin for Socorro tests
-addopts = -rsxX --tb=native -p no:django
+# -m 'not gcp' - skip gcp tests unless explicitly requested
+addopts = -rsxX --tb=native -p no:django -m 'not gcp'
 norecursedirs = .git docs config docker __pycache__
 testpaths = socorro/
 
@@ -21,3 +22,7 @@ filterwarnings =
     # pubsub deprecated the return_immediately flag because it negatively impacts performance, but
     # that performance cost is fine for our use case, especially in tests.
     ignore:The return_immediately flag is deprecated and should be set to False.:DeprecationWarning:google.pubsub_v1
+
+markers =
+    aws: tests that require aws backends to be configured in the environment. this is the default.
+    gcp: tests that require gcp backends to be configured in the environment. skipped unless explicitly requested.
diff --git a/socorro/external/gcs/__init__.py b/socorro/external/gcs/__init__.py
@@ -0,0 +1,3 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.