From fa603d6e761c41a7f6af3427f9eba415094d9012 Mon Sep 17 00:00:00 2001 From: r48b1t Date: Wed, 27 Jul 2022 09:48:33 +0530 Subject: [PATCH 1/2] Base --- .dockerignore | 3 + Dockerfile | 13 ++++ README.md | 132 ++++++++++++++++++++++++++++++++--------- deploy-airflow.sh | 114 +++++++++++++++++++++++++++++++++++ docs/Variables.md | 129 ++++++++++++++++++++++++++++++++++++++++ helm-values.yaml | 47 +++++++++++++++ variables.example.json | 26 ++++++++ 7 files changed, 435 insertions(+), 29 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100755 deploy-airflow.sh create mode 100644 docs/Variables.md create mode 100644 helm-values.yaml create mode 100644 variables.example.json diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..42015dbc1 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +.git +.github +venv \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..fd81b5bcb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM apache/airflow:1.10.15-python3.8 + +RUN python -m pip install --upgrade pip + +ENV DAGS_FOLDER=/opt/airflow/dags/repo/dags + +COPY elaborate-baton-357506-f9435b87997e.json /usr/sa.json + +USER airflow + +COPY requirements.txt requirements.txt + +RUN pip install -r requirements.txt diff --git a/README.md b/README.md index fa3bc2c7d..8cac242b9 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,123 @@ # Ethereum ETL Airflow -Read this article: https://cloud.google.com/blog/products/data-analytics/ethereum-bigquery-how-we-built-dataset +Read this article: [https://cloud.google.com/blog/products/data-analytics/ethereum-bigquery-how-we-built-dataset](https://cloud.google.com/blog/products/data-analytics/ethereum-bigquery-how-we-built-dataset) -## Setting up Airflow DAGs using Google Cloud Composer +# Setting up Airflow DAGs -### Create BigQuery Datasets +## Google Cloud Composer -- Sign in to BigQuery https://bigquery.cloud.google.com/ -- Create new datasets called `crypto_ethereum`, `crypto_ethereum_raw`, `crypto_ethereum_temp` +Assumes that you have `gcloud` installed and configured. If not, [install Google Cloud CLI](https://cloud.google.com/sdk/docs/install-sdk) -### Create Google Cloud Storage bucket - -- Create a new Google Storage bucket to store exported files https://console.cloud.google.com/storage/browser - -### Create Google Cloud Composer environment Create a new Cloud Composer environment: ```bash export ENVIRONMENT_NAME=ethereum-etl-0 -gcloud composer environments create $ENVIRONMENT_NAME --location=us-central1 --zone=us-central1-a \ - --disk-size=50GB --machine-type=n1-standard-2 --node-count=3 --python-version=3 --image-version=composer-1.17.6-airflow-1.10.15 \ - --network=default --subnetwork=default +export NODE_TYPE=n1-standard-2 +export ZONE=us-central1-a +gcloud composer environments create $ENVIRONMENT_NAME \ + --location=us-central1 \ + --zone=$ZONE \ + --disk-size=50GB \ + --machine-type=$NODE_TYPE \ + --node-count=3 \ + --python-version=3 \ + --image-version=composer-1.17.6-airflow-1.10.15 \ + --network=default \ + --subnetwork=default + +gcloud composer environments update $ENVIRONMENT_NAME \ + --location=us-central1 \ + --update-pypi-package=ethereum-etl==1.7.2 +``` -gcloud composer environments update $ENVIRONMENT_NAME --location=us-central1 --update-pypi-package=ethereum-etl==1.7.2 +### Upload DAGs + +```bash +> ./upload_dags.sh ``` -Create variables in Airflow (**Admin > Variables** in the UI): +## AWS EKS -| Variable | Description | -|-----------------------------------------|-----------------------------------------| -| ethereum_output_bucket | GCS bucket to store exported files | -| ethereum_provider_uris | Comma separated URIs of Ethereum nodes | -| ethereum_destination_dataset_project_id | Project ID of BigQuery datasets | -| notification_emails | email for notifications | +Assumes you have docker, kubectl, helm, eksctl, aws cli installed and configured. +The ECR Repository is created if it does not exists. -Check other variables in `dags/ethereumetl_airflow/variables.py`. +Airflow comes with its own Postgres container as well. For most purposes, +an external PG connection is recommended. +You should set it up, as it will be required. -### Upload DAGs ```bash -> ./upload_dags.sh +export ENVIRONMENT_NAME=ethereum-etl-0 +export NODE_TYPE=m5.large +export ZONE=ap-south-1 +eksctl create cluster \ + --name $ENVIRONMENT_NAME \ + --region $ZONE +eksctl create nodegroup \ + --cluster $ENVIRONMENT_NAME \ + --region $ZONE \ + --name $ENVIRONMENT_NAME-nodegroup \ + --node-type $NODE_TYPE \ + --nodes 3 \ + --nodes-min 2 \ + --nodes-max 10 +./deploy-airflow.sh \ + -n $ENVIRONMENT_NAME \ + --pg-url USER:PASSWORD@HOST:PORT/DB \ + --ecs-host 289344454031.dkr.ecr.ap-south-1.amazonaws.com \ + --image-name ethetl \ + --image-tag latest \ + --build-image \ + --fernet-key 15NrZQ5lfysmX9HggBJgl8qlFVrsTys8-XJcK_qN4hQ= ``` -### Running Tests +You might also want to change the policy of storage for airflow-worker-logs to be retained when you redeploy. In order to to this, follow the [retain volume steps](https://kubernetes.io/docs/tasks/administer-cluster/change-pv-reclaim-policy/). + +To enable port forwarding from this app so you can access Airflow UI, run + +`kubectl port-forward svc/airflow-webserver 8080:8080 --namespace $ENVIRONMENT_NAME`. + +You can now login to [http://localhost:8080](http://localhost:8080/). + + +# Creating variables + +Create variables by following steps on [variables](docs/Variables.md) and importing them to Airflow UI. + +# Creating Connections + +You will need to [create connections in Airflow UI](https://airflow.apache.org/docs/apache-airflow/stable/howto/connection.html#creating-a-connection-with-the-ui) for connecting to the cloud +provider of your choice. + +## GCP + +1. If you want to use GCP for storage and processing of the data, then +create a service account for your GCP app. +[Follow the instructions](https://cloud.google.com/docs/authentication/production). +Store the json file somewhere secure. You will need it's content for next step. +2. You'll need to create the following connection IDs. +[Refer here for details on specifics](https://airflow.apache.org/docs/apache-airflow/1.10.13/howto/connection/gcp.html). Copy the content of above JSON file in the field `Keyfile JSON`. + +- `google_cloud_default` +- `bigquery_default` + +# Starting DAGs + +Once in the Airflow UI, make sure to start the following DAGs + +- airflow_monitoringq +- ethereum_amend_dag +- ethereum_clean_dag +- ethereum_export_dag +- ethereum_load_dag +- ethereum_partition_dag +- ethereum_sessions_dag +- ethereum_verify_streaming_dag + +There are 120+ other DAGs that parse contract specific logic. You can optionally chose to start some or all or none of them. + +# Running Tests ```bash pip install -r requirements.txt @@ -51,12 +125,12 @@ export PYTHONPATH='dags' pytest -vv -s ``` -### Creating Table Definition Files for Parsing Events and Function Calls +# Creating Table Definition Files for Parsing Events and Function Calls -Read this article: https://medium.com/@medvedev1088/query-ens-and-0x-events-with-sql-in-google-bigquery-4d197206e644 +Read this article: [https://medium.com/@medvedev1088/query-ens-and-0x-events-with-sql-in-google-bigquery-4d197206e644](https://medium.com/@medvedev1088/query-ens-and-0x-events-with-sql-in-google-bigquery-4d197206e644) -### More Information +# More Information -You can follow the instructions here for Polygon DAGs https://github.com/blockchain-etl/polygon-etl. The architecture +You can follow the instructions here for Polygon DAGs [https://github.com/blockchain-etl/polygon-etl](https://github.com/blockchain-etl/polygon-etl). The architecture there is very similar to Ethereum so in most case substituting `polygon` for `ethereum` will work. Contributions to this README file for porting documentation from Polygon to Ethereum are welcome. \ No newline at end of file diff --git a/deploy-airflow.sh b/deploy-airflow.sh new file mode 100755 index 000000000..bd27dfb3d --- /dev/null +++ b/deploy-airflow.sh @@ -0,0 +1,114 @@ +#!/bin/bash +while [[ $# -gt 0 ]]; do + case $1 in + -n|--namespace) + NAMESPACE="$2" + shift # past argument + shift # past value + ;; + -p|--pg-url) + PG_URL="$2" + shift # past argument + shift # past value + ;; + -f|--fernet-key) + FERNET_KEY="$2" + shift # past argument + shift # past value + ;; + -e|--ecs-host) + ECS_HOST="$2" + shift # past argument + shift # past value + ;; + --image-name) + IMAGE_NAME="$2" + shift # past argument + shift # past value + ;; + --image-tag) + IMAGE_TAG="$2" + shift # past argument + shift # past value + ;; + --build-image) + BUILD_IMAGE=true + shift # past argument + ;; + -*|--*) + echo "Unknown option $1" + exit 1 + ;; + esac +done + +# Check all required arguments +if [[ -z "$NAMESPACE" || -z "$PG_URL" || -z "$ECS_HOST" || -z "$IMAGE_NAME" || -z "$IMAGE_TAG" || -z "$FERNET_KEY" ]]; +then + echo "You missed some required argument." + exit 1 +fi + +# Prepare some arguments +PROJECT_DIR=$(cd $(dirname $0);pwd) +TEMP_DIR="$PROJECT_DIR"/.tmp +HELM_VALUE_YAML="$TEMP_DIR"/value.yaml +IMAGE_REPOSITORY="$ECS_HOST/$IMAGE_NAME" + +if [ ! -z $BUILD_IMAGE ] +then + # Create Repo in ECR + aws ecr create-repository --repository-name "$IMAGE_REPOSITORY" + + # Login to ECR + aws ecr get-login-password --region ap-south-1 | docker login --username AWS --password-stdin "$ECS_HOST" + + # Build and push the image + docker buildx build \ + --platform linux/amd64 \ + --file Dockerfile \ + --no-cache \ + --load \ + -t "$IMAGE_REPOSITORY:$IMAGE_TAG" . + + # Check if build was success + if [ $? -ne 0 ] + then + echo "Docker Build failed. Not proceeding." + exit 1 + fi + + docker push "$IMAGE_REPOSITORY:$IMAGE_TAG" + + # Check if push was success + if [ $? -ne 0 ] + then + echo "Docker Push failed. Not proceeding." + exit 1 + fi + +fi + +# Create temp folder and write helm values yaml to it. +mkdir -p -- "$TEMP_DIR" + +# shellcheck disable=SC2002 +cat "$PROJECT_DIR"/helm-values.yaml | \ + sed "s={{IMAGE_REPOSITORY}}=$IMAGE_REPOSITORY=" | \ + sed "s={{IMAGE_TAG}}=$IMAGE_TAG=" | \ + sed "s/{{FERNET_KEY}}/$FERNET_KEY/" > "$HELM_VALUE_YAML" + +# Recreate namespace and install all resources. +kubectl delete namespace "$NAMESPACE" +kubectl create namespace "$NAMESPACE" + +kubectl create secret generic airflow-database --from-literal=connection=postgresql+psycopg2://"$PG_URL" -n "$NAMESPACE" + +kubectl create secret generic airflow-result-database --from-literal=connection=db+postgresql://"$PG_URL" -n "$NAMESPACE" + +kubectl create secret generic airflow-webserver-secret --from-literal="webserver-secret-key=$(python3 -c 'import secrets; print(secrets.token_hex(16))')" -n "$NAMESPACE" + +helm upgrade --install airflow apache-airflow/airflow --namespace "$NAMESPACE" --create-namespace -f "$HELM_VALUE_YAML" --debug + +# Clean up temp folder +rm -rf "$TEMP_DIR" diff --git a/docs/Variables.md b/docs/Variables.md new file mode 100644 index 000000000..5356d4d05 --- /dev/null +++ b/docs/Variables.md @@ -0,0 +1,129 @@ +# Airflow Variables + +There are many variables that you need to set before the first successful run. +A sample [variables.example.json](../variables.example.json) is provided. You should copy +the file and rename it to `variables.json` and use it to import in the +[airflow variables](http://localhost:8080/variable/list/) UI. + +This document describes various keys of the json file, and the steps required to +setup them. + +# Cloud Provider + +Regardless of where you have deployed your app (aws or gcp), you can choose +which platform will be used for storing and processing of ETL'd data. + + +## ethereum_cloud_provider + +Ethereum-ETL Supports two cloud providres - aws and gcp. The default is `gcp`. +You can set this by + +```json + "ethereum_cloud_provider: "aws", +``` + +or + +```json + "ethereum_cloud_provider": "gcp", +``` + +## ethereum_destination_dataset_project_id + +Project ID of BigQuery or Redshift datasets. + +### BigQuery + +If you don't have a project created, create one. +Within this project, create new datasets called +`crypto_ethereum`, `crypto_ethereum_raw` and `crypto_ethereum_temp`. +Copy the ID of this project to be used in this variable. + +### AWS + +eth-redshift.cjhr02tuz8yb.ap-south-1.redshift.amazonaws.com:5439/dev + + +## ethereum_output_bucket + +GCS or S3 Bucket to store exported files. + +Create a new Google Storage bucket to store exported files [https://console.cloud.google.com/storage/browser](https://console.cloud.google.com/storage/browser) + +Create a new AWS S3 bucket to store exported files. + +There are additional cloud specific steps to +ensure that airflow app is able to write and read in this storage bucket. +Those are not mentioned here. + + +```json + "ethereum_output_bucket": "dev-etl-01", +``` + +# Web3 Providers + +## ethereum_provider_uris + +A Comma separated URIs of Ethereum nodes. + +```json + "ethereum_provider_uris": "https://eth-mainnet.g.alchemy.com/v2/ALCHEMY_API", +``` + +## ethereum_price_provider_key + +The key of Price provider. + +```json + "ethereum_price_provider_key": "YOUR_KEY", +``` + +# Ethereum Configs + +These are optional and by default set to `false`. Set them to `true` as per your needs. The key names are self explanatory. + +```json + "ethereum_export_daofork_traces_option": false, + "ethereum_export_genesis_traces_option": false, + "ethereum_export_blocks_and_transactions_toggle": false, + "ethereum_export_receipts_and_logs_toggle": false, + "ethereum_extract_contracts_toggle": false, + "ethereum_extract_tokens_toggle": false, + "ethereum_extract_token_transfers_toggle": false, + "ethereum_export_traces_toggle": false, + "ethereum_load_start_date": "2022-01-01" +``` + +# Notification + +## notification_emails + +Comma seperated email for notifications + +```json + "notification_emails": "test@example.com,test2@example.com", +``` + +# Execution Options + + +```json + "ethereum_max_lag_in_minutes": 1, + "ethereum_export_batch_size": 150, + "ethereum_export_max_active_runs": 2, + "ethereum_export_max_workers": 2, + "ethereum_export_prices_usd_toggle": "False", + "ethereum_export_retries": 1, + "ethereum_export_schedule_interval": "0 0 * * *", + +``` + +# Optimisation + +## ethereum_load_all_partitions + +```json + "ethereum_load_all_partitions": false, +``` \ No newline at end of file diff --git a/helm-values.yaml b/helm-values.yaml new file mode 100644 index 000000000..ba7428149 --- /dev/null +++ b/helm-values.yaml @@ -0,0 +1,47 @@ +# Default airflow tag to deploy +defaultAirflowTag: "1.10.15" + +# Airflow version (Used to make some decisions based on Airflow Version being deployed) +airflowVersion: "1.10.15" + +images: + airflow: + repository: {{IMAGE_REPOSITORY}} + tag: "{{IMAGE_TAG}}" + +# Airflow database & redis config +data: + metadataSecretName: airflow-database + resultBackendSecretName: airflow-result-database + +# Flask secret key for Airflow Webserver: `[webserver] secret_key` in airflow.cfg +webserverSecretKeySecretName: airflow-webserver-secret + +# Fernet key settings +# Note: fernetKey can only be set during install, not upgrade +fernetKey: {{FERNET_KEY}} + +# PgBouncer settings +pgbouncer: + # Enable PgBouncer + enabled: true + +# Configuration for postgresql subchart +# Not recommended for production +postgresql: + enabled: false + +# Git sync +dags: + gitSync: + enabled: true + + # git repo clone url + # ssh examples ssh://git@github.com/apache/airflow.git + # git@github.com:apache/airflow.git + # https example: https://github.com/apache/airflow.git + repo: https://github.com/bitpack-me/ethereum-etl-airflow.git + branch: master + # subpath within the repo where dags are located + # should be "" if dags are at repo root + subPath: "/dags" diff --git a/variables.example.json b/variables.example.json new file mode 100644 index 000000000..6d55b7ba3 --- /dev/null +++ b/variables.example.json @@ -0,0 +1,26 @@ +{ + "ethereum_cloud_provider": "gcp", + "ethereum_destination_dataset_project_id": "ethereum-destination-dataset", + "ethereum_parse_destination_dataset_project_id": "ethereum", + "ethereum_max_lag_in_minutes": 1, + "ethereum_export_batch_size": 150, + "ethereum_export_max_active_runs": 2, + "ethereum_export_max_workers": 2, + "ethereum_export_prices_usd_toggle": "False", + "ethereum_export_retries": 1, + "ethereum_export_schedule_interval": "0 0 * * *", + "ethereum_export_start_date": "2022-01-01", + "ethereum_output_bucket": "dev-etl-01", + "ethereum_price_provider_key": "KEY", + "ethereum_provider_uris": "https://eth-mainnet.g.alchemy.com/v2/API_KEY", + "ethereum_load_all_partitions": false, + "ethereum_export_daofork_traces_option": false, + "ethereum_export_genesis_traces_option": false, + "ethereum_export_blocks_and_transactions_toggle": false, + "ethereum_export_receipts_and_logs_toggle": false, + "ethereum_extract_contracts_toggle": false, + "ethereum_extract_tokens_toggle": false, + "ethereum_extract_token_transfers_toggle": false, + "ethereum_export_traces_toggle": false, + "ethereum_load_start_date": "2022-01-01" +} \ No newline at end of file From f731983a90a30d09845de1da2b85b6991d3612e9 Mon Sep 17 00:00:00 2001 From: r48b1t Date: Sun, 31 Jul 2022 20:57:41 +0530 Subject: [PATCH 2/2] More Updates --- README.md | 19 ++++++++++ dags/ethereumetl_airflow/variables.py | 2 +- docs/Variables.md | 52 ++++++++++++++++++++++++--- requirements.txt | 2 +- upload_dags.sh | 0 5 files changed, 68 insertions(+), 7 deletions(-) mode change 100644 => 100755 upload_dags.sh diff --git a/README.md b/README.md index 8cac242b9..3ce7a6df3 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,25 @@ Read this article: [https://cloud.google.com/blog/products/data-analytics/ethereum-bigquery-how-we-built-dataset](https://cloud.google.com/blog/products/data-analytics/ethereum-bigquery-how-we-built-dataset) +## Support + +The repo itself can be deployed on any compute cloud platform. +Certain DAGs are available only for GCP. + + +| Feature | GCP | AWS | +|---------|-------|-------| +| Can be deployed? | Yes | Yes | +| ethereum_amend_dag | Yes | No | +| ethereum_clean_dag | Yes ? | Yes ? | +| ethereum_export_dag | Yes (Provider → BigQuery)| Yes (Provider → S3) | +| ethereum_load_dag | Yes (BigQuery → Storage) | Yes (S3 → RedShift) | +| ethereum_parse_dag | Yes | No | +| ethereum_partition_dag | Yes | No | +| ethereum_sessions_dag | Yes | No | +------------------------------------ + + # Setting up Airflow DAGs ## Google Cloud Composer diff --git a/dags/ethereumetl_airflow/variables.py b/dags/ethereumetl_airflow/variables.py index 89aab6895..2d5b4f057 100644 --- a/dags/ethereumetl_airflow/variables.py +++ b/dags/ethereumetl_airflow/variables.py @@ -6,7 +6,7 @@ def read_export_dag_vars(var_prefix, **kwargs): export_start_date = read_var('export_start_date', var_prefix, True, **kwargs) export_start_date = datetime.strptime(export_start_date, '%Y-%m-%d') - + provider_uris = read_var('provider_uris', var_prefix, True, **kwargs) provider_uris = [uri.strip() for uri in provider_uris.split(',')] diff --git a/docs/Variables.md b/docs/Variables.md index 5356d4d05..53490f8d4 100644 --- a/docs/Variables.md +++ b/docs/Variables.md @@ -20,29 +20,71 @@ Ethereum-ETL Supports two cloud providres - aws and gcp. The default is `gcp`. You can set this by ```json - "ethereum_cloud_provider: "aws", + "cloud_provider": "aws", ``` or ```json - "ethereum_cloud_provider": "gcp", + "cloud_provider": "gcp", ``` ## ethereum_destination_dataset_project_id -Project ID of BigQuery or Redshift datasets. +Project ID of Google Cloud Project which has the datasets. -### BigQuery +### GCP If you don't have a project created, create one. Within this project, create new datasets called `crypto_ethereum`, `crypto_ethereum_raw` and `crypto_ethereum_temp`. Copy the ID of this project to be used in this variable. +```json + "ethereum_destination_dataset_project_id": "COPIED_ID_OF_GCP_PROJECT", +``` + +### AWS + +This is not supported. However, dummy values are required to be set for the DAG Imports to happen correctly. + +```json + "ethereum_destination_dataset_project_id": "dummy", +``` + +## ethereum_parse_destination_dataset_project_id + +### GCP + +Project ID of Google Cloud Project which has the datasets. + +```json + "ethereum_parse_destination_dataset_project_id": "ID_OF_GCP_PROJECT", +``` + ### AWS -eth-redshift.cjhr02tuz8yb.ap-south-1.redshift.amazonaws.com:5439/dev +This is not supported. However, dummy values are required to be set for the DAG Imports to happen correctly. + +```json + "ethereum_parse_destination_dataset_project_id": "dummy", +``` + +## ethereum_aws_access_key_id + +This is used only when `cloud_provider` is set to `aws`. + +```json + "ethereum_aws_access_key_id": "AWS_ACCESS_KEY_ID", +``` + +## ethereum_aws_secret_access_key + +This is used only when `cloud_provider` is set to `aws`. + +```json + "ethereum_aws_secret_access_key": "AWS_SECRET_KEY", +``` ## ethereum_output_bucket diff --git a/requirements.txt b/requirements.txt index 1b70de8bc..4fdd49f15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ apache-airflow[gcp]==1.10.15 ethereum-etl==1.7.2 google-cloud-bigquery pytest==5.4.1 -jsonschema==4.4.0 +jsonschema diff --git a/upload_dags.sh b/upload_dags.sh old mode 100644 new mode 100755