Skip to content

Commit

Permalink
Swap sqlite for postgres, update Airflow to 2.9.0, Python to 3.12 (#66)
Browse files Browse the repository at this point in the history
* Swap to using postgres as the backing DB

* Allow DB password to be defined via env file

* Simplify environment variables

* Remove unnecessary deps recipe, add init recipe

* Bump Airflow to 2.9.0, python to 3.12

* Remove --user install

* Unpin pytest-mock since it's managed by constraints
  • Loading branch information
AetherUnbound authored Apr 29, 2024
1 parent bad84fb commit 2889db2
Show file tree
Hide file tree
Showing 7 changed files with 43 additions and 30 deletions.
4 changes: 2 additions & 2 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ AIRFLOW__CORE__HIDE_SENSITIVE_VAR_CONN_FIELDS=False
# CSRF key https://airflow.apache.org/docs/apache-airflow/stable/configurations-ref.html#secret-key
AIRFLOW__WEBSERVER__SECRET_KEY=sample-secret-key=
# Executor to use
AIRFLOW__CORE__EXECUTOR=SequentialExecutor
AIRFLOW__CORE__EXECUTOR=LocalExecutor
# Environment this instance is being run in
AIRFLOW_VAR_ENVIRONMENT=dev

Expand All @@ -28,7 +28,7 @@ AIRFLOW_VAR_ENVIRONMENT=dev
########################################################################################
# Airflow primary metadata database
# Change the following line in prod to use the appropriate DB
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=sqlite:////opt/airflow/db/airflow.db
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
# Remote logging connection ID
# Replace "access_key" and "secret+key" with the real values. Secret key must be URL-encoded
AIRFLOW_CONN_AWS_DEFAULT=aws://test_key:test_secret@?region_name=us-east-1&endpoint_url=http://s3:5000
Expand Down
16 changes: 7 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@ ENV DAGS_FOLDER=${AIRFLOW_HOME}/techbloc_airflow/dags
ENV PYTHONPATH=${DAGS_FOLDER}

# Container optimizations
ENV PIPNOCACHEDIR=1
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV PIP_NO_CACHE_DIR=1
ENV PIP_NO_COLOR=1

# Airflow/workflow configuration
Expand All @@ -32,13 +31,12 @@ ENV AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID=aws_default
ENV AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER=s3://techbloc-airflow-logs


#RUN apt-get update && apt-get -yqq upgrade && apt-get -yqq install \
# build-essential \
# libpq-dev \
# libffi-dev \
# && apt-get autoremove -y \
# && rm -rf /var/lib/apt/lists/*
USER root
RUN apt-get update && apt-get -yqq install \
build-essential \
libpq-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
RUN mkdir -p ${DATABASE_DIR} /home/airflow/.ipython/ /opt/ssh/ && \
chown -R airflow ${DATABASE_DIR} /home/airflow/.ipython/ /opt/ssh/
USER airflow
Expand All @@ -54,7 +52,7 @@ ARG PROJECT_AIRFLOW_VERSION
# https://airflow.apache.org/docs/apache-airflow/stable/installation/installing-from-pypi.html#constraints-files
ARG CONSTRAINTS_FILE="https://raw.githubusercontent.com/apache/airflow/constraints-${PROJECT_AIRFLOW_VERSION}/constraints-${PROJECT_PY_VERSION}.txt"

RUN pip install --user -r ${REQUIREMENTS_FILE} -c ${CONSTRAINTS_FILE}
RUN pip install -r ${REQUIREMENTS_FILE} -c ${CONSTRAINTS_FILE}

COPY entrypoint.sh /opt/airflow/entrypoint.sh

Expand Down
4 changes: 2 additions & 2 deletions docker-compose.dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ x-airflow-common:
&airflow-common
restart: on-failure
depends_on:
- postgres
- s3
build:
context: .
Expand Down Expand Up @@ -56,8 +57,6 @@ services:
# Dev changes for the scheduler
scheduler:
<<: *airflow-common
depends_on:
- s3
environment:
_AIRFLOW_WWW_USER_CREATE: "true"
_AIRFLOW_WWW_USER_USERNAME: airflow
Expand All @@ -72,6 +71,7 @@ services:
<<: *airflow-common
depends_on:
- s3
- postgres
- scheduler

volumes:
Expand Down
23 changes: 19 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,25 +1,41 @@
version: '3'

services:
postgres:
image: postgres:13
environment:
POSTGRES_USER: airflow
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-airflow}
POSTGRES_DB: airflow
volumes:
- airflow-db:/var/lib/postgresql/data
healthcheck:
test: ["CMD", "pg_isready", "-U", "airflow"]
interval: 10s
retries: 5
restart: always

scheduler:
image: ghcr.io/orcacollective/techbloc-airflow:${DOCKER_IMAGE_TAG:-latest}
depends_on:
- postgres
env_file: .env
restart: always
# Only necessary for the entrypoint, services run as "airflow"
user: root
environment:
# Upgrade the DB on startup
_AIRFLOW_DB_UPGRADE: "true"
_AIRFLOW_DB_MIGRATE: "true"
command: scheduler
expose:
- "8793" # Used for fetching logs
volumes:
- ./techbloc_airflow/dags:/opt/airflow/techbloc_airflow/dags
- db:/opt/airflow/db

webserver:
image: ghcr.io/orcacollective/techbloc-airflow:${DOCKER_IMAGE_TAG:-latest}
depends_on:
- postgres
env_file: .env
restart: always
# Only necessary for the entrypoint, services run as "airflow"
Expand All @@ -29,7 +45,6 @@ services:
- "${AIRFLOW_PORT}:8080"
volumes:
- ./techbloc_airflow/dags:/opt/airflow/techbloc_airflow/dags
- db:/opt/airflow/db

volumes:
db:
airflow-db:
20 changes: 10 additions & 10 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,8 @@ deploy:
lint:
pre-commit run --all-files

# Load any dependencies necessary for actions on the stack without running the webserver
_deps:
@just up "s3"

# Mount the tests directory and run a particular command
@_mount-tests command: _deps
@_mount-tests command:
# The test directory is mounted into the container only during testing
@just _dc run \
-v {{ justfile_directory() }}/tests:/opt/airflow/tests/ \
Expand All @@ -99,26 +95,30 @@ _deps:
test-session:
@just _mount-tests bash

# Run pytest using the webserver image
# Run pytest using the scheduler image
test *pytestargs:
@just _mount-tests "bash -c \'pytest {{ pytestargs }}\'"

# Open a shell into the webserver container
# Open a shell into the scheduler container
shell user="airflow": up
@just _dc exec -u {{ user }} {{ SERVICE }} /bin/bash

# Launch an IPython REPL using the webserver image
ipython: _deps
# Launch an IPython REPL using the scheduler image
ipython:
@just _dc run \
--rm \
-w /opt/airflow/techbloc_airflow/dags \
{{ SERVICE }} \
bash -c \'ipython\'

# Run a given command in bash using the scheduler image
run *args: _deps
run *args:
@just _dc run --rm {{ SERVICE }} bash -c \'{{ args }}\'

# Initialize the database
init:
@just run airflow db init

# Launch a pgcli shell on the postgres container (defaults to openledger) use "airflow" for airflow metastore
db-shell:
@just run bash -c 'sqlite3 /opt/airflow/db/airflow.db'
2 changes: 1 addition & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

flaky==3.7.0
ipython
pytest-mock==3.11.1
pytest-mock
pytest-raises==0.11
pytest-sugar==0.9.7
pytest-xdist
4 changes: 2 additions & 2 deletions requirements_prod.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# PYTHON=3.10
apache-airflow[amazon,sqlite,http,ssh]==2.4.3
# PYTHON=3.12
apache-airflow[amazon,postgres,http,ssh]==2.9.0

0 comments on commit 2889db2

Please sign in to comment.