Skip to content

Commit

Permalink
feat: Add airflow compose
Browse files Browse the repository at this point in the history
  • Loading branch information
1ambda committed Sep 2, 2023
1 parent 2d6da23 commit d88d0a0
Show file tree
Hide file tree
Showing 5 changed files with 231 additions and 7 deletions.
37 changes: 37 additions & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use_python() {
if [ -n "$(which pyenv)" ]; then
local pyversion=$1
pyenv local ${pyversion}
fi
}

layout_virtualenv() {
local pyversion=$1
local pvenv=$2
if [ -n "$(which pyenv virtualenv)" ]; then
pyenv virtualenv --force --quiet ${pyversion} ${pvenv}-${pyversion}
fi
pyenv local --unset
}

layout_activate() {
if [ -n "$(which pyenv)" ]; then
source $(pyenv root)/versions/$1/bin/activate
fi
}

pyversion=3.10.13
pvenv=lakehouse

# Setup Python
use python ${pyversion}
layout virtualenv ${pyversion} ${pvenv}
layout activate ${pvenv}-${pyversion}
pip install -q -r requirements.txt

# Start
echo -e ""
echo -e ""
echo -e "[direnv] Lakehouse environment is ready"
echo -e ""
echo -e ""
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,27 +1,30 @@
# Lakehouse Playground

- [x] Spark 3.3 - 3.4 (Iceberg 1.3.1, Hudi 0.13.1)
- [x] Flink 1.16 (Iceberg 1.3.1, Hudi 0.13.1)
- [x] Spark 3.3 ~ 3.4 (Iceberg 1.3.1, Hudi 0.13.1)
- [x] Flink 1.16 ~ 1.17 (Iceberg 1.3.1, Hudi 0.13.1)
- [x] Trino 425
- [x] Airflow 2.7.0
- [x] Jupyterlab

## Getting Started

Execute compose containers first
Execute compose containers first.

```bash
# Use `COMPOSE_PROFILES` to select the profile
COMPOSE_PROFILES=trino docker-compose up;
COMPOSE_PROFILES=spark docker-compose up;
COMPOSE_PROFILES=flink docker-compose up;
COMPOSE_PROFILES=airflow docker-compose up;

# Combine multiple profiles
COMPOSE_PROFILES=trino,spark docker-compose up;
```

Then access the following services
Then access the lakehouse services.

- Trino: http://localhost:8889
- Airflow (`airflow` / `airflow`) : http://localhost:8080
- Local S3 Minio (`minio` / `minio123`): http://localhost:9000
- PySpark Jupyter Notebook (Iceberg): http://localhost:8900
- PySpark Jupyter Notebook (Hudi): http://localhost:8901
Expand Down
Empty file added dags/.gitkeep
Empty file.
188 changes: 186 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,42 @@
version: "3.9"

# Configure Modules
x-airflow-common:
&airflow-common
image: apache/airflow:2.7.0-python3.10
environment:
&airflow-common-env
S3_ENDPOINT: http://minio:9000
S3_ACCESS_KEY: minio
S3_SECRET_KEY: minio123
S3_PATH_STYLE_ACCESS: "true"
AIRFLOW__CORE__EXECUTOR: CeleryExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
AIRFLOW__CORE__FERNET_KEY: ''
AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
AIRFLOW_UID: '50000'
_AIRFLOW_WWW_USER_USERNAME: airflow
_AIRFLOW_WWW_USER_PASSWORD: airflow
_PIP_ADDITIONAL_REQUIREMENTS: dbt-core>=1.6.1 dbt-trino>=1.6.1 trino>=0.326.0
volumes:
- ./dags:/opt/airflow/dags
- ./docker/airflow/config:/opt/airflow/config
- ./docker/airflow/plugins:/opt/airflow/plugins
- ./docker/volume/airflow/logs:/opt/airflow/logs
user: "50000"
depends_on:
&airflow-common-depends-on
redis:
condition: service_healthy
postgres:
condition: service_healthy

services:

postgres:
Expand Down Expand Up @@ -42,7 +80,6 @@ services:
condition: service_healthy

minio:
profiles: [ "metastore" ]
container_name: minio
hostname: minio
image: 'minio/minio'
Expand All @@ -56,7 +93,6 @@ services:
command: server /data --console-address ":9001"

minio-job:
profiles: [ "metastore" ]
image: 'minio/mc'
entrypoint: |
/bin/bash -c "
Expand All @@ -75,6 +111,12 @@ services:
depends_on:
- minio

####################################################################################################
#
# Trino
#
####################################################################################################

trino:
profiles: [ "trino" ]
container_name: trino
Expand Down Expand Up @@ -103,6 +145,12 @@ services:
trino:
condition: service_healthy

####################################################################################################
#
# Spark
#
####################################################################################################

spark-iceberg:
profiles: [ "spark" ]
build:
Expand Down Expand Up @@ -177,6 +225,11 @@ services:
- ./docker/spark/spark-defaults-hudi.conf:/opt/spark/conf/spark-defaults.conf
- ./docker/spark/hudi-defaults.conf:/opt/hudi/conf/hudi-defaults.conf

####################################################################################################
#
# Flink
#
####################################################################################################

flink-jobmanager:
profiles: ["flink"]
Expand Down Expand Up @@ -258,6 +311,137 @@ services:
depends_on:
- flink-jobmanager

####################################################################################################
#
# Airflow
#
####################################################################################################
airflow-webserver:
<<: *airflow-common
profiles: [ "airflow" ]
container_name: airflow-webserver
command: webserver
ports:
- "8080:8080"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully

airflow-scheduler:
<<: *airflow-common
profiles: [ "airflow" ]
container_name: airflow-scheduler
command: scheduler
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully

airflow-worker:
<<: *airflow-common
profiles: [ "airflow" ]
container_name: airflow-worker
command: celery worker
healthcheck:
test:
- "CMD-SHELL"
- 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
environment:
<<: *airflow-common-env
DUMB_INIT_SETSID: "0"
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully

airflow-triggerer:
<<: *airflow-common
profiles: [ "airflow" ]
container_name: airflow-triggerer
command: triggerer
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
restart: always
depends_on:
<<: *airflow-common-depends-on
airflow-init:
condition: service_completed_successfully

airflow-cli:
<<: *airflow-common
container_name: airflow-cli
profiles: [ "airflow-debug" ]
environment:
<<: *airflow-common-env
CONNECTION_CHECK_MAX_COUNT: "0"
# Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
command:
- bash
- -c
- airflow

airflow-init:
<<: *airflow-common
profiles: [ "airflow" ]
container_name: airflow-init
entrypoint: /bin/bash
# yamllint disable rule:line-length
command:
- -c
- |
mkdir -p /sources/logs /sources/dags /sources/plugins
chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
exec /entrypoint airflow version
# yamllint enable rule:line-length
environment:
<<: *airflow-common-env
_AIRFLOW_DB_MIGRATE: 'true'
_AIRFLOW_WWW_USER_CREATE: 'true'
_PIP_ADDITIONAL_REQUIREMENTS: ''
user: "0:0"
volumes:
- ${AIRFLOW_PROJ_DIR:-.}:/sources

redis:
profiles: [ "airflow" ]
container_name: redis
hostname: redis
image: redis:latest
expose:
- 6379
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 30s
retries: 50
start_period: 30s
restart: always

# Configure Network
networks:
default:
name: lakehouse
2 changes: 1 addition & 1 deletion docker/postgres/init-database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-E
CREATE DATABASE airflow;
CREATE USER airflow WITH ENCRYPTED PASSWORD 'airflow';
GRANT ALL PRIVILEGES ON DATABASE metastore TO airflow;
EOSQL
EOSQL

0 comments on commit d88d0a0

Please sign in to comment.