From 87c7a74d416263f7f3dc0341144ececa205e4b23 Mon Sep 17 00:00:00 2001
From: David Aronchick <aronchick@gmail.com>
Date: Mon, 13 Jan 2025 16:51:57 -0800
Subject: [PATCH] updating spot, adding generator files, logging test now
 working

---
 .gitignore                                    |    3 +
 .../.cspell/custom-dictionary.txt             |   22 +
 .../bigquery-with-bacalhau/.gitignore         |   40 +
 .../bigquery-with-bacalhau/LICENSE            |   21 +
 .../bigquery-with-bacalhau/README.md          |  411 +++
 .../check_permissions.sh                      |   22 +
 .../config.yaml.example                       |   13 +
 .../duckdb_query_job.yaml                     |   28 +
 .../process_container/process.py              |  326 ++
 .../start-logging-container.yaml              |   30 +
 .../utility_scripts/confirm_table.sh          |   71 +
 .../utility_scripts/distribute_credentials.sh |   75 +
 .../utility_scripts/list_columns.sh           |   16 +
 .../utility_scripts/setup.py                  |  481 +++
 .../setup_aggregation_tables.sh               |   50 +
 .../utility_scripts/setup_log_uploader.sh     |   50 +
 .../utility_scripts/test_bigquery.py          |  147 +
 .../prep_data/download_data_job.yaml          |    7 +-
 .../prep_data/run_download_jobs.sh            |    2 +-
 .../window_query_complex.sql                  |   26 +-
 scale-tester/.envrc                           |    4 -
 .../aws_spot/.cspell/custom-dictionary.txt    |    3 +
 scale-tester/aws_spot/.gitignore              |   47 +
 scale-tester/aws_spot/README.md               |  190 +-
 .../files/docker/compose.yml}                 |    0
 .../aws_spot/{ => ami/packer}/main.pkr.hcl    |    0
 .../{ => ami/packer}/variables.pkr.hcl        |    0
 .../{build-ami.sh => ami/scripts/build.sh}    |    0
 .../aws_spot/{ => ami/scripts}/setup.sh       |    0
 scale-tester/aws_spot/aws/config/env.sh       |   47 +
 scale-tester/aws_spot/aws/keys/README.md      |   29 +
 .../aws_spot/{ => aws/scripts}/setup-iam.sh   |    0
 .../{ => aws/scripts}/upload-to-ssm.sh        |    0
 .../aws_spot/files/bacalhau-startup.service   |   14 -
 .../aws_spot/files/orchestrator-config.yaml   |   17 -
 scale-tester/aws_spot/fleet/bin/spot-manager  |   21 +
 .../aws_spot/fleet/examples/pusher/README.md  |   44 +
 .../examples/pusher}/env_writer.yaml          |    0
 .../examples/pusher}/pusher-job.yaml          |    0
 .../fleet/examples/pusher/pusher_env.txt.b64  |    1 +
 .../aws_spot/{ => fleet}/scripts/startup.sh   |    0
 .../aws_spot/fleet/src/spot_manager.py        | 2752 +++++++++++++++++
 scale-tester/aws_spot/plan.md                 |   27 +
 scale-tester/aws_spot/pyproject.toml          |   27 +
 scale-tester/aws_spot/requirements.txt        |    5 +
 scale-tester/aws_spot/spot-instances.sh       |  332 --
 .../aws_spot/spot/config/aws-spot-env.sh      |   42 +
 .../spot/config/aws-spot-env.sh.example       |   32 +
 .../clean_up_nodes.py                         |  125 -
 .../.cspell/custom-dictionary.txt             |    1 +
 .../all_locations.yaml                        |  176 --
 .../deploy.py                                 |  173 +-
 .../deploy_single.py                          |  642 ++++
 .../locations.yaml                            |   68 +-
 .../locations/all_locations.json              |  821 +++++
 .../locations/all_locations.yaml              |  546 ++++
 .../main.tf                                   |   10 +-
 .../instance/scripts/install_docker.sh        |   97 +-
 .../modules/instance/scripts/startup.sh       |    2 +-
 .../modules/instance/versions.tf              |    2 +-
 .../modules/network/versions.tf               |    4 +-
 .../modules/region/main.tf                    |   11 +-
 .../modules/region/versions.tf                |    2 +-
 .../modules/securityGroup/versions.tf         |    4 +-
 .../generate-all-locations-file.py            |  119 +
 .../utility_scripts/generate-locations.sh     |   54 -
 .../versions.tf                               |    3 +-
 .../instance/scripts/bacalhau-startup.service |    2 +-
 .../instance/scripts/healthz-web.service      |    2 +-
 .../instance/scripts/install_docker.sh        |   98 +-
 .../modules/instance/scripts/startup.sh       |    2 +-
 .../utility_scripts/get_vm_list.py            |   72 +
 .../utility_scripts/region_checker.py         |  167 +
 ...ability_Standard_B2ms_20250109_134546.json |  176 ++
 .../main.tf                                   |  112 +-
 .../scripts/install_docker.sh                 |   97 +-
 .../scripts/startup.sh                        |    2 +-
 .../utility_scripts/all_locations.json        |  624 ++++
 .../utility_scripts/all_locations.yaml        |  497 +++
 .../generate_all_locations_file.py            |  185 ++
 .../terraform/clean_up_nodes.py               |  132 -
 81 files changed, 9439 insertions(+), 1064 deletions(-)
 create mode 100644 data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt
 create mode 100644 data-engineering/bigquery-with-bacalhau/.gitignore
 create mode 100644 data-engineering/bigquery-with-bacalhau/LICENSE
 create mode 100644 data-engineering/bigquery-with-bacalhau/README.md
 create mode 100644 data-engineering/bigquery-with-bacalhau/check_permissions.sh
 create mode 100644 data-engineering/bigquery-with-bacalhau/config.yaml.example
 create mode 100644 data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml
 create mode 100644 data-engineering/bigquery-with-bacalhau/process_container/process.py
 create mode 100644 data-engineering/bigquery-with-bacalhau/start-logging-container.yaml
 create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh
 create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh
 create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh
 create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py
 create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh
 create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh
 create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py
 create mode 100644 scale-tester/aws_spot/.gitignore
 rename scale-tester/aws_spot/{files/docker-compose.yml => ami/files/docker/compose.yml} (100%)
 rename scale-tester/aws_spot/{ => ami/packer}/main.pkr.hcl (100%)
 rename scale-tester/aws_spot/{ => ami/packer}/variables.pkr.hcl (100%)
 rename scale-tester/aws_spot/{build-ami.sh => ami/scripts/build.sh} (100%)
 rename scale-tester/aws_spot/{ => ami/scripts}/setup.sh (100%)
 create mode 100644 scale-tester/aws_spot/aws/config/env.sh
 create mode 100644 scale-tester/aws_spot/aws/keys/README.md
 rename scale-tester/aws_spot/{ => aws/scripts}/setup-iam.sh (100%)
 rename scale-tester/aws_spot/{ => aws/scripts}/upload-to-ssm.sh (100%)
 delete mode 100644 scale-tester/aws_spot/files/bacalhau-startup.service
 delete mode 100644 scale-tester/aws_spot/files/orchestrator-config.yaml
 create mode 100755 scale-tester/aws_spot/fleet/bin/spot-manager
 create mode 100644 scale-tester/aws_spot/fleet/examples/pusher/README.md
 rename scale-tester/aws_spot/{ => fleet/examples/pusher}/env_writer.yaml (100%)
 rename scale-tester/aws_spot/{ => fleet/examples/pusher}/pusher-job.yaml (100%)
 create mode 100644 scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64
 rename scale-tester/aws_spot/{ => fleet}/scripts/startup.sh (100%)
 create mode 100755 scale-tester/aws_spot/fleet/src/spot_manager.py
 create mode 100644 scale-tester/aws_spot/plan.md
 create mode 100644 scale-tester/aws_spot/pyproject.toml
 create mode 100644 scale-tester/aws_spot/requirements.txt
 delete mode 100755 scale-tester/aws_spot/spot-instances.sh
 create mode 100644 scale-tester/aws_spot/spot/config/aws-spot-env.sh
 create mode 100644 scale-tester/aws_spot/spot/config/aws-spot-env.sh.example
 delete mode 100644 scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py
 delete mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml
 create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py
 create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json
 create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml
 create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py
 delete mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh
 create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py
 create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py
 create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json
 create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json
 create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml
 create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py
 delete mode 100644 systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py

diff --git a/.gitignore b/.gitignore
index f9af9892..72896b47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,9 @@ ansible.cfg
 *.pem
 .env.json
 
+**/MACHINES.json
+
+
 .terraform/
 terraform.tfstate.d/
 .terraform.lock.hcl
diff --git a/data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt b/data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt
new file mode 100644
index 00000000..90122436
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt
@@ -0,0 +1,22 @@
+Bacalhau
+bacalhauproject
+buildx
+cloudresourcemanager
+creds
+datacatering
+duckdb
+DUCKDB
+INPUTFILE
+INSTANCEID
+IRUSR
+IWUSR
+listdir
+makedirs
+mwendler
+natsorted
+noninteractive
+resourcemanager
+tonistiigi
+tpep
+tripdata
+TRUNC
diff --git a/data-engineering/bigquery-with-bacalhau/.gitignore b/data-engineering/bigquery-with-bacalhau/.gitignore
new file mode 100644
index 00000000..a504fc80
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/.gitignore
@@ -0,0 +1,40 @@
+# Credentials and configuration
+.env
+config.yaml
+credentials.json
+log_uploader_credentials.json
+bg_reader_credentials.json
+bq_reader_credentials.json
+**/MACHINES.json
+
+# Logs
+aperitivo_logs.log.*
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/LICENSE b/data-engineering/bigquery-with-bacalhau/LICENSE
new file mode 100644
index 00000000..58275188
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Bacalhau Project Contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE. 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/README.md b/data-engineering/bigquery-with-bacalhau/README.md
new file mode 100644
index 00000000..c9291bf7
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/README.md
@@ -0,0 +1,411 @@
+# Data Engineering with Bacalhau
+
+This repository demonstrates how to use Bacalhau for data engineering tasks, combining DuckDB for data processing and BigQuery for data storage.
+
+## Components
+
+1. **DuckDB Processing**: Process and analyze data using DuckDB's SQL capabilities
+2. **BigQuery Integration**: Store processed results in Google BigQuery for further analysis
+
+## Prerequisites
+
+1. [Bacalhau client](https://docs.bacalhau.org/getting-started/installation) installed
+2. Python 3.10 or higher
+3. A Google Cloud Project with BigQuery enabled (or permissions to create one)
+4. Service account credentials with appropriate permissions
+
+## Setup
+
+### 1. Install Dependencies
+
+Install the required Python packages:
+```bash
+pip install -r requirements.txt
+```
+
+### 2. Interactive Setup
+
+Run the interactive setup script:
+```bash
+./setup.py -i
+```
+
+The script will guide you through:
+1. Project Configuration
+   - Enter existing project ID or create new
+   - Configure project settings
+
+2. Credentials Setup
+   - Create service account (browser will open)
+   - Download and configure credentials
+   - Set up necessary permissions
+
+3. BigQuery Configuration
+   - Configure dataset and table names
+   - Set up storage location
+
+The script will:
+- Create/configure your Google Cloud project
+- Set up service account credentials
+- Create BigQuery dataset and table
+- Save all settings to config.yaml
+
+### Manual Setup (Alternative)
+
+If you prefer manual setup:
+
+1. Create a service account in Google Cloud Console with these roles:
+   - BigQuery Data Editor
+   - BigQuery Job User
+   - Project Creator (if you want the script to create projects)
+
+2. Download the service account key file (JSON format)
+
+3. Create `config.yaml`:
+   ```yaml
+   project:
+     id: "your-project-id"  # Your Google Cloud project ID
+     region: "US"           # Default region for resources
+     create_if_missing: true # Whether to create the project if it doesn't exist
+
+   credentials:
+     path: "credentials.json"  # Path to your service account key
+
+   bigquery:
+     dataset_name: "log_analytics"     # Name of the BigQuery dataset
+     table_name: "log_results"         # Name of the results table
+     location: "US"                    # Dataset location
+   ```
+
+4. Run the setup script:
+   ```bash
+   ./setup.py
+   ```
+
+### 3. Bacalhau Network Setup
+
+Follow the [standard Bacalhau network setup guide](https://docs.bacalhau.org/getting-started/create-private-network).
+
+## Usage
+
+### 1. Simple DuckDB Queries
+
+Run a simple DuckDB query:
+
+```bash
+bacalhau docker run -e QUERY="select 1" docker.io/bacalhauproject/duckdb:latest
+```
+
+### 2. Processing Logs with BigQuery Integration
+
+Process log files and store results in BigQuery:
+
+```bash
+bacalhau docker run \
+  --input /path/to/logs:/var/log/logs_to_process \
+  --volume /path/to/credentials.json:/bacalhau_node/credentials.json \
+  ghcr.io/bacalhau-project/examples/bigquery-processor:latest \
+  -- python process.py input.json "SELECT * FROM temp_log_data"
+```
+
+### 3. Using YAML Configuration
+
+For more complex setups, use the provided YAML configuration:
+
+```bash
+bacalhau job run duckdb_query_job.yaml \
+  --template-vars="filename=/bacalhau_data/data.parquet" \
+  --template-vars="QUERY=$(cat your_query.sql)"
+```
+
+## Data Schema
+
+### BigQuery Table Schema
+
+The `log_results` table in BigQuery has the following schema:
+
+- `projectID`: STRING - Google Cloud project identifier
+- `region`: STRING - Deployment region
+- `nodeName`: STRING - Node name
+- `syncTime`: STRING - Synchronization timestamp
+- `remote_log_id`: STRING - Original log identifier
+- `timestamp`: STRING - Event timestamp
+- `version`: STRING - Log version
+- `message`: STRING - Log message content
+
+## Example Queries
+
+### 1. Basic DuckDB Query
+
+```sql
+-- simple_query.sql
+SELECT COUNT(*) AS row_count FROM yellow_taxi_trips;
+```
+
+### 2. Time Window Analysis
+
+```sql
+-- window_query.sql
+SELECT
+    DATE_TRUNC('hour', tpep_pickup_datetime) + 
+    INTERVAL (FLOOR(EXTRACT(MINUTE FROM tpep_pickup_datetime) / 5) * 5) MINUTE AS interval_start,
+    COUNT(*) AS ride_count
+FROM
+    yellow_taxi_trips
+GROUP BY
+    interval_start
+ORDER BY
+    interval_start;
+```
+
+### 3. BigQuery Examples
+
+After setting up your BigQuery integration, you can run example queries using the provided script:
+
+```bash
+./run_bigquery_query.py
+```
+
+This will run several example queries and show their results:
+
+1. Table Structure - Shows the schema of your log_results table
+2. Total Row Count - Counts the total number of log entries
+3. Recent Logs - Displays the 5 most recent log entries
+4. Logs per Node - Shows how many logs each node has generated
+
+Example output:
+```
+================================================================================
+
+Querying BigQuery table: your-project-id.log_analytics.log_results
+
+================================================================================
+
+Running query: Table Structure
+
+SQL:
+SELECT 
+    column_name,
+    data_type,
+    is_nullable
+FROM your-project-id.log_analytics.INFORMATION_SCHEMA.COLUMNS
+WHERE table_name = 'log_results'
+ORDER BY ordinal_position
+
+Results:
+------------------------------------------------------------
+column_name  | data_type | is_nullable
+------------------------------------------------------------
+projectID    | STRING    | YES
+region       | STRING    | YES
+nodeName     | STRING    | YES
+syncTime     | STRING    | YES
+remote_log_id| STRING    | YES
+timestamp    | STRING    | YES
+version      | STRING    | YES
+message      | STRING    | YES
+------------------------------------------------------------
+Total rows: 8
+
+... (more query results follow)
+```
+
+You can also run these queries directly in the BigQuery console:
+1. Go to https://console.cloud.google.com/bigquery
+2. Select your project
+3. Click "Compose New Query"
+4. Copy any of the SQL queries from the script output
+
+The script uses your config.yaml settings and service account credentials to connect to BigQuery.
+
+## Security Notes
+
+1. Credential Management:
+   - Never commit credentials to version control
+   - Mount credentials at runtime using Bacalhau volumes
+   - Use appropriate IAM roles and permissions
+   - Keep your config.yaml file secure and out of version control
+
+2. Data Access:
+   - Use principle of least privilege
+   - Regularly rotate service account keys
+   - Monitor BigQuery access logs
+
+## Environment Variables
+
+- `INPUTFILE`: Path to the input log file
+- `QUERY`: DuckDB query to transform the data before sending to BigQuery
+
+## Directory Structure
+
+```
+.
+├── container/
+│   ├── process.py      # Main processing script
+│   └── Dockerfile      # Container definition
+├── setup.py            # Infrastructure setup script
+├── requirements.txt    # Python dependencies
+├── config.yaml         # Your configuration (not in version control)
+├── .gitignore         # Git ignore rules
+└── README.md          # This file
+```
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Submit a pull request
+
+## License
+
+This project is licensed under the MIT License - see the LICENSE file for details.
+
+## Demo Instructions
+
+### 1. Initial Setup
+```bash
+# Install dependencies
+pip install -r requirements.txt
+
+# Run interactive setup
+./setup.py -i
+
+# Make utility scripts executable
+chmod +x utility_scripts/*.sh
+
+# Set up tables and service account
+./utility_scripts/confirm_table.sh
+./utility_scripts/setup_log_uploader.sh
+./utility_scripts/setup_aggregation_tables.sh
+```
+
+### 2. Basic Log Processing
+
+```bash
+# Process logs with basic configuration
+bacalhau docker run \
+  --input logs:/var/log/logs_to_process \
+  --volume log_uploader_credentials.json:/var/logs/logs_to_process/log_uploader_credentials.json \
+  ghcr.io/bacalhau-project/examples/bigquery-processor:latest \
+  -- python process.py /var/log/logs_to_process/input.json "SELECT * FROM temp_log_data"
+```
+
+### 3. Advanced Features
+
+#### Track Cloud Provider
+```bash
+# Process logs with provider tracking
+bacalhau docker run \
+  -e CLOUD_PROVIDER=aws \
+  --input logs:/var/log/logs_to_process \
+  --volume log_uploader_credentials.json:/var/logs/logs_to_process/log_uploader_credentials.json \
+  ghcr.io/bacalhau-project/examples/bigquery-processor:latest \
+  -- python process.py /var/log/logs_to_process/input.json "SELECT * FROM temp_log_data"
+```
+
+#### Enable Log Aggregation
+```bash
+# Process logs with 5-minute window aggregation
+bacalhau docker run \
+  -e AGGREGATE_LOGS=true \
+  --input logs:/var/log/logs_to_process \
+  --volume log_uploader_credentials.json:/var/logs/logs_to_process/log_uploader_credentials.json \
+  ghcr.io/bacalhau-project/examples/bigquery-processor:latest \
+  -- python process.py /var/log/logs_to_process/input.json "SELECT * FROM temp_log_data"
+```
+
+### 4. Verify Results
+
+Check the results in BigQuery tables:
+
+1. Regular Logs:
+```sql
+SELECT *
+FROM `your-project-id.log_analytics.log_results`
+ORDER BY timestamp DESC
+LIMIT 5
+```
+
+2. Aggregated Logs (5-minute windows):
+```sql
+SELECT *
+FROM `your-project-id.log_analytics.log_aggregates`
+ORDER BY time_window DESC
+LIMIT 5
+```
+
+3. Emergency Events:
+```sql
+SELECT *
+FROM `your-project-id.log_analytics.emergency_logs`
+ORDER BY timestamp DESC
+LIMIT 5
+```
+
+### Security Features
+
+1. **Restricted Service Account**:
+   - Custom role with minimal permissions
+   - Can only write to specific BigQuery tables
+   - Cannot modify schema or read data
+
+2. **IP Address Sanitization**:
+   - IPv4: Last octet zeroed out
+   - IPv6: Last 64 bits zeroed out
+   - Automatic sanitization of public IPs
+
+3. **Secure Credential Handling**:
+   - Credentials mounted as volume
+   - Not exposed through environment variables
+   - Separate service account for log uploads
+
+### Environment Variables
+
+- `INPUTFILE`: Path to input log file (optional)
+- `QUERY`: DuckDB query for data transformation (optional)
+- `CLOUD_PROVIDER`: Cloud provider identifier (e.g., aws, gcp)
+- `AGGREGATE_LOGS`: Enable 5-minute window aggregation (true/false)
+
+### Table Schemas
+
+1. **log_results** (Main Table):
+   - `project_id`: STRING
+   - `region`: STRING
+   - `nodeName`: STRING
+   - `timestamp`: TIMESTAMP
+   - `version`: STRING
+   - `message`: STRING
+   - `sync_time`: TIMESTAMP
+   - `remote_log_id`: STRING
+   - `hostname`: STRING
+   - `public_ip`: STRING
+   - `private_ip`: STRING
+   - `alert_level`: STRING
+   - `provider`: STRING
+
+2. **log_aggregates** (5-minute windows):
+   - `project_id`: STRING
+   - `region`: STRING
+   - `nodeName`: STRING
+   - `provider`: STRING
+   - `hostname`: STRING
+   - `time_window`: TIMESTAMP
+   - `log_count`: INT64
+   - `messages`: ARRAY<STRING>
+
+3. **emergency_logs** (Critical Events):
+   - `project_id`: STRING
+   - `region`: STRING
+   - `nodeName`: STRING
+   - `provider`: STRING
+   - `hostname`: STRING
+   - `timestamp`: TIMESTAMP
+   - `version`: STRING
+   - `message`: STRING
+   - `remote_log_id`: STRING
+   - `alert_level`: STRING
+   - `public_ip`: STRING
+   - `private_ip`: STRING
+```
+
+This provides a complete, accurate guide for demonstrating all features of the system, including setup, usage, and verification steps.
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/check_permissions.sh b/data-engineering/bigquery-with-bacalhau/check_permissions.sh
new file mode 100644
index 00000000..dcea4c44
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/check_permissions.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Read project ID from config.yaml
+PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])")
+
+echo "Checking BigQuery permissions for project: $PROJECT_ID"
+echo
+
+# Check if we can access the dataset
+echo "Testing dataset access..."
+bq show $PROJECT_ID:log_analytics
+
+# Check if we can modify the table
+echo -e "\nChecking table permissions..."
+bq show --format=prettyjson $PROJECT_ID:log_analytics.log_results
+
+# Check IAM permissions
+echo -e "\nChecking IAM roles..."
+gcloud projects get-iam-policy $PROJECT_ID \
+    --flatten="bindings[].members" \
+    --format="table(bindings.role,bindings.members)" \
+    --filter="bindings.members:$(gcloud config get account)" 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/config.yaml.example b/data-engineering/bigquery-with-bacalhau/config.yaml.example
new file mode 100644
index 00000000..cd91db72
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/config.yaml.example
@@ -0,0 +1,13 @@
+# BigQuery Configuration
+project:
+  id: "bacalhau-and-bigquery"  # Required: Your Google Cloud project ID
+  region: "US"           # Optional: Default region for resources
+  create_if_missing: true # Whether to create the project if it doesn't exist
+
+credentials:
+  path: "credentials.json"  # Path to service account credentials
+
+bigquery:
+  dataset_name: "log_analytics"     # Name of the BigQuery dataset
+  table_name: "log_results"         # Name of the results table
+  location: "US"                    # Dataset location 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml b/data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml
new file mode 100644
index 00000000..a3efa5b8
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml
@@ -0,0 +1,28 @@
+Tasks:
+  - Engine:
+      Params:
+        Image: docker.io/bacalhauproject/duckdb:latest
+        WorkingDirectory: ""
+        EnvironmentVariables:
+          - QUERY=WITH yellow_taxi_trips AS (SELECT * FROM read_parquet('{{ .filename }}')) {{ .query }}
+      Type: docker
+    Name: duckdb-query-job
+    InputSources:
+      - Source:
+          Type: "localDirectory"
+          Params:
+            SourcePath: "/bacalhau_data"
+            ReadWrite: true
+        Target: "/bacalhau_data"
+    Publisher:
+      Type: "local"
+      Params:
+        TargetPath: "/bacalhau_data"
+    Network:
+      Type: Full
+    Resources:
+      CPU: 2000m
+      Memory: 2048Mi
+    Timeouts: {}
+Type: batch
+Count: 1
diff --git a/data-engineering/bigquery-with-bacalhau/process_container/process.py b/data-engineering/bigquery-with-bacalhau/process_container/process.py
new file mode 100644
index 00000000..c1b613e5
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/process_container/process.py
@@ -0,0 +1,326 @@
+#!/usr/bin/env uv run -s
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "duckdb",
+#     "requests",
+#     "natsort",
+#     "google-cloud-storage",
+#     "google-cloud-bigquery",
+#     "ipaddress",
+# ]
+# ///
+
+import argparse
+import ipaddress
+import json
+import os
+import tempfile
+from datetime import datetime
+
+import duckdb
+import requests
+from google.cloud import bigquery
+from google.oauth2 import service_account
+from natsort import natsorted, ns
+
+
+def sanitize_ip(ip_str):
+    """Sanitize IP address by zeroing out last octet for IPv4 or last 64 bits for IPv6."""
+    try:
+        ip = ipaddress.ip_address(ip_str)
+        if isinstance(ip, ipaddress.IPv4Address):
+            # Convert to string and replace last octet with 0
+            parts = str(ip).split(".")
+            parts[-1] = "0"
+            return ".".join(parts)
+        else:
+            # For IPv6, zero out the last 64 bits
+            parts = str(ip).split(":")
+            return ":".join(parts[:4]) + ":0:0:0:0"
+    except:
+        return None
+
+
+def getInstanceMetadata(metadataName):
+    url = f"http://metadata.google.internal/computeMetadata/v1/instance/{metadataName}"
+    return getMetadata(url)
+
+
+def getProjectMetadata(metadataName):
+    url = f"http://metadata.google.internal/computeMetadata/v1/project/{metadataName}"
+    return getMetadata(url)
+
+
+def getMetadata(metadata_server_url):
+    metadata_server_token_url = (
+        "http://metadata/computeMetadata/v1/instance/service-accounts/default/token"
+    )
+    token_request_headers = {"Metadata-Flavor": "Google"}
+    token_response = requests.get(
+        metadata_server_token_url, headers=token_request_headers
+    )
+    jwt = token_response.json()["access_token"]
+
+    metadata_request_headers = {
+        "Metadata-Flavor": "Google",
+        "Authorization": f"Bearer {jwt}",
+    }
+
+    return requests.get(metadata_server_url, headers=metadata_request_headers).text
+
+
+def detect_cloud_provider():
+    """Detect the cloud provider by trying metadata endpoints."""
+
+    def try_gcp():
+        try:
+            headers = {"Metadata-Flavor": "Google"}
+            response = requests.get(
+                "http://metadata.google.internal/computeMetadata/v1/instance/id",
+                headers=headers,
+                timeout=1,
+            )
+            if response.status_code == 200:
+                return "gcp"
+        except:
+            pass
+        return None
+
+    def try_aws():
+        try:
+            # Get IMDSv2 token first
+            token_headers = {"X-aws-ec2-metadata-token-ttl-seconds": "21600"}
+            token = requests.put(
+                "http://169.254.169.254/latest/api/token",
+                headers=token_headers,
+                timeout=1,
+            ).text
+
+            headers = {"X-aws-ec2-metadata-token": token}
+            response = requests.get(
+                "http://169.254.169.254/latest/meta-data/instance-id",
+                headers=headers,
+                timeout=1,
+            )
+            if response.status_code == 200:
+                return "aws"
+        except:
+            pass
+        return None
+
+    def try_azure():
+        try:
+            headers = {"Metadata": "true"}
+            response = requests.get(
+                "http://169.254.169.254/metadata/instance?api-version=2021-02-01",
+                headers=headers,
+                timeout=1,
+            )
+            if response.status_code == 200:
+                return "azure"
+        except:
+            pass
+        return None
+
+    # Try each provider
+    provider = try_gcp() or try_aws() or try_azure() or "unknown"
+    return provider
+
+
+def main(input_file, query):
+    # Create an in-memory DuckDB database
+    con = duckdb.connect(database=":memory:", read_only=False)
+
+    usingTempFile = False
+    # If file is .gz, decompress it into a temporary file
+    if input_file.endswith(".gz"):
+        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".log") as temp:
+            os.system(f"gunzip -c {input_file} > {temp.name}")
+            input_file = temp.name
+            usingTempFile = True
+
+    # Load credentials from the mounted file
+    credentials = service_account.Credentials.from_service_account_file(
+        "/var/logs/logs_to_process/log_uploader_credentials.json",
+        scopes=["https://www.googleapis.com/auth/bigquery"],
+    )
+
+    # Create BigQuery client
+    bq_client = bigquery.Client(credentials=credentials)
+
+    # Generate metadata
+    try:
+        projectID = getProjectMetadata("project-id")
+        region = getInstanceMetadata("zone").split("/")[3]
+        nodeName = getInstanceMetadata("name")
+        provider = detect_cloud_provider()
+    except:
+        # If metadata service is not available, use defaults
+        projectID = "unknown"
+        region = "unknown"
+        nodeName = "unknown"
+        provider = "unknown"
+
+    syncTime = datetime.now().strftime("%Y%m%d%H%M%S")
+
+    # Create a temporary table in DuckDB with the JSON data
+    columns = {
+        "id": "varchar",
+        "@timestamp": "varchar",
+        "@version": "varchar",
+        "message": "varchar",
+    }
+
+    # First create a temporary table with the data
+    temp_table = "temp_log_data"
+    raw_query = f"""
+    CREATE TABLE {temp_table} AS 
+    SELECT 
+        '{projectID}' as project_id,
+        '{region}' as region,
+        '{nodeName}' as nodeName,
+        '{syncTime}' as sync_time,
+        id as remote_log_id,
+        CAST("@timestamp" AS TIMESTAMP) as timestamp,
+        "@version" as version,
+        message,
+        '{provider}' as provider,
+        hostname() as hostname,
+        CASE 
+            WHEN message LIKE '%ERROR%' OR message LIKE '%FATAL%' THEN 'emergency'
+            WHEN message LIKE '%WARN%' THEN 'warning'
+            ELSE 'info'
+        END as alert_level
+    FROM read_json(?, auto_detect=true, columns={columns})
+    """
+    con.execute(raw_query, [input_file])
+
+    # Now apply the user's query to filter/transform the data
+    if query:
+        result_table = "filtered_results"
+        con.execute(f"CREATE TABLE {result_table} AS {query}")
+    else:
+        result_table = temp_table
+
+    # Check if we should aggregate
+    should_aggregate = os.environ.get("AGGREGATE_LOGS", "false").lower() == "true"
+
+    if should_aggregate:
+        # Create aggregated table
+        agg_table = "aggregated_results"
+        con.execute(f"""
+        CREATE TABLE {agg_table} AS
+        SELECT 
+            project_id,
+            region,
+            nodeName,
+            provider,
+            hostname,
+            date_trunc('minute', timestamp) - 
+                (date_part('minute', timestamp)::integer % 5) * interval '1 minute' as time_window,
+            COUNT(*) as log_count,
+            array_agg(message) as messages
+        FROM {result_table}
+        WHERE alert_level != 'emergency'
+        GROUP BY 
+            project_id, region, nodeName, provider, hostname,
+            date_trunc('minute', timestamp) - 
+                (date_part('minute', timestamp)::integer % 5) * interval '1 minute'
+        """)
+
+        # Export aggregated results to BigQuery
+        agg_table_id = f"{projectID}.log_analytics.log_aggregates"
+        df_agg = con.execute(f"SELECT * FROM {agg_table}").df()
+
+        job_config = bigquery.LoadJobConfig(
+            write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
+        )
+
+        job = bq_client.load_table_from_dataframe(
+            df_agg, agg_table_id, job_config=job_config
+        )
+        job.result()
+        print(f"Loaded {len(df_agg)} aggregated rows into {agg_table_id}")
+
+        # Export emergency events separately
+        emergency_table = "emergency_results"
+        con.execute(f"""
+        CREATE TABLE {emergency_table} AS
+        SELECT *
+        FROM {result_table}
+        WHERE alert_level = 'emergency'
+        """)
+
+        emergency_table_id = f"{projectID}.log_analytics.emergency_logs"
+        df_emergency = con.execute(f"SELECT * FROM {emergency_table}").df()
+
+        if len(df_emergency) > 0:
+            job = bq_client.load_table_from_dataframe(
+                df_emergency, emergency_table_id, job_config=job_config
+            )
+            job.result()
+            print(
+                f"Loaded {len(df_emergency)} emergency events into {emergency_table_id}"
+            )
+    else:
+        # Export the regular results to BigQuery
+        table_id = f"{projectID}.log_analytics.log_results"
+        df = con.execute(f"SELECT * FROM {result_table}").df()
+
+        # Sanitize IP addresses if present
+        if "public_ip" in df.columns:
+            df["public_ip"] = df["public_ip"].apply(sanitize_ip)
+
+        job_config = bigquery.LoadJobConfig(
+            write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
+        )
+
+        job = bq_client.load_table_from_dataframe(df, table_id, job_config=job_config)
+        job.result()
+        print(f"Loaded {len(df)} rows into {table_id}")
+
+    # Cleanup
+    if usingTempFile:
+        os.unlink(input_file)
+
+
+if __name__ == "__main__":
+    # Print a header to a list of files that are available to process
+    print("Files available to process (in /var/log/logs_to_process):")
+    print("--------------------")
+
+    # Print all files in /var/log/logs_to_process to stdout with absolute paths.
+    # If there are no files, print a message that "No files are available to process."
+    files = os.listdir("/var/log/logs_to_process")
+    if len(files) == 0:
+        print("No files are available to process.")
+    else:
+        f = natsorted(files, alg=ns.IGNORECASE)
+        for file in f:
+            print(f"/var/log/logs_to_process/{file}")
+
+    print("\n")
+
+    print("Environment Variables")
+    print(f"INPUTFILE = {os.environ.get('INPUTFILE')}")
+    print(f"QUERY = {os.environ.get('QUERY')}")
+    print(f"AGGREGATE_LOGS = {os.environ.get('AGGREGATE_LOGS', 'false')}")
+
+    # If both INPUTFILE and QUERY are set, then use those
+    if os.environ.get("INPUTFILE") and os.environ.get("QUERY"):
+        print("Both INPUTFILE and QUERY are set, so using those")
+        args = argparse.Namespace(
+            input_file=os.environ.get("INPUTFILE"), query=os.environ.get("QUERY")
+        )
+    else:
+        # Set up the argument parser
+        parser = argparse.ArgumentParser(description="Process log data")
+        parser.add_argument("input_file", help="Path to the input log file")
+        parser.add_argument("query", help="DuckDB query to execute")
+
+        # Parse the command-line arguments
+        args = parser.parse_args()
+
+    # Call the main function
+    main(args.input_file, args.query)
diff --git a/data-engineering/bigquery-with-bacalhau/start-logging-container.yaml b/data-engineering/bigquery-with-bacalhau/start-logging-container.yaml
new file mode 100644
index 00000000..fffe8cf3
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/start-logging-container.yaml
@@ -0,0 +1,30 @@
+Tasks:
+  - Engine:
+      Params:
+        Entrypoint: null
+        EnvironmentVariables: null
+        Image: docker.io/bacalhauproject/log-generator:2412171646
+        Parameters:
+          - -d
+          - /var/log/app
+          - -n
+          - aperitivo
+        WorkingDirectory: ""
+      Type: docker
+    Name: sample-job
+    InputSources:
+      - Source:
+          Type: "localDirectory"
+          Params:
+            SourcePath: "/bacalhau_data"
+            ReadWrite: true
+        Target: "/var/log/logs_to_process"
+    Network:
+      Type: None
+    Publisher:
+      Type: ""
+    Resources:
+      CPU: 250m
+      Memory: 250m
+    Timeouts: {}
+Type: daemon
diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh
new file mode 100755
index 00000000..97c28580
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Exit on error
+set -e
+
+# Read project ID from config.yaml
+PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])")
+
+echo "Ensuring all required columns exist in table: $PROJECT_ID.log_analytics.log_results"
+
+# First, drop the timestamp columns if they exist with wrong type
+echo "Dropping timestamp columns to recreate with correct type..."
+bq query --use_legacy_sql=false \
+"ALTER TABLE \`$PROJECT_ID.log_analytics.log_results\`
+DROP COLUMN IF EXISTS timestamp,
+DROP COLUMN IF EXISTS sync_time"
+
+# Add or modify columns to ensure correct schema
+echo "Adding/updating columns..."
+bq query --use_legacy_sql=false \
+"ALTER TABLE \`$PROJECT_ID.log_analytics.log_results\`
+ADD COLUMN IF NOT EXISTS region STRING,
+ADD COLUMN IF NOT EXISTS nodeName STRING,
+ADD COLUMN IF NOT EXISTS timestamp TIMESTAMP,
+ADD COLUMN IF NOT EXISTS version STRING,
+ADD COLUMN IF NOT EXISTS message STRING,
+ADD COLUMN IF NOT EXISTS project_id STRING,
+ADD COLUMN IF NOT EXISTS sync_time TIMESTAMP,
+ADD COLUMN IF NOT EXISTS remote_log_id STRING,
+ADD COLUMN IF NOT EXISTS hostname STRING,
+ADD COLUMN IF NOT EXISTS public_ip STRING,
+ADD COLUMN IF NOT EXISTS private_ip STRING,
+ADD COLUMN IF NOT EXISTS alert_level STRING,
+ADD COLUMN IF NOT EXISTS provider STRING"
+
+# Verify the columns and their types
+echo -e "\nVerifying columns..."
+bq query --use_legacy_sql=false --format=pretty \
+"WITH required_columns AS (
+  SELECT column_name, data_type
+  FROM UNNEST([
+    STRUCT('region' as column_name, 'STRING' as data_type),
+    ('nodeName', 'STRING'),
+    ('timestamp', 'TIMESTAMP'),
+    ('version', 'STRING'),
+    ('message', 'STRING'),
+    ('project_id', 'STRING'),
+    ('sync_time', 'TIMESTAMP'),
+    ('remote_log_id', 'STRING'),
+    ('hostname', 'STRING'),
+    ('public_ip', 'STRING'),
+    ('private_ip', 'STRING'),
+    ('alert_level', 'STRING'),
+    ('provider', 'STRING')
+  ])
+)
+SELECT 
+  c.column_name,
+  c.data_type as current_type,
+  r.data_type as required_type,
+  CASE 
+    WHEN c.data_type = r.data_type THEN '✓'
+    ELSE '✗'
+  END as matches
+FROM \`$PROJECT_ID.log_analytics\`.INFORMATION_SCHEMA.COLUMNS c
+RIGHT JOIN required_columns r
+  ON c.column_name = r.column_name
+WHERE c.table_name = 'log_results'
+ORDER BY r.column_name"
+
+echo -e "\nDone. All required columns should now exist with correct types." 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh
new file mode 100755
index 00000000..0971e42a
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+set -e
+
+# Check if credentials file exists
+if [ ! -f "log_uploader_credentials.json" ]; then
+    echo "Error: log_uploader_credentials.json not found in current directory"
+    exit 1
+fi
+
+# Read and encode the credentials
+echo "Reading and encoding credentials..."
+CREDS_B64=$(base64 -i log_uploader_credentials.json)
+
+# Create and encode the Python script directly in memory
+SCRIPT_B64=$(cat << 'EOF' | base64
+#!/usr/bin/env python3
+import base64
+import os
+import stat
+import sys
+
+try:
+    # Get the credentials from environment variable
+    creds_b64 = os.environ.get('CREDS_B64')
+    if not creds_b64:
+        print("Error: CREDS_B64 environment variable not found")
+        sys.exit(1)
+
+    # Decode the credentials
+    creds = base64.b64decode(creds_b64)
+
+    # If DEBUG is set, list the contents of the directory
+    if os.environ.get('DEBUG'):
+        print("Listing contents of /var/log:")
+        print(os.listdir('/var/log'))
+
+    # Write the credentials
+    creds_path = '/var/log/logs_to_process/log_uploader_credentials.json'
+    with open(creds_path, 'wb') as f:
+        f.write(creds)
+
+    # Set permissions to be readable only by owner (600)
+    os.chmod(creds_path, stat.S_IRUSR | stat.S_IWUSR)
+
+    # Verify the write
+    if not os.path.exists(creds_path):
+        print("Error: Failed to write credentials file")
+        sys.exit(1)
+
+    # Verify the permissions
+    perms = oct(os.stat(creds_path).st_mode)[-3:]
+    if perms != '600':
+        print(f"Warning: Unexpected permissions: {perms}")
+        sys.exit(1)
+
+    print(f"Successfully wrote credentials to {creds_path}")
+    print(f"File permissions: {perms}")
+
+except Exception as e:
+    print(f"Error: {str(e)}")
+    sys.exit(1)
+EOF
+)
+
+echo "Distributing credentials to all nodes..."
+bacalhau docker run \
+    -e SCRIPT_B64="$SCRIPT_B64" \
+    -e CREDS_B64="$CREDS_B64" \
+    -e DEBUG=true \
+    --target all \
+    --input file:///bacalhau_data,dst=/var/log/logs_to_process,opt=readwrite=true \
+    python:3.11-slim \
+    -- /bin/bash -c 'echo "$SCRIPT_B64" | base64 -d > /tmp/write_creds.py && python /tmp/write_creds.py'
+
+echo "Credentials distribution complete."
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh
new file mode 100755
index 00000000..986c6a3d
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Read project ID from config.yaml
+PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])")
+
+echo "Listing columns for table: $PROJECT_ID.log_analytics.log_results"
+echo
+
+bq query --use_legacy_sql=false --format=pretty \
+"SELECT 
+  column_name,
+  data_type,
+  is_nullable
+FROM \`$PROJECT_ID.log_analytics\`.INFORMATION_SCHEMA.COLUMNS
+WHERE table_name = 'log_results'
+ORDER BY ordinal_position" 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py
new file mode 100755
index 00000000..81ea8c27
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py
@@ -0,0 +1,481 @@
+#!/usr/bin/env uv run -s
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "duckdb",
+# "requests",
+# "natsort",
+# "google-cloud-storage",
+# "google-cloud-bigquery",
+# "google-cloud-resource-manager",
+# "google-cloud-iam",
+# "google-cloud-service-usage",
+# "google-auth",
+# "pyyaml",
+# "google-api-core",
+# ]
+# ///
+
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+import time
+from datetime import datetime
+
+import yaml
+from google.api_core import exceptions
+from google.cloud import bigquery, resourcemanager, service_usage_v1
+from google.oauth2 import service_account
+
+DEFAULT_CONFIG = {
+    "project": {
+        "id": "your-project-id",
+        "region": "US",
+        "create_if_missing": True,
+    },
+    "credentials": {
+        "path": "credentials.json",
+    },
+    "bigquery": {
+        "dataset_name": "log_analytics",
+        "table_name": "log_results",
+        "location": "US",
+    },
+}
+
+REQUIRED_APIS = [
+    "bigquery.googleapis.com",
+    "cloudresourcemanager.googleapis.com",
+    "iam.googleapis.com",
+]
+
+
+def prompt_yes_no(question, default="yes"):
+    """Ask a yes/no question and return the answer."""
+    valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
+    if default is None:
+        prompt = " [y/n] "
+    elif default == "yes":
+        prompt = " [Y/n] "
+    elif default == "no":
+        prompt = " [y/N] "
+    else:
+        raise ValueError(f"Invalid default answer: '{default}'")
+
+    while True:
+        sys.stdout.write(question + prompt)
+        choice = input().lower()
+        if default is not None and choice == "":
+            return valid[default]
+        elif choice in valid:
+            return valid[choice]
+        else:
+            sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
+
+
+def generate_project_id(base_id: str) -> str:
+    """Generate a unique project ID with timestamp suffix."""
+    timestamp = datetime.now().strftime("%y%m%d%H%M")
+    # Remove any existing timestamp suffix if present
+    base_id = base_id.split("-20")[0]  # Remove any existing timestamp
+
+    # Ensure the total length stays under 30 characters
+    # Format: base-id + "-" + timestamp = max 30 chars
+    max_base_length = 19  # 30 - 10 (timestamp) - 1 (hyphen)
+    if len(base_id) > max_base_length:
+        base_id = base_id[:max_base_length]
+
+    return f"{base_id}-{timestamp}"
+
+
+def create_project(project_id):
+    """Create a new GCP project."""
+    try:
+        # Create project without credentials
+        client = resourcemanager.ProjectsClient()
+
+        # Add timestamp to project ID only if it doesn't already have one
+        if not any(c.isdigit() for c in project_id):
+            project_id = generate_project_id(project_id)
+
+        # Create project
+        project = resourcemanager.Project()
+        project.project_id = project_id
+        project.display_name = "Bacalhau BigQuery"
+
+        print(f"\nCreating project {project_id}...")
+        operation = client.create_project(request={"project": project})
+        result = operation.result()  # Wait for operation to complete
+
+        # Wait for project to be fully created and ready
+        print("Waiting for project to be ready...")
+        time.sleep(30)  # Give time for project to propagate
+
+        print(f"Project {project_id} created successfully!")
+        return project_id
+    except Exception as e:
+        print(f"Failed to create project: {e}")
+        return None
+
+
+def check_api_enabled(project_id, api_name):
+    """Check if a specific API is enabled for the project."""
+    import subprocess
+
+    try:
+        result = subprocess.run(
+            [
+                "gcloud",
+                "services",
+                "list",
+                "--project",
+                project_id,
+                "--filter",
+                f"config.name={api_name}",
+                "--format",
+                "value(state)",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        return "ENABLED" in result.stdout.upper()
+    except subprocess.CalledProcessError:
+        return False
+
+
+def enable_api_with_gcloud(project_id, api_name):
+    """Enable an API using gcloud command."""
+    try:
+        print(f"Enabling {api_name}...")
+        subprocess.run(
+            ["gcloud", "services", "enable", api_name, f"--project={project_id}"],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to enable {api_name}: {e.stderr}")
+        return False
+
+
+def enable_project_apis(project_id, credentials):
+    """Enable required APIs for the project using gcloud."""
+    try:
+        # Check and enable Service Usage API first
+        if not check_api_enabled(project_id, "serviceusage.googleapis.com"):
+            if not enable_api_with_gcloud(project_id, "serviceusage.googleapis.com"):
+                print(
+                    "\nFailed to enable Service Usage API. Please ensure you have the right permissions."
+                )
+                print("Try running: gcloud auth login")
+                sys.exit(1)
+
+        # Enable other required APIs using gcloud
+        for api in REQUIRED_APIS:
+            if not check_api_enabled(project_id, api):
+                if not enable_api_with_gcloud(project_id, api):
+                    return False
+            else:
+                print(f"API {api} is already enabled.")
+
+        print("All required APIs are enabled")
+        return True
+    except Exception as e:
+        print(f"Failed to enable APIs: {e}")
+        return False
+
+
+def print_credentials_instructions(project_id: str):
+    """Print instructions for creating service account and credentials using gcloud CLI."""
+    print("\nTo create service account and credentials, run these commands:")
+    print("\n1. Create service account:")
+    print("gcloud iam service-accounts create bacalhau-bigquery \\")
+    print("    --display-name='Bacalhau BigQuery Service Account' \\")
+    print(f"    --project={project_id}")
+
+    print("\n2. Grant necessary roles:")
+    print(f"gcloud projects add-iam-policy-binding {project_id} \\")
+    print(
+        f"    --member='serviceAccount:bacalhau-bigquery@{project_id}.iam.gserviceaccount.com' \\"
+    )
+    print("    --role='roles/bigquery.dataEditor'")
+    print(f"gcloud projects add-iam-policy-binding {project_id} \\")
+    print(
+        f"    --member='serviceAccount:bacalhau-bigquery@{project_id}.iam.gserviceaccount.com' \\"
+    )
+    print("    --role='roles/bigquery.jobUser'")
+
+    print("\n3. Download service account key:")
+    print("gcloud iam service-accounts keys create credentials.json \\")
+    print(
+        f"    --iam-account=bacalhau-bigquery@{project_id}.iam.gserviceaccount.com \\"
+    )
+    print(f"    --project={project_id}")
+
+    print("\nAfter running these commands, run this script again to continue setup.")
+
+
+def interactive_setup():
+    """Guide the user through the setup process."""
+    print("\n=== Bacalhau BigQuery Setup ===\n")
+
+    # Check if config exists
+    if os.path.exists("config.yaml"):
+        if not prompt_yes_no("config.yaml already exists. Do you want to reconfigure?"):
+            return load_or_create_config("config.yaml")
+
+    config = DEFAULT_CONFIG.copy()
+
+    # Project configuration
+    print("\n1. Project Configuration")
+    print("-----------------------")
+    project_id = input(
+        "Enter your Google Cloud project ID (or press Enter to create new): "
+    ).strip()
+
+    if not project_id:
+        # Generate a project ID with timestamp
+        base_id = "bq"  # Shorter base name to allow for timestamp
+        default_id = generate_project_id(base_id)
+        project_id = input(f"Enter new project ID (default: {default_id}): ").strip()
+        project_id = project_id if project_id else default_id
+
+        # Create the project
+        new_project_id = create_project(project_id)
+        if not new_project_id:
+            print("\nPlease create the project manually using:")
+            print(f"gcloud projects create {project_id}")
+            sys.exit(1)
+
+        config["project"]["id"] = new_project_id
+        config["project"]["create_if_missing"] = False  # Project is already created
+
+        # Save configuration immediately after project creation
+        with open("config.yaml", "w") as f:
+            yaml.dump(config, f, default_flow_style=False)
+        print("\nConfiguration saved to config.yaml with project ID:", new_project_id)
+
+        # Print instructions for creating service account
+        print_credentials_instructions(new_project_id)
+        sys.exit(0)
+    else:
+        config["project"]["id"] = project_id
+        config["project"]["create_if_missing"] = prompt_yes_no(
+            "Create project if it doesn't exist?"
+        )
+
+    # Credentials setup
+    print("\n2. Credentials Setup")
+    print("------------------")
+    while True:
+        creds_path = input(
+            "\nEnter the path to your credentials file (default: credentials.json): "
+        ).strip()
+        if not creds_path:
+            creds_path = "credentials.json"
+
+        if os.path.exists(creds_path):
+            config["credentials"]["path"] = creds_path
+            break
+        else:
+            print(f"Error: File not found at {creds_path}")
+            print_credentials_instructions(project_id)
+            sys.exit(1)
+
+    # BigQuery configuration
+    print("\n3. BigQuery Configuration")
+    print("----------------------")
+    dataset = input("Enter dataset name (default: log_analytics): ").strip()
+    if dataset:
+        config["bigquery"]["dataset_name"] = dataset
+
+    table = input("Enter table name (default: log_results): ").strip()
+    if table:
+        config["bigquery"]["table_name"] = table
+
+    # Save final configuration
+    with open("config.yaml", "w") as f:
+        yaml.dump(config, f, default_flow_style=False)
+
+    print("\nConfiguration saved to config.yaml")
+    return config
+
+
+def load_or_create_config(config_path):
+    """Load configuration from YAML file or create if doesn't exist."""
+    if os.path.exists(config_path):
+        with open(config_path, "r") as f:
+            return yaml.safe_load(f)
+
+    # If no config exists, run interactive setup
+    return interactive_setup()
+
+
+def validate_config(config):
+    """Validate the configuration has all required fields."""
+    required_fields = [
+        ("project.id", lambda c: c.get("project", {}).get("id")),
+        ("credentials.path", lambda c: c.get("credentials", {}).get("path")),
+    ]
+
+    for field, getter in required_fields:
+        if not getter(config) or getter(config) == DEFAULT_CONFIG["project"]["id"]:
+            print(f"Missing or invalid required field: {field}")
+            if prompt_yes_no("Would you like to run the interactive setup?"):
+                return interactive_setup()
+            sys.exit(1)
+
+
+def setup_bigquery(config, credentials):
+    """Setup BigQuery resources."""
+    client = bigquery.Client(credentials=credentials, project=config["project"]["id"])
+
+    # Create dataset if it doesn't exist
+    dataset_id = f"{config['project']['id']}.{config['bigquery']['dataset_name']}"
+    dataset = bigquery.Dataset(dataset_id)
+    dataset.location = config["bigquery"]["location"]
+
+    try:
+        dataset = client.create_dataset(dataset, exists_ok=True)
+        print(f"Dataset {dataset_id} created or already exists.")
+    except Exception as e:
+        print(f"Error creating dataset: {e}")
+        return False
+
+    # Create table if it doesn't exist
+    schema = [
+        bigquery.SchemaField("projectID", "STRING"),
+        bigquery.SchemaField("region", "STRING"),
+        bigquery.SchemaField("nodeName", "STRING"),
+        bigquery.SchemaField("syncTime", "STRING"),
+        bigquery.SchemaField("remote_log_id", "STRING"),
+        bigquery.SchemaField("timestamp", "STRING"),
+        bigquery.SchemaField("version", "STRING"),
+        bigquery.SchemaField("message", "STRING"),
+    ]
+
+    table_id = f"{dataset_id}.{config['bigquery']['table_name']}"
+    table = bigquery.Table(table_id, schema=schema)
+    try:
+        table = client.create_table(table, exists_ok=True)
+        print(f"Table {table_id} created or already exists.")
+    except Exception as e:
+        print(f"Error creating table: {e}")
+        return False
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Setup BigQuery resources")
+    parser.add_argument(
+        "--config", default="config.yaml", help="Path to configuration file"
+    )
+    parser.add_argument(
+        "--interactive", "-i", action="store_true", help="Run interactive setup"
+    )
+    args = parser.parse_args()
+
+    print("\nSetting up BigQuery integration...")
+
+    # Try to load existing config first
+    config = None
+    if os.path.exists(args.config):
+        try:
+            with open(args.config, "r") as f:
+                config = yaml.safe_load(f)
+        except Exception as e:
+            print(f"Error reading config file: {e}")
+
+    # If no config or interactive mode, run setup
+    if config is None or args.interactive:
+        config = interactive_setup()
+
+    # Validate configuration
+    validate_config(config)
+
+    project_id = config["project"]["id"]
+
+    # First, ensure project exists
+    try:
+        # Try to create client without credentials first
+        client = resourcemanager.ProjectsClient()
+        project = client.get_project(name=f"projects/{project_id}")
+        print(f"Project {project_id} exists.")
+    except exceptions.NotFound:
+        print(f"\nProject {project_id} does not exist.")
+        if config["project"].get("create_if_missing", False):
+            new_project_id = create_project(project_id)
+            if not new_project_id:
+                sys.exit(1)
+            # Update config with new project ID
+            config["project"]["id"] = new_project_id
+            project_id = new_project_id
+            # Save updated config
+            with open(args.config, "w") as f:
+                yaml.dump(config, f, default_flow_style=False)
+            print(f"Updated config.yaml with new project ID: {new_project_id}")
+        else:
+            if prompt_yes_no("Would you like to create the project now?"):
+                new_project_id = create_project(project_id)
+                if not new_project_id:
+                    sys.exit(1)
+                # Update config with new project ID
+                config["project"]["id"] = new_project_id
+                project_id = new_project_id
+                # Save updated config
+                with open(args.config, "w") as f:
+                    yaml.dump(config, f, default_flow_style=False)
+                print(f"Updated config.yaml with new project ID: {new_project_id}")
+            else:
+                sys.exit(1)
+    except exceptions.PermissionDenied:
+        print(f"\nUnable to verify project {project_id} - insufficient permissions.")
+        print("Please run: gcloud auth login")
+        print("Then try again.")
+        sys.exit(1)
+
+    # Now check for credentials
+    creds_path = os.path.expanduser(config["credentials"]["path"])
+    if not os.path.exists(creds_path):
+        print(f"\nCredentials file not found at {creds_path}")
+        # Ensure config is saved before showing instructions
+        if not os.path.exists(args.config):
+            with open(args.config, "w") as f:
+                yaml.dump(config, f, default_flow_style=False)
+            print(f"Created config.yaml with project ID: {project_id}")
+        print_credentials_instructions(project_id)
+        sys.exit(1)
+
+    # Enable required APIs using gcloud credentials (not service account)
+    print("\nEnabling required APIs using your gcloud credentials...")
+    if not enable_project_apis(project_id, None):
+        print(
+            "\nFailed to enable APIs. Please ensure you have the right permissions and try again."
+        )
+        print("You may need to run: gcloud auth login")
+        sys.exit(1)
+
+    # Setup credentials for BigQuery operations
+    credentials = service_account.Credentials.from_service_account_file(
+        creds_path,
+        scopes=["https://www.googleapis.com/auth/cloud-platform"],
+    )
+
+    # Setup BigQuery resources
+    if setup_bigquery(config, credentials):
+        print("\nBigQuery setup completed successfully!")
+        print(
+            f"\nYou can now use the following project ID in your queries: {project_id}"
+        )
+    else:
+        print("\nBigQuery setup failed")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh
new file mode 100755
index 00000000..72a98994
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Exit on error
+set -e
+
+# Read project ID from config.yaml
+PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])")
+
+echo "Creating aggregation tables in project: $PROJECT_ID"
+
+# Create table for 5-minute aggregated logs
+echo "Creating aggregated logs table..."
+bq query --use_legacy_sql=false \
+"CREATE TABLE IF NOT EXISTS \`$PROJECT_ID.log_analytics.log_aggregates\`
+(
+  project_id STRING,
+  region STRING,
+  nodeName STRING,
+  provider STRING,
+  hostname STRING,
+  time_window TIMESTAMP,
+  log_count INT64,
+  messages ARRAY<STRING>
+)"
+
+# Create table for emergency events
+echo "Creating emergency logs table..."
+bq query --use_legacy_sql=false \
+"CREATE TABLE IF NOT EXISTS \`$PROJECT_ID.log_analytics.emergency_logs\`
+(
+  project_id STRING,
+  region STRING,
+  nodeName STRING,
+  provider STRING,
+  hostname STRING,
+  timestamp TIMESTAMP,
+  version STRING,
+  message STRING,
+  remote_log_id STRING,
+  alert_level STRING,
+  public_ip STRING,
+  private_ip STRING
+)"
+
+echo "Done. Created tables:"
+echo "- $PROJECT_ID.log_analytics.log_aggregates (5-minute windows)"
+echo "- $PROJECT_ID.log_analytics.emergency_logs (immediate alerts)"
+echo
+echo "To use aggregation mode, set environment variable:"
+echo "AGGREGATE_LOGS=true" 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh
new file mode 100755
index 00000000..d203f9f9
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Exit on error
+set -e
+
+# Read project ID from config.yaml
+PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('../config.yaml'))['project']['id'])")
+
+echo "Setting up log uploader service account for project: $PROJECT_ID"
+
+# Create a service account specifically for log uploads
+SA_NAME="log-uploader"
+SA_EMAIL="$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com"
+
+echo "Creating service account..."
+gcloud iam service-accounts create $SA_NAME \
+    --display-name="Log Uploader Service Account" \
+    --description="Restricted service account for uploading logs to BigQuery" \
+    --project=$PROJECT_ID
+
+# Create a custom role with minimal permissions
+echo "Creating custom role..."
+gcloud iam roles create logUploader \
+    --project=$PROJECT_ID \
+    --title="Log Uploader" \
+    --description="Custom role for uploading logs to BigQuery" \
+    --permissions=bigquery.tables.get,bigquery.tables.updateData \
+    --stage=GA
+
+# Bind the role to the service account
+echo "Binding role to service account..."
+gcloud projects add-iam-policy-binding $PROJECT_ID \
+    --member="serviceAccount:$SA_EMAIL" \
+    --role="projects/$PROJECT_ID/roles/logUploader"
+
+# Create and download a key
+echo "Creating service account key..."
+gcloud iam service-accounts keys create log-uploader-key.json \
+    --iam-account=$SA_EMAIL \
+    --project=$PROJECT_ID
+
+# Create a directory for the credentials if it doesn't exist
+mv log-uploader-key.json log_uploader_credentials.json
+
+echo "Done. Service account key saved to ./log_uploader_credentials.json"
+echo "This service account has minimal permissions:"
+echo "- Can only write to BigQuery tables"
+echo "- Cannot create/modify table schema"
+echo "- Cannot read data from tables"
+echo "- Cannot access any other GCP services" 
\ No newline at end of file
diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py b/data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py
new file mode 100755
index 00000000..65c722aa
--- /dev/null
+++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env uv run -s
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "google-cloud-bigquery",
+# "google-oauth",
+# "pyyaml",
+# "google-api-core",
+# ]
+# ///
+
+import yaml
+from google.cloud import bigquery
+from google.oauth2 import service_account
+
+# Load the project-id from the config.yaml file
+# If yaml does not exist, warn that they need to run setup.py and configure config.yaml
+
+try:
+    with open("config.yaml", "r") as file:
+        config = yaml.safe_load(file)
+        project_id = config.get("project", {}).get("id", None)
+        if not project_id:
+            print(
+                "config.yaml is missing the project.id field. Please run setup.py and configure config.yaml."
+            )
+            exit(1)
+except FileNotFoundError:
+    print("config.yaml not found. Please run setup.py and configure config.yaml.")
+    exit(1)
+
+# Setup credentials
+credentials = service_account.Credentials.from_service_account_file(
+    "credentials.json",
+    scopes=["https://www.googleapis.com/auth/cloud-platform"],
+)
+
+
+def print_separator():
+    print("\n" + "=" * 80 + "\n")
+
+
+def run_query(client, query, title):
+    print(f"Running query: {title}")
+    print("\nSQL:")
+    print(query)
+    print("\nResults:")
+
+    query_job = client.query(query)
+    results = query_job.result()
+
+    # Print results in a tabular format
+    rows = list(results)
+    if not rows:
+        print("No results found")
+        return
+
+    # Get column names
+    columns = [field.name for field in results.schema]
+
+    # Calculate column widths
+    widths = {col: len(col) for col in columns}
+    for row in rows:
+        for col in columns:
+            widths[col] = max(widths[col], len(str(getattr(row, col))))
+
+    # Print header
+    header = " | ".join(col.ljust(widths[col]) for col in columns)
+    print("-" * len(header))
+    print(header)
+    print("-" * len(header))
+
+    # Print rows
+    for row in rows:
+        print(" | ".join(str(getattr(row, col)).ljust(widths[col]) for col in columns))
+
+    print("-" * len(header))
+    print(f"Total rows: {len(rows)}")
+
+
+def main():
+    # Load config
+    with open("config.yaml", "r") as f:
+        config = yaml.safe_load(f)
+
+    # Setup credentials
+    credentials = service_account.Credentials.from_service_account_file(
+        config["credentials"]["path"],
+        scopes=["https://www.googleapis.com/auth/cloud-platform"],
+    )
+
+    # Create BigQuery client
+    client = bigquery.Client(credentials=credentials, project=config["project"]["id"])
+
+    project_id = config["project"]["id"]
+    dataset_id = f"{project_id}.{config['bigquery']['dataset_name']}"
+    table_id = f"{dataset_id}.{config['bigquery']['table_name']}"
+
+    print_separator()
+    print(f"Querying BigQuery table: {table_id}")
+    print_separator()
+
+    # Query 1: Show table structure
+    schema_query = f"""
+    SELECT 
+        column_name,
+        data_type,
+        is_nullable
+    FROM {dataset_id}.INFORMATION_SCHEMA.COLUMNS
+    WHERE table_name = '{config["bigquery"]["table_name"]}'
+    ORDER BY ordinal_position
+    """
+    run_query(client, schema_query, "Table Structure")
+    print_separator()
+
+    # Query 2: Count total rows
+    count_query = f"""
+    SELECT COUNT(*) as total_rows
+    FROM `{table_id}`
+    """
+    run_query(client, count_query, "Total Row Count")
+    print_separator()
+
+    # Query 3: Sample recent logs
+    recent_logs_query = f"""
+    SELECT timestamp, nodeName, message
+    FROM `{table_id}`
+    ORDER BY timestamp DESC
+    LIMIT 5
+    """
+    run_query(client, recent_logs_query, "Recent Logs")
+    print_separator()
+
+    # Query 4: Count logs by node
+    node_count_query = f"""
+    SELECT 
+        nodeName,
+        COUNT(*) as log_count
+    FROM `{table_id}`
+    GROUP BY nodeName
+    ORDER BY log_count DESC
+    """
+    run_query(client, node_count_query, "Logs per Node")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml b/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml
index d8b23432..765e80ce 100644
--- a/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml
+++ b/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml
@@ -8,7 +8,7 @@ Tasks:
         Parameters:
           - wget
           - -O
-          - /bacalhau_data/$(basename "{{ .url_to_download }}")
+          - /bacalhau_data/{{ .filename }}
           - "{{ .url_to_download }}"
       Type: docker
     Name: download-data-job
@@ -29,5 +29,6 @@ Tasks:
       CPU: 250m
       Memory: 250m
     Timeouts: {}
-Type: batch
-Count: 4
+    Constraits:
+      - INSTANCE_ID=7362301696299350469
+Type: daemon
diff --git a/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh b/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh
index 8e71c1c8..b1475290 100755
--- a/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh
+++ b/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh
@@ -21,7 +21,7 @@ for url in "${urls[@]}"; do
     echo "Processing URL: $url"
     
     # Run the bacalhau job with the URL as template variable
-    bacalhau job run download_data_job.yaml --template-vars="url_to_download=$url"
+    bacalhau job run download_data_job.yaml --template-vars="url_to_download=$url" --template-vars="filename=$(basename $url)"
     
     # Add a small delay between job submissions
     sleep 1
diff --git a/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql b/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql
index a4708a20..a70b3156 100644
--- a/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql
+++ b/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql
@@ -1 +1,25 @@
-SELECT DATE_TRUNC('hour', tpep_pickup_datetime) + INTERVAL (FLOOR(EXTRACT(MINUTE FROM tpep_pickup_datetime) / 5) * 5) MINUTE AS interval_start, COUNT(*) AS ride_count FROM yellow_taxi_trips GROUP BY interval_start ORDER BY interval_start;
\ No newline at end of file
+WITH intervals AS (
+    SELECT
+        DATE_TRUNC('hour', tpep_pickup_datetime) AS pickup_hour,
+        FLOOR(EXTRACT(MINUTE FROM tpep_pickup_datetime) / 5) * 5 AS pickup_minute
+    FROM
+        your_table_name
+)
+SELECT
+    pickup_hour + INTERVAL (pickup_minute) MINUTE AS interval_start,
+    AVG(ride_count) AS avg_rides_per_5min
+FROM (
+    SELECT
+        pickup_hour,
+        pickup_minute,
+        COUNT(*) AS ride_count
+    FROM
+        intervals
+    GROUP BY
+        pickup_hour,
+        pickup_minute
+) AS ride_counts
+GROUP BY
+    interval_start
+ORDER BY
+    interval_start;
\ No newline at end of file
diff --git a/scale-tester/.envrc b/scale-tester/.envrc
index abe7884f..0f210bb5 100644
--- a/scale-tester/.envrc
+++ b/scale-tester/.envrc
@@ -1,6 +1,2 @@
-# shellcheck disable=SC1090
-. <( flox activate; );
-
-
 export GOPATH=$HOME/go
 export PATH=$PATH:$GOPATH/bin
diff --git a/scale-tester/aws_spot/.cspell/custom-dictionary.txt b/scale-tester/aws_spot/.cspell/custom-dictionary.txt
index d232366c..61a3241a 100644
--- a/scale-tester/aws_spot/.cspell/custom-dictionary.txt
+++ b/scale-tester/aws_spot/.cspell/custom-dictionary.txt
@@ -2,4 +2,7 @@ bacalhau
 bacalhauproject
 CPUS
 dind
+levelname
 oneshot
+pythonjsonlogger
+vcpus
diff --git a/scale-tester/aws_spot/.gitignore b/scale-tester/aws_spot/.gitignore
new file mode 100644
index 00000000..3dd73fa4
--- /dev/null
+++ b/scale-tester/aws_spot/.gitignore
@@ -0,0 +1,47 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+.env
+.venv
+env/
+venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# AWS
+config/aws-spot-env.sh
+manifest.json
+
+# Logs
+*.log
+/tmp/
+
+# OS
+.DS_Store
+Thumbs.db 
+.aider*
diff --git a/scale-tester/aws_spot/README.md b/scale-tester/aws_spot/README.md
index f3a067b2..1f7416fe 100644
--- a/scale-tester/aws_spot/README.md
+++ b/scale-tester/aws_spot/README.md
@@ -1,3 +1,187 @@
-packer init .
-packer validate .
-packer build .
\ No newline at end of file
+# Bacalhau Scale Tester
+
+A tool for testing the scalability of Bacalhau nodes on AWS spot instances. This tool can launch, manage, and monitor large numbers of Bacalhau nodes, with built-in health checking and stress testing capabilities.
+
+## Features
+
+- Launch and manage AWS spot instances running Bacalhau nodes
+- Monitor instance health and status
+- Run automated stress tests with configurable parameters
+- Beautiful CLI interface with progress bars and live updates
+- Comprehensive logging and debugging options
+
+## Prerequisites
+
+- Python 3.10 or higher
+- AWS CLI configured with appropriate credentials
+- [uv](https://github.com/astral-sh/uv) for dependency management (recommended)
+- [Packer](https://www.packer.io/) for building AMIs
+
+## Project Structure
+
+```
+bacalhau-scale-tester/
+├── ami/                      # AMI Creation Workflow
+│   ├── packer/              # Packer configuration
+│   │   ├── main.pkr.hcl
+│   │   └── variables.pkr.hcl
+│   ├── files/               # Files included in AMI
+│   │   ├── bacalhau/
+│   │   │   ├── startup.service
+│   │   │   └── config.yaml
+│   │   └── docker/
+│   │       └── compose.yml
+│   └── scripts/             # AMI build scripts
+│       ├── build.sh
+│       └── setup.sh
+├── aws/                     # AWS Resource Management
+│   ├── config/             # AWS configurations
+│   │   ├── env.sh.example
+│   │   └── env.sh
+│   ├── keys/               # SSH keys (gitignored)
+│   │   └── README.md
+│   └── scripts/            # AWS setup scripts
+│       ├── setup-iam.sh
+│       └── upload-ssm.sh
+├── fleet/                  # Spot Fleet Management
+│   ├── bin/               # Command-line tools
+│   │   └── spot-manager
+│   ├── src/               # Python implementation
+│   │   └── spot_manager.py
+│   ├── scripts/           # Fleet management scripts
+│   │   └── startup.sh
+│   └── examples/          # Example jobs
+│       └── pusher/        # Pusher job example
+│           ├── job.yaml
+│           ├── env.yaml
+│           └── README.md
+├── pyproject.toml         # Python project config
+├── requirements.txt       # Python dependencies
+└── README.md             # Main documentation
+```
+
+## Workflows
+
+### 1. Creating a New AMI
+
+To create a new AMI for your Bacalhau nodes:
+
+```bash
+# 1. Configure AMI settings
+vim ami/packer/variables.pkr.hcl
+
+# 2. Build the AMI
+cd ami
+./scripts/build.sh
+cd ..
+```
+
+### 2. Setting Up AWS Resources
+
+Before running spot instances, set up required AWS resources:
+
+```bash
+# 1. Configure AWS environment
+cp aws/config/env.sh.example aws/config/env.sh
+vim aws/config/env.sh
+
+# 2. Create SSH key pair
+cd aws/keys
+aws ec2 create-key-pair --key-name BacalhauScaleTestKey --query 'KeyMaterial' --output text > BacalhauScaleTestKey.pem
+chmod 600 BacalhauScaleTestKey.pem
+cd ../..
+
+# 3. Set up IAM roles and upload configs
+cd aws/scripts
+./setup-iam.sh
+./upload-ssm.sh
+cd ../..
+```
+
+### 3. Managing Spot Fleet
+
+The spot manager provides a CLI for managing your fleet:
+
+```bash
+# The spot-manager script handles environment setup
+./fleet/bin/spot-manager --help
+
+# Launch instances
+./fleet/bin/spot-manager launch --count 5
+
+# List running instances
+./fleet/bin/spot-manager list
+
+# Run stress test
+./fleet/bin/spot-manager stress-test \
+    --min-nodes 100 \
+    --max-nodes 500 \
+    --iterations 5 \
+    --health-timeout 300
+
+# Terminate all instances
+./fleet/bin/spot-manager terminate-all
+```
+
+### Stress Test Options
+
+- `--min-nodes`: Minimum number of nodes per iteration (default: 250)
+- `--max-nodes`: Maximum number of nodes per iteration (default: 750)
+- `--iterations`: Number of test iterations (default: 10)
+- `--health-timeout`: Timeout in seconds for health checks (default: 300)
+
+### Debug Mode
+
+Add `--debug` to any command to enable detailed logging:
+```bash
+./fleet/bin/spot-manager --debug launch --count 5
+```
+
+## Development Setup
+
+1. Clone the repository:
+```bash
+git clone https://github.com/bacalhau-project/bacalhau-scale-tester.git
+cd bacalhau-scale-tester
+```
+
+2. Set up the environment:
+```bash
+# Using uv (recommended)
+uv venv
+source .venv/bin/activate
+uv pip install -r requirements.txt
+
+# Or using pip
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Example Jobs
+
+See the `fleet/examples/` directory for example job configurations:
+
+### Pusher Job
+Located in `fleet/examples/pusher/`, this example demonstrates how to set up a job that pushes events to a monitoring system. See its README for detailed setup instructions.
+
+## Development
+
+The project uses:
+- [Rich](https://rich.readthedocs.io/) for beautiful terminal output
+- [Click](https://click.palletsprojects.com/) for CLI interface
+- [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) for AWS interaction
+- [aiohttp](https://docs.aiohttp.org/) for async health checks
+- [Packer](https://www.packer.io/) for AMI building
+
+## Contributing
+
+1. Fork the repository
+2. Create a feature branch
+3. Commit your changes
+4. Push to the branch
+5. Create a Pull Request
+
+## License
+
+This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
\ No newline at end of file
diff --git a/scale-tester/aws_spot/files/docker-compose.yml b/scale-tester/aws_spot/ami/files/docker/compose.yml
similarity index 100%
rename from scale-tester/aws_spot/files/docker-compose.yml
rename to scale-tester/aws_spot/ami/files/docker/compose.yml
diff --git a/scale-tester/aws_spot/main.pkr.hcl b/scale-tester/aws_spot/ami/packer/main.pkr.hcl
similarity index 100%
rename from scale-tester/aws_spot/main.pkr.hcl
rename to scale-tester/aws_spot/ami/packer/main.pkr.hcl
diff --git a/scale-tester/aws_spot/variables.pkr.hcl b/scale-tester/aws_spot/ami/packer/variables.pkr.hcl
similarity index 100%
rename from scale-tester/aws_spot/variables.pkr.hcl
rename to scale-tester/aws_spot/ami/packer/variables.pkr.hcl
diff --git a/scale-tester/aws_spot/build-ami.sh b/scale-tester/aws_spot/ami/scripts/build.sh
similarity index 100%
rename from scale-tester/aws_spot/build-ami.sh
rename to scale-tester/aws_spot/ami/scripts/build.sh
diff --git a/scale-tester/aws_spot/setup.sh b/scale-tester/aws_spot/ami/scripts/setup.sh
similarity index 100%
rename from scale-tester/aws_spot/setup.sh
rename to scale-tester/aws_spot/ami/scripts/setup.sh
diff --git a/scale-tester/aws_spot/aws/config/env.sh b/scale-tester/aws_spot/aws/config/env.sh
new file mode 100644
index 00000000..65dbf332
--- /dev/null
+++ b/scale-tester/aws_spot/aws/config/env.sh
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# aws-spot-env.sh
+#
+# This file sets environment variables used for launching
+# 1,000 AWS Spot Instances with Docker installed.
+#
+# Usage:
+#   source ./aws-spot-env.sh
+
+# AWS CLI & Region
+export AWS_REGION="us-west-2"
+
+# Key Pair
+export KEY_NAME="BacalhauScaleTestKey"
+
+# Security Group
+export SECURITY_GROUP_NAME="bacalhau-scale-test-group"
+export SECURITY_GROUP_DESC="Security group for Bacalhau Scale Spot Instances"
+
+# Your public IP for SSH ingress (CIDR /32)
+export MY_PUBLIC_IP=$(curl -s ifconfig.me)
+
+# Base AMI to use (Amazon Linux 2 example)
+# aws ssm get-parameters --names /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2 --region us-east-1
+export BASE_AMI_ID="ami-07d9cf938edb0739b"
+export CONFIGURED_AMI_ID="ami-06e47c5231eb29362"
+
+# Instance Type
+export INSTANCE_TYPE="t3.micro"
+
+# Scaling Limits
+export SPOT_INSTANCE_COUNT="4"
+export MAX_INSTANCES="1000"
+export MAX_INSTANCES_PER_LAUNCH="100"
+export MIN_INSTANCES="1"
+export MAX_TOTAL_VCPUS="10000"
+export MAX_TOTAL_MEMORY="100000"  # In GB
+
+# Custom AMI details (if building your own)
+export CUSTOM_AMI_NAME="bacalhau-scale-test-ami"
+export CUSTOM_AMI_DESCRIPTION="AMI with Docker and Bacalhau preinstalled"
+
+# Tags
+export INSTANCE_TAG_KEY="Name"
+export INSTANCE_TAG_VALUE="bacalhau-scale-test"
+
+echo "Environment variables for AWS Spot Instances set."
diff --git a/scale-tester/aws_spot/aws/keys/README.md b/scale-tester/aws_spot/aws/keys/README.md
new file mode 100644
index 00000000..7f1f98a9
--- /dev/null
+++ b/scale-tester/aws_spot/aws/keys/README.md
@@ -0,0 +1,29 @@
+# AWS SSH Keys
+
+This directory contains SSH key pairs for accessing AWS instances. These files are sensitive and should never be committed to version control.
+
+## Required Keys
+
+1. `BacalhauScaleTestKey.pem` - Main SSH key pair for accessing spot instances
+   - Generated when running setup scripts
+   - Must be kept private and secure
+   - Should have permissions set to 600 (`chmod 600 BacalhauScaleTestKey.pem`)
+
+## Security Notes
+
+- Never commit these keys to git
+- Keep backups in a secure location
+- Rotate keys regularly
+- Ensure proper file permissions
+
+## Setup
+
+To create a new key pair:
+
+1. Use AWS Console:
+   ```bash
+   aws ec2 create-key-pair --key-name BacalhauScaleTestKey --query 'KeyMaterial' --output text > BacalhauScaleTestKey.pem
+   chmod 600 BacalhauScaleTestKey.pem
+   ```
+
+2. Update the key name in `aws/config/env.sh`
diff --git a/scale-tester/aws_spot/setup-iam.sh b/scale-tester/aws_spot/aws/scripts/setup-iam.sh
similarity index 100%
rename from scale-tester/aws_spot/setup-iam.sh
rename to scale-tester/aws_spot/aws/scripts/setup-iam.sh
diff --git a/scale-tester/aws_spot/upload-to-ssm.sh b/scale-tester/aws_spot/aws/scripts/upload-to-ssm.sh
similarity index 100%
rename from scale-tester/aws_spot/upload-to-ssm.sh
rename to scale-tester/aws_spot/aws/scripts/upload-to-ssm.sh
diff --git a/scale-tester/aws_spot/files/bacalhau-startup.service b/scale-tester/aws_spot/files/bacalhau-startup.service
deleted file mode 100644
index ec988e93..00000000
--- a/scale-tester/aws_spot/files/bacalhau-startup.service
+++ /dev/null
@@ -1,14 +0,0 @@
-[Unit]
-Description=Bacalhau Startup Script
-After=docker.service network-online.target
-Wants=network-online.target
-
-[Service]
-Type=oneshot
-ExecStart=/bacalhau_node/startup.sh
-RemainAfterExit=yes
-StandardOutput=journal
-StandardError=journal
-
-[Install]
-WantedBy=multi-user.target
\ No newline at end of file
diff --git a/scale-tester/aws_spot/files/orchestrator-config.yaml b/scale-tester/aws_spot/files/orchestrator-config.yaml
deleted file mode 100644
index 82e1ead4..00000000
--- a/scale-tester/aws_spot/files/orchestrator-config.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-NameProvider: "uuid"
-API:
-  Port: 1234
-Compute:
-  Enabled: true
-  Orchestrators:
-    - nats://ns101607.ip-147-135-16.us:4222
-  Auth:
-    Token: 93182ba0-6a4a-4c5b-9554-deb0b19ee71f
-  AllowListedLocalPaths:
-    - /bacalhau_data:rw
-  Engine:
-    Resources:
-      CPU: 1
-      Memory: 1GB
-JobAdmissionControl:
-  AcceptNetworkedJobs: true
diff --git a/scale-tester/aws_spot/fleet/bin/spot-manager b/scale-tester/aws_spot/fleet/bin/spot-manager
new file mode 100755
index 00000000..3f0f8d9f
--- /dev/null
+++ b/scale-tester/aws_spot/fleet/bin/spot-manager
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+#
+# Convenience wrapper for running the spot manager
+# Ensures proper environment and paths are set up
+
+# Get the directory this script is in
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PROJECT_ROOT="$( cd "$SCRIPT_DIR/../.." && pwd )"
+
+# Source AWS environment if it exists
+AWS_ENV_FILE="$PROJECT_ROOT/aws/config/env.sh"
+if [ -f "$AWS_ENV_FILE" ]; then
+    source "$AWS_ENV_FILE"
+else
+    echo "Error: AWS environment file not found at $AWS_ENV_FILE"
+    echo "Please copy aws/config/env.sh.example to aws/config/env.sh and configure it"
+    exit 1
+fi
+
+# Run the spot manager
+"$PROJECT_ROOT/fleet/src/spot_manager.py" "$@" 
\ No newline at end of file
diff --git a/scale-tester/aws_spot/fleet/examples/pusher/README.md b/scale-tester/aws_spot/fleet/examples/pusher/README.md
new file mode 100644
index 00000000..4c7f1924
--- /dev/null
+++ b/scale-tester/aws_spot/fleet/examples/pusher/README.md
@@ -0,0 +1,44 @@
+# Pusher Job Example
+
+This example demonstrates how to set up a job that pushes events to a monitoring system. It's useful for monitoring the health and performance of your Bacalhau nodes.
+
+## Files
+
+- `job.yaml` - Main job configuration for the event pusher
+- `env.yaml` - Environment configuration for the pusher
+- `env.txt` - Environment variables (create from env.txt.example)
+- `env.txt.b64` - Base64 encoded environment variables
+
+## Setup
+
+1. Configure environment:
+```bash
+# Copy example config
+cp env.txt.example env.txt
+
+# Edit with your settings
+vim env.txt
+
+# Create base64 encoded version
+base64 env.txt > env.txt.b64
+```
+
+2. Deploy the job:
+```bash
+# Create the job
+bacalhau create job.yaml
+
+# Verify it's running
+bacalhau list
+```
+
+## Configuration
+
+The pusher job requires the following environment variables:
+
+- `PUSHER_ENDPOINT` - Endpoint to push events to
+- `PUSHER_TOKEN` - Authentication token
+- `PUSHER_INTERVAL` - Push interval in seconds
+- `PUSHER_BATCH_SIZE` - Number of events per batch
+
+See `env.txt.example` for a complete list of options.
diff --git a/scale-tester/aws_spot/env_writer.yaml b/scale-tester/aws_spot/fleet/examples/pusher/env_writer.yaml
similarity index 100%
rename from scale-tester/aws_spot/env_writer.yaml
rename to scale-tester/aws_spot/fleet/examples/pusher/env_writer.yaml
diff --git a/scale-tester/aws_spot/pusher-job.yaml b/scale-tester/aws_spot/fleet/examples/pusher/pusher-job.yaml
similarity index 100%
rename from scale-tester/aws_spot/pusher-job.yaml
rename to scale-tester/aws_spot/fleet/examples/pusher/pusher-job.yaml
diff --git a/scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64 b/scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64
new file mode 100644
index 00000000..8d75863c
--- /dev/null
+++ b/scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64
@@ -0,0 +1 @@
+QVdTX0FDQ0VTU19LRVlfSUQ9QUtJQTNGTERZT0JGQ01XTjRTRkMKQVdTX1NFQ1JFVF9BQ0NFU1NfS0VZPWpsTlU0VE1FcUhkWGZhNWdyYWY4MXBkQWtORGE2RG9raUk3UzJNOEoKQVdTX1JFR0lPTj11cy1lYXN0LTIKU1FTX1FVRVVFX1VSTD1odHRwczovL3Nxcy51cy1lYXN0LTIuYW1hem9uYXdzLmNvbS83NjczOTc3NTI5MDYvc2NhbGUtdGVzdGVyLWV2ZW50cy5maWZvCkNPTE9SPSNGRjAzMjEK
diff --git a/scale-tester/aws_spot/scripts/startup.sh b/scale-tester/aws_spot/fleet/scripts/startup.sh
similarity index 100%
rename from scale-tester/aws_spot/scripts/startup.sh
rename to scale-tester/aws_spot/fleet/scripts/startup.sh
diff --git a/scale-tester/aws_spot/fleet/src/spot_manager.py b/scale-tester/aws_spot/fleet/src/spot_manager.py
new file mode 100755
index 00000000..9af52bda
--- /dev/null
+++ b/scale-tester/aws_spot/fleet/src/spot_manager.py
@@ -0,0 +1,2752 @@
+#!/usr/bin/env uv run -s
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "rich",
+#     "boto3",
+#     "click",
+#     "aiohttp",
+#     "python-dotenv",
+#     "python-json-logger",
+# ]
+# ///
+
+import asyncio
+import json
+import logging
+import os
+import random
+import re
+import signal
+import subprocess
+import sys
+import time
+from collections import deque
+from datetime import datetime
+from functools import wraps
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import aiohttp
+import boto3
+import click
+from click import Context
+from pythonjsonlogger import jsonlogger
+from rich.box import ROUNDED
+from rich.console import Console
+from rich.layout import Layout
+from rich.live import Live
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    Progress,
+    SpinnerColumn,
+    TaskProgressColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+from rich.table import Table
+from rich.theme import Theme
+
+# Initialize Rich console with consistent theme
+console = Console(
+    theme=Theme(
+        {
+            "info": "bold blue",
+            "warning": "bold yellow",
+            "error": "bold red",
+            "success": "bold green",
+            "highlight": "bold cyan",
+            "dim": "dim",
+        }
+    )
+)
+
+# Get the project root directory
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parent.parent
+DEBUG_LOG = Path(os.getcwd()) / "debug.log"  # Use caller's directory for debug log
+
+
+def write_debug(message: str) -> None:
+    """Write debug information to debug.log in the caller's directory"""
+    try:
+        # 'a' mode is used to prevent multiple processes from truncating each other's output
+        with open(DEBUG_LOG, "a") as f:
+            f.write(f"{datetime.now().isoformat()} - {message}\n")
+    except Exception as e:
+        # If we can't write to the debug log, fall back to stderr
+        print(f"Failed to write to debug log: {str(e)}", file=sys.stderr)
+
+
+# Truncate the debug log at startup
+try:
+    DEBUG_LOG.write_text("")
+except Exception as e:
+    print(f"Failed to truncate debug log: {str(e)}", file=sys.stderr)
+
+# Initialize global layout with consistent sizing
+layout = Layout()
+layout.split_column(
+    Layout(name="header", size=3, minimum_size=3),
+    Layout(name="body", ratio=2),
+    Layout(name="status", size=6, minimum_size=6),
+    Layout(name="progress", size=4, minimum_size=4),
+)
+
+# Initialize global progress with more detailed columns and safer formatting
+progress = Progress(
+    SpinnerColumn(),
+    TextColumn("[progress.description]{task.description}"),
+    BarColumn(bar_width=None),
+    TaskProgressColumn(),
+    # Remove percentage for tasks that might not have a total
+    TimeElapsedColumn(),
+    TimeRemainingColumn(),
+    expand=True,
+    disable=False,
+)
+
+
+def safe_progress_update(task_id, **kwargs):
+    """Safely update progress without template errors"""
+    try:
+        if "total" in kwargs and kwargs["total"] is None:
+            # Don't show percentage for indeterminate progress
+            kwargs["visible"] = True
+            if "completed" in kwargs:
+                del kwargs["completed"]
+        progress.update(task_id, **kwargs)
+    except Exception as e:
+        # Fallback to basic progress display
+        try:
+            progress.update(task_id, description="Processing...", visible=True)
+        except:
+            pass  # Suppress any errors in the fallback
+
+
+progress_task = None
+layout["progress"].update(progress)
+
+# Initialize global live display
+live = Live(layout, refresh_per_second=4, auto_refresh=True)
+
+
+def load_shell_env(env_path: Path) -> None:
+    """Load environment variables from a shell script"""
+    if not env_path.exists():
+        console.print(f"[yellow]Warning: Environment file not found at {env_path}[/yellow]")
+        return
+
+    content = env_path.read_text()
+    pattern = r'^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)=["\']?([^"\'\n]*)["\']?$'
+
+    for line in content.splitlines():
+        line = line.strip()
+        if line and not line.startswith("#"):
+            match = re.match(pattern, line)
+            if match:
+                key, value = match.groups()
+                os.environ[key] = value
+
+
+class RateLimiter:
+    """Rate limiter for AWS API calls"""
+
+    def __init__(self, max_rate: float = 10, time_window: float = 1.0):
+        self.max_rate = max_rate
+        self.time_window = time_window
+        self.timestamps = deque(maxlen=max_rate)
+        self.lock = asyncio.Lock()
+
+    async def wait(self):
+        """Wait until we can make another API call"""
+        async with self.lock:
+            now = time.time()
+
+            # Remove old timestamps
+            while self.timestamps and now - self.timestamps[0] > self.time_window:
+                self.timestamps.popleft()
+
+            if len(self.timestamps) >= self.max_rate:
+                # Calculate wait time
+                oldest = self.timestamps[0]
+                wait_time = self.time_window - (now - oldest)
+                if wait_time > 0:
+                    await asyncio.sleep(wait_time)
+                    now = time.time()
+
+            self.timestamps.append(now)
+
+
+def rate_limited(max_rate: float = 10, time_window: float = 1.0):
+    """Decorator to rate limit AWS API calls"""
+
+    def decorator(func):
+        @wraps(func)
+        async def wrapper(self, *args, **kwargs):
+            await self.rate_limiter.wait()
+            return await func(self, *args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+class SpotManager:
+    def __init__(self, debug: bool = False):
+        self.debug = debug
+
+        # Initialize logging first
+        self._setup_logging()
+
+        # Initialize AWS clients
+        self.region = os.getenv("AWS_REGION", "us-west-2")
+        self.ec2 = boto3.client("ec2", region_name=self.region)
+        self.ec2_resource = boto3.resource("ec2", region_name=self.region)
+        self._instance_type_cache = {}
+
+        # Initialize rate limiter with conservative defaults
+        self.rate_limiter = RateLimiter(max_rate=8, time_window=1.0)  # 8 requests per second
+
+        # Initialize lifecycle tracking
+        self.lifecycle_events = {}  # instance_id -> list of lifecycle events
+        self.lifecycle_lock = asyncio.Lock()  # For thread-safe lifecycle updates
+
+        # Health monitoring configuration
+        self.health_check_interval = 60  # Seconds between health checks
+        self.max_health_failures = 3  # Max consecutive failures before recovery
+        self.health_metrics = {}  # instance_id -> health metrics
+        self.health_lock = asyncio.Lock()  # For thread-safe health updates
+        self.monitoring_tasks = set()  # Track active monitoring tasks
+        self.health_thresholds = {
+            "max_response_time": 5.0,  # Seconds
+            "min_success_rate": 0.8,  # 80%
+            "max_consecutive_failures": 3,
+            "max_error_rate": 0.2,  # 20%
+        }
+
+        # Error recovery configuration
+        self.max_retries = 5  # Max retries for recoverable errors
+        self.retry_delay = 1.0  # Initial retry delay in seconds
+        self.max_retry_delay = 30.0  # Max retry delay in seconds
+        self.recovery_actions = {
+            "InstanceLimitExceeded": self._handle_instance_limit_error,
+            "InsufficientInstanceCapacity": self._handle_capacity_error,
+            "SpotInstanceRequestLimitExceeded": self._handle_spot_limit_error,
+            "RequestLimitExceeded": self._handle_rate_limit_error,
+            "Unavailable": self._handle_service_unavailable,
+            "InternalError": self._handle_internal_error,
+        }
+        # Load environment variables
+        aws_env_path = PROJECT_ROOT / "aws" / "config" / "env.sh"
+        load_shell_env(aws_env_path)
+
+        # Initialize scaling limits from environment or defaults
+        self.max_instances = int(os.getenv("MAX_INSTANCES", 1000))
+        self.max_instances_per_launch = min(
+            int(os.getenv("MAX_INSTANCES_PER_LAUNCH", 100)),
+            100,  # AWS hard limit per launch request
+        )
+        self.min_instances = int(os.getenv("MIN_INSTANCES", 1))
+        self.max_total_vcpus = int(os.getenv("MAX_TOTAL_VCPUS", 10000))
+        self.max_total_memory = int(os.getenv("MAX_TOTAL_MEMORY", 100000))  # In GB
+
+        # Track resource usage
+        self.current_vcpus = 0
+        self.current_memory = 0  # In GB
+        self.region = os.getenv("AWS_REGION", "us-west-2")
+        self.instance_type = os.getenv("INSTANCE_TYPE", "t3.micro")
+        self.key_name = os.getenv("KEY_NAME")
+        self.security_group_name = os.getenv("SECURITY_GROUP_NAME", "bacalhau-scale-test-sg")
+        self.configured_ami_id = os.getenv("CONFIGURED_AMI_ID")
+
+        # Validate all configuration parameters
+        self._validate_configuration()
+        self._cleanup_tasks = set()  # Track cleanup tasks
+        self._cleanup_lock = asyncio.Lock()  # For thread-safe cleanup operations
+        self._created_resources = {"instances": set(), "security_groups": set(), "key_pairs": set()}
+        self.key_name = os.getenv("KEY_NAME")
+        self.security_group_name = os.getenv("SECURITY_GROUP_NAME", "bacalhau-scale-test-sg")
+        # Initialize tagging system
+        self.default_tags = {
+            "Name": os.getenv("INSTANCE_TAG_VALUE", "bacalhau-scale-test"),
+            "Project": "BacalhauScaleTest",
+            "Environment": "Test",
+            "ManagedBy": "SpotManager",
+            "CreationTime": datetime.now().isoformat(),
+        }
+        self.configured_ami_id = os.getenv("CONFIGURED_AMI_ID")
+
+        # Instance state machine tracking
+        self.instance_states = {}  # instance_id -> state info
+        self.state_lock = asyncio.Lock()  # For thread-safe state updates
+        self.state_transitions = {
+            "pending": ["running", "terminated", "shutting-down"],
+            "running": ["stopping", "shutting-down", "terminated"],
+            "stopping": ["stopped", "terminated"],
+            "stopped": ["terminated", "pending"],
+            "shutting-down": ["terminated"],
+            "terminated": [],
+        }
+
+        self.ec2 = boto3.client("ec2", region_name=self.region)
+        self.ec2_resource = boto3.resource("ec2", region_name=self.region)
+        self._instance_type_cache = {}
+
+        # Use global progress task
+        global progress_task
+        self.progress_task = progress_task
+
+        self._setup_logging()
+        self.log(
+            "info",
+            "SpotManager initialized",
+            region=self.region,
+            instance_type=self.instance_type,
+            security_group=self.security_group_name,
+        )
+
+    def _validate_configuration(self) -> None:
+        """Validate all configuration parameters"""
+        required_env_vars = [
+            "AWS_REGION",
+            "KEY_NAME",
+            "SECURITY_GROUP_NAME",
+            "CONFIGURED_AMI_ID",
+            "INSTANCE_TYPE",
+        ]
+
+        # Validate instance type
+        if not self.validate_instance_type(self.instance_type):
+            raise ValueError(
+                f"Instance type {self.instance_type} is not supported or available. "
+                f"Must support EBS optimization, HVM virtualization, and have at least 2 network interfaces"
+            )
+
+        missing_vars = [var for var in required_env_vars if not os.getenv(var)]
+        if missing_vars:
+            raise ValueError(
+                f"Missing required environment variables: {', '.join(missing_vars)}. "
+                f"Please check your aws/config/env.sh file"
+            )
+
+        # Validate instance type format
+        if not re.match(r"^[a-z0-9]+\.\w+$", self.instance_type):
+            raise ValueError(
+                f"Invalid instance type format: {self.instance_type}. "
+                f"Expected format like 't3.micro'"
+            )
+
+        # Validate AMI ID format
+        if not re.match(r"^ami-[0-9a-f]{17}$", self.configured_ami_id):
+            raise ValueError(
+                f"Invalid AMI ID format: {self.configured_ami_id}. "
+                f"Expected format like 'ami-0123456789abcdef0'"
+            )
+
+        # Validate security group name
+        if not re.match(r"^[a-zA-Z0-9_\-]{1,255}$", self.security_group_name):
+            raise ValueError(
+                f"Invalid security group name: {self.security_group_name}. "
+                f"Must be 1-255 alphanumeric characters, underscores or hyphens"
+            )
+
+        # Validate key pair name
+        if not re.match(r"^[a-zA-Z0-9_\-]{1,255}$", self.key_name):
+            raise ValueError(
+                f"Invalid key pair name: {self.key_name}. "
+                f"Must be 1-255 alphanumeric characters, underscores or hyphens"
+            )
+
+        # Validate region format
+        if not re.match(r"^[a-z]{2}-[a-z]+-\d+$", self.region):
+            raise ValueError(
+                f"Invalid AWS region format: {self.region}. Expected format like 'us-west-2'"
+            )
+
+    def _setup_logging(self):
+        """Setup logging to write to debug.log"""
+        self.logger = logging.getLogger("SpotManager")
+        self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO)
+
+        # Remove any existing handlers
+        self.logger.handlers = []
+
+        # Create file handler that writes to debug.log
+        file_handler = logging.FileHandler(DEBUG_LOG)
+        formatter = jsonlogger.JsonFormatter(
+            fmt="%(asctime)s %(levelname)s %(name)s %(message)s",
+            rename_fields={"asctime": "timestamp", "levelname": "level", "name": "logger"},
+        )
+        file_handler.setFormatter(formatter)
+        self.logger.addHandler(file_handler)
+
+    def log(self, level: str, message: str, **kwargs):
+        """Log structured messages with additional context"""
+        log_method = getattr(self.logger, level, self.logger.info)
+        log_data = {"message": message, **kwargs}
+        log_method(log_data)
+
+    def debug_log(self, message: str, **kwargs):
+        """Log debug messages to debug.log"""
+        if self.debug:
+            write_debug(message)
+            if kwargs:
+                write_debug(f"Additional context: {json.dumps(kwargs, indent=2)}")
+
+    async def check_node_health(
+        self, ip_address: str, max_retries: int = 3, timeout: int = 5
+    ) -> Dict[str, Any]:
+        """Check if a Bacalhau node is healthy by querying its API with retries
+        and collecting detailed metrics.
+
+        Args:
+            ip_address: IP address of node to check
+            max_retries: Maximum number of retry attempts
+            timeout: Timeout in seconds for each attempt
+
+        Returns:
+            Dict containing health status and metrics:
+            {
+                "healthy": bool,
+                "response_time": float,  # In seconds
+                "status_code": int,
+                "error": Optional[str],
+                "timestamp": str
+            }
+        """
+        url = f"http://{ip_address}:1234"
+        retry_delay = 1  # Start with 1 second delay
+        start_time = time.time()
+
+        for attempt in range(max_retries):
+            try:
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(url, timeout=timeout) as response:
+                        response_time = time.time() - start_time
+
+                        # Collect metrics
+                        metrics = {
+                            "healthy": response.status == 200,
+                            "response_time": response_time,
+                            "status_code": response.status,
+                            "error": None,
+                            "timestamp": datetime.now().isoformat(),
+                        }
+
+                        if response.status != 200:
+                            self.debug_log(
+                                f"Health check attempt {attempt + 1} failed for {ip_address}: "
+                                f"Status {response.status}"
+                            )
+                            metrics["error"] = f"HTTP {response.status}"
+
+                        return metrics
+            except asyncio.TimeoutError:
+                error_msg = f"Health check attempt {attempt + 1} timed out for {ip_address}"
+                self.debug_log(error_msg)
+                metrics = {
+                    "healthy": False,
+                    "response_time": time.time() - start_time,
+                    "status_code": None,
+                    "error": "Timeout",
+                    "timestamp": datetime.now().isoformat(),
+                }
+            except Exception as e:
+                error_msg = f"Health check attempt {attempt + 1} failed for {ip_address}: {str(e)}"
+                self.debug_log(error_msg)
+                metrics = {
+                    "healthy": False,
+                    "response_time": time.time() - start_time,
+                    "status_code": None,
+                    "error": str(e),
+                    "timestamp": datetime.now().isoformat(),
+                }
+
+            # Exponential backoff between retries
+            if attempt < max_retries - 1:
+                await asyncio.sleep(retry_delay)
+                retry_delay = min(retry_delay * 2, 10)  # Cap at 10 seconds
+
+        return metrics
+
+    async def start_health_monitoring(self) -> None:
+        """Start continuous health monitoring for all instances"""
+        while True:
+            try:
+                # Get all running instances
+                instance_ids = await asyncio.get_event_loop().run_in_executor(
+                    None, self.get_all_instance_ids
+                )
+
+                if not instance_ids:
+                    await asyncio.sleep(self.health_check_interval)
+                    continue
+
+                # Get instance IPs
+                instance_ips = await asyncio.get_event_loop().run_in_executor(
+                    None, lambda: self.get_instance_ips(instance_ids)
+                )
+
+                # Check health of all nodes
+                health_results = await self.check_all_nodes_health(instance_ips)
+
+                # Update health metrics
+                async with self.health_lock:
+                    for ip, metrics in health_results.items():
+                        instance_id = next(
+                            (
+                                id
+                                for id, ip_addr in zip(instance_ids, instance_ips)
+                                if ip_addr == ip
+                            ),
+                            None,
+                        )
+                        if instance_id:
+                            if instance_id not in self.health_metrics:
+                                self.health_metrics[instance_id] = {
+                                    "history": [],
+                                    "consecutive_failures": 0,
+                                    "last_healthy": None,
+                                }
+
+                            # Update metrics
+                            self.health_metrics[instance_id]["history"].append(metrics)
+                            if not metrics["healthy"]:
+                                self.health_metrics[instance_id]["consecutive_failures"] += 1
+                            else:
+                                self.health_metrics[instance_id]["consecutive_failures"] = 0
+                                self.health_metrics[instance_id]["last_healthy"] = datetime.now()
+
+                            # Check if instance needs recovery
+                            if (
+                                self.health_metrics[instance_id]["consecutive_failures"]
+                                >= self.health_thresholds["max_consecutive_failures"]
+                            ):
+                                await self.recover_instance(instance_id)
+
+                await asyncio.sleep(self.health_check_interval)
+
+            except Exception as e:
+                self.log("error", "Health monitoring error", error=str(e))
+                await asyncio.sleep(self.health_check_interval)
+
+    async def recover_instance(self, instance_id: str) -> None:
+        """Recover an unhealthy instance"""
+        self.log("warning", "Recovering unhealthy instance", instance_id=instance_id)
+
+        try:
+            # Get instance details
+            instance = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: self.ec2.describe_instances(InstanceIds=[instance_id])["Reservations"][0][
+                    "Instances"
+                ][0],
+            )
+
+            # Terminate the unhealthy instance
+            await asyncio.get_event_loop().run_in_executor(
+                None, lambda: self.ec2.terminate_instances(InstanceIds=[instance_id])
+            )
+
+            # Wait for termination to complete
+            waiter = self.ec2.get_waiter("instance_terminated")
+            await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: waiter.wait(
+                    InstanceIds=[instance_id], WaiterConfig={"Delay": 5, "MaxAttempts": 40}
+                ),
+            )
+
+            # Launch replacement instance
+            await self.launch_instances(1)
+
+            # Cleanup health metrics
+            async with self.health_lock:
+                if instance_id in self.health_metrics:
+                    del self.health_metrics[instance_id]
+
+            self.log("info", "Instance recovery completed", instance_id=instance_id)
+
+        except Exception as e:
+            self.log("error", "Instance recovery failed", instance_id=instance_id, error=str(e))
+            raise
+
+    async def check_all_nodes_health(
+        self, instance_ips: List[str], progress=None, timeout: int = 10
+    ) -> Dict[str, Dict]:
+        """Check health of all nodes in parallel with timeout and collect metrics
+
+        Args:
+            instance_ips: List of IP addresses to check
+            progress: Progress tracker for UI updates
+            timeout: Maximum time to wait for all checks
+
+        Returns:
+            Dict mapping IP addresses to health metrics:
+            {
+                "healthy": bool,
+                "response_time": float,
+                "status_code": Optional[int],
+                "error": Optional[str],
+                "timestamp": str
+            }
+        """
+        try:
+            # Create tasks with individual timeouts
+            tasks = [
+                asyncio.wait_for(self.check_node_health(ip), timeout=timeout) for ip in instance_ips
+            ]
+
+            # Run all checks in parallel
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Process results
+            health_status = {}
+            for ip, result in zip(instance_ips, results):
+                if isinstance(result, Exception):
+                    self.debug_log(f"Health check failed for {ip}: {str(result)}")
+                    health_status[ip] = False
+                else:
+                    health_status[ip] = result
+
+            return health_status
+
+        except asyncio.TimeoutError:
+            self.debug_log(f"Health checks timed out after {timeout} seconds")
+            return {ip: False for ip in instance_ips}
+        except Exception as e:
+            self.debug_log(f"Error checking node health: {str(e)}")
+            return {ip: False for ip in instance_ips}
+
+    def get_instance_ips(self, instance_ids: List[str]) -> List[str]:
+        """Get public IPs for a list of instance IDs"""
+        response = self.ec2.describe_instances(InstanceIds=instance_ids)
+        return [
+            instance["PublicIpAddress"]
+            for reservation in response["Reservations"]
+            for instance in reservation["Instances"]
+            if "PublicIpAddress" in instance
+        ]
+
+    def ensure_security_group(self) -> str:
+        """Ensure security group exists and has correct rules"""
+        # Validate security group configuration before proceeding
+        if not self.security_group_name:
+            raise ValueError("Security group name is not configured")
+
+        self.debug_log("Checking for existing security group...")
+
+        try:
+            response = self.ec2.describe_security_groups(
+                Filters=[{"Name": "group-name", "Values": [self.security_group_name]}]
+            )
+
+            if response["SecurityGroups"]:
+                group_id = response["SecurityGroups"][0]["GroupId"]
+                self.debug_log(f"Found existing security group: {group_id}")
+            else:
+                response = self.ec2.create_security_group(
+                    GroupName=self.security_group_name,
+                    Description="Security group for Bacalhau scale testing",
+                )
+                group_id = response["GroupId"]
+                self._created_resources["security_groups"].add(group_id)
+                self.debug_log(f"Created new security group: {group_id}")
+
+            self._update_security_group_rules(group_id)
+            return group_id
+
+        except Exception as e:
+            console.print(f"[red]Error ensuring security group: {str(e)}[/red]")
+            self._cleanup_resources()
+            raise
+
+    def _update_security_group_rules(self, group_id: str):
+        """Update security group rules"""
+        try:
+            existing_rules = self.ec2.describe_security_group_rules(
+                Filters=[{"Name": "group-id", "Values": [group_id]}]
+            )
+
+            for rule in existing_rules.get("SecurityGroupRules", []):
+                if not rule.get("IsEgress", True):
+                    try:
+                        self.ec2.revoke_security_group_ingress(
+                            GroupId=group_id,
+                            SecurityGroupRuleIds=[rule["SecurityGroupRuleId"]],
+                        )
+                    except Exception as e:
+                        self.debug_log(f"Error removing rule: {str(e)}")
+
+            self.ec2.authorize_security_group_ingress(
+                GroupId=group_id,
+                IpPermissions=[
+                    {
+                        "IpProtocol": "tcp",
+                        "FromPort": 22,
+                        "ToPort": 22,
+                        "IpRanges": [{"CidrIp": "0.0.0.0/0"}],
+                    },
+                    {
+                        "IpProtocol": "tcp",
+                        "FromPort": 4222,
+                        "ToPort": 4222,
+                        "IpRanges": [{"CidrIp": "0.0.0.0/0"}],
+                    },
+                    {
+                        "IpProtocol": "tcp",
+                        "FromPort": 1234,
+                        "ToPort": 1234,
+                        "IpRanges": [{"CidrIp": "0.0.0.0/0"}],
+                    },
+                ],
+            )
+        except Exception as e:
+            self.debug_log(f"Error updating security group rules: {str(e)}")
+            raise
+
+    async def _cleanup_instances(self, instance_ids: List[str]) -> None:
+        """Async cleanup handler for instances with improved state validation"""
+        async with self._cleanup_lock:
+            try:
+                self.log("info", "Starting instance cleanup", instance_count=len(instance_ids))
+
+                # Get current instance states
+                response = await asyncio.get_event_loop().run_in_executor(
+                    None, lambda: self.ec2.describe_instances(InstanceIds=instance_ids)
+                )
+
+                # Filter out instances that are already terminated or shutting-down
+                active_instances = [
+                    instance["InstanceId"]
+                    for reservation in response["Reservations"]
+                    for instance in reservation["Instances"]
+                    if instance["State"]["Name"] not in ["terminated", "shutting-down"]
+                ]
+
+                if not active_instances:
+                    self.log("info", "No active instances to terminate")
+                    return
+
+                # Calculate resources to release
+                released_vcpus = 0
+                released_memory = 0
+                for reservation in response["Reservations"]:
+                    for instance in reservation["Instances"]:
+                        if instance["InstanceId"] in active_instances:
+                            instance_type = instance["InstanceType"]
+                            instance_info = self.get_instance_type_info(instance_type)
+                            released_vcpus += int(instance_info["vcpus"])
+                            released_memory += float(instance_info["memory"].replace("GB", ""))
+
+                # Terminate active instances
+                await asyncio.get_event_loop().run_in_executor(
+                    None, lambda: self.ec2.terminate_instances(InstanceIds=active_instances)
+                )
+                self.log("info", "Instance termination requested", instance_ids=active_instances)
+
+                # Wait for termination with state validation
+                max_attempts = 40
+                attempt = 0
+                while attempt < max_attempts:
+                    # Get current states
+                    status_response = await asyncio.get_event_loop().run_in_executor(
+                        None, lambda: self.ec2.describe_instances(InstanceIds=active_instances)
+                    )
+
+                    # Check if all instances are terminated
+                    terminated_count = sum(
+                        1
+                        for reservation in status_response["Reservations"]
+                        for instance in reservation["Instances"]
+                        if instance["State"]["Name"] in ["terminated", "shutting-down"]
+                    )
+
+                    if terminated_count == len(active_instances):
+                        break
+
+                    attempt += 1
+                    await asyncio.sleep(5)
+
+                if attempt >= max_attempts:
+                    self.log(
+                        "warning",
+                        "Timeout waiting for instances to terminate",
+                        instance_ids=active_instances,
+                    )
+                    raise RuntimeError(
+                        f"Timeout waiting for instances to terminate. "
+                        f"Current states: {', '.join(set(instance['State']['Name'] for reservation in status_response['Reservations'] for instance in reservation['Instances']))}"
+                    )
+                    # Update resource tracking
+                    self.current_vcpus = max(0, self.current_vcpus - released_vcpus)
+                    self.current_memory = max(0, self.current_memory - released_memory)
+
+                    self.log(
+                        "info",
+                        "Instances terminated",
+                        instance_ids=instance_ids,
+                        released_vcpus=released_vcpus,
+                        released_memory=f"{released_memory}GB",
+                        remaining_vcpus=self.current_vcpus,
+                        remaining_memory=f"{self.current_memory}GB",
+                    )
+
+            except Exception as e:
+                self.log("error", "Error during instance cleanup", error=str(e))
+                raise
+
+    @rate_limited(max_rate=5, time_window=1.0)  # 5 requests per second for state updates
+    async def update_instance_states(self, instance_ids: List[str]) -> Dict[str, Dict]:
+        """Update and return current states for given instance IDs with state machine validation
+        and detailed lifecycle tracking"""
+        async with self.state_lock:
+            try:
+                # Get both instance details and status checks in parallel
+                instance_response, status_response = await asyncio.gather(
+                    asyncio.get_event_loop().run_in_executor(
+                        None, lambda: self.ec2.describe_instances(InstanceIds=instance_ids)
+                    ),
+                    asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: self.ec2.describe_instance_status(
+                            InstanceIds=instance_ids, IncludeAllInstances=True
+                        ),
+                    ),
+                )
+
+                # Create status mapping for quick lookup
+                status_map = {
+                    status["InstanceId"]: status
+                    for status in status_response.get("InstanceStatuses", [])
+                }
+
+                # Update our state tracking
+                for reservation in instance_response["Reservations"]:
+                    for instance in reservation["Instances"]:
+                        instance_id = instance["InstanceId"]
+                        new_state = instance["State"]["Name"]
+
+                        # Get current state if exists
+                        current_state = self.instance_states.get(instance_id, {}).get(
+                            "state", "pending"
+                        )
+
+                        # Validate state transition
+                        if not self.validate_state_transition(instance_id, new_state):
+                            self.log(
+                                "warning",
+                                "Invalid state transition attempted",
+                                instance_id=instance_id,
+                                current_state=current_state,
+                                new_state=new_state,
+                            )
+                            continue
+
+                        # Get system health status
+                        system_status = "unknown"
+                        instance_status = "unknown"
+                        if instance_id in status_map:
+                            status = status_map[instance_id]
+                            system_status = status["SystemStatus"]["Status"]
+                            instance_status = status["InstanceStatus"]["Status"]
+
+                        # Record lifecycle event
+                        # Get instance type info
+                        instance_info = self.get_instance_type_info(instance["InstanceType"])
+
+                        lifecycle_event = {
+                            "timestamp": datetime.now().isoformat(),
+                            "state": new_state,
+                            "system_status": system_status,
+                            "instance_status": instance_status,
+                            "ip": instance.get("PublicIpAddress", ""),
+                            "type": instance["InstanceType"],
+                            "details": {
+                                "cpu": instance_info["vcpus"],
+                                "memory": instance_info["memory"],
+                                "network": instance_info["network"],
+                                "ebs_optimized": instance_info["ebs_optimized"],
+                            },
+                        }
+
+                        async with self.lifecycle_lock:
+                            if instance_id not in self.lifecycle_events:
+                                self.lifecycle_events[instance_id] = []
+                            self.lifecycle_events[instance_id].append(lifecycle_event)
+
+                        # Update state tracking
+                        self.instance_states[instance_id] = {
+                            "state": new_state,
+                            "ip": instance.get("PublicIpAddress", ""),
+                            "type": instance["InstanceType"],
+                            "launch_time": instance["LaunchTime"].isoformat(),
+                            "system_status": system_status,
+                            "instance_status": instance_status,
+                            "state_history": self.instance_states.get(instance_id, {}).get(
+                                "state_history", []
+                            )
+                            + [{"state": new_state, "timestamp": datetime.now().isoformat()}],
+                            "lifecycle_events": self.lifecycle_events.get(instance_id, []),
+                        }
+
+                return self.instance_states
+
+            except Exception as e:
+                self.debug_log(f"Error updating instance states: {str(e)}")
+                raise
+
+    @rate_limited(max_rate=5, time_window=1.0)  # 5 requests per second for polling
+    async def poll_instance_status(
+        self,
+        instance_ids: List[str],
+        initial_delay: float = 1.0,
+        max_delay: float = 30.0,
+        progress_tracker: Progress = None,
+        task: int = None,
+    ) -> None:
+        """Poll EC2 for the status of instances until all are running with exponential backoff"""
+        global progress, progress_task, layout
+        progress_task = progress.add_task(
+            "Waiting for instances to start...", total=len(instance_ids)
+        )
+
+        current_delay = initial_delay
+        max_attempts = 60
+        attempt = 0
+
+        while attempt < max_attempts:
+            try:
+                # Get current states
+                states = await self.update_instance_states(instance_ids)
+
+                running_count = sum(1 for state in states.values() if state["state"] == "running")
+
+                terminated_count = sum(
+                    1
+                    for state in states.values()
+                    if state["state"] in ["terminated", "shutting-down"]
+                )
+
+                if terminated_count > 0:
+                    raise RuntimeError(
+                        f"{terminated_count} instances were terminated while waiting for startup. This usually indicates insufficient spot capacity."
+                    )
+
+                if progress_tracker and task is not None:
+                    progress_tracker.update(
+                        task,
+                        completed=running_count,
+                        description=f"[yellow]Running: {running_count}/{len(instance_ids)}[/yellow]",
+                    )
+                    live.refresh()
+                layout["progress"].update(progress)
+
+                if running_count == len(instance_ids):
+                    break
+
+                # Exponential backoff with jitter
+                await asyncio.sleep(current_delay)
+                current_delay = min(max_delay, current_delay * 1.5) * (1 + random.random())
+                attempt += 1
+
+            except self.ec2.exceptions.ClientError as e:
+                if "InvalidInstanceID.NotFound" in str(e):
+                    raise RuntimeError(
+                        "Some instances disappeared while waiting for startup. This usually indicates insufficient spot capacity."
+                    )
+                # Retry on throttling errors
+                if "RequestLimitExceeded" in str(e):
+                    await asyncio.sleep(current_delay)
+                    current_delay = min(max_delay, current_delay * 1.5) * (1 + random.random())
+                    attempt += 1
+                    continue
+                raise
+            except Exception as e:
+                # General error handling with retry
+                self.debug_log(f"Polling error: {str(e)}")
+                await asyncio.sleep(current_delay)
+                current_delay = min(max_delay, current_delay * 1.5) * (1 + random.random())
+                attempt += 1
+                continue
+
+        if attempt >= max_attempts:
+            if progress_tracker and task is not None:
+                progress_tracker.update(
+                    task,
+                    description="[red]Timeout waiting for instances[/red]",
+                    completed=running_count,
+                )
+                live.refresh()
+            raise RuntimeError(
+                f"Timeout waiting for instances to start after {max_attempts} attempts"
+            )
+
+    def list_instances(self, filters: List[Dict] = None) -> None:
+        """List all running instances with optional filters"""
+        try:
+            # Default filter for managed instances
+            default_filters = [
+                {
+                    "Name": "tag:ManagedBy",
+                    "Values": ["SpotManager"],
+                },
+                {"Name": "instance-state-name", "Values": ["pending", "running"]},
+            ]
+
+            # Merge with any additional filters
+            if filters:
+                default_filters.extend(filters)
+
+            response = self.ec2.describe_instances(Filters=default_filters)
+
+            instances = [
+                instance
+                for reservation in response["Reservations"]
+                for instance in reservation["Instances"]
+            ]
+
+            # Update layout with consistent styling
+            layout["header"].update(
+                Panel(
+                    "[bold blue]Bacalhau Spot Manager[/bold blue]\n"
+                    f"[dim]Listing {len(instances)} instances[/dim]",
+                    style="white on #28464B",
+                    border_style="blue",
+                )
+            )
+
+            # Create and display the table
+            table = self.create_instance_table(instances)
+            layout["body"].update(Panel(table, border_style="blue"))
+            layout["status"].update(
+                Panel(
+                    "[green]✓ Instance list loaded successfully[/green]",
+                    border_style="green",
+                )
+            )
+            live.refresh()
+
+        except Exception as e:
+            console.print(f"[red]Error listing instances: {str(e)}[/red]")
+            raise
+
+    @rate_limited(max_rate=2, time_window=1.0)  # 2 requests per second for terminations
+    async def terminate_instances(self, instance_ids: List[str], batch_size: int = 50) -> None:
+        """Terminate specified instances in batches
+
+        Args:
+            instance_ids: List of instance IDs to terminate
+            batch_size: Number of instances per batch (default: 50)
+        """
+        if not instance_ids:
+            return
+
+        global progress, progress_task, layout
+        progress_task = progress.add_task("Terminating instances...", total=len(instance_ids))
+        layout["progress"].update(progress)
+
+        try:
+            # Process instances in batches
+            batches = (len(instance_ids) + batch_size - 1) // batch_size
+            terminated_count = 0
+
+            for batch_num in range(batches):
+                start_idx = batch_num * batch_size
+                end_idx = min((batch_num + 1) * batch_size, len(instance_ids))
+                batch_ids = instance_ids[start_idx:end_idx]
+
+                # Update progress
+                progress.update(
+                    progress_task,
+                    description=f"Terminating batch {batch_num + 1}/{batches} ({len(batch_ids)} instances)...",
+                    completed=terminated_count,
+                )
+                layout["progress"].update(progress)
+                live.refresh()
+
+                try:
+                    # Terminate batch
+                    await asyncio.get_event_loop().run_in_executor(
+                        None, lambda: self.ec2.terminate_instances(InstanceIds=batch_ids)
+                    )
+
+                    # Wait for termination with async waiter
+                    waiter = self.ec2.get_waiter("instance_terminated")
+                    await asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: waiter.wait(
+                            InstanceIds=batch_ids,
+                            WaiterConfig={"Delay": 5, "MaxAttempts": 40},
+                        ),
+                    )
+
+                    terminated_count += len(batch_ids)
+                    progress.update(progress_task, completed=terminated_count)
+                    layout["progress"].update(progress)
+                    live.refresh()
+
+                    # Update status after each batch
+                    layout["status"].update(
+                        Panel(
+                            f"[green]Terminated {terminated_count}/{len(instance_ids)} instances[/green]",
+                            border_style="green",
+                        )
+                    )
+
+                except Exception as e:
+                    layout["status"].update(
+                        Panel(
+                            f"[red]Error terminating batch {batch_num + 1}: {str(e)}[/red]",
+                            border_style="red",
+                        )
+                    )
+                    raise
+
+                # Wait briefly between batches to avoid rate limits
+                if batch_num < batches - 1:
+                    await asyncio.sleep(2)
+
+            progress.update(
+                progress_task,
+                description="All instances terminated",
+                completed=len(instance_ids),
+            )
+            layout["progress"].update(progress)
+
+        except Exception as e:
+            console.print(f"[red]Error terminating instances: {str(e)}[/red]")
+            raise
+
+    @rate_limited(max_rate=2, time_window=1.0)  # 2 requests per second for launches
+    async def launch_instances(self, count: int, batch_size: int = 50) -> List[str]:
+        """Launch specified number of spot instances in batches with cleanup handler
+        and automatic error recovery
+
+        Args:
+            count: Total number of instances to launch
+            batch_size: Number of instances per batch (default: 50)
+
+        Returns:
+            List of launched instance IDs
+
+        Raises:
+            ValueError: If launch configuration is invalid
+            RuntimeError: If launch fails
+        """
+        global progress, progress_task, layout, live
+
+        # Create detailed progress tracking
+        launch_progress = Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(bar_width=None),
+            TaskProgressColumn(),
+            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+            expand=True,
+        )
+
+        # Create subtasks for each phase
+        main_task = launch_progress.add_task("[cyan]Launching instances...", total=100)
+        phase1 = launch_progress.add_task("[yellow]Validating configuration...", total=1)
+        phase2 = launch_progress.add_task("[yellow]Requesting instances...", total=1)
+        phase3 = launch_progress.add_task("[yellow]Waiting for instances...", total=1)
+
+        # Update layout with detailed progress
+        layout["progress"].update(
+            Panel(launch_progress, title="Launch Progress", border_style="blue")
+        )
+        live.refresh()
+
+        # Update progress for validation phase
+        launch_progress.update(phase1, description="[green]Validating configuration...")
+        live.refresh()
+
+        # Validate launch configuration
+        if not self.configured_ami_id:
+            launch_progress.update(
+                phase1, description="[red]Error: AMI ID not configured[/red]", completed=1
+            )
+            live.refresh()
+            raise ValueError("AMI ID is not configured")
+        if not self.key_name:
+            raise ValueError("Key pair name is not configured")
+        if not self.instance_type:
+            raise ValueError("Instance type is not configured")
+
+        # Validate instance count against various limits
+        if count < self.min_instances:
+            raise ValueError(
+                f"Cannot launch fewer than {self.min_instances} instances. Requested: {count}"
+            )
+
+        if count > self.max_instances_per_launch:
+            raise ValueError(
+                f"Cannot launch more than {self.max_instances_per_launch} instances at once. "
+                f"Requested: {count}"
+            )
+
+        # Check current instance count
+        current_count = len(self.get_all_instance_ids())
+        if current_count + count > self.max_instances:
+            raise ValueError(
+                f"Cannot launch {count} instances. Would exceed max limit of {self.max_instances}. "
+                f"Current instances: {current_count}"
+            )
+
+        # Check resource limits
+        instance_info = self.get_instance_type_info(self.instance_type)
+        requested_vcpus = int(instance_info["vcpus"]) * count
+        requested_memory = float(instance_info["memory"].replace("GB", "")) * count
+
+        if self.current_vcpus + requested_vcpus > self.max_total_vcpus:
+            raise ValueError(
+                f"Cannot launch {count} instances. Would exceed vCPU limit of {self.max_total_vcpus}. "
+                f"Current vCPUs: {self.current_vcpus}, Requested: {requested_vcpus}"
+            )
+
+        if self.current_memory + requested_memory > self.max_total_memory:
+            raise ValueError(
+                f"Cannot launch {count} instances. Would exceed memory limit of {self.max_total_memory}GB. "
+                f"Current memory: {self.current_memory}GB, Requested: {requested_memory}GB"
+            )
+
+        if not self.configured_ami_id:
+            raise ValueError("No AMI ID configured. Please run build-ami.sh first.")
+
+        security_group_id = self.ensure_security_group()
+        startup_script_path = PROJECT_ROOT / "fleet" / "scripts" / "startup.sh"
+
+        if not startup_script_path.exists():
+            raise FileNotFoundError(f"Startup script not found at {startup_script_path}")
+
+        retry_count = 0
+        while retry_count < self.max_retries:
+            try:
+                # Update progress for request phase
+                launch_progress.update(
+                    phase2,
+                    description=f"[green]Requesting instances (attempt {retry_count + 1}/{self.max_retries})...",
+                )
+                live.refresh()
+
+                # Update resource tracking
+                self.current_vcpus += requested_vcpus
+                self.current_memory += requested_memory
+
+                # Update main progress
+                launch_progress.update(main_task, completed=25)
+
+                # Update progress for instance creation
+                launch_progress.update(phase2, description="[green]Creating instances...")
+                live.refresh()
+
+                # Calculate number of batches needed
+                batches = (count + batch_size - 1) // batch_size
+                instance_ids = []
+
+                # Process batches sequentially with error handling
+                for batch_num in range(batches):
+                    batch_count = min(batch_size, count - (batch_num * batch_size))
+
+                    # Update progress for batch
+                    launch_progress.update(
+                        phase2,
+                        description=f"[green]Processing batch {batch_num + 1}/{batches} ({batch_count} instances)[/green]",
+                    )
+                    live.refresh()
+
+                    # Track batch resources
+                    batch_vcpus = int(instance_info["vcpus"]) * batch_count
+                    batch_memory = float(instance_info["memory"].replace("GB", "")) * batch_count
+
+                    # Try batch with retries
+                    batch_retries = 3
+                    for attempt in range(batch_retries):
+                        try:
+                            # Run batch asynchronously
+                            response = await asyncio.get_event_loop().run_in_executor(
+                                None,
+                                lambda: self.ec2.run_instances(
+                                    ImageId=self.configured_ami_id,
+                                    InstanceType=self.instance_type,
+                                    KeyName=self.key_name,
+                                    SecurityGroupIds=[security_group_id],
+                                    TagSpecifications=[
+                                        {
+                                            "ResourceType": "instance",
+                                            "Tags": [
+                                                {"Key": key, "Value": value}
+                                                for key, value in self.default_tags.items()
+                                            ]
+                                            + [
+                                                {
+                                                    "Key": "LaunchGroup",
+                                                    "Value": f"scale-test-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
+                                                }
+                                            ],
+                                        }
+                                    ],
+                                    IamInstanceProfile={"Name": "BacalhauScaleTestRole"},
+                                    UserData=startup_script_path.read_text(),
+                                    InstanceMarketOptions={
+                                        "MarketType": "spot",
+                                        "SpotOptions": {
+                                            "SpotInstanceType": "one-time",
+                                            "InstanceInterruptionBehavior": "terminate",
+                                        },
+                                    },
+                                    MinCount=batch_count,
+                                    MaxCount=batch_count,
+                                ),
+                            )
+
+                            # Collect instance IDs
+                            batch_ids = [i["InstanceId"] for i in response["Instances"]]
+                            instance_ids.extend(batch_ids)
+
+                            # Update resource tracking
+                            self.current_vcpus += batch_vcpus
+                            self.current_memory += batch_memory
+
+                            # Wait briefly between batches to avoid rate limits
+                            if batch_num < batches - 1:
+                                await asyncio.sleep(2)
+
+                            break  # Success - exit retry loop
+
+                        except Exception as e:
+                            # Handle batch failure
+                            self.log(
+                                "warning",
+                                "Batch launch failed",
+                                batch_num=batch_num,
+                                attempt=attempt + 1,
+                                error=str(e),
+                            )
+
+                            # Rollback resource tracking
+                            self.current_vcpus -= batch_vcpus
+                            self.current_memory -= batch_memory
+
+                            if attempt == batch_retries - 1:
+                                raise RuntimeError(
+                                    f"Failed to launch batch {batch_num + 1} after {batch_retries} attempts"
+                                )
+
+                            # Exponential backoff before retry
+                            await asyncio.sleep(2**attempt)
+
+                    self._created_resources["instances"].update(instance_ids)
+
+                    # Update progress
+                    launch_progress.update(
+                        phase2,
+                        description=f"[green]Created {len(instance_ids)} instances[/green]",
+                        completed=1,
+                    )
+                    launch_progress.update(main_task, completed=50)
+                    live.refresh()
+
+                    # Create cleanup task
+                    cleanup_task = asyncio.create_task(self._cleanup_instances(instance_ids))
+                    self._cleanup_tasks.add(cleanup_task)
+                    cleanup_task.add_done_callback(self._cleanup_tasks.discard)
+
+                    try:
+                        # Update progress for waiting phase
+                        launch_progress.update(
+                            phase3,
+                            description="[yellow]Waiting for instances to start...",
+                            total=len(instance_ids),
+                        )
+                        live.refresh()
+
+                        # Poll for instance status with progress updates
+                        await self.poll_instance_status(
+                            instance_ids, progress=launch_progress, task=phase3
+                        )
+
+                        # Update main progress
+                        launch_progress.update(main_task, completed=100)
+                        launch_progress.update(
+                            phase3,
+                            description="[green]All instances running[/green]",
+                            completed=len(instance_ids),
+                        )
+                        live.refresh()
+
+                        return instance_ids
+                    except Exception as e:
+                        # If launch fails, ensure cleanup
+                        await cleanup_task
+                        raise
+
+            except Exception as e:
+                # Handle recoverable errors
+                error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "")
+                if error_code in self.recovery_actions:
+                    recovery_action = self.recovery_actions[error_code]
+                    await recovery_action(e, count, batch_size)
+
+                    # Exponential backoff before retry
+                    await asyncio.sleep(self.retry_delay)
+                    self.retry_delay = min(self.retry_delay * 2, self.max_retry_delay)
+                    retry_count += 1
+                    continue
+
+                # Non-recoverable error
+                console.print(f"[red]Error launching instances: {str(e)}[/red]")
+                raise
+
+    def get_all_instance_ids(self, filters: List[Dict] = None) -> List[str]:
+        """Get all running instance IDs with optional filters"""
+        try:
+            # Default filter for managed instances
+            default_filters = [
+                {
+                    "Name": "tag:ManagedBy",
+                    "Values": ["SpotManager"],
+                },
+                {"Name": "instance-state-name", "Values": ["pending", "running"]},
+            ]
+
+            # Merge with any additional filters
+            if filters:
+                default_filters.extend(filters)
+
+            response = self.ec2.describe_instances(Filters=default_filters)
+
+            return [
+                instance["InstanceId"]
+                for reservation in response["Reservations"]
+                for instance in reservation["Instances"]
+            ]
+
+        except Exception as e:
+            console.print(f"[red]Error getting instance IDs: {str(e)}[/red]")
+            raise
+
+    def wait_for_instances_running(self, instance_ids: List[str]) -> None:
+        """Wait for instances to be in running state"""
+        global progress, progress_task, layout
+        progress_task = progress.add_task("Waiting for instances...", total=len(instance_ids))
+        layout["progress"].update(progress)
+
+        max_attempts = 60
+        attempt = 0
+
+        while attempt < max_attempts:
+            try:
+                response = self.ec2.describe_instances(InstanceIds=instance_ids)
+                running_count = sum(
+                    1
+                    for reservation in response["Reservations"]
+                    for instance in reservation["Instances"]
+                    if instance["State"]["Name"] == "running"
+                )
+                terminated_count = sum(
+                    1
+                    for reservation in response["Reservations"]
+                    for instance in reservation["Instances"]
+                    if instance["State"]["Name"] in ["terminated", "shutting-down"]
+                )
+
+                if terminated_count > 0:
+                    raise RuntimeError(
+                        f"{terminated_count} instances were terminated while waiting for startup. This usually indicates insufficient spot capacity."
+                    )
+
+                progress.update(
+                    progress_task,
+                    completed=running_count,
+                    description=f"Running: {running_count}/{len(instance_ids)}",
+                )
+                layout["progress"].update(progress)
+
+                if running_count == len(instance_ids):
+                    break
+
+                time.sleep(5)
+                attempt += 1
+
+            except self.ec2.exceptions.ClientError as e:
+                if "InvalidInstanceID.NotFound" in str(e):
+                    raise RuntimeError(
+                        "Some instances disappeared while waiting for startup. This usually indicates insufficient spot capacity."
+                    )
+                raise
+
+        if attempt >= max_attempts:
+            raise RuntimeError(
+                f"Timeout waiting for instances to start. Current states: {', '.join(set(instance['State']['Name'] for reservation in response['Reservations'] for instance in reservation['Instances']))}"
+            )
+
+    def verify_bacalhau_access(self) -> None:
+        """Verify that we can access Bacalhau CLI and have correct permissions"""
+        try:
+            result = subprocess.run(
+                ["bacalhau", "node", "list", "--output", "json"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            nodes = json.loads(result.stdout)
+
+            if isinstance(nodes, dict):
+                if "nodes" in nodes:
+                    nodes = nodes["nodes"]
+                elif "data" in nodes:
+                    nodes = nodes["data"]
+                else:
+                    raise ValueError(f"Unexpected response structure: {list(nodes.keys())}")
+
+            if not isinstance(nodes, list):
+                raise ValueError(f"Unexpected nodes type: {type(nodes)}")
+
+            self.debug_log(f"Successfully verified Bacalhau access. Found {len(nodes)} nodes.")
+
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Failed to parse Bacalhau node list output as JSON: {str(e)}")
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Failed to run 'bacalhau node list'. Error: {e.stderr}")
+        except FileNotFoundError:
+            raise RuntimeError(
+                "Bacalhau CLI not found. Please install Bacalhau CLI and ensure it's in your PATH."
+            )
+
+    async def check_bacalhau_node_status(
+        self, instance_id: str, max_retries: int = 3, timeout: int = 10
+    ) -> Tuple[bool, str]:
+        """Check if a Bacalhau node is healthy using bacalhau node list with retries
+
+        Args:
+            instance_id: Instance ID to check
+            max_retries: Maximum number of retry attempts
+            timeout: Timeout in seconds for each attempt
+
+        Returns:
+            Tuple of (health status, status message)
+        """
+        retry_delay = 1  # Start with 1 second delay
+
+        for attempt in range(max_retries):
+            try:
+                # Run bacalhau node list command with timeout
+                result = await asyncio.wait_for(
+                    asyncio.get_event_loop().run_in_executor(
+                        None,
+                        lambda: subprocess.run(
+                            ["bacalhau", "node", "list", "--output", "json"],
+                            capture_output=True,
+                            text=True,
+                            check=True,
+                        ),
+                    ),
+                    timeout=timeout,
+                )
+
+                # Parse JSON output
+                try:
+                    nodes = json.loads(result.stdout)
+                    if self.debug:
+                        self.debug_log(f"Node check response type: {type(nodes)}")
+                        self.debug_log(f"Node check response count: {len(nodes)}")
+
+                    # Ensure nodes is a list
+                    if not isinstance(nodes, list):
+                        return False, f"Invalid response type: {type(nodes)}"
+
+                    # Iterate through each node to find a match
+                    for node in nodes:
+                        if not isinstance(node, dict):
+                            continue
+
+                        # Extract node info
+                        info = node.get("Info", {})
+                        if not isinstance(info, dict):
+                            continue
+
+                        # Extract labels and public IP
+                        labels = info.get("Labels", {})
+                        if not isinstance(labels, dict):
+                            continue
+
+                        found_instance_id = labels.get("INSTANCE_ID", "")
+                        if not found_instance_id:
+                            continue
+
+                        # Check if the IP matches
+                        if found_instance_id == instance_id:
+                            # Extract connection state
+                            connection_state = node.get("ConnectionState", {})
+                            if not isinstance(connection_state, dict):
+                                return False, "Invalid connection state format"
+
+                            # Determine node health based on connection status
+                            status = connection_state.get("Status", "UNKNOWN")
+                            if status == "CONNECTED":
+                                return True, "Connected"
+                            else:
+                                # Include last error if available
+                                last_error = connection_state.get("LastError", "")
+                                return False, f"Status: {status}" + (
+                                    f" ({last_error})" if last_error else ""
+                                )
+
+                    return False, ""
+
+                except json.JSONDecodeError as e:
+                    if self.debug:
+                        self.debug_log(f"Raw response: {result.stdout}")
+                    return False, f"Invalid JSON response: {str(e)}"
+
+            except subprocess.CalledProcessError as e:
+                error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr
+                self.debug_log(f"Error running bacalhau node list: {error_msg}")
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(retry_delay)
+                    retry_delay = min(retry_delay * 2, 10)  # Cap at 10 seconds
+                    continue
+                return False, f"Command failed: {error_msg}"
+            except asyncio.TimeoutError:
+                self.debug_log(f"Node status check timed out for {instance_id}")
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(retry_delay)
+                    retry_delay = min(retry_delay * 2, 10)  # Cap at 10 seconds
+                    continue
+                return False, "Timeout"
+            except Exception as e:
+                self.debug_log(f"Unexpected error checking node status: {str(e)}")
+                if attempt < max_retries - 1:
+                    await asyncio.sleep(retry_delay)
+                    retry_delay = min(retry_delay * 2, 10)  # Cap at 10 seconds
+                    continue
+                return False, str(e)
+
+    async def check_all_nodes_status(self, instance_ids: List[str]) -> Dict[str, Tuple[bool, str]]:
+        """Check status of all Bacalhau nodes"""
+        global progress, progress_task
+        results = {}
+        for instance_id in instance_ids:
+            # Update the global progress task description
+            progress.update(
+                progress_task, description=f"Checking Bacalhau status of {instance_id}..."
+            )
+            status, message = await self.check_bacalhau_node_status(instance_id)
+            results[instance_id] = (status, message)
+        return results
+
+    def validate_instance_type(self, instance_type: str) -> bool:
+        """Validate if an instance type is supported and available"""
+        try:
+            # Check if instance type exists
+            response = self.ec2.describe_instance_types(InstanceTypes=[instance_type])
+            if not response["InstanceTypes"]:
+                return False
+
+            # Check instance type capabilities
+            instance_info = response["InstanceTypes"][0]
+
+            # Must support HVM virtualization
+            if "hvm" not in instance_info.get("SupportedVirtualizationTypes", []):
+                return False
+
+            # Must support at least 1 network interface
+            if instance_info.get("NetworkInfo", {}).get("MaximumNetworkInterfaces", 0) < 1:
+                return False
+
+            return True
+
+        except Exception as e:
+            self.debug_log(f"Error validating instance type: {str(e)}")
+            return False
+
+    def get_instance_type_info(self, instance_type: str) -> Dict[str, str]:
+        """Get CPU and memory information for an instance type"""
+        if instance_type not in self._instance_type_cache:
+            try:
+                if not self.validate_instance_type(instance_type):
+                    raise ValueError(f"Instance type {instance_type} is not supported")
+
+                response = self.ec2.describe_instance_types(InstanceTypes=[instance_type])
+                info = response["InstanceTypes"][0]
+                self._instance_type_cache[instance_type] = {
+                    "vcpus": str(info["VCpuInfo"]["DefaultVCpus"]),
+                    "memory": f"{info['MemoryInfo']['SizeInMiB'] / 1024:.1f}GB",
+                    "type": instance_type,
+                    "network": str(info["NetworkInfo"]["MaximumNetworkInterfaces"]),
+                    "ebs_optimized": info["EbsInfo"]["EbsOptimizedSupport"] == "supported",
+                    "supported_architectures": info["ProcessorInfo"]["SupportedArchitectures"],
+                }
+            except Exception as e:
+                self.debug_log(f"Error getting instance type info: {str(e)}")
+                self._instance_type_cache[instance_type] = {
+                    "vcpus": "?",
+                    "memory": "?",
+                    "type": instance_type,
+                    "network": "?",
+                    "ebs_optimized": False,
+                    "supported_architectures": [],
+                }
+        return self._instance_type_cache[instance_type]
+
+    async def _handle_instance_limit_error(self, error, count, batch_size):
+        """Handle instance limit exceeded errors"""
+        self.log("warning", "Instance limit exceeded", error=str(error))
+
+        # Reduce requested count and retry
+        new_count = min(count, self.max_instances_per_launch // 2)
+        if new_count < self.min_instances:
+            raise RuntimeError("Cannot reduce instance count below minimum")
+
+        self.log("info", f"Reducing instance count from {count} to {new_count}")
+        return await self.launch_instances(new_count, batch_size)
+
+    async def _handle_capacity_error(self, error, count, batch_size):
+        """Handle insufficient capacity errors"""
+        self.log("warning", "Insufficient instance capacity", error=str(error))
+
+        # Try different instance type
+        alt_instance_type = self._get_alternative_instance_type()
+        if not alt_instance_type:
+            raise RuntimeError("No alternative instance types available")
+
+        self.log("info", f"Trying alternative instance type: {alt_instance_type}")
+        original_type = self.instance_type
+        self.instance_type = alt_instance_type
+        try:
+            return await self.launch_instances(count, batch_size)
+        finally:
+            self.instance_type = original_type
+
+    async def _handle_spot_limit_error(self, error, count, batch_size):
+        """Handle spot instance limit errors"""
+        self.log("warning", "Spot instance limit exceeded", error=str(error))
+
+        # Reduce batch size and retry
+        new_batch_size = max(1, batch_size // 2)
+        self.log("info", f"Reducing batch size from {batch_size} to {new_batch_size}")
+        return await self.launch_instances(count, new_batch_size)
+
+    async def _handle_rate_limit_error(self, error, count, batch_size):
+        """Handle API rate limit errors"""
+        self.log("warning", "API rate limit exceeded", error=str(error))
+
+        # Wait and retry with reduced rate
+        self.rate_limiter.max_rate = max(1, self.rate_limiter.max_rate // 2)
+        self.log("info", f"Reduced API rate to {self.rate_limiter.max_rate} req/sec")
+        await asyncio.sleep(self.retry_delay)
+        return await self.launch_instances(count, batch_size)
+
+    async def _handle_service_unavailable(self, error, count, batch_size):
+        """Handle service unavailable errors"""
+        self.log("warning", "Service unavailable", error=str(error))
+
+        # Wait and retry
+        await asyncio.sleep(self.retry_delay)
+        return await self.launch_instances(count, batch_size)
+
+    async def _handle_internal_error(self, error, count, batch_size):
+        """Handle AWS internal errors"""
+        self.log("warning", "AWS internal error", error=str(error))
+
+        # Wait and retry
+        await asyncio.sleep(self.retry_delay)
+        return await self.launch_instances(count, batch_size)
+
+    def _get_alternative_instance_type(self) -> Optional[str]:
+        """Get alternative instance type with similar specs"""
+        current_info = self.get_instance_type_info(self.instance_type)
+        alternatives = [
+            "t3.micro",
+            "t3.small",
+            "t3.medium",  # Burstable instances
+            "m5.large",
+            "m5.xlarge",  # General purpose
+            "c5.large",
+            "c5.xlarge",  # Compute optimized
+        ]
+
+        # Try to find similar instance type
+        for alt_type in alternatives:
+            if alt_type == self.instance_type:
+                continue
+
+            alt_info = self.get_instance_type_info(alt_type)
+            if (
+                alt_info["vcpus"] >= current_info["vcpus"]
+                and alt_info["memory"] >= current_info["memory"]
+            ):
+                return alt_type
+
+        return None
+
+    def _cleanup_resources(self) -> None:
+        """Cleanup all created resources"""
+        self.log("info", "Starting resource cleanup")
+
+        # Cleanup instances
+        if self._created_resources["instances"]:
+            try:
+                self.ec2.terminate_instances(InstanceIds=list(self._created_resources["instances"]))
+                self.log(
+                    "info",
+                    "Terminated instances",
+                    instance_ids=list(self._created_resources["instances"]),
+                )
+            except Exception as e:
+                self.log("error", "Error terminating instances", error=str(e))
+
+        # Cleanup security groups
+        if self._created_resources["security_groups"]:
+            for group_id in self._created_resources["security_groups"]:
+                try:
+                    self.ec2.delete_security_group(GroupId=group_id)
+                    self.log("info", "Deleted security group", group_id=group_id)
+                except Exception as e:
+                    self.log(
+                        "error", "Error deleting security group", group_id=group_id, error=str(e)
+                    )
+
+        # Cleanup key pairs
+        if self._created_resources["key_pairs"]:
+            for key_name in self._created_resources["key_pairs"]:
+                try:
+                    self.ec2.delete_key_pair(KeyName=key_name)
+                    self.log("info", "Deleted key pair", key_name=key_name)
+                except Exception as e:
+                    self.log("error", "Error deleting key pair", key_name=key_name, error=str(e))
+
+        self._created_resources = {"instances": set(), "security_groups": set(), "key_pairs": set()}
+        self.log("info", "Resource cleanup completed")
+
+    async def _cleanup_all(self) -> None:
+        """Cleanup all resources"""
+        self.log("info", "Starting full cleanup")
+
+        # Get all running instances
+        instance_ids = await asyncio.get_event_loop().run_in_executor(
+            None, self.get_all_instance_ids
+        )
+
+        # Cleanup instances
+        if instance_ids:
+            await self._cleanup_instances(instance_ids)
+
+        # Cleanup other resources
+        self._cleanup_resources()
+
+        self.log("info", "Full cleanup completed")
+
+    async def run_stress_test(
+        self,
+        min_nodes: int = 250,
+        max_nodes: int = 750,
+        iterations: int = 10,
+        health_check_timeout: int = 300,
+    ) -> None:
+        """Run stress test with random node counts"""
+        # Validate stress test parameters
+        if min_nodes <= 0 or max_nodes <= 0:
+            raise ValueError("Node counts must be greater than 0")
+
+        if min_nodes > max_nodes:
+            raise ValueError("min_nodes cannot be greater than max_nodes")
+
+        if iterations <= 0:
+            raise ValueError("Iterations must be greater than 0")
+
+        if health_check_timeout <= 0:
+            raise ValueError("Health check timeout must be greater than 0")
+
+        # Check max nodes against instance limits
+        if max_nodes > self.max_instances:
+            raise ValueError(
+                f"max_nodes ({max_nodes}) exceeds maximum instance limit ({self.max_instances})"
+            )
+        global progress, progress_task, layout, live
+
+        # Initialize the status table with tighter column widths
+        status_table = Table(
+            show_header=True,
+            header_style="bold magenta",
+            title="Node Status",
+            title_style="bold blue",
+            expand=True,
+        )
+        status_table.add_column("ID", style="cyan", no_wrap=True, width=20)
+        status_table.add_column("State", style="green", width=10)
+        status_table.add_column("CPU", style="yellow", justify="right", width=6)
+        status_table.add_column("Mem", style="yellow", justify="right", width=6)
+        status_table.add_column("InstType", style="yellow", width=10)
+        status_table.add_column("IP Address", style="blue", width=15)
+        status_table.add_column("🐟", style="cyan", width=4)
+
+        # Start progress tracking
+        progress_task = progress.add_task("Waiting...", total=None)
+
+        def update_layout_error(error_msg: str, details: str = ""):
+            """Helper to update layout in error state"""
+            current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            layout["header"].update(
+                Panel(
+                    f"[bold blue]Bacalhau Scale Test[/bold blue]\n"
+                    f"[red]Error State[/red]\n"
+                    f"Time: {current_time}",
+                    style="white on red",
+                )
+            )
+            layout["status"].update(
+                Panel(
+                    f"[red bold]Error:[/red bold] {error_msg}",
+                    style="red",
+                )
+            )
+            status_table.rows = []
+            status_table.add_row("Error Type", "Failed", error_msg)
+            if details:
+                status_table.add_row("Details", "Info", details)
+            status_table.add_row(
+                "Recovery", "Action Required", "Please check AWS quotas and permissions"
+            )
+            layout["body"].update(status_table)
+
+        def cleanup_instances():
+            """Helper to cleanup instances on exit"""
+            if "instance_ids" in locals():
+                try:
+                    progress.update(progress_task, description="Cleaning up instances...")
+                    layout["progress"].update(progress)
+                    status_table.add_row("Cleanup", "In Progress", "Terminating instances...")
+                    layout["body"].update(status_table)
+                    self.terminate_instances(instance_ids)
+                    status_table.add_row(
+                        "Cleanup", "[green]Complete[/green]", "All instances terminated"
+                    )
+                    layout["body"].update(status_table)
+                except Exception as e:
+                    status_table.add_row("Cleanup", "[red]Failed[/red]", f"Error: {str(e)}")
+                    layout["body"].update(status_table)
+
+        try:
+            # Register cleanup handler for Ctrl+C
+            loop = asyncio.get_event_loop()
+            loop.add_signal_handler(signal.SIGINT, lambda: asyncio.create_task(self._cleanup_all()))
+
+            # Verify Bacalhau access before starting
+            try:
+                progress.update(progress_task, description="Verifying Bacalhau access...")
+                layout["progress"].update(progress)
+                self.verify_bacalhau_access()
+            except Exception as e:
+                raise RuntimeError(f"Bacalhau verification failed: {str(e)}")
+
+            current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            layout["header"].update(
+                Panel(
+                    f"[bold blue]Bacalhau Scale Test[/bold blue]\n"
+                    f"Configuration: Nodes: {min_nodes}-{max_nodes} | Iterations: {iterations}\n"
+                    f"Started: {current_time}",
+                    style="white on #28464B",
+                )
+            )
+            live.refresh()
+
+            try:
+                for iteration in range(iterations):
+                    node_count = random.randint(min_nodes, max_nodes)
+                    layout["status"].update(
+                        Panel(
+                            f"[bold]Iteration {iteration + 1}/{iterations}[/bold]\n"
+                            f"Target Nodes: {node_count}\n"
+                            f"Time: {datetime.now().strftime('%H:%M:%S')}",
+                            style="yellow",
+                        )
+                    )
+
+                    # Reset progress for new iteration
+                    progress.update(progress_task, total=None, completed=0)
+                    progress.update(
+                        progress_task,
+                        description=f"Launching {node_count} instances...",
+                    )
+                    layout["progress"].update(progress)
+                    live.refresh()
+
+                    try:
+                        # Clear and initialize status table
+                        status_table.rows = []
+                        layout["body"].update(status_table)
+                        live.refresh()
+
+                        # Launch instances asynchronously
+                        instance_ids = await self.launch_instances(node_count)
+
+                        # Update progress for instance startup
+                        progress.update(progress_task, total=len(instance_ids), completed=0)
+                        progress.update(
+                            progress_task,
+                            description="Waiting for instances to start...",
+                        )
+                        layout["progress"].update(progress)
+                        live.refresh()
+
+                        # Wait for instances to start and join Bacalhau
+                        running_count = 0
+                        joined_count = 0
+                        start_time = time.time()
+                        node_states = {}  # Track node states and info
+
+                        while running_count < len(instance_ids):
+                            # Check if we've exceeded the timeout
+                            if time.time() - start_time > 30:
+                                raise RuntimeError(
+                                    "Timeout waiting for nodes to start and join Bacalhau cluster"
+                                )
+
+                            # Get instance states from AWS
+                            response = await asyncio.get_event_loop().run_in_executor(
+                                None,
+                                lambda: self.ec2.describe_instances(InstanceIds=instance_ids),
+                            )
+                            running_count = 0
+                            instance_rows = []
+
+                            # Update instance states
+                            for reservation in response["Reservations"]:
+                                for instance in reservation["Instances"]:
+                                    instance_id = instance["InstanceId"]
+                                    state = instance["State"]["Name"]
+                                    instance_type = instance["InstanceType"]
+                                    specs = self.get_instance_type_info(instance_type)
+                                    ip = instance.get("PublicIpAddress", "pending...")
+
+                                    # Update node state tracking
+                                    if instance_id not in node_states:
+                                        node_states[instance_id] = {
+                                            "state": state,
+                                            "ip": ip,
+                                            "specs": specs,
+                                            "bacalhau_joined": False,
+                                            "bacalhau_status": "❓",  # Default status
+                                        }
+                                    else:
+                                        node_states[instance_id]["state"] = state
+                                        if ip != "pending...":
+                                            node_states[instance_id]["ip"] = ip
+
+                                    if state == "running":
+                                        running_count += 1
+
+                            # Check Bacalhau status for running nodes
+                            if running_count > 0:
+                                running_ips = [
+                                    state["ip"]
+                                    for state in node_states.values()
+                                    if state["state"] == "running" and state["ip"] != "pending..."
+                                ]
+                                node_results = await self.check_all_nodes_status(running_ips)
+
+                                # Debug log Bacalhau API query
+                                if self.debug:
+                                    self.debug_log(
+                                        f"Queried Bacalhau API. Nodes present: {len(node_results)}"
+                                    )
+
+                                # Update Bacalhau status
+                                for instance_id, state in node_states.items():
+                                    if state["ip"] in node_results:
+                                        is_healthy, message = node_results[state["ip"]]
+                                        if is_healthy:
+                                            state["bacalhau_joined"] = True
+                                            state["bacalhau_status"] = "✅"
+                                        else:
+                                            state["bacalhau_status"] = f"❌ ({message})"
+
+                                # Build table rows with sort key
+                                for instance_id, state in node_states.items():
+                                    status = (
+                                        "[green]running[/green]"
+                                        if state["state"] == "running"
+                                        else f"[yellow]{state['state']}[/yellow]"
+                                    )
+
+                                    # Sort key: pending first, then running not joined, then running and joined
+                                    sort_key = (
+                                        0
+                                        if state["state"] != "running"
+                                        else 1
+                                        if not state["bacalhau_joined"]
+                                        else 2
+                                    )
+
+                                    instance_rows.append(
+                                        (
+                                            sort_key,
+                                            instance_id,
+                                            status,
+                                            f"{state['specs']['vcpus']}",
+                                            state["specs"]["memory"],
+                                            state["specs"]["type"],
+                                            state["ip"],
+                                            state["bacalhau_status"],  # Add Bacalhau status
+                                        )
+                                    )
+
+                                # Sort and update table
+                                instance_rows.sort(key=lambda x: x[0])
+                                status_table.rows = []
+                                for _, *row in instance_rows:
+                                    status_table.add_row(*row)
+
+                                # Update progress
+                                joined_count = sum(
+                                    1 for state in node_states.values() if state["bacalhau_joined"]
+                                )
+                                progress.update(
+                                    progress_task,
+                                    completed=joined_count,
+                                    total=len(instance_ids),
+                                    description=f"Running: {running_count}, Joined: {joined_count}/{len(instance_ids)}",
+                                )
+
+                                # Update layout
+                                layout["body"].update(status_table)
+                                layout["progress"].update(progress)
+                                live.refresh()
+
+                                # Check if all nodes are running and joined
+                                if running_count == len(instance_ids):
+                                    if joined_count == len(instance_ids):
+                                        break
+                                    # If all running but not all joined, wait a bit longer
+                                    await asyncio.sleep(2)
+                                else:
+                                    await asyncio.sleep(5)
+
+                            # Verify all nodes joined
+                            if joined_count < len(instance_ids):
+                                raise RuntimeError(
+                                    f"Timeout waiting for nodes to join Bacalhau cluster. "
+                                    f"Only {joined_count}/{len(instance_ids)} nodes joined."
+                                )
+
+                        # After all nodes are provisioned, continue to monitor Bacalhau status
+                        while True:
+                            # Check Bacalhau status for all running nodes
+                            running_ips = [
+                                state["ip"]
+                                for state in node_states.values()
+                                if state["state"] == "running" and state["ip"] != "pending..."
+                            ]
+                            node_results = await self.check_all_nodes_status(running_ips)
+
+                            # Debug log Bacalhau API query
+                            if self.debug:
+                                self.debug_log(
+                                    f"Queried Bacalhau API. Nodes present: {len(node_results)}"
+                                )
+
+                            # Update Bacalhau status
+                            for instance_id, state in node_states.items():
+                                if state["ip"] in node_results:
+                                    is_healthy, message = node_results[state["ip"]]
+                                    if is_healthy:
+                                        state["bacalhau_joined"] = True
+                                        state["bacalhau_status"] = "✅"
+                                    else:
+                                        state["bacalhau_status"] = f"❌ ({message})"
+
+                            # Build table rows with sort key
+                            instance_rows = []
+                            for instance_id, state in node_states.items():
+                                status = (
+                                    "[green]running[/green]"
+                                    if state["state"] == "running"
+                                    else f"[yellow]{state['state']}[/yellow]"
+                                )
+
+                                # Sort key: pending first, then running not joined, then running and joined
+                                sort_key = (
+                                    0
+                                    if state["state"] != "running"
+                                    else 1
+                                    if not state["bacalhau_joined"]
+                                    else 2
+                                )
+
+                                instance_rows.append(
+                                    (
+                                        sort_key,
+                                        instance_id,
+                                        status,
+                                        f"{state['specs']['vcpus']}",
+                                        state["specs"]["memory"],
+                                        state["specs"]["type"],
+                                        state["ip"],
+                                        state["bacalhau_status"],  # Add Bacalhau status
+                                    )
+                                )
+
+                            # Sort and update table
+                            instance_rows.sort(key=lambda x: x[0])
+                            status_table.rows = []
+                            for _, *row in instance_rows:
+                                status_table.add_row(*row)
+
+                            # Update progress
+                            joined_count = sum(
+                                1 for state in node_states.values() if state["bacalhau_joined"]
+                            )
+                            progress.update(
+                                progress_task,
+                                completed=joined_count,
+                                total=len(instance_ids),
+                                description=f"Running: {running_count}, Joined: {joined_count}/{len(instance_ids)}",
+                            )
+
+                            # Update layout
+                            layout["body"].update(status_table)
+                            layout["progress"].update(progress)
+                            live.refresh()
+
+                            # Check if all nodes are running and joined
+                            if joined_count == len(instance_ids):
+                                break
+
+                            await asyncio.sleep(5)
+
+                    except Exception as e:
+                        error_msg = str(e)
+                        if "MaxSpotInstanceCountExceeded" in error_msg:
+                            details = (
+                                "AWS Spot Instance quota exceeded.\n"
+                                "Please request a quota increase in AWS Console:\n"
+                                "EC2 > Limits > Spot Instance Requests"
+                            )
+                        else:
+                            details = f"Error occurred during iteration {iteration + 1}"
+                        update_layout_error(error_msg, details)
+                        live.refresh()
+                        break
+
+                    finally:
+                        cleanup_instances()
+                        live.refresh()
+
+                    if iteration < iterations - 1:
+                        await asyncio.sleep(10)
+
+            except KeyboardInterrupt:
+                update_layout_error("Test interrupted by user", "Cleaning up resources...")
+                live.refresh()
+                cleanup_instances()
+                live.refresh()
+
+        except Exception as e:
+            update_layout_error(str(e))
+            live.refresh()
+
+    def create_instance_table(self, instances: List[Dict[str, Any]]) -> Table:
+        """Create and populate a table with detailed instance status information
+        including lifecycle events."""
+        table = Table(
+            show_header=True,
+            header_style="bold magenta",
+            box=ROUNDED,
+            border_style="blue",
+            title="Instance Status",
+            title_style="bold blue",
+            caption="Last updated: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            caption_style="dim",
+        )
+
+        # Add columns with adjusted widths
+        table.add_column("ID", style="cyan", width=10)
+        table.add_column("Type", style="green", width=10)
+        table.add_column("State", style="yellow", width=8)
+        table.add_column("Zone", style="blue", width=12)
+        table.add_column("DNS", style="white", width=50)  # Wide enough for full DNS
+        table.add_column("Age", style="green", width=8)
+
+        for instance in instances:
+            # Get basic instance info
+            instance_id = instance.get("InstanceId", "N/A")
+            instance_type = instance.get("InstanceType", "N/A")
+            state = instance.get("State", {}).get("Name", "N/A")
+            zone = instance.get("Placement", {}).get("AvailabilityZone", "N/A")
+            public_dns = instance.get("PublicDnsName", "N/A")
+
+            # Calculate instance age
+            launch_time = instance.get("LaunchTime")
+            age = "N/A"
+            if launch_time:
+                age_delta = datetime.now(launch_time.tzinfo) - launch_time
+                hours = age_delta.total_seconds() / 3600
+                if hours < 24:
+                    age = f"{hours:.1f}h"
+                else:
+                    age = f"{hours / 24:.1f}d"
+
+            table.add_row(
+                instance_id,
+                instance_type,
+                state,
+                zone,
+                public_dns,
+                age,
+            )
+
+        return table
+
+    def terminate_all(self, ctx: Context) -> None:
+        """Terminate all running instances"""
+        if ctx is None:
+            raise ValueError("Context cannot be None")
+
+        global progress, progress_task, layout, live
+
+        # Get instance IDs first
+        instance_ids = self.get_all_instance_ids()
+        if not instance_ids:
+            layout["header"].update(Panel("[bold blue]Bacalhau Spot Manager[/bold blue]"))
+            layout["body"].update(Panel("[yellow]No instances found[/yellow]"))
+            layout["status"].update(Panel("[yellow]No instances to terminate[/yellow]"))
+            layout["progress"].update(progress)
+            return
+
+        try:
+            # Create main termination task
+            progress_task = progress.add_task("Terminating instances...", total=len(instance_ids))
+            layout["progress"].update(progress)
+
+            # Update header
+            layout["header"].update(
+                Panel(
+                    "[bold blue]Bacalhau Spot Manager[/bold blue]\n"
+                    f"[dim]Terminating {len(instance_ids)} instances[/dim]",
+                    style="white on #28464B",
+                )
+            )
+
+            # Get instance details
+            response = self.ec2.describe_instances(InstanceIds=instance_ids)
+            instances = [
+                instance
+                for reservation in response["Reservations"]
+                for instance in reservation["Instances"]
+            ]
+
+            # Create and display initial table
+            table = self.create_instance_table(instances)
+            layout["body"].update(Panel(table, border_style="blue"))
+            layout["status"].update(
+                Panel(
+                    f"[yellow]Terminating {len(instances)} instances...[/yellow]",
+                    border_style="yellow",
+                )
+            )
+
+            # Run the async termination in an event loop
+            asyncio.run(self.terminate_instances(instance_ids))
+
+            # Final update
+            layout["status"].update(
+                Panel(
+                    "[green]✓ All instances terminated successfully[/green]",
+                    border_style="green",
+                )
+            )
+            layout["body"].update(
+                Panel(
+                    "[green]All instances have been terminated[/green]",
+                    border_style="green",
+                )
+            )
+
+        except Exception as e:
+            error_msg = f"Error during termination process: {str(e)}"
+            write_debug(error_msg)
+            layout["status"].update(Panel(f"[red]{error_msg}[/red]", border_style="red"))
+            layout["body"].update(
+                Panel("[red]Termination process failed[/red]", border_style="red")
+            )
+            raise
+
+    def get_running_instances(self):
+        """Get list of running instances"""
+        response = self.ec2.describe_instances(
+            Filters=[{"Name": "instance-state-name", "Values": ["running", "pending"]}]
+        )
+
+        instances = []
+        for reservation in response["Reservations"]:
+            instances.extend(reservation["Instances"])
+        return instances
+
+
+@click.group()
+@click.option(
+    "--debug/--no-debug",
+    default=False,
+    help="Enable debug logging (must be specified before command)",
+)
+@click.pass_context
+def cli(ctx: Context, debug: bool) -> None:
+    """Manage AWS spot instances for Bacalhau scale testing
+
+    Example usage:
+        ./spot-manager --debug launch --count 5
+        ./spot-manager --debug stress-test
+        ./spot-manager --debug list
+    """
+    global live
+    if ctx is None:
+        raise ValueError("Context cannot be None")
+
+    ctx.obj = SpotManager(debug=debug)
+    # Start the live display before running commands
+    live.start()
+
+
+@cli.result_callback()
+def cleanup(ctx: Context, debug: bool) -> None:
+    """Cleanup after all commands are done"""
+    global live, progress, progress_task, layout
+
+    try:
+        # Clear any progress display
+        if progress_task is not None:
+            try:
+                progress.update(progress_task, visible=False)
+                layout["progress"].update(progress)
+            except Exception as e:
+                write_debug(f"Error clearing progress: {str(e)}")
+
+        # Clear the layout
+        try:
+            layout["header"].update("")
+            layout["body"].update("")
+            layout["status"].update("")
+            layout["progress"].update("")
+        except Exception as e:
+            write_debug(f"Error clearing layout: {str(e)}")
+
+        # Stop the live display first
+        if live and live.is_started:
+            try:
+                live.stop()
+            except Exception as e:
+                write_debug(f"Error stopping live display: {str(e)}")
+
+        # Then handle any remaining cleanup tasks
+        if ctx is not None and hasattr(ctx, "obj") and ctx.obj is not None:
+            manager = ctx.obj
+            if manager._cleanup_tasks:
+                try:
+                    asyncio.run(asyncio.wait(manager._cleanup_tasks))
+                except Exception as e:
+                    write_debug(f"Error during task cleanup: {str(e)}")
+
+    except Exception as e:
+        write_debug(f"Error during cleanup: {str(e)}")
+
+
+@cli.command()
+@click.option("--count", default=1, help="Number of instances to launch")
+@click.pass_obj
+def launch(manager: SpotManager, count: int):
+    """Launch spot instances"""
+    global progress, progress_task, layout, live
+
+    # Update header with consistent styling
+    layout["header"].update(
+        Panel(
+            f"[bold blue]Bacalhau Spot Manager[/bold blue]\n[dim]Launching {count} instances[/dim]",
+            style="white on #28464B",
+            border_style="blue",
+        )
+    )
+
+    progress_task = progress.add_task("Launching instances...", total=count)
+    layout["progress"].update(progress)
+    live.refresh()
+
+    try:
+        instance_ids = asyncio.run(manager.launch_instances(count))
+        manager.wait_for_instances_running(instance_ids)
+
+        # Fetch instance details
+        response = manager.ec2.describe_instances(InstanceIds=instance_ids)
+        instances = [
+            instance
+            for reservation in response["Reservations"]
+            for instance in reservation["Instances"]
+        ]
+
+        # Create and display the table with consistent styling
+        table = manager.create_instance_table(instances)
+        layout["body"].update(Panel(table, border_style="blue"))
+        layout["status"].update(
+            Panel(
+                f"[green]✓ Successfully launched {len(instance_ids)} instances[/green]",
+                border_style="green",
+            )
+        )
+        layout["progress"].update(progress)
+
+        # Show the layout
+        live.refresh()
+        time.sleep(2)  # Give user time to see the final state
+
+    except Exception as e:
+        layout["status"].update(
+            Panel(
+                f"[red]✗ Error launching instances: {str(e)}[/red]",
+                border_style="red",
+            )
+        )
+        live.refresh()
+        raise
+
+
+@cli.command("list")
+@click.option("--tag", multiple=True, help="Filter instances by tag (format: key=value)")
+@click.pass_obj
+def list_instances(manager: SpotManager, tag):
+    """List running instances with optional tag filtering"""
+    global progress, progress_task, layout, live
+
+    try:
+        # Parse tag filters
+        filters = []
+        if tag:
+            for t in tag:
+                if "=" in t:
+                    key, value = t.split("=", 1)
+                    filters.append({"Name": f"tag:{key}", "Values": [value]})
+                else:
+                    write_debug(f"Ignoring malformed tag filter: {t}")
+                    console.print(
+                        f"[yellow]Warning: Ignoring malformed tag filter '{t}' (expected key=value)[/yellow]"
+                    )
+
+        response = manager.ec2.describe_instances(Filters=filters)
+        instances = [
+            instance
+            for reservation in response["Reservations"]
+            for instance in reservation["Instances"]
+        ]
+
+        # Check if no instances were found - print simple message and return early
+        if not instances:
+            console.print("[yellow]No instances found[/yellow]")
+            return
+
+        # Create and update the table for instances that exist
+        progress_task = progress.add_task(
+            description="Listing instances...", total=None, visible=True
+        )
+        layout["progress"].update(progress)
+
+        try:
+            # Create and update the table
+            table = manager.create_instance_table(instances)
+            layout["body"].update(Panel(table, border_style="blue"))
+            layout["status"].update(
+                Panel(f"Found {len(instances)} running instances", style="green")
+            )
+            safe_progress_update(progress_task, description="Complete", visible=False)
+            layout["progress"].update(progress)
+
+        except Exception as e:
+            write_debug(f"Error in table creation/display: {str(e)}")
+            layout["body"].update(Panel("[red]Error creating table[/red]", border_style="red"))
+            layout["status"].update(
+                Panel("[red]Error displaying instance information[/red]", border_style="red")
+            )
+
+    except Exception as e:
+        # Ensure we catch and properly display any errors
+        error_msg = f"Error listing instances: {str(e)}"
+        write_debug(error_msg)
+        console.print(f"[red]{error_msg}[/red]")
+        raise
+
+
+@cli.command()
+@click.argument("instance-id")
+@click.pass_obj
+def terminate(manager: SpotManager, instance_id: str):
+    """Terminate a specific instance"""
+    global progress, progress_task, layout, live
+    progress_task = progress.add_task("Terminating instance...", total=1)
+    layout["progress"].update(progress)
+
+    # Run the async termination in an event loop
+    asyncio.run(manager.terminate_instances([instance_id]))
+
+    # Fetch instance details
+    response = manager.ec2.describe_instances(InstanceIds=[instance_id])
+    instances = [
+        instance
+        for reservation in response["Reservations"]
+        for instance in reservation["Instances"]
+    ]
+
+    # Create and display the table
+    table = manager.create_instance_table(instances)
+    layout["body"].update(table)
+    layout["status"].update(Panel(f"[green]Successfully terminated instance {instance_id}[/green]"))
+    layout["progress"].update(progress)
+
+    # Show the layout
+    live.refresh()
+    time.sleep(2)  # Give user time to see the final state
+
+
+@cli.command()
+@click.pass_obj
+def terminate_all(manager: SpotManager):
+    """Terminate all running instances"""
+    global progress, progress_task, layout, live
+
+    progress_task = progress.add_task("Finding instances...", total=None)
+    layout["progress"].update(progress)
+    live.refresh()
+
+    try:
+        instance_ids = manager.get_all_instance_ids()
+        if not instance_ids:
+            layout["header"].update(
+                Panel("[bold blue]Bacalhau Spot Manager[/bold blue]", border_style="blue")
+            )
+            layout["body"].update(
+                Panel("[yellow]No instances found[/yellow]", border_style="yellow")
+            )
+            layout["status"].update(
+                Panel("[yellow]No instances to terminate[/yellow]", border_style="yellow")
+            )
+            layout["progress"].update(progress)
+            live.refresh()
+            return
+
+        # Get initial instance details for comparison
+        initial_response = manager.ec2.describe_instances(InstanceIds=instance_ids)
+        initial_instances = [
+            instance
+            for reservation in initial_response["Reservations"]
+            for instance in reservation["Instances"]
+        ]
+
+        # Show current instances
+        table = manager.create_instance_table(initial_instances)
+        layout["header"].update(
+            Panel("[bold blue]Bacalhau Spot Manager[/bold blue]", border_style="blue")
+        )
+        layout["body"].update(Panel(table, border_style="blue"))
+        layout["status"].update(
+            Panel(
+                f"[yellow]Terminating {len(initial_instances)} instances...[/yellow]",
+                border_style="yellow",
+            )
+        )
+        layout["progress"].update(progress)
+        live.refresh()
+
+        # Run the async termination in an event loop
+        asyncio.run(manager.terminate_instances(instance_ids))
+
+        # Create summary table
+        summary_table = Table(
+            show_header=True,
+            header_style="bold magenta",
+            title="Termination Summary",
+            title_style="bold blue",
+            box=ROUNDED,
+        )
+        summary_table.add_column("Instance ID", style="cyan")
+        summary_table.add_column("Type", style="green")
+        summary_table.add_column("Zone", style="blue")
+        summary_table.add_column("Launch Time", style="yellow")
+        summary_table.add_column("IP Address", style="white")
+        summary_table.add_column("State", style="red")
+
+        # Add rows for each terminated instance
+        for instance in initial_instances:
+            summary_table.add_row(
+                instance.get("InstanceId", "N/A"),
+                instance.get("InstanceType", "N/A"),
+                instance.get("Placement", {}).get("AvailabilityZone", "N/A"),
+                instance.get("LaunchTime", "N/A").strftime("%Y-%m-%d %H:%M:%S")
+                if instance.get("LaunchTime")
+                else "N/A",
+                instance.get("PublicIpAddress", "N/A"),
+                "[red]Terminated[/red]",
+            )
+
+        # Update layout with summary
+        layout["header"].update(
+            Panel(
+                "[bold blue]Termination Complete[/bold blue]\n"
+                f"[green]Successfully terminated {len(initial_instances)} instances[/green]",
+                border_style="blue",
+            )
+        )
+        layout["body"].update(Panel(summary_table, border_style="blue"))
+        layout["status"].update(
+            Panel(
+                "[green]✓ All instances have been terminated[/green]\n"
+                f"Total instances terminated: {len(initial_instances)}",
+                border_style="green",
+            )
+        )
+        layout["progress"].update(progress)
+        live.refresh()
+        time.sleep(3)  # Give user time to see the summary
+
+    except Exception as e:
+        error_msg = f"Error during termination: {str(e)}"
+        write_debug(error_msg)
+        layout["status"].update(Panel(f"[red]{error_msg}[/red]", border_style="red"))
+        layout["body"].update(Panel("[red]Termination process failed[/red]", border_style="red"))
+        live.refresh()
+        raise
+
+
+@cli.command()
+@click.option("--min-nodes", default=250, help="Minimum number of nodes per iteration")
+@click.option("--max-nodes", default=750, help="Maximum number of nodes per iteration")
+@click.option("--iterations", default=10, help="Number of test iterations")
+@click.option("--health-timeout", default=300, help="Timeout in seconds for health checks")
+@click.pass_obj
+def stress_test(
+    manager: SpotManager,
+    min_nodes: int,
+    max_nodes: int,
+    iterations: int,
+    health_timeout: int,
+):
+    """Run stress test with random node counts
+
+    Example usage:
+        ./spot-manager --debug stress-test --min-nodes 5 --max-nodes 10
+    """
+    asyncio.run(
+        manager.run_stress_test(
+            min_nodes=min_nodes,
+            max_nodes=max_nodes,
+            iterations=iterations,
+            health_check_timeout=health_timeout,
+        )
+    )
+
+
+if __name__ == "__main__":
+    cli(obj={})
diff --git a/scale-tester/aws_spot/plan.md b/scale-tester/aws_spot/plan.md
new file mode 100644
index 00000000..ca65d2a4
--- /dev/null
+++ b/scale-tester/aws_spot/plan.md
@@ -0,0 +1,27 @@
+# Spot Manager Improvement Plan
+
+1. Add proper error handling and retries for AWS API calls [DONE]
+2. Implement exponential backoff for instance polling [DONE] 
+3. Add instance state tracking and validation [DONE]
+4. Add comprehensive logging with structured format [DONE]
+5. Implement proper async cleanup handlers [DONE]
+6. Add validation for instance counts and limits [DONE]
+7. Implement proper resource cleanup on errors [DONE]
+8. Add proper instance tagging system [DONE]
+9. Implement instance state machine [DONE]
+10. Add configuration validation [DONE]
+11. Add proper instance type validation [DONE]
+12. Implement proper scaling limits and safeguards [DONE]
+13. Improve UI presentation consistency [DONE]
+14. Add progress tracking for large-scale operations [DONE]
+15. Add batch processing for instance operations
+16. Implement proper rate limiting for AWS API calls
+17. Add health check timeouts and retries
+18. Improve instance status reporting
+19. Add proper instance lifecycle tracking
+20. Implement proper error recovery mechanisms
+
+## First Steps to Execute
+
+1. Add proper error handling and retries for AWS API calls
+2. Implement exponential backoff for instance polling
diff --git a/scale-tester/aws_spot/pyproject.toml b/scale-tester/aws_spot/pyproject.toml
new file mode 100644
index 00000000..e4d78133
--- /dev/null
+++ b/scale-tester/aws_spot/pyproject.toml
@@ -0,0 +1,27 @@
+[project]
+name = "bacalhau-scale-tester"
+version = "0.1.0"
+description = "A tool for testing the scalability of Bacalhau nodes on AWS spot instances"
+authors = [{ name = "Bacalhau Project", email = "info@bacalhau.org" }]
+dependencies = [
+    "rich>=13.7.0",
+    "boto3>=1.34.0",
+    "click>=8.1.7",
+    "aiohttp>=3.9.0",
+    "python-dotenv>=1.0.0",
+]
+requires-python = ">=3.10"
+readme = "README.md"
+license = { text = "Apache-2.0" }
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+
+[tool.black]
+line-length = 100
+target-version = ["py310"]
diff --git a/scale-tester/aws_spot/requirements.txt b/scale-tester/aws_spot/requirements.txt
new file mode 100644
index 00000000..353a4e4f
--- /dev/null
+++ b/scale-tester/aws_spot/requirements.txt
@@ -0,0 +1,5 @@
+rich>=13.7.0
+boto3>=1.34.0
+click>=8.1.7
+aiohttp>=3.9.0
+python-dotenv>=1.0.0 
\ No newline at end of file
diff --git a/scale-tester/aws_spot/spot-instances.sh b/scale-tester/aws_spot/spot-instances.sh
deleted file mode 100755
index 8e560623..00000000
--- a/scale-tester/aws_spot/spot-instances.sh
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/usr/bin/env bash
-# spot-instances.sh
-#
-# Script to launch and manage AWS spot instances
-set -e
-
-source ./aws-spot-env.sh
-
-# Debug settings
-DEBUG=false
-DEBUG_FILE="/tmp/spot_instance_debug.txt"
-
-function debug() {
-    if [ "$DEBUG" = true ]; then
-        echo "[DEBUG] $(date '+%Y-%m-%d %H:%M:%S'): $1" >> "$DEBUG_FILE"
-    fi
-}
-
-function list_instances() {
-    echo "Listing instances with tag '${INSTANCE_TAG_KEY}=${INSTANCE_TAG_VALUE}'..."
-    aws ec2 describe-instances \
-        --region "$AWS_REGION" \
-        --filters "Name=tag:${INSTANCE_TAG_KEY},Values=${INSTANCE_TAG_VALUE}" \
-                 "Name=instance-state-name,Values=pending,running" \
-        --output json \
-        --query 'Reservations[].Instances[].[InstanceId,InstanceType,State.Name,LaunchTime,PublicDnsName]' \
-        | jq -r '.[] | @tsv' \
-        | column -t
-}
-
-function terminate_instance() {
-    local instance_id="$1"
-    echo "Terminating instance: $instance_id"
-    aws ec2 terminate-instances --output json \
-        --region "$AWS_REGION" \
-        --instance-ids "$instance_id"
-    
-    echo "Waiting for instance to terminate..."
-    while true; do
-        status=$(aws ec2 describe-instances \
-            --region "$AWS_REGION" \
-            --instance-ids "$instance_id" \
-            --output json \
-            --query 'Reservations[].Instances[].State.Name' | jq -r '.[0]')
-        
-        if [ "$status" = "terminated" ] || [ -z "$status" ]; then
-            break
-        fi
-        echo "Current status: $status"
-        sleep 5
-    done
-    
-    echo "Instance $instance_id has been terminated."
-}
-
-function terminate_all_instances() {
-    local instance_ids=$(aws ec2 describe-instances \
-        --region "$AWS_REGION" \
-        --filters "Name=tag:${INSTANCE_TAG_KEY},Values=${INSTANCE_TAG_VALUE}" \
-                 "Name=instance-state-name,Values=pending,running" \
-        --output json \
-        --query 'Reservations[].Instances[].InstanceId' \
-        | jq -r '.[]')
-    
-    if [ -z "$instance_ids" ]; then
-        echo "No running instances found matching tag '${INSTANCE_TAG_KEY}=${INSTANCE_TAG_VALUE}'."
-        return
-    fi
-    
-    echo "The following instances will be terminated:"
-    list_instances
-    
-    read -p "Are you sure you want to terminate these instances? (y/N) " confirm
-    if [[ $confirm =~ ^[Yy]$ ]]; then
-        echo "Terminating instances..."
-        aws ec2 terminate-instances --output json \
-            --region "$AWS_REGION" \
-            --instance-ids $instance_ids
-        
-        echo "Waiting for all instances to terminate..."
-        while true; do
-            statuses=$(aws ec2 describe-instances \
-                --region "$AWS_REGION" \
-                --instance-ids $instance_ids \
-                --output json \
-                --query 'Reservations[].Instances[].State.Name' | jq -r '.[]')
-            
-            # Check if all instances are terminated
-            all_terminated=true
-            for status in $statuses; do
-                if [ "$status" != "terminated" ]; then
-                    all_terminated=false
-                    break
-                fi
-            done
-            
-            if [ "$all_terminated" = true ] || [ -z "$statuses" ]; then
-                break
-            fi
-            
-            echo "Current statuses: $statuses"
-            sleep 5
-        done
-        
-        echo "All matching instances have been terminated."
-    else
-        echo "Operation cancelled."
-    fi
-}
-
-function ensure_security_group() {
-    debug "Checking for existing security group..."
-    # First try to get existing group
-    local security_group_id=""
-    local result=$(aws ec2 describe-security-groups \
-        --region "$AWS_REGION" \
-        --filters "Name=group-name,Values=$SECURITY_GROUP_NAME" \
-        --query 'SecurityGroups[0].GroupId' \
-        --output text)
-
-    debug "Query result: '$result'"
-
-    if [ "$result" == "None" ] || [ -z "$result" ]; then
-        debug "Creating security group: $SECURITY_GROUP_NAME"
-        result=$(aws ec2 create-security-group \
-            --region "$AWS_REGION" \
-            --group-name "$SECURITY_GROUP_NAME" \
-            --description "$SECURITY_GROUP_DESC" \
-            --query 'GroupId' \
-            --output text)
-        debug "Created security group ID: '$result'"
-    fi
-
-    if [ -z "$result" ] || [ "$result" == "None" ]; then
-        debug "Error: Failed to get or create security group"
-        return 1
-    fi
-
-    security_group_id="$result"
-    debug "Using security group ID: '$security_group_id'"
-
-    # Remove existing ingress rules
-    debug "Removing existing security group rules..."
-    local existing_rules=$(aws ec2 describe-security-group-rules \
-        --region "$AWS_REGION" \
-        --filters "Name=group-id,Values=$security_group_id" \
-        --query 'SecurityGroupRules[?!IsEgress].SecurityGroupRuleId' \
-        --output text)
-    
-    if [ ! -z "$existing_rules" ]; then
-        debug "Found existing rules: $existing_rules"
-        for rule_id in $existing_rules; do
-            debug "Revoking rule: $rule_id"
-            aws ec2 revoke-security-group-ingress \
-                --region "$AWS_REGION" \
-                --group-id "$security_group_id" \
-                --security-group-rule-ids "$rule_id" > /dev/null || true
-        done
-    fi
-
-    debug "Configuring security group rules for ID: $security_group_id"
-    # Add SSH access
-    aws ec2 authorize-security-group-ingress \
-        --region "$AWS_REGION" \
-        --group-id "$security_group_id" \
-        --protocol tcp \
-        --port 22 \
-        --cidr "0.0.0.0/0" > /dev/null || {
-            debug "Failed to add SSH rule, might already exist"
-        }
-
-    # Add port 4222 access
-    aws ec2 authorize-security-group-ingress \
-        --region "$AWS_REGION" \
-        --group-id "$security_group_id" \
-        --protocol tcp \
-        --port 4222 \
-        --cidr "0.0.0.0/0" > /dev/null || {
-            debug "Failed to add port 4222 rule, might already exist"
-        }
-
-    if [ -z "$security_group_id" ] || [ "$security_group_id" == "None" ]; then
-        debug "Error: Security group ID is empty or None"
-        return 1
-    fi
-
-    debug "Returning security group ID: '$security_group_id'"
-    echo "$security_group_id"
-}
-
-function launch_instances() {
-    # Get security group ID
-    debug "Getting security group..."
-    local security_group_id
-    security_group_id=$(ensure_security_group)
-    debug "Received security group ID: '$security_group_id'"
-    
-    if [ -z "$security_group_id" ] || [ "$security_group_id" == "None" ]; then
-        echo "Error: Failed to get security group ID"
-        exit 1
-    fi
-    
-    debug "Security group ID before launch: '$security_group_id'"
-    
-    # Use the configured AMI - fail if not available
-    local ami_id="$CONFIGURED_AMI_ID"
-    if [ -z "$ami_id" ] || [ "$ami_id" == "null" ]; then
-        echo "Error: No configured AMI found. Please run './build-ami.sh' first to create the AMI."
-        exit 1
-    fi
-    
-    debug "Using configured AMI ID: $ami_id"
-    
-    # Launch a single instance
-    echo "Launching spot instance..."
-    debug "Command: aws ec2 run-instances with spot options"
-    
-    local aws_debug=""
-    if [ "$DEBUG" = true ]; then
-        aws_debug="--debug"
-    fi
-
-    # Create a temporary file for error output
-    local error_file=$(mktemp)
-    
-    # Launch spot instance
-    local output
-    if ! output=$(aws ec2 run-instances \
-        --region "$AWS_REGION" \
-        --image-id "$ami_id" \
-        --instance-type "$INSTANCE_TYPE" \
-        --key-name "$KEY_NAME" \
-        --security-group-ids "$security_group_id" \
-        --tag-specifications "ResourceType=instance,Tags=[{Key=$INSTANCE_TAG_KEY,Value=$INSTANCE_TAG_VALUE}]" \
-        --iam-instance-profile "Name=BacalhauScaleTestRole" \
-        --user-data "file://scripts/startup.sh" \
-        --instance-market-options '{"MarketType":"spot","SpotOptions":{"SpotInstanceType":"one-time","InstanceInterruptionBehavior":"terminate"}}' \
-        --count "$SPOT_INSTANCE_COUNT" \
-        --output json \
-        $aws_debug 2>"$error_file"); then
-        
-        echo "Error launching spot instance:"
-        cat "$error_file"
-        rm "$error_file"
-        exit 1
-    fi
-
-    # Print launched instance details
-    local instance_ids=$(echo "$output" | jq -r '.Instances[].InstanceId')
-    echo "Successfully launched spot instances: $instance_ids"
-    
-    # Wait for instances to be running
-    echo "Waiting for instances to be running..."
-    while true; do
-        local statuses=$(aws ec2 describe-instances \
-            --region "$AWS_REGION" \
-            --instance-ids $instance_ids \
-            --output json \
-            --query 'Reservations[].Instances[].[InstanceId,State.Name]' | \
-            jq -r '.[] | @tsv')
-        
-        echo "Instance statuses:"
-        echo "$statuses" | column -t
-        
-        if ! echo "$statuses" | grep -qE "pending|starting"; then
-            break
-        fi
-        sleep 5
-    done
-    
-    echo "Spot instances are now running. Use './spot-instances.sh list' to see details."
-}
-
-function show_usage() {
-    echo "Usage: $0 [COMMAND] [OPTIONS]"
-    echo ""
-    echo "Commands:"
-    echo "  launch              Launch new spot instances (count: $SPOT_INSTANCE_COUNT)"
-    echo "  list                List all running instances"
-    echo "  terminate <id>      Terminate specific instance"
-    echo "  terminate-all       Terminate all instances"
-    echo "  help               Show this help message"
-    echo ""
-    echo "Options:"
-    echo "  --debug            Enable debug output to $DEBUG_FILE"
-}
-
-# Main script logic
-# Process command line arguments
-COMMAND=""
-for arg in "$@"; do
-    case "$arg" in
-        --debug)
-            DEBUG=true
-            # Clear debug file at start
-            > "$DEBUG_FILE"
-            ;;
-        *)
-            if [ -z "$COMMAND" ]; then
-                COMMAND="$arg"
-            fi
-            ;;
-    esac
-done
-
-case "$COMMAND" in
-    launch)
-        launch_instances
-        ;;
-    list)
-        list_instances
-        ;;
-    terminate)
-        if [ -z "$2" ]; then
-            echo "Error: Instance ID required"
-            show_usage
-            exit 1
-        fi
-        terminate_instance "$2"
-        ;;
-    terminate-all)
-        terminate_all_instances
-        ;;
-    help|--help|-h|"")
-        show_usage
-        ;;
-    *)
-        echo "Error: Unknown command '$1'"
-        show_usage
-        exit 1
-        ;;
-esac 
\ No newline at end of file
diff --git a/scale-tester/aws_spot/spot/config/aws-spot-env.sh b/scale-tester/aws_spot/spot/config/aws-spot-env.sh
new file mode 100644
index 00000000..6b1c69be
--- /dev/null
+++ b/scale-tester/aws_spot/spot/config/aws-spot-env.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+# aws-spot-env.sh
+#
+# This file sets environment variables used for launching
+# 1,000 AWS Spot Instances with Docker installed.
+#
+# Usage:
+#   source ./aws-spot-env.sh
+
+# AWS CLI & Region
+export AWS_REGION="us-west-2"
+
+# Key Pair
+export KEY_NAME="BacalhauScaleTestKey"
+
+# Security Group
+export SECURITY_GROUP_NAME="bacalhau-scale-test-group"
+export SECURITY_GROUP_DESC="Security group for Bacalhau Scale Spot Instances"
+
+# Your public IP for SSH ingress (CIDR /32)
+export MY_PUBLIC_IP=$(curl -s ifconfig.me)
+
+# Base AMI to use (Amazon Linux 2 example)
+# aws ssm get-parameters --names /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2 --region us-east-1
+export BASE_AMI_ID="ami-07d9cf938edb0739b"
+export CONFIGURED_AMI_ID="ami-03212e939b49e6f64"
+
+# Instance Type
+export INSTANCE_TYPE="t3.micro"
+
+# Number of Spot Instances
+export SPOT_INSTANCE_COUNT="100"
+
+# Custom AMI details (if building your own)
+export CUSTOM_AMI_NAME="bacalhau-scale-test-ami"
+export CUSTOM_AMI_DESCRIPTION="AMI with Docker and Bacalhau preinstalled"
+
+# Tags
+export INSTANCE_TAG_KEY="Name"
+export INSTANCE_TAG_VALUE="bacalhau-scale-test"
+
+echo "Environment variables for AWS Spot Instances set."
diff --git a/scale-tester/aws_spot/spot/config/aws-spot-env.sh.example b/scale-tester/aws_spot/spot/config/aws-spot-env.sh.example
new file mode 100644
index 00000000..02b66ad5
--- /dev/null
+++ b/scale-tester/aws_spot/spot/config/aws-spot-env.sh.example
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# aws-spot-env.sh
+#
+# Environment variables for Bacalhau scale testing on AWS spot instances.
+# Copy this file to aws-spot-env.sh and edit with your settings.
+
+# AWS CLI & Region
+export AWS_REGION="us-west-2"
+
+# Key Pair (must exist in your AWS account)
+export KEY_NAME="BacalhauScaleTestKey"
+
+# Security Group
+export SECURITY_GROUP_NAME="bacalhau-scale-test-group"
+export SECURITY_GROUP_DESC="Security group for Bacalhau Scale Spot Instances"
+
+# Instance Configuration
+export INSTANCE_TYPE="t3.micro"
+export SPOT_INSTANCE_COUNT="100"  # Default count for non-stress-test launches
+
+# AMI Configuration
+export CUSTOM_AMI_NAME="bacalhau-scale-test-ami"
+export CUSTOM_AMI_DESCRIPTION="AMI with Docker and Bacalhau preinstalled"
+
+# The AMI ID will be populated by build-ami.sh
+export CONFIGURED_AMI_ID=""
+
+# Instance Tags
+export INSTANCE_TAG_KEY="Name"
+export INSTANCE_TAG_VALUE="bacalhau-scale-test"
+
+echo "Environment variables for AWS Spot Instances loaded." 
\ No newline at end of file
diff --git a/scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py b/scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py
deleted file mode 100644
index ccb3cf23..00000000
--- a/scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import argparse
-import json
-import os
-import subprocess
-import sys
-import concurrent.futures
-import threading
-
-import yaml
-
-
-def get_nodes(api_host):
-    """Get list of all Bacalhau nodes."""
-    try:
-        cmd = [
-            "bacalhau",
-            "node",
-            "list",
-            "--output",
-            "json",
-            "-c",
-            f"API.Host={api_host}",
-        ]
-
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        return json.loads(result.stdout)
-    except subprocess.CalledProcessError as e:
-        print(f"Error running bacalhau node list: {e}")
-        print(f"stdout: {e.stdout}")
-        print(f"stderr: {e.stderr}")
-        sys.exit(1)
-    except json.JSONDecodeError as e:
-        print(f"Error parsing JSON output: {e}")
-        sys.exit(1)
-
-
-def delete_node(node_id, api_host, print_lock):
-    """Delete a specific node by ID."""
-    try:
-        cmd = [
-            "bacalhau",
-            "node",
-            "delete",
-            node_id,
-            "-c",
-            f"API.Host={api_host}",
-        ]
-
-        result = subprocess.run(cmd, capture_output=True, check=True, text=True)
-        with print_lock:
-            print(f"Successfully deleted node: {node_id}")
-        return True
-    except subprocess.CalledProcessError as e:
-        with print_lock:
-            print(f"Failed to delete node {node_id}. Error: {e}")
-        return False
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Delete disconnected Bacalhau nodes")
-    parser.add_argument("--api-host", help="API host to connect to")
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Show what would be deleted without actually deleting",
-    )
-    args = parser.parse_args()
-
-    if not args.api_host:
-        print("API host is required")
-        sys.exit(1)
-
-    print(f"\nConnecting to API host: {args.api_host}")
-
-    # Get all nodes
-    nodes = get_nodes(args.api_host)
-
-    # Filter disconnected compute nodes
-    disconnected_nodes = [
-        node
-        for node in nodes
-        if (
-            node["Connection"] == "DISCONNECTED"
-            and node["Info"]["NodeType"] == "Compute"
-        )
-    ]
-
-    if not disconnected_nodes:
-        print("No disconnected nodes found.")
-        return
-
-    print(f"\nFound {len(disconnected_nodes)} disconnected node(s):")
-    for node in disconnected_nodes:
-        print(f"  - {node['Info']['NodeID']}")
-
-    if args.dry_run:
-        print("\nDry run - no nodes were deleted")
-        return
-
-    print("\nDeleting nodes...")
-    deleted_count = 0
-    print_lock = threading.Lock()
-    
-    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
-        # Create a list to store the futures
-        future_to_node = {
-            executor.submit(delete_node, node["Info"]["NodeID"], args.api_host, print_lock): node 
-            for node in disconnected_nodes
-        }
-        
-        # As each future completes, count the successful deletions
-        for future in concurrent.futures.as_completed(future_to_node):
-            if future.result():
-                deleted_count += 1
-
-    print(f"\nDeleted {deleted_count} of {len(disconnected_nodes)} disconnected nodes")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt
index 17980d9a..a9484ec4 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt
@@ -16,6 +16,7 @@ keypair
 levelname
 NOPASSWD
 oneshot
+puuid
 runcmd
 templatefile
 tfvars
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml
deleted file mode 100644
index d5d8f3f4..00000000
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml
+++ /dev/null
@@ -1,176 +0,0 @@
-# Auto-generated locations configuration
-# Using Amazon Linux 2023 AMIs
-- ap-south-2:
-    region: ap-south-2
-    zone: ap-south-2a
-    instance_type: t3.medium
-    instance_ami: ami-0d1fc426c1b23bce0
-    node_count: 1
-- ap-south-1:
-    region: ap-south-1
-    zone: ap-south-1a
-    instance_type: t3.medium
-    instance_ami: ami-0fd05997b4dff7aac
-    node_count: 1
-- eu-south-1:
-    region: eu-south-1
-    zone: eu-south-1a
-    instance_type: t3.medium
-    instance_ami: ami-0f529654669a607d1
-    node_count: 1
-- eu-south-2:
-    region: eu-south-2
-    zone: eu-south-2a
-    instance_type: t3.medium
-    instance_ami: ami-0d27757cc8327f88f
-    node_count: 1
-- me-central-1:
-    region: me-central-1
-    zone: me-central-1a
-    instance_type: t3.medium
-    instance_ami: ami-0f334de647da2fc7d
-    node_count: 1
-- ca-central-1:
-    region: ca-central-1
-    zone: ca-central-1a
-    instance_type: t3.medium
-    instance_ami: ami-0a590ca28046d073e
-    node_count: 1
-- eu-central-1:
-    region: eu-central-1
-    zone: eu-central-1a
-    instance_type: t3.medium
-    instance_ami: ami-0e54671bdf3c8ed8d
-    node_count: 1
-- eu-central-2:
-    region: eu-central-2
-    zone: eu-central-2a
-    instance_type: t3.medium
-    instance_ami: ami-001ae26aa9fa6e1e0
-    node_count: 1
-- us-west-1:
-    region: us-west-1
-    zone: us-west-1b
-    instance_type: t3.medium
-    instance_ami: ami-0aa117785d1c1bfe5
-    node_count: 1
-- us-west-2:
-    region: us-west-2
-    zone: us-west-2a
-    instance_type: t3.medium
-    instance_ami: ami-07d9cf938edb0739b
-    node_count: 1
-- af-south-1:
-    region: af-south-1
-    zone: af-south-1a
-    instance_type: t3.medium
-    instance_ami: ami-09bb68fb3b90fe9f5
-    node_count: 1
-- eu-north-1:
-    region: eu-north-1
-    zone: eu-north-1a
-    instance_type: t3.medium
-    instance_ami: ami-02df5cb5ad97983ba
-    node_count: 1
-- eu-west-3:
-    region: eu-west-3
-    zone: eu-west-3a
-    instance_type: t3.medium
-    instance_ami: ami-07dc1ccdcec3b4eab
-    node_count: 1
-- eu-west-2:
-    region: eu-west-2
-    zone: eu-west-2a
-    instance_type: t3.medium
-    instance_ami: ami-019374baf467d6601
-    node_count: 1
-- eu-west-1:
-    region: eu-west-1
-    zone: eu-west-1a
-    instance_type: t3.medium
-    instance_ami: ami-0a094c309b87cc107
-    node_count: 1
-- ap-northeast-3:
-    region: ap-northeast-3
-    zone: ap-northeast-3a
-    instance_type: t3.medium
-    instance_ami: ami-0c8df088bd68958ff
-    node_count: 1
-- ap-northeast-2:
-    region: ap-northeast-2
-    zone: ap-northeast-2a
-    instance_type: t3.medium
-    instance_ami: ami-049788618f07e189d
-    node_count: 1
-- me-south-1:
-    region: me-south-1
-    zone: me-south-1a
-    instance_type: t3.medium
-    instance_ami: ami-064ca081dffe98dc2
-    node_count: 1
-- ap-northeast-1:
-    region: ap-northeast-1
-    zone: ap-northeast-1a
-    instance_type: t3.medium
-    instance_ami: ami-0ab02459752898a60
-    node_count: 1
-- sa-east-1:
-    region: sa-east-1
-    zone: sa-east-1a
-    instance_type: t3.medium
-    instance_ami: ami-03c4a8310002221c7
-    node_count: 1
-- ap-east-1:
-    region: ap-east-1
-    zone: ap-east-1a
-    instance_type: t3.medium
-    instance_ami: ami-0a7ea1800b4d7a034
-    node_count: 1
-- ca-west-1:
-    region: ca-west-1
-    zone: ca-west-1a
-    instance_type: t3.medium
-    instance_ami: ami-0ba32fa0d87c5d193
-    node_count: 1
-- ap-southeast-1:
-    region: ap-southeast-1
-    zone: ap-southeast-1a
-    instance_type: t3.medium
-    instance_ami: ami-0995922d49dc9a17d
-    node_count: 1
-- ap-southeast-2:
-    region: ap-southeast-2
-    zone: ap-southeast-2a
-    instance_type: t3.medium
-    instance_ami: ami-0d6560f3176dc9ec0
-    node_count: 1
-- ap-southeast-3:
-    region: ap-southeast-3
-    zone: ap-southeast-3a
-    instance_type: t3.medium
-    instance_ami: ami-01ca3951ed2aa735e
-    node_count: 1
-- ap-southeast-4:
-    region: ap-southeast-4
-    zone: ap-southeast-4a
-    instance_type: t3.medium
-    instance_ami: ami-069ddd2a970d5e293
-    node_count: 1
-- us-east-1:
-    region: us-east-1
-    zone: us-east-1a
-    instance_type: t3.medium
-    instance_ami: ami-01816d07b1128cd2d
-    node_count: 1
-- ap-southeast-5:
-    region: ap-southeast-5
-    zone: ap-southeast-5a
-    instance_type: t3.medium
-    instance_ami: ami-0c4a807cb1a258810
-    node_count: 1
-- us-east-2:
-    region: us-east-2
-    zone: us-east-2a
-    instance_type: t3.medium
-    instance_ami: ami-0b4624933067d393a
-    node_count: 1
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py
index 60bc4565..f7418a62 100755
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py
@@ -13,8 +13,7 @@
 import os
 import subprocess
 import sys
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import yaml
 from rich import box
@@ -154,20 +153,32 @@ def load_config() -> Dict[str, Any]:
         with open("locations.yaml", "r") as f:
             yaml_data = yaml.safe_load(f)
             if not isinstance(yaml_data, list):
-                raise ValueError("Expected a list of region configurations")
+                raise ValueError("Expected a list of zone configurations")
 
-            # Convert list of single-key dictionaries into a single dictionary
+            # Convert list of zone configurations into a dictionary
             config = {}
-            for region_dict in yaml_data:
-                if not isinstance(region_dict, dict):
-                    raise ValueError("Each region configuration must be a dictionary")
-                if len(region_dict) != 1:
+            for zone_dict in yaml_data:
+                if not isinstance(zone_dict, dict):
+                    raise ValueError("Each zone configuration must be a dictionary")
+                if len(zone_dict) != 1:
                     raise ValueError(
-                        "Each region configuration must have exactly one key"
+                        "Each zone configuration must have exactly one key"
                     )
 
-                region = list(region_dict.keys())[0]
-                config[region] = region_dict[region]
+                zone_name = list(zone_dict.keys())[0]
+                zone_config = zone_dict[zone_name]
+
+                # Create a unique key for this zone
+                zone_key = zone_name
+
+                # Validate and set required fields
+                config[zone_key] = {
+                    "instance_type": zone_config.get("instance_type"),
+                    "instance_ami": zone_config.get("instance_ami"),
+                    "node_count": zone_config.get("node_count", 1),
+                    "region": zone_config.get("region"),
+                    "zone": zone_config.get("zone", zone_name),
+                }
 
             # Validate the configuration
             validate_config(config)
@@ -175,7 +186,7 @@ def load_config() -> Dict[str, Any]:
 
     except FileNotFoundError:
         print("Error: locations.yaml file not found")
-        print("Please create a locations.yaml file with your region configurations")
+        print("Please create a locations.yaml file with your zone configurations")
         sys.exit(1)
     except yaml.YAMLError as e:
         print(f"Error parsing locations.yaml: {e}")
@@ -186,8 +197,8 @@ def load_config() -> Dict[str, Any]:
         sys.exit(1)
 
 
-def update_machines_file(region: str, outputs: Dict[str, Any]) -> None:
-    """Update MACHINES.json with outputs from a region"""
+def update_machines_file(region: str, zone: str, outputs: Dict[str, Any]) -> None:
+    """Update MACHINES.json with outputs from a region/zone"""
     machines_file = os.path.join(
         os.path.dirname(os.path.abspath(__file__)), "MACHINES.json"
     )
@@ -205,7 +216,7 @@ def update_machines_file(region: str, outputs: Dict[str, Any]) -> None:
         instance_ids = outputs.get("instance_ids", {}).get("value", [])
 
         # Log the raw values for debugging
-        logging.debug(f"Raw outputs for region {region}:")
+        logging.debug(f"Raw outputs for {region}/{zone}:")
         logging.debug(f"Public IPs: {public_ips}")
         logging.debug(f"Private IPs: {private_ips}")
         logging.debug(f"Instance IDs: {instance_ids}")
@@ -253,7 +264,7 @@ def update_machines_file(region: str, outputs: Dict[str, Any]) -> None:
             else []
         )
 
-        # Create instances list for this region
+        # Create instances list for this zone
         instances = []
         max_length = max(len(instance_ids), len(public_ips), len(private_ips))
 
@@ -263,18 +274,23 @@ def update_machines_file(region: str, outputs: Dict[str, Any]) -> None:
                     "instance_id": instance_ids[i],
                     "public_ip": public_ips[i] if i < len(public_ips) else None,
                     "private_ip": private_ips[i] if i < len(private_ips) else None,
+                    "zone": zone,
                 }
                 instances.append(instance)
 
-        # Update the region's data with the new structure
-        machines_data[region] = {"name": region, "instances": instances}
+        # Initialize region if it doesn't exist
+        if region not in machines_data:
+            machines_data[region] = {"name": region, "zones": {}}
+
+        # Update the zone's data
+        machines_data[region]["zones"][zone] = {"name": zone, "instances": instances}
 
         # Write updated data back to file
         with open(machines_file, "w") as f:
             json.dump(machines_data, f, indent=2)
 
         logging.info(
-            f"Updated MACHINES.json with {len(instances)} instances for region {region}"
+            f"Updated MACHINES.json with {len(instances)} instances for {region}/{zone}"
         )
     except Exception as e:
         logging.error(f"Error updating MACHINES.json: {str(e)}")
@@ -303,10 +319,10 @@ def delete_machines_file() -> None:
         raise
 
 
-def deploy(command, region, region_config):
-    """Deploys or destroys resources in a single region."""
+def deploy(command, zone, zone_config):
+    """Deploys or destroys resources in a single zone."""
     terraform_command = "apply" if command == "create" else "destroy"
-    logging.info(f"Starting {command} operation for region {region}")
+    logging.info(f"Starting {command} operation for zone {zone}")
 
     # Get absolute path to env.tfvars.json
     workspace_dir = os.path.dirname(os.path.abspath(__file__))
@@ -318,23 +334,23 @@ def deploy(command, region, region_config):
         logging.error(f"Required file not found: {env_vars_file}")
         raise FileNotFoundError(f"Required file not found: {env_vars_file}")
 
-    logging.info(f"Region config: {json.dumps(region_config, indent=2)}")
+    logging.info(f"Zone config: {json.dumps(zone_config, indent=2)}")
 
     # For destroy command, get the current state before destroying
     destroyed_resources = {}
     if command == "destroy":
         try:
-            run_command(["terraform", "workspace", "select", "-or-create", region])
+            run_command(["terraform", "workspace", "select", "-or-create", zone])
             result = run_command(["terraform", "output", "-json"])
             try:
                 destroyed_resources = (
                     json.loads(result.stdout) if result.stdout.strip() else {}
                 )
             except json.JSONDecodeError:
-                logging.warning(f"Could not parse terraform output for region {region}")
+                logging.warning(f"Could not parse terraform output for zone {zone}")
                 destroyed_resources = {}
         except Exception as e:
-            logging.warning(f"Could not get current state for region {region}: {e}")
+            logging.warning(f"Could not get current state for zone {zone}: {e}")
             # Even if we can't get the current state, we should still show what was in MACHINES.json
             destroyed_resources = {}
 
@@ -346,30 +362,30 @@ def deploy(command, region, region_config):
         console=console,
     ) as progress:
         task = progress.add_task(
-            f"[cyan]{region}[/cyan] - {command.capitalize()}", total=3
+            f"[cyan]{zone}[/cyan] - {command.capitalize()}", total=3
         )
 
-        # Select workspace for this region
-        logging.info(f"Selecting/creating workspace for region {region}")
-        run_command(["terraform", "workspace", "select", "-or-create", region])
+        # Select workspace for this zone
+        logging.info(f"Selecting/creating workspace for zone {zone}")
+        run_command(["terraform", "workspace", "select", "-or-create", zone])
 
         progress.update(
-            task, advance=1, description=f"[cyan]{region}[/cyan] - Initializing"
+            task, advance=1, description=f"[cyan]{zone}[/cyan] - Initializing"
         )
-        logging.info(f"Running terraform init for region {region}")
+        logging.info(f"Running terraform init for zone {zone}")
         run_command(["terraform", "init", "-upgrade"])
 
         progress.update(
             task,
             advance=1,
-            description=f"[cyan]{region}[/cyan] - {command.capitalize()}",
+            description=f"[cyan]{zone}[/cyan] - {command.capitalize()}",
         )
-        logging.info(f"Running terraform {terraform_command} for region {region}")
+        logging.info(f"Running terraform {terraform_command} for zone {zone}")
         logging.info(
-            f"Command variables: region={region}, zone={region_config['zone']}, "
-            f"instance_ami={region_config['instance_ami']}, "
-            f"node_count={region_config['node_count']}, "
-            f"instance_type={region_config['instance_type']}"
+            f"Command variables: region={zone_config['region']}, zone={zone_config['zone']}, "
+            f"instance_ami={zone_config['instance_ami']}, "
+            f"node_count={zone_config['node_count']}, "
+            f"instance_type={zone_config['instance_type']}"
         )
         try:
             logging.debug(f"Starting terraform {terraform_command}")
@@ -378,11 +394,11 @@ def deploy(command, region, region_config):
                     "terraform",
                     terraform_command,
                     "-auto-approve",
-                    f"-var=region={region}",
-                    f"-var=zone={region_config['zone']}",
-                    f"-var=instance_ami={region_config['instance_ami']}",
-                    f"-var=node_count={region_config['node_count']}",
-                    f"-var=instance_type={region_config['instance_type']}",
+                    f"-var=region={zone_config['region']}",
+                    f"-var=zone={zone_config['zone']}",
+                    f"-var=instance_ami={zone_config['instance_ami']}",
+                    f"-var=node_count={zone_config['node_count']}",
+                    f"-var=instance_type={zone_config['instance_type']}",
                     f"-var-file={env_vars_file}",
                 ]
             )
@@ -392,21 +408,19 @@ def deploy(command, region, region_config):
             if command == "create":
                 outputs_result = run_command(["terraform", "output", "-json"])
                 outputs = json.loads(outputs_result.stdout)
-                update_machines_file(region, outputs)
+                update_machines_file(zone_config["region"], zone, outputs)
 
             logging.debug(f"Terraform {terraform_command} output:\n{result.stdout}")
             if result.stderr:
                 logging.debug(f"Terraform {terraform_command} stderr:\n{result.stderr}")
         except Exception as e:
-            logging.error(
-                f"Error during {terraform_command} for region {region}: {str(e)}"
-            )
+            logging.error(f"Error during {terraform_command} for zone {zone}: {str(e)}")
             raise
 
         progress.update(
-            task, advance=1, description=f"[cyan]{region}[/cyan] - ✓ Complete"
+            task, advance=1, description=f"[cyan]{zone}[/cyan] - ✓ Complete"
         )
-        logging.info(f"Completed {command} operation for region {region}")
+        logging.info(f"Completed {command} operation for zone {zone}")
 
     return destroyed_resources if command == "destroy" else None
 
@@ -512,11 +526,11 @@ def main():
 
         console.print(f"\n[bold blue]Starting {command} operation...[/bold blue]\n")
 
-        # Deploy/destroy resources in each region sequentially
-        for region, region_config in config.items():
-            result = deploy(command, region, region_config)
+        # Deploy/destroy resources in each zone sequentially
+        for zone, zone_config in config.items():
+            result = deploy(command, zone, zone_config)
             if command == "destroy" and result:
-                destroyed_resources[region] = result
+                destroyed_resources[zone] = result
 
         # Display final summary
         console.clear()
@@ -527,12 +541,15 @@ def main():
             # and the actual destroyed resources
             console.print("[bold red]Resources Destroyed:[/bold red]\n")
 
-            for region in config.keys():
-                console.print(f"[bold cyan]Region: {region}[/bold cyan]")
+            for zone in config.keys():
+                region = config[zone]["region"]
+                console.print(f"[bold cyan]Zone: {zone} (Region: {region})[/bold cyan]")
 
                 # Get data from both sources
-                saved_data = machines_data.get(region, {})
-                destroyed_data = destroyed_resources.get(region, {})
+                saved_data = (
+                    machines_data.get(region, {}).get("zones", {}).get(zone, {})
+                )
+                destroyed_data = destroyed_resources.get(zone, {})
 
                 # Display destroyed instances from MACHINES.json
                 instances = saved_data.get("instances", [])
@@ -585,9 +602,7 @@ def main():
 
                 # Only show "No resources" message if both sources are empty
                 if not instances and not destroyed_data:
-                    console.print(
-                        "  [dim]No resources were active in this region[/dim]"
-                    )
+                    console.print("  [dim]No resources were active in this zone[/dim]")
 
                 console.print()
 
@@ -607,6 +622,9 @@ def main():
             table.add_column(
                 "Region", style="cyan", width=15, justify="left", no_wrap=True
             )
+            table.add_column(
+                "Zone", style="green", width=15, justify="left", no_wrap=True
+            )
             table.add_column(
                 "Instance ID", style="yellow", width=25, justify="left", no_wrap=True
             )
@@ -618,17 +636,36 @@ def main():
             )
 
             # Add rows for each active instance from MACHINES.json
-            for region_data in machines_data.values():
-                for instance in region_data["instances"]:
-                    table.add_row(
-                        region_data["name"],
-                        instance["instance_id"],
-                        instance["public_ip"] or "",
-                        instance["private_ip"] or "",
-                    )
+            for region_name, region_data in machines_data.items():
+                for zone_name, zone_data in region_data.get("zones", {}).items():
+                    for instance in zone_data.get("instances", []):
+                        table.add_row(
+                            region_name,
+                            zone_name,
+                            instance["instance_id"],
+                            instance["public_ip"] or "",
+                            instance["private_ip"] or "",
+                        )
 
             console.print(table)
 
+            # Print summary counts
+            total_instances = sum(
+                len(zone_data.get("instances", []))
+                for region_data in machines_data.values()
+                for zone_data in region_data.get("zones", {}).values()
+            )
+            total_regions = len(machines_data)
+            total_zones = sum(
+                len(region_data.get("zones", {}))
+                for region_data in machines_data.values()
+            )
+
+            console.print("\n[bold cyan]Summary:[/bold cyan]")
+            console.print(f"Total Regions: {total_regions}")
+            console.print(f"Total Zones: {total_zones}")
+            console.print(f"Total Instances: {total_instances}")
+
         console.print("\n[bold green]Operation complete![/bold green]\n")
 
     except KeyboardInterrupt:
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py
new file mode 100755
index 00000000..b0affa7e
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py
@@ -0,0 +1,642 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "pyyaml",
+#     "rich",
+# ]
+# ///
+
+import argparse
+import json
+import logging
+import os
+import subprocess
+import sys
+from typing import Any, Dict, Optional, Tuple
+
+import yaml
+from rich import box
+from rich.console import Console
+from rich.progress import BarColumn, Progress, TimeRemainingColumn
+from rich.table import Table
+
+# Set up argument parser before logging configuration
+parser = argparse.ArgumentParser(description="Deploy or destroy infrastructure")
+parser.add_argument("command", choices=["create", "destroy"], help="Action to perform")
+parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+
+args = parser.parse_args()
+
+# Set up logging with more detail
+logging.basicConfig(
+    level=logging.DEBUG if args.debug else logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler(
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug.log"),
+            mode="w",  # Overwrite the file each run
+        ),
+    ],
+)
+
+# Log the start of the script with a clear separator
+logging.info("=" * 80)
+logging.info("Starting new deployment operation")
+if args.debug:
+    logging.info("Debug logging enabled")
+logging.info("=" * 80)
+
+# Default configuration values
+DEFAULTS = {
+    "instance_type": "t2.medium",
+    "node_count": 1,
+}
+
+REQUIRED_FIELDS = {
+    "zone": str,
+    "instance_ami": str,
+}
+
+console = Console()
+
+
+def validate_config(config: Dict[str, Any]) -> None:
+    """Validate the configuration format and required fields"""
+    if not config:
+        raise ValueError("Empty configuration")
+
+    for region, region_config in config.items():
+        if not isinstance(region_config, dict):
+            raise ValueError(f"Invalid configuration for region {region}")
+
+        # Check required fields
+        for field, field_type in REQUIRED_FIELDS.items():
+            if field not in region_config:
+                raise ValueError(
+                    f"Missing required field '{field}' for region {region}"
+                )
+            if not isinstance(region_config[field], field_type):
+                raise ValueError(
+                    f"Invalid type for field '{field}' in region {region}. "
+                    f"Expected {field_type.__name__}"
+                )
+
+        # Apply defaults for optional fields
+        for field, default_value in DEFAULTS.items():
+            if field not in region_config:
+                region_config[field] = default_value
+
+
+def run_command(
+    cmd: list[str], cwd: Optional[str] = None
+) -> subprocess.CompletedProcess:
+    """Run a command with proper error handling"""
+    try:
+        logging.debug(f"Executing command: {' '.join(cmd)}")
+        if cwd:
+            logging.debug(f"Working directory: {cwd}")
+
+        # Get current environment
+        env = os.environ.copy()
+
+        logging.debug("Starting command execution")
+        result = subprocess.run(
+            cmd,
+            check=True,
+            cwd=cwd,
+            capture_output=True,
+            text=True,
+            env=env,
+        )
+        logging.debug("Command completed successfully")
+        if result.stdout:
+            logging.debug(f"Command stdout:\n{result.stdout}")
+        if result.stderr:
+            logging.debug(f"Command stderr:\n{result.stderr}")
+        return result
+    except subprocess.CalledProcessError as e:
+        error_msg = f"Command failed: {' '.join(cmd)}\n"
+        error_msg += f"Exit code: {e.returncode}\n"
+        if e.stdout:
+            error_msg += f"stdout:\n{e.stdout}\n"
+        if e.stderr:
+            error_msg += f"stderr:\n{e.stderr}\n"
+        logging.error(error_msg)
+
+        # Print a user-friendly error message
+        console.print(
+            f"\n[red]Error: Command failed with exit code {e.returncode}[/red]"
+        )
+        console.print(f"[red]Command: {' '.join(cmd)}[/red]")
+        if e.stderr:
+            console.print(f"[yellow]Error details:[/yellow]\n{e.stderr}")
+
+        # Exit immediately on command failure
+        sys.exit(1)
+    except Exception as e:
+        error_msg = f"Unexpected error running command: {' '.join(cmd)}\n{str(e)}"
+        logging.error(error_msg)
+
+        # Print a user-friendly error message
+        console.print("\n[red]Unexpected error:[/red]")
+        console.print(f"[red]Command: {' '.join(cmd)}[/red]")
+        console.print(f"[yellow]Error details:[/yellow]\n{str(e)}")
+
+        # Exit immediately on any error
+        sys.exit(1)
+
+
+def load_config() -> Dict[str, Any]:
+    """Load configuration from locations.yaml"""
+    try:
+        with open("locations.yaml", "r") as f:
+            yaml_data = yaml.safe_load(f)
+            if not isinstance(yaml_data, list):
+                raise ValueError("Expected a list of region configurations")
+
+            # Convert list of single-key dictionaries into a single dictionary
+            config = {}
+            for region_dict in yaml_data:
+                if not isinstance(region_dict, dict):
+                    raise ValueError("Each region configuration must be a dictionary")
+                if len(region_dict) != 1:
+                    raise ValueError(
+                        "Each region configuration must have exactly one key"
+                    )
+
+                region = list(region_dict.keys())[0]
+                config[region] = region_dict[region]
+
+            # Validate the configuration
+            validate_config(config)
+            return config
+
+    except FileNotFoundError:
+        print("Error: locations.yaml file not found")
+        print("Please create a locations.yaml file with your region configurations")
+        sys.exit(1)
+    except yaml.YAMLError as e:
+        print(f"Error parsing locations.yaml: {e}")
+        print("Please ensure your YAML file is properly formatted")
+        sys.exit(1)
+    except ValueError as e:
+        print(f"Invalid configuration: {e}")
+        sys.exit(1)
+
+
+def update_machines_file(region: str, outputs: Dict[str, Any]) -> None:
+    """Update MACHINES.json with outputs from a region"""
+    machines_file = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "MACHINES.json"
+    )
+
+    try:
+        if os.path.exists(machines_file):
+            with open(machines_file, "r") as f:
+                machines_data = json.load(f)
+        else:
+            machines_data = {}
+
+        # Extract values from outputs, ensuring we get the raw values
+        public_ips = outputs.get("public_ips", {}).get("value", [])
+        private_ips = outputs.get("private_ips", {}).get("value", [])
+        instance_ids = outputs.get("instance_ids", {}).get("value", [])
+
+        # Log the raw values for debugging
+        logging.debug(f"Raw outputs for region {region}:")
+        logging.debug(f"Public IPs: {public_ips}")
+        logging.debug(f"Private IPs: {private_ips}")
+        logging.debug(f"Instance IDs: {instance_ids}")
+
+        # Handle nested lists (sometimes AWS returns nested arrays)
+        if (
+            isinstance(public_ips, list)
+            and public_ips
+            and isinstance(public_ips[0], list)
+        ):
+            public_ips = public_ips[0]
+        if (
+            isinstance(private_ips, list)
+            and private_ips
+            and isinstance(private_ips[0], list)
+        ):
+            private_ips = private_ips[0]
+        if (
+            isinstance(instance_ids, list)
+            and instance_ids
+            and isinstance(instance_ids[0], list)
+        ):
+            instance_ids = instance_ids[0]
+
+        # Ensure all lists are actually lists
+        public_ips = (
+            public_ips
+            if isinstance(public_ips, list)
+            else [public_ips]
+            if public_ips
+            else []
+        )
+        private_ips = (
+            private_ips
+            if isinstance(private_ips, list)
+            else [private_ips]
+            if private_ips
+            else []
+        )
+        instance_ids = (
+            instance_ids
+            if isinstance(instance_ids, list)
+            else [instance_ids]
+            if instance_ids
+            else []
+        )
+
+        # Create instances list for this region
+        instances = []
+        max_length = max(len(instance_ids), len(public_ips), len(private_ips))
+
+        for i in range(max_length):
+            if i < len(instance_ids):  # Only create instance if we have an ID
+                instance = {
+                    "instance_id": instance_ids[i],
+                    "public_ip": public_ips[i] if i < len(public_ips) else None,
+                    "private_ip": private_ips[i] if i < len(private_ips) else None,
+                }
+                instances.append(instance)
+
+        # Update the region's data with the new structure
+        machines_data[region] = {"name": region, "instances": instances}
+
+        # Write updated data back to file
+        with open(machines_file, "w") as f:
+            json.dump(machines_data, f, indent=2)
+
+        logging.info(
+            f"Updated MACHINES.json with {len(instances)} instances for region {region}"
+        )
+    except Exception as e:
+        logging.error(f"Error updating MACHINES.json: {str(e)}")
+        raise
+
+
+def check_machines_file() -> bool:
+    """Check if MACHINES.json exists and return True if it does"""
+    machines_file = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "MACHINES.json"
+    )
+    return os.path.exists(machines_file)
+
+
+def delete_machines_file() -> None:
+    """Delete MACHINES.json if it exists"""
+    machines_file = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "MACHINES.json"
+    )
+    try:
+        if os.path.exists(machines_file):
+            os.remove(machines_file)
+            logging.info("Deleted MACHINES.json")
+    except Exception as e:
+        logging.error(f"Error deleting MACHINES.json: {str(e)}")
+        raise
+
+
+def deploy(command, region, region_config):
+    """Deploys or destroys resources in a single region."""
+    terraform_command = "apply" if command == "create" else "destroy"
+    logging.info(f"Starting {command} operation for region {region}")
+
+    # Get absolute path to env.tfvars.json
+    workspace_dir = os.path.dirname(os.path.abspath(__file__))
+    env_vars_file = os.path.join(workspace_dir, "env.tfvars.json")
+    logging.info(f"Using env vars file: {env_vars_file}")
+
+    # Check if env.tfvars.json exists
+    if not os.path.exists(env_vars_file):
+        logging.error(f"Required file not found: {env_vars_file}")
+        raise FileNotFoundError(f"Required file not found: {env_vars_file}")
+
+    logging.info(f"Region config: {json.dumps(region_config, indent=2)}")
+
+    # For destroy command, get the current state before destroying
+    destroyed_resources = {}
+    if command == "destroy":
+        try:
+            run_command(["terraform", "workspace", "select", "-or-create", region])
+            result = run_command(["terraform", "output", "-json"])
+            try:
+                destroyed_resources = (
+                    json.loads(result.stdout) if result.stdout.strip() else {}
+                )
+            except json.JSONDecodeError:
+                logging.warning(f"Could not parse terraform output for region {region}")
+                destroyed_resources = {}
+        except Exception as e:
+            logging.warning(f"Could not get current state for region {region}: {e}")
+            # Even if we can't get the current state, we should still show what was in MACHINES.json
+            destroyed_resources = {}
+
+    with Progress(
+        "[progress.description]{task.description}",
+        BarColumn(),
+        "[progress.percentage]{task.percentage:>3.1f}%",
+        TimeRemainingColumn(),
+        console=console,
+    ) as progress:
+        task = progress.add_task(
+            f"[cyan]{region}[/cyan] - {command.capitalize()}", total=3
+        )
+
+        # Select workspace for this region
+        logging.info(f"Selecting/creating workspace for region {region}")
+        run_command(["terraform", "workspace", "select", "-or-create", region])
+
+        progress.update(
+            task, advance=1, description=f"[cyan]{region}[/cyan] - Initializing"
+        )
+        logging.info(f"Running terraform init for region {region}")
+        run_command(["terraform", "init", "-upgrade"])
+
+        progress.update(
+            task,
+            advance=1,
+            description=f"[cyan]{region}[/cyan] - {command.capitalize()}",
+        )
+        logging.info(f"Running terraform {terraform_command} for region {region}")
+        logging.info(
+            f"Command variables: region={region}, zone={region_config['zone']}, "
+            f"instance_ami={region_config['instance_ami']}, "
+            f"node_count={region_config['node_count']}, "
+            f"instance_type={region_config['instance_type']}"
+        )
+        try:
+            logging.debug(f"Starting terraform {terraform_command}")
+            result = run_command(
+                [
+                    "terraform",
+                    terraform_command,
+                    "-auto-approve",
+                    f"-var=region={region}",
+                    f"-var=zone={region_config['zone']}",
+                    f"-var=instance_ami={region_config['instance_ami']}",
+                    f"-var=node_count={region_config['node_count']}",
+                    f"-var=instance_type={region_config['instance_type']}",
+                    f"-var-file={env_vars_file}",
+                ]
+            )
+            logging.info(f"Terraform {terraform_command} completed successfully")
+
+            # After successful creation, update MACHINES.json
+            if command == "create":
+                outputs_result = run_command(["terraform", "output", "-json"])
+                outputs = json.loads(outputs_result.stdout)
+                update_machines_file(region, outputs)
+
+            logging.debug(f"Terraform {terraform_command} output:\n{result.stdout}")
+            if result.stderr:
+                logging.debug(f"Terraform {terraform_command} stderr:\n{result.stderr}")
+        except Exception as e:
+            logging.error(
+                f"Error during {terraform_command} for region {region}: {str(e)}"
+            )
+            raise
+
+        progress.update(
+            task, advance=1, description=f"[cyan]{region}[/cyan] - ✓ Complete"
+        )
+        logging.info(f"Completed {command} operation for region {region}")
+
+    return destroyed_resources if command == "destroy" else None
+
+
+def validate_aws_credentials() -> Tuple[bool, str]:
+    """Validate AWS credentials are properly configured"""
+    logging.info("Validating AWS credentials...")
+
+    try:
+        # Simply try to make an AWS API call
+        result = subprocess.run(
+            ["aws", "sts", "get-caller-identity"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        identity = json.loads(result.stdout)
+        user_arn = identity.get("Arn", "Unknown")
+        account_id = identity.get("Account", "Unknown")
+        logging.info(f"AWS credentials valid - User: {user_arn}, Account: {account_id}")
+        return True, f"AWS credentials valid - Account: {account_id}"
+    except subprocess.CalledProcessError as e:
+        error_msg = "AWS credentials not found or invalid"
+        if e.stderr:
+            error_msg = f"AWS credential error: {e.stderr.strip()}"
+        logging.error(error_msg)
+        return False, error_msg
+    except Exception as e:
+        error_msg = f"Error validating AWS credentials: {str(e)}"
+        logging.error(error_msg)
+        return False, error_msg
+
+
+def read_machines_file() -> Dict[str, Any]:
+    """Read and return the contents of MACHINES.json"""
+    machines_file = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "MACHINES.json"
+    )
+    try:
+        if os.path.exists(machines_file):
+            with open(machines_file, "r") as f:
+                return json.load(f)
+        return {}
+    except Exception as e:
+        logging.error(f"Error reading MACHINES.json: {str(e)}")
+        raise
+
+
+def main():
+    try:
+        command = args.command
+
+        # Check MACHINES.json status
+        if command == "create" and check_machines_file():
+            console.print("\n[red]Error: MACHINES.json already exists[/red]")
+            console.print("This indicates that there might be existing infrastructure.")
+            console.print(
+                "Please run 'destroy' first or manually delete MACHINES.json if you're sure it's safe."
+            )
+            sys.exit(1)
+
+        # For destroy command, read the existing state before deleting
+        machines_data = {}
+        destroyed_resources = {}
+        if command == "destroy":
+            machines_data = read_machines_file()
+            delete_machines_file()
+
+        # Validate AWS credentials before proceeding
+        credentials_valid, message = validate_aws_credentials()
+        if not credentials_valid:
+            console.print("\n[red]Error: AWS credentials are not valid[/red]")
+            console.print(
+                "Please configure your AWS credentials using one of these methods:"
+            )
+            console.print("1. Set environment variables:")
+            console.print("   export AWS_ACCESS_KEY_ID='your-access-key'")
+            console.print("   export AWS_SECRET_ACCESS_KEY='your-secret-key'")
+            console.print("\n2. Or configure AWS CLI:")
+            console.print("   aws configure")
+            console.print("\nThen verify your credentials with:")
+            console.print("   aws sts get-caller-identity")
+            sys.exit(1)
+        else:
+            console.print(f"\n[green]{message}[/green]")
+
+        # Get absolute path to env.tfvars.json
+        workspace_dir = os.path.dirname(os.path.abspath(__file__))
+        env_vars_file = os.path.join(workspace_dir, "env.tfvars.json")
+
+        # Check if env.tfvars.json exists before starting
+        if not os.path.exists(env_vars_file):
+            console.print(
+                f"\n[red]Error: Required file not found: {env_vars_file}[/red]"
+            )
+            console.print(
+                "Please ensure env.tfvars.json exists in the same directory as deploy.py"
+            )
+            sys.exit(1)
+
+        # Load and validate configuration
+        config = load_config()
+
+        console.print(f"\n[bold blue]Starting {command} operation...[/bold blue]\n")
+
+        # Deploy/destroy resources in each region sequentially
+        for region, region_config in config.items():
+            result = deploy(command, region, region_config)
+            if command == "destroy" and result:
+                destroyed_resources[region] = result
+
+        # Display final summary
+        console.clear()
+        console.print("\n")
+
+        if command == "destroy":
+            # Show summary of destroyed resources using both the saved machines_data
+            # and the actual destroyed resources
+            console.print("[bold red]Resources Destroyed:[/bold red]\n")
+
+            for region in config.keys():
+                console.print(f"[bold cyan]Region: {region}[/bold cyan]")
+
+                # Get data from both sources
+                saved_data = machines_data.get(region, {})
+                destroyed_data = destroyed_resources.get(region, {})
+
+                # Display destroyed instances from MACHINES.json
+                instances = saved_data.get("instances", [])
+                if instances:
+                    console.print("  [yellow]Instances:[/yellow]")
+                    for instance in instances:
+                        console.print(f"    [red]✗[/red] {instance['instance_id']}:")
+                        console.print(
+                            f"      [dim]Public IP: {instance['public_ip']}[/dim]"
+                        )
+                        console.print(
+                            f"      [dim]Private IP: {instance['private_ip']}[/dim]"
+                        )
+
+                # Show VPCs and other AWS resources that were destroyed
+                if destroyed_data:
+                    vpc_resources = [
+                        key for key in destroyed_data.keys() if "vpc" in key.lower()
+                    ]
+                    if vpc_resources:
+                        console.print("  [yellow]VPC Resources:[/yellow]")
+                        for resource in vpc_resources:
+                            value = destroyed_data[resource].get("value")
+                            if isinstance(value, list):
+                                for v in value:
+                                    console.print(f"    [red]✗[/red] {v}")
+                            else:
+                                console.print(f"    [red]✗[/red] {value}")
+
+                    other_resources = [
+                        key
+                        for key in destroyed_data.keys()
+                        if key not in vpc_resources
+                        and key
+                        not in [
+                            "public_ips",
+                            "private_ips",
+                            "instance_ids",
+                        ]
+                    ]
+                    if other_resources:
+                        console.print("  [yellow]Other Resources:[/yellow]")
+                        for resource in other_resources:
+                            value = destroyed_data[resource].get("value")
+                            if isinstance(value, list):
+                                for v in value:
+                                    console.print(f"    [red]✗[/red] {v}")
+                            else:
+                                console.print(f"    [red]✗[/red] {value}")
+
+                # Only show "No resources" message if both sources are empty
+                if not instances and not destroyed_data:
+                    console.print(
+                        "  [dim]No resources were active in this region[/dim]"
+                    )
+
+                console.print()
+
+        else:
+            # Show active resources table using MACHINES.json
+            machines_data = read_machines_file()
+
+            table = Table(
+                title="Active Deployments",
+                show_header=True,
+                header_style="bold",
+                padding=(0, 2),
+                box=box.DOUBLE,
+            )
+
+            # Columns for create operation
+            table.add_column(
+                "Region", style="cyan", width=15, justify="left", no_wrap=True
+            )
+            table.add_column(
+                "Instance ID", style="yellow", width=25, justify="left", no_wrap=True
+            )
+            table.add_column(
+                "Public IP", style="blue", width=20, justify="left", no_wrap=True
+            )
+            table.add_column(
+                "Private IP", style="magenta", width=20, justify="left", no_wrap=True
+            )
+
+            # Add rows for each active instance from MACHINES.json
+            for region_data in machines_data.values():
+                for instance in region_data["instances"]:
+                    table.add_row(
+                        region_data["name"],
+                        instance["instance_id"],
+                        instance["public_ip"] or "",
+                        instance["private_ip"] or "",
+                    )
+
+            console.print(table)
+
+        console.print("\n[bold green]Operation complete![/bold green]\n")
+
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Operation cancelled by user[/yellow]")
+        sys.exit(1)
+    except Exception as e:
+        console.print(f"\n[red]Unexpected error: {e}[/red]")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml
index 0c8511ae..f4ba1821 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml
@@ -1,39 +1,57 @@
 # Because of the way AWS works, we cannot
 # deploy to all regions at once. Use the below list to name the
 # regions you want to deploy to.
-- ap-south-1:
-    region: ap-south-1
-    zone: ap-south-1a
+- ap-south-2a:
+    instance_ami: ami-0a94a70b8a1454a4b
     instance_type: t3.medium
-    instance_ami: ami-0fd05997b4dff7aac
     node_count: 1
-- eu-south-1:
-    region: eu-south-1
-    zone: eu-south-1a
+    region: ap-south-2
+    zone: ap-south-2a
+- ca-central-1d:
+    instance_ami: ami-0a474b3a85d51a5e5
     instance_type: t3.medium
-    instance_ami: ami-0f529654669a607d1
-    node_count: 2
-- eu-south-2:
-    region: eu-south-2
-    zone: eu-south-2a
+    node_count: 1
+    region: ca-central-1
+    zone: ca-central-1d
+- eu-west-1c:
+    instance_ami: ami-032a56ad5e480189c
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-1
+    zone: eu-west-1c
+- eu-west-3b:
+    instance_ami: ami-04a4acda26ca36de0
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-3
+    zone: eu-west-3b
+- me-central-1b:
+    instance_ami: ami-07a33155d2d5abff0
     instance_type: t3.medium
-    instance_ami: ami-0d27757cc8327f88f
-    node_count: 3
-- me-central-1:
+    node_count: 1
     region: me-central-1
-    zone: me-central-1a
+    zone: me-central-1b
+- me-central-1c:
+    instance_ami: ami-07a33155d2d5abff0
     instance_type: t3.medium
-    instance_ami: ami-0f334de647da2fc7d
     node_count: 1
-- ca-central-1:
-    region: ca-central-1
-    zone: ca-central-1a
+    region: me-central-1
+    zone: me-central-1c
+- sa-east-1b:
+    instance_ami: ami-0780816dd7ce942fd
+    instance_type: t3.medium
+    node_count: 1
+    region: sa-east-1
+    zone: sa-east-1b
+- us-east-2a:
+    instance_ami: ami-0884d2865dbe9de4b
     instance_type: t3.medium
-    instance_ami: ami-0a590ca28046d073e
     node_count: 1
-- eu-central-1:
-    region: eu-central-1
-    zone: eu-central-1a
+    region: us-east-2
+    zone: us-east-2a
+- us-east-2b:
+    instance_ami: ami-0884d2865dbe9de4b
     instance_type: t3.medium
-    instance_ami: ami-0e54671bdf3c8ed8d
     node_count: 1
+    region: us-east-2
+    zone: us-east-2b
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json
new file mode 100644
index 00000000..18d0f2fe
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json
@@ -0,0 +1,821 @@
+[
+  {
+    "af-south-1a": {
+      "region": "af-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-05577c8d07333d909",
+      "node_count": 1,
+      "zone": "af-south-1a"
+    }
+  },
+  {
+    "af-south-1b": {
+      "region": "af-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-05577c8d07333d909",
+      "node_count": 1,
+      "zone": "af-south-1b"
+    }
+  },
+  {
+    "af-south-1c": {
+      "region": "af-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-05577c8d07333d909",
+      "node_count": 1,
+      "zone": "af-south-1c"
+    }
+  },
+  {
+    "ap-east-1a": {
+      "region": "ap-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a0aa69b665a45b96",
+      "node_count": 1,
+      "zone": "ap-east-1a"
+    }
+  },
+  {
+    "ap-east-1b": {
+      "region": "ap-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a0aa69b665a45b96",
+      "node_count": 1,
+      "zone": "ap-east-1b"
+    }
+  },
+  {
+    "ap-east-1c": {
+      "region": "ap-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a0aa69b665a45b96",
+      "node_count": 1,
+      "zone": "ap-east-1c"
+    }
+  },
+  {
+    "ap-northeast-1a": {
+      "region": "ap-northeast-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-08f191dd81ec3a3de",
+      "node_count": 1,
+      "zone": "ap-northeast-1a"
+    }
+  },
+  {
+    "ap-northeast-1c": {
+      "region": "ap-northeast-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-08f191dd81ec3a3de",
+      "node_count": 1,
+      "zone": "ap-northeast-1c"
+    }
+  },
+  {
+    "ap-northeast-1d": {
+      "region": "ap-northeast-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-08f191dd81ec3a3de",
+      "node_count": 1,
+      "zone": "ap-northeast-1d"
+    }
+  },
+  {
+    "ap-northeast-2a": {
+      "region": "ap-northeast-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0077297a838d6761d",
+      "node_count": 1,
+      "zone": "ap-northeast-2a"
+    }
+  },
+  {
+    "ap-northeast-2b": {
+      "region": "ap-northeast-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0077297a838d6761d",
+      "node_count": 1,
+      "zone": "ap-northeast-2b"
+    }
+  },
+  {
+    "ap-northeast-2c": {
+      "region": "ap-northeast-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0077297a838d6761d",
+      "node_count": 1,
+      "zone": "ap-northeast-2c"
+    }
+  },
+  {
+    "ap-northeast-2d": {
+      "region": "ap-northeast-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0077297a838d6761d",
+      "node_count": 1,
+      "zone": "ap-northeast-2d"
+    }
+  },
+  {
+    "ap-northeast-3a": {
+      "region": "ap-northeast-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a0bcba223270ed99",
+      "node_count": 1,
+      "zone": "ap-northeast-3a"
+    }
+  },
+  {
+    "ap-northeast-3b": {
+      "region": "ap-northeast-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a0bcba223270ed99",
+      "node_count": 1,
+      "zone": "ap-northeast-3b"
+    }
+  },
+  {
+    "ap-northeast-3c": {
+      "region": "ap-northeast-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a0bcba223270ed99",
+      "node_count": 1,
+      "zone": "ap-northeast-3c"
+    }
+  },
+  {
+    "ap-south-1a": {
+      "region": "ap-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-023a307f3d27ea427",
+      "node_count": 1,
+      "zone": "ap-south-1a"
+    }
+  },
+  {
+    "ap-south-1b": {
+      "region": "ap-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-023a307f3d27ea427",
+      "node_count": 1,
+      "zone": "ap-south-1b"
+    }
+  },
+  {
+    "ap-south-1c": {
+      "region": "ap-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-023a307f3d27ea427",
+      "node_count": 1,
+      "zone": "ap-south-1c"
+    }
+  },
+  {
+    "ap-south-2a": {
+      "region": "ap-south-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a94a70b8a1454a4b",
+      "node_count": 1,
+      "zone": "ap-south-2a"
+    }
+  },
+  {
+    "ap-south-2b": {
+      "region": "ap-south-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a94a70b8a1454a4b",
+      "node_count": 1,
+      "zone": "ap-south-2b"
+    }
+  },
+  {
+    "ap-south-2c": {
+      "region": "ap-south-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a94a70b8a1454a4b",
+      "node_count": 1,
+      "zone": "ap-south-2c"
+    }
+  },
+  {
+    "ap-southeast-1a": {
+      "region": "ap-southeast-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0198a868663199764",
+      "node_count": 1,
+      "zone": "ap-southeast-1a"
+    }
+  },
+  {
+    "ap-southeast-1b": {
+      "region": "ap-southeast-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0198a868663199764",
+      "node_count": 1,
+      "zone": "ap-southeast-1b"
+    }
+  },
+  {
+    "ap-southeast-1c": {
+      "region": "ap-southeast-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0198a868663199764",
+      "node_count": 1,
+      "zone": "ap-southeast-1c"
+    }
+  },
+  {
+    "ap-southeast-2a": {
+      "region": "ap-southeast-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-01e2093820bf84df1",
+      "node_count": 1,
+      "zone": "ap-southeast-2a"
+    }
+  },
+  {
+    "ap-southeast-2b": {
+      "region": "ap-southeast-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-01e2093820bf84df1",
+      "node_count": 1,
+      "zone": "ap-southeast-2b"
+    }
+  },
+  {
+    "ap-southeast-2c": {
+      "region": "ap-southeast-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-01e2093820bf84df1",
+      "node_count": 1,
+      "zone": "ap-southeast-2c"
+    }
+  },
+  {
+    "ap-southeast-3a": {
+      "region": "ap-southeast-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-01d23da707abb0f2f",
+      "node_count": 1,
+      "zone": "ap-southeast-3a"
+    }
+  },
+  {
+    "ap-southeast-3b": {
+      "region": "ap-southeast-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-01d23da707abb0f2f",
+      "node_count": 1,
+      "zone": "ap-southeast-3b"
+    }
+  },
+  {
+    "ap-southeast-3c": {
+      "region": "ap-southeast-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-01d23da707abb0f2f",
+      "node_count": 1,
+      "zone": "ap-southeast-3c"
+    }
+  },
+  {
+    "ap-southeast-4a": {
+      "region": "ap-southeast-4",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-05e0c17592d882511",
+      "node_count": 1,
+      "zone": "ap-southeast-4a"
+    }
+  },
+  {
+    "ap-southeast-4b": {
+      "region": "ap-southeast-4",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-05e0c17592d882511",
+      "node_count": 1,
+      "zone": "ap-southeast-4b"
+    }
+  },
+  {
+    "ap-southeast-4c": {
+      "region": "ap-southeast-4",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-05e0c17592d882511",
+      "node_count": 1,
+      "zone": "ap-southeast-4c"
+    }
+  },
+  {
+    "ap-southeast-5a": {
+      "region": "ap-southeast-5",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a88b63550bbb2b1a",
+      "node_count": 1,
+      "zone": "ap-southeast-5a"
+    }
+  },
+  {
+    "ap-southeast-5b": {
+      "region": "ap-southeast-5",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a88b63550bbb2b1a",
+      "node_count": 1,
+      "zone": "ap-southeast-5b"
+    }
+  },
+  {
+    "ap-southeast-5c": {
+      "region": "ap-southeast-5",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a88b63550bbb2b1a",
+      "node_count": 1,
+      "zone": "ap-southeast-5c"
+    }
+  },
+  {
+    "ca-central-1a": {
+      "region": "ca-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a474b3a85d51a5e5",
+      "node_count": 1,
+      "zone": "ca-central-1a"
+    }
+  },
+  {
+    "ca-central-1b": {
+      "region": "ca-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a474b3a85d51a5e5",
+      "node_count": 1,
+      "zone": "ca-central-1b"
+    }
+  },
+  {
+    "ca-central-1d": {
+      "region": "ca-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0a474b3a85d51a5e5",
+      "node_count": 1,
+      "zone": "ca-central-1d"
+    }
+  },
+  {
+    "ca-west-1a": {
+      "region": "ca-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0bd5d3965e2cc8c99",
+      "node_count": 1,
+      "zone": "ca-west-1a"
+    }
+  },
+  {
+    "ca-west-1b": {
+      "region": "ca-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0bd5d3965e2cc8c99",
+      "node_count": 1,
+      "zone": "ca-west-1b"
+    }
+  },
+  {
+    "ca-west-1c": {
+      "region": "ca-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0bd5d3965e2cc8c99",
+      "node_count": 1,
+      "zone": "ca-west-1c"
+    }
+  },
+  {
+    "eu-central-1a": {
+      "region": "eu-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-03b3b5f65db7e5c6f",
+      "node_count": 1,
+      "zone": "eu-central-1a"
+    }
+  },
+  {
+    "eu-central-1b": {
+      "region": "eu-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-03b3b5f65db7e5c6f",
+      "node_count": 1,
+      "zone": "eu-central-1b"
+    }
+  },
+  {
+    "eu-central-1c": {
+      "region": "eu-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-03b3b5f65db7e5c6f",
+      "node_count": 1,
+      "zone": "eu-central-1c"
+    }
+  },
+  {
+    "eu-central-2a": {
+      "region": "eu-central-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0039b5f24283949b4",
+      "node_count": 1,
+      "zone": "eu-central-2a"
+    }
+  },
+  {
+    "eu-central-2b": {
+      "region": "eu-central-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0039b5f24283949b4",
+      "node_count": 1,
+      "zone": "eu-central-2b"
+    }
+  },
+  {
+    "eu-central-2c": {
+      "region": "eu-central-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0039b5f24283949b4",
+      "node_count": 1,
+      "zone": "eu-central-2c"
+    }
+  },
+  {
+    "eu-north-1a": {
+      "region": "eu-north-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-02e2af61198e99faf",
+      "node_count": 1,
+      "zone": "eu-north-1a"
+    }
+  },
+  {
+    "eu-north-1b": {
+      "region": "eu-north-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-02e2af61198e99faf",
+      "node_count": 1,
+      "zone": "eu-north-1b"
+    }
+  },
+  {
+    "eu-north-1c": {
+      "region": "eu-north-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-02e2af61198e99faf",
+      "node_count": 1,
+      "zone": "eu-north-1c"
+    }
+  },
+  {
+    "eu-south-1a": {
+      "region": "eu-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0ded58aac79f90084",
+      "node_count": 1,
+      "zone": "eu-south-1a"
+    }
+  },
+  {
+    "eu-south-1b": {
+      "region": "eu-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0ded58aac79f90084",
+      "node_count": 1,
+      "zone": "eu-south-1b"
+    }
+  },
+  {
+    "eu-south-1c": {
+      "region": "eu-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0ded58aac79f90084",
+      "node_count": 1,
+      "zone": "eu-south-1c"
+    }
+  },
+  {
+    "eu-south-2a": {
+      "region": "eu-south-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0895641da3c86443b",
+      "node_count": 1,
+      "zone": "eu-south-2a"
+    }
+  },
+  {
+    "eu-south-2b": {
+      "region": "eu-south-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0895641da3c86443b",
+      "node_count": 1,
+      "zone": "eu-south-2b"
+    }
+  },
+  {
+    "eu-south-2c": {
+      "region": "eu-south-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0895641da3c86443b",
+      "node_count": 1,
+      "zone": "eu-south-2c"
+    }
+  },
+  {
+    "eu-west-1a": {
+      "region": "eu-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-032a56ad5e480189c",
+      "node_count": 1,
+      "zone": "eu-west-1a"
+    }
+  },
+  {
+    "eu-west-1b": {
+      "region": "eu-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-032a56ad5e480189c",
+      "node_count": 1,
+      "zone": "eu-west-1b"
+    }
+  },
+  {
+    "eu-west-1c": {
+      "region": "eu-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-032a56ad5e480189c",
+      "node_count": 1,
+      "zone": "eu-west-1c"
+    }
+  },
+  {
+    "eu-west-2a": {
+      "region": "eu-west-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-06cff85354b67982b",
+      "node_count": 1,
+      "zone": "eu-west-2a"
+    }
+  },
+  {
+    "eu-west-2b": {
+      "region": "eu-west-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-06cff85354b67982b",
+      "node_count": 1,
+      "zone": "eu-west-2b"
+    }
+  },
+  {
+    "eu-west-2c": {
+      "region": "eu-west-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-06cff85354b67982b",
+      "node_count": 1,
+      "zone": "eu-west-2c"
+    }
+  },
+  {
+    "eu-west-3a": {
+      "region": "eu-west-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-04a4acda26ca36de0",
+      "node_count": 1,
+      "zone": "eu-west-3a"
+    }
+  },
+  {
+    "eu-west-3b": {
+      "region": "eu-west-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-04a4acda26ca36de0",
+      "node_count": 1,
+      "zone": "eu-west-3b"
+    }
+  },
+  {
+    "eu-west-3c": {
+      "region": "eu-west-3",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-04a4acda26ca36de0",
+      "node_count": 1,
+      "zone": "eu-west-3c"
+    }
+  },
+  {
+    "me-central-1a": {
+      "region": "me-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-07a33155d2d5abff0",
+      "node_count": 1,
+      "zone": "me-central-1a"
+    }
+  },
+  {
+    "me-central-1b": {
+      "region": "me-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-07a33155d2d5abff0",
+      "node_count": 1,
+      "zone": "me-central-1b"
+    }
+  },
+  {
+    "me-central-1c": {
+      "region": "me-central-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-07a33155d2d5abff0",
+      "node_count": 1,
+      "zone": "me-central-1c"
+    }
+  },
+  {
+    "me-south-1a": {
+      "region": "me-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0b32599a51ef0ad90",
+      "node_count": 1,
+      "zone": "me-south-1a"
+    }
+  },
+  {
+    "me-south-1b": {
+      "region": "me-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0b32599a51ef0ad90",
+      "node_count": 1,
+      "zone": "me-south-1b"
+    }
+  },
+  {
+    "me-south-1c": {
+      "region": "me-south-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0b32599a51ef0ad90",
+      "node_count": 1,
+      "zone": "me-south-1c"
+    }
+  },
+  {
+    "sa-east-1a": {
+      "region": "sa-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0780816dd7ce942fd",
+      "node_count": 1,
+      "zone": "sa-east-1a"
+    }
+  },
+  {
+    "sa-east-1b": {
+      "region": "sa-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0780816dd7ce942fd",
+      "node_count": 1,
+      "zone": "sa-east-1b"
+    }
+  },
+  {
+    "sa-east-1c": {
+      "region": "sa-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0780816dd7ce942fd",
+      "node_count": 1,
+      "zone": "sa-east-1c"
+    }
+  },
+  {
+    "us-east-1a": {
+      "region": "us-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0e1bed4f06a3b463d",
+      "node_count": 1,
+      "zone": "us-east-1a"
+    }
+  },
+  {
+    "us-east-1b": {
+      "region": "us-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0e1bed4f06a3b463d",
+      "node_count": 1,
+      "zone": "us-east-1b"
+    }
+  },
+  {
+    "us-east-1c": {
+      "region": "us-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0e1bed4f06a3b463d",
+      "node_count": 1,
+      "zone": "us-east-1c"
+    }
+  },
+  {
+    "us-east-1d": {
+      "region": "us-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0e1bed4f06a3b463d",
+      "node_count": 1,
+      "zone": "us-east-1d"
+    }
+  },
+  {
+    "us-east-1e": {
+      "region": "us-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0e1bed4f06a3b463d",
+      "node_count": 1,
+      "zone": "us-east-1e"
+    }
+  },
+  {
+    "us-east-1f": {
+      "region": "us-east-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0e1bed4f06a3b463d",
+      "node_count": 1,
+      "zone": "us-east-1f"
+    }
+  },
+  {
+    "us-east-2a": {
+      "region": "us-east-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0884d2865dbe9de4b",
+      "node_count": 1,
+      "zone": "us-east-2a"
+    }
+  },
+  {
+    "us-east-2b": {
+      "region": "us-east-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0884d2865dbe9de4b",
+      "node_count": 1,
+      "zone": "us-east-2b"
+    }
+  },
+  {
+    "us-east-2c": {
+      "region": "us-east-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0884d2865dbe9de4b",
+      "node_count": 1,
+      "zone": "us-east-2c"
+    }
+  },
+  {
+    "us-west-1b": {
+      "region": "us-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0d413c682033e11fd",
+      "node_count": 1,
+      "zone": "us-west-1b"
+    }
+  },
+  {
+    "us-west-1c": {
+      "region": "us-west-1",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0d413c682033e11fd",
+      "node_count": 1,
+      "zone": "us-west-1c"
+    }
+  },
+  {
+    "us-west-2a": {
+      "region": "us-west-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0606dd43116f5ed57",
+      "node_count": 1,
+      "zone": "us-west-2a"
+    }
+  },
+  {
+    "us-west-2b": {
+      "region": "us-west-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0606dd43116f5ed57",
+      "node_count": 1,
+      "zone": "us-west-2b"
+    }
+  },
+  {
+    "us-west-2c": {
+      "region": "us-west-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0606dd43116f5ed57",
+      "node_count": 1,
+      "zone": "us-west-2c"
+    }
+  },
+  {
+    "us-west-2d": {
+      "region": "us-west-2",
+      "instance_type": "t3.medium",
+      "instance_ami": "ami-0606dd43116f5ed57",
+      "node_count": 1,
+      "zone": "us-west-2d"
+    }
+  }
+]
\ No newline at end of file
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml
new file mode 100644
index 00000000..2fafb242
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml
@@ -0,0 +1,546 @@
+- af-south-1a:
+    instance_ami: ami-05577c8d07333d909
+    instance_type: t3.medium
+    node_count: 1
+    region: af-south-1
+    zone: af-south-1a
+- af-south-1b:
+    instance_ami: ami-05577c8d07333d909
+    instance_type: t3.medium
+    node_count: 1
+    region: af-south-1
+    zone: af-south-1b
+- af-south-1c:
+    instance_ami: ami-05577c8d07333d909
+    instance_type: t3.medium
+    node_count: 1
+    region: af-south-1
+    zone: af-south-1c
+- ap-east-1a:
+    instance_ami: ami-0a0aa69b665a45b96
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-east-1
+    zone: ap-east-1a
+- ap-east-1b:
+    instance_ami: ami-0a0aa69b665a45b96
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-east-1
+    zone: ap-east-1b
+- ap-east-1c:
+    instance_ami: ami-0a0aa69b665a45b96
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-east-1
+    zone: ap-east-1c
+- ap-northeast-1a:
+    instance_ami: ami-08f191dd81ec3a3de
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-1
+    zone: ap-northeast-1a
+- ap-northeast-1c:
+    instance_ami: ami-08f191dd81ec3a3de
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-1
+    zone: ap-northeast-1c
+- ap-northeast-1d:
+    instance_ami: ami-08f191dd81ec3a3de
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-1
+    zone: ap-northeast-1d
+- ap-northeast-2a:
+    instance_ami: ami-0077297a838d6761d
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-2
+    zone: ap-northeast-2a
+- ap-northeast-2b:
+    instance_ami: ami-0077297a838d6761d
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-2
+    zone: ap-northeast-2b
+- ap-northeast-2c:
+    instance_ami: ami-0077297a838d6761d
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-2
+    zone: ap-northeast-2c
+- ap-northeast-2d:
+    instance_ami: ami-0077297a838d6761d
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-2
+    zone: ap-northeast-2d
+- ap-northeast-3a:
+    instance_ami: ami-0a0bcba223270ed99
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-3
+    zone: ap-northeast-3a
+- ap-northeast-3b:
+    instance_ami: ami-0a0bcba223270ed99
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-3
+    zone: ap-northeast-3b
+- ap-northeast-3c:
+    instance_ami: ami-0a0bcba223270ed99
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-northeast-3
+    zone: ap-northeast-3c
+- ap-south-1a:
+    instance_ami: ami-023a307f3d27ea427
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-south-1
+    zone: ap-south-1a
+- ap-south-1b:
+    instance_ami: ami-023a307f3d27ea427
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-south-1
+    zone: ap-south-1b
+- ap-south-1c:
+    instance_ami: ami-023a307f3d27ea427
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-south-1
+    zone: ap-south-1c
+- ap-south-2a:
+    instance_ami: ami-0a94a70b8a1454a4b
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-south-2
+    zone: ap-south-2a
+- ap-south-2b:
+    instance_ami: ami-0a94a70b8a1454a4b
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-south-2
+    zone: ap-south-2b
+- ap-south-2c:
+    instance_ami: ami-0a94a70b8a1454a4b
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-south-2
+    zone: ap-south-2c
+- ap-southeast-1a:
+    instance_ami: ami-0198a868663199764
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-1
+    zone: ap-southeast-1a
+- ap-southeast-1b:
+    instance_ami: ami-0198a868663199764
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-1
+    zone: ap-southeast-1b
+- ap-southeast-1c:
+    instance_ami: ami-0198a868663199764
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-1
+    zone: ap-southeast-1c
+- ap-southeast-2a:
+    instance_ami: ami-01e2093820bf84df1
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-2
+    zone: ap-southeast-2a
+- ap-southeast-2b:
+    instance_ami: ami-01e2093820bf84df1
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-2
+    zone: ap-southeast-2b
+- ap-southeast-2c:
+    instance_ami: ami-01e2093820bf84df1
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-2
+    zone: ap-southeast-2c
+- ap-southeast-3a:
+    instance_ami: ami-01d23da707abb0f2f
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-3
+    zone: ap-southeast-3a
+- ap-southeast-3b:
+    instance_ami: ami-01d23da707abb0f2f
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-3
+    zone: ap-southeast-3b
+- ap-southeast-3c:
+    instance_ami: ami-01d23da707abb0f2f
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-3
+    zone: ap-southeast-3c
+- ap-southeast-4a:
+    instance_ami: ami-05e0c17592d882511
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-4
+    zone: ap-southeast-4a
+- ap-southeast-4b:
+    instance_ami: ami-05e0c17592d882511
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-4
+    zone: ap-southeast-4b
+- ap-southeast-4c:
+    instance_ami: ami-05e0c17592d882511
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-4
+    zone: ap-southeast-4c
+- ap-southeast-5a:
+    instance_ami: ami-0a88b63550bbb2b1a
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-5
+    zone: ap-southeast-5a
+- ap-southeast-5b:
+    instance_ami: ami-0a88b63550bbb2b1a
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-5
+    zone: ap-southeast-5b
+- ap-southeast-5c:
+    instance_ami: ami-0a88b63550bbb2b1a
+    instance_type: t3.medium
+    node_count: 1
+    region: ap-southeast-5
+    zone: ap-southeast-5c
+- ca-central-1a:
+    instance_ami: ami-0a474b3a85d51a5e5
+    instance_type: t3.medium
+    node_count: 1
+    region: ca-central-1
+    zone: ca-central-1a
+- ca-central-1b:
+    instance_ami: ami-0a474b3a85d51a5e5
+    instance_type: t3.medium
+    node_count: 1
+    region: ca-central-1
+    zone: ca-central-1b
+- ca-central-1d:
+    instance_ami: ami-0a474b3a85d51a5e5
+    instance_type: t3.medium
+    node_count: 1
+    region: ca-central-1
+    zone: ca-central-1d
+- ca-west-1a:
+    instance_ami: ami-0bd5d3965e2cc8c99
+    instance_type: t3.medium
+    node_count: 1
+    region: ca-west-1
+    zone: ca-west-1a
+- ca-west-1b:
+    instance_ami: ami-0bd5d3965e2cc8c99
+    instance_type: t3.medium
+    node_count: 1
+    region: ca-west-1
+    zone: ca-west-1b
+- ca-west-1c:
+    instance_ami: ami-0bd5d3965e2cc8c99
+    instance_type: t3.medium
+    node_count: 1
+    region: ca-west-1
+    zone: ca-west-1c
+- eu-central-1a:
+    instance_ami: ami-03b3b5f65db7e5c6f
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-central-1
+    zone: eu-central-1a
+- eu-central-1b:
+    instance_ami: ami-03b3b5f65db7e5c6f
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-central-1
+    zone: eu-central-1b
+- eu-central-1c:
+    instance_ami: ami-03b3b5f65db7e5c6f
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-central-1
+    zone: eu-central-1c
+- eu-central-2a:
+    instance_ami: ami-0039b5f24283949b4
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-central-2
+    zone: eu-central-2a
+- eu-central-2b:
+    instance_ami: ami-0039b5f24283949b4
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-central-2
+    zone: eu-central-2b
+- eu-central-2c:
+    instance_ami: ami-0039b5f24283949b4
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-central-2
+    zone: eu-central-2c
+- eu-north-1a:
+    instance_ami: ami-02e2af61198e99faf
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-north-1
+    zone: eu-north-1a
+- eu-north-1b:
+    instance_ami: ami-02e2af61198e99faf
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-north-1
+    zone: eu-north-1b
+- eu-north-1c:
+    instance_ami: ami-02e2af61198e99faf
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-north-1
+    zone: eu-north-1c
+- eu-south-1a:
+    instance_ami: ami-0ded58aac79f90084
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-south-1
+    zone: eu-south-1a
+- eu-south-1b:
+    instance_ami: ami-0ded58aac79f90084
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-south-1
+    zone: eu-south-1b
+- eu-south-1c:
+    instance_ami: ami-0ded58aac79f90084
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-south-1
+    zone: eu-south-1c
+- eu-south-2a:
+    instance_ami: ami-0895641da3c86443b
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-south-2
+    zone: eu-south-2a
+- eu-south-2b:
+    instance_ami: ami-0895641da3c86443b
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-south-2
+    zone: eu-south-2b
+- eu-south-2c:
+    instance_ami: ami-0895641da3c86443b
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-south-2
+    zone: eu-south-2c
+- eu-west-1a:
+    instance_ami: ami-032a56ad5e480189c
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-1
+    zone: eu-west-1a
+- eu-west-1b:
+    instance_ami: ami-032a56ad5e480189c
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-1
+    zone: eu-west-1b
+- eu-west-1c:
+    instance_ami: ami-032a56ad5e480189c
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-1
+    zone: eu-west-1c
+- eu-west-2a:
+    instance_ami: ami-06cff85354b67982b
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-2
+    zone: eu-west-2a
+- eu-west-2b:
+    instance_ami: ami-06cff85354b67982b
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-2
+    zone: eu-west-2b
+- eu-west-2c:
+    instance_ami: ami-06cff85354b67982b
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-2
+    zone: eu-west-2c
+- eu-west-3a:
+    instance_ami: ami-04a4acda26ca36de0
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-3
+    zone: eu-west-3a
+- eu-west-3b:
+    instance_ami: ami-04a4acda26ca36de0
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-3
+    zone: eu-west-3b
+- eu-west-3c:
+    instance_ami: ami-04a4acda26ca36de0
+    instance_type: t3.medium
+    node_count: 1
+    region: eu-west-3
+    zone: eu-west-3c
+- me-central-1a:
+    instance_ami: ami-07a33155d2d5abff0
+    instance_type: t3.medium
+    node_count: 1
+    region: me-central-1
+    zone: me-central-1a
+- me-central-1b:
+    instance_ami: ami-07a33155d2d5abff0
+    instance_type: t3.medium
+    node_count: 1
+    region: me-central-1
+    zone: me-central-1b
+- me-central-1c:
+    instance_ami: ami-07a33155d2d5abff0
+    instance_type: t3.medium
+    node_count: 1
+    region: me-central-1
+    zone: me-central-1c
+- me-south-1a:
+    instance_ami: ami-0b32599a51ef0ad90
+    instance_type: t3.medium
+    node_count: 1
+    region: me-south-1
+    zone: me-south-1a
+- me-south-1b:
+    instance_ami: ami-0b32599a51ef0ad90
+    instance_type: t3.medium
+    node_count: 1
+    region: me-south-1
+    zone: me-south-1b
+- me-south-1c:
+    instance_ami: ami-0b32599a51ef0ad90
+    instance_type: t3.medium
+    node_count: 1
+    region: me-south-1
+    zone: me-south-1c
+- sa-east-1a:
+    instance_ami: ami-0780816dd7ce942fd
+    instance_type: t3.medium
+    node_count: 1
+    region: sa-east-1
+    zone: sa-east-1a
+- sa-east-1b:
+    instance_ami: ami-0780816dd7ce942fd
+    instance_type: t3.medium
+    node_count: 1
+    region: sa-east-1
+    zone: sa-east-1b
+- sa-east-1c:
+    instance_ami: ami-0780816dd7ce942fd
+    instance_type: t3.medium
+    node_count: 1
+    region: sa-east-1
+    zone: sa-east-1c
+- us-east-1a:
+    instance_ami: ami-0e1bed4f06a3b463d
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-1
+    zone: us-east-1a
+- us-east-1b:
+    instance_ami: ami-0e1bed4f06a3b463d
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-1
+    zone: us-east-1b
+- us-east-1c:
+    instance_ami: ami-0e1bed4f06a3b463d
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-1
+    zone: us-east-1c
+- us-east-1d:
+    instance_ami: ami-0e1bed4f06a3b463d
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-1
+    zone: us-east-1d
+- us-east-1e:
+    instance_ami: ami-0e1bed4f06a3b463d
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-1
+    zone: us-east-1e
+- us-east-1f:
+    instance_ami: ami-0e1bed4f06a3b463d
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-1
+    zone: us-east-1f
+- us-east-2a:
+    instance_ami: ami-0884d2865dbe9de4b
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-2
+    zone: us-east-2a
+- us-east-2b:
+    instance_ami: ami-0884d2865dbe9de4b
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-2
+    zone: us-east-2b
+- us-east-2c:
+    instance_ami: ami-0884d2865dbe9de4b
+    instance_type: t3.medium
+    node_count: 1
+    region: us-east-2
+    zone: us-east-2c
+- us-west-1b:
+    instance_ami: ami-0d413c682033e11fd
+    instance_type: t3.medium
+    node_count: 1
+    region: us-west-1
+    zone: us-west-1b
+- us-west-1c:
+    instance_ami: ami-0d413c682033e11fd
+    instance_type: t3.medium
+    node_count: 1
+    region: us-west-1
+    zone: us-west-1c
+- us-west-2a:
+    instance_ami: ami-0606dd43116f5ed57
+    instance_type: t3.medium
+    node_count: 1
+    region: us-west-2
+    zone: us-west-2a
+- us-west-2b:
+    instance_ami: ami-0606dd43116f5ed57
+    instance_type: t3.medium
+    node_count: 1
+    region: us-west-2
+    zone: us-west-2b
+- us-west-2c:
+    instance_ami: ami-0606dd43116f5ed57
+    instance_type: t3.medium
+    node_count: 1
+    region: us-west-2
+    zone: us-west-2c
+- us-west-2d:
+    instance_ami: ami-0606dd43116f5ed57
+    instance_type: t3.medium
+    node_count: 1
+    region: us-west-2
+    zone: us-west-2d
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf
index 8e01063c..ee42e8c2 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf
@@ -1,9 +1,15 @@
 # Configure the AWS Provider for the current workspace/region
+
+
 provider "aws" {
   region                   = var.region
   shared_config_files      = ["~/.aws/config"]
   shared_credentials_files = ["~/.aws/credentials"]
-  profile                  = "default"
+}
+
+# Create a unique identifier for this zone deployment
+locals {
+  zone_id = "${var.region}-${var.zone}"
 }
 
 module "region" {
@@ -13,7 +19,7 @@ module "region" {
   zone                      = var.zone
   instance_ami              = var.instance_ami
   node_count                = var.node_count
-  app_tag                   = var.app_tag
+  app_tag                   = "${var.app_tag}-${local.zone_id}"
   aws_instance_type         = var.instance_type
   public_key_path           = var.public_key_path
   private_key_path          = var.private_key_path
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh
index bc674c9b..c74e5d18 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash
 
+# Exit on error
+set -e
+
 # Detect OS
 if [ -f /etc/os-release ]; then
     . /etc/os-release
@@ -8,23 +11,74 @@ fi
 
 echo "Detected OS: $OS"
 
+# Function to retry commands
+retry_command() {
+    local n=0
+    local max=5
+    local delay=15
+    while true; do
+        "$@" && break || {
+            if [[ $n -lt $max ]]; then
+                ((n++))
+                echo "Command failed. Attempt $n/$max. Retrying in $delay seconds..."
+                sleep $delay
+            else
+                echo "The command has failed after $n attempts."
+                return 1
+            fi
+        }
+    done
+}
+
 # Install Docker based on available package manager
 if command -v apt-get >/dev/null 2>&1; then
     # Debian/Ubuntu installation
-    apt-get update
-    apt-get install -y ca-certificates curl gnupg
+    echo "Using apt package manager..."
+    
+    # Update package list with retry
+    retry_command apt-get update
+    
+    # Install prerequisites with retry
+    retry_command apt-get install -y \
+        ca-certificates \
+        curl \
+        gnupg \
+        pigz \
+        libltdl7 \
+        libslirp0 \
+        slirp4netns \
+        apt-transport-https \
+        software-properties-common
+
+    # Setup Docker repository
     install -m 0755 -d /etc/apt/keyrings
-    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
+    rm -f /etc/apt/keyrings/docker.gpg
+    retry_command curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
     chmod a+r /etc/apt/keyrings/docker.gpg
-    echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
-    apt-get update
-    apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin
+
+    # Add Docker repository
+    echo \
+        "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
+        $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+        tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+    # Update again with retry after adding Docker repository
+    retry_command apt-get update
+
+    # Install Docker packages with retry
+    retry_command apt-get install -y \
+        docker-ce \
+        docker-ce-cli \
+        containerd.io \
+        docker-buildx-plugin \
+        docker-compose-plugin
 
 elif command -v yum >/dev/null 2>&1; then
     # DNF-based systems (Amazon Linux 2023, Fedora, RHEL)
-    yum install docker -y
+    echo "Using yum package manager..."
+    retry_command yum install -y docker
     mkdir -p /usr/local/lib/docker/cli-plugins/
-    curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose
+    retry_command curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose
     chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
 
 else
@@ -32,10 +86,27 @@ else
     exit 1
 fi
 
-# Start and enable Docker service
-systemctl start docker
-systemctl enable docker
+# Start and enable Docker service with retry
+echo "Starting Docker service..."
+systemctl start docker || {
+    echo "Failed to start Docker service. Waiting 10 seconds and trying again..."
+    sleep 10
+    systemctl start docker
+}
+
+echo "Enabling Docker service..."
+systemctl enable docker || {
+    echo "Failed to enable Docker service. Waiting 10 seconds and trying again..."
+    sleep 10
+    systemctl enable docker
+}
 
 # Verify installations
-docker --version
-docker compose version
+echo "Verifying Docker installation..."
+if command -v docker >/dev/null 2>&1; then
+    docker --version
+    docker compose version
+else
+    echo "Docker installation verification failed"
+    exit 1
+fi
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh
index ca487235..5d094185 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh
@@ -37,7 +37,7 @@ get_cloud_metadata() {
         METADATA=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance?api-version=2021-02-01")
         REGION=$(echo "$METADATA" | jq -r .compute.location)
         ZONE=$(echo "$METADATA" | jq -r .compute.zone)
-        PUBLIC_IP=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress?api-version=2021-02-01&format=text")
+        PUBLIC_IP=$(curl -s ip.me)
         PRIVATE_IP=$(echo "$METADATA" | jq -r .network.interface[0].ipv4.ipAddress[0].privateIpAddress)
         INSTANCE_ID=$(echo "$METADATA" | jq -r .compute.vmId)
         INSTANCE_TYPE=$(echo "$METADATA" | jq -r .compute.vmSize)
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf
index edc22ffc..a9e3c714 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf
@@ -2,7 +2,7 @@ terraform {
   required_providers {
     aws = {
       source  = "hashicorp/aws"
-      version = "~> 4.0"
+      version = "~> 5.0"
     }
     random = {
       source  = "hashicorp/random"
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf
index b3b846e1..a540207e 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf
@@ -2,8 +2,8 @@ terraform {
   required_providers {
     aws = {
       source                = "hashicorp/aws"
-      version               = "~> 4.0"
+      version               = "~> 5.0"
       configuration_aliases = [aws]
     }
   }
-} 
+}
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf
index 3ff4178a..f4eff6bd 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf
@@ -5,9 +5,9 @@ module "networkModule" {
   region  = var.region
   zone    = var.zone
 
-  cidr_block_range         = "10.0.0.0/16"
-  subnet1_cidr_block_range = "10.0.1.0/24"
-  subnet2_cidr_block_range = "10.0.2.0/24"
+  cidr_block_range         = "10.${index(data.aws_availability_zones.available.names, var.zone)}.0.0/16"
+  subnet1_cidr_block_range = "10.${index(data.aws_availability_zones.available.names, var.zone)}.1.0/24"
+  subnet2_cidr_block_range = "10.${index(data.aws_availability_zones.available.names, var.zone)}.2.0/24"
 
   providers = {
     aws = aws
@@ -50,3 +50,8 @@ module "instanceModule" {
   }
 }
 
+# Get list of availability zones for CIDR block calculation
+data "aws_availability_zones" "available" {
+  state = "available"
+}
+
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf
index 8c2e4cff..a540207e 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf
@@ -2,7 +2,7 @@ terraform {
   required_providers {
     aws = {
       source                = "hashicorp/aws"
-      version               = "~> 4.0"
+      version               = "~> 5.0"
       configuration_aliases = [aws]
     }
   }
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf
index b3b846e1..a540207e 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf
@@ -2,8 +2,8 @@ terraform {
   required_providers {
     aws = {
       source                = "hashicorp/aws"
-      version               = "~> 4.0"
+      version               = "~> 5.0"
       configuration_aliases = [aws]
     }
   }
-} 
+}
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py
new file mode 100755
index 00000000..08e201cb
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "pyyaml",
+#     "boto3",
+# ]
+# ///
+
+import json
+from pathlib import Path
+
+import boto3
+import yaml
+
+
+def get_all_regions():
+    """Get all AWS regions."""
+    ec2 = boto3.client("ec2")
+    regions = [region["RegionName"] for region in ec2.describe_regions()["Regions"]]
+    return sorted(regions)
+
+
+def get_region_zones(region):
+    """Get all availability zones for a given region."""
+    ec2 = boto3.client("ec2", region_name=region)
+    zones = [
+        zone["ZoneName"]
+        for zone in ec2.describe_availability_zones(
+            Filters=[{"Name": "state", "Values": ["available"]}]
+        )["AvailabilityZones"]
+    ]
+    return sorted(zones)
+
+
+def get_latest_ubuntu_ami(region):
+    """Get the latest Ubuntu 22.04 LTS AMI ID for a region."""
+    ec2 = boto3.client("ec2", region_name=region)
+
+    try:
+        response = ec2.describe_images(
+            Filters=[
+                {
+                    "Name": "name",
+                    "Values": [
+                        "ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"
+                    ],
+                },
+                {"Name": "state", "Values": ["available"]},
+                {"Name": "architecture", "Values": ["x86_64"]},
+            ],
+            Owners=["099720109477"],  # Canonical's AWS account ID
+        )
+
+        # Sort images by creation date
+        images = sorted(
+            response["Images"], key=lambda x: x["CreationDate"], reverse=True
+        )
+        if images:
+            return images[0]["ImageId"]
+    except Exception as e:
+        print(f"Warning: Could not get AMI for region {region}: {str(e)}")
+
+    return None
+
+
+def generate_locations_file():
+    """Generate a YAML file with all AWS zones as top-level entries."""
+    locations = []
+
+    for region in get_all_regions():
+        try:
+            zones = get_region_zones(region)
+            ami_id = get_latest_ubuntu_ami(region)
+
+            if ami_id:
+                # Create a zone-based configuration for each zone
+                for zone in zones:
+                    zone_config = {
+                        zone: {
+                            "region": region,
+                            "instance_type": "t3.medium",
+                            "instance_ami": ami_id,
+                            "node_count": 1,
+                            "zone": zone,
+                        }
+                    }
+                    locations.append(zone_config)
+        except Exception as e:
+            print(f"Warning: Could not process region {region}: {str(e)}")
+
+    # Create the locations directory if it doesn't exist
+    output_dir = Path(__file__).parent.parent / "locations"
+    output_dir.mkdir(exist_ok=True)
+
+    # Write YAML file
+    yaml_path = output_dir / "all_locations.yaml"
+    with open(yaml_path, "w") as f:
+        yaml.dump(locations, f, default_flow_style=False)
+
+    # Write JSON file (as an alternative format)
+    json_path = output_dir / "all_locations.json"
+    with open(json_path, "w") as f:
+        json.dump(locations, f, indent=2)
+
+    print("Generated files:")
+    print(f"YAML: {yaml_path}")
+    print(f"JSON: {json_path}")
+
+    # Calculate total nodes
+    total_nodes = sum(
+        config[zone]["node_count"] for config in locations for zone in config
+    )
+    print(f"\nTotal zones: {len(locations)}")
+    print(f"Total nodes: {total_nodes}")
+
+
+if __name__ == "__main__":
+    generate_locations_file()
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh
deleted file mode 100755
index a518815d..00000000
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# Output file
-OUTPUT_FILE="all_locations.yaml"
-
-# Truncate the output file if it exists
-truncate -s 0 $OUTPUT_FILE
-
-# Initialize YAML file
-echo "# Auto-generated locations configuration" > $OUTPUT_FILE
-echo "# Using Amazon Linux 2023 AMIs" >> $OUTPUT_FILE
-
-# Get all available regions
-REGIONS=$(aws ec2 describe-regions --query "Regions[].RegionName" --output text)
-
-for REGION in $REGIONS; do
-    # Get the latest Amazon Linux 2023 AMI
-    AMI=$(aws ec2 describe-images \
-        --region "$REGION" \
-        --owners amazon \
-        --filters "Name=name,Values=al2023-ami-2023.*-x86_64" "Name=state,Values=available" \
-        --query 'sort_by(Images, &CreationDate)[-1].ImageId' \
-        --output text)
-    
-    # Skip if no AMI found
-    if [ "$AMI" == "None" ] || [ -z "$AMI" ]; then
-        echo "Skipping $REGION - No Amazon Linux 2023 AMI found"
-        continue
-    fi
-    
-    # Get the first availability zone
-    ZONE=$(aws ec2 describe-availability-zones \
-        --region $REGION \
-        --query "AvailabilityZones[0].ZoneName" \
-        --output text)
-    
-    # Skip if no availability zone found
-    if [ "$ZONE" == "None" ] || [ -z "$ZONE" ]; then
-        echo "Skipping $REGION - No availability zone found"
-        continue
-    fi
-    
-    # Append to YAML file
-    REGION_BLOCK="- $REGION:"
-    REGION_BLOCK="$REGION_BLOCK\n    region: $REGION"
-    REGION_BLOCK="$REGION_BLOCK\n    zone: $ZONE"
-    REGION_BLOCK="$REGION_BLOCK\n    instance_type: t3.medium"
-    REGION_BLOCK="$REGION_BLOCK\n    instance_ami: $AMI"
-    REGION_BLOCK="$REGION_BLOCK\n    node_count: 1"
-    echo -e "$REGION_BLOCK" >> $OUTPUT_FILE
-    echo -e "Added $REGION with AMI $AMI"
-done
-
-echo "Generated locations.yaml successfully!"
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf
index 5e0defad..e5c884a7 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf
@@ -3,7 +3,8 @@ terraform {
   required_providers {
     aws = {
       source  = "hashicorp/aws"
-      version = "~> 4.0"
+      version = "~> 5.0"
     }
   }
 }
+
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service
index 7c8c29da..b94cbee9 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=Bacalhau Startup Script
-After=docker.service network-online.target
+After=network-online.target
 Wants=network-online.target
 
 [Service]
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service
index 37ee82fe..562312ae 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service
@@ -1,6 +1,6 @@
 [Unit]
 Description=Health Check Web Server
-After=docker.service network-online.target
+After=network-online.target
 Wants=network-online.target
 
 [Service]
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh
index bc674c9b..5ac2cede 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash
 
+# Exit on error
+set -e
+
 # Detect OS
 if [ -f /etc/os-release ]; then
     . /etc/os-release
@@ -8,23 +11,75 @@ fi
 
 echo "Detected OS: $OS"
 
+# Function to retry commands
+retry_command() {
+    local n=0
+    local max=5
+    local delay=15
+    while true; do
+        "$@" && break || {
+            if [[ $n -lt $max ]]; then
+                ((n++))
+                echo "Command failed. Attempt $n/$max. Retrying in $delay seconds..."
+                sleep $delay
+            else
+                echo "The command has failed after $n attempts."
+                return 1
+            fi
+        }
+    done
+}
+
 # Install Docker based on available package manager
 if command -v apt-get >/dev/null 2>&1; then
     # Debian/Ubuntu installation
-    apt-get update
-    apt-get install -y ca-certificates curl gnupg
+    echo "Using apt package manager..."
+    
+    # Update package list with retry
+    retry_command apt-get update
+    
+    # Install prerequisites with retry
+    retry_command apt-get install -y \
+        ca-certificates \
+        curl \
+        gnupg \
+        pigz \
+        jq \
+        libltdl7 \
+        libslirp0 \
+        slirp4netns \
+        apt-transport-https \
+        software-properties-common
+
+    # Setup Docker repository
     install -m 0755 -d /etc/apt/keyrings
-    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
+    rm -f /etc/apt/keyrings/docker.gpg
+    retry_command curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
     chmod a+r /etc/apt/keyrings/docker.gpg
-    echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
-    apt-get update
-    apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin
+
+    # Add Docker repository
+    echo \
+        "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
+        $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+        tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+    # Update again with retry after adding Docker repository
+    retry_command apt-get update
+
+    # Install Docker packages with retry
+    retry_command apt-get install -y \
+        docker-ce \
+        docker-ce-cli \
+        containerd.io \
+        docker-buildx-plugin \
+        docker-compose-plugin
 
 elif command -v yum >/dev/null 2>&1; then
     # DNF-based systems (Amazon Linux 2023, Fedora, RHEL)
-    yum install docker -y
+    echo "Using yum package manager..."
+    retry_command yum install -y docker
     mkdir -p /usr/local/lib/docker/cli-plugins/
-    curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose
+    retry_command curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose
     chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
 
 else
@@ -32,10 +87,27 @@ else
     exit 1
 fi
 
-# Start and enable Docker service
-systemctl start docker
-systemctl enable docker
+# Start and enable Docker service with retry
+echo "Starting Docker service..."
+systemctl start docker || {
+    echo "Failed to start Docker service. Waiting 10 seconds and trying again..."
+    sleep 10
+    systemctl start docker
+}
+
+echo "Enabling Docker service..."
+systemctl enable docker || {
+    echo "Failed to enable Docker service. Waiting 10 seconds and trying again..."
+    sleep 10
+    systemctl enable docker
+}
 
 # Verify installations
-docker --version
-docker compose version
+echo "Verifying Docker installation..."
+if command -v docker >/dev/null 2>&1; then
+    docker --version
+    docker compose version
+else
+    echo "Docker installation verification failed"
+    exit 1
+fi
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh
index ca487235..5d094185 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh
@@ -37,7 +37,7 @@ get_cloud_metadata() {
         METADATA=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance?api-version=2021-02-01")
         REGION=$(echo "$METADATA" | jq -r .compute.location)
         ZONE=$(echo "$METADATA" | jq -r .compute.zone)
-        PUBLIC_IP=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress?api-version=2021-02-01&format=text")
+        PUBLIC_IP=$(curl -s ip.me)
         PRIVATE_IP=$(echo "$METADATA" | jq -r .network.interface[0].ipv4.ipAddress[0].privateIpAddress)
         INSTANCE_ID=$(echo "$METADATA" | jq -r .compute.vmId)
         INSTANCE_TYPE=$(echo "$METADATA" | jq -r .compute.vmSize)
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py
new file mode 100755
index 00000000..6bed2002
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env uv run -s
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# ]
+# ///
+
+import json
+import subprocess
+
+# Define the VM series to check
+VM_SERIES = ["B", "A", "D", "F"]
+
+
+def get_vm_skus(region):
+    # Run the Azure CLI command to get VM SKUs
+    result = subprocess.run(
+        [
+            "az",
+            "vm",
+            "list-skus",
+            "--location",
+            region,
+            "--resource-type",
+            "virtualMachines",
+            "--output",
+            "json",
+        ],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        print("Error fetching VM SKUs:", result.stderr)
+        return []
+    return json.loads(result.stdout)
+
+
+def filter_vm_skus(skus, series):
+    # Filter for VM sizes in the specified series
+    filtered_skus = []
+    for sku in skus:
+        if any(sku["name"].startswith(f"Standard_{s}") for s in series):
+            filtered_skus.append(sku)
+    return filtered_skus
+
+
+def main():
+    # Define the region to check
+    region = "eastus"
+
+    # Get VM SKUs for the region
+    skus = get_vm_skus(region)
+    if not skus:
+        print("No VM SKUs found.")
+        return
+
+    # Filter for VM sizes in the specified series
+    filtered_skus = filter_vm_skus(skus, VM_SERIES)
+    if not filtered_skus:
+        print(f"No VM sizes in series {VM_SERIES} found in {region}.")
+        return
+
+    # Print the filtered VM sizes
+    print(f"Available VM sizes in {region}:")
+    for sku in filtered_skus:
+        print(
+            f"Name: {sku['name']}, Family: {sku['family']}, Locations: {sku['locationInfo'][0]['location']}"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py
new file mode 100755
index 00000000..3766f821
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env uv run -s
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "rich"
+# ]
+# ///
+import argparse
+import json
+import subprocess
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime
+
+from rich.console import Console
+from rich.live import Live
+from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
+from rich.table import Table
+
+console = Console()
+
+
+def create_status_table(regions, results=None):
+    """Create a table showing status for each region."""
+    table = Table(title="Region VM Size Availability Check")
+    table.add_column("Region")
+    table.add_column("Display Name")
+    table.add_column("Status")
+
+    results = results or {}
+    for region in sorted(regions, key=lambda x: x["name"]):
+        name = region["name"]
+        status = results.get(name, "⏳ Checking...")
+        table.add_row(name, region["displayName"], status)
+    return table
+
+
+def check_region(region, vm_size):
+    """Check VM size availability for a single region."""
+    query = f"[?name=='{vm_size}'].name"
+    cmd = [
+        "az",
+        "vm",
+        "list-sizes",
+        "--location",
+        region["name"],
+        "--query",
+        query,
+        "-o",
+        "json",
+    ]
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode == 0:
+        sizes = json.loads(result.stdout)
+        if sizes:  # If the VM size is available in this region
+            return region["name"], "✅ Available"
+    return region["name"], "❌ Not Available"
+
+
+def get_vm_availability(vm_size, debug=False):
+    """Get VM size availability for all regions in parallel."""
+    try:
+        # First get all regions
+        query = "[].{name:name, displayName:displayName}"
+        cmd = [
+            "az",
+            "account",
+            "list-locations",
+            "--query",
+            query,
+            "-o",
+            "json",
+        ]
+        if debug:
+            console.print("[yellow]Fetching regions...[/yellow]")
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            console.print(f"[red]Error: {result.stderr}[/red]")
+            return None
+
+        regions = json.loads(result.stdout)
+        available_regions = []
+        results = {}
+
+        # Create and display the live table
+        with Live(create_status_table(regions, results), refresh_per_second=4) as live:
+            # Check regions in parallel
+            with ThreadPoolExecutor(max_workers=10) as executor:
+                future_to_region = {
+                    executor.submit(check_region, region, vm_size): region
+                    for region in regions
+                }
+
+                for future in as_completed(future_to_region):
+                    region = future_to_region[future]
+                    try:
+                        region_name, status = future.result()
+                        results[region_name] = status
+                        if status == "✅ Available":
+                            available_regions.append(region)
+                        live.update(create_status_table(regions, results))
+                    except Exception as e:
+                        results[region["name"]] = f"❌ Error: {str(e)}"
+                        live.update(create_status_table(regions, results))
+
+        return available_regions
+    except Exception as e:
+        console.print(f"[red]Error: {str(e)}[/red]")
+        return []
+
+
+def generate_locations_section(regions, vm_size):
+    """Generate locations section for available regions."""
+    return {
+        region["name"]: {"machine_type": vm_size, "node_count": 1} for region in regions
+    }
+
+
+def main(vm_size, debug=False):
+    console.print(f"[cyan]Checking availability for VM size: {vm_size}[/cyan]")
+
+    # Get VM availability data
+    available_regions = get_vm_availability(vm_size, debug)
+
+    if not available_regions:
+        console.print(f"\n[red]VM size {vm_size} not found in any region[/red]")
+        return
+
+    # Generate the locations section
+    locations_section = generate_locations_section(available_regions, vm_size)
+
+    # Print to console
+    console.print("\n[green]Available Regions Summary:[/green]")
+    for region in sorted(available_regions, key=lambda x: x["name"]):
+        console.print(f"✅ {region['name']} ({region['displayName']})")
+
+    # Save to file
+    output_file = (
+        f"vm_availability_{vm_size}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    )
+    with open(output_file, "w") as f:
+        json.dump({"locations": locations_section}, f, indent=4)
+
+    console.print(
+        f"\n[blue]Generated Locations Configuration saved to: {output_file}[/blue]"
+    )
+    console.print(f"\n[cyan]Summary:[/cyan]")
+    console.print(f"Available regions: {len(available_regions)}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--vm-size",
+        type=str,
+        default="Standard_B2ms",
+        help="VM size to check (default: Standard_B2ms)",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Print Azure CLI commands being executed",
+    )
+    args = parser.parse_args()
+
+    main(args.vm_size, args.debug)
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json
new file mode 100644
index 00000000..35658154
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json
@@ -0,0 +1,176 @@
+{
+    "locations": {
+        "northeurope": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "westus2": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "westeurope": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "westus3": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "australiaeast": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "eastus": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "southeastasia": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "southcentralus": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "uksouth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "swedencentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "centralus": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "eastasia": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "centralindia": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "canadacentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "southafricanorth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "koreacentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "francecentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "japaneast": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "germanywestcentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "newzealandnorth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "italynorth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "spaincentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "norwayeast": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "brazilsouth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "switzerlandnorth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "mexicocentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "qatarcentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "polandcentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "uaenorth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "israelcentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "eastus2": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "westus": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "westcentralus": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "northcentralus": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "jioindiawest": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "japanwest": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "australiacentral": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "koreasouth": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "australiasoutheast": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "westindia": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "canadaeast": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "ukwest": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        },
+        "southindia": {
+            "machine_type": "Standard_B2ms",
+            "node_count": 1
+        }
+    }
+}
\ No newline at end of file
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf
index d62468bf..00cbb525 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf
@@ -3,19 +3,9 @@ locals {
   timestamp  = formatdate("YYMMDDHHmm", timestamp())
   project_id = "${var.base_project_name}-${local.timestamp}"
 
-  # Function to sanitize label values
-  sanitize_label = replace(
-    lower(
-      replace(
-        replace(var.gcp_user_email, "@", "_at_"),
-        ".", "_"
-      )
-    ),
-    "/[^a-z0-9_-]/",
-    "_"
-  )
-
-  sanitize_tag = replace(lower(var.app_tag), "/[^a-z0-9_-]/", "_")
+  # Simplified sanitize label function - just replace non-compliant chars with "-"
+  sanitize_label = replace(lower(var.gcp_user_email), "/[^a-z0-9-]/", "-")
+  sanitize_tag   = replace(lower(var.app_tag), "/[^a-z0-9-]/", "-")
 
   # Define common tags with sanitized values
   common_tags = {
@@ -24,6 +14,24 @@ locals {
     managed_by = "terraform"
     custom_tag = local.sanitize_tag
   }
+
+  # Flatten the VM instances based on count per zone from locations variable
+  vm_instances = flatten([
+    for zone_key, config in var.locations : [
+      for i in range(lookup(config, "node_count", 1)) : {
+        zone_key     = zone_key
+        index        = i
+        zone         = config.zone
+        machine_type = config.machine_type
+      }
+    ]
+  ])
+
+  # Convert to map with unique keys
+  vm_instances_map = {
+    for instance in local.vm_instances :
+    "${instance.zone_key}-${instance.index}" => instance
+  }
 }
 
 terraform {
@@ -67,6 +75,12 @@ resource "google_project" "bacalhau_project" {
   }
 }
 
+# Link billing account to project
+resource "google_billing_project_info" "billing_info" {
+  project         = google_project.bacalhau_project.project_id
+  billing_account = var.gcp_billing_account_id
+}
+
 # Enable required APIs in new project
 resource "google_project_service" "project_apis" {
   provider = google.bacalhau_cluster_project
@@ -78,11 +92,25 @@ resource "google_project_service" "project_apis" {
     "billingbudgets.googleapis.com"
   ])
 
-  project = google_project.bacalhau_project.project_id
-  service = each.value
-
+  project                    = google_project.bacalhau_project.project_id
+  service                    = each.value
   disable_dependent_services = true
   disable_on_destroy         = false
+  depends_on                 = [google_billing_project_info.billing_info]
+
+  # Add timeouts to give more time for API enablement
+  timeouts {
+    create = "30m"
+    update = "40m"
+  }
+}
+
+# Add explicit dependency on compute API
+resource "time_sleep" "wait_for_apis" {
+  depends_on = [google_project_service.project_apis]
+
+  # Wait for 2 minutes after enabling APIs
+  create_duration = "120s"
 }
 
 # Update the project_owner IAM binding to depend on APIs being enabled
@@ -115,9 +143,17 @@ resource "google_project_iam_member" "member_role" {
   project = google_project.bacalhau_project.project_id
 }
 
-data "cloudinit_config" "user_data" {
+# Update random string to use the new map
+resource "random_string" "vm_name" {
+  for_each = local.vm_instances_map
+  length   = 8
+  special  = false
+  upper    = false
+}
 
-  for_each = var.locations
+# Update the instance and cloud-init resources to use the new map
+data "cloudinit_config" "user_data" {
+  for_each = local.vm_instances_map
 
   gzip          = false
   base64_encode = false
@@ -127,9 +163,9 @@ data "cloudinit_config" "user_data" {
     content_type = "text/cloud-config"
 
     content = templatefile("${path.root}/cloud-init/init-vm.yml", {
-      node_name : "${local.project_id}-${each.key}-vm",
+      node_name : "${replace(lower(each.value.zone), "/[^a-z0-9-]/", "-")}-${random_string.vm_name[each.key].result}-vm",
       username : var.username,
-      region : each.key,
+      region : each.value.zone_key,
       zone : each.value.zone,
       project_id : google_project.bacalhau_project.project_id,
       bacalhau_startup_service_file : filebase64("${path.root}/scripts/bacalhau-startup.service"),
@@ -161,7 +197,7 @@ resource "google_compute_firewall" "allow_ssh_nats" {
   source_ranges = ["0.0.0.0/0"]
   target_tags   = ["${local.project_id}-instance"]
 
-  depends_on = [google_project_service.project_apis]
+  depends_on = [time_sleep.wait_for_apis] # Wait for APIs to be fully enabled
 }
 
 resource "google_compute_instance" "gcp_instance" {
@@ -169,9 +205,9 @@ resource "google_compute_instance" "gcp_instance" {
   project    = google_project.bacalhau_project.project_id
   depends_on = [google_project_iam_member.member_role]
 
-  for_each = var.locations
+  for_each = local.vm_instances_map
 
-  name         = "${local.project_id}-${each.key}-vm"
+  name         = "${replace(lower(each.value.zone), "/[^a-z0-9-]/", "-")}-${random_string.vm_name[each.key].result}-vm"
   machine_type = each.value.machine_type
   zone         = each.value.zone
   tags         = ["${local.project_id}-instance"]
@@ -202,33 +238,3 @@ resource "google_compute_instance" "gcp_instance" {
 
   labels = local.common_tags
 }
-
-data "http" "healthcheck" {
-  for_each = var.locations
-
-  url = "http://${google_compute_instance.gcp_instance[each.key].network_interface[0].access_config[0].nat_ip}/healthz"
-
-  retry {
-    attempts     = 35
-    min_delay_ms = 10000 # 10 seconds
-    max_delay_ms = 10000 # 10 seconds
-  }
-}
-
-output "deployment_status" {
-  description = "Deployment status including health checks"
-  value = {
-    for k, v in google_compute_instance.gcp_instance : k => {
-      name         = v.name
-      external_ip  = v.network_interface[0].access_config[0].nat_ip
-      health_check = try(data.http.healthcheck[k].status_code == 200, false) ? "healthy" : "failed"
-    }
-  }
-}
-
-# Add random string resource at the top level
-resource "random_string" "suffix" {
-  length  = 4
-  special = false
-  upper   = false
-}
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh
index bc674c9b..c74e5d18 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh
@@ -1,5 +1,8 @@
 #!/usr/bin/env bash
 
+# Exit on error
+set -e
+
 # Detect OS
 if [ -f /etc/os-release ]; then
     . /etc/os-release
@@ -8,23 +11,74 @@ fi
 
 echo "Detected OS: $OS"
 
+# Function to retry commands
+retry_command() {
+    local n=0
+    local max=5
+    local delay=15
+    while true; do
+        "$@" && break || {
+            if [[ $n -lt $max ]]; then
+                ((n++))
+                echo "Command failed. Attempt $n/$max. Retrying in $delay seconds..."
+                sleep $delay
+            else
+                echo "The command has failed after $n attempts."
+                return 1
+            fi
+        }
+    done
+}
+
 # Install Docker based on available package manager
 if command -v apt-get >/dev/null 2>&1; then
     # Debian/Ubuntu installation
-    apt-get update
-    apt-get install -y ca-certificates curl gnupg
+    echo "Using apt package manager..."
+    
+    # Update package list with retry
+    retry_command apt-get update
+    
+    # Install prerequisites with retry
+    retry_command apt-get install -y \
+        ca-certificates \
+        curl \
+        gnupg \
+        pigz \
+        libltdl7 \
+        libslirp0 \
+        slirp4netns \
+        apt-transport-https \
+        software-properties-common
+
+    # Setup Docker repository
     install -m 0755 -d /etc/apt/keyrings
-    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
+    rm -f /etc/apt/keyrings/docker.gpg
+    retry_command curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
     chmod a+r /etc/apt/keyrings/docker.gpg
-    echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
-    apt-get update
-    apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin
+
+    # Add Docker repository
+    echo \
+        "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
+        $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+        tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+    # Update again with retry after adding Docker repository
+    retry_command apt-get update
+
+    # Install Docker packages with retry
+    retry_command apt-get install -y \
+        docker-ce \
+        docker-ce-cli \
+        containerd.io \
+        docker-buildx-plugin \
+        docker-compose-plugin
 
 elif command -v yum >/dev/null 2>&1; then
     # DNF-based systems (Amazon Linux 2023, Fedora, RHEL)
-    yum install docker -y
+    echo "Using yum package manager..."
+    retry_command yum install -y docker
     mkdir -p /usr/local/lib/docker/cli-plugins/
-    curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose
+    retry_command curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose
     chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
 
 else
@@ -32,10 +86,27 @@ else
     exit 1
 fi
 
-# Start and enable Docker service
-systemctl start docker
-systemctl enable docker
+# Start and enable Docker service with retry
+echo "Starting Docker service..."
+systemctl start docker || {
+    echo "Failed to start Docker service. Waiting 10 seconds and trying again..."
+    sleep 10
+    systemctl start docker
+}
+
+echo "Enabling Docker service..."
+systemctl enable docker || {
+    echo "Failed to enable Docker service. Waiting 10 seconds and trying again..."
+    sleep 10
+    systemctl enable docker
+}
 
 # Verify installations
-docker --version
-docker compose version
+echo "Verifying Docker installation..."
+if command -v docker >/dev/null 2>&1; then
+    docker --version
+    docker compose version
+else
+    echo "Docker installation verification failed"
+    exit 1
+fi
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh
index ca487235..5d094185 100644
--- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh
@@ -37,7 +37,7 @@ get_cloud_metadata() {
         METADATA=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance?api-version=2021-02-01")
         REGION=$(echo "$METADATA" | jq -r .compute.location)
         ZONE=$(echo "$METADATA" | jq -r .compute.zone)
-        PUBLIC_IP=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress?api-version=2021-02-01&format=text")
+        PUBLIC_IP=$(curl -s ip.me)
         PRIVATE_IP=$(echo "$METADATA" | jq -r .network.interface[0].ipv4.ipAddress[0].privateIpAddress)
         INSTANCE_ID=$(echo "$METADATA" | jq -r .compute.vmId)
         INSTANCE_TYPE=$(echo "$METADATA" | jq -r .compute.vmSize)
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json
new file mode 100644
index 00000000..ac448a83
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json
@@ -0,0 +1,624 @@
+{
+  "locations": {
+    "us_east1_b": {
+      "zone": "us-east1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east1_c": {
+      "zone": "us-east1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east1_d": {
+      "zone": "us-east1-d",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east4_c": {
+      "zone": "us-east4-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east4_b": {
+      "zone": "us-east4-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east4_a": {
+      "zone": "us-east4-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_central1_c": {
+      "zone": "us-central1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_central1_a": {
+      "zone": "us-central1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_central1_f": {
+      "zone": "us-central1-f",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_central1_b": {
+      "zone": "us-central1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west1_b": {
+      "zone": "us-west1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west1_c": {
+      "zone": "us-west1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west1_a": {
+      "zone": "us-west1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west4_a": {
+      "zone": "europe-west4-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west4_b": {
+      "zone": "europe-west4-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west4_c": {
+      "zone": "europe-west4-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west1_b": {
+      "zone": "europe-west1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west1_d": {
+      "zone": "europe-west1-d",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west1_c": {
+      "zone": "europe-west1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west3_c": {
+      "zone": "europe-west3-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west3_a": {
+      "zone": "europe-west3-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west3_b": {
+      "zone": "europe-west3-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west2_c": {
+      "zone": "europe-west2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west2_b": {
+      "zone": "europe-west2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west2_a": {
+      "zone": "europe-west2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_east1_b": {
+      "zone": "asia-east1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_east1_a": {
+      "zone": "asia-east1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_east1_c": {
+      "zone": "asia-east1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_southeast1_b": {
+      "zone": "asia-southeast1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_southeast1_a": {
+      "zone": "asia-southeast1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_southeast1_c": {
+      "zone": "asia-southeast1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast1_b": {
+      "zone": "asia-northeast1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast1_c": {
+      "zone": "asia-northeast1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast1_a": {
+      "zone": "asia-northeast1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_south1_c": {
+      "zone": "asia-south1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_south1_b": {
+      "zone": "asia-south1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_south1_a": {
+      "zone": "asia-south1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "australia_southeast1_b": {
+      "zone": "australia-southeast1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "australia_southeast1_c": {
+      "zone": "australia-southeast1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "australia_southeast1_a": {
+      "zone": "australia-southeast1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "southamerica_east1_b": {
+      "zone": "southamerica-east1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "southamerica_east1_c": {
+      "zone": "southamerica-east1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "southamerica_east1_a": {
+      "zone": "southamerica-east1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "africa_south1_a": {
+      "zone": "africa-south1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "africa_south1_b": {
+      "zone": "africa-south1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "africa_south1_c": {
+      "zone": "africa-south1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_east2_a": {
+      "zone": "asia-east2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_east2_b": {
+      "zone": "asia-east2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_east2_c": {
+      "zone": "asia-east2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast2_a": {
+      "zone": "asia-northeast2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast2_b": {
+      "zone": "asia-northeast2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast2_c": {
+      "zone": "asia-northeast2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast3_a": {
+      "zone": "asia-northeast3-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast3_b": {
+      "zone": "asia-northeast3-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_northeast3_c": {
+      "zone": "asia-northeast3-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_south2_a": {
+      "zone": "asia-south2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_south2_b": {
+      "zone": "asia-south2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_south2_c": {
+      "zone": "asia-south2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_southeast2_a": {
+      "zone": "asia-southeast2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_southeast2_b": {
+      "zone": "asia-southeast2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "asia_southeast2_c": {
+      "zone": "asia-southeast2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "australia_southeast2_a": {
+      "zone": "australia-southeast2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "australia_southeast2_b": {
+      "zone": "australia-southeast2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "australia_southeast2_c": {
+      "zone": "australia-southeast2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_central2_a": {
+      "zone": "europe-central2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_central2_b": {
+      "zone": "europe-central2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_central2_c": {
+      "zone": "europe-central2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_north1_a": {
+      "zone": "europe-north1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_north1_b": {
+      "zone": "europe-north1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_north1_c": {
+      "zone": "europe-north1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_southwest1_a": {
+      "zone": "europe-southwest1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_southwest1_b": {
+      "zone": "europe-southwest1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_southwest1_c": {
+      "zone": "europe-southwest1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west10_a": {
+      "zone": "europe-west10-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west10_b": {
+      "zone": "europe-west10-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west10_c": {
+      "zone": "europe-west10-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west12_a": {
+      "zone": "europe-west12-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west12_b": {
+      "zone": "europe-west12-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west12_c": {
+      "zone": "europe-west12-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west6_a": {
+      "zone": "europe-west6-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west6_b": {
+      "zone": "europe-west6-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west6_c": {
+      "zone": "europe-west6-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west8_a": {
+      "zone": "europe-west8-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west8_b": {
+      "zone": "europe-west8-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west8_c": {
+      "zone": "europe-west8-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west9_a": {
+      "zone": "europe-west9-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west9_b": {
+      "zone": "europe-west9-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "europe_west9_c": {
+      "zone": "europe-west9-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_central1_a": {
+      "zone": "me-central1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_central1_b": {
+      "zone": "me-central1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_central1_c": {
+      "zone": "me-central1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_central2_a": {
+      "zone": "me-central2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_central2_b": {
+      "zone": "me-central2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_central2_c": {
+      "zone": "me-central2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_west1_a": {
+      "zone": "me-west1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_west1_b": {
+      "zone": "me-west1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "me_west1_c": {
+      "zone": "me-west1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_northeast1_a": {
+      "zone": "northamerica-northeast1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_northeast1_b": {
+      "zone": "northamerica-northeast1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_northeast1_c": {
+      "zone": "northamerica-northeast1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_northeast2_a": {
+      "zone": "northamerica-northeast2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_northeast2_b": {
+      "zone": "northamerica-northeast2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_northeast2_c": {
+      "zone": "northamerica-northeast2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_south1_a": {
+      "zone": "northamerica-south1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_south1_b": {
+      "zone": "northamerica-south1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "northamerica_south1_c": {
+      "zone": "northamerica-south1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "southamerica_west1_a": {
+      "zone": "southamerica-west1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "southamerica_west1_b": {
+      "zone": "southamerica-west1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "southamerica_west1_c": {
+      "zone": "southamerica-west1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east5_a": {
+      "zone": "us-east5-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east5_b": {
+      "zone": "us-east5-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_east5_c": {
+      "zone": "us-east5-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_south1_a": {
+      "zone": "us-south1-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_south1_b": {
+      "zone": "us-south1-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_south1_c": {
+      "zone": "us-south1-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west2_a": {
+      "zone": "us-west2-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west2_b": {
+      "zone": "us-west2-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west2_c": {
+      "zone": "us-west2-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west3_a": {
+      "zone": "us-west3-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west3_b": {
+      "zone": "us-west3-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west3_c": {
+      "zone": "us-west3-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west4_a": {
+      "zone": "us-west4-a",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west4_b": {
+      "zone": "us-west4-b",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    },
+    "us_west4_c": {
+      "zone": "us-west4-c",
+      "machine_type": "e2-standard-4",
+      "node_count": 1
+    }
+  }
+}
\ No newline at end of file
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml
new file mode 100644
index 00000000..d9e14856
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml
@@ -0,0 +1,497 @@
+locations:
+  africa_south1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: africa-south1-a
+  africa_south1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: africa-south1-b
+  africa_south1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: africa-south1-c
+  asia_east1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-east1-a
+  asia_east1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-east1-b
+  asia_east1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-east1-c
+  asia_east2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-east2-a
+  asia_east2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-east2-b
+  asia_east2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-east2-c
+  asia_northeast1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast1-a
+  asia_northeast1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast1-b
+  asia_northeast1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast1-c
+  asia_northeast2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast2-a
+  asia_northeast2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast2-b
+  asia_northeast2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast2-c
+  asia_northeast3_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast3-a
+  asia_northeast3_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast3-b
+  asia_northeast3_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-northeast3-c
+  asia_south1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-south1-a
+  asia_south1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-south1-b
+  asia_south1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-south1-c
+  asia_south2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-south2-a
+  asia_south2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-south2-b
+  asia_south2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-south2-c
+  asia_southeast1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-southeast1-a
+  asia_southeast1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-southeast1-b
+  asia_southeast1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-southeast1-c
+  asia_southeast2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-southeast2-a
+  asia_southeast2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-southeast2-b
+  asia_southeast2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: asia-southeast2-c
+  australia_southeast1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: australia-southeast1-a
+  australia_southeast1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: australia-southeast1-b
+  australia_southeast1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: australia-southeast1-c
+  australia_southeast2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: australia-southeast2-a
+  australia_southeast2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: australia-southeast2-b
+  australia_southeast2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: australia-southeast2-c
+  europe_central2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-central2-a
+  europe_central2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-central2-b
+  europe_central2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-central2-c
+  europe_north1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-north1-a
+  europe_north1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-north1-b
+  europe_north1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-north1-c
+  europe_southwest1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-southwest1-a
+  europe_southwest1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-southwest1-b
+  europe_southwest1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-southwest1-c
+  europe_west10_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west10-a
+  europe_west10_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west10-b
+  europe_west10_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west10-c
+  europe_west12_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west12-a
+  europe_west12_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west12-b
+  europe_west12_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west12-c
+  europe_west1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west1-b
+  europe_west1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west1-c
+  europe_west1_d:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west1-d
+  europe_west2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west2-a
+  europe_west2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west2-b
+  europe_west2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west2-c
+  europe_west3_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west3-a
+  europe_west3_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west3-b
+  europe_west3_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west3-c
+  europe_west4_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west4-a
+  europe_west4_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west4-b
+  europe_west4_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west4-c
+  europe_west6_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west6-a
+  europe_west6_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west6-b
+  europe_west6_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west6-c
+  europe_west8_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west8-a
+  europe_west8_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west8-b
+  europe_west8_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west8-c
+  europe_west9_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west9-a
+  europe_west9_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west9-b
+  europe_west9_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: europe-west9-c
+  me_central1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-central1-a
+  me_central1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-central1-b
+  me_central1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-central1-c
+  me_central2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-central2-a
+  me_central2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-central2-b
+  me_central2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-central2-c
+  me_west1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-west1-a
+  me_west1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-west1-b
+  me_west1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: me-west1-c
+  northamerica_northeast1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-northeast1-a
+  northamerica_northeast1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-northeast1-b
+  northamerica_northeast1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-northeast1-c
+  northamerica_northeast2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-northeast2-a
+  northamerica_northeast2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-northeast2-b
+  northamerica_northeast2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-northeast2-c
+  northamerica_south1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-south1-a
+  northamerica_south1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-south1-b
+  northamerica_south1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: northamerica-south1-c
+  southamerica_east1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: southamerica-east1-a
+  southamerica_east1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: southamerica-east1-b
+  southamerica_east1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: southamerica-east1-c
+  southamerica_west1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: southamerica-west1-a
+  southamerica_west1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: southamerica-west1-b
+  southamerica_west1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: southamerica-west1-c
+  us_central1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-central1-a
+  us_central1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-central1-b
+  us_central1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-central1-c
+  us_central1_f:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-central1-f
+  us_east1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east1-b
+  us_east1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east1-c
+  us_east1_d:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east1-d
+  us_east4_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east4-a
+  us_east4_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east4-b
+  us_east4_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east4-c
+  us_east5_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east5-a
+  us_east5_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east5-b
+  us_east5_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-east5-c
+  us_south1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-south1-a
+  us_south1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-south1-b
+  us_south1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-south1-c
+  us_west1_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west1-a
+  us_west1_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west1-b
+  us_west1_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west1-c
+  us_west2_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west2-a
+  us_west2_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west2-b
+  us_west2_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west2-c
+  us_west3_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west3-a
+  us_west3_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west3-b
+  us_west3_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west3-c
+  us_west4_a:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west4-a
+  us_west4_b:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west4-b
+  us_west4_c:
+    machine_type: e2-standard-4
+    node_count: 1
+    zone: us-west4-c
diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py
new file mode 100755
index 00000000..dc24ace8
--- /dev/null
+++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env uv run -s
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "google-cloud-compute",
+#     "pyyaml",
+#     "google-auth",
+# ]
+# ///
+
+import json
+import os
+import subprocess
+import sys
+from typing import Optional
+
+import yaml
+from google.api_core import exceptions
+from google.auth import default
+from google.auth.exceptions import DefaultCredentialsError
+from google.cloud import compute_v1
+
+
+def ensure_gcp_auth() -> Optional[str]:
+    """Ensure GCP authentication and return project ID."""
+    try:
+        # Try to get credentials and project ID
+        credentials, project_id = default()
+        return project_id
+    except DefaultCredentialsError:
+        print(
+            "GCP credentials not found. Please authenticate using one of these methods:"
+        )
+        print("1. Run: gcloud auth application-default login")
+        print("2. Set GOOGLE_APPLICATION_CREDENTIALS environment variable")
+        sys.exit(1)
+
+
+def get_project_id() -> str:
+    """Get the GCP project ID."""
+    # First try environment variable
+    project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
+    if project_id:
+        return project_id
+
+    # Then try gcloud config
+    try:
+        result = subprocess.run(
+            ["gcloud", "config", "get-value", "project"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        project_id = result.stdout.strip()
+        if project_id and project_id != "(unset)":
+            return project_id
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        pass
+
+    # Finally, try to get it from application default credentials
+    project_id = ensure_gcp_auth()
+    if project_id:
+        return project_id
+
+    print("Error: Could not determine GCP project ID. Please either:")
+    print("1. Set GOOGLE_CLOUD_PROJECT environment variable")
+    print("2. Run: gcloud config set project YOUR_PROJECT_ID")
+    print("3. Use application default credentials with a project")
+    sys.exit(1)
+
+
+def check_zone_access(
+    client: compute_v1.ZonesClient, project_id: str, zone_name: str
+) -> bool:
+    """Check if we have access to a specific zone."""
+    try:
+        request = compute_v1.GetZoneRequest(project=project_id, zone=zone_name)
+        zone = client.get(request=request)
+        # Check if the zone is actually available for use
+        return zone.status == "UP" and "DEPRECATED" not in zone.deprecated
+    except exceptions.PermissionDenied:
+        print(f"⚠️  No permission to access zone {zone_name}")
+        return False
+    except exceptions.Forbidden:
+        print(f"⚠️  Access forbidden to zone {zone_name}")
+        return False
+    except Exception as e:
+        print(f"⚠️  Error checking zone {zone_name}: {str(e)}")
+        return False
+
+
+def get_all_zones(project_id: str):
+    """Query GCP to get all available zones."""
+    client = compute_v1.ZonesClient()
+
+    try:
+        zones = []
+        request = compute_v1.ListZonesRequest(project=project_id)
+
+        print("Fetching available zones...")
+        for zone in client.list(request=request):
+            # Only include UP zones that aren't deprecated
+            if zone.status == "UP" and not zone.deprecated:
+                region = zone.name.rsplit("-", 1)[0]
+                zones.append(
+                    {
+                        "region": region,
+                        "zone": zone.name,
+                    }
+                )
+                print(f"✓ Found zone: {zone.name}")
+
+        if not zones:
+            print("\nNo available zones found. This could mean:")
+            print("1. You don't have the required permissions")
+            print("2. The Compute Engine API isn't enabled")
+            print("3. Your project isn't properly set up for Compute Engine")
+            print("\nTry running: gcloud services enable compute.googleapis.com")
+            sys.exit(1)
+
+        return zones
+    except Exception as e:
+        print(f"Error fetching zones: {str(e)}")
+        print(
+            "Please ensure you have the necessary permissions and the Compute Engine API is enabled."
+        )
+        print(
+            "You can enable it by running: gcloud services enable compute.googleapis.com"
+        )
+        sys.exit(1)
+
+
+def generate_locations_config(zones):
+    """Generate the locations configuration."""
+    locations = {}
+
+    for zone_info in zones:
+        zone = zone_info["zone"]
+
+        # Use zone as the key instead of region
+        zone_key = zone.replace(
+            "-", "_"
+        )  # Replace hyphens with underscores for valid keys
+        locations[zone_key] = {
+            "zone": zone,
+            "machine_type": "e2-standard-4",
+            "node_count": 1,
+        }
+
+    return locations
+
+
+def main():
+    # Ensure authentication and get project ID
+    project_id = get_project_id()
+    print(f"Using GCP project: {project_id}")
+
+    # Get all zones
+    print("Fetching zones from GCP...")
+    zones = get_all_zones(project_id)
+
+    if not zones:
+        print(
+            "No zones found. Please check your permissions and project configuration."
+        )
+        sys.exit(1)
+
+    # Generate the locations configuration
+    locations = generate_locations_config(zones)
+
+    # Save as YAML
+    with open("all_locations.yaml", "w") as yaml_file:
+        yaml.dump({"locations": locations}, yaml_file, default_flow_style=False)
+
+    # Save as JSON (for env.json format)
+    with open("all_locations.json", "w") as json_file:
+        json.dump({"locations": locations}, json_file, indent=2)
+
+    print(f"\nGenerated configurations with {len(locations)} zones:")
+    for zone_key in sorted(locations.keys()):
+        print(f"  - {locations[zone_key]['zone']}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py b/systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py
deleted file mode 100644
index 1f3d87bd..00000000
--- a/systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import argparse
-import json
-import os
-import subprocess
-import sys
-
-import yaml
-
-
-def load_config(config_path):
-    """Load configuration from YAML file."""
-    try:
-        with open(config_path, 'r') as f:
-            config = yaml.safe_load(f)
-            
-            # Extract required values
-            compute = config.get('Compute', {})
-            if not compute:
-                raise ValueError("No 'Compute' section in config")
-            
-            orchestrators = compute.get('Orchestrators', [])
-            if not orchestrators:
-                raise ValueError("No 'Orchestrators' specified in config")
-            
-            orchestrator = orchestrators[0]  # Use first orchestrator
-            token = compute.get('Auth', {}).get('Token')
-            if not token:
-                raise ValueError("No 'Auth.Token' specified in config")
-            
-            # Extract hostname without nats:// prefix and port
-            if orchestrator.startswith('nats://'):
-                orchestrator = orchestrator[7:]
-            orchestrator = orchestrator.split(':')[0]  # Remove port number
-            
-            return {
-                'api_host': orchestrator,
-                'token': token
-            }
-    except Exception as e:
-        print(f"Error loading config file: {e}")
-        sys.exit(1)
-
-def get_nodes(api_host, token):
-    """Get list of all Bacalhau nodes."""
-    try:
-        cmd = [
-            "bacalhau",
-            "node",
-            "list",
-            "--output", "json",
-            "-c", f"API.Host={api_host}"
-        ]
-        
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-        return json.loads(result.stdout)
-    except subprocess.CalledProcessError as e:
-        print(f"Error running bacalhau node list: {e}")
-        print(f"stdout: {e.stdout}")
-        print(f"stderr: {e.stderr}")
-        sys.exit(1)
-    except json.JSONDecodeError as e:
-        print(f"Error parsing JSON output: {e}")
-        sys.exit(1)
-
-def delete_node(node_id, api_host, token):
-    """Delete a specific node by ID."""
-    try:
-        cmd = [
-            "bacalhau",
-            "node",
-            "delete",
-            node_id,
-            "-c", f"API.Host={api_host}"
-        ]
-        
-        subprocess.run(cmd, check=True)
-        print(f"Successfully deleted node: {node_id}")
-        return True
-    except subprocess.CalledProcessError as e:
-        print(f"Failed to delete node {node_id}. Error: {e}")
-        return False
-
-def main():
-    parser = argparse.ArgumentParser(description="Delete disconnected Bacalhau nodes")
-    parser.add_argument('config', help='Path to config file')
-    parser.add_argument('--dry-run', action='store_true', help='Show what would be deleted without actually deleting')
-    args = parser.parse_args()
-
-    # Load configuration
-    config = load_config(args.config)
-    
-    print(f"\nConnecting to API host: {config['api_host']}")
-    
-    # Get all nodes
-    nodes = get_nodes(config['api_host'], config['token'])
-    
-    # Filter disconnected compute nodes
-    disconnected_nodes = [
-        node for node in nodes
-        if (
-            node["Connection"] == "DISCONNECTED" and
-            node["Info"]["NodeType"] == "Compute"
-        )
-    ]
-
-    if not disconnected_nodes:
-        print("No disconnected nodes found.")
-        return
-
-    print(f"\nFound {len(disconnected_nodes)} disconnected node(s):")
-    for node in disconnected_nodes:
-        print(f"  - {node['Info']['NodeID']}")
-        
-    if args.dry_run:
-        print("\nDry run - no nodes were deleted")
-        return
-
-    print("\nDeleting nodes...")
-    deleted_count = 0
-    for node in disconnected_nodes:
-        if delete_node(node['Info']['NodeID'], config['api_host'], config['token']):
-            deleted_count += 1
-
-    print(f"\nDeleted {deleted_count} of {len(disconnected_nodes)} disconnected nodes")
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file