From 87c7a74d416263f7f3dc0341144ececa205e4b23 Mon Sep 17 00:00:00 2001 From: David Aronchick Date: Mon, 13 Jan 2025 16:51:57 -0800 Subject: [PATCH] updating spot, adding generator files, logging test now working --- .gitignore | 3 + .../.cspell/custom-dictionary.txt | 22 + .../bigquery-with-bacalhau/.gitignore | 40 + .../bigquery-with-bacalhau/LICENSE | 21 + .../bigquery-with-bacalhau/README.md | 411 +++ .../check_permissions.sh | 22 + .../config.yaml.example | 13 + .../duckdb_query_job.yaml | 28 + .../process_container/process.py | 326 ++ .../start-logging-container.yaml | 30 + .../utility_scripts/confirm_table.sh | 71 + .../utility_scripts/distribute_credentials.sh | 75 + .../utility_scripts/list_columns.sh | 16 + .../utility_scripts/setup.py | 481 +++ .../setup_aggregation_tables.sh | 50 + .../utility_scripts/setup_log_uploader.sh | 50 + .../utility_scripts/test_bigquery.py | 147 + .../prep_data/download_data_job.yaml | 7 +- .../prep_data/run_download_jobs.sh | 2 +- .../window_query_complex.sql | 26 +- scale-tester/.envrc | 4 - .../aws_spot/.cspell/custom-dictionary.txt | 3 + scale-tester/aws_spot/.gitignore | 47 + scale-tester/aws_spot/README.md | 190 +- .../files/docker/compose.yml} | 0 .../aws_spot/{ => ami/packer}/main.pkr.hcl | 0 .../{ => ami/packer}/variables.pkr.hcl | 0 .../{build-ami.sh => ami/scripts/build.sh} | 0 .../aws_spot/{ => ami/scripts}/setup.sh | 0 scale-tester/aws_spot/aws/config/env.sh | 47 + scale-tester/aws_spot/aws/keys/README.md | 29 + .../aws_spot/{ => aws/scripts}/setup-iam.sh | 0 .../{ => aws/scripts}/upload-to-ssm.sh | 0 .../aws_spot/files/bacalhau-startup.service | 14 - .../aws_spot/files/orchestrator-config.yaml | 17 - scale-tester/aws_spot/fleet/bin/spot-manager | 21 + .../aws_spot/fleet/examples/pusher/README.md | 44 + .../examples/pusher}/env_writer.yaml | 0 .../examples/pusher}/pusher-job.yaml | 0 .../fleet/examples/pusher/pusher_env.txt.b64 | 1 + .../aws_spot/{ => fleet}/scripts/startup.sh | 0 .../aws_spot/fleet/src/spot_manager.py | 2752 +++++++++++++++++ scale-tester/aws_spot/plan.md | 27 + scale-tester/aws_spot/pyproject.toml | 27 + scale-tester/aws_spot/requirements.txt | 5 + scale-tester/aws_spot/spot-instances.sh | 332 -- .../aws_spot/spot/config/aws-spot-env.sh | 42 + .../spot/config/aws-spot-env.sh.example | 32 + .../clean_up_nodes.py | 125 - .../.cspell/custom-dictionary.txt | 1 + .../all_locations.yaml | 176 -- .../deploy.py | 173 +- .../deploy_single.py | 642 ++++ .../locations.yaml | 68 +- .../locations/all_locations.json | 821 +++++ .../locations/all_locations.yaml | 546 ++++ .../main.tf | 10 +- .../instance/scripts/install_docker.sh | 97 +- .../modules/instance/scripts/startup.sh | 2 +- .../modules/instance/versions.tf | 2 +- .../modules/network/versions.tf | 4 +- .../modules/region/main.tf | 11 +- .../modules/region/versions.tf | 2 +- .../modules/securityGroup/versions.tf | 4 +- .../generate-all-locations-file.py | 119 + .../utility_scripts/generate-locations.sh | 54 - .../versions.tf | 3 +- .../instance/scripts/bacalhau-startup.service | 2 +- .../instance/scripts/healthz-web.service | 2 +- .../instance/scripts/install_docker.sh | 98 +- .../modules/instance/scripts/startup.sh | 2 +- .../utility_scripts/get_vm_list.py | 72 + .../utility_scripts/region_checker.py | 167 + ...ability_Standard_B2ms_20250109_134546.json | 176 ++ .../main.tf | 112 +- .../scripts/install_docker.sh | 97 +- .../scripts/startup.sh | 2 +- .../utility_scripts/all_locations.json | 624 ++++ .../utility_scripts/all_locations.yaml | 497 +++ .../generate_all_locations_file.py | 185 ++ .../terraform/clean_up_nodes.py | 132 - 81 files changed, 9439 insertions(+), 1064 deletions(-) create mode 100644 data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt create mode 100644 data-engineering/bigquery-with-bacalhau/.gitignore create mode 100644 data-engineering/bigquery-with-bacalhau/LICENSE create mode 100644 data-engineering/bigquery-with-bacalhau/README.md create mode 100644 data-engineering/bigquery-with-bacalhau/check_permissions.sh create mode 100644 data-engineering/bigquery-with-bacalhau/config.yaml.example create mode 100644 data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml create mode 100644 data-engineering/bigquery-with-bacalhau/process_container/process.py create mode 100644 data-engineering/bigquery-with-bacalhau/start-logging-container.yaml create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh create mode 100755 data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py create mode 100644 scale-tester/aws_spot/.gitignore rename scale-tester/aws_spot/{files/docker-compose.yml => ami/files/docker/compose.yml} (100%) rename scale-tester/aws_spot/{ => ami/packer}/main.pkr.hcl (100%) rename scale-tester/aws_spot/{ => ami/packer}/variables.pkr.hcl (100%) rename scale-tester/aws_spot/{build-ami.sh => ami/scripts/build.sh} (100%) rename scale-tester/aws_spot/{ => ami/scripts}/setup.sh (100%) create mode 100644 scale-tester/aws_spot/aws/config/env.sh create mode 100644 scale-tester/aws_spot/aws/keys/README.md rename scale-tester/aws_spot/{ => aws/scripts}/setup-iam.sh (100%) rename scale-tester/aws_spot/{ => aws/scripts}/upload-to-ssm.sh (100%) delete mode 100644 scale-tester/aws_spot/files/bacalhau-startup.service delete mode 100644 scale-tester/aws_spot/files/orchestrator-config.yaml create mode 100755 scale-tester/aws_spot/fleet/bin/spot-manager create mode 100644 scale-tester/aws_spot/fleet/examples/pusher/README.md rename scale-tester/aws_spot/{ => fleet/examples/pusher}/env_writer.yaml (100%) rename scale-tester/aws_spot/{ => fleet/examples/pusher}/pusher-job.yaml (100%) create mode 100644 scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64 rename scale-tester/aws_spot/{ => fleet}/scripts/startup.sh (100%) create mode 100755 scale-tester/aws_spot/fleet/src/spot_manager.py create mode 100644 scale-tester/aws_spot/plan.md create mode 100644 scale-tester/aws_spot/pyproject.toml create mode 100644 scale-tester/aws_spot/requirements.txt delete mode 100755 scale-tester/aws_spot/spot-instances.sh create mode 100644 scale-tester/aws_spot/spot/config/aws-spot-env.sh create mode 100644 scale-tester/aws_spot/spot/config/aws-spot-env.sh.example delete mode 100644 scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py delete mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py delete mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json create mode 100644 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml create mode 100755 setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py delete mode 100644 systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py diff --git a/.gitignore b/.gitignore index f9af9892..72896b47 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,9 @@ ansible.cfg *.pem .env.json +**/MACHINES.json + + .terraform/ terraform.tfstate.d/ .terraform.lock.hcl diff --git a/data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt b/data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt new file mode 100644 index 00000000..90122436 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/.cspell/custom-dictionary.txt @@ -0,0 +1,22 @@ +Bacalhau +bacalhauproject +buildx +cloudresourcemanager +creds +datacatering +duckdb +DUCKDB +INPUTFILE +INSTANCEID +IRUSR +IWUSR +listdir +makedirs +mwendler +natsorted +noninteractive +resourcemanager +tonistiigi +tpep +tripdata +TRUNC diff --git a/data-engineering/bigquery-with-bacalhau/.gitignore b/data-engineering/bigquery-with-bacalhau/.gitignore new file mode 100644 index 00000000..a504fc80 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/.gitignore @@ -0,0 +1,40 @@ +# Credentials and configuration +.env +config.yaml +credentials.json +log_uploader_credentials.json +bg_reader_credentials.json +bq_reader_credentials.json +**/MACHINES.json + +# Logs +aperitivo_logs.log.* + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# IDE +.idea/ +.vscode/ +*.swp +*.swo \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/LICENSE b/data-engineering/bigquery-with-bacalhau/LICENSE new file mode 100644 index 00000000..58275188 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Bacalhau Project Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/README.md b/data-engineering/bigquery-with-bacalhau/README.md new file mode 100644 index 00000000..c9291bf7 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/README.md @@ -0,0 +1,411 @@ +# Data Engineering with Bacalhau + +This repository demonstrates how to use Bacalhau for data engineering tasks, combining DuckDB for data processing and BigQuery for data storage. + +## Components + +1. **DuckDB Processing**: Process and analyze data using DuckDB's SQL capabilities +2. **BigQuery Integration**: Store processed results in Google BigQuery for further analysis + +## Prerequisites + +1. [Bacalhau client](https://docs.bacalhau.org/getting-started/installation) installed +2. Python 3.10 or higher +3. A Google Cloud Project with BigQuery enabled (or permissions to create one) +4. Service account credentials with appropriate permissions + +## Setup + +### 1. Install Dependencies + +Install the required Python packages: +```bash +pip install -r requirements.txt +``` + +### 2. Interactive Setup + +Run the interactive setup script: +```bash +./setup.py -i +``` + +The script will guide you through: +1. Project Configuration + - Enter existing project ID or create new + - Configure project settings + +2. Credentials Setup + - Create service account (browser will open) + - Download and configure credentials + - Set up necessary permissions + +3. BigQuery Configuration + - Configure dataset and table names + - Set up storage location + +The script will: +- Create/configure your Google Cloud project +- Set up service account credentials +- Create BigQuery dataset and table +- Save all settings to config.yaml + +### Manual Setup (Alternative) + +If you prefer manual setup: + +1. Create a service account in Google Cloud Console with these roles: + - BigQuery Data Editor + - BigQuery Job User + - Project Creator (if you want the script to create projects) + +2. Download the service account key file (JSON format) + +3. Create `config.yaml`: + ```yaml + project: + id: "your-project-id" # Your Google Cloud project ID + region: "US" # Default region for resources + create_if_missing: true # Whether to create the project if it doesn't exist + + credentials: + path: "credentials.json" # Path to your service account key + + bigquery: + dataset_name: "log_analytics" # Name of the BigQuery dataset + table_name: "log_results" # Name of the results table + location: "US" # Dataset location + ``` + +4. Run the setup script: + ```bash + ./setup.py + ``` + +### 3. Bacalhau Network Setup + +Follow the [standard Bacalhau network setup guide](https://docs.bacalhau.org/getting-started/create-private-network). + +## Usage + +### 1. Simple DuckDB Queries + +Run a simple DuckDB query: + +```bash +bacalhau docker run -e QUERY="select 1" docker.io/bacalhauproject/duckdb:latest +``` + +### 2. Processing Logs with BigQuery Integration + +Process log files and store results in BigQuery: + +```bash +bacalhau docker run \ + --input /path/to/logs:/var/log/logs_to_process \ + --volume /path/to/credentials.json:/bacalhau_node/credentials.json \ + ghcr.io/bacalhau-project/examples/bigquery-processor:latest \ + -- python process.py input.json "SELECT * FROM temp_log_data" +``` + +### 3. Using YAML Configuration + +For more complex setups, use the provided YAML configuration: + +```bash +bacalhau job run duckdb_query_job.yaml \ + --template-vars="filename=/bacalhau_data/data.parquet" \ + --template-vars="QUERY=$(cat your_query.sql)" +``` + +## Data Schema + +### BigQuery Table Schema + +The `log_results` table in BigQuery has the following schema: + +- `projectID`: STRING - Google Cloud project identifier +- `region`: STRING - Deployment region +- `nodeName`: STRING - Node name +- `syncTime`: STRING - Synchronization timestamp +- `remote_log_id`: STRING - Original log identifier +- `timestamp`: STRING - Event timestamp +- `version`: STRING - Log version +- `message`: STRING - Log message content + +## Example Queries + +### 1. Basic DuckDB Query + +```sql +-- simple_query.sql +SELECT COUNT(*) AS row_count FROM yellow_taxi_trips; +``` + +### 2. Time Window Analysis + +```sql +-- window_query.sql +SELECT + DATE_TRUNC('hour', tpep_pickup_datetime) + + INTERVAL (FLOOR(EXTRACT(MINUTE FROM tpep_pickup_datetime) / 5) * 5) MINUTE AS interval_start, + COUNT(*) AS ride_count +FROM + yellow_taxi_trips +GROUP BY + interval_start +ORDER BY + interval_start; +``` + +### 3. BigQuery Examples + +After setting up your BigQuery integration, you can run example queries using the provided script: + +```bash +./run_bigquery_query.py +``` + +This will run several example queries and show their results: + +1. Table Structure - Shows the schema of your log_results table +2. Total Row Count - Counts the total number of log entries +3. Recent Logs - Displays the 5 most recent log entries +4. Logs per Node - Shows how many logs each node has generated + +Example output: +``` +================================================================================ + +Querying BigQuery table: your-project-id.log_analytics.log_results + +================================================================================ + +Running query: Table Structure + +SQL: +SELECT + column_name, + data_type, + is_nullable +FROM your-project-id.log_analytics.INFORMATION_SCHEMA.COLUMNS +WHERE table_name = 'log_results' +ORDER BY ordinal_position + +Results: +------------------------------------------------------------ +column_name | data_type | is_nullable +------------------------------------------------------------ +projectID | STRING | YES +region | STRING | YES +nodeName | STRING | YES +syncTime | STRING | YES +remote_log_id| STRING | YES +timestamp | STRING | YES +version | STRING | YES +message | STRING | YES +------------------------------------------------------------ +Total rows: 8 + +... (more query results follow) +``` + +You can also run these queries directly in the BigQuery console: +1. Go to https://console.cloud.google.com/bigquery +2. Select your project +3. Click "Compose New Query" +4. Copy any of the SQL queries from the script output + +The script uses your config.yaml settings and service account credentials to connect to BigQuery. + +## Security Notes + +1. Credential Management: + - Never commit credentials to version control + - Mount credentials at runtime using Bacalhau volumes + - Use appropriate IAM roles and permissions + - Keep your config.yaml file secure and out of version control + +2. Data Access: + - Use principle of least privilege + - Regularly rotate service account keys + - Monitor BigQuery access logs + +## Environment Variables + +- `INPUTFILE`: Path to the input log file +- `QUERY`: DuckDB query to transform the data before sending to BigQuery + +## Directory Structure + +``` +. +├── container/ +│ ├── process.py # Main processing script +│ └── Dockerfile # Container definition +├── setup.py # Infrastructure setup script +├── requirements.txt # Python dependencies +├── config.yaml # Your configuration (not in version control) +├── .gitignore # Git ignore rules +└── README.md # This file +``` + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Submit a pull request + +## License + +This project is licensed under the MIT License - see the LICENSE file for details. + +## Demo Instructions + +### 1. Initial Setup +```bash +# Install dependencies +pip install -r requirements.txt + +# Run interactive setup +./setup.py -i + +# Make utility scripts executable +chmod +x utility_scripts/*.sh + +# Set up tables and service account +./utility_scripts/confirm_table.sh +./utility_scripts/setup_log_uploader.sh +./utility_scripts/setup_aggregation_tables.sh +``` + +### 2. Basic Log Processing + +```bash +# Process logs with basic configuration +bacalhau docker run \ + --input logs:/var/log/logs_to_process \ + --volume log_uploader_credentials.json:/var/logs/logs_to_process/log_uploader_credentials.json \ + ghcr.io/bacalhau-project/examples/bigquery-processor:latest \ + -- python process.py /var/log/logs_to_process/input.json "SELECT * FROM temp_log_data" +``` + +### 3. Advanced Features + +#### Track Cloud Provider +```bash +# Process logs with provider tracking +bacalhau docker run \ + -e CLOUD_PROVIDER=aws \ + --input logs:/var/log/logs_to_process \ + --volume log_uploader_credentials.json:/var/logs/logs_to_process/log_uploader_credentials.json \ + ghcr.io/bacalhau-project/examples/bigquery-processor:latest \ + -- python process.py /var/log/logs_to_process/input.json "SELECT * FROM temp_log_data" +``` + +#### Enable Log Aggregation +```bash +# Process logs with 5-minute window aggregation +bacalhau docker run \ + -e AGGREGATE_LOGS=true \ + --input logs:/var/log/logs_to_process \ + --volume log_uploader_credentials.json:/var/logs/logs_to_process/log_uploader_credentials.json \ + ghcr.io/bacalhau-project/examples/bigquery-processor:latest \ + -- python process.py /var/log/logs_to_process/input.json "SELECT * FROM temp_log_data" +``` + +### 4. Verify Results + +Check the results in BigQuery tables: + +1. Regular Logs: +```sql +SELECT * +FROM `your-project-id.log_analytics.log_results` +ORDER BY timestamp DESC +LIMIT 5 +``` + +2. Aggregated Logs (5-minute windows): +```sql +SELECT * +FROM `your-project-id.log_analytics.log_aggregates` +ORDER BY time_window DESC +LIMIT 5 +``` + +3. Emergency Events: +```sql +SELECT * +FROM `your-project-id.log_analytics.emergency_logs` +ORDER BY timestamp DESC +LIMIT 5 +``` + +### Security Features + +1. **Restricted Service Account**: + - Custom role with minimal permissions + - Can only write to specific BigQuery tables + - Cannot modify schema or read data + +2. **IP Address Sanitization**: + - IPv4: Last octet zeroed out + - IPv6: Last 64 bits zeroed out + - Automatic sanitization of public IPs + +3. **Secure Credential Handling**: + - Credentials mounted as volume + - Not exposed through environment variables + - Separate service account for log uploads + +### Environment Variables + +- `INPUTFILE`: Path to input log file (optional) +- `QUERY`: DuckDB query for data transformation (optional) +- `CLOUD_PROVIDER`: Cloud provider identifier (e.g., aws, gcp) +- `AGGREGATE_LOGS`: Enable 5-minute window aggregation (true/false) + +### Table Schemas + +1. **log_results** (Main Table): + - `project_id`: STRING + - `region`: STRING + - `nodeName`: STRING + - `timestamp`: TIMESTAMP + - `version`: STRING + - `message`: STRING + - `sync_time`: TIMESTAMP + - `remote_log_id`: STRING + - `hostname`: STRING + - `public_ip`: STRING + - `private_ip`: STRING + - `alert_level`: STRING + - `provider`: STRING + +2. **log_aggregates** (5-minute windows): + - `project_id`: STRING + - `region`: STRING + - `nodeName`: STRING + - `provider`: STRING + - `hostname`: STRING + - `time_window`: TIMESTAMP + - `log_count`: INT64 + - `messages`: ARRAY + +3. **emergency_logs** (Critical Events): + - `project_id`: STRING + - `region`: STRING + - `nodeName`: STRING + - `provider`: STRING + - `hostname`: STRING + - `timestamp`: TIMESTAMP + - `version`: STRING + - `message`: STRING + - `remote_log_id`: STRING + - `alert_level`: STRING + - `public_ip`: STRING + - `private_ip`: STRING +``` + +This provides a complete, accurate guide for demonstrating all features of the system, including setup, usage, and verification steps. \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/check_permissions.sh b/data-engineering/bigquery-with-bacalhau/check_permissions.sh new file mode 100644 index 00000000..dcea4c44 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/check_permissions.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Read project ID from config.yaml +PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])") + +echo "Checking BigQuery permissions for project: $PROJECT_ID" +echo + +# Check if we can access the dataset +echo "Testing dataset access..." +bq show $PROJECT_ID:log_analytics + +# Check if we can modify the table +echo -e "\nChecking table permissions..." +bq show --format=prettyjson $PROJECT_ID:log_analytics.log_results + +# Check IAM permissions +echo -e "\nChecking IAM roles..." +gcloud projects get-iam-policy $PROJECT_ID \ + --flatten="bindings[].members" \ + --format="table(bindings.role,bindings.members)" \ + --filter="bindings.members:$(gcloud config get account)" \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/config.yaml.example b/data-engineering/bigquery-with-bacalhau/config.yaml.example new file mode 100644 index 00000000..cd91db72 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/config.yaml.example @@ -0,0 +1,13 @@ +# BigQuery Configuration +project: + id: "bacalhau-and-bigquery" # Required: Your Google Cloud project ID + region: "US" # Optional: Default region for resources + create_if_missing: true # Whether to create the project if it doesn't exist + +credentials: + path: "credentials.json" # Path to service account credentials + +bigquery: + dataset_name: "log_analytics" # Name of the BigQuery dataset + table_name: "log_results" # Name of the results table + location: "US" # Dataset location \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml b/data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml new file mode 100644 index 00000000..a3efa5b8 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/duckdb_query_job.yaml @@ -0,0 +1,28 @@ +Tasks: + - Engine: + Params: + Image: docker.io/bacalhauproject/duckdb:latest + WorkingDirectory: "" + EnvironmentVariables: + - QUERY=WITH yellow_taxi_trips AS (SELECT * FROM read_parquet('{{ .filename }}')) {{ .query }} + Type: docker + Name: duckdb-query-job + InputSources: + - Source: + Type: "localDirectory" + Params: + SourcePath: "/bacalhau_data" + ReadWrite: true + Target: "/bacalhau_data" + Publisher: + Type: "local" + Params: + TargetPath: "/bacalhau_data" + Network: + Type: Full + Resources: + CPU: 2000m + Memory: 2048Mi + Timeouts: {} +Type: batch +Count: 1 diff --git a/data-engineering/bigquery-with-bacalhau/process_container/process.py b/data-engineering/bigquery-with-bacalhau/process_container/process.py new file mode 100644 index 00000000..c1b613e5 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/process_container/process.py @@ -0,0 +1,326 @@ +#!/usr/bin/env uv run -s +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "duckdb", +# "requests", +# "natsort", +# "google-cloud-storage", +# "google-cloud-bigquery", +# "ipaddress", +# ] +# /// + +import argparse +import ipaddress +import json +import os +import tempfile +from datetime import datetime + +import duckdb +import requests +from google.cloud import bigquery +from google.oauth2 import service_account +from natsort import natsorted, ns + + +def sanitize_ip(ip_str): + """Sanitize IP address by zeroing out last octet for IPv4 or last 64 bits for IPv6.""" + try: + ip = ipaddress.ip_address(ip_str) + if isinstance(ip, ipaddress.IPv4Address): + # Convert to string and replace last octet with 0 + parts = str(ip).split(".") + parts[-1] = "0" + return ".".join(parts) + else: + # For IPv6, zero out the last 64 bits + parts = str(ip).split(":") + return ":".join(parts[:4]) + ":0:0:0:0" + except: + return None + + +def getInstanceMetadata(metadataName): + url = f"http://metadata.google.internal/computeMetadata/v1/instance/{metadataName}" + return getMetadata(url) + + +def getProjectMetadata(metadataName): + url = f"http://metadata.google.internal/computeMetadata/v1/project/{metadataName}" + return getMetadata(url) + + +def getMetadata(metadata_server_url): + metadata_server_token_url = ( + "http://metadata/computeMetadata/v1/instance/service-accounts/default/token" + ) + token_request_headers = {"Metadata-Flavor": "Google"} + token_response = requests.get( + metadata_server_token_url, headers=token_request_headers + ) + jwt = token_response.json()["access_token"] + + metadata_request_headers = { + "Metadata-Flavor": "Google", + "Authorization": f"Bearer {jwt}", + } + + return requests.get(metadata_server_url, headers=metadata_request_headers).text + + +def detect_cloud_provider(): + """Detect the cloud provider by trying metadata endpoints.""" + + def try_gcp(): + try: + headers = {"Metadata-Flavor": "Google"} + response = requests.get( + "http://metadata.google.internal/computeMetadata/v1/instance/id", + headers=headers, + timeout=1, + ) + if response.status_code == 200: + return "gcp" + except: + pass + return None + + def try_aws(): + try: + # Get IMDSv2 token first + token_headers = {"X-aws-ec2-metadata-token-ttl-seconds": "21600"} + token = requests.put( + "http://169.254.169.254/latest/api/token", + headers=token_headers, + timeout=1, + ).text + + headers = {"X-aws-ec2-metadata-token": token} + response = requests.get( + "http://169.254.169.254/latest/meta-data/instance-id", + headers=headers, + timeout=1, + ) + if response.status_code == 200: + return "aws" + except: + pass + return None + + def try_azure(): + try: + headers = {"Metadata": "true"} + response = requests.get( + "http://169.254.169.254/metadata/instance?api-version=2021-02-01", + headers=headers, + timeout=1, + ) + if response.status_code == 200: + return "azure" + except: + pass + return None + + # Try each provider + provider = try_gcp() or try_aws() or try_azure() or "unknown" + return provider + + +def main(input_file, query): + # Create an in-memory DuckDB database + con = duckdb.connect(database=":memory:", read_only=False) + + usingTempFile = False + # If file is .gz, decompress it into a temporary file + if input_file.endswith(".gz"): + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".log") as temp: + os.system(f"gunzip -c {input_file} > {temp.name}") + input_file = temp.name + usingTempFile = True + + # Load credentials from the mounted file + credentials = service_account.Credentials.from_service_account_file( + "/var/logs/logs_to_process/log_uploader_credentials.json", + scopes=["https://www.googleapis.com/auth/bigquery"], + ) + + # Create BigQuery client + bq_client = bigquery.Client(credentials=credentials) + + # Generate metadata + try: + projectID = getProjectMetadata("project-id") + region = getInstanceMetadata("zone").split("/")[3] + nodeName = getInstanceMetadata("name") + provider = detect_cloud_provider() + except: + # If metadata service is not available, use defaults + projectID = "unknown" + region = "unknown" + nodeName = "unknown" + provider = "unknown" + + syncTime = datetime.now().strftime("%Y%m%d%H%M%S") + + # Create a temporary table in DuckDB with the JSON data + columns = { + "id": "varchar", + "@timestamp": "varchar", + "@version": "varchar", + "message": "varchar", + } + + # First create a temporary table with the data + temp_table = "temp_log_data" + raw_query = f""" + CREATE TABLE {temp_table} AS + SELECT + '{projectID}' as project_id, + '{region}' as region, + '{nodeName}' as nodeName, + '{syncTime}' as sync_time, + id as remote_log_id, + CAST("@timestamp" AS TIMESTAMP) as timestamp, + "@version" as version, + message, + '{provider}' as provider, + hostname() as hostname, + CASE + WHEN message LIKE '%ERROR%' OR message LIKE '%FATAL%' THEN 'emergency' + WHEN message LIKE '%WARN%' THEN 'warning' + ELSE 'info' + END as alert_level + FROM read_json(?, auto_detect=true, columns={columns}) + """ + con.execute(raw_query, [input_file]) + + # Now apply the user's query to filter/transform the data + if query: + result_table = "filtered_results" + con.execute(f"CREATE TABLE {result_table} AS {query}") + else: + result_table = temp_table + + # Check if we should aggregate + should_aggregate = os.environ.get("AGGREGATE_LOGS", "false").lower() == "true" + + if should_aggregate: + # Create aggregated table + agg_table = "aggregated_results" + con.execute(f""" + CREATE TABLE {agg_table} AS + SELECT + project_id, + region, + nodeName, + provider, + hostname, + date_trunc('minute', timestamp) - + (date_part('minute', timestamp)::integer % 5) * interval '1 minute' as time_window, + COUNT(*) as log_count, + array_agg(message) as messages + FROM {result_table} + WHERE alert_level != 'emergency' + GROUP BY + project_id, region, nodeName, provider, hostname, + date_trunc('minute', timestamp) - + (date_part('minute', timestamp)::integer % 5) * interval '1 minute' + """) + + # Export aggregated results to BigQuery + agg_table_id = f"{projectID}.log_analytics.log_aggregates" + df_agg = con.execute(f"SELECT * FROM {agg_table}").df() + + job_config = bigquery.LoadJobConfig( + write_disposition=bigquery.WriteDisposition.WRITE_APPEND, + ) + + job = bq_client.load_table_from_dataframe( + df_agg, agg_table_id, job_config=job_config + ) + job.result() + print(f"Loaded {len(df_agg)} aggregated rows into {agg_table_id}") + + # Export emergency events separately + emergency_table = "emergency_results" + con.execute(f""" + CREATE TABLE {emergency_table} AS + SELECT * + FROM {result_table} + WHERE alert_level = 'emergency' + """) + + emergency_table_id = f"{projectID}.log_analytics.emergency_logs" + df_emergency = con.execute(f"SELECT * FROM {emergency_table}").df() + + if len(df_emergency) > 0: + job = bq_client.load_table_from_dataframe( + df_emergency, emergency_table_id, job_config=job_config + ) + job.result() + print( + f"Loaded {len(df_emergency)} emergency events into {emergency_table_id}" + ) + else: + # Export the regular results to BigQuery + table_id = f"{projectID}.log_analytics.log_results" + df = con.execute(f"SELECT * FROM {result_table}").df() + + # Sanitize IP addresses if present + if "public_ip" in df.columns: + df["public_ip"] = df["public_ip"].apply(sanitize_ip) + + job_config = bigquery.LoadJobConfig( + write_disposition=bigquery.WriteDisposition.WRITE_APPEND, + ) + + job = bq_client.load_table_from_dataframe(df, table_id, job_config=job_config) + job.result() + print(f"Loaded {len(df)} rows into {table_id}") + + # Cleanup + if usingTempFile: + os.unlink(input_file) + + +if __name__ == "__main__": + # Print a header to a list of files that are available to process + print("Files available to process (in /var/log/logs_to_process):") + print("--------------------") + + # Print all files in /var/log/logs_to_process to stdout with absolute paths. + # If there are no files, print a message that "No files are available to process." + files = os.listdir("/var/log/logs_to_process") + if len(files) == 0: + print("No files are available to process.") + else: + f = natsorted(files, alg=ns.IGNORECASE) + for file in f: + print(f"/var/log/logs_to_process/{file}") + + print("\n") + + print("Environment Variables") + print(f"INPUTFILE = {os.environ.get('INPUTFILE')}") + print(f"QUERY = {os.environ.get('QUERY')}") + print(f"AGGREGATE_LOGS = {os.environ.get('AGGREGATE_LOGS', 'false')}") + + # If both INPUTFILE and QUERY are set, then use those + if os.environ.get("INPUTFILE") and os.environ.get("QUERY"): + print("Both INPUTFILE and QUERY are set, so using those") + args = argparse.Namespace( + input_file=os.environ.get("INPUTFILE"), query=os.environ.get("QUERY") + ) + else: + # Set up the argument parser + parser = argparse.ArgumentParser(description="Process log data") + parser.add_argument("input_file", help="Path to the input log file") + parser.add_argument("query", help="DuckDB query to execute") + + # Parse the command-line arguments + args = parser.parse_args() + + # Call the main function + main(args.input_file, args.query) diff --git a/data-engineering/bigquery-with-bacalhau/start-logging-container.yaml b/data-engineering/bigquery-with-bacalhau/start-logging-container.yaml new file mode 100644 index 00000000..fffe8cf3 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/start-logging-container.yaml @@ -0,0 +1,30 @@ +Tasks: + - Engine: + Params: + Entrypoint: null + EnvironmentVariables: null + Image: docker.io/bacalhauproject/log-generator:2412171646 + Parameters: + - -d + - /var/log/app + - -n + - aperitivo + WorkingDirectory: "" + Type: docker + Name: sample-job + InputSources: + - Source: + Type: "localDirectory" + Params: + SourcePath: "/bacalhau_data" + ReadWrite: true + Target: "/var/log/logs_to_process" + Network: + Type: None + Publisher: + Type: "" + Resources: + CPU: 250m + Memory: 250m + Timeouts: {} +Type: daemon diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh new file mode 100755 index 00000000..97c28580 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/confirm_table.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Exit on error +set -e + +# Read project ID from config.yaml +PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])") + +echo "Ensuring all required columns exist in table: $PROJECT_ID.log_analytics.log_results" + +# First, drop the timestamp columns if they exist with wrong type +echo "Dropping timestamp columns to recreate with correct type..." +bq query --use_legacy_sql=false \ +"ALTER TABLE \`$PROJECT_ID.log_analytics.log_results\` +DROP COLUMN IF EXISTS timestamp, +DROP COLUMN IF EXISTS sync_time" + +# Add or modify columns to ensure correct schema +echo "Adding/updating columns..." +bq query --use_legacy_sql=false \ +"ALTER TABLE \`$PROJECT_ID.log_analytics.log_results\` +ADD COLUMN IF NOT EXISTS region STRING, +ADD COLUMN IF NOT EXISTS nodeName STRING, +ADD COLUMN IF NOT EXISTS timestamp TIMESTAMP, +ADD COLUMN IF NOT EXISTS version STRING, +ADD COLUMN IF NOT EXISTS message STRING, +ADD COLUMN IF NOT EXISTS project_id STRING, +ADD COLUMN IF NOT EXISTS sync_time TIMESTAMP, +ADD COLUMN IF NOT EXISTS remote_log_id STRING, +ADD COLUMN IF NOT EXISTS hostname STRING, +ADD COLUMN IF NOT EXISTS public_ip STRING, +ADD COLUMN IF NOT EXISTS private_ip STRING, +ADD COLUMN IF NOT EXISTS alert_level STRING, +ADD COLUMN IF NOT EXISTS provider STRING" + +# Verify the columns and their types +echo -e "\nVerifying columns..." +bq query --use_legacy_sql=false --format=pretty \ +"WITH required_columns AS ( + SELECT column_name, data_type + FROM UNNEST([ + STRUCT('region' as column_name, 'STRING' as data_type), + ('nodeName', 'STRING'), + ('timestamp', 'TIMESTAMP'), + ('version', 'STRING'), + ('message', 'STRING'), + ('project_id', 'STRING'), + ('sync_time', 'TIMESTAMP'), + ('remote_log_id', 'STRING'), + ('hostname', 'STRING'), + ('public_ip', 'STRING'), + ('private_ip', 'STRING'), + ('alert_level', 'STRING'), + ('provider', 'STRING') + ]) +) +SELECT + c.column_name, + c.data_type as current_type, + r.data_type as required_type, + CASE + WHEN c.data_type = r.data_type THEN '✓' + ELSE '✗' + END as matches +FROM \`$PROJECT_ID.log_analytics\`.INFORMATION_SCHEMA.COLUMNS c +RIGHT JOIN required_columns r + ON c.column_name = r.column_name +WHERE c.table_name = 'log_results' +ORDER BY r.column_name" + +echo -e "\nDone. All required columns should now exist with correct types." \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh new file mode 100755 index 00000000..0971e42a --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/distribute_credentials.sh @@ -0,0 +1,75 @@ +#!/bin/bash +set -e + +# Check if credentials file exists +if [ ! -f "log_uploader_credentials.json" ]; then + echo "Error: log_uploader_credentials.json not found in current directory" + exit 1 +fi + +# Read and encode the credentials +echo "Reading and encoding credentials..." +CREDS_B64=$(base64 -i log_uploader_credentials.json) + +# Create and encode the Python script directly in memory +SCRIPT_B64=$(cat << 'EOF' | base64 +#!/usr/bin/env python3 +import base64 +import os +import stat +import sys + +try: + # Get the credentials from environment variable + creds_b64 = os.environ.get('CREDS_B64') + if not creds_b64: + print("Error: CREDS_B64 environment variable not found") + sys.exit(1) + + # Decode the credentials + creds = base64.b64decode(creds_b64) + + # If DEBUG is set, list the contents of the directory + if os.environ.get('DEBUG'): + print("Listing contents of /var/log:") + print(os.listdir('/var/log')) + + # Write the credentials + creds_path = '/var/log/logs_to_process/log_uploader_credentials.json' + with open(creds_path, 'wb') as f: + f.write(creds) + + # Set permissions to be readable only by owner (600) + os.chmod(creds_path, stat.S_IRUSR | stat.S_IWUSR) + + # Verify the write + if not os.path.exists(creds_path): + print("Error: Failed to write credentials file") + sys.exit(1) + + # Verify the permissions + perms = oct(os.stat(creds_path).st_mode)[-3:] + if perms != '600': + print(f"Warning: Unexpected permissions: {perms}") + sys.exit(1) + + print(f"Successfully wrote credentials to {creds_path}") + print(f"File permissions: {perms}") + +except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) +EOF +) + +echo "Distributing credentials to all nodes..." +bacalhau docker run \ + -e SCRIPT_B64="$SCRIPT_B64" \ + -e CREDS_B64="$CREDS_B64" \ + -e DEBUG=true \ + --target all \ + --input file:///bacalhau_data,dst=/var/log/logs_to_process,opt=readwrite=true \ + python:3.11-slim \ + -- /bin/bash -c 'echo "$SCRIPT_B64" | base64 -d > /tmp/write_creds.py && python /tmp/write_creds.py' + +echo "Credentials distribution complete." \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh new file mode 100755 index 00000000..986c6a3d --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/list_columns.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Read project ID from config.yaml +PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])") + +echo "Listing columns for table: $PROJECT_ID.log_analytics.log_results" +echo + +bq query --use_legacy_sql=false --format=pretty \ +"SELECT + column_name, + data_type, + is_nullable +FROM \`$PROJECT_ID.log_analytics\`.INFORMATION_SCHEMA.COLUMNS +WHERE table_name = 'log_results' +ORDER BY ordinal_position" \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py new file mode 100755 index 00000000..81ea8c27 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup.py @@ -0,0 +1,481 @@ +#!/usr/bin/env uv run -s +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "duckdb", +# "requests", +# "natsort", +# "google-cloud-storage", +# "google-cloud-bigquery", +# "google-cloud-resource-manager", +# "google-cloud-iam", +# "google-cloud-service-usage", +# "google-auth", +# "pyyaml", +# "google-api-core", +# ] +# /// + +import argparse +import os +import shutil +import subprocess +import sys +import time +from datetime import datetime + +import yaml +from google.api_core import exceptions +from google.cloud import bigquery, resourcemanager, service_usage_v1 +from google.oauth2 import service_account + +DEFAULT_CONFIG = { + "project": { + "id": "your-project-id", + "region": "US", + "create_if_missing": True, + }, + "credentials": { + "path": "credentials.json", + }, + "bigquery": { + "dataset_name": "log_analytics", + "table_name": "log_results", + "location": "US", + }, +} + +REQUIRED_APIS = [ + "bigquery.googleapis.com", + "cloudresourcemanager.googleapis.com", + "iam.googleapis.com", +] + + +def prompt_yes_no(question, default="yes"): + """Ask a yes/no question and return the answer.""" + valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False} + if default is None: + prompt = " [y/n] " + elif default == "yes": + prompt = " [Y/n] " + elif default == "no": + prompt = " [y/N] " + else: + raise ValueError(f"Invalid default answer: '{default}'") + + while True: + sys.stdout.write(question + prompt) + choice = input().lower() + if default is not None and choice == "": + return valid[default] + elif choice in valid: + return valid[choice] + else: + sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") + + +def generate_project_id(base_id: str) -> str: + """Generate a unique project ID with timestamp suffix.""" + timestamp = datetime.now().strftime("%y%m%d%H%M") + # Remove any existing timestamp suffix if present + base_id = base_id.split("-20")[0] # Remove any existing timestamp + + # Ensure the total length stays under 30 characters + # Format: base-id + "-" + timestamp = max 30 chars + max_base_length = 19 # 30 - 10 (timestamp) - 1 (hyphen) + if len(base_id) > max_base_length: + base_id = base_id[:max_base_length] + + return f"{base_id}-{timestamp}" + + +def create_project(project_id): + """Create a new GCP project.""" + try: + # Create project without credentials + client = resourcemanager.ProjectsClient() + + # Add timestamp to project ID only if it doesn't already have one + if not any(c.isdigit() for c in project_id): + project_id = generate_project_id(project_id) + + # Create project + project = resourcemanager.Project() + project.project_id = project_id + project.display_name = "Bacalhau BigQuery" + + print(f"\nCreating project {project_id}...") + operation = client.create_project(request={"project": project}) + result = operation.result() # Wait for operation to complete + + # Wait for project to be fully created and ready + print("Waiting for project to be ready...") + time.sleep(30) # Give time for project to propagate + + print(f"Project {project_id} created successfully!") + return project_id + except Exception as e: + print(f"Failed to create project: {e}") + return None + + +def check_api_enabled(project_id, api_name): + """Check if a specific API is enabled for the project.""" + import subprocess + + try: + result = subprocess.run( + [ + "gcloud", + "services", + "list", + "--project", + project_id, + "--filter", + f"config.name={api_name}", + "--format", + "value(state)", + ], + capture_output=True, + text=True, + check=True, + ) + return "ENABLED" in result.stdout.upper() + except subprocess.CalledProcessError: + return False + + +def enable_api_with_gcloud(project_id, api_name): + """Enable an API using gcloud command.""" + try: + print(f"Enabling {api_name}...") + subprocess.run( + ["gcloud", "services", "enable", api_name, f"--project={project_id}"], + check=True, + capture_output=True, + text=True, + ) + return True + except subprocess.CalledProcessError as e: + print(f"Failed to enable {api_name}: {e.stderr}") + return False + + +def enable_project_apis(project_id, credentials): + """Enable required APIs for the project using gcloud.""" + try: + # Check and enable Service Usage API first + if not check_api_enabled(project_id, "serviceusage.googleapis.com"): + if not enable_api_with_gcloud(project_id, "serviceusage.googleapis.com"): + print( + "\nFailed to enable Service Usage API. Please ensure you have the right permissions." + ) + print("Try running: gcloud auth login") + sys.exit(1) + + # Enable other required APIs using gcloud + for api in REQUIRED_APIS: + if not check_api_enabled(project_id, api): + if not enable_api_with_gcloud(project_id, api): + return False + else: + print(f"API {api} is already enabled.") + + print("All required APIs are enabled") + return True + except Exception as e: + print(f"Failed to enable APIs: {e}") + return False + + +def print_credentials_instructions(project_id: str): + """Print instructions for creating service account and credentials using gcloud CLI.""" + print("\nTo create service account and credentials, run these commands:") + print("\n1. Create service account:") + print("gcloud iam service-accounts create bacalhau-bigquery \\") + print(" --display-name='Bacalhau BigQuery Service Account' \\") + print(f" --project={project_id}") + + print("\n2. Grant necessary roles:") + print(f"gcloud projects add-iam-policy-binding {project_id} \\") + print( + f" --member='serviceAccount:bacalhau-bigquery@{project_id}.iam.gserviceaccount.com' \\" + ) + print(" --role='roles/bigquery.dataEditor'") + print(f"gcloud projects add-iam-policy-binding {project_id} \\") + print( + f" --member='serviceAccount:bacalhau-bigquery@{project_id}.iam.gserviceaccount.com' \\" + ) + print(" --role='roles/bigquery.jobUser'") + + print("\n3. Download service account key:") + print("gcloud iam service-accounts keys create credentials.json \\") + print( + f" --iam-account=bacalhau-bigquery@{project_id}.iam.gserviceaccount.com \\" + ) + print(f" --project={project_id}") + + print("\nAfter running these commands, run this script again to continue setup.") + + +def interactive_setup(): + """Guide the user through the setup process.""" + print("\n=== Bacalhau BigQuery Setup ===\n") + + # Check if config exists + if os.path.exists("config.yaml"): + if not prompt_yes_no("config.yaml already exists. Do you want to reconfigure?"): + return load_or_create_config("config.yaml") + + config = DEFAULT_CONFIG.copy() + + # Project configuration + print("\n1. Project Configuration") + print("-----------------------") + project_id = input( + "Enter your Google Cloud project ID (or press Enter to create new): " + ).strip() + + if not project_id: + # Generate a project ID with timestamp + base_id = "bq" # Shorter base name to allow for timestamp + default_id = generate_project_id(base_id) + project_id = input(f"Enter new project ID (default: {default_id}): ").strip() + project_id = project_id if project_id else default_id + + # Create the project + new_project_id = create_project(project_id) + if not new_project_id: + print("\nPlease create the project manually using:") + print(f"gcloud projects create {project_id}") + sys.exit(1) + + config["project"]["id"] = new_project_id + config["project"]["create_if_missing"] = False # Project is already created + + # Save configuration immediately after project creation + with open("config.yaml", "w") as f: + yaml.dump(config, f, default_flow_style=False) + print("\nConfiguration saved to config.yaml with project ID:", new_project_id) + + # Print instructions for creating service account + print_credentials_instructions(new_project_id) + sys.exit(0) + else: + config["project"]["id"] = project_id + config["project"]["create_if_missing"] = prompt_yes_no( + "Create project if it doesn't exist?" + ) + + # Credentials setup + print("\n2. Credentials Setup") + print("------------------") + while True: + creds_path = input( + "\nEnter the path to your credentials file (default: credentials.json): " + ).strip() + if not creds_path: + creds_path = "credentials.json" + + if os.path.exists(creds_path): + config["credentials"]["path"] = creds_path + break + else: + print(f"Error: File not found at {creds_path}") + print_credentials_instructions(project_id) + sys.exit(1) + + # BigQuery configuration + print("\n3. BigQuery Configuration") + print("----------------------") + dataset = input("Enter dataset name (default: log_analytics): ").strip() + if dataset: + config["bigquery"]["dataset_name"] = dataset + + table = input("Enter table name (default: log_results): ").strip() + if table: + config["bigquery"]["table_name"] = table + + # Save final configuration + with open("config.yaml", "w") as f: + yaml.dump(config, f, default_flow_style=False) + + print("\nConfiguration saved to config.yaml") + return config + + +def load_or_create_config(config_path): + """Load configuration from YAML file or create if doesn't exist.""" + if os.path.exists(config_path): + with open(config_path, "r") as f: + return yaml.safe_load(f) + + # If no config exists, run interactive setup + return interactive_setup() + + +def validate_config(config): + """Validate the configuration has all required fields.""" + required_fields = [ + ("project.id", lambda c: c.get("project", {}).get("id")), + ("credentials.path", lambda c: c.get("credentials", {}).get("path")), + ] + + for field, getter in required_fields: + if not getter(config) or getter(config) == DEFAULT_CONFIG["project"]["id"]: + print(f"Missing or invalid required field: {field}") + if prompt_yes_no("Would you like to run the interactive setup?"): + return interactive_setup() + sys.exit(1) + + +def setup_bigquery(config, credentials): + """Setup BigQuery resources.""" + client = bigquery.Client(credentials=credentials, project=config["project"]["id"]) + + # Create dataset if it doesn't exist + dataset_id = f"{config['project']['id']}.{config['bigquery']['dataset_name']}" + dataset = bigquery.Dataset(dataset_id) + dataset.location = config["bigquery"]["location"] + + try: + dataset = client.create_dataset(dataset, exists_ok=True) + print(f"Dataset {dataset_id} created or already exists.") + except Exception as e: + print(f"Error creating dataset: {e}") + return False + + # Create table if it doesn't exist + schema = [ + bigquery.SchemaField("projectID", "STRING"), + bigquery.SchemaField("region", "STRING"), + bigquery.SchemaField("nodeName", "STRING"), + bigquery.SchemaField("syncTime", "STRING"), + bigquery.SchemaField("remote_log_id", "STRING"), + bigquery.SchemaField("timestamp", "STRING"), + bigquery.SchemaField("version", "STRING"), + bigquery.SchemaField("message", "STRING"), + ] + + table_id = f"{dataset_id}.{config['bigquery']['table_name']}" + table = bigquery.Table(table_id, schema=schema) + try: + table = client.create_table(table, exists_ok=True) + print(f"Table {table_id} created or already exists.") + except Exception as e: + print(f"Error creating table: {e}") + return False + + return True + + +def main(): + parser = argparse.ArgumentParser(description="Setup BigQuery resources") + parser.add_argument( + "--config", default="config.yaml", help="Path to configuration file" + ) + parser.add_argument( + "--interactive", "-i", action="store_true", help="Run interactive setup" + ) + args = parser.parse_args() + + print("\nSetting up BigQuery integration...") + + # Try to load existing config first + config = None + if os.path.exists(args.config): + try: + with open(args.config, "r") as f: + config = yaml.safe_load(f) + except Exception as e: + print(f"Error reading config file: {e}") + + # If no config or interactive mode, run setup + if config is None or args.interactive: + config = interactive_setup() + + # Validate configuration + validate_config(config) + + project_id = config["project"]["id"] + + # First, ensure project exists + try: + # Try to create client without credentials first + client = resourcemanager.ProjectsClient() + project = client.get_project(name=f"projects/{project_id}") + print(f"Project {project_id} exists.") + except exceptions.NotFound: + print(f"\nProject {project_id} does not exist.") + if config["project"].get("create_if_missing", False): + new_project_id = create_project(project_id) + if not new_project_id: + sys.exit(1) + # Update config with new project ID + config["project"]["id"] = new_project_id + project_id = new_project_id + # Save updated config + with open(args.config, "w") as f: + yaml.dump(config, f, default_flow_style=False) + print(f"Updated config.yaml with new project ID: {new_project_id}") + else: + if prompt_yes_no("Would you like to create the project now?"): + new_project_id = create_project(project_id) + if not new_project_id: + sys.exit(1) + # Update config with new project ID + config["project"]["id"] = new_project_id + project_id = new_project_id + # Save updated config + with open(args.config, "w") as f: + yaml.dump(config, f, default_flow_style=False) + print(f"Updated config.yaml with new project ID: {new_project_id}") + else: + sys.exit(1) + except exceptions.PermissionDenied: + print(f"\nUnable to verify project {project_id} - insufficient permissions.") + print("Please run: gcloud auth login") + print("Then try again.") + sys.exit(1) + + # Now check for credentials + creds_path = os.path.expanduser(config["credentials"]["path"]) + if not os.path.exists(creds_path): + print(f"\nCredentials file not found at {creds_path}") + # Ensure config is saved before showing instructions + if not os.path.exists(args.config): + with open(args.config, "w") as f: + yaml.dump(config, f, default_flow_style=False) + print(f"Created config.yaml with project ID: {project_id}") + print_credentials_instructions(project_id) + sys.exit(1) + + # Enable required APIs using gcloud credentials (not service account) + print("\nEnabling required APIs using your gcloud credentials...") + if not enable_project_apis(project_id, None): + print( + "\nFailed to enable APIs. Please ensure you have the right permissions and try again." + ) + print("You may need to run: gcloud auth login") + sys.exit(1) + + # Setup credentials for BigQuery operations + credentials = service_account.Credentials.from_service_account_file( + creds_path, + scopes=["https://www.googleapis.com/auth/cloud-platform"], + ) + + # Setup BigQuery resources + if setup_bigquery(config, credentials): + print("\nBigQuery setup completed successfully!") + print( + f"\nYou can now use the following project ID in your queries: {project_id}" + ) + else: + print("\nBigQuery setup failed") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh new file mode 100755 index 00000000..72a98994 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_aggregation_tables.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Exit on error +set -e + +# Read project ID from config.yaml +PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('config.yaml'))['project']['id'])") + +echo "Creating aggregation tables in project: $PROJECT_ID" + +# Create table for 5-minute aggregated logs +echo "Creating aggregated logs table..." +bq query --use_legacy_sql=false \ +"CREATE TABLE IF NOT EXISTS \`$PROJECT_ID.log_analytics.log_aggregates\` +( + project_id STRING, + region STRING, + nodeName STRING, + provider STRING, + hostname STRING, + time_window TIMESTAMP, + log_count INT64, + messages ARRAY +)" + +# Create table for emergency events +echo "Creating emergency logs table..." +bq query --use_legacy_sql=false \ +"CREATE TABLE IF NOT EXISTS \`$PROJECT_ID.log_analytics.emergency_logs\` +( + project_id STRING, + region STRING, + nodeName STRING, + provider STRING, + hostname STRING, + timestamp TIMESTAMP, + version STRING, + message STRING, + remote_log_id STRING, + alert_level STRING, + public_ip STRING, + private_ip STRING +)" + +echo "Done. Created tables:" +echo "- $PROJECT_ID.log_analytics.log_aggregates (5-minute windows)" +echo "- $PROJECT_ID.log_analytics.emergency_logs (immediate alerts)" +echo +echo "To use aggregation mode, set environment variable:" +echo "AGGREGATE_LOGS=true" \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh new file mode 100755 index 00000000..d203f9f9 --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/setup_log_uploader.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Exit on error +set -e + +# Read project ID from config.yaml +PROJECT_ID=$(python3 -c "import yaml; print(yaml.safe_load(open('../config.yaml'))['project']['id'])") + +echo "Setting up log uploader service account for project: $PROJECT_ID" + +# Create a service account specifically for log uploads +SA_NAME="log-uploader" +SA_EMAIL="$SA_NAME@$PROJECT_ID.iam.gserviceaccount.com" + +echo "Creating service account..." +gcloud iam service-accounts create $SA_NAME \ + --display-name="Log Uploader Service Account" \ + --description="Restricted service account for uploading logs to BigQuery" \ + --project=$PROJECT_ID + +# Create a custom role with minimal permissions +echo "Creating custom role..." +gcloud iam roles create logUploader \ + --project=$PROJECT_ID \ + --title="Log Uploader" \ + --description="Custom role for uploading logs to BigQuery" \ + --permissions=bigquery.tables.get,bigquery.tables.updateData \ + --stage=GA + +# Bind the role to the service account +echo "Binding role to service account..." +gcloud projects add-iam-policy-binding $PROJECT_ID \ + --member="serviceAccount:$SA_EMAIL" \ + --role="projects/$PROJECT_ID/roles/logUploader" + +# Create and download a key +echo "Creating service account key..." +gcloud iam service-accounts keys create log-uploader-key.json \ + --iam-account=$SA_EMAIL \ + --project=$PROJECT_ID + +# Create a directory for the credentials if it doesn't exist +mv log-uploader-key.json log_uploader_credentials.json + +echo "Done. Service account key saved to ./log_uploader_credentials.json" +echo "This service account has minimal permissions:" +echo "- Can only write to BigQuery tables" +echo "- Cannot create/modify table schema" +echo "- Cannot read data from tables" +echo "- Cannot access any other GCP services" \ No newline at end of file diff --git a/data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py b/data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py new file mode 100755 index 00000000..65c722aa --- /dev/null +++ b/data-engineering/bigquery-with-bacalhau/utility_scripts/test_bigquery.py @@ -0,0 +1,147 @@ +#!/usr/bin/env uv run -s +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "google-cloud-bigquery", +# "google-oauth", +# "pyyaml", +# "google-api-core", +# ] +# /// + +import yaml +from google.cloud import bigquery +from google.oauth2 import service_account + +# Load the project-id from the config.yaml file +# If yaml does not exist, warn that they need to run setup.py and configure config.yaml + +try: + with open("config.yaml", "r") as file: + config = yaml.safe_load(file) + project_id = config.get("project", {}).get("id", None) + if not project_id: + print( + "config.yaml is missing the project.id field. Please run setup.py and configure config.yaml." + ) + exit(1) +except FileNotFoundError: + print("config.yaml not found. Please run setup.py and configure config.yaml.") + exit(1) + +# Setup credentials +credentials = service_account.Credentials.from_service_account_file( + "credentials.json", + scopes=["https://www.googleapis.com/auth/cloud-platform"], +) + + +def print_separator(): + print("\n" + "=" * 80 + "\n") + + +def run_query(client, query, title): + print(f"Running query: {title}") + print("\nSQL:") + print(query) + print("\nResults:") + + query_job = client.query(query) + results = query_job.result() + + # Print results in a tabular format + rows = list(results) + if not rows: + print("No results found") + return + + # Get column names + columns = [field.name for field in results.schema] + + # Calculate column widths + widths = {col: len(col) for col in columns} + for row in rows: + for col in columns: + widths[col] = max(widths[col], len(str(getattr(row, col)))) + + # Print header + header = " | ".join(col.ljust(widths[col]) for col in columns) + print("-" * len(header)) + print(header) + print("-" * len(header)) + + # Print rows + for row in rows: + print(" | ".join(str(getattr(row, col)).ljust(widths[col]) for col in columns)) + + print("-" * len(header)) + print(f"Total rows: {len(rows)}") + + +def main(): + # Load config + with open("config.yaml", "r") as f: + config = yaml.safe_load(f) + + # Setup credentials + credentials = service_account.Credentials.from_service_account_file( + config["credentials"]["path"], + scopes=["https://www.googleapis.com/auth/cloud-platform"], + ) + + # Create BigQuery client + client = bigquery.Client(credentials=credentials, project=config["project"]["id"]) + + project_id = config["project"]["id"] + dataset_id = f"{project_id}.{config['bigquery']['dataset_name']}" + table_id = f"{dataset_id}.{config['bigquery']['table_name']}" + + print_separator() + print(f"Querying BigQuery table: {table_id}") + print_separator() + + # Query 1: Show table structure + schema_query = f""" + SELECT + column_name, + data_type, + is_nullable + FROM {dataset_id}.INFORMATION_SCHEMA.COLUMNS + WHERE table_name = '{config["bigquery"]["table_name"]}' + ORDER BY ordinal_position + """ + run_query(client, schema_query, "Table Structure") + print_separator() + + # Query 2: Count total rows + count_query = f""" + SELECT COUNT(*) as total_rows + FROM `{table_id}` + """ + run_query(client, count_query, "Total Row Count") + print_separator() + + # Query 3: Sample recent logs + recent_logs_query = f""" + SELECT timestamp, nodeName, message + FROM `{table_id}` + ORDER BY timestamp DESC + LIMIT 5 + """ + run_query(client, recent_logs_query, "Recent Logs") + print_separator() + + # Query 4: Count logs by node + node_count_query = f""" + SELECT + nodeName, + COUNT(*) as log_count + FROM `{table_id}` + GROUP BY nodeName + ORDER BY log_count DESC + """ + run_query(client, node_count_query, "Logs per Node") + + +if __name__ == "__main__": + main() diff --git a/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml b/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml index d8b23432..765e80ce 100644 --- a/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml +++ b/data-engineering/using-duckdb-with-bacalhau/prep_data/download_data_job.yaml @@ -8,7 +8,7 @@ Tasks: Parameters: - wget - -O - - /bacalhau_data/$(basename "{{ .url_to_download }}") + - /bacalhau_data/{{ .filename }} - "{{ .url_to_download }}" Type: docker Name: download-data-job @@ -29,5 +29,6 @@ Tasks: CPU: 250m Memory: 250m Timeouts: {} -Type: batch -Count: 4 + Constraits: + - INSTANCE_ID=7362301696299350469 +Type: daemon diff --git a/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh b/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh index 8e71c1c8..b1475290 100755 --- a/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh +++ b/data-engineering/using-duckdb-with-bacalhau/prep_data/run_download_jobs.sh @@ -21,7 +21,7 @@ for url in "${urls[@]}"; do echo "Processing URL: $url" # Run the bacalhau job with the URL as template variable - bacalhau job run download_data_job.yaml --template-vars="url_to_download=$url" + bacalhau job run download_data_job.yaml --template-vars="url_to_download=$url" --template-vars="filename=$(basename $url)" # Add a small delay between job submissions sleep 1 diff --git a/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql b/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql index a4708a20..a70b3156 100644 --- a/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql +++ b/data-engineering/using-duckdb-with-bacalhau/window_query_complex.sql @@ -1 +1,25 @@ -SELECT DATE_TRUNC('hour', tpep_pickup_datetime) + INTERVAL (FLOOR(EXTRACT(MINUTE FROM tpep_pickup_datetime) / 5) * 5) MINUTE AS interval_start, COUNT(*) AS ride_count FROM yellow_taxi_trips GROUP BY interval_start ORDER BY interval_start; \ No newline at end of file +WITH intervals AS ( + SELECT + DATE_TRUNC('hour', tpep_pickup_datetime) AS pickup_hour, + FLOOR(EXTRACT(MINUTE FROM tpep_pickup_datetime) / 5) * 5 AS pickup_minute + FROM + your_table_name +) +SELECT + pickup_hour + INTERVAL (pickup_minute) MINUTE AS interval_start, + AVG(ride_count) AS avg_rides_per_5min +FROM ( + SELECT + pickup_hour, + pickup_minute, + COUNT(*) AS ride_count + FROM + intervals + GROUP BY + pickup_hour, + pickup_minute +) AS ride_counts +GROUP BY + interval_start +ORDER BY + interval_start; \ No newline at end of file diff --git a/scale-tester/.envrc b/scale-tester/.envrc index abe7884f..0f210bb5 100644 --- a/scale-tester/.envrc +++ b/scale-tester/.envrc @@ -1,6 +1,2 @@ -# shellcheck disable=SC1090 -. <( flox activate; ); - - export GOPATH=$HOME/go export PATH=$PATH:$GOPATH/bin diff --git a/scale-tester/aws_spot/.cspell/custom-dictionary.txt b/scale-tester/aws_spot/.cspell/custom-dictionary.txt index d232366c..61a3241a 100644 --- a/scale-tester/aws_spot/.cspell/custom-dictionary.txt +++ b/scale-tester/aws_spot/.cspell/custom-dictionary.txt @@ -2,4 +2,7 @@ bacalhau bacalhauproject CPUS dind +levelname oneshot +pythonjsonlogger +vcpus diff --git a/scale-tester/aws_spot/.gitignore b/scale-tester/aws_spot/.gitignore new file mode 100644 index 00000000..3dd73fa4 --- /dev/null +++ b/scale-tester/aws_spot/.gitignore @@ -0,0 +1,47 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +.env +.venv +env/ +venv/ +ENV/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# AWS +config/aws-spot-env.sh +manifest.json + +# Logs +*.log +/tmp/ + +# OS +.DS_Store +Thumbs.db +.aider* diff --git a/scale-tester/aws_spot/README.md b/scale-tester/aws_spot/README.md index f3a067b2..1f7416fe 100644 --- a/scale-tester/aws_spot/README.md +++ b/scale-tester/aws_spot/README.md @@ -1,3 +1,187 @@ -packer init . -packer validate . -packer build . \ No newline at end of file +# Bacalhau Scale Tester + +A tool for testing the scalability of Bacalhau nodes on AWS spot instances. This tool can launch, manage, and monitor large numbers of Bacalhau nodes, with built-in health checking and stress testing capabilities. + +## Features + +- Launch and manage AWS spot instances running Bacalhau nodes +- Monitor instance health and status +- Run automated stress tests with configurable parameters +- Beautiful CLI interface with progress bars and live updates +- Comprehensive logging and debugging options + +## Prerequisites + +- Python 3.10 or higher +- AWS CLI configured with appropriate credentials +- [uv](https://github.com/astral-sh/uv) for dependency management (recommended) +- [Packer](https://www.packer.io/) for building AMIs + +## Project Structure + +``` +bacalhau-scale-tester/ +├── ami/ # AMI Creation Workflow +│ ├── packer/ # Packer configuration +│ │ ├── main.pkr.hcl +│ │ └── variables.pkr.hcl +│ ├── files/ # Files included in AMI +│ │ ├── bacalhau/ +│ │ │ ├── startup.service +│ │ │ └── config.yaml +│ │ └── docker/ +│ │ └── compose.yml +│ └── scripts/ # AMI build scripts +│ ├── build.sh +│ └── setup.sh +├── aws/ # AWS Resource Management +│ ├── config/ # AWS configurations +│ │ ├── env.sh.example +│ │ └── env.sh +│ ├── keys/ # SSH keys (gitignored) +│ │ └── README.md +│ └── scripts/ # AWS setup scripts +│ ├── setup-iam.sh +│ └── upload-ssm.sh +├── fleet/ # Spot Fleet Management +│ ├── bin/ # Command-line tools +│ │ └── spot-manager +│ ├── src/ # Python implementation +│ │ └── spot_manager.py +│ ├── scripts/ # Fleet management scripts +│ │ └── startup.sh +│ └── examples/ # Example jobs +│ └── pusher/ # Pusher job example +│ ├── job.yaml +│ ├── env.yaml +│ └── README.md +├── pyproject.toml # Python project config +├── requirements.txt # Python dependencies +└── README.md # Main documentation +``` + +## Workflows + +### 1. Creating a New AMI + +To create a new AMI for your Bacalhau nodes: + +```bash +# 1. Configure AMI settings +vim ami/packer/variables.pkr.hcl + +# 2. Build the AMI +cd ami +./scripts/build.sh +cd .. +``` + +### 2. Setting Up AWS Resources + +Before running spot instances, set up required AWS resources: + +```bash +# 1. Configure AWS environment +cp aws/config/env.sh.example aws/config/env.sh +vim aws/config/env.sh + +# 2. Create SSH key pair +cd aws/keys +aws ec2 create-key-pair --key-name BacalhauScaleTestKey --query 'KeyMaterial' --output text > BacalhauScaleTestKey.pem +chmod 600 BacalhauScaleTestKey.pem +cd ../.. + +# 3. Set up IAM roles and upload configs +cd aws/scripts +./setup-iam.sh +./upload-ssm.sh +cd ../.. +``` + +### 3. Managing Spot Fleet + +The spot manager provides a CLI for managing your fleet: + +```bash +# The spot-manager script handles environment setup +./fleet/bin/spot-manager --help + +# Launch instances +./fleet/bin/spot-manager launch --count 5 + +# List running instances +./fleet/bin/spot-manager list + +# Run stress test +./fleet/bin/spot-manager stress-test \ + --min-nodes 100 \ + --max-nodes 500 \ + --iterations 5 \ + --health-timeout 300 + +# Terminate all instances +./fleet/bin/spot-manager terminate-all +``` + +### Stress Test Options + +- `--min-nodes`: Minimum number of nodes per iteration (default: 250) +- `--max-nodes`: Maximum number of nodes per iteration (default: 750) +- `--iterations`: Number of test iterations (default: 10) +- `--health-timeout`: Timeout in seconds for health checks (default: 300) + +### Debug Mode + +Add `--debug` to any command to enable detailed logging: +```bash +./fleet/bin/spot-manager --debug launch --count 5 +``` + +## Development Setup + +1. Clone the repository: +```bash +git clone https://github.com/bacalhau-project/bacalhau-scale-tester.git +cd bacalhau-scale-tester +``` + +2. Set up the environment: +```bash +# Using uv (recommended) +uv venv +source .venv/bin/activate +uv pip install -r requirements.txt + +# Or using pip +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +## Example Jobs + +See the `fleet/examples/` directory for example job configurations: + +### Pusher Job +Located in `fleet/examples/pusher/`, this example demonstrates how to set up a job that pushes events to a monitoring system. See its README for detailed setup instructions. + +## Development + +The project uses: +- [Rich](https://rich.readthedocs.io/) for beautiful terminal output +- [Click](https://click.palletsprojects.com/) for CLI interface +- [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) for AWS interaction +- [aiohttp](https://docs.aiohttp.org/) for async health checks +- [Packer](https://www.packer.io/) for AMI building + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Commit your changes +4. Push to the branch +5. Create a Pull Request + +## License + +This project is licensed under the Apache License 2.0 - see the LICENSE file for details. \ No newline at end of file diff --git a/scale-tester/aws_spot/files/docker-compose.yml b/scale-tester/aws_spot/ami/files/docker/compose.yml similarity index 100% rename from scale-tester/aws_spot/files/docker-compose.yml rename to scale-tester/aws_spot/ami/files/docker/compose.yml diff --git a/scale-tester/aws_spot/main.pkr.hcl b/scale-tester/aws_spot/ami/packer/main.pkr.hcl similarity index 100% rename from scale-tester/aws_spot/main.pkr.hcl rename to scale-tester/aws_spot/ami/packer/main.pkr.hcl diff --git a/scale-tester/aws_spot/variables.pkr.hcl b/scale-tester/aws_spot/ami/packer/variables.pkr.hcl similarity index 100% rename from scale-tester/aws_spot/variables.pkr.hcl rename to scale-tester/aws_spot/ami/packer/variables.pkr.hcl diff --git a/scale-tester/aws_spot/build-ami.sh b/scale-tester/aws_spot/ami/scripts/build.sh similarity index 100% rename from scale-tester/aws_spot/build-ami.sh rename to scale-tester/aws_spot/ami/scripts/build.sh diff --git a/scale-tester/aws_spot/setup.sh b/scale-tester/aws_spot/ami/scripts/setup.sh similarity index 100% rename from scale-tester/aws_spot/setup.sh rename to scale-tester/aws_spot/ami/scripts/setup.sh diff --git a/scale-tester/aws_spot/aws/config/env.sh b/scale-tester/aws_spot/aws/config/env.sh new file mode 100644 index 00000000..65dbf332 --- /dev/null +++ b/scale-tester/aws_spot/aws/config/env.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# aws-spot-env.sh +# +# This file sets environment variables used for launching +# 1,000 AWS Spot Instances with Docker installed. +# +# Usage: +# source ./aws-spot-env.sh + +# AWS CLI & Region +export AWS_REGION="us-west-2" + +# Key Pair +export KEY_NAME="BacalhauScaleTestKey" + +# Security Group +export SECURITY_GROUP_NAME="bacalhau-scale-test-group" +export SECURITY_GROUP_DESC="Security group for Bacalhau Scale Spot Instances" + +# Your public IP for SSH ingress (CIDR /32) +export MY_PUBLIC_IP=$(curl -s ifconfig.me) + +# Base AMI to use (Amazon Linux 2 example) +# aws ssm get-parameters --names /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2 --region us-east-1 +export BASE_AMI_ID="ami-07d9cf938edb0739b" +export CONFIGURED_AMI_ID="ami-06e47c5231eb29362" + +# Instance Type +export INSTANCE_TYPE="t3.micro" + +# Scaling Limits +export SPOT_INSTANCE_COUNT="4" +export MAX_INSTANCES="1000" +export MAX_INSTANCES_PER_LAUNCH="100" +export MIN_INSTANCES="1" +export MAX_TOTAL_VCPUS="10000" +export MAX_TOTAL_MEMORY="100000" # In GB + +# Custom AMI details (if building your own) +export CUSTOM_AMI_NAME="bacalhau-scale-test-ami" +export CUSTOM_AMI_DESCRIPTION="AMI with Docker and Bacalhau preinstalled" + +# Tags +export INSTANCE_TAG_KEY="Name" +export INSTANCE_TAG_VALUE="bacalhau-scale-test" + +echo "Environment variables for AWS Spot Instances set." diff --git a/scale-tester/aws_spot/aws/keys/README.md b/scale-tester/aws_spot/aws/keys/README.md new file mode 100644 index 00000000..7f1f98a9 --- /dev/null +++ b/scale-tester/aws_spot/aws/keys/README.md @@ -0,0 +1,29 @@ +# AWS SSH Keys + +This directory contains SSH key pairs for accessing AWS instances. These files are sensitive and should never be committed to version control. + +## Required Keys + +1. `BacalhauScaleTestKey.pem` - Main SSH key pair for accessing spot instances + - Generated when running setup scripts + - Must be kept private and secure + - Should have permissions set to 600 (`chmod 600 BacalhauScaleTestKey.pem`) + +## Security Notes + +- Never commit these keys to git +- Keep backups in a secure location +- Rotate keys regularly +- Ensure proper file permissions + +## Setup + +To create a new key pair: + +1. Use AWS Console: + ```bash + aws ec2 create-key-pair --key-name BacalhauScaleTestKey --query 'KeyMaterial' --output text > BacalhauScaleTestKey.pem + chmod 600 BacalhauScaleTestKey.pem + ``` + +2. Update the key name in `aws/config/env.sh` diff --git a/scale-tester/aws_spot/setup-iam.sh b/scale-tester/aws_spot/aws/scripts/setup-iam.sh similarity index 100% rename from scale-tester/aws_spot/setup-iam.sh rename to scale-tester/aws_spot/aws/scripts/setup-iam.sh diff --git a/scale-tester/aws_spot/upload-to-ssm.sh b/scale-tester/aws_spot/aws/scripts/upload-to-ssm.sh similarity index 100% rename from scale-tester/aws_spot/upload-to-ssm.sh rename to scale-tester/aws_spot/aws/scripts/upload-to-ssm.sh diff --git a/scale-tester/aws_spot/files/bacalhau-startup.service b/scale-tester/aws_spot/files/bacalhau-startup.service deleted file mode 100644 index ec988e93..00000000 --- a/scale-tester/aws_spot/files/bacalhau-startup.service +++ /dev/null @@ -1,14 +0,0 @@ -[Unit] -Description=Bacalhau Startup Script -After=docker.service network-online.target -Wants=network-online.target - -[Service] -Type=oneshot -ExecStart=/bacalhau_node/startup.sh -RemainAfterExit=yes -StandardOutput=journal -StandardError=journal - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/scale-tester/aws_spot/files/orchestrator-config.yaml b/scale-tester/aws_spot/files/orchestrator-config.yaml deleted file mode 100644 index 82e1ead4..00000000 --- a/scale-tester/aws_spot/files/orchestrator-config.yaml +++ /dev/null @@ -1,17 +0,0 @@ -NameProvider: "uuid" -API: - Port: 1234 -Compute: - Enabled: true - Orchestrators: - - nats://ns101607.ip-147-135-16.us:4222 - Auth: - Token: 93182ba0-6a4a-4c5b-9554-deb0b19ee71f - AllowListedLocalPaths: - - /bacalhau_data:rw - Engine: - Resources: - CPU: 1 - Memory: 1GB -JobAdmissionControl: - AcceptNetworkedJobs: true diff --git a/scale-tester/aws_spot/fleet/bin/spot-manager b/scale-tester/aws_spot/fleet/bin/spot-manager new file mode 100755 index 00000000..3f0f8d9f --- /dev/null +++ b/scale-tester/aws_spot/fleet/bin/spot-manager @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# +# Convenience wrapper for running the spot manager +# Ensures proper environment and paths are set up + +# Get the directory this script is in +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +PROJECT_ROOT="$( cd "$SCRIPT_DIR/../.." && pwd )" + +# Source AWS environment if it exists +AWS_ENV_FILE="$PROJECT_ROOT/aws/config/env.sh" +if [ -f "$AWS_ENV_FILE" ]; then + source "$AWS_ENV_FILE" +else + echo "Error: AWS environment file not found at $AWS_ENV_FILE" + echo "Please copy aws/config/env.sh.example to aws/config/env.sh and configure it" + exit 1 +fi + +# Run the spot manager +"$PROJECT_ROOT/fleet/src/spot_manager.py" "$@" \ No newline at end of file diff --git a/scale-tester/aws_spot/fleet/examples/pusher/README.md b/scale-tester/aws_spot/fleet/examples/pusher/README.md new file mode 100644 index 00000000..4c7f1924 --- /dev/null +++ b/scale-tester/aws_spot/fleet/examples/pusher/README.md @@ -0,0 +1,44 @@ +# Pusher Job Example + +This example demonstrates how to set up a job that pushes events to a monitoring system. It's useful for monitoring the health and performance of your Bacalhau nodes. + +## Files + +- `job.yaml` - Main job configuration for the event pusher +- `env.yaml` - Environment configuration for the pusher +- `env.txt` - Environment variables (create from env.txt.example) +- `env.txt.b64` - Base64 encoded environment variables + +## Setup + +1. Configure environment: +```bash +# Copy example config +cp env.txt.example env.txt + +# Edit with your settings +vim env.txt + +# Create base64 encoded version +base64 env.txt > env.txt.b64 +``` + +2. Deploy the job: +```bash +# Create the job +bacalhau create job.yaml + +# Verify it's running +bacalhau list +``` + +## Configuration + +The pusher job requires the following environment variables: + +- `PUSHER_ENDPOINT` - Endpoint to push events to +- `PUSHER_TOKEN` - Authentication token +- `PUSHER_INTERVAL` - Push interval in seconds +- `PUSHER_BATCH_SIZE` - Number of events per batch + +See `env.txt.example` for a complete list of options. diff --git a/scale-tester/aws_spot/env_writer.yaml b/scale-tester/aws_spot/fleet/examples/pusher/env_writer.yaml similarity index 100% rename from scale-tester/aws_spot/env_writer.yaml rename to scale-tester/aws_spot/fleet/examples/pusher/env_writer.yaml diff --git a/scale-tester/aws_spot/pusher-job.yaml b/scale-tester/aws_spot/fleet/examples/pusher/pusher-job.yaml similarity index 100% rename from scale-tester/aws_spot/pusher-job.yaml rename to scale-tester/aws_spot/fleet/examples/pusher/pusher-job.yaml diff --git a/scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64 b/scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64 new file mode 100644 index 00000000..8d75863c --- /dev/null +++ b/scale-tester/aws_spot/fleet/examples/pusher/pusher_env.txt.b64 @@ -0,0 +1 @@ +QVdTX0FDQ0VTU19LRVlfSUQ9QUtJQTNGTERZT0JGQ01XTjRTRkMKQVdTX1NFQ1JFVF9BQ0NFU1NfS0VZPWpsTlU0VE1FcUhkWGZhNWdyYWY4MXBkQWtORGE2RG9raUk3UzJNOEoKQVdTX1JFR0lPTj11cy1lYXN0LTIKU1FTX1FVRVVFX1VSTD1odHRwczovL3Nxcy51cy1lYXN0LTIuYW1hem9uYXdzLmNvbS83NjczOTc3NTI5MDYvc2NhbGUtdGVzdGVyLWV2ZW50cy5maWZvCkNPTE9SPSNGRjAzMjEK diff --git a/scale-tester/aws_spot/scripts/startup.sh b/scale-tester/aws_spot/fleet/scripts/startup.sh similarity index 100% rename from scale-tester/aws_spot/scripts/startup.sh rename to scale-tester/aws_spot/fleet/scripts/startup.sh diff --git a/scale-tester/aws_spot/fleet/src/spot_manager.py b/scale-tester/aws_spot/fleet/src/spot_manager.py new file mode 100755 index 00000000..9af52bda --- /dev/null +++ b/scale-tester/aws_spot/fleet/src/spot_manager.py @@ -0,0 +1,2752 @@ +#!/usr/bin/env uv run -s +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "rich", +# "boto3", +# "click", +# "aiohttp", +# "python-dotenv", +# "python-json-logger", +# ] +# /// + +import asyncio +import json +import logging +import os +import random +import re +import signal +import subprocess +import sys +import time +from collections import deque +from datetime import datetime +from functools import wraps +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import aiohttp +import boto3 +import click +from click import Context +from pythonjsonlogger import jsonlogger +from rich.box import ROUNDED +from rich.console import Console +from rich.layout import Layout +from rich.live import Live +from rich.panel import Panel +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + TaskProgressColumn, + TextColumn, + TimeElapsedColumn, + TimeRemainingColumn, +) +from rich.table import Table +from rich.theme import Theme + +# Initialize Rich console with consistent theme +console = Console( + theme=Theme( + { + "info": "bold blue", + "warning": "bold yellow", + "error": "bold red", + "success": "bold green", + "highlight": "bold cyan", + "dim": "dim", + } + ) +) + +# Get the project root directory +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parent.parent +DEBUG_LOG = Path(os.getcwd()) / "debug.log" # Use caller's directory for debug log + + +def write_debug(message: str) -> None: + """Write debug information to debug.log in the caller's directory""" + try: + # 'a' mode is used to prevent multiple processes from truncating each other's output + with open(DEBUG_LOG, "a") as f: + f.write(f"{datetime.now().isoformat()} - {message}\n") + except Exception as e: + # If we can't write to the debug log, fall back to stderr + print(f"Failed to write to debug log: {str(e)}", file=sys.stderr) + + +# Truncate the debug log at startup +try: + DEBUG_LOG.write_text("") +except Exception as e: + print(f"Failed to truncate debug log: {str(e)}", file=sys.stderr) + +# Initialize global layout with consistent sizing +layout = Layout() +layout.split_column( + Layout(name="header", size=3, minimum_size=3), + Layout(name="body", ratio=2), + Layout(name="status", size=6, minimum_size=6), + Layout(name="progress", size=4, minimum_size=4), +) + +# Initialize global progress with more detailed columns and safer formatting +progress = Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(bar_width=None), + TaskProgressColumn(), + # Remove percentage for tasks that might not have a total + TimeElapsedColumn(), + TimeRemainingColumn(), + expand=True, + disable=False, +) + + +def safe_progress_update(task_id, **kwargs): + """Safely update progress without template errors""" + try: + if "total" in kwargs and kwargs["total"] is None: + # Don't show percentage for indeterminate progress + kwargs["visible"] = True + if "completed" in kwargs: + del kwargs["completed"] + progress.update(task_id, **kwargs) + except Exception as e: + # Fallback to basic progress display + try: + progress.update(task_id, description="Processing...", visible=True) + except: + pass # Suppress any errors in the fallback + + +progress_task = None +layout["progress"].update(progress) + +# Initialize global live display +live = Live(layout, refresh_per_second=4, auto_refresh=True) + + +def load_shell_env(env_path: Path) -> None: + """Load environment variables from a shell script""" + if not env_path.exists(): + console.print(f"[yellow]Warning: Environment file not found at {env_path}[/yellow]") + return + + content = env_path.read_text() + pattern = r'^(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)=["\']?([^"\'\n]*)["\']?$' + + for line in content.splitlines(): + line = line.strip() + if line and not line.startswith("#"): + match = re.match(pattern, line) + if match: + key, value = match.groups() + os.environ[key] = value + + +class RateLimiter: + """Rate limiter for AWS API calls""" + + def __init__(self, max_rate: float = 10, time_window: float = 1.0): + self.max_rate = max_rate + self.time_window = time_window + self.timestamps = deque(maxlen=max_rate) + self.lock = asyncio.Lock() + + async def wait(self): + """Wait until we can make another API call""" + async with self.lock: + now = time.time() + + # Remove old timestamps + while self.timestamps and now - self.timestamps[0] > self.time_window: + self.timestamps.popleft() + + if len(self.timestamps) >= self.max_rate: + # Calculate wait time + oldest = self.timestamps[0] + wait_time = self.time_window - (now - oldest) + if wait_time > 0: + await asyncio.sleep(wait_time) + now = time.time() + + self.timestamps.append(now) + + +def rate_limited(max_rate: float = 10, time_window: float = 1.0): + """Decorator to rate limit AWS API calls""" + + def decorator(func): + @wraps(func) + async def wrapper(self, *args, **kwargs): + await self.rate_limiter.wait() + return await func(self, *args, **kwargs) + + return wrapper + + return decorator + + +class SpotManager: + def __init__(self, debug: bool = False): + self.debug = debug + + # Initialize logging first + self._setup_logging() + + # Initialize AWS clients + self.region = os.getenv("AWS_REGION", "us-west-2") + self.ec2 = boto3.client("ec2", region_name=self.region) + self.ec2_resource = boto3.resource("ec2", region_name=self.region) + self._instance_type_cache = {} + + # Initialize rate limiter with conservative defaults + self.rate_limiter = RateLimiter(max_rate=8, time_window=1.0) # 8 requests per second + + # Initialize lifecycle tracking + self.lifecycle_events = {} # instance_id -> list of lifecycle events + self.lifecycle_lock = asyncio.Lock() # For thread-safe lifecycle updates + + # Health monitoring configuration + self.health_check_interval = 60 # Seconds between health checks + self.max_health_failures = 3 # Max consecutive failures before recovery + self.health_metrics = {} # instance_id -> health metrics + self.health_lock = asyncio.Lock() # For thread-safe health updates + self.monitoring_tasks = set() # Track active monitoring tasks + self.health_thresholds = { + "max_response_time": 5.0, # Seconds + "min_success_rate": 0.8, # 80% + "max_consecutive_failures": 3, + "max_error_rate": 0.2, # 20% + } + + # Error recovery configuration + self.max_retries = 5 # Max retries for recoverable errors + self.retry_delay = 1.0 # Initial retry delay in seconds + self.max_retry_delay = 30.0 # Max retry delay in seconds + self.recovery_actions = { + "InstanceLimitExceeded": self._handle_instance_limit_error, + "InsufficientInstanceCapacity": self._handle_capacity_error, + "SpotInstanceRequestLimitExceeded": self._handle_spot_limit_error, + "RequestLimitExceeded": self._handle_rate_limit_error, + "Unavailable": self._handle_service_unavailable, + "InternalError": self._handle_internal_error, + } + # Load environment variables + aws_env_path = PROJECT_ROOT / "aws" / "config" / "env.sh" + load_shell_env(aws_env_path) + + # Initialize scaling limits from environment or defaults + self.max_instances = int(os.getenv("MAX_INSTANCES", 1000)) + self.max_instances_per_launch = min( + int(os.getenv("MAX_INSTANCES_PER_LAUNCH", 100)), + 100, # AWS hard limit per launch request + ) + self.min_instances = int(os.getenv("MIN_INSTANCES", 1)) + self.max_total_vcpus = int(os.getenv("MAX_TOTAL_VCPUS", 10000)) + self.max_total_memory = int(os.getenv("MAX_TOTAL_MEMORY", 100000)) # In GB + + # Track resource usage + self.current_vcpus = 0 + self.current_memory = 0 # In GB + self.region = os.getenv("AWS_REGION", "us-west-2") + self.instance_type = os.getenv("INSTANCE_TYPE", "t3.micro") + self.key_name = os.getenv("KEY_NAME") + self.security_group_name = os.getenv("SECURITY_GROUP_NAME", "bacalhau-scale-test-sg") + self.configured_ami_id = os.getenv("CONFIGURED_AMI_ID") + + # Validate all configuration parameters + self._validate_configuration() + self._cleanup_tasks = set() # Track cleanup tasks + self._cleanup_lock = asyncio.Lock() # For thread-safe cleanup operations + self._created_resources = {"instances": set(), "security_groups": set(), "key_pairs": set()} + self.key_name = os.getenv("KEY_NAME") + self.security_group_name = os.getenv("SECURITY_GROUP_NAME", "bacalhau-scale-test-sg") + # Initialize tagging system + self.default_tags = { + "Name": os.getenv("INSTANCE_TAG_VALUE", "bacalhau-scale-test"), + "Project": "BacalhauScaleTest", + "Environment": "Test", + "ManagedBy": "SpotManager", + "CreationTime": datetime.now().isoformat(), + } + self.configured_ami_id = os.getenv("CONFIGURED_AMI_ID") + + # Instance state machine tracking + self.instance_states = {} # instance_id -> state info + self.state_lock = asyncio.Lock() # For thread-safe state updates + self.state_transitions = { + "pending": ["running", "terminated", "shutting-down"], + "running": ["stopping", "shutting-down", "terminated"], + "stopping": ["stopped", "terminated"], + "stopped": ["terminated", "pending"], + "shutting-down": ["terminated"], + "terminated": [], + } + + self.ec2 = boto3.client("ec2", region_name=self.region) + self.ec2_resource = boto3.resource("ec2", region_name=self.region) + self._instance_type_cache = {} + + # Use global progress task + global progress_task + self.progress_task = progress_task + + self._setup_logging() + self.log( + "info", + "SpotManager initialized", + region=self.region, + instance_type=self.instance_type, + security_group=self.security_group_name, + ) + + def _validate_configuration(self) -> None: + """Validate all configuration parameters""" + required_env_vars = [ + "AWS_REGION", + "KEY_NAME", + "SECURITY_GROUP_NAME", + "CONFIGURED_AMI_ID", + "INSTANCE_TYPE", + ] + + # Validate instance type + if not self.validate_instance_type(self.instance_type): + raise ValueError( + f"Instance type {self.instance_type} is not supported or available. " + f"Must support EBS optimization, HVM virtualization, and have at least 2 network interfaces" + ) + + missing_vars = [var for var in required_env_vars if not os.getenv(var)] + if missing_vars: + raise ValueError( + f"Missing required environment variables: {', '.join(missing_vars)}. " + f"Please check your aws/config/env.sh file" + ) + + # Validate instance type format + if not re.match(r"^[a-z0-9]+\.\w+$", self.instance_type): + raise ValueError( + f"Invalid instance type format: {self.instance_type}. " + f"Expected format like 't3.micro'" + ) + + # Validate AMI ID format + if not re.match(r"^ami-[0-9a-f]{17}$", self.configured_ami_id): + raise ValueError( + f"Invalid AMI ID format: {self.configured_ami_id}. " + f"Expected format like 'ami-0123456789abcdef0'" + ) + + # Validate security group name + if not re.match(r"^[a-zA-Z0-9_\-]{1,255}$", self.security_group_name): + raise ValueError( + f"Invalid security group name: {self.security_group_name}. " + f"Must be 1-255 alphanumeric characters, underscores or hyphens" + ) + + # Validate key pair name + if not re.match(r"^[a-zA-Z0-9_\-]{1,255}$", self.key_name): + raise ValueError( + f"Invalid key pair name: {self.key_name}. " + f"Must be 1-255 alphanumeric characters, underscores or hyphens" + ) + + # Validate region format + if not re.match(r"^[a-z]{2}-[a-z]+-\d+$", self.region): + raise ValueError( + f"Invalid AWS region format: {self.region}. Expected format like 'us-west-2'" + ) + + def _setup_logging(self): + """Setup logging to write to debug.log""" + self.logger = logging.getLogger("SpotManager") + self.logger.setLevel(logging.DEBUG if self.debug else logging.INFO) + + # Remove any existing handlers + self.logger.handlers = [] + + # Create file handler that writes to debug.log + file_handler = logging.FileHandler(DEBUG_LOG) + formatter = jsonlogger.JsonFormatter( + fmt="%(asctime)s %(levelname)s %(name)s %(message)s", + rename_fields={"asctime": "timestamp", "levelname": "level", "name": "logger"}, + ) + file_handler.setFormatter(formatter) + self.logger.addHandler(file_handler) + + def log(self, level: str, message: str, **kwargs): + """Log structured messages with additional context""" + log_method = getattr(self.logger, level, self.logger.info) + log_data = {"message": message, **kwargs} + log_method(log_data) + + def debug_log(self, message: str, **kwargs): + """Log debug messages to debug.log""" + if self.debug: + write_debug(message) + if kwargs: + write_debug(f"Additional context: {json.dumps(kwargs, indent=2)}") + + async def check_node_health( + self, ip_address: str, max_retries: int = 3, timeout: int = 5 + ) -> Dict[str, Any]: + """Check if a Bacalhau node is healthy by querying its API with retries + and collecting detailed metrics. + + Args: + ip_address: IP address of node to check + max_retries: Maximum number of retry attempts + timeout: Timeout in seconds for each attempt + + Returns: + Dict containing health status and metrics: + { + "healthy": bool, + "response_time": float, # In seconds + "status_code": int, + "error": Optional[str], + "timestamp": str + } + """ + url = f"http://{ip_address}:1234" + retry_delay = 1 # Start with 1 second delay + start_time = time.time() + + for attempt in range(max_retries): + try: + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=timeout) as response: + response_time = time.time() - start_time + + # Collect metrics + metrics = { + "healthy": response.status == 200, + "response_time": response_time, + "status_code": response.status, + "error": None, + "timestamp": datetime.now().isoformat(), + } + + if response.status != 200: + self.debug_log( + f"Health check attempt {attempt + 1} failed for {ip_address}: " + f"Status {response.status}" + ) + metrics["error"] = f"HTTP {response.status}" + + return metrics + except asyncio.TimeoutError: + error_msg = f"Health check attempt {attempt + 1} timed out for {ip_address}" + self.debug_log(error_msg) + metrics = { + "healthy": False, + "response_time": time.time() - start_time, + "status_code": None, + "error": "Timeout", + "timestamp": datetime.now().isoformat(), + } + except Exception as e: + error_msg = f"Health check attempt {attempt + 1} failed for {ip_address}: {str(e)}" + self.debug_log(error_msg) + metrics = { + "healthy": False, + "response_time": time.time() - start_time, + "status_code": None, + "error": str(e), + "timestamp": datetime.now().isoformat(), + } + + # Exponential backoff between retries + if attempt < max_retries - 1: + await asyncio.sleep(retry_delay) + retry_delay = min(retry_delay * 2, 10) # Cap at 10 seconds + + return metrics + + async def start_health_monitoring(self) -> None: + """Start continuous health monitoring for all instances""" + while True: + try: + # Get all running instances + instance_ids = await asyncio.get_event_loop().run_in_executor( + None, self.get_all_instance_ids + ) + + if not instance_ids: + await asyncio.sleep(self.health_check_interval) + continue + + # Get instance IPs + instance_ips = await asyncio.get_event_loop().run_in_executor( + None, lambda: self.get_instance_ips(instance_ids) + ) + + # Check health of all nodes + health_results = await self.check_all_nodes_health(instance_ips) + + # Update health metrics + async with self.health_lock: + for ip, metrics in health_results.items(): + instance_id = next( + ( + id + for id, ip_addr in zip(instance_ids, instance_ips) + if ip_addr == ip + ), + None, + ) + if instance_id: + if instance_id not in self.health_metrics: + self.health_metrics[instance_id] = { + "history": [], + "consecutive_failures": 0, + "last_healthy": None, + } + + # Update metrics + self.health_metrics[instance_id]["history"].append(metrics) + if not metrics["healthy"]: + self.health_metrics[instance_id]["consecutive_failures"] += 1 + else: + self.health_metrics[instance_id]["consecutive_failures"] = 0 + self.health_metrics[instance_id]["last_healthy"] = datetime.now() + + # Check if instance needs recovery + if ( + self.health_metrics[instance_id]["consecutive_failures"] + >= self.health_thresholds["max_consecutive_failures"] + ): + await self.recover_instance(instance_id) + + await asyncio.sleep(self.health_check_interval) + + except Exception as e: + self.log("error", "Health monitoring error", error=str(e)) + await asyncio.sleep(self.health_check_interval) + + async def recover_instance(self, instance_id: str) -> None: + """Recover an unhealthy instance""" + self.log("warning", "Recovering unhealthy instance", instance_id=instance_id) + + try: + # Get instance details + instance = await asyncio.get_event_loop().run_in_executor( + None, + lambda: self.ec2.describe_instances(InstanceIds=[instance_id])["Reservations"][0][ + "Instances" + ][0], + ) + + # Terminate the unhealthy instance + await asyncio.get_event_loop().run_in_executor( + None, lambda: self.ec2.terminate_instances(InstanceIds=[instance_id]) + ) + + # Wait for termination to complete + waiter = self.ec2.get_waiter("instance_terminated") + await asyncio.get_event_loop().run_in_executor( + None, + lambda: waiter.wait( + InstanceIds=[instance_id], WaiterConfig={"Delay": 5, "MaxAttempts": 40} + ), + ) + + # Launch replacement instance + await self.launch_instances(1) + + # Cleanup health metrics + async with self.health_lock: + if instance_id in self.health_metrics: + del self.health_metrics[instance_id] + + self.log("info", "Instance recovery completed", instance_id=instance_id) + + except Exception as e: + self.log("error", "Instance recovery failed", instance_id=instance_id, error=str(e)) + raise + + async def check_all_nodes_health( + self, instance_ips: List[str], progress=None, timeout: int = 10 + ) -> Dict[str, Dict]: + """Check health of all nodes in parallel with timeout and collect metrics + + Args: + instance_ips: List of IP addresses to check + progress: Progress tracker for UI updates + timeout: Maximum time to wait for all checks + + Returns: + Dict mapping IP addresses to health metrics: + { + "healthy": bool, + "response_time": float, + "status_code": Optional[int], + "error": Optional[str], + "timestamp": str + } + """ + try: + # Create tasks with individual timeouts + tasks = [ + asyncio.wait_for(self.check_node_health(ip), timeout=timeout) for ip in instance_ips + ] + + # Run all checks in parallel + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results + health_status = {} + for ip, result in zip(instance_ips, results): + if isinstance(result, Exception): + self.debug_log(f"Health check failed for {ip}: {str(result)}") + health_status[ip] = False + else: + health_status[ip] = result + + return health_status + + except asyncio.TimeoutError: + self.debug_log(f"Health checks timed out after {timeout} seconds") + return {ip: False for ip in instance_ips} + except Exception as e: + self.debug_log(f"Error checking node health: {str(e)}") + return {ip: False for ip in instance_ips} + + def get_instance_ips(self, instance_ids: List[str]) -> List[str]: + """Get public IPs for a list of instance IDs""" + response = self.ec2.describe_instances(InstanceIds=instance_ids) + return [ + instance["PublicIpAddress"] + for reservation in response["Reservations"] + for instance in reservation["Instances"] + if "PublicIpAddress" in instance + ] + + def ensure_security_group(self) -> str: + """Ensure security group exists and has correct rules""" + # Validate security group configuration before proceeding + if not self.security_group_name: + raise ValueError("Security group name is not configured") + + self.debug_log("Checking for existing security group...") + + try: + response = self.ec2.describe_security_groups( + Filters=[{"Name": "group-name", "Values": [self.security_group_name]}] + ) + + if response["SecurityGroups"]: + group_id = response["SecurityGroups"][0]["GroupId"] + self.debug_log(f"Found existing security group: {group_id}") + else: + response = self.ec2.create_security_group( + GroupName=self.security_group_name, + Description="Security group for Bacalhau scale testing", + ) + group_id = response["GroupId"] + self._created_resources["security_groups"].add(group_id) + self.debug_log(f"Created new security group: {group_id}") + + self._update_security_group_rules(group_id) + return group_id + + except Exception as e: + console.print(f"[red]Error ensuring security group: {str(e)}[/red]") + self._cleanup_resources() + raise + + def _update_security_group_rules(self, group_id: str): + """Update security group rules""" + try: + existing_rules = self.ec2.describe_security_group_rules( + Filters=[{"Name": "group-id", "Values": [group_id]}] + ) + + for rule in existing_rules.get("SecurityGroupRules", []): + if not rule.get("IsEgress", True): + try: + self.ec2.revoke_security_group_ingress( + GroupId=group_id, + SecurityGroupRuleIds=[rule["SecurityGroupRuleId"]], + ) + except Exception as e: + self.debug_log(f"Error removing rule: {str(e)}") + + self.ec2.authorize_security_group_ingress( + GroupId=group_id, + IpPermissions=[ + { + "IpProtocol": "tcp", + "FromPort": 22, + "ToPort": 22, + "IpRanges": [{"CidrIp": "0.0.0.0/0"}], + }, + { + "IpProtocol": "tcp", + "FromPort": 4222, + "ToPort": 4222, + "IpRanges": [{"CidrIp": "0.0.0.0/0"}], + }, + { + "IpProtocol": "tcp", + "FromPort": 1234, + "ToPort": 1234, + "IpRanges": [{"CidrIp": "0.0.0.0/0"}], + }, + ], + ) + except Exception as e: + self.debug_log(f"Error updating security group rules: {str(e)}") + raise + + async def _cleanup_instances(self, instance_ids: List[str]) -> None: + """Async cleanup handler for instances with improved state validation""" + async with self._cleanup_lock: + try: + self.log("info", "Starting instance cleanup", instance_count=len(instance_ids)) + + # Get current instance states + response = await asyncio.get_event_loop().run_in_executor( + None, lambda: self.ec2.describe_instances(InstanceIds=instance_ids) + ) + + # Filter out instances that are already terminated or shutting-down + active_instances = [ + instance["InstanceId"] + for reservation in response["Reservations"] + for instance in reservation["Instances"] + if instance["State"]["Name"] not in ["terminated", "shutting-down"] + ] + + if not active_instances: + self.log("info", "No active instances to terminate") + return + + # Calculate resources to release + released_vcpus = 0 + released_memory = 0 + for reservation in response["Reservations"]: + for instance in reservation["Instances"]: + if instance["InstanceId"] in active_instances: + instance_type = instance["InstanceType"] + instance_info = self.get_instance_type_info(instance_type) + released_vcpus += int(instance_info["vcpus"]) + released_memory += float(instance_info["memory"].replace("GB", "")) + + # Terminate active instances + await asyncio.get_event_loop().run_in_executor( + None, lambda: self.ec2.terminate_instances(InstanceIds=active_instances) + ) + self.log("info", "Instance termination requested", instance_ids=active_instances) + + # Wait for termination with state validation + max_attempts = 40 + attempt = 0 + while attempt < max_attempts: + # Get current states + status_response = await asyncio.get_event_loop().run_in_executor( + None, lambda: self.ec2.describe_instances(InstanceIds=active_instances) + ) + + # Check if all instances are terminated + terminated_count = sum( + 1 + for reservation in status_response["Reservations"] + for instance in reservation["Instances"] + if instance["State"]["Name"] in ["terminated", "shutting-down"] + ) + + if terminated_count == len(active_instances): + break + + attempt += 1 + await asyncio.sleep(5) + + if attempt >= max_attempts: + self.log( + "warning", + "Timeout waiting for instances to terminate", + instance_ids=active_instances, + ) + raise RuntimeError( + f"Timeout waiting for instances to terminate. " + f"Current states: {', '.join(set(instance['State']['Name'] for reservation in status_response['Reservations'] for instance in reservation['Instances']))}" + ) + # Update resource tracking + self.current_vcpus = max(0, self.current_vcpus - released_vcpus) + self.current_memory = max(0, self.current_memory - released_memory) + + self.log( + "info", + "Instances terminated", + instance_ids=instance_ids, + released_vcpus=released_vcpus, + released_memory=f"{released_memory}GB", + remaining_vcpus=self.current_vcpus, + remaining_memory=f"{self.current_memory}GB", + ) + + except Exception as e: + self.log("error", "Error during instance cleanup", error=str(e)) + raise + + @rate_limited(max_rate=5, time_window=1.0) # 5 requests per second for state updates + async def update_instance_states(self, instance_ids: List[str]) -> Dict[str, Dict]: + """Update and return current states for given instance IDs with state machine validation + and detailed lifecycle tracking""" + async with self.state_lock: + try: + # Get both instance details and status checks in parallel + instance_response, status_response = await asyncio.gather( + asyncio.get_event_loop().run_in_executor( + None, lambda: self.ec2.describe_instances(InstanceIds=instance_ids) + ), + asyncio.get_event_loop().run_in_executor( + None, + lambda: self.ec2.describe_instance_status( + InstanceIds=instance_ids, IncludeAllInstances=True + ), + ), + ) + + # Create status mapping for quick lookup + status_map = { + status["InstanceId"]: status + for status in status_response.get("InstanceStatuses", []) + } + + # Update our state tracking + for reservation in instance_response["Reservations"]: + for instance in reservation["Instances"]: + instance_id = instance["InstanceId"] + new_state = instance["State"]["Name"] + + # Get current state if exists + current_state = self.instance_states.get(instance_id, {}).get( + "state", "pending" + ) + + # Validate state transition + if not self.validate_state_transition(instance_id, new_state): + self.log( + "warning", + "Invalid state transition attempted", + instance_id=instance_id, + current_state=current_state, + new_state=new_state, + ) + continue + + # Get system health status + system_status = "unknown" + instance_status = "unknown" + if instance_id in status_map: + status = status_map[instance_id] + system_status = status["SystemStatus"]["Status"] + instance_status = status["InstanceStatus"]["Status"] + + # Record lifecycle event + # Get instance type info + instance_info = self.get_instance_type_info(instance["InstanceType"]) + + lifecycle_event = { + "timestamp": datetime.now().isoformat(), + "state": new_state, + "system_status": system_status, + "instance_status": instance_status, + "ip": instance.get("PublicIpAddress", ""), + "type": instance["InstanceType"], + "details": { + "cpu": instance_info["vcpus"], + "memory": instance_info["memory"], + "network": instance_info["network"], + "ebs_optimized": instance_info["ebs_optimized"], + }, + } + + async with self.lifecycle_lock: + if instance_id not in self.lifecycle_events: + self.lifecycle_events[instance_id] = [] + self.lifecycle_events[instance_id].append(lifecycle_event) + + # Update state tracking + self.instance_states[instance_id] = { + "state": new_state, + "ip": instance.get("PublicIpAddress", ""), + "type": instance["InstanceType"], + "launch_time": instance["LaunchTime"].isoformat(), + "system_status": system_status, + "instance_status": instance_status, + "state_history": self.instance_states.get(instance_id, {}).get( + "state_history", [] + ) + + [{"state": new_state, "timestamp": datetime.now().isoformat()}], + "lifecycle_events": self.lifecycle_events.get(instance_id, []), + } + + return self.instance_states + + except Exception as e: + self.debug_log(f"Error updating instance states: {str(e)}") + raise + + @rate_limited(max_rate=5, time_window=1.0) # 5 requests per second for polling + async def poll_instance_status( + self, + instance_ids: List[str], + initial_delay: float = 1.0, + max_delay: float = 30.0, + progress_tracker: Progress = None, + task: int = None, + ) -> None: + """Poll EC2 for the status of instances until all are running with exponential backoff""" + global progress, progress_task, layout + progress_task = progress.add_task( + "Waiting for instances to start...", total=len(instance_ids) + ) + + current_delay = initial_delay + max_attempts = 60 + attempt = 0 + + while attempt < max_attempts: + try: + # Get current states + states = await self.update_instance_states(instance_ids) + + running_count = sum(1 for state in states.values() if state["state"] == "running") + + terminated_count = sum( + 1 + for state in states.values() + if state["state"] in ["terminated", "shutting-down"] + ) + + if terminated_count > 0: + raise RuntimeError( + f"{terminated_count} instances were terminated while waiting for startup. This usually indicates insufficient spot capacity." + ) + + if progress_tracker and task is not None: + progress_tracker.update( + task, + completed=running_count, + description=f"[yellow]Running: {running_count}/{len(instance_ids)}[/yellow]", + ) + live.refresh() + layout["progress"].update(progress) + + if running_count == len(instance_ids): + break + + # Exponential backoff with jitter + await asyncio.sleep(current_delay) + current_delay = min(max_delay, current_delay * 1.5) * (1 + random.random()) + attempt += 1 + + except self.ec2.exceptions.ClientError as e: + if "InvalidInstanceID.NotFound" in str(e): + raise RuntimeError( + "Some instances disappeared while waiting for startup. This usually indicates insufficient spot capacity." + ) + # Retry on throttling errors + if "RequestLimitExceeded" in str(e): + await asyncio.sleep(current_delay) + current_delay = min(max_delay, current_delay * 1.5) * (1 + random.random()) + attempt += 1 + continue + raise + except Exception as e: + # General error handling with retry + self.debug_log(f"Polling error: {str(e)}") + await asyncio.sleep(current_delay) + current_delay = min(max_delay, current_delay * 1.5) * (1 + random.random()) + attempt += 1 + continue + + if attempt >= max_attempts: + if progress_tracker and task is not None: + progress_tracker.update( + task, + description="[red]Timeout waiting for instances[/red]", + completed=running_count, + ) + live.refresh() + raise RuntimeError( + f"Timeout waiting for instances to start after {max_attempts} attempts" + ) + + def list_instances(self, filters: List[Dict] = None) -> None: + """List all running instances with optional filters""" + try: + # Default filter for managed instances + default_filters = [ + { + "Name": "tag:ManagedBy", + "Values": ["SpotManager"], + }, + {"Name": "instance-state-name", "Values": ["pending", "running"]}, + ] + + # Merge with any additional filters + if filters: + default_filters.extend(filters) + + response = self.ec2.describe_instances(Filters=default_filters) + + instances = [ + instance + for reservation in response["Reservations"] + for instance in reservation["Instances"] + ] + + # Update layout with consistent styling + layout["header"].update( + Panel( + "[bold blue]Bacalhau Spot Manager[/bold blue]\n" + f"[dim]Listing {len(instances)} instances[/dim]", + style="white on #28464B", + border_style="blue", + ) + ) + + # Create and display the table + table = self.create_instance_table(instances) + layout["body"].update(Panel(table, border_style="blue")) + layout["status"].update( + Panel( + "[green]✓ Instance list loaded successfully[/green]", + border_style="green", + ) + ) + live.refresh() + + except Exception as e: + console.print(f"[red]Error listing instances: {str(e)}[/red]") + raise + + @rate_limited(max_rate=2, time_window=1.0) # 2 requests per second for terminations + async def terminate_instances(self, instance_ids: List[str], batch_size: int = 50) -> None: + """Terminate specified instances in batches + + Args: + instance_ids: List of instance IDs to terminate + batch_size: Number of instances per batch (default: 50) + """ + if not instance_ids: + return + + global progress, progress_task, layout + progress_task = progress.add_task("Terminating instances...", total=len(instance_ids)) + layout["progress"].update(progress) + + try: + # Process instances in batches + batches = (len(instance_ids) + batch_size - 1) // batch_size + terminated_count = 0 + + for batch_num in range(batches): + start_idx = batch_num * batch_size + end_idx = min((batch_num + 1) * batch_size, len(instance_ids)) + batch_ids = instance_ids[start_idx:end_idx] + + # Update progress + progress.update( + progress_task, + description=f"Terminating batch {batch_num + 1}/{batches} ({len(batch_ids)} instances)...", + completed=terminated_count, + ) + layout["progress"].update(progress) + live.refresh() + + try: + # Terminate batch + await asyncio.get_event_loop().run_in_executor( + None, lambda: self.ec2.terminate_instances(InstanceIds=batch_ids) + ) + + # Wait for termination with async waiter + waiter = self.ec2.get_waiter("instance_terminated") + await asyncio.get_event_loop().run_in_executor( + None, + lambda: waiter.wait( + InstanceIds=batch_ids, + WaiterConfig={"Delay": 5, "MaxAttempts": 40}, + ), + ) + + terminated_count += len(batch_ids) + progress.update(progress_task, completed=terminated_count) + layout["progress"].update(progress) + live.refresh() + + # Update status after each batch + layout["status"].update( + Panel( + f"[green]Terminated {terminated_count}/{len(instance_ids)} instances[/green]", + border_style="green", + ) + ) + + except Exception as e: + layout["status"].update( + Panel( + f"[red]Error terminating batch {batch_num + 1}: {str(e)}[/red]", + border_style="red", + ) + ) + raise + + # Wait briefly between batches to avoid rate limits + if batch_num < batches - 1: + await asyncio.sleep(2) + + progress.update( + progress_task, + description="All instances terminated", + completed=len(instance_ids), + ) + layout["progress"].update(progress) + + except Exception as e: + console.print(f"[red]Error terminating instances: {str(e)}[/red]") + raise + + @rate_limited(max_rate=2, time_window=1.0) # 2 requests per second for launches + async def launch_instances(self, count: int, batch_size: int = 50) -> List[str]: + """Launch specified number of spot instances in batches with cleanup handler + and automatic error recovery + + Args: + count: Total number of instances to launch + batch_size: Number of instances per batch (default: 50) + + Returns: + List of launched instance IDs + + Raises: + ValueError: If launch configuration is invalid + RuntimeError: If launch fails + """ + global progress, progress_task, layout, live + + # Create detailed progress tracking + launch_progress = Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(bar_width=None), + TaskProgressColumn(), + TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), + TimeElapsedColumn(), + TimeRemainingColumn(), + expand=True, + ) + + # Create subtasks for each phase + main_task = launch_progress.add_task("[cyan]Launching instances...", total=100) + phase1 = launch_progress.add_task("[yellow]Validating configuration...", total=1) + phase2 = launch_progress.add_task("[yellow]Requesting instances...", total=1) + phase3 = launch_progress.add_task("[yellow]Waiting for instances...", total=1) + + # Update layout with detailed progress + layout["progress"].update( + Panel(launch_progress, title="Launch Progress", border_style="blue") + ) + live.refresh() + + # Update progress for validation phase + launch_progress.update(phase1, description="[green]Validating configuration...") + live.refresh() + + # Validate launch configuration + if not self.configured_ami_id: + launch_progress.update( + phase1, description="[red]Error: AMI ID not configured[/red]", completed=1 + ) + live.refresh() + raise ValueError("AMI ID is not configured") + if not self.key_name: + raise ValueError("Key pair name is not configured") + if not self.instance_type: + raise ValueError("Instance type is not configured") + + # Validate instance count against various limits + if count < self.min_instances: + raise ValueError( + f"Cannot launch fewer than {self.min_instances} instances. Requested: {count}" + ) + + if count > self.max_instances_per_launch: + raise ValueError( + f"Cannot launch more than {self.max_instances_per_launch} instances at once. " + f"Requested: {count}" + ) + + # Check current instance count + current_count = len(self.get_all_instance_ids()) + if current_count + count > self.max_instances: + raise ValueError( + f"Cannot launch {count} instances. Would exceed max limit of {self.max_instances}. " + f"Current instances: {current_count}" + ) + + # Check resource limits + instance_info = self.get_instance_type_info(self.instance_type) + requested_vcpus = int(instance_info["vcpus"]) * count + requested_memory = float(instance_info["memory"].replace("GB", "")) * count + + if self.current_vcpus + requested_vcpus > self.max_total_vcpus: + raise ValueError( + f"Cannot launch {count} instances. Would exceed vCPU limit of {self.max_total_vcpus}. " + f"Current vCPUs: {self.current_vcpus}, Requested: {requested_vcpus}" + ) + + if self.current_memory + requested_memory > self.max_total_memory: + raise ValueError( + f"Cannot launch {count} instances. Would exceed memory limit of {self.max_total_memory}GB. " + f"Current memory: {self.current_memory}GB, Requested: {requested_memory}GB" + ) + + if not self.configured_ami_id: + raise ValueError("No AMI ID configured. Please run build-ami.sh first.") + + security_group_id = self.ensure_security_group() + startup_script_path = PROJECT_ROOT / "fleet" / "scripts" / "startup.sh" + + if not startup_script_path.exists(): + raise FileNotFoundError(f"Startup script not found at {startup_script_path}") + + retry_count = 0 + while retry_count < self.max_retries: + try: + # Update progress for request phase + launch_progress.update( + phase2, + description=f"[green]Requesting instances (attempt {retry_count + 1}/{self.max_retries})...", + ) + live.refresh() + + # Update resource tracking + self.current_vcpus += requested_vcpus + self.current_memory += requested_memory + + # Update main progress + launch_progress.update(main_task, completed=25) + + # Update progress for instance creation + launch_progress.update(phase2, description="[green]Creating instances...") + live.refresh() + + # Calculate number of batches needed + batches = (count + batch_size - 1) // batch_size + instance_ids = [] + + # Process batches sequentially with error handling + for batch_num in range(batches): + batch_count = min(batch_size, count - (batch_num * batch_size)) + + # Update progress for batch + launch_progress.update( + phase2, + description=f"[green]Processing batch {batch_num + 1}/{batches} ({batch_count} instances)[/green]", + ) + live.refresh() + + # Track batch resources + batch_vcpus = int(instance_info["vcpus"]) * batch_count + batch_memory = float(instance_info["memory"].replace("GB", "")) * batch_count + + # Try batch with retries + batch_retries = 3 + for attempt in range(batch_retries): + try: + # Run batch asynchronously + response = await asyncio.get_event_loop().run_in_executor( + None, + lambda: self.ec2.run_instances( + ImageId=self.configured_ami_id, + InstanceType=self.instance_type, + KeyName=self.key_name, + SecurityGroupIds=[security_group_id], + TagSpecifications=[ + { + "ResourceType": "instance", + "Tags": [ + {"Key": key, "Value": value} + for key, value in self.default_tags.items() + ] + + [ + { + "Key": "LaunchGroup", + "Value": f"scale-test-{datetime.now().strftime('%Y%m%d-%H%M%S')}", + } + ], + } + ], + IamInstanceProfile={"Name": "BacalhauScaleTestRole"}, + UserData=startup_script_path.read_text(), + InstanceMarketOptions={ + "MarketType": "spot", + "SpotOptions": { + "SpotInstanceType": "one-time", + "InstanceInterruptionBehavior": "terminate", + }, + }, + MinCount=batch_count, + MaxCount=batch_count, + ), + ) + + # Collect instance IDs + batch_ids = [i["InstanceId"] for i in response["Instances"]] + instance_ids.extend(batch_ids) + + # Update resource tracking + self.current_vcpus += batch_vcpus + self.current_memory += batch_memory + + # Wait briefly between batches to avoid rate limits + if batch_num < batches - 1: + await asyncio.sleep(2) + + break # Success - exit retry loop + + except Exception as e: + # Handle batch failure + self.log( + "warning", + "Batch launch failed", + batch_num=batch_num, + attempt=attempt + 1, + error=str(e), + ) + + # Rollback resource tracking + self.current_vcpus -= batch_vcpus + self.current_memory -= batch_memory + + if attempt == batch_retries - 1: + raise RuntimeError( + f"Failed to launch batch {batch_num + 1} after {batch_retries} attempts" + ) + + # Exponential backoff before retry + await asyncio.sleep(2**attempt) + + self._created_resources["instances"].update(instance_ids) + + # Update progress + launch_progress.update( + phase2, + description=f"[green]Created {len(instance_ids)} instances[/green]", + completed=1, + ) + launch_progress.update(main_task, completed=50) + live.refresh() + + # Create cleanup task + cleanup_task = asyncio.create_task(self._cleanup_instances(instance_ids)) + self._cleanup_tasks.add(cleanup_task) + cleanup_task.add_done_callback(self._cleanup_tasks.discard) + + try: + # Update progress for waiting phase + launch_progress.update( + phase3, + description="[yellow]Waiting for instances to start...", + total=len(instance_ids), + ) + live.refresh() + + # Poll for instance status with progress updates + await self.poll_instance_status( + instance_ids, progress=launch_progress, task=phase3 + ) + + # Update main progress + launch_progress.update(main_task, completed=100) + launch_progress.update( + phase3, + description="[green]All instances running[/green]", + completed=len(instance_ids), + ) + live.refresh() + + return instance_ids + except Exception as e: + # If launch fails, ensure cleanup + await cleanup_task + raise + + except Exception as e: + # Handle recoverable errors + error_code = getattr(e, "response", {}).get("Error", {}).get("Code", "") + if error_code in self.recovery_actions: + recovery_action = self.recovery_actions[error_code] + await recovery_action(e, count, batch_size) + + # Exponential backoff before retry + await asyncio.sleep(self.retry_delay) + self.retry_delay = min(self.retry_delay * 2, self.max_retry_delay) + retry_count += 1 + continue + + # Non-recoverable error + console.print(f"[red]Error launching instances: {str(e)}[/red]") + raise + + def get_all_instance_ids(self, filters: List[Dict] = None) -> List[str]: + """Get all running instance IDs with optional filters""" + try: + # Default filter for managed instances + default_filters = [ + { + "Name": "tag:ManagedBy", + "Values": ["SpotManager"], + }, + {"Name": "instance-state-name", "Values": ["pending", "running"]}, + ] + + # Merge with any additional filters + if filters: + default_filters.extend(filters) + + response = self.ec2.describe_instances(Filters=default_filters) + + return [ + instance["InstanceId"] + for reservation in response["Reservations"] + for instance in reservation["Instances"] + ] + + except Exception as e: + console.print(f"[red]Error getting instance IDs: {str(e)}[/red]") + raise + + def wait_for_instances_running(self, instance_ids: List[str]) -> None: + """Wait for instances to be in running state""" + global progress, progress_task, layout + progress_task = progress.add_task("Waiting for instances...", total=len(instance_ids)) + layout["progress"].update(progress) + + max_attempts = 60 + attempt = 0 + + while attempt < max_attempts: + try: + response = self.ec2.describe_instances(InstanceIds=instance_ids) + running_count = sum( + 1 + for reservation in response["Reservations"] + for instance in reservation["Instances"] + if instance["State"]["Name"] == "running" + ) + terminated_count = sum( + 1 + for reservation in response["Reservations"] + for instance in reservation["Instances"] + if instance["State"]["Name"] in ["terminated", "shutting-down"] + ) + + if terminated_count > 0: + raise RuntimeError( + f"{terminated_count} instances were terminated while waiting for startup. This usually indicates insufficient spot capacity." + ) + + progress.update( + progress_task, + completed=running_count, + description=f"Running: {running_count}/{len(instance_ids)}", + ) + layout["progress"].update(progress) + + if running_count == len(instance_ids): + break + + time.sleep(5) + attempt += 1 + + except self.ec2.exceptions.ClientError as e: + if "InvalidInstanceID.NotFound" in str(e): + raise RuntimeError( + "Some instances disappeared while waiting for startup. This usually indicates insufficient spot capacity." + ) + raise + + if attempt >= max_attempts: + raise RuntimeError( + f"Timeout waiting for instances to start. Current states: {', '.join(set(instance['State']['Name'] for reservation in response['Reservations'] for instance in reservation['Instances']))}" + ) + + def verify_bacalhau_access(self) -> None: + """Verify that we can access Bacalhau CLI and have correct permissions""" + try: + result = subprocess.run( + ["bacalhau", "node", "list", "--output", "json"], + capture_output=True, + text=True, + check=True, + ) + nodes = json.loads(result.stdout) + + if isinstance(nodes, dict): + if "nodes" in nodes: + nodes = nodes["nodes"] + elif "data" in nodes: + nodes = nodes["data"] + else: + raise ValueError(f"Unexpected response structure: {list(nodes.keys())}") + + if not isinstance(nodes, list): + raise ValueError(f"Unexpected nodes type: {type(nodes)}") + + self.debug_log(f"Successfully verified Bacalhau access. Found {len(nodes)} nodes.") + + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse Bacalhau node list output as JSON: {str(e)}") + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to run 'bacalhau node list'. Error: {e.stderr}") + except FileNotFoundError: + raise RuntimeError( + "Bacalhau CLI not found. Please install Bacalhau CLI and ensure it's in your PATH." + ) + + async def check_bacalhau_node_status( + self, instance_id: str, max_retries: int = 3, timeout: int = 10 + ) -> Tuple[bool, str]: + """Check if a Bacalhau node is healthy using bacalhau node list with retries + + Args: + instance_id: Instance ID to check + max_retries: Maximum number of retry attempts + timeout: Timeout in seconds for each attempt + + Returns: + Tuple of (health status, status message) + """ + retry_delay = 1 # Start with 1 second delay + + for attempt in range(max_retries): + try: + # Run bacalhau node list command with timeout + result = await asyncio.wait_for( + asyncio.get_event_loop().run_in_executor( + None, + lambda: subprocess.run( + ["bacalhau", "node", "list", "--output", "json"], + capture_output=True, + text=True, + check=True, + ), + ), + timeout=timeout, + ) + + # Parse JSON output + try: + nodes = json.loads(result.stdout) + if self.debug: + self.debug_log(f"Node check response type: {type(nodes)}") + self.debug_log(f"Node check response count: {len(nodes)}") + + # Ensure nodes is a list + if not isinstance(nodes, list): + return False, f"Invalid response type: {type(nodes)}" + + # Iterate through each node to find a match + for node in nodes: + if not isinstance(node, dict): + continue + + # Extract node info + info = node.get("Info", {}) + if not isinstance(info, dict): + continue + + # Extract labels and public IP + labels = info.get("Labels", {}) + if not isinstance(labels, dict): + continue + + found_instance_id = labels.get("INSTANCE_ID", "") + if not found_instance_id: + continue + + # Check if the IP matches + if found_instance_id == instance_id: + # Extract connection state + connection_state = node.get("ConnectionState", {}) + if not isinstance(connection_state, dict): + return False, "Invalid connection state format" + + # Determine node health based on connection status + status = connection_state.get("Status", "UNKNOWN") + if status == "CONNECTED": + return True, "Connected" + else: + # Include last error if available + last_error = connection_state.get("LastError", "") + return False, f"Status: {status}" + ( + f" ({last_error})" if last_error else "" + ) + + return False, "" + + except json.JSONDecodeError as e: + if self.debug: + self.debug_log(f"Raw response: {result.stdout}") + return False, f"Invalid JSON response: {str(e)}" + + except subprocess.CalledProcessError as e: + error_msg = e.stderr.decode() if isinstance(e.stderr, bytes) else e.stderr + self.debug_log(f"Error running bacalhau node list: {error_msg}") + if attempt < max_retries - 1: + await asyncio.sleep(retry_delay) + retry_delay = min(retry_delay * 2, 10) # Cap at 10 seconds + continue + return False, f"Command failed: {error_msg}" + except asyncio.TimeoutError: + self.debug_log(f"Node status check timed out for {instance_id}") + if attempt < max_retries - 1: + await asyncio.sleep(retry_delay) + retry_delay = min(retry_delay * 2, 10) # Cap at 10 seconds + continue + return False, "Timeout" + except Exception as e: + self.debug_log(f"Unexpected error checking node status: {str(e)}") + if attempt < max_retries - 1: + await asyncio.sleep(retry_delay) + retry_delay = min(retry_delay * 2, 10) # Cap at 10 seconds + continue + return False, str(e) + + async def check_all_nodes_status(self, instance_ids: List[str]) -> Dict[str, Tuple[bool, str]]: + """Check status of all Bacalhau nodes""" + global progress, progress_task + results = {} + for instance_id in instance_ids: + # Update the global progress task description + progress.update( + progress_task, description=f"Checking Bacalhau status of {instance_id}..." + ) + status, message = await self.check_bacalhau_node_status(instance_id) + results[instance_id] = (status, message) + return results + + def validate_instance_type(self, instance_type: str) -> bool: + """Validate if an instance type is supported and available""" + try: + # Check if instance type exists + response = self.ec2.describe_instance_types(InstanceTypes=[instance_type]) + if not response["InstanceTypes"]: + return False + + # Check instance type capabilities + instance_info = response["InstanceTypes"][0] + + # Must support HVM virtualization + if "hvm" not in instance_info.get("SupportedVirtualizationTypes", []): + return False + + # Must support at least 1 network interface + if instance_info.get("NetworkInfo", {}).get("MaximumNetworkInterfaces", 0) < 1: + return False + + return True + + except Exception as e: + self.debug_log(f"Error validating instance type: {str(e)}") + return False + + def get_instance_type_info(self, instance_type: str) -> Dict[str, str]: + """Get CPU and memory information for an instance type""" + if instance_type not in self._instance_type_cache: + try: + if not self.validate_instance_type(instance_type): + raise ValueError(f"Instance type {instance_type} is not supported") + + response = self.ec2.describe_instance_types(InstanceTypes=[instance_type]) + info = response["InstanceTypes"][0] + self._instance_type_cache[instance_type] = { + "vcpus": str(info["VCpuInfo"]["DefaultVCpus"]), + "memory": f"{info['MemoryInfo']['SizeInMiB'] / 1024:.1f}GB", + "type": instance_type, + "network": str(info["NetworkInfo"]["MaximumNetworkInterfaces"]), + "ebs_optimized": info["EbsInfo"]["EbsOptimizedSupport"] == "supported", + "supported_architectures": info["ProcessorInfo"]["SupportedArchitectures"], + } + except Exception as e: + self.debug_log(f"Error getting instance type info: {str(e)}") + self._instance_type_cache[instance_type] = { + "vcpus": "?", + "memory": "?", + "type": instance_type, + "network": "?", + "ebs_optimized": False, + "supported_architectures": [], + } + return self._instance_type_cache[instance_type] + + async def _handle_instance_limit_error(self, error, count, batch_size): + """Handle instance limit exceeded errors""" + self.log("warning", "Instance limit exceeded", error=str(error)) + + # Reduce requested count and retry + new_count = min(count, self.max_instances_per_launch // 2) + if new_count < self.min_instances: + raise RuntimeError("Cannot reduce instance count below minimum") + + self.log("info", f"Reducing instance count from {count} to {new_count}") + return await self.launch_instances(new_count, batch_size) + + async def _handle_capacity_error(self, error, count, batch_size): + """Handle insufficient capacity errors""" + self.log("warning", "Insufficient instance capacity", error=str(error)) + + # Try different instance type + alt_instance_type = self._get_alternative_instance_type() + if not alt_instance_type: + raise RuntimeError("No alternative instance types available") + + self.log("info", f"Trying alternative instance type: {alt_instance_type}") + original_type = self.instance_type + self.instance_type = alt_instance_type + try: + return await self.launch_instances(count, batch_size) + finally: + self.instance_type = original_type + + async def _handle_spot_limit_error(self, error, count, batch_size): + """Handle spot instance limit errors""" + self.log("warning", "Spot instance limit exceeded", error=str(error)) + + # Reduce batch size and retry + new_batch_size = max(1, batch_size // 2) + self.log("info", f"Reducing batch size from {batch_size} to {new_batch_size}") + return await self.launch_instances(count, new_batch_size) + + async def _handle_rate_limit_error(self, error, count, batch_size): + """Handle API rate limit errors""" + self.log("warning", "API rate limit exceeded", error=str(error)) + + # Wait and retry with reduced rate + self.rate_limiter.max_rate = max(1, self.rate_limiter.max_rate // 2) + self.log("info", f"Reduced API rate to {self.rate_limiter.max_rate} req/sec") + await asyncio.sleep(self.retry_delay) + return await self.launch_instances(count, batch_size) + + async def _handle_service_unavailable(self, error, count, batch_size): + """Handle service unavailable errors""" + self.log("warning", "Service unavailable", error=str(error)) + + # Wait and retry + await asyncio.sleep(self.retry_delay) + return await self.launch_instances(count, batch_size) + + async def _handle_internal_error(self, error, count, batch_size): + """Handle AWS internal errors""" + self.log("warning", "AWS internal error", error=str(error)) + + # Wait and retry + await asyncio.sleep(self.retry_delay) + return await self.launch_instances(count, batch_size) + + def _get_alternative_instance_type(self) -> Optional[str]: + """Get alternative instance type with similar specs""" + current_info = self.get_instance_type_info(self.instance_type) + alternatives = [ + "t3.micro", + "t3.small", + "t3.medium", # Burstable instances + "m5.large", + "m5.xlarge", # General purpose + "c5.large", + "c5.xlarge", # Compute optimized + ] + + # Try to find similar instance type + for alt_type in alternatives: + if alt_type == self.instance_type: + continue + + alt_info = self.get_instance_type_info(alt_type) + if ( + alt_info["vcpus"] >= current_info["vcpus"] + and alt_info["memory"] >= current_info["memory"] + ): + return alt_type + + return None + + def _cleanup_resources(self) -> None: + """Cleanup all created resources""" + self.log("info", "Starting resource cleanup") + + # Cleanup instances + if self._created_resources["instances"]: + try: + self.ec2.terminate_instances(InstanceIds=list(self._created_resources["instances"])) + self.log( + "info", + "Terminated instances", + instance_ids=list(self._created_resources["instances"]), + ) + except Exception as e: + self.log("error", "Error terminating instances", error=str(e)) + + # Cleanup security groups + if self._created_resources["security_groups"]: + for group_id in self._created_resources["security_groups"]: + try: + self.ec2.delete_security_group(GroupId=group_id) + self.log("info", "Deleted security group", group_id=group_id) + except Exception as e: + self.log( + "error", "Error deleting security group", group_id=group_id, error=str(e) + ) + + # Cleanup key pairs + if self._created_resources["key_pairs"]: + for key_name in self._created_resources["key_pairs"]: + try: + self.ec2.delete_key_pair(KeyName=key_name) + self.log("info", "Deleted key pair", key_name=key_name) + except Exception as e: + self.log("error", "Error deleting key pair", key_name=key_name, error=str(e)) + + self._created_resources = {"instances": set(), "security_groups": set(), "key_pairs": set()} + self.log("info", "Resource cleanup completed") + + async def _cleanup_all(self) -> None: + """Cleanup all resources""" + self.log("info", "Starting full cleanup") + + # Get all running instances + instance_ids = await asyncio.get_event_loop().run_in_executor( + None, self.get_all_instance_ids + ) + + # Cleanup instances + if instance_ids: + await self._cleanup_instances(instance_ids) + + # Cleanup other resources + self._cleanup_resources() + + self.log("info", "Full cleanup completed") + + async def run_stress_test( + self, + min_nodes: int = 250, + max_nodes: int = 750, + iterations: int = 10, + health_check_timeout: int = 300, + ) -> None: + """Run stress test with random node counts""" + # Validate stress test parameters + if min_nodes <= 0 or max_nodes <= 0: + raise ValueError("Node counts must be greater than 0") + + if min_nodes > max_nodes: + raise ValueError("min_nodes cannot be greater than max_nodes") + + if iterations <= 0: + raise ValueError("Iterations must be greater than 0") + + if health_check_timeout <= 0: + raise ValueError("Health check timeout must be greater than 0") + + # Check max nodes against instance limits + if max_nodes > self.max_instances: + raise ValueError( + f"max_nodes ({max_nodes}) exceeds maximum instance limit ({self.max_instances})" + ) + global progress, progress_task, layout, live + + # Initialize the status table with tighter column widths + status_table = Table( + show_header=True, + header_style="bold magenta", + title="Node Status", + title_style="bold blue", + expand=True, + ) + status_table.add_column("ID", style="cyan", no_wrap=True, width=20) + status_table.add_column("State", style="green", width=10) + status_table.add_column("CPU", style="yellow", justify="right", width=6) + status_table.add_column("Mem", style="yellow", justify="right", width=6) + status_table.add_column("InstType", style="yellow", width=10) + status_table.add_column("IP Address", style="blue", width=15) + status_table.add_column("🐟", style="cyan", width=4) + + # Start progress tracking + progress_task = progress.add_task("Waiting...", total=None) + + def update_layout_error(error_msg: str, details: str = ""): + """Helper to update layout in error state""" + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + layout["header"].update( + Panel( + f"[bold blue]Bacalhau Scale Test[/bold blue]\n" + f"[red]Error State[/red]\n" + f"Time: {current_time}", + style="white on red", + ) + ) + layout["status"].update( + Panel( + f"[red bold]Error:[/red bold] {error_msg}", + style="red", + ) + ) + status_table.rows = [] + status_table.add_row("Error Type", "Failed", error_msg) + if details: + status_table.add_row("Details", "Info", details) + status_table.add_row( + "Recovery", "Action Required", "Please check AWS quotas and permissions" + ) + layout["body"].update(status_table) + + def cleanup_instances(): + """Helper to cleanup instances on exit""" + if "instance_ids" in locals(): + try: + progress.update(progress_task, description="Cleaning up instances...") + layout["progress"].update(progress) + status_table.add_row("Cleanup", "In Progress", "Terminating instances...") + layout["body"].update(status_table) + self.terminate_instances(instance_ids) + status_table.add_row( + "Cleanup", "[green]Complete[/green]", "All instances terminated" + ) + layout["body"].update(status_table) + except Exception as e: + status_table.add_row("Cleanup", "[red]Failed[/red]", f"Error: {str(e)}") + layout["body"].update(status_table) + + try: + # Register cleanup handler for Ctrl+C + loop = asyncio.get_event_loop() + loop.add_signal_handler(signal.SIGINT, lambda: asyncio.create_task(self._cleanup_all())) + + # Verify Bacalhau access before starting + try: + progress.update(progress_task, description="Verifying Bacalhau access...") + layout["progress"].update(progress) + self.verify_bacalhau_access() + except Exception as e: + raise RuntimeError(f"Bacalhau verification failed: {str(e)}") + + current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + layout["header"].update( + Panel( + f"[bold blue]Bacalhau Scale Test[/bold blue]\n" + f"Configuration: Nodes: {min_nodes}-{max_nodes} | Iterations: {iterations}\n" + f"Started: {current_time}", + style="white on #28464B", + ) + ) + live.refresh() + + try: + for iteration in range(iterations): + node_count = random.randint(min_nodes, max_nodes) + layout["status"].update( + Panel( + f"[bold]Iteration {iteration + 1}/{iterations}[/bold]\n" + f"Target Nodes: {node_count}\n" + f"Time: {datetime.now().strftime('%H:%M:%S')}", + style="yellow", + ) + ) + + # Reset progress for new iteration + progress.update(progress_task, total=None, completed=0) + progress.update( + progress_task, + description=f"Launching {node_count} instances...", + ) + layout["progress"].update(progress) + live.refresh() + + try: + # Clear and initialize status table + status_table.rows = [] + layout["body"].update(status_table) + live.refresh() + + # Launch instances asynchronously + instance_ids = await self.launch_instances(node_count) + + # Update progress for instance startup + progress.update(progress_task, total=len(instance_ids), completed=0) + progress.update( + progress_task, + description="Waiting for instances to start...", + ) + layout["progress"].update(progress) + live.refresh() + + # Wait for instances to start and join Bacalhau + running_count = 0 + joined_count = 0 + start_time = time.time() + node_states = {} # Track node states and info + + while running_count < len(instance_ids): + # Check if we've exceeded the timeout + if time.time() - start_time > 30: + raise RuntimeError( + "Timeout waiting for nodes to start and join Bacalhau cluster" + ) + + # Get instance states from AWS + response = await asyncio.get_event_loop().run_in_executor( + None, + lambda: self.ec2.describe_instances(InstanceIds=instance_ids), + ) + running_count = 0 + instance_rows = [] + + # Update instance states + for reservation in response["Reservations"]: + for instance in reservation["Instances"]: + instance_id = instance["InstanceId"] + state = instance["State"]["Name"] + instance_type = instance["InstanceType"] + specs = self.get_instance_type_info(instance_type) + ip = instance.get("PublicIpAddress", "pending...") + + # Update node state tracking + if instance_id not in node_states: + node_states[instance_id] = { + "state": state, + "ip": ip, + "specs": specs, + "bacalhau_joined": False, + "bacalhau_status": "❓", # Default status + } + else: + node_states[instance_id]["state"] = state + if ip != "pending...": + node_states[instance_id]["ip"] = ip + + if state == "running": + running_count += 1 + + # Check Bacalhau status for running nodes + if running_count > 0: + running_ips = [ + state["ip"] + for state in node_states.values() + if state["state"] == "running" and state["ip"] != "pending..." + ] + node_results = await self.check_all_nodes_status(running_ips) + + # Debug log Bacalhau API query + if self.debug: + self.debug_log( + f"Queried Bacalhau API. Nodes present: {len(node_results)}" + ) + + # Update Bacalhau status + for instance_id, state in node_states.items(): + if state["ip"] in node_results: + is_healthy, message = node_results[state["ip"]] + if is_healthy: + state["bacalhau_joined"] = True + state["bacalhau_status"] = "✅" + else: + state["bacalhau_status"] = f"❌ ({message})" + + # Build table rows with sort key + for instance_id, state in node_states.items(): + status = ( + "[green]running[/green]" + if state["state"] == "running" + else f"[yellow]{state['state']}[/yellow]" + ) + + # Sort key: pending first, then running not joined, then running and joined + sort_key = ( + 0 + if state["state"] != "running" + else 1 + if not state["bacalhau_joined"] + else 2 + ) + + instance_rows.append( + ( + sort_key, + instance_id, + status, + f"{state['specs']['vcpus']}", + state["specs"]["memory"], + state["specs"]["type"], + state["ip"], + state["bacalhau_status"], # Add Bacalhau status + ) + ) + + # Sort and update table + instance_rows.sort(key=lambda x: x[0]) + status_table.rows = [] + for _, *row in instance_rows: + status_table.add_row(*row) + + # Update progress + joined_count = sum( + 1 for state in node_states.values() if state["bacalhau_joined"] + ) + progress.update( + progress_task, + completed=joined_count, + total=len(instance_ids), + description=f"Running: {running_count}, Joined: {joined_count}/{len(instance_ids)}", + ) + + # Update layout + layout["body"].update(status_table) + layout["progress"].update(progress) + live.refresh() + + # Check if all nodes are running and joined + if running_count == len(instance_ids): + if joined_count == len(instance_ids): + break + # If all running but not all joined, wait a bit longer + await asyncio.sleep(2) + else: + await asyncio.sleep(5) + + # Verify all nodes joined + if joined_count < len(instance_ids): + raise RuntimeError( + f"Timeout waiting for nodes to join Bacalhau cluster. " + f"Only {joined_count}/{len(instance_ids)} nodes joined." + ) + + # After all nodes are provisioned, continue to monitor Bacalhau status + while True: + # Check Bacalhau status for all running nodes + running_ips = [ + state["ip"] + for state in node_states.values() + if state["state"] == "running" and state["ip"] != "pending..." + ] + node_results = await self.check_all_nodes_status(running_ips) + + # Debug log Bacalhau API query + if self.debug: + self.debug_log( + f"Queried Bacalhau API. Nodes present: {len(node_results)}" + ) + + # Update Bacalhau status + for instance_id, state in node_states.items(): + if state["ip"] in node_results: + is_healthy, message = node_results[state["ip"]] + if is_healthy: + state["bacalhau_joined"] = True + state["bacalhau_status"] = "✅" + else: + state["bacalhau_status"] = f"❌ ({message})" + + # Build table rows with sort key + instance_rows = [] + for instance_id, state in node_states.items(): + status = ( + "[green]running[/green]" + if state["state"] == "running" + else f"[yellow]{state['state']}[/yellow]" + ) + + # Sort key: pending first, then running not joined, then running and joined + sort_key = ( + 0 + if state["state"] != "running" + else 1 + if not state["bacalhau_joined"] + else 2 + ) + + instance_rows.append( + ( + sort_key, + instance_id, + status, + f"{state['specs']['vcpus']}", + state["specs"]["memory"], + state["specs"]["type"], + state["ip"], + state["bacalhau_status"], # Add Bacalhau status + ) + ) + + # Sort and update table + instance_rows.sort(key=lambda x: x[0]) + status_table.rows = [] + for _, *row in instance_rows: + status_table.add_row(*row) + + # Update progress + joined_count = sum( + 1 for state in node_states.values() if state["bacalhau_joined"] + ) + progress.update( + progress_task, + completed=joined_count, + total=len(instance_ids), + description=f"Running: {running_count}, Joined: {joined_count}/{len(instance_ids)}", + ) + + # Update layout + layout["body"].update(status_table) + layout["progress"].update(progress) + live.refresh() + + # Check if all nodes are running and joined + if joined_count == len(instance_ids): + break + + await asyncio.sleep(5) + + except Exception as e: + error_msg = str(e) + if "MaxSpotInstanceCountExceeded" in error_msg: + details = ( + "AWS Spot Instance quota exceeded.\n" + "Please request a quota increase in AWS Console:\n" + "EC2 > Limits > Spot Instance Requests" + ) + else: + details = f"Error occurred during iteration {iteration + 1}" + update_layout_error(error_msg, details) + live.refresh() + break + + finally: + cleanup_instances() + live.refresh() + + if iteration < iterations - 1: + await asyncio.sleep(10) + + except KeyboardInterrupt: + update_layout_error("Test interrupted by user", "Cleaning up resources...") + live.refresh() + cleanup_instances() + live.refresh() + + except Exception as e: + update_layout_error(str(e)) + live.refresh() + + def create_instance_table(self, instances: List[Dict[str, Any]]) -> Table: + """Create and populate a table with detailed instance status information + including lifecycle events.""" + table = Table( + show_header=True, + header_style="bold magenta", + box=ROUNDED, + border_style="blue", + title="Instance Status", + title_style="bold blue", + caption="Last updated: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + caption_style="dim", + ) + + # Add columns with adjusted widths + table.add_column("ID", style="cyan", width=10) + table.add_column("Type", style="green", width=10) + table.add_column("State", style="yellow", width=8) + table.add_column("Zone", style="blue", width=12) + table.add_column("DNS", style="white", width=50) # Wide enough for full DNS + table.add_column("Age", style="green", width=8) + + for instance in instances: + # Get basic instance info + instance_id = instance.get("InstanceId", "N/A") + instance_type = instance.get("InstanceType", "N/A") + state = instance.get("State", {}).get("Name", "N/A") + zone = instance.get("Placement", {}).get("AvailabilityZone", "N/A") + public_dns = instance.get("PublicDnsName", "N/A") + + # Calculate instance age + launch_time = instance.get("LaunchTime") + age = "N/A" + if launch_time: + age_delta = datetime.now(launch_time.tzinfo) - launch_time + hours = age_delta.total_seconds() / 3600 + if hours < 24: + age = f"{hours:.1f}h" + else: + age = f"{hours / 24:.1f}d" + + table.add_row( + instance_id, + instance_type, + state, + zone, + public_dns, + age, + ) + + return table + + def terminate_all(self, ctx: Context) -> None: + """Terminate all running instances""" + if ctx is None: + raise ValueError("Context cannot be None") + + global progress, progress_task, layout, live + + # Get instance IDs first + instance_ids = self.get_all_instance_ids() + if not instance_ids: + layout["header"].update(Panel("[bold blue]Bacalhau Spot Manager[/bold blue]")) + layout["body"].update(Panel("[yellow]No instances found[/yellow]")) + layout["status"].update(Panel("[yellow]No instances to terminate[/yellow]")) + layout["progress"].update(progress) + return + + try: + # Create main termination task + progress_task = progress.add_task("Terminating instances...", total=len(instance_ids)) + layout["progress"].update(progress) + + # Update header + layout["header"].update( + Panel( + "[bold blue]Bacalhau Spot Manager[/bold blue]\n" + f"[dim]Terminating {len(instance_ids)} instances[/dim]", + style="white on #28464B", + ) + ) + + # Get instance details + response = self.ec2.describe_instances(InstanceIds=instance_ids) + instances = [ + instance + for reservation in response["Reservations"] + for instance in reservation["Instances"] + ] + + # Create and display initial table + table = self.create_instance_table(instances) + layout["body"].update(Panel(table, border_style="blue")) + layout["status"].update( + Panel( + f"[yellow]Terminating {len(instances)} instances...[/yellow]", + border_style="yellow", + ) + ) + + # Run the async termination in an event loop + asyncio.run(self.terminate_instances(instance_ids)) + + # Final update + layout["status"].update( + Panel( + "[green]✓ All instances terminated successfully[/green]", + border_style="green", + ) + ) + layout["body"].update( + Panel( + "[green]All instances have been terminated[/green]", + border_style="green", + ) + ) + + except Exception as e: + error_msg = f"Error during termination process: {str(e)}" + write_debug(error_msg) + layout["status"].update(Panel(f"[red]{error_msg}[/red]", border_style="red")) + layout["body"].update( + Panel("[red]Termination process failed[/red]", border_style="red") + ) + raise + + def get_running_instances(self): + """Get list of running instances""" + response = self.ec2.describe_instances( + Filters=[{"Name": "instance-state-name", "Values": ["running", "pending"]}] + ) + + instances = [] + for reservation in response["Reservations"]: + instances.extend(reservation["Instances"]) + return instances + + +@click.group() +@click.option( + "--debug/--no-debug", + default=False, + help="Enable debug logging (must be specified before command)", +) +@click.pass_context +def cli(ctx: Context, debug: bool) -> None: + """Manage AWS spot instances for Bacalhau scale testing + + Example usage: + ./spot-manager --debug launch --count 5 + ./spot-manager --debug stress-test + ./spot-manager --debug list + """ + global live + if ctx is None: + raise ValueError("Context cannot be None") + + ctx.obj = SpotManager(debug=debug) + # Start the live display before running commands + live.start() + + +@cli.result_callback() +def cleanup(ctx: Context, debug: bool) -> None: + """Cleanup after all commands are done""" + global live, progress, progress_task, layout + + try: + # Clear any progress display + if progress_task is not None: + try: + progress.update(progress_task, visible=False) + layout["progress"].update(progress) + except Exception as e: + write_debug(f"Error clearing progress: {str(e)}") + + # Clear the layout + try: + layout["header"].update("") + layout["body"].update("") + layout["status"].update("") + layout["progress"].update("") + except Exception as e: + write_debug(f"Error clearing layout: {str(e)}") + + # Stop the live display first + if live and live.is_started: + try: + live.stop() + except Exception as e: + write_debug(f"Error stopping live display: {str(e)}") + + # Then handle any remaining cleanup tasks + if ctx is not None and hasattr(ctx, "obj") and ctx.obj is not None: + manager = ctx.obj + if manager._cleanup_tasks: + try: + asyncio.run(asyncio.wait(manager._cleanup_tasks)) + except Exception as e: + write_debug(f"Error during task cleanup: {str(e)}") + + except Exception as e: + write_debug(f"Error during cleanup: {str(e)}") + + +@cli.command() +@click.option("--count", default=1, help="Number of instances to launch") +@click.pass_obj +def launch(manager: SpotManager, count: int): + """Launch spot instances""" + global progress, progress_task, layout, live + + # Update header with consistent styling + layout["header"].update( + Panel( + f"[bold blue]Bacalhau Spot Manager[/bold blue]\n[dim]Launching {count} instances[/dim]", + style="white on #28464B", + border_style="blue", + ) + ) + + progress_task = progress.add_task("Launching instances...", total=count) + layout["progress"].update(progress) + live.refresh() + + try: + instance_ids = asyncio.run(manager.launch_instances(count)) + manager.wait_for_instances_running(instance_ids) + + # Fetch instance details + response = manager.ec2.describe_instances(InstanceIds=instance_ids) + instances = [ + instance + for reservation in response["Reservations"] + for instance in reservation["Instances"] + ] + + # Create and display the table with consistent styling + table = manager.create_instance_table(instances) + layout["body"].update(Panel(table, border_style="blue")) + layout["status"].update( + Panel( + f"[green]✓ Successfully launched {len(instance_ids)} instances[/green]", + border_style="green", + ) + ) + layout["progress"].update(progress) + + # Show the layout + live.refresh() + time.sleep(2) # Give user time to see the final state + + except Exception as e: + layout["status"].update( + Panel( + f"[red]✗ Error launching instances: {str(e)}[/red]", + border_style="red", + ) + ) + live.refresh() + raise + + +@cli.command("list") +@click.option("--tag", multiple=True, help="Filter instances by tag (format: key=value)") +@click.pass_obj +def list_instances(manager: SpotManager, tag): + """List running instances with optional tag filtering""" + global progress, progress_task, layout, live + + try: + # Parse tag filters + filters = [] + if tag: + for t in tag: + if "=" in t: + key, value = t.split("=", 1) + filters.append({"Name": f"tag:{key}", "Values": [value]}) + else: + write_debug(f"Ignoring malformed tag filter: {t}") + console.print( + f"[yellow]Warning: Ignoring malformed tag filter '{t}' (expected key=value)[/yellow]" + ) + + response = manager.ec2.describe_instances(Filters=filters) + instances = [ + instance + for reservation in response["Reservations"] + for instance in reservation["Instances"] + ] + + # Check if no instances were found - print simple message and return early + if not instances: + console.print("[yellow]No instances found[/yellow]") + return + + # Create and update the table for instances that exist + progress_task = progress.add_task( + description="Listing instances...", total=None, visible=True + ) + layout["progress"].update(progress) + + try: + # Create and update the table + table = manager.create_instance_table(instances) + layout["body"].update(Panel(table, border_style="blue")) + layout["status"].update( + Panel(f"Found {len(instances)} running instances", style="green") + ) + safe_progress_update(progress_task, description="Complete", visible=False) + layout["progress"].update(progress) + + except Exception as e: + write_debug(f"Error in table creation/display: {str(e)}") + layout["body"].update(Panel("[red]Error creating table[/red]", border_style="red")) + layout["status"].update( + Panel("[red]Error displaying instance information[/red]", border_style="red") + ) + + except Exception as e: + # Ensure we catch and properly display any errors + error_msg = f"Error listing instances: {str(e)}" + write_debug(error_msg) + console.print(f"[red]{error_msg}[/red]") + raise + + +@cli.command() +@click.argument("instance-id") +@click.pass_obj +def terminate(manager: SpotManager, instance_id: str): + """Terminate a specific instance""" + global progress, progress_task, layout, live + progress_task = progress.add_task("Terminating instance...", total=1) + layout["progress"].update(progress) + + # Run the async termination in an event loop + asyncio.run(manager.terminate_instances([instance_id])) + + # Fetch instance details + response = manager.ec2.describe_instances(InstanceIds=[instance_id]) + instances = [ + instance + for reservation in response["Reservations"] + for instance in reservation["Instances"] + ] + + # Create and display the table + table = manager.create_instance_table(instances) + layout["body"].update(table) + layout["status"].update(Panel(f"[green]Successfully terminated instance {instance_id}[/green]")) + layout["progress"].update(progress) + + # Show the layout + live.refresh() + time.sleep(2) # Give user time to see the final state + + +@cli.command() +@click.pass_obj +def terminate_all(manager: SpotManager): + """Terminate all running instances""" + global progress, progress_task, layout, live + + progress_task = progress.add_task("Finding instances...", total=None) + layout["progress"].update(progress) + live.refresh() + + try: + instance_ids = manager.get_all_instance_ids() + if not instance_ids: + layout["header"].update( + Panel("[bold blue]Bacalhau Spot Manager[/bold blue]", border_style="blue") + ) + layout["body"].update( + Panel("[yellow]No instances found[/yellow]", border_style="yellow") + ) + layout["status"].update( + Panel("[yellow]No instances to terminate[/yellow]", border_style="yellow") + ) + layout["progress"].update(progress) + live.refresh() + return + + # Get initial instance details for comparison + initial_response = manager.ec2.describe_instances(InstanceIds=instance_ids) + initial_instances = [ + instance + for reservation in initial_response["Reservations"] + for instance in reservation["Instances"] + ] + + # Show current instances + table = manager.create_instance_table(initial_instances) + layout["header"].update( + Panel("[bold blue]Bacalhau Spot Manager[/bold blue]", border_style="blue") + ) + layout["body"].update(Panel(table, border_style="blue")) + layout["status"].update( + Panel( + f"[yellow]Terminating {len(initial_instances)} instances...[/yellow]", + border_style="yellow", + ) + ) + layout["progress"].update(progress) + live.refresh() + + # Run the async termination in an event loop + asyncio.run(manager.terminate_instances(instance_ids)) + + # Create summary table + summary_table = Table( + show_header=True, + header_style="bold magenta", + title="Termination Summary", + title_style="bold blue", + box=ROUNDED, + ) + summary_table.add_column("Instance ID", style="cyan") + summary_table.add_column("Type", style="green") + summary_table.add_column("Zone", style="blue") + summary_table.add_column("Launch Time", style="yellow") + summary_table.add_column("IP Address", style="white") + summary_table.add_column("State", style="red") + + # Add rows for each terminated instance + for instance in initial_instances: + summary_table.add_row( + instance.get("InstanceId", "N/A"), + instance.get("InstanceType", "N/A"), + instance.get("Placement", {}).get("AvailabilityZone", "N/A"), + instance.get("LaunchTime", "N/A").strftime("%Y-%m-%d %H:%M:%S") + if instance.get("LaunchTime") + else "N/A", + instance.get("PublicIpAddress", "N/A"), + "[red]Terminated[/red]", + ) + + # Update layout with summary + layout["header"].update( + Panel( + "[bold blue]Termination Complete[/bold blue]\n" + f"[green]Successfully terminated {len(initial_instances)} instances[/green]", + border_style="blue", + ) + ) + layout["body"].update(Panel(summary_table, border_style="blue")) + layout["status"].update( + Panel( + "[green]✓ All instances have been terminated[/green]\n" + f"Total instances terminated: {len(initial_instances)}", + border_style="green", + ) + ) + layout["progress"].update(progress) + live.refresh() + time.sleep(3) # Give user time to see the summary + + except Exception as e: + error_msg = f"Error during termination: {str(e)}" + write_debug(error_msg) + layout["status"].update(Panel(f"[red]{error_msg}[/red]", border_style="red")) + layout["body"].update(Panel("[red]Termination process failed[/red]", border_style="red")) + live.refresh() + raise + + +@cli.command() +@click.option("--min-nodes", default=250, help="Minimum number of nodes per iteration") +@click.option("--max-nodes", default=750, help="Maximum number of nodes per iteration") +@click.option("--iterations", default=10, help="Number of test iterations") +@click.option("--health-timeout", default=300, help="Timeout in seconds for health checks") +@click.pass_obj +def stress_test( + manager: SpotManager, + min_nodes: int, + max_nodes: int, + iterations: int, + health_timeout: int, +): + """Run stress test with random node counts + + Example usage: + ./spot-manager --debug stress-test --min-nodes 5 --max-nodes 10 + """ + asyncio.run( + manager.run_stress_test( + min_nodes=min_nodes, + max_nodes=max_nodes, + iterations=iterations, + health_check_timeout=health_timeout, + ) + ) + + +if __name__ == "__main__": + cli(obj={}) diff --git a/scale-tester/aws_spot/plan.md b/scale-tester/aws_spot/plan.md new file mode 100644 index 00000000..ca65d2a4 --- /dev/null +++ b/scale-tester/aws_spot/plan.md @@ -0,0 +1,27 @@ +# Spot Manager Improvement Plan + +1. Add proper error handling and retries for AWS API calls [DONE] +2. Implement exponential backoff for instance polling [DONE] +3. Add instance state tracking and validation [DONE] +4. Add comprehensive logging with structured format [DONE] +5. Implement proper async cleanup handlers [DONE] +6. Add validation for instance counts and limits [DONE] +7. Implement proper resource cleanup on errors [DONE] +8. Add proper instance tagging system [DONE] +9. Implement instance state machine [DONE] +10. Add configuration validation [DONE] +11. Add proper instance type validation [DONE] +12. Implement proper scaling limits and safeguards [DONE] +13. Improve UI presentation consistency [DONE] +14. Add progress tracking for large-scale operations [DONE] +15. Add batch processing for instance operations +16. Implement proper rate limiting for AWS API calls +17. Add health check timeouts and retries +18. Improve instance status reporting +19. Add proper instance lifecycle tracking +20. Implement proper error recovery mechanisms + +## First Steps to Execute + +1. Add proper error handling and retries for AWS API calls +2. Implement exponential backoff for instance polling diff --git a/scale-tester/aws_spot/pyproject.toml b/scale-tester/aws_spot/pyproject.toml new file mode 100644 index 00000000..e4d78133 --- /dev/null +++ b/scale-tester/aws_spot/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "bacalhau-scale-tester" +version = "0.1.0" +description = "A tool for testing the scalability of Bacalhau nodes on AWS spot instances" +authors = [{ name = "Bacalhau Project", email = "info@bacalhau.org" }] +dependencies = [ + "rich>=13.7.0", + "boto3>=1.34.0", + "click>=8.1.7", + "aiohttp>=3.9.0", + "python-dotenv>=1.0.0", +] +requires-python = ">=3.10" +readme = "README.md" +license = { text = "Apache-2.0" } + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.ruff] +line-length = 100 +target-version = "py310" + +[tool.black] +line-length = 100 +target-version = ["py310"] diff --git a/scale-tester/aws_spot/requirements.txt b/scale-tester/aws_spot/requirements.txt new file mode 100644 index 00000000..353a4e4f --- /dev/null +++ b/scale-tester/aws_spot/requirements.txt @@ -0,0 +1,5 @@ +rich>=13.7.0 +boto3>=1.34.0 +click>=8.1.7 +aiohttp>=3.9.0 +python-dotenv>=1.0.0 \ No newline at end of file diff --git a/scale-tester/aws_spot/spot-instances.sh b/scale-tester/aws_spot/spot-instances.sh deleted file mode 100755 index 8e560623..00000000 --- a/scale-tester/aws_spot/spot-instances.sh +++ /dev/null @@ -1,332 +0,0 @@ -#!/usr/bin/env bash -# spot-instances.sh -# -# Script to launch and manage AWS spot instances -set -e - -source ./aws-spot-env.sh - -# Debug settings -DEBUG=false -DEBUG_FILE="/tmp/spot_instance_debug.txt" - -function debug() { - if [ "$DEBUG" = true ]; then - echo "[DEBUG] $(date '+%Y-%m-%d %H:%M:%S'): $1" >> "$DEBUG_FILE" - fi -} - -function list_instances() { - echo "Listing instances with tag '${INSTANCE_TAG_KEY}=${INSTANCE_TAG_VALUE}'..." - aws ec2 describe-instances \ - --region "$AWS_REGION" \ - --filters "Name=tag:${INSTANCE_TAG_KEY},Values=${INSTANCE_TAG_VALUE}" \ - "Name=instance-state-name,Values=pending,running" \ - --output json \ - --query 'Reservations[].Instances[].[InstanceId,InstanceType,State.Name,LaunchTime,PublicDnsName]' \ - | jq -r '.[] | @tsv' \ - | column -t -} - -function terminate_instance() { - local instance_id="$1" - echo "Terminating instance: $instance_id" - aws ec2 terminate-instances --output json \ - --region "$AWS_REGION" \ - --instance-ids "$instance_id" - - echo "Waiting for instance to terminate..." - while true; do - status=$(aws ec2 describe-instances \ - --region "$AWS_REGION" \ - --instance-ids "$instance_id" \ - --output json \ - --query 'Reservations[].Instances[].State.Name' | jq -r '.[0]') - - if [ "$status" = "terminated" ] || [ -z "$status" ]; then - break - fi - echo "Current status: $status" - sleep 5 - done - - echo "Instance $instance_id has been terminated." -} - -function terminate_all_instances() { - local instance_ids=$(aws ec2 describe-instances \ - --region "$AWS_REGION" \ - --filters "Name=tag:${INSTANCE_TAG_KEY},Values=${INSTANCE_TAG_VALUE}" \ - "Name=instance-state-name,Values=pending,running" \ - --output json \ - --query 'Reservations[].Instances[].InstanceId' \ - | jq -r '.[]') - - if [ -z "$instance_ids" ]; then - echo "No running instances found matching tag '${INSTANCE_TAG_KEY}=${INSTANCE_TAG_VALUE}'." - return - fi - - echo "The following instances will be terminated:" - list_instances - - read -p "Are you sure you want to terminate these instances? (y/N) " confirm - if [[ $confirm =~ ^[Yy]$ ]]; then - echo "Terminating instances..." - aws ec2 terminate-instances --output json \ - --region "$AWS_REGION" \ - --instance-ids $instance_ids - - echo "Waiting for all instances to terminate..." - while true; do - statuses=$(aws ec2 describe-instances \ - --region "$AWS_REGION" \ - --instance-ids $instance_ids \ - --output json \ - --query 'Reservations[].Instances[].State.Name' | jq -r '.[]') - - # Check if all instances are terminated - all_terminated=true - for status in $statuses; do - if [ "$status" != "terminated" ]; then - all_terminated=false - break - fi - done - - if [ "$all_terminated" = true ] || [ -z "$statuses" ]; then - break - fi - - echo "Current statuses: $statuses" - sleep 5 - done - - echo "All matching instances have been terminated." - else - echo "Operation cancelled." - fi -} - -function ensure_security_group() { - debug "Checking for existing security group..." - # First try to get existing group - local security_group_id="" - local result=$(aws ec2 describe-security-groups \ - --region "$AWS_REGION" \ - --filters "Name=group-name,Values=$SECURITY_GROUP_NAME" \ - --query 'SecurityGroups[0].GroupId' \ - --output text) - - debug "Query result: '$result'" - - if [ "$result" == "None" ] || [ -z "$result" ]; then - debug "Creating security group: $SECURITY_GROUP_NAME" - result=$(aws ec2 create-security-group \ - --region "$AWS_REGION" \ - --group-name "$SECURITY_GROUP_NAME" \ - --description "$SECURITY_GROUP_DESC" \ - --query 'GroupId' \ - --output text) - debug "Created security group ID: '$result'" - fi - - if [ -z "$result" ] || [ "$result" == "None" ]; then - debug "Error: Failed to get or create security group" - return 1 - fi - - security_group_id="$result" - debug "Using security group ID: '$security_group_id'" - - # Remove existing ingress rules - debug "Removing existing security group rules..." - local existing_rules=$(aws ec2 describe-security-group-rules \ - --region "$AWS_REGION" \ - --filters "Name=group-id,Values=$security_group_id" \ - --query 'SecurityGroupRules[?!IsEgress].SecurityGroupRuleId' \ - --output text) - - if [ ! -z "$existing_rules" ]; then - debug "Found existing rules: $existing_rules" - for rule_id in $existing_rules; do - debug "Revoking rule: $rule_id" - aws ec2 revoke-security-group-ingress \ - --region "$AWS_REGION" \ - --group-id "$security_group_id" \ - --security-group-rule-ids "$rule_id" > /dev/null || true - done - fi - - debug "Configuring security group rules for ID: $security_group_id" - # Add SSH access - aws ec2 authorize-security-group-ingress \ - --region "$AWS_REGION" \ - --group-id "$security_group_id" \ - --protocol tcp \ - --port 22 \ - --cidr "0.0.0.0/0" > /dev/null || { - debug "Failed to add SSH rule, might already exist" - } - - # Add port 4222 access - aws ec2 authorize-security-group-ingress \ - --region "$AWS_REGION" \ - --group-id "$security_group_id" \ - --protocol tcp \ - --port 4222 \ - --cidr "0.0.0.0/0" > /dev/null || { - debug "Failed to add port 4222 rule, might already exist" - } - - if [ -z "$security_group_id" ] || [ "$security_group_id" == "None" ]; then - debug "Error: Security group ID is empty or None" - return 1 - fi - - debug "Returning security group ID: '$security_group_id'" - echo "$security_group_id" -} - -function launch_instances() { - # Get security group ID - debug "Getting security group..." - local security_group_id - security_group_id=$(ensure_security_group) - debug "Received security group ID: '$security_group_id'" - - if [ -z "$security_group_id" ] || [ "$security_group_id" == "None" ]; then - echo "Error: Failed to get security group ID" - exit 1 - fi - - debug "Security group ID before launch: '$security_group_id'" - - # Use the configured AMI - fail if not available - local ami_id="$CONFIGURED_AMI_ID" - if [ -z "$ami_id" ] || [ "$ami_id" == "null" ]; then - echo "Error: No configured AMI found. Please run './build-ami.sh' first to create the AMI." - exit 1 - fi - - debug "Using configured AMI ID: $ami_id" - - # Launch a single instance - echo "Launching spot instance..." - debug "Command: aws ec2 run-instances with spot options" - - local aws_debug="" - if [ "$DEBUG" = true ]; then - aws_debug="--debug" - fi - - # Create a temporary file for error output - local error_file=$(mktemp) - - # Launch spot instance - local output - if ! output=$(aws ec2 run-instances \ - --region "$AWS_REGION" \ - --image-id "$ami_id" \ - --instance-type "$INSTANCE_TYPE" \ - --key-name "$KEY_NAME" \ - --security-group-ids "$security_group_id" \ - --tag-specifications "ResourceType=instance,Tags=[{Key=$INSTANCE_TAG_KEY,Value=$INSTANCE_TAG_VALUE}]" \ - --iam-instance-profile "Name=BacalhauScaleTestRole" \ - --user-data "file://scripts/startup.sh" \ - --instance-market-options '{"MarketType":"spot","SpotOptions":{"SpotInstanceType":"one-time","InstanceInterruptionBehavior":"terminate"}}' \ - --count "$SPOT_INSTANCE_COUNT" \ - --output json \ - $aws_debug 2>"$error_file"); then - - echo "Error launching spot instance:" - cat "$error_file" - rm "$error_file" - exit 1 - fi - - # Print launched instance details - local instance_ids=$(echo "$output" | jq -r '.Instances[].InstanceId') - echo "Successfully launched spot instances: $instance_ids" - - # Wait for instances to be running - echo "Waiting for instances to be running..." - while true; do - local statuses=$(aws ec2 describe-instances \ - --region "$AWS_REGION" \ - --instance-ids $instance_ids \ - --output json \ - --query 'Reservations[].Instances[].[InstanceId,State.Name]' | \ - jq -r '.[] | @tsv') - - echo "Instance statuses:" - echo "$statuses" | column -t - - if ! echo "$statuses" | grep -qE "pending|starting"; then - break - fi - sleep 5 - done - - echo "Spot instances are now running. Use './spot-instances.sh list' to see details." -} - -function show_usage() { - echo "Usage: $0 [COMMAND] [OPTIONS]" - echo "" - echo "Commands:" - echo " launch Launch new spot instances (count: $SPOT_INSTANCE_COUNT)" - echo " list List all running instances" - echo " terminate Terminate specific instance" - echo " terminate-all Terminate all instances" - echo " help Show this help message" - echo "" - echo "Options:" - echo " --debug Enable debug output to $DEBUG_FILE" -} - -# Main script logic -# Process command line arguments -COMMAND="" -for arg in "$@"; do - case "$arg" in - --debug) - DEBUG=true - # Clear debug file at start - > "$DEBUG_FILE" - ;; - *) - if [ -z "$COMMAND" ]; then - COMMAND="$arg" - fi - ;; - esac -done - -case "$COMMAND" in - launch) - launch_instances - ;; - list) - list_instances - ;; - terminate) - if [ -z "$2" ]; then - echo "Error: Instance ID required" - show_usage - exit 1 - fi - terminate_instance "$2" - ;; - terminate-all) - terminate_all_instances - ;; - help|--help|-h|"") - show_usage - ;; - *) - echo "Error: Unknown command '$1'" - show_usage - exit 1 - ;; -esac \ No newline at end of file diff --git a/scale-tester/aws_spot/spot/config/aws-spot-env.sh b/scale-tester/aws_spot/spot/config/aws-spot-env.sh new file mode 100644 index 00000000..6b1c69be --- /dev/null +++ b/scale-tester/aws_spot/spot/config/aws-spot-env.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# aws-spot-env.sh +# +# This file sets environment variables used for launching +# 1,000 AWS Spot Instances with Docker installed. +# +# Usage: +# source ./aws-spot-env.sh + +# AWS CLI & Region +export AWS_REGION="us-west-2" + +# Key Pair +export KEY_NAME="BacalhauScaleTestKey" + +# Security Group +export SECURITY_GROUP_NAME="bacalhau-scale-test-group" +export SECURITY_GROUP_DESC="Security group for Bacalhau Scale Spot Instances" + +# Your public IP for SSH ingress (CIDR /32) +export MY_PUBLIC_IP=$(curl -s ifconfig.me) + +# Base AMI to use (Amazon Linux 2 example) +# aws ssm get-parameters --names /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2 --region us-east-1 +export BASE_AMI_ID="ami-07d9cf938edb0739b" +export CONFIGURED_AMI_ID="ami-03212e939b49e6f64" + +# Instance Type +export INSTANCE_TYPE="t3.micro" + +# Number of Spot Instances +export SPOT_INSTANCE_COUNT="100" + +# Custom AMI details (if building your own) +export CUSTOM_AMI_NAME="bacalhau-scale-test-ami" +export CUSTOM_AMI_DESCRIPTION="AMI with Docker and Bacalhau preinstalled" + +# Tags +export INSTANCE_TAG_KEY="Name" +export INSTANCE_TAG_VALUE="bacalhau-scale-test" + +echo "Environment variables for AWS Spot Instances set." diff --git a/scale-tester/aws_spot/spot/config/aws-spot-env.sh.example b/scale-tester/aws_spot/spot/config/aws-spot-env.sh.example new file mode 100644 index 00000000..02b66ad5 --- /dev/null +++ b/scale-tester/aws_spot/spot/config/aws-spot-env.sh.example @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# aws-spot-env.sh +# +# Environment variables for Bacalhau scale testing on AWS spot instances. +# Copy this file to aws-spot-env.sh and edit with your settings. + +# AWS CLI & Region +export AWS_REGION="us-west-2" + +# Key Pair (must exist in your AWS account) +export KEY_NAME="BacalhauScaleTestKey" + +# Security Group +export SECURITY_GROUP_NAME="bacalhau-scale-test-group" +export SECURITY_GROUP_DESC="Security group for Bacalhau Scale Spot Instances" + +# Instance Configuration +export INSTANCE_TYPE="t3.micro" +export SPOT_INSTANCE_COUNT="100" # Default count for non-stress-test launches + +# AMI Configuration +export CUSTOM_AMI_NAME="bacalhau-scale-test-ami" +export CUSTOM_AMI_DESCRIPTION="AMI with Docker and Bacalhau preinstalled" + +# The AMI ID will be populated by build-ami.sh +export CONFIGURED_AMI_ID="" + +# Instance Tags +export INSTANCE_TAG_KEY="Name" +export INSTANCE_TAG_VALUE="bacalhau-scale-test" + +echo "Environment variables for AWS Spot Instances loaded." \ No newline at end of file diff --git a/scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py b/scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py deleted file mode 100644 index ccb3cf23..00000000 --- a/scale-tester/bacalhau-dind-compute-node/clean_up_nodes.py +++ /dev/null @@ -1,125 +0,0 @@ -import argparse -import json -import os -import subprocess -import sys -import concurrent.futures -import threading - -import yaml - - -def get_nodes(api_host): - """Get list of all Bacalhau nodes.""" - try: - cmd = [ - "bacalhau", - "node", - "list", - "--output", - "json", - "-c", - f"API.Host={api_host}", - ] - - result = subprocess.run( - cmd, - capture_output=True, - text=True, - check=True, - ) - return json.loads(result.stdout) - except subprocess.CalledProcessError as e: - print(f"Error running bacalhau node list: {e}") - print(f"stdout: {e.stdout}") - print(f"stderr: {e.stderr}") - sys.exit(1) - except json.JSONDecodeError as e: - print(f"Error parsing JSON output: {e}") - sys.exit(1) - - -def delete_node(node_id, api_host, print_lock): - """Delete a specific node by ID.""" - try: - cmd = [ - "bacalhau", - "node", - "delete", - node_id, - "-c", - f"API.Host={api_host}", - ] - - result = subprocess.run(cmd, capture_output=True, check=True, text=True) - with print_lock: - print(f"Successfully deleted node: {node_id}") - return True - except subprocess.CalledProcessError as e: - with print_lock: - print(f"Failed to delete node {node_id}. Error: {e}") - return False - - -def main(): - parser = argparse.ArgumentParser(description="Delete disconnected Bacalhau nodes") - parser.add_argument("--api-host", help="API host to connect to") - parser.add_argument( - "--dry-run", - action="store_true", - help="Show what would be deleted without actually deleting", - ) - args = parser.parse_args() - - if not args.api_host: - print("API host is required") - sys.exit(1) - - print(f"\nConnecting to API host: {args.api_host}") - - # Get all nodes - nodes = get_nodes(args.api_host) - - # Filter disconnected compute nodes - disconnected_nodes = [ - node - for node in nodes - if ( - node["Connection"] == "DISCONNECTED" - and node["Info"]["NodeType"] == "Compute" - ) - ] - - if not disconnected_nodes: - print("No disconnected nodes found.") - return - - print(f"\nFound {len(disconnected_nodes)} disconnected node(s):") - for node in disconnected_nodes: - print(f" - {node['Info']['NodeID']}") - - if args.dry_run: - print("\nDry run - no nodes were deleted") - return - - print("\nDeleting nodes...") - deleted_count = 0 - print_lock = threading.Lock() - - with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: - # Create a list to store the futures - future_to_node = { - executor.submit(delete_node, node["Info"]["NodeID"], args.api_host, print_lock): node - for node in disconnected_nodes - } - - # As each future completes, count the successful deletions - for future in concurrent.futures.as_completed(future_to_node): - if future.result(): - deleted_count += 1 - - print(f"\nDeleted {deleted_count} of {len(disconnected_nodes)} disconnected nodes") - - -if __name__ == "__main__": - main() diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt index 17980d9a..a9484ec4 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/.cspell/custom-dictionary.txt @@ -16,6 +16,7 @@ keypair levelname NOPASSWD oneshot +puuid runcmd templatefile tfvars diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml deleted file mode 100644 index d5d8f3f4..00000000 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/all_locations.yaml +++ /dev/null @@ -1,176 +0,0 @@ -# Auto-generated locations configuration -# Using Amazon Linux 2023 AMIs -- ap-south-2: - region: ap-south-2 - zone: ap-south-2a - instance_type: t3.medium - instance_ami: ami-0d1fc426c1b23bce0 - node_count: 1 -- ap-south-1: - region: ap-south-1 - zone: ap-south-1a - instance_type: t3.medium - instance_ami: ami-0fd05997b4dff7aac - node_count: 1 -- eu-south-1: - region: eu-south-1 - zone: eu-south-1a - instance_type: t3.medium - instance_ami: ami-0f529654669a607d1 - node_count: 1 -- eu-south-2: - region: eu-south-2 - zone: eu-south-2a - instance_type: t3.medium - instance_ami: ami-0d27757cc8327f88f - node_count: 1 -- me-central-1: - region: me-central-1 - zone: me-central-1a - instance_type: t3.medium - instance_ami: ami-0f334de647da2fc7d - node_count: 1 -- ca-central-1: - region: ca-central-1 - zone: ca-central-1a - instance_type: t3.medium - instance_ami: ami-0a590ca28046d073e - node_count: 1 -- eu-central-1: - region: eu-central-1 - zone: eu-central-1a - instance_type: t3.medium - instance_ami: ami-0e54671bdf3c8ed8d - node_count: 1 -- eu-central-2: - region: eu-central-2 - zone: eu-central-2a - instance_type: t3.medium - instance_ami: ami-001ae26aa9fa6e1e0 - node_count: 1 -- us-west-1: - region: us-west-1 - zone: us-west-1b - instance_type: t3.medium - instance_ami: ami-0aa117785d1c1bfe5 - node_count: 1 -- us-west-2: - region: us-west-2 - zone: us-west-2a - instance_type: t3.medium - instance_ami: ami-07d9cf938edb0739b - node_count: 1 -- af-south-1: - region: af-south-1 - zone: af-south-1a - instance_type: t3.medium - instance_ami: ami-09bb68fb3b90fe9f5 - node_count: 1 -- eu-north-1: - region: eu-north-1 - zone: eu-north-1a - instance_type: t3.medium - instance_ami: ami-02df5cb5ad97983ba - node_count: 1 -- eu-west-3: - region: eu-west-3 - zone: eu-west-3a - instance_type: t3.medium - instance_ami: ami-07dc1ccdcec3b4eab - node_count: 1 -- eu-west-2: - region: eu-west-2 - zone: eu-west-2a - instance_type: t3.medium - instance_ami: ami-019374baf467d6601 - node_count: 1 -- eu-west-1: - region: eu-west-1 - zone: eu-west-1a - instance_type: t3.medium - instance_ami: ami-0a094c309b87cc107 - node_count: 1 -- ap-northeast-3: - region: ap-northeast-3 - zone: ap-northeast-3a - instance_type: t3.medium - instance_ami: ami-0c8df088bd68958ff - node_count: 1 -- ap-northeast-2: - region: ap-northeast-2 - zone: ap-northeast-2a - instance_type: t3.medium - instance_ami: ami-049788618f07e189d - node_count: 1 -- me-south-1: - region: me-south-1 - zone: me-south-1a - instance_type: t3.medium - instance_ami: ami-064ca081dffe98dc2 - node_count: 1 -- ap-northeast-1: - region: ap-northeast-1 - zone: ap-northeast-1a - instance_type: t3.medium - instance_ami: ami-0ab02459752898a60 - node_count: 1 -- sa-east-1: - region: sa-east-1 - zone: sa-east-1a - instance_type: t3.medium - instance_ami: ami-03c4a8310002221c7 - node_count: 1 -- ap-east-1: - region: ap-east-1 - zone: ap-east-1a - instance_type: t3.medium - instance_ami: ami-0a7ea1800b4d7a034 - node_count: 1 -- ca-west-1: - region: ca-west-1 - zone: ca-west-1a - instance_type: t3.medium - instance_ami: ami-0ba32fa0d87c5d193 - node_count: 1 -- ap-southeast-1: - region: ap-southeast-1 - zone: ap-southeast-1a - instance_type: t3.medium - instance_ami: ami-0995922d49dc9a17d - node_count: 1 -- ap-southeast-2: - region: ap-southeast-2 - zone: ap-southeast-2a - instance_type: t3.medium - instance_ami: ami-0d6560f3176dc9ec0 - node_count: 1 -- ap-southeast-3: - region: ap-southeast-3 - zone: ap-southeast-3a - instance_type: t3.medium - instance_ami: ami-01ca3951ed2aa735e - node_count: 1 -- ap-southeast-4: - region: ap-southeast-4 - zone: ap-southeast-4a - instance_type: t3.medium - instance_ami: ami-069ddd2a970d5e293 - node_count: 1 -- us-east-1: - region: us-east-1 - zone: us-east-1a - instance_type: t3.medium - instance_ami: ami-01816d07b1128cd2d - node_count: 1 -- ap-southeast-5: - region: ap-southeast-5 - zone: ap-southeast-5a - instance_type: t3.medium - instance_ami: ami-0c4a807cb1a258810 - node_count: 1 -- us-east-2: - region: us-east-2 - zone: us-east-2a - instance_type: t3.medium - instance_ami: ami-0b4624933067d393a - node_count: 1 diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py index 60bc4565..f7418a62 100755 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy.py @@ -13,8 +13,7 @@ import os import subprocess import sys -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Optional, Tuple import yaml from rich import box @@ -154,20 +153,32 @@ def load_config() -> Dict[str, Any]: with open("locations.yaml", "r") as f: yaml_data = yaml.safe_load(f) if not isinstance(yaml_data, list): - raise ValueError("Expected a list of region configurations") + raise ValueError("Expected a list of zone configurations") - # Convert list of single-key dictionaries into a single dictionary + # Convert list of zone configurations into a dictionary config = {} - for region_dict in yaml_data: - if not isinstance(region_dict, dict): - raise ValueError("Each region configuration must be a dictionary") - if len(region_dict) != 1: + for zone_dict in yaml_data: + if not isinstance(zone_dict, dict): + raise ValueError("Each zone configuration must be a dictionary") + if len(zone_dict) != 1: raise ValueError( - "Each region configuration must have exactly one key" + "Each zone configuration must have exactly one key" ) - region = list(region_dict.keys())[0] - config[region] = region_dict[region] + zone_name = list(zone_dict.keys())[0] + zone_config = zone_dict[zone_name] + + # Create a unique key for this zone + zone_key = zone_name + + # Validate and set required fields + config[zone_key] = { + "instance_type": zone_config.get("instance_type"), + "instance_ami": zone_config.get("instance_ami"), + "node_count": zone_config.get("node_count", 1), + "region": zone_config.get("region"), + "zone": zone_config.get("zone", zone_name), + } # Validate the configuration validate_config(config) @@ -175,7 +186,7 @@ def load_config() -> Dict[str, Any]: except FileNotFoundError: print("Error: locations.yaml file not found") - print("Please create a locations.yaml file with your region configurations") + print("Please create a locations.yaml file with your zone configurations") sys.exit(1) except yaml.YAMLError as e: print(f"Error parsing locations.yaml: {e}") @@ -186,8 +197,8 @@ def load_config() -> Dict[str, Any]: sys.exit(1) -def update_machines_file(region: str, outputs: Dict[str, Any]) -> None: - """Update MACHINES.json with outputs from a region""" +def update_machines_file(region: str, zone: str, outputs: Dict[str, Any]) -> None: + """Update MACHINES.json with outputs from a region/zone""" machines_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "MACHINES.json" ) @@ -205,7 +216,7 @@ def update_machines_file(region: str, outputs: Dict[str, Any]) -> None: instance_ids = outputs.get("instance_ids", {}).get("value", []) # Log the raw values for debugging - logging.debug(f"Raw outputs for region {region}:") + logging.debug(f"Raw outputs for {region}/{zone}:") logging.debug(f"Public IPs: {public_ips}") logging.debug(f"Private IPs: {private_ips}") logging.debug(f"Instance IDs: {instance_ids}") @@ -253,7 +264,7 @@ def update_machines_file(region: str, outputs: Dict[str, Any]) -> None: else [] ) - # Create instances list for this region + # Create instances list for this zone instances = [] max_length = max(len(instance_ids), len(public_ips), len(private_ips)) @@ -263,18 +274,23 @@ def update_machines_file(region: str, outputs: Dict[str, Any]) -> None: "instance_id": instance_ids[i], "public_ip": public_ips[i] if i < len(public_ips) else None, "private_ip": private_ips[i] if i < len(private_ips) else None, + "zone": zone, } instances.append(instance) - # Update the region's data with the new structure - machines_data[region] = {"name": region, "instances": instances} + # Initialize region if it doesn't exist + if region not in machines_data: + machines_data[region] = {"name": region, "zones": {}} + + # Update the zone's data + machines_data[region]["zones"][zone] = {"name": zone, "instances": instances} # Write updated data back to file with open(machines_file, "w") as f: json.dump(machines_data, f, indent=2) logging.info( - f"Updated MACHINES.json with {len(instances)} instances for region {region}" + f"Updated MACHINES.json with {len(instances)} instances for {region}/{zone}" ) except Exception as e: logging.error(f"Error updating MACHINES.json: {str(e)}") @@ -303,10 +319,10 @@ def delete_machines_file() -> None: raise -def deploy(command, region, region_config): - """Deploys or destroys resources in a single region.""" +def deploy(command, zone, zone_config): + """Deploys or destroys resources in a single zone.""" terraform_command = "apply" if command == "create" else "destroy" - logging.info(f"Starting {command} operation for region {region}") + logging.info(f"Starting {command} operation for zone {zone}") # Get absolute path to env.tfvars.json workspace_dir = os.path.dirname(os.path.abspath(__file__)) @@ -318,23 +334,23 @@ def deploy(command, region, region_config): logging.error(f"Required file not found: {env_vars_file}") raise FileNotFoundError(f"Required file not found: {env_vars_file}") - logging.info(f"Region config: {json.dumps(region_config, indent=2)}") + logging.info(f"Zone config: {json.dumps(zone_config, indent=2)}") # For destroy command, get the current state before destroying destroyed_resources = {} if command == "destroy": try: - run_command(["terraform", "workspace", "select", "-or-create", region]) + run_command(["terraform", "workspace", "select", "-or-create", zone]) result = run_command(["terraform", "output", "-json"]) try: destroyed_resources = ( json.loads(result.stdout) if result.stdout.strip() else {} ) except json.JSONDecodeError: - logging.warning(f"Could not parse terraform output for region {region}") + logging.warning(f"Could not parse terraform output for zone {zone}") destroyed_resources = {} except Exception as e: - logging.warning(f"Could not get current state for region {region}: {e}") + logging.warning(f"Could not get current state for zone {zone}: {e}") # Even if we can't get the current state, we should still show what was in MACHINES.json destroyed_resources = {} @@ -346,30 +362,30 @@ def deploy(command, region, region_config): console=console, ) as progress: task = progress.add_task( - f"[cyan]{region}[/cyan] - {command.capitalize()}", total=3 + f"[cyan]{zone}[/cyan] - {command.capitalize()}", total=3 ) - # Select workspace for this region - logging.info(f"Selecting/creating workspace for region {region}") - run_command(["terraform", "workspace", "select", "-or-create", region]) + # Select workspace for this zone + logging.info(f"Selecting/creating workspace for zone {zone}") + run_command(["terraform", "workspace", "select", "-or-create", zone]) progress.update( - task, advance=1, description=f"[cyan]{region}[/cyan] - Initializing" + task, advance=1, description=f"[cyan]{zone}[/cyan] - Initializing" ) - logging.info(f"Running terraform init for region {region}") + logging.info(f"Running terraform init for zone {zone}") run_command(["terraform", "init", "-upgrade"]) progress.update( task, advance=1, - description=f"[cyan]{region}[/cyan] - {command.capitalize()}", + description=f"[cyan]{zone}[/cyan] - {command.capitalize()}", ) - logging.info(f"Running terraform {terraform_command} for region {region}") + logging.info(f"Running terraform {terraform_command} for zone {zone}") logging.info( - f"Command variables: region={region}, zone={region_config['zone']}, " - f"instance_ami={region_config['instance_ami']}, " - f"node_count={region_config['node_count']}, " - f"instance_type={region_config['instance_type']}" + f"Command variables: region={zone_config['region']}, zone={zone_config['zone']}, " + f"instance_ami={zone_config['instance_ami']}, " + f"node_count={zone_config['node_count']}, " + f"instance_type={zone_config['instance_type']}" ) try: logging.debug(f"Starting terraform {terraform_command}") @@ -378,11 +394,11 @@ def deploy(command, region, region_config): "terraform", terraform_command, "-auto-approve", - f"-var=region={region}", - f"-var=zone={region_config['zone']}", - f"-var=instance_ami={region_config['instance_ami']}", - f"-var=node_count={region_config['node_count']}", - f"-var=instance_type={region_config['instance_type']}", + f"-var=region={zone_config['region']}", + f"-var=zone={zone_config['zone']}", + f"-var=instance_ami={zone_config['instance_ami']}", + f"-var=node_count={zone_config['node_count']}", + f"-var=instance_type={zone_config['instance_type']}", f"-var-file={env_vars_file}", ] ) @@ -392,21 +408,19 @@ def deploy(command, region, region_config): if command == "create": outputs_result = run_command(["terraform", "output", "-json"]) outputs = json.loads(outputs_result.stdout) - update_machines_file(region, outputs) + update_machines_file(zone_config["region"], zone, outputs) logging.debug(f"Terraform {terraform_command} output:\n{result.stdout}") if result.stderr: logging.debug(f"Terraform {terraform_command} stderr:\n{result.stderr}") except Exception as e: - logging.error( - f"Error during {terraform_command} for region {region}: {str(e)}" - ) + logging.error(f"Error during {terraform_command} for zone {zone}: {str(e)}") raise progress.update( - task, advance=1, description=f"[cyan]{region}[/cyan] - ✓ Complete" + task, advance=1, description=f"[cyan]{zone}[/cyan] - ✓ Complete" ) - logging.info(f"Completed {command} operation for region {region}") + logging.info(f"Completed {command} operation for zone {zone}") return destroyed_resources if command == "destroy" else None @@ -512,11 +526,11 @@ def main(): console.print(f"\n[bold blue]Starting {command} operation...[/bold blue]\n") - # Deploy/destroy resources in each region sequentially - for region, region_config in config.items(): - result = deploy(command, region, region_config) + # Deploy/destroy resources in each zone sequentially + for zone, zone_config in config.items(): + result = deploy(command, zone, zone_config) if command == "destroy" and result: - destroyed_resources[region] = result + destroyed_resources[zone] = result # Display final summary console.clear() @@ -527,12 +541,15 @@ def main(): # and the actual destroyed resources console.print("[bold red]Resources Destroyed:[/bold red]\n") - for region in config.keys(): - console.print(f"[bold cyan]Region: {region}[/bold cyan]") + for zone in config.keys(): + region = config[zone]["region"] + console.print(f"[bold cyan]Zone: {zone} (Region: {region})[/bold cyan]") # Get data from both sources - saved_data = machines_data.get(region, {}) - destroyed_data = destroyed_resources.get(region, {}) + saved_data = ( + machines_data.get(region, {}).get("zones", {}).get(zone, {}) + ) + destroyed_data = destroyed_resources.get(zone, {}) # Display destroyed instances from MACHINES.json instances = saved_data.get("instances", []) @@ -585,9 +602,7 @@ def main(): # Only show "No resources" message if both sources are empty if not instances and not destroyed_data: - console.print( - " [dim]No resources were active in this region[/dim]" - ) + console.print(" [dim]No resources were active in this zone[/dim]") console.print() @@ -607,6 +622,9 @@ def main(): table.add_column( "Region", style="cyan", width=15, justify="left", no_wrap=True ) + table.add_column( + "Zone", style="green", width=15, justify="left", no_wrap=True + ) table.add_column( "Instance ID", style="yellow", width=25, justify="left", no_wrap=True ) @@ -618,17 +636,36 @@ def main(): ) # Add rows for each active instance from MACHINES.json - for region_data in machines_data.values(): - for instance in region_data["instances"]: - table.add_row( - region_data["name"], - instance["instance_id"], - instance["public_ip"] or "", - instance["private_ip"] or "", - ) + for region_name, region_data in machines_data.items(): + for zone_name, zone_data in region_data.get("zones", {}).items(): + for instance in zone_data.get("instances", []): + table.add_row( + region_name, + zone_name, + instance["instance_id"], + instance["public_ip"] or "", + instance["private_ip"] or "", + ) console.print(table) + # Print summary counts + total_instances = sum( + len(zone_data.get("instances", [])) + for region_data in machines_data.values() + for zone_data in region_data.get("zones", {}).values() + ) + total_regions = len(machines_data) + total_zones = sum( + len(region_data.get("zones", {})) + for region_data in machines_data.values() + ) + + console.print("\n[bold cyan]Summary:[/bold cyan]") + console.print(f"Total Regions: {total_regions}") + console.print(f"Total Zones: {total_zones}") + console.print(f"Total Instances: {total_instances}") + console.print("\n[bold green]Operation complete![/bold green]\n") except KeyboardInterrupt: diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py new file mode 100755 index 00000000..b0affa7e --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/deploy_single.py @@ -0,0 +1,642 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "pyyaml", +# "rich", +# ] +# /// + +import argparse +import json +import logging +import os +import subprocess +import sys +from typing import Any, Dict, Optional, Tuple + +import yaml +from rich import box +from rich.console import Console +from rich.progress import BarColumn, Progress, TimeRemainingColumn +from rich.table import Table + +# Set up argument parser before logging configuration +parser = argparse.ArgumentParser(description="Deploy or destroy infrastructure") +parser.add_argument("command", choices=["create", "destroy"], help="Action to perform") +parser.add_argument("--debug", action="store_true", help="Enable debug logging") + +args = parser.parse_args() + +# Set up logging with more detail +logging.basicConfig( + level=logging.DEBUG if args.debug else logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + handlers=[ + logging.FileHandler( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug.log"), + mode="w", # Overwrite the file each run + ), + ], +) + +# Log the start of the script with a clear separator +logging.info("=" * 80) +logging.info("Starting new deployment operation") +if args.debug: + logging.info("Debug logging enabled") +logging.info("=" * 80) + +# Default configuration values +DEFAULTS = { + "instance_type": "t2.medium", + "node_count": 1, +} + +REQUIRED_FIELDS = { + "zone": str, + "instance_ami": str, +} + +console = Console() + + +def validate_config(config: Dict[str, Any]) -> None: + """Validate the configuration format and required fields""" + if not config: + raise ValueError("Empty configuration") + + for region, region_config in config.items(): + if not isinstance(region_config, dict): + raise ValueError(f"Invalid configuration for region {region}") + + # Check required fields + for field, field_type in REQUIRED_FIELDS.items(): + if field not in region_config: + raise ValueError( + f"Missing required field '{field}' for region {region}" + ) + if not isinstance(region_config[field], field_type): + raise ValueError( + f"Invalid type for field '{field}' in region {region}. " + f"Expected {field_type.__name__}" + ) + + # Apply defaults for optional fields + for field, default_value in DEFAULTS.items(): + if field not in region_config: + region_config[field] = default_value + + +def run_command( + cmd: list[str], cwd: Optional[str] = None +) -> subprocess.CompletedProcess: + """Run a command with proper error handling""" + try: + logging.debug(f"Executing command: {' '.join(cmd)}") + if cwd: + logging.debug(f"Working directory: {cwd}") + + # Get current environment + env = os.environ.copy() + + logging.debug("Starting command execution") + result = subprocess.run( + cmd, + check=True, + cwd=cwd, + capture_output=True, + text=True, + env=env, + ) + logging.debug("Command completed successfully") + if result.stdout: + logging.debug(f"Command stdout:\n{result.stdout}") + if result.stderr: + logging.debug(f"Command stderr:\n{result.stderr}") + return result + except subprocess.CalledProcessError as e: + error_msg = f"Command failed: {' '.join(cmd)}\n" + error_msg += f"Exit code: {e.returncode}\n" + if e.stdout: + error_msg += f"stdout:\n{e.stdout}\n" + if e.stderr: + error_msg += f"stderr:\n{e.stderr}\n" + logging.error(error_msg) + + # Print a user-friendly error message + console.print( + f"\n[red]Error: Command failed with exit code {e.returncode}[/red]" + ) + console.print(f"[red]Command: {' '.join(cmd)}[/red]") + if e.stderr: + console.print(f"[yellow]Error details:[/yellow]\n{e.stderr}") + + # Exit immediately on command failure + sys.exit(1) + except Exception as e: + error_msg = f"Unexpected error running command: {' '.join(cmd)}\n{str(e)}" + logging.error(error_msg) + + # Print a user-friendly error message + console.print("\n[red]Unexpected error:[/red]") + console.print(f"[red]Command: {' '.join(cmd)}[/red]") + console.print(f"[yellow]Error details:[/yellow]\n{str(e)}") + + # Exit immediately on any error + sys.exit(1) + + +def load_config() -> Dict[str, Any]: + """Load configuration from locations.yaml""" + try: + with open("locations.yaml", "r") as f: + yaml_data = yaml.safe_load(f) + if not isinstance(yaml_data, list): + raise ValueError("Expected a list of region configurations") + + # Convert list of single-key dictionaries into a single dictionary + config = {} + for region_dict in yaml_data: + if not isinstance(region_dict, dict): + raise ValueError("Each region configuration must be a dictionary") + if len(region_dict) != 1: + raise ValueError( + "Each region configuration must have exactly one key" + ) + + region = list(region_dict.keys())[0] + config[region] = region_dict[region] + + # Validate the configuration + validate_config(config) + return config + + except FileNotFoundError: + print("Error: locations.yaml file not found") + print("Please create a locations.yaml file with your region configurations") + sys.exit(1) + except yaml.YAMLError as e: + print(f"Error parsing locations.yaml: {e}") + print("Please ensure your YAML file is properly formatted") + sys.exit(1) + except ValueError as e: + print(f"Invalid configuration: {e}") + sys.exit(1) + + +def update_machines_file(region: str, outputs: Dict[str, Any]) -> None: + """Update MACHINES.json with outputs from a region""" + machines_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "MACHINES.json" + ) + + try: + if os.path.exists(machines_file): + with open(machines_file, "r") as f: + machines_data = json.load(f) + else: + machines_data = {} + + # Extract values from outputs, ensuring we get the raw values + public_ips = outputs.get("public_ips", {}).get("value", []) + private_ips = outputs.get("private_ips", {}).get("value", []) + instance_ids = outputs.get("instance_ids", {}).get("value", []) + + # Log the raw values for debugging + logging.debug(f"Raw outputs for region {region}:") + logging.debug(f"Public IPs: {public_ips}") + logging.debug(f"Private IPs: {private_ips}") + logging.debug(f"Instance IDs: {instance_ids}") + + # Handle nested lists (sometimes AWS returns nested arrays) + if ( + isinstance(public_ips, list) + and public_ips + and isinstance(public_ips[0], list) + ): + public_ips = public_ips[0] + if ( + isinstance(private_ips, list) + and private_ips + and isinstance(private_ips[0], list) + ): + private_ips = private_ips[0] + if ( + isinstance(instance_ids, list) + and instance_ids + and isinstance(instance_ids[0], list) + ): + instance_ids = instance_ids[0] + + # Ensure all lists are actually lists + public_ips = ( + public_ips + if isinstance(public_ips, list) + else [public_ips] + if public_ips + else [] + ) + private_ips = ( + private_ips + if isinstance(private_ips, list) + else [private_ips] + if private_ips + else [] + ) + instance_ids = ( + instance_ids + if isinstance(instance_ids, list) + else [instance_ids] + if instance_ids + else [] + ) + + # Create instances list for this region + instances = [] + max_length = max(len(instance_ids), len(public_ips), len(private_ips)) + + for i in range(max_length): + if i < len(instance_ids): # Only create instance if we have an ID + instance = { + "instance_id": instance_ids[i], + "public_ip": public_ips[i] if i < len(public_ips) else None, + "private_ip": private_ips[i] if i < len(private_ips) else None, + } + instances.append(instance) + + # Update the region's data with the new structure + machines_data[region] = {"name": region, "instances": instances} + + # Write updated data back to file + with open(machines_file, "w") as f: + json.dump(machines_data, f, indent=2) + + logging.info( + f"Updated MACHINES.json with {len(instances)} instances for region {region}" + ) + except Exception as e: + logging.error(f"Error updating MACHINES.json: {str(e)}") + raise + + +def check_machines_file() -> bool: + """Check if MACHINES.json exists and return True if it does""" + machines_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "MACHINES.json" + ) + return os.path.exists(machines_file) + + +def delete_machines_file() -> None: + """Delete MACHINES.json if it exists""" + machines_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "MACHINES.json" + ) + try: + if os.path.exists(machines_file): + os.remove(machines_file) + logging.info("Deleted MACHINES.json") + except Exception as e: + logging.error(f"Error deleting MACHINES.json: {str(e)}") + raise + + +def deploy(command, region, region_config): + """Deploys or destroys resources in a single region.""" + terraform_command = "apply" if command == "create" else "destroy" + logging.info(f"Starting {command} operation for region {region}") + + # Get absolute path to env.tfvars.json + workspace_dir = os.path.dirname(os.path.abspath(__file__)) + env_vars_file = os.path.join(workspace_dir, "env.tfvars.json") + logging.info(f"Using env vars file: {env_vars_file}") + + # Check if env.tfvars.json exists + if not os.path.exists(env_vars_file): + logging.error(f"Required file not found: {env_vars_file}") + raise FileNotFoundError(f"Required file not found: {env_vars_file}") + + logging.info(f"Region config: {json.dumps(region_config, indent=2)}") + + # For destroy command, get the current state before destroying + destroyed_resources = {} + if command == "destroy": + try: + run_command(["terraform", "workspace", "select", "-or-create", region]) + result = run_command(["terraform", "output", "-json"]) + try: + destroyed_resources = ( + json.loads(result.stdout) if result.stdout.strip() else {} + ) + except json.JSONDecodeError: + logging.warning(f"Could not parse terraform output for region {region}") + destroyed_resources = {} + except Exception as e: + logging.warning(f"Could not get current state for region {region}: {e}") + # Even if we can't get the current state, we should still show what was in MACHINES.json + destroyed_resources = {} + + with Progress( + "[progress.description]{task.description}", + BarColumn(), + "[progress.percentage]{task.percentage:>3.1f}%", + TimeRemainingColumn(), + console=console, + ) as progress: + task = progress.add_task( + f"[cyan]{region}[/cyan] - {command.capitalize()}", total=3 + ) + + # Select workspace for this region + logging.info(f"Selecting/creating workspace for region {region}") + run_command(["terraform", "workspace", "select", "-or-create", region]) + + progress.update( + task, advance=1, description=f"[cyan]{region}[/cyan] - Initializing" + ) + logging.info(f"Running terraform init for region {region}") + run_command(["terraform", "init", "-upgrade"]) + + progress.update( + task, + advance=1, + description=f"[cyan]{region}[/cyan] - {command.capitalize()}", + ) + logging.info(f"Running terraform {terraform_command} for region {region}") + logging.info( + f"Command variables: region={region}, zone={region_config['zone']}, " + f"instance_ami={region_config['instance_ami']}, " + f"node_count={region_config['node_count']}, " + f"instance_type={region_config['instance_type']}" + ) + try: + logging.debug(f"Starting terraform {terraform_command}") + result = run_command( + [ + "terraform", + terraform_command, + "-auto-approve", + f"-var=region={region}", + f"-var=zone={region_config['zone']}", + f"-var=instance_ami={region_config['instance_ami']}", + f"-var=node_count={region_config['node_count']}", + f"-var=instance_type={region_config['instance_type']}", + f"-var-file={env_vars_file}", + ] + ) + logging.info(f"Terraform {terraform_command} completed successfully") + + # After successful creation, update MACHINES.json + if command == "create": + outputs_result = run_command(["terraform", "output", "-json"]) + outputs = json.loads(outputs_result.stdout) + update_machines_file(region, outputs) + + logging.debug(f"Terraform {terraform_command} output:\n{result.stdout}") + if result.stderr: + logging.debug(f"Terraform {terraform_command} stderr:\n{result.stderr}") + except Exception as e: + logging.error( + f"Error during {terraform_command} for region {region}: {str(e)}" + ) + raise + + progress.update( + task, advance=1, description=f"[cyan]{region}[/cyan] - ✓ Complete" + ) + logging.info(f"Completed {command} operation for region {region}") + + return destroyed_resources if command == "destroy" else None + + +def validate_aws_credentials() -> Tuple[bool, str]: + """Validate AWS credentials are properly configured""" + logging.info("Validating AWS credentials...") + + try: + # Simply try to make an AWS API call + result = subprocess.run( + ["aws", "sts", "get-caller-identity"], + capture_output=True, + text=True, + check=True, + ) + identity = json.loads(result.stdout) + user_arn = identity.get("Arn", "Unknown") + account_id = identity.get("Account", "Unknown") + logging.info(f"AWS credentials valid - User: {user_arn}, Account: {account_id}") + return True, f"AWS credentials valid - Account: {account_id}" + except subprocess.CalledProcessError as e: + error_msg = "AWS credentials not found or invalid" + if e.stderr: + error_msg = f"AWS credential error: {e.stderr.strip()}" + logging.error(error_msg) + return False, error_msg + except Exception as e: + error_msg = f"Error validating AWS credentials: {str(e)}" + logging.error(error_msg) + return False, error_msg + + +def read_machines_file() -> Dict[str, Any]: + """Read and return the contents of MACHINES.json""" + machines_file = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "MACHINES.json" + ) + try: + if os.path.exists(machines_file): + with open(machines_file, "r") as f: + return json.load(f) + return {} + except Exception as e: + logging.error(f"Error reading MACHINES.json: {str(e)}") + raise + + +def main(): + try: + command = args.command + + # Check MACHINES.json status + if command == "create" and check_machines_file(): + console.print("\n[red]Error: MACHINES.json already exists[/red]") + console.print("This indicates that there might be existing infrastructure.") + console.print( + "Please run 'destroy' first or manually delete MACHINES.json if you're sure it's safe." + ) + sys.exit(1) + + # For destroy command, read the existing state before deleting + machines_data = {} + destroyed_resources = {} + if command == "destroy": + machines_data = read_machines_file() + delete_machines_file() + + # Validate AWS credentials before proceeding + credentials_valid, message = validate_aws_credentials() + if not credentials_valid: + console.print("\n[red]Error: AWS credentials are not valid[/red]") + console.print( + "Please configure your AWS credentials using one of these methods:" + ) + console.print("1. Set environment variables:") + console.print(" export AWS_ACCESS_KEY_ID='your-access-key'") + console.print(" export AWS_SECRET_ACCESS_KEY='your-secret-key'") + console.print("\n2. Or configure AWS CLI:") + console.print(" aws configure") + console.print("\nThen verify your credentials with:") + console.print(" aws sts get-caller-identity") + sys.exit(1) + else: + console.print(f"\n[green]{message}[/green]") + + # Get absolute path to env.tfvars.json + workspace_dir = os.path.dirname(os.path.abspath(__file__)) + env_vars_file = os.path.join(workspace_dir, "env.tfvars.json") + + # Check if env.tfvars.json exists before starting + if not os.path.exists(env_vars_file): + console.print( + f"\n[red]Error: Required file not found: {env_vars_file}[/red]" + ) + console.print( + "Please ensure env.tfvars.json exists in the same directory as deploy.py" + ) + sys.exit(1) + + # Load and validate configuration + config = load_config() + + console.print(f"\n[bold blue]Starting {command} operation...[/bold blue]\n") + + # Deploy/destroy resources in each region sequentially + for region, region_config in config.items(): + result = deploy(command, region, region_config) + if command == "destroy" and result: + destroyed_resources[region] = result + + # Display final summary + console.clear() + console.print("\n") + + if command == "destroy": + # Show summary of destroyed resources using both the saved machines_data + # and the actual destroyed resources + console.print("[bold red]Resources Destroyed:[/bold red]\n") + + for region in config.keys(): + console.print(f"[bold cyan]Region: {region}[/bold cyan]") + + # Get data from both sources + saved_data = machines_data.get(region, {}) + destroyed_data = destroyed_resources.get(region, {}) + + # Display destroyed instances from MACHINES.json + instances = saved_data.get("instances", []) + if instances: + console.print(" [yellow]Instances:[/yellow]") + for instance in instances: + console.print(f" [red]✗[/red] {instance['instance_id']}:") + console.print( + f" [dim]Public IP: {instance['public_ip']}[/dim]" + ) + console.print( + f" [dim]Private IP: {instance['private_ip']}[/dim]" + ) + + # Show VPCs and other AWS resources that were destroyed + if destroyed_data: + vpc_resources = [ + key for key in destroyed_data.keys() if "vpc" in key.lower() + ] + if vpc_resources: + console.print(" [yellow]VPC Resources:[/yellow]") + for resource in vpc_resources: + value = destroyed_data[resource].get("value") + if isinstance(value, list): + for v in value: + console.print(f" [red]✗[/red] {v}") + else: + console.print(f" [red]✗[/red] {value}") + + other_resources = [ + key + for key in destroyed_data.keys() + if key not in vpc_resources + and key + not in [ + "public_ips", + "private_ips", + "instance_ids", + ] + ] + if other_resources: + console.print(" [yellow]Other Resources:[/yellow]") + for resource in other_resources: + value = destroyed_data[resource].get("value") + if isinstance(value, list): + for v in value: + console.print(f" [red]✗[/red] {v}") + else: + console.print(f" [red]✗[/red] {value}") + + # Only show "No resources" message if both sources are empty + if not instances and not destroyed_data: + console.print( + " [dim]No resources were active in this region[/dim]" + ) + + console.print() + + else: + # Show active resources table using MACHINES.json + machines_data = read_machines_file() + + table = Table( + title="Active Deployments", + show_header=True, + header_style="bold", + padding=(0, 2), + box=box.DOUBLE, + ) + + # Columns for create operation + table.add_column( + "Region", style="cyan", width=15, justify="left", no_wrap=True + ) + table.add_column( + "Instance ID", style="yellow", width=25, justify="left", no_wrap=True + ) + table.add_column( + "Public IP", style="blue", width=20, justify="left", no_wrap=True + ) + table.add_column( + "Private IP", style="magenta", width=20, justify="left", no_wrap=True + ) + + # Add rows for each active instance from MACHINES.json + for region_data in machines_data.values(): + for instance in region_data["instances"]: + table.add_row( + region_data["name"], + instance["instance_id"], + instance["public_ip"] or "", + instance["private_ip"] or "", + ) + + console.print(table) + + console.print("\n[bold green]Operation complete![/bold green]\n") + + except KeyboardInterrupt: + console.print("\n[yellow]Operation cancelled by user[/yellow]") + sys.exit(1) + except Exception as e: + console.print(f"\n[red]Unexpected error: {e}[/red]") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml index 0c8511ae..f4ba1821 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations.yaml @@ -1,39 +1,57 @@ # Because of the way AWS works, we cannot # deploy to all regions at once. Use the below list to name the # regions you want to deploy to. -- ap-south-1: - region: ap-south-1 - zone: ap-south-1a +- ap-south-2a: + instance_ami: ami-0a94a70b8a1454a4b instance_type: t3.medium - instance_ami: ami-0fd05997b4dff7aac node_count: 1 -- eu-south-1: - region: eu-south-1 - zone: eu-south-1a + region: ap-south-2 + zone: ap-south-2a +- ca-central-1d: + instance_ami: ami-0a474b3a85d51a5e5 instance_type: t3.medium - instance_ami: ami-0f529654669a607d1 - node_count: 2 -- eu-south-2: - region: eu-south-2 - zone: eu-south-2a + node_count: 1 + region: ca-central-1 + zone: ca-central-1d +- eu-west-1c: + instance_ami: ami-032a56ad5e480189c + instance_type: t3.medium + node_count: 1 + region: eu-west-1 + zone: eu-west-1c +- eu-west-3b: + instance_ami: ami-04a4acda26ca36de0 + instance_type: t3.medium + node_count: 1 + region: eu-west-3 + zone: eu-west-3b +- me-central-1b: + instance_ami: ami-07a33155d2d5abff0 instance_type: t3.medium - instance_ami: ami-0d27757cc8327f88f - node_count: 3 -- me-central-1: + node_count: 1 region: me-central-1 - zone: me-central-1a + zone: me-central-1b +- me-central-1c: + instance_ami: ami-07a33155d2d5abff0 instance_type: t3.medium - instance_ami: ami-0f334de647da2fc7d node_count: 1 -- ca-central-1: - region: ca-central-1 - zone: ca-central-1a + region: me-central-1 + zone: me-central-1c +- sa-east-1b: + instance_ami: ami-0780816dd7ce942fd + instance_type: t3.medium + node_count: 1 + region: sa-east-1 + zone: sa-east-1b +- us-east-2a: + instance_ami: ami-0884d2865dbe9de4b instance_type: t3.medium - instance_ami: ami-0a590ca28046d073e node_count: 1 -- eu-central-1: - region: eu-central-1 - zone: eu-central-1a + region: us-east-2 + zone: us-east-2a +- us-east-2b: + instance_ami: ami-0884d2865dbe9de4b instance_type: t3.medium - instance_ami: ami-0e54671bdf3c8ed8d node_count: 1 + region: us-east-2 + zone: us-east-2b diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json new file mode 100644 index 00000000..18d0f2fe --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.json @@ -0,0 +1,821 @@ +[ + { + "af-south-1a": { + "region": "af-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-05577c8d07333d909", + "node_count": 1, + "zone": "af-south-1a" + } + }, + { + "af-south-1b": { + "region": "af-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-05577c8d07333d909", + "node_count": 1, + "zone": "af-south-1b" + } + }, + { + "af-south-1c": { + "region": "af-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-05577c8d07333d909", + "node_count": 1, + "zone": "af-south-1c" + } + }, + { + "ap-east-1a": { + "region": "ap-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0a0aa69b665a45b96", + "node_count": 1, + "zone": "ap-east-1a" + } + }, + { + "ap-east-1b": { + "region": "ap-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0a0aa69b665a45b96", + "node_count": 1, + "zone": "ap-east-1b" + } + }, + { + "ap-east-1c": { + "region": "ap-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0a0aa69b665a45b96", + "node_count": 1, + "zone": "ap-east-1c" + } + }, + { + "ap-northeast-1a": { + "region": "ap-northeast-1", + "instance_type": "t3.medium", + "instance_ami": "ami-08f191dd81ec3a3de", + "node_count": 1, + "zone": "ap-northeast-1a" + } + }, + { + "ap-northeast-1c": { + "region": "ap-northeast-1", + "instance_type": "t3.medium", + "instance_ami": "ami-08f191dd81ec3a3de", + "node_count": 1, + "zone": "ap-northeast-1c" + } + }, + { + "ap-northeast-1d": { + "region": "ap-northeast-1", + "instance_type": "t3.medium", + "instance_ami": "ami-08f191dd81ec3a3de", + "node_count": 1, + "zone": "ap-northeast-1d" + } + }, + { + "ap-northeast-2a": { + "region": "ap-northeast-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0077297a838d6761d", + "node_count": 1, + "zone": "ap-northeast-2a" + } + }, + { + "ap-northeast-2b": { + "region": "ap-northeast-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0077297a838d6761d", + "node_count": 1, + "zone": "ap-northeast-2b" + } + }, + { + "ap-northeast-2c": { + "region": "ap-northeast-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0077297a838d6761d", + "node_count": 1, + "zone": "ap-northeast-2c" + } + }, + { + "ap-northeast-2d": { + "region": "ap-northeast-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0077297a838d6761d", + "node_count": 1, + "zone": "ap-northeast-2d" + } + }, + { + "ap-northeast-3a": { + "region": "ap-northeast-3", + "instance_type": "t3.medium", + "instance_ami": "ami-0a0bcba223270ed99", + "node_count": 1, + "zone": "ap-northeast-3a" + } + }, + { + "ap-northeast-3b": { + "region": "ap-northeast-3", + "instance_type": "t3.medium", + "instance_ami": "ami-0a0bcba223270ed99", + "node_count": 1, + "zone": "ap-northeast-3b" + } + }, + { + "ap-northeast-3c": { + "region": "ap-northeast-3", + "instance_type": "t3.medium", + "instance_ami": "ami-0a0bcba223270ed99", + "node_count": 1, + "zone": "ap-northeast-3c" + } + }, + { + "ap-south-1a": { + "region": "ap-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-023a307f3d27ea427", + "node_count": 1, + "zone": "ap-south-1a" + } + }, + { + "ap-south-1b": { + "region": "ap-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-023a307f3d27ea427", + "node_count": 1, + "zone": "ap-south-1b" + } + }, + { + "ap-south-1c": { + "region": "ap-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-023a307f3d27ea427", + "node_count": 1, + "zone": "ap-south-1c" + } + }, + { + "ap-south-2a": { + "region": "ap-south-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0a94a70b8a1454a4b", + "node_count": 1, + "zone": "ap-south-2a" + } + }, + { + "ap-south-2b": { + "region": "ap-south-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0a94a70b8a1454a4b", + "node_count": 1, + "zone": "ap-south-2b" + } + }, + { + "ap-south-2c": { + "region": "ap-south-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0a94a70b8a1454a4b", + "node_count": 1, + "zone": "ap-south-2c" + } + }, + { + "ap-southeast-1a": { + "region": "ap-southeast-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0198a868663199764", + "node_count": 1, + "zone": "ap-southeast-1a" + } + }, + { + "ap-southeast-1b": { + "region": "ap-southeast-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0198a868663199764", + "node_count": 1, + "zone": "ap-southeast-1b" + } + }, + { + "ap-southeast-1c": { + "region": "ap-southeast-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0198a868663199764", + "node_count": 1, + "zone": "ap-southeast-1c" + } + }, + { + "ap-southeast-2a": { + "region": "ap-southeast-2", + "instance_type": "t3.medium", + "instance_ami": "ami-01e2093820bf84df1", + "node_count": 1, + "zone": "ap-southeast-2a" + } + }, + { + "ap-southeast-2b": { + "region": "ap-southeast-2", + "instance_type": "t3.medium", + "instance_ami": "ami-01e2093820bf84df1", + "node_count": 1, + "zone": "ap-southeast-2b" + } + }, + { + "ap-southeast-2c": { + "region": "ap-southeast-2", + "instance_type": "t3.medium", + "instance_ami": "ami-01e2093820bf84df1", + "node_count": 1, + "zone": "ap-southeast-2c" + } + }, + { + "ap-southeast-3a": { + "region": "ap-southeast-3", + "instance_type": "t3.medium", + "instance_ami": "ami-01d23da707abb0f2f", + "node_count": 1, + "zone": "ap-southeast-3a" + } + }, + { + "ap-southeast-3b": { + "region": "ap-southeast-3", + "instance_type": "t3.medium", + "instance_ami": "ami-01d23da707abb0f2f", + "node_count": 1, + "zone": "ap-southeast-3b" + } + }, + { + "ap-southeast-3c": { + "region": "ap-southeast-3", + "instance_type": "t3.medium", + "instance_ami": "ami-01d23da707abb0f2f", + "node_count": 1, + "zone": "ap-southeast-3c" + } + }, + { + "ap-southeast-4a": { + "region": "ap-southeast-4", + "instance_type": "t3.medium", + "instance_ami": "ami-05e0c17592d882511", + "node_count": 1, + "zone": "ap-southeast-4a" + } + }, + { + "ap-southeast-4b": { + "region": "ap-southeast-4", + "instance_type": "t3.medium", + "instance_ami": "ami-05e0c17592d882511", + "node_count": 1, + "zone": "ap-southeast-4b" + } + }, + { + "ap-southeast-4c": { + "region": "ap-southeast-4", + "instance_type": "t3.medium", + "instance_ami": "ami-05e0c17592d882511", + "node_count": 1, + "zone": "ap-southeast-4c" + } + }, + { + "ap-southeast-5a": { + "region": "ap-southeast-5", + "instance_type": "t3.medium", + "instance_ami": "ami-0a88b63550bbb2b1a", + "node_count": 1, + "zone": "ap-southeast-5a" + } + }, + { + "ap-southeast-5b": { + "region": "ap-southeast-5", + "instance_type": "t3.medium", + "instance_ami": "ami-0a88b63550bbb2b1a", + "node_count": 1, + "zone": "ap-southeast-5b" + } + }, + { + "ap-southeast-5c": { + "region": "ap-southeast-5", + "instance_type": "t3.medium", + "instance_ami": "ami-0a88b63550bbb2b1a", + "node_count": 1, + "zone": "ap-southeast-5c" + } + }, + { + "ca-central-1a": { + "region": "ca-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0a474b3a85d51a5e5", + "node_count": 1, + "zone": "ca-central-1a" + } + }, + { + "ca-central-1b": { + "region": "ca-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0a474b3a85d51a5e5", + "node_count": 1, + "zone": "ca-central-1b" + } + }, + { + "ca-central-1d": { + "region": "ca-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0a474b3a85d51a5e5", + "node_count": 1, + "zone": "ca-central-1d" + } + }, + { + "ca-west-1a": { + "region": "ca-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0bd5d3965e2cc8c99", + "node_count": 1, + "zone": "ca-west-1a" + } + }, + { + "ca-west-1b": { + "region": "ca-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0bd5d3965e2cc8c99", + "node_count": 1, + "zone": "ca-west-1b" + } + }, + { + "ca-west-1c": { + "region": "ca-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0bd5d3965e2cc8c99", + "node_count": 1, + "zone": "ca-west-1c" + } + }, + { + "eu-central-1a": { + "region": "eu-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-03b3b5f65db7e5c6f", + "node_count": 1, + "zone": "eu-central-1a" + } + }, + { + "eu-central-1b": { + "region": "eu-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-03b3b5f65db7e5c6f", + "node_count": 1, + "zone": "eu-central-1b" + } + }, + { + "eu-central-1c": { + "region": "eu-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-03b3b5f65db7e5c6f", + "node_count": 1, + "zone": "eu-central-1c" + } + }, + { + "eu-central-2a": { + "region": "eu-central-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0039b5f24283949b4", + "node_count": 1, + "zone": "eu-central-2a" + } + }, + { + "eu-central-2b": { + "region": "eu-central-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0039b5f24283949b4", + "node_count": 1, + "zone": "eu-central-2b" + } + }, + { + "eu-central-2c": { + "region": "eu-central-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0039b5f24283949b4", + "node_count": 1, + "zone": "eu-central-2c" + } + }, + { + "eu-north-1a": { + "region": "eu-north-1", + "instance_type": "t3.medium", + "instance_ami": "ami-02e2af61198e99faf", + "node_count": 1, + "zone": "eu-north-1a" + } + }, + { + "eu-north-1b": { + "region": "eu-north-1", + "instance_type": "t3.medium", + "instance_ami": "ami-02e2af61198e99faf", + "node_count": 1, + "zone": "eu-north-1b" + } + }, + { + "eu-north-1c": { + "region": "eu-north-1", + "instance_type": "t3.medium", + "instance_ami": "ami-02e2af61198e99faf", + "node_count": 1, + "zone": "eu-north-1c" + } + }, + { + "eu-south-1a": { + "region": "eu-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0ded58aac79f90084", + "node_count": 1, + "zone": "eu-south-1a" + } + }, + { + "eu-south-1b": { + "region": "eu-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0ded58aac79f90084", + "node_count": 1, + "zone": "eu-south-1b" + } + }, + { + "eu-south-1c": { + "region": "eu-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0ded58aac79f90084", + "node_count": 1, + "zone": "eu-south-1c" + } + }, + { + "eu-south-2a": { + "region": "eu-south-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0895641da3c86443b", + "node_count": 1, + "zone": "eu-south-2a" + } + }, + { + "eu-south-2b": { + "region": "eu-south-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0895641da3c86443b", + "node_count": 1, + "zone": "eu-south-2b" + } + }, + { + "eu-south-2c": { + "region": "eu-south-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0895641da3c86443b", + "node_count": 1, + "zone": "eu-south-2c" + } + }, + { + "eu-west-1a": { + "region": "eu-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-032a56ad5e480189c", + "node_count": 1, + "zone": "eu-west-1a" + } + }, + { + "eu-west-1b": { + "region": "eu-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-032a56ad5e480189c", + "node_count": 1, + "zone": "eu-west-1b" + } + }, + { + "eu-west-1c": { + "region": "eu-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-032a56ad5e480189c", + "node_count": 1, + "zone": "eu-west-1c" + } + }, + { + "eu-west-2a": { + "region": "eu-west-2", + "instance_type": "t3.medium", + "instance_ami": "ami-06cff85354b67982b", + "node_count": 1, + "zone": "eu-west-2a" + } + }, + { + "eu-west-2b": { + "region": "eu-west-2", + "instance_type": "t3.medium", + "instance_ami": "ami-06cff85354b67982b", + "node_count": 1, + "zone": "eu-west-2b" + } + }, + { + "eu-west-2c": { + "region": "eu-west-2", + "instance_type": "t3.medium", + "instance_ami": "ami-06cff85354b67982b", + "node_count": 1, + "zone": "eu-west-2c" + } + }, + { + "eu-west-3a": { + "region": "eu-west-3", + "instance_type": "t3.medium", + "instance_ami": "ami-04a4acda26ca36de0", + "node_count": 1, + "zone": "eu-west-3a" + } + }, + { + "eu-west-3b": { + "region": "eu-west-3", + "instance_type": "t3.medium", + "instance_ami": "ami-04a4acda26ca36de0", + "node_count": 1, + "zone": "eu-west-3b" + } + }, + { + "eu-west-3c": { + "region": "eu-west-3", + "instance_type": "t3.medium", + "instance_ami": "ami-04a4acda26ca36de0", + "node_count": 1, + "zone": "eu-west-3c" + } + }, + { + "me-central-1a": { + "region": "me-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-07a33155d2d5abff0", + "node_count": 1, + "zone": "me-central-1a" + } + }, + { + "me-central-1b": { + "region": "me-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-07a33155d2d5abff0", + "node_count": 1, + "zone": "me-central-1b" + } + }, + { + "me-central-1c": { + "region": "me-central-1", + "instance_type": "t3.medium", + "instance_ami": "ami-07a33155d2d5abff0", + "node_count": 1, + "zone": "me-central-1c" + } + }, + { + "me-south-1a": { + "region": "me-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0b32599a51ef0ad90", + "node_count": 1, + "zone": "me-south-1a" + } + }, + { + "me-south-1b": { + "region": "me-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0b32599a51ef0ad90", + "node_count": 1, + "zone": "me-south-1b" + } + }, + { + "me-south-1c": { + "region": "me-south-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0b32599a51ef0ad90", + "node_count": 1, + "zone": "me-south-1c" + } + }, + { + "sa-east-1a": { + "region": "sa-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0780816dd7ce942fd", + "node_count": 1, + "zone": "sa-east-1a" + } + }, + { + "sa-east-1b": { + "region": "sa-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0780816dd7ce942fd", + "node_count": 1, + "zone": "sa-east-1b" + } + }, + { + "sa-east-1c": { + "region": "sa-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0780816dd7ce942fd", + "node_count": 1, + "zone": "sa-east-1c" + } + }, + { + "us-east-1a": { + "region": "us-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0e1bed4f06a3b463d", + "node_count": 1, + "zone": "us-east-1a" + } + }, + { + "us-east-1b": { + "region": "us-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0e1bed4f06a3b463d", + "node_count": 1, + "zone": "us-east-1b" + } + }, + { + "us-east-1c": { + "region": "us-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0e1bed4f06a3b463d", + "node_count": 1, + "zone": "us-east-1c" + } + }, + { + "us-east-1d": { + "region": "us-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0e1bed4f06a3b463d", + "node_count": 1, + "zone": "us-east-1d" + } + }, + { + "us-east-1e": { + "region": "us-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0e1bed4f06a3b463d", + "node_count": 1, + "zone": "us-east-1e" + } + }, + { + "us-east-1f": { + "region": "us-east-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0e1bed4f06a3b463d", + "node_count": 1, + "zone": "us-east-1f" + } + }, + { + "us-east-2a": { + "region": "us-east-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0884d2865dbe9de4b", + "node_count": 1, + "zone": "us-east-2a" + } + }, + { + "us-east-2b": { + "region": "us-east-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0884d2865dbe9de4b", + "node_count": 1, + "zone": "us-east-2b" + } + }, + { + "us-east-2c": { + "region": "us-east-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0884d2865dbe9de4b", + "node_count": 1, + "zone": "us-east-2c" + } + }, + { + "us-west-1b": { + "region": "us-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0d413c682033e11fd", + "node_count": 1, + "zone": "us-west-1b" + } + }, + { + "us-west-1c": { + "region": "us-west-1", + "instance_type": "t3.medium", + "instance_ami": "ami-0d413c682033e11fd", + "node_count": 1, + "zone": "us-west-1c" + } + }, + { + "us-west-2a": { + "region": "us-west-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0606dd43116f5ed57", + "node_count": 1, + "zone": "us-west-2a" + } + }, + { + "us-west-2b": { + "region": "us-west-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0606dd43116f5ed57", + "node_count": 1, + "zone": "us-west-2b" + } + }, + { + "us-west-2c": { + "region": "us-west-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0606dd43116f5ed57", + "node_count": 1, + "zone": "us-west-2c" + } + }, + { + "us-west-2d": { + "region": "us-west-2", + "instance_type": "t3.medium", + "instance_ami": "ami-0606dd43116f5ed57", + "node_count": 1, + "zone": "us-west-2d" + } + } +] \ No newline at end of file diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml new file mode 100644 index 00000000..2fafb242 --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/locations/all_locations.yaml @@ -0,0 +1,546 @@ +- af-south-1a: + instance_ami: ami-05577c8d07333d909 + instance_type: t3.medium + node_count: 1 + region: af-south-1 + zone: af-south-1a +- af-south-1b: + instance_ami: ami-05577c8d07333d909 + instance_type: t3.medium + node_count: 1 + region: af-south-1 + zone: af-south-1b +- af-south-1c: + instance_ami: ami-05577c8d07333d909 + instance_type: t3.medium + node_count: 1 + region: af-south-1 + zone: af-south-1c +- ap-east-1a: + instance_ami: ami-0a0aa69b665a45b96 + instance_type: t3.medium + node_count: 1 + region: ap-east-1 + zone: ap-east-1a +- ap-east-1b: + instance_ami: ami-0a0aa69b665a45b96 + instance_type: t3.medium + node_count: 1 + region: ap-east-1 + zone: ap-east-1b +- ap-east-1c: + instance_ami: ami-0a0aa69b665a45b96 + instance_type: t3.medium + node_count: 1 + region: ap-east-1 + zone: ap-east-1c +- ap-northeast-1a: + instance_ami: ami-08f191dd81ec3a3de + instance_type: t3.medium + node_count: 1 + region: ap-northeast-1 + zone: ap-northeast-1a +- ap-northeast-1c: + instance_ami: ami-08f191dd81ec3a3de + instance_type: t3.medium + node_count: 1 + region: ap-northeast-1 + zone: ap-northeast-1c +- ap-northeast-1d: + instance_ami: ami-08f191dd81ec3a3de + instance_type: t3.medium + node_count: 1 + region: ap-northeast-1 + zone: ap-northeast-1d +- ap-northeast-2a: + instance_ami: ami-0077297a838d6761d + instance_type: t3.medium + node_count: 1 + region: ap-northeast-2 + zone: ap-northeast-2a +- ap-northeast-2b: + instance_ami: ami-0077297a838d6761d + instance_type: t3.medium + node_count: 1 + region: ap-northeast-2 + zone: ap-northeast-2b +- ap-northeast-2c: + instance_ami: ami-0077297a838d6761d + instance_type: t3.medium + node_count: 1 + region: ap-northeast-2 + zone: ap-northeast-2c +- ap-northeast-2d: + instance_ami: ami-0077297a838d6761d + instance_type: t3.medium + node_count: 1 + region: ap-northeast-2 + zone: ap-northeast-2d +- ap-northeast-3a: + instance_ami: ami-0a0bcba223270ed99 + instance_type: t3.medium + node_count: 1 + region: ap-northeast-3 + zone: ap-northeast-3a +- ap-northeast-3b: + instance_ami: ami-0a0bcba223270ed99 + instance_type: t3.medium + node_count: 1 + region: ap-northeast-3 + zone: ap-northeast-3b +- ap-northeast-3c: + instance_ami: ami-0a0bcba223270ed99 + instance_type: t3.medium + node_count: 1 + region: ap-northeast-3 + zone: ap-northeast-3c +- ap-south-1a: + instance_ami: ami-023a307f3d27ea427 + instance_type: t3.medium + node_count: 1 + region: ap-south-1 + zone: ap-south-1a +- ap-south-1b: + instance_ami: ami-023a307f3d27ea427 + instance_type: t3.medium + node_count: 1 + region: ap-south-1 + zone: ap-south-1b +- ap-south-1c: + instance_ami: ami-023a307f3d27ea427 + instance_type: t3.medium + node_count: 1 + region: ap-south-1 + zone: ap-south-1c +- ap-south-2a: + instance_ami: ami-0a94a70b8a1454a4b + instance_type: t3.medium + node_count: 1 + region: ap-south-2 + zone: ap-south-2a +- ap-south-2b: + instance_ami: ami-0a94a70b8a1454a4b + instance_type: t3.medium + node_count: 1 + region: ap-south-2 + zone: ap-south-2b +- ap-south-2c: + instance_ami: ami-0a94a70b8a1454a4b + instance_type: t3.medium + node_count: 1 + region: ap-south-2 + zone: ap-south-2c +- ap-southeast-1a: + instance_ami: ami-0198a868663199764 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-1 + zone: ap-southeast-1a +- ap-southeast-1b: + instance_ami: ami-0198a868663199764 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-1 + zone: ap-southeast-1b +- ap-southeast-1c: + instance_ami: ami-0198a868663199764 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-1 + zone: ap-southeast-1c +- ap-southeast-2a: + instance_ami: ami-01e2093820bf84df1 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-2 + zone: ap-southeast-2a +- ap-southeast-2b: + instance_ami: ami-01e2093820bf84df1 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-2 + zone: ap-southeast-2b +- ap-southeast-2c: + instance_ami: ami-01e2093820bf84df1 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-2 + zone: ap-southeast-2c +- ap-southeast-3a: + instance_ami: ami-01d23da707abb0f2f + instance_type: t3.medium + node_count: 1 + region: ap-southeast-3 + zone: ap-southeast-3a +- ap-southeast-3b: + instance_ami: ami-01d23da707abb0f2f + instance_type: t3.medium + node_count: 1 + region: ap-southeast-3 + zone: ap-southeast-3b +- ap-southeast-3c: + instance_ami: ami-01d23da707abb0f2f + instance_type: t3.medium + node_count: 1 + region: ap-southeast-3 + zone: ap-southeast-3c +- ap-southeast-4a: + instance_ami: ami-05e0c17592d882511 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-4 + zone: ap-southeast-4a +- ap-southeast-4b: + instance_ami: ami-05e0c17592d882511 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-4 + zone: ap-southeast-4b +- ap-southeast-4c: + instance_ami: ami-05e0c17592d882511 + instance_type: t3.medium + node_count: 1 + region: ap-southeast-4 + zone: ap-southeast-4c +- ap-southeast-5a: + instance_ami: ami-0a88b63550bbb2b1a + instance_type: t3.medium + node_count: 1 + region: ap-southeast-5 + zone: ap-southeast-5a +- ap-southeast-5b: + instance_ami: ami-0a88b63550bbb2b1a + instance_type: t3.medium + node_count: 1 + region: ap-southeast-5 + zone: ap-southeast-5b +- ap-southeast-5c: + instance_ami: ami-0a88b63550bbb2b1a + instance_type: t3.medium + node_count: 1 + region: ap-southeast-5 + zone: ap-southeast-5c +- ca-central-1a: + instance_ami: ami-0a474b3a85d51a5e5 + instance_type: t3.medium + node_count: 1 + region: ca-central-1 + zone: ca-central-1a +- ca-central-1b: + instance_ami: ami-0a474b3a85d51a5e5 + instance_type: t3.medium + node_count: 1 + region: ca-central-1 + zone: ca-central-1b +- ca-central-1d: + instance_ami: ami-0a474b3a85d51a5e5 + instance_type: t3.medium + node_count: 1 + region: ca-central-1 + zone: ca-central-1d +- ca-west-1a: + instance_ami: ami-0bd5d3965e2cc8c99 + instance_type: t3.medium + node_count: 1 + region: ca-west-1 + zone: ca-west-1a +- ca-west-1b: + instance_ami: ami-0bd5d3965e2cc8c99 + instance_type: t3.medium + node_count: 1 + region: ca-west-1 + zone: ca-west-1b +- ca-west-1c: + instance_ami: ami-0bd5d3965e2cc8c99 + instance_type: t3.medium + node_count: 1 + region: ca-west-1 + zone: ca-west-1c +- eu-central-1a: + instance_ami: ami-03b3b5f65db7e5c6f + instance_type: t3.medium + node_count: 1 + region: eu-central-1 + zone: eu-central-1a +- eu-central-1b: + instance_ami: ami-03b3b5f65db7e5c6f + instance_type: t3.medium + node_count: 1 + region: eu-central-1 + zone: eu-central-1b +- eu-central-1c: + instance_ami: ami-03b3b5f65db7e5c6f + instance_type: t3.medium + node_count: 1 + region: eu-central-1 + zone: eu-central-1c +- eu-central-2a: + instance_ami: ami-0039b5f24283949b4 + instance_type: t3.medium + node_count: 1 + region: eu-central-2 + zone: eu-central-2a +- eu-central-2b: + instance_ami: ami-0039b5f24283949b4 + instance_type: t3.medium + node_count: 1 + region: eu-central-2 + zone: eu-central-2b +- eu-central-2c: + instance_ami: ami-0039b5f24283949b4 + instance_type: t3.medium + node_count: 1 + region: eu-central-2 + zone: eu-central-2c +- eu-north-1a: + instance_ami: ami-02e2af61198e99faf + instance_type: t3.medium + node_count: 1 + region: eu-north-1 + zone: eu-north-1a +- eu-north-1b: + instance_ami: ami-02e2af61198e99faf + instance_type: t3.medium + node_count: 1 + region: eu-north-1 + zone: eu-north-1b +- eu-north-1c: + instance_ami: ami-02e2af61198e99faf + instance_type: t3.medium + node_count: 1 + region: eu-north-1 + zone: eu-north-1c +- eu-south-1a: + instance_ami: ami-0ded58aac79f90084 + instance_type: t3.medium + node_count: 1 + region: eu-south-1 + zone: eu-south-1a +- eu-south-1b: + instance_ami: ami-0ded58aac79f90084 + instance_type: t3.medium + node_count: 1 + region: eu-south-1 + zone: eu-south-1b +- eu-south-1c: + instance_ami: ami-0ded58aac79f90084 + instance_type: t3.medium + node_count: 1 + region: eu-south-1 + zone: eu-south-1c +- eu-south-2a: + instance_ami: ami-0895641da3c86443b + instance_type: t3.medium + node_count: 1 + region: eu-south-2 + zone: eu-south-2a +- eu-south-2b: + instance_ami: ami-0895641da3c86443b + instance_type: t3.medium + node_count: 1 + region: eu-south-2 + zone: eu-south-2b +- eu-south-2c: + instance_ami: ami-0895641da3c86443b + instance_type: t3.medium + node_count: 1 + region: eu-south-2 + zone: eu-south-2c +- eu-west-1a: + instance_ami: ami-032a56ad5e480189c + instance_type: t3.medium + node_count: 1 + region: eu-west-1 + zone: eu-west-1a +- eu-west-1b: + instance_ami: ami-032a56ad5e480189c + instance_type: t3.medium + node_count: 1 + region: eu-west-1 + zone: eu-west-1b +- eu-west-1c: + instance_ami: ami-032a56ad5e480189c + instance_type: t3.medium + node_count: 1 + region: eu-west-1 + zone: eu-west-1c +- eu-west-2a: + instance_ami: ami-06cff85354b67982b + instance_type: t3.medium + node_count: 1 + region: eu-west-2 + zone: eu-west-2a +- eu-west-2b: + instance_ami: ami-06cff85354b67982b + instance_type: t3.medium + node_count: 1 + region: eu-west-2 + zone: eu-west-2b +- eu-west-2c: + instance_ami: ami-06cff85354b67982b + instance_type: t3.medium + node_count: 1 + region: eu-west-2 + zone: eu-west-2c +- eu-west-3a: + instance_ami: ami-04a4acda26ca36de0 + instance_type: t3.medium + node_count: 1 + region: eu-west-3 + zone: eu-west-3a +- eu-west-3b: + instance_ami: ami-04a4acda26ca36de0 + instance_type: t3.medium + node_count: 1 + region: eu-west-3 + zone: eu-west-3b +- eu-west-3c: + instance_ami: ami-04a4acda26ca36de0 + instance_type: t3.medium + node_count: 1 + region: eu-west-3 + zone: eu-west-3c +- me-central-1a: + instance_ami: ami-07a33155d2d5abff0 + instance_type: t3.medium + node_count: 1 + region: me-central-1 + zone: me-central-1a +- me-central-1b: + instance_ami: ami-07a33155d2d5abff0 + instance_type: t3.medium + node_count: 1 + region: me-central-1 + zone: me-central-1b +- me-central-1c: + instance_ami: ami-07a33155d2d5abff0 + instance_type: t3.medium + node_count: 1 + region: me-central-1 + zone: me-central-1c +- me-south-1a: + instance_ami: ami-0b32599a51ef0ad90 + instance_type: t3.medium + node_count: 1 + region: me-south-1 + zone: me-south-1a +- me-south-1b: + instance_ami: ami-0b32599a51ef0ad90 + instance_type: t3.medium + node_count: 1 + region: me-south-1 + zone: me-south-1b +- me-south-1c: + instance_ami: ami-0b32599a51ef0ad90 + instance_type: t3.medium + node_count: 1 + region: me-south-1 + zone: me-south-1c +- sa-east-1a: + instance_ami: ami-0780816dd7ce942fd + instance_type: t3.medium + node_count: 1 + region: sa-east-1 + zone: sa-east-1a +- sa-east-1b: + instance_ami: ami-0780816dd7ce942fd + instance_type: t3.medium + node_count: 1 + region: sa-east-1 + zone: sa-east-1b +- sa-east-1c: + instance_ami: ami-0780816dd7ce942fd + instance_type: t3.medium + node_count: 1 + region: sa-east-1 + zone: sa-east-1c +- us-east-1a: + instance_ami: ami-0e1bed4f06a3b463d + instance_type: t3.medium + node_count: 1 + region: us-east-1 + zone: us-east-1a +- us-east-1b: + instance_ami: ami-0e1bed4f06a3b463d + instance_type: t3.medium + node_count: 1 + region: us-east-1 + zone: us-east-1b +- us-east-1c: + instance_ami: ami-0e1bed4f06a3b463d + instance_type: t3.medium + node_count: 1 + region: us-east-1 + zone: us-east-1c +- us-east-1d: + instance_ami: ami-0e1bed4f06a3b463d + instance_type: t3.medium + node_count: 1 + region: us-east-1 + zone: us-east-1d +- us-east-1e: + instance_ami: ami-0e1bed4f06a3b463d + instance_type: t3.medium + node_count: 1 + region: us-east-1 + zone: us-east-1e +- us-east-1f: + instance_ami: ami-0e1bed4f06a3b463d + instance_type: t3.medium + node_count: 1 + region: us-east-1 + zone: us-east-1f +- us-east-2a: + instance_ami: ami-0884d2865dbe9de4b + instance_type: t3.medium + node_count: 1 + region: us-east-2 + zone: us-east-2a +- us-east-2b: + instance_ami: ami-0884d2865dbe9de4b + instance_type: t3.medium + node_count: 1 + region: us-east-2 + zone: us-east-2b +- us-east-2c: + instance_ami: ami-0884d2865dbe9de4b + instance_type: t3.medium + node_count: 1 + region: us-east-2 + zone: us-east-2c +- us-west-1b: + instance_ami: ami-0d413c682033e11fd + instance_type: t3.medium + node_count: 1 + region: us-west-1 + zone: us-west-1b +- us-west-1c: + instance_ami: ami-0d413c682033e11fd + instance_type: t3.medium + node_count: 1 + region: us-west-1 + zone: us-west-1c +- us-west-2a: + instance_ami: ami-0606dd43116f5ed57 + instance_type: t3.medium + node_count: 1 + region: us-west-2 + zone: us-west-2a +- us-west-2b: + instance_ami: ami-0606dd43116f5ed57 + instance_type: t3.medium + node_count: 1 + region: us-west-2 + zone: us-west-2b +- us-west-2c: + instance_ami: ami-0606dd43116f5ed57 + instance_type: t3.medium + node_count: 1 + region: us-west-2 + zone: us-west-2c +- us-west-2d: + instance_ami: ami-0606dd43116f5ed57 + instance_type: t3.medium + node_count: 1 + region: us-west-2 + zone: us-west-2d diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf index 8e01063c..ee42e8c2 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/main.tf @@ -1,9 +1,15 @@ # Configure the AWS Provider for the current workspace/region + + provider "aws" { region = var.region shared_config_files = ["~/.aws/config"] shared_credentials_files = ["~/.aws/credentials"] - profile = "default" +} + +# Create a unique identifier for this zone deployment +locals { + zone_id = "${var.region}-${var.zone}" } module "region" { @@ -13,7 +19,7 @@ module "region" { zone = var.zone instance_ami = var.instance_ami node_count = var.node_count - app_tag = var.app_tag + app_tag = "${var.app_tag}-${local.zone_id}" aws_instance_type = var.instance_type public_key_path = var.public_key_path private_key_path = var.private_key_path diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh index bc674c9b..c74e5d18 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/install_docker.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +# Exit on error +set -e + # Detect OS if [ -f /etc/os-release ]; then . /etc/os-release @@ -8,23 +11,74 @@ fi echo "Detected OS: $OS" +# Function to retry commands +retry_command() { + local n=0 + local max=5 + local delay=15 + while true; do + "$@" && break || { + if [[ $n -lt $max ]]; then + ((n++)) + echo "Command failed. Attempt $n/$max. Retrying in $delay seconds..." + sleep $delay + else + echo "The command has failed after $n attempts." + return 1 + fi + } + done +} + # Install Docker based on available package manager if command -v apt-get >/dev/null 2>&1; then # Debian/Ubuntu installation - apt-get update - apt-get install -y ca-certificates curl gnupg + echo "Using apt package manager..." + + # Update package list with retry + retry_command apt-get update + + # Install prerequisites with retry + retry_command apt-get install -y \ + ca-certificates \ + curl \ + gnupg \ + pigz \ + libltdl7 \ + libslirp0 \ + slirp4netns \ + apt-transport-https \ + software-properties-common + + # Setup Docker repository install -m 0755 -d /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + rm -f /etc/apt/keyrings/docker.gpg + retry_command curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg chmod a+r /etc/apt/keyrings/docker.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null - apt-get update - apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin + + # Add Docker repository + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Update again with retry after adding Docker repository + retry_command apt-get update + + # Install Docker packages with retry + retry_command apt-get install -y \ + docker-ce \ + docker-ce-cli \ + containerd.io \ + docker-buildx-plugin \ + docker-compose-plugin elif command -v yum >/dev/null 2>&1; then # DNF-based systems (Amazon Linux 2023, Fedora, RHEL) - yum install docker -y + echo "Using yum package manager..." + retry_command yum install -y docker mkdir -p /usr/local/lib/docker/cli-plugins/ - curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose + retry_command curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose chmod +x /usr/local/lib/docker/cli-plugins/docker-compose else @@ -32,10 +86,27 @@ else exit 1 fi -# Start and enable Docker service -systemctl start docker -systemctl enable docker +# Start and enable Docker service with retry +echo "Starting Docker service..." +systemctl start docker || { + echo "Failed to start Docker service. Waiting 10 seconds and trying again..." + sleep 10 + systemctl start docker +} + +echo "Enabling Docker service..." +systemctl enable docker || { + echo "Failed to enable Docker service. Waiting 10 seconds and trying again..." + sleep 10 + systemctl enable docker +} # Verify installations -docker --version -docker compose version +echo "Verifying Docker installation..." +if command -v docker >/dev/null 2>&1; then + docker --version + docker compose version +else + echo "Docker installation verification failed" + exit 1 +fi diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh index ca487235..5d094185 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/scripts/startup.sh @@ -37,7 +37,7 @@ get_cloud_metadata() { METADATA=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance?api-version=2021-02-01") REGION=$(echo "$METADATA" | jq -r .compute.location) ZONE=$(echo "$METADATA" | jq -r .compute.zone) - PUBLIC_IP=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress?api-version=2021-02-01&format=text") + PUBLIC_IP=$(curl -s ip.me) PRIVATE_IP=$(echo "$METADATA" | jq -r .network.interface[0].ipv4.ipAddress[0].privateIpAddress) INSTANCE_ID=$(echo "$METADATA" | jq -r .compute.vmId) INSTANCE_TYPE=$(echo "$METADATA" | jq -r .compute.vmSize) diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf index edc22ffc..a9e3c714 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/instance/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.0" + version = "~> 5.0" } random = { source = "hashicorp/random" diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf index b3b846e1..a540207e 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/network/versions.tf @@ -2,8 +2,8 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.0" + version = "~> 5.0" configuration_aliases = [aws] } } -} +} diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf index 3ff4178a..f4eff6bd 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/main.tf @@ -5,9 +5,9 @@ module "networkModule" { region = var.region zone = var.zone - cidr_block_range = "10.0.0.0/16" - subnet1_cidr_block_range = "10.0.1.0/24" - subnet2_cidr_block_range = "10.0.2.0/24" + cidr_block_range = "10.${index(data.aws_availability_zones.available.names, var.zone)}.0.0/16" + subnet1_cidr_block_range = "10.${index(data.aws_availability_zones.available.names, var.zone)}.1.0/24" + subnet2_cidr_block_range = "10.${index(data.aws_availability_zones.available.names, var.zone)}.2.0/24" providers = { aws = aws @@ -50,3 +50,8 @@ module "instanceModule" { } } +# Get list of availability zones for CIDR block calculation +data "aws_availability_zones" "available" { + state = "available" +} + diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf index 8c2e4cff..a540207e 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/region/versions.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.0" + version = "~> 5.0" configuration_aliases = [aws] } } diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf index b3b846e1..a540207e 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/modules/securityGroup/versions.tf @@ -2,8 +2,8 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.0" + version = "~> 5.0" configuration_aliases = [aws] } } -} +} diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py new file mode 100755 index 00000000..08e201cb --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-all-locations-file.py @@ -0,0 +1,119 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.11" +# dependencies = [ +# "pyyaml", +# "boto3", +# ] +# /// + +import json +from pathlib import Path + +import boto3 +import yaml + + +def get_all_regions(): + """Get all AWS regions.""" + ec2 = boto3.client("ec2") + regions = [region["RegionName"] for region in ec2.describe_regions()["Regions"]] + return sorted(regions) + + +def get_region_zones(region): + """Get all availability zones for a given region.""" + ec2 = boto3.client("ec2", region_name=region) + zones = [ + zone["ZoneName"] + for zone in ec2.describe_availability_zones( + Filters=[{"Name": "state", "Values": ["available"]}] + )["AvailabilityZones"] + ] + return sorted(zones) + + +def get_latest_ubuntu_ami(region): + """Get the latest Ubuntu 22.04 LTS AMI ID for a region.""" + ec2 = boto3.client("ec2", region_name=region) + + try: + response = ec2.describe_images( + Filters=[ + { + "Name": "name", + "Values": [ + "ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*" + ], + }, + {"Name": "state", "Values": ["available"]}, + {"Name": "architecture", "Values": ["x86_64"]}, + ], + Owners=["099720109477"], # Canonical's AWS account ID + ) + + # Sort images by creation date + images = sorted( + response["Images"], key=lambda x: x["CreationDate"], reverse=True + ) + if images: + return images[0]["ImageId"] + except Exception as e: + print(f"Warning: Could not get AMI for region {region}: {str(e)}") + + return None + + +def generate_locations_file(): + """Generate a YAML file with all AWS zones as top-level entries.""" + locations = [] + + for region in get_all_regions(): + try: + zones = get_region_zones(region) + ami_id = get_latest_ubuntu_ami(region) + + if ami_id: + # Create a zone-based configuration for each zone + for zone in zones: + zone_config = { + zone: { + "region": region, + "instance_type": "t3.medium", + "instance_ami": ami_id, + "node_count": 1, + "zone": zone, + } + } + locations.append(zone_config) + except Exception as e: + print(f"Warning: Could not process region {region}: {str(e)}") + + # Create the locations directory if it doesn't exist + output_dir = Path(__file__).parent.parent / "locations" + output_dir.mkdir(exist_ok=True) + + # Write YAML file + yaml_path = output_dir / "all_locations.yaml" + with open(yaml_path, "w") as f: + yaml.dump(locations, f, default_flow_style=False) + + # Write JSON file (as an alternative format) + json_path = output_dir / "all_locations.json" + with open(json_path, "w") as f: + json.dump(locations, f, indent=2) + + print("Generated files:") + print(f"YAML: {yaml_path}") + print(f"JSON: {json_path}") + + # Calculate total nodes + total_nodes = sum( + config[zone]["node_count"] for config in locations for zone in config + ) + print(f"\nTotal zones: {len(locations)}") + print(f"Total nodes: {total_nodes}") + + +if __name__ == "__main__": + generate_locations_file() diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh deleted file mode 100755 index a518815d..00000000 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/utility_scripts/generate-locations.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Output file -OUTPUT_FILE="all_locations.yaml" - -# Truncate the output file if it exists -truncate -s 0 $OUTPUT_FILE - -# Initialize YAML file -echo "# Auto-generated locations configuration" > $OUTPUT_FILE -echo "# Using Amazon Linux 2023 AMIs" >> $OUTPUT_FILE - -# Get all available regions -REGIONS=$(aws ec2 describe-regions --query "Regions[].RegionName" --output text) - -for REGION in $REGIONS; do - # Get the latest Amazon Linux 2023 AMI - AMI=$(aws ec2 describe-images \ - --region "$REGION" \ - --owners amazon \ - --filters "Name=name,Values=al2023-ami-2023.*-x86_64" "Name=state,Values=available" \ - --query 'sort_by(Images, &CreationDate)[-1].ImageId' \ - --output text) - - # Skip if no AMI found - if [ "$AMI" == "None" ] || [ -z "$AMI" ]; then - echo "Skipping $REGION - No Amazon Linux 2023 AMI found" - continue - fi - - # Get the first availability zone - ZONE=$(aws ec2 describe-availability-zones \ - --region $REGION \ - --query "AvailabilityZones[0].ZoneName" \ - --output text) - - # Skip if no availability zone found - if [ "$ZONE" == "None" ] || [ -z "$ZONE" ]; then - echo "Skipping $REGION - No availability zone found" - continue - fi - - # Append to YAML file - REGION_BLOCK="- $REGION:" - REGION_BLOCK="$REGION_BLOCK\n region: $REGION" - REGION_BLOCK="$REGION_BLOCK\n zone: $ZONE" - REGION_BLOCK="$REGION_BLOCK\n instance_type: t3.medium" - REGION_BLOCK="$REGION_BLOCK\n instance_ami: $AMI" - REGION_BLOCK="$REGION_BLOCK\n node_count: 1" - echo -e "$REGION_BLOCK" >> $OUTPUT_FILE - echo -e "Added $REGION with AMI $AMI" -done - -echo "Generated locations.yaml successfully!" diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf index 5e0defad..e5c884a7 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-AWS/versions.tf @@ -3,7 +3,8 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "~> 4.0" + version = "~> 5.0" } } } + diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service index 7c8c29da..b94cbee9 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/bacalhau-startup.service @@ -1,6 +1,6 @@ [Unit] Description=Bacalhau Startup Script -After=docker.service network-online.target +After=network-online.target Wants=network-online.target [Service] diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service index 37ee82fe..562312ae 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/healthz-web.service @@ -1,6 +1,6 @@ [Unit] Description=Health Check Web Server -After=docker.service network-online.target +After=network-online.target Wants=network-online.target [Service] diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh index bc674c9b..5ac2cede 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/install_docker.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +# Exit on error +set -e + # Detect OS if [ -f /etc/os-release ]; then . /etc/os-release @@ -8,23 +11,75 @@ fi echo "Detected OS: $OS" +# Function to retry commands +retry_command() { + local n=0 + local max=5 + local delay=15 + while true; do + "$@" && break || { + if [[ $n -lt $max ]]; then + ((n++)) + echo "Command failed. Attempt $n/$max. Retrying in $delay seconds..." + sleep $delay + else + echo "The command has failed after $n attempts." + return 1 + fi + } + done +} + # Install Docker based on available package manager if command -v apt-get >/dev/null 2>&1; then # Debian/Ubuntu installation - apt-get update - apt-get install -y ca-certificates curl gnupg + echo "Using apt package manager..." + + # Update package list with retry + retry_command apt-get update + + # Install prerequisites with retry + retry_command apt-get install -y \ + ca-certificates \ + curl \ + gnupg \ + pigz \ + jq \ + libltdl7 \ + libslirp0 \ + slirp4netns \ + apt-transport-https \ + software-properties-common + + # Setup Docker repository install -m 0755 -d /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + rm -f /etc/apt/keyrings/docker.gpg + retry_command curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg chmod a+r /etc/apt/keyrings/docker.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null - apt-get update - apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin + + # Add Docker repository + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Update again with retry after adding Docker repository + retry_command apt-get update + + # Install Docker packages with retry + retry_command apt-get install -y \ + docker-ce \ + docker-ce-cli \ + containerd.io \ + docker-buildx-plugin \ + docker-compose-plugin elif command -v yum >/dev/null 2>&1; then # DNF-based systems (Amazon Linux 2023, Fedora, RHEL) - yum install docker -y + echo "Using yum package manager..." + retry_command yum install -y docker mkdir -p /usr/local/lib/docker/cli-plugins/ - curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose + retry_command curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose chmod +x /usr/local/lib/docker/cli-plugins/docker-compose else @@ -32,10 +87,27 @@ else exit 1 fi -# Start and enable Docker service -systemctl start docker -systemctl enable docker +# Start and enable Docker service with retry +echo "Starting Docker service..." +systemctl start docker || { + echo "Failed to start Docker service. Waiting 10 seconds and trying again..." + sleep 10 + systemctl start docker +} + +echo "Enabling Docker service..." +systemctl enable docker || { + echo "Failed to enable Docker service. Waiting 10 seconds and trying again..." + sleep 10 + systemctl enable docker +} # Verify installations -docker --version -docker compose version +echo "Verifying Docker installation..." +if command -v docker >/dev/null 2>&1; then + docker --version + docker compose version +else + echo "Docker installation verification failed" + exit 1 +fi diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh index ca487235..5d094185 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/modules/instance/scripts/startup.sh @@ -37,7 +37,7 @@ get_cloud_metadata() { METADATA=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance?api-version=2021-02-01") REGION=$(echo "$METADATA" | jq -r .compute.location) ZONE=$(echo "$METADATA" | jq -r .compute.zone) - PUBLIC_IP=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress?api-version=2021-02-01&format=text") + PUBLIC_IP=$(curl -s ip.me) PRIVATE_IP=$(echo "$METADATA" | jq -r .network.interface[0].ipv4.ipAddress[0].privateIpAddress) INSTANCE_ID=$(echo "$METADATA" | jq -r .compute.vmId) INSTANCE_TYPE=$(echo "$METADATA" | jq -r .compute.vmSize) diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py new file mode 100755 index 00000000..6bed2002 --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/get_vm_list.py @@ -0,0 +1,72 @@ +#!/usr/bin/env uv run -s +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# ] +# /// + +import json +import subprocess + +# Define the VM series to check +VM_SERIES = ["B", "A", "D", "F"] + + +def get_vm_skus(region): + # Run the Azure CLI command to get VM SKUs + result = subprocess.run( + [ + "az", + "vm", + "list-skus", + "--location", + region, + "--resource-type", + "virtualMachines", + "--output", + "json", + ], + capture_output=True, + text=True, + ) + if result.returncode != 0: + print("Error fetching VM SKUs:", result.stderr) + return [] + return json.loads(result.stdout) + + +def filter_vm_skus(skus, series): + # Filter for VM sizes in the specified series + filtered_skus = [] + for sku in skus: + if any(sku["name"].startswith(f"Standard_{s}") for s in series): + filtered_skus.append(sku) + return filtered_skus + + +def main(): + # Define the region to check + region = "eastus" + + # Get VM SKUs for the region + skus = get_vm_skus(region) + if not skus: + print("No VM SKUs found.") + return + + # Filter for VM sizes in the specified series + filtered_skus = filter_vm_skus(skus, VM_SERIES) + if not filtered_skus: + print(f"No VM sizes in series {VM_SERIES} found in {region}.") + return + + # Print the filtered VM sizes + print(f"Available VM sizes in {region}:") + for sku in filtered_skus: + print( + f"Name: {sku['name']}, Family: {sku['family']}, Locations: {sku['locationInfo'][0]['location']}" + ) + + +if __name__ == "__main__": + main() diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py new file mode 100755 index 00000000..3766f821 --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/region_checker.py @@ -0,0 +1,167 @@ +#!/usr/bin/env uv run -s +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "rich" +# ] +# /// +import argparse +import json +import subprocess +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime + +from rich.console import Console +from rich.live import Live +from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn +from rich.table import Table + +console = Console() + + +def create_status_table(regions, results=None): + """Create a table showing status for each region.""" + table = Table(title="Region VM Size Availability Check") + table.add_column("Region") + table.add_column("Display Name") + table.add_column("Status") + + results = results or {} + for region in sorted(regions, key=lambda x: x["name"]): + name = region["name"] + status = results.get(name, "⏳ Checking...") + table.add_row(name, region["displayName"], status) + return table + + +def check_region(region, vm_size): + """Check VM size availability for a single region.""" + query = f"[?name=='{vm_size}'].name" + cmd = [ + "az", + "vm", + "list-sizes", + "--location", + region["name"], + "--query", + query, + "-o", + "json", + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + sizes = json.loads(result.stdout) + if sizes: # If the VM size is available in this region + return region["name"], "✅ Available" + return region["name"], "❌ Not Available" + + +def get_vm_availability(vm_size, debug=False): + """Get VM size availability for all regions in parallel.""" + try: + # First get all regions + query = "[].{name:name, displayName:displayName}" + cmd = [ + "az", + "account", + "list-locations", + "--query", + query, + "-o", + "json", + ] + if debug: + console.print("[yellow]Fetching regions...[/yellow]") + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + console.print(f"[red]Error: {result.stderr}[/red]") + return None + + regions = json.loads(result.stdout) + available_regions = [] + results = {} + + # Create and display the live table + with Live(create_status_table(regions, results), refresh_per_second=4) as live: + # Check regions in parallel + with ThreadPoolExecutor(max_workers=10) as executor: + future_to_region = { + executor.submit(check_region, region, vm_size): region + for region in regions + } + + for future in as_completed(future_to_region): + region = future_to_region[future] + try: + region_name, status = future.result() + results[region_name] = status + if status == "✅ Available": + available_regions.append(region) + live.update(create_status_table(regions, results)) + except Exception as e: + results[region["name"]] = f"❌ Error: {str(e)}" + live.update(create_status_table(regions, results)) + + return available_regions + except Exception as e: + console.print(f"[red]Error: {str(e)}[/red]") + return [] + + +def generate_locations_section(regions, vm_size): + """Generate locations section for available regions.""" + return { + region["name"]: {"machine_type": vm_size, "node_count": 1} for region in regions + } + + +def main(vm_size, debug=False): + console.print(f"[cyan]Checking availability for VM size: {vm_size}[/cyan]") + + # Get VM availability data + available_regions = get_vm_availability(vm_size, debug) + + if not available_regions: + console.print(f"\n[red]VM size {vm_size} not found in any region[/red]") + return + + # Generate the locations section + locations_section = generate_locations_section(available_regions, vm_size) + + # Print to console + console.print("\n[green]Available Regions Summary:[/green]") + for region in sorted(available_regions, key=lambda x: x["name"]): + console.print(f"✅ {region['name']} ({region['displayName']})") + + # Save to file + output_file = ( + f"vm_availability_{vm_size}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + ) + with open(output_file, "w") as f: + json.dump({"locations": locations_section}, f, indent=4) + + console.print( + f"\n[blue]Generated Locations Configuration saved to: {output_file}[/blue]" + ) + console.print(f"\n[cyan]Summary:[/cyan]") + console.print(f"Available regions: {len(available_regions)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--vm-size", + type=str, + default="Standard_B2ms", + help="VM size to check (default: Standard_B2ms)", + ) + parser.add_argument( + "--debug", + action="store_true", + help="Print Azure CLI commands being executed", + ) + args = parser.parse_args() + + main(args.vm_size, args.debug) diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json new file mode 100644 index 00000000..35658154 --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-Azure/utility_scripts/vm_availability_Standard_B2ms_20250109_134546.json @@ -0,0 +1,176 @@ +{ + "locations": { + "northeurope": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "westus2": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "westeurope": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "westus3": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "australiaeast": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "eastus": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "southeastasia": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "southcentralus": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "uksouth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "swedencentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "centralus": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "eastasia": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "centralindia": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "canadacentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "southafricanorth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "koreacentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "francecentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "japaneast": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "germanywestcentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "newzealandnorth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "italynorth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "spaincentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "norwayeast": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "brazilsouth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "switzerlandnorth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "mexicocentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "qatarcentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "polandcentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "uaenorth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "israelcentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "eastus2": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "westus": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "westcentralus": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "northcentralus": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "jioindiawest": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "japanwest": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "australiacentral": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "koreasouth": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "australiasoutheast": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "westindia": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "canadaeast": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "ukwest": { + "machine_type": "Standard_B2ms", + "node_count": 1 + }, + "southindia": { + "machine_type": "Standard_B2ms", + "node_count": 1 + } + } +} \ No newline at end of file diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf index d62468bf..00cbb525 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/main.tf @@ -3,19 +3,9 @@ locals { timestamp = formatdate("YYMMDDHHmm", timestamp()) project_id = "${var.base_project_name}-${local.timestamp}" - # Function to sanitize label values - sanitize_label = replace( - lower( - replace( - replace(var.gcp_user_email, "@", "_at_"), - ".", "_" - ) - ), - "/[^a-z0-9_-]/", - "_" - ) - - sanitize_tag = replace(lower(var.app_tag), "/[^a-z0-9_-]/", "_") + # Simplified sanitize label function - just replace non-compliant chars with "-" + sanitize_label = replace(lower(var.gcp_user_email), "/[^a-z0-9-]/", "-") + sanitize_tag = replace(lower(var.app_tag), "/[^a-z0-9-]/", "-") # Define common tags with sanitized values common_tags = { @@ -24,6 +14,24 @@ locals { managed_by = "terraform" custom_tag = local.sanitize_tag } + + # Flatten the VM instances based on count per zone from locations variable + vm_instances = flatten([ + for zone_key, config in var.locations : [ + for i in range(lookup(config, "node_count", 1)) : { + zone_key = zone_key + index = i + zone = config.zone + machine_type = config.machine_type + } + ] + ]) + + # Convert to map with unique keys + vm_instances_map = { + for instance in local.vm_instances : + "${instance.zone_key}-${instance.index}" => instance + } } terraform { @@ -67,6 +75,12 @@ resource "google_project" "bacalhau_project" { } } +# Link billing account to project +resource "google_billing_project_info" "billing_info" { + project = google_project.bacalhau_project.project_id + billing_account = var.gcp_billing_account_id +} + # Enable required APIs in new project resource "google_project_service" "project_apis" { provider = google.bacalhau_cluster_project @@ -78,11 +92,25 @@ resource "google_project_service" "project_apis" { "billingbudgets.googleapis.com" ]) - project = google_project.bacalhau_project.project_id - service = each.value - + project = google_project.bacalhau_project.project_id + service = each.value disable_dependent_services = true disable_on_destroy = false + depends_on = [google_billing_project_info.billing_info] + + # Add timeouts to give more time for API enablement + timeouts { + create = "30m" + update = "40m" + } +} + +# Add explicit dependency on compute API +resource "time_sleep" "wait_for_apis" { + depends_on = [google_project_service.project_apis] + + # Wait for 2 minutes after enabling APIs + create_duration = "120s" } # Update the project_owner IAM binding to depend on APIs being enabled @@ -115,9 +143,17 @@ resource "google_project_iam_member" "member_role" { project = google_project.bacalhau_project.project_id } -data "cloudinit_config" "user_data" { +# Update random string to use the new map +resource "random_string" "vm_name" { + for_each = local.vm_instances_map + length = 8 + special = false + upper = false +} - for_each = var.locations +# Update the instance and cloud-init resources to use the new map +data "cloudinit_config" "user_data" { + for_each = local.vm_instances_map gzip = false base64_encode = false @@ -127,9 +163,9 @@ data "cloudinit_config" "user_data" { content_type = "text/cloud-config" content = templatefile("${path.root}/cloud-init/init-vm.yml", { - node_name : "${local.project_id}-${each.key}-vm", + node_name : "${replace(lower(each.value.zone), "/[^a-z0-9-]/", "-")}-${random_string.vm_name[each.key].result}-vm", username : var.username, - region : each.key, + region : each.value.zone_key, zone : each.value.zone, project_id : google_project.bacalhau_project.project_id, bacalhau_startup_service_file : filebase64("${path.root}/scripts/bacalhau-startup.service"), @@ -161,7 +197,7 @@ resource "google_compute_firewall" "allow_ssh_nats" { source_ranges = ["0.0.0.0/0"] target_tags = ["${local.project_id}-instance"] - depends_on = [google_project_service.project_apis] + depends_on = [time_sleep.wait_for_apis] # Wait for APIs to be fully enabled } resource "google_compute_instance" "gcp_instance" { @@ -169,9 +205,9 @@ resource "google_compute_instance" "gcp_instance" { project = google_project.bacalhau_project.project_id depends_on = [google_project_iam_member.member_role] - for_each = var.locations + for_each = local.vm_instances_map - name = "${local.project_id}-${each.key}-vm" + name = "${replace(lower(each.value.zone), "/[^a-z0-9-]/", "-")}-${random_string.vm_name[each.key].result}-vm" machine_type = each.value.machine_type zone = each.value.zone tags = ["${local.project_id}-instance"] @@ -202,33 +238,3 @@ resource "google_compute_instance" "gcp_instance" { labels = local.common_tags } - -data "http" "healthcheck" { - for_each = var.locations - - url = "http://${google_compute_instance.gcp_instance[each.key].network_interface[0].access_config[0].nat_ip}/healthz" - - retry { - attempts = 35 - min_delay_ms = 10000 # 10 seconds - max_delay_ms = 10000 # 10 seconds - } -} - -output "deployment_status" { - description = "Deployment status including health checks" - value = { - for k, v in google_compute_instance.gcp_instance : k => { - name = v.name - external_ip = v.network_interface[0].access_config[0].nat_ip - health_check = try(data.http.healthcheck[k].status_code == 200, false) ? "healthy" : "failed" - } - } -} - -# Add random string resource at the top level -resource "random_string" "suffix" { - length = 4 - special = false - upper = false -} diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh index bc674c9b..c74e5d18 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/install_docker.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +# Exit on error +set -e + # Detect OS if [ -f /etc/os-release ]; then . /etc/os-release @@ -8,23 +11,74 @@ fi echo "Detected OS: $OS" +# Function to retry commands +retry_command() { + local n=0 + local max=5 + local delay=15 + while true; do + "$@" && break || { + if [[ $n -lt $max ]]; then + ((n++)) + echo "Command failed. Attempt $n/$max. Retrying in $delay seconds..." + sleep $delay + else + echo "The command has failed after $n attempts." + return 1 + fi + } + done +} + # Install Docker based on available package manager if command -v apt-get >/dev/null 2>&1; then # Debian/Ubuntu installation - apt-get update - apt-get install -y ca-certificates curl gnupg + echo "Using apt package manager..." + + # Update package list with retry + retry_command apt-get update + + # Install prerequisites with retry + retry_command apt-get install -y \ + ca-certificates \ + curl \ + gnupg \ + pigz \ + libltdl7 \ + libslirp0 \ + slirp4netns \ + apt-transport-https \ + software-properties-common + + # Setup Docker repository install -m 0755 -d /etc/apt/keyrings - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg + rm -f /etc/apt/keyrings/docker.gpg + retry_command curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg chmod a+r /etc/apt/keyrings/docker.gpg - echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null - apt-get update - apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin + + # Add Docker repository + echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + tee /etc/apt/sources.list.d/docker.list > /dev/null + + # Update again with retry after adding Docker repository + retry_command apt-get update + + # Install Docker packages with retry + retry_command apt-get install -y \ + docker-ce \ + docker-ce-cli \ + containerd.io \ + docker-buildx-plugin \ + docker-compose-plugin elif command -v yum >/dev/null 2>&1; then # DNF-based systems (Amazon Linux 2023, Fedora, RHEL) - yum install docker -y + echo "Using yum package manager..." + retry_command yum install -y docker mkdir -p /usr/local/lib/docker/cli-plugins/ - curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose + retry_command curl -SL https://github.com/docker/compose/releases/download/v2.24.5/docker-compose-linux-x86_64 -o /usr/local/lib/docker/cli-plugins/docker-compose chmod +x /usr/local/lib/docker/cli-plugins/docker-compose else @@ -32,10 +86,27 @@ else exit 1 fi -# Start and enable Docker service -systemctl start docker -systemctl enable docker +# Start and enable Docker service with retry +echo "Starting Docker service..." +systemctl start docker || { + echo "Failed to start Docker service. Waiting 10 seconds and trying again..." + sleep 10 + systemctl start docker +} + +echo "Enabling Docker service..." +systemctl enable docker || { + echo "Failed to enable Docker service. Waiting 10 seconds and trying again..." + sleep 10 + systemctl enable docker +} # Verify installations -docker --version -docker compose version +echo "Verifying Docker installation..." +if command -v docker >/dev/null 2>&1; then + docker --version + docker compose version +else + echo "Docker installation verification failed" + exit 1 +fi diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh index ca487235..5d094185 100644 --- a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/scripts/startup.sh @@ -37,7 +37,7 @@ get_cloud_metadata() { METADATA=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance?api-version=2021-02-01") REGION=$(echo "$METADATA" | jq -r .compute.location) ZONE=$(echo "$METADATA" | jq -r .compute.zone) - PUBLIC_IP=$(curl -s -H "Metadata:true" "http://169.254.169.254/metadata/instance/network/interface/0/ipv4/ipAddress/0/publicIpAddress?api-version=2021-02-01&format=text") + PUBLIC_IP=$(curl -s ip.me) PRIVATE_IP=$(echo "$METADATA" | jq -r .network.interface[0].ipv4.ipAddress[0].privateIpAddress) INSTANCE_ID=$(echo "$METADATA" | jq -r .compute.vmId) INSTANCE_TYPE=$(echo "$METADATA" | jq -r .compute.vmSize) diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json new file mode 100644 index 00000000..ac448a83 --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.json @@ -0,0 +1,624 @@ +{ + "locations": { + "us_east1_b": { + "zone": "us-east1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east1_c": { + "zone": "us-east1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east1_d": { + "zone": "us-east1-d", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east4_c": { + "zone": "us-east4-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east4_b": { + "zone": "us-east4-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east4_a": { + "zone": "us-east4-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_central1_c": { + "zone": "us-central1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_central1_a": { + "zone": "us-central1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_central1_f": { + "zone": "us-central1-f", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_central1_b": { + "zone": "us-central1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west1_b": { + "zone": "us-west1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west1_c": { + "zone": "us-west1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west1_a": { + "zone": "us-west1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west4_a": { + "zone": "europe-west4-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west4_b": { + "zone": "europe-west4-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west4_c": { + "zone": "europe-west4-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west1_b": { + "zone": "europe-west1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west1_d": { + "zone": "europe-west1-d", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west1_c": { + "zone": "europe-west1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west3_c": { + "zone": "europe-west3-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west3_a": { + "zone": "europe-west3-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west3_b": { + "zone": "europe-west3-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west2_c": { + "zone": "europe-west2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west2_b": { + "zone": "europe-west2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west2_a": { + "zone": "europe-west2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_east1_b": { + "zone": "asia-east1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_east1_a": { + "zone": "asia-east1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_east1_c": { + "zone": "asia-east1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_southeast1_b": { + "zone": "asia-southeast1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_southeast1_a": { + "zone": "asia-southeast1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_southeast1_c": { + "zone": "asia-southeast1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast1_b": { + "zone": "asia-northeast1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast1_c": { + "zone": "asia-northeast1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast1_a": { + "zone": "asia-northeast1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_south1_c": { + "zone": "asia-south1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_south1_b": { + "zone": "asia-south1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_south1_a": { + "zone": "asia-south1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "australia_southeast1_b": { + "zone": "australia-southeast1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "australia_southeast1_c": { + "zone": "australia-southeast1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "australia_southeast1_a": { + "zone": "australia-southeast1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "southamerica_east1_b": { + "zone": "southamerica-east1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "southamerica_east1_c": { + "zone": "southamerica-east1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "southamerica_east1_a": { + "zone": "southamerica-east1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "africa_south1_a": { + "zone": "africa-south1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "africa_south1_b": { + "zone": "africa-south1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "africa_south1_c": { + "zone": "africa-south1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_east2_a": { + "zone": "asia-east2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_east2_b": { + "zone": "asia-east2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_east2_c": { + "zone": "asia-east2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast2_a": { + "zone": "asia-northeast2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast2_b": { + "zone": "asia-northeast2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast2_c": { + "zone": "asia-northeast2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast3_a": { + "zone": "asia-northeast3-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast3_b": { + "zone": "asia-northeast3-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_northeast3_c": { + "zone": "asia-northeast3-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_south2_a": { + "zone": "asia-south2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_south2_b": { + "zone": "asia-south2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_south2_c": { + "zone": "asia-south2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_southeast2_a": { + "zone": "asia-southeast2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_southeast2_b": { + "zone": "asia-southeast2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "asia_southeast2_c": { + "zone": "asia-southeast2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "australia_southeast2_a": { + "zone": "australia-southeast2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "australia_southeast2_b": { + "zone": "australia-southeast2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "australia_southeast2_c": { + "zone": "australia-southeast2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_central2_a": { + "zone": "europe-central2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_central2_b": { + "zone": "europe-central2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_central2_c": { + "zone": "europe-central2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_north1_a": { + "zone": "europe-north1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_north1_b": { + "zone": "europe-north1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_north1_c": { + "zone": "europe-north1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_southwest1_a": { + "zone": "europe-southwest1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_southwest1_b": { + "zone": "europe-southwest1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_southwest1_c": { + "zone": "europe-southwest1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west10_a": { + "zone": "europe-west10-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west10_b": { + "zone": "europe-west10-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west10_c": { + "zone": "europe-west10-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west12_a": { + "zone": "europe-west12-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west12_b": { + "zone": "europe-west12-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west12_c": { + "zone": "europe-west12-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west6_a": { + "zone": "europe-west6-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west6_b": { + "zone": "europe-west6-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west6_c": { + "zone": "europe-west6-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west8_a": { + "zone": "europe-west8-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west8_b": { + "zone": "europe-west8-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west8_c": { + "zone": "europe-west8-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west9_a": { + "zone": "europe-west9-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west9_b": { + "zone": "europe-west9-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "europe_west9_c": { + "zone": "europe-west9-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_central1_a": { + "zone": "me-central1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_central1_b": { + "zone": "me-central1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_central1_c": { + "zone": "me-central1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_central2_a": { + "zone": "me-central2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_central2_b": { + "zone": "me-central2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_central2_c": { + "zone": "me-central2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_west1_a": { + "zone": "me-west1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_west1_b": { + "zone": "me-west1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "me_west1_c": { + "zone": "me-west1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_northeast1_a": { + "zone": "northamerica-northeast1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_northeast1_b": { + "zone": "northamerica-northeast1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_northeast1_c": { + "zone": "northamerica-northeast1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_northeast2_a": { + "zone": "northamerica-northeast2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_northeast2_b": { + "zone": "northamerica-northeast2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_northeast2_c": { + "zone": "northamerica-northeast2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_south1_a": { + "zone": "northamerica-south1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_south1_b": { + "zone": "northamerica-south1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "northamerica_south1_c": { + "zone": "northamerica-south1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "southamerica_west1_a": { + "zone": "southamerica-west1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "southamerica_west1_b": { + "zone": "southamerica-west1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "southamerica_west1_c": { + "zone": "southamerica-west1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east5_a": { + "zone": "us-east5-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east5_b": { + "zone": "us-east5-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_east5_c": { + "zone": "us-east5-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_south1_a": { + "zone": "us-south1-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_south1_b": { + "zone": "us-south1-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_south1_c": { + "zone": "us-south1-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west2_a": { + "zone": "us-west2-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west2_b": { + "zone": "us-west2-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west2_c": { + "zone": "us-west2-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west3_a": { + "zone": "us-west3-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west3_b": { + "zone": "us-west3-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west3_c": { + "zone": "us-west3-c", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west4_a": { + "zone": "us-west4-a", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west4_b": { + "zone": "us-west4-b", + "machine_type": "e2-standard-4", + "node_count": 1 + }, + "us_west4_c": { + "zone": "us-west4-c", + "machine_type": "e2-standard-4", + "node_count": 1 + } + } +} \ No newline at end of file diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml new file mode 100644 index 00000000..d9e14856 --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/all_locations.yaml @@ -0,0 +1,497 @@ +locations: + africa_south1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: africa-south1-a + africa_south1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: africa-south1-b + africa_south1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: africa-south1-c + asia_east1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-east1-a + asia_east1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-east1-b + asia_east1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-east1-c + asia_east2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-east2-a + asia_east2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-east2-b + asia_east2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-east2-c + asia_northeast1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast1-a + asia_northeast1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast1-b + asia_northeast1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast1-c + asia_northeast2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast2-a + asia_northeast2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast2-b + asia_northeast2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast2-c + asia_northeast3_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast3-a + asia_northeast3_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast3-b + asia_northeast3_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-northeast3-c + asia_south1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-south1-a + asia_south1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-south1-b + asia_south1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-south1-c + asia_south2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-south2-a + asia_south2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-south2-b + asia_south2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-south2-c + asia_southeast1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-southeast1-a + asia_southeast1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-southeast1-b + asia_southeast1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-southeast1-c + asia_southeast2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-southeast2-a + asia_southeast2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-southeast2-b + asia_southeast2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: asia-southeast2-c + australia_southeast1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: australia-southeast1-a + australia_southeast1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: australia-southeast1-b + australia_southeast1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: australia-southeast1-c + australia_southeast2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: australia-southeast2-a + australia_southeast2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: australia-southeast2-b + australia_southeast2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: australia-southeast2-c + europe_central2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-central2-a + europe_central2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-central2-b + europe_central2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-central2-c + europe_north1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-north1-a + europe_north1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-north1-b + europe_north1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-north1-c + europe_southwest1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-southwest1-a + europe_southwest1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-southwest1-b + europe_southwest1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-southwest1-c + europe_west10_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west10-a + europe_west10_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west10-b + europe_west10_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west10-c + europe_west12_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west12-a + europe_west12_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west12-b + europe_west12_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west12-c + europe_west1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west1-b + europe_west1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west1-c + europe_west1_d: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west1-d + europe_west2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west2-a + europe_west2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west2-b + europe_west2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west2-c + europe_west3_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west3-a + europe_west3_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west3-b + europe_west3_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west3-c + europe_west4_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west4-a + europe_west4_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west4-b + europe_west4_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west4-c + europe_west6_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west6-a + europe_west6_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west6-b + europe_west6_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west6-c + europe_west8_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west8-a + europe_west8_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west8-b + europe_west8_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west8-c + europe_west9_a: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west9-a + europe_west9_b: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west9-b + europe_west9_c: + machine_type: e2-standard-4 + node_count: 1 + zone: europe-west9-c + me_central1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: me-central1-a + me_central1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: me-central1-b + me_central1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: me-central1-c + me_central2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: me-central2-a + me_central2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: me-central2-b + me_central2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: me-central2-c + me_west1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: me-west1-a + me_west1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: me-west1-b + me_west1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: me-west1-c + northamerica_northeast1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-northeast1-a + northamerica_northeast1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-northeast1-b + northamerica_northeast1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-northeast1-c + northamerica_northeast2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-northeast2-a + northamerica_northeast2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-northeast2-b + northamerica_northeast2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-northeast2-c + northamerica_south1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-south1-a + northamerica_south1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-south1-b + northamerica_south1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: northamerica-south1-c + southamerica_east1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: southamerica-east1-a + southamerica_east1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: southamerica-east1-b + southamerica_east1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: southamerica-east1-c + southamerica_west1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: southamerica-west1-a + southamerica_west1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: southamerica-west1-b + southamerica_west1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: southamerica-west1-c + us_central1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-central1-a + us_central1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-central1-b + us_central1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-central1-c + us_central1_f: + machine_type: e2-standard-4 + node_count: 1 + zone: us-central1-f + us_east1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east1-b + us_east1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east1-c + us_east1_d: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east1-d + us_east4_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east4-a + us_east4_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east4-b + us_east4_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east4-c + us_east5_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east5-a + us_east5_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east5-b + us_east5_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-east5-c + us_south1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-south1-a + us_south1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-south1-b + us_south1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-south1-c + us_west1_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west1-a + us_west1_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west1-b + us_west1_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west1-c + us_west2_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west2-a + us_west2_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west2-b + us_west2_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west2-c + us_west3_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west3-a + us_west3_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west3-b + us_west3_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west3-c + us_west4_a: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west4-a + us_west4_b: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west4-b + us_west4_c: + machine_type: e2-standard-4 + node_count: 1 + zone: us-west4-c diff --git a/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py new file mode 100755 index 00000000..dc24ace8 --- /dev/null +++ b/setting-up-bacalhau-cluster/setting-up-bacalhau-with-terraform-on-GCP/utility_scripts/generate_all_locations_file.py @@ -0,0 +1,185 @@ +#!/usr/bin/env uv run -s +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "google-cloud-compute", +# "pyyaml", +# "google-auth", +# ] +# /// + +import json +import os +import subprocess +import sys +from typing import Optional + +import yaml +from google.api_core import exceptions +from google.auth import default +from google.auth.exceptions import DefaultCredentialsError +from google.cloud import compute_v1 + + +def ensure_gcp_auth() -> Optional[str]: + """Ensure GCP authentication and return project ID.""" + try: + # Try to get credentials and project ID + credentials, project_id = default() + return project_id + except DefaultCredentialsError: + print( + "GCP credentials not found. Please authenticate using one of these methods:" + ) + print("1. Run: gcloud auth application-default login") + print("2. Set GOOGLE_APPLICATION_CREDENTIALS environment variable") + sys.exit(1) + + +def get_project_id() -> str: + """Get the GCP project ID.""" + # First try environment variable + project_id = os.getenv("GOOGLE_CLOUD_PROJECT") + if project_id: + return project_id + + # Then try gcloud config + try: + result = subprocess.run( + ["gcloud", "config", "get-value", "project"], + capture_output=True, + text=True, + check=True, + ) + project_id = result.stdout.strip() + if project_id and project_id != "(unset)": + return project_id + except (subprocess.CalledProcessError, FileNotFoundError): + pass + + # Finally, try to get it from application default credentials + project_id = ensure_gcp_auth() + if project_id: + return project_id + + print("Error: Could not determine GCP project ID. Please either:") + print("1. Set GOOGLE_CLOUD_PROJECT environment variable") + print("2. Run: gcloud config set project YOUR_PROJECT_ID") + print("3. Use application default credentials with a project") + sys.exit(1) + + +def check_zone_access( + client: compute_v1.ZonesClient, project_id: str, zone_name: str +) -> bool: + """Check if we have access to a specific zone.""" + try: + request = compute_v1.GetZoneRequest(project=project_id, zone=zone_name) + zone = client.get(request=request) + # Check if the zone is actually available for use + return zone.status == "UP" and "DEPRECATED" not in zone.deprecated + except exceptions.PermissionDenied: + print(f"⚠️ No permission to access zone {zone_name}") + return False + except exceptions.Forbidden: + print(f"⚠️ Access forbidden to zone {zone_name}") + return False + except Exception as e: + print(f"⚠️ Error checking zone {zone_name}: {str(e)}") + return False + + +def get_all_zones(project_id: str): + """Query GCP to get all available zones.""" + client = compute_v1.ZonesClient() + + try: + zones = [] + request = compute_v1.ListZonesRequest(project=project_id) + + print("Fetching available zones...") + for zone in client.list(request=request): + # Only include UP zones that aren't deprecated + if zone.status == "UP" and not zone.deprecated: + region = zone.name.rsplit("-", 1)[0] + zones.append( + { + "region": region, + "zone": zone.name, + } + ) + print(f"✓ Found zone: {zone.name}") + + if not zones: + print("\nNo available zones found. This could mean:") + print("1. You don't have the required permissions") + print("2. The Compute Engine API isn't enabled") + print("3. Your project isn't properly set up for Compute Engine") + print("\nTry running: gcloud services enable compute.googleapis.com") + sys.exit(1) + + return zones + except Exception as e: + print(f"Error fetching zones: {str(e)}") + print( + "Please ensure you have the necessary permissions and the Compute Engine API is enabled." + ) + print( + "You can enable it by running: gcloud services enable compute.googleapis.com" + ) + sys.exit(1) + + +def generate_locations_config(zones): + """Generate the locations configuration.""" + locations = {} + + for zone_info in zones: + zone = zone_info["zone"] + + # Use zone as the key instead of region + zone_key = zone.replace( + "-", "_" + ) # Replace hyphens with underscores for valid keys + locations[zone_key] = { + "zone": zone, + "machine_type": "e2-standard-4", + "node_count": 1, + } + + return locations + + +def main(): + # Ensure authentication and get project ID + project_id = get_project_id() + print(f"Using GCP project: {project_id}") + + # Get all zones + print("Fetching zones from GCP...") + zones = get_all_zones(project_id) + + if not zones: + print( + "No zones found. Please check your permissions and project configuration." + ) + sys.exit(1) + + # Generate the locations configuration + locations = generate_locations_config(zones) + + # Save as YAML + with open("all_locations.yaml", "w") as yaml_file: + yaml.dump({"locations": locations}, yaml_file, default_flow_style=False) + + # Save as JSON (for env.json format) + with open("all_locations.json", "w") as json_file: + json.dump({"locations": locations}, json_file, indent=2) + + print(f"\nGenerated configurations with {len(locations)} zones:") + for zone_key in sorted(locations.keys()): + print(f" - {locations[zone_key]['zone']}") + + +if __name__ == "__main__": + main() diff --git a/systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py b/systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py deleted file mode 100644 index 1f3d87bd..00000000 --- a/systems-engineering/duckdb-log-processing/terraform/clean_up_nodes.py +++ /dev/null @@ -1,132 +0,0 @@ -import argparse -import json -import os -import subprocess -import sys - -import yaml - - -def load_config(config_path): - """Load configuration from YAML file.""" - try: - with open(config_path, 'r') as f: - config = yaml.safe_load(f) - - # Extract required values - compute = config.get('Compute', {}) - if not compute: - raise ValueError("No 'Compute' section in config") - - orchestrators = compute.get('Orchestrators', []) - if not orchestrators: - raise ValueError("No 'Orchestrators' specified in config") - - orchestrator = orchestrators[0] # Use first orchestrator - token = compute.get('Auth', {}).get('Token') - if not token: - raise ValueError("No 'Auth.Token' specified in config") - - # Extract hostname without nats:// prefix and port - if orchestrator.startswith('nats://'): - orchestrator = orchestrator[7:] - orchestrator = orchestrator.split(':')[0] # Remove port number - - return { - 'api_host': orchestrator, - 'token': token - } - except Exception as e: - print(f"Error loading config file: {e}") - sys.exit(1) - -def get_nodes(api_host, token): - """Get list of all Bacalhau nodes.""" - try: - cmd = [ - "bacalhau", - "node", - "list", - "--output", "json", - "-c", f"API.Host={api_host}" - ] - - result = subprocess.run( - cmd, - capture_output=True, - text=True, - check=True, - ) - return json.loads(result.stdout) - except subprocess.CalledProcessError as e: - print(f"Error running bacalhau node list: {e}") - print(f"stdout: {e.stdout}") - print(f"stderr: {e.stderr}") - sys.exit(1) - except json.JSONDecodeError as e: - print(f"Error parsing JSON output: {e}") - sys.exit(1) - -def delete_node(node_id, api_host, token): - """Delete a specific node by ID.""" - try: - cmd = [ - "bacalhau", - "node", - "delete", - node_id, - "-c", f"API.Host={api_host}" - ] - - subprocess.run(cmd, check=True) - print(f"Successfully deleted node: {node_id}") - return True - except subprocess.CalledProcessError as e: - print(f"Failed to delete node {node_id}. Error: {e}") - return False - -def main(): - parser = argparse.ArgumentParser(description="Delete disconnected Bacalhau nodes") - parser.add_argument('config', help='Path to config file') - parser.add_argument('--dry-run', action='store_true', help='Show what would be deleted without actually deleting') - args = parser.parse_args() - - # Load configuration - config = load_config(args.config) - - print(f"\nConnecting to API host: {config['api_host']}") - - # Get all nodes - nodes = get_nodes(config['api_host'], config['token']) - - # Filter disconnected compute nodes - disconnected_nodes = [ - node for node in nodes - if ( - node["Connection"] == "DISCONNECTED" and - node["Info"]["NodeType"] == "Compute" - ) - ] - - if not disconnected_nodes: - print("No disconnected nodes found.") - return - - print(f"\nFound {len(disconnected_nodes)} disconnected node(s):") - for node in disconnected_nodes: - print(f" - {node['Info']['NodeID']}") - - if args.dry_run: - print("\nDry run - no nodes were deleted") - return - - print("\nDeleting nodes...") - deleted_count = 0 - for node in disconnected_nodes: - if delete_node(node['Info']['NodeID'], config['api_host'], config['token']): - deleted_count += 1 - - print(f"\nDeleted {deleted_count} of {len(disconnected_nodes)} disconnected nodes") - -if __name__ == "__main__": - main() \ No newline at end of file