Skip to content

Commit

Permalink
feat: add otel agent to collect bacalhau telemetry
Browse files Browse the repository at this point in the history
- agent currently logs telemetry. later can be configured to pusblish to
  an endpoint: either one we host, or something like google stackdriver
  • Loading branch information
frrist authored and frrist committed Jan 24, 2024
1 parent 87db1db commit 28b0c1d
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 13 deletions.
20 changes: 19 additions & 1 deletion ops/tf/modules/cloud-init/cloud-init.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
#cloud-config

write_files:
# bacalhau config
- path: /etc/config.yaml
encoding: b64
owner: root:root
permissions: "0600"
content: |
${bacalhau_config_file}
# bacalhau service file
- path: /etc/systemd/system/bacalhau.service
encoding: b64
owner: root:root
permissions: "0600"
content: |
${bacalhau_service_file}
${bacalhau_service_file}
# otel config file
- path: /etc/otel-collector.yaml
encoding: b64
owner: root:root
permissions: "0600"
content: |
${otel_config_file}
# otel service file
- path: /etc/systemd/system/otel.service
encoding: b64
owner: root:root
permissions: "0600"
content: |
${otel_service_file}
30 changes: 24 additions & 6 deletions ops/tf/modules/gcp/compute_instances/compute/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,14 @@ locals {

// service env vars
bacalhau_env_vars = {
LOG_LEVEL = "debug"
BACALHAU_NODE_LOGGINGMODE = "default"
BACALHAU_DIR = "/data"
BACALHAU_ENVIRONMENT = "local"
AWS_ACCESS_KEY_ID = var.aws_access_key_id
AWS_SECRET_ACCESS_KEY = var.aws_secret_access_key
LOG_LEVEL = "debug"
BACALHAU_NODE_LOGGINGMODE = "default"
BACALHAU_DIR = "/data"
BACALHAU_ENVIRONMENT = "local"
// TODO make this a variable
OTEL_EXPORTER_OTLP_ENDPOINT = "http://localhost:4318"
AWS_ACCESS_KEY_ID = var.aws_access_key_id
AWS_SECRET_ACCESS_KEY = var.aws_secret_access_key
# Add more variables here as needed
}
# Convert the map to the required string format for the systemd service file
Expand Down Expand Up @@ -101,6 +103,20 @@ locals {
node_type = "compute"
// Add more arguments as needed
})

//
// templating otel config file
//
otel_config_content = templatefile("${path.module}/../../../instance_files/otel-collector.yaml", {
// add more arguments as needed
})

//
// templating otel service file
//
otel_service_content = templatefile("${path.module}/../../../instance_files/otel.service", {
// add more arguments as needed
})
}


Expand All @@ -116,6 +132,8 @@ data "cloudinit_config" "compute_cloud_init" {
content = templatefile("${path.module}/../../../cloud-init/cloud-init.yml", {
bacalhau_config_file : base64encode(local.compute_config_content),
bacalhau_service_file : base64encode(local.bacalhau_service_content),
otel_config_file : base64encode(local.otel_config_content)
otel_service_file : base64encode(local.otel_service_content),
requester_ip : var.requester_ip,
})
}
Expand Down
30 changes: 24 additions & 6 deletions ops/tf/modules/gcp/compute_instances/requester/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,14 @@ locals {

// service env vars
bacalhau_env_vars = {
LOG_LEVEL = "debug"
BACALHAU_NODE_LOGGINGMODE = "default"
BACALHAU_DIR = "/data"
BACALHAU_ENVIRONMENT = "local"
AWS_ACCESS_KEY_ID = var.aws_access_key_id
AWS_SECRET_ACCESS_KEY = var.aws_secret_access_key
LOG_LEVEL = "debug"
BACALHAU_NODE_LOGGINGMODE = "default"
BACALHAU_DIR = "/data"
BACALHAU_ENVIRONMENT = "local"
// TODO make this a variable
OTEL_EXPORTER_OTLP_ENDPOINT = "http://localhost:4318"
AWS_ACCESS_KEY_ID = var.aws_access_key_id
AWS_SECRET_ACCESS_KEY = var.aws_secret_access_key
# Add more variables here as needed
}
# Convert the map to the required string format for the systemd service file
Expand Down Expand Up @@ -80,6 +82,20 @@ locals {
node_type = "requester"
// Add more arguments as needed
})

//
// templating otel config file
//
otel_config_content = templatefile("${path.module}/../../../instance_files/otel-collector.yaml", {
// add more arguments as needed
})

//
// templating otel service file
//
otel_service_content = templatefile("${path.module}/../../../instance_files/otel.service", {
// add more arguments as needed
})
}


Expand All @@ -95,6 +111,8 @@ data "cloudinit_config" "requester_cloud_init" {
content = templatefile("${path.module}/../../../cloud-init/cloud-init.yml", {
bacalhau_config_file : base64encode(local.requester_config_content),
bacalhau_service_file : base64encode(local.bacalhau_service_content)
otel_config_file : base64encode(local.otel_config_content)
otel_service_file : base64encode(local.otel_service_content),
})
}
}
61 changes: 61 additions & 0 deletions ops/tf/modules/instance_files/otel-collector.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
receivers:
# gather telemetry from host machine
hostmetrics:
scrapers:
cpu:
disk:
load:
filesystem:
memory:
network:
paging:
# gather telemetry from bacalhau process (from otel sdk)
otlp:
protocols:
http:
# gather telemetry from otel collector process
prometheus:
config:
scrape_configs:
- job_name: 'otel-collector'
scrape_interval: 5s
static_configs:
- targets: [ '0.0.0.0:8888' ]

# modify the collection of telemetry data before exporting
processors:
batch:
memory_limiter:
check_interval: 5s
limit_mib: 4000
spike_limit_mib: 500
resourcedetection/gcp:
detectors: [ env, gcp ]
timeout: 2s
override: false
resource:
attributes:
- key: service.namespace
value: bacalhau
action: insert
# TODO why is this here?
attributes/metrics:
actions:
- pattern: net\.sock.+
action: delete

# export telemetry gathered to stderr of otel process
exporters:
logging:
loglevel: debug

service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, resourcedetection/gcp, resource, attributes/metrics, batch]
exporters: [logging]
metrics:
receivers: [otlp, hostmetrics]
processors: [memory_limiter, resourcedetection/gcp, resource, attributes/metrics, batch]
exporters: [logging]
14 changes: 14 additions & 0 deletions ops/tf/modules/instance_files/otel.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description=otel collector
Documentation=https://opentelemetry.io/docs/collector/
Wants=network-online.target
After=network-online.target

[Service]
User=root
Group=root
Type=simple
ExecStart=/usr/local/bin/otelcol --config=/etc/otel-collector.yaml

[Install]
WantedBy=multi-user.target
10 changes: 10 additions & 0 deletions ops/tf/modules/instance_files/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,27 @@ function setup-bacalhau-config() {
sudo mv /etc/config.yaml /data/config.yaml
}

function install-otel-collector() {
wget "https://github.com/open-telemetry/opentelemetry-collector-releases/releases/download/v0.92.0/otelcol-contrib_0.92.0_linux_386.tar.gz"
tar xvf "otelcol-contrib_0.92.0_linux_386.tar.gz"
sudo mv otelcol-contrib /usr/local/bin/otelcol
}

# reload service files and enable services
function setup-services() {
echo "Loading systemctl services..."
sudo systemctl daemon-reload
echo "Enabling systemctl services..."
sudo systemctl enable docker
sudo systemctl enable otel.service
sudo systemctl enable bacalhau.service
}

# start services
function start-services() {
echo "Starting systemctl services..."
sudo systemctl restart docker
sudo systemctl restart otel.service
sudo systemctl restart bacalhau.service
}

Expand All @@ -68,6 +76,8 @@ function start() {
setup-bacalhau-local-disk
fi

# TODO move this into the VMI, maybe?
install-otel-collector
setup-bacalhau-config
setup-services
start-services
Expand Down

0 comments on commit 28b0c1d

Please sign in to comment.