From 849dab55b6fd2e7b328dfa79cfb3cf20e59cde9c Mon Sep 17 00:00:00 2001 From: Forrest <6546409+frrist@users.noreply.github.com> Date: Thu, 25 Jan 2024 01:19:16 -0800 Subject: [PATCH] feat: implement local Telemetry suite (#3302) - closes #3301 --------- Co-authored-by: frrist --- ops/metrics/README.md | 28 +++ ops/metrics/docker-compose.yaml | 52 +++++ .../provisioning/dashboards/dashboard.json | 187 ++++++++++++++++++ .../provisioning/dashboards/dashboards.yml | 9 + .../provisioning/datasources/datasources.yml | 7 + ops/metrics/otel-collector-config,yaml | 0 ops/metrics/otel-collector-config.yaml | 53 +++++ ops/metrics/prometheus/prometheus.yml | 5 + 8 files changed, 341 insertions(+) create mode 100644 ops/metrics/README.md create mode 100644 ops/metrics/docker-compose.yaml create mode 100644 ops/metrics/grafana/provisioning/dashboards/dashboard.json create mode 100644 ops/metrics/grafana/provisioning/dashboards/dashboards.yml create mode 100644 ops/metrics/grafana/provisioning/datasources/datasources.yml create mode 100644 ops/metrics/otel-collector-config,yaml create mode 100644 ops/metrics/otel-collector-config.yaml create mode 100644 ops/metrics/prometheus/prometheus.yml diff --git a/ops/metrics/README.md b/ops/metrics/README.md new file mode 100644 index 0000000000..d3fe4908f4 --- /dev/null +++ b/ops/metrics/README.md @@ -0,0 +1,28 @@ +# Usage +**Start containers:** +```shell +docker-compose up +``` +**Export collection endpoint for bacalhau** +```shell +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 + ``` +**Start Bacalhau** +```shell +bacalhau serve --node-type=compute,requester +``` +**Open Browser** +- Grafana: http://localhost:3000 + - Username: `admin` + - Password: `admin` +- Jaeger: http://localhost:16686 + +**Clean up** +- Remove volumes associated with containers to reset state. + +**Saving Changes to a Grafana Dashboard** +- export dashboard data from grafana as json +- save it to file ./grafana/provisioning/dashboards/dashboard.json + +# Best Practices for Telemetry Collections +[OpenTelemetry In Bacalhau](../../docs/docs/dev/open_telemetry_in_bacalhau.md) \ No newline at end of file diff --git a/ops/metrics/docker-compose.yaml b/ops/metrics/docker-compose.yaml new file mode 100644 index 0000000000..56da2cd9fd --- /dev/null +++ b/ops/metrics/docker-compose.yaml @@ -0,0 +1,52 @@ +version: '3.5' + +services: + prometheus: + image: prom/prometheus:latest + volumes: + - ./prometheus/:/etc/prometheus/ + - prometheus-storage:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + ports: + - 9090:9090 + restart: always + + grafana: + image: grafana/grafana + depends_on: + - prometheus + volumes: + - ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources # Datasource provisioning + - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards # Dashboard provisioning + + ports: + - 3000:3000 + restart: always + + opentelemetry-collector: + image: otel/opentelemetry-collector:latest + command: [ "--config=/etc/otel-collector-config.yaml" ] # Command to use the custom config + volumes: + - ./otel-collector-config.yaml:/etc/otel-collector-config.yaml + ports: + - 127.0.0.1:4318:4318 # HTTP + - 55681:55681 # OpenTelemetry protocol + depends_on: + - prometheus + + jaeger: + container_name: jaeger + image: jaegertracing/all-in-one:latest + ports: + - "6831:6831/udp" + - "5778:5778" + - "4316:4316" + - "16686:16686" + - "14268:14268" + +volumes: + prometheus-storage: {} diff --git a/ops/metrics/grafana/provisioning/dashboards/dashboard.json b/ops/metrics/grafana/provisioning/dashboards/dashboard.json new file mode 100644 index 0000000000..52aa74cf85 --- /dev/null +++ b/ops/metrics/grafana/provisioning/dashboards/dashboard.json @@ -0,0 +1,187 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_jobs_received_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Jobs Receieved", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_jobs_completed_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Jobs Completed", + "type": "stat" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Bacalhau Metrics", + "uid": "cbe6c668-d74b-4a27-be8b-431c19b2d4ca", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/ops/metrics/grafana/provisioning/dashboards/dashboards.yml b/ops/metrics/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000000..7205625cf9 --- /dev/null +++ b/ops/metrics/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +providers: + - name: 'default' # A unique name for this provider + orgId: 1 # Optional: specify organization ID, 1 is default + folder: '' # The folder to save dashboards in Grafana + type: file + options: + path: /etc/grafana/provisioning/dashboards # Path within the container diff --git a/ops/metrics/grafana/provisioning/datasources/datasources.yml b/ops/metrics/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000000..5b8876eb5d --- /dev/null +++ b/ops/metrics/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,7 @@ +apiVersion: 1 + +datasources: + - name: Prometheus OTEL + type: prometheus + access: proxy + url: http://prometheus:9090 diff --git a/ops/metrics/otel-collector-config,yaml b/ops/metrics/otel-collector-config,yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/ops/metrics/otel-collector-config.yaml b/ops/metrics/otel-collector-config.yaml new file mode 100644 index 0000000000..58417b9fe1 --- /dev/null +++ b/ops/metrics/otel-collector-config.yaml @@ -0,0 +1,53 @@ +# receive telemetry data from bacalhau otel sdk. +receivers: + otlp: + protocols: + http: + endpoint: "0.0.0.0:4318" + +# batch process data and label it with 'otel' as the service colector +processors: + batch: + memory_limiter: + check_interval: 5s + limit_mib: 4000 + spike_limit_mib: 500 + resource: + attributes: + - key: service.collector + value: otel + action: insert + attributes/metrics: + actions: + - pattern: net\.sock.+ + action: delete + + +exporters: + # metrics are exported to prometheus + prometheus: + endpoint: "0.0.0.0:9095" + namespace: "bacalhau" + # uncomment for debugging, will print all metrics to stdout + #logging: + #loglevel: debug + # traces go to jaeger instance + otlp/jaeger: + endpoint: "jaeger:4317" + tls: + insecure: true + insecure_skip_verify: true + +service: + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, attributes/metrics, batch] + exporters: [prometheus] + #exporters: [prometheus, logging] + traces: + receivers: [otlp] + processors: [memory_limiter, resource, attributes/metrics, batch] + exporters: [otlp/jaeger] + #exporters: [logging, otlp/jaeger] + diff --git a/ops/metrics/prometheus/prometheus.yml b/ops/metrics/prometheus/prometheus.yml new file mode 100644 index 0000000000..7e33e0d582 --- /dev/null +++ b/ops/metrics/prometheus/prometheus.yml @@ -0,0 +1,5 @@ +scrape_configs: + - job_name: 'otel-collector' + scrape_interval: 5s + static_configs: + - targets: ['opentelemetry-collector:9095']