diff --git a/ops/metrics/grafana/provisioning/dashboards/dashboard.json b/ops/metrics/grafana/provisioning/dashboards/dashboard.json index 52aa74cf85..e6b25d8cb9 100644 --- a/ops/metrics/grafana/provisioning/dashboards/dashboard.json +++ b/ops/metrics/grafana/provisioning/dashboards/dashboard.json @@ -31,6 +31,13 @@ "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, "mappings": [], "thresholds": { "mode": "absolute", @@ -50,11 +57,95 @@ "overrides": [] }, "gridPos": { - "h": 8, - "w": 24, + "h": 10, + "w": 5, "x": 0, "y": 0 }, + "id": 3, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_bacalhau_node_info_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Node Metadata", + "transformations": [ + { + "id": "labelsToFields", + "options": { + "keepLabels": [ + "node_engines", + "node_id", + "node_is_compute", + "node_is_requester", + "node_network_transport", + "node_publishers", + "node_storages" + ], + "mode": "rows" + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 5, + "y": 0 + }, "id": 2, "options": { "colorMode": "value", @@ -105,16 +196,83 @@ "mode": "thresholds" }, "mappings": [], + "noValue": "0", "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null - }, + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 7, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_jobs_accepted_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Jobs Accepted", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ { - "color": "red", - "value": 80 + "color": "green", + "value": null } ] }, @@ -123,12 +281,12 @@ "overrides": [] }, "gridPos": { - "h": 9, - "w": 24, - "x": 0, - "y": 8 + "h": 10, + "w": 2, + "x": 9, + "y": 0 }, - "id": 1, + "id": 5, "options": { "colorMode": "value", "graphMode": "area", @@ -166,22 +324,1062 @@ ], "title": "Jobs Completed", "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 11, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_docker_active_executions", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Active Docker Executions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 13, + "y": 0 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_wasm_active_executions", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Active WASM Executions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ms", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "rate(bacalhau_job_duration_milliseconds_sum[5m])\n/\nrate(bacalhau_job_duration_milliseconds_count[5m])", + "instant": false, + "legendFormat": "{{task_engine}}", + "range": true, + "refId": "A" + } + ], + "title": "Average Job Duration over 5mins", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "rate(bacalhau_http_server_duration_milliseconds_sum[5m])\n/\nrate(bacalhau_http_server_duration_milliseconds_count[5m])", + "instant": false, + "legendFormat": "{{http_route}}", + "range": true, + "refId": "A" + } + ], + "title": "Averagef HTTP Requests Duration over 5min", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "bacalhau_http_server_request_content_length_bytes_total", + "hide": false, + "instant": false, + "legendFormat": "{{http_route}}", + "range": true, + "refId": "B" + } + ], + "title": "HTTP Request Content Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes", + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "editorMode": "code", + "expr": "bacalhau_http_server_response_content_length_bytes_total", + "hide": false, + "instant": false, + "legendFormat": "{{http_route}}", + "range": true, + "refId": "B" + } + ], + "title": "HTTP Response Content Length", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 0, + "y": 42 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_cancelable", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Cancelable", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 2, + "y": 42 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_inflight", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Inflight", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 4, + "y": 42 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_pending", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Pending", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 6, + "y": 42 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_eval_broker_waiting", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Evaluatio Broker Waiting", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 8, + "y": 42 + }, + "id": 14, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_worker_ack_faults_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker Ack Faults", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 10, + "y": 42 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_worker_dequeue_faults_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker Dequeue Faults", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unitScale": true + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 2, + "x": 12, + "y": 42 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "P6EBD7EB59B5FF381" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "bacalhau_worker_process_faults_total", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Worker Process Faults", + "type": "stat" } ], - "refresh": "", + "refresh": "10s", "schemaVersion": 39, "tags": [], "templating": { "list": [] }, "time": { - "from": "now-5m", + "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Bacalhau Metrics", "uid": "cbe6c668-d74b-4a27-be8b-431c19b2d4ca", - "version": 1, + "version": 2, "weekStart": "" } \ No newline at end of file diff --git a/ops/metrics/otel-collector-config,yaml b/ops/metrics/otel-collector-config,yaml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/ops/metrics/prometheus/prometheus.yml b/ops/metrics/prometheus/prometheus.yml index 7e33e0d582..1e7a9903d6 100644 --- a/ops/metrics/prometheus/prometheus.yml +++ b/ops/metrics/prometheus/prometheus.yml @@ -1,5 +1,5 @@ scrape_configs: - job_name: 'otel-collector' - scrape_interval: 5s + scrape_interval: 1s static_configs: - targets: ['opentelemetry-collector:9095'] diff --git a/pkg/compute/executor.go b/pkg/compute/executor.go index 37e2d16935..66d9ce33dd 100644 --- a/pkg/compute/executor.go +++ b/pkg/compute/executor.go @@ -11,6 +11,7 @@ import ( "github.com/rs/zerolog/log" "github.com/bacalhau-project/bacalhau/pkg/models" + "github.com/bacalhau-project/bacalhau/pkg/telemetry" "github.com/bacalhau-project/bacalhau/pkg/compute/store" "github.com/bacalhau-project/bacalhau/pkg/executor" @@ -298,11 +299,14 @@ func (e *BaseExecutor) Run(ctx context.Context, state store.LocalExecutionState) Str("execution", execution.ID). Logger().WithContext(ctx) + stopwatch := telemetry.NewTimer(jobDurationMilliseconds) + stopwatch.Start() operation := "Running" defer func() { if err != nil { e.handleFailure(ctx, state, err, operation) } + stopwatch.Stop(ctx, state.Execution.Job.MetricAttributes()...) }() res := e.Start(ctx, execution) diff --git a/pkg/compute/metrics.go b/pkg/compute/metrics.go index 398993f92c..3e453f5cfc 100644 --- a/pkg/compute/metrics.go +++ b/pkg/compute/metrics.go @@ -1,30 +1,37 @@ package compute import ( + "github.com/samber/lo" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/metric" ) // Metrics for monitoring compute nodes: var ( - meter = otel.GetMeterProvider().Meter("compute") - jobsReceived, _ = meter.Int64Counter( + meter = otel.GetMeterProvider().Meter("compute") + jobsReceived = lo.Must(meter.Int64Counter( "jobs_received", metric.WithDescription("Number of jobs received by the compute node"), - ) + )) - jobsAccepted, _ = meter.Int64Counter( + jobsAccepted = lo.Must(meter.Int64Counter( "jobs_accepted", metric.WithDescription("Number of jobs bid on and accepted by the compute node"), - ) + )) - jobsCompleted, _ = meter.Int64Counter( + jobsCompleted = lo.Must(meter.Int64Counter( "jobs_completed", metric.WithDescription("Number of jobs completed by the compute node."), - ) + )) - jobsFailed, _ = meter.Int64Counter( + jobsFailed = lo.Must(meter.Int64Counter( "jobs_failed", metric.WithDescription("Number of jobs failed by the compute node."), - ) + )) + + jobDurationMilliseconds = lo.Must(meter.Int64Histogram( + "job_duration_milliseconds", + metric.WithDescription("Duration of a job on the compute node in milliseconds."), + metric.WithUnit("ms"), + )) ) diff --git a/pkg/models/execution.go b/pkg/models/execution.go index 124388ca36..7b04eadba7 100644 --- a/pkg/models/execution.go +++ b/pkg/models/execution.go @@ -6,8 +6,9 @@ import ( "errors" "time" - "github.com/bacalhau-project/bacalhau/pkg/lib/validate" "github.com/hashicorp/go-multierror" + + "github.com/bacalhau-project/bacalhau/pkg/lib/validate" ) // ExecutionStateType The state of an execution. An execution represents a single attempt to execute a job on a node. diff --git a/pkg/node/metrics/node.go b/pkg/node/metrics/node.go new file mode 100644 index 0000000000..526f5b19ac --- /dev/null +++ b/pkg/node/metrics/node.go @@ -0,0 +1,20 @@ +package metrics + +import ( + "github.com/samber/lo" + "go.opentelemetry.io/otel" + + "github.com/bacalhau-project/bacalhau/pkg/telemetry" +) + +var ( + nodeMeter = otel.GetMeterProvider().Meter("bacalhau-node") +) + +var ( + NodeInfo = lo.Must(telemetry.NewCounter( + nodeMeter, + "bacalhau_node_info", + "A static metric with labels describing the bacalhau node", + )) +) diff --git a/pkg/node/node.go b/pkg/node/node.go index a8b2c8c494..ba619ae2cc 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -5,6 +5,12 @@ import ( "fmt" "time" + "github.com/hashicorp/go-multierror" + "github.com/imdario/mergo" + "github.com/labstack/echo/v4" + "github.com/libp2p/go-libp2p/core/host" + "go.opentelemetry.io/otel/attribute" + "github.com/bacalhau-project/bacalhau/pkg/authz" pkgconfig "github.com/bacalhau-project/bacalhau/pkg/config" "github.com/bacalhau-project/bacalhau/pkg/config/types" @@ -14,6 +20,7 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/model" "github.com/bacalhau-project/bacalhau/pkg/models" nats_transport "github.com/bacalhau-project/bacalhau/pkg/nats/transport" + "github.com/bacalhau-project/bacalhau/pkg/node/metrics" "github.com/bacalhau-project/bacalhau/pkg/publicapi" "github.com/bacalhau-project/bacalhau/pkg/publicapi/apimodels" "github.com/bacalhau-project/bacalhau/pkg/publicapi/endpoint/agent" @@ -24,10 +31,6 @@ import ( "github.com/bacalhau-project/bacalhau/pkg/system" "github.com/bacalhau-project/bacalhau/pkg/transport" "github.com/bacalhau-project/bacalhau/pkg/version" - "github.com/hashicorp/go-multierror" - "github.com/imdario/mergo" - "github.com/labstack/echo/v4" - "github.com/libp2p/go-libp2p/core/host" ) type FeatureConfig struct { @@ -365,6 +368,15 @@ func NewNode( return errors.ErrorOrNil() }) + metrics.NodeInfo.Add(ctx, 1, + attribute.String("node_id", config.NodeID), + attribute.String("node_network_transport", config.NetworkConfig.Type), + attribute.Bool("node_is_compute", config.IsComputeNode), + attribute.Bool("node_is_requester", config.IsRequesterNode), + attribute.StringSlice("node_engines", executors.Keys(ctx)), + attribute.StringSlice("node_publishers", publishers.Keys(ctx)), + attribute.StringSlice("node_storages", storageProviders.Keys(ctx)), + ) node := &Node{ ID: config.NodeID, CleanupManager: config.CleanupManager, diff --git a/pkg/telemetry/metrics.go b/pkg/telemetry/metrics.go index f602e457ab..35415d6dc2 100644 --- a/pkg/telemetry/metrics.go +++ b/pkg/telemetry/metrics.go @@ -29,6 +29,7 @@ func newMeterProvider() { return } + // TODO we can decrese the read rate from the default 30sec to something quicker by passing option here. reader := sdkmetric.NewPeriodicReader(exp) meterProvider = sdkmetric.NewMeterProvider( diff --git a/pkg/telemetry/timer.go b/pkg/telemetry/timer.go new file mode 100644 index 0000000000..9a8da613a2 --- /dev/null +++ b/pkg/telemetry/timer.go @@ -0,0 +1,40 @@ +package telemetry + +import ( + "context" + "time" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Timer measures the duration of an event. +type Timer struct { + startTime time.Time + durationRecorder metric.Int64Histogram +} + +func NewTimer(durationRecorder metric.Int64Histogram) *Timer { + return &Timer{ + durationRecorder: durationRecorder, + } +} + +// Start begins the timer by recording the current time. +func (t *Timer) Start() { + t.startTime = time.Now() +} + +// Stop ends the timer and records the duration since Start was called. +// `attrs` are optional attributes that can be added to the duration metric for additional context. +func (t *Timer) Stop(ctx context.Context, attrs ...attribute.KeyValue) { + if t.startTime.IsZero() { + // Handle the case where Stop is called without Start being called. + return + } + + // Calculate the duration and record it using the OpenTelemetry histogram. + duration := time.Since(t.startTime).Milliseconds() + t.durationRecorder.Record(ctx, duration, metric.WithAttributes(attrs...)) + t.startTime = time.Time{} // Reset the start time for future use. +}