Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Prometheus & Grafana Setup Guide

This document explains how to run Prometheus and Grafana locally for two audiences:

1) **Node runners** who want to view dashboards alongside their nodes
2) **Citrea developers** who want to update dashboards locally and sync changes to production

---

## 1) For Node Runners: View Grafana Dashboards Next to Your Node

1. Go to the Docker directory:
- `<project_dir>/docker`

2. Start the user telemetry stack:
```bash
docker compose -f docker-compose.telemetry.user.yaml up
```

3. Open Grafana:
- `http://localhost:3000`

4. Import a dashboard:
- Navigate to:
- `<project_dir>/resources/grafana/user`
- Copy the JSON for the node type you want.
- Import it into Grafana.

**Note:** Don’t forget to add the telemetry config to your `rollup_config.toml`.

---

## 2) For Citrea Developers: Update Dashboards Locally and Sync to Production

### Run Prometheus + Grafana Locally

1. Go to the Docker directory:
- `<project_dir>/docker`

2. Start the developer telemetry stack:
```bash
docker compose -f docker-compose.telemetry.yaml up
```

3. Open Grafana and make your dashboard changes:
- `http://localhost:3000`

---

### Update the Production Dashboard

4. When exporting your updated dashboard:
- Select **Export as code → Classic**
- Disable **Export for sharing externally**
- This ensures the local data source UID remains compatible with production.

5. In production Grafana:
- Open the dashboard
- Click **Edit**
- Go to **Settings → JSON Model**
- Paste the updated JSON.

6. Save the updated production dashboard JSON under:
- `resources/grafana/prod`

---

### Update User Dashboards

7. After updating the production dashboard, generate the user version using:
```bash
python ./resource/grafana/remove_labels.py \
./resources/grafana/prod/<node_type>.dashboard.json \
> ./resources/grafana/user/<node_type>.dashboard.json
```

8. Validate:
- Run Prometheus/Grafana in **user mode**
- Import the newly generated user dashboard
- Confirm everything looks correct.

---

## Summary

- **Node runners** should use:
- `docker-compose.telemetry.user.yaml`
- User dashboards from `resources/grafana/user`

- **Developers** should:
- Update locally with `docker-compose.telemetry.yaml`
- Export carefully to preserve UID compatibility
- Paste into prod JSON Model
- Commit the updated JSON in:
- `resources/grafana/prod`
- Regenerate user dashboards using `remove_labels.py`

---
43 changes: 43 additions & 0 deletions docker/docker-compose.telemetry.user.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
services:
prometheus:
container_name: prometheus.citrea
image: prom/prometheus
ports:
- 9090:9090
volumes:
- ./telemetry-user/prometheus.yml:/etc/prometheus/prometheus.yml
networks:
- monitoring
extra_hosts:
- "host.docker.internal:host-gateway"
grafana:
image: grafana/grafana-enterprise
ports:
- 3000:3000
environment:
- GF_SECURITY_ADMIN_PASSWORD=password
volumes:
- grafana_data_user:/var/lib/grafana
- ./telemetry-user/grafana/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
networks:
- monitoring

cadvisor:
image: gcr.io/cadvisor/cadvisor
ports:
- 8080:8080
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /var/run/docker.sock:/var/run/docker.sock:ro # Add only if you have your containers running on Mac
networks:
- monitoring

networks:
monitoring:
driver: bridge

volumes:
grafana_data_user: {}
7 changes: 6 additions & 1 deletion docker/docker-compose.telemetry.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
services:
prometheus:
image: prom/prometheus
container_name: prometheus.citrea
build:
context: ./prometheus
dockerfile: Dockerfile
image: local-prometheus
ports:
- 9090:9090
volumes:
Expand All @@ -18,6 +22,7 @@ services:
- GF_SECURITY_ADMIN_PASSWORD=password
volumes:
- grafana_data:/var/lib/grafana
- ./telemetry/grafana/datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
networks:
- monitoring

Expand Down
23 changes: 23 additions & 0 deletions docker/prometheus/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM prom/prometheus:v3.4.0 AS source

FROM ubuntu:24.04

RUN apt update && apt -y upgrade && apt -y install python3 python3-pip && apt -y autoremove && apt clean && rm -rf /var/lib/apt/lists/*

RUN pip3 install --break-system-packages pyyaml

COPY --from=source /bin/prometheus /bin/prometheus
COPY --from=source /bin/promtool /bin/promtool

RUN mkdir -p /mnt/task/prometheus-data && mkdir -p /etc/prometheus

RUN apt update && apt install telnet -y

WORKDIR /srv

COPY start.sh /srv/start.sh
COPY init.py /srv/init.py

RUN chmod +x /srv/start.sh

ENTRYPOINT ["/srv/start.sh"]
165 changes: 165 additions & 0 deletions docker/prometheus/init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
from copy import deepcopy

import yaml


LOCAL = {
"ENVIRONMENT": "CORE",
"NET_NAME": "DEV-NET",
"SERVICE:citrea-sequencer": "host.docker.internal:8001",
"SERVICE:citrea-full-node": "host.docker.internal:8002",
"SERVICE:citrea-prover": "host.docker.internal:8003",
"SERVICE:citrea-light-prover": "host.docker.internal:8004",
}


def main():
net_name = LOCAL["NET_NAME"]
env_name = LOCAL["ENVIRONMENT"]

services = [(k.split(":")[1], v) for k, v in LOCAL.items() if k.startswith("SERVICE:")]

prometheus_config_yaml = create_prometheus_config(net_name, env_name, services)

# print(prometheus_config_yaml)

with open("/etc/prometheus/prometheus.yml", "w") as f:
f.write(prometheus_config_yaml)


def create_prometheus_config(net_name: str, env_name: str, services: list[tuple[str, str]]):
net_name = net_name.lower()
env_name = env_name.lower()

global_config = {
"scrape_interval": "15s",
"evaluation_interval": "15s"
}

general_relabel_configs = [
{
"source_labels": [
"__address__"
],
"target_label": "net_name",
"replacement": net_name
},
{
"source_labels": [
"__address__"
],
"target_label": "env_name",
"replacement": env_name
}
]

self_scrape_config = {
"job_name": "prometheus",
"static_configs": [
{
"targets": [
"127.0.0.1:9090"
]
}
],
"relabel_configs": [
*deepcopy(general_relabel_configs),
*replacement_builder(".*", "prometheus"),
]
}

service_grouped = {
k: [f"{x[1]}" for x in services if x[0] == k]
for k in set(x[0] for x in services)
}

service_configs = [
{
"job_name": k,
"scrape_interval": "1s",
"static_configs": [
{
"targets": v
}
],
"relabel_configs": [
*deepcopy(general_relabel_configs),
*replacement_builder("(.*)\.?:.*", "$1"),
*replacement_builder("(.*)\.citrea\.?:.*", "$1"),
*replacement_builder(f"{env_name}-{net_name}-(.*)\.citrea\.?:.*", "$1"),
]
}
for k, v in service_grouped.items()
]

scrape_configs = [
self_scrape_config,
*service_configs,
]


prometheus_config = {
"global": global_config,
"scrape_configs": scrape_configs
}

prometheus_config_yaml = yaml.dump(prometheus_config, default_flow_style=False)
return prometheus_config_yaml


def replacement_builder(regex: str, replacement: str):
net_name = LOCAL["NET_NAME"].lower()
env_name = LOCAL["ENVIRONMENT"].lower()

if "dev" in net_name:
short_prefix_start = "dn"
elif "test" in net_name:
short_prefix_start = "tn"
elif "main" in net_name:
short_prefix_start = "mn"
elif "general" in net_name:
short_prefix_start = "g"
else:
short_prefix_start = "x"

if "core" in env_name:
short_prefix_end = "c"
elif "web" in env_name:
short_prefix_end = "w"
elif "pop" in env_name:
short_prefix_end = "p"
if "eu" in env_name:
short_prefix_end = f"{short_prefix_end}eu"
elif "ap" in env_name:
short_prefix_end = f"{short_prefix_end}ap"
else:
short_prefix_end = f"{short_prefix_end}xx"
elif "common" in env_name:
short_prefix_end = "c"
else:
short_prefix_end = "x"

short_prefix = f"{short_prefix_start}{short_prefix_end}"

return deepcopy([
{
"source_labels": [
"__address__"
],
"regex": regex,
"target_label": "instance",
"replacement": f"{short_prefix}-{replacement}",
},
{
"source_labels": [
"__address__"
],
"regex": regex,
"target_label": "service_name",
"replacement": f"{replacement}",
}
])


if __name__ == "__main__":
main()
15 changes: 15 additions & 0 deletions docker/prometheus/start.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
set -e

python3 ./init.py


# run with server mode
/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/mnt/task/prometheus-data \
--web.external-url= \
--web.page-title="DEV-NET CORE Prometheus" \
--web.enable-lifecycle \
--web.enable-admin-api \
--storage.tsdb.retention.time=30d
14 changes: 14 additions & 0 deletions docker/telemetry-user/grafana/datasources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# NOTE: A specific UID is used to match the production environment's datasource UID
# to enable seamless dashboard import/export.

apiVersion: 1

datasources:
- name: prometheus
type: prometheus
access: proxy
uid: eed8s4geh3myob
url: http://prometheus.citrea:9090
isDefault: true
readOnly: false
editable: true
Loading
Loading