Skip to content

Commit 827aefd

Browse files
committed
feat(validator): add support to validate kepler metrics
This commit adds support to validate essential metrics produced by Kepler Signed-off-by: vprashar2929 <[email protected]>
1 parent 75b9533 commit 827aefd

File tree

4 files changed

+356
-0
lines changed

4 files changed

+356
-0
lines changed
+320
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
config:
2+
mapping:
3+
actual: latest
4+
predicted: dev
5+
6+
validations:
7+
# absolute power comparison
8+
- name: Total - absolute
9+
latest: |
10+
sum(
11+
rate(
12+
kepler_process_joules_total{{
13+
job="{latest_job_name}",
14+
}}[{rate_interval}]
15+
)
16+
)
17+
dev: |
18+
sum(
19+
rate(
20+
kepler_process_joules_total{{
21+
job="{dev_job_name}",
22+
}}[{rate_interval}]
23+
)
24+
)
25+
26+
max_mae: 2.0
27+
28+
- name: platform - absolute
29+
latest: |
30+
sum(
31+
rate(
32+
kepler_process_platform_joules_total{{
33+
job="{latest_job_name}",
34+
}}[{rate_interval}]
35+
)
36+
)
37+
dev: |
38+
sum(
39+
rate(
40+
kepler_process_platform_joules_total{{
41+
job="{dev_job_name}",
42+
}}[{rate_interval}]
43+
)
44+
)
45+
46+
max_mae: 2.0
47+
48+
- name: package - absolute
49+
latest: |
50+
sum(
51+
rate(
52+
kepler_process_package_joules_total{{
53+
job="{latest_job_name}",
54+
}}[{rate_interval}]
55+
)
56+
)
57+
dev: |
58+
sum(
59+
rate(
60+
kepler_process_package_joules_total{{
61+
job="{dev_job_name}",
62+
}}[{rate_interval}]
63+
)
64+
)
65+
max_mae: 2.0
66+
67+
- name: dram - absolute
68+
latest: |
69+
sum(
70+
rate(
71+
kepler_process_dram_joules_total{{
72+
job="{latest_job_name}",
73+
}}[{rate_interval}]
74+
)
75+
)
76+
dev: |
77+
sum(
78+
rate(
79+
kepler_process_dram_joules_total{{
80+
job="{dev_job_name}",
81+
}}[{rate_interval}]
82+
)
83+
)
84+
max_mae: 2.0
85+
86+
- name: core - absolute
87+
units: Watts
88+
latest: |
89+
sum(
90+
rate(
91+
kepler_process_core_joules_total{{
92+
job="{latest_job_name}",
93+
}}[{rate_interval}]
94+
)
95+
)
96+
dev: |
97+
sum(
98+
rate(
99+
kepler_process_core_joules_total{{
100+
job="{dev_job_name}",
101+
}}[{rate_interval}]
102+
)
103+
)
104+
max_mae: 2.0
105+
106+
- name: other - absolute
107+
units: Watts
108+
latest: |
109+
sum(
110+
rate(
111+
kepler_process_other_joules_total{{
112+
job="{latest_job_name}",
113+
}}[{rate_interval}]
114+
)
115+
)
116+
dev: |
117+
sum(
118+
rate(
119+
kepler_process_other_joules_total{{
120+
job="{dev_job_name}",
121+
}}[{rate_interval}]
122+
)
123+
)
124+
max_mae: 2.0
125+
126+
# CPU time comparison
127+
- name: cpu-time
128+
units: Milliseconds
129+
latest: |
130+
sum(
131+
rate(
132+
kepler_process_bpf_cpu_time_ms_total{{
133+
job="{latest_job_name}"
134+
}}[{rate_interval}]
135+
)
136+
)
137+
dev: |
138+
sum(
139+
rate(
140+
kepler_process_bpf_cpu_time_ms_total{{
141+
job="{dev_job_name}",
142+
}}[{rate_interval}]
143+
)
144+
)
145+
# max_mae: 20.0
146+
147+
- name: package - dynamic
148+
units: Watts
149+
latest: |
150+
sum(
151+
rate(
152+
kepler_process_package_joules_total{{
153+
job="{latest_job_name}",
154+
mode="dynamic",
155+
}}[{rate_interval}]
156+
)
157+
)
158+
dev: |
159+
sum(
160+
rate(
161+
kepler_process_package_joules_total{{
162+
job="{dev_job_name}",
163+
mode="dynamic",
164+
}}[{rate_interval}]
165+
)
166+
)
167+
max_mae: 2.0
168+
169+
- name: core - dynamic
170+
units: Watts
171+
latest: |
172+
sum(
173+
rate(
174+
kepler_process_core_joules_total{{
175+
job="{latest_job_name}",
176+
mode="dynamic",
177+
}}[{rate_interval}]
178+
)
179+
)
180+
dev: |
181+
sum(
182+
rate(
183+
kepler_process_core_joules_total{{
184+
job="{dev_job_name}",
185+
mode="dynamic",
186+
}}[{rate_interval}]
187+
)
188+
)
189+
max_mae: 2.0
190+
191+
- name: dram - dynamic
192+
units: Watts
193+
latest: |
194+
sum(
195+
rate(
196+
kepler_process_dram_joules_total{{
197+
job="{latest_job_name}",
198+
mode="dynamic",
199+
}}[{rate_interval}]
200+
)
201+
)
202+
dev: |
203+
sum(
204+
rate(
205+
kepler_process_dram_joules_total{{
206+
job="{dev_job_name}",
207+
mode="dynamic",
208+
}}[{rate_interval}]
209+
)
210+
)
211+
max_mae: 2.0
212+
213+
- name: other - dynamic
214+
units: Watts
215+
latest: |
216+
sum(
217+
rate(
218+
kepler_process_other_joules_total{{
219+
job="{latest_job_name}",
220+
mode="dynamic",
221+
}}[{rate_interval}]
222+
)
223+
)
224+
dev: |
225+
sum(
226+
rate(
227+
kepler_process_other_joules_total{{
228+
job="{dev_job_name}",
229+
mode="dynamic",
230+
}}[{rate_interval}]
231+
)
232+
)
233+
max_mae: 2.0
234+
235+
# Node comparison
236+
- name: node platform - dynamic
237+
units: Watts
238+
latest: |
239+
rate(kepler_node_platform_joules_total{{
240+
job="{latest_job_name}",
241+
mode="dynamic",
242+
}}[{rate_interval}]
243+
)
244+
245+
dev: |
246+
rate(kepler_node_platform_joules_total{{
247+
job="{dev_job_name}",
248+
mode="dynamic",
249+
}}[{rate_interval}]
250+
)
251+
max_mae: 2.0
252+
253+
- name: node package - dynamic
254+
units: Watts
255+
latest: |
256+
rate(kepler_node_package_joules_total{{
257+
job="{latest_job_name}",
258+
mode="dynamic",
259+
}}[{rate_interval}]
260+
)
261+
262+
dev: |
263+
rate(kepler_node_package_joules_total{{
264+
job="{dev_job_name}",
265+
mode="dynamic",
266+
}}[{rate_interval}]
267+
)
268+
max_mae: 2.0
269+
270+
- name: node core - dynamic
271+
units: Watts
272+
latest: |
273+
rate(kepler_node_core_joules_total{{
274+
job="{latest_job_name}",
275+
mode="dynamic",
276+
}}[{rate_interval}]
277+
)
278+
279+
dev: |
280+
rate(kepler_node_core_joules_total{{
281+
job="{dev_job_name}",
282+
mode="dynamic",
283+
}}[{rate_interval}]
284+
)
285+
max_mae: 2.0
286+
287+
- name: node dram - dynamic
288+
units: Watts
289+
latest: |
290+
rate(kepler_node_dram_joules_total{{
291+
job="{latest_job_name}",
292+
mode="dynamic",
293+
}}[{rate_interval}]
294+
)
295+
296+
dev: |
297+
rate(kepler_node_dram_joules_total{{
298+
job="{dev_job_name}",
299+
mode="dynamic",
300+
}}[{rate_interval}]
301+
)
302+
max_mae: 2.0
303+
304+
- name: node other - dynamic
305+
units: Watts
306+
latest: |
307+
rate(kepler_node_other_joules_total{{
308+
job="{latest_job_name}",
309+
mode="dynamic",
310+
}}[{rate_interval}]
311+
)
312+
313+
dev: |
314+
rate(kepler_node_other_joules_total{{
315+
job="{dev_job_name}",
316+
mode="dynamic",
317+
}}[{rate_interval}]
318+
)
319+
max_mae: 2.0
320+

e2e/tools/validator/src/validator/cli/__init__.py

+29
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import matplotlib.pyplot as plt
1717
import numpy as np
1818
import numpy.typing as npt
19+
from click.exceptions import Exit
1920
from matplotlib import ticker
2021
from matplotlib.dates import DateFormatter
2122

@@ -610,6 +611,34 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di
610611
return int(res.validations.passed)
611612

612613

614+
@validator.command()
615+
@click.option("--duration", "-d", type=options.Duration(), required=True)
616+
# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory)
617+
@click.option(
618+
"--report-dir",
619+
"-o",
620+
default="/tmp",
621+
type=click.Path(exists=True, dir_okay=True, writable=True),
622+
show_default=True,
623+
)
624+
@pass_config
625+
def validate_metrics(cfg: config.Validator, duration: datetime.timedelta, report_dir: str):
626+
results_dir, tag = create_report_dir(report_dir)
627+
res = TestResult(tag)
628+
res.end_time = datetime.datetime.now(tz=datetime.UTC)
629+
res.start_time = res.end_time - duration
630+
click.secho(" * Generating build and node info ...", fg="green")
631+
res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus)
632+
click.secho(" * Generating spec report ...", fg="green")
633+
res.host_spec = get_host_spec()
634+
script_result = ScriptResult(res.start_time, res.end_time)
635+
res.validations = run_validations(cfg, script_result, results_dir)
636+
click.secho(" * Generating validate metrics report file and dir", fg="green")
637+
write_md_report(results_dir, res)
638+
639+
raise Exit(1) if not res.validations.passed else Exit(0)
640+
641+
613642
def write_json_report(results_dir: str, res: TestResult):
614643
pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?')
615644

e2e/tools/validator/src/validator/config/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class Metal(NamedTuple):
3131
class PrometheusJob(NamedTuple):
3232
metal: str
3333
vm: str
34+
dev: str
35+
latest: str
3436

3537

3638
class Prometheus(NamedTuple):
@@ -95,6 +97,8 @@ def load(config_file: str) -> Validator:
9597
job = PrometheusJob(
9698
metal=prom_job.get("metal", "metal"),
9799
vm=prom_job.get("vm", "vm"),
100+
latest=prom_job.get("latest", "latest"),
101+
dev=prom_job.get("dev", "dev"),
98102
)
99103

100104
prometheus = Prometheus(

e2e/tools/validator/src/validator/validations/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation:
8888
predicted_label=predicted_label,
8989
units=v.get("units", ""),
9090
max_mape=v.get("max_mape"),
91+
max_mae=v.get("max_mae"),
9192
)
9293

9394
return [validation_from_yaml(v) for v in yml["validations"]]
@@ -112,6 +113,8 @@ def load(self) -> list[Validation]:
112113
promql_vars["rate_interval"] = prom.rate_interval
113114
promql_vars["metal_job_name"] = prom.job.metal
114115
promql_vars["vm_job_name"] = prom.job.vm
116+
promql_vars["latest_job_name"] = prom.job.latest
117+
promql_vars["dev_job_name"] = prom.job.dev
115118

116119
logger.debug("promql_vars: %s", promql_vars)
117120

0 commit comments

Comments
 (0)