Skip to content

Commit 28889fe

Browse files
committed
feat(validator): add support to validate essential metrics produced by Kepler
This commit introduces functionality to validate essential metrics produced by Kepler The following comparisons are included: - Node Exporter Comparison - Validates `node_rapl_<package|core|dram>` metrics against `kepler_node_<package|core|dram>{dev}` - Kepler Process Comparison - Compares `kepler_process_<package|core|dram|platform|other|uncore>{latest}` metrics to `kepler_process_<package|core|dram|platform|other|uncore>{dev}` - Kepler Node Comparison - Validates `kepler_node_<package|core|dram|platform|other|uncore>{latest}` against `kepler_node_<package|core|dram|platform|other|uncore>{dev}` Additionally, the following changes are made to existing functionality: - Adds a new `metric_validations.yaml` file which includes promql queries for comparisons along with threshold values - Update the existing `stressor.sh` script to now support few more parameters to make it more flexible - warmup time: time to wait before starting the stressor - cooldown time: time to wait after the stressor is finished - repeats: number of times to repeat the stressor. Since for regression test we don't want to repeat the stressor multiple times - Adds a new `validator-regression.yaml` file which includes the configuration for the regression test Signed-off-by: vprashar2929 <[email protected]>
1 parent 021b544 commit 28889fe

File tree

9 files changed

+524
-13
lines changed

9 files changed

+524
-13
lines changed
+354
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
config:
2+
mapping:
3+
actual: latest
4+
predicted: dev
5+
6+
validations:
7+
# node rapl comparison
8+
- name: node-rapl - kepler-package
9+
units: Watts
10+
mapping:
11+
actual: node-rapl
12+
predicted: kepler-package
13+
14+
node-rapl: |
15+
sum(
16+
rate(
17+
node_rapl_package_joules_total[{rate_interval}]
18+
)
19+
)
20+
21+
kepler-package: |
22+
sum(
23+
rate(
24+
kepler_node_package_joules_total{{
25+
job="dev",
26+
mode="dynamic",
27+
}}[{rate_interval}]
28+
)
29+
)
30+
max_mae: 1.01
31+
32+
- name: node-rapl - kepler-core
33+
units: Watts
34+
mapping:
35+
actual: node-rapl
36+
predicted: kepler-core
37+
38+
node-rapl: |
39+
sum(
40+
rate(
41+
node_rapl_core_joules_total[{rate_interval}]
42+
)
43+
)
44+
45+
kepler-core: |
46+
sum(
47+
rate(
48+
kepler_node_core_joules_total{{
49+
job="dev",
50+
mode="dynamic",
51+
}}[{rate_interval}]
52+
)
53+
)
54+
max_mae: 1.01
55+
56+
- name: node-rapl - kepler-dram
57+
units: Watts
58+
mapping:
59+
actual: node-rapl
60+
predicted: kepler-dram
61+
62+
node-rapl: |
63+
sum(
64+
rate(
65+
node_rapl_dram_joules_total[{rate_interval}]
66+
)
67+
)
68+
69+
kepler-dram: |
70+
sum(
71+
rate(
72+
kepler_node_dram_joules_total{{
73+
job="dev",
74+
mode="dynamic",
75+
}}[{rate_interval}]
76+
)
77+
)
78+
max_mae: 1.01
79+
80+
# absolute power comparison
81+
- name: Total - absolute
82+
latest: |
83+
sum(
84+
rate(
85+
kepler_process_joules_total{{
86+
job="latest",
87+
}}[{rate_interval}]
88+
)
89+
)
90+
dev: |
91+
sum(
92+
rate(
93+
kepler_process_joules_total{{
94+
job="dev",
95+
}}[{rate_interval}]
96+
)
97+
)
98+
99+
max_mae: 2.01
100+
101+
# CPU time comparison
102+
- name: cpu-time
103+
units: Milliseconds
104+
latest: |
105+
sum(
106+
rate(
107+
kepler_process_bpf_cpu_time_ms_total{{
108+
job="latest"
109+
}}[{rate_interval}]
110+
)
111+
)
112+
dev: |
113+
sum(
114+
rate(
115+
kepler_process_bpf_cpu_time_ms_total{{
116+
job="dev",
117+
}}[{rate_interval}]
118+
)
119+
)
120+
# max_mae: 20.0
121+
122+
# process comparison
123+
- name: platform - dynamic
124+
latest: |
125+
sum(
126+
rate(
127+
kepler_process_platform_joules_total{{
128+
job="latest", mode="dynamic",
129+
}}[{rate_interval}]
130+
)
131+
)
132+
dev: |
133+
sum(
134+
rate(
135+
kepler_process_platform_joules_total{{
136+
job="dev", mode="dynamic",
137+
}}[{rate_interval}]
138+
)
139+
)
140+
141+
max_mae: 2.01
142+
143+
- name: package - dynamic
144+
units: Watts
145+
latest: |
146+
sum(
147+
rate(
148+
kepler_process_package_joules_total{{
149+
job="latest",
150+
mode="dynamic",
151+
}}[{rate_interval}]
152+
)
153+
)
154+
dev: |
155+
sum(
156+
rate(
157+
kepler_process_package_joules_total{{
158+
job="dev",
159+
mode="dynamic",
160+
}}[{rate_interval}]
161+
)
162+
)
163+
max_mae: 2.01
164+
165+
- name: core - dynamic
166+
units: Watts
167+
latest: |
168+
sum(
169+
rate(
170+
kepler_process_core_joules_total{{
171+
job="latest",
172+
mode="dynamic",
173+
}}[{rate_interval}]
174+
)
175+
)
176+
dev: |
177+
sum(
178+
rate(
179+
kepler_process_core_joules_total{{
180+
job="dev",
181+
mode="dynamic",
182+
}}[{rate_interval}]
183+
)
184+
)
185+
max_mae: 2.01
186+
187+
- name: dram - dynamic
188+
units: Watts
189+
latest: |
190+
sum(
191+
rate(
192+
kepler_process_dram_joules_total{{
193+
job="latest",
194+
mode="dynamic",
195+
}}[{rate_interval}]
196+
)
197+
)
198+
dev: |
199+
sum(
200+
rate(
201+
kepler_process_dram_joules_total{{
202+
job="dev",
203+
mode="dynamic",
204+
}}[{rate_interval}]
205+
)
206+
)
207+
max_mae: 2.01
208+
209+
- name: other - dynamic
210+
units: Watts
211+
latest: |
212+
sum(
213+
rate(
214+
kepler_process_other_joules_total{{
215+
job="latest",
216+
mode="dynamic",
217+
}}[{rate_interval}]
218+
)
219+
)
220+
dev: |
221+
sum(
222+
rate(
223+
kepler_process_other_joules_total{{
224+
job="dev",
225+
mode="dynamic",
226+
}}[{rate_interval}]
227+
)
228+
)
229+
max_mae: 2.01
230+
231+
- name: uncore - dynamic
232+
units: Watts
233+
latest: |
234+
sum(
235+
rate(
236+
kepler_process_uncore_joules_total{{
237+
job="latest",
238+
mode="dynamic",
239+
}}[{rate_interval}]
240+
)
241+
)
242+
dev: |
243+
sum(
244+
rate(
245+
kepler_process_uncore_joules_total{{
246+
job="dev",
247+
mode="dynamic",
248+
}}[{rate_interval}]
249+
)
250+
)
251+
max_mae: 2.01
252+
253+
# node comparison
254+
- name: node platform - dynamic
255+
units: Watts
256+
latest: |
257+
rate(kepler_node_platform_joules_total{{
258+
job="latest",
259+
mode="dynamic",
260+
}}[{rate_interval}]
261+
)
262+
263+
dev: |
264+
rate(kepler_node_platform_joules_total{{
265+
job="dev",
266+
mode="dynamic",
267+
}}[{rate_interval}]
268+
)
269+
max_mae: 2.01
270+
271+
- name: node package - dynamic
272+
units: Watts
273+
latest: |
274+
rate(kepler_node_package_joules_total{{
275+
job="latest",
276+
mode="dynamic",
277+
}}[{rate_interval}]
278+
)
279+
280+
dev: |
281+
rate(kepler_node_package_joules_total{{
282+
job="dev",
283+
mode="dynamic",
284+
}}[{rate_interval}]
285+
)
286+
max_mae: 2.01
287+
288+
- name: node core - dynamic
289+
units: Watts
290+
latest: |
291+
rate(kepler_node_core_joules_total{{
292+
job="latest",
293+
mode="dynamic",
294+
}}[{rate_interval}]
295+
)
296+
297+
dev: |
298+
rate(kepler_node_core_joules_total{{
299+
job="dev",
300+
mode="dynamic",
301+
}}[{rate_interval}]
302+
)
303+
max_mae: 2.01
304+
305+
- name: node dram - dynamic
306+
units: Watts
307+
latest: |
308+
rate(kepler_node_dram_joules_total{{
309+
job="latest",
310+
mode="dynamic",
311+
}}[{rate_interval}]
312+
)
313+
314+
dev: |
315+
rate(kepler_node_dram_joules_total{{
316+
job="dev",
317+
mode="dynamic",
318+
}}[{rate_interval}]
319+
)
320+
max_mae: 2.01
321+
322+
- name: node other - dynamic
323+
units: Watts
324+
latest: |
325+
rate(kepler_node_other_joules_total{{
326+
job="latest",
327+
mode="dynamic",
328+
}}[{rate_interval}]
329+
)
330+
331+
dev: |
332+
rate(kepler_node_other_joules_total{{
333+
job="dev",
334+
mode="dynamic",
335+
}}[{rate_interval}]
336+
)
337+
max_mae: 2.01
338+
339+
- name: node uncore - dynamic
340+
units: Watts
341+
latest: |
342+
rate(kepler_node_uncore_joules_total{{
343+
job="latest",
344+
mode="dynamic",
345+
}}[{rate_interval}]
346+
)
347+
348+
dev: |
349+
rate(kepler_node_uncore_joules_total{{
350+
job="dev",
351+
mode="dynamic",
352+
}}[{rate_interval}]
353+
)
354+
max_mae: 2.01

0 commit comments

Comments
 (0)