Skip to content

Commit f331454

Browse files
authored
Merge pull request #247 from coroot/psi_metrics
add per-container Pressure Stall Information (PSI) metrics
2 parents d7c8349 + 3a7cbbd commit f331454

File tree

8 files changed

+138
-3
lines changed

8 files changed

+138
-3
lines changed

cgroup/cpu.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ type CPUStat struct {
1616
LimitCores float64
1717
}
1818

19-
func (cg Cgroup) CpuStat() *CPUStat {
19+
func (cg *Cgroup) CpuStat() *CPUStat {
2020
cpu, cpuacct := cg.subsystems["cpu"], cg.subsystems["cpuacct"]
2121
if cpu == "" || cpuacct == "" {
2222
st, _ := cg.cpuStatV2()
@@ -26,7 +26,7 @@ func (cg Cgroup) CpuStat() *CPUStat {
2626
return st
2727
}
2828

29-
func (cg Cgroup) cpuStatV1() (*CPUStat, error) {
29+
func (cg *Cgroup) cpuStatV1() (*CPUStat, error) {
3030
if cg.subsystems["cpu"] == "" || cg.subsystems["cpuacct"] == "" {
3131
return nil, nil
3232
}
@@ -56,7 +56,7 @@ func (cg Cgroup) cpuStatV1() (*CPUStat, error) {
5656
return res, nil
5757
}
5858

59-
func (cg Cgroup) cpuStatV2() (*CPUStat, error) {
59+
func (cg *Cgroup) cpuStatV2() (*CPUStat, error) {
6060
if cg.subsystems[""] == "" {
6161
return nil, nil
6262
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
some avg10=0.00 avg60=0.00 avg300=0.00 total=465907442
2+
full avg10=0.00 avg60=0.00 avg300=0.00 total=463529433
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
some avg10=0.00 avg60=0.00 avg300=0.05 total=17657662684
2+
full avg10=0.00 avg60=0.00 avg300=0.05 total=17636951020
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
some avg10=0.00 avg60=0.00 avg300=0.00 total=6937313991
2+
full avg10=0.00 avg60=0.00 avg300=0.00 total=6934649214

cgroup/psi.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
package cgroup
2+
3+
import (
4+
"os"
5+
"path"
6+
"strconv"
7+
"strings"
8+
9+
"github.com/coroot/coroot-node-agent/common"
10+
"k8s.io/klog/v2"
11+
)
12+
13+
type PSIStats struct {
14+
CPUSecondsSome float64
15+
CPUSecondsFull float64
16+
MemorySecondsSome float64
17+
MemorySecondsFull float64
18+
IOSecondsSome float64
19+
IOSecondsFull float64
20+
}
21+
22+
type PressureTotals struct {
23+
SomeSecondsTotal float64
24+
FullSecondsTotal float64
25+
}
26+
27+
func (cg *Cgroup) PSI() *PSIStats {
28+
if cg.subsystems[""] == "" {
29+
return nil
30+
}
31+
stats := &PSIStats{}
32+
for _, controller := range []string{"cpu", "memory", "io"} {
33+
p, err := cg.readPressure(controller)
34+
if err != nil {
35+
if !common.IsNotExist(err) {
36+
klog.Warningln(err)
37+
}
38+
return nil
39+
}
40+
switch controller {
41+
case "cpu":
42+
stats.CPUSecondsSome = p.SomeSecondsTotal
43+
stats.CPUSecondsFull = p.FullSecondsTotal
44+
case "memory":
45+
stats.MemorySecondsSome = p.SomeSecondsTotal
46+
stats.MemorySecondsFull = p.FullSecondsTotal
47+
case "io":
48+
stats.IOSecondsSome = p.SomeSecondsTotal
49+
stats.IOSecondsFull = p.FullSecondsTotal
50+
}
51+
}
52+
return stats
53+
}
54+
55+
func (cg *Cgroup) readPressure(controller string) (*PressureTotals, error) {
56+
data, err := os.ReadFile(path.Join(cg2Root, cg.subsystems[""], controller+".pressure"))
57+
if err != nil {
58+
return nil, err
59+
}
60+
pressure := &PressureTotals{}
61+
for _, line := range strings.Split(strings.TrimSpace(string(data)), "\n") {
62+
parts := strings.Fields(line)
63+
if len(parts) == 0 {
64+
continue
65+
}
66+
kind := parts[0]
67+
for _, p := range parts[1:] {
68+
if strings.HasPrefix(p, "total=") {
69+
vStr := strings.TrimPrefix(p, "total=")
70+
v, err := strconv.ParseUint(vStr, 10, 64)
71+
if err != nil {
72+
return nil, err
73+
}
74+
switch kind {
75+
case "some":
76+
pressure.SomeSecondsTotal = float64(v) / 1e6 // microseconds to seconds
77+
case "full":
78+
pressure.FullSecondsTotal = float64(v) / 1e6
79+
}
80+
break
81+
}
82+
}
83+
}
84+
return pressure, nil
85+
}

cgroup/psi_test.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package cgroup
2+
3+
import (
4+
"path"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
"github.com/stretchr/testify/require"
9+
)
10+
11+
func TestCgroupPSI(t *testing.T) {
12+
cgRoot = "fixtures/cgroup"
13+
cg2Root = "fixtures/cgroup"
14+
15+
cg, _ := NewFromProcessCgroupFile(path.Join("fixtures/proc/400/cgroup"))
16+
stat := cg.PSI()
17+
require.NotNil(t, stat)
18+
assert.Equal(t, float64(465907442)/1e6, stat.CPUSecondsSome)
19+
assert.Equal(t, float64(463529433)/1e6, stat.CPUSecondsFull)
20+
assert.Equal(t, float64(6937313991)/1e6, stat.MemorySecondsSome)
21+
assert.Equal(t, float64(6934649214)/1e6, stat.MemorySecondsFull)
22+
assert.Equal(t, float64(17657662684)/1e6, stat.IOSecondsSome)
23+
assert.Equal(t, float64(17636951020)/1e6, stat.IOSecondsFull)
24+
25+
cg, _ = NewFromProcessCgroupFile(path.Join("fixtures/proc/100/cgroup"))
26+
assert.Nil(t, cg.PSI())
27+
}

containers/container.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,15 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
265265
}
266266
}
267267

268+
if psi := c.cgroup.PSI(); psi != nil {
269+
ch <- counter(metrics.PsiCPU, psi.CPUSecondsSome, "some")
270+
ch <- counter(metrics.PsiCPU, psi.CPUSecondsFull, "full")
271+
ch <- counter(metrics.PsiMemory, psi.MemorySecondsSome, "some")
272+
ch <- counter(metrics.PsiMemory, psi.MemorySecondsFull, "full")
273+
ch <- counter(metrics.PsiIO, psi.IOSecondsSome, "some")
274+
ch <- counter(metrics.PsiIO, psi.IOSecondsFull, "full")
275+
}
276+
268277
if c.oomKills > 0 {
269278
ch <- counter(metrics.OOMKills, float64(c.oomKills))
270279
}

containers/metrics.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ var metrics = struct {
1919
MemoryCache *prometheus.Desc
2020
OOMKills *prometheus.Desc
2121

22+
PsiCPU *prometheus.Desc
23+
PsiMemory *prometheus.Desc
24+
PsiIO *prometheus.Desc
25+
2226
DiskDelay *prometheus.Desc
2327
DiskSize *prometheus.Desc
2428
DiskUsed *prometheus.Desc
@@ -71,6 +75,10 @@ var metrics = struct {
7175
MemoryCache: metric("container_resources_memory_cache_bytes", "Amount of page cache memory allocated by the container"),
7276
OOMKills: metric("container_oom_kills_total", "Total number of times the container was terminated by the OOM killer"),
7377

78+
PsiCPU: metric("container_resources_cpu_pressure_waiting_seconds_total", "Total time in seconds tha the container were delayed due to CPU pressure", "kind"),
79+
PsiMemory: metric("container_resources_memory_pressure_waiting_seconds_total", "Total time in seconds that the container were delayed due to memory pressure", "kind"),
80+
PsiIO: metric("container_resources_io_pressure_waiting_seconds_total", "Total time in seconds that the container were delayed due to I/O pressure", "kind"),
81+
7482
DiskDelay: metric("container_resources_disk_delay_seconds_total", "Total time duration processes of the container have been waiting fot I/Os to complete"),
7583
DiskSize: metric("container_resources_disk_size_bytes", "Total capacity of the volume", "mount_point", "device", "volume"),
7684
DiskUsed: metric("container_resources_disk_used_bytes", "Used capacity of the volume", "mount_point", "device", "volume"),

0 commit comments

Comments
 (0)