Skip to content

Commit d7c8349

Browse files
authored
Merge pull request #246 from coroot/python_ebpf_map
read python stats directly from the ebpf map
2 parents aa81b30 + 611166e commit d7c8349

File tree

8 files changed

+111
-70
lines changed

8 files changed

+111
-70
lines changed

containers/container.go

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,9 @@ type Container struct {
134134

135135
gpuStats map[string]*GpuUsage
136136

137-
oomKills int
138-
pythonThreadLockWaitTime time.Duration
139-
nodejsStats *ebpftracer.NodejsStats
137+
oomKills int
138+
nodejsStats *ebpftracer.NodejsStats
139+
pythonStats *ebpftracer.PythonStats
140140

141141
mounts map[string]proc.MountInfo
142142
seenMounts map[uint64]struct{}
@@ -399,8 +399,8 @@ func (c *Container) Collect(ch chan<- prometheus.Metric) {
399399
for appType := range appTypes {
400400
ch <- gauge(metrics.ApplicationType, 1, appType)
401401
}
402-
if c.pythonThreadLockWaitTime > 0 {
403-
ch <- counter(metrics.PythonThreadLockWaitTime, c.pythonThreadLockWaitTime.Seconds())
402+
if c.pythonStats != nil {
403+
ch <- counter(metrics.PythonThreadLockWaitTime, c.pythonStats.ThreadLockWaitTime.Seconds())
404404
}
405405
if c.nodejsStats != nil {
406406
ch <- counter(metrics.NodejsEventLoopBlockedTime, c.nodejsStats.EventLoopBlockedTime.Seconds())
@@ -850,6 +850,23 @@ func (c *Container) updateNodejsStats(s NodejsStatsUpdate) {
850850
p.nodejsPrevStats = &s.Stats
851851
}
852852

853+
func (c *Container) updatePythonStats(s PythonStatsUpdate) {
854+
c.lock.Lock()
855+
defer c.lock.Unlock()
856+
857+
p := c.processes[s.Pid]
858+
if p == nil || p.pythonPrevStats == nil {
859+
return
860+
}
861+
if delta := s.Stats.ThreadLockWaitTime - p.pythonPrevStats.ThreadLockWaitTime; delta > 0 {
862+
if c.pythonStats == nil {
863+
c.pythonStats = &ebpftracer.PythonStats{}
864+
}
865+
c.pythonStats.ThreadLockWaitTime += delta
866+
}
867+
p.pythonPrevStats = &s.Stats
868+
}
869+
853870
func (c *Container) getMounts() map[string]map[string]*proc.FSStat {
854871
if len(c.mounts) == 0 {
855872
return nil

containers/process.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ type Process struct {
4444
pythonGilChecked bool
4545
nodejsChecked bool
4646
nodejsPrevStats *ebpftracer.NodejsStats
47+
pythonPrevStats *ebpftracer.PythonStats
4748

4849
gpuUsageSamples []gpu.ProcessUsageSample
4950
}
@@ -113,6 +114,7 @@ func (p *Process) instrumentPython(cmdline []byte, tracer *ebpftracer.Tracer) {
113114
if !pythonCmd.Match(cmd) {
114115
return
115116
}
117+
p.pythonPrevStats = &ebpftracer.PythonStats{}
116118
p.uprobes = append(p.uprobes, tracer.AttachPythonThreadLockProbes(p.Pid)...)
117119
}
118120

containers/registry.go

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ type Registry struct {
6262
ebpfStatsLock sync.Mutex
6363
trafficStatsUpdateCh chan *TrafficStatsUpdate
6464
nodejsStatsUpdateCh chan *NodejsStatsUpdate
65+
pythonStatsUpdateCh chan *PythonStatsUpdate
6566

6667
gpuProcessUsageSampleChan chan gpu.ProcessUsageSample
6768
}
@@ -119,6 +120,7 @@ func NewRegistry(reg prometheus.Registerer, processInfoCh chan<- ProcessInfo, gp
119120

120121
trafficStatsUpdateCh: make(chan *TrafficStatsUpdate),
121122
nodejsStatsUpdateCh: make(chan *NodejsStatsUpdate),
123+
pythonStatsUpdateCh: make(chan *PythonStatsUpdate),
122124

123125
gpuProcessUsageSampleChan: gpuProcessUsageSampleChan,
124126
}
@@ -220,6 +222,13 @@ func (r *Registry) handleEvents(ch <-chan ebpftracer.Event) {
220222
if c := r.containersByPid[u.Pid]; c != nil {
221223
c.updateNodejsStats(*u)
222224
}
225+
case u := <-r.pythonStatsUpdateCh:
226+
if u == nil {
227+
continue
228+
}
229+
if c := r.containersByPid[u.Pid]; c != nil {
230+
c.updatePythonStats(*u)
231+
}
223232
case sample := <-r.gpuProcessUsageSampleChan:
224233
if c := r.containersByPid[sample.Pid]; c != nil {
225234
if p := c.processes[sample.Pid]; p != nil {
@@ -306,10 +315,6 @@ func (r *Registry) handleEvents(ch <-chan ebpftracer.Event) {
306315
}
307316
r.ip2fqdnLock.Unlock()
308317
}
309-
case ebpftracer.EventTypePythonThreadLock:
310-
if c := r.containersByPid[e.Pid]; c != nil {
311-
c.pythonThreadLockWaitTime += e.Duration
312-
}
313318
}
314319
}
315320
}
@@ -410,6 +415,7 @@ func (r *Registry) updateStatsFromEbpfMapsIfNecessary() {
410415

411416
r.updateTrafficStats()
412417
r.updateNodejsStats()
418+
r.updatePythonStats()
413419

414420
r.ebpfStatsLastUpdated = time.Now()
415421
}
@@ -447,6 +453,21 @@ func (r *Registry) updateNodejsStats() {
447453
r.nodejsStatsUpdateCh <- nil
448454
}
449455

456+
func (r *Registry) updatePythonStats() {
457+
iter := r.tracer.PythonStatsIterator()
458+
var pid uint64
459+
stats := ebpftracer.PythonStats{}
460+
461+
for iter.Next(&pid, &stats) {
462+
r.pythonStatsUpdateCh <- &PythonStatsUpdate{Pid: uint32(pid), Stats: stats}
463+
}
464+
465+
if err := iter.Err(); err != nil {
466+
klog.Warningln(err)
467+
}
468+
r.pythonStatsUpdateCh <- nil
469+
}
470+
450471
func (r *Registry) getDomain(ip netaddr.IP) *common.Domain {
451472
r.ip2fqdnLock.RLock()
452473
defer r.ip2fqdnLock.RUnlock()
@@ -563,3 +584,8 @@ type NodejsStatsUpdate struct {
563584
Pid uint32
564585
Stats ebpftracer.NodejsStats
565586
}
587+
588+
type PythonStatsUpdate struct {
589+
Pid uint32
590+
Stats ebpftracer.PythonStats
591+
}

ebpftracer/ebpf.go

Lines changed: 10 additions & 10 deletions
Large diffs are not rendered by default.

ebpftracer/ebpf/ebpf.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ struct trace_event_raw_sys_exit__stub {
4040
};
4141

4242
#include "nodejs.c"
43+
#include "python.c"
4344
#include "proc.c"
4445
#include "file.c"
4546
#include "tcp/conntrack.c"
@@ -48,6 +49,5 @@ struct trace_event_raw_sys_exit__stub {
4849
#include "l7/l7.c"
4950
#include "l7/gotls.c"
5051
#include "l7/openssl.c"
51-
#include "python.c"
5252

5353
char _license[] SEC("license") = "GPL";

ebpftracer/ebpf/proc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ int sched_process_exit(struct trace_event_raw_sched_process_template__stub *args
6262
return 0;
6363
}
6464

65+
bpf_map_delete_elem(&python_stats, &pid);
6566
bpf_map_delete_elem(&nodejs_stats, &pid);
6667
bpf_map_delete_elem(&nodejs_prev_event_loop_iter, &pid);
6768
bpf_map_delete_elem(&nodejs_current_io_cb, &pid);

ebpftracer/ebpf/python.c

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1+
struct python_proc_stats {
2+
__u64 thread_lock_wait_time;
3+
};
4+
15
struct {
2-
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
3-
__uint(key_size, sizeof(int));
4-
__uint(value_size, sizeof(int));
5-
} python_thread_events SEC(".maps");
6+
__uint(type, BPF_MAP_TYPE_HASH);
7+
__uint(key_size, sizeof(__u64));
8+
__uint(value_size, sizeof(struct python_proc_stats));
9+
__uint(max_entries, 10240);
10+
} python_stats SEC(".maps");
11+
612

713
struct {
814
__uint(type, BPF_MAP_TYPE_HASH);
@@ -19,24 +25,25 @@ int pthread_cond_timedwait_enter(struct pt_regs *ctx) {
1925
return 0;
2026
}
2127

22-
struct python_thread_event {
23-
__u32 type;
24-
__u32 pid;
25-
__u64 duration;
26-
};
27-
2828
SEC("uprobe/pthread_cond_timedwait_exit")
2929
int pthread_cond_timedwait_exit(struct pt_regs *ctx) {
3030
__u64 pid_tgid = bpf_get_current_pid_tgid();
3131
__u64 *timestamp = bpf_map_lookup_elem(&python_thread_locks, &pid_tgid);
3232
if (!timestamp) {
3333
return 0;
3434
}
35-
struct python_thread_event e = {
36-
.type = EVENT_TYPE_PYTHON_THREAD_LOCK,
37-
.pid = pid_tgid >> 32,
38-
.duration = bpf_ktime_get_ns()-*timestamp,
39-
};
40-
bpf_perf_event_output(ctx, &python_thread_events, BPF_F_CURRENT_CPU, &e, sizeof(e));
35+
__u64 duration = bpf_ktime_get_ns() - *timestamp;
36+
bpf_map_delete_elem(&python_thread_locks, &pid_tgid);
37+
__u64 pid = pid_tgid >> 32;
38+
struct python_proc_stats *stats = bpf_map_lookup_elem(&python_stats, &pid);
39+
if (!stats) {
40+
struct python_proc_stats s = {};
41+
bpf_map_update_elem(&python_stats, &pid, &s, BPF_ANY);
42+
stats = bpf_map_lookup_elem(&python_stats, &pid);
43+
if (!stats) {
44+
return 0;
45+
}
46+
}
47+
__sync_fetch_and_add(&stats->thread_lock_wait_time, duration);
4148
return 0;
4249
}

ebpftracer/tracer.go

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,16 @@ type EventType uint32
3333
type EventReason uint32
3434

3535
const (
36-
EventTypeProcessStart EventType = 1
37-
EventTypeProcessExit EventType = 2
38-
EventTypeConnectionOpen EventType = 3
39-
EventTypeConnectionClose EventType = 4
40-
EventTypeConnectionError EventType = 5
41-
EventTypeListenOpen EventType = 6
42-
EventTypeListenClose EventType = 7
43-
EventTypeFileOpen EventType = 8
44-
EventTypeTCPRetransmit EventType = 9
45-
EventTypeL7Request EventType = 10
46-
EventTypePythonThreadLock EventType = 11
36+
EventTypeProcessStart EventType = 1
37+
EventTypeProcessExit EventType = 2
38+
EventTypeConnectionOpen EventType = 3
39+
EventTypeConnectionClose EventType = 4
40+
EventTypeConnectionError EventType = 5
41+
EventTypeListenOpen EventType = 6
42+
EventTypeListenClose EventType = 7
43+
EventTypeFileOpen EventType = 8
44+
EventTypeTCPRetransmit EventType = 9
45+
EventTypeL7Request EventType = 10
4746

4847
EventReasonNone EventReason = 0
4948
EventReasonOOMKill EventReason = 1
@@ -73,11 +72,10 @@ type Event struct {
7372
type perfMapType uint8
7473

7574
const (
76-
perfMapTypeProcEvents perfMapType = 1
77-
perfMapTypeTCPEvents perfMapType = 2
78-
perfMapTypeFileEvents perfMapType = 3
79-
perfMapTypeL7Events perfMapType = 4
80-
perfMapTypePythonThreadEvents perfMapType = 5
75+
perfMapTypeProcEvents perfMapType = 1
76+
perfMapTypeTCPEvents perfMapType = 2
77+
perfMapTypeFileEvents perfMapType = 3
78+
perfMapTypeL7Events perfMapType = 4
8179
)
8280

8381
type Tracer struct {
@@ -139,10 +137,18 @@ func (t *Tracer) NodejsStatsIterator() *ebpf.MapIterator {
139137
return t.collection.Maps["nodejs_stats"].Iterate()
140138
}
141139

140+
func (t *Tracer) PythonStatsIterator() *ebpf.MapIterator {
141+
return t.collection.Maps["python_stats"].Iterate()
142+
}
143+
142144
type NodejsStats struct {
143145
EventLoopBlockedTime time.Duration
144146
}
145147

148+
type PythonStats struct {
149+
ThreadLockWaitTime time.Duration
150+
}
151+
146152
type ConnectionId struct {
147153
FD uint64
148154
PID uint32
@@ -230,7 +236,6 @@ func (t *Tracer) ebpf(ch chan<- Event) error {
230236
{name: "tcp_connect_events", typ: perfMapTypeTCPEvents, perCPUBufferSizePages: 8, readTimeout: 10 * time.Millisecond},
231237
{name: "tcp_retransmit_events", typ: perfMapTypeTCPEvents, perCPUBufferSizePages: 4},
232238
{name: "file_events", typ: perfMapTypeFileEvents, perCPUBufferSizePages: 4},
233-
{name: "python_thread_events", typ: perfMapTypePythonThreadEvents, perCPUBufferSizePages: 4},
234239
}
235240

236241
if !t.disableL7Tracing {
@@ -365,12 +370,6 @@ type l7Event struct {
365370
PayloadSize uint64
366371
}
367372

368-
type pythonThreadEvent struct {
369-
Type EventType
370-
Pid uint32
371-
Duration uint64
372-
}
373-
374373
func runEventsReader(name string, r *perf.Reader, ch chan<- Event, typ perfMapType, readTimeout time.Duration) {
375374
if readTimeout == 0 {
376375
readTimeout = 100 * time.Millisecond
@@ -450,17 +449,6 @@ func runEventsReader(name string, r *perf.Reader, ch chan<- Event, typ perfMapTy
450449
BytesReceived: v.BytesReceived,
451450
}
452451
}
453-
case perfMapTypePythonThreadEvents:
454-
v := &pythonThreadEvent{}
455-
if err := binary.Read(bytes.NewBuffer(rec.RawSample), binary.LittleEndian, v); err != nil {
456-
klog.Warningln("failed to read msg:", err)
457-
continue
458-
}
459-
event = Event{
460-
Type: v.Type,
461-
Pid: v.Pid,
462-
Duration: time.Duration(v.Duration),
463-
}
464452
default:
465453
continue
466454
}

0 commit comments

Comments
 (0)