Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Commit

Permalink
BACKPORT: psi: Fix cpu.pressure for cpu.max and competing cgroups
Browse files Browse the repository at this point in the history
For simplicity, cpu pressure is defined as having more than one
runnable task on a given CPU. This works on the system-level, but it
has limitations in a cgrouped reality: When cpu.max is in use, it
doesn't capture the time in which a task is not executing on the CPU
due to throttling. Likewise, it doesn't capture the time in which a
competing cgroup is occupying the CPU - meaning it only reflects
cgroup-internal competitive pressure, not outside pressure.

Enable tracking of currently executing tasks, and then change the
definition of cpu pressure in a cgroup from

	NR_RUNNING > 1

to

	NR_RUNNING > ON_CPU

which will capture the effects of cpu.max as well as competition from
outside the cgroup.

After this patch, a cgroup running `stress -c 1` with a cpu.max
setting of 5000 10000 shows ~50% continuous CPU pressure.

Signed-off-by: Johannes Weiner <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Aarqw12 <[email protected]>
Signed-off-by: prorooter007 <[email protected]>
Signed-off-by: Marco Zanin <[email protected]>
  • Loading branch information
hnaz authored and reocat committed Jan 18, 2024
1 parent 9432ea4 commit b83c657
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 7 deletions.
10 changes: 9 additions & 1 deletion include/linux/psi_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,21 @@ enum psi_task_count {
NR_IOWAIT,
NR_MEMSTALL,
NR_RUNNING,
NR_PSI_TASK_COUNTS = 3,
/*
* This can't have values other than 0 or 1 and could be
* implemented as a bit flag. But for now we still have room
* in the first cacheline of psi_group_cpu, and this way we
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
NR_PSI_TASK_COUNTS = 4,
};

/* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)

/* Resources that workloads could be stalled on */
enum psi_res {
Expand Down
2 changes: 1 addition & 1 deletion kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -3641,7 +3641,7 @@ static void __sched notrace __schedule(bool preempt)
rq->curr = next;
++*switch_count;

set_task_last_switch_out(prev, wallclock);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));

trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
Expand Down
12 changes: 7 additions & 5 deletions kernel/sched/psi.c
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_FULL:
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
case PSI_CPU_SOME:
return tasks[NR_RUNNING] > 1;
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
Expand Down Expand Up @@ -695,10 +695,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu,
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t] == 0 && !psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n",
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
clear, set);
groupc->tasks[3], clear, set);
psi_bug = 1;
}
groupc->tasks[t]--;
Expand Down Expand Up @@ -916,9 +916,11 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)

rq = task_rq_lock(task, &rf);

if (task_on_rq_queued(task))
if (task_on_rq_queued(task)) {
task_flags = TSK_RUNNING;
else if (task->in_iowait)
if (task_current(rq, task))
task_flags |= TSK_ONCPU;
} else if (task->in_iowait)
task_flags = TSK_IOWAIT;

if (task->flags & PF_MEMSTALL)
Expand Down
28 changes: 28 additions & 0 deletions kernel/sched/stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
if (p->flags & PF_MEMSTALL)
clear |= TSK_MEMSTALL;
} else {
/*
* When a task sleeps, schedule() dequeues it before
* switching to the next one. Merge the clearing of
* TSK_RUNNING and TSK_ONCPU to save an unnecessary
* psi_task_change() call in psi_sched_switch().
*/
clear |= TSK_ONCPU;

if (p->in_iowait)
set |= TSK_IOWAIT;
}
Expand Down Expand Up @@ -118,6 +126,23 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
}
}

static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep)
{
if (static_branch_likely(&psi_disabled))
return;

/*
* Clear the TSK_ONCPU state if the task was preempted. If
* it's a voluntary sleep, dequeue will have taken care of it.
*/
if (!sleep)
psi_task_change(prev, TSK_ONCPU, 0);

psi_task_change(next, 0, TSK_ONCPU);
}

static inline void psi_task_tick(struct rq *rq)
{
if (static_branch_likely(&psi_disabled))
Expand All @@ -130,6 +155,9 @@ static inline void psi_task_tick(struct rq *rq)
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_task_tick(struct rq *rq) {}
#endif /* CONFIG_PSI */

Expand Down

0 comments on commit b83c657

Please sign in to comment.