diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0023052eab23..c22fe2e96d03 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -14,13 +14,21 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - NR_PSI_TASK_COUNTS = 3, + /* + * This can't have values other than 0 or 1 and could be + * implemented as a bit flag. But for now we still have room + * in the first cacheline of psi_group_cpu, and this way we + * don't have to special case any state tracking for it. + */ + NR_ONCPU, + NR_PSI_TASK_COUNTS = 4, }; /* Task state bitmasks */ #define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) +#define TSK_ONCPU (1 << NR_ONCPU) /* Resources that workloads could be stalled on */ enum psi_res { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c9d38881a9cd..9dd4718949da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3641,7 +3641,7 @@ static void __sched notrace __schedule(bool preempt) rq->curr = next; ++*switch_count; - set_task_last_switch_out(prev, wallclock); + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c1f34eb45733..54e81a280c01 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -225,7 +225,7 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_FULL: return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: - return tasks[NR_RUNNING] > 1; + return tasks[NR_RUNNING] > tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -695,10 +695,10 @@ static u32 psi_group_change(struct psi_group *group, int cpu, if (!(m & (1 << t))) continue; if (groupc->tasks[t] == 0 && !psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } groupc->tasks[t]--; @@ -916,9 +916,11 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) + if (task_on_rq_queued(task)) { task_flags = TSK_RUNNING; - else if (task->in_iowait) + if (task_current(rq, task)) + task_flags |= TSK_ONCPU; + } else if (task->in_iowait) task_flags = TSK_IOWAIT; if (task->flags & PF_MEMSTALL) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 6693dfecb929..1c21baecc667 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -85,6 +85,14 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) if (p->flags & PF_MEMSTALL) clear |= TSK_MEMSTALL; } else { + /* + * When a task sleeps, schedule() dequeues it before + * switching to the next one. Merge the clearing of + * TSK_RUNNING and TSK_ONCPU to save an unnecessary + * psi_task_change() call in psi_sched_switch(). + */ + clear |= TSK_ONCPU; + if (p->in_iowait) set |= TSK_IOWAIT; } @@ -118,6 +126,23 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) } } +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) +{ + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Clear the TSK_ONCPU state if the task was preempted. If + * it's a voluntary sleep, dequeue will have taken care of it. + */ + if (!sleep) + psi_task_change(prev, TSK_ONCPU, 0); + + psi_task_change(next, 0, TSK_ONCPU); +} + static inline void psi_task_tick(struct rq *rq) { if (static_branch_likely(&psi_disabled)) @@ -130,6 +155,9 @@ static inline void psi_task_tick(struct rq *rq) static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} static inline void psi_dequeue(struct task_struct *p, bool sleep) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} +static inline void psi_sched_switch(struct task_struct *prev, + struct task_struct *next, + bool sleep) {} static inline void psi_task_tick(struct rq *rq) {} #endif /* CONFIG_PSI */