diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ff7d268c814..5f8481523d08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -710,6 +710,10 @@ struct task_struct { struct uclamp_se uclamp[UCLAMP_CNT]; #endif +#ifdef CONFIG_HOTPLUG_CPU + struct list_head percpu_kthread_node; +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 2aa088d220e0..89fb9b0278b0 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -11,6 +11,8 @@ extern int sched_cpu_activate(unsigned int cpu); extern int sched_cpus_activate(struct cpumask *cpus); extern int sched_cpu_deactivate(unsigned int cpu); extern int sched_cpus_deactivate_nosync(struct cpumask *cpus); +extern int sched_cpu_drain_rq(unsigned int cpu); +extern void sched_cpu_drain_rq_wait(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU extern int sched_cpu_dying(unsigned int cpu); diff --git a/kernel/cpu.c b/kernel/cpu.c index f77bb1e07fe2..2bcc13dad8ea 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1095,6 +1095,34 @@ EXPORT_SYMBOL_GPL(remove_cpu); extern bool dl_cpu_busy(unsigned int cpu); +int __pause_drain_rq(struct cpumask *cpus) +{ + unsigned int cpu; + int err = 0; + + /* + * Disabling preemption avoids that one of the stopper, started from + * sched_cpu_drain_rq(), blocks firing draining for the whole cpumask. + */ + preempt_disable(); + for_each_cpu(cpu, cpus) { + err = sched_cpu_drain_rq(cpu); + if (err) + break; + } + preempt_enable(); + + return err; +} + +void __wait_drain_rq(struct cpumask *cpus) +{ + unsigned int cpu; + + for_each_cpu(cpu, cpus) + sched_cpu_drain_rq_wait(cpu); +} + int pause_cpus(struct cpumask *cpus) { int err = 0; @@ -1125,8 +1153,45 @@ int pause_cpus(struct cpumask *cpus) if (cpumask_empty(cpus)) goto err_cpu_maps_update; + /* + * Lazy migration: + * + * We do care about how fast a CPU can go idle and stay this in this + * state. If we try to take the cpus_write_lock() here, we would have + * to wait for a few dozens of ms, as this function might schedule. + * However, we can, as a first step, flip the active mask and migrate + * anything currently on the run-queue, to give a chance to the paused + * CPUs to reach quickly an idle state. There's a risk meanwhile for + * another CPU to observe an out-of-date active_mask or to incompletely + * update a cpuset. Both problems would be resolved later in the slow + * path, which ensures active_mask synchronization, triggers a cpuset + * rebuild and migrate any task that would have escaped the lazy + * migration. + */ + for_each_cpu(cpu, cpus) + set_cpu_active(cpu, false); + err = __pause_drain_rq(cpus); + if (err) { + __wait_drain_rq(cpus); + for_each_cpu(cpu, cpus) + set_cpu_active(cpu, true); + goto err_cpu_maps_update; + } + + /* + * Slow path deactivation: + * + * Now that paused CPUs are most likely idle, we can go through a + * complete scheduler deactivation. + * + * The cpu_active_mask being already set and cpus_write_lock calling + * synchronize_rcu(), we know that all preempt-disabled and RCU users + * will observe the updated value. + */ cpus_write_lock(); + __wait_drain_rq(cpus); + cpuhp_tasks_frozen = 0; if (sched_cpus_deactivate_nosync(cpus)) { @@ -1134,6 +1199,14 @@ int pause_cpus(struct cpumask *cpus) goto err_cpus_write_unlock; } + err = __pause_drain_rq(cpus); + __wait_drain_rq(cpus); + if (err) { + for_each_cpu(cpu, cpus) + sched_cpu_activate(cpu); + goto err_cpus_write_unlock; + } + /* * Even if living on the side of the regular HP path, pause is using * one of the HP step (CPUHP_AP_ACTIVE). This should be reflected on the diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 30f344e687be..fc592fdcb7f9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6783,11 +6783,14 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) * Called with rq->lock held even though we'er in stop_machine() and * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. + * + * force: if false, the function will skip CPU pinned kthreads. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf, bool force) { struct rq *rq = dead_rq; - struct task_struct *next, *stop = rq->stop; + struct task_struct *next, *tmp, *stop = rq->stop; + LIST_HEAD(percpu_kthreads); struct rq_flags orf = *rf; int dest_cpu; @@ -6819,6 +6822,18 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) next = __pick_migrate_task(rq); + /* + * Argh ... no iterator for tasks, we need to remove the + * kthread from the run-queue to continue. + */ + if (!force && is_per_cpu_kthread(next)) { + INIT_LIST_HEAD(&next->percpu_kthread_node); + list_add(&next->percpu_kthread_node, &percpu_kthreads); + deactivate_task(rq, next, + DEQUEUE_NOCLOCK | DEQUEUE_SAVE); + continue; + } + /* * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either @@ -6837,7 +6852,14 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) * changed the task, WARN if weird stuff happened, because in * that case the above rq->lock drop is a fail too. */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + if (task_rq(next) != rq || !task_on_rq_queued(next)) { + /* + * In the !force case, there is a hole between + * rq_unlock() and rq_relock(), where another CPU might + * not observe an up to date cpu_active_mask and try to + * move tasks around. + */ + WARN_ON(force); raw_spin_unlock(&next->pi_lock); continue; } @@ -6854,6 +6876,12 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) raw_spin_unlock(&next->pi_lock); } + list_for_each_entry_safe(next, tmp, &percpu_kthreads, + percpu_kthread_node) { + activate_task(rq, next, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); + list_del(&next->percpu_kthread_node); + } + rq->stop = stop; } #endif /* CONFIG_HOTPLUG_CPU */ @@ -6936,6 +6964,40 @@ static int cpuset_cpu_inactive(unsigned int cpu) return 0; } +static int drain_rq_cpu_stop(void *data) +{ + struct rq *rq = this_rq(); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + migrate_tasks(rq, &rf, false); + rq_unlock_irqrestore(rq, &rf); + + return 0; +} + +int sched_cpu_drain_rq(unsigned int cpu) +{ + struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain); + struct cpu_stop_done *rq_drain_done = &(cpu_rq(cpu)->drain_done); + + if (idle_cpu(cpu)) { + rq_drain->done = NULL; + return 0; + } + + return stop_one_cpu_async(cpu, drain_rq_cpu_stop, NULL, rq_drain, + rq_drain_done); +} + +void sched_cpu_drain_rq_wait(unsigned int cpu) +{ + struct cpu_stop_work *rq_drain = &(cpu_rq(cpu)->drain); + + if (rq_drain->done) + cpu_stop_work_wait(rq_drain); +} + int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7087,7 +7149,7 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq, &rf); + migrate_tasks(rq, &rf, true); BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3a93dfa72d1d..5f14530e6344 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1051,6 +1051,11 @@ struct rq { unsigned int ttwu_local; #endif +#ifdef CONFIG_HOTPLUG_CPU + struct cpu_stop_work drain; + struct cpu_stop_done drain_done; +#endif + #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state;