ANDROID: sched: check on_rq in freezer_should_skip()

In aosp/1979327 we attempted to prevent tasks with pending signals and PF_FREEZER_SKIP from being immediately rescheduled, because such tasks would crash the kernel if run while no capable CPUs were online. This was implemented by declining to immediately reschedule them unless various conditions were met. However, this ended up causing signals to fail to be delivered if the signal was received while a task is processing a syscall, such as futex(2), that will block with PF_FREEZER_SKIP set, as the kernel relies on a check for TIF_SIGPENDING after setting the task state to TASK_INTERRUPTIBLE in order to deliver such a signal. This patch is an alternative solution to the original problem that avoids introducing the signal delivery bug. It works by changing how freezer_should_skip() is implemented. Instead of just checking PF_FREEZER_SKIP, we also use the on_rq field to check whether the task is not on a runqueue. In this way we ensure that a task that will be immediately rescheduled will not return true from freezer_should_skip(), and the task will block the freezer unless it is actually taken off the runqueue. Signed-off-by: Peter Collingbourne <pcc@google.com> Bug: 202918514 Bug: 251700836 Change-Id: I3f9b705ce9ad2ca1d2df959f43cf05bef78560f8
2022-10-31 13:18:58 -07:00
parent 9bb8b08464
commit 5723e2465c
4 changed files with 24 additions and 21 deletions
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -8,6 +8,9 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
+#if defined(CONFIG_ARM64) && !defined(__GENKSYMS__)
+#include <linux/mmu_context.h>
+#endif

 #ifdef CONFIG_FREEZER
 extern atomic_t system_freezing_cnt;	/* nr of freezing conds in effect */
@@ -108,10 +111,15 @@ static inline bool cgroup_freezing(struct task_struct *task)
 * The caller shouldn't do anything which isn't allowed for a frozen task
 * until freezer_cont() is called.  Usually, freezer[_do_not]_count() pair
 * wrap a scheduling operation and nothing much else.
+ *
+ * The write to current->flags uses release semantics to prevent a concurrent
+ * freezer_should_skip() from observing this write before a write to on_rq
+ * during a prior call to activate_task(), which may cause it to return true
+ * before deactivate_task() is called.
 */
 static inline void freezer_do_not_count(void)
 {
-	current->flags |= PF_FREEZER_SKIP;
+	smp_store_release(&current->flags, current->flags | PF_FREEZER_SKIP);
 }

 /**
@@ -161,7 +169,19 @@ static inline bool freezer_should_skip(struct task_struct *p)
 	 * clearing %PF_FREEZER_SKIP.
 	 */
 	smp_mb();
+#ifdef CONFIG_ARM64
+	return (p->flags & PF_FREEZER_SKIP) &&
+	       (!p->on_rq || task_cpu_possible_mask(p) == cpu_possible_mask);
+#else
+	/*
+	 * On non-aarch64, avoid depending on task_cpu_possible_mask(), which is
+	 * defined in <linux/mmu_context.h>, because including that header from
+	 * here exposes a tricky bug in the tracepoint headers on x86, and that
+	 * macro would end up being defined equal to cpu_possible_mask on other
+	 * architectures anyway.
+	 */
 	return p->flags & PF_FREEZER_SKIP;
+#endif
 }

 /*
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -83,7 +83,7 @@ static void __init handle_initrd(void)
 	 * In case that a resume from disk is carried out by linuxrc or one of
 	 * its children, we need to tell the freezer not to wait for us.
 	 */
-	current->flags |= PF_FREEZER_SKIP;
+	freezer_do_not_count();

 	info = call_usermodehelper_setup("/linuxrc", argv, envp_init,
 					 GFP_KERNEL, init_linuxrc, NULL, NULL);
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -23,7 +23,7 @@

 void lock_system_sleep(void)
 {
-	current->flags |= PF_FREEZER_SKIP;
+	freezer_do_not_count();
 	mutex_lock(&system_transition_mutex);
 }
 EXPORT_SYMBOL_GPL(lock_system_sleep);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6282,23 +6282,6 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

 #endif /* CONFIG_SCHED_CORE */

-static bool __task_can_run(struct task_struct *prev)
-{
-	if (__fatal_signal_pending(prev))
-		return true;
-
-	if (!frozen_or_skipped(prev))
-		return true;
-
-	/*
-	 * We can't safely go back on the runqueue if we're an asymmetric
-	 * task skipping the freezer. Doing so can lead to migration failures
-	 * later on if there aren't any suitable CPUs left around for us to
-	 * move to.
-	 */
-	return task_cpu_possible_mask(prev) == cpu_possible_mask;
-}
-
 /*
 * Constants for the sched_mode argument of __schedule().
 *
@@ -6410,7 +6393,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
 	 */
 	prev_state = READ_ONCE(prev->__state);
 	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
-		if (signal_pending_state(prev_state, prev) && __task_can_run(prev)) {
+		if (signal_pending_state(prev_state, prev)) {
 			WRITE_ONCE(prev->__state, TASK_RUNNING);
 		} else {
 			prev->sched_contributes_to_load =