summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSteven Rostedt (VMware) <rostedt@goodmis.org>2018-04-26 16:36:39 +0800
committerTao Huang <huangtao@rock-chips.com>2018-04-26 19:23:47 +0800
commit102d3ecfe1d47d093514eacaec35309db62ca22d (patch)
treef936516d9527e121ff4a84cd606a699ec46afb6f
parent4b2e018dc102895d8b3b78b91c646f9b70269e5e (diff)
UPSTREAM: sched/core: Call __schedule() from do_idle() without enabling preemption
I finally got around to creating trampolines for dynamically allocated ftrace_ops with using synchronize_rcu_tasks(). For users of the ftrace function hook callbacks, like perf, that allocate the ftrace_ops descriptor via kmalloc() and friends, ftrace was not able to optimize the functions being traced to use a trampoline because they would also need to be allocated dynamically. The problem is that they cannot be freed when CONFIG_PREEMPT is set, as there's no way to tell if a task was preempted on the trampoline. That was before Paul McKenney implemented synchronize_rcu_tasks() that would make sure all tasks (except idle) have scheduled out or have entered user space. While testing this, I triggered this bug: BUG: unable to handle kernel paging request at ffffffffa0230077 ... RIP: 0010:0xffffffffa0230077 ... Call Trace: schedule+0x5/0xe0 schedule_preempt_disabled+0x18/0x30 do_idle+0x172/0x220 What happened was that the idle task was preempted on the trampoline. As synchronize_rcu_tasks() ignores the idle thread, there's nothing that lets ftrace know that the idle task was preempted on a trampoline. The idle task shouldn't need to ever enable preemption. The idle task is simply a loop that calls schedule or places the cpu into idle mode. In fact, having preemption enabled is inefficient, because it can happen when idle is just about to call schedule anyway, which would cause schedule to be called twice. Once for when the interrupt came in and was returning back to normal context, and then again in the normal path that the idle loop is running in, which would be pointless, as it had already scheduled. The only reason schedule_preempt_disable() enables preemption is to be able to call sched_submit_work(), which requires preemption enabled. As this is a nop when the task is in the RUNNING state, and idle is always in the running state, there's no reason that idle needs to enable preemption. But that means it cannot use schedule_preempt_disable() as other callers of that function require calling sched_submit_work(). Adding a new function local to kernel/sched/ that allows idle to call the scheduler without enabling preemption, fixes the synchronize_rcu_tasks() issue, as well as removes the pointless spurious schedule calls caused by interrupts happening in the brief window where preemption is enabled just before it calls schedule. Reviewed: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20170414084809.3dacde2a@gandalf.local.home Signed-off-by: Ingo Molnar <mingo@kernel.org> -----Shawn: trace on 4.4 for RK3308 ------------------------- [ 151.389904] BUG: scheduling while atomic: swapper/0/0/0x00000000 [ 151.390478] Modules linked in: [ 151.390813] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.126 #1327 [ 151.390830] Hardware name: Rockchip RK3308 evb digital-i2s mic board (DT) [ 151.390844] Call trace: [ 151.390868] [<ffffff800808731c>] dump_backtrace+0x0/0x1c4 [ 151.390883] [<ffffff80080874f4>] show_stack+0x14/0x1c [ 151.390900] [<ffffff80081e4274>] dump_stack+0x94/0xbc [ 151.390919] [<ffffff80080b4c6c>] __schedule_bug+0x3c/0x54 [ 151.390938] [<ffffff800857e978>] __schedule+0x88/0x45c [ 151.390953] [<ffffff800857edc0>] schedule+0x74/0x94 [ 151.390971] [<ffffff800857f118>] schedule_preempt_disabled+0x20/0x38 [ 151.390987] [<ffffff80080c9d74>] cpu_startup_entry+0x44/0x204 [ 151.391007] [<ffffff800857cda0>] rest_init+0x80/0x8c [ 151.391025] [<ffffff8008750b04>] start_kernel+0x31c/0x330 [ 151.391040] [<ffffff80087501c4>] __primary_switched+0x30/0x6c ------------------------------------------------------- Change-Id: I12971dfe9c2039920162326aabe1df0ecaf79804 Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com> (cherry-picked from 8663effb24f9430394d3bf1ed2dac42a771421d1)
-rw-r--r--kernel/sched/core.c25
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/sched.h2
3 files changed, 28 insertions, 1 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0e6e03bcbad0..0ffaf3e814f6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3358,6 +3358,31 @@ asmlinkage __visible void __sched schedule(void)
}
EXPORT_SYMBOL(schedule);
+/*
+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
+ * state (have scheduled out non-voluntarily) by making sure that all
+ * tasks have either left the run queue or have gone into user space.
+ * As idle tasks do not do either, they must not ever be preempted
+ * (schedule out non-voluntarily).
+ *
+ * schedule_idle() is similar to schedule_preempt_disable() except that it
+ * never enables preemption because it does not call sched_submit_work().
+ */
+void __sched schedule_idle(void)
+{
+ /*
+ * As this skips calling sched_submit_work(), which the idle task does
+ * regardless because that function is a nop when the task is in a
+ * TASK_RUNNING state, make sure this isn't used someplace that the
+ * current task can be in any other state. Note, idle is always in the
+ * TASK_RUNNING state.
+ */
+ WARN_ON_ONCE(current->state);
+ do {
+ __schedule(false);
+ } while (need_resched());
+}
+
#ifdef CONFIG_CONTEXT_TRACKING
asmlinkage __visible void __sched schedule_user(void)
{
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 917c94abf5bb..2d1f2c7efc4f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -276,7 +276,7 @@ static void cpu_idle_loop(void)
smp_mb__after_atomic();
sched_ttwu_pending();
- schedule_preempt_disabled();
+ schedule_idle();
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d9ad549591d8..bd4671aaa3d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1397,6 +1397,8 @@ static inline int idle_get_state_idx(struct rq *rq)
}
#endif
+extern void schedule_idle(void);
+
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);