summaryrefslogtreecommitdiff
path: root/mm/oom_kill.c
diff options
context:
space:
mode:
authorRoman Gushchin <guro@fb.com>2018-08-21 21:53:54 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-08-22 10:52:45 -0700
commit3d8b38eb81cac81395f6a823f6bf401b327268e6 (patch)
treedc0f23ff3a3b8a3d13706fd35afbd522b8c4b0c6 /mm/oom_kill.c
parent5989ad7b5ede38d605c588981f634c08252abfc3 (diff)
mm, oom: introduce memory.oom.group
For some workloads an intervention from the OOM killer can be painful. Killing a random task can bring the workload into an inconsistent state. Historically, there are two common solutions for this problem: 1) enabling panic_on_oom, 2) using a userspace daemon to monitor OOMs and kill all outstanding processes. Both approaches have their downsides: rebooting on each OOM is an obvious waste of capacity, and handling all in userspace is tricky and requires a userspace agent, which will monitor all cgroups for OOMs. In most cases an in-kernel after-OOM cleaning-up mechanism can eliminate the necessity of enabling panic_on_oom. Also, it can simplify the cgroup management for userspace applications. This commit introduces a new knob for cgroup v2 memory controller: memory.oom.group. The knob determines whether the cgroup should be treated as an indivisible workload by the OOM killer. If set, all tasks belonging to the cgroup or to its descendants (if the memory cgroup is not a leaf cgroup) are killed together or not at all. To determine which cgroup has to be killed, we do traverse the cgroup hierarchy from the victim task's cgroup up to the OOMing cgroup (or root) and looking for the highest-level cgroup with memory.oom.group set. Tasks with the OOM protection (oom_score_adj set to -1000) are treated as an exception and are never killed. This patch doesn't change the OOM victim selection algorithm. Link: http://lkml.kernel.org/r/20180802003201.817-4-guro@fb.com Signed-off-by: Roman Gushchin <guro@fb.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: David Rientjes <rientjes@google.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/oom_kill.c')
-rw-r--r--mm/oom_kill.c30
1 files changed, 30 insertions, 0 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 330416c67ce5..0e10b864e074 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -908,6 +908,19 @@ static void __oom_kill_process(struct task_struct *victim)
}
#undef K
+/*
+ * Kill provided task unless it's secured by setting
+ * oom_score_adj to OOM_SCORE_ADJ_MIN.
+ */
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(task);
+ __oom_kill_process(task);
+ }
+ return 0;
+}
+
static void oom_kill_process(struct oom_control *oc, const char *message)
{
struct task_struct *p = oc->chosen;
@@ -915,6 +928,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
struct task_struct *victim = p;
struct task_struct *child;
struct task_struct *t;
+ struct mem_cgroup *oom_group;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -968,7 +982,23 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
read_unlock(&tasklist_lock);
+ /*
+ * Do we need to kill the entire memory cgroup?
+ * Or even one of the ancestor memory cgroups?
+ * Check this out before killing the victim task.
+ */
+ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
+
__oom_kill_process(victim);
+
+ /*
+ * If necessary, kill all tasks in the selected memory cgroup.
+ */
+ if (oom_group) {
+ mem_cgroup_print_oom_group(oom_group);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_put(oom_group);
+ }
}
/*