summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/bpf/arraymap.c61
-rw-r--r--kernel/bpf/core.c26
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/inode.c40
-rw-r--r--kernel/bpf/offload.c15
-rw-r--r--kernel/bpf/sockmap.c11
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/verifier.c388
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/stat.c8
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/delayacct.c42
-rw-r--r--kernel/events/core.c7
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c90
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/irq/debug.h5
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/generic-chip.c11
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/matrix.c24
-rw-r--r--kernel/irq/msi.c64
-rw-r--r--kernel/kallsyms.c8
-rw-r--r--kernel/kcov.c4
-rw-r--r--kernel/locking/lockdep.c651
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/locking/spinlock.c13
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/printk/printk.c3
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c28
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/fair.c102
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/posix-timers.c29
-rw-r--r--kernel/time/tick-sched.c32
-rw-r--r--kernel/time/timer.c37
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/bpf_trace.c27
-rw-r--r--kernel/trace/ring_buffer.c79
-rw-r--r--kernel/trace/trace.c54
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c46
58 files changed, 1043 insertions, 1077 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_before_jiffies(acct->needcheck))
+ if (time_is_after_jiffies(acct->needcheck))
goto out;
/* May block */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..ab94d304a634 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
int numa_node = bpf_map_attr_numa_node(attr);
+ u32 elem_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
- u64 array_size;
- u32 elem_size;
+ u64 array_size, mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
+
array_size = sizeof(*array);
if (percpu)
- array_size += (u64) attr->max_entries * sizeof(void *);
+ array_size += (u64) max_entries * sizeof(void *);
else
- array_size += (u64) attr->max_entries * elem_size;
+ array_size += (u64) max_entries * elem_size;
/* make sure there is no u32 overflow later in round_up() */
if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array = bpf_map_area_alloc(array_size, numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + array->elem_size * (index & array->index_mask);
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
u32 elem_size = round_up(map->value_size, 8);
const int ret = BPF_REG_0;
@@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
+ }
if (is_power_of_2(elem_size)) {
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return this_cpu_ptr(array->pptrs[index]);
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
}
int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
off += size;
@@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
- memcpy(this_cpu_ptr(array->pptrs[index]),
+ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size);
else
- memcpy(array->value + array->elem_size * index,
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
value, map->value_size);
return 0;
}
@@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
*/
size = round_up(map->value_size, 8);
rcu_read_lock();
- pptr = array->pptrs[index];
+ pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
off += size;
@@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
static u32 array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 elem_size = round_up(map->value_size, 8);
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ if (map->unpriv_array) {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
+ } else {
+ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
+ }
if (is_power_of_2(elem_size))
*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
else
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b9f8686a84cf..7949e8b8f94e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
EXPORT_SYMBOL_GPL(__bpf_call_base);
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
@@ -955,7 +956,7 @@ select_insn:
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
@@ -974,7 +975,7 @@ select_insn:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
@@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
};
+#else
+static unsigned int __bpf_prog_ret0(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ return 0;
+}
+#endif
+
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
@@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
+#else
+ fp->bpf_func = __bpf_prog_ret0;
+#endif
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
@@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
*/
if (!bpf_prog_is_dev_bound(fp->aux)) {
fp = bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited) {
+ *err = -ENOTSUPP;
+ return fp;
+ }
+#endif
} else {
*err = bpf_prog_offload_compile(fp);
if (*err)
@@ -1447,7 +1466,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
rcu_read_lock();
prog = rcu_dereference(progs)->progs;
for (; *prog; prog++)
- cnt++;
+ if (*prog != &dummy_bpf_prog.prog)
+ cnt++;
rcu_read_unlock();
return cnt;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index e469e05c8e83..3905d4bc5b80 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)
pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
htab->map.key_size);
free_percpu(pptr);
+ cond_resched();
}
free_elems:
bpf_map_area_free(htab->elems);
@@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)
goto free_elems;
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
pptr);
+ cond_resched();
}
skip_percpu_elems:
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..5bb5e49ef4c3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -368,7 +368,45 @@ out:
putname(pname);
return ret;
}
-EXPORT_SYMBOL_GPL(bpf_obj_get_user);
+
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (inode->i_op == &bpf_map_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op != &bpf_prog_iops)
+ return ERR_PTR(-EACCES);
+
+ prog = inode->i_private;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ if (!bpf_prog_get_ok(prog, &type, false))
+ return ERR_PTR(-EINVAL);
+
+ return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ struct path path;
+ int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+ prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+ if (!IS_ERR(prog))
+ touch_atime(&path);
+ path_put(&path);
+ return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
static void bpf_evict_inode(struct inode *inode)
{
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 68ec884440b7..8455b89d1bbf 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,3 +1,18 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree.
+ *
+ * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+ * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+ */
+
#include <linux/bpf.h>
#include <linux/bpf_verifier.h>
#include <linux/bug.h>
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..1712d319c2d8 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)
write_lock_bh(&sock->sk_callback_lock);
psock = smap_psock_sk(sock);
- smap_list_remove(psock, &stab->sock_map[i]);
- smap_release_sock(psock, sock);
+ /* This check handles a racing sock event that can get the
+ * sk_callback_lock before this case but after xchg happens
+ * causing the refcnt to hit zero and sock user data (psock)
+ * to be null and queued for garbage collection.
+ */
+ if (likely(psock)) {
+ smap_list_remove(psock, &stab->sock_map[i]);
+ smap_release_sock(psock, sock);
+ }
write_unlock_bh(&sock->sk_callback_lock);
}
rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
-static bool bpf_prog_get_ok(struct bpf_prog *prog,
+bool bpf_prog_get_ok(struct bpf_prog *prog,
enum bpf_prog_type *attach_type, bool attach_drv)
{
/* not an attachment, just a refcount inc, always allow */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4593571c404..13551e623501 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
+{
+ const struct bpf_reg_state *reg = cur_regs(env) + regno;
+
+ return reg->type == PTR_TO_CTX;
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -1059,6 +1066,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
break;
case PTR_TO_STACK:
pointer_desc = "stack ";
+ /* The stack spill tracking logic in check_stack_write()
+ * and check_stack_read() relies on stack accesses being
+ * aligned.
+ */
+ strict = true;
break;
default:
break;
@@ -1067,6 +1079,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+/* truncate register to smaller size (in bytes)
+ * must be called with size < BPF_REG_SIZE
+ */
+static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
+{
+ u64 mask;
+
+ /* clear high bits in bit representation */
+ reg->var_off = tnum_cast(reg->var_off, size);
+
+ /* fix arithmetic bounds */
+ mask = ((u64)1 << (size * 8)) - 1;
+ if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
+ reg->umin_value &= mask;
+ reg->umax_value &= mask;
+ } else {
+ reg->umin_value = 0;
+ reg->umax_value = mask;
+ }
+ reg->smin_value = reg->umin_value;
+ reg->smax_value = reg->umax_value;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -1200,9 +1235,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
regs[value_regno].type == SCALAR_VALUE) {
/* b/h/w load zero-extends, mark upper bits as known 0 */
- regs[value_regno].var_off =
- tnum_cast(regs[value_regno].var_off, size);
- __update_reg_bounds(&regs[value_regno]);
+ coerce_reg_to_size(&regs[value_regno], size);
}
return err;
}
@@ -1232,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
return -EACCES;
}
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1282,6 +1321,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
verbose(env, "invalid variable stack read R%d var_off=%s\n",
regno, tn_buf);
+ return -EACCES;
}
off = regs[regno].off + regs[regno].var_off.value;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -1674,7 +1714,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return -EINVAL;
}
+ /* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
+ if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
+ verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
memset(&meta, 0, sizeof(meta));
meta.pkt_access = fn->pkt_access;
@@ -1696,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (meta.map_ptr == NULL) {
+ verbose(env, "verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -1766,14 +1819,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return 0;
}
-static void coerce_reg_to_32(struct bpf_reg_state *reg)
-{
- /* clear high 32 bits */
- reg->var_off = tnum_cast(reg->var_off, 4);
- /* Update bounds */
- __update_reg_bounds(reg);
-}
-
static bool signed_add_overflows(s64 a, s64 b)
{
/* Do the add in u64, where overflow is well-defined */
@@ -1794,6 +1839,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
return res > a;
}
+static bool check_reg_sane_offset(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ enum bpf_reg_type type)
+{
+ bool known = tnum_is_const(reg->var_off);
+ s64 val = reg->var_off.value;
+ s64 smin = reg->smin_value;
+
+ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+ verbose(env, "math between %s pointer and %lld is not allowed\n",
+ reg_type_str[type], val);
+ return false;
+ }
+
+ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "%s pointer offset %d is not allowed\n",
+ reg_type_str[type], reg->off);
+ return false;
+ }
+
+ if (smin == S64_MIN) {
+ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
+ reg_type_str[type]);
+ return false;
+ }
+
+ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "value %lld makes %s pointer be out of bounds\n",
+ smin, reg_type_str[type]);
+ return false;
+ }
+
+ return true;
+}
+
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
@@ -1815,44 +1895,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg = &regs[dst];
- if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad sbounds\n");
- return -EINVAL;
- }
- if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
- print_verifier_state(env, env->cur_state);
- verbose(env,
- "verifier internal error: known but bad ubounds\n");
- return -EINVAL;
+ if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
}
if (BPF_CLASS(insn->code) != BPF_ALU64) {
/* 32-bit ALU ops on pointers produce (meaningless) scalars */
- if (!env->allow_ptr_leaks)
- verbose(env,
- "R%d 32-bit pointer arithmetic prohibited\n",
- dst);
+ verbose(env,
+ "R%d 32-bit pointer arithmetic prohibited\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == CONST_PTR_TO_MAP) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+ dst);
return -EACCES;
}
if (ptr_reg->type == PTR_TO_PACKET_END) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
- dst);
+ verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+ dst);
return -EACCES;
}
@@ -1862,6 +1934,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
dst_reg->type = ptr_reg->type;
dst_reg->id = ptr_reg->id;
+ if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
+ !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
+ return -EINVAL;
+
switch (opcode) {
case BPF_ADD:
/* We can take a fixed offset as long as it doesn't overflow
@@ -1915,9 +1991,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_SUB:
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d tried to subtract pointer from scalar\n",
- dst);
+ verbose(env, "R%d tried to subtract pointer from scalar\n",
+ dst);
return -EACCES;
}
/* We don't allow subtraction from FP, because (according to
@@ -1925,9 +2000,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* be able to deal with it.
*/
if (ptr_reg->type == PTR_TO_STACK) {
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d subtraction from stack pointer prohibited\n",
- dst);
+ verbose(env, "R%d subtraction from stack pointer prohibited\n",
+ dst);
return -EACCES;
}
if (known && (ptr_reg->off - smin_val ==
@@ -1976,28 +2050,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_AND:
case BPF_OR:
case BPF_XOR:
- /* bitwise ops on pointers are troublesome, prohibit for now.
- * (However, in principle we could allow some cases, e.g.
- * ptr &= ~3 which would reduce min_value by 3.)
- */
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
- dst, bpf_alu_string[opcode >> 4]);
+ /* bitwise ops on pointers are troublesome, prohibit. */
+ verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
- if (!env->allow_ptr_leaks)
- verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
- dst, bpf_alu_string[opcode >> 4]);
+ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
+ dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
}
+ if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
+ return -EINVAL;
+
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
}
+/* WARNING: This function does calculations on 64-bit values, but the actual
+ * execution may occur on 32-bit values. Therefore, things like bitshifts
+ * need extra checks in the 32-bit case.
+ */
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_reg_state *dst_reg,
@@ -2008,12 +2084,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
bool src_known, dst_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
+ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
- if (BPF_CLASS(insn->code) != BPF_ALU64) {
- /* 32-bit ALU ops are (32,32)->64 */
- coerce_reg_to_32(dst_reg);
- coerce_reg_to_32(&src_reg);
- }
smin_val = src_reg.smin_value;
smax_val = src_reg.smax_value;
umin_val = src_reg.umin_value;
@@ -2021,6 +2093,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
src_known = tnum_is_const(src_reg.var_off);
dst_known = tnum_is_const(dst_reg->var_off);
+ if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
+ smin_val > smax_val || umin_val > umax_val) {
+ /* Taint dst register if offset had invalid bounds derived from
+ * e.g. dead branches.
+ */
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
+ if (!src_known &&
+ opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
+ __mark_reg_unknown(dst_reg);
+ return 0;
+ }
+
switch (opcode) {
case BPF_ADD:
if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2149,9 +2236,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_LSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
mark_reg_unknown(env, regs, insn->dst_reg);
break;
@@ -2177,27 +2264,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
break;
case BPF_RSH:
- if (umax_val > 63) {
- /* Shifts greater than 63 are undefined. This includes
- * shifts by a negative number.
+ if (umax_val >= insn_bitness) {
+ /* Shifts greater than 31 or 63 are undefined.
+ * This includes shifts by a negative number.
*/
mark_reg_unknown(env, regs, insn->dst_reg);
break;
}
- /* BPF_RSH is an unsigned shift, so make the appropriate casts */
- if (dst_reg->smin_value < 0) {
- if (umin_val) {
- /* Sign bit will be cleared */
- dst_reg->smin_value = 0;
- } else {
- /* Lost sign bit information */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- }
- } else {
- dst_reg->smin_value =
- (u64)(dst_reg->smin_value) >> umax_val;
- }
+ /* BPF_RSH is an unsigned shift. If the value in dst_reg might
+ * be negative, then either:
+ * 1) src_reg might be zero, so the sign bit of the result is
+ * unknown, so we lose our signed bounds
+ * 2) it's known negative, thus the unsigned bounds capture the
+ * signed bounds
+ * 3) the signed bounds cross zero, so they tell us nothing
+ * about the result
+ * If the value in dst_reg is known nonnegative, then again the
+ * unsigned bounts capture the signed bounds.
+ * Thus, in all cases it suffices to blow away our signed bounds
+ * and rely on inferring new ones from the unsigned bounds and
+ * var_off of the result.
+ */
+ dst_reg->smin_value = S64_MIN;
+ dst_reg->smax_value = S64_MAX;
if (src_known)
dst_reg->var_off = tnum_rshift(dst_reg->var_off,
umin_val);
@@ -2213,6 +2302,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
break;
}
+ if (BPF_CLASS(insn->code) != BPF_ALU64) {
+ /* 32-bit ALU ops are (32,32)->32 */
+ coerce_reg_to_size(dst_reg, 4);
+ coerce_reg_to_size(&src_reg, 4);
+ }
+
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
return 0;
@@ -2227,7 +2322,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
- int rc;
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
@@ -2238,43 +2332,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
if (src_reg->type != SCALAR_VALUE) {
if (dst_reg->type != SCALAR_VALUE) {
/* Combining two pointers by any ALU op yields
- * an arbitrary scalar.
+ * an arbitrary scalar. Disallow all math except
+ * pointer subtraction
*/
- if (!env->allow_ptr_leaks) {
- verbose(env, "R%d pointer %s pointer prohibited\n",
- insn->dst_reg,
- bpf_alu_string[opcode >> 4]);
- return -EACCES;
+ if (opcode == BPF_SUB){
+ mark_reg_unknown(env, regs, insn->dst_reg);
+ return 0;
}
- mark_reg_unknown(env, regs, insn->dst_reg);
- return 0;
+ verbose(env, "R%d pointer %s pointer prohibited\n",
+ insn->dst_reg,
+ bpf_alu_string[opcode >> 4]);
+ return -EACCES;
} else {
/* scalar += pointer
* This is legal, but we have to reverse our
* src/dest handling in computing the range
*/
- rc = adjust_ptr_min_max_vals(env, insn,
- src_reg, dst_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* scalar += unknown scalar */
- __mark_reg_unknown(&off_reg);
- return adjust_scalar_min_max_vals(
- env, insn,
- dst_reg, off_reg);
- }
- return rc;
+ return adjust_ptr_min_max_vals(env, insn,
+ src_reg, dst_reg);
}
} else if (ptr_reg) {
/* pointer += scalar */
- rc = adjust_ptr_min_max_vals(env, insn,
- dst_reg, src_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* unknown scalar += scalar */
- __mark_reg_unknown(dst_reg);
- return adjust_scalar_min_max_vals(
- env, insn, dst_reg, *src_reg);
- }
- return rc;
+ return adjust_ptr_min_max_vals(env, insn,
+ dst_reg, src_reg);
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -2283,17 +2363,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
off_reg.type = SCALAR_VALUE;
__mark_reg_known(&off_reg, insn->imm);
src_reg = &off_reg;
- if (ptr_reg) { /* pointer += K */
- rc = adjust_ptr_min_max_vals(env, insn,
- ptr_reg, src_reg);
- if (rc == -EACCES && env->allow_ptr_leaks) {
- /* unknown scalar += K */
- __mark_reg_unknown(dst_reg);
- return adjust_scalar_min_max_vals(
- env, insn, dst_reg, off_reg);
- }
- return rc;
- }
+ if (ptr_reg) /* pointer += K */
+ return adjust_ptr_min_max_vals(env, insn,
+ ptr_reg, src_reg);
}
/* Got here implies adding two SCALAR_VALUEs */
@@ -2390,17 +2462,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EACCES;
}
mark_reg_unknown(env, regs, insn->dst_reg);
- /* high 32 bits are known zero. */
- regs[insn->dst_reg].var_off = tnum_cast(
- regs[insn->dst_reg].var_off, 4);
- __update_reg_bounds(&regs[insn->dst_reg]);
+ coerce_reg_to_size(&regs[insn->dst_reg], 4);
}
} else {
/* case: R = imm
* remember the value we stored into this reg
*/
regs[insn->dst_reg].type = SCALAR_VALUE;
- __mark_reg_known(regs + insn->dst_reg, insn->imm);
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ __mark_reg_known(regs + insn->dst_reg,
+ insn->imm);
+ } else {
+ __mark_reg_known(regs + insn->dst_reg,
+ (u32)insn->imm);
+ }
}
} else if (opcode > BPF_END) {
@@ -2436,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3431,15 +3511,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
} else {
- /* if we knew anything about the old value, we're not
- * equal, because we can't know anything about the
- * scalar value of the pointer in the new value.
+ /* We're trying to use a pointer in place of a scalar.
+ * Even if the scalar was unbounded, this could lead to
+ * pointer leaks because scalars are allowed to leak
+ * while pointers are not. We could make this safe in
+ * special cases if root is calling us, but it's
+ * probably not worth the hassle.
*/
- return rold->umin_value == 0 &&
- rold->umax_value == U64_MAX &&
- rold->smin_value == S64_MIN &&
- rold->smax_value == S64_MAX &&
- tnum_is_unknown(rold->var_off);
+ return false;
}
case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and
@@ -3932,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose(env, "BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -4384,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
int i, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
@@ -4407,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (map_ptr == BPF_MAP_PTR_POISON) {
+ verbose(env, "tail_call abusing map_ptr\n");
+ return -EINVAL;
+ }
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
*/
do {
css_task_iter_start(&from->self, 0, &it);
- task = css_task_iter_next(&it);
+
+ do {
+ task = css_task_iter_next(&it);
+ } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end(&it);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe147f24..7e4c44538119 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
else
- strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
}
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
root->flags = opts->flags;
if (opts->release_agent)
- strcpy(root->release_agent_path, opts->release_agent);
+ strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
if (opts->name)
- strcpy(root->name, opts->name);
+ strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
if (opts->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *l = it->task_pos;
+ struct list_head *next;
lockdep_assert_held(&css_set_lock);
- WARN_ON_ONCE(!l);
-
repeat:
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
- l = l->next;
+ next = it->task_pos->next;
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (l == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
css_task_iter_advance_css_set(it);
else
- it->task_pos = l;
+ it->task_pos = next;
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cgroup.threads",
+ .flags = CFTYPE_NS_DELEGATABLE,
.release = cgroup_procs_release,
.seq_start = cgroup_threads_start,
.seq_next = cgroup_procs_next,
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f780d8f6a9d..9caeda610249 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
refcnt = refcount_read(&cset->refcount);
seq_printf(seq, "css_set %pK %d", cset, refcnt);
if (refcnt > cset->nr_tasks)
@@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
spin_lock_irq(&css_set_lock);
rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
+ cset = task_css_set(current);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 133b465691d6..1e111dd455c4 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp)
}
/* ->updated_children list is self terminated */
- for_each_possible_cpu(cpu)
- cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+ for_each_possible_cpu(cpu) {
+ struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+ cstat->updated_children = cgrp;
+ u64_stats_init(&cstat->sync);
+ }
prev_cputime_init(&cgrp->stat.prev_cputime);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 04892a82f6ac..53f7dc65f9a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
-static void inline cpuhp_lock_acquire(bool bringup)
+static inline void cpuhp_lock_acquire(bool bringup)
{
lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
-static void inline cpuhp_lock_release(bool bringup)
+static inline void cpuhp_lock_release(bool bringup)
{
lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else
-static void inline cpuhp_lock_acquire(bool bringup) { }
-static void inline cpuhp_lock_release(bool bringup) { }
+static inline void cpuhp_lock_acquire(bool bringup) { }
+static inline void cpuhp_lock_release(bool bringup) { }
#endif
@@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu)
BUG_ON(cpu_online(cpu));
/*
- * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all
- * runnable tasks from the cpu, there's only the idle task left now
+ * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
+ * all runnable tasks from the CPU, there's only the idle task left now
* that the migration thread is done doing the stop_machine thing.
*
* Wait for the stop thread to go away.
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
* before blk_mq_queue_reinit_notify() from notify_dead(),
* otherwise a RCU stall occurs.
*/
- [CPUHP_TIMERS_DEAD] = {
+ [CPUHP_TIMERS_PREPARE] = {
.name = "timers:dead",
- .startup.single = NULL,
+ .startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
@@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown.single = NULL,
.cant_stop = true,
},
- [CPUHP_AP_SMPCFD_DYING] = {
- .name = "smpcfd:dying",
- .startup.single = NULL,
- .teardown.single = smpcfd_dying_cpu,
- },
/*
* Handled on controll processor until the plugged processor manages
* this itself.
@@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup.single = NULL,
.teardown.single = rcutree_dying_cpu,
},
+ [CPUHP_AP_SMPCFD_DYING] = {
+ .name = "smpcfd:dying",
+ .startup.single = NULL,
+ .teardown.single = smpcfd_dying_cpu,
+ },
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(contig_page_data);
#endif
#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_SYMBOL_ARRAY(mem_section);
VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
VMCOREINFO_STRUCT_SIZE(mem_section);
VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index e74be38245ad..ed5d34925ad0 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -350,7 +350,7 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- if (kallsyms_symbol_next(p_tmp, i) < 0)
+ if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
break;
kdb_printf("%s ", p_tmp);
*(p_tmp + len) = '\0';
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
* Finish delay accounting for a statistic using its timestamps (@start),
* accumalator (@total) and @count
*/
-static void delayacct_end(u64 *start, u64 *total, u32 *count)
+static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
{
s64 ns = ktime_get_ns() - *start;
unsigned long flags;
if (ns > 0) {
- spin_lock_irqsave(&current->delays->lock, flags);
+ spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ spin_unlock_irqrestore(lock, flags);
}
}
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
current->delays->blkio_start = ktime_get_ns();
}
-void __delayacct_blkio_end(void)
+/*
+ * We cannot rely on the `current` macro, as we haven't yet switched back to
+ * the process being woken.
+ */
+void __delayacct_blkio_end(struct task_struct *p)
{
- if (current->delays->flags & DELAYACCT_PF_SWAPIN)
- /* Swapin block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->swapin_delay,
- &current->delays->swapin_count);
- else /* Other block I/O */
- delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_delay,
- &current->delays->blkio_count);
+ struct task_delay_info *delays = p->delays;
+ u64 *total;
+ u32 *count;
+
+ if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
+ total = &delays->swapin_delay;
+ count = &delays->swapin_count;
+ } else {
+ total = &delays->blkio_delay;
+ count = &delays->blkio_count;
+ }
+
+ delayacct_end(&delays->lock, &delays->blkio_start, total, count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
void __delayacct_freepages_end(void)
{
- delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_delay,
- &current->delays->freepages_count);
+ delayacct_end(
+ &current->delays->lock,
+ &current->delays->freepages_start,
+ &current->delays->freepages_delay,
+ &current->delays->freepages_count);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..4df5b695bf0d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6639,6 +6639,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
struct perf_namespaces_event *namespaces_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
+ u16 header_size = namespaces_event->event_id.header.size;
int ret;
if (!perf_event_namespaces_match(event))
@@ -6649,7 +6650,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
ret = perf_output_begin(&handle, event,
namespaces_event->event_id.header.size);
if (ret)
- return;
+ goto out;
namespaces_event->event_id.pid = perf_event_pid(event,
namespaces_event->task);
@@ -6661,6 +6662,8 @@ static void perf_event_namespaces_output(struct perf_event *event,
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
+out:
+ namespaces_event->event_id.header.size = header_size;
}
static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
@@ -7987,11 +7990,11 @@ static void bpf_overflow_handler(struct perf_event *event,
{
struct bpf_perf_event_data_kern ctx = {
.data = data,
- .regs = regs,
.event = event,
};
int ret = 0;
+ ctx.regs = perf_arch_bpf_user_pt_regs(regs);
preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6b4298a41167..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1755,3 +1755,12 @@ Efault:
return -EFAULT;
}
#endif
+
+__weak void abort(void)
+{
+ BUG();
+
+ /* if that doesn't kill us, halt */
+ panic("Oops failed to kill thread");
+}
+EXPORT_SYMBOL(abort);
diff --git a/kernel/fork.c b/kernel/fork.c
index 432eadf6b58c..2295fc69717f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out;
}
/* a new mm has just been created */
- arch_dup_mmap(oldmm, mm);
- retval = 0;
+ retval = arch_dup_mmap(oldmm, mm);
out:
up_write(&mm->mmap_sem);
flush_tlb_mm(oldmm);
diff --git a/kernel/futex.c b/kernel/futex.c
index 76ed5921117a..8c5424dd5924 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1582,8 +1582,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
{
unsigned int op = (encoded_op & 0x70000000) >> 28;
unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
- int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12);
- int cmparg = sign_extend32(encoded_op & 0x00000fff, 12);
+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
int oldval, ret;
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
/*
* When PI not supported: return -ENOSYS if requeue_pi is true,
* consequently the compiler knows requeue_pi is always false past
@@ -2294,21 +2297,17 @@ static void unqueue_me_pi(struct futex_q *q)
spin_unlock(q->lock_ptr);
}
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *argowner)
{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
u32 uval, uninitialized_var(curval), newval;
- struct task_struct *oldowner;
+ struct task_struct *oldowner, *newowner;
+ u32 newtid;
int ret;
+ lockdep_assert_held(q->lock_ptr);
+
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
oldowner = pi_state->owner;
@@ -2317,11 +2316,17 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
newtid |= FUTEX_OWNER_DIED;
/*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but have failed to get the rtmutex the first time.
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
*
- * We have to replace the newowner TID in the user space variable.
+ * Either way, we have to replace the TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2339,42 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the PID check in lookup_pi_state.
*/
retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock after all, nothing to fix. */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ /*
+ * Since we just failed the trylock; there must be an owner.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ BUG_ON(!newowner);
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
*
- * We can safely read pi_state->owner without holding wait_lock
- * because we now own the rt_mutex, only the owner will attempt
- * to change it.
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
+ */
+ if (q->pi_state->owner == current) {
+ ret = fixup_pi_state_owner(uaddr, q, NULL);
+ goto out;
+ }
+
+ /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
return gid_gt(a, b) - gid_lt(a, b);
}
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
{
sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
gid_cmp, NULL);
}
+EXPORT_SYMBOL(groups_sort);
/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
- groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
}
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..e4d3819a91cc 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -12,6 +12,11 @@
static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
printk("->handle_irq(): %p, ", desc->handle_irq);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
+ BIT_MASK_DESCR(IRQD_CAN_RESERVE),
BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
/*
- * Separate lockdep class for interrupt chip which can nest irq_desc
- * lock.
+ * Separate lockdep classes for interrupt chip which can nest irq_desc
+ * lock and request mutex.
*/
static struct lock_class_key irq_nested_lock_class;
+static struct lock_class_key irq_nested_request_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
set_bit(idx, &gc->installed);
if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(virq, &irq_nested_lock_class);
+ irq_set_lockdep_class(virq, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (chip->irq_calc_mask)
chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
continue;
if (flags & IRQ_GC_INIT_NESTED_LOCK)
- irq_set_lockdep_class(i, &irq_nested_lock_class);
+ irq_set_lockdep_class(i, &irq_nested_lock_class,
+ &irq_nested_request_class);
if (!(flags & IRQ_GC_NO_MASK)) {
struct irq_data *d = irq_get_irq_data(i);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
{
irqd_set_activated(data);
return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..62068ad46930 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
}
}
-static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
{
int ret = 0;
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
if (irqd->parent_data)
ret = __irq_domain_activate_irq(irqd->parent_data,
- early);
+ reserve);
if (!ret && domain->ops->activate) {
- ret = domain->ops->activate(domain, irqd, early);
+ ret = domain->ops->activate(domain, irqd, reserve);
/* Rollback in case of error */
if (ret && irqd->parent_data)
__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate
* interrupt
- * @irq_data: outermost irq_data associated with interrupt
+ * @irq_data: Outermost irq_data associated with interrupt
+ * @reserve: If set only reserve an interrupt vector instead of assigning one
*
* This is the second step to call domain_ops->activate to program interrupt
* controllers, so the interrupt could actually get delivered.
*/
-int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
{
int ret = 0;
if (!irqd_is_activated(irq_data))
- ret = __irq_domain_activate_irq(irq_data, early);
+ ret = __irq_domain_activate_irq(irq_data, reserve);
if (!ret)
irqd_set_activated(irq_data);
return ret;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 7df2480005f8..5187dfe809ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m)
int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
bool reserved, unsigned int *mapped_cpu)
{
- unsigned int cpu;
+ unsigned int cpu, best_cpu, maxavl = 0;
+ struct cpumap *cm;
+ unsigned int bit;
+ best_cpu = UINT_MAX;
for_each_cpu(cpu, msk) {
- struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
- unsigned int bit;
+ cm = per_cpu_ptr(m->maps, cpu);
- if (!cm->online)
+ if (!cm->online || cm->available <= maxavl)
continue;
+ best_cpu = cpu;
+ maxavl = cm->available;
+ }
+
+ if (maxavl) {
+ cm = per_cpu_ptr(m->maps, best_cpu);
bit = matrix_alloc_area(m, cm, 1, false);
if (bit < m->alloc_end) {
cm->allocated++;
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
m->global_available--;
if (reserved)
m->global_reserved--;
- *mapped_cpu = cpu;
- trace_irq_matrix_alloc(bit, cpu, m, cm);
+ *mapped_cpu = best_cpu;
+ trace_irq_matrix_alloc(bit, best_cpu, m, cm);
return bit;
}
}
@@ -384,7 +392,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
{
struct cpumap *cm = this_cpu_ptr(m->maps);
- return (m->global_available - cpudown) ? cm->available : 0;
+ if (!cpudown)
+ return m->global_available;
+ return m->global_available - cm->available;
}
/**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
return ret;
}
+/*
+ * Carefully check whether the device can use reservation mode. If
+ * reservation mode is enabled then the early activation will assign a
+ * dummy vector to the device. If the PCI/MSI device does not support
+ * masking of the entry then this can result in spurious interrupts when
+ * the device driver is not absolutely careful. But even then a malfunction
+ * of the hardware could result in a spurious interrupt on the dummy vector
+ * and render the device unusable. If the entry can be masked then the core
+ * logic will prevent the spurious interrupt and reservation mode can be
+ * used. For now reservation mode is restricted to PCI/MSI.
+ */
+static bool msi_check_reservation_mode(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ struct device *dev)
+{
+ struct msi_desc *desc;
+
+ if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+ return false;
+
+ if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
+ return false;
+
+ if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ return false;
+
+ /*
+ * Checking the first MSI descriptor is sufficient. MSIX supports
+ * masking and MSI does so when the maskbit is set.
+ */
+ desc = first_msi_entry(dev);
+ return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+}
+
/**
* msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
* @domain: The domain to allocate from
@@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- msi_alloc_info_t arg;
+ struct irq_data *irq_data;
struct msi_desc *desc;
+ msi_alloc_info_t arg;
int i, ret, virq;
+ bool can_reserve;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
@@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
if (ops->msi_finish)
ops->msi_finish(&arg, 0);
+ can_reserve = msi_check_reservation_mode(domain, info, dev);
+
for_each_msi_entry(desc, dev) {
virq = desc->irq;
if (desc->nvec_used == 1)
@@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
* the MSI entries before the PCI layer enables MSI in the
* card. Otherwise the card latches a random msi message.
*/
- if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
- struct irq_data *irq_data;
+ if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
+ continue;
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ if (!can_reserve)
+ irqd_clr_can_reserve(irq_data);
+ ret = irq_domain_activate_irq(irq_data, can_reserve);
+ if (ret)
+ goto cleanup;
+ }
+
+ /*
+ * If these interrupts use reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (can_reserve) {
+ for_each_msi_entry(desc, dev) {
irq_data = irq_domain_get_irq_data(domain, desc->irq);
- ret = irq_domain_activate_irq(irq_data, true);
- if (ret)
- goto cleanup;
- if (info->flags & MSI_FLAG_MUST_REACTIVATE)
- irqd_clr_activated(irq_data);
+ irqd_clr_activated(irq_data);
}
}
return 0;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 531ffa984bc2..d5fa4116688a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -614,14 +614,14 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
- unsigned long value;
+ void *value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
- value = iter->show_value ? iter->value : 0;
+ value = iter->show_value ? (void *)iter->value : NULL;
if (iter->module_name[0]) {
char type;
@@ -632,10 +632,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
+ seq_printf(m, "%px %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, KALLSYM_FMT " %c %s\n", value,
+ seq_printf(m, "%px %c %s\n", value,
iter->type, iter->name);
return 0;
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 15f33faf4013..7594c033d98a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
-void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
{
write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
}
@@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
}
EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
-void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
{
write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
_RET_IP_);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9776da8db180..5fa1324a4f29 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -57,10 +57,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/lock.h>
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-#include <linux/slab.h>
-#endif
-
#ifdef CONFIG_PROVE_LOCKING
int prove_locking = 1;
module_param(prove_locking, int, 0644);
@@ -75,19 +71,6 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
-#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
-static int crossrelease_fullstack = 1;
-#else
-static int crossrelease_fullstack;
-#endif
-static int __init allow_crossrelease_fullstack(char *str)
-{
- crossrelease_fullstack = 1;
- return 0;
-}
-
-early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
-
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -740,18 +723,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
}
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-static void cross_init(struct lockdep_map *lock, int cross);
-static int cross_lock(struct lockdep_map *lock);
-static int lock_acquire_crosslock(struct held_lock *hlock);
-static int lock_release_crosslock(struct lockdep_map *lock);
-#else
-static inline void cross_init(struct lockdep_map *lock, int cross) {}
-static inline int cross_lock(struct lockdep_map *lock) { return 0; }
-static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
-static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
-#endif
-
/*
* Register a lock's class in the hash-table, if the class is not present
* yet. Otherwise we look it up. We cache the result in the lock object
@@ -1151,41 +1122,22 @@ print_circular_lock_scenario(struct held_lock *src,
printk(KERN_CONT "\n\n");
}
- if (cross_lock(tgt->instance)) {
- printk(" Possible unsafe locking scenario by crosslock:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk(" unlock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
- } else {
- printk(" Possible unsafe locking scenario:\n\n");
- printk(" CPU0 CPU1\n");
- printk(" ---- ----\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(parent);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(target);
- printk(KERN_CONT ");\n");
- printk(" lock(");
- __print_lock_name(source);
- printk(KERN_CONT ");\n");
- printk("\n *** DEADLOCK ***\n\n");
- }
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(KERN_CONT ");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(KERN_CONT ");\n");
+ printk("\n *** DEADLOCK ***\n\n");
}
/*
@@ -1211,10 +1163,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
curr->comm, task_pid_nr(curr));
print_lock(check_src);
- if (cross_lock(check_tgt->instance))
- pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
- else
- pr_warn("\nbut task is already holding lock:\n");
+ pr_warn("\nbut task is already holding lock:\n");
print_lock(check_tgt);
pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1244,9 +1193,7 @@ static noinline int print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- if (cross_lock(check_tgt->instance))
- this->trace = *trace;
- else if (!save_trace(&this->trace))
+ if (!save_trace(&this->trace))
return 0;
depth = get_lock_depth(target);
@@ -1850,9 +1797,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- if (cross_lock(prev->instance))
- continue;
-
return print_deadlock_bug(curr, prev, next);
}
return 1;
@@ -2018,31 +1962,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
for (;;) {
int distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
+
/*
- * Only non-crosslock entries get new dependencies added.
- * Crosslock entries will be added by commit later:
+ * Only non-recursive-read entries get new dependencies
+ * added:
*/
- if (!cross_lock(hlock->instance)) {
+ if (hlock->read != 2 && hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
+ if (!ret)
+ return 0;
+
/*
- * Only non-recursive-read entries get new dependencies
- * added:
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
*/
- if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next,
- distance, &trace, save_trace);
- if (!ret)
- return 0;
-
- /*
- * Stop after the first non-trylock entry,
- * as non-trylock entries have added their
- * own direct dependencies already, so this
- * lock is connected to them indirectly:
- */
- if (!hlock->trylock)
- break;
- }
+ if (!hlock->trylock)
+ break;
}
+
depth--;
/*
* End of lock-stack?
@@ -3292,21 +3231,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
- cross_init(lock, 0);
__lockdep_init_map(lock, name, key, subclass);
}
EXPORT_SYMBOL_GPL(lockdep_init_map);
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
- struct lock_class_key *key, int subclass)
-{
- cross_init(lock, 1);
- __lockdep_init_map(lock, name, key, subclass);
-}
-EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
-#endif
-
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
@@ -3362,7 +3290,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int chain_head = 0;
int class_idx;
u64 chain_key;
- int ret;
if (unlikely(!debug_locks))
return 0;
@@ -3411,8 +3338,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes + 1;
- /* TODO: nest_lock is not implemented for crosslock yet. */
- if (depth && !cross_lock(lock)) {
+ if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (hlock->references) {
@@ -3500,14 +3426,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
- ret = lock_acquire_crosslock(hlock);
- /*
- * 2 means normal acquire operations are needed. Otherwise, it's
- * ok just to return with '0:fail, 1:success'.
- */
- if (ret != 2)
- return ret;
-
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -3745,19 +3663,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
struct task_struct *curr = current;
struct held_lock *hlock;
unsigned int depth;
- int ret, i;
+ int i;
if (unlikely(!debug_locks))
return 0;
- ret = lock_release_crosslock(lock);
- /*
- * 2 means normal release operations are needed. Otherwise, it's
- * ok just to return with '0:fail, 1:success'.
- */
- if (ret != 2)
- return ret;
-
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
@@ -4675,494 +4585,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
dump_stack();
}
EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
-
-#ifdef CONFIG_LOCKDEP_CROSSRELEASE
-
-/*
- * Crossrelease works by recording a lock history for each thread and
- * connecting those historic locks that were taken after the
- * wait_for_completion() in the complete() context.
- *
- * Task-A Task-B
- *
- * mutex_lock(&A);
- * mutex_unlock(&A);
- *
- * wait_for_completion(&C);
- * lock_acquire_crosslock();
- * atomic_inc_return(&cross_gen_id);
- * |
- * | mutex_lock(&B);
- * | mutex_unlock(&B);
- * |
- * | complete(&C);
- * `-- lock_commit_crosslock();
- *
- * Which will then add a dependency between B and C.
- */
-
-#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
-
-/*
- * Whenever a crosslock is held, cross_gen_id will be increased.
- */
-static atomic_t cross_gen_id; /* Can be wrapped */
-
-/*
- * Make an entry of the ring buffer invalid.
- */
-static inline void invalidate_xhlock(struct hist_lock *xhlock)
-{
- /*
- * Normally, xhlock->hlock.instance must be !NULL.
- */
- xhlock->hlock.instance = NULL;
-}
-
-/*
- * Lock history stacks; we have 2 nested lock history stacks:
- *
- * HARD(IRQ)
- * SOFT(IRQ)
- *
- * The thing is that once we complete a HARD/SOFT IRQ the future task locks
- * should not depend on any of the locks observed while running the IRQ. So
- * what we do is rewind the history buffer and erase all our knowledge of that
- * temporal event.
- */
-
-void crossrelease_hist_start(enum xhlock_context_t c)
-{
- struct task_struct *cur = current;
-
- if (!cur->xhlocks)
- return;
-
- cur->xhlock_idx_hist[c] = cur->xhlock_idx;
- cur->hist_id_save[c] = cur->hist_id;
-}
-
-void crossrelease_hist_end(enum xhlock_context_t c)
-{
- struct task_struct *cur = current;
-
- if (cur->xhlocks) {
- unsigned int idx = cur->xhlock_idx_hist[c];
- struct hist_lock *h = &xhlock(idx);
-
- cur->xhlock_idx = idx;
-
- /* Check if the ring was overwritten. */
- if (h->hist_id != cur->hist_id_save[c])
- invalidate_xhlock(h);
- }
-}
-
-/*
- * lockdep_invariant_state() is used to annotate independence inside a task, to
- * make one task look like multiple independent 'tasks'.
- *
- * Take for instance workqueues; each work is independent of the last. The
- * completion of a future work does not depend on the completion of a past work
- * (in general). Therefore we must not carry that (lock) dependency across
- * works.
- *
- * This is true for many things; pretty much all kthreads fall into this
- * pattern, where they have an invariant state and future completions do not
- * depend on past completions. Its just that since they all have the 'same'
- * form -- the kthread does the same over and over -- it doesn't typically
- * matter.
- *
- * The same is true for system-calls, once a system call is completed (we've
- * returned to userspace) the next system call does not depend on the lock
- * history of the previous system call.
- *
- * They key property for independence, this invariant state, is that it must be
- * a point where we hold no locks and have no history. Because if we were to
- * hold locks, the restore at _end() would not necessarily recover it's history
- * entry. Similarly, independence per-definition means it does not depend on
- * prior state.
- */
-void lockdep_invariant_state(bool force)
-{
- /*
- * We call this at an invariant point, no current state, no history.
- * Verify the former, enforce the latter.
- */
- WARN_ON_ONCE(!force && current->lockdep_depth);
- invalidate_xhlock(&xhlock(current->xhlock_idx));
-}
-
-static int cross_lock(struct lockdep_map *lock)
-{
- return lock ? lock->cross : 0;
-}
-
-/*
- * This is needed to decide the relationship between wrapable variables.
- */
-static inline int before(unsigned int a, unsigned int b)
-{
- return (int)(a - b) < 0;
-}
-
-static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
-{
- return hlock_class(&xhlock->hlock);
-}
-
-static inline struct lock_class *xlock_class(struct cross_lock *xlock)
-{
- return hlock_class(&xlock->hlock);
-}
-
-/*
- * Should we check a dependency with previous one?
- */
-static inline int depend_before(struct held_lock *hlock)
-{
- return hlock->read != 2 && hlock->check && !hlock->trylock;
-}
-
-/*
- * Should we check a dependency with next one?
- */
-static inline int depend_after(struct held_lock *hlock)
-{
- return hlock->read != 2 && hlock->check;
-}
-
-/*
- * Check if the xhlock is valid, which would be false if,
- *
- * 1. Has not used after initializaion yet.
- * 2. Got invalidated.
- *
- * Remind hist_lock is implemented as a ring buffer.
- */
-static inline int xhlock_valid(struct hist_lock *xhlock)
-{
- /*
- * xhlock->hlock.instance must be !NULL.
- */
- return !!xhlock->hlock.instance;
-}
-
-/*
- * Record a hist_lock entry.
- *
- * Irq disable is only required.
- */
-static void add_xhlock(struct held_lock *hlock)
-{
- unsigned int idx = ++current->xhlock_idx;
- struct hist_lock *xhlock = &xhlock(idx);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
- /*
- * This can be done locklessly because they are all task-local
- * state, we must however ensure IRQs are disabled.
- */
- WARN_ON_ONCE(!irqs_disabled());
-#endif
-
- /* Initialize hist_lock's members */
- xhlock->hlock = *hlock;
- xhlock->hist_id = ++current->hist_id;
-
- xhlock->trace.nr_entries = 0;
- xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
- xhlock->trace.entries = xhlock->trace_entries;
-
- if (crossrelease_fullstack) {
- xhlock->trace.skip = 3;
- save_stack_trace(&xhlock->trace);
- } else {
- xhlock->trace.nr_entries = 1;
- xhlock->trace.entries[0] = hlock->acquire_ip;
- }
-}
-
-static inline int same_context_xhlock(struct hist_lock *xhlock)
-{
- return xhlock->hlock.irq_context == task_irq_context(current);
-}
-
-/*
- * This should be lockless as far as possible because this would be
- * called very frequently.
- */
-static void check_add_xhlock(struct held_lock *hlock)
-{
- /*
- * Record a hist_lock, only in case that acquisitions ahead
- * could depend on the held_lock. For example, if the held_lock
- * is trylock then acquisitions ahead never depends on that.
- * In that case, we don't need to record it. Just return.
- */
- if (!current->xhlocks || !depend_before(hlock))
- return;
-
- add_xhlock(hlock);
-}
-
-/*
- * For crosslock.
- */
-static int add_xlock(struct held_lock *hlock)
-{
- struct cross_lock *xlock;
- unsigned int gen_id;
-
- if (!graph_lock())
- return 0;
-
- xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
-
- /*
- * When acquisitions for a crosslock are overlapped, we use
- * nr_acquire to perform commit for them, based on cross_gen_id
- * of the first acquisition, which allows to add additional
- * dependencies.
- *
- * Moreover, when no acquisition of a crosslock is in progress,
- * we should not perform commit because the lock might not exist
- * any more, which might cause incorrect memory access. So we
- * have to track the number of acquisitions of a crosslock.
- *
- * depend_after() is necessary to initialize only the first
- * valid xlock so that the xlock can be used on its commit.
- */
- if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
- goto unlock;
-
- gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
- xlock->hlock = *hlock;
- xlock->hlock.gen_id = gen_id;
-unlock:
- graph_unlock();
- return 1;
-}
-
-/*
- * Called for both normal and crosslock acquires. Normal locks will be
- * pushed on the hist_lock queue. Cross locks will record state and
- * stop regular lock_acquire() to avoid being placed on the held_lock
- * stack.
- *
- * Return: 0 - failure;
- * 1 - crosslock, done;
- * 2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_acquire_crosslock(struct held_lock *hlock)
-{
- /*
- * CONTEXT 1 CONTEXT 2
- * --------- ---------
- * lock A (cross)
- * X = atomic_inc_return(&cross_gen_id)
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * Y = atomic_read_acquire(&cross_gen_id)
- * lock B
- *
- * atomic_read_acquire() is for ordering between A and B,
- * IOW, A happens before B, when CONTEXT 2 see Y >= X.
- *
- * Pairs with atomic_inc_return() in add_xlock().
- */
- hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
-
- if (cross_lock(hlock->instance))
- return add_xlock(hlock);
-
- check_add_xhlock(hlock);
- return 2;
-}
-
-static int copy_trace(struct stack_trace *trace)
-{
- unsigned long *buf = stack_trace + nr_stack_trace_entries;
- unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- unsigned int nr = min(max_nr, trace->nr_entries);
-
- trace->nr_entries = nr;
- memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
- trace->entries = buf;
- nr_stack_trace_entries += nr;
-
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
- if (!debug_locks_off_graph_unlock())
- return 0;
-
- print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
- dump_stack();
-
- return 0;
- }
-
- return 1;
-}
-
-static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
-{
- unsigned int xid, pid;
- u64 chain_key;
-
- xid = xlock_class(xlock) - lock_classes;
- chain_key = iterate_chain_key((u64)0, xid);
- pid = xhlock_class(xhlock) - lock_classes;
- chain_key = iterate_chain_key(chain_key, pid);
-
- if (lookup_chain_cache(chain_key))
- return 1;
-
- if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
- chain_key))
- return 0;
-
- if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
- &xhlock->trace, copy_trace))
- return 0;
-
- return 1;
-}
-
-static void commit_xhlocks(struct cross_lock *xlock)
-{
- unsigned int cur = current->xhlock_idx;
- unsigned int prev_hist_id = xhlock(cur).hist_id;
- unsigned int i;
-
- if (!graph_lock())
- return;
-
- if (xlock->nr_acquire) {
- for (i = 0; i < MAX_XHLOCKS_NR; i++) {
- struct hist_lock *xhlock = &xhlock(cur - i);
-
- if (!xhlock_valid(xhlock))
- break;
-
- if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
- break;
-
- if (!same_context_xhlock(xhlock))
- break;
-
- /*
- * Filter out the cases where the ring buffer was
- * overwritten and the current entry has a bigger
- * hist_id than the previous one, which is impossible
- * otherwise:
- */
- if (unlikely(before(prev_hist_id, xhlock->hist_id)))
- break;
-
- prev_hist_id = xhlock->hist_id;
-
- /*
- * commit_xhlock() returns 0 with graph_lock already
- * released if fail.
- */
- if (!commit_xhlock(xlock, xhlock))
- return;
- }
- }
-
- graph_unlock();
-}
-
-void lock_commit_crosslock(struct lockdep_map *lock)
-{
- struct cross_lock *xlock;
- unsigned long flags;
-
- if (unlikely(!debug_locks || current->lockdep_recursion))
- return;
-
- if (!current->xhlocks)
- return;
-
- /*
- * Do commit hist_locks with the cross_lock, only in case that
- * the cross_lock could depend on acquisitions after that.
- *
- * For example, if the cross_lock does not have the 'check' flag
- * then we don't need to check dependencies and commit for that.
- * Just skip it. In that case, of course, the cross_lock does
- * not depend on acquisitions ahead, either.
- *
- * WARNING: Don't do that in add_xlock() in advance. When an
- * acquisition context is different from the commit context,
- * invalid(skipped) cross_lock might be accessed.
- */
- if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
- return;
-
- raw_local_irq_save(flags);
- check_flags(flags);
- current->lockdep_recursion = 1;
- xlock = &((struct lockdep_map_cross *)lock)->xlock;
- commit_xhlocks(xlock);
- current->lockdep_recursion = 0;
- raw_local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(lock_commit_crosslock);
-
-/*
- * Return: 0 - failure;
- * 1 - crosslock, done;
- * 2 - normal lock, continue to held_lock[] ops.
- */
-static int lock_release_crosslock(struct lockdep_map *lock)
-{
- if (cross_lock(lock)) {
- if (!graph_lock())
- return 0;
- ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
- graph_unlock();
- return 1;
- }
- return 2;
-}
-
-static void cross_init(struct lockdep_map *lock, int cross)
-{
- if (cross)
- ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
-
- lock->cross = cross;
-
- /*
- * Crossrelease assumes that the ring buffer size of xhlocks
- * is aligned with power of 2. So force it on build.
- */
- BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
-}
-
-void lockdep_init_task(struct task_struct *task)
-{
- int i;
-
- task->xhlock_idx = UINT_MAX;
- task->hist_id = 0;
-
- for (i = 0; i < XHLOCK_CTX_NR; i++) {
- task->xhlock_idx_hist[i] = UINT_MAX;
- task->hist_id_save[i] = 0;
- }
-
- task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
- GFP_KERNEL);
-}
-
-void lockdep_free_task(struct task_struct *task)
-{
- if (task->xhlocks) {
- void *tmp = task->xhlocks;
- /* Diable crossrelease for current */
- task->xhlocks = NULL;
- kfree(tmp);
- }
-}
-#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
/*
* Slow path try-lock function:
*/
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
return rt_mutex_slowtrylock(lock);
}
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 1fd1a7543cdd..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
break; \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while ((lock)->break_lock) \
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
} \
\
unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
local_irq_restore(flags); \
preempt_enable(); \
\
- if (!(lock)->break_lock) \
- (lock)->break_lock = 1; \
- while ((lock)->break_lock) \
- arch_##op##_relax(&lock->raw_lock); \
+ arch_##op##_relax(&lock->raw_lock); \
} \
- (lock)->break_lock = 0; \
+ \
return flags; \
} \
\
diff --git a/kernel/module.c b/kernel/module.c
index f0411a271765..dea01ac9cb74 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4157,7 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
char buf[MODULE_FLAGS_BUF_SIZE];
- unsigned long value;
+ void *value;
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4173,8 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- value = m->private ? 0 : (unsigned long)mod->core_layout.base;
- seq_printf(m, " 0x" KALLSYM_FMT, value);
+ value = m->private ? NULL : mod->core_layout.base;
+ seq_printf(m, " 0x%px", value);
/* Taints info */
if (mod->taints)
diff --git a/kernel/pid.c b/kernel/pid.c
index b13b624e2c49..1e8bb6550ec4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -193,10 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns)) {
- disable_pid_allocation(ns);
+ if (pid_ns_prepare_proc(ns))
goto out_free;
- }
}
get_pid_ns(ns);
@@ -226,6 +224,10 @@ out_free:
while (++i <= ns->level)
idr_remove(&ns->idr, (pid->numbers + i)->nr);
+ /* On failure to allocate the first pid, reset the state */
+ if (ns->pid_allocated == PIDNS_ADDING)
+ idr_set_cursor(&ns->idr, 0);
+
spin_unlock_irq(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5d81206a572d..b9006617710f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl)
void show_regs_print_info(const char *log_lvl)
{
dump_stack_print_info(log_lvl);
-
- printk("%stask: %p task.stack: %p\n",
- log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
spin_lock_irqsave(&x->wait.lock, flags);
- /*
- * Perform commit of crossrelease here.
- */
- complete_release_commit(x);
-
if (x->done != UINT_MAX)
x->done++;
__wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75554f366fd3..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->state = TASK_WAKING;
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#else /* CONFIG_SMP */
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
if (!task_on_rq_queued(p)) {
if (p->in_iowait) {
- delayacct_blkio_end();
+ delayacct_blkio_end(p);
atomic_dec(&rq->nr_iowait);
}
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
@@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
return ret;
}
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
@@ -5144,6 +5133,17 @@ out_unlock:
return retval;
}
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ * this syscall writes the default timeslice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct timespec __user *, interval)
{
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
#ifdef CONFIG_NO_HZ_COMMON
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
{
- unsigned long idle_calls = tick_nohz_get_idle_calls();
+ unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
bool ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4037e19bbca2..2fe3aa853e4d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se,
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
- *
- * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
- * simply copies the running sum over.
+ * Per the above update_tg_cfs_util() is trivial and simply copies the running
+ * sum over (but still wrong, because the group entity and group rq do not have
+ * their PELT windows aligned).
*
* However, update_tg_cfs_runnable() is more complex. So we have:
*
@@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se,
* And since, like util, the runnable part should be directly transferable,
* the following would _appear_ to be the straight forward approach:
*
- * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3)
+ * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
*
* And per (1) we have:
*
- * ge->avg.running_avg == grq->avg.running_avg
+ * ge->avg.runnable_avg == grq->avg.runnable_avg
*
* Which gives:
*
@@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se,
* to (shortly) return to us. This only works by keeping the weights as
* integral part of the sum. We therefore cannot decompose as per (3).
*
- * OK, so what then?
+ * Another reason this doesn't work is that runnable isn't a 0-sum entity.
+ * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
+ * rq itself is runnable anywhere between 2/3 and 1 depending on how the
+ * runnable section of these tasks overlap (or not). If they were to perfectly
+ * align the rq as a whole would be runnable 2/3 of the time. If however we
+ * always have at least 1 runnable task, the rq as a whole is always runnable.
*
+ * So we'll have to approximate.. :/
*
- * Another way to look at things is:
+ * Given the constraint:
*
- * grq->avg.load_avg = \Sum se->avg.load_avg
+ * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
*
- * Therefore, per (2):
+ * We can construct a rule that adds runnable to a rq by assuming minimal
+ * overlap.
*
- * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ * On removal, we'll assume each task is equally runnable; which yields:
*
- * And the very thing we're propagating is a change in that sum (someone
- * joined/left). So we can easily know the runnable change, which would be, per
- * (2) the already tracked se->load_avg divided by the corresponding
- * se->weight.
+ * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
*
- * Basically (4) but in differential form:
+ * XXX: only do this for the part of runnable > running ?
*
- * d(runnable_avg) += se->avg.load_avg / se->load.weight
- * (5)
- * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
*/
static inline void
@@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
if (!delta)
return;
+ /*
+ * The relation between sum and avg is:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * however, the PELT windows are not aligned between grq and gse.
+ */
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
@@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long runnable_sum = gcfs_rq->prop_runnable_sum;
- long runnable_load_avg, load_avg;
- s64 runnable_load_sum, load_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ unsigned long runnable_load_avg, load_avg;
+ u64 runnable_load_sum, load_sum = 0;
+ s64 delta_sum;
if (!runnable_sum)
return;
gcfs_rq->prop_runnable_sum = 0;
+ if (runnable_sum >= 0) {
+ /*
+ * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
+ * the CPU is saturated running == runnable.
+ */
+ runnable_sum += se->avg.load_sum;
+ runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
+ } else {
+ /*
+ * Estimate the new unweighted runnable_sum of the gcfs_rq by
+ * assuming all tasks are equally runnable.
+ */
+ if (scale_load_down(gcfs_rq->load.weight)) {
+ load_sum = div_s64(gcfs_rq->avg.load_sum,
+ scale_load_down(gcfs_rq->load.weight));
+ }
+
+ /* But make sure to not inflate se's runnable */
+ runnable_sum = min(se->avg.load_sum, load_sum);
+ }
+
+ /*
+ * runnable_sum can't be lower than running_sum
+ * As running sum is scale with cpu capacity wehreas the runnable sum
+ * is not we rescale running_sum 1st
+ */
+ running_sum = se->avg.util_sum /
+ arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ runnable_sum = max(runnable_sum, running_sum);
+
load_sum = (s64)se_weight(se) * runnable_sum;
load_avg = div_s64(load_sum, LOAD_AVG_MAX);
- add_positive(&se->avg.load_sum, runnable_sum);
- add_positive(&se->avg.load_avg, load_avg);
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
+ delta_avg = load_avg - se->avg.load_avg;
- add_positive(&cfs_rq->avg.load_avg, load_avg);
- add_positive(&cfs_rq->avg.load_sum, load_sum);
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+ delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
+ delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
- add_positive(&se->avg.runnable_load_sum, runnable_sum);
- add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+ se->avg.runnable_load_sum = runnable_sum;
+ se->avg.runnable_load_avg = runnable_load_avg;
if (se->on_rq) {
- add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
- add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
+ add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
}
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
rcu_read_unlock();
}
if (!fallback) {
+ preempt_disable();
smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ preempt_enable();
free_cpumask_var(tmpmask);
}
cpus_read_unlock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4056c19ca3f0..665ace2fc558 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
+ if (likely(!rt_overload_count))
return;
/*
@@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 98feab7933c7..929ecb7d6b78 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&wq_head->lock, flags);
- __add_wait_queue_entry_tail(wq_head, wq_entry);
+ __add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e776fc8cc1df..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -95,6 +95,7 @@ config NO_HZ_FULL
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select IRQ_WORK
+ select CPU_ISOLATION
help
Adaptively try to shutdown the tick whenever possible, even when
the CPU is running tasks. Typically this requires running a single
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 13d6881f908b..ec999f32c840 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event)
{
struct task_struct *rtn = current->group_leader;
- if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
- (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
- !same_thread_group(rtn, current) ||
- (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ switch (event->sigev_notify) {
+ case SIGEV_SIGNAL | SIGEV_THREAD_ID:
+ rtn = find_task_by_vpid(event->sigev_notify_thread_id);
+ if (!rtn || !same_thread_group(rtn, current))
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_SIGNAL:
+ case SIGEV_THREAD:
+ if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_NONE:
+ return task_pid(rtn);
+ default:
return NULL;
-
- if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
- ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
- return NULL;
-
- return task_pid(rtn);
+ }
}
static struct k_itimer * alloc_posix_timer(void)
@@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
struct timespec64 ts64;
bool sig_none;
- sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+ sig_none = timr->it_sigev_notify == SIGEV_NONE;
iv = timr->it_interval;
/* interval timer ? */
@@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags,
timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
expires = timespec64_to_ktime(new_setting->it_value);
- sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE;
+ sigev_none = timr->it_sigev_notify == SIGEV_NONE;
kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
timr->it_active = !sigev_none;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..f7cc7abfcf25 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
ts->next_tick = 0;
}
+static inline bool local_timer_softirq_pending(void)
+{
+ return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(basemono, &next_rcu) ||
- arch_needs_cpu() || irq_work_needs_cpu()) {
+ /*
+ * Keep the periodic tick, when RCU, architecture or irq_work
+ * requests it.
+ * Aside of that check whether the local timer softirq is
+ * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * because there is an already expired timer, so it will request
+ * immeditate expiry, which rearms the hardware timer with a
+ * minimal delta which brings us back to this place
+ * immediately. Lather, rinse and repeat...
+ */
+ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
@@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void)
}
/**
+ * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
+ * for a particular CPU.
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
+{
+ struct tick_sched *ts = tick_get_tick_sched(cpu);
+
+ return ts->idle_calls;
+}
+
+/**
* tick_nohz_get_idle_calls - return the current idle calls counter value
*
* Called from the schedutil frequency scaling governor in scheduler context.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf878fba..0bcf00e3ce48 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
return base;
}
@@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = this_cpu_ptr(&timer_bases[BASE_DEF]);
return base;
}
@@ -1009,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
- debug_activate(timer, expires);
-
new_base = get_target_base(base, timer->flags);
if (base != new_base) {
@@ -1034,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
}
}
+ debug_activate(timer, expires);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
base->must_forward_clk = false;
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
@@ -1698,7 +1696,7 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
if (time_before(jiffies, base->clk)) {
- if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
return;
/* CPU is awake, so check the deferrable base. */
base++;
@@ -1855,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
}
}
+int timers_prepare_cpu(unsigned int cpu)
+{
+ struct timer_base *base;
+ int b;
+
+ for (b = 0; b < NR_BASES; b++) {
+ base = per_cpu_ptr(&timer_bases[b], cpu);
+ base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->is_idle = false;
+ base->must_forward_clk = true;
+ }
+ return 0;
+}
+
int timers_dead_cpu(unsigned int cpu)
{
struct timer_base *old_base;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..f54dc62b599c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS
bool "Enable trace events for preempt and irq disable/enable"
select TRACE_IRQFLAGS
depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ depends on TRACING
default n
help
Enable tracing of disable and enable events for preemption and irqs.
@@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
on if you need to profile the system's use of these macros.
config PROFILE_ALL_BRANCHES
- bool "Profile all if conditionals"
+ bool "Profile all if conditionals" if !FORTIFY_SOURCE
select TRACE_BRANCH_PROFILING
help
This tracer profiles all branch conditions. Every if ()
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 206e0e2ace53..987d9a9ae283 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -591,7 +591,7 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
return 0;
@@ -637,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
@@ -872,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error, union kernfs_node_id *cgid)
+ u32 what, int error)
{
struct blk_trace *bt = q->blk_trace;
@@ -880,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
return;
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
+ bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+ blk_trace_bio_get_cgid(q, bio));
}
static void blk_add_trace_bio_bounce(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
static void blk_add_trace_bio_complete(void *ignore,
struct request_queue *q, struct bio *bio,
int error)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -903,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
}
static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -912,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
struct request *rq,
struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
}
static void blk_add_trace_bio_queue(void *ignore,
struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
}
static void blk_add_trace_getrq(void *ignore,
@@ -928,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
@@ -945,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore,
struct bio *bio, int rw)
{
if (bio)
- blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
- blk_trace_bio_get_cgid(q, bio));
+ blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 27d1f4ffa3de..40207c2a4113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
- u64 flags, struct perf_raw_record *raw)
+ u64 flags, struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_sample_data_init(sd, 0, 0);
- sd->raw = raw;
perf_event_output(event, sd, regs);
return 0;
}
@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
struct perf_raw_record raw = {
.frag = {
.size = size,
@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
- return __bpf_perf_event_output(regs, map, flags, &raw);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = &raw;
+
+ return __bpf_perf_event_output(regs, map, flags, sd);
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
};
static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
{
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
struct perf_raw_frag frag = {
.copy = ctx_copy,
@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
};
perf_fetch_caller_regs(regs);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = &raw;
- return __bpf_perf_event_output(regs, map, flags, &raw);
+ return __bpf_perf_event_output(regs, map, flags, sd);
}
BPF_CALL_0(bpf_get_current_task)
@@ -759,6 +764,8 @@ const struct bpf_prog_ops perf_event_prog_ops = {
static DEFINE_MUTEX(bpf_event_mutex);
+#define BPF_TRACE_MAX_PROGS 64
+
int perf_event_attach_bpf_prog(struct perf_event *event,
struct bpf_prog *prog)
{
@@ -772,6 +779,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
goto unlock;
old_array = event->tp_event->prog_array;
+ if (old_array &&
+ bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
+ ret = -E2BIG;
+ goto unlock;
+ }
+
ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
if (ret < 0)
goto unlock;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 91874a95060d..5af2842dea96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
*/
size_t ring_buffer_page_len(void *page)
{
- return local_read(&((struct buffer_data_page *)page)->commit)
+ struct buffer_data_page *bpage = page;
+
+ return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
+ BUF_PAGE_HDR_SIZE;
}
@@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
}
EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
-static __always_inline void *
-__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
-{
- return bpage->data + index;
-}
-
static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
{
return bpage->page->data + index;
@@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The lock and unlock are done within a preempt disable section.
* The current_context per_cpu variable can only be modified
* by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. There are four
- * different contexts that we need to consider.
+ * be modified more than once via an interrupt. To pass this
+ * information from the lock to the unlock without having to
+ * access the 'in_interrupt()' functions again (which do show
+ * a bit of overhead in something as critical as function tracing,
+ * we use a bitmask trick.
+ *
+ * bit 0 = NMI context
+ * bit 1 = IRQ context
+ * bit 2 = SoftIRQ context
+ * bit 3 = normal context.
+ *
+ * This works because this is the order of contexts that can
+ * preempt other contexts. A SoftIRQ never preempts an IRQ
+ * context.
+ *
+ * When the context is determined, the corresponding bit is
+ * checked and set (if it was set, then a recursion of that context
+ * happened).
+ *
+ * On unlock, we need to clear this bit. To do so, just subtract
+ * 1 from the current_context and AND it to itself.
*
- * Normal context.
- * SoftIRQ context
- * IRQ context
- * NMI context
+ * (binary)
+ * 101 - 1 = 100
+ * 101 & 100 = 100 (clearing bit zero)
*
- * If for some reason the ring buffer starts to recurse, we
- * only allow that to happen at most 4 times (one for each
- * context). If it happens 5 times, then we consider this a
- * recusive loop and do not let it go further.
+ * 1010 - 1 = 1001
+ * 1010 & 1001 = 1000 (clearing bit 1)
+ *
+ * The least significant bit can be cleared this way, and it
+ * just so happens that it is the same bit corresponding to
+ * the current context.
*/
static __always_inline int
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
- if (cpu_buffer->current_context >= 4)
+ unsigned int val = cpu_buffer->current_context;
+ unsigned long pc = preempt_count();
+ int bit;
+
+ if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+ bit = RB_CTX_NORMAL;
+ else
+ bit = pc & NMI_MASK ? RB_CTX_NMI :
+ pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+
+ if (unlikely(val & (1 << bit)))
return 1;
- cpu_buffer->current_context++;
- /* Interrupts must see this update */
- barrier();
+ val |= (1 << bit);
+ cpu_buffer->current_context = val;
return 0;
}
@@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
static __always_inline void
trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
{
- /* Don't let the dec leak out */
- barrier();
- cpu_buffer->current_context--;
+ cpu_buffer->current_context &= cpu_buffer->current_context - 1;
}
/**
@@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct buffer_data_page *bpage = data;
+ struct page *page = virt_to_page(bpage);
unsigned long flags;
+ /* If the page is still in use someplace else, we can't reuse it */
+ if (page_ref_count(page) > 1)
+ goto out;
+
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
+ out:
free_page((unsigned long)bpage);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 73e67b68c53b..2a8d8a294345 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
}
/**
- * trace_pid_filter_add_remove - Add or remove a task from a pid_list
+ * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
* @pid_list: The list to modify
* @self: The current task for fork or NULL for exit
* @task: The task to add or remove
@@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)
}
/**
- * trace_snapshot - take a snapshot of the current buffer.
+ * tracing_snapshot - take a snapshot of the current buffer.
*
* This causes a swap between the snapshot buffer and the current live
* tracing buffer. You can use this to take snapshots of the live
@@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void)
EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
/**
- * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
*
- * This is similar to trace_snapshot(), but it will allocate the
+ * This is similar to tracing_snapshot(), but it will allocate the
* snapshot buffer if it isn't already allocated. Use this only
* where it is safe to sleep, as the allocation may sleep.
*
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh;
/*
* Copy the new maximum trace into the separate maximum-trace
* structure. (this way the maximum trace is permanently saved,
- * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
+ * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
*/
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2415,7 +2415,7 @@ trace_process_export(struct trace_export *export,
entry = ring_buffer_event_data(event);
size = ring_buffer_event_length(event);
- export->write(entry, size);
+ export->write(export, entry, size);
}
static DEFINE_MUTEX(ftrace_export_lock);
@@ -4178,37 +4178,30 @@ static const struct file_operations show_traces_fops = {
.llseek = seq_lseek,
};
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask (and one more byte for the newline):
- */
-static char mask_str[NR_CPUS + 1];
-
static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_array *tr = file_inode(filp)->i_private;
+ char *mask_str;
int len;
- mutex_lock(&tracing_cpumask_update_lock);
+ len = snprintf(NULL, 0, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask)) + 1;
+ mask_str = kmalloc(len, GFP_KERNEL);
+ if (!mask_str)
+ return -ENOMEM;
- len = snprintf(mask_str, count, "%*pb\n",
+ len = snprintf(mask_str, len, "%*pb\n",
cpumask_pr_args(tr->tracing_cpumask));
if (len >= count) {
count = -EINVAL;
goto out_err;
}
- count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
out_err:
- mutex_unlock(&tracing_cpumask_update_lock);
+ kfree(mask_str);
return count;
}
@@ -4228,8 +4221,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (err)
goto err_unlock;
- mutex_lock(&tracing_cpumask_update_lock);
-
local_irq_disable();
arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
@@ -4252,8 +4243,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
-
- mutex_unlock(&tracing_cpumask_update_lock);
free_cpumask_var(tracing_cpumask_new);
return count;
@@ -6780,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
- int entries, size, i;
+ int entries, i;
ssize_t ret = 0;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6834,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- /*
- * zero out any left over data, this is going to
- * user land.
- */
- size = ring_buffer_page_len(ref->page);
- if (size < PAGE_SIZE)
- memset(ref->page + size, 0, PAGE_SIZE - size);
-
page = virt_to_page(ref->page);
spd.pages[i] = page;
@@ -7599,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
buf->data = alloc_percpu(struct trace_array_cpu);
if (!buf->data) {
ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
return -ENOMEM;
}
@@ -7622,7 +7604,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot ? size : 1);
if (WARN_ON(ret)) {
ring_buffer_free(tr->trace_buffer.buffer);
+ tr->trace_buffer.buffer = NULL;
free_percpu(tr->trace_buffer.data);
+ tr->trace_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
+ bool first = false;
int last_i;
int i;
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
/* events are usually grouped together with systems */
if (!last_system || call->class->system != last_system) {
+ first = true;
last_i = 0;
last_system = call->class->system;
}
+ /*
+ * Since calls are grouped by systems, the likelyhood that the
+ * next call in the iteration belongs to the same system as the
+ * previous call is high. As an optimization, we skip seaching
+ * for a map[] that matches the call's system if the last call
+ * was from the same system. That's what last_i is for. If the
+ * call has the same system as the previous call, then last_i
+ * will be the index of the first map[] that has a matching
+ * system.
+ */
for (i = last_i; i < len; i++) {
if (call->class->system == map[i]->system) {
/* Save the first system if need be */
- if (!last_i)
+ if (first) {
last_i = i;
+ first = false;
+ }
update_event_printk(call, map[i]);
}
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 734accc02418..3c7bfc4bf5e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (__this_cpu_read(disable_stack_tracer) != 1)
goto out;
+ /* If rcu is not watching, then save stack trace can fail */
+ if (!rcu_is_watching())
+ goto out;
+
ip += MCOUNT_INSN_SIZE;
check_stack(ip, &stack);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8fdb710bfdd7..f699122dab32 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -38,7 +38,6 @@
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
-#include <linux/kallsyms.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
@@ -48,6 +47,8 @@
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/sched/isolation.h>
+#include <linux/nmi.h>
#include "workqueue_internal.h"
@@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker)
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/*
- * Sanity check nr_running. Because wq_unbind_fn() releases
+ * Sanity check nr_running. Because unbind_workers() releases
* pool->lock between setting %WORKER_UNBOUND and zapping
* nr_running, the warning may trigger spuriously. Check iff
* unbind is not in progress.
@@ -4463,6 +4464,12 @@ void show_workqueue_state(void)
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
}
@@ -4490,6 +4497,12 @@ void show_workqueue_state(void)
pr_cont("\n");
next_pool:
spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
}
rcu_read_unlock_sched();
@@ -4510,9 +4523,8 @@ void show_workqueue_state(void)
* cpu comes back online.
*/
-static void wq_unbind_fn(struct work_struct *work)
+static void unbind_workers(int cpu)
{
- int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
@@ -4589,16 +4601,6 @@ static void rebind_workers(struct worker_pool *pool)
spin_lock_irq(&pool->lock);
- /*
- * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
- * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
- * being reworked and this can go away in time.
- */
- if (!(pool->flags & POOL_DISASSOCIATED)) {
- spin_unlock_irq(&pool->lock);
- return;
- }
-
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
@@ -4709,12 +4711,13 @@ int workqueue_online_cpu(unsigned int cpu)
int workqueue_offline_cpu(unsigned int cpu)
{
- struct work_struct unbind_work;
struct workqueue_struct *wq;
/* unbinding per-cpu workers should happen on the local CPU */
- INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
- queue_work_on(cpu, system_highpri_wq, &unbind_work);
+ if (WARN_ON(cpu != smp_processor_id()))
+ return -1;
+
+ unbind_workers(cpu);
/* update NUMA affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
@@ -4722,9 +4725,6 @@ int workqueue_offline_cpu(unsigned int cpu)
wq_update_unbound_numa(wq, cpu, false);
mutex_unlock(&wq_pool_mutex);
- /* wait for per-cpu unbinding to finish */
- flush_work(&unbind_work);
- destroy_work_on_stack(&unbind_work);
return 0;
}
@@ -4957,6 +4957,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
return -ENOMEM;
+ /*
+ * Not excluding isolated cpus on purpose.
+ * If the user wishes to include them, we allow that.
+ */
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
apply_wqattrs_lock();
@@ -5555,7 +5559,7 @@ int __init workqueue_init_early(void)
WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);