summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorHuang, Tao <huangtao@rock-chips.com>2016-11-04 14:23:25 +0800
committerHuang, Tao <huangtao@rock-chips.com>2016-11-04 14:30:24 +0800
commitf9ae5d202b3953b5d69e860e540a6f53df7015b5 (patch)
tree92585302aa605d32b2e4782c7b4f700b1a5c2c35 /kernel
parent1429ad62bef1098f1cec26447cdc12934879a23c (diff)
parent79df8fa79b6a2aced892ad2b2c9832e7d9bdea6b (diff)
Merge branch 'linux-linaro-lsk-v4.4-android' of git://git.linaro.org/kernel/linux-linaro-stable.git
* linux-linaro-lsk-v4.4-android: (1362 commits) Linux 4.4.30 Revert "fix minor infoleak in get_user_ex()" Revert "x86/mm: Expand the exception table logic to allow new handling options" Linux 4.4.29 ARM: pxa: pxa_cplds: fix interrupt handling powerpc/nvram: Fix an incorrect partition merge mpt3sas: Don't spam logs if logging level is 0 perf symbols: Fixup symbol sizes before picking best ones perf symbols: Check symbol_conf.allow_aliases for kallsyms loading too perf hists browser: Fix event group display clk: divider: Fix clk_divider_round_rate() to use clk_readl() clk: qoriq: fix a register offset error s390/con3270: fix insufficient space padding s390/con3270: fix use of uninitialised data s390/cio: fix accidental interrupt enabling during resume x86/mm: Expand the exception table logic to allow new handling options dmaengine: ipu: remove bogus NO_IRQ reference power: bq24257: Fix use of uninitialized pointer bq->charger staging: r8188eu: Fix scheduling while atomic splat ASoC: dapm: Fix kcontrol creation for output driver widget ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/audit_watch.c8
-rw-r--r--kernel/auditsc.c344
-rw-r--r--kernel/capability.c46
-rw-r--r--kernel/cgroup.c67
-rw-r--r--kernel/configs/tiny.config8
-rw-r--r--kernel/cpu.c11
-rw-r--r--kernel/cpuset.c65
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c37
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/generic-chip.c21
-rw-r--r--kernel/irq/msi.c20
-rw-r--r--kernel/kexec_file.c3
-rw-r--r--kernel/locking/percpu-rwsem.c229
-rw-r--r--kernel/module.c13
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/power/hibernate.c4
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/printk/braille.c4
-rw-r--r--kernel/rcu/sync.c13
-rw-r--r--kernel/sched/Makefile3
-rw-r--r--kernel/sched/core.c209
-rw-r--r--kernel/sched/cpufreq_sched.c164
-rw-r--r--kernel/sched/cputime.c31
-rw-r--r--kernel/sched/fair.c629
-rw-r--r--kernel/sched/features.h4
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/rt.c57
-rw-r--r--kernel/sched/sched.h119
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/tune.c466
-rw-r--r--kernel/sched/tune.h26
-rw-r--r--kernel/sched/walt.c1168
-rw-r--r--kernel/sched/walt.h62
-rw-r--r--kernel/sysctl.c103
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/hrtimer.c7
-rw-r--r--kernel/time/ntp.c20
-rw-r--r--kernel/time/posix-cpu-timers.c1
-rw-r--r--kernel/time/timekeeping.c35
-rw-r--r--kernel/time/timekeeping_debug.c9
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/trace.c29
47 files changed, 3391 insertions, 770 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 5ffcbd354a52..34f690b9213a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -870,6 +870,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
+ /* NOTE: we are using task_tgid_vnr() below because
+ * the s.pid value is relative to the namespace
+ * of the caller; at present this doesn't matter
+ * much since you can really only run auditd
+ * from the initial pid namespace, but something
+ * to keep in mind if this changes */
int new_pid = s.pid;
if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
@@ -1896,7 +1902,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
task_ppid_nr(tsk),
- task_pid_nr(tsk),
+ task_tgid_nr(tsk),
from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 656c7e93ac0d..939945a5649c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
unsigned long ino;
dev_t dev;
- rcu_read_lock();
- exe_file = rcu_dereference(tsk->mm->exe_file);
+ exe_file = get_task_exe_file(tsk);
+ if (!exe_file)
+ return 0;
ino = exe_file->f_inode->i_ino;
dev = exe_file->f_inode->i_sb->s_dev;
- rcu_read_unlock();
+ fput(exe_file);
return audit_mark_compare(mark, ino, dev);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b86cc04959de..63f0e495f517 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
#include <linux/compat.h>
#include <linux/ctype.h>
#include <linux/string.h>
+#include <linux/uaccess.h>
#include <uapi/linux/limits.h>
#include "audit.h"
@@ -82,7 +83,8 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* no execve audit message should be longer than this (userspace limits) */
+/* no execve audit message should be longer than this (userspace limits),
+ * see the note near the top of audit_log_execve_info() about this value */
#define MAX_EXECVE_AUDIT_LEN 7500
/* max length to print of cmdline/proctitle value during audit */
@@ -456,7 +458,7 @@ static int audit_filter_rules(struct task_struct *tsk,
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(tsk);
+ pid = task_tgid_nr(tsk);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
@@ -988,184 +990,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
return rc;
}
-/*
- * to_send and len_sent accounting are very loose estimates. We aren't
- * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundary)
- *
- * why snprintf? an int is up to 12 digits long. if we just assumed when
- * logging that a[%d]= was going to be 16 characters long we would be wasting
- * space in every audit message. In one 7500 byte message we can log up to
- * about 1000 min size arguments. That comes down to about 50% waste of space
- * if we didn't do the snprintf to find out how long arg_num_len was.
- */
-static int audit_log_single_execve_arg(struct audit_context *context,
- struct audit_buffer **ab,
- int arg_num,
- size_t *len_sent,
- const char __user *p,
- char *buf)
+static void audit_log_execve_info(struct audit_context *context,
+ struct audit_buffer **ab)
{
- char arg_num_len_buf[12];
- const char __user *tmp_p = p;
- /* how many digits are in arg_num? 5 is the length of ' a=""' */
- size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5;
- size_t len, len_left, to_send;
- size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
- unsigned int i, has_cntl = 0, too_long = 0;
- int ret;
-
- /* strnlen_user includes the null we don't want to send */
- len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
-
- /*
- * We just created this mm, if we can't find the strings
- * we just copied into it something is _very_ wrong. Similar
- * for strings that are too long, we should not have created
- * any.
- */
- if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) {
- send_sig(SIGKILL, current, 0);
- return -1;
+ long len_max;
+ long len_rem;
+ long len_full;
+ long len_buf;
+ long len_abuf;
+ long len_tmp;
+ bool require_data;
+ bool encode;
+ unsigned int iter;
+ unsigned int arg;
+ char *buf_head;
+ char *buf;
+ const char __user *p = (const char __user *)current->mm->arg_start;
+
+ /* NOTE: this buffer needs to be large enough to hold all the non-arg
+ * data we put in the audit record for this argument (see the
+ * code below) ... at this point in time 96 is plenty */
+ char abuf[96];
+
+ /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the
+ * current value of 7500 is not as important as the fact that it
+ * is less than 8k, a setting of 7500 gives us plenty of wiggle
+ * room if we go over a little bit in the logging below */
+ WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500);
+ len_max = MAX_EXECVE_AUDIT_LEN;
+
+ /* scratch buffer to hold the userspace args */
+ buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
+ if (!buf_head) {
+ audit_panic("out of memory for argv string");
+ return;
}
+ buf = buf_head;
- /* walk the whole argument looking for non-ascii chars */
+ audit_log_format(*ab, "argc=%d", context->execve.argc);
+
+ len_rem = len_max;
+ len_buf = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ iter = 0;
+ arg = 0;
do {
- if (len_left > MAX_EXECVE_AUDIT_LEN)
- to_send = MAX_EXECVE_AUDIT_LEN;
- else
- to_send = len_left;
- ret = copy_from_user(buf, tmp_p, to_send);
- /*
- * There is no reason for this copy to be short. We just
- * copied them here, and the mm hasn't been exposed to user-
- * space yet.
- */
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
- }
- buf[to_send] = '\0';
- has_cntl = audit_string_contains_control(buf, to_send);
- if (has_cntl) {
- /*
- * hex messages get logged as 2 bytes, so we can only
- * send half as much in each message
- */
- max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
- break;
- }
- len_left -= to_send;
- tmp_p += to_send;
- } while (len_left > 0);
-
- len_left = len;
-
- if (len > max_execve_audit_len)
- too_long = 1;
-
- /* rewalk the argument actually logging the message */
- for (i = 0; len_left > 0; i++) {
- int room_left;
-
- if (len_left > max_execve_audit_len)
- to_send = max_execve_audit_len;
- else
- to_send = len_left;
-
- /* do we have space left to send this argument in this ab? */
- room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
- if (has_cntl)
- room_left -= (to_send * 2);
- else
- room_left -= to_send;
- if (room_left < 0) {
- *len_sent = 0;
- audit_log_end(*ab);
- *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
- if (!*ab)
- return 0;
- }
+ /* NOTE: we don't ever want to trust this value for anything
+ * serious, but the audit record format insists we
+ * provide an argument length for really long arguments,
+ * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but
+ * to use strncpy_from_user() to obtain this value for
+ * recording in the log, although we don't use it
+ * anywhere here to avoid a double-fetch problem */
+ if (len_full == 0)
+ len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1;
+
+ /* read more data from userspace */
+ if (require_data) {
+ /* can we make more room in the buffer? */
+ if (buf != buf_head) {
+ memmove(buf_head, buf, len_buf);
+ buf = buf_head;
+ }
+
+ /* fetch as much as we can of the argument */
+ len_tmp = strncpy_from_user(&buf_head[len_buf], p,
+ len_max - len_buf);
+ if (len_tmp == -EFAULT) {
+ /* unable to copy from userspace */
+ send_sig(SIGKILL, current, 0);
+ goto out;
+ } else if (len_tmp == (len_max - len_buf)) {
+ /* buffer is not large enough */
+ require_data = true;
+ /* NOTE: if we are going to span multiple
+ * buffers force the encoding so we stand
+ * a chance at a sane len_full value and
+ * consistent record encoding */
+ encode = true;
+ len_full = len_full * 2;
+ p += len_tmp;
+ } else {
+ require_data = false;
+ if (!encode)
+ encode = audit_string_contains_control(
+ buf, len_tmp);
+ /* try to use a trusted value for len_full */
+ if (len_full < len_max)
+ len_full = (encode ?
+ len_tmp * 2 : len_tmp);
+ p += len_tmp + 1;
+ }
+ len_buf += len_tmp;
+ buf_head[len_buf] = '\0';
- /*
- * first record needs to say how long the original string was
- * so we can be sure nothing was lost.
- */
- if ((i == 0) && (too_long))
- audit_log_format(*ab, " a%d_len=%zu", arg_num,
- has_cntl ? 2*len : len);
-
- /*
- * normally arguments are small enough to fit and we already
- * filled buf above when we checked for control characters
- * so don't bother with another copy_from_user
- */
- if (len >= max_execve_audit_len)
- ret = copy_from_user(buf, p, to_send);
- else
- ret = 0;
- if (ret) {
- WARN_ON(1);
- send_sig(SIGKILL, current, 0);
- return -1;
+ /* length of the buffer in the audit record? */
+ len_abuf = (encode ? len_buf * 2 : len_buf + 2);
}
- buf[to_send] = '\0';
-
- /* actually log it */
- audit_log_format(*ab, " a%d", arg_num);
- if (too_long)
- audit_log_format(*ab, "[%d]", i);
- audit_log_format(*ab, "=");
- if (has_cntl)
- audit_log_n_hex(*ab, buf, to_send);
- else
- audit_log_string(*ab, buf);
-
- p += to_send;
- len_left -= to_send;
- *len_sent += arg_num_len;
- if (has_cntl)
- *len_sent += to_send * 2;
- else
- *len_sent += to_send;
- }
- /* include the null we didn't log */
- return len + 1;
-}
-static void audit_log_execve_info(struct audit_context *context,
- struct audit_buffer **ab)
-{
- int i, len;
- size_t len_sent = 0;
- const char __user *p;
- char *buf;
+ /* write as much as we can to the audit log */
+ if (len_buf > 0) {
+ /* NOTE: some magic numbers here - basically if we
+ * can't fit a reasonable amount of data into the
+ * existing audit buffer, flush it and start with
+ * a new buffer */
+ if ((sizeof(abuf) + 8) > len_rem) {
+ len_rem = len_max;
+ audit_log_end(*ab);
+ *ab = audit_log_start(context,
+ GFP_KERNEL, AUDIT_EXECVE);
+ if (!*ab)
+ goto out;
+ }
- p = (const char __user *)current->mm->arg_start;
+ /* create the non-arg portion of the arg record */
+ len_tmp = 0;
+ if (require_data || (iter > 0) ||
+ ((len_abuf + sizeof(abuf)) > len_rem)) {
+ if (iter == 0) {
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d_len=%lu",
+ arg, len_full);
+ }
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d[%d]=", arg, iter++);
+ } else
+ len_tmp += snprintf(&abuf[len_tmp],
+ sizeof(abuf) - len_tmp,
+ " a%d=", arg);
+ WARN_ON(len_tmp >= sizeof(abuf));
+ abuf[sizeof(abuf) - 1] = '\0';
+
+ /* log the arg in the audit record */
+ audit_log_format(*ab, "%s", abuf);
+ len_rem -= len_tmp;
+ len_tmp = len_buf;
+ if (encode) {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem / 2; /* encoding */
+ audit_log_n_hex(*ab, buf, len_tmp);
+ len_rem -= len_tmp * 2;
+ len_abuf -= len_tmp * 2;
+ } else {
+ if (len_abuf > len_rem)
+ len_tmp = len_rem - 2; /* quotes */
+ audit_log_n_string(*ab, buf, len_tmp);
+ len_rem -= len_tmp + 2;
+ /* don't subtract the "2" because we still need
+ * to add quotes to the remaining string */
+ len_abuf -= len_tmp;
+ }
+ len_buf -= len_tmp;
+ buf += len_tmp;
+ }
- audit_log_format(*ab, "argc=%d", context->execve.argc);
+ /* ready to move to the next argument? */
+ if ((len_buf == 0) && !require_data) {
+ arg++;
+ iter = 0;
+ len_full = 0;
+ require_data = true;
+ encode = false;
+ }
+ } while (arg < context->execve.argc);
- /*
- * we need some kernel buffer to hold the userspace args. Just
- * allocate one big one rather than allocating one of the right size
- * for every single argument inside audit_log_single_execve_arg()
- * should be <8k allocation so should be pretty safe.
- */
- buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
- if (!buf) {
- audit_panic("out of memory for argv string");
- return;
- }
+ /* NOTE: the caller handles the final audit_log_end() call */
- for (i = 0; i < context->execve.argc; i++) {
- len = audit_log_single_execve_arg(context, ab, i,
- &len_sent, p, buf);
- if (len <= 0)
- break;
- p += len;
- }
- kfree(buf);
+out:
+ kfree(buf_head);
}
static void show_special(struct audit_context *context, int *call_panic)
@@ -1991,7 +1987,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
if (!ab)
return;
- audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
oldloginuid, loginuid, oldsessionid, sessionid, !rc);
@@ -2216,7 +2212,7 @@ void __audit_ptrace(struct task_struct *t)
{
struct audit_context *context = current->audit_context;
- context->target_pid = task_pid_nr(t);
+ context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2241,7 +2237,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_pid_nr(tsk);
+ audit_sig_pid = task_tgid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
@@ -2341,7 +2337,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = task_pid_nr(current);
+ context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
@@ -2373,7 +2369,7 @@ static void audit_log_task(struct audit_buffer *ab)
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 45432b54d5c6..00411c82dac5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -361,6 +361,24 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit)
+{
+ int capable;
+
+ if (unlikely(!cap_valid(cap))) {
+ pr_crit("capable() called with invalid cap=%u\n", cap);
+ BUG();
+ }
+
+ capable = audit ? security_capable(current_cred(), ns, cap) :
+ security_capable_noaudit(current_cred(), ns, cap);
+ if (capable == 0) {
+ current->flags |= PF_SUPERPRIV;
+ return true;
+ }
+ return false;
+}
+
/**
* ns_capable - Determine if the current task has a superior capability in effect
* @ns: The usernamespace we want the capability in
@@ -374,19 +392,27 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
*/
bool ns_capable(struct user_namespace *ns, int cap)
{
- if (unlikely(!cap_valid(cap))) {
- pr_crit("capable() called with invalid cap=%u\n", cap);
- BUG();
- }
-
- if (security_capable(current_cred(), ns, cap) == 0) {
- current->flags |= PF_SUPERPRIV;
- return true;
- }
- return false;
+ return ns_capable_common(ns, cap, true);
}
EXPORT_SYMBOL(ns_capable);
+/**
+ * ns_capable_noaudit - Determine if the current task has a superior capability
+ * (unaudited) in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable_noaudit(struct user_namespace *ns, int cap)
+{
+ return ns_capable_common(ns, cap, false);
+}
+EXPORT_SYMBOL(ns_capable_noaudit);
/**
* capable - Determine if the current task has a superior capability in effect
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 47851dbc893d..e4552a3cbf41 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2671,45 +2671,6 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
return ret;
}
-int subsys_cgroup_allow_attach(struct cgroup_taskset *tset)
-{
- const struct cred *cred = current_cred(), *tcred;
- struct task_struct *task;
- struct cgroup_subsys_state *css;
-
- if (capable(CAP_SYS_NICE))
- return 0;
-
- cgroup_taskset_for_each(task, css, tset) {
- tcred = __task_cred(task);
-
- if (current != task && !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
- return -EACCES;
- }
-
- return 0;
-}
-
-static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-{
- struct cgroup_subsys_state *css;
- int i;
- int ret;
-
- for_each_css(css, i, cgrp) {
- if (css->ss->allow_attach) {
- ret = css->ss->allow_attach(tset);
- if (ret)
- return ret;
- } else {
- return -EACCES;
- }
- }
-
- return 0;
-}
-
static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
@@ -2724,24 +2685,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
*/
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid)) {
- /*
- * if the default permission check fails, give each
- * cgroup a chance to extend the permission check
- */
- struct cgroup_taskset tset = {
- .src_csets = LIST_HEAD_INIT(tset.src_csets),
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
- .csets = &tset.src_csets,
- };
- struct css_set *cset;
- cset = task_css_set(task);
- list_add(&cset->mg_node, &tset.src_csets);
- ret = cgroup_allow_attach(dst_cgrp, &tset);
- list_del(&tset.src_csets);
- if (ret)
- ret = -EACCES;
- }
+ !uid_eq(cred->euid, tcred->suid) &&
+ !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+ ret = -EACCES;
if (!ret && cgroup_on_dfl(dst_cgrp)) {
struct super_block *sb = of->file->f_path.dentry->d_sb;
@@ -4848,6 +4794,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
memset(css, 0, sizeof(*css));
css->cgroup = cgrp;
css->ss = ss;
+ css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++;
@@ -5379,6 +5326,12 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ /*
+ * The latency of the synchronize_sched() is too high for cgroups,
+ * avoid it at the cost of forcing all readers into the slow path.
+ */
+ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
mutex_lock(&cgroup_mutex);
/* Add init_css_set to the hash table */
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
CONFIG_SLOB=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9ced7c751648..c8a1751be224 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -185,10 +185,17 @@ void cpu_hotplug_disable(void)
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
+static void __cpu_hotplug_enable(void)
+{
+ if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+ return;
+ cpu_hotplug_disabled--;
+}
+
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
+ __cpu_hotplug_enable();
cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -631,7 +638,7 @@ void enable_nonboot_cpus(void)
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
+ __cpu_hotplug_enable();
if (cpumask_empty(frozen_cpus))
goto out;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 11eaf14b52c2..3f9db31c5d04 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -98,6 +98,7 @@ struct cpuset {
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
+ cpumask_var_t cpus_requested;
nodemask_t mems_allowed;
/* effective CPUs and Memory Nodes allow to tasks */
@@ -324,8 +325,7 @@ static struct file_system_type cpuset_fs_type = {
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus. The top
- * cpuset always has some cpus online.
+ * until we find one that does have some online cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
@@ -334,8 +334,20 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
cs = parent_cs(cs);
+ if (unlikely(!cs)) {
+ /*
+ * The top cpuset doesn't have any online cpu as a
+ * consequence of a race between cpuset_hotplug_work
+ * and cpu hotplug notifier. But we know the top
+ * cpuset's effective_cpus is on its way to to be
+ * identical to cpu_online_mask.
+ */
+ cpumask_copy(pmask, cpu_online_mask);
+ return;
+ }
+ }
cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
@@ -386,7 +398,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ return cpumask_subset(p->cpus_requested, q->cpus_requested) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -486,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ cpumask_intersects(trial->cpus_requested, c->cpus_requested))
goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
@@ -945,17 +957,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
} else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
+ retval = cpulist_parse(buf, trialcs->cpus_requested);
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
+ if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
return -EINVAL;
+
+ cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
}
/* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
return 0;
retval = validate_change(cs, trialcs);
@@ -964,6 +977,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
spin_unlock_irq(&callback_lock);
/* use trialcs->cpus_allowed as a temp variable */
@@ -1754,7 +1768,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
switch (type) {
case FILE_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
break;
case FILE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -1943,11 +1957,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
goto free_cs;
+ if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+ goto free_allowed;
if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ goto free_requested;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
+ cpumask_clear(cs->cpus_requested);
nodes_clear(cs->mems_allowed);
cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
@@ -1956,7 +1973,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return &cs->css;
-free_cpus:
+free_requested:
+ free_cpumask_var(cs->cpus_requested);
+free_allowed:
free_cpumask_var(cs->cpus_allowed);
free_cs:
kfree(cs);
@@ -2019,6 +2038,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, parent->cpus_requested);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
@@ -2053,6 +2073,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->cpus_requested);
kfree(cs);
}
@@ -2074,6 +2095,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_unlock(&cpuset_mutex);
}
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task, void *priv)
+{
+ if (task_css_is_root(task, cpuset_cgrp_id))
+ return;
+
+ set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ task->mems_allowed = current->mems_allowed;
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
@@ -2084,6 +2119,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .fork = cpuset_fork,
.legacy_cftypes = files,
.early_init = 1,
};
@@ -2102,8 +2138,11 @@ int __init cpuset_init(void)
BUG();
if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
+ cpumask_setall(top_cpuset.cpus_requested);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
nodes_setall(top_cpuset.effective_mems);
@@ -2237,7 +2276,7 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/cred.c b/kernel/cred.c
index 71179a09c1d6..ff8606f77d90 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -689,6 +689,8 @@ EXPORT_SYMBOL(set_security_override_from_ctx);
*/
int set_create_files_as(struct cred *new, struct inode *inode)
{
+ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid))
+ return -EINVAL;
new->fsuid = inode->i_uid;
new->fsgid = inode->i_gid;
return security_kernel_create_files_as(new, inode);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 662f2b23632f..f4fdaff76f6d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1544,12 +1544,33 @@ static int __init perf_workqueue_init(void)
core_initcall(perf_workqueue_init);
-static inline int pmu_filter_match(struct perf_event *event)
+static inline int __pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
return pmu->filter_match ? pmu->filter_match(event) : 1;
}
+/*
+ * Check whether we should attempt to schedule an event group based on
+ * PMU-specific filtering. An event group can consist of HW and SW events,
+ * potentially with a SW leader, so we must check all the filters, to
+ * determine whether a group is schedulable:
+ */
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct perf_event *child;
+
+ if (!__pmu_filter_match(event))
+ return 0;
+
+ list_for_each_entry(child, &event->sibling_list, group_entry) {
+ if (!__pmu_filter_match(child))
+ return 0;
+ }
+
+ return 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7dad84913abf..7b1b772ab1ce 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -171,8 +171,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ if (!ptep) {
+ mem_cgroup_cancel_charge(kpage, memcg);
goto unlock;
+ }
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
@@ -199,7 +201,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
@@ -1692,8 +1693,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
int result;
pagefault_disable();
- result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
- sizeof(opcode));
+ result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
if (likely(result == 0))
diff --git a/kernel/exit.c b/kernel/exit.c
index ffba5df4abd5..62c4bd4abd3a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,8 @@
#include <linux/writeback.h>
#include <linux/shm.h>
+#include "sched/tune.h"
+
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
@@ -699,6 +701,9 @@ void do_exit(long code)
}
exit_signals(tsk); /* sets PF_EXITING */
+
+ schedtune_exit_task(tsk);
+
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ec6e9939b2c..18a5cb17035a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -764,6 +764,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
EXPORT_SYMBOL(get_mm_exe_file);
/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+ struct file *exe_file = NULL;
+ struct mm_struct *mm;
+
+ task_lock(task);
+ mm = task->mm;
+ if (mm) {
+ if (!(task->flags & PF_KTHREAD))
+ exe_file = get_mm_exe_file(mm);
+ }
+ task_unlock(task);
+ return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
+/**
* get_task_mm - acquire a reference to the task's mm
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
@@ -879,14 +902,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
deactivate_mm(tsk, mm);
/*
- * If we're exiting normally, clear a user-space tid field if
- * requested. We leave this alone when dying by signal, to leave
- * the value intact in a core dump, and to save the unnecessary
- * trouble, say, a killed vfork parent shouldn't touch this mm.
- * Userland only wants this done for a sys_exit.
+ * Signal userspace if we're not exiting with a core dump
+ * because we want to leave the value intact for debugging
+ * purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->flags & PF_SIGNALED) &&
+ if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
@@ -1370,7 +1391,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->real_start_time = ktime_get_boot_ns();
p->io_context = NULL;
p->audit_context = NULL;
- threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1522,6 +1542,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
@@ -1622,6 +1643,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
+ threadgroup_change_end(current);
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_io:
@@ -1652,7 +1674,6 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
- threadgroup_change_end(current);
delayacct_tsk_free(p);
bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index 9d8163afd87c..e8af73cc51a7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -681,7 +681,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
int ret;
pagefault_disable();
- ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+ ret = __get_user(*dest, from);
pagefault_enable();
return ret ? -EFAULT : 0;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index abd286afbd27..a4775f3451b9 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -411,8 +411,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
}
EXPORT_SYMBOL_GPL(irq_map_generic_chip);
+static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
+{
+ struct irq_data *data = irq_domain_get_irq_data(d, virq);
+ struct irq_domain_chip_generic *dgc = d->gc;
+ unsigned int hw_irq = data->hwirq;
+ struct irq_chip_generic *gc;
+ int irq_idx;
+
+ gc = irq_get_domain_generic_chip(d, hw_irq);
+ if (!gc)
+ return;
+
+ irq_idx = hw_irq % dgc->irqs_per_chip;
+
+ clear_bit(irq_idx, &gc->installed);
+ irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL,
+ NULL);
+
+}
+
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
+ .unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
};
EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6b0c0b74a2a1..cd6009006510 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_domain_ops *ops = info->ops;
msi_alloc_info_t arg;
struct msi_desc *desc;
- int i, ret, virq = -1;
+ int i, ret, virq;
ret = ops->msi_check(domain, info, dev);
if (ret == 0)
@@ -278,12 +278,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
for_each_msi_entry(desc, dev) {
ops->set_desc(&arg, desc);
- if (info->flags & MSI_FLAG_IDENTITY_MAP)
- virq = (int)ops->get_hwirq(info, &arg);
- else
- virq = -1;
- virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false);
if (virq < 0) {
ret = -ENOSPC;
@@ -302,11 +298,23 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
ops->msi_finish(&arg, 0);
for_each_msi_entry(desc, dev) {
+ virq = desc->irq;
if (desc->nvec_used == 1)
dev_dbg(dev, "irq %d for MSI\n", virq);
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ irq_domain_activate_irq(irq_data);
+ }
}
return 0;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b70ada0028d2..6030efd4a188 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -934,7 +934,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
return 0;
out:
vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
return ret;
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..ce182599cf2e 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,151 +8,186 @@
#include <linux/sched.h>
#include <linux/errno.h>
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
+ sem->read_count = alloc_percpu(int);
+ if (unlikely(!sem->read_count))
return -ENOMEM;
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ __init_rwsem(&sem->rw_sem, name, rwsem_key);
+ init_waitqueue_head(&sem->writer);
+ sem->readers_block = 0;
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
{
/*
* XXX: temporary kludge. The error path in alloc_super()
* assumes that percpu_free_rwsem() is safe after kzalloc().
*/
- if (!brw->fast_read_ctr)
+ if (!sem->read_count)
return;
- rcu_sync_dtor(&brw->rss);
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->read_count);
+ sem->read_count = NULL; /* catch use after free bugs */
}
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
{
- bool success;
+ /*
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
+ *
+ * If the reader misses the writer's assignment of readers_block, then
+ * the writer is guaranteed to see the reader's increment.
+ *
+ * Conversely, any readers that increment their sem->read_count after
+ * the writer looks are guaranteed to see the readers_block value,
+ * which in turn means that they are guaranteed to immediately
+ * decrement their sem->read_count, so that it doesn't matter that the
+ * writer missed them.
+ */
- preempt_disable();
- success = rcu_sync_is_idle(&brw->rss);
- if (likely(success))
- __this_cpu_add(*brw->fast_read_ctr, val);
- preempt_enable();
+ smp_mb(); /* A matches D */
- return success;
-}
+ /*
+ * If !readers_block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(!smp_load_acquire(&sem->readers_block)))
+ return 1;
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
- might_sleep();
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+ /*
+ * Per the above comment; we still have preemption disabled and
+ * will thus decrement on the same CPU as we incremented.
+ */
+ __percpu_up_read(sem);
- if (likely(update_fast_ctr(brw, +1)))
- return;
+ if (try)
+ return 0;
- /* Avoid rwsem_acquire_read() and rwsem_release() */
- __down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
-}
-EXPORT_SYMBOL_GPL(percpu_down_read);
+ /*
+ * We either call schedule() in the wait, or we'll fall through
+ * and reschedule on the preempt_enable() in percpu_down_read().
+ */
+ preempt_enable_no_resched();
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
-{
- if (unlikely(!update_fast_ctr(brw, +1))) {
- if (!__down_read_trylock(&brw->rw_sem))
- return 0;
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
- }
-
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+ /*
+ * Avoid lockdep for the down/up_read() we already have them.
+ */
+ __down_read(&sem->rw_sem);
+ this_cpu_inc(*sem->read_count);
+ __up_read(&sem->rw_sem);
+
+ preempt_disable();
return 1;
}
+EXPORT_SYMBOL_GPL(__percpu_down_read);
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
- if (likely(update_fast_ctr(brw, -1)))
- return;
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to aggregate
+ * zero, as that is the only time it matters) they will also see our
+ * critical section.
+ */
+ __this_cpu_dec(*sem->read_count);
- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
+ /* Prod writer to recheck readers_active */
+ wake_up(&sem->writer);
}
-EXPORT_SYMBOL_GPL(percpu_up_read);
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- unsigned int sum = 0;
- int cpu;
+ if (per_cpu_sum(*sem->read_count) != 0)
+ return false;
+
+ /*
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
+ */
- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
+ smp_mb(); /* C matches B */
- return sum;
+ return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);
+
+ down_write(&sem->rw_sem);
+
/*
- * Make rcu_sync_is_idle() == F and thus disable the fast-path in
- * percpu_down_read() and percpu_up_read(), and wait for gp pass.
- *
- * The latter synchronises us with the preceding readers which used
- * the fast-past, so we can not miss the result of __this_cpu_add()
- * or anything else inside their criticial sections.
+ * Notify new readers to block; up until now, and thus throughout the
+ * longish rcu_sync_enter() above, new readers could still come in.
*/
- rcu_sync_enter(&brw->rss);
+ WRITE_ONCE(sem->readers_block, 1);
- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
+ smp_mb(); /* D matches A */
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+ /*
+ * If they don't see our writer of readers_block, then we are
+ * guaranteed to see their sem->read_count increment, and therefore
+ * will wait for them.
+ */
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+ /* Wait for all now active readers to complete. */
+ wait_event(sem->writer, readers_active_check(sem));
}
EXPORT_SYMBOL_GPL(percpu_down_write);
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
/*
- * Enable the fast-path in percpu_down_read() and percpu_up_read()
- * but only after another gp pass; this adds the necessary barrier
- * to ensure the reader can't miss the changes done by us.
+ * Signal the writer is done, no fast path yet.
+ *
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
+ *
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
+ */
+ smp_store_release(&sem->readers_block, 0);
+
+ /*
+ * Release the write lock, this will allow readers back in the game.
+ */
+ up_write(&sem->rw_sem);
+
+ /*
+ * Once this completes (at least one RCU-sched grace period hence) the
+ * reader fast path will be available again. Safe to use outside the
+ * exclusive write lock because its counting.
*/
- rcu_sync_exit(&brw->rss);
+ rcu_sync_exit(&sem->rss);
}
EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/module.c b/kernel/module.c
index 0e5c71195f18..b14a4f31221f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2606,13 +2606,18 @@ static inline void kmemleak_load_module(const struct module *mod,
#endif
#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
int err = -ENOKEY;
const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
const void *mod = info->hdr;
- if (info->len > markerlen &&
+ /*
+ * Require flags == 0, as a module with version information
+ * removed is no longer the module that was signed
+ */
+ if (flags == 0 &&
+ info->len > markerlen &&
memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
info->len -= markerlen;
@@ -2631,7 +2636,7 @@ static int module_sig_check(struct load_info *info)
return err;
}
#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info)
+static int module_sig_check(struct load_info *info, int flags)
{
return 0;
}
@@ -3444,7 +3449,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
long err;
char *after_dashes;
- err = module_sig_check(info);
+ err = module_sig_check(info, flags);
if (err)
goto free_copy;
diff --git a/kernel/panic.c b/kernel/panic.c
index 223564d3e1f8..41e2b54f36b5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -28,9 +28,6 @@
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
-/* Machine specific panic information string */
-char *mach_panic_string;
-
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask;
static int pause_on_oops;
@@ -415,11 +412,6 @@ late_initcall(init_oops_id);
void print_oops_end_marker(void)
{
init_oops_id();
-
- if (mach_panic_string)
- printk(KERN_WARNING "Board Information: %s\n",
- mach_panic_string);
-
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index b7dd5718836e..3124cebaec31 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -299,12 +299,12 @@ static int create_image(int platform_mode)
save_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ /* Restore control flow magically appears here */
+ restore_processor_state();
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
- /* Restore control flow magically appears here */
- restore_processor_state();
if (!in_suspend)
events_check_enabled = false;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3a970604308f..f155c62f1f2c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -765,9 +765,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
*/
static bool rtree_next_node(struct memory_bitmap *bm)
{
- bm->cur.node = list_entry(bm->cur.node->list.next,
- struct rtree_node, list);
- if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
bm->cur.node_bit = 0;
touch_softlockup_watchdog();
@@ -775,9 +775,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
}
/* No more nodes, goto next zone */
- bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
struct mem_zone_bm_rtree, list);
- if (&bm->cur.zone->list != &bm->zones) {
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
struct rtree_node, list);
bm->cur.node_pfn = 0;
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
char *_braille_console_setup(char **str, char **brl_options)
{
- if (!memcmp(*str, "brl,", 4)) {
+ if (!strncmp(*str, "brl,", 4)) {
*brl_options = "";
*str += 4;
- } else if (!memcmp(str, "brl=", 4)) {
+ } else if (!strncmp(*str, "brl=", 4)) {
*brl_options = *str + 4;
*str = strchr(*brl_options, ',');
if (!*str)
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index be922c9f3d37..b49cf3ac2d47 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,7 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
"suspicious rcu_sync_is_idle() usage");
}
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
#endif
/**
@@ -83,6 +84,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
}
/**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+ rsp->gp_count++;
+ rsp->gp_state = GP_PASSED;
+}
+
+/**
* rcu_sync_enter() - Force readers onto slowpath
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c6a85f813dfd..dc93f1cffc6e 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -14,7 +14,8 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o
obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a94f6424103c..1df6da0094f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#include "walt.h"
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -287,6 +288,18 @@ int sysctl_sched_rt_runtime = 950000;
/* cpus with isolated domains */
cpumask_var_t cpu_isolated_map;
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+ return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+ task_rq_unlock(rq, p, flags);
+}
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -627,7 +640,10 @@ int get_nohz_timer_target(void)
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
cpu = i;
goto unlock;
}
@@ -1073,7 +1089,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new
dequeue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ double_lock_balance(rq, cpu_rq(new_cpu));
set_task_cpu(p, new_cpu);
+ double_unlock_balance(rq, cpu_rq(new_cpu));
raw_spin_unlock(&rq->lock);
rq = cpu_rq(new_cpu);
@@ -1297,6 +1315,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
+
+ walt_fixup_busy_time(p, new_cpu);
}
__set_task_cpu(p, new_cpu);
@@ -1925,6 +1945,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
+#ifdef CONFIG_SMP
+ struct rq *rq;
+ u64 wallclock;
+#endif
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -1942,6 +1966,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * [S] p->on_rq = 1; [L] P->state
+ * UNLOCK rq->lock -----.
+ * \
+ * +--- RMB
+ * schedule() /
+ * LOCK rq->lock -----'
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ *
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
+ * last wakeup of our task and the schedule that got our task
+ * current.
+ */
+ smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
@@ -1982,6 +2028,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
+ rq = cpu_rq(task_cpu(p));
+
+ raw_spin_lock(&rq->lock);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ raw_spin_unlock(&rq->lock);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -1989,10 +2043,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_class->task_waking(p);
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
@@ -2041,8 +2097,13 @@ static void try_to_wake_up_local(struct task_struct *p)
trace_sched_waking(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ u64 wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0);
@@ -2108,6 +2169,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+ walt_init_new_task_load(p);
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2375,6 +2437,9 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+ walt_init_new_task_load(p);
+
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
@@ -2387,6 +2452,7 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p);
+ walt_mark_task_starting(p);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -2768,6 +2834,36 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+ unsigned int seqcnt;
+ u64 integral;
+ struct rq *q;
+
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ q = cpu_rq(cpu);
+
+ /*
+ * Update average to avoid reading stalled value if there were
+ * no run-queue changes for a long time. On the other hand if
+ * the changes are happening right now, just read current value
+ * directly.
+ */
+
+ seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+ integral = do_nr_running_integral(q);
+ if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+ read_seqcount_begin(&q->ave_seqcnt);
+ integral = q->nr_running_integral;
+ }
+
+ return integral;
+}
+#endif
+
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
struct rq *rq = this_rq();
@@ -2855,20 +2951,76 @@ unsigned long long task_sched_runtime(struct task_struct *p)
}
#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-static unsigned long sum_capacity_reqs(unsigned long cfs_cap,
- struct sched_capacity_reqs *scr)
+
+static inline
+unsigned long add_capacity_margin(unsigned long cpu_capacity)
{
- unsigned long total = cfs_cap + scr->rt;
+ cpu_capacity = cpu_capacity * capacity_margin;
+ cpu_capacity /= SCHED_CAPACITY_SCALE;
+ return cpu_capacity;
+}
- total = total * capacity_margin;
- total /= SCHED_CAPACITY_SCALE;
- total += scr->dl;
- return total;
+static inline
+unsigned long sum_capacity_reqs(unsigned long cfs_cap,
+ struct sched_capacity_reqs *scr)
+{
+ unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
+ return total += scr->dl;
}
-static void sched_freq_tick(int cpu)
+static void sched_freq_tick_pelt(int cpu)
{
+ unsigned long cpu_utilization = capacity_max;
+ unsigned long capacity_curr = capacity_curr_of(cpu);
struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+ if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
+ return;
+
+ /*
+ * To make free room for a task that is building up its "real"
+ * utilization and to harm its performance the least, request
+ * a jump to a higher OPP as soon as the margin of free capacity
+ * is impacted (specified by capacity_margin).
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+}
+
+#ifdef CONFIG_SCHED_WALT
+static void sched_freq_tick_walt(int cpu)
+{
+ unsigned long cpu_utilization = cpu_util(cpu);
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+
+ if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
+ return sched_freq_tick_pelt(cpu);
+
+ /*
+ * Add a margin to the WALT utilization.
+ * NOTE: WALT tracks a single CPU signal for all the scheduling
+ * classes, thus this margin is going to be added to the DL class as
+ * well, which is something we do not do in sched_freq_tick_pelt case.
+ */
+ cpu_utilization = add_capacity_margin(cpu_utilization);
+ if (cpu_utilization <= capacity_curr)
+ return;
+
+ /*
+ * It is likely that the load is growing so we
+ * keep the added margin in our request as an
+ * extra boost.
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+
+}
+#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
+#else
+#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
+#endif /* CONFIG_SCHED_WALT */
+
+static void sched_freq_tick(int cpu)
+{
unsigned long capacity_orig, capacity_curr;
if (!sched_freq())
@@ -2879,19 +3031,11 @@ static void sched_freq_tick(int cpu)
if (capacity_curr == capacity_orig)
return;
- /*
- * To make free room for a task that is building up its "real"
- * utilization and to harm its performance the least, request
- * a jump to max OPP as soon as the margin of free capacity is
- * impacted (specified by capacity_margin).
- */
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
- if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr))
- set_cfs_cpu_capacity(cpu, true, capacity_max);
+ _sched_freq_tick(cpu);
}
#else
static inline void sched_freq_tick(int cpu) { }
-#endif
+#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
/*
* This function gets called by the timer code, with HZ frequency.
@@ -2906,9 +3050,12 @@ void scheduler_tick(void)
sched_clock_tick();
raw_spin_lock(&rq->lock);
+ walt_set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
calc_global_load_tick(rq);
sched_freq_tick(cpu);
raw_spin_unlock(&rq->lock);
@@ -3147,6 +3294,7 @@ static void __sched notrace __schedule(bool preempt)
unsigned long *switch_count;
struct rq *rq;
int cpu;
+ u64 wallclock;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -3208,6 +3356,9 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);
next = pick_next_task(rq, prev);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
@@ -4991,14 +5142,16 @@ void show_state_filter(unsigned long state_filter)
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
*/
touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
}
- touch_all_softlockup_watchdogs();
-
#ifdef CONFIG_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
@@ -5032,6 +5185,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
+
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -5628,6 +5782,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_set_window_start(rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
account_reset_rq(rq);
break;
@@ -5648,6 +5805,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_migrate_sync_cpu(cpu);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
@@ -7492,6 +7650,7 @@ void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
+ walt_init_cpu_efficiency();
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -7669,6 +7828,11 @@ void __init sched_init(void)
rq->idle_stamp = 0;
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+ rq->cur_irqload = 0;
+ rq->avg_irqload = 0;
+ rq->irqload_ts = 0;
+#endif
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -8792,7 +8956,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .allow_attach = subsys_cgroup_allow_attach,
.legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index e1d208e101ed..f6f9b9b3a4a8 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -19,7 +19,8 @@
#include "sched.h"
-#define THROTTLE_NSEC 50000000 /* 50ms default */
+#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */
+#define THROTTLE_UP_NSEC 500000 /* 500us default */
struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
static bool __read_mostly cpufreq_driver_slow;
@@ -33,8 +34,10 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
/**
* gov_data - per-policy data internal to the governor
- * @throttle: next throttling period expiry. Derived from throttle_nsec
- * @throttle_nsec: throttle period length in nanoseconds
+ * @up_throttle: next throttling period expiry if increasing OPP
+ * @down_throttle: next throttling period expiry if decreasing OPP
+ * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
+ * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
* @task: worker thread for dvfs transition that may block/sleep
* @irq_work: callback used to wake up worker thread
* @requested_freq: last frequency requested by the sched governor
@@ -48,8 +51,10 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
* call down_write(policy->rwsem).
*/
struct gov_data {
- ktime_t throttle;
- unsigned int throttle_nsec;
+ ktime_t up_throttle;
+ ktime_t down_throttle;
+ unsigned int up_throttle_nsec;
+ unsigned int down_throttle_nsec;
struct task_struct *task;
struct irq_work irq_work;
unsigned int requested_freq;
@@ -66,25 +71,29 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
- gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec);
+ gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
+ gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
up_write(&policy->rwsem);
}
-static bool finish_last_request(struct gov_data *gd)
+static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
{
ktime_t now = ktime_get();
- if (ktime_after(now, gd->throttle))
+ ktime_t throttle = gd->requested_freq < cur_freq ?
+ gd->down_throttle : gd->up_throttle;
+
+ if (ktime_after(now, throttle))
return false;
while (1) {
- int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now));
+ int usec_left = ktime_to_ns(ktime_sub(throttle, now));
usec_left /= NSEC_PER_USEC;
trace_cpufreq_sched_throttled(usec_left);
usleep_range(usec_left, usec_left + 100);
now = ktime_get();
- if (ktime_after(now, gd->throttle))
+ if (ktime_after(now, throttle))
return true;
}
}
@@ -122,13 +131,15 @@ static int cpufreq_sched_thread(void *data)
new_request = gd->requested_freq;
if (new_request == last_request) {
set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
schedule();
} else {
/*
* if the frequency thread sleeps while waiting to be
* unthrottled, start over to check for a newer request
*/
- if (finish_last_request(gd))
+ if (finish_last_request(gd, policy->cur))
continue;
last_request = new_request;
cpufreq_sched_try_driver_target(policy, new_request);
@@ -190,9 +201,14 @@ static void update_fdomain_capacity_request(int cpu)
goto out;
freq_new = policy->freq_table[index_new].frequency;
+ if (freq_new > policy->max)
+ freq_new = policy->max;
+
+ if (freq_new < policy->min)
+ freq_new = policy->min;
+
trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
gd->requested_freq);
-
if (freq_new == gd->requested_freq)
goto out;
@@ -246,10 +262,17 @@ static inline void clear_sched_freq(void)
static_key_slow_dec(&__sched_freq);
}
+static struct attribute_group sched_attr_group_gov_pol;
+static struct attribute_group *get_sysfs_attr(void)
+{
+ return &sched_attr_group_gov_pol;
+}
+
static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
{
struct gov_data *gd;
int cpu;
+ int rc;
for_each_cpu(cpu, policy->cpus)
memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
@@ -259,12 +282,20 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
if (!gd)
return -ENOMEM;
- gd->throttle_nsec = policy->cpuinfo.transition_latency ?
+ gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
policy->cpuinfo.transition_latency :
- THROTTLE_NSEC;
+ THROTTLE_UP_NSEC;
+ gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
pr_debug("%s: throttle threshold = %u [ns]\n",
- __func__, gd->throttle_nsec);
+ __func__, gd->up_throttle_nsec);
+
+ rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+ if (rc) {
+ pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
+ goto err;
+ }
+ policy->governor_data = gd;
if (cpufreq_driver_is_slow()) {
cpufreq_driver_slow = true;
gd->task = kthread_create(cpufreq_sched_thread, policy,
@@ -281,12 +312,12 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
}
- policy->governor_data = gd;
set_sched_freq();
return 0;
err:
+ policy->governor_data = NULL;
kfree(gd);
return -ENOMEM;
}
@@ -301,6 +332,8 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
put_task_struct(gd->task);
}
+ sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+
policy->governor_data = NULL;
kfree(gd);
@@ -317,6 +350,21 @@ static int cpufreq_sched_start(struct cpufreq_policy *policy)
return 0;
}
+static void cpufreq_sched_limits(struct cpufreq_policy *policy)
+{
+ unsigned int clamp_freq;
+ struct gov_data *gd = policy->governor_data;;
+
+ pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
+ policy->cpu, policy->min, policy->max,
+ policy->cur);
+
+ clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
+
+ if (policy->cur != clamp_freq)
+ __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
+}
+
static int cpufreq_sched_stop(struct cpufreq_policy *policy)
{
int cpu;
@@ -340,11 +388,95 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy,
case CPUFREQ_GOV_STOP:
return cpufreq_sched_stop(policy);
case CPUFREQ_GOV_LIMITS:
+ cpufreq_sched_limits(policy);
break;
}
return 0;
}
+/* Tunables */
+static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
+{
+ return sprintf(buf, "%u\n", gd->up_throttle_nsec);
+}
+
+static ssize_t store_up_throttle_nsec(struct gov_data *gd,
+ const char *buf, size_t count)
+{
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ gd->up_throttle_nsec = val;
+ return count;
+}
+
+static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
+{
+ return sprintf(buf, "%u\n", gd->down_throttle_nsec);
+}
+
+static ssize_t store_down_throttle_nsec(struct gov_data *gd,
+ const char *buf, size_t count)
+{
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ gd->down_throttle_nsec = val;
+ return count;
+}
+
+/*
+ * Create show/store routines
+ * - sys: One governor instance for complete SYSTEM
+ * - pol: One governor instance per struct cpufreq_policy
+ */
+#define show_gov_pol_sys(file_name) \
+static ssize_t show_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, char *buf) \
+{ \
+ return show_##file_name(policy->governor_data, buf); \
+}
+
+#define store_gov_pol_sys(file_name) \
+static ssize_t store_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, const char *buf, size_t count) \
+{ \
+ return store_##file_name(policy->governor_data, buf, count); \
+}
+
+#define gov_pol_attr_rw(_name) \
+ static struct freq_attr _name##_gov_pol = \
+ __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
+
+#define show_store_gov_pol_sys(file_name) \
+ show_gov_pol_sys(file_name); \
+ store_gov_pol_sys(file_name)
+#define tunable_handlers(file_name) \
+ show_gov_pol_sys(file_name); \
+ store_gov_pol_sys(file_name); \
+ gov_pol_attr_rw(file_name)
+
+tunable_handlers(down_throttle_nsec);
+tunable_handlers(up_throttle_nsec);
+
+/* Per policy governor instance */
+static struct attribute *sched_attributes_gov_pol[] = {
+ &up_throttle_nsec_gov_pol.attr,
+ &down_throttle_nsec_gov_pol.attr,
+ NULL,
+};
+
+static struct attribute_group sched_attr_group_gov_pol = {
+ .attrs = sched_attributes_gov_pol,
+ .name = "sched",
+};
+
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
static
#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f74ea89e77a8..acde1d7c763c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -5,6 +5,7 @@
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include "sched.h"
+#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr)
unsigned long flags;
s64 delta;
int cpu;
+#ifdef CONFIG_SCHED_WALT
+ u64 wallclock;
+ bool account = true;
+#endif
if (!sched_clock_irqtime)
return;
@@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr)
local_irq_save(flags);
cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+ wallclock = sched_clock_cpu(cpu);
+#endif
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
@@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
+#ifdef CONFIG_SCHED_WALT
+ else
+ account = false;
+#endif
irq_time_write_end();
+#ifdef CONFIG_SCHED_WALT
+ if (account)
+ walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
@@ -600,19 +616,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -635,7 +657,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3217494e326e..d5965cfbd3d7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,11 +30,13 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/module.h>
#include <trace/events/sched.h>
#include "sched.h"
#include "tune.h"
+#include "walt.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -51,6 +53,17 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+ (10 * NSEC_PER_MSEC);
+#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -683,13 +696,13 @@ void init_entity_runnable_average(struct sched_entity *se)
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_avg = sched_freq() ?
+ sysctl_sched_initial_task_util :
+ scale_load_down(SCHED_LOAD_SCALE);
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -1194,8 +1207,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1263,20 +1274,30 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1360,6 +1381,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1385,9 +1407,16 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -4168,8 +4197,14 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+#ifdef CONFIG_SMP
static void update_capacity_of(int cpu)
{
unsigned long req_cap;
@@ -4182,8 +4217,7 @@ static void update_capacity_of(int cpu)
req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
set_cfs_cpu_capacity(cpu, true, req_cap);
}
-
-static bool cpu_overutilized(int cpu);
+#endif
/*
* The enqueue_task method is called before nr_running is
@@ -4195,8 +4229,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
int task_new = flags & ENQUEUE_WAKEUP_NEW;
int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
for_each_sched_entity(se) {
if (se->on_rq)
@@ -4213,6 +4249,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
flags = ENQUEUE_WAKEUP;
}
@@ -4220,6 +4257,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4228,13 +4266,37 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
- if (!se) {
+ if (!se)
add_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_new && !rq->rd->overutilized &&
- cpu_overutilized(rq->cpu))
+ cpu_overutilized(rq->cpu)) {
rq->rd->overutilized = true;
-
- schedtune_enqueue_task(p, cpu_of(rq));
+ trace_sched_overutilized(true);
+ }
/*
* We want to potentially trigger a freq switch
@@ -4246,6 +4308,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (task_new || task_wakeup)
update_capacity_of(cpu_of(rq));
}
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
@@ -4275,6 +4339,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -4295,6 +4360,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4303,9 +4369,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_shares(cfs_rq);
}
- if (!se) {
+ if (!se)
sub_nr_running(rq, 1);
- schedtune_dequeue_task(p, cpu_of(rq));
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_dec_cumulative_runnable_avg(rq, p);
/*
* We want to potentially trigger a freq switch
@@ -4322,6 +4401,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
set_cfs_cpu_capacity(cpu_of(rq), false, 0);
}
}
+
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
@@ -4660,19 +4742,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
@@ -4954,6 +5041,7 @@ static int sched_group_energy(struct energy_env *eenv)
} while (sg = sg->next, sg != sd->groups);
}
next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
continue;
}
@@ -4966,44 +5054,6 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
}
-#ifdef CONFIG_SCHED_TUNE
-static int energy_diff_evaluate(struct energy_env *eenv)
-{
- unsigned int boost;
- int nrg_delta;
-
- /* Return energy diff when boost margin is 0 */
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- boost = schedtune_task_boost(eenv->task);
-#else
- boost = get_sysctl_sched_cfs_boost();
-#endif
- if (boost == 0)
- return eenv->nrg.diff;
-
- /* Compute normalized energy diff */
- nrg_delta = schedtune_normalize_energy(eenv->nrg.diff);
- eenv->nrg.delta = nrg_delta;
-
- eenv->payoff = schedtune_accept_deltas(
- eenv->nrg.delta,
- eenv->cap.delta,
- eenv->task);
-
- /*
- * When SchedTune is enabled, the energy_diff() function will return
- * the computed energy payoff value. Since the energy_diff() return
- * value is expected to be negative by its callers, this evaluation
- * function return a negative value each time the evaluation return a
- * positive payoff, which is the condition for the acceptance of
- * a scheduling decision
- */
- return -eenv->payoff;
-}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff_evaluate(eenv) eenv->nrg.diff
-#endif
-
/*
* energy_diff(): Estimate the energy impact of changing the utilization
* distribution. eenv specifies the change: utilisation amount, source, and
@@ -5011,7 +5061,7 @@ static int energy_diff_evaluate(struct energy_env *eenv)
* utilization is removed from or added to the system (e.g. task wake-up). If
* both are specified, the utilization is migrated.
*/
-static int energy_diff(struct energy_env *eenv)
+static inline int __energy_diff(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
@@ -5059,9 +5109,86 @@ static int energy_diff(struct energy_env *eenv)
eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
eenv->payoff = 0;
- return energy_diff_evaluate(eenv);
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
+ return eenv->nrg.diff;
}
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+ int boost = schedtune_task_boost(eenv->task);
+ int nrg_delta;
+
+ /* Conpute "absolute" energy diff */
+ __energy_diff(eenv);
+
+ /* Return energy diff when boost margin is 0 */
+ if (boost == 0)
+ return eenv->nrg.diff;
+
+ /* Compute normalized energy diff */
+ nrg_delta = normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
* A waker of many should wake a different task than the one last awakened
@@ -5155,6 +5282,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
static inline unsigned long task_util(struct task_struct *p)
{
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << 10) / walt_ravg_window;
+ }
+#endif
return p->se.avg.util_avg;
}
@@ -5197,22 +5330,25 @@ static bool cpu_overutilized(int cpu)
#ifdef CONFIG_SCHED_TUNE
-static unsigned long
-schedtune_margin(unsigned long signal, unsigned long boost)
+static long
+schedtune_margin(unsigned long signal, long boost)
{
- unsigned long long margin = 0;
+ long long margin = 0;
/*
* Signal proportional compensation (SPC)
*
* The Boost (B) value is used to compute a Margin (M) which is
* proportional to the complement of the original Signal (S):
- * M = B * (SCHED_LOAD_SCALE - S)
+ * M = B * (SCHED_LOAD_SCALE - S), if B is positive
+ * M = B * S, if B is negative
* The obtained M could be used by the caller to "boost" S.
*/
- margin = SCHED_LOAD_SCALE - signal;
- margin *= boost;
-
+ if (boost >= 0) {
+ margin = SCHED_LOAD_SCALE - signal;
+ margin *= boost;
+ } else
+ margin = -signal * boost;
/*
* Fast integer division by constant:
* Constant : (C) = 100
@@ -5228,37 +5364,29 @@ schedtune_margin(unsigned long signal, unsigned long boost)
margin *= 1311;
margin >>= 17;
+ if (boost < 0)
+ margin *= -1;
return margin;
}
-static inline unsigned int
+static inline int
schedtune_cpu_margin(unsigned long util, int cpu)
{
- unsigned int boost;
+ int boost = schedtune_cpu_boost(cpu);
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- boost = schedtune_cpu_boost(cpu);
-#else
- boost = get_sysctl_sched_cfs_boost();
-#endif
if (boost == 0)
return 0;
return schedtune_margin(util, boost);
}
-static inline unsigned long
+static inline long
schedtune_task_margin(struct task_struct *task)
{
- unsigned int boost;
+ int boost = schedtune_task_boost(task);
unsigned long util;
- unsigned long margin;
+ long margin;
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- boost = schedtune_task_boost(task);
-#else
- boost = get_sysctl_sched_cfs_boost();
-#endif
if (boost == 0)
return 0;
@@ -5270,13 +5398,13 @@ schedtune_task_margin(struct task_struct *task)
#else /* CONFIG_SCHED_TUNE */
-static inline unsigned int
+static inline int
schedtune_cpu_margin(unsigned long util, int cpu)
{
return 0;
}
-static inline unsigned int
+static inline int
schedtune_task_margin(struct task_struct *task)
{
return 0;
@@ -5288,7 +5416,7 @@ static inline unsigned long
boosted_cpu_util(int cpu)
{
unsigned long util = cpu_util(cpu);
- unsigned long margin = schedtune_cpu_margin(util, cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
trace_sched_boost_cpu(cpu, util, margin);
@@ -5299,7 +5427,9 @@ static inline unsigned long
boosted_task_util(struct task_struct *task)
{
unsigned long util = task_util(task);
- unsigned long margin = schedtune_task_margin(task);
+ long margin = schedtune_task_margin(task);
+
+ trace_sched_boost_task(task, util, margin);
return util + margin;
}
@@ -5455,15 +5585,20 @@ static int select_idle_sibling(struct task_struct *p, int target)
struct sched_domain *sd;
struct sched_group *sg;
int i = task_cpu(p);
+ int best_idle = -1;
+ int best_idle_cstate = -1;
+ int best_idle_capacity = INT_MAX;
- if (idle_cpu(target))
- return target;
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target))
+ return target;
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
+ }
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
@@ -5476,30 +5611,165 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p)))
goto next;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ struct rq *rq = cpu_rq(i);
+ int idle_idx = idle_get_state_idx(rq);
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target))
+ return target;
+
+ if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+ best_idle = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- target = cpumask_first_and(sched_group_cpus(sg),
+ target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
- goto done;
+ goto done;
+ }
next:
sg = sg->next;
} while (sg != sd->groups);
}
+ if (best_idle > 0)
+ target = best_idle;
+
done:
return target;
}
-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+{
+ int iter_cpu;
+ int target_cpu = -1;
+ int target_util = 0;
+ int backup_capacity = 0;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ int backup_cpu = -1;
+ unsigned long task_util_boosted, new_util;
+
+ task_util_boosted = boosted_task_util(p);
+ for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+ int cur_capacity;
+ struct rq *rq;
+ int idle_idx;
+
+ /*
+ * Iterate from higher cpus for boosted tasks.
+ */
+ int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+ if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+#ifdef CONFIG_SCHED_WALT
+ if (walt_cpu_high_irqload(i))
+ continue;
+#endif
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle) {
+ if (best_idle_cpu < 0)
+ best_idle_cpu = i;
+ continue;
+ }
+
+ cur_capacity = capacity_curr_of(i);
+ rq = cpu_rq(i);
+ idle_idx = idle_get_state_idx(rq);
+
+ if (new_util < cur_capacity) {
+ if (cpu_rq(i)->nr_running) {
+ if (prefer_idle) {
+ /* Find a target cpu with highest
+ * utilization.
+ */
+ if (target_util == 0 ||
+ target_util < new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ } else {
+ /* Find a target cpu with lowest
+ * utilization.
+ */
+ if (target_util == 0 ||
+ target_util > new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ }
+ } else if (!prefer_idle) {
+ if (best_idle_cpu < 0 ||
+ (sysctl_sched_cstate_aware &&
+ best_idle_cstate > idle_idx)) {
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ }
+ }
+ } else if (backup_capacity == 0 ||
+ backup_capacity > cur_capacity) {
+ // Find a backup cpu with least capacity.
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
+ }
+ }
+
+ if (prefer_idle && best_idle_cpu >= 0)
+ target_cpu = best_idle_cpu;
+ else if (target_cpu < 0)
+ target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+ return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
{
struct sched_domain *sd;
struct sched_group *sg, *sg_target;
int target_max_cap = INT_MAX;
int target_cpu = task_cpu(p);
+ unsigned long task_util_boosted, new_util;
int i;
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+ cpumask_t search_cpus;
+ cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+ if (cpumask_test_cpu(cpu, &search_cpus))
+ return cpu;
+ }
+
sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
if (!sd)
@@ -5508,50 +5778,76 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
sg = sd->groups;
sg_target = sg;
- /*
- * Find group with sufficient capacity. We only get here if no cpu is
- * overutilized. We may end up overutilizing a cpu by adding the task,
- * but that should not be any worse than select_idle_sibling().
- * load_balance() should sort it out later as we get above the tipping
- * point.
- */
- do {
- /* Assuming all cpus are the same in group */
- int max_cap_cpu = group_first_cpu(sg);
+ if (sysctl_sched_is_big_little) {
/*
- * Assume smaller max capacity means more energy-efficient.
- * Ideally we should query the energy model for the right
- * answer but it easily ends up in an exhaustive search.
+ * Find group with sufficient capacity. We only get here if no cpu is
+ * overutilized. We may end up overutilizing a cpu by adding the task,
+ * but that should not be any worse than select_idle_sibling().
+ * load_balance() should sort it out later as we get above the tipping
+ * point.
*/
- if (capacity_of(max_cap_cpu) < target_max_cap &&
- task_fits_max(p, max_cap_cpu)) {
- sg_target = sg;
- target_max_cap = capacity_of(max_cap_cpu);
- }
- } while (sg = sg->next, sg != sd->groups);
+ do {
+ /* Assuming all cpus are the same in group */
+ int max_cap_cpu = group_first_cpu(sg);
- /* Find cpu with sufficient capacity */
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- int new_util = cpu_util(i) + boosted_task_util(p);
+ /*
+ * Assume smaller max capacity means more energy-efficient.
+ * Ideally we should query the energy model for the right
+ * answer but it easily ends up in an exhaustive search.
+ */
+ if (capacity_of(max_cap_cpu) < target_max_cap &&
+ task_fits_max(p, max_cap_cpu)) {
+ sg_target = sg;
+ target_max_cap = capacity_of(max_cap_cpu);
+ }
+ } while (sg = sg->next, sg != sd->groups);
- if (new_util > capacity_orig_of(i))
- continue;
+ task_util_boosted = boosted_task_util(p);
+ /* Find cpu with sufficient capacity */
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
- if (new_util < capacity_curr_of(i)) {
- target_cpu = i;
- if (cpu_rq(i)->nr_running)
- break;
- }
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ if (new_util < capacity_curr_of(i)) {
+ target_cpu = i;
+ if (cpu_rq(i)->nr_running)
+ break;
+ }
- /* cpu has capacity at higher OPP, keep it as fallback */
- if (target_cpu == task_cpu(p))
- target_cpu = i;
+ /* cpu has capacity at higher OPP, keep it as fallback */
+ if (target_cpu == task_cpu(p))
+ target_cpu = i;
+ }
+ } else {
+ /*
+ * Find a cpu with sufficient capacity
+ */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ bool boosted = schedtune_task_boost(p) > 0;
+ bool prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ bool boosted = 0;
+ bool prefer_idle = 0;
+#endif
+ int tmp_target = find_best_target(p, boosted, prefer_idle);
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+ return target_cpu;
+ }
}
if (target_cpu != task_cpu(p)) {
@@ -5628,7 +5924,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (!sd) {
if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+ new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, new_cpu);
@@ -5699,6 +5995,8 @@ static void task_dead_fair(struct task_struct *p)
{
remove_entity_load_avg(&p->se);
}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
@@ -6404,7 +6702,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
@@ -6825,7 +7125,10 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
raw_spin_unlock_irqrestore(&mcc->lock, flags);
- //pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+/*
+ printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+ cpu, capacity);
+*/
goto skip_unlock;
#endif
}
@@ -7048,7 +7351,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
bool *overload, bool *overutilized)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -7065,7 +7368,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
@@ -7073,7 +7377,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
if (cpu_overutilized(i)) {
@@ -7279,12 +7586,17 @@ next_group:
env->dst_rq->rd->overload = overload;
/* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized)
+ if (env->dst_rq->rd->overutilized != overutilized) {
env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
} else {
- if (!env->dst_rq->rd->overutilized && overutilized)
+ if (!env->dst_rq->rd->overutilized && overutilized) {
env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
+
}
/**
@@ -8078,6 +8390,7 @@ static int idle_balance(struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
+ long removed_util=0;
idle_enter_fair(this_rq);
@@ -8101,6 +8414,17 @@ static int idle_balance(struct rq *this_rq)
raw_spin_unlock(&this_rq->lock);
+ /*
+ * If removed_util_avg is !0 we most probably migrated some task away
+ * from this_cpu. In this case we might be willing to trigger an OPP
+ * update, but we want to do so if we don't find anybody else to pull
+ * here (we will trigger an OPP update with the pulled task's enqueue
+ * anyway).
+ *
+ * Record removed_util before calling update_blocked_averages, and use
+ * it below (before returning) to see if an OPP update is required.
+ */
+ removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
@@ -8165,6 +8489,12 @@ out:
if (pulled_task) {
idle_exit_fair(this_rq);
this_rq->idle_stamp = 0;
+ } else if (removed_util) {
+ /*
+ * No task pulled and someone has been migrated away.
+ * Good case to trigger an OPP update.
+ */
+ update_capacity_of(this_cpu);
}
return pulled_task;
@@ -8723,10 +9053,15 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+#ifdef CONFIG_SMP
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index b634151ce286..55e461055332 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -73,4 +73,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
* Energy aware scheduling. Use platform energy model to guide scheduling
* decisions optimizing for energy efficiency.
*/
+#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE
+SCHED_FEAT(ENERGY_AWARE, true)
+#else
SCHED_FEAT(ENERGY_AWARE, false)
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cbc130efbc5b..917c94abf5bb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -220,6 +220,7 @@ static void cpu_idle_loop(void)
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9694204660b7..8a16cba968c4 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,6 +8,8 @@
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include "walt.h"
+
int sched_rr_timeslice = RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -889,6 +891,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
+static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
+{
+ struct rt_prio_array *array = &rt_rq->active;
+ struct sched_rt_entity *rt_se;
+ char buf[500];
+ char *pos = buf;
+ char *end = buf + sizeof(buf);
+ int idx;
+
+ pos += snprintf(pos, sizeof(buf),
+ "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
+ rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
+ goto out;
+
+ pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
+ idx = sched_find_first_bit(array->bitmap);
+ while (idx < MAX_RT_PRIO) {
+ list_for_each_entry(rt_se, array->queue + idx, run_list) {
+ struct task_struct *p;
+
+ if (!rt_entity_is_task(rt_se))
+ continue;
+
+ p = rt_task_of(rt_se);
+ if (pos < end)
+ pos += snprintf(pos, end - pos, "\t%s (%d)\n",
+ p->comm, p->pid);
+ }
+ idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+ }
+out:
+#ifdef CONFIG_PANIC_ON_RT_THROTTLING
+ /*
+ * Use pr_err() in the BUG() case since printk_sched() will
+ * not get flushed and deadlock is not a concern.
+ */
+ pr_err("%s", buf);
+ BUG();
+#else
+ printk_deferred("%s", buf);
+#endif
+}
+
static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
u64 runtime = sched_rt_runtime(rt_rq);
@@ -912,8 +959,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
* but accrue some time due to boosting.
*/
if (likely(rt_b->rt_runtime)) {
+ static bool once = false;
+
rt_rq->rt_throttled = 1;
- printk_deferred_once("sched: RT throttling activated\n");
+
+ if (!once) {
+ once = true;
+ dump_throttled_rt_tasks(rt_rq);
+ }
} else {
/*
* In case we did anyway, make it go away,
@@ -1261,6 +1314,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -1272,6 +1326,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
+ walt_dec_cumulative_runnable_avg(rq, p);
dequeue_pushable_task(rq, p);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a537f1864dd0..2f2b959ad244 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -410,6 +410,10 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+#ifdef CONFIG_SCHED_WALT
+ u64 cumulative_runnable_avg;
+#endif
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
@@ -594,6 +598,14 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
+
+#ifdef CONFIG_CPU_QUIET
+ /* time-based average load */
+ u64 nr_last_stamp;
+ u64 nr_running_integral;
+ seqcount_t ave_seqcnt;
+#endif
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -655,6 +667,30 @@ struct rq {
u64 max_idle_balance_cost;
#endif
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * max_freq = user or thermal defined maximum
+ * max_possible_freq = maximum supported by hardware
+ */
+ unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
+ struct cpumask freq_domain_cpumask;
+
+ u64 cumulative_runnable_avg;
+ int efficiency; /* Differentiate cpus with different IPC capability */
+ int load_scale_factor;
+ int capacity;
+ int max_possible_capacity;
+ u64 window_start;
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ u64 cur_irqload;
+ u64 avg_irqload;
+ u64 irqload_ts;
+#endif /* CONFIG_SCHED_WALT */
+
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -1269,6 +1305,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
@@ -1351,9 +1388,7 @@ unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
-extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
-
-static inline void add_nr_running(struct rq *rq, unsigned count)
+static inline void __add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
@@ -1381,11 +1416,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
}
}
-static inline void sub_nr_running(struct rq *rq, unsigned count)
+static inline void __sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
}
+#ifdef CONFIG_CPU_QUIET
+#define NR_AVE_SCALE(x) ((x) << FSHIFT)
+static inline u64 do_nr_running_integral(struct rq *rq)
+{
+ s64 nr, deltax;
+ u64 nr_running_integral = rq->nr_running_integral;
+
+ deltax = rq->clock_task - rq->nr_last_stamp;
+ nr = NR_AVE_SCALE(rq->nr_running);
+
+ nr_running_integral += nr * deltax;
+
+ return nr_running_integral;
+}
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __add_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __sub_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+#else
+#define add_nr_running __add_nr_running
+#define sub_nr_running __sub_nr_running
+#endif
+
static inline void rq_last_tick_reset(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_FULL
@@ -1469,6 +1541,10 @@ static inline unsigned long capacity_orig_of(int cpu)
return cpu_rq(cpu)->cpu_capacity_orig;
}
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+
/*
* cpu_util returns the amount of capacity of a CPU that is used by CFS
* tasks. The unit of the return value must be the one of capacity so we can
@@ -1500,6 +1576,12 @@ static inline unsigned long __cpu_util(int cpu, int delta)
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+ util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT;
+ do_div(util, walt_ravg_window);
+ }
+#endif
delta += util;
if (delta < 0)
return 0;
@@ -1530,8 +1612,27 @@ void update_cpu_capacity_request(int cpu, bool request);
static inline void set_cfs_cpu_capacity(int cpu, bool request,
unsigned long capacity)
{
- if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) {
- per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity;
+ struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+ int rtdl = scr->rt + scr->dl;
+ /*
+ * WALT tracks the utilization of a CPU considering the load
+ * generated by all the scheduling classes.
+ * Since the following call to:
+ * update_cpu_capacity
+ * is already adding the RT and DL utilizations let's remove
+ * these contributions from the WALT signal.
+ */
+ if (capacity > rtdl)
+ capacity -= rtdl;
+ else
+ capacity = 0;
+ }
+#endif
+ if (scr->cfs != capacity) {
+ scr->cfs = capacity;
update_cpu_capacity_request(cpu, request);
}
}
@@ -1657,6 +1758,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
+extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -1729,7 +1833,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
+ if (this_rq != busiest)
+ raw_spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index cbc67da10954..61f852d46858 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
#include "sched.h"
+#include "walt.h"
/*
* stop-task scheduling class.
@@ -42,12 +43,14 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
+ walt_inc_cumulative_runnable_avg(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ walt_dec_cumulative_runnable_avg(rq, p);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index 9d8eeb438140..68a24a044b0a 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -3,24 +3,21 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/printk.h>
-#include <linux/reciprocal_div.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <trace/events/sched.h>
#include "sched.h"
+#include "tune.h"
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+static bool schedtune_initialized = false;
+#endif
unsigned int sysctl_sched_cfs_boost __read_mostly;
-/*
- * System energy normalization constants
- */
-static struct target_nrg {
- unsigned long min_power;
- unsigned long max_power;
- struct reciprocal_value rdiv;
-} schedtune_target_nrg;
+extern struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
static int perf_boost_idx;
@@ -41,16 +38,16 @@ struct threshold_params {
*/
static struct threshold_params
threshold_gains[] = {
- { 0, 4 }, /* >= 0% */
- { 0, 4 }, /* >= 10% */
- { 1, 4 }, /* >= 20% */
- { 2, 4 }, /* >= 30% */
- { 3, 4 }, /* >= 40% */
- { 4, 3 }, /* >= 50% */
- { 4, 2 }, /* >= 60% */
- { 4, 1 }, /* >= 70% */
- { 4, 0 }, /* >= 80% */
- { 4, 0 } /* >= 90% */
+ { 0, 5 }, /* < 10% */
+ { 1, 5 }, /* < 20% */
+ { 2, 5 }, /* < 30% */
+ { 3, 5 }, /* < 40% */
+ { 4, 5 }, /* < 50% */
+ { 5, 4 }, /* < 60% */
+ { 5, 3 }, /* < 70% */
+ { 5, 2 }, /* < 80% */
+ { 5, 1 }, /* < 90% */
+ { 5, 0 } /* <= 100% */
};
static int
@@ -58,36 +55,51 @@ __schedtune_accept_deltas(int nrg_delta, int cap_delta,
int perf_boost_idx, int perf_constrain_idx)
{
int payoff = -INT_MAX;
+ int gain_idx = -1;
/* Performance Boost (B) region */
- if (nrg_delta > 0 && cap_delta > 0) {
- /*
- * Evaluate "Performance Boost" vs "Energy Increase"
- * payoff criteria:
- * cap_delta / nrg_delta < cap_gain / nrg_gain
- * which is:
- * nrg_delta * cap_gain > cap_delta * nrg_gain
- */
- payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain;
- payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain;
- return payoff;
- }
-
+ if (nrg_delta >= 0 && cap_delta > 0)
+ gain_idx = perf_boost_idx;
/* Performance Constraint (C) region */
- if (nrg_delta < 0 && cap_delta < 0) {
- /*
- * Evaluate "Performance Boost" vs "Energy Increase"
- * payoff criteria:
- * cap_delta / nrg_delta > cap_gain / nrg_gain
- * which is:
- * cap_delta * nrg_gain > nrg_delta * cap_gain
- */
- payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain;
- payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain;
- return payoff;
- }
+ else if (nrg_delta < 0 && cap_delta <= 0)
+ gain_idx = perf_constrain_idx;
/* Default: reject schedule candidate */
+ if (gain_idx == -1)
+ return payoff;
+
+ /*
+ * Evaluate "Performance Boost" vs "Energy Increase"
+ *
+ * - Performance Boost (B) region
+ *
+ * Condition: nrg_delta > 0 && cap_delta > 0
+ * Payoff criteria:
+ * cap_gain / nrg_gain < cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since both nrg_gain and nrg_delta are positive, the
+ * inequality does not change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * - Performance Constraint (C) region
+ *
+ * Condition: nrg_delta < 0 && cap_delta < 0
+ * payoff criteria:
+ * cap_gain / nrg_gain > cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since nrg_gain > 0 while nrg_delta < 0, the
+ * inequality change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * This means that, in case of same positive defined {cap,nrg}_gain
+ * for both the B and C regions, we can use the same payoff formula
+ * where a positive value represents the accept condition.
+ */
+ payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
+ payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
+
return payoff;
}
@@ -113,6 +125,10 @@ struct schedtune {
/* Performance Constraint (C) region threshold params */
int perf_constrain_idx;
+
+ /* Hint to bias scheduling of tasks on that SchedTune CGroup
+ * towards idle CPUs */
+ int prefer_idle;
};
static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@@ -144,6 +160,7 @@ root_schedtune = {
.boost = 0,
.perf_boost_idx = 0,
.perf_constrain_idx = 0,
+ .prefer_idle = 0,
};
int
@@ -155,12 +172,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta,
int perf_constrain_idx;
/* Optimal (O) region */
- if (nrg_delta < 0 && cap_delta > 0)
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
return INT_MAX;
+ }
/* Suboptimal (S) region */
- if (nrg_delta > 0 && cap_delta < 0)
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
return -INT_MAX;
+ }
/* Get task specific perf Boost/Constraints indexes */
rcu_read_lock();
@@ -202,13 +223,16 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
*/
struct boost_groups {
/* Maximum boost value for all RUNNABLE tasks on a CPU */
- unsigned boost_max;
+ bool idle;
+ int boost_max;
struct {
/* The boost for tasks on that boost group */
- unsigned boost;
+ int boost;
/* Count of RUNNABLE tasks on that boost group */
unsigned tasks;
} group[BOOSTGROUPS_COUNT];
+ /* CPU's boost group locking */
+ raw_spinlock_t lock;
};
/* Boost groups affecting each CPU in the system */
@@ -218,7 +242,7 @@ static void
schedtune_cpu_update(int cpu)
{
struct boost_groups *bg;
- unsigned boost_max;
+ int boost_max;
int idx;
bg = &per_cpu(cpu_boost_groups, cpu);
@@ -232,9 +256,13 @@ schedtune_cpu_update(int cpu)
*/
if (bg->group[idx].tasks == 0)
continue;
+
boost_max = max(boost_max, bg->group[idx].boost);
}
-
+ /* Ensures boost_max is non-negative when all cgroup boost values
+ * are neagtive. Avoids under-accounting of cpu capacity which may cause
+ * task stacking and frequency spikes.*/
+ boost_max = max(boost_max, 0);
bg->boost_max = boost_max;
}
@@ -281,28 +309,24 @@ schedtune_boostgroup_update(int idx, int boost)
return 0;
}
+#define ENQUEUE_TASK 1
+#define DEQUEUE_TASK -1
+
static inline void
schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
{
- struct boost_groups *bg;
- int tasks;
-
- bg = &per_cpu(cpu_boost_groups, cpu);
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ int tasks = bg->group[idx].tasks + task_count;
/* Update boosted tasks count while avoiding to make it negative */
- if (task_count < 0 && bg->group[idx].tasks <= -task_count)
- bg->group[idx].tasks = 0;
- else
- bg->group[idx].tasks += task_count;
-
- /* Boost group activation or deactivation on that RQ */
- tasks = bg->group[idx].tasks;
- if (tasks == 1 || tasks == 0)
- schedtune_cpu_update(cpu);
+ bg->group[idx].tasks = max(0, tasks);
trace_sched_tune_tasks_update(p, cpu, tasks, idx,
bg->group[idx].boost, bg->boost_max);
+ /* Boost group activation or deactivation on that RQ */
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
}
/*
@@ -310,9 +334,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
*/
void schedtune_enqueue_task(struct task_struct *p, int cpu)
{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
struct schedtune *st;
int idx;
+ if (!unlikely(schedtune_initialized))
+ return;
+
/*
* When a task is marked PF_EXITING by do_exit() it's going to be
* dequeued and enqueued multiple times in the exit path.
@@ -322,13 +351,103 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
if (p->flags & PF_EXITING)
return;
- /* Get task boost group */
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions for example on
+ * do_exit()::cgroup_exit() and task migration.
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
rcu_read_lock();
+
st = task_schedtune(p);
idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+ struct boost_groups *bg;
+ unsigned long irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int src_bg; /* Source boost group index */
+ int dst_bg; /* Destination boost group index */
+ int tasks;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+
+ cgroup_taskset_for_each(task, css, tset) {
+
+ /*
+ * Lock the CPU's RQ the task is enqueued to avoid race
+ * conditions with migration code while the task is being
+ * accounted
+ */
+ rq = lock_rq_of(task, &irq_flags);
+
+ if (!task->on_rq) {
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ cpu = cpu_of(rq);
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ raw_spin_lock(&bg->lock);
+
+ dst_bg = css_st(css)->idx;
+ src_bg = task_schedtune(task)->idx;
+
+ /*
+ * Current task is not changing boostgroup, which can
+ * happen when the new hierarchy is in use.
+ */
+ if (unlikely(dst_bg == src_bg)) {
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * This is the case of a RUNNABLE task which is switching its
+ * current boost group.
+ */
+
+ /* Move task from src to dst boost group */
+ tasks = bg->group[src_bg].tasks - 1;
+ bg->group[src_bg].tasks = max(0, tasks);
+ bg->group[dst_bg].tasks += 1;
+
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+
+ /* Update CPU boost group */
+ if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+ schedtune_cpu_update(task_cpu(task));
+
+ }
- schedtune_tasks_update(p, cpu, idx, 1);
+ return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+ /* This can happen only if SchedTune controller is mounted with
+ * other hierarchies ane one of them fails. Since usually SchedTune is
+ * mouted on its own hierarcy, for the time being we do not implement
+ * a proper rollback mechanism */
+ WARN(1, "SchedTune cancel attach not implemented");
}
/*
@@ -336,26 +455,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
*/
void schedtune_dequeue_task(struct task_struct *p, int cpu)
{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
struct schedtune *st;
int idx;
+ if (!unlikely(schedtune_initialized))
+ return;
+
/*
* When a task is marked PF_EXITING by do_exit() it's going to be
* dequeued and enqueued multiple times in the exit path.
* Thus we avoid any further update, since we do not want to change
* CPU boosting while the task is exiting.
- * The last dequeue will be done by cgroup exit() callback.
+ * The last dequeue is already enforce by the do_exit() code path
+ * via schedtune_exit_task().
*/
if (p->flags & PF_EXITING)
return;
- /* Get task boost group */
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
rcu_read_lock();
+
st = task_schedtune(p);
idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+ struct schedtune *st;
+ unsigned long irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ rq = lock_rq_of(tsk, &irq_flags);
+ rcu_read_lock();
- schedtune_tasks_update(p, cpu, idx, -1);
+ cpu = cpu_of(rq);
+ st = task_schedtune(tsk);
+ idx = st->idx;
+ schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ unlock_rq_of(rq, tsk, &irq_flags);
}
int schedtune_cpu_boost(int cpu)
@@ -380,7 +535,39 @@ int schedtune_task_boost(struct task_struct *p)
return task_boost;
}
+int schedtune_prefer_idle(struct task_struct *p)
+{
+ struct schedtune *st;
+ int prefer_idle;
+
+ /* Get prefer_idle value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ prefer_idle = st->prefer_idle;
+ rcu_read_unlock();
+
+ return prefer_idle;
+}
+
static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 prefer_idle)
+{
+ struct schedtune *st = css_st(css);
+ st->prefer_idle = prefer_idle;
+
+ return 0;
+}
+
+static s64
boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct schedtune *st = css_st(css);
@@ -390,16 +577,32 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
static int
boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 boost)
+ s64 boost)
{
struct schedtune *st = css_st(css);
+ unsigned threshold_idx;
+ int boost_pct;
- if (boost < 0 || boost > 100)
+ if (boost < -100 || boost > 100)
return -EINVAL;
+ boost_pct = boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ st->perf_boost_idx = threshold_idx;
+ st->perf_constrain_idx = threshold_idx;
st->boost = boost;
- if (css == &root_schedtune.css)
+ if (css == &root_schedtune.css) {
sysctl_sched_cfs_boost = boost;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+ }
/* Update CPU boost */
schedtune_boostgroup_update(st->idx, st->boost);
@@ -412,8 +615,13 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
static struct cftype files[] = {
{
.name = "boost",
- .read_u64 = boost_read,
- .write_u64 = boost_write,
+ .read_s64 = boost_read,
+ .write_s64 = boost_write,
+ },
+ {
+ .name = "prefer_idle",
+ .read_u64 = prefer_idle_read,
+ .write_u64 = prefer_idle_write,
},
{ } /* terminate */
};
@@ -437,33 +645,14 @@ schedtune_boostgroup_init(struct schedtune *st)
return 0;
}
-static int
-schedtune_init(void)
-{
- struct boost_groups *bg;
- int cpu;
-
- /* Initialize the per CPU boost groups */
- for_each_possible_cpu(cpu) {
- bg = &per_cpu(cpu_boost_groups, cpu);
- memset(bg, 0, sizeof(struct boost_groups));
- }
-
- pr_info(" schedtune configured to support %d boost groups\n",
- BOOSTGROUPS_COUNT);
- return 0;
-}
-
static struct cgroup_subsys_state *
schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct schedtune *st;
int idx;
- if (!parent_css) {
- schedtune_init();
+ if (!parent_css)
return &root_schedtune.css;
- }
/* Allow only single level hierachies */
if (parent_css != &root_schedtune.css) {
@@ -520,10 +709,30 @@ schedtune_css_free(struct cgroup_subsys_state *css)
struct cgroup_subsys schedtune_cgrp_subsys = {
.css_alloc = schedtune_css_alloc,
.css_free = schedtune_css_free,
+ .can_attach = schedtune_can_attach,
+ .cancel_attach = schedtune_cancel_attach,
.legacy_cftypes = files,
.early_init = 1,
};
+static inline void
+schedtune_init_cgroups(void)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ memset(bg, 0, sizeof(struct boost_groups));
+ }
+
+ pr_info("schedtune: configured to support %d boost groups\n",
+ BOOSTGROUPS_COUNT);
+
+ schedtune_initialized = true;
+}
+
#else /* CONFIG_CGROUP_SCHEDTUNE */
int
@@ -531,12 +740,16 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta,
struct task_struct *task)
{
/* Optimal (O) region */
- if (nrg_delta < 0 && cap_delta > 0)
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
return INT_MAX;
+ }
/* Suboptimal (S) region */
- if (nrg_delta > 0 && cap_delta < 0)
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
return -INT_MAX;
+ }
return __schedtune_accept_deltas(nrg_delta, cap_delta,
perf_boost_idx, perf_constrain_idx);
@@ -550,52 +763,29 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ unsigned threshold_idx;
+ int boost_pct;
if (ret || !write)
return ret;
- /* Performance Boost (B) region threshold params */
- perf_boost_idx = sysctl_sched_cfs_boost;
- perf_boost_idx /= 10;
+ if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
+ return -EINVAL;
+ boost_pct = sysctl_sched_cfs_boost;
- /* Performance Constraint (C) region threshold params */
- perf_constrain_idx = 100 - sysctl_sched_cfs_boost;
- perf_constrain_idx /= 10;
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
return 0;
}
-/*
- * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
- * corresponding to the specified energy variation.
- */
-int
-schedtune_normalize_energy(int energy_diff)
-{
- u32 normalized_nrg;
- int max_delta;
-
-#ifdef CONFIG_SCHED_DEBUG
- /* Check for boundaries */
- max_delta = schedtune_target_nrg.max_power;
- max_delta -= schedtune_target_nrg.min_power;
- WARN_ON(abs(energy_diff) >= max_delta);
-#endif
-
- /* Do scaling using positive numbers to increase the range */
- normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
-
- /* Scale by energy magnitude */
- normalized_nrg <<= SCHED_LOAD_SHIFT;
-
- /* Normalize on max energy for target platform */
- normalized_nrg = reciprocal_divide(
- normalized_nrg, schedtune_target_nrg.rdiv);
-
- return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
-}
-
#ifdef CONFIG_SCHED_DEBUG
static void
schedtune_test_nrg(unsigned long delta_pwr)
@@ -698,7 +888,7 @@ schedtune_add_cluster_nrg(
* that bind the EM to the topology information.
*/
static int
-schedtune_init_late(void)
+schedtune_init(void)
{
struct target_nrg *ste = &schedtune_target_nrg;
unsigned long delta_pwr = 0;
@@ -738,11 +928,17 @@ schedtune_init_late(void)
ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
schedtune_test_nrg(delta_pwr);
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ schedtune_init_cgroups();
+#else
+ pr_info("schedtune: configured to support global boosting only\n");
+#endif
+
return 0;
nodata:
rcu_read_unlock();
return -EINVAL;
}
-late_initcall(schedtune_init_late);
-
+postcore_initcall(schedtune_init);
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
index f7273a5d994a..4f6441771e4c 100644
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -1,16 +1,36 @@
#ifdef CONFIG_SCHED_TUNE
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ struct reciprocal_value rdiv;
+};
+
#ifdef CONFIG_CGROUP_SCHEDTUNE
int schedtune_cpu_boost(int cpu);
int schedtune_task_boost(struct task_struct *tsk);
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_exit_task(struct task_struct *tsk);
+
void schedtune_enqueue_task(struct task_struct *p, int cpu);
void schedtune_dequeue_task(struct task_struct *p, int cpu);
#else /* CONFIG_CGROUP_SCHEDTUNE */
+#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
+#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
+
+#define schedtune_exit_task(task) do { } while (0)
+
#define schedtune_enqueue_task(task, cpu) do { } while (0)
#define schedtune_dequeue_task(task, cpu) do { } while (0)
@@ -22,10 +42,14 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta,
#else /* CONFIG_SCHED_TUNE */
+#define schedtune_cpu_boost(cpu) 0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_exit_task(task) do { } while (0)
+
#define schedtune_enqueue_task(task, cpu) do { } while (0)
#define schedtune_dequeue_task(task, cpu) do { } while (0)
-#define schedtune_normalize_energy(energy) energy
#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
#endif /* CONFIG_SCHED_TUNE */
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644
index 000000000000..2ffb1680b380
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,1168 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ * and Todd Kjos
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+unsigned int __read_mostly walt_disabled = 0;
+
+static unsigned int max_possible_efficiency = 1024;
+static unsigned int min_possible_efficiency = 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+static unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+static unsigned int min_max_freq = 1;
+
+static unsigned int max_capacity = 1024;
+static unsigned int min_capacity = 1024;
+static unsigned int max_load_scale_factor = 1024;
+static unsigned int max_possible_capacity = 1024;
+
+/* Mask of all CPUs that have max_possible_capacity */
+static cpumask_t mpc_mask = CPU_MASK_ALL;
+
+/* Window size (in ns) */
+__read_mostly unsigned int walt_ravg_window = 20000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+ return p->ravg.demand;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg -= p->ravg.demand;
+ BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p, s64 task_load_delta)
+{
+ rq->cumulative_runnable_avg += task_load_delta;
+ if ((s64)rq->cumulative_runnable_avg < 0)
+ panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+ task_load_delta, task_load(p));
+}
+
+u64 walt_ktime_clock(void)
+{
+ if (unlikely(walt_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+ walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+ ktime_last = ktime_get();
+ walt_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+ .resume = walt_resume,
+ .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+ register_syscore_ops(&walt_syscore_ops);
+ return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+ if (p->flags & PF_EXITING) {
+ if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+ get_option(&str, &walt_ravg_window);
+
+ walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+ return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+ if (delta < 0) {
+ delta = 0;
+ WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
+ }
+
+ if (delta < walt_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, walt_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+}
+
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ unsigned int cur_freq = rq->cur_freq;
+ int sf;
+
+ if (unlikely(cur_freq > max_possible_freq))
+ cur_freq = rq->max_possible_freq;
+
+ /* round up div64 */
+ delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
+ max_possible_freq);
+
+ sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+
+ delta *= sf;
+ delta >>= 10;
+
+ return delta;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!walt_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+ delta = get_jiffies_64() - rq->irqload_ts;
+
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < WALT_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+ return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+ event == TASK_UPDATE)
+ return 1;
+
+ /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, nr_full_windows = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = walt_ravg_window;
+ u64 delta;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ nr_full_windows = div64_u64((window_start - mark_start),
+ window_size);
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ /* Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks. */
+ if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+ u32 curr_window = 0;
+
+ if (!nr_full_windows)
+ curr_window = p->ravg.curr_window;
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+ /* account_busy_for_cpu_time() = 0, so no update to the
+ * task's current window needs to be made. This could be
+ * for example
+ *
+ * - a wakeup event on a task within the current
+ * window (!new_window below, no action required),
+ * - switching to a new task from idle (PICK_NEXT_TASK)
+ * in a new window where irqtime is 0 and we aren't
+ * waiting on IO */
+
+ if (!new_window)
+ return;
+
+ /* A new window has started. The RQ demand must be rolled
+ * over if p is the current task. */
+ if (p_is_curr_task) {
+ u64 prev_sum = 0;
+
+ /* p is either idle task or an exiting task */
+ if (!nr_full_windows) {
+ prev_sum = rq->curr_runnable_sum;
+ }
+
+ rq->prev_runnable_sum = prev_sum;
+ rq->curr_runnable_sum = 0;
+ }
+
+ return;
+ }
+
+ if (!new_window) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window. */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ rq->curr_runnable_sum += delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window += delta;
+
+ return;
+ }
+
+ if (!p_is_curr_task) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+ rq->prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum += delta;
+ if (!exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window += delta;
+
+ delta += rq->curr_runnable_sum;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window = delta;
+
+ }
+ /*
+ * Rollover for normal runnable sum is done here by overwriting
+ * the values in prev_runnable_sum and curr_runnable_sum.
+ * Rollover for new task runnable sum has completed by previous
+ * if-else statement.
+ */
+ rq->prev_runnable_sum = delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum = delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (irqtime) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime. */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /* Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted. */
+ rq->prev_runnable_sum = rq->curr_runnable_sum;
+ if (mark_start > window_start) {
+ rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /* The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first. */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ rq->prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+ BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+ /* No need to bother updating task demand for exiting tasks
+ * or the idle task. */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /* When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time. */
+ if (event == TASK_WAKE || (!walt_account_wait_time &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = walt_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, walt_ravg_hist_size);
+ if (walt_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ */
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+
+ p->ravg.demand = demand;
+
+done:
+ trace_walt_update_history(rq, p, runtime, samples, event);
+ return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+ u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > walt_ravg_window))
+ p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = walt_ravg_window;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(p, event)) {
+ if (new_window)
+ /* If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those. */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return;
+ }
+
+ if (!new_window) {
+ /* The simple case - busy time contained within the existing
+ * window. */
+ add_to_task_demand(rq, p, wallclock - mark_start);
+ return;
+ }
+
+ /* Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start. */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows)
+ update_history(rq, p, scale_exec_time(window_size, rq),
+ nr_full_windows, event);
+
+ /* Roll window_start back to current to process any remainder
+ * in current window. */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ if (walt_disabled || !rq->window_start)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start)
+ goto done;
+
+ update_task_demand(p, rq, event, wallclock);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+ trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+ p->ravg.mark_start = wallclock;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+void walt_init_cpu_efficiency(void)
+{
+ int i, efficiency;
+ unsigned int max = 0, min = UINT_MAX;
+
+ for_each_possible_cpu(i) {
+ efficiency = arch_get_cpu_efficiency(i);
+ cpu_rq(i)->efficiency = efficiency;
+
+ if (efficiency > max)
+ max = efficiency;
+ if (efficiency < min)
+ min = efficiency;
+ }
+
+ if (max)
+ max_possible_efficiency = max;
+
+ if (min)
+ min_possible_efficiency = min;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+
+ if (exiting_task(p))
+ sum = EXITING_TASK_MARKER;
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = walt_ktime_clock();
+ p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+ struct rq *sync_rq = cpu_rq(sync_cpu);
+
+ if (rq->window_start)
+ return;
+
+ if (cpu == sync_cpu) {
+ rq->window_start = walt_ktime_clock();
+ } else {
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = cpu_rq(sync_cpu)->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+ if (cpu == sync_cpu)
+ sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+ if (p->ravg.curr_window) {
+ src_rq->curr_runnable_sum -= p->ravg.curr_window;
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ }
+
+ if (p->ravg.prev_window) {
+ src_rq->prev_runnable_sum -= p->ravg.prev_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ if ((s64)src_rq->prev_runnable_sum < 0) {
+ src_rq->prev_runnable_sum = 0;
+ WARN_ON(1);
+ }
+ if ((s64)src_rq->curr_runnable_sum < 0) {
+ src_rq->curr_runnable_sum = 0;
+ WARN_ON(1);
+ }
+
+ trace_walt_migration_update_sum(src_rq, p);
+ trace_walt_migration_update_sum(dest_rq, p);
+
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+ int i;
+ int max = 0, min = INT_MAX;
+
+ for_each_online_cpu(i) {
+ if (cpu_rq(i)->capacity > max)
+ max = cpu_rq(i)->capacity;
+ if (cpu_rq(i)->capacity < min)
+ min = cpu_rq(i)->capacity;
+ }
+
+ max_capacity = max;
+ min_capacity = min;
+}
+
+static void update_min_max_capacity(void)
+{
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(i)
+ raw_spin_lock(&cpu_rq(i)->lock);
+
+ __update_min_max_capacity();
+
+ for_each_possible_cpu(i)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+ local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long capacity_scale_cpu_efficiency(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static unsigned long load_scale_cpu_efficiency(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_efficiency,
+ cpu_rq(cpu)->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static unsigned long load_scale_cpu_freq(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
+}
+
+static int compute_capacity(int cpu)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cpu);
+ capacity >>= 10;
+
+ capacity *= capacity_scale_cpu_freq(cpu);
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_load_scale_factor(int cpu)
+{
+ int load_scale = 1024;
+
+ /*
+ * load_scale_factor accounts for the fact that task load
+ * is in reference to "best" performing cpu. Task's load will need to be
+ * scaled (up) by a factor to determine suitability to be placed on a
+ * (little) cpu.
+ */
+ load_scale *= load_scale_cpu_efficiency(cpu);
+ load_scale >>= 10;
+
+ load_scale *= load_scale_cpu_freq(cpu);
+ load_scale >>= 10;
+
+ return load_scale;
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+ int i, update_max = 0;
+ u64 highest_mpc = 0, highest_mplsf = 0;
+ const struct cpumask *cpus = policy->related_cpus;
+ unsigned int orig_min_max_freq = min_max_freq;
+ unsigned int orig_max_possible_freq = max_possible_freq;
+ /* Initialized to policy->max in case policy->related_cpus is empty! */
+ unsigned int orig_max_freq = policy->max;
+
+ if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+ val != CPUFREQ_CREATE_POLICY)
+ return 0;
+
+ if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+ update_min_max_capacity();
+ return 0;
+ }
+
+ for_each_cpu(i, policy->related_cpus) {
+ cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+ policy->related_cpus);
+ orig_max_freq = cpu_rq(i)->max_freq;
+ cpu_rq(i)->min_freq = policy->min;
+ cpu_rq(i)->max_freq = policy->max;
+ cpu_rq(i)->cur_freq = policy->cur;
+ cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
+ }
+
+ max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+ if (min_max_freq == 1)
+ min_max_freq = UINT_MAX;
+ min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+ BUG_ON(!min_max_freq);
+ BUG_ON(!policy->max);
+
+ /* Changes to policy other than max_freq don't require any updates */
+ if (orig_max_freq == policy->max)
+ return 0;
+
+ /*
+ * A changed min_max_freq or max_possible_freq (possible during bootup)
+ * needs to trigger re-computation of load_scale_factor and capacity for
+ * all possible cpus (even those offline). It also needs to trigger
+ * re-computation of nr_big_task count on all online cpus.
+ *
+ * A changed rq->max_freq otoh needs to trigger re-computation of
+ * load_scale_factor and capacity for just the cluster of cpus involved.
+ * Since small task definition depends on max_load_scale_factor, a
+ * changed load_scale_factor of one cluster could influence
+ * classification of tasks in another cluster. Hence a changed
+ * rq->max_freq will need to trigger re-computation of nr_big_task
+ * count on all online cpus.
+ *
+ * While it should be sufficient for nr_big_tasks to be
+ * re-computed for only online cpus, we have inadequate context
+ * information here (in policy notifier) with regard to hotplug-safety
+ * context in which notification is issued. As a result, we can't use
+ * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
+ * fixed up to issue notification always in hotplug-safe context,
+ * re-compute nr_big_task for all possible cpus.
+ */
+
+ if (orig_min_max_freq != min_max_freq ||
+ orig_max_possible_freq != max_possible_freq) {
+ cpus = cpu_possible_mask;
+ update_max = 1;
+ }
+
+ /*
+ * Changed load_scale_factor can trigger reclassification of tasks as
+ * big or small. Make this change "atomic" so that tasks are accounted
+ * properly due to changed load_scale_factor
+ */
+ for_each_cpu(i, cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->capacity = compute_capacity(i);
+ rq->load_scale_factor = compute_load_scale_factor(i);
+
+ if (update_max) {
+ u64 mpc, mplsf;
+
+ mpc = div_u64(((u64) rq->capacity) *
+ rq->max_possible_freq, rq->max_freq);
+ rq->max_possible_capacity = (int) mpc;
+
+ mplsf = div_u64(((u64) rq->load_scale_factor) *
+ rq->max_possible_freq, rq->max_freq);
+
+ if (mpc > highest_mpc) {
+ highest_mpc = mpc;
+ cpumask_clear(&mpc_mask);
+ cpumask_set_cpu(i, &mpc_mask);
+ } else if (mpc == highest_mpc) {
+ cpumask_set_cpu(i, &mpc_mask);
+ }
+
+ if (mplsf > highest_mplsf)
+ highest_mplsf = mplsf;
+ }
+ }
+
+ if (update_max) {
+ max_possible_capacity = highest_mpc;
+ max_load_scale_factor = highest_mplsf;
+ }
+
+ __update_min_max_capacity();
+
+ return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+ unsigned int cpu = freq->cpu, new_freq = freq->new;
+ unsigned long flags;
+ int i;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ BUG_ON(!new_freq);
+
+ if (cpu_rq(cpu)->cur_freq == new_freq)
+ return 0;
+
+ for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
+ struct rq *rq = cpu_rq(i);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
+ rq->cur_freq = new_freq;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+ .notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+ .notifier_call = cpufreq_notifier_trans
+};
+
+static int register_sched_callback(void)
+{
+ int ret;
+
+ ret = cpufreq_register_notifier(&notifier_policy_block,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (!ret)
+ ret = cpufreq_register_notifier(&notifier_trans_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows =
+ div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+ (u64)walt_ravg_window, 100);
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ memset(&p->ravg, 0, sizeof(struct ravg));
+
+ if (init_load_pct) {
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)walt_ravg_window, 100);
+ }
+
+ p->ravg.demand = init_load_windows;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644
index 000000000000..e181c87a928d
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+ u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern unsigned int walt_disabled;
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 46822df92c50..abb795e8a6f1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -305,6 +305,64 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
+ .procname = "sched_is_big_little",
+ .data = &sysctl_sched_is_big_little,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_SCHED_WALT
+ {
+ .procname = "sched_use_walt_cpu_util",
+ .data = &sysctl_sched_use_walt_cpu_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_use_walt_task_util",
+ .data = &sysctl_sched_use_walt_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_init_task_load_pct",
+ .data = &sysctl_sched_walt_init_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_cpu_high_irqload",
+ .data = &sysctl_sched_walt_cpu_high_irqload,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ {
+ .procname = "sched_sync_hint_enable",
+ .data = &sysctl_sched_sync_hint_enable,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_initial_task_util",
+ .data = &sysctl_sched_initial_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_cstate_aware",
+ .data = &sysctl_sched_cstate_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
@@ -2097,6 +2155,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+{
+ if (write) {
+ if (*negp)
+ return -EINVAL;
+ *valp = *lvalp;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long)val;
+ }
+ return 0;
+}
+
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2224,8 +2297,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
int proc_dointvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table,write,buffer,lenp,ppos,
- NULL,NULL);
+ return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2838,6 +2930,12 @@ int proc_dointvec(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_minmax(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2883,6 +2981,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
* exception granted :-)
*/
EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1347882d131e..b98810d2f3b4 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
/* cs is a watchdog. */
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+ }
+ spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_select_watchdog(bool fallback)
+{
+ struct clocksource *cs, *old_wd;
+ unsigned long flags;
+
+ spin_lock_irqsave(&watchdog_lock, flags);
+ /* save current watchdog */
+ old_wd = watchdog;
+ if (fallback)
+ watchdog = NULL;
+
+ list_for_each_entry(cs, &clocksource_list, list) {
+ /* cs is a clocksource to be watched. */
+ if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
+ continue;
+
+ /* Skip current if we were requested for a fallback. */
+ if (fallback && cs == old_wd)
+ continue;
+
/* Pick the best watchdog. */
- if (!watchdog || cs->rating > watchdog->rating) {
+ if (!watchdog || cs->rating > watchdog->rating)
watchdog = cs;
- /* Reset watchdog cycles */
- clocksource_reset_watchdog();
- }
}
+ /* If we failed to find a fallback restore the old one. */
+ if (!watchdog)
+ watchdog = old_wd;
+
+ /* If we changed the watchdog we need to reset cycles. */
+ if (watchdog != old_wd)
+ clocksource_reset_watchdog();
+
/* Check if the watchdog timer needs to be started. */
clocksource_start_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
+static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
mutex_lock(&clocksource_mutex);
__clocksource_change_rating(cs, rating);
clocksource_select();
+ clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
*/
static int clocksource_unbind(struct clocksource *cs)
{
- /*
- * I really can't convince myself to support this on hardware
- * designed by lobotomized monkeys.
- */
- if (clocksource_is_watchdog(cs))
- return -EBUSY;
+ if (clocksource_is_watchdog(cs)) {
+ /* Select and try to install a replacement watchdog. */
+ clocksource_select_watchdog(true);
+ if (clocksource_is_watchdog(cs))
+ return -EBUSY;
+ }
if (cs == curr_clocksource) {
/* Select and try to install a replacement clock source */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 58a321c34cfb..405536b22c0c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -94,6 +94,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ /* Make sure we catch unsupported clockids */
+ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
+
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
@@ -102,7 +105,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- return hrtimer_clock_to_base_table[clock_id];
+ int base = hrtimer_clock_to_base_table[clock_id];
+ BUG_ON(base == HRTIMER_MAX_CLOCK_BASES);
+ return base;
}
/*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 149cc8086aea..ab861771e37f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -674,8 +674,24 @@ int ntp_validate_timex(struct timex *txc)
return -EINVAL;
}
- if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
- return -EPERM;
+ if (txc->modes & ADJ_SETOFFSET) {
+ /* In order to inject time, you gotta be super-user! */
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (txc->modes & ADJ_NANO) {
+ struct timespec ts;
+
+ ts.tv_sec = txc->time.tv_sec;
+ ts.tv_nsec = txc->time.tv_usec;
+ if (!timespec_inject_offset_valid(&ts))
+ return -EINVAL;
+
+ } else {
+ if (!timeval_inject_offset_valid(&txc->time))
+ return -EINVAL;
+ }
+ }
/*
* Check for potential multiplication overflows that can
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index f5e86d282d52..80016b329d94 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -808,6 +808,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
timer->it.cpu.expires = 0;
sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
&itp->it_value);
+ return;
} else {
cpu_timer_sample_group(timer->it_clock, p, &now);
unlock_task_sighand(p, &flags);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 99188ee5d9d0..445601c580d6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
+static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+ cycle_t delta)
+{
+ s64 nsec;
+
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ return nsec + arch_gettimeoffset();
+}
+
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
cycle_t delta;
- s64 nsec;
delta = timekeeping_get_delta(tkr);
+ return timekeeping_delta_to_ns(tkr, delta);
+}
- nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+ cycle_t cycles)
+{
+ cycle_t delta;
- /* If arch requires, add in get_arch_timeoffset() */
- return nsec + arch_gettimeoffset();
+ /* calculate the delta since the last update_wall_time */
+ delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+ return timekeeping_delta_to_ns(tkr, delta);
}
/**
@@ -383,7 +400,13 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+ now = ktime_to_ns(tkr->base);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tkr->read(tkr->clock),
+ tkr->cycle_last,
+ tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
return now;
@@ -958,7 +981,7 @@ int timekeeping_inject_offset(struct timespec *ts)
struct timespec64 ts64, tmp;
int ret = 0;
- if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_inject_offset_valid(ts))
return -EINVAL;
ts64 = timespec_to_timespec64(*ts);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
#include "timekeeping_internal.h"
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
{
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
void tk_debug_account_sleep_time(struct timespec64 *t)
{
- sleep_time_bin[fls(t->tv_sec)]++;
+ /* Cap bin index so we don't overflow the array */
+ int bin = min(fls(t->tv_sec), NUM_BINS-1);
+
+ sleep_time_bin[bin]++;
}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ba04bf0c2653..a9bba37fab5a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,4 +1,8 @@
+# We are fully aware of the dangers of __builtin_return_address()
+FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
+KBUILD_CFLAGS += $(FRAME_CFLAGS)
+
# Do not instrument the tracer itself:
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd4d86ae8e21..bff7d4c69274 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4816,19 +4816,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
struct trace_iterator *iter = filp->private_data;
ssize_t sret;
- /* return any leftover data */
- sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
- if (sret != -EBUSY)
- return sret;
-
- trace_seq_init(&iter->seq);
-
/*
* Avoid more than one consumer on a single file descriptor
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
mutex_lock(&iter->mutex);
+
+ /* return any leftover data */
+ sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+ if (sret != -EBUSY)
+ goto out;
+
+ trace_seq_init(&iter->seq);
+
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
@@ -5855,9 +5856,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
-
if (*ppos & (PAGE_SIZE - 1))
return -EINVAL;
@@ -5867,6 +5865,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
again:
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
@@ -5924,19 +5925,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
if (ret)
- return ret;
+ goto out;
+ ret = -EAGAIN;
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK))
- return -EAGAIN;
+ goto out;
ret = wait_on_pipe(iter, true);
if (ret)
- return ret;
+ goto out;
goto again;
}
ret = splice_to_pipe(pipe, &spd);
+out:
splice_shrink_spd(&spd);
return ret;