diff options
author | Tao Huang <huangtao@rock-chips.com> | 2017-12-01 11:04:13 +0800 |
---|---|---|
committer | Tao Huang <huangtao@rock-chips.com> | 2017-12-01 11:04:13 +0800 |
commit | afd240d16841b1964db94a19d89d988ad80773ee (patch) | |
tree | bbec7e663b66a4278e7fc4e7b14158147ac3c21f /kernel | |
parent | 7b1e4fbf3b2f50012f996f520099f4075963e3c2 (diff) | |
parent | 0f646885c342e74051e3d40acd40af310469526d (diff) |
Merge branch 'linux-linaro-lsk-v4.4-android' of git://git.linaro.org/kernel/linux-linaro-stable.git
* linux-linaro-lsk-v4.4-android: (510 commits)
Linux 4.4.103
Revert "sctp: do not peel off an assoc from one netns to another one"
xen: xenbus driver must not accept invalid transaction ids
s390/kbuild: enable modversions for symbols exported from asm
ASoC: wm_adsp: Don't overrun firmware file buffer when reading region data
btrfs: return the actual error value from from btrfs_uuid_tree_iterate
ASoC: rsnd: don't double free kctrl
netfilter: nf_tables: fix oob access
netfilter: nft_queue: use raw_smp_processor_id()
spi: SPI_FSL_DSPI should depend on HAS_DMA
staging: iio: cdc: fix improper return value
iio: light: fix improper return value
mac80211: Suppress NEW_PEER_CANDIDATE event if no room
mac80211: Remove invalid flag operations in mesh TSF synchronization
drm: Apply range restriction after color adjustment when allocation
ALSA: hda - Apply ALC269_FIXUP_NO_SHUTUP on HDA_FIXUP_ACT_PROBE
ath10k: set CTS protection VDEV param only if VDEV is up
ath10k: fix potential memory leak in ath10k_wmi_tlv_op_pull_fw_stats()
ath10k: ignore configuring the incorrect board_id
ath10k: fix incorrect txpower set by P2P_DEVICE interface
...
Conflicts:
drivers/media/v4l2-core/v4l2-ctrls.c
kernel/sched/fair.c
Change-Id: I48152b2a0ab1f9f07e1da7823119b94f9b9e1751
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/verifier.c | 3 | ||||
-rw-r--r-- | kernel/events/core.c | 3 | ||||
-rw-r--r-- | kernel/kexec.c | 109 | ||||
-rw-r--r-- | kernel/kexec_core.c | 9 | ||||
-rw-r--r-- | kernel/kexec_file.c | 8 | ||||
-rw-r--r-- | kernel/locking/lockdep.c | 11 | ||||
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/auto_group.c | 23 | ||||
-rw-r--r-- | kernel/sched/core.c | 263 | ||||
-rw-r--r-- | kernel/sched/cpufreq_sched.c | 525 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 3 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 7 | ||||
-rw-r--r-- | kernel/sched/fair.c | 575 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 3 | ||||
-rw-r--r-- | kernel/sched/rt.c | 295 | ||||
-rw-r--r-- | kernel/sched/sched.h | 142 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 3 | ||||
-rw-r--r-- | kernel/sched/walt.c | 411 | ||||
-rw-r--r-- | kernel/sched/walt.h | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | kernel/workqueue.c | 37 | ||||
-rw-r--r-- | kernel/workqueue_internal.h | 3 |
22 files changed, 843 insertions, 1600 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c97bce6a0e0e..eb759f5008b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1044,7 +1044,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + BPF_CLASS(insn->code) == BPF_ALU64) { verbose("BPF_END uses reserved fields\n"); return -EINVAL; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 42338667f910..2cd263d3ad14 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7238,6 +7238,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -7250,7 +7251,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put_rcu(prog); } diff --git a/kernel/kexec.c b/kernel/kexec.c index d873b64fbddc..e1acab9c8260 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -103,6 +103,65 @@ out_free_image: return ret; } +static int do_kexec_load(unsigned long entry, unsigned long nr_segments, + struct kexec_segment __user *segments, unsigned long flags) +{ + struct kimage **dest_image, *image; + unsigned long i; + int ret; + + if (flags & KEXEC_ON_CRASH) { + dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } else { + dest_image = &kexec_image; + } + + if (nr_segments == 0) { + /* Uninstall image */ + kimage_free(xchg(dest_image, NULL)); + return 0; + } + if (flags & KEXEC_ON_CRASH) { + /* + * Loading another kernel to switch to if this one + * crashes. Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + } + + ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags); + if (ret) + return ret; + + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + for (i = 0; i < nr_segments; i++) { + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* Install the new kernel and uninstall the old */ + image = xchg(dest_image, image); + +out: + if ((flags & KEXEC_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + + kimage_free(image); + return ret; +} + /* * Exec Kernel system call: for obvious reasons only root may call it. * @@ -127,7 +186,6 @@ out_free_image: SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, struct kexec_segment __user *, segments, unsigned long, flags) { - struct kimage **dest_image, *image; int result; /* We only trust the superuser with rebooting the system. */ @@ -152,9 +210,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; - image = NULL; - result = 0; - /* Because we write directly to the reserved memory * region when loading crash kernels we need a mutex here to * prevent multiple crash kernels from attempting to load @@ -166,53 +221,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, if (!mutex_trylock(&kexec_mutex)) return -EBUSY; - dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) - dest_image = &kexec_crash_image; - if (nr_segments > 0) { - unsigned long i; - - if (flags & KEXEC_ON_CRASH) { - /* - * Loading another kernel to switch to if this one - * crashes. Free any current crash dump kernel before - * we corrupt it. - */ - - kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - crash_map_reserved_pages(); - } else { - /* Loading another kernel to reboot into. */ - - result = kimage_alloc_init(&image, entry, nr_segments, - segments, flags); - } - if (result) - goto out; - - if (flags & KEXEC_PRESERVE_CONTEXT) - image->preserve_context = 1; - result = machine_kexec_prepare(image); - if (result) - goto out; - - for (i = 0; i < nr_segments; i++) { - result = kimage_load_segment(image, &image->segment[i]); - if (result) - goto out; - } - kimage_terminate(image); - if (flags & KEXEC_ON_CRASH) - crash_unmap_reserved_pages(); - } - /* Install the new kernel, and Uninstall the old */ - image = xchg(dest_image, image); + result = do_kexec_load(entry, nr_segments, segments, flags); -out: mutex_unlock(&kexec_mutex); - kimage_free(image); return result; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 11b64a63c0f8..aa88c32b54f0 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -926,7 +926,6 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); - crash_map_reserved_pages(); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) @@ -940,7 +939,6 @@ int crash_shrink_memory(unsigned long new_size) crashk_res.end = end - 1; insert_resource(&iomem_resource, ram_res); - crash_unmap_reserved_pages(); unlock: mutex_unlock(&kexec_mutex); @@ -1522,13 +1520,14 @@ int kernel_kexec(void) } /* - * Add and remove page tables for crashkernel memory + * Protection mechanism for crashkernel reserved memory after + * the kdump kernel is loaded. * * Provide an empty default implementation here -- architecture * code may override this */ -void __weak crash_map_reserved_pages(void) +void __weak arch_kexec_protect_crashkres(void) {} -void __weak crash_unmap_reserved_pages(void) +void __weak arch_kexec_unprotect_crashkres(void) {} diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6030efd4a188..ef2cf637f840 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -327,8 +327,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, return -EBUSY; dest_image = &kexec_image; - if (flags & KEXEC_FILE_ON_CRASH) + if (flags & KEXEC_FILE_ON_CRASH) { dest_image = &kexec_crash_image; + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); + } if (flags & KEXEC_FILE_UNLOAD) goto exchange; @@ -377,6 +380,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, exchange: image = xchg(dest_image, image); out: + if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) + arch_kexec_protect_crashkres(); + mutex_unlock(&kexec_mutex); kimage_free(image); return ret; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56618f6..0e2c4911ba61 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3128,10 +3128,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) + if (hlock->references) { + /* + * Check: unsigned int references:12, overflow. + */ + if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) + return 0; + hlock->references++; - else + } else { hlock->references = 2; + } return 1; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ca0d94096170..d7ec4f7dd0d9 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -22,5 +22,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o -obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601ddf7..8620fd01b3d0 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). */ - if (p->flags & PF_EXITING) - return false; - return true; } @@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 31a636085ac8..d3e77586657a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -547,6 +547,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; + head->count++; + get_task_struct(task); /* @@ -556,6 +558,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint); + void wake_up_q(struct wake_q_head *head) { struct wake_q_node *node = head->first; @@ -570,10 +576,10 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing + * try_to_wake_up() implies a wmb() to pair with the queueing * in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + try_to_wake_up(task, TASK_NORMAL, 0, head->count); put_task_struct(task); } } @@ -614,8 +620,7 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; + raw_spin_lock_irqsave(&rq->lock, flags); resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1332,7 +1337,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) dst_rq = cpu_rq(cpu); deactivate_task(src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, cpu); + p->on_rq = TASK_ON_RQ_QUEUED; activate_task(dst_rq, p, 0); check_preempt_curr(dst_rq, p, 0); } else { @@ -1641,12 +1648,14 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, + int sibling_count_hint) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, + sibling_count_hint); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1931,6 +1940,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) + * @sibling_count_hint: A hint at the number of threads that are being woken up + * in this event. * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -1942,7 +1953,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * or @state didn't match @p's state. */ static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint) { unsigned long flags; int cpu, success = 0; @@ -2043,8 +2055,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); - + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, + sibling_count_hint); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -2126,13 +2138,13 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_NORMAL, 0); + return try_to_wake_up(p, TASK_NORMAL, 0, 1); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 1); } /* @@ -2169,9 +2181,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SCHED_WALT + p->last_sleep_ts = 0; +#endif + INIT_LIST_HEAD(&p->se.group_node); walt_init_new_task_load(p); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2254,11 +2274,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2295,8 +2315,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); + init_entity_runnable_average(&p->se); /* * The child is not yet in the pid-hash so no cgroup attach races, @@ -2306,7 +2325,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2438,6 +2463,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; walt_init_new_task_load(p); @@ -2448,10 +2474,14 @@ void wake_up_new_task(struct task_struct *p) * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif rq = __task_rq_lock(p); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); walt_mark_task_starting(p); @@ -2886,7 +2916,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; @@ -2952,94 +2982,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_CPU_FREQ_GOV_SCHED - -static inline -unsigned long add_capacity_margin(unsigned long cpu_capacity) -{ - cpu_capacity = cpu_capacity * capacity_margin; - cpu_capacity /= SCHED_CAPACITY_SCALE; - return cpu_capacity; -} - -static inline -unsigned long sum_capacity_reqs(unsigned long cfs_cap, - struct sched_capacity_reqs *scr) -{ - unsigned long total = add_capacity_margin(cfs_cap + scr->rt); - return total += scr->dl; -} - -unsigned long boosted_cpu_util(int cpu); -static void sched_freq_tick_pelt(int cpu) -{ - unsigned long cpu_utilization = boosted_cpu_util(cpu); - unsigned long capacity_curr = capacity_curr_of(cpu); - struct sched_capacity_reqs *scr; - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr) - return; - - /* - * To make free room for a task that is building up its "real" - * utilization and to harm its performance the least, request - * a jump to a higher OPP as soon as the margin of free capacity - * is impacted (specified by capacity_margin). - */ - set_cfs_cpu_capacity(cpu, true, cpu_utilization); -} - -#ifdef CONFIG_SCHED_WALT -static void sched_freq_tick_walt(int cpu) -{ - unsigned long cpu_utilization = cpu_util_freq(cpu); - unsigned long capacity_curr = capacity_curr_of(cpu); - - if (walt_disabled || !sysctl_sched_use_walt_cpu_util) - return sched_freq_tick_pelt(cpu); - - /* - * Add a margin to the WALT utilization to check if we will need to - * increase frequency. - * NOTE: WALT tracks a single CPU signal for all the scheduling - * classes, thus this margin is going to be added to the DL class as - * well, which is something we do not do in sched_freq_tick_pelt case. - */ - if (add_capacity_margin(cpu_utilization) <= capacity_curr) - return; - - /* - * It is likely that the load is growing so we - * keep the added margin in our request as an - * extra boost. - */ - set_cfs_cpu_capacity(cpu, true, cpu_utilization); - -} -#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu) -#else -#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu) -#endif /* CONFIG_SCHED_WALT */ - -static void sched_freq_tick(int cpu) -{ - unsigned long capacity_orig, capacity_curr; - - if (!sched_freq()) - return; - - capacity_orig = capacity_orig_of(cpu); - capacity_curr = capacity_curr_of(cpu); - if (capacity_curr == capacity_orig) - return; - - _sched_freq_tick(cpu); -} -#else -static inline void sched_freq_tick(int cpu) { } -#endif /* CONFIG_CPU_FREQ_GOV_SCHED */ - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3054,13 +2996,12 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); walt_set_window_start(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, - walt_ktime_clock(), 0); calc_global_load_tick(rq); - sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -3070,6 +3011,9 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + if (curr->sched_class == &fair_sched_class) + check_for_migration(rq, curr); } #ifdef CONFIG_NO_HZ_FULL @@ -3367,6 +3311,10 @@ static void __sched notrace __schedule(bool preempt) rq->clock_skip_update = 0; if (likely(prev != next)) { +#ifdef CONFIG_SCHED_WALT + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; +#endif rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -3540,7 +3488,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags); + return try_to_wake_up(curr->private, mode, wake_flags, 1); } EXPORT_SYMBOL(default_wake_function); @@ -3566,6 +3514,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -3657,6 +3606,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4083,6 +4034,7 @@ recheck: * runqueue lock must be held. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); /* * Changing the policy of the stop threads its a very bad idea @@ -6156,6 +6108,12 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; +#ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); +#endif + init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_dlo_mask; @@ -7703,7 +7661,6 @@ void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; - walt_init_cpu_efficiency(); alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); @@ -7877,6 +7834,7 @@ void __init sched_init(void) rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; + rq->push_task = NULL; rq->cpu = i; rq->online = 0; rq->idle_stamp = 0; @@ -8183,27 +8141,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -8216,11 +8156,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8657,15 +8623,28 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task, void *private) { - sched_move_task(task); + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(task, &flags); + + update_rq_clock(rq); + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &flags); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8676,8 +8655,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c deleted file mode 100644 index 6ffb23adbcef..000000000000 --- a/kernel/sched/cpufreq_sched.c +++ /dev/null @@ -1,525 +0,0 @@ -/* - * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/cpufreq.h> -#include <linux/module.h> -#include <linux/kthread.h> -#include <linux/percpu.h> -#include <linux/irq_work.h> -#include <linux/delay.h> -#include <linux/string.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/cpufreq_sched.h> - -#include "sched.h" - -#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */ -#define THROTTLE_UP_NSEC 500000 /* 500us default */ - -struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; -static bool __read_mostly cpufreq_driver_slow; - -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED -static struct cpufreq_governor cpufreq_gov_sched; -#endif - -static DEFINE_PER_CPU(unsigned long, enabled); -DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); - -struct gov_tunables { - struct gov_attr_set attr_set; - unsigned int up_throttle_nsec; - unsigned int down_throttle_nsec; -}; - -/** - * gov_data - per-policy data internal to the governor - * @up_throttle: next throttling period expiry if increasing OPP - * @down_throttle: next throttling period expiry if decreasing OPP - * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP - * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP - * @task: worker thread for dvfs transition that may block/sleep - * @irq_work: callback used to wake up worker thread - * @requested_freq: last frequency requested by the sched governor - * - * struct gov_data is the per-policy cpufreq_sched-specific data structure. A - * per-policy instance of it is created when the cpufreq_sched governor receives - * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data - * member of struct cpufreq_policy. - * - * Readers of this data must call down_read(policy->rwsem). Writers must - * call down_write(policy->rwsem). - */ -struct gov_data { - ktime_t up_throttle; - ktime_t down_throttle; - struct gov_tunables *tunables; - struct list_head tunables_hook; - struct task_struct *task; - struct irq_work irq_work; - unsigned int requested_freq; -}; - -static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, - unsigned int freq) -{ - struct gov_data *gd = policy->governor_data; - - /* avoid race with cpufreq_sched_stop */ - if (!down_write_trylock(&policy->rwsem)) - return; - - __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - - gd->up_throttle = ktime_add_ns(ktime_get(), - gd->tunables->up_throttle_nsec); - gd->down_throttle = ktime_add_ns(ktime_get(), - gd->tunables->down_throttle_nsec); - up_write(&policy->rwsem); -} - -static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq) -{ - ktime_t now = ktime_get(); - - ktime_t throttle = gd->requested_freq < cur_freq ? - gd->down_throttle : gd->up_throttle; - - if (ktime_after(now, throttle)) - return false; - - while (1) { - int usec_left = ktime_to_ns(ktime_sub(throttle, now)); - - usec_left /= NSEC_PER_USEC; - trace_cpufreq_sched_throttled(usec_left); - usleep_range(usec_left, usec_left + 100); - now = ktime_get(); - if (ktime_after(now, throttle)) - return true; - } -} - -/* - * we pass in struct cpufreq_policy. This is safe because changing out the - * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), - * which tears down all of the data structures and __cpufreq_governor(policy, - * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the - * new policy pointer - */ -static int cpufreq_sched_thread(void *data) -{ - struct sched_param param; - struct cpufreq_policy *policy; - struct gov_data *gd; - unsigned int new_request = 0; - unsigned int last_request = 0; - int ret; - - policy = (struct cpufreq_policy *) data; - gd = policy->governor_data; - - param.sched_priority = 50; - ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); - if (ret) { - pr_warn("%s: failed to set SCHED_FIFO\n", __func__); - do_exit(-EINVAL); - } else { - pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", - __func__, gd->task->pid); - } - - do { - new_request = gd->requested_freq; - if (new_request == last_request) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; - schedule(); - } else { - /* - * if the frequency thread sleeps while waiting to be - * unthrottled, start over to check for a newer request - */ - if (finish_last_request(gd, policy->cur)) - continue; - last_request = new_request; - cpufreq_sched_try_driver_target(policy, new_request); - } - } while (!kthread_should_stop()); - - return 0; -} - -static void cpufreq_sched_irq_work(struct irq_work *irq_work) -{ - struct gov_data *gd; - - gd = container_of(irq_work, struct gov_data, irq_work); - if (!gd) - return; - - wake_up_process(gd->task); -} - -static void update_fdomain_capacity_request(int cpu) -{ - unsigned int freq_new, index_new, cpu_tmp; - struct cpufreq_policy *policy; - struct gov_data *gd; - unsigned long capacity = 0; - - /* - * Avoid grabbing the policy if possible. A test is still - * required after locking the CPU's policy to avoid racing - * with the governor changing. - */ - if (!per_cpu(enabled, cpu)) - return; - - policy = cpufreq_cpu_get(cpu); - if (IS_ERR_OR_NULL(policy)) - return; - - if (policy->governor != &cpufreq_gov_sched || - !policy->governor_data) - goto out; - - gd = policy->governor_data; - - /* find max capacity requested by cpus in this policy */ - for_each_cpu(cpu_tmp, policy->cpus) { - struct sched_capacity_reqs *scr; - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); - capacity = max(capacity, scr->total); - } - - /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; - if (cpufreq_frequency_table_target(policy, policy->freq_table, - freq_new, CPUFREQ_RELATION_L, - &index_new)) - goto out; - freq_new = policy->freq_table[index_new].frequency; - - if (freq_new > policy->max) - freq_new = policy->max; - - if (freq_new < policy->min) - freq_new = policy->min; - - trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, - gd->requested_freq); - if (freq_new == gd->requested_freq) - goto out; - - gd->requested_freq = freq_new; - - /* - * Throttling is not yet supported on platforms with fast cpufreq - * drivers. - */ - if (cpufreq_driver_slow) - irq_work_queue_on(&gd->irq_work, cpu); - else - cpufreq_sched_try_driver_target(policy, freq_new); - -out: - cpufreq_cpu_put(policy); -} - -#ifdef CONFIG_SCHED_WALT -static inline unsigned long -requested_capacity(struct sched_capacity_reqs *scr) -{ - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) - return scr->cfs; - return scr->cfs + scr->rt; -} -#else -#define requested_capacity(scr) (scr->cfs + scr->rt) -#endif - -void update_cpu_capacity_request(int cpu, bool request) -{ - unsigned long new_capacity; - struct sched_capacity_reqs *scr; - - /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ - lockdep_assert_held(&cpu_rq(cpu)->lock); - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - - new_capacity = requested_capacity(scr); - new_capacity = new_capacity * capacity_margin - / SCHED_CAPACITY_SCALE; - new_capacity += scr->dl; - - if (new_capacity == scr->total) - return; - - trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); - - scr->total = new_capacity; - if (request) - update_fdomain_capacity_request(cpu); -} - -static inline void set_sched_freq(void) -{ - static_key_slow_inc(&__sched_freq); -} - -static inline void clear_sched_freq(void) -{ - static_key_slow_dec(&__sched_freq); -} - -/* Tunables */ -static struct gov_tunables *global_tunables; - -static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set) -{ - return container_of(attr_set, struct gov_tunables, attr_set); -} - -static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf) -{ - struct gov_tunables *tunables = to_tunables(attr_set); - - return sprintf(buf, "%u\n", tunables->up_throttle_nsec); -} - -static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set, - const char *buf, size_t count) -{ - struct gov_tunables *tunables = to_tunables(attr_set); - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - tunables->up_throttle_nsec = val; - return count; -} - -static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf) -{ - struct gov_tunables *tunables = to_tunables(attr_set); - - return sprintf(buf, "%u\n", tunables->down_throttle_nsec); -} - -static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set, - const char *buf, size_t count) -{ - struct gov_tunables *tunables = to_tunables(attr_set); - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - tunables->down_throttle_nsec = val; - return count; -} - -static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec); -static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec); - -static struct attribute *schedfreq_attributes[] = { - &up_throttle_nsec.attr, - &down_throttle_nsec.attr, - NULL -}; - -static struct kobj_type tunables_ktype = { - .default_attrs = schedfreq_attributes, - .sysfs_ops = &governor_sysfs_ops, -}; - -static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) -{ - struct gov_data *gd; - int cpu; - int rc; - - for_each_cpu(cpu, policy->cpus) - memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, - sizeof(struct sched_capacity_reqs)); - - gd = kzalloc(sizeof(*gd), GFP_KERNEL); - if (!gd) - return -ENOMEM; - - policy->governor_data = gd; - - if (!global_tunables) { - gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL); - if (!gd->tunables) - goto free_gd; - - gd->tunables->up_throttle_nsec = - policy->cpuinfo.transition_latency ? - policy->cpuinfo.transition_latency : - THROTTLE_UP_NSEC; - gd->tunables->down_throttle_nsec = - THROTTLE_DOWN_NSEC; - - rc = kobject_init_and_add(&gd->tunables->attr_set.kobj, - &tunables_ktype, - get_governor_parent_kobj(policy), - "%s", cpufreq_gov_sched.name); - if (rc) - goto free_tunables; - - gov_attr_set_init(&gd->tunables->attr_set, - &gd->tunables_hook); - - pr_debug("%s: throttle_threshold = %u [ns]\n", - __func__, gd->tunables->up_throttle_nsec); - - if (!have_governor_per_policy()) - global_tunables = gd->tunables; - } else { - gd->tunables = global_tunables; - gov_attr_set_get(&global_tunables->attr_set, - &gd->tunables_hook); - } - - policy->governor_data = gd; - if (cpufreq_driver_is_slow()) { - cpufreq_driver_slow = true; - gd->task = kthread_create(cpufreq_sched_thread, policy, - "kschedfreq:%d", - cpumask_first(policy->related_cpus)); - if (IS_ERR_OR_NULL(gd->task)) { - pr_err("%s: failed to create kschedfreq thread\n", - __func__); - goto free_tunables; - } - get_task_struct(gd->task); - kthread_bind_mask(gd->task, policy->related_cpus); - wake_up_process(gd->task); - init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); - } - - set_sched_freq(); - - return 0; - -free_tunables: - kfree(gd->tunables); -free_gd: - policy->governor_data = NULL; - kfree(gd); - return -ENOMEM; -} - -static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) -{ - unsigned int count; - struct gov_data *gd = policy->governor_data; - - clear_sched_freq(); - if (cpufreq_driver_slow) { - kthread_stop(gd->task); - put_task_struct(gd->task); - } - - count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook); - if (!count) { - if (!have_governor_per_policy()) - global_tunables = NULL; - kfree(gd->tunables); - } - - policy->governor_data = NULL; - - kfree(gd); - return 0; -} - -static int cpufreq_sched_start(struct cpufreq_policy *policy) -{ - int cpu; - - for_each_cpu(cpu, policy->cpus) - per_cpu(enabled, cpu) = 1; - - return 0; -} - -static void cpufreq_sched_limits(struct cpufreq_policy *policy) -{ - unsigned int clamp_freq; - struct gov_data *gd = policy->governor_data;; - - pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", - policy->cpu, policy->min, policy->max, - policy->cur); - - clamp_freq = clamp(gd->requested_freq, policy->min, policy->max); - - if (policy->cur != clamp_freq) - __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L); -} - -static int cpufreq_sched_stop(struct cpufreq_policy *policy) -{ - int cpu; - - for_each_cpu(cpu, policy->cpus) - per_cpu(enabled, cpu) = 0; - - return 0; -} - -static int cpufreq_sched_setup(struct cpufreq_policy *policy, - unsigned int event) -{ - switch (event) { - case CPUFREQ_GOV_POLICY_INIT: - return cpufreq_sched_policy_init(policy); - case CPUFREQ_GOV_POLICY_EXIT: - return cpufreq_sched_policy_exit(policy); - case CPUFREQ_GOV_START: - return cpufreq_sched_start(policy); - case CPUFREQ_GOV_STOP: - return cpufreq_sched_stop(policy); - case CPUFREQ_GOV_LIMITS: - cpufreq_sched_limits(policy); - break; - } - return 0; -} - - -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED -static -#endif -struct cpufreq_governor cpufreq_gov_sched = { - .name = "sched", - .governor = cpufreq_sched_setup, - .owner = THIS_MODULE, -}; - -static int __init cpufreq_sched_init(void) -{ - int cpu; - - for_each_cpu(cpu, cpu_possible_mask) - per_cpu(enabled, cpu) = 0; - return cpufreq_register_governor(&cpufreq_gov_sched); -} - -/* Try to make this the default governor */ -fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28977799017b..d3765f0cb699 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -216,8 +216,9 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) *util = boosted_cpu_util(cpu); if (likely(use_pelt())) - *util = min((*util + rt), max_cap); + *util = *util + rt; + *util = min(*util, max_cap); *max = max_cap; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2aae8b8b68e8..cdf2a0b97611 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1070,7 +1070,8 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; @@ -1587,7 +1588,9 @@ retry: deactivate_task(rq, next_task, 0); clear_average_bw(&next_task->dl, &rq->dl); + next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, later_rq->cpu); + next_task->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1677,7 +1680,9 @@ static void pull_dl_task(struct rq *this_rq) deactivate_task(src_rq, p, 0); clear_average_bw(&p->dl, &src_rq->dl); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 460681232b0a..4133355b66cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -54,7 +54,6 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_sync_hint_enable = 1; -unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; #ifdef CONFIG_SCHED_WALT @@ -750,9 +749,7 @@ void init_entity_runnable_average(struct sched_entity *se) sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * In previous Android versions, we used to have: - * sa->util_avg = sched_freq() ? - * sysctl_sched_initial_task_util : - * scale_load_down(SCHED_LOAD_SCALE); + * sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); * sa->util_sum = sa->util_avg * LOAD_AVG_MAX; * However, that functionality has been moved to enqueue. * It is unclear if we should restore this in enqueue. @@ -766,7 +763,9 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); static void attach_entity_cfs_rq(struct sched_entity *se); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -837,7 +836,7 @@ void post_init_entity_util_avg(struct sched_entity *se) attach_entity_cfs_rq(se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } @@ -3312,11 +3311,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); @@ -4663,21 +4665,6 @@ unsigned long boosted_cpu_util(int cpu); #define boosted_cpu_util(cpu) cpu_util_freq(cpu) #endif -#ifdef CONFIG_SMP -static void update_capacity_of(int cpu) -{ - unsigned long req_cap; - - if (!sched_freq()) - return; - - /* Convert scale-invariant capacity to cpu. */ - req_cap = boosted_cpu_util(cpu); - req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); - set_cfs_cpu_capacity(cpu, true, req_cap); -} -#endif - /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4690,7 +4677,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; - int task_wakeup = flags & ENQUEUE_WAKEUP; #endif /* @@ -4712,7 +4698,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -4764,16 +4750,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) rq->rd->overutilized = true; trace_sched_overutilized(true); } - - /* - * We want to potentially trigger a freq switch - * request only for tasks that are waking up; this is - * because we get here also during load balancing, but - * in these cases it seems wise to trigger as single - * request after load balancing is done. - */ - if (task_new || task_wakeup) - update_capacity_of(cpu_of(rq)); } #endif /* CONFIG_SMP */ @@ -4849,25 +4825,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_dequeue_task(p, cpu_of(rq)); - if (!se) { + if (!se) walt_dec_cumulative_runnable_avg(rq, p); - - /* - * We want to potentially trigger a freq switch - * request only for tasks that are going to sleep; - * this is because we get here also during load - * balancing, but in these cases it seems wise to - * trigger as single request after load balancing is - * done. - */ - if (task_sleep) { - if (rq->cfs.nr_running) - update_capacity_of(cpu_of(rq)); - else if (sched_freq()) - set_cfs_cpu_capacity(cpu_of(rq), false, 0); - } - } - #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5292,6 +5251,7 @@ struct energy_env { int util_delta; int src_cpu; int dst_cpu; + int trg_cpu; int energy; int payoff; struct task_struct *task; @@ -5308,11 +5268,14 @@ struct energy_env { } cap; }; +static int cpu_util_wake(int cpu, struct task_struct *p); + /* * __cpu_norm_util() returns the cpu util relative to a specific capacity, - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for - * energy calculations. Using the scale-invariant util returned by - * cpu_util() and approximating scale-invariant util by: + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for + * energy calculations. + * + * Since util is a scale-invariant utilization defined as: * * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time * @@ -5322,34 +5285,32 @@ struct energy_env { * * norm_util = running_time/time ~ util/capacity */ -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) { - int util = __cpu_util(cpu, delta); - if (util >= capacity) return SCHED_CAPACITY_SCALE; return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static int calc_util_delta(struct energy_env *eenv, int cpu) +static unsigned long group_max_util(struct energy_env *eenv) { - if (cpu == eenv->src_cpu) - return -eenv->util_delta; - if (cpu == eenv->dst_cpu) - return eenv->util_delta; - return 0; -} - -static -unsigned long group_max_util(struct energy_env *eenv) -{ - int i, delta; unsigned long max_util = 0; + unsigned long util; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; - for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { - delta = calc_util_delta(eenv, i); - max_util = max(max_util, __cpu_util(i, delta)); + max_util = max(max_util, util); } return max_util; @@ -5357,44 +5318,56 @@ unsigned long group_max_util(struct energy_env *eenv) /* * group_norm_util() returns the approximated group util relative to it's - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in - * energy calculations. Since task executions may or may not overlap in time in - * the group the true normalized util is between max(cpu_norm_util(i)) and - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The - * latter is used as the estimate as it leads to a more pessimistic energy + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use + * in energy calculations. + * + * Since task executions may or may not overlap in time in the group the true + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i)) + * when iterating over all CPUs in the group. + * The latter estimate is used as it leads to a more pessimistic energy * estimate (more busy). */ static unsigned long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i, delta; - unsigned long util_sum = 0; unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long util, util_sum = 0; + int cpu; - for_each_cpu(i, sched_group_cpus(sg)) { - delta = calc_util_delta(eenv, i); - util_sum += __cpu_norm_util(i, capacity, delta); + for_each_cpu(cpu, sched_group_cpus(sg)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; + + util_sum += __cpu_norm_util(util, capacity); } - if (util_sum > SCHED_CAPACITY_SCALE) - return SCHED_CAPACITY_SCALE; - return util_sum; + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy * const sge) { - int idx; + int idx, max_idx = sge->nr_cap_states - 1; unsigned long util = group_max_util(eenv); + /* default is max_cap if we don't find a match */ + eenv->cap_idx = max_idx; + for (idx = 0; idx < sge->nr_cap_states; idx++) { - if (sge->cap_states[idx].cap >= util) + if (sge->cap_states[idx].cap >= util) { + eenv->cap_idx = idx; break; + } } - eenv->cap_idx = idx; - - return idx; + return eenv->cap_idx; } static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) @@ -5410,13 +5383,6 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; - /* - * Try to estimate if a deeper idle state is - * achievable when we move the task. - */ - for_each_cpu(i, sched_group_cpus(sg)) - grp_util += cpu_util(i); - src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); if (src_in_grp == dst_in_grp) { @@ -5425,10 +5391,16 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) */ goto end; } - /* add or remove util as appropriate to indicate what group util - * will be (worst case - no concurrent execution) after moving the task + + /* + * Try to estimate if a deeper idle state is + * achievable when we move the task. */ - grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta; + for_each_cpu(i, sched_group_cpus(sg)) { + grp_util += cpu_util_wake(i, eenv->task); + if (unlikely(i == eenv->trg_cpu)) + grp_util += eenv->util_delta; + } if (grp_util <= ((long)sg->sgc->max_capacity * (int)sg->group_weight)) { @@ -5515,13 +5487,13 @@ static int sched_group_energy(struct energy_env *eenv) if (sg->group_weight == 1) { /* Remove capacity of src CPU (before task move) */ - if (eenv->util_delta == 0 && + if (eenv->trg_cpu == eenv->src_cpu && cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { eenv->cap.before = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta -= eenv->cap.before; } /* Add capacity of dst CPU (after task move) */ - if (eenv->util_delta != 0 && + if (eenv->trg_cpu == eenv->dst_cpu && cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { eenv->cap.after = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta += eenv->cap.after; @@ -5567,6 +5539,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +static inline unsigned long task_util(struct task_struct *p); + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -5582,11 +5556,13 @@ static inline int __energy_diff(struct energy_env *eenv) int diff, margin; struct energy_env eenv_before = { - .util_delta = 0, + .util_delta = task_util(eenv->task), .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .trg_cpu = eenv->src_cpu, .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, + .task = eenv->task, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -5645,9 +5621,11 @@ static inline int __energy_diff(struct energy_env *eenv) #ifdef CONFIG_SCHED_TUNE struct target_nrg schedtune_target_nrg; + #ifdef CONFIG_CGROUP_SCHEDTUNE extern bool schedtune_initialized; -#endif +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + /* * System energy normalization * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], @@ -5662,7 +5640,7 @@ normalize_energy(int energy_diff) /* during early setup, we don't know the extents */ if (unlikely(!schedtune_initialized)) return energy_diff < 0 ? -1 : 1 ; -#endif +#endif /* CONFIG_CGROUP_SCHEDTUNE */ #ifdef CONFIG_SCHED_DEBUG { @@ -5698,8 +5676,14 @@ energy_diff(struct energy_env *eenv) __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ - if (boost == 0) + if (boost == 0) { + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + 0, -eenv->nrg.diff); return eenv->nrg.diff; + } /* Compute normalized energy diff */ nrg_delta = normalize_energy(eenv->nrg.diff); @@ -5742,15 +5726,18 @@ energy_diff(struct energy_env *eenv) * being client/server, worker/dispatcher, interrupt source or whatever is * irrelevant, spread criteria is apparent partner count exceeds socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } @@ -5962,8 +5949,6 @@ boosted_task_util(struct task_struct *task) return util + margin; } -static int cpu_util_wake(int cpu, struct task_struct *p); - static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); @@ -5972,6 +5957,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -5979,7 +5966,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -6047,23 +6034,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * utilized systems if we require spare_capacity > task_util(p), * so we allow for some task stuffing by using * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + if (this_spare > task_util(p) / 2 && imbalance*this_spare > 100*most_spare) return NULL; else if (most_spare > task_util(p) / 2) return most_spare_sg; +skip_spare: if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -6110,6 +6105,68 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + } + +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = cpu; + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; + + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); + } + + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); + } + + return new_cpu; } /* @@ -6208,7 +6265,9 @@ done: /* * cpu_util_wake: Compute cpu utilization with any contributions from - * the waking task p removed. + * the waking task p removed. check_for_migration() looks for a better CPU of + * rq->curr. For that case we should return cpu util with contributions from + * currently running task p removed. */ static int cpu_util_wake(int cpu, struct task_struct *p) { @@ -6221,7 +6280,8 @@ static int cpu_util_wake(int cpu, struct task_struct *p) * utilization from cpu utilization. Instead just use * cpu_util for this case. */ - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + if (!walt_disabled && sysctl_sched_use_walt_cpu_util && + p->state == TASK_WAKING) return cpu_util(cpu); #endif /* Task has no contribution or is new */ @@ -6238,9 +6298,6 @@ static int start_cpu(bool boosted) { struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } @@ -6252,7 +6309,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, unsigned long target_capacity = ULONG_MAX; unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; - unsigned long target_util = ULONG_MAX; unsigned long best_active_util = ULONG_MAX; int best_idle_cstate = INT_MAX; struct sched_domain *sd; @@ -6391,6 +6447,19 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, } /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + + /* * Case B) Non latency sensitive tasks on IDLE CPUs. * * Find an optimal backup IDLE CPU for non latency @@ -6468,7 +6537,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, target_max_spare_cap = capacity_orig - new_util; target_capacity = capacity_orig; - target_util = new_util; target_cpu = i; } @@ -6589,6 +6657,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync .src_cpu = prev_cpu, .dst_cpu = target_cpu, .task = p, + .trg_cpu = target_cpu, }; @@ -6607,7 +6676,10 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync /* No energy saving for target_cpu, try backup */ target_cpu = tmp_backup; eenv.dst_cpu = target_cpu; - if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + eenv.trg_cpu = target_cpu; + if (tmp_backup < 0 || + tmp_backup == prev_cpu || + energy_diff(&eenv) >= 0) { schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); target_cpu = prev_cpu; @@ -6642,7 +6714,8 @@ unlock: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -6650,9 +6723,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + want_affine = !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); + } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); @@ -6684,61 +6760,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else { - int wu = sd_flag & SD_BALANCE_WAKE; - int cas_cpu = -1; - - if (wu) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); - schedstat_inc(this_rq(), eas_stats.cas_attempts); - } - - while (sd) { - struct sched_group *group; - int weight; - - if (wu) - schedstat_inc(sd, eas_stats.cas_attempts); - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = cas_cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - if (wu && (cas_cpu >= 0)) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_count); - schedstat_inc(this_rq(), eas_stats.cas_count); - } + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); @@ -7627,10 +7663,6 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); - /* - * We want to potentially raise target_cpu's OPP. - */ - update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -7652,11 +7684,6 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - /* - * We want to potentially raise env.dst_cpu's OPP. - */ - update_capacity_of(env->dst_cpu); - raw_spin_unlock(&env->dst_rq->lock); } @@ -8700,8 +8727,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; @@ -8869,6 +8899,7 @@ static int need_active_balance(struct lb_env *env) } if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && env->src_rq->cfs.h_nr_running == 1 && cpu_overutilized(env->src_cpu) && !cpu_overutilized(env->dst_cpu)) { @@ -8990,17 +9021,13 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); - /* - * We want to potentially lower env.src_cpu's OPP. - */ - if (cur_ld_moved) - update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -9225,7 +9252,6 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; - long removed_util=0; idle_enter_fair(this_rq); @@ -9249,17 +9275,6 @@ static int idle_balance(struct rq *this_rq) raw_spin_unlock(&this_rq->lock); - /* - * If removed_util_avg is !0 we most probably migrated some task away - * from this_cpu. In this case we might be willing to trigger an OPP - * update, but we want to do so if we don't find anybody else to pull - * here (we will trigger an OPP update with the pulled task's enqueue - * anyway). - * - * Record removed_util before calling update_blocked_averages, and use - * it below (before returning) to see if an OPP update is required. - */ - removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -9324,12 +9339,6 @@ out: if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; - } else if (removed_util) { - /* - * No task pulled and someone has been migrated away. - * Good case to trigger an OPP update. - */ - update_capacity_of(this_cpu); } return pulled_task; @@ -9347,8 +9356,18 @@ static int active_load_balance_cpu_stop(void *data) int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; struct rq *target_rq = cpu_rq(target_cpu); - struct sched_domain *sd; + struct sched_domain *sd = NULL; struct task_struct *p = NULL; + struct task_struct *push_task; + int push_task_detached = 0; + struct lb_env env = { + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + }; raw_spin_lock_irq(&busiest_rq->lock); @@ -9368,6 +9387,17 @@ static int active_load_balance_cpu_stop(void *data) */ BUG_ON(busiest_rq == target_rq); + push_task = busiest_rq->push_task; + if (push_task) { + if (task_on_rq_queued(push_task) && + task_cpu(push_task) == busiest_cpu && + cpu_online(target_cpu)) { + detach_task(push_task, &env); + push_task_detached = 1; + } + goto out_unlock; + } + /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); for_each_domain(target_cpu, sd) { @@ -9377,33 +9407,31 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { - struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, - }; - + env.sd = sd; schedstat_inc(sd, alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); - if (p) { + if (p) schedstat_inc(sd, alb_pushed); - /* - * We want to potentially lower env.src_cpu's OPP. - */ - update_capacity_of(env.src_cpu); - } else schedstat_inc(sd, alb_failed); } rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; + + if (push_task) + busiest_rq->push_task = NULL; + raw_spin_unlock(&busiest_rq->lock); + if (push_task) { + if (push_task_detached) + attach_one_task(target_rq, push_task); + put_task_struct(push_task); + } + if (p) attach_one_task(target_rq, p); @@ -9868,6 +9896,47 @@ static void rq_offline_fair(struct rq *rq) unthrottle_offline_cfs_rqs(rq); } +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock(&rq->lock); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock(&rq->lock); + + return rc; +} + +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int new_cpu; + int active_balance; + int cpu = task_cpu(p); + + if (rq->misfit_task) { + if (rq->curr->state != TASK_RUNNING || + rq->curr->nr_cpus_allowed == 1) + return; + + new_cpu = select_energy_cpu_brute(p, cpu, 0); + if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + stop_one_cpu_nowait(cpu, + active_load_balance_cpu_stop, + rq, &rq->active_balance_work); + } + } +} + #endif /* CONFIG_SMP */ /* @@ -9906,31 +9975,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -9943,8 +9998,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* @@ -10136,6 +10190,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -10148,6 +10210,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -10374,7 +10449,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..33d7003fa1b8 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6bb51d62dca4..0af072f64e52 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -9,6 +9,7 @@ #include <linux/irq_work.h> #include "walt.h" +#include "tune.h" int sched_rr_timeslice = RR_TIMESLICE; @@ -66,10 +67,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) -static void push_irq_work_func(struct irq_work *work); -#endif - void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -89,13 +86,6 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); - -#ifdef HAVE_RT_PUSH_IPI - rt_rq->push_flags = 0; - rt_rq->push_cpu = nr_cpu_ids; - raw_spin_lock_init(&rt_rq->push_lock); - init_irq_work(&rt_rq->push_work, push_irq_work_func); -#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1321,6 +1311,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + schedtune_enqueue_task(p, cpu_of(rq)); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1332,6 +1324,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); + schedtune_dequeue_task(p, cpu_of(rq)); } /* @@ -1372,7 +1365,8 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; @@ -1484,41 +1478,6 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -#ifdef CONFIG_SMP -static void sched_rt_update_capacity_req(struct rq *rq) -{ - u64 total, used, age_stamp, avg; - s64 delta; - - if (!sched_freq()) - return; - - sched_avg_update(rq); - /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. - */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = rq_clock(rq) - age_stamp; - - if (unlikely(delta < 0)) - delta = 0; - - total = sched_avg_period() + delta; - - used = div_u64(avg, total); - if (unlikely(used > SCHED_CAPACITY_SCALE)) - used = SCHED_CAPACITY_SCALE; - - set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); -} -#else -static inline void sched_rt_update_capacity_req(struct rq *rq) -{ } - -#endif - static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1587,17 +1546,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) { - /* - * The next task to be picked on this rq will have a lower - * priority than rt tasks so we can spend some time to update - * the capacity used by rt tasks based on the last activity. - * This value will be the used as an estimation of the next - * activity. - */ - sched_rt_update_capacity_req(rq); + if (!rt_rq->rt_queued) return NULL; - } put_prev_task(rq, prev); @@ -1882,7 +1832,9 @@ retry: } deactivate_task(rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, lowest_rq->cpu); + next_task->on_rq = TASK_ON_RQ_QUEUED; activate_task(lowest_rq, next_task, 0); ret = 1; @@ -1904,160 +1856,166 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI + /* - * The search for the next cpu always starts at rq->cpu and ends - * when we reach rq->cpu again. It will never return rq->cpu. - * This returns the next cpu to check, or nr_cpu_ids if the loop - * is complete. + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * All CPUs with overloaded RT tasks need to be notified as there is currently + * no way to know which of these CPUs have the highest priority task waiting + * to run. Instead of trying to take a spinlock on each of these CPUs, + * which has shown to cause large latency when done on machines with many + * CPUs, sending an IPI to the CPUs to have them push off the overloaded + * RT tasks waiting to run. + * + * Just sending an IPI to each of the CPUs is also an issue, as on large + * count CPU machines, this can cause an IPI storm on a CPU, especially + * if its the only CPU with multiple RT tasks queued, and a large number + * of CPUs scheduling a lower priority task at the same time. + * + * Each root domain has its own irq work function that can iterate over + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT + * tassk must be checked if there's one or many CPUs that are lowering + * their priority, there's a single irq work iterator that will try to + * push off RT tasks that are waiting to run. + * + * When a CPU schedules a lower priority task, it will kick off the + * irq work iterator that will jump to each CPU with overloaded RT tasks. + * As it only takes the first CPU that schedules a lower priority task + * to start the process, the rto_start variable is incremented and if + * the atomic result is one, then that CPU will try to take the rto_lock. + * This prevents high contention on the lock as the process handles all + * CPUs scheduling lower priority tasks. + * + * All CPUs that are scheduling a lower priority task will increment the + * rt_loop_next variable. This will make sure that the irq work iterator + * checks all RT overloaded CPUs whenever a CPU schedules a new lower + * priority task, even if the iterator is in the middle of a scan. Incrementing + * the rt_loop_next will cause the iterator to perform another scan. * - * rq->rt.push_cpu holds the last cpu returned by this function, - * or if this is the first instance, it must hold rq->cpu. */ static int rto_next_cpu(struct rq *rq) { - int prev_cpu = rq->rt.push_cpu; + struct root_domain *rd = rq->rd; + int next; int cpu; - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); - /* - * If the previous cpu is less than the rq's CPU, then it already - * passed the end of the mask, and has started from the beginning. - * We end if the next CPU is greater or equal to rq's CPU. + * When starting the IPI RT pushing, the rto_cpu is set to -1, + * rt_next_cpu() will simply return the first CPU found in + * the rto_mask. + * + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * will return the next CPU found in the rto_mask. + * + * If there are no more CPUs left in the rto_mask, then a check is made + * against rto_loop and rto_loop_next. rto_loop is only updated with + * the rto_lock held, but any CPU may increment the rto_loop_next + * without any locking. */ - if (prev_cpu < rq->cpu) { - if (cpu >= rq->cpu) - return nr_cpu_ids; + for (;;) { - } else if (cpu >= nr_cpu_ids) { - /* - * We passed the end of the mask, start at the beginning. - * If the result is greater or equal to the rq's CPU, then - * the loop is finished. - */ - cpu = cpumask_first(rq->rd->rto_mask); - if (cpu >= rq->cpu) - return nr_cpu_ids; - } - rq->rt.push_cpu = cpu; + /* When rto_cpu is -1 this acts like cpumask_first() */ + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); - /* Return cpu to let the caller know if the loop is finished or not */ - return cpu; -} + rd->rto_cpu = cpu; -static int find_next_push_cpu(struct rq *rq) -{ - struct rq *next_rq; - int cpu; + if (cpu < nr_cpu_ids) + return cpu; - while (1) { - cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - break; - next_rq = cpu_rq(cpu); + rd->rto_cpu = -1; + + /* + * ACQUIRE ensures we see the @rto_mask changes + * made prior to the @next value observed. + * + * Matches WMB in rt_set_overload(). + */ + next = atomic_read_acquire(&rd->rto_loop_next); - /* Make sure the next rq can push to this rq */ - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + if (rd->rto_loop == next) break; + + rd->rto_loop = next; } - return cpu; + return -1; } -#define RT_PUSH_IPI_EXECUTING 1 -#define RT_PUSH_IPI_RESTART 2 +static inline bool rto_start_trylock(atomic_t *v) +{ + return !atomic_cmpxchg_acquire(v, 0, 1); +} -static void tell_cpu_to_push(struct rq *rq) +static inline void rto_start_unlock(atomic_t *v) { - int cpu; + atomic_set_release(v, 0); +} - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - raw_spin_lock(&rq->rt.push_lock); - /* Make sure it's still executing */ - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - /* - * Tell the IPI to restart the loop as things have - * changed since it started. - */ - rq->rt.push_flags |= RT_PUSH_IPI_RESTART; - raw_spin_unlock(&rq->rt.push_lock); - return; - } - raw_spin_unlock(&rq->rt.push_lock); - } +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu = -1; - /* When here, there's no IPI going around */ + /* Keep the loop going if the IPI is currently active */ + atomic_inc(&rq->rd->rto_loop_next); - rq->rt.push_cpu = rq->cpu; - cpu = find_next_push_cpu(rq); - if (cpu >= nr_cpu_ids) + /* Only one CPU can initiate a loop at a time */ + if (!rto_start_trylock(&rq->rd->rto_loop_start)) return; - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + raw_spin_lock(&rq->rd->rto_lock); - irq_work_queue_on(&rq->rt.push_work, cpu); + /* + * The rto_cpu is updated under the lock, if it has a valid cpu + * then the IPI is still running and will continue due to the + * update to loop_next, and nothing needs to be done here. + * Otherwise it is finishing up and an ipi needs to be sent. + */ + if (rq->rd->rto_cpu < 0) + cpu = rto_next_cpu(rq); + + raw_spin_unlock(&rq->rd->rto_lock); + + rto_start_unlock(&rq->rd->rto_loop_start); + + if (cpu >= 0) + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } /* Called from hardirq context */ -static void try_to_push_tasks(void *arg) +void rto_push_irq_work_func(struct irq_work *work) { - struct rt_rq *rt_rq = arg; - struct rq *rq, *src_rq; - int this_cpu; + struct rq *rq; int cpu; - this_cpu = rt_rq->push_cpu; - - /* Paranoid check */ - BUG_ON(this_cpu != smp_processor_id()); - - rq = cpu_rq(this_cpu); - src_rq = rq_of_rt_rq(rt_rq); + rq = this_rq(); -again: + /* + * We do not need to grab the lock to check for has_pushable_tasks. + * When it gets updated, a check is made if a push is possible. + */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_task(rq); + push_rt_tasks(rq); raw_spin_unlock(&rq->lock); } - /* Pass the IPI to the next rt overloaded queue */ - raw_spin_lock(&rt_rq->push_lock); - /* - * If the source queue changed since the IPI went out, - * we need to restart the search from that CPU again. - */ - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; - rt_rq->push_cpu = src_rq->cpu; - } + raw_spin_lock(&rq->rd->rto_lock); - cpu = find_next_push_cpu(src_rq); + /* Pass the IPI to the next rt overloaded queue */ + cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; - raw_spin_unlock(&rt_rq->push_lock); + raw_spin_unlock(&rq->rd->rto_lock); - if (cpu >= nr_cpu_ids) + if (cpu < 0) return; - /* - * It is possible that a restart caused this CPU to be - * chosen again. Don't bother with an IPI, just see if we - * have more to push. - */ - if (unlikely(cpu == rq->cpu)) - goto again; - /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rt_rq->push_work, cpu); -} - -static void push_irq_work_func(struct irq_work *work) -{ - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); - - try_to_push_tasks(rt_rq); + irq_work_queue_on(&rq->rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ @@ -2136,7 +2094,9 @@ static void pull_rt_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; activate_task(this_rq, p, 0); /* * We continue with the search, just in @@ -2313,9 +2273,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); - if (rq->rt.rt_nr_running) - sched_rt_update_capacity_req(rq); - watchdog(rq, p); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d4613c5be81d..3db67eb07bdd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -31,8 +31,10 @@ extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +extern void check_for_migration(struct rq *rq, struct task_struct *p); #else static inline void update_cpu_load_active(struct rq *this_rq) { } +static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } #endif /* @@ -335,7 +337,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -434,7 +444,7 @@ static inline int rt_bandwidth_enabled(void) } /* RT IPI pull logic requires IRQ_WORK */ -#ifdef CONFIG_IRQ_WORK +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) # define HAVE_RT_PUSH_IPI #endif @@ -455,12 +465,6 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; -#ifdef HAVE_RT_PUSH_IPI - int push_flags; - int push_cpu; - struct irq_work push_work; - raw_spinlock_t push_lock; -#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -553,6 +557,19 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -569,6 +586,9 @@ struct root_domain { extern struct root_domain def_root_domain; +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif #endif /* CONFIG_SMP */ /* @@ -656,6 +676,7 @@ struct rq { /* For active balancing */ int active_balance; int push_cpu; + struct task_struct *push_task; struct cpu_stop_work active_balance_work; /* cpu of this runqueue: */ int cpu; @@ -673,18 +694,7 @@ struct rq { #endif #ifdef CONFIG_SCHED_WALT - /* - * max_freq = user or thermal defined maximum - * max_possible_freq = maximum supported by hardware - */ - unsigned int cur_freq, max_freq, min_freq, max_possible_freq; - struct cpumask freq_domain_cpumask; - u64 cumulative_runnable_avg; - int efficiency; /* Differentiate cpus with different IPC capability */ - int load_scale_factor; - int capacity; - int max_possible_capacity; u64 window_start; u64 curr_runnable_sum; u64 prev_runnable_sum; @@ -693,6 +703,7 @@ struct rq { u64 cur_irqload; u64 avg_irqload; u64 irqload_ts; + u64 cum_window_demand; #endif /* CONFIG_SCHED_WALT */ @@ -998,6 +1009,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif @@ -1262,7 +1274,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags, + int subling_count_hint); void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); @@ -1295,8 +1308,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group)(struct task_struct *p, int type); #endif }; @@ -1557,7 +1573,7 @@ static inline unsigned long capacity_orig_of(int cpu) extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int walt_ravg_window; -extern unsigned int walt_disabled; +extern bool walt_disabled; /* * cpu_util returns the amount of capacity of a CPU that is used by CFS @@ -1622,77 +1638,6 @@ static inline unsigned long cpu_util_freq(int cpu) #endif -#ifdef CONFIG_CPU_FREQ_GOV_SCHED -#define capacity_max SCHED_CAPACITY_SCALE -extern unsigned int capacity_margin; -extern struct static_key __sched_freq; - -static inline bool sched_freq(void) -{ - return static_key_false(&__sched_freq); -} - -DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); -void update_cpu_capacity_request(int cpu, bool request); - -static inline void set_cfs_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ - struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - -#ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - int rtdl = scr->rt + scr->dl; - /* - * WALT tracks the utilization of a CPU considering the load - * generated by all the scheduling classes. - * Since the following call to: - * update_cpu_capacity - * is already adding the RT and DL utilizations let's remove - * these contributions from the WALT signal. - */ - if (capacity > rtdl) - capacity -= rtdl; - else - capacity = 0; - } -#endif - if (scr->cfs != capacity) { - scr->cfs = capacity; - update_cpu_capacity_request(cpu, request); - } -} - -static inline void set_rt_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ - if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; - update_cpu_capacity_request(cpu, request); - } -} - -static inline void set_dl_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ - if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; - update_cpu_capacity_request(cpu, request); - } -} -#else -static inline bool sched_freq(void) { return false; } -static inline void set_cfs_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ } -static inline void set_rt_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ } -static inline void set_dl_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ } -#endif - static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); @@ -2096,6 +2041,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_SCHED_WALT + +static inline bool +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return cpu_of(rq) == task_cpu(p) && + (p->on_rq || p->last_sleep_ts >= rq->window_start); +} + +#endif /* CONFIG_SCHED_WALT */ + #ifdef arch_scale_freq_capacity #ifndef arch_scale_freq_invariant #define arch_scale_freq_invariant() (true) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 61f852d46858..a5567ccd8803 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -12,7 +12,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* stop tasks as never migrate */ } diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 28e999554463..8d25ffbe4fed 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -20,7 +20,6 @@ */ #include <linux/syscore_ops.h> -#include <linux/cpufreq.h> #include <trace/events/sched.h> #include "sched.h" #include "walt.h" @@ -42,48 +41,17 @@ static __read_mostly unsigned int walt_io_is_busy = 0; unsigned int sysctl_sched_walt_init_task_load_pct = 15; -/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ -unsigned int __read_mostly walt_disabled = 0; +/* true -> use PELT based load stats, false -> use window-based load stats */ +bool __read_mostly walt_disabled = false; -static unsigned int max_possible_efficiency = 1024; -static unsigned int min_possible_efficiency = 1024; - -/* - * Maximum possible frequency across all cpus. Task demand and cpu - * capacity (cpu_power) metrics are scaled in reference to it. - */ -static unsigned int max_possible_freq = 1; - -/* - * Minimum possible max_freq across all cpus. This will be same as - * max_possible_freq on homogeneous systems and could be different from - * max_possible_freq on heterogenous systems. min_max_freq is used to derive - * capacity (cpu_power) of cpus. - */ -static unsigned int min_max_freq = 1; - -static unsigned int max_load_scale_factor = 1024; -static unsigned int max_possible_capacity = 1024; - -/* Mask of all CPUs that have max_possible_capacity */ -static cpumask_t mpc_mask = CPU_MASK_ALL; - -/* Window size (in ns) */ -__read_mostly unsigned int walt_ravg_window = 20000000; - -/* Min window size (in ns) = 10ms */ -#ifdef CONFIG_HZ_300 /* - * Tick interval becomes to 3333333 due to - * rounding error when HZ=300. + * Window size (in ns). Adjust for the tick size so that the window + * rollover occurs just before the tick boundary. */ -#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) -#else -#define MIN_SCHED_RAVG_WINDOW 10000000 -#endif - -/* Max window size (in ns) = 1s */ -#define MAX_SCHED_RAVG_WINDOW 1000000000 +__read_mostly unsigned int walt_ravg_window = + (20000000 / TICK_NSEC) * TICK_NSEC; +#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC) +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC) static unsigned int sync_cpu; static ktime_t ktime_last; @@ -94,11 +62,28 @@ static unsigned int task_load(struct task_struct *p) return p->ravg.demand; } +static inline void fixup_cum_window_demand(struct rq *rq, s64 delta) +{ + rq->cum_window_demand += delta; + if (unlikely((s64)rq->cum_window_demand < 0)) + rq->cum_window_demand = 0; +} + void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { rq->cumulative_runnable_avg += p->ravg.demand; + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + fixup_cum_window_demand(rq, p->ravg.demand); } void @@ -107,6 +92,14 @@ walt_dec_cumulative_runnable_avg(struct rq *rq, { rq->cumulative_runnable_avg -= p->ravg.demand; BUG_ON((s64)rq->cumulative_runnable_avg < 0); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + fixup_cum_window_demand(rq, -(s64)p->ravg.demand); } static void @@ -119,6 +112,8 @@ fixup_cumulative_runnable_avg(struct rq *rq, if ((s64)rq->cumulative_runnable_avg < 0) panic("cra less than zero: tld: %lld, task_load(p) = %u\n", task_load_delta, task_load(p)); + + fixup_cum_window_demand(rq, task_load_delta); } u64 walt_ktime_clock(void) @@ -177,10 +172,28 @@ static int exiting_task(struct task_struct *p) static int __init set_walt_ravg_window(char *str) { + unsigned int adj_window; + bool no_walt = walt_disabled; + get_option(&str, &walt_ravg_window); - walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || - walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + /* Adjust for CONFIG_HZ */ + adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC; + + /* Warn if we're a bit too far away from the expected window size */ + WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC, + "tick-adjusted window size %u, original was %u\n", adj_window, + walt_ravg_window); + + walt_ravg_window = adj_window; + + walt_disabled = walt_disabled || + (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + + WARN(!no_walt && walt_disabled, + "invalid window size, disabling WALT\n"); + return 0; } @@ -204,26 +217,20 @@ update_window_start(struct rq *rq, u64 wallclock) nr_windows = div64_u64(delta, walt_ravg_window); rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; + + rq->cum_window_demand = rq->cumulative_runnable_avg; } +/* + * Translate absolute delta time accounted on a CPU + * to a scale where 1024 is the capacity of the most + * capable CPU running at FMAX + */ static u64 scale_exec_time(u64 delta, struct rq *rq) { - unsigned int cur_freq = rq->cur_freq; - int sf; - - if (unlikely(cur_freq > max_possible_freq)) - cur_freq = rq->max_possible_freq; - - /* round up div64 */ - delta = div64_u64(delta * cur_freq + max_possible_freq - 1, - max_possible_freq); + unsigned long capcurr = capacity_curr_of(cpu_of(rq)); - sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency); - - delta *= sf; - delta >>= 10; - - return delta; + return (delta * capcurr) >> SCHED_CAPACITY_SHIFT; } static int cpu_is_waiting_on_io(struct rq *rq) @@ -600,10 +607,20 @@ static void update_history(struct rq *rq, struct task_struct *p, * A throttled deadline sched class task gets dequeued without * changing p->on_rq. Since the dequeue decrements hmp stats * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. */ - if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || - !p->dl.dl_throttled)) - fixup_cumulative_runnable_avg(rq, p, demand); + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p)) + fixup_cumulative_runnable_avg(rq, p, demand); + else if (rq->curr == p) + fixup_cum_window_demand(rq, demand); + } p->ravg.demand = demand; @@ -746,33 +763,6 @@ done: p->ravg.mark_start = wallclock; } -unsigned long __weak arch_get_cpu_efficiency(int cpu) -{ - return SCHED_LOAD_SCALE; -} - -void walt_init_cpu_efficiency(void) -{ - int i, efficiency; - unsigned int max = 0, min = UINT_MAX; - - for_each_possible_cpu(i) { - efficiency = arch_get_cpu_efficiency(i); - cpu_rq(i)->efficiency = efficiency; - - if (efficiency > max) - max = efficiency; - if (efficiency < min) - min = efficiency; - } - - if (max) - max_possible_efficiency = max; - - if (min) - min_possible_efficiency = min; -} - static void reset_task_stats(struct task_struct *p) { u32 sum = 0; @@ -851,6 +841,17 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->state == TASK_WAKING && + p->last_sleep_ts >= src_rq->window_start) { + fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand); + fixup_cum_window_demand(dest_rq, p->ravg.demand); + } + if (p->ravg.curr_window) { src_rq->curr_runnable_sum -= p->ravg.curr_window; dest_rq->curr_runnable_sum += p->ravg.curr_window; @@ -877,242 +878,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) double_rq_unlock(src_rq, dest_rq); } -/* - * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that - * least efficient cpu gets capacity of 1024 - */ -static unsigned long capacity_scale_cpu_efficiency(int cpu) -{ - return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency; -} - -/* - * Return 'capacity' of a cpu in reference to cpu with lowest max_freq - * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. - */ -static unsigned long capacity_scale_cpu_freq(int cpu) -{ - return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq; -} - -/* - * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so - * that "most" efficient cpu gets a load_scale_factor of 1 - */ -static unsigned long load_scale_cpu_efficiency(int cpu) -{ - return DIV_ROUND_UP(1024 * max_possible_efficiency, - cpu_rq(cpu)->efficiency); -} - -/* - * Return load_scale_factor of a cpu in reference to cpu with best max_freq - * (max_possible_freq), so that one with best max_freq gets a load_scale_factor - * of 1. - */ -static unsigned long load_scale_cpu_freq(int cpu) -{ - return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq); -} - -static int compute_capacity(int cpu) -{ - int capacity = 1024; - - capacity *= capacity_scale_cpu_efficiency(cpu); - capacity >>= 10; - - capacity *= capacity_scale_cpu_freq(cpu); - capacity >>= 10; - - return capacity; -} - -static int compute_load_scale_factor(int cpu) -{ - int load_scale = 1024; - - /* - * load_scale_factor accounts for the fact that task load - * is in reference to "best" performing cpu. Task's load will need to be - * scaled (up) by a factor to determine suitability to be placed on a - * (little) cpu. - */ - load_scale *= load_scale_cpu_efficiency(cpu); - load_scale >>= 10; - - load_scale *= load_scale_cpu_freq(cpu); - load_scale >>= 10; - - return load_scale; -} - -static int cpufreq_notifier_policy(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct cpufreq_policy *policy = (struct cpufreq_policy *)data; - int i, update_max = 0; - u64 highest_mpc = 0, highest_mplsf = 0; - const struct cpumask *cpus = policy->related_cpus; - unsigned int orig_min_max_freq = min_max_freq; - unsigned int orig_max_possible_freq = max_possible_freq; - /* Initialized to policy->max in case policy->related_cpus is empty! */ - unsigned int orig_max_freq = policy->max; - - if (val != CPUFREQ_NOTIFY) - return 0; - - for_each_cpu(i, policy->related_cpus) { - cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, - policy->related_cpus); - orig_max_freq = cpu_rq(i)->max_freq; - cpu_rq(i)->min_freq = policy->min; - cpu_rq(i)->max_freq = policy->max; - cpu_rq(i)->cur_freq = policy->cur; - cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq; - } - - max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); - if (min_max_freq == 1) - min_max_freq = UINT_MAX; - min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); - BUG_ON(!min_max_freq); - BUG_ON(!policy->max); - - /* Changes to policy other than max_freq don't require any updates */ - if (orig_max_freq == policy->max) - return 0; - - /* - * A changed min_max_freq or max_possible_freq (possible during bootup) - * needs to trigger re-computation of load_scale_factor and capacity for - * all possible cpus (even those offline). It also needs to trigger - * re-computation of nr_big_task count on all online cpus. - * - * A changed rq->max_freq otoh needs to trigger re-computation of - * load_scale_factor and capacity for just the cluster of cpus involved. - * Since small task definition depends on max_load_scale_factor, a - * changed load_scale_factor of one cluster could influence - * classification of tasks in another cluster. Hence a changed - * rq->max_freq will need to trigger re-computation of nr_big_task - * count on all online cpus. - * - * While it should be sufficient for nr_big_tasks to be - * re-computed for only online cpus, we have inadequate context - * information here (in policy notifier) with regard to hotplug-safety - * context in which notification is issued. As a result, we can't use - * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is - * fixed up to issue notification always in hotplug-safe context, - * re-compute nr_big_task for all possible cpus. - */ - - if (orig_min_max_freq != min_max_freq || - orig_max_possible_freq != max_possible_freq) { - cpus = cpu_possible_mask; - update_max = 1; - } - - /* - * Changed load_scale_factor can trigger reclassification of tasks as - * big or small. Make this change "atomic" so that tasks are accounted - * properly due to changed load_scale_factor - */ - for_each_cpu(i, cpus) { - struct rq *rq = cpu_rq(i); - - rq->capacity = compute_capacity(i); - rq->load_scale_factor = compute_load_scale_factor(i); - - if (update_max) { - u64 mpc, mplsf; - - mpc = div_u64(((u64) rq->capacity) * - rq->max_possible_freq, rq->max_freq); - rq->max_possible_capacity = (int) mpc; - - mplsf = div_u64(((u64) rq->load_scale_factor) * - rq->max_possible_freq, rq->max_freq); - - if (mpc > highest_mpc) { - highest_mpc = mpc; - cpumask_clear(&mpc_mask); - cpumask_set_cpu(i, &mpc_mask); - } else if (mpc == highest_mpc) { - cpumask_set_cpu(i, &mpc_mask); - } - - if (mplsf > highest_mplsf) - highest_mplsf = mplsf; - } - } - - if (update_max) { - max_possible_capacity = highest_mpc; - max_load_scale_factor = highest_mplsf; - } - - return 0; -} - -static int cpufreq_notifier_trans(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; - unsigned int cpu = freq->cpu, new_freq = freq->new; - unsigned long flags; - int i; - - if (val != CPUFREQ_POSTCHANGE) - return 0; - - BUG_ON(!new_freq); - - if (cpu_rq(cpu)->cur_freq == new_freq) - return 0; - - for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) { - struct rq *rq = cpu_rq(i); - - raw_spin_lock_irqsave(&rq->lock, flags); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, - walt_ktime_clock(), 0); - rq->cur_freq = new_freq; - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - - return 0; -} - -static struct notifier_block notifier_policy_block = { - .notifier_call = cpufreq_notifier_policy -}; - -static struct notifier_block notifier_trans_block = { - .notifier_call = cpufreq_notifier_trans -}; - -static int register_sched_callback(void) -{ - int ret; - - ret = cpufreq_register_notifier(¬ifier_policy_block, - CPUFREQ_POLICY_NOTIFIER); - - if (!ret) - ret = cpufreq_register_notifier(¬ifier_trans_block, - CPUFREQ_TRANSITION_NOTIFIER); - - return 0; -} - -/* - * cpufreq callbacks can be registered at core_initcall or later time. - * Any registration done prior to that is "forgotten" by cpufreq. See - * initialization of variable init_cpufreq_transition_notifier_list_called - * for further information. - */ -core_initcall(register_sched_callback); - void walt_init_new_task_load(struct task_struct *p) { int i; diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index f56c4da16d0b..de7edac43674 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -59,6 +59,6 @@ static inline u64 walt_ktime_clock(void) { return 0; } #endif /* CONFIG_SCHED_WALT */ -extern unsigned int walt_disabled; +extern bool walt_disabled; #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 55caf81a833f..4e2f98dd2052 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -343,13 +343,6 @@ static struct ctl_table kern_table[] = { }, #endif { - .procname = "sched_initial_task_util", - .data = &sysctl_sched_initial_task_util, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, .maxlen = sizeof(unsigned int), diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 23231237f2e2..95cc76785a12 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -68,6 +68,7 @@ enum { * attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ + POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ /* worker flags */ @@ -163,7 +164,6 @@ struct worker_pool { /* L: hash of busy workers */ /* see manage_workers() for details on the two manager mutexes */ - struct mutex manager_arb; /* manager arbitration */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ @@ -295,6 +295,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -808,7 +809,7 @@ static bool need_to_create_worker(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_arb); + bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1952,24 +1953,17 @@ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - /* - * Anyone who successfully grabs manager_arb wins the arbitration - * and becomes the manager. mutex_trylock() on pool->manager_arb - * failure while holding pool->lock reliably indicates that someone - * else is managing the pool and the worker which failed trylock - * can proceed to executing work items. This means that anyone - * grabbing manager_arb is responsible for actually performing - * manager duties. If manager_arb is grabbed and released without - * actual management, the pool may stall indefinitely. - */ - if (!mutex_trylock(&pool->manager_arb)) + if (pool->flags & POOL_MANAGER_ACTIVE) return false; + + pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; - mutex_unlock(&pool->manager_arb); + pool->flags &= ~POOL_MANAGER_ACTIVE; + wake_up(&wq_manager_wait); return true; } @@ -3119,7 +3113,6 @@ static int init_worker_pool(struct worker_pool *pool) setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); - mutex_init(&pool->manager_arb); mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); @@ -3189,13 +3182,15 @@ static void put_unbound_pool(struct worker_pool *pool) hash_del(&pool->hash_node); /* - * Become the manager and destroy all workers. Grabbing - * manager_arb prevents @pool's workers from blocking on - * attach_mutex. + * Become the manager and destroy all workers. This prevents + * @pool's workers from blocking on attach_mutex. We're the last + * manager and @pool gets freed with the flag set. */ - mutex_lock(&pool->manager_arb); - spin_lock_irq(&pool->lock); + wait_event_lock_irq(wq_manager_wait, + !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + pool->flags |= POOL_MANAGER_ACTIVE; + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); @@ -3209,8 +3204,6 @@ static void put_unbound_pool(struct worker_pool *pool) if (pool->detach_completion) wait_for_completion(pool->detach_completion); - mutex_unlock(&pool->manager_arb); - /* shut down the timers */ del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 45215870ac6c..3fa9c146fccb 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -9,6 +9,7 @@ #include <linux/workqueue.h> #include <linux/kthread.h> +#include <linux/preempt.h> struct worker_pool; @@ -59,7 +60,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } |