1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 * Various enhancements by Dmitry Adamushko.
11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 * Group scheduling enhancements by Srivatsa Vaddagiri
14 * Copyright IBM Corporation, 2007
15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 * Scaled math optimizations by Thomas Gleixner
18 * Copyright (C) 2007, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
19 *
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */
23 #include <linux/energy_model.h>
24 #include <linux/mmap_lock.h>
25 #include <linux/hugetlb_inline.h>
26 #include <linux/jiffies.h>
27 #include <linux/mm_api.h>
28 #include <linux/highmem.h>
29 #include <linux/spinlock_api.h>
30 #include <linux/cpumask_api.h>
31 #include <linux/lockdep_api.h>
32 #include <linux/softirq.h>
33 #include <linux/refcount_api.h>
34 #include <linux/topology.h>
35 #include <linux/sched/clock.h>
36 #include <linux/sched/cond_resched.h>
37 #include <linux/sched/cputime.h>
38 #include <linux/sched/isolation.h>
39 #include <linux/sched/nohz.h>
40 #include <linux/sched/prio.h>
41
42 #include <linux/cpuidle.h>
43 #include <linux/interrupt.h>
44 #include <linux/memory-tiers.h>
45 #include <linux/mempolicy.h>
46 #include <linux/mutex_api.h>
47 #include <linux/profile.h>
48 #include <linux/psi.h>
49 #include <linux/ratelimit.h>
50 #include <linux/task_work.h>
51 #include <linux/rbtree_augmented.h>
52
53 #include <asm/switch_to.h>
54
55 #include <uapi/linux/sched/types.h>
56
57 #include "sched.h"
58 #include "stats.h"
59 #include "autogroup.h"
60
61 /*
62 * The initial- and re-scaling of tunables is configurable
63 *
64 * Options are:
65 *
66 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
67 * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
68 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
69 *
70 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
71 */
72 unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
73
74 /*
75 * Minimal preemption granularity for CPU-bound tasks:
76 *
77 * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
78 */
79 unsigned int sysctl_sched_base_slice = 700000ULL;
80 static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
81
82 __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
83
setup_sched_thermal_decay_shift(char * str)84 static int __init setup_sched_thermal_decay_shift(char *str)
85 {
86 pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
87 return 1;
88 }
89 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
90
91 /*
92 * For asym packing, by default the lower numbered CPU has higher priority.
93 */
arch_asym_cpu_priority(int cpu)94 int __weak arch_asym_cpu_priority(int cpu)
95 {
96 return -cpu;
97 }
98
99 /*
100 * The margin used when comparing utilization with CPU capacity.
101 *
102 * (default: ~20%)
103 */
104 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
105
106 /*
107 * The margin used when comparing CPU capacities.
108 * is 'cap1' noticeably greater than 'cap2'
109 *
110 * (default: ~5%)
111 */
112 #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
113
114 #ifdef CONFIG_CFS_BANDWIDTH
115 /*
116 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
117 * each time a cfs_rq requests quota.
118 *
119 * Note: in the case that the slice exceeds the runtime remaining (either due
120 * to consumption or the quota being specified to be smaller than the slice)
121 * we will always only issue the remaining available time.
122 *
123 * (default: 5 msec, units: microseconds)
124 */
125 static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
126 #endif
127
128 #ifdef CONFIG_NUMA_BALANCING
129 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
130 static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
131 #endif
132
133 #ifdef CONFIG_SYSCTL
134 static const struct ctl_table sched_fair_sysctls[] = {
135 #ifdef CONFIG_CFS_BANDWIDTH
136 {
137 .procname = "sched_cfs_bandwidth_slice_us",
138 .data = &sysctl_sched_cfs_bandwidth_slice,
139 .maxlen = sizeof(unsigned int),
140 .mode = 0644,
141 .proc_handler = proc_dointvec_minmax,
142 .extra1 = SYSCTL_ONE,
143 },
144 #endif
145 #ifdef CONFIG_NUMA_BALANCING
146 {
147 .procname = "numa_balancing_promote_rate_limit_MBps",
148 .data = &sysctl_numa_balancing_promote_rate_limit,
149 .maxlen = sizeof(unsigned int),
150 .mode = 0644,
151 .proc_handler = proc_dointvec_minmax,
152 .extra1 = SYSCTL_ZERO,
153 },
154 #endif /* CONFIG_NUMA_BALANCING */
155 };
156
sched_fair_sysctl_init(void)157 static int __init sched_fair_sysctl_init(void)
158 {
159 register_sysctl_init("kernel", sched_fair_sysctls);
160 return 0;
161 }
162 late_initcall(sched_fair_sysctl_init);
163 #endif /* CONFIG_SYSCTL */
164
update_load_add(struct load_weight * lw,unsigned long inc)165 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
166 {
167 lw->weight += inc;
168 lw->inv_weight = 0;
169 }
170
update_load_sub(struct load_weight * lw,unsigned long dec)171 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
172 {
173 lw->weight -= dec;
174 lw->inv_weight = 0;
175 }
176
update_load_set(struct load_weight * lw,unsigned long w)177 static inline void update_load_set(struct load_weight *lw, unsigned long w)
178 {
179 lw->weight = w;
180 lw->inv_weight = 0;
181 }
182
183 /*
184 * Increase the granularity value when there are more CPUs,
185 * because with more CPUs the 'effective latency' as visible
186 * to users decreases. But the relationship is not linear,
187 * so pick a second-best guess by going with the log2 of the
188 * number of CPUs.
189 *
190 * This idea comes from the SD scheduler of Con Kolivas:
191 */
get_update_sysctl_factor(void)192 static unsigned int get_update_sysctl_factor(void)
193 {
194 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
195 unsigned int factor;
196
197 switch (sysctl_sched_tunable_scaling) {
198 case SCHED_TUNABLESCALING_NONE:
199 factor = 1;
200 break;
201 case SCHED_TUNABLESCALING_LINEAR:
202 factor = cpus;
203 break;
204 case SCHED_TUNABLESCALING_LOG:
205 default:
206 factor = 1 + ilog2(cpus);
207 break;
208 }
209
210 return factor;
211 }
212
update_sysctl(void)213 static void update_sysctl(void)
214 {
215 unsigned int factor = get_update_sysctl_factor();
216
217 #define SET_SYSCTL(name) \
218 (sysctl_##name = (factor) * normalized_sysctl_##name)
219 SET_SYSCTL(sched_base_slice);
220 #undef SET_SYSCTL
221 }
222
sched_init_granularity(void)223 void __init sched_init_granularity(void)
224 {
225 update_sysctl();
226 }
227
228 #define WMULT_CONST (~0U)
229 #define WMULT_SHIFT 32
230
__update_inv_weight(struct load_weight * lw)231 static void __update_inv_weight(struct load_weight *lw)
232 {
233 unsigned long w;
234
235 if (likely(lw->inv_weight))
236 return;
237
238 w = scale_load_down(lw->weight);
239
240 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
241 lw->inv_weight = 1;
242 else if (unlikely(!w))
243 lw->inv_weight = WMULT_CONST;
244 else
245 lw->inv_weight = WMULT_CONST / w;
246 }
247
248 /*
249 * delta_exec * weight / lw.weight
250 * OR
251 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
252 *
253 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
254 * we're guaranteed shift stays positive because inv_weight is guaranteed to
255 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
256 *
257 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
258 * weight/lw.weight <= 1, and therefore our shift will also be positive.
259 */
__calc_delta(u64 delta_exec,unsigned long weight,struct load_weight * lw)260 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
261 {
262 u64 fact = scale_load_down(weight);
263 u32 fact_hi = (u32)(fact >> 32);
264 int shift = WMULT_SHIFT;
265 int fs;
266
267 __update_inv_weight(lw);
268
269 if (unlikely(fact_hi)) {
270 fs = fls(fact_hi);
271 shift -= fs;
272 fact >>= fs;
273 }
274
275 fact = mul_u32_u32(fact, lw->inv_weight);
276
277 fact_hi = (u32)(fact >> 32);
278 if (fact_hi) {
279 fs = fls(fact_hi);
280 shift -= fs;
281 fact >>= fs;
282 }
283
284 return mul_u64_u32_shr(delta_exec, fact, shift);
285 }
286
287 /*
288 * delta /= w
289 */
calc_delta_fair(u64 delta,struct sched_entity * se)290 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
291 {
292 if (unlikely(se->load.weight != NICE_0_LOAD))
293 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
294
295 return delta;
296 }
297
298 const struct sched_class fair_sched_class;
299
300 /**************************************************************
301 * CFS operations on generic schedulable entities:
302 */
303
304 #ifdef CONFIG_FAIR_GROUP_SCHED
305
306 /* Walk up scheduling entities hierarchy */
307 #define for_each_sched_entity(se) \
308 for (; se; se = se->parent)
309
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)310 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
311 {
312 struct rq *rq = rq_of(cfs_rq);
313 int cpu = cpu_of(rq);
314
315 if (cfs_rq->on_list)
316 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
317
318 cfs_rq->on_list = 1;
319
320 /*
321 * Ensure we either appear before our parent (if already
322 * enqueued) or force our parent to appear after us when it is
323 * enqueued. The fact that we always enqueue bottom-up
324 * reduces this to two cases and a special case for the root
325 * cfs_rq. Furthermore, it also means that we will always reset
326 * tmp_alone_branch either when the branch is connected
327 * to a tree or when we reach the top of the tree
328 */
329 if (cfs_rq->tg->parent &&
330 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
331 /*
332 * If parent is already on the list, we add the child
333 * just before. Thanks to circular linked property of
334 * the list, this means to put the child at the tail
335 * of the list that starts by parent.
336 */
337 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
338 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
339 /*
340 * The branch is now connected to its tree so we can
341 * reset tmp_alone_branch to the beginning of the
342 * list.
343 */
344 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
345 return true;
346 }
347
348 if (!cfs_rq->tg->parent) {
349 /*
350 * cfs rq without parent should be put
351 * at the tail of the list.
352 */
353 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
354 &rq->leaf_cfs_rq_list);
355 /*
356 * We have reach the top of a tree so we can reset
357 * tmp_alone_branch to the beginning of the list.
358 */
359 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
360 return true;
361 }
362
363 /*
364 * The parent has not already been added so we want to
365 * make sure that it will be put after us.
366 * tmp_alone_branch points to the begin of the branch
367 * where we will add parent.
368 */
369 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
370 /*
371 * update tmp_alone_branch to points to the new begin
372 * of the branch
373 */
374 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
375 return false;
376 }
377
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)378 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
379 {
380 if (cfs_rq->on_list) {
381 struct rq *rq = rq_of(cfs_rq);
382
383 /*
384 * With cfs_rq being unthrottled/throttled during an enqueue,
385 * it can happen the tmp_alone_branch points to the leaf that
386 * we finally want to delete. In this case, tmp_alone_branch moves
387 * to the prev element but it will point to rq->leaf_cfs_rq_list
388 * at the end of the enqueue.
389 */
390 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
391 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
392
393 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
394 cfs_rq->on_list = 0;
395 }
396 }
397
assert_list_leaf_cfs_rq(struct rq * rq)398 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
399 {
400 WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
401 }
402
403 /* Iterate through all leaf cfs_rq's on a runqueue */
404 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
405 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
406 leaf_cfs_rq_list)
407
408 /* Do the two (enqueued) entities belong to the same group ? */
409 static inline struct cfs_rq *
is_same_group(struct sched_entity * se,struct sched_entity * pse)410 is_same_group(struct sched_entity *se, struct sched_entity *pse)
411 {
412 if (se->cfs_rq == pse->cfs_rq)
413 return se->cfs_rq;
414
415 return NULL;
416 }
417
parent_entity(const struct sched_entity * se)418 static inline struct sched_entity *parent_entity(const struct sched_entity *se)
419 {
420 return se->parent;
421 }
422
423 static void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)424 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
425 {
426 int se_depth, pse_depth;
427
428 /*
429 * preemption test can be made between sibling entities who are in the
430 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
431 * both tasks until we find their ancestors who are siblings of common
432 * parent.
433 */
434
435 /* First walk up until both entities are at same depth */
436 se_depth = (*se)->depth;
437 pse_depth = (*pse)->depth;
438
439 while (se_depth > pse_depth) {
440 se_depth--;
441 *se = parent_entity(*se);
442 }
443
444 while (pse_depth > se_depth) {
445 pse_depth--;
446 *pse = parent_entity(*pse);
447 }
448
449 while (!is_same_group(*se, *pse)) {
450 *se = parent_entity(*se);
451 *pse = parent_entity(*pse);
452 }
453 }
454
tg_is_idle(struct task_group * tg)455 static int tg_is_idle(struct task_group *tg)
456 {
457 return tg->idle > 0;
458 }
459
cfs_rq_is_idle(struct cfs_rq * cfs_rq)460 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
461 {
462 return cfs_rq->idle > 0;
463 }
464
se_is_idle(struct sched_entity * se)465 static int se_is_idle(struct sched_entity *se)
466 {
467 if (entity_is_task(se))
468 return task_has_idle_policy(task_of(se));
469 return cfs_rq_is_idle(group_cfs_rq(se));
470 }
471
472 #else /* !CONFIG_FAIR_GROUP_SCHED: */
473
474 #define for_each_sched_entity(se) \
475 for (; se; se = NULL)
476
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)477 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
478 {
479 return true;
480 }
481
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)482 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
483 {
484 }
485
assert_list_leaf_cfs_rq(struct rq * rq)486 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
487 {
488 }
489
490 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
491 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
492
parent_entity(struct sched_entity * se)493 static inline struct sched_entity *parent_entity(struct sched_entity *se)
494 {
495 return NULL;
496 }
497
498 static inline void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)499 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
500 {
501 }
502
tg_is_idle(struct task_group * tg)503 static inline int tg_is_idle(struct task_group *tg)
504 {
505 return 0;
506 }
507
cfs_rq_is_idle(struct cfs_rq * cfs_rq)508 static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
509 {
510 return 0;
511 }
512
se_is_idle(struct sched_entity * se)513 static int se_is_idle(struct sched_entity *se)
514 {
515 return task_has_idle_policy(task_of(se));
516 }
517
518 #endif /* !CONFIG_FAIR_GROUP_SCHED */
519
520 static __always_inline
521 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
522
523 /**************************************************************
524 * Scheduling class tree data structure manipulation methods:
525 */
526
527 extern void __BUILD_BUG_vruntime_cmp(void);
528
529 /* Use __builtin_strcmp() because of __HAVE_ARCH_STRCMP: */
530
531 #define vruntime_cmp(A, CMP_STR, B) ({ \
532 int __res = 0; \
533 \
534 if (!__builtin_strcmp(CMP_STR, "<")) { \
535 __res = ((s64)((A)-(B)) < 0); \
536 } else if (!__builtin_strcmp(CMP_STR, "<=")) { \
537 __res = ((s64)((A)-(B)) <= 0); \
538 } else if (!__builtin_strcmp(CMP_STR, ">")) { \
539 __res = ((s64)((A)-(B)) > 0); \
540 } else if (!__builtin_strcmp(CMP_STR, ">=")) { \
541 __res = ((s64)((A)-(B)) >= 0); \
542 } else { \
543 /* Unknown operator throws linker error: */ \
544 __BUILD_BUG_vruntime_cmp(); \
545 } \
546 \
547 __res; \
548 })
549
550 extern void __BUILD_BUG_vruntime_op(void);
551
552 #define vruntime_op(A, OP_STR, B) ({ \
553 s64 __res = 0; \
554 \
555 if (!__builtin_strcmp(OP_STR, "-")) { \
556 __res = (s64)((A)-(B)); \
557 } else { \
558 /* Unknown operator throws linker error: */ \
559 __BUILD_BUG_vruntime_op(); \
560 } \
561 \
562 __res; \
563 })
564
565
max_vruntime(u64 max_vruntime,u64 vruntime)566 static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
567 {
568 if (vruntime_cmp(vruntime, ">", max_vruntime))
569 max_vruntime = vruntime;
570
571 return max_vruntime;
572 }
573
min_vruntime(u64 min_vruntime,u64 vruntime)574 static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
575 {
576 if (vruntime_cmp(vruntime, "<", min_vruntime))
577 min_vruntime = vruntime;
578
579 return min_vruntime;
580 }
581
entity_before(const struct sched_entity * a,const struct sched_entity * b)582 static inline bool entity_before(const struct sched_entity *a,
583 const struct sched_entity *b)
584 {
585 /*
586 * Tiebreak on vruntime seems unnecessary since it can
587 * hardly happen.
588 */
589 return vruntime_cmp(a->deadline, "<", b->deadline);
590 }
591
592 /*
593 * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
594 * and this value should be no more than two lag bounds. Which puts it in the
595 * general order of:
596 *
597 * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
598 *
599 * which is around 44 bits in size (on 64bit); that is 20 for
600 * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
601 * however many msec the actual slice+tick ends up begin.
602 *
603 * (disregarding the actual divide-by-weight part makes for the worst case
604 * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
605 * being the zero-lag point).
606 */
entity_key(struct cfs_rq * cfs_rq,struct sched_entity * se)607 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
608 {
609 return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
610 }
611
612 #define __node_2_se(node) \
613 rb_entry((node), struct sched_entity, run_node)
614
615 /*
616 * Compute virtual time from the per-task service numbers:
617 *
618 * Fair schedulers conserve lag:
619 *
620 * \Sum lag_i = 0
621 *
622 * Where lag_i is given by:
623 *
624 * lag_i = S - s_i = w_i * (V - v_i)
625 *
626 * Where S is the ideal service time and V is it's virtual time counterpart.
627 * Therefore:
628 *
629 * \Sum lag_i = 0
630 * \Sum w_i * (V - v_i) = 0
631 * \Sum (w_i * V - w_i * v_i) = 0
632 *
633 * From which we can solve an expression for V in v_i (which we have in
634 * se->vruntime):
635 *
636 * \Sum v_i * w_i \Sum v_i * w_i
637 * V = -------------- = --------------
638 * \Sum w_i W
639 *
640 * Specifically, this is the weighted average of all entity virtual runtimes.
641 *
642 * [[ NOTE: this is only equal to the ideal scheduler under the condition
643 * that join/leave operations happen at lag_i = 0, otherwise the
644 * virtual time has non-contiguous motion equivalent to:
645 *
646 * V +-= lag_i / W
647 *
648 * Also see the comment in place_entity() that deals with this. ]]
649 *
650 * However, since v_i is u64, and the multiplication could easily overflow
651 * transform it into a relative form that uses smaller quantities:
652 *
653 * Substitute: v_i == (v_i - v0) + v0
654 *
655 * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i
656 * V = ---------------------------- = --------------------- + v0
657 * W W
658 *
659 * Which we track using:
660 *
661 * v0 := cfs_rq->zero_vruntime
662 * \Sum (v_i - v0) * w_i := cfs_rq->sum_w_vruntime
663 * \Sum w_i := cfs_rq->sum_weight
664 *
665 * Since zero_vruntime closely tracks the per-task service, these
666 * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
667 * induced in the system due to quantisation.
668 *
669 * Also, we use scale_load_down() to reduce the size.
670 *
671 * As measured, the max (key * weight) value was ~44 bits for a kernel build.
672 */
673 static void
sum_w_vruntime_add(struct cfs_rq * cfs_rq,struct sched_entity * se)674 sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
675 {
676 unsigned long weight = scale_load_down(se->load.weight);
677 s64 key = entity_key(cfs_rq, se);
678
679 cfs_rq->sum_w_vruntime += key * weight;
680 cfs_rq->sum_weight += weight;
681 }
682
683 static void
sum_w_vruntime_sub(struct cfs_rq * cfs_rq,struct sched_entity * se)684 sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
685 {
686 unsigned long weight = scale_load_down(se->load.weight);
687 s64 key = entity_key(cfs_rq, se);
688
689 cfs_rq->sum_w_vruntime -= key * weight;
690 cfs_rq->sum_weight -= weight;
691 }
692
693 static inline
update_zero_vruntime(struct cfs_rq * cfs_rq,s64 delta)694 void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
695 {
696 /*
697 * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
698 */
699 cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
700 cfs_rq->zero_vruntime += delta;
701 }
702
703 /*
704 * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
705 * For this to be so, the result of this function must have a left bias.
706 *
707 * Called in:
708 * - place_entity() -- before enqueue
709 * - update_entity_lag() -- before dequeue
710 * - update_deadline() -- slice expiration
711 *
712 * This means it is one entry 'behind' but that puts it close enough to where
713 * the bound on entity_key() is at most two lag bounds.
714 */
avg_vruntime(struct cfs_rq * cfs_rq)715 u64 avg_vruntime(struct cfs_rq *cfs_rq)
716 {
717 struct sched_entity *curr = cfs_rq->curr;
718 long weight = cfs_rq->sum_weight;
719 s64 delta = 0;
720
721 if (curr && !curr->on_rq)
722 curr = NULL;
723
724 if (weight) {
725 s64 runtime = cfs_rq->sum_w_vruntime;
726
727 if (curr) {
728 unsigned long w = scale_load_down(curr->load.weight);
729
730 runtime += entity_key(cfs_rq, curr) * w;
731 weight += w;
732 }
733
734 /* sign flips effective floor / ceiling */
735 if (runtime < 0)
736 runtime -= (weight - 1);
737
738 delta = div_s64(runtime, weight);
739 } else if (curr) {
740 /*
741 * When there is but one element, it is the average.
742 */
743 delta = curr->vruntime - cfs_rq->zero_vruntime;
744 }
745
746 update_zero_vruntime(cfs_rq, delta);
747
748 return cfs_rq->zero_vruntime;
749 }
750
751 static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
752
753 /*
754 * lag_i = S - s_i = w_i * (V - v_i)
755 *
756 * However, since V is approximated by the weighted average of all entities it
757 * is possible -- by addition/removal/reweight to the tree -- to move V around
758 * and end up with a larger lag than we started with.
759 *
760 * Limit this to either double the slice length with a minimum of TICK_NSEC
761 * since that is the timing granularity.
762 *
763 * EEVDF gives the following limit for a steady state system:
764 *
765 * -r_max < lag < max(r_max, q)
766 */
update_entity_lag(struct cfs_rq * cfs_rq,struct sched_entity * se)767 static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
768 {
769 u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
770 s64 vlag, limit;
771
772 WARN_ON_ONCE(!se->on_rq);
773
774 vlag = avg_vruntime(cfs_rq) - se->vruntime;
775 limit = calc_delta_fair(max_slice, se);
776
777 se->vlag = clamp(vlag, -limit, limit);
778 }
779
780 /*
781 * Entity is eligible once it received less service than it ought to have,
782 * eg. lag >= 0.
783 *
784 * lag_i = S - s_i = w_i*(V - v_i)
785 *
786 * lag_i >= 0 -> V >= v_i
787 *
788 * \Sum (v_i - v)*w_i
789 * V = ------------------ + v
790 * \Sum w_i
791 *
792 * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
793 *
794 * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due
795 * to the loss in precision caused by the division.
796 */
vruntime_eligible(struct cfs_rq * cfs_rq,u64 vruntime)797 static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
798 {
799 struct sched_entity *curr = cfs_rq->curr;
800 s64 avg = cfs_rq->sum_w_vruntime;
801 long load = cfs_rq->sum_weight;
802
803 if (curr && curr->on_rq) {
804 unsigned long weight = scale_load_down(curr->load.weight);
805
806 avg += entity_key(cfs_rq, curr) * weight;
807 load += weight;
808 }
809
810 return avg >= vruntime_op(vruntime, "-", cfs_rq->zero_vruntime) * load;
811 }
812
entity_eligible(struct cfs_rq * cfs_rq,struct sched_entity * se)813 int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
814 {
815 return vruntime_eligible(cfs_rq, se->vruntime);
816 }
817
cfs_rq_min_slice(struct cfs_rq * cfs_rq)818 static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
819 {
820 struct sched_entity *root = __pick_root_entity(cfs_rq);
821 struct sched_entity *curr = cfs_rq->curr;
822 u64 min_slice = ~0ULL;
823
824 if (curr && curr->on_rq)
825 min_slice = curr->slice;
826
827 if (root)
828 min_slice = min(min_slice, root->min_slice);
829
830 return min_slice;
831 }
832
cfs_rq_max_slice(struct cfs_rq * cfs_rq)833 static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq)
834 {
835 struct sched_entity *root = __pick_root_entity(cfs_rq);
836 struct sched_entity *curr = cfs_rq->curr;
837 u64 max_slice = 0ULL;
838
839 if (curr && curr->on_rq)
840 max_slice = curr->slice;
841
842 if (root)
843 max_slice = max(max_slice, root->max_slice);
844
845 return max_slice;
846 }
847
__entity_less(struct rb_node * a,const struct rb_node * b)848 static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
849 {
850 return entity_before(__node_2_se(a), __node_2_se(b));
851 }
852
__min_vruntime_update(struct sched_entity * se,struct rb_node * node)853 static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node)
854 {
855 if (node) {
856 struct sched_entity *rse = __node_2_se(node);
857
858 if (vruntime_cmp(se->min_vruntime, ">", rse->min_vruntime))
859 se->min_vruntime = rse->min_vruntime;
860 }
861 }
862
__min_slice_update(struct sched_entity * se,struct rb_node * node)863 static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node)
864 {
865 if (node) {
866 struct sched_entity *rse = __node_2_se(node);
867 if (rse->min_slice < se->min_slice)
868 se->min_slice = rse->min_slice;
869 }
870 }
871
__max_slice_update(struct sched_entity * se,struct rb_node * node)872 static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node)
873 {
874 if (node) {
875 struct sched_entity *rse = __node_2_se(node);
876 if (rse->max_slice > se->max_slice)
877 se->max_slice = rse->max_slice;
878 }
879 }
880
881 /*
882 * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
883 */
min_vruntime_update(struct sched_entity * se,bool exit)884 static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
885 {
886 u64 old_min_vruntime = se->min_vruntime;
887 u64 old_min_slice = se->min_slice;
888 u64 old_max_slice = se->max_slice;
889 struct rb_node *node = &se->run_node;
890
891 se->min_vruntime = se->vruntime;
892 __min_vruntime_update(se, node->rb_right);
893 __min_vruntime_update(se, node->rb_left);
894
895 se->min_slice = se->slice;
896 __min_slice_update(se, node->rb_right);
897 __min_slice_update(se, node->rb_left);
898
899 se->max_slice = se->slice;
900 __max_slice_update(se, node->rb_right);
901 __max_slice_update(se, node->rb_left);
902
903 return se->min_vruntime == old_min_vruntime &&
904 se->min_slice == old_min_slice &&
905 se->max_slice == old_max_slice;
906 }
907
908 RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
909 run_node, min_vruntime, min_vruntime_update);
910
911 /*
912 * Enqueue an entity into the rb-tree:
913 */
__enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)914 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
915 {
916 sum_w_vruntime_add(cfs_rq, se);
917 se->min_vruntime = se->vruntime;
918 se->min_slice = se->slice;
919 rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
920 __entity_less, &min_vruntime_cb);
921 }
922
__dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)923 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
924 {
925 rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
926 &min_vruntime_cb);
927 sum_w_vruntime_sub(cfs_rq, se);
928 }
929
__pick_root_entity(struct cfs_rq * cfs_rq)930 struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
931 {
932 struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node;
933
934 if (!root)
935 return NULL;
936
937 return __node_2_se(root);
938 }
939
__pick_first_entity(struct cfs_rq * cfs_rq)940 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
941 {
942 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
943
944 if (!left)
945 return NULL;
946
947 return __node_2_se(left);
948 }
949
950 /*
951 * Set the vruntime up to which an entity can run before looking
952 * for another entity to pick.
953 * In case of run to parity, we use the shortest slice of the enqueued
954 * entities to set the protected period.
955 * When run to parity is disabled, we give a minimum quantum to the running
956 * entity to ensure progress.
957 */
set_protect_slice(struct cfs_rq * cfs_rq,struct sched_entity * se)958 static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
959 {
960 u64 slice = normalized_sysctl_sched_base_slice;
961 u64 vprot = se->deadline;
962
963 if (sched_feat(RUN_TO_PARITY))
964 slice = cfs_rq_min_slice(cfs_rq);
965
966 slice = min(slice, se->slice);
967 if (slice != se->slice)
968 vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se));
969
970 se->vprot = vprot;
971 }
972
update_protect_slice(struct cfs_rq * cfs_rq,struct sched_entity * se)973 static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
974 {
975 u64 slice = cfs_rq_min_slice(cfs_rq);
976
977 se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se));
978 }
979
protect_slice(struct sched_entity * se)980 static inline bool protect_slice(struct sched_entity *se)
981 {
982 return vruntime_cmp(se->vruntime, "<", se->vprot);
983 }
984
cancel_protect_slice(struct sched_entity * se)985 static inline void cancel_protect_slice(struct sched_entity *se)
986 {
987 if (protect_slice(se))
988 se->vprot = se->vruntime;
989 }
990
991 /*
992 * Earliest Eligible Virtual Deadline First
993 *
994 * In order to provide latency guarantees for different request sizes
995 * EEVDF selects the best runnable task from two criteria:
996 *
997 * 1) the task must be eligible (must be owed service)
998 *
999 * 2) from those tasks that meet 1), we select the one
1000 * with the earliest virtual deadline.
1001 *
1002 * We can do this in O(log n) time due to an augmented RB-tree. The
1003 * tree keeps the entries sorted on deadline, but also functions as a
1004 * heap based on the vruntime by keeping:
1005 *
1006 * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime)
1007 *
1008 * Which allows tree pruning through eligibility.
1009 */
__pick_eevdf(struct cfs_rq * cfs_rq,bool protect)1010 static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
1011 {
1012 struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
1013 struct sched_entity *se = __pick_first_entity(cfs_rq);
1014 struct sched_entity *curr = cfs_rq->curr;
1015 struct sched_entity *best = NULL;
1016
1017 /*
1018 * We can safely skip eligibility check if there is only one entity
1019 * in this cfs_rq, saving some cycles.
1020 */
1021 if (cfs_rq->nr_queued == 1)
1022 return curr && curr->on_rq ? curr : se;
1023
1024 /*
1025 * Picking the ->next buddy will affect latency but not fairness.
1026 */
1027 if (sched_feat(PICK_BUDDY) &&
1028 cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
1029 /* ->next will never be delayed */
1030 WARN_ON_ONCE(cfs_rq->next->sched_delayed);
1031 return cfs_rq->next;
1032 }
1033
1034 if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
1035 curr = NULL;
1036
1037 if (curr && protect && protect_slice(curr))
1038 return curr;
1039
1040 /* Pick the leftmost entity if it's eligible */
1041 if (se && entity_eligible(cfs_rq, se)) {
1042 best = se;
1043 goto found;
1044 }
1045
1046 /* Heap search for the EEVD entity */
1047 while (node) {
1048 struct rb_node *left = node->rb_left;
1049
1050 /*
1051 * Eligible entities in left subtree are always better
1052 * choices, since they have earlier deadlines.
1053 */
1054 if (left && vruntime_eligible(cfs_rq,
1055 __node_2_se(left)->min_vruntime)) {
1056 node = left;
1057 continue;
1058 }
1059
1060 se = __node_2_se(node);
1061
1062 /*
1063 * The left subtree either is empty or has no eligible
1064 * entity, so check the current node since it is the one
1065 * with earliest deadline that might be eligible.
1066 */
1067 if (entity_eligible(cfs_rq, se)) {
1068 best = se;
1069 break;
1070 }
1071
1072 node = node->rb_right;
1073 }
1074 found:
1075 if (!best || (curr && entity_before(curr, best)))
1076 best = curr;
1077
1078 return best;
1079 }
1080
pick_eevdf(struct cfs_rq * cfs_rq)1081 static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
1082 {
1083 return __pick_eevdf(cfs_rq, true);
1084 }
1085
__pick_last_entity(struct cfs_rq * cfs_rq)1086 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
1087 {
1088 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
1089
1090 if (!last)
1091 return NULL;
1092
1093 return __node_2_se(last);
1094 }
1095
1096 /**************************************************************
1097 * Scheduling class statistics methods:
1098 */
sched_update_scaling(void)1099 int sched_update_scaling(void)
1100 {
1101 unsigned int factor = get_update_sysctl_factor();
1102
1103 #define WRT_SYSCTL(name) \
1104 (normalized_sysctl_##name = sysctl_##name / (factor))
1105 WRT_SYSCTL(sched_base_slice);
1106 #undef WRT_SYSCTL
1107
1108 return 0;
1109 }
1110
1111 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
1112
1113 /*
1114 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
1115 * this is probably good enough.
1116 */
update_deadline(struct cfs_rq * cfs_rq,struct sched_entity * se)1117 static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se)
1118 {
1119 if (vruntime_cmp(se->vruntime, "<", se->deadline))
1120 return false;
1121
1122 /*
1123 * For EEVDF the virtual time slope is determined by w_i (iow.
1124 * nice) while the request time r_i is determined by
1125 * sysctl_sched_base_slice.
1126 */
1127 if (!se->custom_slice)
1128 se->slice = sysctl_sched_base_slice;
1129
1130 /*
1131 * EEVDF: vd_i = ve_i + r_i / w_i
1132 */
1133 se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
1134 avg_vruntime(cfs_rq);
1135
1136 /*
1137 * The task has consumed its request, reschedule.
1138 */
1139 return true;
1140 }
1141
1142 #include "pelt.h"
1143
1144 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
1145 static unsigned long task_h_load(struct task_struct *p);
1146 static unsigned long capacity_of(int cpu);
1147
1148 /* Give new sched_entity start runnable values to heavy its load in infant time */
init_entity_runnable_average(struct sched_entity * se)1149 void init_entity_runnable_average(struct sched_entity *se)
1150 {
1151 struct sched_avg *sa = &se->avg;
1152
1153 memset(sa, 0, sizeof(*sa));
1154
1155 /*
1156 * Tasks are initialized with full load to be seen as heavy tasks until
1157 * they get a chance to stabilize to their real load level.
1158 * Group entities are initialized with zero load to reflect the fact that
1159 * nothing has been attached to the task group yet.
1160 */
1161 if (entity_is_task(se))
1162 sa->load_avg = scale_load_down(se->load.weight);
1163
1164 /* when this task is enqueued, it will contribute to its cfs_rq's load_avg */
1165 }
1166
1167 /*
1168 * With new tasks being created, their initial util_avgs are extrapolated
1169 * based on the cfs_rq's current util_avg:
1170 *
1171 * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
1172 * * se_weight(se)
1173 *
1174 * However, in many cases, the above util_avg does not give a desired
1175 * value. Moreover, the sum of the util_avgs may be divergent, such
1176 * as when the series is a harmonic series.
1177 *
1178 * To solve this problem, we also cap the util_avg of successive tasks to
1179 * only 1/2 of the left utilization budget:
1180 *
1181 * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
1182 *
1183 * where n denotes the nth task and cpu_scale the CPU capacity.
1184 *
1185 * For example, for a CPU with 1024 of capacity, a simplest series from
1186 * the beginning would be like:
1187 *
1188 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
1189 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
1190 *
1191 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
1192 * if util_avg > util_avg_cap.
1193 */
post_init_entity_util_avg(struct task_struct * p)1194 void post_init_entity_util_avg(struct task_struct *p)
1195 {
1196 struct sched_entity *se = &p->se;
1197 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1198 struct sched_avg *sa = &se->avg;
1199 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
1200 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
1201
1202 if (p->sched_class != &fair_sched_class) {
1203 /*
1204 * For !fair tasks do:
1205 *
1206 update_cfs_rq_load_avg(now, cfs_rq);
1207 attach_entity_load_avg(cfs_rq, se);
1208 switched_from_fair(rq, p);
1209 *
1210 * such that the next switched_to_fair() has the
1211 * expected state.
1212 */
1213 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
1214 return;
1215 }
1216
1217 if (cap > 0) {
1218 if (cfs_rq->avg.util_avg != 0) {
1219 sa->util_avg = cfs_rq->avg.util_avg * se_weight(se);
1220 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
1221
1222 if (sa->util_avg > cap)
1223 sa->util_avg = cap;
1224 } else {
1225 sa->util_avg = cap;
1226 }
1227 }
1228
1229 sa->runnable_avg = sa->util_avg;
1230 }
1231
update_se(struct rq * rq,struct sched_entity * se)1232 static s64 update_se(struct rq *rq, struct sched_entity *se)
1233 {
1234 u64 now = rq_clock_task(rq);
1235 s64 delta_exec;
1236
1237 delta_exec = now - se->exec_start;
1238 if (unlikely(delta_exec <= 0))
1239 return delta_exec;
1240
1241 se->exec_start = now;
1242 if (entity_is_task(se)) {
1243 struct task_struct *donor = task_of(se);
1244 struct task_struct *running = rq->curr;
1245 /*
1246 * If se is a task, we account the time against the running
1247 * task, as w/ proxy-exec they may not be the same.
1248 */
1249 running->se.exec_start = now;
1250 running->se.sum_exec_runtime += delta_exec;
1251
1252 trace_sched_stat_runtime(running, delta_exec);
1253 account_group_exec_runtime(running, delta_exec);
1254
1255 /* cgroup time is always accounted against the donor */
1256 cgroup_account_cputime(donor, delta_exec);
1257 } else {
1258 /* If not task, account the time against donor se */
1259 se->sum_exec_runtime += delta_exec;
1260 }
1261
1262 if (schedstat_enabled()) {
1263 struct sched_statistics *stats;
1264
1265 stats = __schedstats_from_se(se);
1266 __schedstat_set(stats->exec_max,
1267 max(delta_exec, stats->exec_max));
1268 }
1269
1270 return delta_exec;
1271 }
1272
1273 static void set_next_buddy(struct sched_entity *se);
1274
1275 /*
1276 * Used by other classes to account runtime.
1277 */
update_curr_common(struct rq * rq)1278 s64 update_curr_common(struct rq *rq)
1279 {
1280 return update_se(rq, &rq->donor->se);
1281 }
1282
1283 /*
1284 * Update the current task's runtime statistics.
1285 */
update_curr(struct cfs_rq * cfs_rq)1286 static void update_curr(struct cfs_rq *cfs_rq)
1287 {
1288 /*
1289 * Note: cfs_rq->curr corresponds to the task picked to
1290 * run (ie: rq->donor.se) which due to proxy-exec may
1291 * not necessarily be the actual task running
1292 * (rq->curr.se). This is easy to confuse!
1293 */
1294 struct sched_entity *curr = cfs_rq->curr;
1295 struct rq *rq = rq_of(cfs_rq);
1296 s64 delta_exec;
1297 bool resched;
1298
1299 if (unlikely(!curr))
1300 return;
1301
1302 delta_exec = update_se(rq, curr);
1303 if (unlikely(delta_exec <= 0))
1304 return;
1305
1306 curr->vruntime += calc_delta_fair(delta_exec, curr);
1307 resched = update_deadline(cfs_rq, curr);
1308
1309 if (entity_is_task(curr)) {
1310 /*
1311 * If the fair_server is active, we need to account for the
1312 * fair_server time whether or not the task is running on
1313 * behalf of fair_server or not:
1314 * - If the task is running on behalf of fair_server, we need
1315 * to limit its time based on the assigned runtime.
1316 * - Fair task that runs outside of fair_server should account
1317 * against fair_server such that it can account for this time
1318 * and possibly avoid running this period.
1319 */
1320 dl_server_update(&rq->fair_server, delta_exec);
1321 }
1322
1323 account_cfs_rq_runtime(cfs_rq, delta_exec);
1324
1325 if (cfs_rq->nr_queued == 1)
1326 return;
1327
1328 if (resched || !protect_slice(curr)) {
1329 resched_curr_lazy(rq);
1330 clear_buddies(cfs_rq, curr);
1331 }
1332 }
1333
update_curr_fair(struct rq * rq)1334 static void update_curr_fair(struct rq *rq)
1335 {
1336 update_curr(cfs_rq_of(&rq->donor->se));
1337 }
1338
1339 static inline void
update_stats_wait_start_fair(struct cfs_rq * cfs_rq,struct sched_entity * se)1340 update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1341 {
1342 struct sched_statistics *stats;
1343 struct task_struct *p = NULL;
1344
1345 if (!schedstat_enabled())
1346 return;
1347
1348 stats = __schedstats_from_se(se);
1349
1350 if (entity_is_task(se))
1351 p = task_of(se);
1352
1353 __update_stats_wait_start(rq_of(cfs_rq), p, stats);
1354 }
1355
1356 static inline void
update_stats_wait_end_fair(struct cfs_rq * cfs_rq,struct sched_entity * se)1357 update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1358 {
1359 struct sched_statistics *stats;
1360 struct task_struct *p = NULL;
1361
1362 if (!schedstat_enabled())
1363 return;
1364
1365 stats = __schedstats_from_se(se);
1366
1367 /*
1368 * When the sched_schedstat changes from 0 to 1, some sched se
1369 * maybe already in the runqueue, the se->statistics.wait_start
1370 * will be 0.So it will let the delta wrong. We need to avoid this
1371 * scenario.
1372 */
1373 if (unlikely(!schedstat_val(stats->wait_start)))
1374 return;
1375
1376 if (entity_is_task(se))
1377 p = task_of(se);
1378
1379 __update_stats_wait_end(rq_of(cfs_rq), p, stats);
1380 }
1381
1382 static inline void
update_stats_enqueue_sleeper_fair(struct cfs_rq * cfs_rq,struct sched_entity * se)1383 update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
1384 {
1385 struct sched_statistics *stats;
1386 struct task_struct *tsk = NULL;
1387
1388 if (!schedstat_enabled())
1389 return;
1390
1391 stats = __schedstats_from_se(se);
1392
1393 if (entity_is_task(se))
1394 tsk = task_of(se);
1395
1396 __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
1397 }
1398
1399 /*
1400 * Task is being enqueued - update stats:
1401 */
1402 static inline void
update_stats_enqueue_fair(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)1403 update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1404 {
1405 if (!schedstat_enabled())
1406 return;
1407
1408 /*
1409 * Are we enqueueing a waiting task? (for current tasks
1410 * a dequeue/enqueue event is a NOP)
1411 */
1412 if (se != cfs_rq->curr)
1413 update_stats_wait_start_fair(cfs_rq, se);
1414
1415 if (flags & ENQUEUE_WAKEUP)
1416 update_stats_enqueue_sleeper_fair(cfs_rq, se);
1417 }
1418
1419 static inline void
update_stats_dequeue_fair(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)1420 update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1421 {
1422
1423 if (!schedstat_enabled())
1424 return;
1425
1426 /*
1427 * Mark the end of the wait period if dequeueing a
1428 * waiting task:
1429 */
1430 if (se != cfs_rq->curr)
1431 update_stats_wait_end_fair(cfs_rq, se);
1432
1433 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1434 struct task_struct *tsk = task_of(se);
1435 unsigned int state;
1436
1437 /* XXX racy against TTWU */
1438 state = READ_ONCE(tsk->__state);
1439 if (state & TASK_INTERRUPTIBLE)
1440 __schedstat_set(tsk->stats.sleep_start,
1441 rq_clock(rq_of(cfs_rq)));
1442 if (state & TASK_UNINTERRUPTIBLE)
1443 __schedstat_set(tsk->stats.block_start,
1444 rq_clock(rq_of(cfs_rq)));
1445 }
1446 }
1447
1448 /*
1449 * We are picking a new current task - update its stats:
1450 */
1451 static inline void
update_stats_curr_start(struct cfs_rq * cfs_rq,struct sched_entity * se)1452 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1453 {
1454 /*
1455 * We are starting a new run period:
1456 */
1457 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1458 }
1459
1460 /**************************************************
1461 * Scheduling class queueing methods:
1462 */
1463
is_core_idle(int cpu)1464 static inline bool is_core_idle(int cpu)
1465 {
1466 #ifdef CONFIG_SCHED_SMT
1467 int sibling;
1468
1469 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1470 if (cpu == sibling)
1471 continue;
1472
1473 if (!idle_cpu(sibling))
1474 return false;
1475 }
1476 #endif
1477
1478 return true;
1479 }
1480
1481 #ifdef CONFIG_NUMA
1482 #define NUMA_IMBALANCE_MIN 2
1483
1484 static inline long
adjust_numa_imbalance(int imbalance,int dst_running,int imb_numa_nr)1485 adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
1486 {
1487 /*
1488 * Allow a NUMA imbalance if busy CPUs is less than the maximum
1489 * threshold. Above this threshold, individual tasks may be contending
1490 * for both memory bandwidth and any shared HT resources. This is an
1491 * approximation as the number of running tasks may not be related to
1492 * the number of busy CPUs due to sched_setaffinity.
1493 */
1494 if (dst_running > imb_numa_nr)
1495 return imbalance;
1496
1497 /*
1498 * Allow a small imbalance based on a simple pair of communicating
1499 * tasks that remain local when the destination is lightly loaded.
1500 */
1501 if (imbalance <= NUMA_IMBALANCE_MIN)
1502 return 0;
1503
1504 return imbalance;
1505 }
1506 #endif /* CONFIG_NUMA */
1507
1508 #ifdef CONFIG_NUMA_BALANCING
1509 /*
1510 * Approximate time to scan a full NUMA task in ms. The task scan period is
1511 * calculated based on the tasks virtual memory size and
1512 * numa_balancing_scan_size.
1513 */
1514 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1515 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1516
1517 /* Portion of address space to scan in MB */
1518 unsigned int sysctl_numa_balancing_scan_size = 256;
1519
1520 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1521 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1522
1523 /* The page with hint page fault latency < threshold in ms is considered hot */
1524 unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC;
1525
1526 struct numa_group {
1527 refcount_t refcount;
1528
1529 spinlock_t lock; /* nr_tasks, tasks */
1530 int nr_tasks;
1531 pid_t gid;
1532 int active_nodes;
1533
1534 struct rcu_head rcu;
1535 unsigned long total_faults;
1536 unsigned long max_faults_cpu;
1537 /*
1538 * faults[] array is split into two regions: faults_mem and faults_cpu.
1539 *
1540 * Faults_cpu is used to decide whether memory should move
1541 * towards the CPU. As a consequence, these stats are weighted
1542 * more by CPU use than by memory faults.
1543 */
1544 unsigned long faults[];
1545 };
1546
1547 /*
1548 * For functions that can be called in multiple contexts that permit reading
1549 * ->numa_group (see struct task_struct for locking rules).
1550 */
deref_task_numa_group(struct task_struct * p)1551 static struct numa_group *deref_task_numa_group(struct task_struct *p)
1552 {
1553 return rcu_dereference_check(p->numa_group, p == current ||
1554 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
1555 }
1556
deref_curr_numa_group(struct task_struct * p)1557 static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1558 {
1559 return rcu_dereference_protected(p->numa_group, p == current);
1560 }
1561
1562 static inline unsigned long group_faults_priv(struct numa_group *ng);
1563 static inline unsigned long group_faults_shared(struct numa_group *ng);
1564
task_nr_scan_windows(struct task_struct * p)1565 static unsigned int task_nr_scan_windows(struct task_struct *p)
1566 {
1567 unsigned long rss = 0;
1568 unsigned long nr_scan_pages;
1569
1570 /*
1571 * Calculations based on RSS as non-present and empty pages are skipped
1572 * by the PTE scanner and NUMA hinting faults should be trapped based
1573 * on resident pages
1574 */
1575 nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size);
1576 rss = get_mm_rss(p->mm);
1577 if (!rss)
1578 rss = nr_scan_pages;
1579
1580 rss = round_up(rss, nr_scan_pages);
1581 return rss / nr_scan_pages;
1582 }
1583
1584 /* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1585 #define MAX_SCAN_WINDOW 2560
1586
task_scan_min(struct task_struct * p)1587 static unsigned int task_scan_min(struct task_struct *p)
1588 {
1589 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1590 unsigned int scan, floor;
1591 unsigned int windows = 1;
1592
1593 if (scan_size < MAX_SCAN_WINDOW)
1594 windows = MAX_SCAN_WINDOW / scan_size;
1595 floor = 1000 / windows;
1596
1597 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1598 return max_t(unsigned int, floor, scan);
1599 }
1600
task_scan_start(struct task_struct * p)1601 static unsigned int task_scan_start(struct task_struct *p)
1602 {
1603 unsigned long smin = task_scan_min(p);
1604 unsigned long period = smin;
1605 struct numa_group *ng;
1606
1607 /* Scale the maximum scan period with the amount of shared memory. */
1608 rcu_read_lock();
1609 ng = rcu_dereference_all(p->numa_group);
1610 if (ng) {
1611 unsigned long shared = group_faults_shared(ng);
1612 unsigned long private = group_faults_priv(ng);
1613
1614 period *= refcount_read(&ng->refcount);
1615 period *= shared + 1;
1616 period /= private + shared + 1;
1617 }
1618 rcu_read_unlock();
1619
1620 return max(smin, period);
1621 }
1622
task_scan_max(struct task_struct * p)1623 static unsigned int task_scan_max(struct task_struct *p)
1624 {
1625 unsigned long smin = task_scan_min(p);
1626 unsigned long smax;
1627 struct numa_group *ng;
1628
1629 /* Watch for min being lower than max due to floor calculations */
1630 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1631
1632 /* Scale the maximum scan period with the amount of shared memory. */
1633 ng = deref_curr_numa_group(p);
1634 if (ng) {
1635 unsigned long shared = group_faults_shared(ng);
1636 unsigned long private = group_faults_priv(ng);
1637 unsigned long period = smax;
1638
1639 period *= refcount_read(&ng->refcount);
1640 period *= shared + 1;
1641 period /= private + shared + 1;
1642
1643 smax = max(smax, period);
1644 }
1645
1646 return max(smin, smax);
1647 }
1648
account_numa_enqueue(struct rq * rq,struct task_struct * p)1649 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1650 {
1651 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1652 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1653 }
1654
account_numa_dequeue(struct rq * rq,struct task_struct * p)1655 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1656 {
1657 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1658 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1659 }
1660
1661 /* Shared or private faults. */
1662 #define NR_NUMA_HINT_FAULT_TYPES 2
1663
1664 /* Memory and CPU locality */
1665 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1666
1667 /* Averaged statistics, and temporary buffers. */
1668 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1669
task_numa_group_id(struct task_struct * p)1670 pid_t task_numa_group_id(struct task_struct *p)
1671 {
1672 struct numa_group *ng;
1673 pid_t gid = 0;
1674
1675 rcu_read_lock();
1676 ng = rcu_dereference_all(p->numa_group);
1677 if (ng)
1678 gid = ng->gid;
1679 rcu_read_unlock();
1680
1681 return gid;
1682 }
1683
1684 /*
1685 * The averaged statistics, shared & private, memory & CPU,
1686 * occupy the first half of the array. The second half of the
1687 * array is for current counters, which are averaged into the
1688 * first set by task_numa_placement.
1689 */
task_faults_idx(enum numa_faults_stats s,int nid,int priv)1690 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1691 {
1692 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1693 }
1694
task_faults(struct task_struct * p,int nid)1695 static inline unsigned long task_faults(struct task_struct *p, int nid)
1696 {
1697 if (!p->numa_faults)
1698 return 0;
1699
1700 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1701 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1702 }
1703
group_faults(struct task_struct * p,int nid)1704 static inline unsigned long group_faults(struct task_struct *p, int nid)
1705 {
1706 struct numa_group *ng = deref_task_numa_group(p);
1707
1708 if (!ng)
1709 return 0;
1710
1711 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1712 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1713 }
1714
group_faults_cpu(struct numa_group * group,int nid)1715 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1716 {
1717 return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
1718 group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
1719 }
1720
group_faults_priv(struct numa_group * ng)1721 static inline unsigned long group_faults_priv(struct numa_group *ng)
1722 {
1723 unsigned long faults = 0;
1724 int node;
1725
1726 for_each_online_node(node) {
1727 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1728 }
1729
1730 return faults;
1731 }
1732
group_faults_shared(struct numa_group * ng)1733 static inline unsigned long group_faults_shared(struct numa_group *ng)
1734 {
1735 unsigned long faults = 0;
1736 int node;
1737
1738 for_each_online_node(node) {
1739 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1740 }
1741
1742 return faults;
1743 }
1744
1745 /*
1746 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1747 * considered part of a numa group's pseudo-interleaving set. Migrations
1748 * between these nodes are slowed down, to allow things to settle down.
1749 */
1750 #define ACTIVE_NODE_FRACTION 3
1751
numa_is_active_node(int nid,struct numa_group * ng)1752 static bool numa_is_active_node(int nid, struct numa_group *ng)
1753 {
1754 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1755 }
1756
1757 /* Handle placement on systems where not all nodes are directly connected. */
score_nearby_nodes(struct task_struct * p,int nid,int lim_dist,bool task)1758 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1759 int lim_dist, bool task)
1760 {
1761 unsigned long score = 0;
1762 int node, max_dist;
1763
1764 /*
1765 * All nodes are directly connected, and the same distance
1766 * from each other. No need for fancy placement algorithms.
1767 */
1768 if (sched_numa_topology_type == NUMA_DIRECT)
1769 return 0;
1770
1771 /* sched_max_numa_distance may be changed in parallel. */
1772 max_dist = READ_ONCE(sched_max_numa_distance);
1773 /*
1774 * This code is called for each node, introducing N^2 complexity,
1775 * which should be OK given the number of nodes rarely exceeds 8.
1776 */
1777 for_each_online_node(node) {
1778 unsigned long faults;
1779 int dist = node_distance(nid, node);
1780
1781 /*
1782 * The furthest away nodes in the system are not interesting
1783 * for placement; nid was already counted.
1784 */
1785 if (dist >= max_dist || node == nid)
1786 continue;
1787
1788 /*
1789 * On systems with a backplane NUMA topology, compare groups
1790 * of nodes, and move tasks towards the group with the most
1791 * memory accesses. When comparing two nodes at distance
1792 * "hoplimit", only nodes closer by than "hoplimit" are part
1793 * of each group. Skip other nodes.
1794 */
1795 if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
1796 continue;
1797
1798 /* Add up the faults from nearby nodes. */
1799 if (task)
1800 faults = task_faults(p, node);
1801 else
1802 faults = group_faults(p, node);
1803
1804 /*
1805 * On systems with a glueless mesh NUMA topology, there are
1806 * no fixed "groups of nodes". Instead, nodes that are not
1807 * directly connected bounce traffic through intermediate
1808 * nodes; a numa_group can occupy any set of nodes.
1809 * The further away a node is, the less the faults count.
1810 * This seems to result in good task placement.
1811 */
1812 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1813 faults *= (max_dist - dist);
1814 faults /= (max_dist - LOCAL_DISTANCE);
1815 }
1816
1817 score += faults;
1818 }
1819
1820 return score;
1821 }
1822
1823 /*
1824 * These return the fraction of accesses done by a particular task, or
1825 * task group, on a particular numa node. The group weight is given a
1826 * larger multiplier, in order to group tasks together that are almost
1827 * evenly spread out between numa nodes.
1828 */
task_weight(struct task_struct * p,int nid,int dist)1829 static inline unsigned long task_weight(struct task_struct *p, int nid,
1830 int dist)
1831 {
1832 unsigned long faults, total_faults;
1833
1834 if (!p->numa_faults)
1835 return 0;
1836
1837 total_faults = p->total_numa_faults;
1838
1839 if (!total_faults)
1840 return 0;
1841
1842 faults = task_faults(p, nid);
1843 faults += score_nearby_nodes(p, nid, dist, true);
1844
1845 return 1000 * faults / total_faults;
1846 }
1847
group_weight(struct task_struct * p,int nid,int dist)1848 static inline unsigned long group_weight(struct task_struct *p, int nid,
1849 int dist)
1850 {
1851 struct numa_group *ng = deref_task_numa_group(p);
1852 unsigned long faults, total_faults;
1853
1854 if (!ng)
1855 return 0;
1856
1857 total_faults = ng->total_faults;
1858
1859 if (!total_faults)
1860 return 0;
1861
1862 faults = group_faults(p, nid);
1863 faults += score_nearby_nodes(p, nid, dist, false);
1864
1865 return 1000 * faults / total_faults;
1866 }
1867
1868 /*
1869 * If memory tiering mode is enabled, cpupid of slow memory page is
1870 * used to record scan time instead of CPU and PID. When tiering mode
1871 * is disabled at run time, the scan time (in cpupid) will be
1872 * interpreted as CPU and PID. So CPU needs to be checked to avoid to
1873 * access out of array bound.
1874 */
cpupid_valid(int cpupid)1875 static inline bool cpupid_valid(int cpupid)
1876 {
1877 return cpupid_to_cpu(cpupid) < nr_cpu_ids;
1878 }
1879
1880 /*
1881 * For memory tiering mode, if there are enough free pages (more than
1882 * enough watermark defined here) in fast memory node, to take full
1883 * advantage of fast memory capacity, all recently accessed slow
1884 * memory pages will be migrated to fast memory node without
1885 * considering hot threshold.
1886 */
pgdat_free_space_enough(struct pglist_data * pgdat)1887 static bool pgdat_free_space_enough(struct pglist_data *pgdat)
1888 {
1889 int z;
1890 unsigned long enough_wmark;
1891
1892 enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
1893 pgdat->node_present_pages >> 4);
1894 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
1895 struct zone *zone = pgdat->node_zones + z;
1896
1897 if (!populated_zone(zone))
1898 continue;
1899
1900 if (zone_watermark_ok(zone, 0,
1901 promo_wmark_pages(zone) + enough_wmark,
1902 ZONE_MOVABLE, 0))
1903 return true;
1904 }
1905 return false;
1906 }
1907
1908 /*
1909 * For memory tiering mode, when page tables are scanned, the scan
1910 * time will be recorded in struct page in addition to make page
1911 * PROT_NONE for slow memory page. So when the page is accessed, in
1912 * hint page fault handler, the hint page fault latency is calculated
1913 * via,
1914 *
1915 * hint page fault latency = hint page fault time - scan time
1916 *
1917 * The smaller the hint page fault latency, the higher the possibility
1918 * for the page to be hot.
1919 */
numa_hint_fault_latency(struct folio * folio)1920 static int numa_hint_fault_latency(struct folio *folio)
1921 {
1922 int last_time, time;
1923
1924 time = jiffies_to_msecs(jiffies);
1925 last_time = folio_xchg_access_time(folio, time);
1926
1927 return (time - last_time) & PAGE_ACCESS_TIME_MASK;
1928 }
1929
1930 /*
1931 * For memory tiering mode, too high promotion/demotion throughput may
1932 * hurt application latency. So we provide a mechanism to rate limit
1933 * the number of pages that are tried to be promoted.
1934 */
numa_promotion_rate_limit(struct pglist_data * pgdat,unsigned long rate_limit,int nr)1935 static bool numa_promotion_rate_limit(struct pglist_data *pgdat,
1936 unsigned long rate_limit, int nr)
1937 {
1938 unsigned long nr_cand;
1939 unsigned int now, start;
1940
1941 now = jiffies_to_msecs(jiffies);
1942 mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
1943 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1944 start = pgdat->nbp_rl_start;
1945 if (now - start > MSEC_PER_SEC &&
1946 cmpxchg(&pgdat->nbp_rl_start, start, now) == start)
1947 pgdat->nbp_rl_nr_cand = nr_cand;
1948 if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
1949 return true;
1950 return false;
1951 }
1952
1953 #define NUMA_MIGRATION_ADJUST_STEPS 16
1954
numa_promotion_adjust_threshold(struct pglist_data * pgdat,unsigned long rate_limit,unsigned int ref_th)1955 static void numa_promotion_adjust_threshold(struct pglist_data *pgdat,
1956 unsigned long rate_limit,
1957 unsigned int ref_th)
1958 {
1959 unsigned int now, start, th_period, unit_th, th;
1960 unsigned long nr_cand, ref_cand, diff_cand;
1961
1962 now = jiffies_to_msecs(jiffies);
1963 th_period = sysctl_numa_balancing_scan_period_max;
1964 start = pgdat->nbp_th_start;
1965 if (now - start > th_period &&
1966 cmpxchg(&pgdat->nbp_th_start, start, now) == start) {
1967 ref_cand = rate_limit *
1968 sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC;
1969 nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
1970 diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
1971 unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS;
1972 th = pgdat->nbp_threshold ? : ref_th;
1973 if (diff_cand > ref_cand * 11 / 10)
1974 th = max(th - unit_th, unit_th);
1975 else if (diff_cand < ref_cand * 9 / 10)
1976 th = min(th + unit_th, ref_th * 2);
1977 pgdat->nbp_th_nr_cand = nr_cand;
1978 pgdat->nbp_threshold = th;
1979 }
1980 }
1981
should_numa_migrate_memory(struct task_struct * p,struct folio * folio,int src_nid,int dst_cpu)1982 bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio,
1983 int src_nid, int dst_cpu)
1984 {
1985 struct numa_group *ng = deref_curr_numa_group(p);
1986 int dst_nid = cpu_to_node(dst_cpu);
1987 int last_cpupid, this_cpupid;
1988
1989 /*
1990 * Cannot migrate to memoryless nodes.
1991 */
1992 if (!node_state(dst_nid, N_MEMORY))
1993 return false;
1994
1995 /*
1996 * The pages in slow memory node should be migrated according
1997 * to hot/cold instead of private/shared.
1998 */
1999 if (folio_use_access_time(folio)) {
2000 struct pglist_data *pgdat;
2001 unsigned long rate_limit;
2002 unsigned int latency, th, def_th;
2003 long nr = folio_nr_pages(folio);
2004
2005 pgdat = NODE_DATA(dst_nid);
2006 if (pgdat_free_space_enough(pgdat)) {
2007 /* workload changed, reset hot threshold */
2008 pgdat->nbp_threshold = 0;
2009 mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr);
2010 return true;
2011 }
2012
2013 def_th = sysctl_numa_balancing_hot_threshold;
2014 rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit);
2015 numa_promotion_adjust_threshold(pgdat, rate_limit, def_th);
2016
2017 th = pgdat->nbp_threshold ? : def_th;
2018 latency = numa_hint_fault_latency(folio);
2019 if (latency >= th)
2020 return false;
2021
2022 return !numa_promotion_rate_limit(pgdat, rate_limit, nr);
2023 }
2024
2025 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
2026 last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid);
2027
2028 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
2029 !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
2030 return false;
2031
2032 /*
2033 * Allow first faults or private faults to migrate immediately early in
2034 * the lifetime of a task. The magic number 4 is based on waiting for
2035 * two full passes of the "multi-stage node selection" test that is
2036 * executed below.
2037 */
2038 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
2039 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
2040 return true;
2041
2042 /*
2043 * Multi-stage node selection is used in conjunction with a periodic
2044 * migration fault to build a temporal task<->page relation. By using
2045 * a two-stage filter we remove short/unlikely relations.
2046 *
2047 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
2048 * a task's usage of a particular page (n_p) per total usage of this
2049 * page (n_t) (in a given time-span) to a probability.
2050 *
2051 * Our periodic faults will sample this probability and getting the
2052 * same result twice in a row, given these samples are fully
2053 * independent, is then given by P(n)^2, provided our sample period
2054 * is sufficiently short compared to the usage pattern.
2055 *
2056 * This quadric squishes small probabilities, making it less likely we
2057 * act on an unlikely task<->page relation.
2058 */
2059 if (!cpupid_pid_unset(last_cpupid) &&
2060 cpupid_to_nid(last_cpupid) != dst_nid)
2061 return false;
2062
2063 /* Always allow migrate on private faults */
2064 if (cpupid_match_pid(p, last_cpupid))
2065 return true;
2066
2067 /* A shared fault, but p->numa_group has not been set up yet. */
2068 if (!ng)
2069 return true;
2070
2071 /*
2072 * Destination node is much more heavily used than the source
2073 * node? Allow migration.
2074 */
2075 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
2076 ACTIVE_NODE_FRACTION)
2077 return true;
2078
2079 /*
2080 * Distribute memory according to CPU & memory use on each node,
2081 * with 3/4 hysteresis to avoid unnecessary memory migrations:
2082 *
2083 * faults_cpu(dst) 3 faults_cpu(src)
2084 * --------------- * - > ---------------
2085 * faults_mem(dst) 4 faults_mem(src)
2086 */
2087 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
2088 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
2089 }
2090
2091 /*
2092 * 'numa_type' describes the node at the moment of load balancing.
2093 */
2094 enum numa_type {
2095 /* The node has spare capacity that can be used to run more tasks. */
2096 node_has_spare = 0,
2097 /*
2098 * The node is fully used and the tasks don't compete for more CPU
2099 * cycles. Nevertheless, some tasks might wait before running.
2100 */
2101 node_fully_busy,
2102 /*
2103 * The node is overloaded and can't provide expected CPU cycles to all
2104 * tasks.
2105 */
2106 node_overloaded
2107 };
2108
2109 /* Cached statistics for all CPUs within a node */
2110 struct numa_stats {
2111 unsigned long load;
2112 unsigned long runnable;
2113 unsigned long util;
2114 /* Total compute capacity of CPUs on a node */
2115 unsigned long compute_capacity;
2116 unsigned int nr_running;
2117 unsigned int weight;
2118 enum numa_type node_type;
2119 int idle_cpu;
2120 };
2121
2122 struct task_numa_env {
2123 struct task_struct *p;
2124
2125 int src_cpu, src_nid;
2126 int dst_cpu, dst_nid;
2127 int imb_numa_nr;
2128
2129 struct numa_stats src_stats, dst_stats;
2130
2131 int imbalance_pct;
2132 int dist;
2133
2134 struct task_struct *best_task;
2135 long best_imp;
2136 int best_cpu;
2137 };
2138
2139 static unsigned long cpu_load(struct rq *rq);
2140 static unsigned long cpu_runnable(struct rq *rq);
2141
2142 static inline enum
numa_classify(unsigned int imbalance_pct,struct numa_stats * ns)2143 numa_type numa_classify(unsigned int imbalance_pct,
2144 struct numa_stats *ns)
2145 {
2146 if ((ns->nr_running > ns->weight) &&
2147 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
2148 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
2149 return node_overloaded;
2150
2151 if ((ns->nr_running < ns->weight) ||
2152 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
2153 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
2154 return node_has_spare;
2155
2156 return node_fully_busy;
2157 }
2158
2159 #ifdef CONFIG_SCHED_SMT
2160 /* Forward declarations of select_idle_sibling helpers */
2161 static inline bool test_idle_cores(int cpu);
numa_idle_core(int idle_core,int cpu)2162 static inline int numa_idle_core(int idle_core, int cpu)
2163 {
2164 if (!static_branch_likely(&sched_smt_present) ||
2165 idle_core >= 0 || !test_idle_cores(cpu))
2166 return idle_core;
2167
2168 /*
2169 * Prefer cores instead of packing HT siblings
2170 * and triggering future load balancing.
2171 */
2172 if (is_core_idle(cpu))
2173 idle_core = cpu;
2174
2175 return idle_core;
2176 }
2177 #else /* !CONFIG_SCHED_SMT: */
numa_idle_core(int idle_core,int cpu)2178 static inline int numa_idle_core(int idle_core, int cpu)
2179 {
2180 return idle_core;
2181 }
2182 #endif /* !CONFIG_SCHED_SMT */
2183
2184 /*
2185 * Gather all necessary information to make NUMA balancing placement
2186 * decisions that are compatible with standard load balancer. This
2187 * borrows code and logic from update_sg_lb_stats but sharing a
2188 * common implementation is impractical.
2189 */
update_numa_stats(struct task_numa_env * env,struct numa_stats * ns,int nid,bool find_idle)2190 static void update_numa_stats(struct task_numa_env *env,
2191 struct numa_stats *ns, int nid,
2192 bool find_idle)
2193 {
2194 int cpu, idle_core = -1;
2195
2196 memset(ns, 0, sizeof(*ns));
2197 ns->idle_cpu = -1;
2198
2199 rcu_read_lock();
2200 for_each_cpu(cpu, cpumask_of_node(nid)) {
2201 struct rq *rq = cpu_rq(cpu);
2202
2203 ns->load += cpu_load(rq);
2204 ns->runnable += cpu_runnable(rq);
2205 ns->util += cpu_util_cfs(cpu);
2206 ns->nr_running += rq->cfs.h_nr_runnable;
2207 ns->compute_capacity += capacity_of(cpu);
2208
2209 if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
2210 if (READ_ONCE(rq->numa_migrate_on) ||
2211 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
2212 continue;
2213
2214 if (ns->idle_cpu == -1)
2215 ns->idle_cpu = cpu;
2216
2217 idle_core = numa_idle_core(idle_core, cpu);
2218 }
2219 }
2220 rcu_read_unlock();
2221
2222 ns->weight = cpumask_weight(cpumask_of_node(nid));
2223
2224 ns->node_type = numa_classify(env->imbalance_pct, ns);
2225
2226 if (idle_core >= 0)
2227 ns->idle_cpu = idle_core;
2228 }
2229
task_numa_assign(struct task_numa_env * env,struct task_struct * p,long imp)2230 static void task_numa_assign(struct task_numa_env *env,
2231 struct task_struct *p, long imp)
2232 {
2233 struct rq *rq = cpu_rq(env->dst_cpu);
2234
2235 /* Check if run-queue part of active NUMA balance. */
2236 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
2237 int cpu;
2238 int start = env->dst_cpu;
2239
2240 /* Find alternative idle CPU. */
2241 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
2242 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
2243 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
2244 continue;
2245 }
2246
2247 env->dst_cpu = cpu;
2248 rq = cpu_rq(env->dst_cpu);
2249 if (!xchg(&rq->numa_migrate_on, 1))
2250 goto assign;
2251 }
2252
2253 /* Failed to find an alternative idle CPU */
2254 return;
2255 }
2256
2257 assign:
2258 /*
2259 * Clear previous best_cpu/rq numa-migrate flag, since task now
2260 * found a better CPU to move/swap.
2261 */
2262 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
2263 rq = cpu_rq(env->best_cpu);
2264 WRITE_ONCE(rq->numa_migrate_on, 0);
2265 }
2266
2267 if (env->best_task)
2268 put_task_struct(env->best_task);
2269 if (p)
2270 get_task_struct(p);
2271
2272 env->best_task = p;
2273 env->best_imp = imp;
2274 env->best_cpu = env->dst_cpu;
2275 }
2276
load_too_imbalanced(long src_load,long dst_load,struct task_numa_env * env)2277 static bool load_too_imbalanced(long src_load, long dst_load,
2278 struct task_numa_env *env)
2279 {
2280 long imb, old_imb;
2281 long orig_src_load, orig_dst_load;
2282 long src_capacity, dst_capacity;
2283
2284 /*
2285 * The load is corrected for the CPU capacity available on each node.
2286 *
2287 * src_load dst_load
2288 * ------------ vs ---------
2289 * src_capacity dst_capacity
2290 */
2291 src_capacity = env->src_stats.compute_capacity;
2292 dst_capacity = env->dst_stats.compute_capacity;
2293
2294 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
2295
2296 orig_src_load = env->src_stats.load;
2297 orig_dst_load = env->dst_stats.load;
2298
2299 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
2300
2301 /* Would this change make things worse? */
2302 return (imb > old_imb);
2303 }
2304
2305 /*
2306 * Maximum NUMA importance can be 1998 (2*999);
2307 * SMALLIMP @ 30 would be close to 1998/64.
2308 * Used to deter task migration.
2309 */
2310 #define SMALLIMP 30
2311
2312 /*
2313 * This checks if the overall compute and NUMA accesses of the system would
2314 * be improved if the source tasks was migrated to the target dst_cpu taking
2315 * into account that it might be best if task running on the dst_cpu should
2316 * be exchanged with the source task
2317 */
task_numa_compare(struct task_numa_env * env,long taskimp,long groupimp,bool maymove)2318 static bool task_numa_compare(struct task_numa_env *env,
2319 long taskimp, long groupimp, bool maymove)
2320 {
2321 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
2322 struct rq *dst_rq = cpu_rq(env->dst_cpu);
2323 long imp = p_ng ? groupimp : taskimp;
2324 struct task_struct *cur;
2325 long src_load, dst_load;
2326 int dist = env->dist;
2327 long moveimp = imp;
2328 long load;
2329 bool stopsearch = false;
2330
2331 if (READ_ONCE(dst_rq->numa_migrate_on))
2332 return false;
2333
2334 rcu_read_lock();
2335 cur = rcu_dereference_all(dst_rq->curr);
2336 if (cur && ((cur->flags & (PF_EXITING | PF_KTHREAD)) ||
2337 !cur->mm))
2338 cur = NULL;
2339
2340 /*
2341 * Because we have preemption enabled we can get migrated around and
2342 * end try selecting ourselves (current == env->p) as a swap candidate.
2343 */
2344 if (cur == env->p) {
2345 stopsearch = true;
2346 goto unlock;
2347 }
2348
2349 if (!cur) {
2350 if (maymove && moveimp >= env->best_imp)
2351 goto assign;
2352 else
2353 goto unlock;
2354 }
2355
2356 /* Skip this swap candidate if cannot move to the source cpu. */
2357 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
2358 goto unlock;
2359
2360 /*
2361 * Skip this swap candidate if it is not moving to its preferred
2362 * node and the best task is.
2363 */
2364 if (env->best_task &&
2365 env->best_task->numa_preferred_nid == env->src_nid &&
2366 cur->numa_preferred_nid != env->src_nid) {
2367 goto unlock;
2368 }
2369
2370 /*
2371 * "imp" is the fault differential for the source task between the
2372 * source and destination node. Calculate the total differential for
2373 * the source task and potential destination task. The more negative
2374 * the value is, the more remote accesses that would be expected to
2375 * be incurred if the tasks were swapped.
2376 *
2377 * If dst and source tasks are in the same NUMA group, or not
2378 * in any group then look only at task weights.
2379 */
2380 cur_ng = rcu_dereference_all(cur->numa_group);
2381 if (cur_ng == p_ng) {
2382 /*
2383 * Do not swap within a group or between tasks that have
2384 * no group if there is spare capacity. Swapping does
2385 * not address the load imbalance and helps one task at
2386 * the cost of punishing another.
2387 */
2388 if (env->dst_stats.node_type == node_has_spare)
2389 goto unlock;
2390
2391 imp = taskimp + task_weight(cur, env->src_nid, dist) -
2392 task_weight(cur, env->dst_nid, dist);
2393 /*
2394 * Add some hysteresis to prevent swapping the
2395 * tasks within a group over tiny differences.
2396 */
2397 if (cur_ng)
2398 imp -= imp / 16;
2399 } else {
2400 /*
2401 * Compare the group weights. If a task is all by itself
2402 * (not part of a group), use the task weight instead.
2403 */
2404 if (cur_ng && p_ng)
2405 imp += group_weight(cur, env->src_nid, dist) -
2406 group_weight(cur, env->dst_nid, dist);
2407 else
2408 imp += task_weight(cur, env->src_nid, dist) -
2409 task_weight(cur, env->dst_nid, dist);
2410 }
2411
2412 /* Discourage picking a task already on its preferred node */
2413 if (cur->numa_preferred_nid == env->dst_nid)
2414 imp -= imp / 16;
2415
2416 /*
2417 * Encourage picking a task that moves to its preferred node.
2418 * This potentially makes imp larger than it's maximum of
2419 * 1998 (see SMALLIMP and task_weight for why) but in this
2420 * case, it does not matter.
2421 */
2422 if (cur->numa_preferred_nid == env->src_nid)
2423 imp += imp / 8;
2424
2425 if (maymove && moveimp > imp && moveimp > env->best_imp) {
2426 imp = moveimp;
2427 cur = NULL;
2428 goto assign;
2429 }
2430
2431 /*
2432 * Prefer swapping with a task moving to its preferred node over a
2433 * task that is not.
2434 */
2435 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
2436 env->best_task->numa_preferred_nid != env->src_nid) {
2437 goto assign;
2438 }
2439
2440 /*
2441 * If the NUMA importance is less than SMALLIMP,
2442 * task migration might only result in ping pong
2443 * of tasks and also hurt performance due to cache
2444 * misses.
2445 */
2446 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
2447 goto unlock;
2448
2449 /*
2450 * In the overloaded case, try and keep the load balanced.
2451 */
2452 load = task_h_load(env->p) - task_h_load(cur);
2453 if (!load)
2454 goto assign;
2455
2456 dst_load = env->dst_stats.load + load;
2457 src_load = env->src_stats.load - load;
2458
2459 if (load_too_imbalanced(src_load, dst_load, env))
2460 goto unlock;
2461
2462 assign:
2463 /* Evaluate an idle CPU for a task numa move. */
2464 if (!cur) {
2465 int cpu = env->dst_stats.idle_cpu;
2466
2467 /* Nothing cached so current CPU went idle since the search. */
2468 if (cpu < 0)
2469 cpu = env->dst_cpu;
2470
2471 /*
2472 * If the CPU is no longer truly idle and the previous best CPU
2473 * is, keep using it.
2474 */
2475 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
2476 idle_cpu(env->best_cpu)) {
2477 cpu = env->best_cpu;
2478 }
2479
2480 env->dst_cpu = cpu;
2481 }
2482
2483 task_numa_assign(env, cur, imp);
2484
2485 /*
2486 * If a move to idle is allowed because there is capacity or load
2487 * balance improves then stop the search. While a better swap
2488 * candidate may exist, a search is not free.
2489 */
2490 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
2491 stopsearch = true;
2492
2493 /*
2494 * If a swap candidate must be identified and the current best task
2495 * moves its preferred node then stop the search.
2496 */
2497 if (!maymove && env->best_task &&
2498 env->best_task->numa_preferred_nid == env->src_nid) {
2499 stopsearch = true;
2500 }
2501 unlock:
2502 rcu_read_unlock();
2503
2504 return stopsearch;
2505 }
2506
task_numa_find_cpu(struct task_numa_env * env,long taskimp,long groupimp)2507 static void task_numa_find_cpu(struct task_numa_env *env,
2508 long taskimp, long groupimp)
2509 {
2510 bool maymove = false;
2511 int cpu;
2512
2513 /*
2514 * If dst node has spare capacity, then check if there is an
2515 * imbalance that would be overruled by the load balancer.
2516 */
2517 if (env->dst_stats.node_type == node_has_spare) {
2518 unsigned int imbalance;
2519 int src_running, dst_running;
2520
2521 /*
2522 * Would movement cause an imbalance? Note that if src has
2523 * more running tasks that the imbalance is ignored as the
2524 * move improves the imbalance from the perspective of the
2525 * CPU load balancer.
2526 * */
2527 src_running = env->src_stats.nr_running - 1;
2528 dst_running = env->dst_stats.nr_running + 1;
2529 imbalance = max(0, dst_running - src_running);
2530 imbalance = adjust_numa_imbalance(imbalance, dst_running,
2531 env->imb_numa_nr);
2532
2533 /* Use idle CPU if there is no imbalance */
2534 if (!imbalance) {
2535 maymove = true;
2536 if (env->dst_stats.idle_cpu >= 0) {
2537 env->dst_cpu = env->dst_stats.idle_cpu;
2538 task_numa_assign(env, NULL, 0);
2539 return;
2540 }
2541 }
2542 } else {
2543 long src_load, dst_load, load;
2544 /*
2545 * If the improvement from just moving env->p direction is better
2546 * than swapping tasks around, check if a move is possible.
2547 */
2548 load = task_h_load(env->p);
2549 dst_load = env->dst_stats.load + load;
2550 src_load = env->src_stats.load - load;
2551 maymove = !load_too_imbalanced(src_load, dst_load, env);
2552 }
2553
2554 /* Skip CPUs if the source task cannot migrate */
2555 for_each_cpu_and(cpu, cpumask_of_node(env->dst_nid), env->p->cpus_ptr) {
2556 env->dst_cpu = cpu;
2557 if (task_numa_compare(env, taskimp, groupimp, maymove))
2558 break;
2559 }
2560 }
2561
task_numa_migrate(struct task_struct * p)2562 static int task_numa_migrate(struct task_struct *p)
2563 {
2564 struct task_numa_env env = {
2565 .p = p,
2566
2567 .src_cpu = task_cpu(p),
2568 .src_nid = task_node(p),
2569
2570 .imbalance_pct = 112,
2571
2572 .best_task = NULL,
2573 .best_imp = 0,
2574 .best_cpu = -1,
2575 };
2576 unsigned long taskweight, groupweight;
2577 struct sched_domain *sd;
2578 long taskimp, groupimp;
2579 struct numa_group *ng;
2580 struct rq *best_rq;
2581 int nid, ret, dist;
2582
2583 /*
2584 * Pick the lowest SD_NUMA domain, as that would have the smallest
2585 * imbalance and would be the first to start moving tasks about.
2586 *
2587 * And we want to avoid any moving of tasks about, as that would create
2588 * random movement of tasks -- counter the numa conditions we're trying
2589 * to satisfy here.
2590 */
2591 rcu_read_lock();
2592 sd = rcu_dereference_all(per_cpu(sd_numa, env.src_cpu));
2593 if (sd) {
2594 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
2595 env.imb_numa_nr = sd->imb_numa_nr;
2596 }
2597 rcu_read_unlock();
2598
2599 /*
2600 * Cpusets can break the scheduler domain tree into smaller
2601 * balance domains, some of which do not cross NUMA boundaries.
2602 * Tasks that are "trapped" in such domains cannot be migrated
2603 * elsewhere, so there is no point in (re)trying.
2604 */
2605 if (unlikely(!sd)) {
2606 sched_setnuma(p, task_node(p));
2607 return -EINVAL;
2608 }
2609
2610 env.dst_nid = p->numa_preferred_nid;
2611 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
2612 taskweight = task_weight(p, env.src_nid, dist);
2613 groupweight = group_weight(p, env.src_nid, dist);
2614 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
2615 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
2616 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
2617 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2618
2619 /* Try to find a spot on the preferred nid. */
2620 task_numa_find_cpu(&env, taskimp, groupimp);
2621
2622 /*
2623 * Look at other nodes in these cases:
2624 * - there is no space available on the preferred_nid
2625 * - the task is part of a numa_group that is interleaved across
2626 * multiple NUMA nodes; in order to better consolidate the group,
2627 * we need to check other locations.
2628 */
2629 ng = deref_curr_numa_group(p);
2630 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
2631 for_each_node_state(nid, N_CPU) {
2632 if (nid == env.src_nid || nid == p->numa_preferred_nid)
2633 continue;
2634
2635 dist = node_distance(env.src_nid, env.dst_nid);
2636 if (sched_numa_topology_type == NUMA_BACKPLANE &&
2637 dist != env.dist) {
2638 taskweight = task_weight(p, env.src_nid, dist);
2639 groupweight = group_weight(p, env.src_nid, dist);
2640 }
2641
2642 /* Only consider nodes where both task and groups benefit */
2643 taskimp = task_weight(p, nid, dist) - taskweight;
2644 groupimp = group_weight(p, nid, dist) - groupweight;
2645 if (taskimp < 0 && groupimp < 0)
2646 continue;
2647
2648 env.dist = dist;
2649 env.dst_nid = nid;
2650 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
2651 task_numa_find_cpu(&env, taskimp, groupimp);
2652 }
2653 }
2654
2655 /*
2656 * If the task is part of a workload that spans multiple NUMA nodes,
2657 * and is migrating into one of the workload's active nodes, remember
2658 * this node as the task's preferred numa node, so the workload can
2659 * settle down.
2660 * A task that migrated to a second choice node will be better off
2661 * trying for a better one later. Do not set the preferred node here.
2662 */
2663 if (ng) {
2664 if (env.best_cpu == -1)
2665 nid = env.src_nid;
2666 else
2667 nid = cpu_to_node(env.best_cpu);
2668
2669 if (nid != p->numa_preferred_nid)
2670 sched_setnuma(p, nid);
2671 }
2672
2673 /* No better CPU than the current one was found. */
2674 if (env.best_cpu == -1) {
2675 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
2676 return -EAGAIN;
2677 }
2678
2679 best_rq = cpu_rq(env.best_cpu);
2680 if (env.best_task == NULL) {
2681 ret = migrate_task_to(p, env.best_cpu);
2682 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2683 if (ret != 0)
2684 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
2685 return ret;
2686 }
2687
2688 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
2689 WRITE_ONCE(best_rq->numa_migrate_on, 0);
2690
2691 if (ret != 0)
2692 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
2693 put_task_struct(env.best_task);
2694 return ret;
2695 }
2696
2697 /* Attempt to migrate a task to a CPU on the preferred node. */
numa_migrate_preferred(struct task_struct * p)2698 static void numa_migrate_preferred(struct task_struct *p)
2699 {
2700 unsigned long interval = HZ;
2701
2702 /* This task has no NUMA fault statistics yet */
2703 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
2704 return;
2705
2706 /* Periodically retry migrating the task to the preferred node */
2707 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
2708 p->numa_migrate_retry = jiffies + interval;
2709
2710 /* Success if task is already running on preferred CPU */
2711 if (task_node(p) == p->numa_preferred_nid)
2712 return;
2713
2714 /* Otherwise, try migrate to a CPU on the preferred node */
2715 task_numa_migrate(p);
2716 }
2717
2718 /*
2719 * Find out how many nodes the workload is actively running on. Do this by
2720 * tracking the nodes from which NUMA hinting faults are triggered. This can
2721 * be different from the set of nodes where the workload's memory is currently
2722 * located.
2723 */
numa_group_count_active_nodes(struct numa_group * numa_group)2724 static void numa_group_count_active_nodes(struct numa_group *numa_group)
2725 {
2726 unsigned long faults, max_faults = 0;
2727 int nid, active_nodes = 0;
2728
2729 for_each_node_state(nid, N_CPU) {
2730 faults = group_faults_cpu(numa_group, nid);
2731 if (faults > max_faults)
2732 max_faults = faults;
2733 }
2734
2735 for_each_node_state(nid, N_CPU) {
2736 faults = group_faults_cpu(numa_group, nid);
2737 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2738 active_nodes++;
2739 }
2740
2741 numa_group->max_faults_cpu = max_faults;
2742 numa_group->active_nodes = active_nodes;
2743 }
2744
2745 /*
2746 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2747 * increments. The more local the fault statistics are, the higher the scan
2748 * period will be for the next scan window. If local/(local+remote) ratio is
2749 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2750 * the scan period will decrease. Aim for 70% local accesses.
2751 */
2752 #define NUMA_PERIOD_SLOTS 10
2753 #define NUMA_PERIOD_THRESHOLD 7
2754
2755 /*
2756 * Increase the scan period (slow down scanning) if the majority of
2757 * our memory is already on our local node, or if the majority of
2758 * the page accesses are shared with other processes.
2759 * Otherwise, decrease the scan period.
2760 */
update_task_scan_period(struct task_struct * p,unsigned long shared,unsigned long private)2761 static void update_task_scan_period(struct task_struct *p,
2762 unsigned long shared, unsigned long private)
2763 {
2764 unsigned int period_slot;
2765 int lr_ratio, ps_ratio;
2766 int diff;
2767
2768 unsigned long remote = p->numa_faults_locality[0];
2769 unsigned long local = p->numa_faults_locality[1];
2770
2771 /*
2772 * If there were no record hinting faults then either the task is
2773 * completely idle or all activity is in areas that are not of interest
2774 * to automatic numa balancing. Related to that, if there were failed
2775 * migration then it implies we are migrating too quickly or the local
2776 * node is overloaded. In either case, scan slower
2777 */
2778 if (local + shared == 0 || p->numa_faults_locality[2]) {
2779 p->numa_scan_period = min(p->numa_scan_period_max,
2780 p->numa_scan_period << 1);
2781
2782 p->mm->numa_next_scan = jiffies +
2783 msecs_to_jiffies(p->numa_scan_period);
2784
2785 return;
2786 }
2787
2788 /*
2789 * Prepare to scale scan period relative to the current period.
2790 * == NUMA_PERIOD_THRESHOLD scan period stays the same
2791 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2792 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2793 */
2794 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
2795 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2796 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2797
2798 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2799 /*
2800 * Most memory accesses are local. There is no need to
2801 * do fast NUMA scanning, since memory is already local.
2802 */
2803 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2804 if (!slot)
2805 slot = 1;
2806 diff = slot * period_slot;
2807 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2808 /*
2809 * Most memory accesses are shared with other tasks.
2810 * There is no point in continuing fast NUMA scanning,
2811 * since other tasks may just move the memory elsewhere.
2812 */
2813 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
2814 if (!slot)
2815 slot = 1;
2816 diff = slot * period_slot;
2817 } else {
2818 /*
2819 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2820 * yet they are not on the local NUMA node. Speed up
2821 * NUMA scanning to get the memory moved over.
2822 */
2823 int ratio = max(lr_ratio, ps_ratio);
2824 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
2825 }
2826
2827 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2828 task_scan_min(p), task_scan_max(p));
2829 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2830 }
2831
2832 /*
2833 * Get the fraction of time the task has been running since the last
2834 * NUMA placement cycle. The scheduler keeps similar statistics, but
2835 * decays those on a 32ms period, which is orders of magnitude off
2836 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2837 * stats only if the task is so new there are no NUMA statistics yet.
2838 */
numa_get_avg_runtime(struct task_struct * p,u64 * period)2839 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2840 {
2841 u64 runtime, delta, now;
2842 /* Use the start of this time slice to avoid calculations. */
2843 now = p->se.exec_start;
2844 runtime = p->se.sum_exec_runtime;
2845
2846 if (p->last_task_numa_placement) {
2847 delta = runtime - p->last_sum_exec_runtime;
2848 *period = now - p->last_task_numa_placement;
2849
2850 /* Avoid time going backwards, prevent potential divide error: */
2851 if (unlikely((s64)*period < 0))
2852 *period = 0;
2853 } else {
2854 delta = p->se.avg.load_sum;
2855 *period = LOAD_AVG_MAX;
2856 }
2857
2858 p->last_sum_exec_runtime = runtime;
2859 p->last_task_numa_placement = now;
2860
2861 return delta;
2862 }
2863
2864 /*
2865 * Determine the preferred nid for a task in a numa_group. This needs to
2866 * be done in a way that produces consistent results with group_weight,
2867 * otherwise workloads might not converge.
2868 */
preferred_group_nid(struct task_struct * p,int nid)2869 static int preferred_group_nid(struct task_struct *p, int nid)
2870 {
2871 nodemask_t nodes;
2872 int dist;
2873
2874 /* Direct connections between all NUMA nodes. */
2875 if (sched_numa_topology_type == NUMA_DIRECT)
2876 return nid;
2877
2878 /*
2879 * On a system with glueless mesh NUMA topology, group_weight
2880 * scores nodes according to the number of NUMA hinting faults on
2881 * both the node itself, and on nearby nodes.
2882 */
2883 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2884 unsigned long score, max_score = 0;
2885 int node, max_node = nid;
2886
2887 dist = sched_max_numa_distance;
2888
2889 for_each_node_state(node, N_CPU) {
2890 score = group_weight(p, node, dist);
2891 if (score > max_score) {
2892 max_score = score;
2893 max_node = node;
2894 }
2895 }
2896 return max_node;
2897 }
2898
2899 /*
2900 * Finding the preferred nid in a system with NUMA backplane
2901 * interconnect topology is more involved. The goal is to locate
2902 * tasks from numa_groups near each other in the system, and
2903 * untangle workloads from different sides of the system. This requires
2904 * searching down the hierarchy of node groups, recursively searching
2905 * inside the highest scoring group of nodes. The nodemask tricks
2906 * keep the complexity of the search down.
2907 */
2908 nodes = node_states[N_CPU];
2909 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2910 unsigned long max_faults = 0;
2911 nodemask_t max_group = NODE_MASK_NONE;
2912 int a, b;
2913
2914 /* Are there nodes at this distance from each other? */
2915 if (!find_numa_distance(dist))
2916 continue;
2917
2918 for_each_node_mask(a, nodes) {
2919 unsigned long faults = 0;
2920 nodemask_t this_group;
2921 nodes_clear(this_group);
2922
2923 /* Sum group's NUMA faults; includes a==b case. */
2924 for_each_node_mask(b, nodes) {
2925 if (node_distance(a, b) < dist) {
2926 faults += group_faults(p, b);
2927 node_set(b, this_group);
2928 node_clear(b, nodes);
2929 }
2930 }
2931
2932 /* Remember the top group. */
2933 if (faults > max_faults) {
2934 max_faults = faults;
2935 max_group = this_group;
2936 /*
2937 * subtle: at the smallest distance there is
2938 * just one node left in each "group", the
2939 * winner is the preferred nid.
2940 */
2941 nid = a;
2942 }
2943 }
2944 /* Next round, evaluate the nodes within max_group. */
2945 if (!max_faults)
2946 break;
2947 nodes = max_group;
2948 }
2949 return nid;
2950 }
2951
task_numa_placement(struct task_struct * p)2952 static void task_numa_placement(struct task_struct *p)
2953 __context_unsafe(/* conditional locking */)
2954 {
2955 int seq, nid, max_nid = NUMA_NO_NODE;
2956 unsigned long max_faults = 0;
2957 unsigned long fault_types[2] = { 0, 0 };
2958 unsigned long total_faults;
2959 u64 runtime, period;
2960 spinlock_t *group_lock = NULL;
2961 struct numa_group *ng;
2962
2963 /*
2964 * The p->mm->numa_scan_seq field gets updated without
2965 * exclusive access. Use READ_ONCE() here to ensure
2966 * that the field is read in a single access:
2967 */
2968 seq = READ_ONCE(p->mm->numa_scan_seq);
2969 if (p->numa_scan_seq == seq)
2970 return;
2971 p->numa_scan_seq = seq;
2972 p->numa_scan_period_max = task_scan_max(p);
2973
2974 total_faults = p->numa_faults_locality[0] +
2975 p->numa_faults_locality[1];
2976 runtime = numa_get_avg_runtime(p, &period);
2977
2978 /* If the task is part of a group prevent parallel updates to group stats */
2979 ng = deref_curr_numa_group(p);
2980 if (ng) {
2981 group_lock = &ng->lock;
2982 spin_lock_irq(group_lock);
2983 }
2984
2985 /* Find the node with the highest number of faults */
2986 for_each_online_node(nid) {
2987 /* Keep track of the offsets in numa_faults array */
2988 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2989 unsigned long faults = 0, group_faults = 0;
2990 int priv;
2991
2992 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2993 long diff, f_diff, f_weight;
2994
2995 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2996 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2997 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2998 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2999
3000 /* Decay existing window, copy faults since last scan */
3001 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
3002 fault_types[priv] += p->numa_faults[membuf_idx];
3003 p->numa_faults[membuf_idx] = 0;
3004
3005 /*
3006 * Normalize the faults_from, so all tasks in a group
3007 * count according to CPU use, instead of by the raw
3008 * number of faults. Tasks with little runtime have
3009 * little over-all impact on throughput, and thus their
3010 * faults are less important.
3011 */
3012 f_weight = div64_u64(runtime << 16, period + 1);
3013 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
3014 (total_faults + 1);
3015 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
3016 p->numa_faults[cpubuf_idx] = 0;
3017
3018 p->numa_faults[mem_idx] += diff;
3019 p->numa_faults[cpu_idx] += f_diff;
3020 faults += p->numa_faults[mem_idx];
3021 p->total_numa_faults += diff;
3022 if (ng) {
3023 /*
3024 * safe because we can only change our own group
3025 *
3026 * mem_idx represents the offset for a given
3027 * nid and priv in a specific region because it
3028 * is at the beginning of the numa_faults array.
3029 */
3030 ng->faults[mem_idx] += diff;
3031 ng->faults[cpu_idx] += f_diff;
3032 ng->total_faults += diff;
3033 group_faults += ng->faults[mem_idx];
3034 }
3035 }
3036
3037 if (!ng) {
3038 if (faults > max_faults) {
3039 max_faults = faults;
3040 max_nid = nid;
3041 }
3042 } else if (group_faults > max_faults) {
3043 max_faults = group_faults;
3044 max_nid = nid;
3045 }
3046 }
3047
3048 /* Cannot migrate task to CPU-less node */
3049 max_nid = numa_nearest_node(max_nid, N_CPU);
3050
3051 if (ng) {
3052 numa_group_count_active_nodes(ng);
3053 spin_unlock_irq(group_lock);
3054 max_nid = preferred_group_nid(p, max_nid);
3055 }
3056
3057 if (max_faults) {
3058 /* Set the new preferred node */
3059 if (max_nid != p->numa_preferred_nid)
3060 sched_setnuma(p, max_nid);
3061 }
3062
3063 update_task_scan_period(p, fault_types[0], fault_types[1]);
3064 }
3065
get_numa_group(struct numa_group * grp)3066 static inline int get_numa_group(struct numa_group *grp)
3067 {
3068 return refcount_inc_not_zero(&grp->refcount);
3069 }
3070
put_numa_group(struct numa_group * grp)3071 static inline void put_numa_group(struct numa_group *grp)
3072 {
3073 if (refcount_dec_and_test(&grp->refcount))
3074 kfree_rcu(grp, rcu);
3075 }
3076
task_numa_group(struct task_struct * p,int cpupid,int flags,int * priv)3077 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
3078 int *priv)
3079 {
3080 struct numa_group *grp, *my_grp;
3081 struct task_struct *tsk;
3082 bool join = false;
3083 int cpu = cpupid_to_cpu(cpupid);
3084 int i;
3085
3086 if (unlikely(!deref_curr_numa_group(p))) {
3087 unsigned int size = sizeof(struct numa_group) +
3088 NR_NUMA_HINT_FAULT_STATS *
3089 nr_node_ids * sizeof(unsigned long);
3090
3091 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
3092 if (!grp)
3093 return;
3094
3095 refcount_set(&grp->refcount, 1);
3096 grp->active_nodes = 1;
3097 grp->max_faults_cpu = 0;
3098 spin_lock_init(&grp->lock);
3099 grp->gid = p->pid;
3100
3101 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3102 grp->faults[i] = p->numa_faults[i];
3103
3104 grp->total_faults = p->total_numa_faults;
3105
3106 grp->nr_tasks++;
3107 rcu_assign_pointer(p->numa_group, grp);
3108 }
3109
3110 rcu_read_lock();
3111 tsk = READ_ONCE(cpu_rq(cpu)->curr);
3112
3113 if (!cpupid_match_pid(tsk, cpupid))
3114 goto no_join;
3115
3116 grp = rcu_dereference_all(tsk->numa_group);
3117 if (!grp)
3118 goto no_join;
3119
3120 my_grp = deref_curr_numa_group(p);
3121 if (grp == my_grp)
3122 goto no_join;
3123
3124 /*
3125 * Only join the other group if its bigger; if we're the bigger group,
3126 * the other task will join us.
3127 */
3128 if (my_grp->nr_tasks > grp->nr_tasks)
3129 goto no_join;
3130
3131 /*
3132 * Tie-break on the grp address.
3133 */
3134 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
3135 goto no_join;
3136
3137 /* Always join threads in the same process. */
3138 if (tsk->mm == current->mm)
3139 join = true;
3140
3141 /* Simple filter to avoid false positives due to PID collisions */
3142 if (flags & TNF_SHARED)
3143 join = true;
3144
3145 /* Update priv based on whether false sharing was detected */
3146 *priv = !join;
3147
3148 if (join && !get_numa_group(grp))
3149 goto no_join;
3150
3151 rcu_read_unlock();
3152
3153 if (!join)
3154 return;
3155
3156 WARN_ON_ONCE(irqs_disabled());
3157 double_lock_irq(&my_grp->lock, &grp->lock);
3158
3159 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
3160 my_grp->faults[i] -= p->numa_faults[i];
3161 grp->faults[i] += p->numa_faults[i];
3162 }
3163 my_grp->total_faults -= p->total_numa_faults;
3164 grp->total_faults += p->total_numa_faults;
3165
3166 my_grp->nr_tasks--;
3167 grp->nr_tasks++;
3168
3169 spin_unlock(&my_grp->lock);
3170 spin_unlock_irq(&grp->lock);
3171
3172 rcu_assign_pointer(p->numa_group, grp);
3173
3174 put_numa_group(my_grp);
3175 return;
3176
3177 no_join:
3178 rcu_read_unlock();
3179 return;
3180 }
3181
3182 /*
3183 * Get rid of NUMA statistics associated with a task (either current or dead).
3184 * If @final is set, the task is dead and has reached refcount zero, so we can
3185 * safely free all relevant data structures. Otherwise, there might be
3186 * concurrent reads from places like load balancing and procfs, and we should
3187 * reset the data back to default state without freeing ->numa_faults.
3188 */
task_numa_free(struct task_struct * p,bool final)3189 void task_numa_free(struct task_struct *p, bool final)
3190 {
3191 /* safe: p either is current or is being freed by current */
3192 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
3193 unsigned long *numa_faults = p->numa_faults;
3194 unsigned long flags;
3195 int i;
3196
3197 if (!numa_faults)
3198 return;
3199
3200 if (grp) {
3201 spin_lock_irqsave(&grp->lock, flags);
3202 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3203 grp->faults[i] -= p->numa_faults[i];
3204 grp->total_faults -= p->total_numa_faults;
3205
3206 grp->nr_tasks--;
3207 spin_unlock_irqrestore(&grp->lock, flags);
3208 RCU_INIT_POINTER(p->numa_group, NULL);
3209 put_numa_group(grp);
3210 }
3211
3212 if (final) {
3213 p->numa_faults = NULL;
3214 kfree(numa_faults);
3215 } else {
3216 p->total_numa_faults = 0;
3217 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
3218 numa_faults[i] = 0;
3219 }
3220 }
3221
3222 /*
3223 * Got a PROT_NONE fault for a page on @node.
3224 */
task_numa_fault(int last_cpupid,int mem_node,int pages,int flags)3225 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
3226 {
3227 struct task_struct *p = current;
3228 bool migrated = flags & TNF_MIGRATED;
3229 int cpu_node = task_node(current);
3230 int local = !!(flags & TNF_FAULT_LOCAL);
3231 struct numa_group *ng;
3232 int priv;
3233
3234 if (!static_branch_likely(&sched_numa_balancing))
3235 return;
3236
3237 /* for example, ksmd faulting in a user's mm */
3238 if (!p->mm)
3239 return;
3240
3241 /*
3242 * NUMA faults statistics are unnecessary for the slow memory
3243 * node for memory tiering mode.
3244 */
3245 if (!node_is_toptier(mem_node) &&
3246 (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ||
3247 !cpupid_valid(last_cpupid)))
3248 return;
3249
3250 /* Allocate buffer to track faults on a per-node basis */
3251 if (unlikely(!p->numa_faults)) {
3252 int size = sizeof(*p->numa_faults) *
3253 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
3254
3255 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
3256 if (!p->numa_faults)
3257 return;
3258
3259 p->total_numa_faults = 0;
3260 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
3261 }
3262
3263 /*
3264 * First accesses are treated as private, otherwise consider accesses
3265 * to be private if the accessing pid has not changed
3266 */
3267 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
3268 priv = 1;
3269 } else {
3270 priv = cpupid_match_pid(p, last_cpupid);
3271 if (!priv && !(flags & TNF_NO_GROUP))
3272 task_numa_group(p, last_cpupid, flags, &priv);
3273 }
3274
3275 /*
3276 * If a workload spans multiple NUMA nodes, a shared fault that
3277 * occurs wholly within the set of nodes that the workload is
3278 * actively using should be counted as local. This allows the
3279 * scan rate to slow down when a workload has settled down.
3280 */
3281 ng = deref_curr_numa_group(p);
3282 if (!priv && !local && ng && ng->active_nodes > 1 &&
3283 numa_is_active_node(cpu_node, ng) &&
3284 numa_is_active_node(mem_node, ng))
3285 local = 1;
3286
3287 /*
3288 * Retry to migrate task to preferred node periodically, in case it
3289 * previously failed, or the scheduler moved us.
3290 */
3291 if (time_after(jiffies, p->numa_migrate_retry)) {
3292 task_numa_placement(p);
3293 numa_migrate_preferred(p);
3294 }
3295
3296 if (migrated)
3297 p->numa_pages_migrated += pages;
3298 if (flags & TNF_MIGRATE_FAIL)
3299 p->numa_faults_locality[2] += pages;
3300
3301 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
3302 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
3303 p->numa_faults_locality[local] += pages;
3304 }
3305
reset_ptenuma_scan(struct task_struct * p)3306 static void reset_ptenuma_scan(struct task_struct *p)
3307 {
3308 /*
3309 * We only did a read acquisition of the mmap sem, so
3310 * p->mm->numa_scan_seq is written to without exclusive access
3311 * and the update is not guaranteed to be atomic. That's not
3312 * much of an issue though, since this is just used for
3313 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
3314 * expensive, to avoid any form of compiler optimizations:
3315 */
3316 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
3317 p->mm->numa_scan_offset = 0;
3318 }
3319
vma_is_accessed(struct mm_struct * mm,struct vm_area_struct * vma)3320 static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
3321 {
3322 unsigned long pids;
3323 /*
3324 * Allow unconditional access first two times, so that all the (pages)
3325 * of VMAs get prot_none fault introduced irrespective of accesses.
3326 * This is also done to avoid any side effect of task scanning
3327 * amplifying the unfairness of disjoint set of VMAs' access.
3328 */
3329 if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
3330 return true;
3331
3332 pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
3333 if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
3334 return true;
3335
3336 /*
3337 * Complete a scan that has already started regardless of PID access, or
3338 * some VMAs may never be scanned in multi-threaded applications:
3339 */
3340 if (mm->numa_scan_offset > vma->vm_start) {
3341 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID);
3342 return true;
3343 }
3344
3345 /*
3346 * This vma has not been accessed for a while, and if the number
3347 * the threads in the same process is low, which means no other
3348 * threads can help scan this vma, force a vma scan.
3349 */
3350 if (READ_ONCE(mm->numa_scan_seq) >
3351 (vma->numab_state->prev_scan_seq + get_nr_threads(current)))
3352 return true;
3353
3354 return false;
3355 }
3356
3357 #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
3358
3359 /*
3360 * The expensive part of numa migration is done from task_work context.
3361 * Triggered from task_tick_numa().
3362 */
task_numa_work(struct callback_head * work)3363 static void task_numa_work(struct callback_head *work)
3364 {
3365 unsigned long migrate, next_scan, now = jiffies;
3366 struct task_struct *p = current;
3367 struct mm_struct *mm = p->mm;
3368 u64 runtime = p->se.sum_exec_runtime;
3369 struct vm_area_struct *vma;
3370 unsigned long start, end;
3371 unsigned long nr_pte_updates = 0;
3372 long pages, virtpages;
3373 struct vma_iterator vmi;
3374 bool vma_pids_skipped;
3375 bool vma_pids_forced = false;
3376
3377 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
3378
3379 work->next = work;
3380 /*
3381 * Who cares about NUMA placement when they're dying.
3382 *
3383 * NOTE: make sure not to dereference p->mm before this check,
3384 * exit_task_work() happens _after_ exit_mm() so we could be called
3385 * without p->mm even though we still had it when we enqueued this
3386 * work.
3387 */
3388 if (p->flags & PF_EXITING)
3389 return;
3390
3391 /*
3392 * Memory is pinned to only one NUMA node via cpuset.mems, naturally
3393 * no page can be migrated.
3394 */
3395 if (cpusets_enabled() && nodes_weight(cpuset_current_mems_allowed) == 1) {
3396 trace_sched_skip_cpuset_numa(current, &cpuset_current_mems_allowed);
3397 return;
3398 }
3399
3400 if (!mm->numa_next_scan) {
3401 mm->numa_next_scan = now +
3402 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3403 }
3404
3405 /*
3406 * Enforce maximal scan/migration frequency..
3407 */
3408 migrate = mm->numa_next_scan;
3409 if (time_before(now, migrate))
3410 return;
3411
3412 if (p->numa_scan_period == 0) {
3413 p->numa_scan_period_max = task_scan_max(p);
3414 p->numa_scan_period = task_scan_start(p);
3415 }
3416
3417 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
3418 if (!try_cmpxchg(&mm->numa_next_scan, &migrate, next_scan))
3419 return;
3420
3421 /*
3422 * Delay this task enough that another task of this mm will likely win
3423 * the next time around.
3424 */
3425 p->node_stamp += 2 * TICK_NSEC;
3426
3427 pages = sysctl_numa_balancing_scan_size;
3428 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
3429 virtpages = pages * 8; /* Scan up to this much virtual space */
3430 if (!pages)
3431 return;
3432
3433
3434 if (!mmap_read_trylock(mm))
3435 return;
3436
3437 /*
3438 * VMAs are skipped if the current PID has not trapped a fault within
3439 * the VMA recently. Allow scanning to be forced if there is no
3440 * suitable VMA remaining.
3441 */
3442 vma_pids_skipped = false;
3443
3444 retry_pids:
3445 start = mm->numa_scan_offset;
3446 vma_iter_init(&vmi, mm, start);
3447 vma = vma_next(&vmi);
3448 if (!vma) {
3449 reset_ptenuma_scan(p);
3450 start = 0;
3451 vma_iter_set(&vmi, start);
3452 vma = vma_next(&vmi);
3453 }
3454
3455 for (; vma; vma = vma_next(&vmi)) {
3456 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
3457 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
3458 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
3459 continue;
3460 }
3461
3462 /*
3463 * Shared library pages mapped by multiple processes are not
3464 * migrated as it is expected they are cache replicated. Avoid
3465 * hinting faults in read-only file-backed mappings or the vDSO
3466 * as migrating the pages will be of marginal benefit.
3467 */
3468 if (!vma->vm_mm ||
3469 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) {
3470 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO);
3471 continue;
3472 }
3473
3474 /*
3475 * Skip inaccessible VMAs to avoid any confusion between
3476 * PROT_NONE and NUMA hinting PTEs
3477 */
3478 if (!vma_is_accessible(vma)) {
3479 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE);
3480 continue;
3481 }
3482
3483 /* Initialise new per-VMA NUMAB state. */
3484 if (!vma->numab_state) {
3485 struct vma_numab_state *ptr;
3486
3487 ptr = kzalloc_obj(*ptr);
3488 if (!ptr)
3489 continue;
3490
3491 if (cmpxchg(&vma->numab_state, NULL, ptr)) {
3492 kfree(ptr);
3493 continue;
3494 }
3495
3496 vma->numab_state->start_scan_seq = mm->numa_scan_seq;
3497
3498 vma->numab_state->next_scan = now +
3499 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3500
3501 /* Reset happens after 4 times scan delay of scan start */
3502 vma->numab_state->pids_active_reset = vma->numab_state->next_scan +
3503 msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3504
3505 /*
3506 * Ensure prev_scan_seq does not match numa_scan_seq,
3507 * to prevent VMAs being skipped prematurely on the
3508 * first scan:
3509 */
3510 vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1;
3511 }
3512
3513 /*
3514 * Scanning the VMAs of short lived tasks add more overhead. So
3515 * delay the scan for new VMAs.
3516 */
3517 if (mm->numa_scan_seq && time_before(jiffies,
3518 vma->numab_state->next_scan)) {
3519 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY);
3520 continue;
3521 }
3522
3523 /* RESET access PIDs regularly for old VMAs. */
3524 if (mm->numa_scan_seq &&
3525 time_after(jiffies, vma->numab_state->pids_active_reset)) {
3526 vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
3527 msecs_to_jiffies(VMA_PID_RESET_PERIOD);
3528 vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
3529 vma->numab_state->pids_active[1] = 0;
3530 }
3531
3532 /* Do not rescan VMAs twice within the same sequence. */
3533 if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) {
3534 mm->numa_scan_offset = vma->vm_end;
3535 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED);
3536 continue;
3537 }
3538
3539 /*
3540 * Do not scan the VMA if task has not accessed it, unless no other
3541 * VMA candidate exists.
3542 */
3543 if (!vma_pids_forced && !vma_is_accessed(mm, vma)) {
3544 vma_pids_skipped = true;
3545 trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE);
3546 continue;
3547 }
3548
3549 do {
3550 start = max(start, vma->vm_start);
3551 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
3552 end = min(end, vma->vm_end);
3553 nr_pte_updates = change_prot_numa(vma, start, end);
3554
3555 /*
3556 * Try to scan sysctl_numa_balancing_size worth of
3557 * hpages that have at least one present PTE that
3558 * is not already PTE-numa. If the VMA contains
3559 * areas that are unused or already full of prot_numa
3560 * PTEs, scan up to virtpages, to skip through those
3561 * areas faster.
3562 */
3563 if (nr_pte_updates)
3564 pages -= (end - start) >> PAGE_SHIFT;
3565 virtpages -= (end - start) >> PAGE_SHIFT;
3566
3567 start = end;
3568 if (pages <= 0 || virtpages <= 0)
3569 goto out;
3570
3571 cond_resched();
3572 } while (end != vma->vm_end);
3573
3574 /* VMA scan is complete, do not scan until next sequence. */
3575 vma->numab_state->prev_scan_seq = mm->numa_scan_seq;
3576
3577 /*
3578 * Only force scan within one VMA at a time, to limit the
3579 * cost of scanning a potentially uninteresting VMA.
3580 */
3581 if (vma_pids_forced)
3582 break;
3583 }
3584
3585 /*
3586 * If no VMAs are remaining and VMAs were skipped due to the PID
3587 * not accessing the VMA previously, then force a scan to ensure
3588 * forward progress:
3589 */
3590 if (!vma && !vma_pids_forced && vma_pids_skipped) {
3591 vma_pids_forced = true;
3592 goto retry_pids;
3593 }
3594
3595 out:
3596 /*
3597 * It is possible to reach the end of the VMA list but the last few
3598 * VMAs are not guaranteed to the vma_migratable. If they are not, we
3599 * would find the !migratable VMA on the next scan but not reset the
3600 * scanner to the start so check it now.
3601 */
3602 if (vma)
3603 mm->numa_scan_offset = start;
3604 else
3605 reset_ptenuma_scan(p);
3606 mmap_read_unlock(mm);
3607
3608 /*
3609 * Make sure tasks use at least 32x as much time to run other code
3610 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
3611 * Usually update_task_scan_period slows down scanning enough; on an
3612 * overloaded system we need to limit overhead on a per task basis.
3613 */
3614 if (unlikely(p->se.sum_exec_runtime != runtime)) {
3615 u64 diff = p->se.sum_exec_runtime - runtime;
3616 p->node_stamp += 32 * diff;
3617 }
3618 }
3619
init_numa_balancing(u64 clone_flags,struct task_struct * p)3620 void init_numa_balancing(u64 clone_flags, struct task_struct *p)
3621 {
3622 int mm_users = 0;
3623 struct mm_struct *mm = p->mm;
3624
3625 if (mm) {
3626 mm_users = atomic_read(&mm->mm_users);
3627 if (mm_users == 1) {
3628 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
3629 mm->numa_scan_seq = 0;
3630 }
3631 }
3632 p->node_stamp = 0;
3633 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
3634 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
3635 p->numa_migrate_retry = 0;
3636 /* Protect against double add, see task_tick_numa and task_numa_work */
3637 p->numa_work.next = &p->numa_work;
3638 p->numa_faults = NULL;
3639 p->numa_pages_migrated = 0;
3640 p->total_numa_faults = 0;
3641 RCU_INIT_POINTER(p->numa_group, NULL);
3642 p->last_task_numa_placement = 0;
3643 p->last_sum_exec_runtime = 0;
3644
3645 init_task_work(&p->numa_work, task_numa_work);
3646
3647 /* New address space, reset the preferred nid */
3648 if (!(clone_flags & CLONE_VM)) {
3649 p->numa_preferred_nid = NUMA_NO_NODE;
3650 return;
3651 }
3652
3653 /*
3654 * New thread, keep existing numa_preferred_nid which should be copied
3655 * already by arch_dup_task_struct but stagger when scans start.
3656 */
3657 if (mm) {
3658 unsigned int delay;
3659
3660 delay = min_t(unsigned int, task_scan_max(current),
3661 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
3662 delay += 2 * TICK_NSEC;
3663 p->node_stamp = delay;
3664 }
3665 }
3666
3667 /*
3668 * Drive the periodic memory faults..
3669 */
task_tick_numa(struct rq * rq,struct task_struct * curr)3670 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3671 {
3672 struct callback_head *work = &curr->numa_work;
3673 u64 period, now;
3674
3675 /*
3676 * We don't care about NUMA placement if we don't have memory.
3677 */
3678 if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
3679 return;
3680
3681 /*
3682 * Using runtime rather than walltime has the dual advantage that
3683 * we (mostly) drive the selection from busy threads and that the
3684 * task needs to have done some actual work before we bother with
3685 * NUMA placement.
3686 */
3687 now = curr->se.sum_exec_runtime;
3688 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
3689
3690 if (now > curr->node_stamp + period) {
3691 if (!curr->node_stamp)
3692 curr->numa_scan_period = task_scan_start(curr);
3693 curr->node_stamp += period;
3694
3695 if (!time_before(jiffies, curr->mm->numa_next_scan))
3696 task_work_add(curr, work, TWA_RESUME);
3697 }
3698 }
3699
update_scan_period(struct task_struct * p,int new_cpu)3700 static void update_scan_period(struct task_struct *p, int new_cpu)
3701 {
3702 int src_nid = cpu_to_node(task_cpu(p));
3703 int dst_nid = cpu_to_node(new_cpu);
3704
3705 if (!static_branch_likely(&sched_numa_balancing))
3706 return;
3707
3708 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
3709 return;
3710
3711 if (src_nid == dst_nid)
3712 return;
3713
3714 /*
3715 * Allow resets if faults have been trapped before one scan
3716 * has completed. This is most likely due to a new task that
3717 * is pulled cross-node due to wakeups or load balancing.
3718 */
3719 if (p->numa_scan_seq) {
3720 /*
3721 * Avoid scan adjustments if moving to the preferred
3722 * node or if the task was not previously running on
3723 * the preferred node.
3724 */
3725 if (dst_nid == p->numa_preferred_nid ||
3726 (p->numa_preferred_nid != NUMA_NO_NODE &&
3727 src_nid != p->numa_preferred_nid))
3728 return;
3729 }
3730
3731 p->numa_scan_period = task_scan_start(p);
3732 }
3733
3734 #else /* !CONFIG_NUMA_BALANCING: */
3735
task_tick_numa(struct rq * rq,struct task_struct * curr)3736 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
3737 {
3738 }
3739
account_numa_enqueue(struct rq * rq,struct task_struct * p)3740 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
3741 {
3742 }
3743
account_numa_dequeue(struct rq * rq,struct task_struct * p)3744 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
3745 {
3746 }
3747
update_scan_period(struct task_struct * p,int new_cpu)3748 static inline void update_scan_period(struct task_struct *p, int new_cpu)
3749 {
3750 }
3751
3752 #endif /* !CONFIG_NUMA_BALANCING */
3753
3754 static void
account_entity_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)3755 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3756 {
3757 update_load_add(&cfs_rq->load, se->load.weight);
3758 if (entity_is_task(se)) {
3759 struct rq *rq = rq_of(cfs_rq);
3760
3761 account_numa_enqueue(rq, task_of(se));
3762 list_add(&se->group_node, &rq->cfs_tasks);
3763 }
3764 cfs_rq->nr_queued++;
3765 }
3766
3767 static void
account_entity_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)3768 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
3769 {
3770 update_load_sub(&cfs_rq->load, se->load.weight);
3771 if (entity_is_task(se)) {
3772 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
3773 list_del_init(&se->group_node);
3774 }
3775 cfs_rq->nr_queued--;
3776 }
3777
3778 /*
3779 * Signed add and clamp on underflow.
3780 *
3781 * Explicitly do a load-store to ensure the intermediate value never hits
3782 * memory. This allows lockless observations without ever seeing the negative
3783 * values.
3784 */
3785 #define add_positive(_ptr, _val) do { \
3786 typeof(_ptr) ptr = (_ptr); \
3787 __signed_scalar_typeof(*ptr) val = (_val); \
3788 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3789 \
3790 res = var + val; \
3791 \
3792 if (val < 0 && res > var) \
3793 res = 0; \
3794 \
3795 WRITE_ONCE(*ptr, res); \
3796 } while (0)
3797
3798 /*
3799 * Remove and clamp on negative, from a local variable.
3800 *
3801 * A variant of sub_positive(), which does not use explicit load-store
3802 * and is thus optimized for local variable updates.
3803 */
3804 #define lsub_positive(_ptr, _val) do { \
3805 typeof(_ptr) ptr = (_ptr); \
3806 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3807 } while (0)
3808
3809
3810 /*
3811 * Because of rounding, se->util_sum might ends up being +1 more than
3812 * cfs->util_sum. Although this is not a problem by itself, detaching
3813 * a lot of tasks with the rounding problem between 2 updates of
3814 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
3815 * cfs_util_avg is not.
3816 *
3817 * Check that util_sum is still above its lower bound for the new
3818 * util_avg. Given that period_contrib might have moved since the last
3819 * sync, we are only sure that util_sum must be above or equal to
3820 * util_avg * minimum possible divider
3821 */
3822 #define __update_sa(sa, name, delta_avg, delta_sum) do { \
3823 add_positive(&(sa)->name##_avg, delta_avg); \
3824 add_positive(&(sa)->name##_sum, delta_sum); \
3825 (sa)->name##_sum = max_t(typeof((sa)->name##_sum), \
3826 (sa)->name##_sum, \
3827 (sa)->name##_avg * PELT_MIN_DIVIDER); \
3828 } while (0)
3829
3830 static inline void
enqueue_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3831 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3832 {
3833 __update_sa(&cfs_rq->avg, load, se->avg.load_avg,
3834 se_weight(se) * se->avg.load_sum);
3835 }
3836
3837 static inline void
dequeue_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3838 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3839 {
3840 __update_sa(&cfs_rq->avg, load, -se->avg.load_avg,
3841 se_weight(se) * -se->avg.load_sum);
3842 }
3843
3844 static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
3845
reweight_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,unsigned long weight)3846 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
3847 unsigned long weight)
3848 {
3849 bool curr = cfs_rq->curr == se;
3850 bool rel_vprot = false;
3851 u64 vprot;
3852
3853 if (se->on_rq) {
3854 /* commit outstanding execution time */
3855 update_curr(cfs_rq);
3856 update_entity_lag(cfs_rq, se);
3857 se->deadline -= se->vruntime;
3858 se->rel_deadline = 1;
3859 if (curr && protect_slice(se)) {
3860 vprot = se->vprot - se->vruntime;
3861 rel_vprot = true;
3862 }
3863
3864 cfs_rq->nr_queued--;
3865 if (!curr)
3866 __dequeue_entity(cfs_rq, se);
3867 update_load_sub(&cfs_rq->load, se->load.weight);
3868 }
3869 dequeue_load_avg(cfs_rq, se);
3870
3871 /*
3872 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
3873 * we need to scale se->vlag when w_i changes.
3874 */
3875 se->vlag = div_s64(se->vlag * se->load.weight, weight);
3876 if (se->rel_deadline)
3877 se->deadline = div_s64(se->deadline * se->load.weight, weight);
3878
3879 if (rel_vprot)
3880 vprot = div_s64(vprot * se->load.weight, weight);
3881
3882 update_load_set(&se->load, weight);
3883
3884 do {
3885 u32 divider = get_pelt_divider(&se->avg);
3886
3887 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
3888 } while (0);
3889
3890 enqueue_load_avg(cfs_rq, se);
3891 if (se->on_rq) {
3892 place_entity(cfs_rq, se, 0);
3893 if (rel_vprot)
3894 se->vprot = se->vruntime + vprot;
3895 update_load_add(&cfs_rq->load, se->load.weight);
3896 if (!curr)
3897 __enqueue_entity(cfs_rq, se);
3898 cfs_rq->nr_queued++;
3899 }
3900 }
3901
reweight_task_fair(struct rq * rq,struct task_struct * p,const struct load_weight * lw)3902 static void reweight_task_fair(struct rq *rq, struct task_struct *p,
3903 const struct load_weight *lw)
3904 {
3905 struct sched_entity *se = &p->se;
3906 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3907 struct load_weight *load = &se->load;
3908
3909 reweight_entity(cfs_rq, se, lw->weight);
3910 load->inv_weight = lw->inv_weight;
3911 }
3912
3913 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3914
3915 #ifdef CONFIG_FAIR_GROUP_SCHED
3916 /*
3917 * All this does is approximate the hierarchical proportion which includes that
3918 * global sum we all love to hate.
3919 *
3920 * That is, the weight of a group entity, is the proportional share of the
3921 * group weight based on the group runqueue weights. That is:
3922 *
3923 * tg->weight * grq->load.weight
3924 * ge->load.weight = ----------------------------- (1)
3925 * \Sum grq->load.weight
3926 *
3927 * Now, because computing that sum is prohibitively expensive to compute (been
3928 * there, done that) we approximate it with this average stuff. The average
3929 * moves slower and therefore the approximation is cheaper and more stable.
3930 *
3931 * So instead of the above, we substitute:
3932 *
3933 * grq->load.weight -> grq->avg.load_avg (2)
3934 *
3935 * which yields the following:
3936 *
3937 * tg->weight * grq->avg.load_avg
3938 * ge->load.weight = ------------------------------ (3)
3939 * tg->load_avg
3940 *
3941 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3942 *
3943 * That is shares_avg, and it is right (given the approximation (2)).
3944 *
3945 * The problem with it is that because the average is slow -- it was designed
3946 * to be exactly that of course -- this leads to transients in boundary
3947 * conditions. In specific, the case where the group was idle and we start the
3948 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3949 * yielding bad latency etc..
3950 *
3951 * Now, in that special case (1) reduces to:
3952 *
3953 * tg->weight * grq->load.weight
3954 * ge->load.weight = ----------------------------- = tg->weight (4)
3955 * grp->load.weight
3956 *
3957 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3958 *
3959 * So what we do is modify our approximation (3) to approach (4) in the (near)
3960 * UP case, like:
3961 *
3962 * ge->load.weight =
3963 *
3964 * tg->weight * grq->load.weight
3965 * --------------------------------------------------- (5)
3966 * tg->load_avg - grq->avg.load_avg + grq->load.weight
3967 *
3968 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3969 * we need to use grq->avg.load_avg as its lower bound, which then gives:
3970 *
3971 *
3972 * tg->weight * grq->load.weight
3973 * ge->load.weight = ----------------------------- (6)
3974 * tg_load_avg'
3975 *
3976 * Where:
3977 *
3978 * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3979 * max(grq->load.weight, grq->avg.load_avg)
3980 *
3981 * And that is shares_weight and is icky. In the (near) UP case it approaches
3982 * (4) while in the normal case it approaches (3). It consistently
3983 * overestimates the ge->load.weight and therefore:
3984 *
3985 * \Sum ge->load.weight >= tg->weight
3986 *
3987 * hence icky!
3988 */
calc_group_shares(struct cfs_rq * cfs_rq)3989 static long calc_group_shares(struct cfs_rq *cfs_rq)
3990 {
3991 long tg_weight, tg_shares, load, shares;
3992 struct task_group *tg = cfs_rq->tg;
3993
3994 tg_shares = READ_ONCE(tg->shares);
3995
3996 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3997
3998 tg_weight = atomic_long_read(&tg->load_avg);
3999
4000 /* Ensure tg_weight >= load */
4001 tg_weight -= cfs_rq->tg_load_avg_contrib;
4002 tg_weight += load;
4003
4004 shares = (tg_shares * load);
4005 if (tg_weight)
4006 shares /= tg_weight;
4007
4008 /*
4009 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
4010 * of a group with small tg->shares value. It is a floor value which is
4011 * assigned as a minimum load.weight to the sched_entity representing
4012 * the group on a CPU.
4013 *
4014 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
4015 * on an 8-core system with 8 tasks each runnable on one CPU shares has
4016 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
4017 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
4018 * instead of 0.
4019 */
4020 return clamp_t(long, shares, MIN_SHARES, tg_shares);
4021 }
4022
4023 /*
4024 * Recomputes the group entity based on the current state of its group
4025 * runqueue.
4026 */
update_cfs_group(struct sched_entity * se)4027 static void update_cfs_group(struct sched_entity *se)
4028 {
4029 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4030 long shares;
4031
4032 /*
4033 * When a group becomes empty, preserve its weight. This matters for
4034 * DELAY_DEQUEUE.
4035 */
4036 if (!gcfs_rq || !gcfs_rq->load.weight)
4037 return;
4038
4039 shares = calc_group_shares(gcfs_rq);
4040 if (unlikely(se->load.weight != shares))
4041 reweight_entity(cfs_rq_of(se), se, shares);
4042 }
4043
4044 #else /* !CONFIG_FAIR_GROUP_SCHED: */
update_cfs_group(struct sched_entity * se)4045 static inline void update_cfs_group(struct sched_entity *se)
4046 {
4047 }
4048 #endif /* !CONFIG_FAIR_GROUP_SCHED */
4049
cfs_rq_util_change(struct cfs_rq * cfs_rq,int flags)4050 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
4051 {
4052 struct rq *rq = rq_of(cfs_rq);
4053
4054 if (&rq->cfs == cfs_rq) {
4055 /*
4056 * There are a few boundary cases this might miss but it should
4057 * get called often enough that that should (hopefully) not be
4058 * a real problem.
4059 *
4060 * It will not get called when we go idle, because the idle
4061 * thread is a different class (!fair), nor will the utilization
4062 * number include things like RT tasks.
4063 *
4064 * As is, the util number is not freq-invariant (we'd have to
4065 * implement arch_scale_freq_capacity() for that).
4066 *
4067 * See cpu_util_cfs().
4068 */
4069 cpufreq_update_util(rq, flags);
4070 }
4071 }
4072
load_avg_is_decayed(struct sched_avg * sa)4073 static inline bool load_avg_is_decayed(struct sched_avg *sa)
4074 {
4075 if (sa->load_sum)
4076 return false;
4077
4078 if (sa->util_sum)
4079 return false;
4080
4081 if (sa->runnable_sum)
4082 return false;
4083
4084 /*
4085 * _avg must be null when _sum are null because _avg = _sum / divider
4086 * Make sure that rounding and/or propagation of PELT values never
4087 * break this.
4088 */
4089 WARN_ON_ONCE(sa->load_avg ||
4090 sa->util_avg ||
4091 sa->runnable_avg);
4092
4093 return true;
4094 }
4095
cfs_rq_last_update_time(struct cfs_rq * cfs_rq)4096 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
4097 {
4098 return u64_u32_load_copy(cfs_rq->avg.last_update_time,
4099 cfs_rq->last_update_time_copy);
4100 }
4101 #ifdef CONFIG_FAIR_GROUP_SCHED
4102 /*
4103 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
4104 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
4105 * bottom-up, we only have to test whether the cfs_rq before us on the list
4106 * is our child.
4107 * If cfs_rq is not on the list, test whether a child needs its to be added to
4108 * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
4109 */
child_cfs_rq_on_list(struct cfs_rq * cfs_rq)4110 static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
4111 {
4112 struct cfs_rq *prev_cfs_rq;
4113 struct list_head *prev;
4114 struct rq *rq = rq_of(cfs_rq);
4115
4116 if (cfs_rq->on_list) {
4117 prev = cfs_rq->leaf_cfs_rq_list.prev;
4118 } else {
4119 prev = rq->tmp_alone_branch;
4120 }
4121
4122 if (prev == &rq->leaf_cfs_rq_list)
4123 return false;
4124
4125 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
4126
4127 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
4128 }
4129
cfs_rq_is_decayed(struct cfs_rq * cfs_rq)4130 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4131 {
4132 if (cfs_rq->load.weight)
4133 return false;
4134
4135 if (!load_avg_is_decayed(&cfs_rq->avg))
4136 return false;
4137
4138 if (child_cfs_rq_on_list(cfs_rq))
4139 return false;
4140
4141 if (cfs_rq->tg_load_avg_contrib)
4142 return false;
4143
4144 return true;
4145 }
4146
4147 /**
4148 * update_tg_load_avg - update the tg's load avg
4149 * @cfs_rq: the cfs_rq whose avg changed
4150 *
4151 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
4152 * However, because tg->load_avg is a global value there are performance
4153 * considerations.
4154 *
4155 * In order to avoid having to look at the other cfs_rq's, we use a
4156 * differential update where we store the last value we propagated. This in
4157 * turn allows skipping updates if the differential is 'small'.
4158 *
4159 * Updating tg's load_avg is necessary before update_cfs_share().
4160 */
update_tg_load_avg(struct cfs_rq * cfs_rq)4161 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
4162 {
4163 long delta;
4164 u64 now;
4165
4166 /*
4167 * No need to update load_avg for root_task_group as it is not used.
4168 */
4169 if (cfs_rq->tg == &root_task_group)
4170 return;
4171
4172 /* rq has been offline and doesn't contribute to the share anymore: */
4173 if (!cpu_active(cpu_of(rq_of(cfs_rq))))
4174 return;
4175
4176 /*
4177 * For migration heavy workloads, access to tg->load_avg can be
4178 * unbound. Limit the update rate to at most once per ms.
4179 */
4180 now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4181 if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC)
4182 return;
4183
4184 delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
4185 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
4186 atomic_long_add(delta, &cfs_rq->tg->load_avg);
4187 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
4188 cfs_rq->last_update_tg_load_avg = now;
4189 }
4190 }
4191
clear_tg_load_avg(struct cfs_rq * cfs_rq)4192 static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq)
4193 {
4194 long delta;
4195 u64 now;
4196
4197 /*
4198 * No need to update load_avg for root_task_group, as it is not used.
4199 */
4200 if (cfs_rq->tg == &root_task_group)
4201 return;
4202
4203 now = sched_clock_cpu(cpu_of(rq_of(cfs_rq)));
4204 delta = 0 - cfs_rq->tg_load_avg_contrib;
4205 atomic_long_add(delta, &cfs_rq->tg->load_avg);
4206 cfs_rq->tg_load_avg_contrib = 0;
4207 cfs_rq->last_update_tg_load_avg = now;
4208 }
4209
4210 /* CPU offline callback: */
clear_tg_offline_cfs_rqs(struct rq * rq)4211 static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq)
4212 {
4213 struct task_group *tg;
4214
4215 lockdep_assert_rq_held(rq);
4216
4217 /*
4218 * The rq clock has already been updated in
4219 * set_rq_offline(), so we should skip updating
4220 * the rq clock again in unthrottle_cfs_rq().
4221 */
4222 rq_clock_start_loop_update(rq);
4223
4224 rcu_read_lock();
4225 list_for_each_entry_rcu(tg, &task_groups, list) {
4226 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4227
4228 clear_tg_load_avg(cfs_rq);
4229 }
4230 rcu_read_unlock();
4231
4232 rq_clock_stop_loop_update(rq);
4233 }
4234
4235 /*
4236 * Called within set_task_rq() right before setting a task's CPU. The
4237 * caller only guarantees p->pi_lock is held; no other assumptions,
4238 * including the state of rq->lock, should be made.
4239 */
set_task_rq_fair(struct sched_entity * se,struct cfs_rq * prev,struct cfs_rq * next)4240 void set_task_rq_fair(struct sched_entity *se,
4241 struct cfs_rq *prev, struct cfs_rq *next)
4242 {
4243 u64 p_last_update_time;
4244 u64 n_last_update_time;
4245
4246 if (!sched_feat(ATTACH_AGE_LOAD))
4247 return;
4248
4249 /*
4250 * We are supposed to update the task to "current" time, then its up to
4251 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
4252 * getting what current time is, so simply throw away the out-of-date
4253 * time. This will result in the wakee task is less decayed, but giving
4254 * the wakee more load sounds not bad.
4255 */
4256 if (!(se->avg.last_update_time && prev))
4257 return;
4258
4259 p_last_update_time = cfs_rq_last_update_time(prev);
4260 n_last_update_time = cfs_rq_last_update_time(next);
4261
4262 __update_load_avg_blocked_se(p_last_update_time, se);
4263 se->avg.last_update_time = n_last_update_time;
4264 }
4265
4266 /*
4267 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
4268 * propagate its contribution. The key to this propagation is the invariant
4269 * that for each group:
4270 *
4271 * ge->avg == grq->avg (1)
4272 *
4273 * _IFF_ we look at the pure running and runnable sums. Because they
4274 * represent the very same entity, just at different points in the hierarchy.
4275 *
4276 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
4277 * and simply copies the running/runnable sum over (but still wrong, because
4278 * the group entity and group rq do not have their PELT windows aligned).
4279 *
4280 * However, update_tg_cfs_load() is more complex. So we have:
4281 *
4282 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
4283 *
4284 * And since, like util, the runnable part should be directly transferable,
4285 * the following would _appear_ to be the straight forward approach:
4286 *
4287 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
4288 *
4289 * And per (1) we have:
4290 *
4291 * ge->avg.runnable_avg == grq->avg.runnable_avg
4292 *
4293 * Which gives:
4294 *
4295 * ge->load.weight * grq->avg.load_avg
4296 * ge->avg.load_avg = ----------------------------------- (4)
4297 * grq->load.weight
4298 *
4299 * Except that is wrong!
4300 *
4301 * Because while for entities historical weight is not important and we
4302 * really only care about our future and therefore can consider a pure
4303 * runnable sum, runqueues can NOT do this.
4304 *
4305 * We specifically want runqueues to have a load_avg that includes
4306 * historical weights. Those represent the blocked load, the load we expect
4307 * to (shortly) return to us. This only works by keeping the weights as
4308 * integral part of the sum. We therefore cannot decompose as per (3).
4309 *
4310 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
4311 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
4312 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
4313 * runnable section of these tasks overlap (or not). If they were to perfectly
4314 * align the rq as a whole would be runnable 2/3 of the time. If however we
4315 * always have at least 1 runnable task, the rq as a whole is always runnable.
4316 *
4317 * So we'll have to approximate.. :/
4318 *
4319 * Given the constraint:
4320 *
4321 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
4322 *
4323 * We can construct a rule that adds runnable to a rq by assuming minimal
4324 * overlap.
4325 *
4326 * On removal, we'll assume each task is equally runnable; which yields:
4327 *
4328 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
4329 *
4330 * XXX: only do this for the part of runnable > running ?
4331 *
4332 */
4333 static inline void
update_tg_cfs_util(struct cfs_rq * cfs_rq,struct sched_entity * se,struct cfs_rq * gcfs_rq)4334 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4335 {
4336 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
4337 u32 new_sum, divider;
4338
4339 /* Nothing to update */
4340 if (!delta_avg)
4341 return;
4342
4343 /*
4344 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4345 * See ___update_load_avg() for details.
4346 */
4347 divider = get_pelt_divider(&cfs_rq->avg);
4348
4349 /* Set new sched_entity's utilization */
4350 se->avg.util_avg = gcfs_rq->avg.util_avg;
4351 new_sum = se->avg.util_avg * divider;
4352 delta_sum = (long)new_sum - (long)se->avg.util_sum;
4353 se->avg.util_sum = new_sum;
4354
4355 /* Update parent cfs_rq utilization */
4356 __update_sa(&cfs_rq->avg, util, delta_avg, delta_sum);
4357 }
4358
4359 static inline void
update_tg_cfs_runnable(struct cfs_rq * cfs_rq,struct sched_entity * se,struct cfs_rq * gcfs_rq)4360 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4361 {
4362 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
4363 u32 new_sum, divider;
4364
4365 /* Nothing to update */
4366 if (!delta_avg)
4367 return;
4368
4369 /*
4370 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4371 * See ___update_load_avg() for details.
4372 */
4373 divider = get_pelt_divider(&cfs_rq->avg);
4374
4375 /* Set new sched_entity's runnable */
4376 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
4377 new_sum = se->avg.runnable_avg * divider;
4378 delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
4379 se->avg.runnable_sum = new_sum;
4380
4381 /* Update parent cfs_rq runnable */
4382 __update_sa(&cfs_rq->avg, runnable, delta_avg, delta_sum);
4383 }
4384
4385 static inline void
update_tg_cfs_load(struct cfs_rq * cfs_rq,struct sched_entity * se,struct cfs_rq * gcfs_rq)4386 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
4387 {
4388 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
4389 unsigned long load_avg;
4390 u64 load_sum = 0;
4391 s64 delta_sum;
4392 u32 divider;
4393
4394 if (!runnable_sum)
4395 return;
4396
4397 gcfs_rq->prop_runnable_sum = 0;
4398
4399 /*
4400 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4401 * See ___update_load_avg() for details.
4402 */
4403 divider = get_pelt_divider(&cfs_rq->avg);
4404
4405 if (runnable_sum >= 0) {
4406 /*
4407 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
4408 * the CPU is saturated running == runnable.
4409 */
4410 runnable_sum += se->avg.load_sum;
4411 runnable_sum = min_t(long, runnable_sum, divider);
4412 } else {
4413 /*
4414 * Estimate the new unweighted runnable_sum of the gcfs_rq by
4415 * assuming all tasks are equally runnable.
4416 */
4417 if (scale_load_down(gcfs_rq->load.weight)) {
4418 load_sum = div_u64(gcfs_rq->avg.load_sum,
4419 scale_load_down(gcfs_rq->load.weight));
4420 }
4421
4422 /* But make sure to not inflate se's runnable */
4423 runnable_sum = min(se->avg.load_sum, load_sum);
4424 }
4425
4426 /*
4427 * runnable_sum can't be lower than running_sum
4428 * Rescale running sum to be in the same range as runnable sum
4429 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
4430 * runnable_sum is in [0 : LOAD_AVG_MAX]
4431 */
4432 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
4433 runnable_sum = max(runnable_sum, running_sum);
4434
4435 load_sum = se_weight(se) * runnable_sum;
4436 load_avg = div_u64(load_sum, divider);
4437
4438 delta_avg = load_avg - se->avg.load_avg;
4439 if (!delta_avg)
4440 return;
4441
4442 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
4443
4444 se->avg.load_sum = runnable_sum;
4445 se->avg.load_avg = load_avg;
4446 __update_sa(&cfs_rq->avg, load, delta_avg, delta_sum);
4447 }
4448
add_tg_cfs_propagate(struct cfs_rq * cfs_rq,long runnable_sum)4449 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
4450 {
4451 cfs_rq->propagate = 1;
4452 cfs_rq->prop_runnable_sum += runnable_sum;
4453 }
4454
4455 /* Update task and its cfs_rq load average */
propagate_entity_load_avg(struct sched_entity * se)4456 static inline int propagate_entity_load_avg(struct sched_entity *se)
4457 {
4458 struct cfs_rq *cfs_rq, *gcfs_rq;
4459
4460 if (entity_is_task(se))
4461 return 0;
4462
4463 gcfs_rq = group_cfs_rq(se);
4464 if (!gcfs_rq->propagate)
4465 return 0;
4466
4467 gcfs_rq->propagate = 0;
4468
4469 cfs_rq = cfs_rq_of(se);
4470
4471 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
4472
4473 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
4474 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
4475 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
4476
4477 trace_pelt_cfs_tp(cfs_rq);
4478 trace_pelt_se_tp(se);
4479
4480 return 1;
4481 }
4482
4483 /*
4484 * Check if we need to update the load and the utilization of a blocked
4485 * group_entity:
4486 */
skip_blocked_update(struct sched_entity * se)4487 static inline bool skip_blocked_update(struct sched_entity *se)
4488 {
4489 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
4490
4491 /*
4492 * If sched_entity still have not zero load or utilization, we have to
4493 * decay it:
4494 */
4495 if (se->avg.load_avg || se->avg.util_avg)
4496 return false;
4497
4498 /*
4499 * If there is a pending propagation, we have to update the load and
4500 * the utilization of the sched_entity:
4501 */
4502 if (gcfs_rq->propagate)
4503 return false;
4504
4505 /*
4506 * Otherwise, the load and the utilization of the sched_entity is
4507 * already zero and there is no pending propagation, so it will be a
4508 * waste of time to try to decay it:
4509 */
4510 return true;
4511 }
4512
4513 #else /* !CONFIG_FAIR_GROUP_SCHED: */
4514
update_tg_load_avg(struct cfs_rq * cfs_rq)4515 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
4516
clear_tg_offline_cfs_rqs(struct rq * rq)4517 static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {}
4518
propagate_entity_load_avg(struct sched_entity * se)4519 static inline int propagate_entity_load_avg(struct sched_entity *se)
4520 {
4521 return 0;
4522 }
4523
add_tg_cfs_propagate(struct cfs_rq * cfs_rq,long runnable_sum)4524 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
4525
4526 #endif /* !CONFIG_FAIR_GROUP_SCHED */
4527
4528 #ifdef CONFIG_NO_HZ_COMMON
migrate_se_pelt_lag(struct sched_entity * se)4529 static inline void migrate_se_pelt_lag(struct sched_entity *se)
4530 {
4531 u64 throttled = 0, now, lut;
4532 struct cfs_rq *cfs_rq;
4533 struct rq *rq;
4534 bool is_idle;
4535
4536 if (load_avg_is_decayed(&se->avg))
4537 return;
4538
4539 cfs_rq = cfs_rq_of(se);
4540 rq = rq_of(cfs_rq);
4541
4542 rcu_read_lock();
4543 is_idle = is_idle_task(rcu_dereference_all(rq->curr));
4544 rcu_read_unlock();
4545
4546 /*
4547 * The lag estimation comes with a cost we don't want to pay all the
4548 * time. Hence, limiting to the case where the source CPU is idle and
4549 * we know we are at the greatest risk to have an outdated clock.
4550 */
4551 if (!is_idle)
4552 return;
4553
4554 /*
4555 * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
4556 *
4557 * last_update_time (the cfs_rq's last_update_time)
4558 * = cfs_rq_clock_pelt()@cfs_rq_idle
4559 * = rq_clock_pelt()@cfs_rq_idle
4560 * - cfs->throttled_clock_pelt_time@cfs_rq_idle
4561 *
4562 * cfs_idle_lag (delta between rq's update and cfs_rq's update)
4563 * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
4564 *
4565 * rq_idle_lag (delta between now and rq's update)
4566 * = sched_clock_cpu() - rq_clock()@rq_idle
4567 *
4568 * We can then write:
4569 *
4570 * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
4571 * sched_clock_cpu() - rq_clock()@rq_idle
4572 * Where:
4573 * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
4574 * rq_clock()@rq_idle is rq->clock_idle
4575 * cfs->throttled_clock_pelt_time@cfs_rq_idle
4576 * is cfs_rq->throttled_pelt_idle
4577 */
4578
4579 #ifdef CONFIG_CFS_BANDWIDTH
4580 throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
4581 /* The clock has been stopped for throttling */
4582 if (throttled == U64_MAX)
4583 return;
4584 #endif
4585 now = u64_u32_load(rq->clock_pelt_idle);
4586 /*
4587 * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
4588 * is observed the old clock_pelt_idle value and the new clock_idle,
4589 * which lead to an underestimation. The opposite would lead to an
4590 * overestimation.
4591 */
4592 smp_rmb();
4593 lut = cfs_rq_last_update_time(cfs_rq);
4594
4595 now -= throttled;
4596 if (now < lut)
4597 /*
4598 * cfs_rq->avg.last_update_time is more recent than our
4599 * estimation, let's use it.
4600 */
4601 now = lut;
4602 else
4603 now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
4604
4605 __update_load_avg_blocked_se(now, se);
4606 }
4607 #else /* !CONFIG_NO_HZ_COMMON: */
migrate_se_pelt_lag(struct sched_entity * se)4608 static void migrate_se_pelt_lag(struct sched_entity *se) {}
4609 #endif /* !CONFIG_NO_HZ_COMMON */
4610
4611 /**
4612 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
4613 * @now: current time, as per cfs_rq_clock_pelt()
4614 * @cfs_rq: cfs_rq to update
4615 *
4616 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
4617 * avg. The immediate corollary is that all (fair) tasks must be attached.
4618 *
4619 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
4620 *
4621 * Return: true if the load decayed or we removed load.
4622 *
4623 * Since both these conditions indicate a changed cfs_rq->avg.load we should
4624 * call update_tg_load_avg() when this function returns true.
4625 */
4626 static inline int
update_cfs_rq_load_avg(u64 now,struct cfs_rq * cfs_rq)4627 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
4628 {
4629 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
4630 struct sched_avg *sa = &cfs_rq->avg;
4631 int decayed = 0;
4632
4633 if (cfs_rq->removed.nr) {
4634 unsigned long r;
4635 u32 divider = get_pelt_divider(&cfs_rq->avg);
4636
4637 raw_spin_lock(&cfs_rq->removed.lock);
4638 swap(cfs_rq->removed.util_avg, removed_util);
4639 swap(cfs_rq->removed.load_avg, removed_load);
4640 swap(cfs_rq->removed.runnable_avg, removed_runnable);
4641 cfs_rq->removed.nr = 0;
4642 raw_spin_unlock(&cfs_rq->removed.lock);
4643
4644 r = removed_load;
4645 __update_sa(sa, load, -r, -r*divider);
4646
4647 r = removed_util;
4648 __update_sa(sa, util, -r, -r*divider);
4649
4650 r = removed_runnable;
4651 __update_sa(sa, runnable, -r, -r*divider);
4652
4653 /*
4654 * removed_runnable is the unweighted version of removed_load so we
4655 * can use it to estimate removed_load_sum.
4656 */
4657 add_tg_cfs_propagate(cfs_rq,
4658 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
4659
4660 decayed = 1;
4661 }
4662
4663 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
4664 u64_u32_store_copy(sa->last_update_time,
4665 cfs_rq->last_update_time_copy,
4666 sa->last_update_time);
4667 return decayed;
4668 }
4669
4670 /**
4671 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
4672 * @cfs_rq: cfs_rq to attach to
4673 * @se: sched_entity to attach
4674 *
4675 * Must call update_cfs_rq_load_avg() before this, since we rely on
4676 * cfs_rq->avg.last_update_time being current.
4677 */
attach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)4678 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4679 {
4680 /*
4681 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
4682 * See ___update_load_avg() for details.
4683 */
4684 u32 divider = get_pelt_divider(&cfs_rq->avg);
4685
4686 /*
4687 * When we attach the @se to the @cfs_rq, we must align the decay
4688 * window because without that, really weird and wonderful things can
4689 * happen.
4690 *
4691 * XXX illustrate
4692 */
4693 se->avg.last_update_time = cfs_rq->avg.last_update_time;
4694 se->avg.period_contrib = cfs_rq->avg.period_contrib;
4695
4696 /*
4697 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
4698 * period_contrib. This isn't strictly correct, but since we're
4699 * entirely outside of the PELT hierarchy, nobody cares if we truncate
4700 * _sum a little.
4701 */
4702 se->avg.util_sum = se->avg.util_avg * divider;
4703
4704 se->avg.runnable_sum = se->avg.runnable_avg * divider;
4705
4706 se->avg.load_sum = se->avg.load_avg * divider;
4707 if (se_weight(se) < se->avg.load_sum)
4708 se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
4709 else
4710 se->avg.load_sum = 1;
4711
4712 enqueue_load_avg(cfs_rq, se);
4713 cfs_rq->avg.util_avg += se->avg.util_avg;
4714 cfs_rq->avg.util_sum += se->avg.util_sum;
4715 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
4716 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
4717
4718 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
4719
4720 cfs_rq_util_change(cfs_rq, 0);
4721
4722 trace_pelt_cfs_tp(cfs_rq);
4723 }
4724
4725 /**
4726 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
4727 * @cfs_rq: cfs_rq to detach from
4728 * @se: sched_entity to detach
4729 *
4730 * Must call update_cfs_rq_load_avg() before this, since we rely on
4731 * cfs_rq->avg.last_update_time being current.
4732 */
detach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)4733 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
4734 {
4735 dequeue_load_avg(cfs_rq, se);
4736 __update_sa(&cfs_rq->avg, util, -se->avg.util_avg, -se->avg.util_sum);
4737 __update_sa(&cfs_rq->avg, runnable, -se->avg.runnable_avg, -se->avg.runnable_sum);
4738
4739 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
4740
4741 cfs_rq_util_change(cfs_rq, 0);
4742
4743 trace_pelt_cfs_tp(cfs_rq);
4744 }
4745
4746 /*
4747 * Optional action to be done while updating the load average
4748 */
4749 #define UPDATE_TG 0x1
4750 #define SKIP_AGE_LOAD 0x2
4751 #define DO_ATTACH 0x4
4752 #define DO_DETACH 0x8
4753
4754 /* Update task and its cfs_rq load average */
update_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)4755 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4756 {
4757 u64 now = cfs_rq_clock_pelt(cfs_rq);
4758 int decayed;
4759
4760 /*
4761 * Track task load average for carrying it to new CPU after migrated, and
4762 * track group sched_entity load average for task_h_load calculation in migration
4763 */
4764 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
4765 __update_load_avg_se(now, cfs_rq, se);
4766
4767 decayed = update_cfs_rq_load_avg(now, cfs_rq);
4768 decayed |= propagate_entity_load_avg(se);
4769
4770 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
4771
4772 /*
4773 * DO_ATTACH means we're here from enqueue_entity().
4774 * !last_update_time means we've passed through
4775 * migrate_task_rq_fair() indicating we migrated.
4776 *
4777 * IOW we're enqueueing a task on a new CPU.
4778 */
4779 attach_entity_load_avg(cfs_rq, se);
4780 update_tg_load_avg(cfs_rq);
4781
4782 } else if (flags & DO_DETACH) {
4783 /*
4784 * DO_DETACH means we're here from dequeue_entity()
4785 * and we are migrating task out of the CPU.
4786 */
4787 detach_entity_load_avg(cfs_rq, se);
4788 update_tg_load_avg(cfs_rq);
4789 } else if (decayed) {
4790 cfs_rq_util_change(cfs_rq, 0);
4791
4792 if (flags & UPDATE_TG)
4793 update_tg_load_avg(cfs_rq);
4794 }
4795 }
4796
4797 /*
4798 * Synchronize entity load avg of dequeued entity without locking
4799 * the previous rq.
4800 */
sync_entity_load_avg(struct sched_entity * se)4801 static void sync_entity_load_avg(struct sched_entity *se)
4802 {
4803 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4804 u64 last_update_time;
4805
4806 last_update_time = cfs_rq_last_update_time(cfs_rq);
4807 __update_load_avg_blocked_se(last_update_time, se);
4808 }
4809
4810 /*
4811 * Task first catches up with cfs_rq, and then subtract
4812 * itself from the cfs_rq (task must be off the queue now).
4813 */
remove_entity_load_avg(struct sched_entity * se)4814 static void remove_entity_load_avg(struct sched_entity *se)
4815 {
4816 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4817 unsigned long flags;
4818
4819 /*
4820 * tasks cannot exit without having gone through wake_up_new_task() ->
4821 * enqueue_task_fair() which will have added things to the cfs_rq,
4822 * so we can remove unconditionally.
4823 */
4824
4825 sync_entity_load_avg(se);
4826
4827 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
4828 ++cfs_rq->removed.nr;
4829 cfs_rq->removed.util_avg += se->avg.util_avg;
4830 cfs_rq->removed.load_avg += se->avg.load_avg;
4831 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
4832 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
4833 }
4834
cfs_rq_runnable_avg(struct cfs_rq * cfs_rq)4835 static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
4836 {
4837 return cfs_rq->avg.runnable_avg;
4838 }
4839
cfs_rq_load_avg(struct cfs_rq * cfs_rq)4840 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
4841 {
4842 return cfs_rq->avg.load_avg;
4843 }
4844
4845 static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
4846 __must_hold(__rq_lockp(this_rq));
4847
task_util(struct task_struct * p)4848 static inline unsigned long task_util(struct task_struct *p)
4849 {
4850 return READ_ONCE(p->se.avg.util_avg);
4851 }
4852
task_runnable(struct task_struct * p)4853 static inline unsigned long task_runnable(struct task_struct *p)
4854 {
4855 return READ_ONCE(p->se.avg.runnable_avg);
4856 }
4857
_task_util_est(struct task_struct * p)4858 static inline unsigned long _task_util_est(struct task_struct *p)
4859 {
4860 return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
4861 }
4862
task_util_est(struct task_struct * p)4863 static inline unsigned long task_util_est(struct task_struct *p)
4864 {
4865 return max(task_util(p), _task_util_est(p));
4866 }
4867
util_est_enqueue(struct cfs_rq * cfs_rq,struct task_struct * p)4868 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
4869 struct task_struct *p)
4870 {
4871 unsigned int enqueued;
4872
4873 if (!sched_feat(UTIL_EST))
4874 return;
4875
4876 /* Update root cfs_rq's estimated utilization */
4877 enqueued = cfs_rq->avg.util_est;
4878 enqueued += _task_util_est(p);
4879 WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4880
4881 trace_sched_util_est_cfs_tp(cfs_rq);
4882 }
4883
util_est_dequeue(struct cfs_rq * cfs_rq,struct task_struct * p)4884 static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
4885 struct task_struct *p)
4886 {
4887 unsigned int enqueued;
4888
4889 if (!sched_feat(UTIL_EST))
4890 return;
4891
4892 /* Update root cfs_rq's estimated utilization */
4893 enqueued = cfs_rq->avg.util_est;
4894 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4895 WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
4896
4897 trace_sched_util_est_cfs_tp(cfs_rq);
4898 }
4899
4900 #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4901
util_est_update(struct cfs_rq * cfs_rq,struct task_struct * p,bool task_sleep)4902 static inline void util_est_update(struct cfs_rq *cfs_rq,
4903 struct task_struct *p,
4904 bool task_sleep)
4905 {
4906 unsigned int ewma, dequeued, last_ewma_diff;
4907
4908 if (!sched_feat(UTIL_EST))
4909 return;
4910
4911 /*
4912 * Skip update of task's estimated utilization when the task has not
4913 * yet completed an activation, e.g. being migrated.
4914 */
4915 if (!task_sleep)
4916 return;
4917
4918 /* Get current estimate of utilization */
4919 ewma = READ_ONCE(p->se.avg.util_est);
4920
4921 /*
4922 * If the PELT values haven't changed since enqueue time,
4923 * skip the util_est update.
4924 */
4925 if (ewma & UTIL_AVG_UNCHANGED)
4926 return;
4927
4928 /* Get utilization at dequeue */
4929 dequeued = task_util(p);
4930
4931 /*
4932 * Reset EWMA on utilization increases, the moving average is used only
4933 * to smooth utilization decreases.
4934 */
4935 if (ewma <= dequeued) {
4936 ewma = dequeued;
4937 goto done;
4938 }
4939
4940 /*
4941 * Skip update of task's estimated utilization when its members are
4942 * already ~1% close to its last activation value.
4943 */
4944 last_ewma_diff = ewma - dequeued;
4945 if (last_ewma_diff < UTIL_EST_MARGIN)
4946 goto done;
4947
4948 /*
4949 * To avoid underestimate of task utilization, skip updates of EWMA if
4950 * we cannot grant that thread got all CPU time it wanted.
4951 */
4952 if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
4953 goto done;
4954
4955
4956 /*
4957 * Update Task's estimated utilization
4958 *
4959 * When *p completes an activation we can consolidate another sample
4960 * of the task size. This is done by using this value to update the
4961 * Exponential Weighted Moving Average (EWMA):
4962 *
4963 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4964 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4965 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4966 * = w * ( -last_ewma_diff ) + ewma(t-1)
4967 * = w * (-last_ewma_diff + ewma(t-1) / w)
4968 *
4969 * Where 'w' is the weight of new samples, which is configured to be
4970 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4971 */
4972 ewma <<= UTIL_EST_WEIGHT_SHIFT;
4973 ewma -= last_ewma_diff;
4974 ewma >>= UTIL_EST_WEIGHT_SHIFT;
4975 done:
4976 ewma |= UTIL_AVG_UNCHANGED;
4977 WRITE_ONCE(p->se.avg.util_est, ewma);
4978
4979 trace_sched_util_est_se_tp(&p->se);
4980 }
4981
get_actual_cpu_capacity(int cpu)4982 static inline unsigned long get_actual_cpu_capacity(int cpu)
4983 {
4984 unsigned long capacity = arch_scale_cpu_capacity(cpu);
4985
4986 capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
4987
4988 return capacity;
4989 }
4990
util_fits_cpu(unsigned long util,unsigned long uclamp_min,unsigned long uclamp_max,int cpu)4991 static inline int util_fits_cpu(unsigned long util,
4992 unsigned long uclamp_min,
4993 unsigned long uclamp_max,
4994 int cpu)
4995 {
4996 unsigned long capacity = capacity_of(cpu);
4997 unsigned long capacity_orig;
4998 bool fits, uclamp_max_fits;
4999
5000 /*
5001 * Check if the real util fits without any uclamp boost/cap applied.
5002 */
5003 fits = fits_capacity(util, capacity);
5004
5005 if (!uclamp_is_used())
5006 return fits;
5007
5008 /*
5009 * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and
5010 * uclamp_max. We only care about capacity pressure (by using
5011 * capacity_of()) for comparing against the real util.
5012 *
5013 * If a task is boosted to 1024 for example, we don't want a tiny
5014 * pressure to skew the check whether it fits a CPU or not.
5015 *
5016 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
5017 * should fit a little cpu even if there's some pressure.
5018 *
5019 * Only exception is for HW or cpufreq pressure since it has a direct impact
5020 * on available OPP of the system.
5021 *
5022 * We honour it for uclamp_min only as a drop in performance level
5023 * could result in not getting the requested minimum performance level.
5024 *
5025 * For uclamp_max, we can tolerate a drop in performance level as the
5026 * goal is to cap the task. So it's okay if it's getting less.
5027 */
5028 capacity_orig = arch_scale_cpu_capacity(cpu);
5029
5030 /*
5031 * We want to force a task to fit a cpu as implied by uclamp_max.
5032 * But we do have some corner cases to cater for..
5033 *
5034 *
5035 * C=z
5036 * | ___
5037 * | C=y | |
5038 * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
5039 * | C=x | | | |
5040 * | ___ | | | |
5041 * | | | | | | | (util somewhere in this region)
5042 * | | | | | | |
5043 * | | | | | | |
5044 * +----------------------------------------
5045 * CPU0 CPU1 CPU2
5046 *
5047 * In the above example if a task is capped to a specific performance
5048 * point, y, then when:
5049 *
5050 * * util = 80% of x then it does not fit on CPU0 and should migrate
5051 * to CPU1
5052 * * util = 80% of y then it is forced to fit on CPU1 to honour
5053 * uclamp_max request.
5054 *
5055 * which is what we're enforcing here. A task always fits if
5056 * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
5057 * the normal upmigration rules should withhold still.
5058 *
5059 * Only exception is when we are on max capacity, then we need to be
5060 * careful not to block overutilized state. This is so because:
5061 *
5062 * 1. There's no concept of capping at max_capacity! We can't go
5063 * beyond this performance level anyway.
5064 * 2. The system is being saturated when we're operating near
5065 * max capacity, it doesn't make sense to block overutilized.
5066 */
5067 uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
5068 uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
5069 fits = fits || uclamp_max_fits;
5070
5071 /*
5072 *
5073 * C=z
5074 * | ___ (region a, capped, util >= uclamp_max)
5075 * | C=y | |
5076 * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
5077 * | C=x | | | |
5078 * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
5079 * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
5080 * | | | | | | |
5081 * | | | | | | | (region c, boosted, util < uclamp_min)
5082 * +----------------------------------------
5083 * CPU0 CPU1 CPU2
5084 *
5085 * a) If util > uclamp_max, then we're capped, we don't care about
5086 * actual fitness value here. We only care if uclamp_max fits
5087 * capacity without taking margin/pressure into account.
5088 * See comment above.
5089 *
5090 * b) If uclamp_min <= util <= uclamp_max, then the normal
5091 * fits_capacity() rules apply. Except we need to ensure that we
5092 * enforce we remain within uclamp_max, see comment above.
5093 *
5094 * c) If util < uclamp_min, then we are boosted. Same as (b) but we
5095 * need to take into account the boosted value fits the CPU without
5096 * taking margin/pressure into account.
5097 *
5098 * Cases (a) and (b) are handled in the 'fits' variable already. We
5099 * just need to consider an extra check for case (c) after ensuring we
5100 * handle the case uclamp_min > uclamp_max.
5101 */
5102 uclamp_min = min(uclamp_min, uclamp_max);
5103 if (fits && (util < uclamp_min) &&
5104 (uclamp_min > get_actual_cpu_capacity(cpu)))
5105 return -1;
5106
5107 return fits;
5108 }
5109
task_fits_cpu(struct task_struct * p,int cpu)5110 static inline int task_fits_cpu(struct task_struct *p, int cpu)
5111 {
5112 unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
5113 unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
5114 unsigned long util = task_util_est(p);
5115 /*
5116 * Return true only if the cpu fully fits the task requirements, which
5117 * include the utilization but also the performance hints.
5118 */
5119 return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
5120 }
5121
update_misfit_status(struct task_struct * p,struct rq * rq)5122 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
5123 {
5124 int cpu = cpu_of(rq);
5125
5126 if (!sched_asym_cpucap_active())
5127 return;
5128
5129 /*
5130 * Affinity allows us to go somewhere higher? Or are we on biggest
5131 * available CPU already? Or do we fit into this CPU ?
5132 */
5133 if (!p || (p->nr_cpus_allowed == 1) ||
5134 (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
5135 task_fits_cpu(p, cpu)) {
5136
5137 rq->misfit_task_load = 0;
5138 return;
5139 }
5140
5141 /*
5142 * Make sure that misfit_task_load will not be null even if
5143 * task_h_load() returns 0.
5144 */
5145 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
5146 }
5147
__setparam_fair(struct task_struct * p,const struct sched_attr * attr)5148 void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
5149 {
5150 struct sched_entity *se = &p->se;
5151
5152 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5153 if (attr->sched_runtime) {
5154 se->custom_slice = 1;
5155 se->slice = clamp_t(u64, attr->sched_runtime,
5156 NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
5157 NSEC_PER_MSEC*100); /* HZ=100 / 10 */
5158 } else {
5159 se->custom_slice = 0;
5160 se->slice = sysctl_sched_base_slice;
5161 }
5162 }
5163
5164 static void
place_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)5165 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5166 {
5167 u64 vslice, vruntime = avg_vruntime(cfs_rq);
5168 s64 lag = 0;
5169
5170 if (!se->custom_slice)
5171 se->slice = sysctl_sched_base_slice;
5172 vslice = calc_delta_fair(se->slice, se);
5173
5174 /*
5175 * Due to how V is constructed as the weighted average of entities,
5176 * adding tasks with positive lag, or removing tasks with negative lag
5177 * will move 'time' backwards, this can screw around with the lag of
5178 * other tasks.
5179 *
5180 * EEVDF: placement strategy #1 / #2
5181 */
5182 if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
5183 struct sched_entity *curr = cfs_rq->curr;
5184 unsigned long load;
5185
5186 lag = se->vlag;
5187
5188 /*
5189 * If we want to place a task and preserve lag, we have to
5190 * consider the effect of the new entity on the weighted
5191 * average and compensate for this, otherwise lag can quickly
5192 * evaporate.
5193 *
5194 * Lag is defined as:
5195 *
5196 * lag_i = S - s_i = w_i * (V - v_i)
5197 *
5198 * To avoid the 'w_i' term all over the place, we only track
5199 * the virtual lag:
5200 *
5201 * vl_i = V - v_i <=> v_i = V - vl_i
5202 *
5203 * And we take V to be the weighted average of all v:
5204 *
5205 * V = (\Sum w_j*v_j) / W
5206 *
5207 * Where W is: \Sum w_j
5208 *
5209 * Then, the weighted average after adding an entity with lag
5210 * vl_i is given by:
5211 *
5212 * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
5213 * = (W*V + w_i*(V - vl_i)) / (W + w_i)
5214 * = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
5215 * = (V*(W + w_i) - w_i*vl_i) / (W + w_i)
5216 * = V - w_i*vl_i / (W + w_i)
5217 *
5218 * And the actual lag after adding an entity with vl_i is:
5219 *
5220 * vl'_i = V' - v_i
5221 * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
5222 * = vl_i - w_i*vl_i / (W + w_i)
5223 *
5224 * Which is strictly less than vl_i. So in order to preserve lag
5225 * we should inflate the lag before placement such that the
5226 * effective lag after placement comes out right.
5227 *
5228 * As such, invert the above relation for vl'_i to get the vl_i
5229 * we need to use such that the lag after placement is the lag
5230 * we computed before dequeue.
5231 *
5232 * vl'_i = vl_i - w_i*vl_i / (W + w_i)
5233 * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
5234 *
5235 * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
5236 * = W*vl_i
5237 *
5238 * vl_i = (W + w_i)*vl'_i / W
5239 */
5240 load = cfs_rq->sum_weight;
5241 if (curr && curr->on_rq)
5242 load += scale_load_down(curr->load.weight);
5243
5244 lag *= load + scale_load_down(se->load.weight);
5245 if (WARN_ON_ONCE(!load))
5246 load = 1;
5247 lag = div_s64(lag, load);
5248 }
5249
5250 se->vruntime = vruntime - lag;
5251
5252 if (se->rel_deadline) {
5253 se->deadline += se->vruntime;
5254 se->rel_deadline = 0;
5255 return;
5256 }
5257
5258 /*
5259 * When joining the competition; the existing tasks will be,
5260 * on average, halfway through their slice, as such start tasks
5261 * off with half a slice to ease into the competition.
5262 */
5263 if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL))
5264 vslice /= 2;
5265
5266 /*
5267 * EEVDF: vd_i = ve_i + r_i/w_i
5268 */
5269 se->deadline = se->vruntime + vslice;
5270 }
5271
5272 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
5273 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
5274
5275 static void
5276 requeue_delayed_entity(struct sched_entity *se);
5277
5278 static void
enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)5279 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5280 {
5281 bool curr = cfs_rq->curr == se;
5282
5283 /*
5284 * If we're the current task, we must renormalise before calling
5285 * update_curr().
5286 */
5287 if (curr)
5288 place_entity(cfs_rq, se, flags);
5289
5290 update_curr(cfs_rq);
5291
5292 /*
5293 * When enqueuing a sched_entity, we must:
5294 * - Update loads to have both entity and cfs_rq synced with now.
5295 * - For group_entity, update its runnable_weight to reflect the new
5296 * h_nr_runnable of its group cfs_rq.
5297 * - For group_entity, update its weight to reflect the new share of
5298 * its group cfs_rq
5299 * - Add its new weight to cfs_rq->load.weight
5300 */
5301 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
5302 se_update_runnable(se);
5303 /*
5304 * XXX update_load_avg() above will have attached us to the pelt sum;
5305 * but update_cfs_group() here will re-adjust the weight and have to
5306 * undo/redo all that. Seems wasteful.
5307 */
5308 update_cfs_group(se);
5309
5310 /*
5311 * XXX now that the entity has been re-weighted, and it's lag adjusted,
5312 * we can place the entity.
5313 */
5314 if (!curr)
5315 place_entity(cfs_rq, se, flags);
5316
5317 account_entity_enqueue(cfs_rq, se);
5318
5319 /* Entity has migrated, no longer consider this task hot */
5320 if (flags & ENQUEUE_MIGRATED)
5321 se->exec_start = 0;
5322
5323 check_schedstat_required();
5324 update_stats_enqueue_fair(cfs_rq, se, flags);
5325 if (!curr)
5326 __enqueue_entity(cfs_rq, se);
5327 se->on_rq = 1;
5328
5329 if (cfs_rq->nr_queued == 1) {
5330 check_enqueue_throttle(cfs_rq);
5331 list_add_leaf_cfs_rq(cfs_rq);
5332 #ifdef CONFIG_CFS_BANDWIDTH
5333 if (cfs_rq->pelt_clock_throttled) {
5334 struct rq *rq = rq_of(cfs_rq);
5335
5336 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5337 cfs_rq->throttled_clock_pelt;
5338 cfs_rq->pelt_clock_throttled = 0;
5339 }
5340 #endif
5341 }
5342 }
5343
__clear_buddies_next(struct sched_entity * se)5344 static void __clear_buddies_next(struct sched_entity *se)
5345 {
5346 for_each_sched_entity(se) {
5347 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5348 if (cfs_rq->next != se)
5349 break;
5350
5351 cfs_rq->next = NULL;
5352 }
5353 }
5354
clear_buddies(struct cfs_rq * cfs_rq,struct sched_entity * se)5355 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
5356 {
5357 if (cfs_rq->next == se)
5358 __clear_buddies_next(se);
5359 }
5360
5361 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5362
set_delayed(struct sched_entity * se)5363 static void set_delayed(struct sched_entity *se)
5364 {
5365 se->sched_delayed = 1;
5366
5367 /*
5368 * Delayed se of cfs_rq have no tasks queued on them.
5369 * Do not adjust h_nr_runnable since dequeue_entities()
5370 * will account it for blocked tasks.
5371 */
5372 if (!entity_is_task(se))
5373 return;
5374
5375 for_each_sched_entity(se) {
5376 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5377
5378 cfs_rq->h_nr_runnable--;
5379 }
5380 }
5381
clear_delayed(struct sched_entity * se)5382 static void clear_delayed(struct sched_entity *se)
5383 {
5384 se->sched_delayed = 0;
5385
5386 /*
5387 * Delayed se of cfs_rq have no tasks queued on them.
5388 * Do not adjust h_nr_runnable since a dequeue has
5389 * already accounted for it or an enqueue of a task
5390 * below it will account for it in enqueue_task_fair().
5391 */
5392 if (!entity_is_task(se))
5393 return;
5394
5395 for_each_sched_entity(se) {
5396 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5397
5398 cfs_rq->h_nr_runnable++;
5399 }
5400 }
5401
finish_delayed_dequeue_entity(struct sched_entity * se)5402 static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
5403 {
5404 clear_delayed(se);
5405 if (sched_feat(DELAY_ZERO) && se->vlag > 0)
5406 se->vlag = 0;
5407 }
5408
5409 static bool
dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)5410 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
5411 {
5412 bool sleep = flags & DEQUEUE_SLEEP;
5413 int action = UPDATE_TG;
5414
5415 update_curr(cfs_rq);
5416 clear_buddies(cfs_rq, se);
5417
5418 if (flags & DEQUEUE_DELAYED) {
5419 WARN_ON_ONCE(!se->sched_delayed);
5420 } else {
5421 bool delay = sleep;
5422 /*
5423 * DELAY_DEQUEUE relies on spurious wakeups, special task
5424 * states must not suffer spurious wakeups, excempt them.
5425 */
5426 if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE))
5427 delay = false;
5428
5429 WARN_ON_ONCE(delay && se->sched_delayed);
5430
5431 if (sched_feat(DELAY_DEQUEUE) && delay &&
5432 !entity_eligible(cfs_rq, se)) {
5433 update_load_avg(cfs_rq, se, 0);
5434 set_delayed(se);
5435 return false;
5436 }
5437 }
5438
5439 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
5440 action |= DO_DETACH;
5441
5442 /*
5443 * When dequeuing a sched_entity, we must:
5444 * - Update loads to have both entity and cfs_rq synced with now.
5445 * - For group_entity, update its runnable_weight to reflect the new
5446 * h_nr_runnable of its group cfs_rq.
5447 * - Subtract its previous weight from cfs_rq->load.weight.
5448 * - For group entity, update its weight to reflect the new share
5449 * of its group cfs_rq.
5450 */
5451 update_load_avg(cfs_rq, se, action);
5452 se_update_runnable(se);
5453
5454 update_stats_dequeue_fair(cfs_rq, se, flags);
5455
5456 update_entity_lag(cfs_rq, se);
5457 if (sched_feat(PLACE_REL_DEADLINE) && !sleep) {
5458 se->deadline -= se->vruntime;
5459 se->rel_deadline = 1;
5460 }
5461
5462 if (se != cfs_rq->curr)
5463 __dequeue_entity(cfs_rq, se);
5464 se->on_rq = 0;
5465 account_entity_dequeue(cfs_rq, se);
5466
5467 /* return excess runtime on last dequeue */
5468 return_cfs_rq_runtime(cfs_rq);
5469
5470 update_cfs_group(se);
5471
5472 if (flags & DEQUEUE_DELAYED)
5473 finish_delayed_dequeue_entity(se);
5474
5475 if (cfs_rq->nr_queued == 0) {
5476 update_idle_cfs_rq_clock_pelt(cfs_rq);
5477 #ifdef CONFIG_CFS_BANDWIDTH
5478 if (throttled_hierarchy(cfs_rq)) {
5479 struct rq *rq = rq_of(cfs_rq);
5480
5481 list_del_leaf_cfs_rq(cfs_rq);
5482 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5483 cfs_rq->pelt_clock_throttled = 1;
5484 }
5485 #endif
5486 }
5487
5488 return true;
5489 }
5490
5491 static void
set_next_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,bool first)5492 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first)
5493 {
5494 clear_buddies(cfs_rq, se);
5495
5496 /* 'current' is not kept within the tree. */
5497 if (se->on_rq) {
5498 /*
5499 * Any task has to be enqueued before it get to execute on
5500 * a CPU. So account for the time it spent waiting on the
5501 * runqueue.
5502 */
5503 update_stats_wait_end_fair(cfs_rq, se);
5504 __dequeue_entity(cfs_rq, se);
5505 update_load_avg(cfs_rq, se, UPDATE_TG);
5506
5507 if (first)
5508 set_protect_slice(cfs_rq, se);
5509 }
5510
5511 update_stats_curr_start(cfs_rq, se);
5512 WARN_ON_ONCE(cfs_rq->curr);
5513 cfs_rq->curr = se;
5514
5515 /*
5516 * Track our maximum slice length, if the CPU's load is at
5517 * least twice that of our own weight (i.e. don't track it
5518 * when there are only lesser-weight tasks around):
5519 */
5520 if (schedstat_enabled() &&
5521 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
5522 struct sched_statistics *stats;
5523
5524 stats = __schedstats_from_se(se);
5525 __schedstat_set(stats->slice_max,
5526 max((u64)stats->slice_max,
5527 se->sum_exec_runtime - se->prev_sum_exec_runtime));
5528 }
5529
5530 se->prev_sum_exec_runtime = se->sum_exec_runtime;
5531 }
5532
5533 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
5534
5535 /*
5536 * Pick the next process, keeping these things in mind, in this order:
5537 * 1) keep things fair between processes/task groups
5538 * 2) pick the "next" process, since someone really wants that to run
5539 * 3) pick the "last" process, for cache locality
5540 * 4) do not run the "skip" process, if something else is available
5541 */
5542 static struct sched_entity *
pick_next_entity(struct rq * rq,struct cfs_rq * cfs_rq)5543 pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
5544 {
5545 struct sched_entity *se;
5546
5547 se = pick_eevdf(cfs_rq);
5548 if (se->sched_delayed) {
5549 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5550 /*
5551 * Must not reference @se again, see __block_task().
5552 */
5553 return NULL;
5554 }
5555 return se;
5556 }
5557
5558 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
5559
put_prev_entity(struct cfs_rq * cfs_rq,struct sched_entity * prev)5560 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
5561 {
5562 /*
5563 * If still on the runqueue then deactivate_task()
5564 * was not called and update_curr() has to be done:
5565 */
5566 if (prev->on_rq)
5567 update_curr(cfs_rq);
5568
5569 /* throttle cfs_rqs exceeding runtime */
5570 check_cfs_rq_runtime(cfs_rq);
5571
5572 if (prev->on_rq) {
5573 update_stats_wait_start_fair(cfs_rq, prev);
5574 /* Put 'current' back into the tree. */
5575 __enqueue_entity(cfs_rq, prev);
5576 /* in !on_rq case, update occurred at dequeue */
5577 update_load_avg(cfs_rq, prev, 0);
5578 }
5579 WARN_ON_ONCE(cfs_rq->curr != prev);
5580 cfs_rq->curr = NULL;
5581 }
5582
5583 static void
entity_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr,int queued)5584 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
5585 {
5586 /*
5587 * Update run-time statistics of the 'current'.
5588 */
5589 update_curr(cfs_rq);
5590
5591 /*
5592 * Ensure that runnable average is periodically updated.
5593 */
5594 update_load_avg(cfs_rq, curr, UPDATE_TG);
5595 update_cfs_group(curr);
5596
5597 #ifdef CONFIG_SCHED_HRTICK
5598 /*
5599 * queued ticks are scheduled to match the slice, so don't bother
5600 * validating it and just reschedule.
5601 */
5602 if (queued) {
5603 resched_curr_lazy(rq_of(cfs_rq));
5604 return;
5605 }
5606 #endif
5607 }
5608
5609
5610 /**************************************************
5611 * CFS bandwidth control machinery
5612 */
5613
5614 #ifdef CONFIG_CFS_BANDWIDTH
5615
5616 #ifdef CONFIG_JUMP_LABEL
5617 static struct static_key __cfs_bandwidth_used;
5618
cfs_bandwidth_used(void)5619 static inline bool cfs_bandwidth_used(void)
5620 {
5621 return static_key_false(&__cfs_bandwidth_used);
5622 }
5623
cfs_bandwidth_usage_inc(void)5624 void cfs_bandwidth_usage_inc(void)
5625 {
5626 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
5627 }
5628
cfs_bandwidth_usage_dec(void)5629 void cfs_bandwidth_usage_dec(void)
5630 {
5631 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
5632 }
5633 #else /* !CONFIG_JUMP_LABEL: */
cfs_bandwidth_used(void)5634 static bool cfs_bandwidth_used(void)
5635 {
5636 return true;
5637 }
5638
cfs_bandwidth_usage_inc(void)5639 void cfs_bandwidth_usage_inc(void) {}
cfs_bandwidth_usage_dec(void)5640 void cfs_bandwidth_usage_dec(void) {}
5641 #endif /* !CONFIG_JUMP_LABEL */
5642
sched_cfs_bandwidth_slice(void)5643 static inline u64 sched_cfs_bandwidth_slice(void)
5644 {
5645 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
5646 }
5647
5648 /*
5649 * Replenish runtime according to assigned quota. We use sched_clock_cpu
5650 * directly instead of rq->clock to avoid adding additional synchronization
5651 * around rq->lock.
5652 *
5653 * requires cfs_b->lock
5654 */
__refill_cfs_bandwidth_runtime(struct cfs_bandwidth * cfs_b)5655 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
5656 {
5657 s64 runtime;
5658
5659 if (unlikely(cfs_b->quota == RUNTIME_INF))
5660 return;
5661
5662 cfs_b->runtime += cfs_b->quota;
5663 runtime = cfs_b->runtime_snap - cfs_b->runtime;
5664 if (runtime > 0) {
5665 cfs_b->burst_time += runtime;
5666 cfs_b->nr_burst++;
5667 }
5668
5669 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
5670 cfs_b->runtime_snap = cfs_b->runtime;
5671 }
5672
tg_cfs_bandwidth(struct task_group * tg)5673 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5674 {
5675 return &tg->cfs_bandwidth;
5676 }
5677
5678 /* returns 0 on failure to allocate runtime */
__assign_cfs_rq_runtime(struct cfs_bandwidth * cfs_b,struct cfs_rq * cfs_rq,u64 target_runtime)5679 static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
5680 struct cfs_rq *cfs_rq, u64 target_runtime)
5681 {
5682 u64 min_amount, amount = 0;
5683
5684 lockdep_assert_held(&cfs_b->lock);
5685
5686 /* note: this is a positive sum as runtime_remaining <= 0 */
5687 min_amount = target_runtime - cfs_rq->runtime_remaining;
5688
5689 if (cfs_b->quota == RUNTIME_INF)
5690 amount = min_amount;
5691 else {
5692 start_cfs_bandwidth(cfs_b);
5693
5694 if (cfs_b->runtime > 0) {
5695 amount = min(cfs_b->runtime, min_amount);
5696 cfs_b->runtime -= amount;
5697 cfs_b->idle = 0;
5698 }
5699 }
5700
5701 cfs_rq->runtime_remaining += amount;
5702
5703 return cfs_rq->runtime_remaining > 0;
5704 }
5705
5706 /* returns 0 on failure to allocate runtime */
assign_cfs_rq_runtime(struct cfs_rq * cfs_rq)5707 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5708 {
5709 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5710 int ret;
5711
5712 raw_spin_lock(&cfs_b->lock);
5713 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
5714 raw_spin_unlock(&cfs_b->lock);
5715
5716 return ret;
5717 }
5718
__account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)5719 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5720 {
5721 /* dock delta_exec before expiring quota (as it could span periods) */
5722 cfs_rq->runtime_remaining -= delta_exec;
5723
5724 if (likely(cfs_rq->runtime_remaining > 0))
5725 return;
5726
5727 if (cfs_rq->throttled)
5728 return;
5729 /*
5730 * if we're unable to extend our runtime we resched so that the active
5731 * hierarchy can be throttled
5732 */
5733 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
5734 resched_curr(rq_of(cfs_rq));
5735 }
5736
5737 static __always_inline
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)5738 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
5739 {
5740 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
5741 return;
5742
5743 __account_cfs_rq_runtime(cfs_rq, delta_exec);
5744 }
5745
cfs_rq_throttled(struct cfs_rq * cfs_rq)5746 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5747 {
5748 return cfs_bandwidth_used() && cfs_rq->throttled;
5749 }
5750
cfs_rq_pelt_clock_throttled(struct cfs_rq * cfs_rq)5751 static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
5752 {
5753 return cfs_bandwidth_used() && cfs_rq->pelt_clock_throttled;
5754 }
5755
5756 /* check whether cfs_rq, or any parent, is throttled */
throttled_hierarchy(struct cfs_rq * cfs_rq)5757 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5758 {
5759 return cfs_bandwidth_used() && cfs_rq->throttle_count;
5760 }
5761
lb_throttled_hierarchy(struct task_struct * p,int dst_cpu)5762 static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
5763 {
5764 return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]);
5765 }
5766
task_is_throttled(struct task_struct * p)5767 static inline bool task_is_throttled(struct task_struct *p)
5768 {
5769 return cfs_bandwidth_used() && p->throttled;
5770 }
5771
5772 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags);
throttle_cfs_rq_work(struct callback_head * work)5773 static void throttle_cfs_rq_work(struct callback_head *work)
5774 {
5775 struct task_struct *p = container_of(work, struct task_struct, sched_throttle_work);
5776 struct sched_entity *se;
5777 struct cfs_rq *cfs_rq;
5778 struct rq *rq;
5779
5780 WARN_ON_ONCE(p != current);
5781 p->sched_throttle_work.next = &p->sched_throttle_work;
5782
5783 /*
5784 * If task is exiting, then there won't be a return to userspace, so we
5785 * don't have to bother with any of this.
5786 */
5787 if ((p->flags & PF_EXITING))
5788 return;
5789
5790 scoped_guard(task_rq_lock, p) {
5791 se = &p->se;
5792 cfs_rq = cfs_rq_of(se);
5793
5794 /* Raced, forget */
5795 if (p->sched_class != &fair_sched_class)
5796 return;
5797
5798 /*
5799 * If not in limbo, then either replenish has happened or this
5800 * task got migrated out of the throttled cfs_rq, move along.
5801 */
5802 if (!cfs_rq->throttle_count)
5803 return;
5804 rq = scope.rq;
5805 update_rq_clock(rq);
5806 WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node));
5807 dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE);
5808 list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
5809 /*
5810 * Must not set throttled before dequeue or dequeue will
5811 * mistakenly regard this task as an already throttled one.
5812 */
5813 p->throttled = true;
5814 resched_curr(rq);
5815 }
5816 }
5817
init_cfs_throttle_work(struct task_struct * p)5818 void init_cfs_throttle_work(struct task_struct *p)
5819 {
5820 init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work);
5821 /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */
5822 p->sched_throttle_work.next = &p->sched_throttle_work;
5823 INIT_LIST_HEAD(&p->throttle_node);
5824 }
5825
5826 /*
5827 * Task is throttled and someone wants to dequeue it again:
5828 * it could be sched/core when core needs to do things like
5829 * task affinity change, task group change, task sched class
5830 * change etc. and in these cases, DEQUEUE_SLEEP is not set;
5831 * or the task is blocked after throttled due to freezer etc.
5832 * and in these cases, DEQUEUE_SLEEP is set.
5833 */
5834 static void detach_task_cfs_rq(struct task_struct *p);
dequeue_throttled_task(struct task_struct * p,int flags)5835 static void dequeue_throttled_task(struct task_struct *p, int flags)
5836 {
5837 WARN_ON_ONCE(p->se.on_rq);
5838 list_del_init(&p->throttle_node);
5839
5840 /* task blocked after throttled */
5841 if (flags & DEQUEUE_SLEEP) {
5842 p->throttled = false;
5843 return;
5844 }
5845
5846 /*
5847 * task is migrating off its old cfs_rq, detach
5848 * the task's load from its old cfs_rq.
5849 */
5850 if (task_on_rq_migrating(p))
5851 detach_task_cfs_rq(p);
5852 }
5853
enqueue_throttled_task(struct task_struct * p)5854 static bool enqueue_throttled_task(struct task_struct *p)
5855 {
5856 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5857
5858 /* @p should have gone through dequeue_throttled_task() first */
5859 WARN_ON_ONCE(!list_empty(&p->throttle_node));
5860
5861 /*
5862 * If the throttled task @p is enqueued to a throttled cfs_rq,
5863 * take the fast path by directly putting the task on the
5864 * target cfs_rq's limbo list.
5865 *
5866 * Do not do that when @p is current because the following race can
5867 * cause @p's group_node to be incorectly re-insterted in its rq's
5868 * cfs_tasks list, despite being throttled:
5869 *
5870 * cpuX cpuY
5871 * p ret2user
5872 * throttle_cfs_rq_work() sched_move_task(p)
5873 * LOCK task_rq_lock
5874 * dequeue_task_fair(p)
5875 * UNLOCK task_rq_lock
5876 * LOCK task_rq_lock
5877 * task_current_donor(p) == true
5878 * task_on_rq_queued(p) == true
5879 * dequeue_task(p)
5880 * put_prev_task(p)
5881 * sched_change_group()
5882 * enqueue_task(p) -> p's new cfs_rq
5883 * is throttled, go
5884 * fast path and skip
5885 * actual enqueue
5886 * set_next_task(p)
5887 * list_move(&se->group_node, &rq->cfs_tasks); // bug
5888 * schedule()
5889 *
5890 * In the above race case, @p current cfs_rq is in the same rq as
5891 * its previous cfs_rq because sched_move_task() only moves a task
5892 * to a different group from the same rq, so we can use its current
5893 * cfs_rq to derive rq and test if the task is current.
5894 */
5895 if (throttled_hierarchy(cfs_rq) &&
5896 !task_current_donor(rq_of(cfs_rq), p)) {
5897 list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list);
5898 return true;
5899 }
5900
5901 /* we can't take the fast path, do an actual enqueue*/
5902 p->throttled = false;
5903 return false;
5904 }
5905
5906 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags);
tg_unthrottle_up(struct task_group * tg,void * data)5907 static int tg_unthrottle_up(struct task_group *tg, void *data)
5908 {
5909 struct rq *rq = data;
5910 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5911 struct task_struct *p, *tmp;
5912
5913 if (--cfs_rq->throttle_count)
5914 return 0;
5915
5916 if (cfs_rq->pelt_clock_throttled) {
5917 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
5918 cfs_rq->throttled_clock_pelt;
5919 cfs_rq->pelt_clock_throttled = 0;
5920 }
5921
5922 if (cfs_rq->throttled_clock_self) {
5923 u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
5924
5925 cfs_rq->throttled_clock_self = 0;
5926
5927 if (WARN_ON_ONCE((s64)delta < 0))
5928 delta = 0;
5929
5930 cfs_rq->throttled_clock_self_time += delta;
5931 }
5932
5933 /* Re-enqueue the tasks that have been throttled at this level. */
5934 list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) {
5935 list_del_init(&p->throttle_node);
5936 p->throttled = false;
5937 enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP);
5938 }
5939
5940 /* Add cfs_rq with load or one or more already running entities to the list */
5941 if (!cfs_rq_is_decayed(cfs_rq))
5942 list_add_leaf_cfs_rq(cfs_rq);
5943
5944 return 0;
5945 }
5946
task_has_throttle_work(struct task_struct * p)5947 static inline bool task_has_throttle_work(struct task_struct *p)
5948 {
5949 return p->sched_throttle_work.next != &p->sched_throttle_work;
5950 }
5951
task_throttle_setup_work(struct task_struct * p)5952 static inline void task_throttle_setup_work(struct task_struct *p)
5953 {
5954 if (task_has_throttle_work(p))
5955 return;
5956
5957 /*
5958 * Kthreads and exiting tasks don't return to userspace, so adding the
5959 * work is pointless
5960 */
5961 if ((p->flags & (PF_EXITING | PF_KTHREAD)))
5962 return;
5963
5964 task_work_add(p, &p->sched_throttle_work, TWA_RESUME);
5965 }
5966
record_throttle_clock(struct cfs_rq * cfs_rq)5967 static void record_throttle_clock(struct cfs_rq *cfs_rq)
5968 {
5969 struct rq *rq = rq_of(cfs_rq);
5970
5971 if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
5972 cfs_rq->throttled_clock = rq_clock(rq);
5973
5974 if (!cfs_rq->throttled_clock_self)
5975 cfs_rq->throttled_clock_self = rq_clock(rq);
5976 }
5977
tg_throttle_down(struct task_group * tg,void * data)5978 static int tg_throttle_down(struct task_group *tg, void *data)
5979 {
5980 struct rq *rq = data;
5981 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5982
5983 if (cfs_rq->throttle_count++)
5984 return 0;
5985
5986 /*
5987 * For cfs_rqs that still have entities enqueued, PELT clock
5988 * stop happens at dequeue time when all entities are dequeued.
5989 */
5990 if (!cfs_rq->nr_queued) {
5991 list_del_leaf_cfs_rq(cfs_rq);
5992 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
5993 cfs_rq->pelt_clock_throttled = 1;
5994 }
5995
5996 WARN_ON_ONCE(cfs_rq->throttled_clock_self);
5997 WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list));
5998 return 0;
5999 }
6000
throttle_cfs_rq(struct cfs_rq * cfs_rq)6001 static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
6002 {
6003 struct rq *rq = rq_of(cfs_rq);
6004 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6005 int dequeue = 1;
6006
6007 raw_spin_lock(&cfs_b->lock);
6008 /* This will start the period timer if necessary */
6009 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
6010 /*
6011 * We have raced with bandwidth becoming available, and if we
6012 * actually throttled the timer might not unthrottle us for an
6013 * entire period. We additionally needed to make sure that any
6014 * subsequent check_cfs_rq_runtime calls agree not to throttle
6015 * us, as we may commit to do cfs put_prev+pick_next, so we ask
6016 * for 1ns of runtime rather than just check cfs_b.
6017 */
6018 dequeue = 0;
6019 } else {
6020 list_add_tail_rcu(&cfs_rq->throttled_list,
6021 &cfs_b->throttled_cfs_rq);
6022 }
6023 raw_spin_unlock(&cfs_b->lock);
6024
6025 if (!dequeue)
6026 return false; /* Throttle no longer required. */
6027
6028 /* freeze hierarchy runnable averages while throttled */
6029 rcu_read_lock();
6030 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
6031 rcu_read_unlock();
6032
6033 /*
6034 * Note: distribution will already see us throttled via the
6035 * throttled-list. rq->lock protects completion.
6036 */
6037 cfs_rq->throttled = 1;
6038 WARN_ON_ONCE(cfs_rq->throttled_clock);
6039 return true;
6040 }
6041
unthrottle_cfs_rq(struct cfs_rq * cfs_rq)6042 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
6043 {
6044 struct rq *rq = rq_of(cfs_rq);
6045 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6046 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
6047
6048 /*
6049 * It's possible we are called with runtime_remaining < 0 due to things
6050 * like async unthrottled us with a positive runtime_remaining but other
6051 * still running entities consumed those runtime before we reached here.
6052 *
6053 * We can't unthrottle this cfs_rq without any runtime remaining because
6054 * any enqueue in tg_unthrottle_up() will immediately trigger a throttle,
6055 * which is not supposed to happen on unthrottle path.
6056 */
6057 if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0)
6058 return;
6059
6060 cfs_rq->throttled = 0;
6061
6062 update_rq_clock(rq);
6063
6064 raw_spin_lock(&cfs_b->lock);
6065 if (cfs_rq->throttled_clock) {
6066 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
6067 cfs_rq->throttled_clock = 0;
6068 }
6069 list_del_rcu(&cfs_rq->throttled_list);
6070 raw_spin_unlock(&cfs_b->lock);
6071
6072 /* update hierarchical throttle state */
6073 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
6074
6075 if (!cfs_rq->load.weight) {
6076 if (!cfs_rq->on_list)
6077 return;
6078 /*
6079 * Nothing to run but something to decay (on_list)?
6080 * Complete the branch.
6081 */
6082 for_each_sched_entity(se) {
6083 if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
6084 break;
6085 }
6086 }
6087
6088 assert_list_leaf_cfs_rq(rq);
6089
6090 /* Determine whether we need to wake up potentially idle CPU: */
6091 if (rq->curr == rq->idle && rq->cfs.nr_queued)
6092 resched_curr(rq);
6093 }
6094
__cfsb_csd_unthrottle(void * arg)6095 static void __cfsb_csd_unthrottle(void *arg)
6096 {
6097 struct cfs_rq *cursor, *tmp;
6098 struct rq *rq = arg;
6099 struct rq_flags rf;
6100
6101 rq_lock(rq, &rf);
6102
6103 /*
6104 * Iterating over the list can trigger several call to
6105 * update_rq_clock() in unthrottle_cfs_rq().
6106 * Do it once and skip the potential next ones.
6107 */
6108 update_rq_clock(rq);
6109 rq_clock_start_loop_update(rq);
6110
6111 /*
6112 * Since we hold rq lock we're safe from concurrent manipulation of
6113 * the CSD list. However, this RCU critical section annotates the
6114 * fact that we pair with sched_free_group_rcu(), so that we cannot
6115 * race with group being freed in the window between removing it
6116 * from the list and advancing to the next entry in the list.
6117 */
6118 rcu_read_lock();
6119
6120 list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
6121 throttled_csd_list) {
6122 list_del_init(&cursor->throttled_csd_list);
6123
6124 if (cfs_rq_throttled(cursor))
6125 unthrottle_cfs_rq(cursor);
6126 }
6127
6128 rcu_read_unlock();
6129
6130 rq_clock_stop_loop_update(rq);
6131 rq_unlock(rq, &rf);
6132 }
6133
__unthrottle_cfs_rq_async(struct cfs_rq * cfs_rq)6134 static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6135 {
6136 struct rq *rq = rq_of(cfs_rq);
6137 bool first;
6138
6139 if (rq == this_rq()) {
6140 unthrottle_cfs_rq(cfs_rq);
6141 return;
6142 }
6143
6144 /* Already enqueued */
6145 if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
6146 return;
6147
6148 first = list_empty(&rq->cfsb_csd_list);
6149 list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
6150 if (first)
6151 smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
6152 }
6153
unthrottle_cfs_rq_async(struct cfs_rq * cfs_rq)6154 static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
6155 {
6156 lockdep_assert_rq_held(rq_of(cfs_rq));
6157
6158 if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) ||
6159 cfs_rq->runtime_remaining <= 0))
6160 return;
6161
6162 __unthrottle_cfs_rq_async(cfs_rq);
6163 }
6164
distribute_cfs_runtime(struct cfs_bandwidth * cfs_b)6165 static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
6166 {
6167 int this_cpu = smp_processor_id();
6168 u64 runtime, remaining = 1;
6169 bool throttled = false;
6170 struct cfs_rq *cfs_rq, *tmp;
6171 struct rq_flags rf;
6172 struct rq *rq;
6173 LIST_HEAD(local_unthrottle);
6174
6175 rcu_read_lock();
6176 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
6177 throttled_list) {
6178 rq = rq_of(cfs_rq);
6179
6180 if (!remaining) {
6181 throttled = true;
6182 break;
6183 }
6184
6185 rq_lock_irqsave(rq, &rf);
6186 if (!cfs_rq_throttled(cfs_rq))
6187 goto next;
6188
6189 /* Already queued for async unthrottle */
6190 if (!list_empty(&cfs_rq->throttled_csd_list))
6191 goto next;
6192
6193 /* By the above checks, this should never be true */
6194 WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
6195
6196 raw_spin_lock(&cfs_b->lock);
6197 runtime = -cfs_rq->runtime_remaining + 1;
6198 if (runtime > cfs_b->runtime)
6199 runtime = cfs_b->runtime;
6200 cfs_b->runtime -= runtime;
6201 remaining = cfs_b->runtime;
6202 raw_spin_unlock(&cfs_b->lock);
6203
6204 cfs_rq->runtime_remaining += runtime;
6205
6206 /* we check whether we're throttled above */
6207 if (cfs_rq->runtime_remaining > 0) {
6208 if (cpu_of(rq) != this_cpu) {
6209 unthrottle_cfs_rq_async(cfs_rq);
6210 } else {
6211 /*
6212 * We currently only expect to be unthrottling
6213 * a single cfs_rq locally.
6214 */
6215 WARN_ON_ONCE(!list_empty(&local_unthrottle));
6216 list_add_tail(&cfs_rq->throttled_csd_list,
6217 &local_unthrottle);
6218 }
6219 } else {
6220 throttled = true;
6221 }
6222
6223 next:
6224 rq_unlock_irqrestore(rq, &rf);
6225 }
6226
6227 list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle,
6228 throttled_csd_list) {
6229 struct rq *rq = rq_of(cfs_rq);
6230
6231 rq_lock_irqsave(rq, &rf);
6232
6233 list_del_init(&cfs_rq->throttled_csd_list);
6234
6235 if (cfs_rq_throttled(cfs_rq))
6236 unthrottle_cfs_rq(cfs_rq);
6237
6238 rq_unlock_irqrestore(rq, &rf);
6239 }
6240 WARN_ON_ONCE(!list_empty(&local_unthrottle));
6241
6242 rcu_read_unlock();
6243
6244 return throttled;
6245 }
6246
6247 /*
6248 * Responsible for refilling a task_group's bandwidth and unthrottling its
6249 * cfs_rqs as appropriate. If there has been no activity within the last
6250 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
6251 * used to track this state.
6252 */
do_sched_cfs_period_timer(struct cfs_bandwidth * cfs_b,int overrun,unsigned long flags)6253 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
6254 __must_hold(&cfs_b->lock)
6255 {
6256 int throttled;
6257
6258 /* no need to continue the timer with no bandwidth constraint */
6259 if (cfs_b->quota == RUNTIME_INF)
6260 goto out_deactivate;
6261
6262 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
6263 cfs_b->nr_periods += overrun;
6264
6265 /* Refill extra burst quota even if cfs_b->idle */
6266 __refill_cfs_bandwidth_runtime(cfs_b);
6267
6268 /*
6269 * idle depends on !throttled (for the case of a large deficit), and if
6270 * we're going inactive then everything else can be deferred
6271 */
6272 if (cfs_b->idle && !throttled)
6273 goto out_deactivate;
6274
6275 if (!throttled) {
6276 /* mark as potentially idle for the upcoming period */
6277 cfs_b->idle = 1;
6278 return 0;
6279 }
6280
6281 /* account preceding periods in which throttling occurred */
6282 cfs_b->nr_throttled += overrun;
6283
6284 /*
6285 * This check is repeated as we release cfs_b->lock while we unthrottle.
6286 */
6287 while (throttled && cfs_b->runtime > 0) {
6288 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6289 /* we can't nest cfs_b->lock while distributing bandwidth */
6290 throttled = distribute_cfs_runtime(cfs_b);
6291 raw_spin_lock_irqsave(&cfs_b->lock, flags);
6292 }
6293
6294 /*
6295 * While we are ensured activity in the period following an
6296 * unthrottle, this also covers the case in which the new bandwidth is
6297 * insufficient to cover the existing bandwidth deficit. (Forcing the
6298 * timer to remain active while there are any throttled entities.)
6299 */
6300 cfs_b->idle = 0;
6301
6302 return 0;
6303
6304 out_deactivate:
6305 return 1;
6306 }
6307
6308 /* a cfs_rq won't donate quota below this amount */
6309 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
6310 /* minimum remaining period time to redistribute slack quota */
6311 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
6312 /* how long we wait to gather additional slack before distributing */
6313 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
6314
6315 /*
6316 * Are we near the end of the current quota period?
6317 *
6318 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
6319 * hrtimer base being cleared by hrtimer_start. In the case of
6320 * migrate_hrtimers, base is never cleared, so we are fine.
6321 */
runtime_refresh_within(struct cfs_bandwidth * cfs_b,u64 min_expire)6322 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
6323 {
6324 struct hrtimer *refresh_timer = &cfs_b->period_timer;
6325 s64 remaining;
6326
6327 /* if the call-back is running a quota refresh is already occurring */
6328 if (hrtimer_callback_running(refresh_timer))
6329 return 1;
6330
6331 /* is a quota refresh about to occur? */
6332 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
6333 if (remaining < (s64)min_expire)
6334 return 1;
6335
6336 return 0;
6337 }
6338
start_cfs_slack_bandwidth(struct cfs_bandwidth * cfs_b)6339 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
6340 {
6341 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
6342
6343 /* if there's a quota refresh soon don't bother with slack */
6344 if (runtime_refresh_within(cfs_b, min_left))
6345 return;
6346
6347 /* don't push forwards an existing deferred unthrottle */
6348 if (cfs_b->slack_started)
6349 return;
6350 cfs_b->slack_started = true;
6351
6352 hrtimer_start(&cfs_b->slack_timer,
6353 ns_to_ktime(cfs_bandwidth_slack_period),
6354 HRTIMER_MODE_REL);
6355 }
6356
6357 /* we know any runtime found here is valid as update_curr() precedes return */
__return_cfs_rq_runtime(struct cfs_rq * cfs_rq)6358 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6359 {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
6362
6363 if (slack_runtime <= 0)
6364 return;
6365
6366 raw_spin_lock(&cfs_b->lock);
6367 if (cfs_b->quota != RUNTIME_INF) {
6368 cfs_b->runtime += slack_runtime;
6369
6370 /* we are under rq->lock, defer unthrottling using a timer */
6371 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
6372 !list_empty(&cfs_b->throttled_cfs_rq))
6373 start_cfs_slack_bandwidth(cfs_b);
6374 }
6375 raw_spin_unlock(&cfs_b->lock);
6376
6377 /* even if it's not valid for return we don't want to try again */
6378 cfs_rq->runtime_remaining -= slack_runtime;
6379 }
6380
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)6381 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6382 {
6383 if (!cfs_bandwidth_used())
6384 return;
6385
6386 if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
6387 return;
6388
6389 __return_cfs_rq_runtime(cfs_rq);
6390 }
6391
6392 /*
6393 * This is done with a timer (instead of inline with bandwidth return) since
6394 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
6395 */
do_sched_cfs_slack_timer(struct cfs_bandwidth * cfs_b)6396 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
6397 {
6398 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
6399 unsigned long flags;
6400
6401 /* confirm we're still not at a refresh boundary */
6402 raw_spin_lock_irqsave(&cfs_b->lock, flags);
6403 cfs_b->slack_started = false;
6404
6405 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
6406 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6407 return;
6408 }
6409
6410 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
6411 runtime = cfs_b->runtime;
6412
6413 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6414
6415 if (!runtime)
6416 return;
6417
6418 distribute_cfs_runtime(cfs_b);
6419 }
6420
6421 /*
6422 * When a group wakes up we want to make sure that its quota is not already
6423 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
6424 * runtime as update_curr() throttling can not trigger until it's on-rq.
6425 */
check_enqueue_throttle(struct cfs_rq * cfs_rq)6426 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
6427 {
6428 if (!cfs_bandwidth_used())
6429 return;
6430
6431 /* an active group must be handled by the update_curr()->put() path */
6432 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
6433 return;
6434
6435 /* ensure the group is not already throttled */
6436 if (cfs_rq_throttled(cfs_rq))
6437 return;
6438
6439 /* update runtime allocation */
6440 account_cfs_rq_runtime(cfs_rq, 0);
6441 if (cfs_rq->runtime_remaining <= 0)
6442 throttle_cfs_rq(cfs_rq);
6443 }
6444
sync_throttle(struct task_group * tg,int cpu)6445 static void sync_throttle(struct task_group *tg, int cpu)
6446 {
6447 struct cfs_rq *pcfs_rq, *cfs_rq;
6448
6449 if (!cfs_bandwidth_used())
6450 return;
6451
6452 if (!tg->parent)
6453 return;
6454
6455 cfs_rq = tg->cfs_rq[cpu];
6456 pcfs_rq = tg->parent->cfs_rq[cpu];
6457
6458 cfs_rq->throttle_count = pcfs_rq->throttle_count;
6459 cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
6460
6461 /*
6462 * It is not enough to sync the "pelt_clock_throttled" indicator
6463 * with the parent cfs_rq when the hierarchy is not queued.
6464 * Always join a throttled hierarchy with PELT clock throttled
6465 * and leaf it to the first enqueue, or distribution to
6466 * unthrottle the PELT clock.
6467 */
6468 if (cfs_rq->throttle_count)
6469 cfs_rq->pelt_clock_throttled = 1;
6470 }
6471
6472 /* conditionally throttle active cfs_rq's from put_prev_entity() */
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)6473 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6474 {
6475 if (!cfs_bandwidth_used())
6476 return false;
6477
6478 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
6479 return false;
6480
6481 /*
6482 * it's possible for a throttled entity to be forced into a running
6483 * state (e.g. set_curr_task), in this case we're finished.
6484 */
6485 if (cfs_rq_throttled(cfs_rq))
6486 return true;
6487
6488 return throttle_cfs_rq(cfs_rq);
6489 }
6490
sched_cfs_slack_timer(struct hrtimer * timer)6491 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
6492 {
6493 struct cfs_bandwidth *cfs_b =
6494 container_of(timer, struct cfs_bandwidth, slack_timer);
6495
6496 do_sched_cfs_slack_timer(cfs_b);
6497
6498 return HRTIMER_NORESTART;
6499 }
6500
sched_cfs_period_timer(struct hrtimer * timer)6501 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
6502 {
6503 struct cfs_bandwidth *cfs_b =
6504 container_of(timer, struct cfs_bandwidth, period_timer);
6505 unsigned long flags;
6506 int overrun;
6507 int idle = 0;
6508 int count = 0;
6509
6510 raw_spin_lock_irqsave(&cfs_b->lock, flags);
6511 for (;;) {
6512 overrun = hrtimer_forward_now(timer, cfs_b->period);
6513 if (!overrun)
6514 break;
6515
6516 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
6517
6518 if (++count > 3) {
6519 u64 new, old = ktime_to_ns(cfs_b->period);
6520
6521 /*
6522 * Grow period by a factor of 2 to avoid losing precision.
6523 * Precision loss in the quota/period ratio can cause __cfs_schedulable
6524 * to fail.
6525 */
6526 new = old * 2;
6527 if (new < max_bw_quota_period_us * NSEC_PER_USEC) {
6528 cfs_b->period = ns_to_ktime(new);
6529 cfs_b->quota *= 2;
6530 cfs_b->burst *= 2;
6531
6532 pr_warn_ratelimited(
6533 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6534 smp_processor_id(),
6535 div_u64(new, NSEC_PER_USEC),
6536 div_u64(cfs_b->quota, NSEC_PER_USEC));
6537 } else {
6538 pr_warn_ratelimited(
6539 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
6540 smp_processor_id(),
6541 div_u64(old, NSEC_PER_USEC),
6542 div_u64(cfs_b->quota, NSEC_PER_USEC));
6543 }
6544
6545 /* reset count so we don't come right back in here */
6546 count = 0;
6547 }
6548 }
6549 if (idle)
6550 cfs_b->period_active = 0;
6551 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
6552
6553 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
6554 }
6555
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b,struct cfs_bandwidth * parent)6556 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent)
6557 {
6558 raw_spin_lock_init(&cfs_b->lock);
6559 cfs_b->runtime = 0;
6560 cfs_b->quota = RUNTIME_INF;
6561 cfs_b->period = us_to_ktime(default_bw_period_us());
6562 cfs_b->burst = 0;
6563 cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
6564
6565 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
6566 hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC,
6567 HRTIMER_MODE_ABS_PINNED);
6568
6569 /* Add a random offset so that timers interleave */
6570 hrtimer_set_expires(&cfs_b->period_timer,
6571 get_random_u32_below(cfs_b->period));
6572 hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
6573 HRTIMER_MODE_REL);
6574 cfs_b->slack_started = false;
6575 }
6576
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)6577 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
6578 {
6579 cfs_rq->runtime_enabled = 0;
6580 INIT_LIST_HEAD(&cfs_rq->throttled_list);
6581 INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
6582 INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list);
6583 }
6584
start_cfs_bandwidth(struct cfs_bandwidth * cfs_b)6585 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6586 {
6587 lockdep_assert_held(&cfs_b->lock);
6588
6589 if (cfs_b->period_active)
6590 return;
6591
6592 cfs_b->period_active = 1;
6593 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
6594 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
6595 }
6596
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)6597 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
6598 {
6599 int __maybe_unused i;
6600
6601 /* init_cfs_bandwidth() was not called */
6602 if (!cfs_b->throttled_cfs_rq.next)
6603 return;
6604
6605 hrtimer_cancel(&cfs_b->period_timer);
6606 hrtimer_cancel(&cfs_b->slack_timer);
6607
6608 /*
6609 * It is possible that we still have some cfs_rq's pending on a CSD
6610 * list, though this race is very rare. In order for this to occur, we
6611 * must have raced with the last task leaving the group while there
6612 * exist throttled cfs_rq(s), and the period_timer must have queued the
6613 * CSD item but the remote cpu has not yet processed it. To handle this,
6614 * we can simply flush all pending CSD work inline here. We're
6615 * guaranteed at this point that no additional cfs_rq of this group can
6616 * join a CSD list.
6617 */
6618 for_each_possible_cpu(i) {
6619 struct rq *rq = cpu_rq(i);
6620 unsigned long flags;
6621
6622 if (list_empty(&rq->cfsb_csd_list))
6623 continue;
6624
6625 local_irq_save(flags);
6626 __cfsb_csd_unthrottle(rq);
6627 local_irq_restore(flags);
6628 }
6629 }
6630
6631 /*
6632 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
6633 *
6634 * The race is harmless, since modifying bandwidth settings of unhooked group
6635 * bits doesn't do much.
6636 */
6637
6638 /* cpu online callback */
update_runtime_enabled(struct rq * rq)6639 static void __maybe_unused update_runtime_enabled(struct rq *rq)
6640 {
6641 struct task_group *tg;
6642
6643 lockdep_assert_rq_held(rq);
6644
6645 rcu_read_lock();
6646 list_for_each_entry_rcu(tg, &task_groups, list) {
6647 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6648 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6649
6650 raw_spin_lock(&cfs_b->lock);
6651 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
6652 raw_spin_unlock(&cfs_b->lock);
6653 }
6654 rcu_read_unlock();
6655 }
6656
6657 /* cpu offline callback */
unthrottle_offline_cfs_rqs(struct rq * rq)6658 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
6659 {
6660 struct task_group *tg;
6661
6662 lockdep_assert_rq_held(rq);
6663
6664 // Do not unthrottle for an active CPU
6665 if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
6666 return;
6667
6668 /*
6669 * The rq clock has already been updated in the
6670 * set_rq_offline(), so we should skip updating
6671 * the rq clock again in unthrottle_cfs_rq().
6672 */
6673 rq_clock_start_loop_update(rq);
6674
6675 rcu_read_lock();
6676 list_for_each_entry_rcu(tg, &task_groups, list) {
6677 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
6678
6679 if (!cfs_rq->runtime_enabled)
6680 continue;
6681
6682 /*
6683 * Offline rq is schedulable till CPU is completely disabled
6684 * in take_cpu_down(), so we prevent new cfs throttling here.
6685 */
6686 cfs_rq->runtime_enabled = 0;
6687
6688 if (!cfs_rq_throttled(cfs_rq))
6689 continue;
6690
6691 /*
6692 * clock_task is not advancing so we just need to make sure
6693 * there's some valid quota amount
6694 */
6695 cfs_rq->runtime_remaining = 1;
6696 unthrottle_cfs_rq(cfs_rq);
6697 }
6698 rcu_read_unlock();
6699
6700 rq_clock_stop_loop_update(rq);
6701 }
6702
cfs_task_bw_constrained(struct task_struct * p)6703 bool cfs_task_bw_constrained(struct task_struct *p)
6704 {
6705 struct cfs_rq *cfs_rq = task_cfs_rq(p);
6706
6707 if (!cfs_bandwidth_used())
6708 return false;
6709
6710 if (cfs_rq->runtime_enabled ||
6711 tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF)
6712 return true;
6713
6714 return false;
6715 }
6716
6717 #ifdef CONFIG_NO_HZ_FULL
6718 /* called from pick_next_task_fair() */
sched_fair_update_stop_tick(struct rq * rq,struct task_struct * p)6719 static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
6720 {
6721 int cpu = cpu_of(rq);
6722
6723 if (!cfs_bandwidth_used())
6724 return;
6725
6726 if (!tick_nohz_full_cpu(cpu))
6727 return;
6728
6729 if (rq->nr_running != 1)
6730 return;
6731
6732 /*
6733 * We know there is only one task runnable and we've just picked it. The
6734 * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will
6735 * be otherwise able to stop the tick. Just need to check if we are using
6736 * bandwidth control.
6737 */
6738 if (cfs_task_bw_constrained(p))
6739 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
6740 }
6741 #endif /* CONFIG_NO_HZ_FULL */
6742
6743 #else /* !CONFIG_CFS_BANDWIDTH: */
6744
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)6745 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)6746 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
check_enqueue_throttle(struct cfs_rq * cfs_rq)6747 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
sync_throttle(struct task_group * tg,int cpu)6748 static inline void sync_throttle(struct task_group *tg, int cpu) {}
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)6749 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
task_throttle_setup_work(struct task_struct * p)6750 static void task_throttle_setup_work(struct task_struct *p) {}
task_is_throttled(struct task_struct * p)6751 static bool task_is_throttled(struct task_struct *p) { return false; }
dequeue_throttled_task(struct task_struct * p,int flags)6752 static void dequeue_throttled_task(struct task_struct *p, int flags) {}
enqueue_throttled_task(struct task_struct * p)6753 static bool enqueue_throttled_task(struct task_struct *p) { return false; }
record_throttle_clock(struct cfs_rq * cfs_rq)6754 static void record_throttle_clock(struct cfs_rq *cfs_rq) {}
6755
cfs_rq_throttled(struct cfs_rq * cfs_rq)6756 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
6757 {
6758 return 0;
6759 }
6760
cfs_rq_pelt_clock_throttled(struct cfs_rq * cfs_rq)6761 static inline bool cfs_rq_pelt_clock_throttled(struct cfs_rq *cfs_rq)
6762 {
6763 return false;
6764 }
6765
throttled_hierarchy(struct cfs_rq * cfs_rq)6766 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
6767 {
6768 return 0;
6769 }
6770
lb_throttled_hierarchy(struct task_struct * p,int dst_cpu)6771 static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu)
6772 {
6773 return 0;
6774 }
6775
6776 #ifdef CONFIG_FAIR_GROUP_SCHED
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b,struct cfs_bandwidth * parent)6777 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {}
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)6778 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
6779 #endif
6780
tg_cfs_bandwidth(struct task_group * tg)6781 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
6782 {
6783 return NULL;
6784 }
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)6785 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
update_runtime_enabled(struct rq * rq)6786 static inline void update_runtime_enabled(struct rq *rq) {}
unthrottle_offline_cfs_rqs(struct rq * rq)6787 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6788 #ifdef CONFIG_CGROUP_SCHED
cfs_task_bw_constrained(struct task_struct * p)6789 bool cfs_task_bw_constrained(struct task_struct *p)
6790 {
6791 return false;
6792 }
6793 #endif
6794 #endif /* !CONFIG_CFS_BANDWIDTH */
6795
6796 #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL)
sched_fair_update_stop_tick(struct rq * rq,struct task_struct * p)6797 static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {}
6798 #endif
6799
6800 /**************************************************
6801 * CFS operations on tasks:
6802 */
6803
6804 #ifdef CONFIG_SCHED_HRTICK
hrtick_start_fair(struct rq * rq,struct task_struct * p)6805 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
6806 {
6807 struct sched_entity *se = &p->se;
6808
6809 WARN_ON_ONCE(task_rq(p) != rq);
6810
6811 if (rq->cfs.h_nr_queued > 1) {
6812 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
6813 u64 slice = se->slice;
6814 s64 delta = slice - ran;
6815
6816 if (delta < 0) {
6817 if (task_current_donor(rq, p))
6818 resched_curr(rq);
6819 return;
6820 }
6821 hrtick_start(rq, delta);
6822 }
6823 }
6824
6825 /*
6826 * called from enqueue/dequeue and updates the hrtick when the
6827 * current task is from our class and nr_running is low enough
6828 * to matter.
6829 */
hrtick_update(struct rq * rq)6830 static void hrtick_update(struct rq *rq)
6831 {
6832 struct task_struct *donor = rq->donor;
6833
6834 if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
6835 return;
6836
6837 hrtick_start_fair(rq, donor);
6838 }
6839 #else /* !CONFIG_SCHED_HRTICK: */
6840 static inline void
hrtick_start_fair(struct rq * rq,struct task_struct * p)6841 hrtick_start_fair(struct rq *rq, struct task_struct *p)
6842 {
6843 }
6844
hrtick_update(struct rq * rq)6845 static inline void hrtick_update(struct rq *rq)
6846 {
6847 }
6848 #endif /* !CONFIG_SCHED_HRTICK */
6849
cpu_overutilized(int cpu)6850 static inline bool cpu_overutilized(int cpu)
6851 {
6852 unsigned long rq_util_min, rq_util_max;
6853
6854 if (!sched_energy_enabled())
6855 return false;
6856
6857 rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
6858 rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
6859
6860 /* Return true only if the utilization doesn't fit CPU's capacity */
6861 return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
6862 }
6863
6864 /*
6865 * overutilized value make sense only if EAS is enabled
6866 */
is_rd_overutilized(struct root_domain * rd)6867 static inline bool is_rd_overutilized(struct root_domain *rd)
6868 {
6869 return !sched_energy_enabled() || READ_ONCE(rd->overutilized);
6870 }
6871
set_rd_overutilized(struct root_domain * rd,bool flag)6872 static inline void set_rd_overutilized(struct root_domain *rd, bool flag)
6873 {
6874 if (!sched_energy_enabled())
6875 return;
6876
6877 WRITE_ONCE(rd->overutilized, flag);
6878 trace_sched_overutilized_tp(rd, flag);
6879 }
6880
check_update_overutilized_status(struct rq * rq)6881 static inline void check_update_overutilized_status(struct rq *rq)
6882 {
6883 /*
6884 * overutilized field is used for load balancing decisions only
6885 * if energy aware scheduler is being used
6886 */
6887
6888 if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu))
6889 set_rd_overutilized(rq->rd, 1);
6890 }
6891
6892 /* Runqueue only has SCHED_IDLE tasks enqueued */
sched_idle_rq(struct rq * rq)6893 static int sched_idle_rq(struct rq *rq)
6894 {
6895 return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
6896 rq->nr_running);
6897 }
6898
sched_idle_cpu(int cpu)6899 static int sched_idle_cpu(int cpu)
6900 {
6901 return sched_idle_rq(cpu_rq(cpu));
6902 }
6903
6904 static void
requeue_delayed_entity(struct sched_entity * se)6905 requeue_delayed_entity(struct sched_entity *se)
6906 {
6907 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6908
6909 /*
6910 * se->sched_delayed should imply: se->on_rq == 1.
6911 * Because a delayed entity is one that is still on
6912 * the runqueue competing until elegibility.
6913 */
6914 WARN_ON_ONCE(!se->sched_delayed);
6915 WARN_ON_ONCE(!se->on_rq);
6916
6917 if (sched_feat(DELAY_ZERO)) {
6918 update_entity_lag(cfs_rq, se);
6919 if (se->vlag > 0) {
6920 cfs_rq->nr_queued--;
6921 if (se != cfs_rq->curr)
6922 __dequeue_entity(cfs_rq, se);
6923 se->vlag = 0;
6924 place_entity(cfs_rq, se, 0);
6925 if (se != cfs_rq->curr)
6926 __enqueue_entity(cfs_rq, se);
6927 cfs_rq->nr_queued++;
6928 }
6929 }
6930
6931 update_load_avg(cfs_rq, se, 0);
6932 clear_delayed(se);
6933 }
6934
6935 /*
6936 * The enqueue_task method is called before nr_running is
6937 * increased. Here we update the fair scheduling stats and
6938 * then put the task into the rbtree:
6939 */
6940 static void
enqueue_task_fair(struct rq * rq,struct task_struct * p,int flags)6941 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
6942 {
6943 struct cfs_rq *cfs_rq;
6944 struct sched_entity *se = &p->se;
6945 int h_nr_idle = task_has_idle_policy(p);
6946 int h_nr_runnable = 1;
6947 int task_new = !(flags & ENQUEUE_WAKEUP);
6948 int rq_h_nr_queued = rq->cfs.h_nr_queued;
6949 u64 slice = 0;
6950
6951 if (task_is_throttled(p) && enqueue_throttled_task(p))
6952 return;
6953
6954 /*
6955 * The code below (indirectly) updates schedutil which looks at
6956 * the cfs_rq utilization to select a frequency.
6957 * Let's add the task's estimated utilization to the cfs_rq's
6958 * estimated utilization, before we update schedutil.
6959 */
6960 if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED))
6961 util_est_enqueue(&rq->cfs, p);
6962
6963 if (flags & ENQUEUE_DELAYED) {
6964 requeue_delayed_entity(se);
6965 return;
6966 }
6967
6968 /*
6969 * If in_iowait is set, the code below may not trigger any cpufreq
6970 * utilization updates, so do it here explicitly with the IOWAIT flag
6971 * passed.
6972 */
6973 if (p->in_iowait)
6974 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
6975
6976 if (task_new && se->sched_delayed)
6977 h_nr_runnable = 0;
6978
6979 for_each_sched_entity(se) {
6980 if (se->on_rq) {
6981 if (se->sched_delayed)
6982 requeue_delayed_entity(se);
6983 break;
6984 }
6985 cfs_rq = cfs_rq_of(se);
6986
6987 /*
6988 * Basically set the slice of group entries to the min_slice of
6989 * their respective cfs_rq. This ensures the group can service
6990 * its entities in the desired time-frame.
6991 */
6992 if (slice) {
6993 se->slice = slice;
6994 se->custom_slice = 1;
6995 }
6996 enqueue_entity(cfs_rq, se, flags);
6997 slice = cfs_rq_min_slice(cfs_rq);
6998
6999 cfs_rq->h_nr_runnable += h_nr_runnable;
7000 cfs_rq->h_nr_queued++;
7001 cfs_rq->h_nr_idle += h_nr_idle;
7002
7003 if (cfs_rq_is_idle(cfs_rq))
7004 h_nr_idle = 1;
7005
7006 flags = ENQUEUE_WAKEUP;
7007 }
7008
7009 for_each_sched_entity(se) {
7010 cfs_rq = cfs_rq_of(se);
7011
7012 update_load_avg(cfs_rq, se, UPDATE_TG);
7013 se_update_runnable(se);
7014 update_cfs_group(se);
7015
7016 se->slice = slice;
7017 if (se != cfs_rq->curr)
7018 min_vruntime_cb_propagate(&se->run_node, NULL);
7019 slice = cfs_rq_min_slice(cfs_rq);
7020
7021 cfs_rq->h_nr_runnable += h_nr_runnable;
7022 cfs_rq->h_nr_queued++;
7023 cfs_rq->h_nr_idle += h_nr_idle;
7024
7025 if (cfs_rq_is_idle(cfs_rq))
7026 h_nr_idle = 1;
7027 }
7028
7029 if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
7030 dl_server_start(&rq->fair_server);
7031
7032 /* At this point se is NULL and we are at root level*/
7033 add_nr_running(rq, 1);
7034
7035 /*
7036 * Since new tasks are assigned an initial util_avg equal to
7037 * half of the spare capacity of their CPU, tiny tasks have the
7038 * ability to cross the overutilized threshold, which will
7039 * result in the load balancer ruining all the task placement
7040 * done by EAS. As a way to mitigate that effect, do not account
7041 * for the first enqueue operation of new tasks during the
7042 * overutilized flag detection.
7043 *
7044 * A better way of solving this problem would be to wait for
7045 * the PELT signals of tasks to converge before taking them
7046 * into account, but that is not straightforward to implement,
7047 * and the following generally works well enough in practice.
7048 */
7049 if (!task_new)
7050 check_update_overutilized_status(rq);
7051
7052 assert_list_leaf_cfs_rq(rq);
7053
7054 hrtick_update(rq);
7055 }
7056
7057 /*
7058 * Basically dequeue_task_fair(), except it can deal with dequeue_entity()
7059 * failing half-way through and resume the dequeue later.
7060 *
7061 * Returns:
7062 * -1 - dequeue delayed
7063 * 0 - dequeue throttled
7064 * 1 - dequeue complete
7065 */
dequeue_entities(struct rq * rq,struct sched_entity * se,int flags)7066 static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
7067 {
7068 bool was_sched_idle = sched_idle_rq(rq);
7069 bool task_sleep = flags & DEQUEUE_SLEEP;
7070 bool task_delayed = flags & DEQUEUE_DELAYED;
7071 bool task_throttled = flags & DEQUEUE_THROTTLE;
7072 struct task_struct *p = NULL;
7073 int h_nr_idle = 0;
7074 int h_nr_queued = 0;
7075 int h_nr_runnable = 0;
7076 struct cfs_rq *cfs_rq;
7077 u64 slice = 0;
7078
7079 if (entity_is_task(se)) {
7080 p = task_of(se);
7081 h_nr_queued = 1;
7082 h_nr_idle = task_has_idle_policy(p);
7083 if (task_sleep || task_delayed || !se->sched_delayed)
7084 h_nr_runnable = 1;
7085 }
7086
7087 for_each_sched_entity(se) {
7088 cfs_rq = cfs_rq_of(se);
7089
7090 if (!dequeue_entity(cfs_rq, se, flags)) {
7091 if (p && &p->se == se)
7092 return -1;
7093
7094 slice = cfs_rq_min_slice(cfs_rq);
7095 break;
7096 }
7097
7098 cfs_rq->h_nr_runnable -= h_nr_runnable;
7099 cfs_rq->h_nr_queued -= h_nr_queued;
7100 cfs_rq->h_nr_idle -= h_nr_idle;
7101
7102 if (cfs_rq_is_idle(cfs_rq))
7103 h_nr_idle = h_nr_queued;
7104
7105 if (throttled_hierarchy(cfs_rq) && task_throttled)
7106 record_throttle_clock(cfs_rq);
7107
7108 /* Don't dequeue parent if it has other entities besides us */
7109 if (cfs_rq->load.weight) {
7110 slice = cfs_rq_min_slice(cfs_rq);
7111
7112 /* Avoid re-evaluating load for this entity: */
7113 se = parent_entity(se);
7114 /*
7115 * Bias pick_next to pick a task from this cfs_rq, as
7116 * p is sleeping when it is within its sched_slice.
7117 */
7118 if (task_sleep && se)
7119 set_next_buddy(se);
7120 break;
7121 }
7122 flags |= DEQUEUE_SLEEP;
7123 flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL);
7124 }
7125
7126 for_each_sched_entity(se) {
7127 cfs_rq = cfs_rq_of(se);
7128
7129 update_load_avg(cfs_rq, se, UPDATE_TG);
7130 se_update_runnable(se);
7131 update_cfs_group(se);
7132
7133 se->slice = slice;
7134 if (se != cfs_rq->curr)
7135 min_vruntime_cb_propagate(&se->run_node, NULL);
7136 slice = cfs_rq_min_slice(cfs_rq);
7137
7138 cfs_rq->h_nr_runnable -= h_nr_runnable;
7139 cfs_rq->h_nr_queued -= h_nr_queued;
7140 cfs_rq->h_nr_idle -= h_nr_idle;
7141
7142 if (cfs_rq_is_idle(cfs_rq))
7143 h_nr_idle = h_nr_queued;
7144
7145 if (throttled_hierarchy(cfs_rq) && task_throttled)
7146 record_throttle_clock(cfs_rq);
7147 }
7148
7149 sub_nr_running(rq, h_nr_queued);
7150
7151 /* balance early to pull high priority tasks */
7152 if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
7153 rq->next_balance = jiffies;
7154
7155 if (p && task_delayed) {
7156 WARN_ON_ONCE(!task_sleep);
7157 WARN_ON_ONCE(p->on_rq != 1);
7158
7159 /* Fix-up what dequeue_task_fair() skipped */
7160 hrtick_update(rq);
7161
7162 /*
7163 * Fix-up what block_task() skipped.
7164 *
7165 * Must be last, @p might not be valid after this.
7166 */
7167 __block_task(rq, p);
7168 }
7169
7170 return 1;
7171 }
7172
7173 /*
7174 * The dequeue_task method is called before nr_running is
7175 * decreased. We remove the task from the rbtree and
7176 * update the fair scheduling stats:
7177 */
dequeue_task_fair(struct rq * rq,struct task_struct * p,int flags)7178 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
7179 {
7180 if (task_is_throttled(p)) {
7181 dequeue_throttled_task(p, flags);
7182 return true;
7183 }
7184
7185 if (!p->se.sched_delayed)
7186 util_est_dequeue(&rq->cfs, p);
7187
7188 util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
7189 if (dequeue_entities(rq, &p->se, flags) < 0)
7190 return false;
7191
7192 /*
7193 * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
7194 */
7195
7196 hrtick_update(rq);
7197 return true;
7198 }
7199
cfs_h_nr_delayed(struct rq * rq)7200 static inline unsigned int cfs_h_nr_delayed(struct rq *rq)
7201 {
7202 return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable);
7203 }
7204
7205 /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */
7206 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7207 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
7208 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
7209
7210 #ifdef CONFIG_NO_HZ_COMMON
7211
7212 static struct {
7213 cpumask_var_t idle_cpus_mask;
7214 int has_blocked_load; /* Idle CPUS has blocked load */
7215 int needs_update; /* Newly idle CPUs need their next_balance collated */
7216 unsigned long next_balance; /* in jiffy units */
7217 unsigned long next_blocked; /* Next update of blocked load in jiffies */
7218 } nohz ____cacheline_aligned;
7219
7220 #endif /* CONFIG_NO_HZ_COMMON */
7221
cpu_load(struct rq * rq)7222 static unsigned long cpu_load(struct rq *rq)
7223 {
7224 return cfs_rq_load_avg(&rq->cfs);
7225 }
7226
7227 /*
7228 * cpu_load_without - compute CPU load without any contributions from *p
7229 * @cpu: the CPU which load is requested
7230 * @p: the task which load should be discounted
7231 *
7232 * The load of a CPU is defined by the load of tasks currently enqueued on that
7233 * CPU as well as tasks which are currently sleeping after an execution on that
7234 * CPU.
7235 *
7236 * This method returns the load of the specified CPU by discounting the load of
7237 * the specified task, whenever the task is currently contributing to the CPU
7238 * load.
7239 */
cpu_load_without(struct rq * rq,struct task_struct * p)7240 static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
7241 {
7242 struct cfs_rq *cfs_rq;
7243 unsigned int load;
7244
7245 /* Task has no contribution or is new */
7246 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
7247 return cpu_load(rq);
7248
7249 cfs_rq = &rq->cfs;
7250 load = READ_ONCE(cfs_rq->avg.load_avg);
7251
7252 /* Discount task's util from CPU's util */
7253 lsub_positive(&load, task_h_load(p));
7254
7255 return load;
7256 }
7257
cpu_runnable(struct rq * rq)7258 static unsigned long cpu_runnable(struct rq *rq)
7259 {
7260 return cfs_rq_runnable_avg(&rq->cfs);
7261 }
7262
cpu_runnable_without(struct rq * rq,struct task_struct * p)7263 static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
7264 {
7265 struct cfs_rq *cfs_rq;
7266 unsigned int runnable;
7267
7268 /* Task has no contribution or is new */
7269 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
7270 return cpu_runnable(rq);
7271
7272 cfs_rq = &rq->cfs;
7273 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
7274
7275 /* Discount task's runnable from CPU's runnable */
7276 lsub_positive(&runnable, p->se.avg.runnable_avg);
7277
7278 return runnable;
7279 }
7280
capacity_of(int cpu)7281 static unsigned long capacity_of(int cpu)
7282 {
7283 return cpu_rq(cpu)->cpu_capacity;
7284 }
7285
record_wakee(struct task_struct * p)7286 static void record_wakee(struct task_struct *p)
7287 {
7288 /*
7289 * Only decay a single time; tasks that have less then 1 wakeup per
7290 * jiffy will not have built up many flips.
7291 */
7292 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
7293 current->wakee_flips >>= 1;
7294 current->wakee_flip_decay_ts = jiffies;
7295 }
7296
7297 if (current->last_wakee != p) {
7298 current->last_wakee = p;
7299 current->wakee_flips++;
7300 }
7301 }
7302
7303 /*
7304 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
7305 *
7306 * A waker of many should wake a different task than the one last awakened
7307 * at a frequency roughly N times higher than one of its wakees.
7308 *
7309 * In order to determine whether we should let the load spread vs consolidating
7310 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
7311 * partner, and a factor of lls_size higher frequency in the other.
7312 *
7313 * With both conditions met, we can be relatively sure that the relationship is
7314 * non-monogamous, with partner count exceeding socket size.
7315 *
7316 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
7317 * whatever is irrelevant, spread criteria is apparent partner count exceeds
7318 * socket size.
7319 */
wake_wide(struct task_struct * p)7320 static int wake_wide(struct task_struct *p)
7321 {
7322 unsigned int master = current->wakee_flips;
7323 unsigned int slave = p->wakee_flips;
7324 int factor = __this_cpu_read(sd_llc_size);
7325
7326 if (master < slave)
7327 swap(master, slave);
7328 if (slave < factor || master < slave * factor)
7329 return 0;
7330 return 1;
7331 }
7332
7333 /*
7334 * The purpose of wake_affine() is to quickly determine on which CPU we can run
7335 * soonest. For the purpose of speed we only consider the waking and previous
7336 * CPU.
7337 *
7338 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
7339 * cache-affine and is (or will be) idle.
7340 *
7341 * wake_affine_weight() - considers the weight to reflect the average
7342 * scheduling latency of the CPUs. This seems to work
7343 * for the overloaded case.
7344 */
7345 static int
wake_affine_idle(int this_cpu,int prev_cpu,int sync)7346 wake_affine_idle(int this_cpu, int prev_cpu, int sync)
7347 {
7348 /*
7349 * If this_cpu is idle, it implies the wakeup is from interrupt
7350 * context. Only allow the move if cache is shared. Otherwise an
7351 * interrupt intensive workload could force all tasks onto one
7352 * node depending on the IO topology or IRQ affinity settings.
7353 *
7354 * If the prev_cpu is idle and cache affine then avoid a migration.
7355 * There is no guarantee that the cache hot data from an interrupt
7356 * is more important than cache hot data on the prev_cpu and from
7357 * a cpufreq perspective, it's better to have higher utilisation
7358 * on one CPU.
7359 */
7360 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
7361 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
7362
7363 if (sync) {
7364 struct rq *rq = cpu_rq(this_cpu);
7365
7366 if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1)
7367 return this_cpu;
7368 }
7369
7370 if (available_idle_cpu(prev_cpu))
7371 return prev_cpu;
7372
7373 return nr_cpumask_bits;
7374 }
7375
7376 static int
wake_affine_weight(struct sched_domain * sd,struct task_struct * p,int this_cpu,int prev_cpu,int sync)7377 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
7378 int this_cpu, int prev_cpu, int sync)
7379 {
7380 s64 this_eff_load, prev_eff_load;
7381 unsigned long task_load;
7382
7383 this_eff_load = cpu_load(cpu_rq(this_cpu));
7384
7385 if (sync) {
7386 unsigned long current_load = task_h_load(current);
7387
7388 if (current_load > this_eff_load)
7389 return this_cpu;
7390
7391 this_eff_load -= current_load;
7392 }
7393
7394 task_load = task_h_load(p);
7395
7396 this_eff_load += task_load;
7397 if (sched_feat(WA_BIAS))
7398 this_eff_load *= 100;
7399 this_eff_load *= capacity_of(prev_cpu);
7400
7401 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
7402 prev_eff_load -= task_load;
7403 if (sched_feat(WA_BIAS))
7404 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
7405 prev_eff_load *= capacity_of(this_cpu);
7406
7407 /*
7408 * If sync, adjust the weight of prev_eff_load such that if
7409 * prev_eff == this_eff that select_idle_sibling() will consider
7410 * stacking the wakee on top of the waker if no other CPU is
7411 * idle.
7412 */
7413 if (sync)
7414 prev_eff_load += 1;
7415
7416 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
7417 }
7418
wake_affine(struct sched_domain * sd,struct task_struct * p,int this_cpu,int prev_cpu,int sync)7419 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
7420 int this_cpu, int prev_cpu, int sync)
7421 {
7422 int target = nr_cpumask_bits;
7423
7424 if (sched_feat(WA_IDLE))
7425 target = wake_affine_idle(this_cpu, prev_cpu, sync);
7426
7427 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
7428 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
7429
7430 schedstat_inc(p->stats.nr_wakeups_affine_attempts);
7431 if (target != this_cpu)
7432 return prev_cpu;
7433
7434 schedstat_inc(sd->ttwu_move_affine);
7435 schedstat_inc(p->stats.nr_wakeups_affine);
7436 return target;
7437 }
7438
7439 static struct sched_group *
7440 sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
7441
7442 /*
7443 * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group.
7444 */
7445 static int
sched_balance_find_dst_group_cpu(struct sched_group * group,struct task_struct * p,int this_cpu)7446 sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
7447 {
7448 unsigned long load, min_load = ULONG_MAX;
7449 unsigned int min_exit_latency = UINT_MAX;
7450 u64 latest_idle_timestamp = 0;
7451 int least_loaded_cpu = this_cpu;
7452 int shallowest_idle_cpu = -1;
7453 int i;
7454
7455 /* Check if we have any choice: */
7456 if (group->group_weight == 1)
7457 return cpumask_first(sched_group_span(group));
7458
7459 /* Traverse only the allowed CPUs */
7460 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
7461 struct rq *rq = cpu_rq(i);
7462
7463 if (!sched_core_cookie_match(rq, p))
7464 continue;
7465
7466 if (sched_idle_cpu(i))
7467 return i;
7468
7469 if (available_idle_cpu(i)) {
7470 struct cpuidle_state *idle = idle_get_state(rq);
7471 if (idle && idle->exit_latency < min_exit_latency) {
7472 /*
7473 * We give priority to a CPU whose idle state
7474 * has the smallest exit latency irrespective
7475 * of any idle timestamp.
7476 */
7477 min_exit_latency = idle->exit_latency;
7478 latest_idle_timestamp = rq->idle_stamp;
7479 shallowest_idle_cpu = i;
7480 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
7481 rq->idle_stamp > latest_idle_timestamp) {
7482 /*
7483 * If equal or no active idle state, then
7484 * the most recently idled CPU might have
7485 * a warmer cache.
7486 */
7487 latest_idle_timestamp = rq->idle_stamp;
7488 shallowest_idle_cpu = i;
7489 }
7490 } else if (shallowest_idle_cpu == -1) {
7491 load = cpu_load(cpu_rq(i));
7492 if (load < min_load) {
7493 min_load = load;
7494 least_loaded_cpu = i;
7495 }
7496 }
7497 }
7498
7499 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
7500 }
7501
sched_balance_find_dst_cpu(struct sched_domain * sd,struct task_struct * p,int cpu,int prev_cpu,int sd_flag)7502 static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p,
7503 int cpu, int prev_cpu, int sd_flag)
7504 {
7505 int new_cpu = cpu;
7506
7507 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
7508 return prev_cpu;
7509
7510 /*
7511 * We need task's util for cpu_util_without, sync it up to
7512 * prev_cpu's last_update_time.
7513 */
7514 if (!(sd_flag & SD_BALANCE_FORK))
7515 sync_entity_load_avg(&p->se);
7516
7517 while (sd) {
7518 struct sched_group *group;
7519 struct sched_domain *tmp;
7520 int weight;
7521
7522 if (!(sd->flags & sd_flag)) {
7523 sd = sd->child;
7524 continue;
7525 }
7526
7527 group = sched_balance_find_dst_group(sd, p, cpu);
7528 if (!group) {
7529 sd = sd->child;
7530 continue;
7531 }
7532
7533 new_cpu = sched_balance_find_dst_group_cpu(group, p, cpu);
7534 if (new_cpu == cpu) {
7535 /* Now try balancing at a lower domain level of 'cpu': */
7536 sd = sd->child;
7537 continue;
7538 }
7539
7540 /* Now try balancing at a lower domain level of 'new_cpu': */
7541 cpu = new_cpu;
7542 weight = sd->span_weight;
7543 sd = NULL;
7544 for_each_domain(cpu, tmp) {
7545 if (weight <= tmp->span_weight)
7546 break;
7547 if (tmp->flags & sd_flag)
7548 sd = tmp;
7549 }
7550 }
7551
7552 return new_cpu;
7553 }
7554
__select_idle_cpu(int cpu,struct task_struct * p)7555 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
7556 {
7557 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
7558 sched_cpu_cookie_match(cpu_rq(cpu), p))
7559 return cpu;
7560
7561 return -1;
7562 }
7563
7564 #ifdef CONFIG_SCHED_SMT
7565 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
7566 EXPORT_SYMBOL_GPL(sched_smt_present);
7567
set_idle_cores(int cpu,int val)7568 static inline void set_idle_cores(int cpu, int val)
7569 {
7570 struct sched_domain_shared *sds;
7571
7572 sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
7573 if (sds)
7574 WRITE_ONCE(sds->has_idle_cores, val);
7575 }
7576
test_idle_cores(int cpu)7577 static inline bool test_idle_cores(int cpu)
7578 {
7579 struct sched_domain_shared *sds;
7580
7581 sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
7582 if (sds)
7583 return READ_ONCE(sds->has_idle_cores);
7584
7585 return false;
7586 }
7587
7588 /*
7589 * Scans the local SMT mask to see if the entire core is idle, and records this
7590 * information in sd_llc_shared->has_idle_cores.
7591 *
7592 * Since SMT siblings share all cache levels, inspecting this limited remote
7593 * state should be fairly cheap.
7594 */
__update_idle_core(struct rq * rq)7595 void __update_idle_core(struct rq *rq)
7596 {
7597 int core = cpu_of(rq);
7598 int cpu;
7599
7600 rcu_read_lock();
7601 if (test_idle_cores(core))
7602 goto unlock;
7603
7604 for_each_cpu(cpu, cpu_smt_mask(core)) {
7605 if (cpu == core)
7606 continue;
7607
7608 if (!available_idle_cpu(cpu))
7609 goto unlock;
7610 }
7611
7612 set_idle_cores(core, 1);
7613 unlock:
7614 rcu_read_unlock();
7615 }
7616
7617 /*
7618 * Scan the entire LLC domain for idle cores; this dynamically switches off if
7619 * there are no idle cores left in the system; tracked through
7620 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
7621 */
select_idle_core(struct task_struct * p,int core,struct cpumask * cpus,int * idle_cpu)7622 static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7623 {
7624 bool idle = true;
7625 int cpu;
7626
7627 for_each_cpu(cpu, cpu_smt_mask(core)) {
7628 if (!available_idle_cpu(cpu)) {
7629 idle = false;
7630 if (*idle_cpu == -1) {
7631 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
7632 *idle_cpu = cpu;
7633 break;
7634 }
7635 continue;
7636 }
7637 break;
7638 }
7639 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus))
7640 *idle_cpu = cpu;
7641 }
7642
7643 if (idle)
7644 return core;
7645
7646 cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
7647 return -1;
7648 }
7649
7650 /*
7651 * Scan the local SMT mask for idle CPUs.
7652 */
select_idle_smt(struct task_struct * p,struct sched_domain * sd,int target)7653 static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7654 {
7655 int cpu;
7656
7657 for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) {
7658 if (cpu == target)
7659 continue;
7660 /*
7661 * Check if the CPU is in the LLC scheduling domain of @target.
7662 * Due to isolcpus, there is no guarantee that all the siblings are in the domain.
7663 */
7664 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
7665 continue;
7666 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
7667 return cpu;
7668 }
7669
7670 return -1;
7671 }
7672
7673 #else /* !CONFIG_SCHED_SMT: */
7674
set_idle_cores(int cpu,int val)7675 static inline void set_idle_cores(int cpu, int val)
7676 {
7677 }
7678
test_idle_cores(int cpu)7679 static inline bool test_idle_cores(int cpu)
7680 {
7681 return false;
7682 }
7683
select_idle_core(struct task_struct * p,int core,struct cpumask * cpus,int * idle_cpu)7684 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
7685 {
7686 return __select_idle_cpu(core, p);
7687 }
7688
select_idle_smt(struct task_struct * p,struct sched_domain * sd,int target)7689 static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
7690 {
7691 return -1;
7692 }
7693
7694 #endif /* !CONFIG_SCHED_SMT */
7695
7696 /*
7697 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
7698 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
7699 * average idle time for this rq (as found in rq->avg_idle).
7700 */
select_idle_cpu(struct task_struct * p,struct sched_domain * sd,bool has_idle_core,int target)7701 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
7702 {
7703 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7704 int i, cpu, idle_cpu = -1, nr = INT_MAX;
7705 struct sched_domain_shared *sd_share;
7706
7707 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7708
7709 if (sched_feat(SIS_UTIL)) {
7710 sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
7711 if (sd_share) {
7712 /* because !--nr is the condition to stop scan */
7713 nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
7714 /* overloaded LLC is unlikely to have idle cpu/core */
7715 if (nr == 1)
7716 return -1;
7717 }
7718 }
7719
7720 if (static_branch_unlikely(&sched_cluster_active)) {
7721 struct sched_group *sg = sd->groups;
7722
7723 if (sg->flags & SD_CLUSTER) {
7724 for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) {
7725 if (!cpumask_test_cpu(cpu, cpus))
7726 continue;
7727
7728 if (has_idle_core) {
7729 i = select_idle_core(p, cpu, cpus, &idle_cpu);
7730 if ((unsigned int)i < nr_cpumask_bits)
7731 return i;
7732 } else {
7733 if (--nr <= 0)
7734 return -1;
7735 idle_cpu = __select_idle_cpu(cpu, p);
7736 if ((unsigned int)idle_cpu < nr_cpumask_bits)
7737 return idle_cpu;
7738 }
7739 }
7740 cpumask_andnot(cpus, cpus, sched_group_span(sg));
7741 }
7742 }
7743
7744 for_each_cpu_wrap(cpu, cpus, target + 1) {
7745 if (has_idle_core) {
7746 i = select_idle_core(p, cpu, cpus, &idle_cpu);
7747 if ((unsigned int)i < nr_cpumask_bits)
7748 return i;
7749
7750 } else {
7751 if (--nr <= 0)
7752 return -1;
7753 idle_cpu = __select_idle_cpu(cpu, p);
7754 if ((unsigned int)idle_cpu < nr_cpumask_bits)
7755 break;
7756 }
7757 }
7758
7759 if (has_idle_core)
7760 set_idle_cores(target, false);
7761
7762 return idle_cpu;
7763 }
7764
7765 /*
7766 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
7767 * the task fits. If no CPU is big enough, but there are idle ones, try to
7768 * maximize capacity.
7769 */
7770 static int
select_idle_capacity(struct task_struct * p,struct sched_domain * sd,int target)7771 select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
7772 {
7773 unsigned long task_util, util_min, util_max, best_cap = 0;
7774 int fits, best_fits = 0;
7775 int cpu, best_cpu = -1;
7776 struct cpumask *cpus;
7777
7778 cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
7779 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
7780
7781 task_util = task_util_est(p);
7782 util_min = uclamp_eff_value(p, UCLAMP_MIN);
7783 util_max = uclamp_eff_value(p, UCLAMP_MAX);
7784
7785 for_each_cpu_wrap(cpu, cpus, target) {
7786 unsigned long cpu_cap = capacity_of(cpu);
7787
7788 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
7789 continue;
7790
7791 fits = util_fits_cpu(task_util, util_min, util_max, cpu);
7792
7793 /* This CPU fits with all requirements */
7794 if (fits > 0)
7795 return cpu;
7796 /*
7797 * Only the min performance hint (i.e. uclamp_min) doesn't fit.
7798 * Look for the CPU with best capacity.
7799 */
7800 else if (fits < 0)
7801 cpu_cap = get_actual_cpu_capacity(cpu);
7802
7803 /*
7804 * First, select CPU which fits better (-1 being better than 0).
7805 * Then, select the one with best capacity at same level.
7806 */
7807 if ((fits < best_fits) ||
7808 ((fits == best_fits) && (cpu_cap > best_cap))) {
7809 best_cap = cpu_cap;
7810 best_cpu = cpu;
7811 best_fits = fits;
7812 }
7813 }
7814
7815 return best_cpu;
7816 }
7817
asym_fits_cpu(unsigned long util,unsigned long util_min,unsigned long util_max,int cpu)7818 static inline bool asym_fits_cpu(unsigned long util,
7819 unsigned long util_min,
7820 unsigned long util_max,
7821 int cpu)
7822 {
7823 if (sched_asym_cpucap_active())
7824 /*
7825 * Return true only if the cpu fully fits the task requirements
7826 * which include the utilization and the performance hints.
7827 */
7828 return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
7829
7830 return true;
7831 }
7832
7833 /*
7834 * Try and locate an idle core/thread in the LLC cache domain.
7835 */
select_idle_sibling(struct task_struct * p,int prev,int target)7836 static int select_idle_sibling(struct task_struct *p, int prev, int target)
7837 {
7838 bool has_idle_core = false;
7839 struct sched_domain *sd;
7840 unsigned long task_util, util_min, util_max;
7841 int i, recent_used_cpu, prev_aff = -1;
7842
7843 /*
7844 * On asymmetric system, update task utilization because we will check
7845 * that the task fits with CPU's capacity.
7846 */
7847 if (sched_asym_cpucap_active()) {
7848 sync_entity_load_avg(&p->se);
7849 task_util = task_util_est(p);
7850 util_min = uclamp_eff_value(p, UCLAMP_MIN);
7851 util_max = uclamp_eff_value(p, UCLAMP_MAX);
7852 }
7853
7854 /*
7855 * per-cpu select_rq_mask usage
7856 */
7857 lockdep_assert_irqs_disabled();
7858
7859 if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
7860 asym_fits_cpu(task_util, util_min, util_max, target))
7861 return target;
7862
7863 /*
7864 * If the previous CPU is cache affine and idle, don't be stupid:
7865 */
7866 if (prev != target && cpus_share_cache(prev, target) &&
7867 (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
7868 asym_fits_cpu(task_util, util_min, util_max, prev)) {
7869
7870 if (!static_branch_unlikely(&sched_cluster_active) ||
7871 cpus_share_resources(prev, target))
7872 return prev;
7873
7874 prev_aff = prev;
7875 }
7876
7877 /*
7878 * Allow a per-cpu kthread to stack with the wakee if the
7879 * kworker thread and the tasks previous CPUs are the same.
7880 * The assumption is that the wakee queued work for the
7881 * per-cpu kthread that is now complete and the wakeup is
7882 * essentially a sync wakeup. An obvious example of this
7883 * pattern is IO completions.
7884 */
7885 if (is_per_cpu_kthread(current) &&
7886 in_task() &&
7887 prev == smp_processor_id() &&
7888 this_rq()->nr_running <= 1 &&
7889 asym_fits_cpu(task_util, util_min, util_max, prev)) {
7890 return prev;
7891 }
7892
7893 /* Check a recently used CPU as a potential idle candidate: */
7894 recent_used_cpu = p->recent_used_cpu;
7895 p->recent_used_cpu = prev;
7896 if (recent_used_cpu != prev &&
7897 recent_used_cpu != target &&
7898 cpus_share_cache(recent_used_cpu, target) &&
7899 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
7900 cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
7901 asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
7902
7903 if (!static_branch_unlikely(&sched_cluster_active) ||
7904 cpus_share_resources(recent_used_cpu, target))
7905 return recent_used_cpu;
7906
7907 } else {
7908 recent_used_cpu = -1;
7909 }
7910
7911 /*
7912 * For asymmetric CPU capacity systems, our domain of interest is
7913 * sd_asym_cpucapacity rather than sd_llc.
7914 */
7915 if (sched_asym_cpucap_active()) {
7916 sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, target));
7917 /*
7918 * On an asymmetric CPU capacity system where an exclusive
7919 * cpuset defines a symmetric island (i.e. one unique
7920 * capacity_orig value through the cpuset), the key will be set
7921 * but the CPUs within that cpuset will not have a domain with
7922 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
7923 * capacity path.
7924 */
7925 if (sd) {
7926 i = select_idle_capacity(p, sd, target);
7927 return ((unsigned)i < nr_cpumask_bits) ? i : target;
7928 }
7929 }
7930
7931 sd = rcu_dereference_all(per_cpu(sd_llc, target));
7932 if (!sd)
7933 return target;
7934
7935 if (sched_smt_active()) {
7936 has_idle_core = test_idle_cores(target);
7937
7938 if (!has_idle_core && cpus_share_cache(prev, target)) {
7939 i = select_idle_smt(p, sd, prev);
7940 if ((unsigned int)i < nr_cpumask_bits)
7941 return i;
7942 }
7943 }
7944
7945 i = select_idle_cpu(p, sd, has_idle_core, target);
7946 if ((unsigned)i < nr_cpumask_bits)
7947 return i;
7948
7949 /*
7950 * For cluster machines which have lower sharing cache like L2 or
7951 * LLC Tag, we tend to find an idle CPU in the target's cluster
7952 * first. But prev_cpu or recent_used_cpu may also be a good candidate,
7953 * use them if possible when no idle CPU found in select_idle_cpu().
7954 */
7955 if ((unsigned int)prev_aff < nr_cpumask_bits)
7956 return prev_aff;
7957 if ((unsigned int)recent_used_cpu < nr_cpumask_bits)
7958 return recent_used_cpu;
7959
7960 return target;
7961 }
7962
7963 /**
7964 * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
7965 * @cpu: the CPU to get the utilization for
7966 * @p: task for which the CPU utilization should be predicted or NULL
7967 * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
7968 * @boost: 1 to enable boosting, otherwise 0
7969 *
7970 * The unit of the return value must be the same as the one of CPU capacity
7971 * so that CPU utilization can be compared with CPU capacity.
7972 *
7973 * CPU utilization is the sum of running time of runnable tasks plus the
7974 * recent utilization of currently non-runnable tasks on that CPU.
7975 * It represents the amount of CPU capacity currently used by CFS tasks in
7976 * the range [0..max CPU capacity] with max CPU capacity being the CPU
7977 * capacity at f_max.
7978 *
7979 * The estimated CPU utilization is defined as the maximum between CPU
7980 * utilization and sum of the estimated utilization of the currently
7981 * runnable tasks on that CPU. It preserves a utilization "snapshot" of
7982 * previously-executed tasks, which helps better deduce how busy a CPU will
7983 * be when a long-sleeping task wakes up. The contribution to CPU utilization
7984 * of such a task would be significantly decayed at this point of time.
7985 *
7986 * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
7987 * CPU contention for CFS tasks can be detected by CPU runnable > CPU
7988 * utilization. Boosting is implemented in cpu_util() so that internal
7989 * users (e.g. EAS) can use it next to external users (e.g. schedutil),
7990 * latter via cpu_util_cfs_boost().
7991 *
7992 * CPU utilization can be higher than the current CPU capacity
7993 * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
7994 * of rounding errors as well as task migrations or wakeups of new tasks.
7995 * CPU utilization has to be capped to fit into the [0..max CPU capacity]
7996 * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
7997 * could be seen as over-utilized even though CPU1 has 20% of spare CPU
7998 * capacity. CPU utilization is allowed to overshoot current CPU capacity
7999 * though since this is useful for predicting the CPU capacity required
8000 * after task migrations (scheduler-driven DVFS).
8001 *
8002 * Return: (Boosted) (estimated) utilization for the specified CPU.
8003 */
8004 static unsigned long
cpu_util(int cpu,struct task_struct * p,int dst_cpu,int boost)8005 cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
8006 {
8007 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8008 unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
8009 unsigned long runnable;
8010
8011 if (boost) {
8012 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
8013 util = max(util, runnable);
8014 }
8015
8016 /*
8017 * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
8018 * contribution. If @p migrates from another CPU to @cpu add its
8019 * contribution. In all the other cases @cpu is not impacted by the
8020 * migration so its util_avg is already correct.
8021 */
8022 if (p && task_cpu(p) == cpu && dst_cpu != cpu)
8023 lsub_positive(&util, task_util(p));
8024 else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
8025 util += task_util(p);
8026
8027 if (sched_feat(UTIL_EST)) {
8028 unsigned long util_est;
8029
8030 util_est = READ_ONCE(cfs_rq->avg.util_est);
8031
8032 /*
8033 * During wake-up @p isn't enqueued yet and doesn't contribute
8034 * to any cpu_rq(cpu)->cfs.avg.util_est.
8035 * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
8036 * has been enqueued.
8037 *
8038 * During exec (@dst_cpu = -1) @p is enqueued and does
8039 * contribute to cpu_rq(cpu)->cfs.util_est.
8040 * Remove it to "simulate" cpu_util without @p's contribution.
8041 *
8042 * Despite the task_on_rq_queued(@p) check there is still a
8043 * small window for a possible race when an exec
8044 * select_task_rq_fair() races with LB's detach_task().
8045 *
8046 * detach_task()
8047 * deactivate_task()
8048 * p->on_rq = TASK_ON_RQ_MIGRATING;
8049 * -------------------------------- A
8050 * dequeue_task() \
8051 * dequeue_task_fair() + Race Time
8052 * util_est_dequeue() /
8053 * -------------------------------- B
8054 *
8055 * The additional check "current == p" is required to further
8056 * reduce the race window.
8057 */
8058 if (dst_cpu == cpu)
8059 util_est += _task_util_est(p);
8060 else if (p && unlikely(task_on_rq_queued(p) || current == p))
8061 lsub_positive(&util_est, _task_util_est(p));
8062
8063 util = max(util, util_est);
8064 }
8065
8066 return min(util, arch_scale_cpu_capacity(cpu));
8067 }
8068
cpu_util_cfs(int cpu)8069 unsigned long cpu_util_cfs(int cpu)
8070 {
8071 return cpu_util(cpu, NULL, -1, 0);
8072 }
8073
cpu_util_cfs_boost(int cpu)8074 unsigned long cpu_util_cfs_boost(int cpu)
8075 {
8076 return cpu_util(cpu, NULL, -1, 1);
8077 }
8078
8079 /*
8080 * cpu_util_without: compute cpu utilization without any contributions from *p
8081 * @cpu: the CPU which utilization is requested
8082 * @p: the task which utilization should be discounted
8083 *
8084 * The utilization of a CPU is defined by the utilization of tasks currently
8085 * enqueued on that CPU as well as tasks which are currently sleeping after an
8086 * execution on that CPU.
8087 *
8088 * This method returns the utilization of the specified CPU by discounting the
8089 * utilization of the specified task, whenever the task is currently
8090 * contributing to the CPU utilization.
8091 */
cpu_util_without(int cpu,struct task_struct * p)8092 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
8093 {
8094 /* Task has no contribution or is new */
8095 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8096 p = NULL;
8097
8098 return cpu_util(cpu, p, -1, 0);
8099 }
8100
8101 /*
8102 * This function computes an effective utilization for the given CPU, to be
8103 * used for frequency selection given the linear relation: f = u * f_max.
8104 *
8105 * The scheduler tracks the following metrics:
8106 *
8107 * cpu_util_{cfs,rt,dl,irq}()
8108 * cpu_bw_dl()
8109 *
8110 * Where the cfs,rt and dl util numbers are tracked with the same metric and
8111 * synchronized windows and are thus directly comparable.
8112 *
8113 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
8114 * which excludes things like IRQ and steal-time. These latter are then accrued
8115 * in the IRQ utilization.
8116 *
8117 * The DL bandwidth number OTOH is not a measured metric but a value computed
8118 * based on the task model parameters and gives the minimal utilization
8119 * required to meet deadlines.
8120 */
effective_cpu_util(int cpu,unsigned long util_cfs,unsigned long * min,unsigned long * max)8121 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
8122 unsigned long *min,
8123 unsigned long *max)
8124 {
8125 unsigned long util, irq, scale;
8126 struct rq *rq = cpu_rq(cpu);
8127
8128 scale = arch_scale_cpu_capacity(cpu);
8129
8130 /*
8131 * Early check to see if IRQ/steal time saturates the CPU, can be
8132 * because of inaccuracies in how we track these -- see
8133 * update_irq_load_avg().
8134 */
8135 irq = cpu_util_irq(rq);
8136 if (unlikely(irq >= scale)) {
8137 if (min)
8138 *min = scale;
8139 if (max)
8140 *max = scale;
8141 return scale;
8142 }
8143
8144 if (min) {
8145 /*
8146 * The minimum utilization returns the highest level between:
8147 * - the computed DL bandwidth needed with the IRQ pressure which
8148 * steals time to the deadline task.
8149 * - The minimum performance requirement for CFS and/or RT.
8150 */
8151 *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
8152
8153 /*
8154 * When an RT task is runnable and uclamp is not used, we must
8155 * ensure that the task will run at maximum compute capacity.
8156 */
8157 if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
8158 *min = max(*min, scale);
8159 }
8160
8161 /*
8162 * Because the time spend on RT/DL tasks is visible as 'lost' time to
8163 * CFS tasks and we use the same metric to track the effective
8164 * utilization (PELT windows are synchronized) we can directly add them
8165 * to obtain the CPU's actual utilization.
8166 */
8167 util = util_cfs + cpu_util_rt(rq);
8168 util += cpu_util_dl(rq);
8169
8170 /*
8171 * The maximum hint is a soft bandwidth requirement, which can be lower
8172 * than the actual utilization because of uclamp_max requirements.
8173 */
8174 if (max)
8175 *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
8176
8177 if (util >= scale)
8178 return scale;
8179
8180 /*
8181 * There is still idle time; further improve the number by using the
8182 * IRQ metric. Because IRQ/steal time is hidden from the task clock we
8183 * need to scale the task numbers:
8184 *
8185 * max - irq
8186 * U' = irq + --------- * U
8187 * max
8188 */
8189 util = scale_irq_capacity(util, irq, scale);
8190 util += irq;
8191
8192 return min(scale, util);
8193 }
8194
sched_cpu_util(int cpu)8195 unsigned long sched_cpu_util(int cpu)
8196 {
8197 return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
8198 }
8199
8200 /*
8201 * energy_env - Utilization landscape for energy estimation.
8202 * @task_busy_time: Utilization contribution by the task for which we test the
8203 * placement. Given by eenv_task_busy_time().
8204 * @pd_busy_time: Utilization of the whole perf domain without the task
8205 * contribution. Given by eenv_pd_busy_time().
8206 * @cpu_cap: Maximum CPU capacity for the perf domain.
8207 * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
8208 */
8209 struct energy_env {
8210 unsigned long task_busy_time;
8211 unsigned long pd_busy_time;
8212 unsigned long cpu_cap;
8213 unsigned long pd_cap;
8214 };
8215
8216 /*
8217 * Compute the task busy time for compute_energy(). This time cannot be
8218 * injected directly into effective_cpu_util() because of the IRQ scaling.
8219 * The latter only makes sense with the most recent CPUs where the task has
8220 * run.
8221 */
eenv_task_busy_time(struct energy_env * eenv,struct task_struct * p,int prev_cpu)8222 static inline void eenv_task_busy_time(struct energy_env *eenv,
8223 struct task_struct *p, int prev_cpu)
8224 {
8225 unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
8226 unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
8227
8228 if (unlikely(irq >= max_cap))
8229 busy_time = max_cap;
8230 else
8231 busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
8232
8233 eenv->task_busy_time = busy_time;
8234 }
8235
8236 /*
8237 * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
8238 * utilization for each @pd_cpus, it however doesn't take into account
8239 * clamping since the ratio (utilization / cpu_capacity) is already enough to
8240 * scale the EM reported power consumption at the (eventually clamped)
8241 * cpu_capacity.
8242 *
8243 * The contribution of the task @p for which we want to estimate the
8244 * energy cost is removed (by cpu_util()) and must be calculated
8245 * separately (see eenv_task_busy_time). This ensures:
8246 *
8247 * - A stable PD utilization, no matter which CPU of that PD we want to place
8248 * the task on.
8249 *
8250 * - A fair comparison between CPUs as the task contribution (task_util())
8251 * will always be the same no matter which CPU utilization we rely on
8252 * (util_avg or util_est).
8253 *
8254 * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
8255 * exceed @eenv->pd_cap.
8256 */
eenv_pd_busy_time(struct energy_env * eenv,struct cpumask * pd_cpus,struct task_struct * p)8257 static inline void eenv_pd_busy_time(struct energy_env *eenv,
8258 struct cpumask *pd_cpus,
8259 struct task_struct *p)
8260 {
8261 unsigned long busy_time = 0;
8262 int cpu;
8263
8264 for_each_cpu(cpu, pd_cpus) {
8265 unsigned long util = cpu_util(cpu, p, -1, 0);
8266
8267 busy_time += effective_cpu_util(cpu, util, NULL, NULL);
8268 }
8269
8270 eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
8271 }
8272
8273 /*
8274 * Compute the maximum utilization for compute_energy() when the task @p
8275 * is placed on the cpu @dst_cpu.
8276 *
8277 * Returns the maximum utilization among @eenv->cpus. This utilization can't
8278 * exceed @eenv->cpu_cap.
8279 */
8280 static inline unsigned long
eenv_pd_max_util(struct energy_env * eenv,struct cpumask * pd_cpus,struct task_struct * p,int dst_cpu)8281 eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
8282 struct task_struct *p, int dst_cpu)
8283 {
8284 unsigned long max_util = 0;
8285 int cpu;
8286
8287 for_each_cpu(cpu, pd_cpus) {
8288 struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
8289 unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
8290 unsigned long eff_util, min, max;
8291
8292 /*
8293 * Performance domain frequency: utilization clamping
8294 * must be considered since it affects the selection
8295 * of the performance domain frequency.
8296 * NOTE: in case RT tasks are running, by default the min
8297 * utilization can be max OPP.
8298 */
8299 eff_util = effective_cpu_util(cpu, util, &min, &max);
8300
8301 /* Task's uclamp can modify min and max value */
8302 if (tsk && uclamp_is_used()) {
8303 min = max(min, uclamp_eff_value(p, UCLAMP_MIN));
8304
8305 /*
8306 * If there is no active max uclamp constraint,
8307 * directly use task's one, otherwise keep max.
8308 */
8309 if (uclamp_rq_is_idle(cpu_rq(cpu)))
8310 max = uclamp_eff_value(p, UCLAMP_MAX);
8311 else
8312 max = max(max, uclamp_eff_value(p, UCLAMP_MAX));
8313 }
8314
8315 eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max);
8316 max_util = max(max_util, eff_util);
8317 }
8318
8319 return min(max_util, eenv->cpu_cap);
8320 }
8321
8322 /*
8323 * compute_energy(): Use the Energy Model to estimate the energy that @pd would
8324 * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
8325 * contribution is ignored.
8326 */
8327 static inline unsigned long
compute_energy(struct energy_env * eenv,struct perf_domain * pd,struct cpumask * pd_cpus,struct task_struct * p,int dst_cpu)8328 compute_energy(struct energy_env *eenv, struct perf_domain *pd,
8329 struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
8330 {
8331 unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
8332 unsigned long busy_time = eenv->pd_busy_time;
8333 unsigned long energy;
8334
8335 if (dst_cpu >= 0)
8336 busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
8337
8338 energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
8339
8340 trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time);
8341
8342 return energy;
8343 }
8344
8345 /*
8346 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
8347 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
8348 * spare capacity in each performance domain and uses it as a potential
8349 * candidate to execute the task. Then, it uses the Energy Model to figure
8350 * out which of the CPU candidates is the most energy-efficient.
8351 *
8352 * The rationale for this heuristic is as follows. In a performance domain,
8353 * all the most energy efficient CPU candidates (according to the Energy
8354 * Model) are those for which we'll request a low frequency. When there are
8355 * several CPUs for which the frequency request will be the same, we don't
8356 * have enough data to break the tie between them, because the Energy Model
8357 * only includes active power costs. With this model, if we assume that
8358 * frequency requests follow utilization (e.g. using schedutil), the CPU with
8359 * the maximum spare capacity in a performance domain is guaranteed to be among
8360 * the best candidates of the performance domain.
8361 *
8362 * In practice, it could be preferable from an energy standpoint to pack
8363 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
8364 * but that could also hurt our chances to go cluster idle, and we have no
8365 * ways to tell with the current Energy Model if this is actually a good
8366 * idea or not. So, find_energy_efficient_cpu() basically favors
8367 * cluster-packing, and spreading inside a cluster. That should at least be
8368 * a good thing for latency, and this is consistent with the idea that most
8369 * of the energy savings of EAS come from the asymmetry of the system, and
8370 * not so much from breaking the tie between identical CPUs. That's also the
8371 * reason why EAS is enabled in the topology code only for systems where
8372 * SD_ASYM_CPUCAPACITY is set.
8373 *
8374 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
8375 * they don't have any useful utilization data yet and it's not possible to
8376 * forecast their impact on energy consumption. Consequently, they will be
8377 * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out
8378 * to be energy-inefficient in some use-cases. The alternative would be to
8379 * bias new tasks towards specific types of CPUs first, or to try to infer
8380 * their util_avg from the parent task, but those heuristics could hurt
8381 * other use-cases too. So, until someone finds a better way to solve this,
8382 * let's keep things simple by re-using the existing slow path.
8383 */
find_energy_efficient_cpu(struct task_struct * p,int prev_cpu)8384 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
8385 {
8386 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
8387 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
8388 unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
8389 unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
8390 struct root_domain *rd = this_rq()->rd;
8391 int cpu, best_energy_cpu, target = -1;
8392 int prev_fits = -1, best_fits = -1;
8393 unsigned long best_actual_cap = 0;
8394 unsigned long prev_actual_cap = 0;
8395 struct sched_domain *sd;
8396 struct perf_domain *pd;
8397 struct energy_env eenv;
8398
8399 rcu_read_lock();
8400 pd = rcu_dereference_all(rd->pd);
8401 if (!pd)
8402 goto unlock;
8403
8404 /*
8405 * Energy-aware wake-up happens on the lowest sched_domain starting
8406 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
8407 */
8408 sd = rcu_dereference_all(*this_cpu_ptr(&sd_asym_cpucapacity));
8409 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
8410 sd = sd->parent;
8411 if (!sd)
8412 goto unlock;
8413
8414 target = prev_cpu;
8415
8416 sync_entity_load_avg(&p->se);
8417 if (!task_util_est(p) && p_util_min == 0)
8418 goto unlock;
8419
8420 eenv_task_busy_time(&eenv, p, prev_cpu);
8421
8422 for (; pd; pd = pd->next) {
8423 unsigned long util_min = p_util_min, util_max = p_util_max;
8424 unsigned long cpu_cap, cpu_actual_cap, util;
8425 long prev_spare_cap = -1, max_spare_cap = -1;
8426 unsigned long rq_util_min, rq_util_max;
8427 unsigned long cur_delta, base_energy;
8428 int max_spare_cap_cpu = -1;
8429 int fits, max_fits = -1;
8430
8431 if (!cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask))
8432 continue;
8433
8434 /* Account external pressure for the energy estimation */
8435 cpu = cpumask_first(cpus);
8436 cpu_actual_cap = get_actual_cpu_capacity(cpu);
8437
8438 eenv.cpu_cap = cpu_actual_cap;
8439 eenv.pd_cap = 0;
8440
8441 for_each_cpu(cpu, cpus) {
8442 struct rq *rq = cpu_rq(cpu);
8443
8444 eenv.pd_cap += cpu_actual_cap;
8445
8446 if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
8447 continue;
8448
8449 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
8450 continue;
8451
8452 util = cpu_util(cpu, p, cpu, 0);
8453 cpu_cap = capacity_of(cpu);
8454
8455 /*
8456 * Skip CPUs that cannot satisfy the capacity request.
8457 * IOW, placing the task there would make the CPU
8458 * overutilized. Take uclamp into account to see how
8459 * much capacity we can get out of the CPU; this is
8460 * aligned with sched_cpu_util().
8461 */
8462 if (uclamp_is_used() && !uclamp_rq_is_idle(rq)) {
8463 /*
8464 * Open code uclamp_rq_util_with() except for
8465 * the clamp() part. I.e.: apply max aggregation
8466 * only. util_fits_cpu() logic requires to
8467 * operate on non clamped util but must use the
8468 * max-aggregated uclamp_{min, max}.
8469 */
8470 rq_util_min = uclamp_rq_get(rq, UCLAMP_MIN);
8471 rq_util_max = uclamp_rq_get(rq, UCLAMP_MAX);
8472
8473 util_min = max(rq_util_min, p_util_min);
8474 util_max = max(rq_util_max, p_util_max);
8475 }
8476
8477 fits = util_fits_cpu(util, util_min, util_max, cpu);
8478 if (!fits)
8479 continue;
8480
8481 lsub_positive(&cpu_cap, util);
8482
8483 if (cpu == prev_cpu) {
8484 /* Always use prev_cpu as a candidate. */
8485 prev_spare_cap = cpu_cap;
8486 prev_fits = fits;
8487 } else if ((fits > max_fits) ||
8488 ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) {
8489 /*
8490 * Find the CPU with the maximum spare capacity
8491 * among the remaining CPUs in the performance
8492 * domain.
8493 */
8494 max_spare_cap = cpu_cap;
8495 max_spare_cap_cpu = cpu;
8496 max_fits = fits;
8497 }
8498 }
8499
8500 if (max_spare_cap_cpu < 0 && prev_spare_cap < 0)
8501 continue;
8502
8503 eenv_pd_busy_time(&eenv, cpus, p);
8504 /* Compute the 'base' energy of the pd, without @p */
8505 base_energy = compute_energy(&eenv, pd, cpus, p, -1);
8506
8507 /* Evaluate the energy impact of using prev_cpu. */
8508 if (prev_spare_cap > -1) {
8509 prev_delta = compute_energy(&eenv, pd, cpus, p,
8510 prev_cpu);
8511 /* CPU utilization has changed */
8512 if (prev_delta < base_energy)
8513 goto unlock;
8514 prev_delta -= base_energy;
8515 prev_actual_cap = cpu_actual_cap;
8516 best_delta = min(best_delta, prev_delta);
8517 }
8518
8519 /* Evaluate the energy impact of using max_spare_cap_cpu. */
8520 if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
8521 /* Current best energy cpu fits better */
8522 if (max_fits < best_fits)
8523 continue;
8524
8525 /*
8526 * Both don't fit performance hint (i.e. uclamp_min)
8527 * but best energy cpu has better capacity.
8528 */
8529 if ((max_fits < 0) &&
8530 (cpu_actual_cap <= best_actual_cap))
8531 continue;
8532
8533 cur_delta = compute_energy(&eenv, pd, cpus, p,
8534 max_spare_cap_cpu);
8535 /* CPU utilization has changed */
8536 if (cur_delta < base_energy)
8537 goto unlock;
8538 cur_delta -= base_energy;
8539
8540 /*
8541 * Both fit for the task but best energy cpu has lower
8542 * energy impact.
8543 */
8544 if ((max_fits > 0) && (best_fits > 0) &&
8545 (cur_delta >= best_delta))
8546 continue;
8547
8548 best_delta = cur_delta;
8549 best_energy_cpu = max_spare_cap_cpu;
8550 best_fits = max_fits;
8551 best_actual_cap = cpu_actual_cap;
8552 }
8553 }
8554 rcu_read_unlock();
8555
8556 if ((best_fits > prev_fits) ||
8557 ((best_fits > 0) && (best_delta < prev_delta)) ||
8558 ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
8559 target = best_energy_cpu;
8560
8561 return target;
8562
8563 unlock:
8564 rcu_read_unlock();
8565
8566 return target;
8567 }
8568
8569 /*
8570 * select_task_rq_fair: Select target runqueue for the waking task in domains
8571 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
8572 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
8573 *
8574 * Balances load by selecting the idlest CPU in the idlest group, or under
8575 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
8576 *
8577 * Returns the target CPU number.
8578 */
8579 static int
select_task_rq_fair(struct task_struct * p,int prev_cpu,int wake_flags)8580 select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
8581 {
8582 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
8583 struct sched_domain *tmp, *sd = NULL;
8584 int cpu = smp_processor_id();
8585 int new_cpu = prev_cpu;
8586 int want_affine = 0;
8587 /* SD_flags and WF_flags share the first nibble */
8588 int sd_flag = wake_flags & 0xF;
8589
8590 /*
8591 * required for stable ->cpus_allowed
8592 */
8593 lockdep_assert_held(&p->pi_lock);
8594 if (wake_flags & WF_TTWU) {
8595 record_wakee(p);
8596
8597 if ((wake_flags & WF_CURRENT_CPU) &&
8598 cpumask_test_cpu(cpu, p->cpus_ptr))
8599 return cpu;
8600
8601 if (!is_rd_overutilized(this_rq()->rd)) {
8602 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
8603 if (new_cpu >= 0)
8604 return new_cpu;
8605 new_cpu = prev_cpu;
8606 }
8607
8608 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
8609 }
8610
8611 rcu_read_lock();
8612 for_each_domain(cpu, tmp) {
8613 /*
8614 * If both 'cpu' and 'prev_cpu' are part of this domain,
8615 * cpu is a valid SD_WAKE_AFFINE target.
8616 */
8617 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
8618 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
8619 if (cpu != prev_cpu)
8620 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
8621
8622 sd = NULL; /* Prefer wake_affine over balance flags */
8623 break;
8624 }
8625
8626 /*
8627 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
8628 * usually do not have SD_BALANCE_WAKE set. That means wakeup
8629 * will usually go to the fast path.
8630 */
8631 if (tmp->flags & sd_flag)
8632 sd = tmp;
8633 else if (!want_affine)
8634 break;
8635 }
8636
8637 if (unlikely(sd)) {
8638 /* Slow path */
8639 new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
8640 } else if (wake_flags & WF_TTWU) { /* XXX always ? */
8641 /* Fast path */
8642 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
8643 }
8644 rcu_read_unlock();
8645
8646 return new_cpu;
8647 }
8648
8649 /*
8650 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
8651 * cfs_rq_of(p) references at time of call are still valid and identify the
8652 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
8653 */
migrate_task_rq_fair(struct task_struct * p,int new_cpu)8654 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
8655 {
8656 struct sched_entity *se = &p->se;
8657
8658 if (!task_on_rq_migrating(p)) {
8659 remove_entity_load_avg(se);
8660
8661 /*
8662 * Here, the task's PELT values have been updated according to
8663 * the current rq's clock. But if that clock hasn't been
8664 * updated in a while, a substantial idle time will be missed,
8665 * leading to an inflation after wake-up on the new rq.
8666 *
8667 * Estimate the missing time from the cfs_rq last_update_time
8668 * and update sched_avg to improve the PELT continuity after
8669 * migration.
8670 */
8671 migrate_se_pelt_lag(se);
8672 }
8673
8674 /* Tell new CPU we are migrated */
8675 se->avg.last_update_time = 0;
8676
8677 update_scan_period(p, new_cpu);
8678 }
8679
task_dead_fair(struct task_struct * p)8680 static void task_dead_fair(struct task_struct *p)
8681 {
8682 struct sched_entity *se = &p->se;
8683
8684 if (se->sched_delayed) {
8685 struct rq_flags rf;
8686 struct rq *rq;
8687
8688 rq = task_rq_lock(p, &rf);
8689 if (se->sched_delayed) {
8690 update_rq_clock(rq);
8691 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
8692 }
8693 task_rq_unlock(rq, p, &rf);
8694 }
8695
8696 remove_entity_load_avg(se);
8697 }
8698
8699 /*
8700 * Set the max capacity the task is allowed to run at for misfit detection.
8701 */
set_task_max_allowed_capacity(struct task_struct * p)8702 static void set_task_max_allowed_capacity(struct task_struct *p)
8703 {
8704 struct asym_cap_data *entry;
8705
8706 if (!sched_asym_cpucap_active())
8707 return;
8708
8709 rcu_read_lock();
8710 list_for_each_entry_rcu(entry, &asym_cap_list, link) {
8711 cpumask_t *cpumask;
8712
8713 cpumask = cpu_capacity_span(entry);
8714 if (!cpumask_intersects(p->cpus_ptr, cpumask))
8715 continue;
8716
8717 p->max_allowed_capacity = entry->capacity;
8718 break;
8719 }
8720 rcu_read_unlock();
8721 }
8722
set_cpus_allowed_fair(struct task_struct * p,struct affinity_context * ctx)8723 static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
8724 {
8725 set_cpus_allowed_common(p, ctx);
8726 set_task_max_allowed_capacity(p);
8727 }
8728
set_next_buddy(struct sched_entity * se)8729 static void set_next_buddy(struct sched_entity *se)
8730 {
8731 for_each_sched_entity(se) {
8732 if (WARN_ON_ONCE(!se->on_rq))
8733 return;
8734 if (se_is_idle(se))
8735 return;
8736 cfs_rq_of(se)->next = se;
8737 }
8738 }
8739
8740 enum preempt_wakeup_action {
8741 PREEMPT_WAKEUP_NONE, /* No preemption. */
8742 PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */
8743 PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */
8744 PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */
8745 };
8746
8747 static inline bool
set_preempt_buddy(struct cfs_rq * cfs_rq,int wake_flags,struct sched_entity * pse,struct sched_entity * se)8748 set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags,
8749 struct sched_entity *pse, struct sched_entity *se)
8750 {
8751 /*
8752 * Keep existing buddy if the deadline is sooner than pse.
8753 * The older buddy may be cache cold and completely unrelated
8754 * to the current wakeup but that is unpredictable where as
8755 * obeying the deadline is more in line with EEVDF objectives.
8756 */
8757 if (cfs_rq->next && entity_before(cfs_rq->next, pse))
8758 return false;
8759
8760 set_next_buddy(pse);
8761 return true;
8762 }
8763
8764 /*
8765 * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not
8766 * strictly enforced because the hint is either misunderstood or
8767 * multiple tasks must be woken up.
8768 */
8769 static inline enum preempt_wakeup_action
preempt_sync(struct rq * rq,int wake_flags,struct sched_entity * pse,struct sched_entity * se)8770 preempt_sync(struct rq *rq, int wake_flags,
8771 struct sched_entity *pse, struct sched_entity *se)
8772 {
8773 u64 threshold, delta;
8774
8775 /*
8776 * WF_SYNC without WF_TTWU is not expected so warn if it happens even
8777 * though it is likely harmless.
8778 */
8779 WARN_ON_ONCE(!(wake_flags & WF_TTWU));
8780
8781 threshold = sysctl_sched_migration_cost;
8782 delta = rq_clock_task(rq) - se->exec_start;
8783 if ((s64)delta < 0)
8784 delta = 0;
8785
8786 /*
8787 * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they
8788 * could run on other CPUs. Reduce the threshold before preemption is
8789 * allowed to an arbitrary lower value as it is more likely (but not
8790 * guaranteed) the waker requires the wakee to finish.
8791 */
8792 if (wake_flags & WF_RQ_SELECTED)
8793 threshold >>= 2;
8794
8795 /*
8796 * As WF_SYNC is not strictly obeyed, allow some runtime for batch
8797 * wakeups to be issued.
8798 */
8799 if (entity_before(pse, se) && delta >= threshold)
8800 return PREEMPT_WAKEUP_RESCHED;
8801
8802 return PREEMPT_WAKEUP_NONE;
8803 }
8804
8805 /*
8806 * Preempt the current task with a newly woken task if needed:
8807 */
wakeup_preempt_fair(struct rq * rq,struct task_struct * p,int wake_flags)8808 static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_flags)
8809 {
8810 enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
8811 struct task_struct *donor = rq->donor;
8812 struct sched_entity *se = &donor->se, *pse = &p->se;
8813 struct cfs_rq *cfs_rq = task_cfs_rq(donor);
8814 int cse_is_idle, pse_is_idle;
8815
8816 /*
8817 * XXX Getting preempted by higher class, try and find idle CPU?
8818 */
8819 if (p->sched_class != &fair_sched_class)
8820 return;
8821
8822 if (unlikely(se == pse))
8823 return;
8824
8825 /*
8826 * This is possible from callers such as attach_tasks(), in which we
8827 * unconditionally wakeup_preempt() after an enqueue (which may have
8828 * lead to a throttle). This both saves work and prevents false
8829 * next-buddy nomination below.
8830 */
8831 if (task_is_throttled(p))
8832 return;
8833
8834 /*
8835 * We can come here with TIF_NEED_RESCHED already set from new task
8836 * wake up path.
8837 *
8838 * Note: this also catches the edge-case of curr being in a throttled
8839 * group (e.g. via set_curr_task), since update_curr() (in the
8840 * enqueue of curr) will have resulted in resched being set. This
8841 * prevents us from potentially nominating it as a false LAST_BUDDY
8842 * below.
8843 */
8844 if (test_tsk_need_resched(rq->curr))
8845 return;
8846
8847 if (!sched_feat(WAKEUP_PREEMPTION))
8848 return;
8849
8850 find_matching_se(&se, &pse);
8851 WARN_ON_ONCE(!pse);
8852
8853 cse_is_idle = se_is_idle(se);
8854 pse_is_idle = se_is_idle(pse);
8855
8856 /*
8857 * Preempt an idle entity in favor of a non-idle entity (and don't preempt
8858 * in the inverse case).
8859 */
8860 if (cse_is_idle && !pse_is_idle) {
8861 /*
8862 * When non-idle entity preempt an idle entity,
8863 * don't give idle entity slice protection.
8864 */
8865 preempt_action = PREEMPT_WAKEUP_SHORT;
8866 goto preempt;
8867 }
8868
8869 if (cse_is_idle != pse_is_idle)
8870 return;
8871
8872 /*
8873 * BATCH and IDLE tasks do not preempt others.
8874 */
8875 if (unlikely(!normal_policy(p->policy)))
8876 return;
8877
8878 cfs_rq = cfs_rq_of(se);
8879 update_curr(cfs_rq);
8880 /*
8881 * If @p has a shorter slice than current and @p is eligible, override
8882 * current's slice protection in order to allow preemption.
8883 */
8884 if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) {
8885 preempt_action = PREEMPT_WAKEUP_SHORT;
8886 goto pick;
8887 }
8888
8889 /*
8890 * Ignore wakee preemption on WF_FORK as it is less likely that
8891 * there is shared data as exec often follow fork. Do not
8892 * preempt for tasks that are sched_delayed as it would violate
8893 * EEVDF to forcibly queue an ineligible task.
8894 */
8895 if ((wake_flags & WF_FORK) || pse->sched_delayed)
8896 return;
8897
8898 /* Prefer picking wakee soon if appropriate. */
8899 if (sched_feat(NEXT_BUDDY) &&
8900 set_preempt_buddy(cfs_rq, wake_flags, pse, se)) {
8901
8902 /*
8903 * Decide whether to obey WF_SYNC hint for a new buddy. Old
8904 * buddies are ignored as they may not be relevant to the
8905 * waker and less likely to be cache hot.
8906 */
8907 if (wake_flags & WF_SYNC)
8908 preempt_action = preempt_sync(rq, wake_flags, pse, se);
8909 }
8910
8911 switch (preempt_action) {
8912 case PREEMPT_WAKEUP_NONE:
8913 return;
8914 case PREEMPT_WAKEUP_RESCHED:
8915 goto preempt;
8916 case PREEMPT_WAKEUP_SHORT:
8917 fallthrough;
8918 case PREEMPT_WAKEUP_PICK:
8919 break;
8920 }
8921
8922 pick:
8923 /*
8924 * If @p has become the most eligible task, force preemption.
8925 */
8926 if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
8927 goto preempt;
8928
8929 if (sched_feat(RUN_TO_PARITY))
8930 update_protect_slice(cfs_rq, se);
8931
8932 return;
8933
8934 preempt:
8935 if (preempt_action == PREEMPT_WAKEUP_SHORT)
8936 cancel_protect_slice(se);
8937
8938 resched_curr_lazy(rq);
8939 }
8940
pick_task_fair(struct rq * rq,struct rq_flags * rf)8941 static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
8942 {
8943 struct sched_entity *se;
8944 struct cfs_rq *cfs_rq;
8945 struct task_struct *p;
8946 bool throttled;
8947
8948 again:
8949 cfs_rq = &rq->cfs;
8950 if (!cfs_rq->nr_queued)
8951 return NULL;
8952
8953 throttled = false;
8954
8955 do {
8956 /* Might not have done put_prev_entity() */
8957 if (cfs_rq->curr && cfs_rq->curr->on_rq)
8958 update_curr(cfs_rq);
8959
8960 throttled |= check_cfs_rq_runtime(cfs_rq);
8961
8962 se = pick_next_entity(rq, cfs_rq);
8963 if (!se)
8964 goto again;
8965 cfs_rq = group_cfs_rq(se);
8966 } while (cfs_rq);
8967
8968 p = task_of(se);
8969 if (unlikely(throttled))
8970 task_throttle_setup_work(p);
8971 return p;
8972 }
8973
8974 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
8975 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
8976
8977 struct task_struct *
pick_next_task_fair(struct rq * rq,struct task_struct * prev,struct rq_flags * rf)8978 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
8979 __must_hold(__rq_lockp(rq))
8980 {
8981 struct sched_entity *se;
8982 struct task_struct *p;
8983 int new_tasks;
8984
8985 again:
8986 p = pick_task_fair(rq, rf);
8987 if (!p)
8988 goto idle;
8989 se = &p->se;
8990
8991 #ifdef CONFIG_FAIR_GROUP_SCHED
8992 if (prev->sched_class != &fair_sched_class)
8993 goto simple;
8994
8995 __put_prev_set_next_dl_server(rq, prev, p);
8996
8997 /*
8998 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
8999 * likely that a next task is from the same cgroup as the current.
9000 *
9001 * Therefore attempt to avoid putting and setting the entire cgroup
9002 * hierarchy, only change the part that actually changes.
9003 *
9004 * Since we haven't yet done put_prev_entity and if the selected task
9005 * is a different task than we started out with, try and touch the
9006 * least amount of cfs_rqs.
9007 */
9008 if (prev != p) {
9009 struct sched_entity *pse = &prev->se;
9010 struct cfs_rq *cfs_rq;
9011
9012 while (!(cfs_rq = is_same_group(se, pse))) {
9013 int se_depth = se->depth;
9014 int pse_depth = pse->depth;
9015
9016 if (se_depth <= pse_depth) {
9017 put_prev_entity(cfs_rq_of(pse), pse);
9018 pse = parent_entity(pse);
9019 }
9020 if (se_depth >= pse_depth) {
9021 set_next_entity(cfs_rq_of(se), se, true);
9022 se = parent_entity(se);
9023 }
9024 }
9025
9026 put_prev_entity(cfs_rq, pse);
9027 set_next_entity(cfs_rq, se, true);
9028
9029 __set_next_task_fair(rq, p, true);
9030 }
9031
9032 return p;
9033
9034 simple:
9035 #endif /* CONFIG_FAIR_GROUP_SCHED */
9036 put_prev_set_next_task(rq, prev, p);
9037 return p;
9038
9039 idle:
9040 if (rf) {
9041 new_tasks = sched_balance_newidle(rq, rf);
9042
9043 /*
9044 * Because sched_balance_newidle() releases (and re-acquires)
9045 * rq->lock, it is possible for any higher priority task to
9046 * appear. In that case we must re-start the pick_next_entity()
9047 * loop.
9048 */
9049 if (new_tasks < 0)
9050 return RETRY_TASK;
9051
9052 if (new_tasks > 0)
9053 goto again;
9054 }
9055
9056 return NULL;
9057 }
9058
9059 static struct task_struct *
fair_server_pick_task(struct sched_dl_entity * dl_se,struct rq_flags * rf)9060 fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf)
9061 {
9062 return pick_task_fair(dl_se->rq, rf);
9063 }
9064
fair_server_init(struct rq * rq)9065 void fair_server_init(struct rq *rq)
9066 {
9067 struct sched_dl_entity *dl_se = &rq->fair_server;
9068
9069 init_dl_entity(dl_se);
9070
9071 dl_server_init(dl_se, rq, fair_server_pick_task);
9072 }
9073
9074 /*
9075 * Account for a descheduled task:
9076 */
put_prev_task_fair(struct rq * rq,struct task_struct * prev,struct task_struct * next)9077 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next)
9078 {
9079 struct sched_entity *se = &prev->se;
9080 struct cfs_rq *cfs_rq;
9081
9082 for_each_sched_entity(se) {
9083 cfs_rq = cfs_rq_of(se);
9084 put_prev_entity(cfs_rq, se);
9085 }
9086 }
9087
9088 /*
9089 * sched_yield() is very simple
9090 */
yield_task_fair(struct rq * rq)9091 static void yield_task_fair(struct rq *rq)
9092 {
9093 struct task_struct *curr = rq->donor;
9094 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
9095 struct sched_entity *se = &curr->se;
9096
9097 /*
9098 * Are we the only task in the tree?
9099 */
9100 if (unlikely(rq->nr_running == 1))
9101 return;
9102
9103 clear_buddies(cfs_rq, se);
9104
9105 update_rq_clock(rq);
9106 /*
9107 * Update run-time statistics of the 'current'.
9108 */
9109 update_curr(cfs_rq);
9110 /*
9111 * Tell update_rq_clock() that we've just updated,
9112 * so we don't do microscopic update in schedule()
9113 * and double the fastpath cost.
9114 */
9115 rq_clock_skip_update(rq);
9116
9117 /*
9118 * Forfeit the remaining vruntime, only if the entity is eligible. This
9119 * condition is necessary because in core scheduling we prefer to run
9120 * ineligible tasks rather than force idling. If this happens we may
9121 * end up in a loop where the core scheduler picks the yielding task,
9122 * which yields immediately again; without the condition the vruntime
9123 * ends up quickly running away.
9124 */
9125 if (entity_eligible(cfs_rq, se)) {
9126 se->vruntime = se->deadline;
9127 update_deadline(cfs_rq, se);
9128 }
9129 }
9130
yield_to_task_fair(struct rq * rq,struct task_struct * p)9131 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
9132 {
9133 struct sched_entity *se = &p->se;
9134
9135 /* !se->on_rq also covers throttled task */
9136 if (!se->on_rq)
9137 return false;
9138
9139 /* Tell the scheduler that we'd really like se to run next. */
9140 set_next_buddy(se);
9141
9142 yield_task_fair(rq);
9143
9144 return true;
9145 }
9146
9147 /**************************************************
9148 * Fair scheduling class load-balancing methods.
9149 *
9150 * BASICS
9151 *
9152 * The purpose of load-balancing is to achieve the same basic fairness the
9153 * per-CPU scheduler provides, namely provide a proportional amount of compute
9154 * time to each task. This is expressed in the following equation:
9155 *
9156 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
9157 *
9158 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
9159 * W_i,0 is defined as:
9160 *
9161 * W_i,0 = \Sum_j w_i,j (2)
9162 *
9163 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
9164 * is derived from the nice value as per sched_prio_to_weight[].
9165 *
9166 * The weight average is an exponential decay average of the instantaneous
9167 * weight:
9168 *
9169 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
9170 *
9171 * C_i is the compute capacity of CPU i, typically it is the
9172 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
9173 * can also include other factors [XXX].
9174 *
9175 * To achieve this balance we define a measure of imbalance which follows
9176 * directly from (1):
9177 *
9178 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
9179 *
9180 * We them move tasks around to minimize the imbalance. In the continuous
9181 * function space it is obvious this converges, in the discrete case we get
9182 * a few fun cases generally called infeasible weight scenarios.
9183 *
9184 * [XXX expand on:
9185 * - infeasible weights;
9186 * - local vs global optima in the discrete case. ]
9187 *
9188 *
9189 * SCHED DOMAINS
9190 *
9191 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
9192 * for all i,j solution, we create a tree of CPUs that follows the hardware
9193 * topology where each level pairs two lower groups (or better). This results
9194 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
9195 * tree to only the first of the previous level and we decrease the frequency
9196 * of load-balance at each level inversely proportional to the number of CPUs in
9197 * the groups.
9198 *
9199 * This yields:
9200 *
9201 * log_2 n 1 n
9202 * \Sum { --- * --- * 2^i } = O(n) (5)
9203 * i = 0 2^i 2^i
9204 * `- size of each group
9205 * | | `- number of CPUs doing load-balance
9206 * | `- freq
9207 * `- sum over all levels
9208 *
9209 * Coupled with a limit on how many tasks we can migrate every balance pass,
9210 * this makes (5) the runtime complexity of the balancer.
9211 *
9212 * An important property here is that each CPU is still (indirectly) connected
9213 * to every other CPU in at most O(log n) steps:
9214 *
9215 * The adjacency matrix of the resulting graph is given by:
9216 *
9217 * log_2 n
9218 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
9219 * k = 0
9220 *
9221 * And you'll find that:
9222 *
9223 * A^(log_2 n)_i,j != 0 for all i,j (7)
9224 *
9225 * Showing there's indeed a path between every CPU in at most O(log n) steps.
9226 * The task movement gives a factor of O(m), giving a convergence complexity
9227 * of:
9228 *
9229 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
9230 *
9231 *
9232 * WORK CONSERVING
9233 *
9234 * In order to avoid CPUs going idle while there's still work to do, new idle
9235 * balancing is more aggressive and has the newly idle CPU iterate up the domain
9236 * tree itself instead of relying on other CPUs to bring it work.
9237 *
9238 * This adds some complexity to both (5) and (8) but it reduces the total idle
9239 * time.
9240 *
9241 * [XXX more?]
9242 *
9243 *
9244 * CGROUPS
9245 *
9246 * Cgroups make a horror show out of (2), instead of a simple sum we get:
9247 *
9248 * s_k,i
9249 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
9250 * S_k
9251 *
9252 * Where
9253 *
9254 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
9255 *
9256 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
9257 *
9258 * The big problem is S_k, its a global sum needed to compute a local (W_i)
9259 * property.
9260 *
9261 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
9262 * rewrite all of this once again.]
9263 */
9264
9265 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
9266
9267 enum fbq_type { regular, remote, all };
9268
9269 /*
9270 * 'group_type' describes the group of CPUs at the moment of load balancing.
9271 *
9272 * The enum is ordered by pulling priority, with the group with lowest priority
9273 * first so the group_type can simply be compared when selecting the busiest
9274 * group. See update_sd_pick_busiest().
9275 */
9276 enum group_type {
9277 /* The group has spare capacity that can be used to run more tasks. */
9278 group_has_spare = 0,
9279 /*
9280 * The group is fully used and the tasks don't compete for more CPU
9281 * cycles. Nevertheless, some tasks might wait before running.
9282 */
9283 group_fully_busy,
9284 /*
9285 * One task doesn't fit with CPU's capacity and must be migrated to a
9286 * more powerful CPU.
9287 */
9288 group_misfit_task,
9289 /*
9290 * Balance SMT group that's fully busy. Can benefit from migration
9291 * a task on SMT with busy sibling to another CPU on idle core.
9292 */
9293 group_smt_balance,
9294 /*
9295 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
9296 * and the task should be migrated to it instead of running on the
9297 * current CPU.
9298 */
9299 group_asym_packing,
9300 /*
9301 * The tasks' affinity constraints previously prevented the scheduler
9302 * from balancing the load across the system.
9303 */
9304 group_imbalanced,
9305 /*
9306 * The CPU is overloaded and can't provide expected CPU cycles to all
9307 * tasks.
9308 */
9309 group_overloaded
9310 };
9311
9312 enum migration_type {
9313 migrate_load = 0,
9314 migrate_util,
9315 migrate_task,
9316 migrate_misfit
9317 };
9318
9319 #define LBF_ALL_PINNED 0x01
9320 #define LBF_NEED_BREAK 0x02
9321 #define LBF_DST_PINNED 0x04
9322 #define LBF_SOME_PINNED 0x08
9323 #define LBF_ACTIVE_LB 0x10
9324
9325 struct lb_env {
9326 struct sched_domain *sd;
9327
9328 struct rq *src_rq;
9329 int src_cpu;
9330
9331 int dst_cpu;
9332 struct rq *dst_rq;
9333
9334 struct cpumask *dst_grpmask;
9335 int new_dst_cpu;
9336 enum cpu_idle_type idle;
9337 long imbalance;
9338 /* The set of CPUs under consideration for load-balancing */
9339 struct cpumask *cpus;
9340
9341 unsigned int flags;
9342
9343 unsigned int loop;
9344 unsigned int loop_break;
9345 unsigned int loop_max;
9346
9347 enum fbq_type fbq_type;
9348 enum migration_type migration_type;
9349 struct list_head tasks;
9350 };
9351
9352 /*
9353 * Is this task likely cache-hot:
9354 */
task_hot(struct task_struct * p,struct lb_env * env)9355 static int task_hot(struct task_struct *p, struct lb_env *env)
9356 {
9357 s64 delta;
9358
9359 lockdep_assert_rq_held(env->src_rq);
9360
9361 if (p->sched_class != &fair_sched_class)
9362 return 0;
9363
9364 if (unlikely(task_has_idle_policy(p)))
9365 return 0;
9366
9367 /* SMT siblings share cache */
9368 if (env->sd->flags & SD_SHARE_CPUCAPACITY)
9369 return 0;
9370
9371 /*
9372 * Buddy candidates are cache hot:
9373 */
9374 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
9375 (&p->se == cfs_rq_of(&p->se)->next))
9376 return 1;
9377
9378 if (sysctl_sched_migration_cost == -1)
9379 return 1;
9380
9381 /*
9382 * Don't migrate task if the task's cookie does not match
9383 * with the destination CPU's core cookie.
9384 */
9385 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
9386 return 1;
9387
9388 if (sysctl_sched_migration_cost == 0)
9389 return 0;
9390
9391 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
9392
9393 return delta < (s64)sysctl_sched_migration_cost;
9394 }
9395
9396 #ifdef CONFIG_NUMA_BALANCING
9397 /*
9398 * Returns a positive value, if task migration degrades locality.
9399 * Returns 0, if task migration is not affected by locality.
9400 * Returns a negative value, if task migration improves locality i.e migration preferred.
9401 */
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)9402 static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
9403 {
9404 struct numa_group *numa_group = rcu_dereference_all(p->numa_group);
9405 unsigned long src_weight, dst_weight;
9406 int src_nid, dst_nid, dist;
9407
9408 if (!static_branch_likely(&sched_numa_balancing))
9409 return 0;
9410
9411 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
9412 return 0;
9413
9414 src_nid = cpu_to_node(env->src_cpu);
9415 dst_nid = cpu_to_node(env->dst_cpu);
9416
9417 if (src_nid == dst_nid)
9418 return 0;
9419
9420 /* Migrating away from the preferred node is always bad. */
9421 if (src_nid == p->numa_preferred_nid) {
9422 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
9423 return 1;
9424 else
9425 return 0;
9426 }
9427
9428 /* Encourage migration to the preferred node. */
9429 if (dst_nid == p->numa_preferred_nid)
9430 return -1;
9431
9432 /* Leaving a core idle is often worse than degrading locality. */
9433 if (env->idle == CPU_IDLE)
9434 return 0;
9435
9436 dist = node_distance(src_nid, dst_nid);
9437 if (numa_group) {
9438 src_weight = group_weight(p, src_nid, dist);
9439 dst_weight = group_weight(p, dst_nid, dist);
9440 } else {
9441 src_weight = task_weight(p, src_nid, dist);
9442 dst_weight = task_weight(p, dst_nid, dist);
9443 }
9444
9445 return src_weight - dst_weight;
9446 }
9447
9448 #else /* !CONFIG_NUMA_BALANCING: */
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)9449 static inline long migrate_degrades_locality(struct task_struct *p,
9450 struct lb_env *env)
9451 {
9452 return 0;
9453 }
9454 #endif /* !CONFIG_NUMA_BALANCING */
9455
9456 /*
9457 * Check whether the task is ineligible on the destination cpu
9458 *
9459 * When the PLACE_LAG scheduling feature is enabled and
9460 * dst_cfs_rq->nr_queued is greater than 1, if the task
9461 * is ineligible, it will also be ineligible when
9462 * it is migrated to the destination cpu.
9463 */
task_is_ineligible_on_dst_cpu(struct task_struct * p,int dest_cpu)9464 static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
9465 {
9466 struct cfs_rq *dst_cfs_rq;
9467
9468 #ifdef CONFIG_FAIR_GROUP_SCHED
9469 dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
9470 #else
9471 dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
9472 #endif
9473 if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
9474 !entity_eligible(task_cfs_rq(p), &p->se))
9475 return 1;
9476
9477 return 0;
9478 }
9479
9480 /*
9481 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9482 */
9483 static
can_migrate_task(struct task_struct * p,struct lb_env * env)9484 int can_migrate_task(struct task_struct *p, struct lb_env *env)
9485 {
9486 long degrades, hot;
9487
9488 lockdep_assert_rq_held(env->src_rq);
9489 if (p->sched_task_hot)
9490 p->sched_task_hot = 0;
9491
9492 /*
9493 * We do not migrate tasks that are:
9494 * 1) delayed dequeued unless we migrate load, or
9495 * 2) target cfs_rq is in throttled hierarchy, or
9496 * 3) cannot be migrated to this CPU due to cpus_ptr, or
9497 * 4) running (obviously), or
9498 * 5) are cache-hot on their current CPU, or
9499 * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled)
9500 */
9501 if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
9502 return 0;
9503
9504 if (lb_throttled_hierarchy(p, env->dst_cpu))
9505 return 0;
9506
9507 /*
9508 * We want to prioritize the migration of eligible tasks.
9509 * For ineligible tasks we soft-limit them and only allow
9510 * them to migrate when nr_balance_failed is non-zero to
9511 * avoid load-balancing trying very hard to balance the load.
9512 */
9513 if (!env->sd->nr_balance_failed &&
9514 task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
9515 return 0;
9516
9517 /* Disregard percpu kthreads; they are where they need to be. */
9518 if (kthread_is_per_cpu(p))
9519 return 0;
9520
9521 if (task_is_blocked(p))
9522 return 0;
9523
9524 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
9525 int cpu;
9526
9527 schedstat_inc(p->stats.nr_failed_migrations_affine);
9528
9529 env->flags |= LBF_SOME_PINNED;
9530
9531 /*
9532 * Remember if this task can be migrated to any other CPU in
9533 * our sched_group. We may want to revisit it if we couldn't
9534 * meet load balance goals by pulling other tasks on src_cpu.
9535 *
9536 * Avoid computing new_dst_cpu
9537 * - for NEWLY_IDLE
9538 * - if we have already computed one in current iteration
9539 * - if it's an active balance
9540 */
9541 if (env->idle == CPU_NEWLY_IDLE ||
9542 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
9543 return 0;
9544
9545 /* Prevent to re-select dst_cpu via env's CPUs: */
9546 cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
9547
9548 if (cpu < nr_cpu_ids) {
9549 env->flags |= LBF_DST_PINNED;
9550 env->new_dst_cpu = cpu;
9551 }
9552
9553 return 0;
9554 }
9555
9556 /* Record that we found at least one task that could run on dst_cpu */
9557 env->flags &= ~LBF_ALL_PINNED;
9558
9559 if (task_on_cpu(env->src_rq, p) ||
9560 task_current_donor(env->src_rq, p)) {
9561 schedstat_inc(p->stats.nr_failed_migrations_running);
9562 return 0;
9563 }
9564
9565 /*
9566 * Aggressive migration if:
9567 * 1) active balance
9568 * 2) destination numa is preferred
9569 * 3) task is cache cold, or
9570 * 4) too many balance attempts have failed.
9571 */
9572 if (env->flags & LBF_ACTIVE_LB)
9573 return 1;
9574
9575 degrades = migrate_degrades_locality(p, env);
9576 if (!degrades)
9577 hot = task_hot(p, env);
9578 else
9579 hot = degrades > 0;
9580
9581 if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9582 if (hot)
9583 p->sched_task_hot = 1;
9584 return 1;
9585 }
9586
9587 schedstat_inc(p->stats.nr_failed_migrations_hot);
9588 return 0;
9589 }
9590
9591 /*
9592 * detach_task() -- detach the task for the migration specified in env
9593 */
detach_task(struct task_struct * p,struct lb_env * env)9594 static void detach_task(struct task_struct *p, struct lb_env *env)
9595 {
9596 lockdep_assert_rq_held(env->src_rq);
9597
9598 if (p->sched_task_hot) {
9599 p->sched_task_hot = 0;
9600 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
9601 schedstat_inc(p->stats.nr_forced_migrations);
9602 }
9603
9604 WARN_ON(task_current(env->src_rq, p));
9605 WARN_ON(task_current_donor(env->src_rq, p));
9606
9607 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
9608 set_task_cpu(p, env->dst_cpu);
9609 }
9610
9611 /*
9612 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
9613 * part of active balancing operations within "domain".
9614 *
9615 * Returns a task if successful and NULL otherwise.
9616 */
detach_one_task(struct lb_env * env)9617 static struct task_struct *detach_one_task(struct lb_env *env)
9618 {
9619 struct task_struct *p;
9620
9621 lockdep_assert_rq_held(env->src_rq);
9622
9623 list_for_each_entry_reverse(p,
9624 &env->src_rq->cfs_tasks, se.group_node) {
9625 if (!can_migrate_task(p, env))
9626 continue;
9627
9628 detach_task(p, env);
9629
9630 /*
9631 * Right now, this is only the second place where
9632 * lb_gained[env->idle] is updated (other is detach_tasks)
9633 * so we can safely collect stats here rather than
9634 * inside detach_tasks().
9635 */
9636 schedstat_inc(env->sd->lb_gained[env->idle]);
9637 return p;
9638 }
9639 return NULL;
9640 }
9641
9642 /*
9643 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
9644 * busiest_rq, as part of a balancing operation within domain "sd".
9645 *
9646 * Returns number of detached tasks if successful and 0 otherwise.
9647 */
detach_tasks(struct lb_env * env)9648 static int detach_tasks(struct lb_env *env)
9649 {
9650 struct list_head *tasks = &env->src_rq->cfs_tasks;
9651 unsigned long util, load;
9652 struct task_struct *p;
9653 int detached = 0;
9654
9655 lockdep_assert_rq_held(env->src_rq);
9656
9657 /*
9658 * Source run queue has been emptied by another CPU, clear
9659 * LBF_ALL_PINNED flag as we will not test any task.
9660 */
9661 if (env->src_rq->nr_running <= 1) {
9662 env->flags &= ~LBF_ALL_PINNED;
9663 return 0;
9664 }
9665
9666 if (env->imbalance <= 0)
9667 return 0;
9668
9669 while (!list_empty(tasks)) {
9670 /*
9671 * We don't want to steal all, otherwise we may be treated likewise,
9672 * which could at worst lead to a livelock crash.
9673 */
9674 if (env->idle && env->src_rq->nr_running <= 1)
9675 break;
9676
9677 env->loop++;
9678 /* We've more or less seen every task there is, call it quits */
9679 if (env->loop > env->loop_max)
9680 break;
9681
9682 /* take a breather every nr_migrate tasks */
9683 if (env->loop > env->loop_break) {
9684 env->loop_break += SCHED_NR_MIGRATE_BREAK;
9685 env->flags |= LBF_NEED_BREAK;
9686 break;
9687 }
9688
9689 p = list_last_entry(tasks, struct task_struct, se.group_node);
9690
9691 if (!can_migrate_task(p, env))
9692 goto next;
9693
9694 switch (env->migration_type) {
9695 case migrate_load:
9696 /*
9697 * Depending of the number of CPUs and tasks and the
9698 * cgroup hierarchy, task_h_load() can return a null
9699 * value. Make sure that env->imbalance decreases
9700 * otherwise detach_tasks() will stop only after
9701 * detaching up to loop_max tasks.
9702 */
9703 load = max_t(unsigned long, task_h_load(p), 1);
9704
9705 if (sched_feat(LB_MIN) &&
9706 load < 16 && !env->sd->nr_balance_failed)
9707 goto next;
9708
9709 /*
9710 * Make sure that we don't migrate too much load.
9711 * Nevertheless, let relax the constraint if
9712 * scheduler fails to find a good waiting task to
9713 * migrate.
9714 */
9715 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
9716 goto next;
9717
9718 env->imbalance -= load;
9719 break;
9720
9721 case migrate_util:
9722 util = task_util_est(p);
9723
9724 if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance)
9725 goto next;
9726
9727 env->imbalance -= util;
9728 break;
9729
9730 case migrate_task:
9731 env->imbalance--;
9732 break;
9733
9734 case migrate_misfit:
9735 /* This is not a misfit task */
9736 if (task_fits_cpu(p, env->src_cpu))
9737 goto next;
9738
9739 env->imbalance = 0;
9740 break;
9741 }
9742
9743 detach_task(p, env);
9744 list_add(&p->se.group_node, &env->tasks);
9745
9746 detached++;
9747
9748 #ifdef CONFIG_PREEMPTION
9749 /*
9750 * NEWIDLE balancing is a source of latency, so preemptible
9751 * kernels will stop after the first task is detached to minimize
9752 * the critical section.
9753 */
9754 if (env->idle == CPU_NEWLY_IDLE)
9755 break;
9756 #endif
9757
9758 /*
9759 * We only want to steal up to the prescribed amount of
9760 * load/util/tasks.
9761 */
9762 if (env->imbalance <= 0)
9763 break;
9764
9765 continue;
9766 next:
9767 if (p->sched_task_hot)
9768 schedstat_inc(p->stats.nr_failed_migrations_hot);
9769
9770 list_move(&p->se.group_node, tasks);
9771 }
9772
9773 /*
9774 * Right now, this is one of only two places we collect this stat
9775 * so we can safely collect detach_one_task() stats here rather
9776 * than inside detach_one_task().
9777 */
9778 schedstat_add(env->sd->lb_gained[env->idle], detached);
9779
9780 return detached;
9781 }
9782
9783 /*
9784 * attach_task() -- attach the task detached by detach_task() to its new rq.
9785 */
attach_task(struct rq * rq,struct task_struct * p)9786 static void attach_task(struct rq *rq, struct task_struct *p)
9787 {
9788 lockdep_assert_rq_held(rq);
9789
9790 WARN_ON_ONCE(task_rq(p) != rq);
9791 activate_task(rq, p, ENQUEUE_NOCLOCK);
9792 wakeup_preempt(rq, p, 0);
9793 }
9794
9795 /*
9796 * attach_one_task() -- attaches the task returned from detach_one_task() to
9797 * its new rq.
9798 */
attach_one_task(struct rq * rq,struct task_struct * p)9799 static void attach_one_task(struct rq *rq, struct task_struct *p)
9800 {
9801 struct rq_flags rf;
9802
9803 rq_lock(rq, &rf);
9804 update_rq_clock(rq);
9805 attach_task(rq, p);
9806 rq_unlock(rq, &rf);
9807 }
9808
9809 /*
9810 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
9811 * new rq.
9812 */
attach_tasks(struct lb_env * env)9813 static void attach_tasks(struct lb_env *env)
9814 {
9815 struct list_head *tasks = &env->tasks;
9816 struct task_struct *p;
9817 struct rq_flags rf;
9818
9819 rq_lock(env->dst_rq, &rf);
9820 update_rq_clock(env->dst_rq);
9821
9822 while (!list_empty(tasks)) {
9823 p = list_first_entry(tasks, struct task_struct, se.group_node);
9824 list_del_init(&p->se.group_node);
9825
9826 attach_task(env->dst_rq, p);
9827 }
9828
9829 rq_unlock(env->dst_rq, &rf);
9830 }
9831
9832 #ifdef CONFIG_NO_HZ_COMMON
cfs_rq_has_blocked_load(struct cfs_rq * cfs_rq)9833 static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq)
9834 {
9835 if (cfs_rq->avg.load_avg)
9836 return true;
9837
9838 if (cfs_rq->avg.util_avg)
9839 return true;
9840
9841 return false;
9842 }
9843
others_have_blocked(struct rq * rq)9844 static inline bool others_have_blocked(struct rq *rq)
9845 {
9846 if (cpu_util_rt(rq))
9847 return true;
9848
9849 if (cpu_util_dl(rq))
9850 return true;
9851
9852 if (hw_load_avg(rq))
9853 return true;
9854
9855 if (cpu_util_irq(rq))
9856 return true;
9857
9858 return false;
9859 }
9860
update_blocked_load_tick(struct rq * rq)9861 static inline void update_blocked_load_tick(struct rq *rq)
9862 {
9863 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
9864 }
9865
update_has_blocked_load_status(struct rq * rq,bool has_blocked_load)9866 static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load)
9867 {
9868 if (!has_blocked_load)
9869 rq->has_blocked_load = 0;
9870 }
9871 #else /* !CONFIG_NO_HZ_COMMON: */
cfs_rq_has_blocked_load(struct cfs_rq * cfs_rq)9872 static inline bool cfs_rq_has_blocked_load(struct cfs_rq *cfs_rq) { return false; }
others_have_blocked(struct rq * rq)9873 static inline bool others_have_blocked(struct rq *rq) { return false; }
update_blocked_load_tick(struct rq * rq)9874 static inline void update_blocked_load_tick(struct rq *rq) {}
update_has_blocked_load_status(struct rq * rq,bool has_blocked_load)9875 static inline void update_has_blocked_load_status(struct rq *rq, bool has_blocked_load) {}
9876 #endif /* !CONFIG_NO_HZ_COMMON */
9877
__update_blocked_others(struct rq * rq,bool * done)9878 static bool __update_blocked_others(struct rq *rq, bool *done)
9879 {
9880 bool updated;
9881
9882 /*
9883 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
9884 * DL and IRQ signals have been updated before updating CFS.
9885 */
9886 updated = update_other_load_avgs(rq);
9887
9888 if (others_have_blocked(rq))
9889 *done = false;
9890
9891 return updated;
9892 }
9893
9894 #ifdef CONFIG_FAIR_GROUP_SCHED
9895
__update_blocked_fair(struct rq * rq,bool * done)9896 static bool __update_blocked_fair(struct rq *rq, bool *done)
9897 {
9898 struct cfs_rq *cfs_rq, *pos;
9899 bool decayed = false;
9900 int cpu = cpu_of(rq);
9901
9902 /*
9903 * Iterates the task_group tree in a bottom up fashion, see
9904 * list_add_leaf_cfs_rq() for details.
9905 */
9906 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
9907 struct sched_entity *se;
9908
9909 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
9910 update_tg_load_avg(cfs_rq);
9911
9912 if (cfs_rq->nr_queued == 0)
9913 update_idle_cfs_rq_clock_pelt(cfs_rq);
9914
9915 if (cfs_rq == &rq->cfs)
9916 decayed = true;
9917 }
9918
9919 /* Propagate pending load changes to the parent, if any: */
9920 se = cfs_rq->tg->se[cpu];
9921 if (se && !skip_blocked_update(se))
9922 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
9923
9924 /*
9925 * There can be a lot of idle CPU cgroups. Don't let fully
9926 * decayed cfs_rqs linger on the list.
9927 */
9928 if (cfs_rq_is_decayed(cfs_rq))
9929 list_del_leaf_cfs_rq(cfs_rq);
9930
9931 /* Don't need periodic decay once load/util_avg are null */
9932 if (cfs_rq_has_blocked_load(cfs_rq))
9933 *done = false;
9934 }
9935
9936 return decayed;
9937 }
9938
9939 /*
9940 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
9941 * This needs to be done in a top-down fashion because the load of a child
9942 * group is a fraction of its parents load.
9943 */
update_cfs_rq_h_load(struct cfs_rq * cfs_rq)9944 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
9945 {
9946 struct rq *rq = rq_of(cfs_rq);
9947 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
9948 unsigned long now = jiffies;
9949 unsigned long load;
9950
9951 if (cfs_rq->last_h_load_update == now)
9952 return;
9953
9954 WRITE_ONCE(cfs_rq->h_load_next, NULL);
9955 for_each_sched_entity(se) {
9956 cfs_rq = cfs_rq_of(se);
9957 WRITE_ONCE(cfs_rq->h_load_next, se);
9958 if (cfs_rq->last_h_load_update == now)
9959 break;
9960 }
9961
9962 if (!se) {
9963 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
9964 cfs_rq->last_h_load_update = now;
9965 }
9966
9967 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
9968 load = cfs_rq->h_load;
9969 load = div64_ul(load * se->avg.load_avg,
9970 cfs_rq_load_avg(cfs_rq) + 1);
9971 cfs_rq = group_cfs_rq(se);
9972 cfs_rq->h_load = load;
9973 cfs_rq->last_h_load_update = now;
9974 }
9975 }
9976
task_h_load(struct task_struct * p)9977 static unsigned long task_h_load(struct task_struct *p)
9978 {
9979 struct cfs_rq *cfs_rq = task_cfs_rq(p);
9980
9981 update_cfs_rq_h_load(cfs_rq);
9982 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
9983 cfs_rq_load_avg(cfs_rq) + 1);
9984 }
9985 #else /* !CONFIG_FAIR_GROUP_SCHED: */
__update_blocked_fair(struct rq * rq,bool * done)9986 static bool __update_blocked_fair(struct rq *rq, bool *done)
9987 {
9988 struct cfs_rq *cfs_rq = &rq->cfs;
9989 bool decayed;
9990
9991 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
9992 if (cfs_rq_has_blocked_load(cfs_rq))
9993 *done = false;
9994
9995 return decayed;
9996 }
9997
task_h_load(struct task_struct * p)9998 static unsigned long task_h_load(struct task_struct *p)
9999 {
10000 return p->se.avg.load_avg;
10001 }
10002 #endif /* !CONFIG_FAIR_GROUP_SCHED */
10003
__sched_balance_update_blocked_averages(struct rq * rq)10004 static void __sched_balance_update_blocked_averages(struct rq *rq)
10005 {
10006 bool decayed = false, done = true;
10007
10008 update_blocked_load_tick(rq);
10009
10010 decayed |= __update_blocked_others(rq, &done);
10011 decayed |= __update_blocked_fair(rq, &done);
10012
10013 update_has_blocked_load_status(rq, !done);
10014 if (decayed)
10015 cpufreq_update_util(rq, 0);
10016 }
10017
sched_balance_update_blocked_averages(int cpu)10018 static void sched_balance_update_blocked_averages(int cpu)
10019 {
10020 struct rq *rq = cpu_rq(cpu);
10021
10022 guard(rq_lock_irqsave)(rq);
10023 update_rq_clock(rq);
10024 __sched_balance_update_blocked_averages(rq);
10025 }
10026
10027 /********** Helpers for sched_balance_find_src_group ************************/
10028
10029 /*
10030 * sg_lb_stats - stats of a sched_group required for load-balancing:
10031 */
10032 struct sg_lb_stats {
10033 unsigned long avg_load; /* Avg load over the CPUs of the group */
10034 unsigned long group_load; /* Total load over the CPUs of the group */
10035 unsigned long group_capacity; /* Capacity over the CPUs of the group */
10036 unsigned long group_util; /* Total utilization over the CPUs of the group */
10037 unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
10038 unsigned int sum_nr_running; /* Nr of all tasks running in the group */
10039 unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
10040 unsigned int idle_cpus; /* Nr of idle CPUs in the group */
10041 unsigned int group_weight;
10042 enum group_type group_type;
10043 unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
10044 unsigned int group_smt_balance; /* Task on busy SMT be moved */
10045 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
10046 #ifdef CONFIG_NUMA_BALANCING
10047 unsigned int nr_numa_running;
10048 unsigned int nr_preferred_running;
10049 #endif
10050 };
10051
10052 /*
10053 * sd_lb_stats - stats of a sched_domain required for load-balancing:
10054 */
10055 struct sd_lb_stats {
10056 struct sched_group *busiest; /* Busiest group in this sd */
10057 struct sched_group *local; /* Local group in this sd */
10058 unsigned long total_load; /* Total load of all groups in sd */
10059 unsigned long total_capacity; /* Total capacity of all groups in sd */
10060 unsigned long avg_load; /* Average load across all groups in sd */
10061 unsigned int prefer_sibling; /* Tasks should go to sibling first */
10062
10063 struct sg_lb_stats busiest_stat; /* Statistics of the busiest group */
10064 struct sg_lb_stats local_stat; /* Statistics of the local group */
10065 };
10066
init_sd_lb_stats(struct sd_lb_stats * sds)10067 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
10068 {
10069 /*
10070 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
10071 * local_stat because update_sg_lb_stats() does a full clear/assignment.
10072 * We must however set busiest_stat::group_type and
10073 * busiest_stat::idle_cpus to the worst busiest group because
10074 * update_sd_pick_busiest() reads these before assignment.
10075 */
10076 *sds = (struct sd_lb_stats){
10077 .busiest = NULL,
10078 .local = NULL,
10079 .total_load = 0UL,
10080 .total_capacity = 0UL,
10081 .busiest_stat = {
10082 .idle_cpus = UINT_MAX,
10083 .group_type = group_has_spare,
10084 },
10085 };
10086 }
10087
scale_rt_capacity(int cpu)10088 static unsigned long scale_rt_capacity(int cpu)
10089 {
10090 unsigned long max = get_actual_cpu_capacity(cpu);
10091 struct rq *rq = cpu_rq(cpu);
10092 unsigned long used, free;
10093 unsigned long irq;
10094
10095 irq = cpu_util_irq(rq);
10096
10097 if (unlikely(irq >= max))
10098 return 1;
10099
10100 /*
10101 * avg_rt.util_avg and avg_dl.util_avg track binary signals
10102 * (running and not running) with weights 0 and 1024 respectively.
10103 */
10104 used = cpu_util_rt(rq);
10105 used += cpu_util_dl(rq);
10106
10107 if (unlikely(used >= max))
10108 return 1;
10109
10110 free = max - used;
10111
10112 return scale_irq_capacity(free, irq, max);
10113 }
10114
update_cpu_capacity(struct sched_domain * sd,int cpu)10115 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
10116 {
10117 unsigned long capacity = scale_rt_capacity(cpu);
10118 struct sched_group *sdg = sd->groups;
10119
10120 if (!capacity)
10121 capacity = 1;
10122
10123 cpu_rq(cpu)->cpu_capacity = capacity;
10124 trace_sched_cpu_capacity_tp(cpu_rq(cpu));
10125
10126 sdg->sgc->capacity = capacity;
10127 sdg->sgc->min_capacity = capacity;
10128 sdg->sgc->max_capacity = capacity;
10129 }
10130
update_group_capacity(struct sched_domain * sd,int cpu)10131 void update_group_capacity(struct sched_domain *sd, int cpu)
10132 {
10133 struct sched_domain *child = sd->child;
10134 struct sched_group *group, *sdg = sd->groups;
10135 unsigned long capacity, min_capacity, max_capacity;
10136 unsigned long interval;
10137
10138 interval = msecs_to_jiffies(sd->balance_interval);
10139 interval = clamp(interval, 1UL, max_load_balance_interval);
10140 sdg->sgc->next_update = jiffies + interval;
10141
10142 if (!child) {
10143 update_cpu_capacity(sd, cpu);
10144 return;
10145 }
10146
10147 capacity = 0;
10148 min_capacity = ULONG_MAX;
10149 max_capacity = 0;
10150
10151 if (child->flags & SD_NUMA) {
10152 /*
10153 * SD_NUMA domains cannot assume that child groups
10154 * span the current group.
10155 */
10156
10157 for_each_cpu(cpu, sched_group_span(sdg)) {
10158 unsigned long cpu_cap = capacity_of(cpu);
10159
10160 capacity += cpu_cap;
10161 min_capacity = min(cpu_cap, min_capacity);
10162 max_capacity = max(cpu_cap, max_capacity);
10163 }
10164 } else {
10165 /*
10166 * !SD_NUMA domains can assume that child groups
10167 * span the current group.
10168 */
10169
10170 group = child->groups;
10171 do {
10172 struct sched_group_capacity *sgc = group->sgc;
10173
10174 capacity += sgc->capacity;
10175 min_capacity = min(sgc->min_capacity, min_capacity);
10176 max_capacity = max(sgc->max_capacity, max_capacity);
10177 group = group->next;
10178 } while (group != child->groups);
10179 }
10180
10181 sdg->sgc->capacity = capacity;
10182 sdg->sgc->min_capacity = min_capacity;
10183 sdg->sgc->max_capacity = max_capacity;
10184 }
10185
10186 /*
10187 * Check whether the capacity of the rq has been noticeably reduced by side
10188 * activity. The imbalance_pct is used for the threshold.
10189 * Return true is the capacity is reduced
10190 */
10191 static inline int
check_cpu_capacity(struct rq * rq,struct sched_domain * sd)10192 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
10193 {
10194 return ((rq->cpu_capacity * sd->imbalance_pct) <
10195 (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
10196 }
10197
10198 /* Check if the rq has a misfit task */
check_misfit_status(struct rq * rq)10199 static inline bool check_misfit_status(struct rq *rq)
10200 {
10201 return rq->misfit_task_load;
10202 }
10203
10204 /*
10205 * Group imbalance indicates (and tries to solve) the problem where balancing
10206 * groups is inadequate due to ->cpus_ptr constraints.
10207 *
10208 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
10209 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
10210 * Something like:
10211 *
10212 * { 0 1 2 3 } { 4 5 6 7 }
10213 * * * * *
10214 *
10215 * If we were to balance group-wise we'd place two tasks in the first group and
10216 * two tasks in the second group. Clearly this is undesired as it will overload
10217 * cpu 3 and leave one of the CPUs in the second group unused.
10218 *
10219 * The current solution to this issue is detecting the skew in the first group
10220 * by noticing the lower domain failed to reach balance and had difficulty
10221 * moving tasks due to affinity constraints.
10222 *
10223 * When this is so detected; this group becomes a candidate for busiest; see
10224 * update_sd_pick_busiest(). And calculate_imbalance() and
10225 * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it
10226 * to create an effective group imbalance.
10227 *
10228 * This is a somewhat tricky proposition since the next run might not find the
10229 * group imbalance and decide the groups need to be balanced again. A most
10230 * subtle and fragile situation.
10231 */
10232
sg_imbalanced(struct sched_group * group)10233 static inline int sg_imbalanced(struct sched_group *group)
10234 {
10235 return group->sgc->imbalance;
10236 }
10237
10238 /*
10239 * group_has_capacity returns true if the group has spare capacity that could
10240 * be used by some tasks.
10241 * We consider that a group has spare capacity if the number of task is
10242 * smaller than the number of CPUs or if the utilization is lower than the
10243 * available capacity for CFS tasks.
10244 * For the latter, we use a threshold to stabilize the state, to take into
10245 * account the variance of the tasks' load and to return true if the available
10246 * capacity in meaningful for the load balancer.
10247 * As an example, an available capacity of 1% can appear but it doesn't make
10248 * any benefit for the load balance.
10249 */
10250 static inline bool
group_has_capacity(unsigned int imbalance_pct,struct sg_lb_stats * sgs)10251 group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10252 {
10253 if (sgs->sum_nr_running < sgs->group_weight)
10254 return true;
10255
10256 if ((sgs->group_capacity * imbalance_pct) <
10257 (sgs->group_runnable * 100))
10258 return false;
10259
10260 if ((sgs->group_capacity * 100) >
10261 (sgs->group_util * imbalance_pct))
10262 return true;
10263
10264 return false;
10265 }
10266
10267 /*
10268 * group_is_overloaded returns true if the group has more tasks than it can
10269 * handle.
10270 * group_is_overloaded is not equals to !group_has_capacity because a group
10271 * with the exact right number of tasks, has no more spare capacity but is not
10272 * overloaded so both group_has_capacity and group_is_overloaded return
10273 * false.
10274 */
10275 static inline bool
group_is_overloaded(unsigned int imbalance_pct,struct sg_lb_stats * sgs)10276 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
10277 {
10278 if (sgs->sum_nr_running <= sgs->group_weight)
10279 return false;
10280
10281 if ((sgs->group_capacity * 100) <
10282 (sgs->group_util * imbalance_pct))
10283 return true;
10284
10285 if ((sgs->group_capacity * imbalance_pct) <
10286 (sgs->group_runnable * 100))
10287 return true;
10288
10289 return false;
10290 }
10291
10292 static inline enum
group_classify(unsigned int imbalance_pct,struct sched_group * group,struct sg_lb_stats * sgs)10293 group_type group_classify(unsigned int imbalance_pct,
10294 struct sched_group *group,
10295 struct sg_lb_stats *sgs)
10296 {
10297 if (group_is_overloaded(imbalance_pct, sgs))
10298 return group_overloaded;
10299
10300 if (sg_imbalanced(group))
10301 return group_imbalanced;
10302
10303 if (sgs->group_asym_packing)
10304 return group_asym_packing;
10305
10306 if (sgs->group_smt_balance)
10307 return group_smt_balance;
10308
10309 if (sgs->group_misfit_task_load)
10310 return group_misfit_task;
10311
10312 if (!group_has_capacity(imbalance_pct, sgs))
10313 return group_fully_busy;
10314
10315 return group_has_spare;
10316 }
10317
10318 /**
10319 * sched_use_asym_prio - Check whether asym_packing priority must be used
10320 * @sd: The scheduling domain of the load balancing
10321 * @cpu: A CPU
10322 *
10323 * Always use CPU priority when balancing load between SMT siblings. When
10324 * balancing load between cores, it is not sufficient that @cpu is idle. Only
10325 * use CPU priority if the whole core is idle.
10326 *
10327 * Returns: True if the priority of @cpu must be followed. False otherwise.
10328 */
sched_use_asym_prio(struct sched_domain * sd,int cpu)10329 static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
10330 {
10331 if (!(sd->flags & SD_ASYM_PACKING))
10332 return false;
10333
10334 if (!sched_smt_active())
10335 return true;
10336
10337 return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
10338 }
10339
sched_asym(struct sched_domain * sd,int dst_cpu,int src_cpu)10340 static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu)
10341 {
10342 /*
10343 * First check if @dst_cpu can do asym_packing load balance. Only do it
10344 * if it has higher priority than @src_cpu.
10345 */
10346 return sched_use_asym_prio(sd, dst_cpu) &&
10347 sched_asym_prefer(dst_cpu, src_cpu);
10348 }
10349
10350 /**
10351 * sched_group_asym - Check if the destination CPU can do asym_packing balance
10352 * @env: The load balancing environment
10353 * @sgs: Load-balancing statistics of the candidate busiest group
10354 * @group: The candidate busiest group
10355 *
10356 * @env::dst_cpu can do asym_packing if it has higher priority than the
10357 * preferred CPU of @group.
10358 *
10359 * Return: true if @env::dst_cpu can do with asym_packing load balance. False
10360 * otherwise.
10361 */
10362 static inline bool
sched_group_asym(struct lb_env * env,struct sg_lb_stats * sgs,struct sched_group * group)10363 sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group)
10364 {
10365 /*
10366 * CPU priorities do not make sense for SMT cores with more than one
10367 * busy sibling.
10368 */
10369 if ((group->flags & SD_SHARE_CPUCAPACITY) &&
10370 (sgs->group_weight - sgs->idle_cpus != 1))
10371 return false;
10372
10373 return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu));
10374 }
10375
10376 /* One group has more than one SMT CPU while the other group does not */
smt_vs_nonsmt_groups(struct sched_group * sg1,struct sched_group * sg2)10377 static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
10378 struct sched_group *sg2)
10379 {
10380 if (!sg1 || !sg2)
10381 return false;
10382
10383 return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
10384 (sg2->flags & SD_SHARE_CPUCAPACITY);
10385 }
10386
smt_balance(struct lb_env * env,struct sg_lb_stats * sgs,struct sched_group * group)10387 static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
10388 struct sched_group *group)
10389 {
10390 if (!env->idle)
10391 return false;
10392
10393 /*
10394 * For SMT source group, it is better to move a task
10395 * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
10396 * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
10397 * will not be on.
10398 */
10399 if (group->flags & SD_SHARE_CPUCAPACITY &&
10400 sgs->sum_h_nr_running > 1)
10401 return true;
10402
10403 return false;
10404 }
10405
sibling_imbalance(struct lb_env * env,struct sd_lb_stats * sds,struct sg_lb_stats * busiest,struct sg_lb_stats * local)10406 static inline long sibling_imbalance(struct lb_env *env,
10407 struct sd_lb_stats *sds,
10408 struct sg_lb_stats *busiest,
10409 struct sg_lb_stats *local)
10410 {
10411 int ncores_busiest, ncores_local;
10412 long imbalance;
10413
10414 if (!env->idle || !busiest->sum_nr_running)
10415 return 0;
10416
10417 ncores_busiest = sds->busiest->cores;
10418 ncores_local = sds->local->cores;
10419
10420 if (ncores_busiest == ncores_local) {
10421 imbalance = busiest->sum_nr_running;
10422 lsub_positive(&imbalance, local->sum_nr_running);
10423 return imbalance;
10424 }
10425
10426 /* Balance such that nr_running/ncores ratio are same on both groups */
10427 imbalance = ncores_local * busiest->sum_nr_running;
10428 lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
10429 /* Normalize imbalance and do rounding on normalization */
10430 imbalance = 2 * imbalance + ncores_local + ncores_busiest;
10431 imbalance /= ncores_local + ncores_busiest;
10432
10433 /* Take advantage of resource in an empty sched group */
10434 if (imbalance <= 1 && local->sum_nr_running == 0 &&
10435 busiest->sum_nr_running > 1)
10436 imbalance = 2;
10437
10438 return imbalance;
10439 }
10440
10441 static inline bool
sched_reduced_capacity(struct rq * rq,struct sched_domain * sd)10442 sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
10443 {
10444 /*
10445 * When there is more than 1 task, the group_overloaded case already
10446 * takes care of cpu with reduced capacity
10447 */
10448 if (rq->cfs.h_nr_runnable != 1)
10449 return false;
10450
10451 return check_cpu_capacity(rq, sd);
10452 }
10453
10454 /**
10455 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
10456 * @env: The load balancing environment.
10457 * @sds: Load-balancing data with statistics of the local group.
10458 * @group: sched_group whose statistics are to be updated.
10459 * @sgs: variable to hold the statistics for this group.
10460 * @sg_overloaded: sched_group is overloaded
10461 * @sg_overutilized: sched_group is overutilized
10462 */
update_sg_lb_stats(struct lb_env * env,struct sd_lb_stats * sds,struct sched_group * group,struct sg_lb_stats * sgs,bool * sg_overloaded,bool * sg_overutilized)10463 static inline void update_sg_lb_stats(struct lb_env *env,
10464 struct sd_lb_stats *sds,
10465 struct sched_group *group,
10466 struct sg_lb_stats *sgs,
10467 bool *sg_overloaded,
10468 bool *sg_overutilized)
10469 {
10470 int i, nr_running, local_group, sd_flags = env->sd->flags;
10471 bool balancing_at_rd = !env->sd->parent;
10472
10473 memset(sgs, 0, sizeof(*sgs));
10474
10475 local_group = group == sds->local;
10476
10477 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
10478 struct rq *rq = cpu_rq(i);
10479 unsigned long load = cpu_load(rq);
10480
10481 sgs->group_load += load;
10482 sgs->group_util += cpu_util_cfs(i);
10483 sgs->group_runnable += cpu_runnable(rq);
10484 sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
10485
10486 nr_running = rq->nr_running;
10487 sgs->sum_nr_running += nr_running;
10488
10489 if (cpu_overutilized(i))
10490 *sg_overutilized = 1;
10491
10492 /*
10493 * No need to call idle_cpu() if nr_running is not 0
10494 */
10495 if (!nr_running && idle_cpu(i)) {
10496 sgs->idle_cpus++;
10497 /* Idle cpu can't have misfit task */
10498 continue;
10499 }
10500
10501 /* Overload indicator is only updated at root domain */
10502 if (balancing_at_rd && nr_running > 1)
10503 *sg_overloaded = 1;
10504
10505 #ifdef CONFIG_NUMA_BALANCING
10506 /* Only fbq_classify_group() uses this to classify NUMA groups */
10507 if (sd_flags & SD_NUMA) {
10508 sgs->nr_numa_running += rq->nr_numa_running;
10509 sgs->nr_preferred_running += rq->nr_preferred_running;
10510 }
10511 #endif
10512 if (local_group)
10513 continue;
10514
10515 if (sd_flags & SD_ASYM_CPUCAPACITY) {
10516 /* Check for a misfit task on the cpu */
10517 if (sgs->group_misfit_task_load < rq->misfit_task_load) {
10518 sgs->group_misfit_task_load = rq->misfit_task_load;
10519 *sg_overloaded = 1;
10520 }
10521 } else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
10522 /* Check for a task running on a CPU with reduced capacity */
10523 if (sgs->group_misfit_task_load < load)
10524 sgs->group_misfit_task_load = load;
10525 }
10526 }
10527
10528 sgs->group_capacity = group->sgc->capacity;
10529
10530 sgs->group_weight = group->group_weight;
10531
10532 /* Check if dst CPU is idle and preferred to this group */
10533 if (!local_group && env->idle && sgs->sum_h_nr_running &&
10534 sched_group_asym(env, sgs, group))
10535 sgs->group_asym_packing = 1;
10536
10537 /* Check for loaded SMT group to be balanced to dst CPU */
10538 if (!local_group && smt_balance(env, sgs, group))
10539 sgs->group_smt_balance = 1;
10540
10541 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
10542
10543 /* Computing avg_load makes sense only when group is overloaded */
10544 if (sgs->group_type == group_overloaded)
10545 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10546 sgs->group_capacity;
10547 }
10548
10549 /**
10550 * update_sd_pick_busiest - return 1 on busiest group
10551 * @env: The load balancing environment.
10552 * @sds: sched_domain statistics
10553 * @sg: sched_group candidate to be checked for being the busiest
10554 * @sgs: sched_group statistics
10555 *
10556 * Determine if @sg is a busier group than the previously selected
10557 * busiest group.
10558 *
10559 * Return: %true if @sg is a busier group than the previously selected
10560 * busiest group. %false otherwise.
10561 */
update_sd_pick_busiest(struct lb_env * env,struct sd_lb_stats * sds,struct sched_group * sg,struct sg_lb_stats * sgs)10562 static bool update_sd_pick_busiest(struct lb_env *env,
10563 struct sd_lb_stats *sds,
10564 struct sched_group *sg,
10565 struct sg_lb_stats *sgs)
10566 {
10567 struct sg_lb_stats *busiest = &sds->busiest_stat;
10568
10569 /* Make sure that there is at least one task to pull */
10570 if (!sgs->sum_h_nr_running)
10571 return false;
10572
10573 /*
10574 * Don't try to pull misfit tasks we can't help.
10575 * We can use max_capacity here as reduction in capacity on some
10576 * CPUs in the group should either be possible to resolve
10577 * internally or be covered by avg_load imbalance (eventually).
10578 */
10579 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10580 (sgs->group_type == group_misfit_task) &&
10581 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
10582 sds->local_stat.group_type != group_has_spare))
10583 return false;
10584
10585 if (sgs->group_type > busiest->group_type)
10586 return true;
10587
10588 if (sgs->group_type < busiest->group_type)
10589 return false;
10590
10591 /*
10592 * The candidate and the current busiest group are the same type of
10593 * group. Let check which one is the busiest according to the type.
10594 */
10595
10596 switch (sgs->group_type) {
10597 case group_overloaded:
10598 /* Select the overloaded group with highest avg_load. */
10599 return sgs->avg_load > busiest->avg_load;
10600
10601 case group_imbalanced:
10602 /*
10603 * Select the 1st imbalanced group as we don't have any way to
10604 * choose one more than another.
10605 */
10606 return false;
10607
10608 case group_asym_packing:
10609 /* Prefer to move from lowest priority CPU's work */
10610 return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu),
10611 READ_ONCE(sg->asym_prefer_cpu));
10612
10613 case group_misfit_task:
10614 /*
10615 * If we have more than one misfit sg go with the biggest
10616 * misfit.
10617 */
10618 return sgs->group_misfit_task_load > busiest->group_misfit_task_load;
10619
10620 case group_smt_balance:
10621 /*
10622 * Check if we have spare CPUs on either SMT group to
10623 * choose has spare or fully busy handling.
10624 */
10625 if (sgs->idle_cpus != 0 || busiest->idle_cpus != 0)
10626 goto has_spare;
10627
10628 fallthrough;
10629
10630 case group_fully_busy:
10631 /*
10632 * Select the fully busy group with highest avg_load. In
10633 * theory, there is no need to pull task from such kind of
10634 * group because tasks have all compute capacity that they need
10635 * but we can still improve the overall throughput by reducing
10636 * contention when accessing shared HW resources.
10637 *
10638 * XXX for now avg_load is not computed and always 0 so we
10639 * select the 1st one, except if @sg is composed of SMT
10640 * siblings.
10641 */
10642
10643 if (sgs->avg_load < busiest->avg_load)
10644 return false;
10645
10646 if (sgs->avg_load == busiest->avg_load) {
10647 /*
10648 * SMT sched groups need more help than non-SMT groups.
10649 * If @sg happens to also be SMT, either choice is good.
10650 */
10651 if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
10652 return false;
10653 }
10654
10655 break;
10656
10657 case group_has_spare:
10658 /*
10659 * Do not pick sg with SMT CPUs over sg with pure CPUs,
10660 * as we do not want to pull task off SMT core with one task
10661 * and make the core idle.
10662 */
10663 if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
10664 if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
10665 return false;
10666 else
10667 return true;
10668 }
10669 has_spare:
10670
10671 /*
10672 * Select not overloaded group with lowest number of idle CPUs
10673 * and highest number of running tasks. We could also compare
10674 * the spare capacity which is more stable but it can end up
10675 * that the group has less spare capacity but finally more idle
10676 * CPUs which means less opportunity to pull tasks.
10677 */
10678 if (sgs->idle_cpus > busiest->idle_cpus)
10679 return false;
10680 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
10681 (sgs->sum_nr_running <= busiest->sum_nr_running))
10682 return false;
10683
10684 break;
10685 }
10686
10687 /*
10688 * Candidate sg has no more than one task per CPU and has higher
10689 * per-CPU capacity. Migrating tasks to less capable CPUs may harm
10690 * throughput. Maximize throughput, power/energy consequences are not
10691 * considered.
10692 */
10693 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
10694 (sgs->group_type <= group_fully_busy) &&
10695 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
10696 return false;
10697
10698 return true;
10699 }
10700
10701 #ifdef CONFIG_NUMA_BALANCING
fbq_classify_group(struct sg_lb_stats * sgs)10702 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10703 {
10704 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
10705 return regular;
10706 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
10707 return remote;
10708 return all;
10709 }
10710
fbq_classify_rq(struct rq * rq)10711 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10712 {
10713 if (rq->nr_running > rq->nr_numa_running)
10714 return regular;
10715 if (rq->nr_running > rq->nr_preferred_running)
10716 return remote;
10717 return all;
10718 }
10719 #else /* !CONFIG_NUMA_BALANCING: */
fbq_classify_group(struct sg_lb_stats * sgs)10720 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
10721 {
10722 return all;
10723 }
10724
fbq_classify_rq(struct rq * rq)10725 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
10726 {
10727 return regular;
10728 }
10729 #endif /* !CONFIG_NUMA_BALANCING */
10730
10731
10732 struct sg_lb_stats;
10733
10734 /*
10735 * task_running_on_cpu - return 1 if @p is running on @cpu.
10736 */
10737
task_running_on_cpu(int cpu,struct task_struct * p)10738 static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
10739 {
10740 /* Task has no contribution or is new */
10741 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
10742 return 0;
10743
10744 if (task_on_rq_queued(p))
10745 return 1;
10746
10747 return 0;
10748 }
10749
10750 /**
10751 * idle_cpu_without - would a given CPU be idle without p ?
10752 * @cpu: the processor on which idleness is tested.
10753 * @p: task which should be ignored.
10754 *
10755 * Return: 1 if the CPU would be idle. 0 otherwise.
10756 */
idle_cpu_without(int cpu,struct task_struct * p)10757 static int idle_cpu_without(int cpu, struct task_struct *p)
10758 {
10759 struct rq *rq = cpu_rq(cpu);
10760
10761 if (rq->curr != rq->idle && rq->curr != p)
10762 return 0;
10763
10764 /*
10765 * rq->nr_running can't be used but an updated version without the
10766 * impact of p on cpu must be used instead. The updated nr_running
10767 * be computed and tested before calling idle_cpu_without().
10768 */
10769
10770 if (rq->ttwu_pending)
10771 return 0;
10772
10773 return 1;
10774 }
10775
10776 /*
10777 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
10778 * @sd: The sched_domain level to look for idlest group.
10779 * @group: sched_group whose statistics are to be updated.
10780 * @sgs: variable to hold the statistics for this group.
10781 * @p: The task for which we look for the idlest group/CPU.
10782 */
update_sg_wakeup_stats(struct sched_domain * sd,struct sched_group * group,struct sg_lb_stats * sgs,struct task_struct * p)10783 static inline void update_sg_wakeup_stats(struct sched_domain *sd,
10784 struct sched_group *group,
10785 struct sg_lb_stats *sgs,
10786 struct task_struct *p)
10787 {
10788 int i, nr_running;
10789
10790 memset(sgs, 0, sizeof(*sgs));
10791
10792 /* Assume that task can't fit any CPU of the group */
10793 if (sd->flags & SD_ASYM_CPUCAPACITY)
10794 sgs->group_misfit_task_load = 1;
10795
10796 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
10797 struct rq *rq = cpu_rq(i);
10798 unsigned int local;
10799
10800 sgs->group_load += cpu_load_without(rq, p);
10801 sgs->group_util += cpu_util_without(i, p);
10802 sgs->group_runnable += cpu_runnable_without(rq, p);
10803 local = task_running_on_cpu(i, p);
10804 sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
10805
10806 nr_running = rq->nr_running - local;
10807 sgs->sum_nr_running += nr_running;
10808
10809 /*
10810 * No need to call idle_cpu_without() if nr_running is not 0
10811 */
10812 if (!nr_running && idle_cpu_without(i, p))
10813 sgs->idle_cpus++;
10814
10815 /* Check if task fits in the CPU */
10816 if (sd->flags & SD_ASYM_CPUCAPACITY &&
10817 sgs->group_misfit_task_load &&
10818 task_fits_cpu(p, i))
10819 sgs->group_misfit_task_load = 0;
10820
10821 }
10822
10823 sgs->group_capacity = group->sgc->capacity;
10824
10825 sgs->group_weight = group->group_weight;
10826
10827 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
10828
10829 /*
10830 * Computing avg_load makes sense only when group is fully busy or
10831 * overloaded
10832 */
10833 if (sgs->group_type == group_fully_busy ||
10834 sgs->group_type == group_overloaded)
10835 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
10836 sgs->group_capacity;
10837 }
10838
update_pick_idlest(struct sched_group * idlest,struct sg_lb_stats * idlest_sgs,struct sched_group * group,struct sg_lb_stats * sgs)10839 static bool update_pick_idlest(struct sched_group *idlest,
10840 struct sg_lb_stats *idlest_sgs,
10841 struct sched_group *group,
10842 struct sg_lb_stats *sgs)
10843 {
10844 if (sgs->group_type < idlest_sgs->group_type)
10845 return true;
10846
10847 if (sgs->group_type > idlest_sgs->group_type)
10848 return false;
10849
10850 /*
10851 * The candidate and the current idlest group are the same type of
10852 * group. Let check which one is the idlest according to the type.
10853 */
10854
10855 switch (sgs->group_type) {
10856 case group_overloaded:
10857 case group_fully_busy:
10858 /* Select the group with lowest avg_load. */
10859 if (idlest_sgs->avg_load <= sgs->avg_load)
10860 return false;
10861 break;
10862
10863 case group_imbalanced:
10864 case group_asym_packing:
10865 case group_smt_balance:
10866 /* Those types are not used in the slow wakeup path */
10867 return false;
10868
10869 case group_misfit_task:
10870 /* Select group with the highest max capacity */
10871 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
10872 return false;
10873 break;
10874
10875 case group_has_spare:
10876 /* Select group with most idle CPUs */
10877 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
10878 return false;
10879
10880 /* Select group with lowest group_util */
10881 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
10882 idlest_sgs->group_util <= sgs->group_util)
10883 return false;
10884
10885 break;
10886 }
10887
10888 return true;
10889 }
10890
10891 /*
10892 * sched_balance_find_dst_group() finds and returns the least busy CPU group within the
10893 * domain.
10894 *
10895 * Assumes p is allowed on at least one CPU in sd.
10896 */
10897 static struct sched_group *
sched_balance_find_dst_group(struct sched_domain * sd,struct task_struct * p,int this_cpu)10898 sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
10899 {
10900 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
10901 struct sg_lb_stats local_sgs, tmp_sgs;
10902 struct sg_lb_stats *sgs;
10903 unsigned long imbalance;
10904 struct sg_lb_stats idlest_sgs = {
10905 .avg_load = UINT_MAX,
10906 .group_type = group_overloaded,
10907 };
10908
10909 do {
10910 int local_group;
10911
10912 /* Skip over this group if it has no CPUs allowed */
10913 if (!cpumask_intersects(sched_group_span(group),
10914 p->cpus_ptr))
10915 continue;
10916
10917 /* Skip over this group if no cookie matched */
10918 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
10919 continue;
10920
10921 local_group = cpumask_test_cpu(this_cpu,
10922 sched_group_span(group));
10923
10924 if (local_group) {
10925 sgs = &local_sgs;
10926 local = group;
10927 } else {
10928 sgs = &tmp_sgs;
10929 }
10930
10931 update_sg_wakeup_stats(sd, group, sgs, p);
10932
10933 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
10934 idlest = group;
10935 idlest_sgs = *sgs;
10936 }
10937
10938 } while (group = group->next, group != sd->groups);
10939
10940
10941 /* There is no idlest group to push tasks to */
10942 if (!idlest)
10943 return NULL;
10944
10945 /* The local group has been skipped because of CPU affinity */
10946 if (!local)
10947 return idlest;
10948
10949 /*
10950 * If the local group is idler than the selected idlest group
10951 * don't try and push the task.
10952 */
10953 if (local_sgs.group_type < idlest_sgs.group_type)
10954 return NULL;
10955
10956 /*
10957 * If the local group is busier than the selected idlest group
10958 * try and push the task.
10959 */
10960 if (local_sgs.group_type > idlest_sgs.group_type)
10961 return idlest;
10962
10963 switch (local_sgs.group_type) {
10964 case group_overloaded:
10965 case group_fully_busy:
10966
10967 /* Calculate allowed imbalance based on load */
10968 imbalance = scale_load_down(NICE_0_LOAD) *
10969 (sd->imbalance_pct-100) / 100;
10970
10971 /*
10972 * When comparing groups across NUMA domains, it's possible for
10973 * the local domain to be very lightly loaded relative to the
10974 * remote domains but "imbalance" skews the comparison making
10975 * remote CPUs look much more favourable. When considering
10976 * cross-domain, add imbalance to the load on the remote node
10977 * and consider staying local.
10978 */
10979
10980 if ((sd->flags & SD_NUMA) &&
10981 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
10982 return NULL;
10983
10984 /*
10985 * If the local group is less loaded than the selected
10986 * idlest group don't try and push any tasks.
10987 */
10988 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
10989 return NULL;
10990
10991 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
10992 return NULL;
10993 break;
10994
10995 case group_imbalanced:
10996 case group_asym_packing:
10997 case group_smt_balance:
10998 /* Those type are not used in the slow wakeup path */
10999 return NULL;
11000
11001 case group_misfit_task:
11002 /* Select group with the highest max capacity */
11003 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
11004 return NULL;
11005 break;
11006
11007 case group_has_spare:
11008 #ifdef CONFIG_NUMA
11009 if (sd->flags & SD_NUMA) {
11010 int imb_numa_nr = sd->imb_numa_nr;
11011 #ifdef CONFIG_NUMA_BALANCING
11012 int idlest_cpu;
11013 /*
11014 * If there is spare capacity at NUMA, try to select
11015 * the preferred node
11016 */
11017 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
11018 return NULL;
11019
11020 idlest_cpu = cpumask_first(sched_group_span(idlest));
11021 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
11022 return idlest;
11023 #endif /* CONFIG_NUMA_BALANCING */
11024 /*
11025 * Otherwise, keep the task close to the wakeup source
11026 * and improve locality if the number of running tasks
11027 * would remain below threshold where an imbalance is
11028 * allowed while accounting for the possibility the
11029 * task is pinned to a subset of CPUs. If there is a
11030 * real need of migration, periodic load balance will
11031 * take care of it.
11032 */
11033 if (p->nr_cpus_allowed != NR_CPUS) {
11034 unsigned int w = cpumask_weight_and(p->cpus_ptr,
11035 sched_group_span(local));
11036 imb_numa_nr = min(w, sd->imb_numa_nr);
11037 }
11038
11039 imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
11040 if (!adjust_numa_imbalance(imbalance,
11041 local_sgs.sum_nr_running + 1,
11042 imb_numa_nr)) {
11043 return NULL;
11044 }
11045 }
11046 #endif /* CONFIG_NUMA */
11047
11048 /*
11049 * Select group with highest number of idle CPUs. We could also
11050 * compare the utilization which is more stable but it can end
11051 * up that the group has less spare capacity but finally more
11052 * idle CPUs which means more opportunity to run task.
11053 */
11054 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
11055 return NULL;
11056 break;
11057 }
11058
11059 return idlest;
11060 }
11061
update_idle_cpu_scan(struct lb_env * env,unsigned long sum_util)11062 static void update_idle_cpu_scan(struct lb_env *env,
11063 unsigned long sum_util)
11064 {
11065 struct sched_domain_shared *sd_share;
11066 int llc_weight, pct;
11067 u64 x, y, tmp;
11068 /*
11069 * Update the number of CPUs to scan in LLC domain, which could
11070 * be used as a hint in select_idle_cpu(). The update of sd_share
11071 * could be expensive because it is within a shared cache line.
11072 * So the write of this hint only occurs during periodic load
11073 * balancing, rather than CPU_NEWLY_IDLE, because the latter
11074 * can fire way more frequently than the former.
11075 */
11076 if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
11077 return;
11078
11079 llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
11080 if (env->sd->span_weight != llc_weight)
11081 return;
11082
11083 sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
11084 if (!sd_share)
11085 return;
11086
11087 /*
11088 * The number of CPUs to search drops as sum_util increases, when
11089 * sum_util hits 85% or above, the scan stops.
11090 * The reason to choose 85% as the threshold is because this is the
11091 * imbalance_pct(117) when a LLC sched group is overloaded.
11092 *
11093 * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
11094 * and y'= y / SCHED_CAPACITY_SCALE
11095 *
11096 * x is the ratio of sum_util compared to the CPU capacity:
11097 * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
11098 * y' is the ratio of CPUs to be scanned in the LLC domain,
11099 * and the number of CPUs to scan is calculated by:
11100 *
11101 * nr_scan = llc_weight * y' [2]
11102 *
11103 * When x hits the threshold of overloaded, AKA, when
11104 * x = 100 / pct, y drops to 0. According to [1],
11105 * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
11106 *
11107 * Scale x by SCHED_CAPACITY_SCALE:
11108 * x' = sum_util / llc_weight; [3]
11109 *
11110 * and finally [1] becomes:
11111 * y = SCHED_CAPACITY_SCALE -
11112 * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
11113 *
11114 */
11115 /* equation [3] */
11116 x = sum_util;
11117 do_div(x, llc_weight);
11118
11119 /* equation [4] */
11120 pct = env->sd->imbalance_pct;
11121 tmp = x * x * pct * pct;
11122 do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
11123 tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
11124 y = SCHED_CAPACITY_SCALE - tmp;
11125
11126 /* equation [2] */
11127 y *= llc_weight;
11128 do_div(y, SCHED_CAPACITY_SCALE);
11129 if ((int)y != sd_share->nr_idle_scan)
11130 WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
11131 }
11132
11133 /**
11134 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
11135 * @env: The load balancing environment.
11136 * @sds: variable to hold the statistics for this sched_domain.
11137 */
11138
update_sd_lb_stats(struct lb_env * env,struct sd_lb_stats * sds)11139 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
11140 {
11141 struct sched_group *sg = env->sd->groups;
11142 struct sg_lb_stats *local = &sds->local_stat;
11143 struct sg_lb_stats tmp_sgs;
11144 unsigned long sum_util = 0;
11145 bool sg_overloaded = 0, sg_overutilized = 0;
11146
11147 do {
11148 struct sg_lb_stats *sgs = &tmp_sgs;
11149 int local_group;
11150
11151 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
11152 if (local_group) {
11153 sds->local = sg;
11154 sgs = local;
11155
11156 if (env->idle != CPU_NEWLY_IDLE ||
11157 time_after_eq(jiffies, sg->sgc->next_update))
11158 update_group_capacity(env->sd, env->dst_cpu);
11159 }
11160
11161 update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
11162
11163 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
11164 sds->busiest = sg;
11165 sds->busiest_stat = *sgs;
11166 }
11167
11168 /* Now, start updating sd_lb_stats */
11169 sds->total_load += sgs->group_load;
11170 sds->total_capacity += sgs->group_capacity;
11171
11172 sum_util += sgs->group_util;
11173 sg = sg->next;
11174 } while (sg != env->sd->groups);
11175
11176 /*
11177 * Indicate that the child domain of the busiest group prefers tasks
11178 * go to a child's sibling domains first. NB the flags of a sched group
11179 * are those of the child domain.
11180 */
11181 if (sds->busiest)
11182 sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
11183
11184
11185 if (env->sd->flags & SD_NUMA)
11186 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
11187
11188 if (!env->sd->parent) {
11189 /* update overload indicator if we are at root domain */
11190 set_rd_overloaded(env->dst_rq->rd, sg_overloaded);
11191
11192 /* Update over-utilization (tipping point, U >= 0) indicator */
11193 set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
11194 } else if (sg_overutilized) {
11195 set_rd_overutilized(env->dst_rq->rd, sg_overutilized);
11196 }
11197
11198 update_idle_cpu_scan(env, sum_util);
11199 }
11200
11201 /**
11202 * calculate_imbalance - Calculate the amount of imbalance present within the
11203 * groups of a given sched_domain during load balance.
11204 * @env: load balance environment
11205 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
11206 */
calculate_imbalance(struct lb_env * env,struct sd_lb_stats * sds)11207 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
11208 {
11209 struct sg_lb_stats *local, *busiest;
11210
11211 local = &sds->local_stat;
11212 busiest = &sds->busiest_stat;
11213
11214 if (busiest->group_type == group_misfit_task) {
11215 if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
11216 /* Set imbalance to allow misfit tasks to be balanced. */
11217 env->migration_type = migrate_misfit;
11218 env->imbalance = 1;
11219 } else {
11220 /*
11221 * Set load imbalance to allow moving task from cpu
11222 * with reduced capacity.
11223 */
11224 env->migration_type = migrate_load;
11225 env->imbalance = busiest->group_misfit_task_load;
11226 }
11227 return;
11228 }
11229
11230 if (busiest->group_type == group_asym_packing) {
11231 /*
11232 * In case of asym capacity, we will try to migrate all load to
11233 * the preferred CPU.
11234 */
11235 env->migration_type = migrate_task;
11236 env->imbalance = busiest->sum_h_nr_running;
11237 return;
11238 }
11239
11240 if (busiest->group_type == group_smt_balance) {
11241 /* Reduce number of tasks sharing CPU capacity */
11242 env->migration_type = migrate_task;
11243 env->imbalance = 1;
11244 return;
11245 }
11246
11247 if (busiest->group_type == group_imbalanced) {
11248 /*
11249 * In the group_imb case we cannot rely on group-wide averages
11250 * to ensure CPU-load equilibrium, try to move any task to fix
11251 * the imbalance. The next load balance will take care of
11252 * balancing back the system.
11253 */
11254 env->migration_type = migrate_task;
11255 env->imbalance = 1;
11256 return;
11257 }
11258
11259 /*
11260 * Try to use spare capacity of local group without overloading it or
11261 * emptying busiest.
11262 */
11263 if (local->group_type == group_has_spare) {
11264 if ((busiest->group_type > group_fully_busy) &&
11265 !(env->sd->flags & SD_SHARE_LLC)) {
11266 /*
11267 * If busiest is overloaded, try to fill spare
11268 * capacity. This might end up creating spare capacity
11269 * in busiest or busiest still being overloaded but
11270 * there is no simple way to directly compute the
11271 * amount of load to migrate in order to balance the
11272 * system.
11273 */
11274 env->migration_type = migrate_util;
11275 env->imbalance = max(local->group_capacity, local->group_util) -
11276 local->group_util;
11277
11278 /*
11279 * In some cases, the group's utilization is max or even
11280 * higher than capacity because of migrations but the
11281 * local CPU is (newly) idle. There is at least one
11282 * waiting task in this overloaded busiest group. Let's
11283 * try to pull it.
11284 */
11285 if (env->idle && env->imbalance == 0) {
11286 env->migration_type = migrate_task;
11287 env->imbalance = 1;
11288 }
11289
11290 return;
11291 }
11292
11293 if (busiest->group_weight == 1 || sds->prefer_sibling) {
11294 /*
11295 * When prefer sibling, evenly spread running tasks on
11296 * groups.
11297 */
11298 env->migration_type = migrate_task;
11299 env->imbalance = sibling_imbalance(env, sds, busiest, local);
11300 } else {
11301
11302 /*
11303 * If there is no overload, we just want to even the number of
11304 * idle CPUs.
11305 */
11306 env->migration_type = migrate_task;
11307 env->imbalance = max_t(long, 0,
11308 (local->idle_cpus - busiest->idle_cpus));
11309 }
11310
11311 #ifdef CONFIG_NUMA
11312 /* Consider allowing a small imbalance between NUMA groups */
11313 if (env->sd->flags & SD_NUMA) {
11314 env->imbalance = adjust_numa_imbalance(env->imbalance,
11315 local->sum_nr_running + 1,
11316 env->sd->imb_numa_nr);
11317 }
11318 #endif
11319
11320 /* Number of tasks to move to restore balance */
11321 env->imbalance >>= 1;
11322
11323 return;
11324 }
11325
11326 /*
11327 * Local is fully busy but has to take more load to relieve the
11328 * busiest group
11329 */
11330 if (local->group_type < group_overloaded) {
11331 /*
11332 * Local will become overloaded so the avg_load metrics are
11333 * finally needed.
11334 */
11335
11336 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
11337 local->group_capacity;
11338
11339 /*
11340 * If the local group is more loaded than the selected
11341 * busiest group don't try to pull any tasks.
11342 */
11343 if (local->avg_load >= busiest->avg_load) {
11344 env->imbalance = 0;
11345 return;
11346 }
11347
11348 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
11349 sds->total_capacity;
11350
11351 /*
11352 * If the local group is more loaded than the average system
11353 * load, don't try to pull any tasks.
11354 */
11355 if (local->avg_load >= sds->avg_load) {
11356 env->imbalance = 0;
11357 return;
11358 }
11359
11360 }
11361
11362 /*
11363 * Both group are or will become overloaded and we're trying to get all
11364 * the CPUs to the average_load, so we don't want to push ourselves
11365 * above the average load, nor do we wish to reduce the max loaded CPU
11366 * below the average load. At the same time, we also don't want to
11367 * reduce the group load below the group capacity. Thus we look for
11368 * the minimum possible imbalance.
11369 */
11370 env->migration_type = migrate_load;
11371 env->imbalance = min(
11372 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
11373 (sds->avg_load - local->avg_load) * local->group_capacity
11374 ) / SCHED_CAPACITY_SCALE;
11375 }
11376
11377 /******* sched_balance_find_src_group() helpers end here *********************/
11378
11379 /*
11380 * Decision matrix according to the local and busiest group type:
11381 *
11382 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
11383 * has_spare nr_idle balanced N/A N/A balanced balanced
11384 * fully_busy nr_idle nr_idle N/A N/A balanced balanced
11385 * misfit_task force N/A N/A N/A N/A N/A
11386 * asym_packing force force N/A N/A force force
11387 * imbalanced force force N/A N/A force force
11388 * overloaded force force N/A N/A force avg_load
11389 *
11390 * N/A : Not Applicable because already filtered while updating
11391 * statistics.
11392 * balanced : The system is balanced for these 2 groups.
11393 * force : Calculate the imbalance as load migration is probably needed.
11394 * avg_load : Only if imbalance is significant enough.
11395 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
11396 * different in groups.
11397 */
11398
11399 /**
11400 * sched_balance_find_src_group - Returns the busiest group within the sched_domain
11401 * if there is an imbalance.
11402 * @env: The load balancing environment.
11403 *
11404 * Also calculates the amount of runnable load which should be moved
11405 * to restore balance.
11406 *
11407 * Return: - The busiest group if imbalance exists.
11408 */
sched_balance_find_src_group(struct lb_env * env)11409 static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
11410 {
11411 struct sg_lb_stats *local, *busiest;
11412 struct sd_lb_stats sds;
11413
11414 init_sd_lb_stats(&sds);
11415
11416 /*
11417 * Compute the various statistics relevant for load balancing at
11418 * this level.
11419 */
11420 update_sd_lb_stats(env, &sds);
11421
11422 /* There is no busy sibling group to pull tasks from */
11423 if (!sds.busiest)
11424 goto out_balanced;
11425
11426 busiest = &sds.busiest_stat;
11427
11428 /* Misfit tasks should be dealt with regardless of the avg load */
11429 if (busiest->group_type == group_misfit_task)
11430 goto force_balance;
11431
11432 if (!is_rd_overutilized(env->dst_rq->rd) &&
11433 rcu_dereference_all(env->dst_rq->rd->pd))
11434 goto out_balanced;
11435
11436 /* ASYM feature bypasses nice load balance check */
11437 if (busiest->group_type == group_asym_packing)
11438 goto force_balance;
11439
11440 /*
11441 * If the busiest group is imbalanced the below checks don't
11442 * work because they assume all things are equal, which typically
11443 * isn't true due to cpus_ptr constraints and the like.
11444 */
11445 if (busiest->group_type == group_imbalanced)
11446 goto force_balance;
11447
11448 local = &sds.local_stat;
11449 /*
11450 * If the local group is busier than the selected busiest group
11451 * don't try and pull any tasks.
11452 */
11453 if (local->group_type > busiest->group_type)
11454 goto out_balanced;
11455
11456 /*
11457 * When groups are overloaded, use the avg_load to ensure fairness
11458 * between tasks.
11459 */
11460 if (local->group_type == group_overloaded) {
11461 /*
11462 * If the local group is more loaded than the selected
11463 * busiest group don't try to pull any tasks.
11464 */
11465 if (local->avg_load >= busiest->avg_load)
11466 goto out_balanced;
11467
11468 /* XXX broken for overlapping NUMA groups */
11469 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
11470 sds.total_capacity;
11471
11472 /*
11473 * Don't pull any tasks if this group is already above the
11474 * domain average load.
11475 */
11476 if (local->avg_load >= sds.avg_load)
11477 goto out_balanced;
11478
11479 /*
11480 * If the busiest group is more loaded, use imbalance_pct to be
11481 * conservative.
11482 */
11483 if (100 * busiest->avg_load <=
11484 env->sd->imbalance_pct * local->avg_load)
11485 goto out_balanced;
11486 }
11487
11488 /*
11489 * Try to move all excess tasks to a sibling domain of the busiest
11490 * group's child domain.
11491 */
11492 if (sds.prefer_sibling && local->group_type == group_has_spare &&
11493 sibling_imbalance(env, &sds, busiest, local) > 1)
11494 goto force_balance;
11495
11496 if (busiest->group_type != group_overloaded) {
11497 if (!env->idle) {
11498 /*
11499 * If the busiest group is not overloaded (and as a
11500 * result the local one too) but this CPU is already
11501 * busy, let another idle CPU try to pull task.
11502 */
11503 goto out_balanced;
11504 }
11505
11506 if (busiest->group_type == group_smt_balance &&
11507 smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
11508 /* Let non SMT CPU pull from SMT CPU sharing with sibling */
11509 goto force_balance;
11510 }
11511
11512 if (busiest->group_weight > 1 &&
11513 local->idle_cpus <= (busiest->idle_cpus + 1)) {
11514 /*
11515 * If the busiest group is not overloaded
11516 * and there is no imbalance between this and busiest
11517 * group wrt idle CPUs, it is balanced. The imbalance
11518 * becomes significant if the diff is greater than 1
11519 * otherwise we might end up to just move the imbalance
11520 * on another group. Of course this applies only if
11521 * there is more than 1 CPU per group.
11522 */
11523 goto out_balanced;
11524 }
11525
11526 if (busiest->sum_h_nr_running == 1) {
11527 /*
11528 * busiest doesn't have any tasks waiting to run
11529 */
11530 goto out_balanced;
11531 }
11532 }
11533
11534 force_balance:
11535 /* Looks like there is an imbalance. Compute it */
11536 calculate_imbalance(env, &sds);
11537 return env->imbalance ? sds.busiest : NULL;
11538
11539 out_balanced:
11540 env->imbalance = 0;
11541 return NULL;
11542 }
11543
11544 /*
11545 * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group.
11546 */
sched_balance_find_src_rq(struct lb_env * env,struct sched_group * group)11547 static struct rq *sched_balance_find_src_rq(struct lb_env *env,
11548 struct sched_group *group)
11549 {
11550 struct rq *busiest = NULL, *rq;
11551 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
11552 unsigned int busiest_nr = 0;
11553 int i;
11554
11555 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
11556 unsigned long capacity, load, util;
11557 unsigned int nr_running;
11558 enum fbq_type rt;
11559
11560 rq = cpu_rq(i);
11561 rt = fbq_classify_rq(rq);
11562
11563 /*
11564 * We classify groups/runqueues into three groups:
11565 * - regular: there are !numa tasks
11566 * - remote: there are numa tasks that run on the 'wrong' node
11567 * - all: there is no distinction
11568 *
11569 * In order to avoid migrating ideally placed numa tasks,
11570 * ignore those when there's better options.
11571 *
11572 * If we ignore the actual busiest queue to migrate another
11573 * task, the next balance pass can still reduce the busiest
11574 * queue by moving tasks around inside the node.
11575 *
11576 * If we cannot move enough load due to this classification
11577 * the next pass will adjust the group classification and
11578 * allow migration of more tasks.
11579 *
11580 * Both cases only affect the total convergence complexity.
11581 */
11582 if (rt > env->fbq_type)
11583 continue;
11584
11585 nr_running = rq->cfs.h_nr_runnable;
11586 if (!nr_running)
11587 continue;
11588
11589 capacity = capacity_of(i);
11590
11591 /*
11592 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
11593 * eventually lead to active_balancing high->low capacity.
11594 * Higher per-CPU capacity is considered better than balancing
11595 * average load.
11596 */
11597 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
11598 !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
11599 nr_running == 1)
11600 continue;
11601
11602 /*
11603 * Make sure we only pull tasks from a CPU of lower priority
11604 * when balancing between SMT siblings.
11605 *
11606 * If balancing between cores, let lower priority CPUs help
11607 * SMT cores with more than one busy sibling.
11608 */
11609 if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1)
11610 continue;
11611
11612 switch (env->migration_type) {
11613 case migrate_load:
11614 /*
11615 * When comparing with load imbalance, use cpu_load()
11616 * which is not scaled with the CPU capacity.
11617 */
11618 load = cpu_load(rq);
11619
11620 if (nr_running == 1 && load > env->imbalance &&
11621 !check_cpu_capacity(rq, env->sd))
11622 break;
11623
11624 /*
11625 * For the load comparisons with the other CPUs,
11626 * consider the cpu_load() scaled with the CPU
11627 * capacity, so that the load can be moved away
11628 * from the CPU that is potentially running at a
11629 * lower capacity.
11630 *
11631 * Thus we're looking for max(load_i / capacity_i),
11632 * crosswise multiplication to rid ourselves of the
11633 * division works out to:
11634 * load_i * capacity_j > load_j * capacity_i;
11635 * where j is our previous maximum.
11636 */
11637 if (load * busiest_capacity > busiest_load * capacity) {
11638 busiest_load = load;
11639 busiest_capacity = capacity;
11640 busiest = rq;
11641 }
11642 break;
11643
11644 case migrate_util:
11645 util = cpu_util_cfs_boost(i);
11646
11647 /*
11648 * Don't try to pull utilization from a CPU with one
11649 * running task. Whatever its utilization, we will fail
11650 * detach the task.
11651 */
11652 if (nr_running <= 1)
11653 continue;
11654
11655 if (busiest_util < util) {
11656 busiest_util = util;
11657 busiest = rq;
11658 }
11659 break;
11660
11661 case migrate_task:
11662 if (busiest_nr < nr_running) {
11663 busiest_nr = nr_running;
11664 busiest = rq;
11665 }
11666 break;
11667
11668 case migrate_misfit:
11669 /*
11670 * For ASYM_CPUCAPACITY domains with misfit tasks we
11671 * simply seek the "biggest" misfit task.
11672 */
11673 if (rq->misfit_task_load > busiest_load) {
11674 busiest_load = rq->misfit_task_load;
11675 busiest = rq;
11676 }
11677
11678 break;
11679
11680 }
11681 }
11682
11683 return busiest;
11684 }
11685
11686 /*
11687 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
11688 * so long as it is large enough.
11689 */
11690 #define MAX_PINNED_INTERVAL 512
11691
11692 static inline bool
asym_active_balance(struct lb_env * env)11693 asym_active_balance(struct lb_env *env)
11694 {
11695 /*
11696 * ASYM_PACKING needs to force migrate tasks from busy but lower
11697 * priority CPUs in order to pack all tasks in the highest priority
11698 * CPUs. When done between cores, do it only if the whole core if the
11699 * whole core is idle.
11700 *
11701 * If @env::src_cpu is an SMT core with busy siblings, let
11702 * the lower priority @env::dst_cpu help it. Do not follow
11703 * CPU priority.
11704 */
11705 return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
11706 (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
11707 !sched_use_asym_prio(env->sd, env->src_cpu));
11708 }
11709
11710 static inline bool
imbalanced_active_balance(struct lb_env * env)11711 imbalanced_active_balance(struct lb_env *env)
11712 {
11713 struct sched_domain *sd = env->sd;
11714
11715 /*
11716 * The imbalanced case includes the case of pinned tasks preventing a fair
11717 * distribution of the load on the system but also the even distribution of the
11718 * threads on a system with spare capacity
11719 */
11720 if ((env->migration_type == migrate_task) &&
11721 (sd->nr_balance_failed > sd->cache_nice_tries+2))
11722 return 1;
11723
11724 return 0;
11725 }
11726
need_active_balance(struct lb_env * env)11727 static int need_active_balance(struct lb_env *env)
11728 {
11729 struct sched_domain *sd = env->sd;
11730
11731 if (asym_active_balance(env))
11732 return 1;
11733
11734 if (imbalanced_active_balance(env))
11735 return 1;
11736
11737 /*
11738 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
11739 * It's worth migrating the task if the src_cpu's capacity is reduced
11740 * because of other sched_class or IRQs if more capacity stays
11741 * available on dst_cpu.
11742 */
11743 if (env->idle &&
11744 (env->src_rq->cfs.h_nr_runnable == 1)) {
11745 if ((check_cpu_capacity(env->src_rq, sd)) &&
11746 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
11747 return 1;
11748 }
11749
11750 if (env->migration_type == migrate_misfit)
11751 return 1;
11752
11753 return 0;
11754 }
11755
11756 static int active_load_balance_cpu_stop(void *data);
11757
should_we_balance(struct lb_env * env)11758 static int should_we_balance(struct lb_env *env)
11759 {
11760 struct cpumask *swb_cpus = this_cpu_cpumask_var_ptr(should_we_balance_tmpmask);
11761 struct sched_group *sg = env->sd->groups;
11762 int cpu, idle_smt = -1;
11763
11764 /*
11765 * Ensure the balancing environment is consistent; can happen
11766 * when the softirq triggers 'during' hotplug.
11767 */
11768 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
11769 return 0;
11770
11771 /*
11772 * In the newly idle case, we will allow all the CPUs
11773 * to do the newly idle load balance.
11774 *
11775 * However, we bail out if we already have tasks or a wakeup pending,
11776 * to optimize wakeup latency.
11777 */
11778 if (env->idle == CPU_NEWLY_IDLE) {
11779 if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
11780 return 0;
11781 return 1;
11782 }
11783
11784 cpumask_copy(swb_cpus, group_balance_mask(sg));
11785 /* Try to find first idle CPU */
11786 for_each_cpu_and(cpu, swb_cpus, env->cpus) {
11787 if (!idle_cpu(cpu))
11788 continue;
11789
11790 /*
11791 * Don't balance to idle SMT in busy core right away when
11792 * balancing cores, but remember the first idle SMT CPU for
11793 * later consideration. Find CPU on an idle core first.
11794 */
11795 if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
11796 if (idle_smt == -1)
11797 idle_smt = cpu;
11798 /*
11799 * If the core is not idle, and first SMT sibling which is
11800 * idle has been found, then its not needed to check other
11801 * SMT siblings for idleness:
11802 */
11803 #ifdef CONFIG_SCHED_SMT
11804 cpumask_andnot(swb_cpus, swb_cpus, cpu_smt_mask(cpu));
11805 #endif
11806 continue;
11807 }
11808
11809 /*
11810 * Are we the first idle core in a non-SMT domain or higher,
11811 * or the first idle CPU in a SMT domain?
11812 */
11813 return cpu == env->dst_cpu;
11814 }
11815
11816 /* Are we the first idle CPU with busy siblings? */
11817 if (idle_smt != -1)
11818 return idle_smt == env->dst_cpu;
11819
11820 /* Are we the first CPU of this group ? */
11821 return group_balance_cpu(sg) == env->dst_cpu;
11822 }
11823
update_lb_imbalance_stat(struct lb_env * env,struct sched_domain * sd,enum cpu_idle_type idle)11824 static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
11825 enum cpu_idle_type idle)
11826 {
11827 if (!schedstat_enabled())
11828 return;
11829
11830 switch (env->migration_type) {
11831 case migrate_load:
11832 __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
11833 break;
11834 case migrate_util:
11835 __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
11836 break;
11837 case migrate_task:
11838 __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
11839 break;
11840 case migrate_misfit:
11841 __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
11842 break;
11843 }
11844 }
11845
11846 /*
11847 * This flag serializes load-balancing passes over large domains
11848 * (above the NODE topology level) - only one load-balancing instance
11849 * may run at a time, to reduce overhead on very large systems with
11850 * lots of CPUs and large NUMA distances.
11851 *
11852 * - Note that load-balancing passes triggered while another one
11853 * is executing are skipped and not re-tried.
11854 *
11855 * - Also note that this does not serialize rebalance_domains()
11856 * execution, as non-SD_SERIALIZE domains will still be
11857 * load-balanced in parallel.
11858 */
11859 static atomic_t sched_balance_running = ATOMIC_INIT(0);
11860
11861 /*
11862 * Check this_cpu to ensure it is balanced within domain. Attempt to move
11863 * tasks if there is an imbalance.
11864 */
sched_balance_rq(int this_cpu,struct rq * this_rq,struct sched_domain * sd,enum cpu_idle_type idle,int * continue_balancing)11865 static int sched_balance_rq(int this_cpu, struct rq *this_rq,
11866 struct sched_domain *sd, enum cpu_idle_type idle,
11867 int *continue_balancing)
11868 {
11869 int ld_moved, cur_ld_moved, active_balance = 0;
11870 struct sched_domain *sd_parent = sd->parent;
11871 struct sched_group *group;
11872 struct rq *busiest;
11873 struct rq_flags rf;
11874 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
11875 struct lb_env env = {
11876 .sd = sd,
11877 .dst_cpu = this_cpu,
11878 .dst_rq = this_rq,
11879 .dst_grpmask = group_balance_mask(sd->groups),
11880 .idle = idle,
11881 .loop_break = SCHED_NR_MIGRATE_BREAK,
11882 .cpus = cpus,
11883 .fbq_type = all,
11884 .tasks = LIST_HEAD_INIT(env.tasks),
11885 };
11886 bool need_unlock = false;
11887
11888 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
11889
11890 schedstat_inc(sd->lb_count[idle]);
11891
11892 redo:
11893 if (!should_we_balance(&env)) {
11894 *continue_balancing = 0;
11895 goto out_balanced;
11896 }
11897
11898 if (!need_unlock && (sd->flags & SD_SERIALIZE)) {
11899 int zero = 0;
11900 if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1))
11901 goto out_balanced;
11902
11903 need_unlock = true;
11904 }
11905
11906 group = sched_balance_find_src_group(&env);
11907 if (!group) {
11908 schedstat_inc(sd->lb_nobusyg[idle]);
11909 goto out_balanced;
11910 }
11911
11912 busiest = sched_balance_find_src_rq(&env, group);
11913 if (!busiest) {
11914 schedstat_inc(sd->lb_nobusyq[idle]);
11915 goto out_balanced;
11916 }
11917
11918 WARN_ON_ONCE(busiest == env.dst_rq);
11919
11920 update_lb_imbalance_stat(&env, sd, idle);
11921
11922 env.src_cpu = busiest->cpu;
11923 env.src_rq = busiest;
11924
11925 ld_moved = 0;
11926 /* Clear this flag as soon as we find a pullable task */
11927 env.flags |= LBF_ALL_PINNED;
11928 if (busiest->nr_running > 1) {
11929 /*
11930 * Attempt to move tasks. If sched_balance_find_src_group has found
11931 * an imbalance but busiest->nr_running <= 1, the group is
11932 * still unbalanced. ld_moved simply stays zero, so it is
11933 * correctly treated as an imbalance.
11934 */
11935 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
11936
11937 more_balance:
11938 rq_lock_irqsave(busiest, &rf);
11939 update_rq_clock(busiest);
11940
11941 /*
11942 * cur_ld_moved - load moved in current iteration
11943 * ld_moved - cumulative load moved across iterations
11944 */
11945 cur_ld_moved = detach_tasks(&env);
11946
11947 /*
11948 * We've detached some tasks from busiest_rq. Every
11949 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
11950 * unlock busiest->lock, and we are able to be sure
11951 * that nobody can manipulate the tasks in parallel.
11952 * See task_rq_lock() family for the details.
11953 */
11954
11955 rq_unlock(busiest, &rf);
11956
11957 if (cur_ld_moved) {
11958 attach_tasks(&env);
11959 ld_moved += cur_ld_moved;
11960 }
11961
11962 local_irq_restore(rf.flags);
11963
11964 if (env.flags & LBF_NEED_BREAK) {
11965 env.flags &= ~LBF_NEED_BREAK;
11966 goto more_balance;
11967 }
11968
11969 /*
11970 * Revisit (affine) tasks on src_cpu that couldn't be moved to
11971 * us and move them to an alternate dst_cpu in our sched_group
11972 * where they can run. The upper limit on how many times we
11973 * iterate on same src_cpu is dependent on number of CPUs in our
11974 * sched_group.
11975 *
11976 * This changes load balance semantics a bit on who can move
11977 * load to a given_cpu. In addition to the given_cpu itself
11978 * (or a ilb_cpu acting on its behalf where given_cpu is
11979 * nohz-idle), we now have balance_cpu in a position to move
11980 * load to given_cpu. In rare situations, this may cause
11981 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
11982 * _independently_ and at _same_ time to move some load to
11983 * given_cpu) causing excess load to be moved to given_cpu.
11984 * This however should not happen so much in practice and
11985 * moreover subsequent load balance cycles should correct the
11986 * excess load moved.
11987 */
11988 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
11989
11990 /* Prevent to re-select dst_cpu via env's CPUs */
11991 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
11992
11993 env.dst_rq = cpu_rq(env.new_dst_cpu);
11994 env.dst_cpu = env.new_dst_cpu;
11995 env.flags &= ~LBF_DST_PINNED;
11996 env.loop = 0;
11997 env.loop_break = SCHED_NR_MIGRATE_BREAK;
11998
11999 /*
12000 * Go back to "more_balance" rather than "redo" since we
12001 * need to continue with same src_cpu.
12002 */
12003 goto more_balance;
12004 }
12005
12006 /*
12007 * We failed to reach balance because of affinity.
12008 */
12009 if (sd_parent) {
12010 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
12011
12012 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
12013 *group_imbalance = 1;
12014 }
12015
12016 /* All tasks on this runqueue were pinned by CPU affinity */
12017 if (unlikely(env.flags & LBF_ALL_PINNED)) {
12018 __cpumask_clear_cpu(cpu_of(busiest), cpus);
12019 /*
12020 * Attempting to continue load balancing at the current
12021 * sched_domain level only makes sense if there are
12022 * active CPUs remaining as possible busiest CPUs to
12023 * pull load from which are not contained within the
12024 * destination group that is receiving any migrated
12025 * load.
12026 */
12027 if (!cpumask_subset(cpus, env.dst_grpmask)) {
12028 env.loop = 0;
12029 env.loop_break = SCHED_NR_MIGRATE_BREAK;
12030 goto redo;
12031 }
12032 goto out_all_pinned;
12033 }
12034 }
12035
12036 if (!ld_moved) {
12037 schedstat_inc(sd->lb_failed[idle]);
12038 /*
12039 * Increment the failure counter only on periodic balance.
12040 * We do not want newidle balance, which can be very
12041 * frequent, pollute the failure counter causing
12042 * excessive cache_hot migrations and active balances.
12043 *
12044 * Similarly for migration_misfit which is not related to
12045 * load/util migration, don't pollute nr_balance_failed.
12046 */
12047 if (idle != CPU_NEWLY_IDLE &&
12048 env.migration_type != migrate_misfit)
12049 sd->nr_balance_failed++;
12050
12051 if (need_active_balance(&env)) {
12052 unsigned long flags;
12053
12054 raw_spin_rq_lock_irqsave(busiest, flags);
12055
12056 /*
12057 * Don't kick the active_load_balance_cpu_stop,
12058 * if the curr task on busiest CPU can't be
12059 * moved to this_cpu:
12060 */
12061 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
12062 raw_spin_rq_unlock_irqrestore(busiest, flags);
12063 goto out_one_pinned;
12064 }
12065
12066 /* Record that we found at least one task that could run on this_cpu */
12067 env.flags &= ~LBF_ALL_PINNED;
12068
12069 /*
12070 * ->active_balance synchronizes accesses to
12071 * ->active_balance_work. Once set, it's cleared
12072 * only after active load balance is finished.
12073 */
12074 if (!busiest->active_balance) {
12075 busiest->active_balance = 1;
12076 busiest->push_cpu = this_cpu;
12077 active_balance = 1;
12078 }
12079
12080 preempt_disable();
12081 raw_spin_rq_unlock_irqrestore(busiest, flags);
12082 if (active_balance) {
12083 stop_one_cpu_nowait(cpu_of(busiest),
12084 active_load_balance_cpu_stop, busiest,
12085 &busiest->active_balance_work);
12086 }
12087 preempt_enable();
12088 }
12089 } else {
12090 sd->nr_balance_failed = 0;
12091 }
12092
12093 if (likely(!active_balance) || need_active_balance(&env)) {
12094 /* We were unbalanced, so reset the balancing interval */
12095 sd->balance_interval = sd->min_interval;
12096 }
12097
12098 goto out;
12099
12100 out_balanced:
12101 /*
12102 * We reach balance although we may have faced some affinity
12103 * constraints. Clear the imbalance flag only if other tasks got
12104 * a chance to move and fix the imbalance.
12105 */
12106 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
12107 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
12108
12109 if (*group_imbalance)
12110 *group_imbalance = 0;
12111 }
12112
12113 out_all_pinned:
12114 /*
12115 * We reach balance because all tasks are pinned at this level so
12116 * we can't migrate them. Let the imbalance flag set so parent level
12117 * can try to migrate them.
12118 */
12119 schedstat_inc(sd->lb_balanced[idle]);
12120
12121 sd->nr_balance_failed = 0;
12122
12123 out_one_pinned:
12124 ld_moved = 0;
12125
12126 /*
12127 * sched_balance_newidle() disregards balance intervals, so we could
12128 * repeatedly reach this code, which would lead to balance_interval
12129 * skyrocketing in a short amount of time. Skip the balance_interval
12130 * increase logic to avoid that.
12131 *
12132 * Similarly misfit migration which is not necessarily an indication of
12133 * the system being busy and requires lb to backoff to let it settle
12134 * down.
12135 */
12136 if (env.idle == CPU_NEWLY_IDLE ||
12137 env.migration_type == migrate_misfit)
12138 goto out;
12139
12140 /* tune up the balancing interval */
12141 if ((env.flags & LBF_ALL_PINNED &&
12142 sd->balance_interval < MAX_PINNED_INTERVAL) ||
12143 sd->balance_interval < sd->max_interval)
12144 sd->balance_interval *= 2;
12145 out:
12146 if (need_unlock)
12147 atomic_set_release(&sched_balance_running, 0);
12148
12149 return ld_moved;
12150 }
12151
12152 static inline unsigned long
get_sd_balance_interval(struct sched_domain * sd,int cpu_busy)12153 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
12154 {
12155 unsigned long interval = sd->balance_interval;
12156
12157 if (cpu_busy)
12158 interval *= sd->busy_factor;
12159
12160 /* scale ms to jiffies */
12161 interval = msecs_to_jiffies(interval);
12162
12163 /*
12164 * Reduce likelihood of busy balancing at higher domains racing with
12165 * balancing at lower domains by preventing their balancing periods
12166 * from being multiples of each other.
12167 */
12168 if (cpu_busy)
12169 interval -= 1;
12170
12171 interval = clamp(interval, 1UL, max_load_balance_interval);
12172
12173 return interval;
12174 }
12175
12176 static inline void
update_next_balance(struct sched_domain * sd,unsigned long * next_balance)12177 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
12178 {
12179 unsigned long interval, next;
12180
12181 /* used by idle balance, so cpu_busy = 0 */
12182 interval = get_sd_balance_interval(sd, 0);
12183 next = sd->last_balance + interval;
12184
12185 if (time_after(*next_balance, next))
12186 *next_balance = next;
12187 }
12188
12189 /*
12190 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
12191 * running tasks off the busiest CPU onto idle CPUs. It requires at
12192 * least 1 task to be running on each physical CPU where possible, and
12193 * avoids physical / logical imbalances.
12194 */
active_load_balance_cpu_stop(void * data)12195 static int active_load_balance_cpu_stop(void *data)
12196 {
12197 struct rq *busiest_rq = data;
12198 int busiest_cpu = cpu_of(busiest_rq);
12199 int target_cpu = busiest_rq->push_cpu;
12200 struct rq *target_rq = cpu_rq(target_cpu);
12201 struct sched_domain *sd;
12202 struct task_struct *p = NULL;
12203 struct rq_flags rf;
12204
12205 rq_lock_irq(busiest_rq, &rf);
12206 /*
12207 * Between queueing the stop-work and running it is a hole in which
12208 * CPUs can become inactive. We should not move tasks from or to
12209 * inactive CPUs.
12210 */
12211 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
12212 goto out_unlock;
12213
12214 /* Make sure the requested CPU hasn't gone down in the meantime: */
12215 if (unlikely(busiest_cpu != smp_processor_id() ||
12216 !busiest_rq->active_balance))
12217 goto out_unlock;
12218
12219 /* Is there any task to move? */
12220 if (busiest_rq->nr_running <= 1)
12221 goto out_unlock;
12222
12223 /*
12224 * This condition is "impossible", if it occurs
12225 * we need to fix it. Originally reported by
12226 * Bjorn Helgaas on a 128-CPU setup.
12227 */
12228 WARN_ON_ONCE(busiest_rq == target_rq);
12229
12230 /* Search for an sd spanning us and the target CPU. */
12231 rcu_read_lock();
12232 for_each_domain(target_cpu, sd) {
12233 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
12234 break;
12235 }
12236
12237 if (likely(sd)) {
12238 struct lb_env env = {
12239 .sd = sd,
12240 .dst_cpu = target_cpu,
12241 .dst_rq = target_rq,
12242 .src_cpu = busiest_rq->cpu,
12243 .src_rq = busiest_rq,
12244 .idle = CPU_IDLE,
12245 .flags = LBF_ACTIVE_LB,
12246 };
12247
12248 schedstat_inc(sd->alb_count);
12249 update_rq_clock(busiest_rq);
12250
12251 p = detach_one_task(&env);
12252 if (p) {
12253 schedstat_inc(sd->alb_pushed);
12254 /* Active balancing done, reset the failure counter. */
12255 sd->nr_balance_failed = 0;
12256 } else {
12257 schedstat_inc(sd->alb_failed);
12258 }
12259 }
12260 rcu_read_unlock();
12261 out_unlock:
12262 busiest_rq->active_balance = 0;
12263 rq_unlock(busiest_rq, &rf);
12264
12265 if (p)
12266 attach_one_task(target_rq, p);
12267
12268 local_irq_enable();
12269
12270 return 0;
12271 }
12272
12273 /*
12274 * Scale the max sched_balance_rq interval with the number of CPUs in the system.
12275 * This trades load-balance latency on larger machines for less cross talk.
12276 */
update_max_interval(void)12277 void update_max_interval(void)
12278 {
12279 max_load_balance_interval = HZ*num_online_cpus()/10;
12280 }
12281
update_newidle_stats(struct sched_domain * sd,unsigned int success)12282 static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success)
12283 {
12284 sd->newidle_call++;
12285 sd->newidle_success += success;
12286
12287 if (sd->newidle_call >= 1024) {
12288 sd->newidle_ratio = sd->newidle_success;
12289 sd->newidle_call /= 2;
12290 sd->newidle_success /= 2;
12291 }
12292 }
12293
12294 static inline bool
update_newidle_cost(struct sched_domain * sd,u64 cost,unsigned int success)12295 update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success)
12296 {
12297 unsigned long next_decay = sd->last_decay_max_lb_cost + HZ;
12298 unsigned long now = jiffies;
12299
12300 if (cost)
12301 update_newidle_stats(sd, success);
12302
12303 if (cost > sd->max_newidle_lb_cost) {
12304 /*
12305 * Track max cost of a domain to make sure to not delay the
12306 * next wakeup on the CPU.
12307 */
12308 sd->max_newidle_lb_cost = cost;
12309 sd->last_decay_max_lb_cost = now;
12310
12311 } else if (time_after(now, next_decay)) {
12312 /*
12313 * Decay the newidle max times by ~1% per second to ensure that
12314 * it is not outdated and the current max cost is actually
12315 * shorter.
12316 */
12317 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
12318 sd->last_decay_max_lb_cost = now;
12319 return true;
12320 }
12321
12322 return false;
12323 }
12324
12325 /*
12326 * It checks each scheduling domain to see if it is due to be balanced,
12327 * and initiates a balancing operation if so.
12328 *
12329 * Balancing parameters are set up in init_sched_domains.
12330 */
sched_balance_domains(struct rq * rq,enum cpu_idle_type idle)12331 static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
12332 {
12333 int continue_balancing = 1;
12334 int cpu = rq->cpu;
12335 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
12336 unsigned long interval;
12337 struct sched_domain *sd;
12338 /* Earliest time when we have to do rebalance again */
12339 unsigned long next_balance = jiffies + 60*HZ;
12340 int update_next_balance = 0;
12341 int need_decay = 0;
12342 u64 max_cost = 0;
12343
12344 rcu_read_lock();
12345 for_each_domain(cpu, sd) {
12346 /*
12347 * Decay the newidle max times here because this is a regular
12348 * visit to all the domains.
12349 */
12350 need_decay = update_newidle_cost(sd, 0, 0);
12351 max_cost += sd->max_newidle_lb_cost;
12352
12353 /*
12354 * Stop the load balance at this level. There is another
12355 * CPU in our sched group which is doing load balancing more
12356 * actively.
12357 */
12358 if (!continue_balancing) {
12359 if (need_decay)
12360 continue;
12361 break;
12362 }
12363
12364 interval = get_sd_balance_interval(sd, busy);
12365 if (time_after_eq(jiffies, sd->last_balance + interval)) {
12366 if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
12367 /*
12368 * The LBF_DST_PINNED logic could have changed
12369 * env->dst_cpu, so we can't know our idle
12370 * state even if we migrated tasks. Update it.
12371 */
12372 idle = idle_cpu(cpu);
12373 busy = !idle && !sched_idle_cpu(cpu);
12374 }
12375 sd->last_balance = jiffies;
12376 interval = get_sd_balance_interval(sd, busy);
12377 }
12378 if (time_after(next_balance, sd->last_balance + interval)) {
12379 next_balance = sd->last_balance + interval;
12380 update_next_balance = 1;
12381 }
12382 }
12383 if (need_decay) {
12384 /*
12385 * Ensure the rq-wide value also decays but keep it at a
12386 * reasonable floor to avoid funnies with rq->avg_idle.
12387 */
12388 rq->max_idle_balance_cost =
12389 max((u64)sysctl_sched_migration_cost, max_cost);
12390 }
12391 rcu_read_unlock();
12392
12393 /*
12394 * next_balance will be updated only when there is a need.
12395 * When the cpu is attached to null domain for ex, it will not be
12396 * updated.
12397 */
12398 if (likely(update_next_balance))
12399 rq->next_balance = next_balance;
12400
12401 }
12402
on_null_domain(struct rq * rq)12403 static inline int on_null_domain(struct rq *rq)
12404 {
12405 return unlikely(!rcu_dereference_sched(rq->sd));
12406 }
12407
12408 #ifdef CONFIG_NO_HZ_COMMON
12409 /*
12410 * NOHZ idle load balancing (ILB) details:
12411 *
12412 * - When one of the busy CPUs notices that there may be an idle rebalancing
12413 * needed, they will kick the idle load balancer, which then does idle
12414 * load balancing for all the idle CPUs.
12415 */
find_new_ilb(void)12416 static inline int find_new_ilb(void)
12417 {
12418 const struct cpumask *hk_mask;
12419 int ilb_cpu;
12420
12421 hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
12422
12423 for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
12424
12425 if (ilb_cpu == smp_processor_id())
12426 continue;
12427
12428 if (idle_cpu(ilb_cpu))
12429 return ilb_cpu;
12430 }
12431
12432 return -1;
12433 }
12434
12435 /*
12436 * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
12437 * SMP function call (IPI).
12438 *
12439 * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
12440 * (if there is one).
12441 */
kick_ilb(unsigned int flags)12442 static void kick_ilb(unsigned int flags)
12443 {
12444 int ilb_cpu;
12445
12446 /*
12447 * Increase nohz.next_balance only when if full ilb is triggered but
12448 * not if we only update stats.
12449 */
12450 if (flags & NOHZ_BALANCE_KICK)
12451 nohz.next_balance = jiffies+1;
12452
12453 ilb_cpu = find_new_ilb();
12454 if (ilb_cpu < 0)
12455 return;
12456
12457 /*
12458 * Don't bother if no new NOHZ balance work items for ilb_cpu,
12459 * i.e. all bits in flags are already set in ilb_cpu.
12460 */
12461 if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
12462 return;
12463
12464 /*
12465 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
12466 * the first flag owns it; cleared by nohz_csd_func().
12467 */
12468 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
12469 if (flags & NOHZ_KICK_MASK)
12470 return;
12471
12472 /*
12473 * This way we generate an IPI on the target CPU which
12474 * is idle, and the softirq performing NOHZ idle load balancing
12475 * will be run before returning from the IPI.
12476 */
12477 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
12478 }
12479
12480 /*
12481 * Current decision point for kicking the idle load balancer in the presence
12482 * of idle CPUs in the system.
12483 */
nohz_balancer_kick(struct rq * rq)12484 static void nohz_balancer_kick(struct rq *rq)
12485 {
12486 unsigned long now = jiffies;
12487 struct sched_domain_shared *sds;
12488 struct sched_domain *sd;
12489 int nr_busy, i, cpu = rq->cpu;
12490 unsigned int flags = 0;
12491
12492 if (unlikely(rq->idle_balance))
12493 return;
12494
12495 /*
12496 * We may be recently in ticked or tickless idle mode. At the first
12497 * busy tick after returning from idle, we will update the busy stats.
12498 */
12499 nohz_balance_exit_idle(rq);
12500
12501 if (READ_ONCE(nohz.has_blocked_load) &&
12502 time_after(now, READ_ONCE(nohz.next_blocked)))
12503 flags = NOHZ_STATS_KICK;
12504
12505 /*
12506 * Most of the time system is not 100% busy. i.e nohz.nr_cpus > 0
12507 * Skip the read if time is not due.
12508 *
12509 * If none are in tickless mode, there maybe a narrow window
12510 * (28 jiffies, HZ=1000) where flags maybe set and kick_ilb called.
12511 * But idle load balancing is not done as find_new_ilb fails.
12512 * That's very rare. So read nohz.nr_cpus only if time is due.
12513 */
12514 if (time_before(now, nohz.next_balance))
12515 goto out;
12516
12517 /*
12518 * None are in tickless mode and hence no need for NOHZ idle load
12519 * balancing
12520 */
12521 if (unlikely(cpumask_empty(nohz.idle_cpus_mask)))
12522 return;
12523
12524 if (rq->nr_running >= 2) {
12525 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12526 goto out;
12527 }
12528
12529 rcu_read_lock();
12530
12531 sd = rcu_dereference_all(rq->sd);
12532 if (sd) {
12533 /*
12534 * If there's a runnable CFS task and the current CPU has reduced
12535 * capacity, kick the ILB to see if there's a better CPU to run on:
12536 */
12537 if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
12538 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12539 goto unlock;
12540 }
12541 }
12542
12543 sd = rcu_dereference_all(per_cpu(sd_asym_packing, cpu));
12544 if (sd) {
12545 /*
12546 * When ASYM_PACKING; see if there's a more preferred CPU
12547 * currently idle; in which case, kick the ILB to move tasks
12548 * around.
12549 *
12550 * When balancing between cores, all the SMT siblings of the
12551 * preferred CPU must be idle.
12552 */
12553 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
12554 if (sched_asym(sd, i, cpu)) {
12555 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12556 goto unlock;
12557 }
12558 }
12559 }
12560
12561 sd = rcu_dereference_all(per_cpu(sd_asym_cpucapacity, cpu));
12562 if (sd) {
12563 /*
12564 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
12565 * to run the misfit task on.
12566 */
12567 if (check_misfit_status(rq)) {
12568 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12569 goto unlock;
12570 }
12571
12572 /*
12573 * For asymmetric systems, we do not want to nicely balance
12574 * cache use, instead we want to embrace asymmetry and only
12575 * ensure tasks have enough CPU capacity.
12576 *
12577 * Skip the LLC logic because it's not relevant in that case.
12578 */
12579 goto unlock;
12580 }
12581
12582 sds = rcu_dereference_all(per_cpu(sd_llc_shared, cpu));
12583 if (sds) {
12584 /*
12585 * If there is an imbalance between LLC domains (IOW we could
12586 * increase the overall cache utilization), we need a less-loaded LLC
12587 * domain to pull some load from. Likewise, we may need to spread
12588 * load within the current LLC domain (e.g. packed SMT cores but
12589 * other CPUs are idle). We can't really know from here how busy
12590 * the others are - so just get a NOHZ balance going if it looks
12591 * like this LLC domain has tasks we could move.
12592 */
12593 nr_busy = atomic_read(&sds->nr_busy_cpus);
12594 if (nr_busy > 1) {
12595 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
12596 goto unlock;
12597 }
12598 }
12599 unlock:
12600 rcu_read_unlock();
12601 out:
12602 if (READ_ONCE(nohz.needs_update))
12603 flags |= NOHZ_NEXT_KICK;
12604
12605 if (flags)
12606 kick_ilb(flags);
12607 }
12608
set_cpu_sd_state_busy(int cpu)12609 static void set_cpu_sd_state_busy(int cpu)
12610 {
12611 struct sched_domain *sd;
12612
12613 rcu_read_lock();
12614 sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
12615
12616 if (!sd || !sd->nohz_idle)
12617 goto unlock;
12618 sd->nohz_idle = 0;
12619
12620 atomic_inc(&sd->shared->nr_busy_cpus);
12621 unlock:
12622 rcu_read_unlock();
12623 }
12624
nohz_balance_exit_idle(struct rq * rq)12625 void nohz_balance_exit_idle(struct rq *rq)
12626 {
12627 WARN_ON_ONCE(rq != this_rq());
12628
12629 if (likely(!rq->nohz_tick_stopped))
12630 return;
12631
12632 rq->nohz_tick_stopped = 0;
12633 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
12634
12635 set_cpu_sd_state_busy(rq->cpu);
12636 }
12637
set_cpu_sd_state_idle(int cpu)12638 static void set_cpu_sd_state_idle(int cpu)
12639 {
12640 struct sched_domain *sd;
12641
12642 rcu_read_lock();
12643 sd = rcu_dereference_all(per_cpu(sd_llc, cpu));
12644
12645 if (!sd || sd->nohz_idle)
12646 goto unlock;
12647 sd->nohz_idle = 1;
12648
12649 atomic_dec(&sd->shared->nr_busy_cpus);
12650 unlock:
12651 rcu_read_unlock();
12652 }
12653
12654 /*
12655 * This routine will record that the CPU is going idle with tick stopped.
12656 * This info will be used in performing idle load balancing in the future.
12657 */
nohz_balance_enter_idle(int cpu)12658 void nohz_balance_enter_idle(int cpu)
12659 {
12660 struct rq *rq = cpu_rq(cpu);
12661
12662 WARN_ON_ONCE(cpu != smp_processor_id());
12663
12664 /* If this CPU is going down, then nothing needs to be done: */
12665 if (!cpu_active(cpu))
12666 return;
12667
12668 /*
12669 * Can be set safely without rq->lock held
12670 * If a clear happens, it will have evaluated last additions because
12671 * rq->lock is held during the check and the clear
12672 */
12673 rq->has_blocked_load = 1;
12674
12675 /*
12676 * The tick is still stopped but load could have been added in the
12677 * meantime. We set the nohz.has_blocked_load flag to trig a check of the
12678 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
12679 * of nohz.has_blocked_load can only happen after checking the new load
12680 */
12681 if (rq->nohz_tick_stopped)
12682 goto out;
12683
12684 /* If we're a completely isolated CPU, we don't play: */
12685 if (on_null_domain(rq))
12686 return;
12687
12688 rq->nohz_tick_stopped = 1;
12689
12690 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
12691
12692 /*
12693 * Ensures that if nohz_idle_balance() fails to observe our
12694 * @idle_cpus_mask store, it must observe the @has_blocked_load
12695 * and @needs_update stores.
12696 */
12697 smp_mb__after_atomic();
12698
12699 set_cpu_sd_state_idle(cpu);
12700
12701 WRITE_ONCE(nohz.needs_update, 1);
12702 out:
12703 /*
12704 * Each time a cpu enter idle, we assume that it has blocked load and
12705 * enable the periodic update of the load of idle CPUs
12706 */
12707 WRITE_ONCE(nohz.has_blocked_load, 1);
12708 }
12709
update_nohz_stats(struct rq * rq)12710 static bool update_nohz_stats(struct rq *rq)
12711 {
12712 unsigned int cpu = rq->cpu;
12713
12714 if (!rq->has_blocked_load)
12715 return false;
12716
12717 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
12718 return false;
12719
12720 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
12721 return true;
12722
12723 sched_balance_update_blocked_averages(cpu);
12724
12725 return rq->has_blocked_load;
12726 }
12727
12728 /*
12729 * Internal function that runs load balance for all idle CPUs. The load balance
12730 * can be a simple update of blocked load or a complete load balance with
12731 * tasks movement depending of flags.
12732 */
_nohz_idle_balance(struct rq * this_rq,unsigned int flags)12733 static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
12734 {
12735 /* Earliest time when we have to do rebalance again */
12736 unsigned long now = jiffies;
12737 unsigned long next_balance = now + 60*HZ;
12738 bool has_blocked_load = false;
12739 int update_next_balance = 0;
12740 int this_cpu = this_rq->cpu;
12741 int balance_cpu;
12742 struct rq *rq;
12743
12744 WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
12745
12746 /*
12747 * We assume there will be no idle load after this update and clear
12748 * the has_blocked_load flag. If a cpu enters idle in the mean time, it will
12749 * set the has_blocked_load flag and trigger another update of idle load.
12750 * Because a cpu that becomes idle, is added to idle_cpus_mask before
12751 * setting the flag, we are sure to not clear the state and not
12752 * check the load of an idle cpu.
12753 *
12754 * Same applies to idle_cpus_mask vs needs_update.
12755 */
12756 if (flags & NOHZ_STATS_KICK)
12757 WRITE_ONCE(nohz.has_blocked_load, 0);
12758 if (flags & NOHZ_NEXT_KICK)
12759 WRITE_ONCE(nohz.needs_update, 0);
12760
12761 /*
12762 * Ensures that if we miss the CPU, we must see the has_blocked_load
12763 * store from nohz_balance_enter_idle().
12764 */
12765 smp_mb();
12766
12767 /*
12768 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
12769 * chance for other idle cpu to pull load.
12770 */
12771 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
12772 if (!idle_cpu(balance_cpu))
12773 continue;
12774
12775 /*
12776 * If this CPU gets work to do, stop the load balancing
12777 * work being done for other CPUs. Next load
12778 * balancing owner will pick it up.
12779 */
12780 if (!idle_cpu(this_cpu) && need_resched()) {
12781 if (flags & NOHZ_STATS_KICK)
12782 has_blocked_load = true;
12783 if (flags & NOHZ_NEXT_KICK)
12784 WRITE_ONCE(nohz.needs_update, 1);
12785 goto abort;
12786 }
12787
12788 rq = cpu_rq(balance_cpu);
12789
12790 if (flags & NOHZ_STATS_KICK)
12791 has_blocked_load |= update_nohz_stats(rq);
12792
12793 /*
12794 * If time for next balance is due,
12795 * do the balance.
12796 */
12797 if (time_after_eq(jiffies, rq->next_balance)) {
12798 struct rq_flags rf;
12799
12800 rq_lock_irqsave(rq, &rf);
12801 update_rq_clock(rq);
12802 rq_unlock_irqrestore(rq, &rf);
12803
12804 if (flags & NOHZ_BALANCE_KICK)
12805 sched_balance_domains(rq, CPU_IDLE);
12806 }
12807
12808 if (time_after(next_balance, rq->next_balance)) {
12809 next_balance = rq->next_balance;
12810 update_next_balance = 1;
12811 }
12812 }
12813
12814 /*
12815 * next_balance will be updated only when there is a need.
12816 * When the CPU is attached to null domain for ex, it will not be
12817 * updated.
12818 */
12819 if (likely(update_next_balance))
12820 nohz.next_balance = next_balance;
12821
12822 if (flags & NOHZ_STATS_KICK)
12823 WRITE_ONCE(nohz.next_blocked,
12824 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
12825
12826 abort:
12827 /* There is still blocked load, enable periodic update */
12828 if (has_blocked_load)
12829 WRITE_ONCE(nohz.has_blocked_load, 1);
12830 }
12831
12832 /*
12833 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
12834 * rebalancing for all the CPUs for whom scheduler ticks are stopped.
12835 */
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)12836 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12837 {
12838 unsigned int flags = this_rq->nohz_idle_balance;
12839
12840 if (!flags)
12841 return false;
12842
12843 this_rq->nohz_idle_balance = 0;
12844
12845 if (idle != CPU_IDLE)
12846 return false;
12847
12848 _nohz_idle_balance(this_rq, flags);
12849
12850 return true;
12851 }
12852
12853 /*
12854 * Check if we need to directly run the ILB for updating blocked load before
12855 * entering idle state. Here we run ILB directly without issuing IPIs.
12856 *
12857 * Note that when this function is called, the tick may not yet be stopped on
12858 * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and
12859 * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates
12860 * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle
12861 * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is
12862 * called from this function on (this) CPU that's not yet in the mask. That's
12863 * OK because the goal of nohz_run_idle_balance() is to run ILB only for
12864 * updating the blocked load of already idle CPUs without waking up one of
12865 * those idle CPUs and outside the preempt disable / IRQ off phase of the local
12866 * cpu about to enter idle, because it can take a long time.
12867 */
nohz_run_idle_balance(int cpu)12868 void nohz_run_idle_balance(int cpu)
12869 {
12870 unsigned int flags;
12871
12872 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
12873
12874 /*
12875 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
12876 * (i.e. NOHZ_STATS_KICK set) and will do the same.
12877 */
12878 if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
12879 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK);
12880 }
12881
nohz_newidle_balance(struct rq * this_rq)12882 static void nohz_newidle_balance(struct rq *this_rq)
12883 {
12884 int this_cpu = this_rq->cpu;
12885
12886 /* Will wake up very soon. No time for doing anything else*/
12887 if (this_rq->avg_idle < sysctl_sched_migration_cost)
12888 return;
12889
12890 /* Don't need to update blocked load of idle CPUs*/
12891 if (!READ_ONCE(nohz.has_blocked_load) ||
12892 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
12893 return;
12894
12895 /*
12896 * Set the need to trigger ILB in order to update blocked load
12897 * before entering idle state.
12898 */
12899 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
12900 }
12901
12902 #else /* !CONFIG_NO_HZ_COMMON: */
nohz_balancer_kick(struct rq * rq)12903 static inline void nohz_balancer_kick(struct rq *rq) { }
12904
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)12905 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
12906 {
12907 return false;
12908 }
12909
nohz_newidle_balance(struct rq * this_rq)12910 static inline void nohz_newidle_balance(struct rq *this_rq) { }
12911 #endif /* !CONFIG_NO_HZ_COMMON */
12912
12913 /*
12914 * sched_balance_newidle is called by schedule() if this_cpu is about to become
12915 * idle. Attempts to pull tasks from other CPUs.
12916 *
12917 * Returns:
12918 * < 0 - we released the lock and there are !fair tasks present
12919 * 0 - failed, no new tasks
12920 * > 0 - success, new (fair) tasks present
12921 */
sched_balance_newidle(struct rq * this_rq,struct rq_flags * rf)12922 static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
12923 __must_hold(__rq_lockp(this_rq))
12924 {
12925 unsigned long next_balance = jiffies + HZ;
12926 int this_cpu = this_rq->cpu;
12927 int continue_balancing = 1;
12928 u64 t0, t1, curr_cost = 0;
12929 struct sched_domain *sd;
12930 int pulled_task = 0;
12931
12932 update_misfit_status(NULL, this_rq);
12933
12934 /*
12935 * There is a task waiting to run. No need to search for one.
12936 * Return 0; the task will be enqueued when switching to idle.
12937 */
12938 if (this_rq->ttwu_pending)
12939 return 0;
12940
12941 /*
12942 * We must set idle_stamp _before_ calling sched_balance_rq()
12943 * for CPU_NEWLY_IDLE, such that we measure the this duration
12944 * as idle time.
12945 */
12946 this_rq->idle_stamp = rq_clock(this_rq);
12947
12948 /*
12949 * Do not pull tasks towards !active CPUs...
12950 */
12951 if (!cpu_active(this_cpu))
12952 return 0;
12953
12954 /*
12955 * This is OK, because current is on_cpu, which avoids it being picked
12956 * for load-balance and preemption/IRQs are still disabled avoiding
12957 * further scheduler activity on it and we're being very careful to
12958 * re-start the picking loop.
12959 */
12960 rq_unpin_lock(this_rq, rf);
12961
12962 sd = rcu_dereference_sched_domain(this_rq->sd);
12963 if (!sd)
12964 goto out;
12965
12966 if (!get_rd_overloaded(this_rq->rd) ||
12967 this_rq->avg_idle < sd->max_newidle_lb_cost) {
12968
12969 update_next_balance(sd, &next_balance);
12970 goto out;
12971 }
12972
12973 /*
12974 * Include sched_balance_update_blocked_averages() in the cost
12975 * calculation because it can be quite costly -- this ensures we skip
12976 * it when avg_idle gets to be very low.
12977 */
12978 t0 = sched_clock_cpu(this_cpu);
12979 __sched_balance_update_blocked_averages(this_rq);
12980
12981 rq_modified_begin(this_rq, &fair_sched_class);
12982 raw_spin_rq_unlock(this_rq);
12983
12984 for_each_domain(this_cpu, sd) {
12985 u64 domain_cost;
12986
12987 update_next_balance(sd, &next_balance);
12988
12989 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
12990 break;
12991
12992 if (sd->flags & SD_BALANCE_NEWIDLE) {
12993 unsigned int weight = 1;
12994
12995 if (sched_feat(NI_RANDOM)) {
12996 /*
12997 * Throw a 1k sided dice; and only run
12998 * newidle_balance according to the success
12999 * rate.
13000 */
13001 u32 d1k = sched_rng() % 1024;
13002 weight = 1 + sd->newidle_ratio;
13003 if (d1k > weight) {
13004 update_newidle_stats(sd, 0);
13005 continue;
13006 }
13007 weight = (1024 + weight/2) / weight;
13008 }
13009
13010 pulled_task = sched_balance_rq(this_cpu, this_rq,
13011 sd, CPU_NEWLY_IDLE,
13012 &continue_balancing);
13013
13014 t1 = sched_clock_cpu(this_cpu);
13015 domain_cost = t1 - t0;
13016 curr_cost += domain_cost;
13017 t0 = t1;
13018
13019 /*
13020 * Track max cost of a domain to make sure to not delay the
13021 * next wakeup on the CPU.
13022 */
13023 update_newidle_cost(sd, domain_cost, weight * !!pulled_task);
13024 }
13025
13026 /*
13027 * Stop searching for tasks to pull if there are
13028 * now runnable tasks on this rq.
13029 */
13030 if (pulled_task || !continue_balancing)
13031 break;
13032 }
13033
13034 raw_spin_rq_lock(this_rq);
13035
13036 if (curr_cost > this_rq->max_idle_balance_cost)
13037 this_rq->max_idle_balance_cost = curr_cost;
13038
13039 /*
13040 * While browsing the domains, we released the rq lock, a task could
13041 * have been enqueued in the meantime. Since we're not going idle,
13042 * pretend we pulled a task.
13043 */
13044 if (this_rq->cfs.h_nr_queued && !pulled_task)
13045 pulled_task = 1;
13046
13047 /* If a higher prio class was modified, restart the pick */
13048 if (rq_modified_above(this_rq, &fair_sched_class))
13049 pulled_task = -1;
13050
13051 out:
13052 /* Move the next balance forward */
13053 if (time_after(this_rq->next_balance, next_balance))
13054 this_rq->next_balance = next_balance;
13055
13056 if (pulled_task)
13057 this_rq->idle_stamp = 0;
13058 else
13059 nohz_newidle_balance(this_rq);
13060
13061 rq_repin_lock(this_rq, rf);
13062
13063 return pulled_task;
13064 }
13065
13066 /*
13067 * This softirq handler is triggered via SCHED_SOFTIRQ from two places:
13068 *
13069 * - directly from the local sched_tick() for periodic load balancing
13070 *
13071 * - indirectly from a remote sched_tick() for NOHZ idle balancing
13072 * through the SMP cross-call nohz_csd_func()
13073 */
sched_balance_softirq(void)13074 static __latent_entropy void sched_balance_softirq(void)
13075 {
13076 struct rq *this_rq = this_rq();
13077 enum cpu_idle_type idle = this_rq->idle_balance;
13078 /*
13079 * If this CPU has a pending NOHZ_BALANCE_KICK, then do the
13080 * balancing on behalf of the other idle CPUs whose ticks are
13081 * stopped. Do nohz_idle_balance *before* sched_balance_domains to
13082 * give the idle CPUs a chance to load balance. Else we may
13083 * load balance only within the local sched_domain hierarchy
13084 * and abort nohz_idle_balance altogether if we pull some load.
13085 */
13086 if (nohz_idle_balance(this_rq, idle))
13087 return;
13088
13089 /* normal load balance */
13090 sched_balance_update_blocked_averages(this_rq->cpu);
13091 sched_balance_domains(this_rq, idle);
13092 }
13093
13094 /*
13095 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
13096 */
sched_balance_trigger(struct rq * rq)13097 void sched_balance_trigger(struct rq *rq)
13098 {
13099 /*
13100 * Don't need to rebalance while attached to NULL domain or
13101 * runqueue CPU is not active
13102 */
13103 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
13104 return;
13105
13106 if (time_after_eq(jiffies, rq->next_balance))
13107 raise_softirq(SCHED_SOFTIRQ);
13108
13109 nohz_balancer_kick(rq);
13110 }
13111
rq_online_fair(struct rq * rq)13112 static void rq_online_fair(struct rq *rq)
13113 {
13114 update_sysctl();
13115
13116 update_runtime_enabled(rq);
13117 }
13118
rq_offline_fair(struct rq * rq)13119 static void rq_offline_fair(struct rq *rq)
13120 {
13121 update_sysctl();
13122
13123 /* Ensure any throttled groups are reachable by pick_next_task */
13124 unthrottle_offline_cfs_rqs(rq);
13125
13126 /* Ensure that we remove rq contribution to group share: */
13127 clear_tg_offline_cfs_rqs(rq);
13128 }
13129
13130 #ifdef CONFIG_SCHED_CORE
13131 static inline bool
__entity_slice_used(struct sched_entity * se,int min_nr_tasks)13132 __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
13133 {
13134 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
13135 u64 slice = se->slice;
13136
13137 return (rtime * min_nr_tasks > slice);
13138 }
13139
13140 #define MIN_NR_TASKS_DURING_FORCEIDLE 2
task_tick_core(struct rq * rq,struct task_struct * curr)13141 static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
13142 {
13143 if (!sched_core_enabled(rq))
13144 return;
13145
13146 /*
13147 * If runqueue has only one task which used up its slice and
13148 * if the sibling is forced idle, then trigger schedule to
13149 * give forced idle task a chance.
13150 *
13151 * sched_slice() considers only this active rq and it gets the
13152 * whole slice. But during force idle, we have siblings acting
13153 * like a single runqueue and hence we need to consider runnable
13154 * tasks on this CPU and the forced idle CPU. Ideally, we should
13155 * go through the forced idle rq, but that would be a perf hit.
13156 * We can assume that the forced idle CPU has at least
13157 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
13158 * if we need to give up the CPU.
13159 */
13160 if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
13161 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
13162 resched_curr(rq);
13163 }
13164
13165 /*
13166 * Consider any infeasible weight scenario. Take for instance two tasks,
13167 * each bound to their respective sibling, one with weight 1 and one with
13168 * weight 2. Then the lower weight task will run ahead of the higher weight
13169 * task without bound.
13170 *
13171 * This utterly destroys the concept of a shared time base.
13172 *
13173 * Remember; all this is about a proportionally fair scheduling, where each
13174 * tasks receives:
13175 *
13176 * w_i
13177 * dt_i = ---------- dt (1)
13178 * \Sum_j w_j
13179 *
13180 * which we do by tracking a virtual time, s_i:
13181 *
13182 * 1
13183 * s_i = --- d[t]_i (2)
13184 * w_i
13185 *
13186 * Where d[t] is a delta of discrete time, while dt is an infinitesimal.
13187 * The immediate corollary is that the ideal schedule S, where (2) to use
13188 * an infinitesimal delta, is:
13189 *
13190 * 1
13191 * S = ---------- dt (3)
13192 * \Sum_i w_i
13193 *
13194 * From which we can define the lag, or deviation from the ideal, as:
13195 *
13196 * lag(i) = S - s_i (4)
13197 *
13198 * And since the one and only purpose is to approximate S, we get that:
13199 *
13200 * \Sum_i w_i lag(i) := 0 (5)
13201 *
13202 * If this were not so, we no longer converge to S, and we can no longer
13203 * claim our scheduler has any of the properties we derive from S. This is
13204 * exactly what you did above, you broke it!
13205 *
13206 *
13207 * Let's continue for a while though; to see if there is anything useful to
13208 * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i:
13209 *
13210 * \Sum_i w_i s_i
13211 * S = -------------- (6)
13212 * \Sum_i w_i
13213 *
13214 * Which gives us a way to compute S, given our s_i. Now, if you've read
13215 * our code, you know that we do not in fact do this, the reason for this
13216 * is two-fold. Firstly, computing S in that way requires a 64bit division
13217 * for every time we'd use it (see 12), and secondly, this only describes
13218 * the steady-state, it doesn't handle dynamics.
13219 *
13220 * Anyway, in (6): s_i -> x + (s_i - x), to get:
13221 *
13222 * \Sum_i w_i (s_i - x)
13223 * S - x = -------------------- (7)
13224 * \Sum_i w_i
13225 *
13226 * Which shows that S and s_i transform alike (which makes perfect sense
13227 * given that S is basically the (weighted) average of s_i).
13228 *
13229 * So the thing to remember is that the above is strictly UP. It is
13230 * possible to generalize to multiple runqueues -- however it gets really
13231 * yuck when you have to add affinity support, as illustrated by our very
13232 * first counter-example.
13233 *
13234 * Luckily I think we can avoid needing a full multi-queue variant for
13235 * core-scheduling (or load-balancing). The crucial observation is that we
13236 * only actually need this comparison in the presence of forced-idle; only
13237 * then do we need to tell if the stalled rq has higher priority over the
13238 * other.
13239 *
13240 * [XXX assumes SMT2; better consider the more general case, I suspect
13241 * it'll work out because our comparison is always between 2 rqs and the
13242 * answer is only interesting if one of them is forced-idle]
13243 *
13244 * And (under assumption of SMT2) when there is forced-idle, there is only
13245 * a single queue, so everything works like normal.
13246 *
13247 * Let, for our runqueue 'k':
13248 *
13249 * T_k = \Sum_i w_i s_i
13250 * W_k = \Sum_i w_i ; for all i of k (8)
13251 *
13252 * Then we can write (6) like:
13253 *
13254 * T_k
13255 * S_k = --- (9)
13256 * W_k
13257 *
13258 * From which immediately follows that:
13259 *
13260 * T_k + T_l
13261 * S_k+l = --------- (10)
13262 * W_k + W_l
13263 *
13264 * On which we can define a combined lag:
13265 *
13266 * lag_k+l(i) := S_k+l - s_i (11)
13267 *
13268 * And that gives us the tools to compare tasks across a combined runqueue.
13269 *
13270 *
13271 * Combined this gives the following:
13272 *
13273 * a) when a runqueue enters force-idle, sync it against it's sibling rq(s)
13274 * using (7); this only requires storing single 'time'-stamps.
13275 *
13276 * b) when comparing tasks between 2 runqueues of which one is forced-idle,
13277 * compare the combined lag, per (11).
13278 *
13279 * Now, of course cgroups (I so hate them) make this more interesting in
13280 * that a) seems to suggest we need to iterate all cgroup on a CPU at such
13281 * boundaries, but I think we can avoid that. The force-idle is for the
13282 * whole CPU, all it's rqs. So we can mark it in the root and lazily
13283 * propagate downward on demand.
13284 */
13285
13286 /*
13287 * So this sync is basically a relative reset of S to 0.
13288 *
13289 * So with 2 queues, when one goes idle, we drop them both to 0 and one
13290 * then increases due to not being idle, and the idle one builds up lag to
13291 * get re-elected. So far so simple, right?
13292 *
13293 * When there's 3, we can have the situation where 2 run and one is idle,
13294 * we sync to 0 and let the idle one build up lag to get re-election. Now
13295 * suppose another one also drops idle. At this point dropping all to 0
13296 * again would destroy the built-up lag from the queue that was already
13297 * idle, not good.
13298 *
13299 * So instead of syncing everything, we can:
13300 *
13301 * less := !((s64)(s_a - s_b) <= 0)
13302 *
13303 * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b
13304 * == v_a - (v_b - S_a + S_b)
13305 *
13306 * IOW, we can recast the (lag) comparison to a one-sided difference.
13307 * So if then, instead of syncing the whole queue, sync the idle queue
13308 * against the active queue with S_a + S_b at the point where we sync.
13309 *
13310 * (XXX consider the implication of living in a cyclic group: N / 2^n N)
13311 *
13312 * This gives us means of syncing single queues against the active queue,
13313 * and for already idle queues to preserve their build-up lag.
13314 *
13315 * Of course, then we get the situation where there's 2 active and one
13316 * going idle, who do we pick to sync against? Theory would have us sync
13317 * against the combined S, but as we've already demonstrated, there is no
13318 * such thing in infeasible weight scenarios.
13319 *
13320 * One thing I've considered; and this is where that core_active rudiment
13321 * came from, is having active queues sync up between themselves after
13322 * every tick. This limits the observed divergence due to the work
13323 * conservancy.
13324 *
13325 * On top of that, we can improve upon things by employing (10) here.
13326 */
13327
13328 /*
13329 * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
13330 */
se_fi_update(const struct sched_entity * se,unsigned int fi_seq,bool forceidle)13331 static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
13332 bool forceidle)
13333 {
13334 for_each_sched_entity(se) {
13335 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13336
13337 if (forceidle) {
13338 if (cfs_rq->forceidle_seq == fi_seq)
13339 break;
13340 cfs_rq->forceidle_seq = fi_seq;
13341 }
13342
13343 cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
13344 }
13345 }
13346
task_vruntime_update(struct rq * rq,struct task_struct * p,bool in_fi)13347 void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
13348 {
13349 struct sched_entity *se = &p->se;
13350
13351 if (p->sched_class != &fair_sched_class)
13352 return;
13353
13354 se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
13355 }
13356
cfs_prio_less(const struct task_struct * a,const struct task_struct * b,bool in_fi)13357 bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
13358 bool in_fi)
13359 {
13360 struct rq *rq = task_rq(a);
13361 const struct sched_entity *sea = &a->se;
13362 const struct sched_entity *seb = &b->se;
13363 struct cfs_rq *cfs_rqa;
13364 struct cfs_rq *cfs_rqb;
13365 s64 delta;
13366
13367 WARN_ON_ONCE(task_rq(b)->core != rq->core);
13368
13369 #ifdef CONFIG_FAIR_GROUP_SCHED
13370 /*
13371 * Find an se in the hierarchy for tasks a and b, such that the se's
13372 * are immediate siblings.
13373 */
13374 while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
13375 int sea_depth = sea->depth;
13376 int seb_depth = seb->depth;
13377
13378 if (sea_depth >= seb_depth)
13379 sea = parent_entity(sea);
13380 if (sea_depth <= seb_depth)
13381 seb = parent_entity(seb);
13382 }
13383
13384 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
13385 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
13386
13387 cfs_rqa = sea->cfs_rq;
13388 cfs_rqb = seb->cfs_rq;
13389 #else /* !CONFIG_FAIR_GROUP_SCHED: */
13390 cfs_rqa = &task_rq(a)->cfs;
13391 cfs_rqb = &task_rq(b)->cfs;
13392 #endif /* !CONFIG_FAIR_GROUP_SCHED */
13393
13394 /*
13395 * Find delta after normalizing se's vruntime with its cfs_rq's
13396 * zero_vruntime_fi, which would have been updated in prior calls
13397 * to se_fi_update().
13398 */
13399 delta = vruntime_op(sea->vruntime, "-", seb->vruntime) +
13400 vruntime_op(cfs_rqb->zero_vruntime_fi, "-", cfs_rqa->zero_vruntime_fi);
13401
13402 return delta > 0;
13403 }
13404
task_is_throttled_fair(struct task_struct * p,int cpu)13405 static int task_is_throttled_fair(struct task_struct *p, int cpu)
13406 {
13407 struct cfs_rq *cfs_rq;
13408
13409 #ifdef CONFIG_FAIR_GROUP_SCHED
13410 cfs_rq = task_group(p)->cfs_rq[cpu];
13411 #else
13412 cfs_rq = &cpu_rq(cpu)->cfs;
13413 #endif
13414 return throttled_hierarchy(cfs_rq);
13415 }
13416 #else /* !CONFIG_SCHED_CORE: */
task_tick_core(struct rq * rq,struct task_struct * curr)13417 static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
13418 #endif /* !CONFIG_SCHED_CORE */
13419
13420 /*
13421 * scheduler tick hitting a task of our scheduling class.
13422 *
13423 * NOTE: This function can be called remotely by the tick offload that
13424 * goes along full dynticks. Therefore no local assumption can be made
13425 * and everything must be accessed through the @rq and @curr passed in
13426 * parameters.
13427 */
task_tick_fair(struct rq * rq,struct task_struct * curr,int queued)13428 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
13429 {
13430 struct cfs_rq *cfs_rq;
13431 struct sched_entity *se = &curr->se;
13432
13433 for_each_sched_entity(se) {
13434 cfs_rq = cfs_rq_of(se);
13435 entity_tick(cfs_rq, se, queued);
13436 }
13437
13438 if (queued) {
13439 if (!need_resched())
13440 hrtick_start_fair(rq, curr);
13441 return;
13442 }
13443
13444 if (static_branch_unlikely(&sched_numa_balancing))
13445 task_tick_numa(rq, curr);
13446
13447 update_misfit_status(curr, rq);
13448 check_update_overutilized_status(task_rq(curr));
13449
13450 task_tick_core(rq, curr);
13451 }
13452
13453 /*
13454 * called on fork with the child task as argument from the parent's context
13455 * - child not yet on the tasklist
13456 * - preemption disabled
13457 */
task_fork_fair(struct task_struct * p)13458 static void task_fork_fair(struct task_struct *p)
13459 {
13460 set_task_max_allowed_capacity(p);
13461 }
13462
13463 /*
13464 * Priority of the task has changed. Check to see if we preempt
13465 * the current task.
13466 */
13467 static void
prio_changed_fair(struct rq * rq,struct task_struct * p,u64 oldprio)13468 prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio)
13469 {
13470 if (!task_on_rq_queued(p))
13471 return;
13472
13473 if (p->prio == oldprio)
13474 return;
13475
13476 if (rq->cfs.nr_queued == 1)
13477 return;
13478
13479 /*
13480 * Reschedule if we are currently running on this runqueue and
13481 * our priority decreased, or if we are not currently running on
13482 * this runqueue and our priority is higher than the current's
13483 */
13484 if (task_current_donor(rq, p)) {
13485 if (p->prio > oldprio)
13486 resched_curr(rq);
13487 } else {
13488 wakeup_preempt(rq, p, 0);
13489 }
13490 }
13491
13492 #ifdef CONFIG_FAIR_GROUP_SCHED
13493 /*
13494 * Propagate the changes of the sched_entity across the tg tree to make it
13495 * visible to the root
13496 */
propagate_entity_cfs_rq(struct sched_entity * se)13497 static void propagate_entity_cfs_rq(struct sched_entity *se)
13498 {
13499 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13500
13501 /*
13502 * If a task gets attached to this cfs_rq and before being queued,
13503 * it gets migrated to another CPU due to reasons like affinity
13504 * change, make sure this cfs_rq stays on leaf cfs_rq list to have
13505 * that removed load decayed or it can cause faireness problem.
13506 */
13507 if (!cfs_rq_pelt_clock_throttled(cfs_rq))
13508 list_add_leaf_cfs_rq(cfs_rq);
13509
13510 /* Start to propagate at parent */
13511 se = se->parent;
13512
13513 for_each_sched_entity(se) {
13514 cfs_rq = cfs_rq_of(se);
13515
13516 update_load_avg(cfs_rq, se, UPDATE_TG);
13517
13518 if (!cfs_rq_pelt_clock_throttled(cfs_rq))
13519 list_add_leaf_cfs_rq(cfs_rq);
13520 }
13521
13522 assert_list_leaf_cfs_rq(rq_of(cfs_rq));
13523 }
13524 #else /* !CONFIG_FAIR_GROUP_SCHED: */
propagate_entity_cfs_rq(struct sched_entity * se)13525 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
13526 #endif /* !CONFIG_FAIR_GROUP_SCHED */
13527
detach_entity_cfs_rq(struct sched_entity * se)13528 static void detach_entity_cfs_rq(struct sched_entity *se)
13529 {
13530 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13531
13532 /*
13533 * In case the task sched_avg hasn't been attached:
13534 * - A forked task which hasn't been woken up by wake_up_new_task().
13535 * - A task which has been woken up by try_to_wake_up() but is
13536 * waiting for actually being woken up by sched_ttwu_pending().
13537 */
13538 if (!se->avg.last_update_time)
13539 return;
13540
13541 /* Catch up with the cfs_rq and remove our load when we leave */
13542 update_load_avg(cfs_rq, se, 0);
13543 detach_entity_load_avg(cfs_rq, se);
13544 update_tg_load_avg(cfs_rq);
13545 propagate_entity_cfs_rq(se);
13546 }
13547
attach_entity_cfs_rq(struct sched_entity * se)13548 static void attach_entity_cfs_rq(struct sched_entity *se)
13549 {
13550 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13551
13552 /* Synchronize entity with its cfs_rq */
13553 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
13554 attach_entity_load_avg(cfs_rq, se);
13555 update_tg_load_avg(cfs_rq);
13556 propagate_entity_cfs_rq(se);
13557 }
13558
detach_task_cfs_rq(struct task_struct * p)13559 static void detach_task_cfs_rq(struct task_struct *p)
13560 {
13561 struct sched_entity *se = &p->se;
13562
13563 detach_entity_cfs_rq(se);
13564 }
13565
attach_task_cfs_rq(struct task_struct * p)13566 static void attach_task_cfs_rq(struct task_struct *p)
13567 {
13568 struct sched_entity *se = &p->se;
13569
13570 attach_entity_cfs_rq(se);
13571 }
13572
switching_from_fair(struct rq * rq,struct task_struct * p)13573 static void switching_from_fair(struct rq *rq, struct task_struct *p)
13574 {
13575 if (p->se.sched_delayed)
13576 dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
13577 }
13578
switched_from_fair(struct rq * rq,struct task_struct * p)13579 static void switched_from_fair(struct rq *rq, struct task_struct *p)
13580 {
13581 detach_task_cfs_rq(p);
13582 }
13583
switched_to_fair(struct rq * rq,struct task_struct * p)13584 static void switched_to_fair(struct rq *rq, struct task_struct *p)
13585 {
13586 WARN_ON_ONCE(p->se.sched_delayed);
13587
13588 attach_task_cfs_rq(p);
13589
13590 set_task_max_allowed_capacity(p);
13591
13592 if (task_on_rq_queued(p)) {
13593 /*
13594 * We were most likely switched from sched_rt, so
13595 * kick off the schedule if running, otherwise just see
13596 * if we can still preempt the current task.
13597 */
13598 if (task_current_donor(rq, p))
13599 resched_curr(rq);
13600 else
13601 wakeup_preempt(rq, p, 0);
13602 }
13603 }
13604
__set_next_task_fair(struct rq * rq,struct task_struct * p,bool first)13605 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
13606 {
13607 struct sched_entity *se = &p->se;
13608
13609 if (task_on_rq_queued(p)) {
13610 /*
13611 * Move the next running task to the front of the list, so our
13612 * cfs_tasks list becomes MRU one.
13613 */
13614 list_move(&se->group_node, &rq->cfs_tasks);
13615 }
13616 if (!first)
13617 return;
13618
13619 WARN_ON_ONCE(se->sched_delayed);
13620
13621 if (hrtick_enabled_fair(rq))
13622 hrtick_start_fair(rq, p);
13623
13624 update_misfit_status(p, rq);
13625 sched_fair_update_stop_tick(rq, p);
13626 }
13627
13628 /*
13629 * Account for a task changing its policy or group.
13630 *
13631 * This routine is mostly called to set cfs_rq->curr field when a task
13632 * migrates between groups/classes.
13633 */
set_next_task_fair(struct rq * rq,struct task_struct * p,bool first)13634 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
13635 {
13636 struct sched_entity *se = &p->se;
13637
13638 for_each_sched_entity(se) {
13639 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13640
13641 set_next_entity(cfs_rq, se, first);
13642 /* ensure bandwidth has been allocated on our new cfs_rq */
13643 account_cfs_rq_runtime(cfs_rq, 0);
13644 }
13645
13646 __set_next_task_fair(rq, p, first);
13647 }
13648
init_cfs_rq(struct cfs_rq * cfs_rq)13649 void init_cfs_rq(struct cfs_rq *cfs_rq)
13650 {
13651 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
13652 cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
13653 raw_spin_lock_init(&cfs_rq->removed.lock);
13654 }
13655
13656 #ifdef CONFIG_FAIR_GROUP_SCHED
task_change_group_fair(struct task_struct * p)13657 static void task_change_group_fair(struct task_struct *p)
13658 {
13659 /*
13660 * We couldn't detach or attach a forked task which
13661 * hasn't been woken up by wake_up_new_task().
13662 */
13663 if (READ_ONCE(p->__state) == TASK_NEW)
13664 return;
13665
13666 detach_task_cfs_rq(p);
13667
13668 /* Tell se's cfs_rq has been changed -- migrated */
13669 p->se.avg.last_update_time = 0;
13670 set_task_rq(p, task_cpu(p));
13671 attach_task_cfs_rq(p);
13672 }
13673
free_fair_sched_group(struct task_group * tg)13674 void free_fair_sched_group(struct task_group *tg)
13675 {
13676 int i;
13677
13678 for_each_possible_cpu(i) {
13679 if (tg->cfs_rq)
13680 kfree(tg->cfs_rq[i]);
13681 if (tg->se)
13682 kfree(tg->se[i]);
13683 }
13684
13685 kfree(tg->cfs_rq);
13686 kfree(tg->se);
13687 }
13688
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)13689 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
13690 {
13691 struct sched_entity *se;
13692 struct cfs_rq *cfs_rq;
13693 int i;
13694
13695 tg->cfs_rq = kzalloc_objs(cfs_rq, nr_cpu_ids);
13696 if (!tg->cfs_rq)
13697 goto err;
13698 tg->se = kzalloc_objs(se, nr_cpu_ids);
13699 if (!tg->se)
13700 goto err;
13701
13702 tg->shares = NICE_0_LOAD;
13703
13704 init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent));
13705
13706 for_each_possible_cpu(i) {
13707 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
13708 GFP_KERNEL, cpu_to_node(i));
13709 if (!cfs_rq)
13710 goto err;
13711
13712 se = kzalloc_node(sizeof(struct sched_entity_stats),
13713 GFP_KERNEL, cpu_to_node(i));
13714 if (!se)
13715 goto err_free_rq;
13716
13717 init_cfs_rq(cfs_rq);
13718 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
13719 init_entity_runnable_average(se);
13720 }
13721
13722 return 1;
13723
13724 err_free_rq:
13725 kfree(cfs_rq);
13726 err:
13727 return 0;
13728 }
13729
online_fair_sched_group(struct task_group * tg)13730 void online_fair_sched_group(struct task_group *tg)
13731 {
13732 struct sched_entity *se;
13733 struct rq_flags rf;
13734 struct rq *rq;
13735 int i;
13736
13737 for_each_possible_cpu(i) {
13738 rq = cpu_rq(i);
13739 se = tg->se[i];
13740 rq_lock_irq(rq, &rf);
13741 update_rq_clock(rq);
13742 attach_entity_cfs_rq(se);
13743 sync_throttle(tg, i);
13744 rq_unlock_irq(rq, &rf);
13745 }
13746 }
13747
unregister_fair_sched_group(struct task_group * tg)13748 void unregister_fair_sched_group(struct task_group *tg)
13749 {
13750 int cpu;
13751
13752 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
13753
13754 for_each_possible_cpu(cpu) {
13755 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
13756 struct sched_entity *se = tg->se[cpu];
13757 struct rq *rq = cpu_rq(cpu);
13758
13759 if (se) {
13760 if (se->sched_delayed) {
13761 guard(rq_lock_irqsave)(rq);
13762 if (se->sched_delayed) {
13763 update_rq_clock(rq);
13764 dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
13765 }
13766 list_del_leaf_cfs_rq(cfs_rq);
13767 }
13768 remove_entity_load_avg(se);
13769 }
13770
13771 /*
13772 * Only empty task groups can be destroyed; so we can speculatively
13773 * check on_list without danger of it being re-added.
13774 */
13775 if (cfs_rq->on_list) {
13776 guard(rq_lock_irqsave)(rq);
13777 list_del_leaf_cfs_rq(cfs_rq);
13778 }
13779 }
13780 }
13781
init_tg_cfs_entry(struct task_group * tg,struct cfs_rq * cfs_rq,struct sched_entity * se,int cpu,struct sched_entity * parent)13782 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
13783 struct sched_entity *se, int cpu,
13784 struct sched_entity *parent)
13785 {
13786 struct rq *rq = cpu_rq(cpu);
13787
13788 cfs_rq->tg = tg;
13789 cfs_rq->rq = rq;
13790 init_cfs_rq_runtime(cfs_rq);
13791
13792 tg->cfs_rq[cpu] = cfs_rq;
13793 tg->se[cpu] = se;
13794
13795 /* se could be NULL for root_task_group */
13796 if (!se)
13797 return;
13798
13799 if (!parent) {
13800 se->cfs_rq = &rq->cfs;
13801 se->depth = 0;
13802 } else {
13803 se->cfs_rq = parent->my_q;
13804 se->depth = parent->depth + 1;
13805 }
13806
13807 se->my_q = cfs_rq;
13808 /* guarantee group entities always have weight */
13809 update_load_set(&se->load, NICE_0_LOAD);
13810 se->parent = parent;
13811 }
13812
13813 static DEFINE_MUTEX(shares_mutex);
13814
__sched_group_set_shares(struct task_group * tg,unsigned long shares)13815 static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
13816 {
13817 int i;
13818
13819 lockdep_assert_held(&shares_mutex);
13820
13821 /*
13822 * We can't change the weight of the root cgroup.
13823 */
13824 if (!tg->se[0])
13825 return -EINVAL;
13826
13827 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
13828
13829 if (tg->shares == shares)
13830 return 0;
13831
13832 tg->shares = shares;
13833 for_each_possible_cpu(i) {
13834 struct rq *rq = cpu_rq(i);
13835 struct sched_entity *se = tg->se[i];
13836 struct rq_flags rf;
13837
13838 /* Propagate contribution to hierarchy */
13839 rq_lock_irqsave(rq, &rf);
13840 update_rq_clock(rq);
13841 for_each_sched_entity(se) {
13842 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
13843 update_cfs_group(se);
13844 }
13845 rq_unlock_irqrestore(rq, &rf);
13846 }
13847
13848 return 0;
13849 }
13850
sched_group_set_shares(struct task_group * tg,unsigned long shares)13851 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
13852 {
13853 int ret;
13854
13855 mutex_lock(&shares_mutex);
13856 if (tg_is_idle(tg))
13857 ret = -EINVAL;
13858 else
13859 ret = __sched_group_set_shares(tg, shares);
13860 mutex_unlock(&shares_mutex);
13861
13862 return ret;
13863 }
13864
sched_group_set_idle(struct task_group * tg,long idle)13865 int sched_group_set_idle(struct task_group *tg, long idle)
13866 {
13867 int i;
13868
13869 if (tg == &root_task_group)
13870 return -EINVAL;
13871
13872 if (idle < 0 || idle > 1)
13873 return -EINVAL;
13874
13875 mutex_lock(&shares_mutex);
13876
13877 if (tg->idle == idle) {
13878 mutex_unlock(&shares_mutex);
13879 return 0;
13880 }
13881
13882 tg->idle = idle;
13883
13884 for_each_possible_cpu(i) {
13885 struct rq *rq = cpu_rq(i);
13886 struct sched_entity *se = tg->se[i];
13887 struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
13888 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
13889 long idle_task_delta;
13890 struct rq_flags rf;
13891
13892 rq_lock_irqsave(rq, &rf);
13893
13894 grp_cfs_rq->idle = idle;
13895 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
13896 goto next_cpu;
13897
13898 idle_task_delta = grp_cfs_rq->h_nr_queued -
13899 grp_cfs_rq->h_nr_idle;
13900 if (!cfs_rq_is_idle(grp_cfs_rq))
13901 idle_task_delta *= -1;
13902
13903 for_each_sched_entity(se) {
13904 struct cfs_rq *cfs_rq = cfs_rq_of(se);
13905
13906 if (!se->on_rq)
13907 break;
13908
13909 cfs_rq->h_nr_idle += idle_task_delta;
13910
13911 /* Already accounted at parent level and above. */
13912 if (cfs_rq_is_idle(cfs_rq))
13913 break;
13914 }
13915
13916 next_cpu:
13917 rq_unlock_irqrestore(rq, &rf);
13918 }
13919
13920 /* Idle groups have minimum weight. */
13921 if (tg_is_idle(tg))
13922 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
13923 else
13924 __sched_group_set_shares(tg, NICE_0_LOAD);
13925
13926 mutex_unlock(&shares_mutex);
13927 return 0;
13928 }
13929
13930 #endif /* CONFIG_FAIR_GROUP_SCHED */
13931
13932
get_rr_interval_fair(struct rq * rq,struct task_struct * task)13933 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
13934 {
13935 struct sched_entity *se = &task->se;
13936 unsigned int rr_interval = 0;
13937
13938 /*
13939 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
13940 * idle runqueue:
13941 */
13942 if (rq->cfs.load.weight)
13943 rr_interval = NS_TO_JIFFIES(se->slice);
13944
13945 return rr_interval;
13946 }
13947
13948 /*
13949 * All the scheduling class methods:
13950 */
13951 DEFINE_SCHED_CLASS(fair) = {
13952 .enqueue_task = enqueue_task_fair,
13953 .dequeue_task = dequeue_task_fair,
13954 .yield_task = yield_task_fair,
13955 .yield_to_task = yield_to_task_fair,
13956
13957 .wakeup_preempt = wakeup_preempt_fair,
13958
13959 .pick_task = pick_task_fair,
13960 .pick_next_task = pick_next_task_fair,
13961 .put_prev_task = put_prev_task_fair,
13962 .set_next_task = set_next_task_fair,
13963
13964 .select_task_rq = select_task_rq_fair,
13965 .migrate_task_rq = migrate_task_rq_fair,
13966
13967 .rq_online = rq_online_fair,
13968 .rq_offline = rq_offline_fair,
13969
13970 .task_dead = task_dead_fair,
13971 .set_cpus_allowed = set_cpus_allowed_fair,
13972
13973 .task_tick = task_tick_fair,
13974 .task_fork = task_fork_fair,
13975
13976 .reweight_task = reweight_task_fair,
13977 .prio_changed = prio_changed_fair,
13978 .switching_from = switching_from_fair,
13979 .switched_from = switched_from_fair,
13980 .switched_to = switched_to_fair,
13981
13982 .get_rr_interval = get_rr_interval_fair,
13983
13984 .update_curr = update_curr_fair,
13985
13986 #ifdef CONFIG_FAIR_GROUP_SCHED
13987 .task_change_group = task_change_group_fair,
13988 #endif
13989
13990 #ifdef CONFIG_SCHED_CORE
13991 .task_is_throttled = task_is_throttled_fair,
13992 #endif
13993
13994 #ifdef CONFIG_UCLAMP_TASK
13995 .uclamp_enabled = 1,
13996 #endif
13997 };
13998
print_cfs_stats(struct seq_file * m,int cpu)13999 void print_cfs_stats(struct seq_file *m, int cpu)
14000 {
14001 struct cfs_rq *cfs_rq, *pos;
14002
14003 rcu_read_lock();
14004 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
14005 print_cfs_rq(m, cpu, cfs_rq);
14006 rcu_read_unlock();
14007 }
14008
14009 #ifdef CONFIG_NUMA_BALANCING
show_numa_stats(struct task_struct * p,struct seq_file * m)14010 void show_numa_stats(struct task_struct *p, struct seq_file *m)
14011 {
14012 int node;
14013 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
14014 struct numa_group *ng;
14015
14016 rcu_read_lock();
14017 ng = rcu_dereference_all(p->numa_group);
14018 for_each_online_node(node) {
14019 if (p->numa_faults) {
14020 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
14021 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
14022 }
14023 if (ng) {
14024 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
14025 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
14026 }
14027 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
14028 }
14029 rcu_read_unlock();
14030 }
14031 #endif /* CONFIG_NUMA_BALANCING */
14032
init_sched_fair_class(void)14033 __init void init_sched_fair_class(void)
14034 {
14035 int i;
14036
14037 for_each_possible_cpu(i) {
14038 zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i));
14039 zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i));
14040 zalloc_cpumask_var_node(&per_cpu(should_we_balance_tmpmask, i),
14041 GFP_KERNEL, cpu_to_node(i));
14042
14043 #ifdef CONFIG_CFS_BANDWIDTH
14044 INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i));
14045 INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list);
14046 #endif
14047 }
14048
14049 open_softirq(SCHED_SOFTIRQ, sched_balance_softirq);
14050
14051 #ifdef CONFIG_NO_HZ_COMMON
14052 nohz.next_balance = jiffies;
14053 nohz.next_blocked = jiffies;
14054 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
14055 #endif
14056 }
14057