whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

fair.c (291134B)


      1 // SPDX-License-Identifier: GPL-2.0
      2 /*
      3  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
      4  *
      5  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
      6  *
      7  *  Interactivity improvements by Mike Galbraith
      8  *  (C) 2007 Mike Galbraith <efault@gmx.de>
      9  *
     10  *  Various enhancements by Dmitry Adamushko.
     11  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
     12  *
     13  *  Group scheduling enhancements by Srivatsa Vaddagiri
     14  *  Copyright IBM Corporation, 2007
     15  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
     16  *
     17  *  Scaled math optimizations by Thomas Gleixner
     18  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
     19  *
     20  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
     21  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
     22  */
     23 #include "sched.h"
     24 
     25 #include <trace/events/sched.h>
     26 
     27 /*
     28  * Targeted preemption latency for CPU-bound tasks:
     29  *
     30  * NOTE: this latency value is not the same as the concept of
     31  * 'timeslice length' - timeslices in CFS are of variable length
     32  * and have no persistent notion like in traditional, time-slice
     33  * based scheduling concepts.
     34  *
     35  * (to see the precise effective timeslice length of your workload,
     36  *  run vmstat and monitor the context-switches (cs) field)
     37  *
     38  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
     39  */
     40 unsigned int sysctl_sched_latency			= 6000000ULL;
     41 static unsigned int normalized_sysctl_sched_latency	= 6000000ULL;
     42 
     43 /*
     44  * The initial- and re-scaling of tunables is configurable
     45  *
     46  * Options are:
     47  *
     48  *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
     49  *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
     50  *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
     51  *
     52  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
     53  */
     54 enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
     55 
     56 /*
     57  * Minimal preemption granularity for CPU-bound tasks:
     58  *
     59  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
     60  */
     61 unsigned int sysctl_sched_min_granularity			= 750000ULL;
     62 static unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
     63 
     64 /*
     65  * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
     66  */
     67 static unsigned int sched_nr_latency = 8;
     68 
     69 /*
     70  * After fork, child runs first. If set to 0 (default) then
     71  * parent will (try to) run first.
     72  */
     73 unsigned int sysctl_sched_child_runs_first __read_mostly;
     74 
     75 /*
     76  * SCHED_OTHER wake-up granularity.
     77  *
     78  * This option delays the preemption effects of decoupled workloads
     79  * and reduces their over-scheduling. Synchronous workloads will still
     80  * have immediate wakeup/sleep latencies.
     81  *
     82  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
     83  */
     84 unsigned int sysctl_sched_wakeup_granularity			= 1000000UL;
     85 static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
     86 
     87 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
     88 
     89 #ifdef CONFIG_SMP
     90 /*
     91  * For asym packing, by default the lower numbered CPU has higher priority.
     92  */
     93 int __weak arch_asym_cpu_priority(int cpu)
     94 {
     95 	return -cpu;
     96 }
     97 
     98 /*
     99  * The margin used when comparing utilization with CPU capacity:
    100  * util * margin < capacity * 1024
    101  *
    102  * (default: ~20%)
    103  */
    104 static unsigned int capacity_margin			= 1280;
    105 #endif
    106 
    107 #ifdef CONFIG_CFS_BANDWIDTH
    108 /*
    109  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
    110  * each time a cfs_rq requests quota.
    111  *
    112  * Note: in the case that the slice exceeds the runtime remaining (either due
    113  * to consumption or the quota being specified to be smaller than the slice)
    114  * we will always only issue the remaining available time.
    115  *
    116  * (default: 5 msec, units: microseconds)
    117  */
    118 unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
    119 #endif
    120 
    121 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
    122 {
    123 	lw->weight += inc;
    124 	lw->inv_weight = 0;
    125 }
    126 
    127 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
    128 {
    129 	lw->weight -= dec;
    130 	lw->inv_weight = 0;
    131 }
    132 
    133 static inline void update_load_set(struct load_weight *lw, unsigned long w)
    134 {
    135 	lw->weight = w;
    136 	lw->inv_weight = 0;
    137 }
    138 
    139 /*
    140  * Increase the granularity value when there are more CPUs,
    141  * because with more CPUs the 'effective latency' as visible
    142  * to users decreases. But the relationship is not linear,
    143  * so pick a second-best guess by going with the log2 of the
    144  * number of CPUs.
    145  *
    146  * This idea comes from the SD scheduler of Con Kolivas:
    147  */
    148 static unsigned int get_update_sysctl_factor(void)
    149 {
    150 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
    151 	unsigned int factor;
    152 
    153 	switch (sysctl_sched_tunable_scaling) {
    154 	case SCHED_TUNABLESCALING_NONE:
    155 		factor = 1;
    156 		break;
    157 	case SCHED_TUNABLESCALING_LINEAR:
    158 		factor = cpus;
    159 		break;
    160 	case SCHED_TUNABLESCALING_LOG:
    161 	default:
    162 		factor = 1 + ilog2(cpus);
    163 		break;
    164 	}
    165 
    166 	return factor;
    167 }
    168 
    169 static void update_sysctl(void)
    170 {
    171 	unsigned int factor = get_update_sysctl_factor();
    172 
    173 #define SET_SYSCTL(name) \
    174 	(sysctl_##name = (factor) * normalized_sysctl_##name)
    175 	SET_SYSCTL(sched_min_granularity);
    176 	SET_SYSCTL(sched_latency);
    177 	SET_SYSCTL(sched_wakeup_granularity);
    178 #undef SET_SYSCTL
    179 }
    180 
    181 void sched_init_granularity(void)
    182 {
    183 	update_sysctl();
    184 }
    185 
    186 #define WMULT_CONST	(~0U)
    187 #define WMULT_SHIFT	32
    188 
    189 static void __update_inv_weight(struct load_weight *lw)
    190 {
    191 	unsigned long w;
    192 
    193 	if (likely(lw->inv_weight))
    194 		return;
    195 
    196 	w = scale_load_down(lw->weight);
    197 
    198 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
    199 		lw->inv_weight = 1;
    200 	else if (unlikely(!w))
    201 		lw->inv_weight = WMULT_CONST;
    202 	else
    203 		lw->inv_weight = WMULT_CONST / w;
    204 }
    205 
    206 /*
    207  * delta_exec * weight / lw.weight
    208  *   OR
    209  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
    210  *
    211  * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
    212  * we're guaranteed shift stays positive because inv_weight is guaranteed to
    213  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
    214  *
    215  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
    216  * weight/lw.weight <= 1, and therefore our shift will also be positive.
    217  */
    218 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
    219 {
    220 	u64 fact = scale_load_down(weight);
    221 	int shift = WMULT_SHIFT;
    222 
    223 	__update_inv_weight(lw);
    224 
    225 	if (unlikely(fact >> 32)) {
    226 		while (fact >> 32) {
    227 			fact >>= 1;
    228 			shift--;
    229 		}
    230 	}
    231 
    232 	/* hint to use a 32x32->64 mul */
    233 	fact = (u64)(u32)fact * lw->inv_weight;
    234 
    235 	while (fact >> 32) {
    236 		fact >>= 1;
    237 		shift--;
    238 	}
    239 
    240 	return mul_u64_u32_shr(delta_exec, fact, shift);
    241 }
    242 
    243 
    244 const struct sched_class fair_sched_class;
    245 
    246 /**************************************************************
    247  * CFS operations on generic schedulable entities:
    248  */
    249 
    250 #ifdef CONFIG_FAIR_GROUP_SCHED
    251 static inline struct task_struct *task_of(struct sched_entity *se)
    252 {
    253 	SCHED_WARN_ON(!entity_is_task(se));
    254 	return container_of(se, struct task_struct, se);
    255 }
    256 
    257 /* Walk up scheduling entities hierarchy */
    258 #define for_each_sched_entity(se) \
    259 		for (; se; se = se->parent)
    260 
    261 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
    262 {
    263 	return p->se.cfs_rq;
    264 }
    265 
    266 /* runqueue on which this entity is (to be) queued */
    267 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
    268 {
    269 	return se->cfs_rq;
    270 }
    271 
    272 /* runqueue "owned" by this group */
    273 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
    274 {
    275 	return grp->my_q;
    276 }
    277 
    278 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    279 {
    280 	struct rq *rq = rq_of(cfs_rq);
    281 	int cpu = cpu_of(rq);
    282 
    283 	if (cfs_rq->on_list)
    284 		return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
    285 
    286 	cfs_rq->on_list = 1;
    287 
    288 	/*
    289 	 * Ensure we either appear before our parent (if already
    290 	 * enqueued) or force our parent to appear after us when it is
    291 	 * enqueued. The fact that we always enqueue bottom-up
    292 	 * reduces this to two cases and a special case for the root
    293 	 * cfs_rq. Furthermore, it also means that we will always reset
    294 	 * tmp_alone_branch either when the branch is connected
    295 	 * to a tree or when we reach the top of the tree
    296 	 */
    297 	if (cfs_rq->tg->parent &&
    298 	    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
    299 		/*
    300 		 * If parent is already on the list, we add the child
    301 		 * just before. Thanks to circular linked property of
    302 		 * the list, this means to put the child at the tail
    303 		 * of the list that starts by parent.
    304 		 */
    305 		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
    306 			&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
    307 		/*
    308 		 * The branch is now connected to its tree so we can
    309 		 * reset tmp_alone_branch to the beginning of the
    310 		 * list.
    311 		 */
    312 		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
    313 		return true;
    314 	}
    315 
    316 	if (!cfs_rq->tg->parent) {
    317 		/*
    318 		 * cfs rq without parent should be put
    319 		 * at the tail of the list.
    320 		 */
    321 		list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
    322 			&rq->leaf_cfs_rq_list);
    323 		/*
    324 		 * We have reach the top of a tree so we can reset
    325 		 * tmp_alone_branch to the beginning of the list.
    326 		 */
    327 		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
    328 		return true;
    329 	}
    330 
    331 	/*
    332 	 * The parent has not already been added so we want to
    333 	 * make sure that it will be put after us.
    334 	 * tmp_alone_branch points to the begin of the branch
    335 	 * where we will add parent.
    336 	 */
    337 	list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
    338 	/*
    339 	 * update tmp_alone_branch to points to the new begin
    340 	 * of the branch
    341 	 */
    342 	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
    343 	return false;
    344 }
    345 
    346 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    347 {
    348 	if (cfs_rq->on_list) {
    349 		struct rq *rq = rq_of(cfs_rq);
    350 
    351 		/*
    352 		 * With cfs_rq being unthrottled/throttled during an enqueue,
    353 		 * it can happen the tmp_alone_branch points the a leaf that
    354 		 * we finally want to del. In this case, tmp_alone_branch moves
    355 		 * to the prev element but it will point to rq->leaf_cfs_rq_list
    356 		 * at the end of the enqueue.
    357 		 */
    358 		if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
    359 			rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
    360 
    361 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
    362 		cfs_rq->on_list = 0;
    363 	}
    364 }
    365 
    366 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
    367 {
    368 	SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
    369 }
    370 
    371 /* Iterate thr' all leaf cfs_rq's on a runqueue */
    372 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)			\
    373 	list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,	\
    374 				 leaf_cfs_rq_list)
    375 
    376 /* Do the two (enqueued) entities belong to the same group ? */
    377 static inline struct cfs_rq *
    378 is_same_group(struct sched_entity *se, struct sched_entity *pse)
    379 {
    380 	if (se->cfs_rq == pse->cfs_rq)
    381 		return se->cfs_rq;
    382 
    383 	return NULL;
    384 }
    385 
    386 static inline struct sched_entity *parent_entity(struct sched_entity *se)
    387 {
    388 	return se->parent;
    389 }
    390 
    391 static void
    392 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
    393 {
    394 	int se_depth, pse_depth;
    395 
    396 	/*
    397 	 * preemption test can be made between sibling entities who are in the
    398 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
    399 	 * both tasks until we find their ancestors who are siblings of common
    400 	 * parent.
    401 	 */
    402 
    403 	/* First walk up until both entities are at same depth */
    404 	se_depth = (*se)->depth;
    405 	pse_depth = (*pse)->depth;
    406 
    407 	while (se_depth > pse_depth) {
    408 		se_depth--;
    409 		*se = parent_entity(*se);
    410 	}
    411 
    412 	while (pse_depth > se_depth) {
    413 		pse_depth--;
    414 		*pse = parent_entity(*pse);
    415 	}
    416 
    417 	while (!is_same_group(*se, *pse)) {
    418 		*se = parent_entity(*se);
    419 		*pse = parent_entity(*pse);
    420 	}
    421 }
    422 
    423 #else	/* !CONFIG_FAIR_GROUP_SCHED */
    424 
    425 static inline struct task_struct *task_of(struct sched_entity *se)
    426 {
    427 	return container_of(se, struct task_struct, se);
    428 }
    429 
    430 #define for_each_sched_entity(se) \
    431 		for (; se; se = NULL)
    432 
    433 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
    434 {
    435 	return &task_rq(p)->cfs;
    436 }
    437 
    438 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
    439 {
    440 	struct task_struct *p = task_of(se);
    441 	struct rq *rq = task_rq(p);
    442 
    443 	return &rq->cfs;
    444 }
    445 
    446 /* runqueue "owned" by this group */
    447 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
    448 {
    449 	return NULL;
    450 }
    451 
    452 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    453 {
    454 	return true;
    455 }
    456 
    457 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
    458 {
    459 }
    460 
    461 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
    462 {
    463 }
    464 
    465 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)	\
    466 		for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
    467 
    468 static inline struct sched_entity *parent_entity(struct sched_entity *se)
    469 {
    470 	return NULL;
    471 }
    472 
    473 static inline void
    474 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
    475 {
    476 }
    477 
    478 #endif	/* CONFIG_FAIR_GROUP_SCHED */
    479 
    480 static __always_inline
    481 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
    482 
    483 /**************************************************************
    484  * Scheduling class tree data structure manipulation methods:
    485  */
    486 
    487 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
    488 {
    489 	s64 delta = (s64)(vruntime - max_vruntime);
    490 	if (delta > 0)
    491 		max_vruntime = vruntime;
    492 
    493 	return max_vruntime;
    494 }
    495 
    496 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
    497 {
    498 	s64 delta = (s64)(vruntime - min_vruntime);
    499 	if (delta < 0)
    500 		min_vruntime = vruntime;
    501 
    502 	return min_vruntime;
    503 }
    504 
    505 static inline int entity_before(struct sched_entity *a,
    506 				struct sched_entity *b)
    507 {
    508 	return (s64)(a->vruntime - b->vruntime) < 0;
    509 }
    510 
    511 static void update_min_vruntime(struct cfs_rq *cfs_rq)
    512 {
    513 	struct sched_entity *curr = cfs_rq->curr;
    514 	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
    515 
    516 	u64 vruntime = cfs_rq->min_vruntime;
    517 
    518 	if (curr) {
    519 		if (curr->on_rq)
    520 			vruntime = curr->vruntime;
    521 		else
    522 			curr = NULL;
    523 	}
    524 
    525 	if (leftmost) { /* non-empty tree */
    526 		struct sched_entity *se;
    527 		se = rb_entry(leftmost, struct sched_entity, run_node);
    528 
    529 		if (!curr)
    530 			vruntime = se->vruntime;
    531 		else
    532 			vruntime = min_vruntime(vruntime, se->vruntime);
    533 	}
    534 
    535 	/* ensure we never gain time by being placed backwards. */
    536 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
    537 #ifndef CONFIG_64BIT
    538 	smp_wmb();
    539 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
    540 #endif
    541 }
    542 
    543 /*
    544  * Enqueue an entity into the rb-tree:
    545  */
    546 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    547 {
    548 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
    549 	struct rb_node *parent = NULL;
    550 	struct sched_entity *entry;
    551 	bool leftmost = true;
    552 
    553 	/*
    554 	 * Find the right place in the rbtree:
    555 	 */
    556 	while (*link) {
    557 		parent = *link;
    558 		entry = rb_entry(parent, struct sched_entity, run_node);
    559 		/*
    560 		 * We dont care about collisions. Nodes with
    561 		 * the same key stay together.
    562 		 */
    563 		if (entity_before(se, entry)) {
    564 			link = &parent->rb_left;
    565 		} else {
    566 			link = &parent->rb_right;
    567 			leftmost = false;
    568 		}
    569 	}
    570 
    571 	rb_link_node(&se->run_node, parent, link);
    572 	rb_insert_color_cached(&se->run_node,
    573 			       &cfs_rq->tasks_timeline, leftmost);
    574 }
    575 
    576 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
    577 {
    578 	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
    579 }
    580 
    581 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
    582 {
    583 	struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
    584 
    585 	if (!left)
    586 		return NULL;
    587 
    588 	return rb_entry(left, struct sched_entity, run_node);
    589 }
    590 
    591 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
    592 {
    593 	struct rb_node *next = rb_next(&se->run_node);
    594 
    595 	if (!next)
    596 		return NULL;
    597 
    598 	return rb_entry(next, struct sched_entity, run_node);
    599 }
    600 
    601 #ifdef CONFIG_SCHED_DEBUG
    602 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
    603 {
    604 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
    605 
    606 	if (!last)
    607 		return NULL;
    608 
    609 	return rb_entry(last, struct sched_entity, run_node);
    610 }
    611 
    612 /**************************************************************
    613  * Scheduling class statistics methods:
    614  */
    615 
    616 int sched_proc_update_handler(struct ctl_table *table, int write,
    617 		void __user *buffer, size_t *lenp,
    618 		loff_t *ppos)
    619 {
    620 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
    621 	unsigned int factor = get_update_sysctl_factor();
    622 
    623 	if (ret || !write)
    624 		return ret;
    625 
    626 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
    627 					sysctl_sched_min_granularity);
    628 
    629 #define WRT_SYSCTL(name) \
    630 	(normalized_sysctl_##name = sysctl_##name / (factor))
    631 	WRT_SYSCTL(sched_min_granularity);
    632 	WRT_SYSCTL(sched_latency);
    633 	WRT_SYSCTL(sched_wakeup_granularity);
    634 #undef WRT_SYSCTL
    635 
    636 	return 0;
    637 }
    638 #endif
    639 
    640 /*
    641  * delta /= w
    642  */
    643 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
    644 {
    645 	if (unlikely(se->load.weight != NICE_0_LOAD))
    646 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
    647 
    648 	return delta;
    649 }
    650 
    651 /*
    652  * The idea is to set a period in which each task runs once.
    653  *
    654  * When there are too many tasks (sched_nr_latency) we have to stretch
    655  * this period because otherwise the slices get too small.
    656  *
    657  * p = (nr <= nl) ? l : l*nr/nl
    658  */
    659 static u64 __sched_period(unsigned long nr_running)
    660 {
    661 	if (unlikely(nr_running > sched_nr_latency))
    662 		return nr_running * sysctl_sched_min_granularity;
    663 	else
    664 		return sysctl_sched_latency;
    665 }
    666 
    667 /*
    668  * We calculate the wall-time slice from the period by taking a part
    669  * proportional to the weight.
    670  *
    671  * s = p*P[w/rw]
    672  */
    673 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
    674 {
    675 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
    676 
    677 	for_each_sched_entity(se) {
    678 		struct load_weight *load;
    679 		struct load_weight lw;
    680 
    681 		cfs_rq = cfs_rq_of(se);
    682 		load = &cfs_rq->load;
    683 
    684 		if (unlikely(!se->on_rq)) {
    685 			lw = cfs_rq->load;
    686 
    687 			update_load_add(&lw, se->load.weight);
    688 			load = &lw;
    689 		}
    690 		slice = __calc_delta(slice, se->load.weight, load);
    691 	}
    692 	return slice;
    693 }
    694 
    695 /*
    696  * We calculate the vruntime slice of a to-be-inserted task.
    697  *
    698  * vs = s/w
    699  */
    700 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
    701 {
    702 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
    703 }
    704 
    705 #include "pelt.h"
    706 #ifdef CONFIG_SMP
    707 
    708 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
    709 static unsigned long task_h_load(struct task_struct *p);
    710 static unsigned long capacity_of(int cpu);
    711 
    712 /* Give new sched_entity start runnable values to heavy its load in infant time */
    713 void init_entity_runnable_average(struct sched_entity *se)
    714 {
    715 	struct sched_avg *sa = &se->avg;
    716 
    717 	memset(sa, 0, sizeof(*sa));
    718 
    719 	/*
    720 	 * Tasks are initialized with full load to be seen as heavy tasks until
    721 	 * they get a chance to stabilize to their real load level.
    722 	 * Group entities are initialized with zero load to reflect the fact that
    723 	 * nothing has been attached to the task group yet.
    724 	 */
    725 	if (entity_is_task(se))
    726 		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
    727 
    728 	se->runnable_weight = se->load.weight;
    729 
    730 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
    731 }
    732 
    733 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
    734 static void attach_entity_cfs_rq(struct sched_entity *se);
    735 
    736 /*
    737  * With new tasks being created, their initial util_avgs are extrapolated
    738  * based on the cfs_rq's current util_avg:
    739  *
    740  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
    741  *
    742  * However, in many cases, the above util_avg does not give a desired
    743  * value. Moreover, the sum of the util_avgs may be divergent, such
    744  * as when the series is a harmonic series.
    745  *
    746  * To solve this problem, we also cap the util_avg of successive tasks to
    747  * only 1/2 of the left utilization budget:
    748  *
    749  *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
    750  *
    751  * where n denotes the nth task and cpu_scale the CPU capacity.
    752  *
    753  * For example, for a CPU with 1024 of capacity, a simplest series from
    754  * the beginning would be like:
    755  *
    756  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
    757  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
    758  *
    759  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
    760  * if util_avg > util_avg_cap.
    761  */
    762 void post_init_entity_util_avg(struct task_struct *p)
    763 {
    764 	struct sched_entity *se = &p->se;
    765 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
    766 	struct sched_avg *sa = &se->avg;
    767 	long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
    768 	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
    769 
    770 	if (cap > 0) {
    771 		if (cfs_rq->avg.util_avg != 0) {
    772 			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
    773 			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
    774 
    775 			if (sa->util_avg > cap)
    776 				sa->util_avg = cap;
    777 		} else {
    778 			sa->util_avg = cap;
    779 		}
    780 	}
    781 
    782 	if (p->sched_class != &fair_sched_class) {
    783 		/*
    784 		 * For !fair tasks do:
    785 		 *
    786 		update_cfs_rq_load_avg(now, cfs_rq);
    787 		attach_entity_load_avg(cfs_rq, se, 0);
    788 		switched_from_fair(rq, p);
    789 		 *
    790 		 * such that the next switched_to_fair() has the
    791 		 * expected state.
    792 		 */
    793 		se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
    794 		return;
    795 	}
    796 
    797 	attach_entity_cfs_rq(se);
    798 }
    799 
    800 #else /* !CONFIG_SMP */
    801 void init_entity_runnable_average(struct sched_entity *se)
    802 {
    803 }
    804 void post_init_entity_util_avg(struct task_struct *p)
    805 {
    806 }
    807 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
    808 {
    809 }
    810 #endif /* CONFIG_SMP */
    811 
    812 /*
    813  * Update the current task's runtime statistics.
    814  */
    815 static void update_curr(struct cfs_rq *cfs_rq)
    816 {
    817 	struct sched_entity *curr = cfs_rq->curr;
    818 	u64 now = rq_clock_task(rq_of(cfs_rq));
    819 	u64 delta_exec;
    820 
    821 	if (unlikely(!curr))
    822 		return;
    823 
    824 	delta_exec = now - curr->exec_start;
    825 	if (unlikely((s64)delta_exec <= 0))
    826 		return;
    827 
    828 	curr->exec_start = now;
    829 
    830 	schedstat_set(curr->statistics.exec_max,
    831 		      max(delta_exec, curr->statistics.exec_max));
    832 
    833 	curr->sum_exec_runtime += delta_exec;
    834 	schedstat_add(cfs_rq->exec_clock, delta_exec);
    835 
    836 	curr->vruntime += calc_delta_fair(delta_exec, curr);
    837 	update_min_vruntime(cfs_rq);
    838 
    839 	if (entity_is_task(curr)) {
    840 		struct task_struct *curtask = task_of(curr);
    841 
    842 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
    843 		cgroup_account_cputime(curtask, delta_exec);
    844 		account_group_exec_runtime(curtask, delta_exec);
    845 	}
    846 
    847 	account_cfs_rq_runtime(cfs_rq, delta_exec);
    848 }
    849 
    850 static void update_curr_fair(struct rq *rq)
    851 {
    852 	update_curr(cfs_rq_of(&rq->curr->se));
    853 }
    854 
    855 static inline void
    856 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
    857 {
    858 	u64 wait_start, prev_wait_start;
    859 
    860 	if (!schedstat_enabled())
    861 		return;
    862 
    863 	wait_start = rq_clock(rq_of(cfs_rq));
    864 	prev_wait_start = schedstat_val(se->statistics.wait_start);
    865 
    866 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
    867 	    likely(wait_start > prev_wait_start))
    868 		wait_start -= prev_wait_start;
    869 
    870 	__schedstat_set(se->statistics.wait_start, wait_start);
    871 }
    872 
    873 static inline void
    874 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
    875 {
    876 	struct task_struct *p;
    877 	u64 delta;
    878 
    879 	if (!schedstat_enabled())
    880 		return;
    881 
    882 	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
    883 
    884 	if (entity_is_task(se)) {
    885 		p = task_of(se);
    886 		if (task_on_rq_migrating(p)) {
    887 			/*
    888 			 * Preserve migrating task's wait time so wait_start
    889 			 * time stamp can be adjusted to accumulate wait time
    890 			 * prior to migration.
    891 			 */
    892 			__schedstat_set(se->statistics.wait_start, delta);
    893 			return;
    894 		}
    895 		trace_sched_stat_wait(p, delta);
    896 	}
    897 
    898 	__schedstat_set(se->statistics.wait_max,
    899 		      max(schedstat_val(se->statistics.wait_max), delta));
    900 	__schedstat_inc(se->statistics.wait_count);
    901 	__schedstat_add(se->statistics.wait_sum, delta);
    902 	__schedstat_set(se->statistics.wait_start, 0);
    903 }
    904 
    905 static inline void
    906 update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
    907 {
    908 	struct task_struct *tsk = NULL;
    909 	u64 sleep_start, block_start;
    910 
    911 	if (!schedstat_enabled())
    912 		return;
    913 
    914 	sleep_start = schedstat_val(se->statistics.sleep_start);
    915 	block_start = schedstat_val(se->statistics.block_start);
    916 
    917 	if (entity_is_task(se))
    918 		tsk = task_of(se);
    919 
    920 	if (sleep_start) {
    921 		u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
    922 
    923 		if ((s64)delta < 0)
    924 			delta = 0;
    925 
    926 		if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
    927 			__schedstat_set(se->statistics.sleep_max, delta);
    928 
    929 		__schedstat_set(se->statistics.sleep_start, 0);
    930 		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
    931 
    932 		if (tsk) {
    933 			account_scheduler_latency(tsk, delta >> 10, 1);
    934 			trace_sched_stat_sleep(tsk, delta);
    935 		}
    936 	}
    937 	if (block_start) {
    938 		u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
    939 
    940 		if ((s64)delta < 0)
    941 			delta = 0;
    942 
    943 		if (unlikely(delta > schedstat_val(se->statistics.block_max)))
    944 			__schedstat_set(se->statistics.block_max, delta);
    945 
    946 		__schedstat_set(se->statistics.block_start, 0);
    947 		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
    948 
    949 		if (tsk) {
    950 			if (tsk->in_iowait) {
    951 				__schedstat_add(se->statistics.iowait_sum, delta);
    952 				__schedstat_inc(se->statistics.iowait_count);
    953 				trace_sched_stat_iowait(tsk, delta);
    954 			}
    955 
    956 			trace_sched_stat_blocked(tsk, delta);
    957 
    958 			/*
    959 			 * Blocking time is in units of nanosecs, so shift by
    960 			 * 20 to get a milliseconds-range estimation of the
    961 			 * amount of time that the task spent sleeping:
    962 			 */
    963 			if (unlikely(prof_on == SLEEP_PROFILING)) {
    964 				profile_hits(SLEEP_PROFILING,
    965 						(void *)get_wchan(tsk),
    966 						delta >> 20);
    967 			}
    968 			account_scheduler_latency(tsk, delta >> 10, 0);
    969 		}
    970 	}
    971 }
    972 
    973 /*
    974  * Task is being enqueued - update stats:
    975  */
    976 static inline void
    977 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
    978 {
    979 	if (!schedstat_enabled())
    980 		return;
    981 
    982 	/*
    983 	 * Are we enqueueing a waiting task? (for current tasks
    984 	 * a dequeue/enqueue event is a NOP)
    985 	 */
    986 	if (se != cfs_rq->curr)
    987 		update_stats_wait_start(cfs_rq, se);
    988 
    989 	if (flags & ENQUEUE_WAKEUP)
    990 		update_stats_enqueue_sleeper(cfs_rq, se);
    991 }
    992 
    993 static inline void
    994 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
    995 {
    996 
    997 	if (!schedstat_enabled())
    998 		return;
    999 
   1000 	/*
   1001 	 * Mark the end of the wait period if dequeueing a
   1002 	 * waiting task:
   1003 	 */
   1004 	if (se != cfs_rq->curr)
   1005 		update_stats_wait_end(cfs_rq, se);
   1006 
   1007 	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
   1008 		struct task_struct *tsk = task_of(se);
   1009 
   1010 		if (tsk->state & TASK_INTERRUPTIBLE)
   1011 			__schedstat_set(se->statistics.sleep_start,
   1012 				      rq_clock(rq_of(cfs_rq)));
   1013 		if (tsk->state & TASK_UNINTERRUPTIBLE)
   1014 			__schedstat_set(se->statistics.block_start,
   1015 				      rq_clock(rq_of(cfs_rq)));
   1016 	}
   1017 }
   1018 
   1019 /*
   1020  * We are picking a new current task - update its stats:
   1021  */
   1022 static inline void
   1023 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   1024 {
   1025 	/*
   1026 	 * We are starting a new run period:
   1027 	 */
   1028 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
   1029 }
   1030 
   1031 /**************************************************
   1032  * Scheduling class queueing methods:
   1033  */
   1034 
   1035 #ifdef CONFIG_NUMA_BALANCING
   1036 /*
   1037  * Approximate time to scan a full NUMA task in ms. The task scan period is
   1038  * calculated based on the tasks virtual memory size and
   1039  * numa_balancing_scan_size.
   1040  */
   1041 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
   1042 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
   1043 
   1044 /* Portion of address space to scan in MB */
   1045 unsigned int sysctl_numa_balancing_scan_size = 256;
   1046 
   1047 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
   1048 unsigned int sysctl_numa_balancing_scan_delay = 1000;
   1049 
   1050 struct numa_group {
   1051 	refcount_t refcount;
   1052 
   1053 	spinlock_t lock; /* nr_tasks, tasks */
   1054 	int nr_tasks;
   1055 	pid_t gid;
   1056 	int active_nodes;
   1057 
   1058 	struct rcu_head rcu;
   1059 	unsigned long total_faults;
   1060 	unsigned long max_faults_cpu;
   1061 	/*
   1062 	 * Faults_cpu is used to decide whether memory should move
   1063 	 * towards the CPU. As a consequence, these stats are weighted
   1064 	 * more by CPU use than by memory faults.
   1065 	 */
   1066 	unsigned long *faults_cpu;
   1067 	unsigned long faults[0];
   1068 };
   1069 
   1070 static inline unsigned long group_faults_priv(struct numa_group *ng);
   1071 static inline unsigned long group_faults_shared(struct numa_group *ng);
   1072 
   1073 static unsigned int task_nr_scan_windows(struct task_struct *p)
   1074 {
   1075 	unsigned long rss = 0;
   1076 	unsigned long nr_scan_pages;
   1077 
   1078 	/*
   1079 	 * Calculations based on RSS as non-present and empty pages are skipped
   1080 	 * by the PTE scanner and NUMA hinting faults should be trapped based
   1081 	 * on resident pages
   1082 	 */
   1083 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
   1084 	rss = get_mm_rss(p->mm);
   1085 	if (!rss)
   1086 		rss = nr_scan_pages;
   1087 
   1088 	rss = round_up(rss, nr_scan_pages);
   1089 	return rss / nr_scan_pages;
   1090 }
   1091 
   1092 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
   1093 #define MAX_SCAN_WINDOW 2560
   1094 
   1095 static unsigned int task_scan_min(struct task_struct *p)
   1096 {
   1097 	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
   1098 	unsigned int scan, floor;
   1099 	unsigned int windows = 1;
   1100 
   1101 	if (scan_size < MAX_SCAN_WINDOW)
   1102 		windows = MAX_SCAN_WINDOW / scan_size;
   1103 	floor = 1000 / windows;
   1104 
   1105 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
   1106 	return max_t(unsigned int, floor, scan);
   1107 }
   1108 
   1109 static unsigned int task_scan_start(struct task_struct *p)
   1110 {
   1111 	unsigned long smin = task_scan_min(p);
   1112 	unsigned long period = smin;
   1113 
   1114 	/* Scale the maximum scan period with the amount of shared memory. */
   1115 	if (p->numa_group) {
   1116 		struct numa_group *ng = p->numa_group;
   1117 		unsigned long shared = group_faults_shared(ng);
   1118 		unsigned long private = group_faults_priv(ng);
   1119 
   1120 		period *= refcount_read(&ng->refcount);
   1121 		period *= shared + 1;
   1122 		period /= private + shared + 1;
   1123 	}
   1124 
   1125 	return max(smin, period);
   1126 }
   1127 
   1128 static unsigned int task_scan_max(struct task_struct *p)
   1129 {
   1130 	unsigned long smin = task_scan_min(p);
   1131 	unsigned long smax;
   1132 
   1133 	/* Watch for min being lower than max due to floor calculations */
   1134 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
   1135 
   1136 	/* Scale the maximum scan period with the amount of shared memory. */
   1137 	if (p->numa_group) {
   1138 		struct numa_group *ng = p->numa_group;
   1139 		unsigned long shared = group_faults_shared(ng);
   1140 		unsigned long private = group_faults_priv(ng);
   1141 		unsigned long period = smax;
   1142 
   1143 		period *= refcount_read(&ng->refcount);
   1144 		period *= shared + 1;
   1145 		period /= private + shared + 1;
   1146 
   1147 		smax = max(smax, period);
   1148 	}
   1149 
   1150 	return max(smin, smax);
   1151 }
   1152 
   1153 void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
   1154 {
   1155 	int mm_users = 0;
   1156 	struct mm_struct *mm = p->mm;
   1157 
   1158 	if (mm) {
   1159 		mm_users = atomic_read(&mm->mm_users);
   1160 		if (mm_users == 1) {
   1161 			mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
   1162 			mm->numa_scan_seq = 0;
   1163 		}
   1164 	}
   1165 	p->node_stamp			= 0;
   1166 	p->numa_scan_seq		= mm ? mm->numa_scan_seq : 0;
   1167 	p->numa_scan_period		= sysctl_numa_balancing_scan_delay;
   1168 	p->numa_work.next		= &p->numa_work;
   1169 	p->numa_faults			= NULL;
   1170 	p->numa_group			= NULL;
   1171 	p->last_task_numa_placement	= 0;
   1172 	p->last_sum_exec_runtime	= 0;
   1173 
   1174 	/* New address space, reset the preferred nid */
   1175 	if (!(clone_flags & CLONE_VM)) {
   1176 		p->numa_preferred_nid = NUMA_NO_NODE;
   1177 		return;
   1178 	}
   1179 
   1180 	/*
   1181 	 * New thread, keep existing numa_preferred_nid which should be copied
   1182 	 * already by arch_dup_task_struct but stagger when scans start.
   1183 	 */
   1184 	if (mm) {
   1185 		unsigned int delay;
   1186 
   1187 		delay = min_t(unsigned int, task_scan_max(current),
   1188 			current->numa_scan_period * mm_users * NSEC_PER_MSEC);
   1189 		delay += 2 * TICK_NSEC;
   1190 		p->node_stamp = delay;
   1191 	}
   1192 }
   1193 
   1194 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
   1195 {
   1196 	rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
   1197 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
   1198 }
   1199 
   1200 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
   1201 {
   1202 	rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
   1203 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
   1204 }
   1205 
   1206 /* Shared or private faults. */
   1207 #define NR_NUMA_HINT_FAULT_TYPES 2
   1208 
   1209 /* Memory and CPU locality */
   1210 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
   1211 
   1212 /* Averaged statistics, and temporary buffers. */
   1213 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
   1214 
   1215 pid_t task_numa_group_id(struct task_struct *p)
   1216 {
   1217 	return p->numa_group ? p->numa_group->gid : 0;
   1218 }
   1219 
   1220 /*
   1221  * The averaged statistics, shared & private, memory & CPU,
   1222  * occupy the first half of the array. The second half of the
   1223  * array is for current counters, which are averaged into the
   1224  * first set by task_numa_placement.
   1225  */
   1226 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
   1227 {
   1228 	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
   1229 }
   1230 
   1231 static inline unsigned long task_faults(struct task_struct *p, int nid)
   1232 {
   1233 	if (!p->numa_faults)
   1234 		return 0;
   1235 
   1236 	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
   1237 		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
   1238 }
   1239 
   1240 static inline unsigned long group_faults(struct task_struct *p, int nid)
   1241 {
   1242 	if (!p->numa_group)
   1243 		return 0;
   1244 
   1245 	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
   1246 		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
   1247 }
   1248 
   1249 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
   1250 {
   1251 	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
   1252 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
   1253 }
   1254 
   1255 static inline unsigned long group_faults_priv(struct numa_group *ng)
   1256 {
   1257 	unsigned long faults = 0;
   1258 	int node;
   1259 
   1260 	for_each_online_node(node) {
   1261 		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
   1262 	}
   1263 
   1264 	return faults;
   1265 }
   1266 
   1267 static inline unsigned long group_faults_shared(struct numa_group *ng)
   1268 {
   1269 	unsigned long faults = 0;
   1270 	int node;
   1271 
   1272 	for_each_online_node(node) {
   1273 		faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
   1274 	}
   1275 
   1276 	return faults;
   1277 }
   1278 
   1279 /*
   1280  * A node triggering more than 1/3 as many NUMA faults as the maximum is
   1281  * considered part of a numa group's pseudo-interleaving set. Migrations
   1282  * between these nodes are slowed down, to allow things to settle down.
   1283  */
   1284 #define ACTIVE_NODE_FRACTION 3
   1285 
   1286 static bool numa_is_active_node(int nid, struct numa_group *ng)
   1287 {
   1288 	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
   1289 }
   1290 
   1291 /* Handle placement on systems where not all nodes are directly connected. */
   1292 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
   1293 					int maxdist, bool task)
   1294 {
   1295 	unsigned long score = 0;
   1296 	int node;
   1297 
   1298 	/*
   1299 	 * All nodes are directly connected, and the same distance
   1300 	 * from each other. No need for fancy placement algorithms.
   1301 	 */
   1302 	if (sched_numa_topology_type == NUMA_DIRECT)
   1303 		return 0;
   1304 
   1305 	/*
   1306 	 * This code is called for each node, introducing N^2 complexity,
   1307 	 * which should be ok given the number of nodes rarely exceeds 8.
   1308 	 */
   1309 	for_each_online_node(node) {
   1310 		unsigned long faults;
   1311 		int dist = node_distance(nid, node);
   1312 
   1313 		/*
   1314 		 * The furthest away nodes in the system are not interesting
   1315 		 * for placement; nid was already counted.
   1316 		 */
   1317 		if (dist == sched_max_numa_distance || node == nid)
   1318 			continue;
   1319 
   1320 		/*
   1321 		 * On systems with a backplane NUMA topology, compare groups
   1322 		 * of nodes, and move tasks towards the group with the most
   1323 		 * memory accesses. When comparing two nodes at distance
   1324 		 * "hoplimit", only nodes closer by than "hoplimit" are part
   1325 		 * of each group. Skip other nodes.
   1326 		 */
   1327 		if (sched_numa_topology_type == NUMA_BACKPLANE &&
   1328 					dist >= maxdist)
   1329 			continue;
   1330 
   1331 		/* Add up the faults from nearby nodes. */
   1332 		if (task)
   1333 			faults = task_faults(p, node);
   1334 		else
   1335 			faults = group_faults(p, node);
   1336 
   1337 		/*
   1338 		 * On systems with a glueless mesh NUMA topology, there are
   1339 		 * no fixed "groups of nodes". Instead, nodes that are not
   1340 		 * directly connected bounce traffic through intermediate
   1341 		 * nodes; a numa_group can occupy any set of nodes.
   1342 		 * The further away a node is, the less the faults count.
   1343 		 * This seems to result in good task placement.
   1344 		 */
   1345 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
   1346 			faults *= (sched_max_numa_distance - dist);
   1347 			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
   1348 		}
   1349 
   1350 		score += faults;
   1351 	}
   1352 
   1353 	return score;
   1354 }
   1355 
   1356 /*
   1357  * These return the fraction of accesses done by a particular task, or
   1358  * task group, on a particular numa node.  The group weight is given a
   1359  * larger multiplier, in order to group tasks together that are almost
   1360  * evenly spread out between numa nodes.
   1361  */
   1362 static inline unsigned long task_weight(struct task_struct *p, int nid,
   1363 					int dist)
   1364 {
   1365 	unsigned long faults, total_faults;
   1366 
   1367 	if (!p->numa_faults)
   1368 		return 0;
   1369 
   1370 	total_faults = p->total_numa_faults;
   1371 
   1372 	if (!total_faults)
   1373 		return 0;
   1374 
   1375 	faults = task_faults(p, nid);
   1376 	faults += score_nearby_nodes(p, nid, dist, true);
   1377 
   1378 	return 1000 * faults / total_faults;
   1379 }
   1380 
   1381 static inline unsigned long group_weight(struct task_struct *p, int nid,
   1382 					 int dist)
   1383 {
   1384 	unsigned long faults, total_faults;
   1385 
   1386 	if (!p->numa_group)
   1387 		return 0;
   1388 
   1389 	total_faults = p->numa_group->total_faults;
   1390 
   1391 	if (!total_faults)
   1392 		return 0;
   1393 
   1394 	faults = group_faults(p, nid);
   1395 	faults += score_nearby_nodes(p, nid, dist, false);
   1396 
   1397 	return 1000 * faults / total_faults;
   1398 }
   1399 
   1400 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
   1401 				int src_nid, int dst_cpu)
   1402 {
   1403 	struct numa_group *ng = p->numa_group;
   1404 	int dst_nid = cpu_to_node(dst_cpu);
   1405 	int last_cpupid, this_cpupid;
   1406 
   1407 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
   1408 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
   1409 
   1410 	/*
   1411 	 * Allow first faults or private faults to migrate immediately early in
   1412 	 * the lifetime of a task. The magic number 4 is based on waiting for
   1413 	 * two full passes of the "multi-stage node selection" test that is
   1414 	 * executed below.
   1415 	 */
   1416 	if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
   1417 	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
   1418 		return true;
   1419 
   1420 	/*
   1421 	 * Multi-stage node selection is used in conjunction with a periodic
   1422 	 * migration fault to build a temporal task<->page relation. By using
   1423 	 * a two-stage filter we remove short/unlikely relations.
   1424 	 *
   1425 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
   1426 	 * a task's usage of a particular page (n_p) per total usage of this
   1427 	 * page (n_t) (in a given time-span) to a probability.
   1428 	 *
   1429 	 * Our periodic faults will sample this probability and getting the
   1430 	 * same result twice in a row, given these samples are fully
   1431 	 * independent, is then given by P(n)^2, provided our sample period
   1432 	 * is sufficiently short compared to the usage pattern.
   1433 	 *
   1434 	 * This quadric squishes small probabilities, making it less likely we
   1435 	 * act on an unlikely task<->page relation.
   1436 	 */
   1437 	if (!cpupid_pid_unset(last_cpupid) &&
   1438 				cpupid_to_nid(last_cpupid) != dst_nid)
   1439 		return false;
   1440 
   1441 	/* Always allow migrate on private faults */
   1442 	if (cpupid_match_pid(p, last_cpupid))
   1443 		return true;
   1444 
   1445 	/* A shared fault, but p->numa_group has not been set up yet. */
   1446 	if (!ng)
   1447 		return true;
   1448 
   1449 	/*
   1450 	 * Destination node is much more heavily used than the source
   1451 	 * node? Allow migration.
   1452 	 */
   1453 	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
   1454 					ACTIVE_NODE_FRACTION)
   1455 		return true;
   1456 
   1457 	/*
   1458 	 * Distribute memory according to CPU & memory use on each node,
   1459 	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
   1460 	 *
   1461 	 * faults_cpu(dst)   3   faults_cpu(src)
   1462 	 * --------------- * - > ---------------
   1463 	 * faults_mem(dst)   4   faults_mem(src)
   1464 	 */
   1465 	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
   1466 	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
   1467 }
   1468 
   1469 static unsigned long weighted_cpuload(struct rq *rq);
   1470 static unsigned long source_load(int cpu, int type);
   1471 static unsigned long target_load(int cpu, int type);
   1472 
   1473 /* Cached statistics for all CPUs within a node */
   1474 struct numa_stats {
   1475 	unsigned long load;
   1476 
   1477 	/* Total compute capacity of CPUs on a node */
   1478 	unsigned long compute_capacity;
   1479 };
   1480 
   1481 /*
   1482  * XXX borrowed from update_sg_lb_stats
   1483  */
   1484 static void update_numa_stats(struct numa_stats *ns, int nid)
   1485 {
   1486 	int cpu;
   1487 
   1488 	memset(ns, 0, sizeof(*ns));
   1489 	for_each_cpu(cpu, cpumask_of_node(nid)) {
   1490 		struct rq *rq = cpu_rq(cpu);
   1491 
   1492 		ns->load += weighted_cpuload(rq);
   1493 		ns->compute_capacity += capacity_of(cpu);
   1494 	}
   1495 
   1496 }
   1497 
   1498 struct task_numa_env {
   1499 	struct task_struct *p;
   1500 
   1501 	int src_cpu, src_nid;
   1502 	int dst_cpu, dst_nid;
   1503 
   1504 	struct numa_stats src_stats, dst_stats;
   1505 
   1506 	int imbalance_pct;
   1507 	int dist;
   1508 
   1509 	struct task_struct *best_task;
   1510 	long best_imp;
   1511 	int best_cpu;
   1512 };
   1513 
   1514 static void task_numa_assign(struct task_numa_env *env,
   1515 			     struct task_struct *p, long imp)
   1516 {
   1517 	struct rq *rq = cpu_rq(env->dst_cpu);
   1518 
   1519 	/* Bail out if run-queue part of active NUMA balance. */
   1520 	if (xchg(&rq->numa_migrate_on, 1))
   1521 		return;
   1522 
   1523 	/*
   1524 	 * Clear previous best_cpu/rq numa-migrate flag, since task now
   1525 	 * found a better CPU to move/swap.
   1526 	 */
   1527 	if (env->best_cpu != -1) {
   1528 		rq = cpu_rq(env->best_cpu);
   1529 		WRITE_ONCE(rq->numa_migrate_on, 0);
   1530 	}
   1531 
   1532 	if (env->best_task)
   1533 		put_task_struct(env->best_task);
   1534 	if (p)
   1535 		get_task_struct(p);
   1536 
   1537 	env->best_task = p;
   1538 	env->best_imp = imp;
   1539 	env->best_cpu = env->dst_cpu;
   1540 }
   1541 
   1542 static bool load_too_imbalanced(long src_load, long dst_load,
   1543 				struct task_numa_env *env)
   1544 {
   1545 	long imb, old_imb;
   1546 	long orig_src_load, orig_dst_load;
   1547 	long src_capacity, dst_capacity;
   1548 
   1549 	/*
   1550 	 * The load is corrected for the CPU capacity available on each node.
   1551 	 *
   1552 	 * src_load        dst_load
   1553 	 * ------------ vs ---------
   1554 	 * src_capacity    dst_capacity
   1555 	 */
   1556 	src_capacity = env->src_stats.compute_capacity;
   1557 	dst_capacity = env->dst_stats.compute_capacity;
   1558 
   1559 	imb = abs(dst_load * src_capacity - src_load * dst_capacity);
   1560 
   1561 	orig_src_load = env->src_stats.load;
   1562 	orig_dst_load = env->dst_stats.load;
   1563 
   1564 	old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
   1565 
   1566 	/* Would this change make things worse? */
   1567 	return (imb > old_imb);
   1568 }
   1569 
   1570 /*
   1571  * Maximum NUMA importance can be 1998 (2*999);
   1572  * SMALLIMP @ 30 would be close to 1998/64.
   1573  * Used to deter task migration.
   1574  */
   1575 #define SMALLIMP	30
   1576 
   1577 /*
   1578  * This checks if the overall compute and NUMA accesses of the system would
   1579  * be improved if the source tasks was migrated to the target dst_cpu taking
   1580  * into account that it might be best if task running on the dst_cpu should
   1581  * be exchanged with the source task
   1582  */
   1583 static void task_numa_compare(struct task_numa_env *env,
   1584 			      long taskimp, long groupimp, bool maymove)
   1585 {
   1586 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
   1587 	struct task_struct *cur;
   1588 	long src_load, dst_load;
   1589 	long load;
   1590 	long imp = env->p->numa_group ? groupimp : taskimp;
   1591 	long moveimp = imp;
   1592 	int dist = env->dist;
   1593 
   1594 	if (READ_ONCE(dst_rq->numa_migrate_on))
   1595 		return;
   1596 
   1597 	rcu_read_lock();
   1598 	cur = task_rcu_dereference(&dst_rq->curr);
   1599 	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
   1600 		cur = NULL;
   1601 
   1602 	/*
   1603 	 * Because we have preemption enabled we can get migrated around and
   1604 	 * end try selecting ourselves (current == env->p) as a swap candidate.
   1605 	 */
   1606 	if (cur == env->p)
   1607 		goto unlock;
   1608 
   1609 	if (!cur) {
   1610 		if (maymove && moveimp >= env->best_imp)
   1611 			goto assign;
   1612 		else
   1613 			goto unlock;
   1614 	}
   1615 
   1616 	/*
   1617 	 * "imp" is the fault differential for the source task between the
   1618 	 * source and destination node. Calculate the total differential for
   1619 	 * the source task and potential destination task. The more negative
   1620 	 * the value is, the more remote accesses that would be expected to
   1621 	 * be incurred if the tasks were swapped.
   1622 	 */
   1623 	/* Skip this swap candidate if cannot move to the source cpu */
   1624 	if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
   1625 		goto unlock;
   1626 
   1627 	/*
   1628 	 * If dst and source tasks are in the same NUMA group, or not
   1629 	 * in any group then look only at task weights.
   1630 	 */
   1631 	if (cur->numa_group == env->p->numa_group) {
   1632 		imp = taskimp + task_weight(cur, env->src_nid, dist) -
   1633 		      task_weight(cur, env->dst_nid, dist);
   1634 		/*
   1635 		 * Add some hysteresis to prevent swapping the
   1636 		 * tasks within a group over tiny differences.
   1637 		 */
   1638 		if (cur->numa_group)
   1639 			imp -= imp / 16;
   1640 	} else {
   1641 		/*
   1642 		 * Compare the group weights. If a task is all by itself
   1643 		 * (not part of a group), use the task weight instead.
   1644 		 */
   1645 		if (cur->numa_group && env->p->numa_group)
   1646 			imp += group_weight(cur, env->src_nid, dist) -
   1647 			       group_weight(cur, env->dst_nid, dist);
   1648 		else
   1649 			imp += task_weight(cur, env->src_nid, dist) -
   1650 			       task_weight(cur, env->dst_nid, dist);
   1651 	}
   1652 
   1653 	if (maymove && moveimp > imp && moveimp > env->best_imp) {
   1654 		imp = moveimp;
   1655 		cur = NULL;
   1656 		goto assign;
   1657 	}
   1658 
   1659 	/*
   1660 	 * If the NUMA importance is less than SMALLIMP,
   1661 	 * task migration might only result in ping pong
   1662 	 * of tasks and also hurt performance due to cache
   1663 	 * misses.
   1664 	 */
   1665 	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
   1666 		goto unlock;
   1667 
   1668 	/*
   1669 	 * In the overloaded case, try and keep the load balanced.
   1670 	 */
   1671 	load = task_h_load(env->p) - task_h_load(cur);
   1672 	if (!load)
   1673 		goto assign;
   1674 
   1675 	dst_load = env->dst_stats.load + load;
   1676 	src_load = env->src_stats.load - load;
   1677 
   1678 	if (load_too_imbalanced(src_load, dst_load, env))
   1679 		goto unlock;
   1680 
   1681 assign:
   1682 	/*
   1683 	 * One idle CPU per node is evaluated for a task numa move.
   1684 	 * Call select_idle_sibling to maybe find a better one.
   1685 	 */
   1686 	if (!cur) {
   1687 		/*
   1688 		 * select_idle_siblings() uses an per-CPU cpumask that
   1689 		 * can be used from IRQ context.
   1690 		 */
   1691 		local_irq_disable();
   1692 		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
   1693 						   env->dst_cpu);
   1694 		local_irq_enable();
   1695 	}
   1696 
   1697 	task_numa_assign(env, cur, imp);
   1698 unlock:
   1699 	rcu_read_unlock();
   1700 }
   1701 
   1702 static void task_numa_find_cpu(struct task_numa_env *env,
   1703 				long taskimp, long groupimp)
   1704 {
   1705 	long src_load, dst_load, load;
   1706 	bool maymove = false;
   1707 	int cpu;
   1708 
   1709 	load = task_h_load(env->p);
   1710 	dst_load = env->dst_stats.load + load;
   1711 	src_load = env->src_stats.load - load;
   1712 
   1713 	/*
   1714 	 * If the improvement from just moving env->p direction is better
   1715 	 * than swapping tasks around, check if a move is possible.
   1716 	 */
   1717 	maymove = !load_too_imbalanced(src_load, dst_load, env);
   1718 
   1719 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
   1720 		/* Skip this CPU if the source task cannot migrate */
   1721 		if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
   1722 			continue;
   1723 
   1724 		env->dst_cpu = cpu;
   1725 		task_numa_compare(env, taskimp, groupimp, maymove);
   1726 	}
   1727 }
   1728 
   1729 static int task_numa_migrate(struct task_struct *p)
   1730 {
   1731 	struct task_numa_env env = {
   1732 		.p = p,
   1733 
   1734 		.src_cpu = task_cpu(p),
   1735 		.src_nid = task_node(p),
   1736 
   1737 		.imbalance_pct = 112,
   1738 
   1739 		.best_task = NULL,
   1740 		.best_imp = 0,
   1741 		.best_cpu = -1,
   1742 	};
   1743 	struct sched_domain *sd;
   1744 	struct rq *best_rq;
   1745 	unsigned long taskweight, groupweight;
   1746 	int nid, ret, dist;
   1747 	long taskimp, groupimp;
   1748 
   1749 	/*
   1750 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
   1751 	 * imbalance and would be the first to start moving tasks about.
   1752 	 *
   1753 	 * And we want to avoid any moving of tasks about, as that would create
   1754 	 * random movement of tasks -- counter the numa conditions we're trying
   1755 	 * to satisfy here.
   1756 	 */
   1757 	rcu_read_lock();
   1758 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
   1759 	if (sd)
   1760 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
   1761 	rcu_read_unlock();
   1762 
   1763 	/*
   1764 	 * Cpusets can break the scheduler domain tree into smaller
   1765 	 * balance domains, some of which do not cross NUMA boundaries.
   1766 	 * Tasks that are "trapped" in such domains cannot be migrated
   1767 	 * elsewhere, so there is no point in (re)trying.
   1768 	 */
   1769 	if (unlikely(!sd)) {
   1770 		sched_setnuma(p, task_node(p));
   1771 		return -EINVAL;
   1772 	}
   1773 
   1774 	env.dst_nid = p->numa_preferred_nid;
   1775 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
   1776 	taskweight = task_weight(p, env.src_nid, dist);
   1777 	groupweight = group_weight(p, env.src_nid, dist);
   1778 	update_numa_stats(&env.src_stats, env.src_nid);
   1779 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
   1780 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
   1781 	update_numa_stats(&env.dst_stats, env.dst_nid);
   1782 
   1783 	/* Try to find a spot on the preferred nid. */
   1784 	task_numa_find_cpu(&env, taskimp, groupimp);
   1785 
   1786 	/*
   1787 	 * Look at other nodes in these cases:
   1788 	 * - there is no space available on the preferred_nid
   1789 	 * - the task is part of a numa_group that is interleaved across
   1790 	 *   multiple NUMA nodes; in order to better consolidate the group,
   1791 	 *   we need to check other locations.
   1792 	 */
   1793 	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
   1794 		for_each_online_node(nid) {
   1795 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
   1796 				continue;
   1797 
   1798 			dist = node_distance(env.src_nid, env.dst_nid);
   1799 			if (sched_numa_topology_type == NUMA_BACKPLANE &&
   1800 						dist != env.dist) {
   1801 				taskweight = task_weight(p, env.src_nid, dist);
   1802 				groupweight = group_weight(p, env.src_nid, dist);
   1803 			}
   1804 
   1805 			/* Only consider nodes where both task and groups benefit */
   1806 			taskimp = task_weight(p, nid, dist) - taskweight;
   1807 			groupimp = group_weight(p, nid, dist) - groupweight;
   1808 			if (taskimp < 0 && groupimp < 0)
   1809 				continue;
   1810 
   1811 			env.dist = dist;
   1812 			env.dst_nid = nid;
   1813 			update_numa_stats(&env.dst_stats, env.dst_nid);
   1814 			task_numa_find_cpu(&env, taskimp, groupimp);
   1815 		}
   1816 	}
   1817 
   1818 	/*
   1819 	 * If the task is part of a workload that spans multiple NUMA nodes,
   1820 	 * and is migrating into one of the workload's active nodes, remember
   1821 	 * this node as the task's preferred numa node, so the workload can
   1822 	 * settle down.
   1823 	 * A task that migrated to a second choice node will be better off
   1824 	 * trying for a better one later. Do not set the preferred node here.
   1825 	 */
   1826 	if (p->numa_group) {
   1827 		if (env.best_cpu == -1)
   1828 			nid = env.src_nid;
   1829 		else
   1830 			nid = cpu_to_node(env.best_cpu);
   1831 
   1832 		if (nid != p->numa_preferred_nid)
   1833 			sched_setnuma(p, nid);
   1834 	}
   1835 
   1836 	/* No better CPU than the current one was found. */
   1837 	if (env.best_cpu == -1)
   1838 		return -EAGAIN;
   1839 
   1840 	best_rq = cpu_rq(env.best_cpu);
   1841 	if (env.best_task == NULL) {
   1842 		ret = migrate_task_to(p, env.best_cpu);
   1843 		WRITE_ONCE(best_rq->numa_migrate_on, 0);
   1844 		if (ret != 0)
   1845 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
   1846 		return ret;
   1847 	}
   1848 
   1849 	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
   1850 	WRITE_ONCE(best_rq->numa_migrate_on, 0);
   1851 
   1852 	if (ret != 0)
   1853 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
   1854 	put_task_struct(env.best_task);
   1855 	return ret;
   1856 }
   1857 
   1858 /* Attempt to migrate a task to a CPU on the preferred node. */
   1859 static void numa_migrate_preferred(struct task_struct *p)
   1860 {
   1861 	unsigned long interval = HZ;
   1862 
   1863 	/* This task has no NUMA fault statistics yet */
   1864 	if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
   1865 		return;
   1866 
   1867 	/* Periodically retry migrating the task to the preferred node */
   1868 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
   1869 	p->numa_migrate_retry = jiffies + interval;
   1870 
   1871 	/* Success if task is already running on preferred CPU */
   1872 	if (task_node(p) == p->numa_preferred_nid)
   1873 		return;
   1874 
   1875 	/* Otherwise, try migrate to a CPU on the preferred node */
   1876 	task_numa_migrate(p);
   1877 }
   1878 
   1879 /*
   1880  * Find out how many nodes on the workload is actively running on. Do this by
   1881  * tracking the nodes from which NUMA hinting faults are triggered. This can
   1882  * be different from the set of nodes where the workload's memory is currently
   1883  * located.
   1884  */
   1885 static void numa_group_count_active_nodes(struct numa_group *numa_group)
   1886 {
   1887 	unsigned long faults, max_faults = 0;
   1888 	int nid, active_nodes = 0;
   1889 
   1890 	for_each_online_node(nid) {
   1891 		faults = group_faults_cpu(numa_group, nid);
   1892 		if (faults > max_faults)
   1893 			max_faults = faults;
   1894 	}
   1895 
   1896 	for_each_online_node(nid) {
   1897 		faults = group_faults_cpu(numa_group, nid);
   1898 		if (faults * ACTIVE_NODE_FRACTION > max_faults)
   1899 			active_nodes++;
   1900 	}
   1901 
   1902 	numa_group->max_faults_cpu = max_faults;
   1903 	numa_group->active_nodes = active_nodes;
   1904 }
   1905 
   1906 /*
   1907  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
   1908  * increments. The more local the fault statistics are, the higher the scan
   1909  * period will be for the next scan window. If local/(local+remote) ratio is
   1910  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
   1911  * the scan period will decrease. Aim for 70% local accesses.
   1912  */
   1913 #define NUMA_PERIOD_SLOTS 10
   1914 #define NUMA_PERIOD_THRESHOLD 7
   1915 
   1916 /*
   1917  * Increase the scan period (slow down scanning) if the majority of
   1918  * our memory is already on our local node, or if the majority of
   1919  * the page accesses are shared with other processes.
   1920  * Otherwise, decrease the scan period.
   1921  */
   1922 static void update_task_scan_period(struct task_struct *p,
   1923 			unsigned long shared, unsigned long private)
   1924 {
   1925 	unsigned int period_slot;
   1926 	int lr_ratio, ps_ratio;
   1927 	int diff;
   1928 
   1929 	unsigned long remote = p->numa_faults_locality[0];
   1930 	unsigned long local = p->numa_faults_locality[1];
   1931 
   1932 	/*
   1933 	 * If there were no record hinting faults then either the task is
   1934 	 * completely idle or all activity is areas that are not of interest
   1935 	 * to automatic numa balancing. Related to that, if there were failed
   1936 	 * migration then it implies we are migrating too quickly or the local
   1937 	 * node is overloaded. In either case, scan slower
   1938 	 */
   1939 	if (local + shared == 0 || p->numa_faults_locality[2]) {
   1940 		p->numa_scan_period = min(p->numa_scan_period_max,
   1941 			p->numa_scan_period << 1);
   1942 
   1943 		p->mm->numa_next_scan = jiffies +
   1944 			msecs_to_jiffies(p->numa_scan_period);
   1945 
   1946 		return;
   1947 	}
   1948 
   1949 	/*
   1950 	 * Prepare to scale scan period relative to the current period.
   1951 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
   1952 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
   1953 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
   1954 	 */
   1955 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
   1956 	lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
   1957 	ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
   1958 
   1959 	if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
   1960 		/*
   1961 		 * Most memory accesses are local. There is no need to
   1962 		 * do fast NUMA scanning, since memory is already local.
   1963 		 */
   1964 		int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
   1965 		if (!slot)
   1966 			slot = 1;
   1967 		diff = slot * period_slot;
   1968 	} else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
   1969 		/*
   1970 		 * Most memory accesses are shared with other tasks.
   1971 		 * There is no point in continuing fast NUMA scanning,
   1972 		 * since other tasks may just move the memory elsewhere.
   1973 		 */
   1974 		int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
   1975 		if (!slot)
   1976 			slot = 1;
   1977 		diff = slot * period_slot;
   1978 	} else {
   1979 		/*
   1980 		 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
   1981 		 * yet they are not on the local NUMA node. Speed up
   1982 		 * NUMA scanning to get the memory moved over.
   1983 		 */
   1984 		int ratio = max(lr_ratio, ps_ratio);
   1985 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
   1986 	}
   1987 
   1988 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
   1989 			task_scan_min(p), task_scan_max(p));
   1990 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
   1991 }
   1992 
   1993 /*
   1994  * Get the fraction of time the task has been running since the last
   1995  * NUMA placement cycle. The scheduler keeps similar statistics, but
   1996  * decays those on a 32ms period, which is orders of magnitude off
   1997  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
   1998  * stats only if the task is so new there are no NUMA statistics yet.
   1999  */
   2000 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
   2001 {
   2002 	u64 runtime, delta, now;
   2003 	/* Use the start of this time slice to avoid calculations. */
   2004 	now = p->se.exec_start;
   2005 	runtime = p->se.sum_exec_runtime;
   2006 
   2007 	if (p->last_task_numa_placement) {
   2008 		delta = runtime - p->last_sum_exec_runtime;
   2009 		*period = now - p->last_task_numa_placement;
   2010 	} else {
   2011 		delta = p->se.avg.load_sum;
   2012 		*period = LOAD_AVG_MAX;
   2013 	}
   2014 
   2015 	p->last_sum_exec_runtime = runtime;
   2016 	p->last_task_numa_placement = now;
   2017 
   2018 	return delta;
   2019 }
   2020 
   2021 /*
   2022  * Determine the preferred nid for a task in a numa_group. This needs to
   2023  * be done in a way that produces consistent results with group_weight,
   2024  * otherwise workloads might not converge.
   2025  */
   2026 static int preferred_group_nid(struct task_struct *p, int nid)
   2027 {
   2028 	nodemask_t nodes;
   2029 	int dist;
   2030 
   2031 	/* Direct connections between all NUMA nodes. */
   2032 	if (sched_numa_topology_type == NUMA_DIRECT)
   2033 		return nid;
   2034 
   2035 	/*
   2036 	 * On a system with glueless mesh NUMA topology, group_weight
   2037 	 * scores nodes according to the number of NUMA hinting faults on
   2038 	 * both the node itself, and on nearby nodes.
   2039 	 */
   2040 	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
   2041 		unsigned long score, max_score = 0;
   2042 		int node, max_node = nid;
   2043 
   2044 		dist = sched_max_numa_distance;
   2045 
   2046 		for_each_online_node(node) {
   2047 			score = group_weight(p, node, dist);
   2048 			if (score > max_score) {
   2049 				max_score = score;
   2050 				max_node = node;
   2051 			}
   2052 		}
   2053 		return max_node;
   2054 	}
   2055 
   2056 	/*
   2057 	 * Finding the preferred nid in a system with NUMA backplane
   2058 	 * interconnect topology is more involved. The goal is to locate
   2059 	 * tasks from numa_groups near each other in the system, and
   2060 	 * untangle workloads from different sides of the system. This requires
   2061 	 * searching down the hierarchy of node groups, recursively searching
   2062 	 * inside the highest scoring group of nodes. The nodemask tricks
   2063 	 * keep the complexity of the search down.
   2064 	 */
   2065 	nodes = node_online_map;
   2066 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
   2067 		unsigned long max_faults = 0;
   2068 		nodemask_t max_group = NODE_MASK_NONE;
   2069 		int a, b;
   2070 
   2071 		/* Are there nodes at this distance from each other? */
   2072 		if (!find_numa_distance(dist))
   2073 			continue;
   2074 
   2075 		for_each_node_mask(a, nodes) {
   2076 			unsigned long faults = 0;
   2077 			nodemask_t this_group;
   2078 			nodes_clear(this_group);
   2079 
   2080 			/* Sum group's NUMA faults; includes a==b case. */
   2081 			for_each_node_mask(b, nodes) {
   2082 				if (node_distance(a, b) < dist) {
   2083 					faults += group_faults(p, b);
   2084 					node_set(b, this_group);
   2085 					node_clear(b, nodes);
   2086 				}
   2087 			}
   2088 
   2089 			/* Remember the top group. */
   2090 			if (faults > max_faults) {
   2091 				max_faults = faults;
   2092 				max_group = this_group;
   2093 				/*
   2094 				 * subtle: at the smallest distance there is
   2095 				 * just one node left in each "group", the
   2096 				 * winner is the preferred nid.
   2097 				 */
   2098 				nid = a;
   2099 			}
   2100 		}
   2101 		/* Next round, evaluate the nodes within max_group. */
   2102 		if (!max_faults)
   2103 			break;
   2104 		nodes = max_group;
   2105 	}
   2106 	return nid;
   2107 }
   2108 
   2109 static void task_numa_placement(struct task_struct *p)
   2110 {
   2111 	int seq, nid, max_nid = NUMA_NO_NODE;
   2112 	unsigned long max_faults = 0;
   2113 	unsigned long fault_types[2] = { 0, 0 };
   2114 	unsigned long total_faults;
   2115 	u64 runtime, period;
   2116 	spinlock_t *group_lock = NULL;
   2117 
   2118 	/*
   2119 	 * The p->mm->numa_scan_seq field gets updated without
   2120 	 * exclusive access. Use READ_ONCE() here to ensure
   2121 	 * that the field is read in a single access:
   2122 	 */
   2123 	seq = READ_ONCE(p->mm->numa_scan_seq);
   2124 	if (p->numa_scan_seq == seq)
   2125 		return;
   2126 	p->numa_scan_seq = seq;
   2127 	p->numa_scan_period_max = task_scan_max(p);
   2128 
   2129 	total_faults = p->numa_faults_locality[0] +
   2130 		       p->numa_faults_locality[1];
   2131 	runtime = numa_get_avg_runtime(p, &period);
   2132 
   2133 	/* If the task is part of a group prevent parallel updates to group stats */
   2134 	if (p->numa_group) {
   2135 		group_lock = &p->numa_group->lock;
   2136 		spin_lock_irq(group_lock);
   2137 	}
   2138 
   2139 	/* Find the node with the highest number of faults */
   2140 	for_each_online_node(nid) {
   2141 		/* Keep track of the offsets in numa_faults array */
   2142 		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
   2143 		unsigned long faults = 0, group_faults = 0;
   2144 		int priv;
   2145 
   2146 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
   2147 			long diff, f_diff, f_weight;
   2148 
   2149 			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
   2150 			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
   2151 			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
   2152 			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
   2153 
   2154 			/* Decay existing window, copy faults since last scan */
   2155 			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
   2156 			fault_types[priv] += p->numa_faults[membuf_idx];
   2157 			p->numa_faults[membuf_idx] = 0;
   2158 
   2159 			/*
   2160 			 * Normalize the faults_from, so all tasks in a group
   2161 			 * count according to CPU use, instead of by the raw
   2162 			 * number of faults. Tasks with little runtime have
   2163 			 * little over-all impact on throughput, and thus their
   2164 			 * faults are less important.
   2165 			 */
   2166 			f_weight = div64_u64(runtime << 16, period + 1);
   2167 			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
   2168 				   (total_faults + 1);
   2169 			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
   2170 			p->numa_faults[cpubuf_idx] = 0;
   2171 
   2172 			p->numa_faults[mem_idx] += diff;
   2173 			p->numa_faults[cpu_idx] += f_diff;
   2174 			faults += p->numa_faults[mem_idx];
   2175 			p->total_numa_faults += diff;
   2176 			if (p->numa_group) {
   2177 				/*
   2178 				 * safe because we can only change our own group
   2179 				 *
   2180 				 * mem_idx represents the offset for a given
   2181 				 * nid and priv in a specific region because it
   2182 				 * is at the beginning of the numa_faults array.
   2183 				 */
   2184 				p->numa_group->faults[mem_idx] += diff;
   2185 				p->numa_group->faults_cpu[mem_idx] += f_diff;
   2186 				p->numa_group->total_faults += diff;
   2187 				group_faults += p->numa_group->faults[mem_idx];
   2188 			}
   2189 		}
   2190 
   2191 		if (!p->numa_group) {
   2192 			if (faults > max_faults) {
   2193 				max_faults = faults;
   2194 				max_nid = nid;
   2195 			}
   2196 		} else if (group_faults > max_faults) {
   2197 			max_faults = group_faults;
   2198 			max_nid = nid;
   2199 		}
   2200 	}
   2201 
   2202 	if (p->numa_group) {
   2203 		numa_group_count_active_nodes(p->numa_group);
   2204 		spin_unlock_irq(group_lock);
   2205 		max_nid = preferred_group_nid(p, max_nid);
   2206 	}
   2207 
   2208 	if (max_faults) {
   2209 		/* Set the new preferred node */
   2210 		if (max_nid != p->numa_preferred_nid)
   2211 			sched_setnuma(p, max_nid);
   2212 	}
   2213 
   2214 	update_task_scan_period(p, fault_types[0], fault_types[1]);
   2215 }
   2216 
   2217 static inline int get_numa_group(struct numa_group *grp)
   2218 {
   2219 	return refcount_inc_not_zero(&grp->refcount);
   2220 }
   2221 
   2222 static inline void put_numa_group(struct numa_group *grp)
   2223 {
   2224 	if (refcount_dec_and_test(&grp->refcount))
   2225 		kfree_rcu(grp, rcu);
   2226 }
   2227 
   2228 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
   2229 			int *priv)
   2230 {
   2231 	struct numa_group *grp, *my_grp;
   2232 	struct task_struct *tsk;
   2233 	bool join = false;
   2234 	int cpu = cpupid_to_cpu(cpupid);
   2235 	int i;
   2236 
   2237 	if (unlikely(!p->numa_group)) {
   2238 		unsigned int size = sizeof(struct numa_group) +
   2239 				    4*nr_node_ids*sizeof(unsigned long);
   2240 
   2241 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
   2242 		if (!grp)
   2243 			return;
   2244 
   2245 		refcount_set(&grp->refcount, 1);
   2246 		grp->active_nodes = 1;
   2247 		grp->max_faults_cpu = 0;
   2248 		spin_lock_init(&grp->lock);
   2249 		grp->gid = p->pid;
   2250 		/* Second half of the array tracks nids where faults happen */
   2251 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
   2252 						nr_node_ids;
   2253 
   2254 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
   2255 			grp->faults[i] = p->numa_faults[i];
   2256 
   2257 		grp->total_faults = p->total_numa_faults;
   2258 
   2259 		grp->nr_tasks++;
   2260 		rcu_assign_pointer(p->numa_group, grp);
   2261 	}
   2262 
   2263 	rcu_read_lock();
   2264 	tsk = READ_ONCE(cpu_rq(cpu)->curr);
   2265 
   2266 	if (!cpupid_match_pid(tsk, cpupid))
   2267 		goto no_join;
   2268 
   2269 	grp = rcu_dereference(tsk->numa_group);
   2270 	if (!grp)
   2271 		goto no_join;
   2272 
   2273 	my_grp = p->numa_group;
   2274 	if (grp == my_grp)
   2275 		goto no_join;
   2276 
   2277 	/*
   2278 	 * Only join the other group if its bigger; if we're the bigger group,
   2279 	 * the other task will join us.
   2280 	 */
   2281 	if (my_grp->nr_tasks > grp->nr_tasks)
   2282 		goto no_join;
   2283 
   2284 	/*
   2285 	 * Tie-break on the grp address.
   2286 	 */
   2287 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
   2288 		goto no_join;
   2289 
   2290 	/* Always join threads in the same process. */
   2291 	if (tsk->mm == current->mm)
   2292 		join = true;
   2293 
   2294 	/* Simple filter to avoid false positives due to PID collisions */
   2295 	if (flags & TNF_SHARED)
   2296 		join = true;
   2297 
   2298 	/* Update priv based on whether false sharing was detected */
   2299 	*priv = !join;
   2300 
   2301 	if (join && !get_numa_group(grp))
   2302 		goto no_join;
   2303 
   2304 	rcu_read_unlock();
   2305 
   2306 	if (!join)
   2307 		return;
   2308 
   2309 	BUG_ON(irqs_disabled());
   2310 	double_lock_irq(&my_grp->lock, &grp->lock);
   2311 
   2312 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
   2313 		my_grp->faults[i] -= p->numa_faults[i];
   2314 		grp->faults[i] += p->numa_faults[i];
   2315 	}
   2316 	my_grp->total_faults -= p->total_numa_faults;
   2317 	grp->total_faults += p->total_numa_faults;
   2318 
   2319 	my_grp->nr_tasks--;
   2320 	grp->nr_tasks++;
   2321 
   2322 	spin_unlock(&my_grp->lock);
   2323 	spin_unlock_irq(&grp->lock);
   2324 
   2325 	rcu_assign_pointer(p->numa_group, grp);
   2326 
   2327 	put_numa_group(my_grp);
   2328 	return;
   2329 
   2330 no_join:
   2331 	rcu_read_unlock();
   2332 	return;
   2333 }
   2334 
   2335 void task_numa_free(struct task_struct *p)
   2336 {
   2337 	struct numa_group *grp = p->numa_group;
   2338 	void *numa_faults = p->numa_faults;
   2339 	unsigned long flags;
   2340 	int i;
   2341 
   2342 	if (grp) {
   2343 		spin_lock_irqsave(&grp->lock, flags);
   2344 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
   2345 			grp->faults[i] -= p->numa_faults[i];
   2346 		grp->total_faults -= p->total_numa_faults;
   2347 
   2348 		grp->nr_tasks--;
   2349 		spin_unlock_irqrestore(&grp->lock, flags);
   2350 		RCU_INIT_POINTER(p->numa_group, NULL);
   2351 		put_numa_group(grp);
   2352 	}
   2353 
   2354 	p->numa_faults = NULL;
   2355 	kfree(numa_faults);
   2356 }
   2357 
   2358 /*
   2359  * Got a PROT_NONE fault for a page on @node.
   2360  */
   2361 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
   2362 {
   2363 	struct task_struct *p = current;
   2364 	bool migrated = flags & TNF_MIGRATED;
   2365 	int cpu_node = task_node(current);
   2366 	int local = !!(flags & TNF_FAULT_LOCAL);
   2367 	struct numa_group *ng;
   2368 	int priv;
   2369 
   2370 	if (!static_branch_likely(&sched_numa_balancing))
   2371 		return;
   2372 
   2373 	/* for example, ksmd faulting in a user's mm */
   2374 	if (!p->mm)
   2375 		return;
   2376 
   2377 	/* Allocate buffer to track faults on a per-node basis */
   2378 	if (unlikely(!p->numa_faults)) {
   2379 		int size = sizeof(*p->numa_faults) *
   2380 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
   2381 
   2382 		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
   2383 		if (!p->numa_faults)
   2384 			return;
   2385 
   2386 		p->total_numa_faults = 0;
   2387 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
   2388 	}
   2389 
   2390 	/*
   2391 	 * First accesses are treated as private, otherwise consider accesses
   2392 	 * to be private if the accessing pid has not changed
   2393 	 */
   2394 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
   2395 		priv = 1;
   2396 	} else {
   2397 		priv = cpupid_match_pid(p, last_cpupid);
   2398 		if (!priv && !(flags & TNF_NO_GROUP))
   2399 			task_numa_group(p, last_cpupid, flags, &priv);
   2400 	}
   2401 
   2402 	/*
   2403 	 * If a workload spans multiple NUMA nodes, a shared fault that
   2404 	 * occurs wholly within the set of nodes that the workload is
   2405 	 * actively using should be counted as local. This allows the
   2406 	 * scan rate to slow down when a workload has settled down.
   2407 	 */
   2408 	ng = p->numa_group;
   2409 	if (!priv && !local && ng && ng->active_nodes > 1 &&
   2410 				numa_is_active_node(cpu_node, ng) &&
   2411 				numa_is_active_node(mem_node, ng))
   2412 		local = 1;
   2413 
   2414 	/*
   2415 	 * Retry to migrate task to preferred node periodically, in case it
   2416 	 * previously failed, or the scheduler moved us.
   2417 	 */
   2418 	if (time_after(jiffies, p->numa_migrate_retry)) {
   2419 		task_numa_placement(p);
   2420 		numa_migrate_preferred(p);
   2421 	}
   2422 
   2423 	if (migrated)
   2424 		p->numa_pages_migrated += pages;
   2425 	if (flags & TNF_MIGRATE_FAIL)
   2426 		p->numa_faults_locality[2] += pages;
   2427 
   2428 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
   2429 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
   2430 	p->numa_faults_locality[local] += pages;
   2431 }
   2432 
   2433 static void reset_ptenuma_scan(struct task_struct *p)
   2434 {
   2435 	/*
   2436 	 * We only did a read acquisition of the mmap sem, so
   2437 	 * p->mm->numa_scan_seq is written to without exclusive access
   2438 	 * and the update is not guaranteed to be atomic. That's not
   2439 	 * much of an issue though, since this is just used for
   2440 	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
   2441 	 * expensive, to avoid any form of compiler optimizations:
   2442 	 */
   2443 	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
   2444 	p->mm->numa_scan_offset = 0;
   2445 }
   2446 
   2447 /*
   2448  * The expensive part of numa migration is done from task_work context.
   2449  * Triggered from task_tick_numa().
   2450  */
   2451 void task_numa_work(struct callback_head *work)
   2452 {
   2453 	unsigned long migrate, next_scan, now = jiffies;
   2454 	struct task_struct *p = current;
   2455 	struct mm_struct *mm = p->mm;
   2456 	u64 runtime = p->se.sum_exec_runtime;
   2457 	struct vm_area_struct *vma;
   2458 	unsigned long start, end;
   2459 	unsigned long nr_pte_updates = 0;
   2460 	long pages, virtpages;
   2461 
   2462 	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
   2463 
   2464 	work->next = work; /* protect against double add */
   2465 	/*
   2466 	 * Who cares about NUMA placement when they're dying.
   2467 	 *
   2468 	 * NOTE: make sure not to dereference p->mm before this check,
   2469 	 * exit_task_work() happens _after_ exit_mm() so we could be called
   2470 	 * without p->mm even though we still had it when we enqueued this
   2471 	 * work.
   2472 	 */
   2473 	if (p->flags & PF_EXITING)
   2474 		return;
   2475 
   2476 	if (!mm->numa_next_scan) {
   2477 		mm->numa_next_scan = now +
   2478 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
   2479 	}
   2480 
   2481 	/*
   2482 	 * Enforce maximal scan/migration frequency..
   2483 	 */
   2484 	migrate = mm->numa_next_scan;
   2485 	if (time_before(now, migrate))
   2486 		return;
   2487 
   2488 	if (p->numa_scan_period == 0) {
   2489 		p->numa_scan_period_max = task_scan_max(p);
   2490 		p->numa_scan_period = task_scan_start(p);
   2491 	}
   2492 
   2493 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
   2494 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
   2495 		return;
   2496 
   2497 	/*
   2498 	 * Delay this task enough that another task of this mm will likely win
   2499 	 * the next time around.
   2500 	 */
   2501 	p->node_stamp += 2 * TICK_NSEC;
   2502 
   2503 	start = mm->numa_scan_offset;
   2504 	pages = sysctl_numa_balancing_scan_size;
   2505 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
   2506 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
   2507 	if (!pages)
   2508 		return;
   2509 
   2510 
   2511 	if (!down_read_trylock(&mm->mmap_sem))
   2512 		return;
   2513 	vma = find_vma(mm, start);
   2514 	if (!vma) {
   2515 		reset_ptenuma_scan(p);
   2516 		start = 0;
   2517 		vma = mm->mmap;
   2518 	}
   2519 	for (; vma; vma = vma->vm_next) {
   2520 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
   2521 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
   2522 			continue;
   2523 		}
   2524 
   2525 		/*
   2526 		 * Shared library pages mapped by multiple processes are not
   2527 		 * migrated as it is expected they are cache replicated. Avoid
   2528 		 * hinting faults in read-only file-backed mappings or the vdso
   2529 		 * as migrating the pages will be of marginal benefit.
   2530 		 */
   2531 		if (!vma->vm_mm ||
   2532 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
   2533 			continue;
   2534 
   2535 		/*
   2536 		 * Skip inaccessible VMAs to avoid any confusion between
   2537 		 * PROT_NONE and NUMA hinting ptes
   2538 		 */
   2539 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
   2540 			continue;
   2541 
   2542 		do {
   2543 			start = max(start, vma->vm_start);
   2544 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
   2545 			end = min(end, vma->vm_end);
   2546 			nr_pte_updates = change_prot_numa(vma, start, end);
   2547 
   2548 			/*
   2549 			 * Try to scan sysctl_numa_balancing_size worth of
   2550 			 * hpages that have at least one present PTE that
   2551 			 * is not already pte-numa. If the VMA contains
   2552 			 * areas that are unused or already full of prot_numa
   2553 			 * PTEs, scan up to virtpages, to skip through those
   2554 			 * areas faster.
   2555 			 */
   2556 			if (nr_pte_updates)
   2557 				pages -= (end - start) >> PAGE_SHIFT;
   2558 			virtpages -= (end - start) >> PAGE_SHIFT;
   2559 
   2560 			start = end;
   2561 			if (pages <= 0 || virtpages <= 0)
   2562 				goto out;
   2563 
   2564 			cond_resched();
   2565 		} while (end != vma->vm_end);
   2566 	}
   2567 
   2568 out:
   2569 	/*
   2570 	 * It is possible to reach the end of the VMA list but the last few
   2571 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
   2572 	 * would find the !migratable VMA on the next scan but not reset the
   2573 	 * scanner to the start so check it now.
   2574 	 */
   2575 	if (vma)
   2576 		mm->numa_scan_offset = start;
   2577 	else
   2578 		reset_ptenuma_scan(p);
   2579 	up_read(&mm->mmap_sem);
   2580 
   2581 	/*
   2582 	 * Make sure tasks use at least 32x as much time to run other code
   2583 	 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
   2584 	 * Usually update_task_scan_period slows down scanning enough; on an
   2585 	 * overloaded system we need to limit overhead on a per task basis.
   2586 	 */
   2587 	if (unlikely(p->se.sum_exec_runtime != runtime)) {
   2588 		u64 diff = p->se.sum_exec_runtime - runtime;
   2589 		p->node_stamp += 32 * diff;
   2590 	}
   2591 }
   2592 
   2593 /*
   2594  * Drive the periodic memory faults..
   2595  */
   2596 void task_tick_numa(struct rq *rq, struct task_struct *curr)
   2597 {
   2598 	struct callback_head *work = &curr->numa_work;
   2599 	u64 period, now;
   2600 
   2601 	/*
   2602 	 * We don't care about NUMA placement if we don't have memory.
   2603 	 */
   2604 	if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
   2605 		return;
   2606 
   2607 	/*
   2608 	 * Using runtime rather than walltime has the dual advantage that
   2609 	 * we (mostly) drive the selection from busy threads and that the
   2610 	 * task needs to have done some actual work before we bother with
   2611 	 * NUMA placement.
   2612 	 */
   2613 	now = curr->se.sum_exec_runtime;
   2614 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
   2615 
   2616 	if (now > curr->node_stamp + period) {
   2617 		if (!curr->node_stamp)
   2618 			curr->numa_scan_period = task_scan_start(curr);
   2619 		curr->node_stamp += period;
   2620 
   2621 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
   2622 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
   2623 			task_work_add(curr, work, true);
   2624 		}
   2625 	}
   2626 }
   2627 
   2628 static void update_scan_period(struct task_struct *p, int new_cpu)
   2629 {
   2630 	int src_nid = cpu_to_node(task_cpu(p));
   2631 	int dst_nid = cpu_to_node(new_cpu);
   2632 
   2633 	if (!static_branch_likely(&sched_numa_balancing))
   2634 		return;
   2635 
   2636 	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
   2637 		return;
   2638 
   2639 	if (src_nid == dst_nid)
   2640 		return;
   2641 
   2642 	/*
   2643 	 * Allow resets if faults have been trapped before one scan
   2644 	 * has completed. This is most likely due to a new task that
   2645 	 * is pulled cross-node due to wakeups or load balancing.
   2646 	 */
   2647 	if (p->numa_scan_seq) {
   2648 		/*
   2649 		 * Avoid scan adjustments if moving to the preferred
   2650 		 * node or if the task was not previously running on
   2651 		 * the preferred node.
   2652 		 */
   2653 		if (dst_nid == p->numa_preferred_nid ||
   2654 		    (p->numa_preferred_nid != NUMA_NO_NODE &&
   2655 			src_nid != p->numa_preferred_nid))
   2656 			return;
   2657 	}
   2658 
   2659 	p->numa_scan_period = task_scan_start(p);
   2660 }
   2661 
   2662 #else
   2663 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
   2664 {
   2665 }
   2666 
   2667 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
   2668 {
   2669 }
   2670 
   2671 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
   2672 {
   2673 }
   2674 
   2675 static inline void update_scan_period(struct task_struct *p, int new_cpu)
   2676 {
   2677 }
   2678 
   2679 #endif /* CONFIG_NUMA_BALANCING */
   2680 
   2681 static void
   2682 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
   2683 {
   2684 	update_load_add(&cfs_rq->load, se->load.weight);
   2685 	if (!parent_entity(se))
   2686 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
   2687 #ifdef CONFIG_SMP
   2688 	if (entity_is_task(se)) {
   2689 		struct rq *rq = rq_of(cfs_rq);
   2690 
   2691 		account_numa_enqueue(rq, task_of(se));
   2692 		list_add(&se->group_node, &rq->cfs_tasks);
   2693 	}
   2694 #endif
   2695 	cfs_rq->nr_running++;
   2696 }
   2697 
   2698 static void
   2699 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
   2700 {
   2701 	update_load_sub(&cfs_rq->load, se->load.weight);
   2702 	if (!parent_entity(se))
   2703 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
   2704 #ifdef CONFIG_SMP
   2705 	if (entity_is_task(se)) {
   2706 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
   2707 		list_del_init(&se->group_node);
   2708 	}
   2709 #endif
   2710 	cfs_rq->nr_running--;
   2711 }
   2712 
   2713 /*
   2714  * Signed add and clamp on underflow.
   2715  *
   2716  * Explicitly do a load-store to ensure the intermediate value never hits
   2717  * memory. This allows lockless observations without ever seeing the negative
   2718  * values.
   2719  */
   2720 #define add_positive(_ptr, _val) do {                           \
   2721 	typeof(_ptr) ptr = (_ptr);                              \
   2722 	typeof(_val) val = (_val);                              \
   2723 	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
   2724 								\
   2725 	res = var + val;                                        \
   2726 								\
   2727 	if (val < 0 && res > var)                               \
   2728 		res = 0;                                        \
   2729 								\
   2730 	WRITE_ONCE(*ptr, res);                                  \
   2731 } while (0)
   2732 
   2733 /*
   2734  * Unsigned subtract and clamp on underflow.
   2735  *
   2736  * Explicitly do a load-store to ensure the intermediate value never hits
   2737  * memory. This allows lockless observations without ever seeing the negative
   2738  * values.
   2739  */
   2740 #define sub_positive(_ptr, _val) do {				\
   2741 	typeof(_ptr) ptr = (_ptr);				\
   2742 	typeof(*ptr) val = (_val);				\
   2743 	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
   2744 	res = var - val;					\
   2745 	if (res > var)						\
   2746 		res = 0;					\
   2747 	WRITE_ONCE(*ptr, res);					\
   2748 } while (0)
   2749 
   2750 /*
   2751  * Remove and clamp on negative, from a local variable.
   2752  *
   2753  * A variant of sub_positive(), which does not use explicit load-store
   2754  * and is thus optimized for local variable updates.
   2755  */
   2756 #define lsub_positive(_ptr, _val) do {				\
   2757 	typeof(_ptr) ptr = (_ptr);				\
   2758 	*ptr -= min_t(typeof(*ptr), *ptr, _val);		\
   2759 } while (0)
   2760 
   2761 #ifdef CONFIG_SMP
   2762 static inline void
   2763 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   2764 {
   2765 	cfs_rq->runnable_weight += se->runnable_weight;
   2766 
   2767 	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
   2768 	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
   2769 }
   2770 
   2771 static inline void
   2772 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   2773 {
   2774 	cfs_rq->runnable_weight -= se->runnable_weight;
   2775 
   2776 	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
   2777 	sub_positive(&cfs_rq->avg.runnable_load_sum,
   2778 		     se_runnable(se) * se->avg.runnable_load_sum);
   2779 }
   2780 
   2781 static inline void
   2782 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   2783 {
   2784 	cfs_rq->avg.load_avg += se->avg.load_avg;
   2785 	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
   2786 }
   2787 
   2788 static inline void
   2789 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   2790 {
   2791 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
   2792 	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
   2793 }
   2794 #else
   2795 static inline void
   2796 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
   2797 static inline void
   2798 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
   2799 static inline void
   2800 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
   2801 static inline void
   2802 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
   2803 #endif
   2804 
   2805 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
   2806 			    unsigned long weight, unsigned long runnable)
   2807 {
   2808 	if (se->on_rq) {
   2809 		/* commit outstanding execution time */
   2810 		if (cfs_rq->curr == se)
   2811 			update_curr(cfs_rq);
   2812 		account_entity_dequeue(cfs_rq, se);
   2813 		dequeue_runnable_load_avg(cfs_rq, se);
   2814 	}
   2815 	dequeue_load_avg(cfs_rq, se);
   2816 
   2817 	se->runnable_weight = runnable;
   2818 	update_load_set(&se->load, weight);
   2819 
   2820 #ifdef CONFIG_SMP
   2821 	do {
   2822 		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
   2823 
   2824 		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
   2825 		se->avg.runnable_load_avg =
   2826 			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
   2827 	} while (0);
   2828 #endif
   2829 
   2830 	enqueue_load_avg(cfs_rq, se);
   2831 	if (se->on_rq) {
   2832 		account_entity_enqueue(cfs_rq, se);
   2833 		enqueue_runnable_load_avg(cfs_rq, se);
   2834 	}
   2835 }
   2836 
   2837 void reweight_task(struct task_struct *p, int prio)
   2838 {
   2839 	struct sched_entity *se = &p->se;
   2840 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   2841 	struct load_weight *load = &se->load;
   2842 	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
   2843 
   2844 	reweight_entity(cfs_rq, se, weight, weight);
   2845 	load->inv_weight = sched_prio_to_wmult[prio];
   2846 }
   2847 
   2848 #ifdef CONFIG_FAIR_GROUP_SCHED
   2849 #ifdef CONFIG_SMP
   2850 /*
   2851  * All this does is approximate the hierarchical proportion which includes that
   2852  * global sum we all love to hate.
   2853  *
   2854  * That is, the weight of a group entity, is the proportional share of the
   2855  * group weight based on the group runqueue weights. That is:
   2856  *
   2857  *                     tg->weight * grq->load.weight
   2858  *   ge->load.weight = -----------------------------               (1)
   2859  *			  \Sum grq->load.weight
   2860  *
   2861  * Now, because computing that sum is prohibitively expensive to compute (been
   2862  * there, done that) we approximate it with this average stuff. The average
   2863  * moves slower and therefore the approximation is cheaper and more stable.
   2864  *
   2865  * So instead of the above, we substitute:
   2866  *
   2867  *   grq->load.weight -> grq->avg.load_avg                         (2)
   2868  *
   2869  * which yields the following:
   2870  *
   2871  *                     tg->weight * grq->avg.load_avg
   2872  *   ge->load.weight = ------------------------------              (3)
   2873  *				tg->load_avg
   2874  *
   2875  * Where: tg->load_avg ~= \Sum grq->avg.load_avg
   2876  *
   2877  * That is shares_avg, and it is right (given the approximation (2)).
   2878  *
   2879  * The problem with it is that because the average is slow -- it was designed
   2880  * to be exactly that of course -- this leads to transients in boundary
   2881  * conditions. In specific, the case where the group was idle and we start the
   2882  * one task. It takes time for our CPU's grq->avg.load_avg to build up,
   2883  * yielding bad latency etc..
   2884  *
   2885  * Now, in that special case (1) reduces to:
   2886  *
   2887  *                     tg->weight * grq->load.weight
   2888  *   ge->load.weight = ----------------------------- = tg->weight   (4)
   2889  *			    grp->load.weight
   2890  *
   2891  * That is, the sum collapses because all other CPUs are idle; the UP scenario.
   2892  *
   2893  * So what we do is modify our approximation (3) to approach (4) in the (near)
   2894  * UP case, like:
   2895  *
   2896  *   ge->load.weight =
   2897  *
   2898  *              tg->weight * grq->load.weight
   2899  *     ---------------------------------------------------         (5)
   2900  *     tg->load_avg - grq->avg.load_avg + grq->load.weight
   2901  *
   2902  * But because grq->load.weight can drop to 0, resulting in a divide by zero,
   2903  * we need to use grq->avg.load_avg as its lower bound, which then gives:
   2904  *
   2905  *
   2906  *                     tg->weight * grq->load.weight
   2907  *   ge->load.weight = -----------------------------		   (6)
   2908  *				tg_load_avg'
   2909  *
   2910  * Where:
   2911  *
   2912  *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
   2913  *                  max(grq->load.weight, grq->avg.load_avg)
   2914  *
   2915  * And that is shares_weight and is icky. In the (near) UP case it approaches
   2916  * (4) while in the normal case it approaches (3). It consistently
   2917  * overestimates the ge->load.weight and therefore:
   2918  *
   2919  *   \Sum ge->load.weight >= tg->weight
   2920  *
   2921  * hence icky!
   2922  */
   2923 static long calc_group_shares(struct cfs_rq *cfs_rq)
   2924 {
   2925 	long tg_weight, tg_shares, load, shares;
   2926 	struct task_group *tg = cfs_rq->tg;
   2927 
   2928 	tg_shares = READ_ONCE(tg->shares);
   2929 
   2930 	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
   2931 
   2932 	tg_weight = atomic_long_read(&tg->load_avg);
   2933 
   2934 	/* Ensure tg_weight >= load */
   2935 	tg_weight -= cfs_rq->tg_load_avg_contrib;
   2936 	tg_weight += load;
   2937 
   2938 	shares = (tg_shares * load);
   2939 	if (tg_weight)
   2940 		shares /= tg_weight;
   2941 
   2942 	/*
   2943 	 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
   2944 	 * of a group with small tg->shares value. It is a floor value which is
   2945 	 * assigned as a minimum load.weight to the sched_entity representing
   2946 	 * the group on a CPU.
   2947 	 *
   2948 	 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
   2949 	 * on an 8-core system with 8 tasks each runnable on one CPU shares has
   2950 	 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
   2951 	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
   2952 	 * instead of 0.
   2953 	 */
   2954 	return clamp_t(long, shares, MIN_SHARES, tg_shares);
   2955 }
   2956 
   2957 /*
   2958  * This calculates the effective runnable weight for a group entity based on
   2959  * the group entity weight calculated above.
   2960  *
   2961  * Because of the above approximation (2), our group entity weight is
   2962  * an load_avg based ratio (3). This means that it includes blocked load and
   2963  * does not represent the runnable weight.
   2964  *
   2965  * Approximate the group entity's runnable weight per ratio from the group
   2966  * runqueue:
   2967  *
   2968  *					     grq->avg.runnable_load_avg
   2969  *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
   2970  *						 grq->avg.load_avg
   2971  *
   2972  * However, analogous to above, since the avg numbers are slow, this leads to
   2973  * transients in the from-idle case. Instead we use:
   2974  *
   2975  *   ge->runnable_weight = ge->load.weight *
   2976  *
   2977  *		max(grq->avg.runnable_load_avg, grq->runnable_weight)
   2978  *		-----------------------------------------------------	(8)
   2979  *		      max(grq->avg.load_avg, grq->load.weight)
   2980  *
   2981  * Where these max() serve both to use the 'instant' values to fix the slow
   2982  * from-idle and avoid the /0 on to-idle, similar to (6).
   2983  */
   2984 static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
   2985 {
   2986 	long runnable, load_avg;
   2987 
   2988 	load_avg = max(cfs_rq->avg.load_avg,
   2989 		       scale_load_down(cfs_rq->load.weight));
   2990 
   2991 	runnable = max(cfs_rq->avg.runnable_load_avg,
   2992 		       scale_load_down(cfs_rq->runnable_weight));
   2993 
   2994 	runnable *= shares;
   2995 	if (load_avg)
   2996 		runnable /= load_avg;
   2997 
   2998 	return clamp_t(long, runnable, MIN_SHARES, shares);
   2999 }
   3000 #endif /* CONFIG_SMP */
   3001 
   3002 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
   3003 
   3004 /*
   3005  * Recomputes the group entity based on the current state of its group
   3006  * runqueue.
   3007  */
   3008 static void update_cfs_group(struct sched_entity *se)
   3009 {
   3010 	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
   3011 	long shares, runnable;
   3012 
   3013 	if (!gcfs_rq)
   3014 		return;
   3015 
   3016 	if (throttled_hierarchy(gcfs_rq))
   3017 		return;
   3018 
   3019 #ifndef CONFIG_SMP
   3020 	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
   3021 
   3022 	if (likely(se->load.weight == shares))
   3023 		return;
   3024 #else
   3025 	shares   = calc_group_shares(gcfs_rq);
   3026 	runnable = calc_group_runnable(gcfs_rq, shares);
   3027 #endif
   3028 
   3029 	reweight_entity(cfs_rq_of(se), se, shares, runnable);
   3030 }
   3031 
   3032 #else /* CONFIG_FAIR_GROUP_SCHED */
   3033 static inline void update_cfs_group(struct sched_entity *se)
   3034 {
   3035 }
   3036 #endif /* CONFIG_FAIR_GROUP_SCHED */
   3037 
   3038 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
   3039 {
   3040 	struct rq *rq = rq_of(cfs_rq);
   3041 
   3042 	if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
   3043 		/*
   3044 		 * There are a few boundary cases this might miss but it should
   3045 		 * get called often enough that that should (hopefully) not be
   3046 		 * a real problem.
   3047 		 *
   3048 		 * It will not get called when we go idle, because the idle
   3049 		 * thread is a different class (!fair), nor will the utilization
   3050 		 * number include things like RT tasks.
   3051 		 *
   3052 		 * As is, the util number is not freq-invariant (we'd have to
   3053 		 * implement arch_scale_freq_capacity() for that).
   3054 		 *
   3055 		 * See cpu_util().
   3056 		 */
   3057 		cpufreq_update_util(rq, flags);
   3058 	}
   3059 }
   3060 
   3061 #ifdef CONFIG_SMP
   3062 #ifdef CONFIG_FAIR_GROUP_SCHED
   3063 /**
   3064  * update_tg_load_avg - update the tg's load avg
   3065  * @cfs_rq: the cfs_rq whose avg changed
   3066  * @force: update regardless of how small the difference
   3067  *
   3068  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
   3069  * However, because tg->load_avg is a global value there are performance
   3070  * considerations.
   3071  *
   3072  * In order to avoid having to look at the other cfs_rq's, we use a
   3073  * differential update where we store the last value we propagated. This in
   3074  * turn allows skipping updates if the differential is 'small'.
   3075  *
   3076  * Updating tg's load_avg is necessary before update_cfs_share().
   3077  */
   3078 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
   3079 {
   3080 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
   3081 
   3082 	/*
   3083 	 * No need to update load_avg for root_task_group as it is not used.
   3084 	 */
   3085 	if (cfs_rq->tg == &root_task_group)
   3086 		return;
   3087 
   3088 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
   3089 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
   3090 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
   3091 	}
   3092 }
   3093 
   3094 /*
   3095  * Called within set_task_rq() right before setting a task's CPU. The
   3096  * caller only guarantees p->pi_lock is held; no other assumptions,
   3097  * including the state of rq->lock, should be made.
   3098  */
   3099 void set_task_rq_fair(struct sched_entity *se,
   3100 		      struct cfs_rq *prev, struct cfs_rq *next)
   3101 {
   3102 	u64 p_last_update_time;
   3103 	u64 n_last_update_time;
   3104 
   3105 	if (!sched_feat(ATTACH_AGE_LOAD))
   3106 		return;
   3107 
   3108 	/*
   3109 	 * We are supposed to update the task to "current" time, then its up to
   3110 	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
   3111 	 * getting what current time is, so simply throw away the out-of-date
   3112 	 * time. This will result in the wakee task is less decayed, but giving
   3113 	 * the wakee more load sounds not bad.
   3114 	 */
   3115 	if (!(se->avg.last_update_time && prev))
   3116 		return;
   3117 
   3118 #ifndef CONFIG_64BIT
   3119 	{
   3120 		u64 p_last_update_time_copy;
   3121 		u64 n_last_update_time_copy;
   3122 
   3123 		do {
   3124 			p_last_update_time_copy = prev->load_last_update_time_copy;
   3125 			n_last_update_time_copy = next->load_last_update_time_copy;
   3126 
   3127 			smp_rmb();
   3128 
   3129 			p_last_update_time = prev->avg.last_update_time;
   3130 			n_last_update_time = next->avg.last_update_time;
   3131 
   3132 		} while (p_last_update_time != p_last_update_time_copy ||
   3133 			 n_last_update_time != n_last_update_time_copy);
   3134 	}
   3135 #else
   3136 	p_last_update_time = prev->avg.last_update_time;
   3137 	n_last_update_time = next->avg.last_update_time;
   3138 #endif
   3139 	__update_load_avg_blocked_se(p_last_update_time, se);
   3140 	se->avg.last_update_time = n_last_update_time;
   3141 }
   3142 
   3143 
   3144 /*
   3145  * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
   3146  * propagate its contribution. The key to this propagation is the invariant
   3147  * that for each group:
   3148  *
   3149  *   ge->avg == grq->avg						(1)
   3150  *
   3151  * _IFF_ we look at the pure running and runnable sums. Because they
   3152  * represent the very same entity, just at different points in the hierarchy.
   3153  *
   3154  * Per the above update_tg_cfs_util() is trivial and simply copies the running
   3155  * sum over (but still wrong, because the group entity and group rq do not have
   3156  * their PELT windows aligned).
   3157  *
   3158  * However, update_tg_cfs_runnable() is more complex. So we have:
   3159  *
   3160  *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
   3161  *
   3162  * And since, like util, the runnable part should be directly transferable,
   3163  * the following would _appear_ to be the straight forward approach:
   3164  *
   3165  *   grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg	(3)
   3166  *
   3167  * And per (1) we have:
   3168  *
   3169  *   ge->avg.runnable_avg == grq->avg.runnable_avg
   3170  *
   3171  * Which gives:
   3172  *
   3173  *                      ge->load.weight * grq->avg.load_avg
   3174  *   ge->avg.load_avg = -----------------------------------		(4)
   3175  *                               grq->load.weight
   3176  *
   3177  * Except that is wrong!
   3178  *
   3179  * Because while for entities historical weight is not important and we
   3180  * really only care about our future and therefore can consider a pure
   3181  * runnable sum, runqueues can NOT do this.
   3182  *
   3183  * We specifically want runqueues to have a load_avg that includes
   3184  * historical weights. Those represent the blocked load, the load we expect
   3185  * to (shortly) return to us. This only works by keeping the weights as
   3186  * integral part of the sum. We therefore cannot decompose as per (3).
   3187  *
   3188  * Another reason this doesn't work is that runnable isn't a 0-sum entity.
   3189  * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
   3190  * rq itself is runnable anywhere between 2/3 and 1 depending on how the
   3191  * runnable section of these tasks overlap (or not). If they were to perfectly
   3192  * align the rq as a whole would be runnable 2/3 of the time. If however we
   3193  * always have at least 1 runnable task, the rq as a whole is always runnable.
   3194  *
   3195  * So we'll have to approximate.. :/
   3196  *
   3197  * Given the constraint:
   3198  *
   3199  *   ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
   3200  *
   3201  * We can construct a rule that adds runnable to a rq by assuming minimal
   3202  * overlap.
   3203  *
   3204  * On removal, we'll assume each task is equally runnable; which yields:
   3205  *
   3206  *   grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
   3207  *
   3208  * XXX: only do this for the part of runnable > running ?
   3209  *
   3210  */
   3211 
   3212 static inline void
   3213 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
   3214 {
   3215 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
   3216 
   3217 	/* Nothing to update */
   3218 	if (!delta)
   3219 		return;
   3220 
   3221 	/*
   3222 	 * The relation between sum and avg is:
   3223 	 *
   3224 	 *   LOAD_AVG_MAX - 1024 + sa->period_contrib
   3225 	 *
   3226 	 * however, the PELT windows are not aligned between grq and gse.
   3227 	 */
   3228 
   3229 	/* Set new sched_entity's utilization */
   3230 	se->avg.util_avg = gcfs_rq->avg.util_avg;
   3231 	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
   3232 
   3233 	/* Update parent cfs_rq utilization */
   3234 	add_positive(&cfs_rq->avg.util_avg, delta);
   3235 	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
   3236 }
   3237 
   3238 static inline void
   3239 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
   3240 {
   3241 	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
   3242 	unsigned long runnable_load_avg, load_avg;
   3243 	u64 runnable_load_sum, load_sum = 0;
   3244 	s64 delta_sum;
   3245 
   3246 	if (!runnable_sum)
   3247 		return;
   3248 
   3249 	gcfs_rq->prop_runnable_sum = 0;
   3250 
   3251 	if (runnable_sum >= 0) {
   3252 		/*
   3253 		 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
   3254 		 * the CPU is saturated running == runnable.
   3255 		 */
   3256 		runnable_sum += se->avg.load_sum;
   3257 		runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
   3258 	} else {
   3259 		/*
   3260 		 * Estimate the new unweighted runnable_sum of the gcfs_rq by
   3261 		 * assuming all tasks are equally runnable.
   3262 		 */
   3263 		if (scale_load_down(gcfs_rq->load.weight)) {
   3264 			load_sum = div_s64(gcfs_rq->avg.load_sum,
   3265 				scale_load_down(gcfs_rq->load.weight));
   3266 		}
   3267 
   3268 		/* But make sure to not inflate se's runnable */
   3269 		runnable_sum = min(se->avg.load_sum, load_sum);
   3270 	}
   3271 
   3272 	/*
   3273 	 * runnable_sum can't be lower than running_sum
   3274 	 * Rescale running sum to be in the same range as runnable sum
   3275 	 * running_sum is in [0 : LOAD_AVG_MAX <<  SCHED_CAPACITY_SHIFT]
   3276 	 * runnable_sum is in [0 : LOAD_AVG_MAX]
   3277 	 */
   3278 	running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
   3279 	runnable_sum = max(runnable_sum, running_sum);
   3280 
   3281 	load_sum = (s64)se_weight(se) * runnable_sum;
   3282 	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
   3283 
   3284 	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
   3285 	delta_avg = load_avg - se->avg.load_avg;
   3286 
   3287 	se->avg.load_sum = runnable_sum;
   3288 	se->avg.load_avg = load_avg;
   3289 	add_positive(&cfs_rq->avg.load_avg, delta_avg);
   3290 	add_positive(&cfs_rq->avg.load_sum, delta_sum);
   3291 
   3292 	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
   3293 	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
   3294 	delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
   3295 	delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
   3296 
   3297 	se->avg.runnable_load_sum = runnable_sum;
   3298 	se->avg.runnable_load_avg = runnable_load_avg;
   3299 
   3300 	if (se->on_rq) {
   3301 		add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
   3302 		add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
   3303 	}
   3304 }
   3305 
   3306 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
   3307 {
   3308 	cfs_rq->propagate = 1;
   3309 	cfs_rq->prop_runnable_sum += runnable_sum;
   3310 }
   3311 
   3312 /* Update task and its cfs_rq load average */
   3313 static inline int propagate_entity_load_avg(struct sched_entity *se)
   3314 {
   3315 	struct cfs_rq *cfs_rq, *gcfs_rq;
   3316 
   3317 	if (entity_is_task(se))
   3318 		return 0;
   3319 
   3320 	gcfs_rq = group_cfs_rq(se);
   3321 	if (!gcfs_rq->propagate)
   3322 		return 0;
   3323 
   3324 	gcfs_rq->propagate = 0;
   3325 
   3326 	cfs_rq = cfs_rq_of(se);
   3327 
   3328 	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
   3329 
   3330 	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
   3331 	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
   3332 
   3333 	return 1;
   3334 }
   3335 
   3336 /*
   3337  * Check if we need to update the load and the utilization of a blocked
   3338  * group_entity:
   3339  */
   3340 static inline bool skip_blocked_update(struct sched_entity *se)
   3341 {
   3342 	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
   3343 
   3344 	/*
   3345 	 * If sched_entity still have not zero load or utilization, we have to
   3346 	 * decay it:
   3347 	 */
   3348 	if (se->avg.load_avg || se->avg.util_avg)
   3349 		return false;
   3350 
   3351 	/*
   3352 	 * If there is a pending propagation, we have to update the load and
   3353 	 * the utilization of the sched_entity:
   3354 	 */
   3355 	if (gcfs_rq->propagate)
   3356 		return false;
   3357 
   3358 	/*
   3359 	 * Otherwise, the load and the utilization of the sched_entity is
   3360 	 * already zero and there is no pending propagation, so it will be a
   3361 	 * waste of time to try to decay it:
   3362 	 */
   3363 	return true;
   3364 }
   3365 
   3366 #else /* CONFIG_FAIR_GROUP_SCHED */
   3367 
   3368 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
   3369 
   3370 static inline int propagate_entity_load_avg(struct sched_entity *se)
   3371 {
   3372 	return 0;
   3373 }
   3374 
   3375 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
   3376 
   3377 #endif /* CONFIG_FAIR_GROUP_SCHED */
   3378 
   3379 /**
   3380  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
   3381  * @now: current time, as per cfs_rq_clock_pelt()
   3382  * @cfs_rq: cfs_rq to update
   3383  *
   3384  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
   3385  * avg. The immediate corollary is that all (fair) tasks must be attached, see
   3386  * post_init_entity_util_avg().
   3387  *
   3388  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
   3389  *
   3390  * Returns true if the load decayed or we removed load.
   3391  *
   3392  * Since both these conditions indicate a changed cfs_rq->avg.load we should
   3393  * call update_tg_load_avg() when this function returns true.
   3394  */
   3395 static inline int
   3396 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
   3397 {
   3398 	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
   3399 	struct sched_avg *sa = &cfs_rq->avg;
   3400 	int decayed = 0;
   3401 
   3402 	if (cfs_rq->removed.nr) {
   3403 		unsigned long r;
   3404 		u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
   3405 
   3406 		raw_spin_lock(&cfs_rq->removed.lock);
   3407 		swap(cfs_rq->removed.util_avg, removed_util);
   3408 		swap(cfs_rq->removed.load_avg, removed_load);
   3409 		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
   3410 		cfs_rq->removed.nr = 0;
   3411 		raw_spin_unlock(&cfs_rq->removed.lock);
   3412 
   3413 		r = removed_load;
   3414 		sub_positive(&sa->load_avg, r);
   3415 		sub_positive(&sa->load_sum, r * divider);
   3416 
   3417 		r = removed_util;
   3418 		sub_positive(&sa->util_avg, r);
   3419 		sub_positive(&sa->util_sum, r * divider);
   3420 
   3421 		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
   3422 
   3423 		decayed = 1;
   3424 	}
   3425 
   3426 	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
   3427 
   3428 #ifndef CONFIG_64BIT
   3429 	smp_wmb();
   3430 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
   3431 #endif
   3432 
   3433 	if (decayed)
   3434 		cfs_rq_util_change(cfs_rq, 0);
   3435 
   3436 	return decayed;
   3437 }
   3438 
   3439 /**
   3440  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
   3441  * @cfs_rq: cfs_rq to attach to
   3442  * @se: sched_entity to attach
   3443  * @flags: migration hints
   3444  *
   3445  * Must call update_cfs_rq_load_avg() before this, since we rely on
   3446  * cfs_rq->avg.last_update_time being current.
   3447  */
   3448 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   3449 {
   3450 	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
   3451 
   3452 	/*
   3453 	 * When we attach the @se to the @cfs_rq, we must align the decay
   3454 	 * window because without that, really weird and wonderful things can
   3455 	 * happen.
   3456 	 *
   3457 	 * XXX illustrate
   3458 	 */
   3459 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
   3460 	se->avg.period_contrib = cfs_rq->avg.period_contrib;
   3461 
   3462 	/*
   3463 	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
   3464 	 * period_contrib. This isn't strictly correct, but since we're
   3465 	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
   3466 	 * _sum a little.
   3467 	 */
   3468 	se->avg.util_sum = se->avg.util_avg * divider;
   3469 
   3470 	se->avg.load_sum = divider;
   3471 	if (se_weight(se)) {
   3472 		se->avg.load_sum =
   3473 			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
   3474 	}
   3475 
   3476 	se->avg.runnable_load_sum = se->avg.load_sum;
   3477 
   3478 	enqueue_load_avg(cfs_rq, se);
   3479 	cfs_rq->avg.util_avg += se->avg.util_avg;
   3480 	cfs_rq->avg.util_sum += se->avg.util_sum;
   3481 
   3482 	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
   3483 
   3484 	cfs_rq_util_change(cfs_rq, flags);
   3485 }
   3486 
   3487 /**
   3488  * detach_entity_load_avg - detach this entity from its cfs_rq load avg
   3489  * @cfs_rq: cfs_rq to detach from
   3490  * @se: sched_entity to detach
   3491  *
   3492  * Must call update_cfs_rq_load_avg() before this, since we rely on
   3493  * cfs_rq->avg.last_update_time being current.
   3494  */
   3495 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3496 {
   3497 	dequeue_load_avg(cfs_rq, se);
   3498 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
   3499 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
   3500 
   3501 	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
   3502 
   3503 	cfs_rq_util_change(cfs_rq, 0);
   3504 }
   3505 
   3506 /*
   3507  * Optional action to be done while updating the load average
   3508  */
   3509 #define UPDATE_TG	0x1
   3510 #define SKIP_AGE_LOAD	0x2
   3511 #define DO_ATTACH	0x4
   3512 
   3513 /* Update task and its cfs_rq load average */
   3514 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   3515 {
   3516 	u64 now = cfs_rq_clock_pelt(cfs_rq);
   3517 	int decayed;
   3518 
   3519 	/*
   3520 	 * Track task load average for carrying it to new CPU after migrated, and
   3521 	 * track group sched_entity load average for task_h_load calc in migration
   3522 	 */
   3523 	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
   3524 		__update_load_avg_se(now, cfs_rq, se);
   3525 
   3526 	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
   3527 	decayed |= propagate_entity_load_avg(se);
   3528 
   3529 	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
   3530 
   3531 		/*
   3532 		 * DO_ATTACH means we're here from enqueue_entity().
   3533 		 * !last_update_time means we've passed through
   3534 		 * migrate_task_rq_fair() indicating we migrated.
   3535 		 *
   3536 		 * IOW we're enqueueing a task on a new CPU.
   3537 		 */
   3538 		attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
   3539 		update_tg_load_avg(cfs_rq, 0);
   3540 
   3541 	} else if (decayed && (flags & UPDATE_TG))
   3542 		update_tg_load_avg(cfs_rq, 0);
   3543 }
   3544 
   3545 #ifndef CONFIG_64BIT
   3546 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
   3547 {
   3548 	u64 last_update_time_copy;
   3549 	u64 last_update_time;
   3550 
   3551 	do {
   3552 		last_update_time_copy = cfs_rq->load_last_update_time_copy;
   3553 		smp_rmb();
   3554 		last_update_time = cfs_rq->avg.last_update_time;
   3555 	} while (last_update_time != last_update_time_copy);
   3556 
   3557 	return last_update_time;
   3558 }
   3559 #else
   3560 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
   3561 {
   3562 	return cfs_rq->avg.last_update_time;
   3563 }
   3564 #endif
   3565 
   3566 /*
   3567  * Synchronize entity load avg of dequeued entity without locking
   3568  * the previous rq.
   3569  */
   3570 void sync_entity_load_avg(struct sched_entity *se)
   3571 {
   3572 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3573 	u64 last_update_time;
   3574 
   3575 	last_update_time = cfs_rq_last_update_time(cfs_rq);
   3576 	__update_load_avg_blocked_se(last_update_time, se);
   3577 }
   3578 
   3579 /*
   3580  * Task first catches up with cfs_rq, and then subtract
   3581  * itself from the cfs_rq (task must be off the queue now).
   3582  */
   3583 void remove_entity_load_avg(struct sched_entity *se)
   3584 {
   3585 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3586 	unsigned long flags;
   3587 
   3588 	/*
   3589 	 * tasks cannot exit without having gone through wake_up_new_task() ->
   3590 	 * post_init_entity_util_avg() which will have added things to the
   3591 	 * cfs_rq, so we can remove unconditionally.
   3592 	 */
   3593 
   3594 	sync_entity_load_avg(se);
   3595 
   3596 	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
   3597 	++cfs_rq->removed.nr;
   3598 	cfs_rq->removed.util_avg	+= se->avg.util_avg;
   3599 	cfs_rq->removed.load_avg	+= se->avg.load_avg;
   3600 	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
   3601 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
   3602 }
   3603 
   3604 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
   3605 {
   3606 	return cfs_rq->avg.runnable_load_avg;
   3607 }
   3608 
   3609 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
   3610 {
   3611 	return cfs_rq->avg.load_avg;
   3612 }
   3613 
   3614 static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
   3615 
   3616 static inline unsigned long task_util(struct task_struct *p)
   3617 {
   3618 	return READ_ONCE(p->se.avg.util_avg);
   3619 }
   3620 
   3621 static inline unsigned long _task_util_est(struct task_struct *p)
   3622 {
   3623 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
   3624 
   3625 	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
   3626 }
   3627 
   3628 static inline unsigned long task_util_est(struct task_struct *p)
   3629 {
   3630 	return max(task_util(p), _task_util_est(p));
   3631 }
   3632 
   3633 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
   3634 				    struct task_struct *p)
   3635 {
   3636 	unsigned int enqueued;
   3637 
   3638 	if (!sched_feat(UTIL_EST))
   3639 		return;
   3640 
   3641 	/* Update root cfs_rq's estimated utilization */
   3642 	enqueued  = cfs_rq->avg.util_est.enqueued;
   3643 	enqueued += _task_util_est(p);
   3644 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
   3645 }
   3646 
   3647 /*
   3648  * Check if a (signed) value is within a specified (unsigned) margin,
   3649  * based on the observation that:
   3650  *
   3651  *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
   3652  *
   3653  * NOTE: this only works when value + maring < INT_MAX.
   3654  */
   3655 static inline bool within_margin(int value, int margin)
   3656 {
   3657 	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
   3658 }
   3659 
   3660 static void
   3661 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
   3662 {
   3663 	long last_ewma_diff;
   3664 	struct util_est ue;
   3665 	int cpu;
   3666 
   3667 	if (!sched_feat(UTIL_EST))
   3668 		return;
   3669 
   3670 	/* Update root cfs_rq's estimated utilization */
   3671 	ue.enqueued  = cfs_rq->avg.util_est.enqueued;
   3672 	ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
   3673 	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
   3674 
   3675 	/*
   3676 	 * Skip update of task's estimated utilization when the task has not
   3677 	 * yet completed an activation, e.g. being migrated.
   3678 	 */
   3679 	if (!task_sleep)
   3680 		return;
   3681 
   3682 	/*
   3683 	 * If the PELT values haven't changed since enqueue time,
   3684 	 * skip the util_est update.
   3685 	 */
   3686 	ue = p->se.avg.util_est;
   3687 	if (ue.enqueued & UTIL_AVG_UNCHANGED)
   3688 		return;
   3689 
   3690 	/*
   3691 	 * Skip update of task's estimated utilization when its EWMA is
   3692 	 * already ~1% close to its last activation value.
   3693 	 */
   3694 	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
   3695 	last_ewma_diff = ue.enqueued - ue.ewma;
   3696 	if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
   3697 		return;
   3698 
   3699 	/*
   3700 	 * To avoid overestimation of actual task utilization, skip updates if
   3701 	 * we cannot grant there is idle time in this CPU.
   3702 	 */
   3703 	cpu = cpu_of(rq_of(cfs_rq));
   3704 	if (task_util(p) > capacity_orig_of(cpu))
   3705 		return;
   3706 
   3707 	/*
   3708 	 * Update Task's estimated utilization
   3709 	 *
   3710 	 * When *p completes an activation we can consolidate another sample
   3711 	 * of the task size. This is done by storing the current PELT value
   3712 	 * as ue.enqueued and by using this value to update the Exponential
   3713 	 * Weighted Moving Average (EWMA):
   3714 	 *
   3715 	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
   3716 	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
   3717 	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
   3718 	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
   3719 	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
   3720 	 *
   3721 	 * Where 'w' is the weight of new samples, which is configured to be
   3722 	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
   3723 	 */
   3724 	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
   3725 	ue.ewma  += last_ewma_diff;
   3726 	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
   3727 	WRITE_ONCE(p->se.avg.util_est, ue);
   3728 }
   3729 
   3730 static inline int task_fits_capacity(struct task_struct *p, long capacity)
   3731 {
   3732 	return capacity * 1024 > task_util_est(p) * capacity_margin;
   3733 }
   3734 
   3735 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
   3736 {
   3737 	if (!static_branch_unlikely(&sched_asym_cpucapacity))
   3738 		return;
   3739 
   3740 	if (!p) {
   3741 		rq->misfit_task_load = 0;
   3742 		return;
   3743 	}
   3744 
   3745 	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
   3746 		rq->misfit_task_load = 0;
   3747 		return;
   3748 	}
   3749 
   3750 	rq->misfit_task_load = task_h_load(p);
   3751 }
   3752 
   3753 #else /* CONFIG_SMP */
   3754 
   3755 #define UPDATE_TG	0x0
   3756 #define SKIP_AGE_LOAD	0x0
   3757 #define DO_ATTACH	0x0
   3758 
   3759 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
   3760 {
   3761 	cfs_rq_util_change(cfs_rq, 0);
   3762 }
   3763 
   3764 static inline void remove_entity_load_avg(struct sched_entity *se) {}
   3765 
   3766 static inline void
   3767 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
   3768 static inline void
   3769 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
   3770 
   3771 static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
   3772 {
   3773 	return 0;
   3774 }
   3775 
   3776 static inline void
   3777 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
   3778 
   3779 static inline void
   3780 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
   3781 		 bool task_sleep) {}
   3782 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
   3783 
   3784 #endif /* CONFIG_SMP */
   3785 
   3786 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3787 {
   3788 #ifdef CONFIG_SCHED_DEBUG
   3789 	s64 d = se->vruntime - cfs_rq->min_vruntime;
   3790 
   3791 	if (d < 0)
   3792 		d = -d;
   3793 
   3794 	if (d > 3*sysctl_sched_latency)
   3795 		schedstat_inc(cfs_rq->nr_spread_over);
   3796 #endif
   3797 }
   3798 
   3799 static void
   3800 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
   3801 {
   3802 	u64 vruntime = cfs_rq->min_vruntime;
   3803 
   3804 	/*
   3805 	 * The 'current' period is already promised to the current tasks,
   3806 	 * however the extra weight of the new task will slow them down a
   3807 	 * little, place the new task so that it fits in the slot that
   3808 	 * stays open at the end.
   3809 	 */
   3810 	if (initial && sched_feat(START_DEBIT))
   3811 		vruntime += sched_vslice(cfs_rq, se);
   3812 
   3813 	/* sleeps up to a single latency don't count. */
   3814 	if (!initial) {
   3815 		unsigned long thresh = sysctl_sched_latency;
   3816 
   3817 		/*
   3818 		 * Halve their sleep time's effect, to allow
   3819 		 * for a gentler effect of sleepers:
   3820 		 */
   3821 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
   3822 			thresh >>= 1;
   3823 
   3824 		vruntime -= thresh;
   3825 	}
   3826 
   3827 	/* ensure we never gain time by being placed backwards. */
   3828 	se->vruntime = max_vruntime(se->vruntime, vruntime);
   3829 }
   3830 
   3831 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
   3832 
   3833 static inline void check_schedstat_required(void)
   3834 {
   3835 #ifdef CONFIG_SCHEDSTATS
   3836 	if (schedstat_enabled())
   3837 		return;
   3838 
   3839 	/* Force schedstat enabled if a dependent tracepoint is active */
   3840 	if (trace_sched_stat_wait_enabled()    ||
   3841 			trace_sched_stat_sleep_enabled()   ||
   3842 			trace_sched_stat_iowait_enabled()  ||
   3843 			trace_sched_stat_blocked_enabled() ||
   3844 			trace_sched_stat_runtime_enabled())  {
   3845 		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
   3846 			     "stat_blocked and stat_runtime require the "
   3847 			     "kernel parameter schedstats=enable or "
   3848 			     "kernel.sched_schedstats=1\n");
   3849 	}
   3850 #endif
   3851 }
   3852 
   3853 
   3854 /*
   3855  * MIGRATION
   3856  *
   3857  *	dequeue
   3858  *	  update_curr()
   3859  *	    update_min_vruntime()
   3860  *	  vruntime -= min_vruntime
   3861  *
   3862  *	enqueue
   3863  *	  update_curr()
   3864  *	    update_min_vruntime()
   3865  *	  vruntime += min_vruntime
   3866  *
   3867  * this way the vruntime transition between RQs is done when both
   3868  * min_vruntime are up-to-date.
   3869  *
   3870  * WAKEUP (remote)
   3871  *
   3872  *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
   3873  *	  vruntime -= min_vruntime
   3874  *
   3875  *	enqueue
   3876  *	  update_curr()
   3877  *	    update_min_vruntime()
   3878  *	  vruntime += min_vruntime
   3879  *
   3880  * this way we don't have the most up-to-date min_vruntime on the originating
   3881  * CPU and an up-to-date min_vruntime on the destination CPU.
   3882  */
   3883 
   3884 static void
   3885 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   3886 {
   3887 	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
   3888 	bool curr = cfs_rq->curr == se;
   3889 
   3890 	/*
   3891 	 * If we're the current task, we must renormalise before calling
   3892 	 * update_curr().
   3893 	 */
   3894 	if (renorm && curr)
   3895 		se->vruntime += cfs_rq->min_vruntime;
   3896 
   3897 	update_curr(cfs_rq);
   3898 
   3899 	/*
   3900 	 * Otherwise, renormalise after, such that we're placed at the current
   3901 	 * moment in time, instead of some random moment in the past. Being
   3902 	 * placed in the past could significantly boost this task to the
   3903 	 * fairness detriment of existing tasks.
   3904 	 */
   3905 	if (renorm && !curr)
   3906 		se->vruntime += cfs_rq->min_vruntime;
   3907 
   3908 	/*
   3909 	 * When enqueuing a sched_entity, we must:
   3910 	 *   - Update loads to have both entity and cfs_rq synced with now.
   3911 	 *   - Add its load to cfs_rq->runnable_avg
   3912 	 *   - For group_entity, update its weight to reflect the new share of
   3913 	 *     its group cfs_rq
   3914 	 *   - Add its new weight to cfs_rq->load.weight
   3915 	 */
   3916 	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
   3917 	update_cfs_group(se);
   3918 	enqueue_runnable_load_avg(cfs_rq, se);
   3919 	account_entity_enqueue(cfs_rq, se);
   3920 
   3921 	if (flags & ENQUEUE_WAKEUP)
   3922 		place_entity(cfs_rq, se, 0);
   3923 
   3924 	check_schedstat_required();
   3925 	update_stats_enqueue(cfs_rq, se, flags);
   3926 	check_spread(cfs_rq, se);
   3927 	if (!curr)
   3928 		__enqueue_entity(cfs_rq, se);
   3929 	se->on_rq = 1;
   3930 
   3931 	if (cfs_rq->nr_running == 1) {
   3932 		list_add_leaf_cfs_rq(cfs_rq);
   3933 		check_enqueue_throttle(cfs_rq);
   3934 	}
   3935 }
   3936 
   3937 static void __clear_buddies_last(struct sched_entity *se)
   3938 {
   3939 	for_each_sched_entity(se) {
   3940 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3941 		if (cfs_rq->last != se)
   3942 			break;
   3943 
   3944 		cfs_rq->last = NULL;
   3945 	}
   3946 }
   3947 
   3948 static void __clear_buddies_next(struct sched_entity *se)
   3949 {
   3950 	for_each_sched_entity(se) {
   3951 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3952 		if (cfs_rq->next != se)
   3953 			break;
   3954 
   3955 		cfs_rq->next = NULL;
   3956 	}
   3957 }
   3958 
   3959 static void __clear_buddies_skip(struct sched_entity *se)
   3960 {
   3961 	for_each_sched_entity(se) {
   3962 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   3963 		if (cfs_rq->skip != se)
   3964 			break;
   3965 
   3966 		cfs_rq->skip = NULL;
   3967 	}
   3968 }
   3969 
   3970 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
   3971 {
   3972 	if (cfs_rq->last == se)
   3973 		__clear_buddies_last(se);
   3974 
   3975 	if (cfs_rq->next == se)
   3976 		__clear_buddies_next(se);
   3977 
   3978 	if (cfs_rq->skip == se)
   3979 		__clear_buddies_skip(se);
   3980 }
   3981 
   3982 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
   3983 
   3984 static void
   3985 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
   3986 {
   3987 	/*
   3988 	 * Update run-time statistics of the 'current'.
   3989 	 */
   3990 	update_curr(cfs_rq);
   3991 
   3992 	/*
   3993 	 * When dequeuing a sched_entity, we must:
   3994 	 *   - Update loads to have both entity and cfs_rq synced with now.
   3995 	 *   - Subtract its load from the cfs_rq->runnable_avg.
   3996 	 *   - Subtract its previous weight from cfs_rq->load.weight.
   3997 	 *   - For group entity, update its weight to reflect the new share
   3998 	 *     of its group cfs_rq.
   3999 	 */
   4000 	update_load_avg(cfs_rq, se, UPDATE_TG);
   4001 	dequeue_runnable_load_avg(cfs_rq, se);
   4002 
   4003 	update_stats_dequeue(cfs_rq, se, flags);
   4004 
   4005 	clear_buddies(cfs_rq, se);
   4006 
   4007 	if (se != cfs_rq->curr)
   4008 		__dequeue_entity(cfs_rq, se);
   4009 	se->on_rq = 0;
   4010 	account_entity_dequeue(cfs_rq, se);
   4011 
   4012 	/*
   4013 	 * Normalize after update_curr(); which will also have moved
   4014 	 * min_vruntime if @se is the one holding it back. But before doing
   4015 	 * update_min_vruntime() again, which will discount @se's position and
   4016 	 * can move min_vruntime forward still more.
   4017 	 */
   4018 	if (!(flags & DEQUEUE_SLEEP))
   4019 		se->vruntime -= cfs_rq->min_vruntime;
   4020 
   4021 	/* return excess runtime on last dequeue */
   4022 	return_cfs_rq_runtime(cfs_rq);
   4023 
   4024 	update_cfs_group(se);
   4025 
   4026 	/*
   4027 	 * Now advance min_vruntime if @se was the entity holding it back,
   4028 	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
   4029 	 * put back on, and if we advance min_vruntime, we'll be placed back
   4030 	 * further than we started -- ie. we'll be penalized.
   4031 	 */
   4032 	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
   4033 		update_min_vruntime(cfs_rq);
   4034 }
   4035 
   4036 /*
   4037  * Preempt the current task with a newly woken task if needed:
   4038  */
   4039 static void
   4040 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
   4041 {
   4042 	unsigned long ideal_runtime, delta_exec;
   4043 	struct sched_entity *se;
   4044 	s64 delta;
   4045 
   4046 	ideal_runtime = sched_slice(cfs_rq, curr);
   4047 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
   4048 	if (delta_exec > ideal_runtime) {
   4049 		resched_curr(rq_of(cfs_rq));
   4050 		/*
   4051 		 * The current task ran long enough, ensure it doesn't get
   4052 		 * re-elected due to buddy favours.
   4053 		 */
   4054 		clear_buddies(cfs_rq, curr);
   4055 		return;
   4056 	}
   4057 
   4058 	/*
   4059 	 * Ensure that a task that missed wakeup preemption by a
   4060 	 * narrow margin doesn't have to wait for a full slice.
   4061 	 * This also mitigates buddy induced latencies under load.
   4062 	 */
   4063 	if (delta_exec < sysctl_sched_min_granularity)
   4064 		return;
   4065 
   4066 	se = __pick_first_entity(cfs_rq);
   4067 	delta = curr->vruntime - se->vruntime;
   4068 
   4069 	if (delta < 0)
   4070 		return;
   4071 
   4072 	if (delta > ideal_runtime)
   4073 		resched_curr(rq_of(cfs_rq));
   4074 }
   4075 
   4076 static void
   4077 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
   4078 {
   4079 	/* 'current' is not kept within the tree. */
   4080 	if (se->on_rq) {
   4081 		/*
   4082 		 * Any task has to be enqueued before it get to execute on
   4083 		 * a CPU. So account for the time it spent waiting on the
   4084 		 * runqueue.
   4085 		 */
   4086 		update_stats_wait_end(cfs_rq, se);
   4087 		__dequeue_entity(cfs_rq, se);
   4088 		update_load_avg(cfs_rq, se, UPDATE_TG);
   4089 	}
   4090 
   4091 	update_stats_curr_start(cfs_rq, se);
   4092 	cfs_rq->curr = se;
   4093 
   4094 	/*
   4095 	 * Track our maximum slice length, if the CPU's load is at
   4096 	 * least twice that of our own weight (i.e. dont track it
   4097 	 * when there are only lesser-weight tasks around):
   4098 	 */
   4099 	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
   4100 		schedstat_set(se->statistics.slice_max,
   4101 			max((u64)schedstat_val(se->statistics.slice_max),
   4102 			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
   4103 	}
   4104 
   4105 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
   4106 }
   4107 
   4108 static int
   4109 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
   4110 
   4111 /*
   4112  * Pick the next process, keeping these things in mind, in this order:
   4113  * 1) keep things fair between processes/task groups
   4114  * 2) pick the "next" process, since someone really wants that to run
   4115  * 3) pick the "last" process, for cache locality
   4116  * 4) do not run the "skip" process, if something else is available
   4117  */
   4118 static struct sched_entity *
   4119 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
   4120 {
   4121 	struct sched_entity *left = __pick_first_entity(cfs_rq);
   4122 	struct sched_entity *se;
   4123 
   4124 	/*
   4125 	 * If curr is set we have to see if its left of the leftmost entity
   4126 	 * still in the tree, provided there was anything in the tree at all.
   4127 	 */
   4128 	if (!left || (curr && entity_before(curr, left)))
   4129 		left = curr;
   4130 
   4131 	se = left; /* ideally we run the leftmost entity */
   4132 
   4133 	/*
   4134 	 * Avoid running the skip buddy, if running something else can
   4135 	 * be done without getting too unfair.
   4136 	 */
   4137 	if (cfs_rq->skip == se) {
   4138 		struct sched_entity *second;
   4139 
   4140 		if (se == curr) {
   4141 			second = __pick_first_entity(cfs_rq);
   4142 		} else {
   4143 			second = __pick_next_entity(se);
   4144 			if (!second || (curr && entity_before(curr, second)))
   4145 				second = curr;
   4146 		}
   4147 
   4148 		if (second && wakeup_preempt_entity(second, left) < 1)
   4149 			se = second;
   4150 	}
   4151 
   4152 	/*
   4153 	 * Prefer last buddy, try to return the CPU to a preempted task.
   4154 	 */
   4155 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
   4156 		se = cfs_rq->last;
   4157 
   4158 	/*
   4159 	 * Someone really wants this to run. If it's not unfair, run it.
   4160 	 */
   4161 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
   4162 		se = cfs_rq->next;
   4163 
   4164 	clear_buddies(cfs_rq, se);
   4165 
   4166 	return se;
   4167 }
   4168 
   4169 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
   4170 
   4171 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
   4172 {
   4173 	/*
   4174 	 * If still on the runqueue then deactivate_task()
   4175 	 * was not called and update_curr() has to be done:
   4176 	 */
   4177 	if (prev->on_rq)
   4178 		update_curr(cfs_rq);
   4179 
   4180 	/* throttle cfs_rqs exceeding runtime */
   4181 	check_cfs_rq_runtime(cfs_rq);
   4182 
   4183 	check_spread(cfs_rq, prev);
   4184 
   4185 	if (prev->on_rq) {
   4186 		update_stats_wait_start(cfs_rq, prev);
   4187 		/* Put 'current' back into the tree. */
   4188 		__enqueue_entity(cfs_rq, prev);
   4189 		/* in !on_rq case, update occurred at dequeue */
   4190 		update_load_avg(cfs_rq, prev, 0);
   4191 	}
   4192 	cfs_rq->curr = NULL;
   4193 }
   4194 
   4195 static void
   4196 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
   4197 {
   4198 	/*
   4199 	 * Update run-time statistics of the 'current'.
   4200 	 */
   4201 	update_curr(cfs_rq);
   4202 
   4203 	/*
   4204 	 * Ensure that runnable average is periodically updated.
   4205 	 */
   4206 	update_load_avg(cfs_rq, curr, UPDATE_TG);
   4207 	update_cfs_group(curr);
   4208 
   4209 #ifdef CONFIG_SCHED_HRTICK
   4210 	/*
   4211 	 * queued ticks are scheduled to match the slice, so don't bother
   4212 	 * validating it and just reschedule.
   4213 	 */
   4214 	if (queued) {
   4215 		resched_curr(rq_of(cfs_rq));
   4216 		return;
   4217 	}
   4218 	/*
   4219 	 * don't let the period tick interfere with the hrtick preemption
   4220 	 */
   4221 	if (!sched_feat(DOUBLE_TICK) &&
   4222 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
   4223 		return;
   4224 #endif
   4225 
   4226 	if (cfs_rq->nr_running > 1)
   4227 		check_preempt_tick(cfs_rq, curr);
   4228 }
   4229 
   4230 
   4231 /**************************************************
   4232  * CFS bandwidth control machinery
   4233  */
   4234 
   4235 #ifdef CONFIG_CFS_BANDWIDTH
   4236 
   4237 #ifdef CONFIG_JUMP_LABEL
   4238 static struct static_key __cfs_bandwidth_used;
   4239 
   4240 static inline bool cfs_bandwidth_used(void)
   4241 {
   4242 	return static_key_false(&__cfs_bandwidth_used);
   4243 }
   4244 
   4245 void cfs_bandwidth_usage_inc(void)
   4246 {
   4247 	static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
   4248 }
   4249 
   4250 void cfs_bandwidth_usage_dec(void)
   4251 {
   4252 	static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
   4253 }
   4254 #else /* CONFIG_JUMP_LABEL */
   4255 static bool cfs_bandwidth_used(void)
   4256 {
   4257 	return true;
   4258 }
   4259 
   4260 void cfs_bandwidth_usage_inc(void) {}
   4261 void cfs_bandwidth_usage_dec(void) {}
   4262 #endif /* CONFIG_JUMP_LABEL */
   4263 
   4264 /*
   4265  * default period for cfs group bandwidth.
   4266  * default: 0.1s, units: nanoseconds
   4267  */
   4268 static inline u64 default_cfs_period(void)
   4269 {
   4270 	return 100000000ULL;
   4271 }
   4272 
   4273 static inline u64 sched_cfs_bandwidth_slice(void)
   4274 {
   4275 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
   4276 }
   4277 
   4278 /*
   4279  * Replenish runtime according to assigned quota and update expiration time.
   4280  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
   4281  * additional synchronization around rq->lock.
   4282  *
   4283  * requires cfs_b->lock
   4284  */
   4285 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
   4286 {
   4287 	u64 now;
   4288 
   4289 	if (cfs_b->quota == RUNTIME_INF)
   4290 		return;
   4291 
   4292 	now = sched_clock_cpu(smp_processor_id());
   4293 	cfs_b->runtime = cfs_b->quota;
   4294 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
   4295 	cfs_b->expires_seq++;
   4296 }
   4297 
   4298 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
   4299 {
   4300 	return &tg->cfs_bandwidth;
   4301 }
   4302 
   4303 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
   4304 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
   4305 {
   4306 	if (unlikely(cfs_rq->throttle_count))
   4307 		return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
   4308 
   4309 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
   4310 }
   4311 
   4312 /* returns 0 on failure to allocate runtime */
   4313 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   4314 {
   4315 	struct task_group *tg = cfs_rq->tg;
   4316 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
   4317 	u64 amount = 0, min_amount, expires;
   4318 	int expires_seq;
   4319 
   4320 	/* note: this is a positive sum as runtime_remaining <= 0 */
   4321 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
   4322 
   4323 	raw_spin_lock(&cfs_b->lock);
   4324 	if (cfs_b->quota == RUNTIME_INF)
   4325 		amount = min_amount;
   4326 	else {
   4327 		start_cfs_bandwidth(cfs_b);
   4328 
   4329 		if (cfs_b->runtime > 0) {
   4330 			amount = min(cfs_b->runtime, min_amount);
   4331 			cfs_b->runtime -= amount;
   4332 			cfs_b->idle = 0;
   4333 		}
   4334 	}
   4335 	expires_seq = cfs_b->expires_seq;
   4336 	expires = cfs_b->runtime_expires;
   4337 	raw_spin_unlock(&cfs_b->lock);
   4338 
   4339 	cfs_rq->runtime_remaining += amount;
   4340 	/*
   4341 	 * we may have advanced our local expiration to account for allowed
   4342 	 * spread between our sched_clock and the one on which runtime was
   4343 	 * issued.
   4344 	 */
   4345 	if (cfs_rq->expires_seq != expires_seq) {
   4346 		cfs_rq->expires_seq = expires_seq;
   4347 		cfs_rq->runtime_expires = expires;
   4348 	}
   4349 
   4350 	return cfs_rq->runtime_remaining > 0;
   4351 }
   4352 
   4353 /*
   4354  * Note: This depends on the synchronization provided by sched_clock and the
   4355  * fact that rq->clock snapshots this value.
   4356  */
   4357 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   4358 {
   4359 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   4360 
   4361 	/* if the deadline is ahead of our clock, nothing to do */
   4362 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
   4363 		return;
   4364 
   4365 	if (cfs_rq->runtime_remaining < 0)
   4366 		return;
   4367 
   4368 	/*
   4369 	 * If the local deadline has passed we have to consider the
   4370 	 * possibility that our sched_clock is 'fast' and the global deadline
   4371 	 * has not truly expired.
   4372 	 *
   4373 	 * Fortunately we can check determine whether this the case by checking
   4374 	 * whether the global deadline(cfs_b->expires_seq) has advanced.
   4375 	 */
   4376 	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
   4377 		/* extend local deadline, drift is bounded above by 2 ticks */
   4378 		cfs_rq->runtime_expires += TICK_NSEC;
   4379 	} else {
   4380 		/* global deadline is ahead, expiration has passed */
   4381 		cfs_rq->runtime_remaining = 0;
   4382 	}
   4383 }
   4384 
   4385 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
   4386 {
   4387 	/* dock delta_exec before expiring quota (as it could span periods) */
   4388 	cfs_rq->runtime_remaining -= delta_exec;
   4389 	expire_cfs_rq_runtime(cfs_rq);
   4390 
   4391 	if (likely(cfs_rq->runtime_remaining > 0))
   4392 		return;
   4393 
   4394 	/*
   4395 	 * if we're unable to extend our runtime we resched so that the active
   4396 	 * hierarchy can be throttled
   4397 	 */
   4398 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
   4399 		resched_curr(rq_of(cfs_rq));
   4400 }
   4401 
   4402 static __always_inline
   4403 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
   4404 {
   4405 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
   4406 		return;
   4407 
   4408 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
   4409 }
   4410 
   4411 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
   4412 {
   4413 	return cfs_bandwidth_used() && cfs_rq->throttled;
   4414 }
   4415 
   4416 /* check whether cfs_rq, or any parent, is throttled */
   4417 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
   4418 {
   4419 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
   4420 }
   4421 
   4422 /*
   4423  * Ensure that neither of the group entities corresponding to src_cpu or
   4424  * dest_cpu are members of a throttled hierarchy when performing group
   4425  * load-balance operations.
   4426  */
   4427 static inline int throttled_lb_pair(struct task_group *tg,
   4428 				    int src_cpu, int dest_cpu)
   4429 {
   4430 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
   4431 
   4432 	src_cfs_rq = tg->cfs_rq[src_cpu];
   4433 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
   4434 
   4435 	return throttled_hierarchy(src_cfs_rq) ||
   4436 	       throttled_hierarchy(dest_cfs_rq);
   4437 }
   4438 
   4439 static int tg_unthrottle_up(struct task_group *tg, void *data)
   4440 {
   4441 	struct rq *rq = data;
   4442 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   4443 
   4444 	cfs_rq->throttle_count--;
   4445 	if (!cfs_rq->throttle_count) {
   4446 		/* adjust cfs_rq_clock_task() */
   4447 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
   4448 					     cfs_rq->throttled_clock_task;
   4449 
   4450 		/* Add cfs_rq with already running entity in the list */
   4451 		if (cfs_rq->nr_running >= 1)
   4452 			list_add_leaf_cfs_rq(cfs_rq);
   4453 	}
   4454 
   4455 	return 0;
   4456 }
   4457 
   4458 static int tg_throttle_down(struct task_group *tg, void *data)
   4459 {
   4460 	struct rq *rq = data;
   4461 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   4462 
   4463 	/* group is entering throttled state, stop time */
   4464 	if (!cfs_rq->throttle_count) {
   4465 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
   4466 		list_del_leaf_cfs_rq(cfs_rq);
   4467 	}
   4468 	cfs_rq->throttle_count++;
   4469 
   4470 	return 0;
   4471 }
   4472 
   4473 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
   4474 {
   4475 	struct rq *rq = rq_of(cfs_rq);
   4476 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   4477 	struct sched_entity *se;
   4478 	long task_delta, dequeue = 1;
   4479 	bool empty;
   4480 
   4481 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
   4482 
   4483 	/* freeze hierarchy runnable averages while throttled */
   4484 	rcu_read_lock();
   4485 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
   4486 	rcu_read_unlock();
   4487 
   4488 	task_delta = cfs_rq->h_nr_running;
   4489 	for_each_sched_entity(se) {
   4490 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
   4491 		/* throttled entity or throttle-on-deactivate */
   4492 		if (!se->on_rq)
   4493 			break;
   4494 
   4495 		if (dequeue)
   4496 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
   4497 		qcfs_rq->h_nr_running -= task_delta;
   4498 
   4499 		if (qcfs_rq->load.weight)
   4500 			dequeue = 0;
   4501 	}
   4502 
   4503 	if (!se)
   4504 		sub_nr_running(rq, task_delta);
   4505 
   4506 	cfs_rq->throttled = 1;
   4507 	cfs_rq->throttled_clock = rq_clock(rq);
   4508 	raw_spin_lock(&cfs_b->lock);
   4509 	empty = list_empty(&cfs_b->throttled_cfs_rq);
   4510 
   4511 	/*
   4512 	 * Add to the _head_ of the list, so that an already-started
   4513 	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
   4514 	 * not running add to the tail so that later runqueues don't get starved.
   4515 	 */
   4516 	if (cfs_b->distribute_running)
   4517 		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
   4518 	else
   4519 		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
   4520 
   4521 	/*
   4522 	 * If we're the first throttled task, make sure the bandwidth
   4523 	 * timer is running.
   4524 	 */
   4525 	if (empty)
   4526 		start_cfs_bandwidth(cfs_b);
   4527 
   4528 	raw_spin_unlock(&cfs_b->lock);
   4529 }
   4530 
   4531 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
   4532 {
   4533 	struct rq *rq = rq_of(cfs_rq);
   4534 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   4535 	struct sched_entity *se;
   4536 	int enqueue = 1;
   4537 	long task_delta;
   4538 
   4539 	se = cfs_rq->tg->se[cpu_of(rq)];
   4540 
   4541 	cfs_rq->throttled = 0;
   4542 
   4543 	update_rq_clock(rq);
   4544 
   4545 	raw_spin_lock(&cfs_b->lock);
   4546 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
   4547 	list_del_rcu(&cfs_rq->throttled_list);
   4548 	raw_spin_unlock(&cfs_b->lock);
   4549 
   4550 	/* update hierarchical throttle state */
   4551 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
   4552 
   4553 	if (!cfs_rq->load.weight)
   4554 		return;
   4555 
   4556 	task_delta = cfs_rq->h_nr_running;
   4557 	for_each_sched_entity(se) {
   4558 		if (se->on_rq)
   4559 			enqueue = 0;
   4560 
   4561 		cfs_rq = cfs_rq_of(se);
   4562 		if (enqueue)
   4563 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
   4564 		cfs_rq->h_nr_running += task_delta;
   4565 
   4566 		if (cfs_rq_throttled(cfs_rq))
   4567 			break;
   4568 	}
   4569 
   4570 	assert_list_leaf_cfs_rq(rq);
   4571 
   4572 	if (!se)
   4573 		add_nr_running(rq, task_delta);
   4574 
   4575 	/* Determine whether we need to wake up potentially idle CPU: */
   4576 	if (rq->curr == rq->idle && rq->cfs.nr_running)
   4577 		resched_curr(rq);
   4578 }
   4579 
   4580 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
   4581 		u64 remaining, u64 expires)
   4582 {
   4583 	struct cfs_rq *cfs_rq;
   4584 	u64 runtime;
   4585 	u64 starting_runtime = remaining;
   4586 
   4587 	rcu_read_lock();
   4588 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
   4589 				throttled_list) {
   4590 		struct rq *rq = rq_of(cfs_rq);
   4591 		struct rq_flags rf;
   4592 
   4593 		rq_lock_irqsave(rq, &rf);
   4594 		if (!cfs_rq_throttled(cfs_rq))
   4595 			goto next;
   4596 
   4597 		runtime = -cfs_rq->runtime_remaining + 1;
   4598 		if (runtime > remaining)
   4599 			runtime = remaining;
   4600 		remaining -= runtime;
   4601 
   4602 		cfs_rq->runtime_remaining += runtime;
   4603 		cfs_rq->runtime_expires = expires;
   4604 
   4605 		/* we check whether we're throttled above */
   4606 		if (cfs_rq->runtime_remaining > 0)
   4607 			unthrottle_cfs_rq(cfs_rq);
   4608 
   4609 next:
   4610 		rq_unlock_irqrestore(rq, &rf);
   4611 
   4612 		if (!remaining)
   4613 			break;
   4614 	}
   4615 	rcu_read_unlock();
   4616 
   4617 	return starting_runtime - remaining;
   4618 }
   4619 
   4620 /*
   4621  * Responsible for refilling a task_group's bandwidth and unthrottling its
   4622  * cfs_rqs as appropriate. If there has been no activity within the last
   4623  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
   4624  * used to track this state.
   4625  */
   4626 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
   4627 {
   4628 	u64 runtime, runtime_expires;
   4629 	int throttled;
   4630 
   4631 	/* no need to continue the timer with no bandwidth constraint */
   4632 	if (cfs_b->quota == RUNTIME_INF)
   4633 		goto out_deactivate;
   4634 
   4635 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
   4636 	cfs_b->nr_periods += overrun;
   4637 
   4638 	/*
   4639 	 * idle depends on !throttled (for the case of a large deficit), and if
   4640 	 * we're going inactive then everything else can be deferred
   4641 	 */
   4642 	if (cfs_b->idle && !throttled)
   4643 		goto out_deactivate;
   4644 
   4645 	__refill_cfs_bandwidth_runtime(cfs_b);
   4646 
   4647 	if (!throttled) {
   4648 		/* mark as potentially idle for the upcoming period */
   4649 		cfs_b->idle = 1;
   4650 		return 0;
   4651 	}
   4652 
   4653 	/* account preceding periods in which throttling occurred */
   4654 	cfs_b->nr_throttled += overrun;
   4655 
   4656 	runtime_expires = cfs_b->runtime_expires;
   4657 
   4658 	/*
   4659 	 * This check is repeated as we are holding onto the new bandwidth while
   4660 	 * we unthrottle. This can potentially race with an unthrottled group
   4661 	 * trying to acquire new bandwidth from the global pool. This can result
   4662 	 * in us over-using our runtime if it is all used during this loop, but
   4663 	 * only by limited amounts in that extreme case.
   4664 	 */
   4665 	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
   4666 		runtime = cfs_b->runtime;
   4667 		cfs_b->distribute_running = 1;
   4668 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   4669 		/* we can't nest cfs_b->lock while distributing bandwidth */
   4670 		runtime = distribute_cfs_runtime(cfs_b, runtime,
   4671 						 runtime_expires);
   4672 		raw_spin_lock_irqsave(&cfs_b->lock, flags);
   4673 
   4674 		cfs_b->distribute_running = 0;
   4675 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
   4676 
   4677 		lsub_positive(&cfs_b->runtime, runtime);
   4678 	}
   4679 
   4680 	/*
   4681 	 * While we are ensured activity in the period following an
   4682 	 * unthrottle, this also covers the case in which the new bandwidth is
   4683 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
   4684 	 * timer to remain active while there are any throttled entities.)
   4685 	 */
   4686 	cfs_b->idle = 0;
   4687 
   4688 	return 0;
   4689 
   4690 out_deactivate:
   4691 	return 1;
   4692 }
   4693 
   4694 /* a cfs_rq won't donate quota below this amount */
   4695 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
   4696 /* minimum remaining period time to redistribute slack quota */
   4697 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
   4698 /* how long we wait to gather additional slack before distributing */
   4699 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
   4700 
   4701 /*
   4702  * Are we near the end of the current quota period?
   4703  *
   4704  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
   4705  * hrtimer base being cleared by hrtimer_start. In the case of
   4706  * migrate_hrtimers, base is never cleared, so we are fine.
   4707  */
   4708 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
   4709 {
   4710 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
   4711 	u64 remaining;
   4712 
   4713 	/* if the call-back is running a quota refresh is already occurring */
   4714 	if (hrtimer_callback_running(refresh_timer))
   4715 		return 1;
   4716 
   4717 	/* is a quota refresh about to occur? */
   4718 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
   4719 	if (remaining < min_expire)
   4720 		return 1;
   4721 
   4722 	return 0;
   4723 }
   4724 
   4725 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
   4726 {
   4727 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
   4728 
   4729 	/* if there's a quota refresh soon don't bother with slack */
   4730 	if (runtime_refresh_within(cfs_b, min_left))
   4731 		return;
   4732 
   4733 	hrtimer_start(&cfs_b->slack_timer,
   4734 			ns_to_ktime(cfs_bandwidth_slack_period),
   4735 			HRTIMER_MODE_REL);
   4736 }
   4737 
   4738 /* we know any runtime found here is valid as update_curr() precedes return */
   4739 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   4740 {
   4741 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
   4742 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
   4743 
   4744 	if (slack_runtime <= 0)
   4745 		return;
   4746 
   4747 	raw_spin_lock(&cfs_b->lock);
   4748 	if (cfs_b->quota != RUNTIME_INF &&
   4749 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
   4750 		cfs_b->runtime += slack_runtime;
   4751 
   4752 		/* we are under rq->lock, defer unthrottling using a timer */
   4753 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
   4754 		    !list_empty(&cfs_b->throttled_cfs_rq))
   4755 			start_cfs_slack_bandwidth(cfs_b);
   4756 	}
   4757 	raw_spin_unlock(&cfs_b->lock);
   4758 
   4759 	/* even if it's not valid for return we don't want to try again */
   4760 	cfs_rq->runtime_remaining -= slack_runtime;
   4761 }
   4762 
   4763 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   4764 {
   4765 	if (!cfs_bandwidth_used())
   4766 		return;
   4767 
   4768 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
   4769 		return;
   4770 
   4771 	__return_cfs_rq_runtime(cfs_rq);
   4772 }
   4773 
   4774 /*
   4775  * This is done with a timer (instead of inline with bandwidth return) since
   4776  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
   4777  */
   4778 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
   4779 {
   4780 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
   4781 	unsigned long flags;
   4782 	u64 expires;
   4783 
   4784 	/* confirm we're still not at a refresh boundary */
   4785 	raw_spin_lock_irqsave(&cfs_b->lock, flags);
   4786 	if (cfs_b->distribute_running) {
   4787 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   4788 		return;
   4789 	}
   4790 
   4791 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
   4792 		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   4793 		return;
   4794 	}
   4795 
   4796 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
   4797 		runtime = cfs_b->runtime;
   4798 
   4799 	expires = cfs_b->runtime_expires;
   4800 	if (runtime)
   4801 		cfs_b->distribute_running = 1;
   4802 
   4803 	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   4804 
   4805 	if (!runtime)
   4806 		return;
   4807 
   4808 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
   4809 
   4810 	raw_spin_lock_irqsave(&cfs_b->lock, flags);
   4811 	if (expires == cfs_b->runtime_expires)
   4812 		lsub_positive(&cfs_b->runtime, runtime);
   4813 	cfs_b->distribute_running = 0;
   4814 	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   4815 }
   4816 
   4817 /*
   4818  * When a group wakes up we want to make sure that its quota is not already
   4819  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
   4820  * runtime as update_curr() throttling can not not trigger until it's on-rq.
   4821  */
   4822 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
   4823 {
   4824 	if (!cfs_bandwidth_used())
   4825 		return;
   4826 
   4827 	/* an active group must be handled by the update_curr()->put() path */
   4828 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
   4829 		return;
   4830 
   4831 	/* ensure the group is not already throttled */
   4832 	if (cfs_rq_throttled(cfs_rq))
   4833 		return;
   4834 
   4835 	/* update runtime allocation */
   4836 	account_cfs_rq_runtime(cfs_rq, 0);
   4837 	if (cfs_rq->runtime_remaining <= 0)
   4838 		throttle_cfs_rq(cfs_rq);
   4839 }
   4840 
   4841 static void sync_throttle(struct task_group *tg, int cpu)
   4842 {
   4843 	struct cfs_rq *pcfs_rq, *cfs_rq;
   4844 
   4845 	if (!cfs_bandwidth_used())
   4846 		return;
   4847 
   4848 	if (!tg->parent)
   4849 		return;
   4850 
   4851 	cfs_rq = tg->cfs_rq[cpu];
   4852 	pcfs_rq = tg->parent->cfs_rq[cpu];
   4853 
   4854 	cfs_rq->throttle_count = pcfs_rq->throttle_count;
   4855 	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
   4856 }
   4857 
   4858 /* conditionally throttle active cfs_rq's from put_prev_entity() */
   4859 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   4860 {
   4861 	if (!cfs_bandwidth_used())
   4862 		return false;
   4863 
   4864 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
   4865 		return false;
   4866 
   4867 	/*
   4868 	 * it's possible for a throttled entity to be forced into a running
   4869 	 * state (e.g. set_curr_task), in this case we're finished.
   4870 	 */
   4871 	if (cfs_rq_throttled(cfs_rq))
   4872 		return true;
   4873 
   4874 	throttle_cfs_rq(cfs_rq);
   4875 	return true;
   4876 }
   4877 
   4878 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
   4879 {
   4880 	struct cfs_bandwidth *cfs_b =
   4881 		container_of(timer, struct cfs_bandwidth, slack_timer);
   4882 
   4883 	do_sched_cfs_slack_timer(cfs_b);
   4884 
   4885 	return HRTIMER_NORESTART;
   4886 }
   4887 
   4888 extern const u64 max_cfs_quota_period;
   4889 
   4890 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
   4891 {
   4892 	struct cfs_bandwidth *cfs_b =
   4893 		container_of(timer, struct cfs_bandwidth, period_timer);
   4894 	unsigned long flags;
   4895 	int overrun;
   4896 	int idle = 0;
   4897 	int count = 0;
   4898 
   4899 	raw_spin_lock_irqsave(&cfs_b->lock, flags);
   4900 	for (;;) {
   4901 		overrun = hrtimer_forward_now(timer, cfs_b->period);
   4902 		if (!overrun)
   4903 			break;
   4904 
   4905 		if (++count > 3) {
   4906 			u64 new, old = ktime_to_ns(cfs_b->period);
   4907 
   4908 			new = (old * 147) / 128; /* ~115% */
   4909 			new = min(new, max_cfs_quota_period);
   4910 
   4911 			cfs_b->period = ns_to_ktime(new);
   4912 
   4913 			/* since max is 1s, this is limited to 1e9^2, which fits in u64 */
   4914 			cfs_b->quota *= new;
   4915 			cfs_b->quota = div64_u64(cfs_b->quota, old);
   4916 
   4917 			pr_warn_ratelimited(
   4918 	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
   4919 				smp_processor_id(),
   4920 				div_u64(new, NSEC_PER_USEC),
   4921 				div_u64(cfs_b->quota, NSEC_PER_USEC));
   4922 
   4923 			/* reset count so we don't come right back in here */
   4924 			count = 0;
   4925 		}
   4926 
   4927 		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
   4928 	}
   4929 	if (idle)
   4930 		cfs_b->period_active = 0;
   4931 	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
   4932 
   4933 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
   4934 }
   4935 
   4936 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
   4937 {
   4938 	raw_spin_lock_init(&cfs_b->lock);
   4939 	cfs_b->runtime = 0;
   4940 	cfs_b->quota = RUNTIME_INF;
   4941 	cfs_b->period = ns_to_ktime(default_cfs_period());
   4942 
   4943 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
   4944 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
   4945 	cfs_b->period_timer.function = sched_cfs_period_timer;
   4946 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
   4947 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
   4948 	cfs_b->distribute_running = 0;
   4949 }
   4950 
   4951 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
   4952 {
   4953 	cfs_rq->runtime_enabled = 0;
   4954 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
   4955 }
   4956 
   4957 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
   4958 {
   4959 	u64 overrun;
   4960 
   4961 	lockdep_assert_held(&cfs_b->lock);
   4962 
   4963 	if (cfs_b->period_active)
   4964 		return;
   4965 
   4966 	cfs_b->period_active = 1;
   4967 	overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
   4968 	cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
   4969 	cfs_b->expires_seq++;
   4970 	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
   4971 }
   4972 
   4973 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
   4974 {
   4975 	/* init_cfs_bandwidth() was not called */
   4976 	if (!cfs_b->throttled_cfs_rq.next)
   4977 		return;
   4978 
   4979 	hrtimer_cancel(&cfs_b->period_timer);
   4980 	hrtimer_cancel(&cfs_b->slack_timer);
   4981 }
   4982 
   4983 /*
   4984  * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
   4985  *
   4986  * The race is harmless, since modifying bandwidth settings of unhooked group
   4987  * bits doesn't do much.
   4988  */
   4989 
   4990 /* cpu online calback */
   4991 static void __maybe_unused update_runtime_enabled(struct rq *rq)
   4992 {
   4993 	struct task_group *tg;
   4994 
   4995 	lockdep_assert_held(&rq->lock);
   4996 
   4997 	rcu_read_lock();
   4998 	list_for_each_entry_rcu(tg, &task_groups, list) {
   4999 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   5000 		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   5001 
   5002 		raw_spin_lock(&cfs_b->lock);
   5003 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
   5004 		raw_spin_unlock(&cfs_b->lock);
   5005 	}
   5006 	rcu_read_unlock();
   5007 }
   5008 
   5009 /* cpu offline callback */
   5010 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
   5011 {
   5012 	struct task_group *tg;
   5013 
   5014 	lockdep_assert_held(&rq->lock);
   5015 
   5016 	rcu_read_lock();
   5017 	list_for_each_entry_rcu(tg, &task_groups, list) {
   5018 		struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
   5019 
   5020 		if (!cfs_rq->runtime_enabled)
   5021 			continue;
   5022 
   5023 		/*
   5024 		 * clock_task is not advancing so we just need to make sure
   5025 		 * there's some valid quota amount
   5026 		 */
   5027 		cfs_rq->runtime_remaining = 1;
   5028 		/*
   5029 		 * Offline rq is schedulable till CPU is completely disabled
   5030 		 * in take_cpu_down(), so we prevent new cfs throttling here.
   5031 		 */
   5032 		cfs_rq->runtime_enabled = 0;
   5033 
   5034 		if (cfs_rq_throttled(cfs_rq))
   5035 			unthrottle_cfs_rq(cfs_rq);
   5036 	}
   5037 	rcu_read_unlock();
   5038 }
   5039 
   5040 #else /* CONFIG_CFS_BANDWIDTH */
   5041 
   5042 static inline bool cfs_bandwidth_used(void)
   5043 {
   5044 	return false;
   5045 }
   5046 
   5047 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
   5048 {
   5049 	return rq_clock_task(rq_of(cfs_rq));
   5050 }
   5051 
   5052 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
   5053 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
   5054 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
   5055 static inline void sync_throttle(struct task_group *tg, int cpu) {}
   5056 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
   5057 
   5058 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
   5059 {
   5060 	return 0;
   5061 }
   5062 
   5063 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
   5064 {
   5065 	return 0;
   5066 }
   5067 
   5068 static inline int throttled_lb_pair(struct task_group *tg,
   5069 				    int src_cpu, int dest_cpu)
   5070 {
   5071 	return 0;
   5072 }
   5073 
   5074 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
   5075 
   5076 #ifdef CONFIG_FAIR_GROUP_SCHED
   5077 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
   5078 #endif
   5079 
   5080 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
   5081 {
   5082 	return NULL;
   5083 }
   5084 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
   5085 static inline void update_runtime_enabled(struct rq *rq) {}
   5086 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
   5087 
   5088 #endif /* CONFIG_CFS_BANDWIDTH */
   5089 
   5090 /**************************************************
   5091  * CFS operations on tasks:
   5092  */
   5093 
   5094 #ifdef CONFIG_SCHED_HRTICK
   5095 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
   5096 {
   5097 	struct sched_entity *se = &p->se;
   5098 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
   5099 
   5100 	SCHED_WARN_ON(task_rq(p) != rq);
   5101 
   5102 	if (rq->cfs.h_nr_running > 1) {
   5103 		u64 slice = sched_slice(cfs_rq, se);
   5104 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
   5105 		s64 delta = slice - ran;
   5106 
   5107 		if (delta < 0) {
   5108 			if (rq->curr == p)
   5109 				resched_curr(rq);
   5110 			return;
   5111 		}
   5112 		hrtick_start(rq, delta);
   5113 	}
   5114 }
   5115 
   5116 /*
   5117  * called from enqueue/dequeue and updates the hrtick when the
   5118  * current task is from our class and nr_running is low enough
   5119  * to matter.
   5120  */
   5121 static void hrtick_update(struct rq *rq)
   5122 {
   5123 	struct task_struct *curr = rq->curr;
   5124 
   5125 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
   5126 		return;
   5127 
   5128 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
   5129 		hrtick_start_fair(rq, curr);
   5130 }
   5131 #else /* !CONFIG_SCHED_HRTICK */
   5132 static inline void
   5133 hrtick_start_fair(struct rq *rq, struct task_struct *p)
   5134 {
   5135 }
   5136 
   5137 static inline void hrtick_update(struct rq *rq)
   5138 {
   5139 }
   5140 #endif
   5141 
   5142 #ifdef CONFIG_SMP
   5143 static inline unsigned long cpu_util(int cpu);
   5144 static unsigned long capacity_of(int cpu);
   5145 
   5146 static inline bool cpu_overutilized(int cpu)
   5147 {
   5148 	return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
   5149 }
   5150 
   5151 static inline void update_overutilized_status(struct rq *rq)
   5152 {
   5153 	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
   5154 		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
   5155 }
   5156 #else
   5157 static inline void update_overutilized_status(struct rq *rq) { }
   5158 #endif
   5159 
   5160 /*
   5161  * The enqueue_task method is called before nr_running is
   5162  * increased. Here we update the fair scheduling stats and
   5163  * then put the task into the rbtree:
   5164  */
   5165 static void
   5166 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
   5167 {
   5168 	struct cfs_rq *cfs_rq;
   5169 	struct sched_entity *se = &p->se;
   5170 
   5171 	/*
   5172 	 * The code below (indirectly) updates schedutil which looks at
   5173 	 * the cfs_rq utilization to select a frequency.
   5174 	 * Let's add the task's estimated utilization to the cfs_rq's
   5175 	 * estimated utilization, before we update schedutil.
   5176 	 */
   5177 	util_est_enqueue(&rq->cfs, p);
   5178 
   5179 	/*
   5180 	 * If in_iowait is set, the code below may not trigger any cpufreq
   5181 	 * utilization updates, so do it here explicitly with the IOWAIT flag
   5182 	 * passed.
   5183 	 */
   5184 	if (p->in_iowait)
   5185 		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
   5186 
   5187 	for_each_sched_entity(se) {
   5188 		if (se->on_rq)
   5189 			break;
   5190 		cfs_rq = cfs_rq_of(se);
   5191 		enqueue_entity(cfs_rq, se, flags);
   5192 
   5193 		/*
   5194 		 * end evaluation on encountering a throttled cfs_rq
   5195 		 *
   5196 		 * note: in the case of encountering a throttled cfs_rq we will
   5197 		 * post the final h_nr_running increment below.
   5198 		 */
   5199 		if (cfs_rq_throttled(cfs_rq))
   5200 			break;
   5201 		cfs_rq->h_nr_running++;
   5202 
   5203 		flags = ENQUEUE_WAKEUP;
   5204 	}
   5205 
   5206 	for_each_sched_entity(se) {
   5207 		cfs_rq = cfs_rq_of(se);
   5208 		cfs_rq->h_nr_running++;
   5209 
   5210 		if (cfs_rq_throttled(cfs_rq))
   5211 			break;
   5212 
   5213 		update_load_avg(cfs_rq, se, UPDATE_TG);
   5214 		update_cfs_group(se);
   5215 	}
   5216 
   5217 	if (!se) {
   5218 		add_nr_running(rq, 1);
   5219 		/*
   5220 		 * Since new tasks are assigned an initial util_avg equal to
   5221 		 * half of the spare capacity of their CPU, tiny tasks have the
   5222 		 * ability to cross the overutilized threshold, which will
   5223 		 * result in the load balancer ruining all the task placement
   5224 		 * done by EAS. As a way to mitigate that effect, do not account
   5225 		 * for the first enqueue operation of new tasks during the
   5226 		 * overutilized flag detection.
   5227 		 *
   5228 		 * A better way of solving this problem would be to wait for
   5229 		 * the PELT signals of tasks to converge before taking them
   5230 		 * into account, but that is not straightforward to implement,
   5231 		 * and the following generally works well enough in practice.
   5232 		 */
   5233 		if (flags & ENQUEUE_WAKEUP)
   5234 			update_overutilized_status(rq);
   5235 
   5236 	}
   5237 
   5238 	if (cfs_bandwidth_used()) {
   5239 		/*
   5240 		 * When bandwidth control is enabled; the cfs_rq_throttled()
   5241 		 * breaks in the above iteration can result in incomplete
   5242 		 * leaf list maintenance, resulting in triggering the assertion
   5243 		 * below.
   5244 		 */
   5245 		for_each_sched_entity(se) {
   5246 			cfs_rq = cfs_rq_of(se);
   5247 
   5248 			if (list_add_leaf_cfs_rq(cfs_rq))
   5249 				break;
   5250 		}
   5251 	}
   5252 
   5253 	assert_list_leaf_cfs_rq(rq);
   5254 
   5255 	hrtick_update(rq);
   5256 }
   5257 
   5258 static void set_next_buddy(struct sched_entity *se);
   5259 
   5260 /*
   5261  * The dequeue_task method is called before nr_running is
   5262  * decreased. We remove the task from the rbtree and
   5263  * update the fair scheduling stats:
   5264  */
   5265 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
   5266 {
   5267 	struct cfs_rq *cfs_rq;
   5268 	struct sched_entity *se = &p->se;
   5269 	int task_sleep = flags & DEQUEUE_SLEEP;
   5270 
   5271 	for_each_sched_entity(se) {
   5272 		cfs_rq = cfs_rq_of(se);
   5273 		dequeue_entity(cfs_rq, se, flags);
   5274 
   5275 		/*
   5276 		 * end evaluation on encountering a throttled cfs_rq
   5277 		 *
   5278 		 * note: in the case of encountering a throttled cfs_rq we will
   5279 		 * post the final h_nr_running decrement below.
   5280 		*/
   5281 		if (cfs_rq_throttled(cfs_rq))
   5282 			break;
   5283 		cfs_rq->h_nr_running--;
   5284 
   5285 		/* Don't dequeue parent if it has other entities besides us */
   5286 		if (cfs_rq->load.weight) {
   5287 			/* Avoid re-evaluating load for this entity: */
   5288 			se = parent_entity(se);
   5289 			/*
   5290 			 * Bias pick_next to pick a task from this cfs_rq, as
   5291 			 * p is sleeping when it is within its sched_slice.
   5292 			 */
   5293 			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
   5294 				set_next_buddy(se);
   5295 			break;
   5296 		}
   5297 		flags |= DEQUEUE_SLEEP;
   5298 	}
   5299 
   5300 	for_each_sched_entity(se) {
   5301 		cfs_rq = cfs_rq_of(se);
   5302 		cfs_rq->h_nr_running--;
   5303 
   5304 		if (cfs_rq_throttled(cfs_rq))
   5305 			break;
   5306 
   5307 		update_load_avg(cfs_rq, se, UPDATE_TG);
   5308 		update_cfs_group(se);
   5309 	}
   5310 
   5311 	if (!se)
   5312 		sub_nr_running(rq, 1);
   5313 
   5314 	util_est_dequeue(&rq->cfs, p, task_sleep);
   5315 	hrtick_update(rq);
   5316 }
   5317 
   5318 #ifdef CONFIG_SMP
   5319 
   5320 /* Working cpumask for: load_balance, load_balance_newidle. */
   5321 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
   5322 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
   5323 
   5324 #ifdef CONFIG_NO_HZ_COMMON
   5325 /*
   5326  * per rq 'load' arrray crap; XXX kill this.
   5327  */
   5328 
   5329 /*
   5330  * The exact cpuload calculated at every tick would be:
   5331  *
   5332  *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
   5333  *
   5334  * If a CPU misses updates for n ticks (as it was idle) and update gets
   5335  * called on the n+1-th tick when CPU may be busy, then we have:
   5336  *
   5337  *   load_n   = (1 - 1/2^i)^n * load_0
   5338  *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
   5339  *
   5340  * decay_load_missed() below does efficient calculation of
   5341  *
   5342  *   load' = (1 - 1/2^i)^n * load
   5343  *
   5344  * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
   5345  * This allows us to precompute the above in said factors, thereby allowing the
   5346  * reduction of an arbitrary n in O(log_2 n) steps. (See also
   5347  * fixed_power_int())
   5348  *
   5349  * The calculation is approximated on a 128 point scale.
   5350  */
   5351 #define DEGRADE_SHIFT		7
   5352 
   5353 static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
   5354 static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
   5355 	{   0,   0,  0,  0,  0,  0, 0, 0 },
   5356 	{  64,  32,  8,  0,  0,  0, 0, 0 },
   5357 	{  96,  72, 40, 12,  1,  0, 0, 0 },
   5358 	{ 112,  98, 75, 43, 15,  1, 0, 0 },
   5359 	{ 120, 112, 98, 76, 45, 16, 2, 0 }
   5360 };
   5361 
   5362 /*
   5363  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
   5364  * would be when CPU is idle and so we just decay the old load without
   5365  * adding any new load.
   5366  */
   5367 static unsigned long
   5368 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
   5369 {
   5370 	int j = 0;
   5371 
   5372 	if (!missed_updates)
   5373 		return load;
   5374 
   5375 	if (missed_updates >= degrade_zero_ticks[idx])
   5376 		return 0;
   5377 
   5378 	if (idx == 1)
   5379 		return load >> missed_updates;
   5380 
   5381 	while (missed_updates) {
   5382 		if (missed_updates % 2)
   5383 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
   5384 
   5385 		missed_updates >>= 1;
   5386 		j++;
   5387 	}
   5388 	return load;
   5389 }
   5390 
   5391 static struct {
   5392 	cpumask_var_t idle_cpus_mask;
   5393 	atomic_t nr_cpus;
   5394 	int has_blocked;		/* Idle CPUS has blocked load */
   5395 	unsigned long next_balance;     /* in jiffy units */
   5396 	unsigned long next_blocked;	/* Next update of blocked load in jiffies */
   5397 } nohz ____cacheline_aligned;
   5398 
   5399 #endif /* CONFIG_NO_HZ_COMMON */
   5400 
   5401 /**
   5402  * __cpu_load_update - update the rq->cpu_load[] statistics
   5403  * @this_rq: The rq to update statistics for
   5404  * @this_load: The current load
   5405  * @pending_updates: The number of missed updates
   5406  *
   5407  * Update rq->cpu_load[] statistics. This function is usually called every
   5408  * scheduler tick (TICK_NSEC).
   5409  *
   5410  * This function computes a decaying average:
   5411  *
   5412  *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
   5413  *
   5414  * Because of NOHZ it might not get called on every tick which gives need for
   5415  * the @pending_updates argument.
   5416  *
   5417  *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
   5418  *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
   5419  *             = A * (A * load[i]_n-2 + B) + B
   5420  *             = A * (A * (A * load[i]_n-3 + B) + B) + B
   5421  *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
   5422  *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
   5423  *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
   5424  *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
   5425  *
   5426  * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
   5427  * any change in load would have resulted in the tick being turned back on.
   5428  *
   5429  * For regular NOHZ, this reduces to:
   5430  *
   5431  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
   5432  *
   5433  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
   5434  * term.
   5435  */
   5436 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
   5437 			    unsigned long pending_updates)
   5438 {
   5439 	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
   5440 	int i, scale;
   5441 
   5442 	this_rq->nr_load_updates++;
   5443 
   5444 	/* Update our load: */
   5445 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
   5446 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
   5447 		unsigned long old_load, new_load;
   5448 
   5449 		/* scale is effectively 1 << i now, and >> i divides by scale */
   5450 
   5451 		old_load = this_rq->cpu_load[i];
   5452 #ifdef CONFIG_NO_HZ_COMMON
   5453 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
   5454 		if (tickless_load) {
   5455 			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
   5456 			/*
   5457 			 * old_load can never be a negative value because a
   5458 			 * decayed tickless_load cannot be greater than the
   5459 			 * original tickless_load.
   5460 			 */
   5461 			old_load += tickless_load;
   5462 		}
   5463 #endif
   5464 		new_load = this_load;
   5465 		/*
   5466 		 * Round up the averaging division if load is increasing. This
   5467 		 * prevents us from getting stuck on 9 if the load is 10, for
   5468 		 * example.
   5469 		 */
   5470 		if (new_load > old_load)
   5471 			new_load += scale - 1;
   5472 
   5473 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
   5474 	}
   5475 }
   5476 
   5477 /* Used instead of source_load when we know the type == 0 */
   5478 static unsigned long weighted_cpuload(struct rq *rq)
   5479 {
   5480 	return cfs_rq_runnable_load_avg(&rq->cfs);
   5481 }
   5482 
   5483 #ifdef CONFIG_NO_HZ_COMMON
   5484 /*
   5485  * There is no sane way to deal with nohz on smp when using jiffies because the
   5486  * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
   5487  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
   5488  *
   5489  * Therefore we need to avoid the delta approach from the regular tick when
   5490  * possible since that would seriously skew the load calculation. This is why we
   5491  * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
   5492  * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
   5493  * loop exit, nohz_idle_balance, nohz full exit...)
   5494  *
   5495  * This means we might still be one tick off for nohz periods.
   5496  */
   5497 
   5498 static void cpu_load_update_nohz(struct rq *this_rq,
   5499 				 unsigned long curr_jiffies,
   5500 				 unsigned long load)
   5501 {
   5502 	unsigned long pending_updates;
   5503 
   5504 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
   5505 	if (pending_updates) {
   5506 		this_rq->last_load_update_tick = curr_jiffies;
   5507 		/*
   5508 		 * In the regular NOHZ case, we were idle, this means load 0.
   5509 		 * In the NOHZ_FULL case, we were non-idle, we should consider
   5510 		 * its weighted load.
   5511 		 */
   5512 		cpu_load_update(this_rq, load, pending_updates);
   5513 	}
   5514 }
   5515 
   5516 /*
   5517  * Called from nohz_idle_balance() to update the load ratings before doing the
   5518  * idle balance.
   5519  */
   5520 static void cpu_load_update_idle(struct rq *this_rq)
   5521 {
   5522 	/*
   5523 	 * bail if there's load or we're actually up-to-date.
   5524 	 */
   5525 	if (weighted_cpuload(this_rq))
   5526 		return;
   5527 
   5528 	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
   5529 }
   5530 
   5531 /*
   5532  * Record CPU load on nohz entry so we know the tickless load to account
   5533  * on nohz exit. cpu_load[0] happens then to be updated more frequently
   5534  * than other cpu_load[idx] but it should be fine as cpu_load readers
   5535  * shouldn't rely into synchronized cpu_load[*] updates.
   5536  */
   5537 void cpu_load_update_nohz_start(void)
   5538 {
   5539 	struct rq *this_rq = this_rq();
   5540 
   5541 	/*
   5542 	 * This is all lockless but should be fine. If weighted_cpuload changes
   5543 	 * concurrently we'll exit nohz. And cpu_load write can race with
   5544 	 * cpu_load_update_idle() but both updater would be writing the same.
   5545 	 */
   5546 	this_rq->cpu_load[0] = weighted_cpuload(this_rq);
   5547 }
   5548 
   5549 /*
   5550  * Account the tickless load in the end of a nohz frame.
   5551  */
   5552 void cpu_load_update_nohz_stop(void)
   5553 {
   5554 	unsigned long curr_jiffies = READ_ONCE(jiffies);
   5555 	struct rq *this_rq = this_rq();
   5556 	unsigned long load;
   5557 	struct rq_flags rf;
   5558 
   5559 	if (curr_jiffies == this_rq->last_load_update_tick)
   5560 		return;
   5561 
   5562 	load = weighted_cpuload(this_rq);
   5563 	rq_lock(this_rq, &rf);
   5564 	update_rq_clock(this_rq);
   5565 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
   5566 	rq_unlock(this_rq, &rf);
   5567 }
   5568 #else /* !CONFIG_NO_HZ_COMMON */
   5569 static inline void cpu_load_update_nohz(struct rq *this_rq,
   5570 					unsigned long curr_jiffies,
   5571 					unsigned long load) { }
   5572 #endif /* CONFIG_NO_HZ_COMMON */
   5573 
   5574 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
   5575 {
   5576 #ifdef CONFIG_NO_HZ_COMMON
   5577 	/* See the mess around cpu_load_update_nohz(). */
   5578 	this_rq->last_load_update_tick = READ_ONCE(jiffies);
   5579 #endif
   5580 	cpu_load_update(this_rq, load, 1);
   5581 }
   5582 
   5583 /*
   5584  * Called from scheduler_tick()
   5585  */
   5586 void cpu_load_update_active(struct rq *this_rq)
   5587 {
   5588 	unsigned long load = weighted_cpuload(this_rq);
   5589 
   5590 	if (tick_nohz_tick_stopped())
   5591 		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
   5592 	else
   5593 		cpu_load_update_periodic(this_rq, load);
   5594 }
   5595 
   5596 /*
   5597  * Return a low guess at the load of a migration-source CPU weighted
   5598  * according to the scheduling class and "nice" value.
   5599  *
   5600  * We want to under-estimate the load of migration sources, to
   5601  * balance conservatively.
   5602  */
   5603 static unsigned long source_load(int cpu, int type)
   5604 {
   5605 	struct rq *rq = cpu_rq(cpu);
   5606 	unsigned long total = weighted_cpuload(rq);
   5607 
   5608 	if (type == 0 || !sched_feat(LB_BIAS))
   5609 		return total;
   5610 
   5611 	return min(rq->cpu_load[type-1], total);
   5612 }
   5613 
   5614 /*
   5615  * Return a high guess at the load of a migration-target CPU weighted
   5616  * according to the scheduling class and "nice" value.
   5617  */
   5618 static unsigned long target_load(int cpu, int type)
   5619 {
   5620 	struct rq *rq = cpu_rq(cpu);
   5621 	unsigned long total = weighted_cpuload(rq);
   5622 
   5623 	if (type == 0 || !sched_feat(LB_BIAS))
   5624 		return total;
   5625 
   5626 	return max(rq->cpu_load[type-1], total);
   5627 }
   5628 
   5629 static unsigned long capacity_of(int cpu)
   5630 {
   5631 	return cpu_rq(cpu)->cpu_capacity;
   5632 }
   5633 
   5634 static unsigned long cpu_avg_load_per_task(int cpu)
   5635 {
   5636 	struct rq *rq = cpu_rq(cpu);
   5637 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
   5638 	unsigned long load_avg = weighted_cpuload(rq);
   5639 
   5640 	if (nr_running)
   5641 		return load_avg / nr_running;
   5642 
   5643 	return 0;
   5644 }
   5645 
   5646 static void record_wakee(struct task_struct *p)
   5647 {
   5648 	/*
   5649 	 * Only decay a single time; tasks that have less then 1 wakeup per
   5650 	 * jiffy will not have built up many flips.
   5651 	 */
   5652 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
   5653 		current->wakee_flips >>= 1;
   5654 		current->wakee_flip_decay_ts = jiffies;
   5655 	}
   5656 
   5657 	if (current->last_wakee != p) {
   5658 		current->last_wakee = p;
   5659 		current->wakee_flips++;
   5660 	}
   5661 }
   5662 
   5663 /*
   5664  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
   5665  *
   5666  * A waker of many should wake a different task than the one last awakened
   5667  * at a frequency roughly N times higher than one of its wakees.
   5668  *
   5669  * In order to determine whether we should let the load spread vs consolidating
   5670  * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
   5671  * partner, and a factor of lls_size higher frequency in the other.
   5672  *
   5673  * With both conditions met, we can be relatively sure that the relationship is
   5674  * non-monogamous, with partner count exceeding socket size.
   5675  *
   5676  * Waker/wakee being client/server, worker/dispatcher, interrupt source or
   5677  * whatever is irrelevant, spread criteria is apparent partner count exceeds
   5678  * socket size.
   5679  */
   5680 static int wake_wide(struct task_struct *p)
   5681 {
   5682 	unsigned int master = current->wakee_flips;
   5683 	unsigned int slave = p->wakee_flips;
   5684 	int factor = this_cpu_read(sd_llc_size);
   5685 
   5686 	if (master < slave)
   5687 		swap(master, slave);
   5688 	if (slave < factor || master < slave * factor)
   5689 		return 0;
   5690 	return 1;
   5691 }
   5692 
   5693 /*
   5694  * The purpose of wake_affine() is to quickly determine on which CPU we can run
   5695  * soonest. For the purpose of speed we only consider the waking and previous
   5696  * CPU.
   5697  *
   5698  * wake_affine_idle() - only considers 'now', it check if the waking CPU is
   5699  *			cache-affine and is (or	will be) idle.
   5700  *
   5701  * wake_affine_weight() - considers the weight to reflect the average
   5702  *			  scheduling latency of the CPUs. This seems to work
   5703  *			  for the overloaded case.
   5704  */
   5705 static int
   5706 wake_affine_idle(int this_cpu, int prev_cpu, int sync)
   5707 {
   5708 	/*
   5709 	 * If this_cpu is idle, it implies the wakeup is from interrupt
   5710 	 * context. Only allow the move if cache is shared. Otherwise an
   5711 	 * interrupt intensive workload could force all tasks onto one
   5712 	 * node depending on the IO topology or IRQ affinity settings.
   5713 	 *
   5714 	 * If the prev_cpu is idle and cache affine then avoid a migration.
   5715 	 * There is no guarantee that the cache hot data from an interrupt
   5716 	 * is more important than cache hot data on the prev_cpu and from
   5717 	 * a cpufreq perspective, it's better to have higher utilisation
   5718 	 * on one CPU.
   5719 	 */
   5720 	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
   5721 		return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
   5722 
   5723 	if (sync && cpu_rq(this_cpu)->nr_running == 1)
   5724 		return this_cpu;
   5725 
   5726 	return nr_cpumask_bits;
   5727 }
   5728 
   5729 static int
   5730 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
   5731 		   int this_cpu, int prev_cpu, int sync)
   5732 {
   5733 	s64 this_eff_load, prev_eff_load;
   5734 	unsigned long task_load;
   5735 
   5736 	this_eff_load = target_load(this_cpu, sd->wake_idx);
   5737 
   5738 	if (sync) {
   5739 		unsigned long current_load = task_h_load(current);
   5740 
   5741 		if (current_load > this_eff_load)
   5742 			return this_cpu;
   5743 
   5744 		this_eff_load -= current_load;
   5745 	}
   5746 
   5747 	task_load = task_h_load(p);
   5748 
   5749 	this_eff_load += task_load;
   5750 	if (sched_feat(WA_BIAS))
   5751 		this_eff_load *= 100;
   5752 	this_eff_load *= capacity_of(prev_cpu);
   5753 
   5754 	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
   5755 	prev_eff_load -= task_load;
   5756 	if (sched_feat(WA_BIAS))
   5757 		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
   5758 	prev_eff_load *= capacity_of(this_cpu);
   5759 
   5760 	/*
   5761 	 * If sync, adjust the weight of prev_eff_load such that if
   5762 	 * prev_eff == this_eff that select_idle_sibling() will consider
   5763 	 * stacking the wakee on top of the waker if no other CPU is
   5764 	 * idle.
   5765 	 */
   5766 	if (sync)
   5767 		prev_eff_load += 1;
   5768 
   5769 	return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
   5770 }
   5771 
   5772 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
   5773 		       int this_cpu, int prev_cpu, int sync)
   5774 {
   5775 	int target = nr_cpumask_bits;
   5776 
   5777 	if (sched_feat(WA_IDLE))
   5778 		target = wake_affine_idle(this_cpu, prev_cpu, sync);
   5779 
   5780 	if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
   5781 		target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
   5782 
   5783 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
   5784 	if (target == nr_cpumask_bits)
   5785 		return prev_cpu;
   5786 
   5787 	schedstat_inc(sd->ttwu_move_affine);
   5788 	schedstat_inc(p->se.statistics.nr_wakeups_affine);
   5789 	return target;
   5790 }
   5791 
   5792 static unsigned long cpu_util_without(int cpu, struct task_struct *p);
   5793 
   5794 static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
   5795 {
   5796 	return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
   5797 }
   5798 
   5799 /*
   5800  * find_idlest_group finds and returns the least busy CPU group within the
   5801  * domain.
   5802  *
   5803  * Assumes p is allowed on at least one CPU in sd.
   5804  */
   5805 static struct sched_group *
   5806 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
   5807 		  int this_cpu, int sd_flag)
   5808 {
   5809 	struct sched_group *idlest = NULL, *group = sd->groups;
   5810 	struct sched_group *most_spare_sg = NULL;
   5811 	unsigned long min_runnable_load = ULONG_MAX;
   5812 	unsigned long this_runnable_load = ULONG_MAX;
   5813 	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
   5814 	unsigned long most_spare = 0, this_spare = 0;
   5815 	int load_idx = sd->forkexec_idx;
   5816 	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
   5817 	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
   5818 				(sd->imbalance_pct-100) / 100;
   5819 
   5820 	if (sd_flag & SD_BALANCE_WAKE)
   5821 		load_idx = sd->wake_idx;
   5822 
   5823 	do {
   5824 		unsigned long load, avg_load, runnable_load;
   5825 		unsigned long spare_cap, max_spare_cap;
   5826 		int local_group;
   5827 		int i;
   5828 
   5829 		/* Skip over this group if it has no CPUs allowed */
   5830 		if (!cpumask_intersects(sched_group_span(group),
   5831 					&p->cpus_allowed))
   5832 			continue;
   5833 
   5834 		local_group = cpumask_test_cpu(this_cpu,
   5835 					       sched_group_span(group));
   5836 
   5837 		/*
   5838 		 * Tally up the load of all CPUs in the group and find
   5839 		 * the group containing the CPU with most spare capacity.
   5840 		 */
   5841 		avg_load = 0;
   5842 		runnable_load = 0;
   5843 		max_spare_cap = 0;
   5844 
   5845 		for_each_cpu(i, sched_group_span(group)) {
   5846 			/* Bias balancing toward CPUs of our domain */
   5847 			if (local_group)
   5848 				load = source_load(i, load_idx);
   5849 			else
   5850 				load = target_load(i, load_idx);
   5851 
   5852 			runnable_load += load;
   5853 
   5854 			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
   5855 
   5856 			spare_cap = capacity_spare_without(i, p);
   5857 
   5858 			if (spare_cap > max_spare_cap)
   5859 				max_spare_cap = spare_cap;
   5860 		}
   5861 
   5862 		/* Adjust by relative CPU capacity of the group */
   5863 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
   5864 					group->sgc->capacity;
   5865 		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
   5866 					group->sgc->capacity;
   5867 
   5868 		if (local_group) {
   5869 			this_runnable_load = runnable_load;
   5870 			this_avg_load = avg_load;
   5871 			this_spare = max_spare_cap;
   5872 		} else {
   5873 			if (min_runnable_load > (runnable_load + imbalance)) {
   5874 				/*
   5875 				 * The runnable load is significantly smaller
   5876 				 * so we can pick this new CPU:
   5877 				 */
   5878 				min_runnable_load = runnable_load;
   5879 				min_avg_load = avg_load;
   5880 				idlest = group;
   5881 			} else if ((runnable_load < (min_runnable_load + imbalance)) &&
   5882 				   (100*min_avg_load > imbalance_scale*avg_load)) {
   5883 				/*
   5884 				 * The runnable loads are close so take the
   5885 				 * blocked load into account through avg_load:
   5886 				 */
   5887 				min_avg_load = avg_load;
   5888 				idlest = group;
   5889 			}
   5890 
   5891 			if (most_spare < max_spare_cap) {
   5892 				most_spare = max_spare_cap;
   5893 				most_spare_sg = group;
   5894 			}
   5895 		}
   5896 	} while (group = group->next, group != sd->groups);
   5897 
   5898 	/*
   5899 	 * The cross-over point between using spare capacity or least load
   5900 	 * is too conservative for high utilization tasks on partially
   5901 	 * utilized systems if we require spare_capacity > task_util(p),
   5902 	 * so we allow for some task stuffing by using
   5903 	 * spare_capacity > task_util(p)/2.
   5904 	 *
   5905 	 * Spare capacity can't be used for fork because the utilization has
   5906 	 * not been set yet, we must first select a rq to compute the initial
   5907 	 * utilization.
   5908 	 */
   5909 	if (sd_flag & SD_BALANCE_FORK)
   5910 		goto skip_spare;
   5911 
   5912 	if (this_spare > task_util(p) / 2 &&
   5913 	    imbalance_scale*this_spare > 100*most_spare)
   5914 		return NULL;
   5915 
   5916 	if (most_spare > task_util(p) / 2)
   5917 		return most_spare_sg;
   5918 
   5919 skip_spare:
   5920 	if (!idlest)
   5921 		return NULL;
   5922 
   5923 	/*
   5924 	 * When comparing groups across NUMA domains, it's possible for the
   5925 	 * local domain to be very lightly loaded relative to the remote
   5926 	 * domains but "imbalance" skews the comparison making remote CPUs
   5927 	 * look much more favourable. When considering cross-domain, add
   5928 	 * imbalance to the runnable load on the remote node and consider
   5929 	 * staying local.
   5930 	 */
   5931 	if ((sd->flags & SD_NUMA) &&
   5932 	    min_runnable_load + imbalance >= this_runnable_load)
   5933 		return NULL;
   5934 
   5935 	if (min_runnable_load > (this_runnable_load + imbalance))
   5936 		return NULL;
   5937 
   5938 	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
   5939 	     (100*this_avg_load < imbalance_scale*min_avg_load))
   5940 		return NULL;
   5941 
   5942 	return idlest;
   5943 }
   5944 
   5945 /*
   5946  * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
   5947  */
   5948 static int
   5949 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   5950 {
   5951 	unsigned long load, min_load = ULONG_MAX;
   5952 	unsigned int min_exit_latency = UINT_MAX;
   5953 	u64 latest_idle_timestamp = 0;
   5954 	int least_loaded_cpu = this_cpu;
   5955 	int shallowest_idle_cpu = -1;
   5956 	int i;
   5957 
   5958 	/* Check if we have any choice: */
   5959 	if (group->group_weight == 1)
   5960 		return cpumask_first(sched_group_span(group));
   5961 
   5962 	/* Traverse only the allowed CPUs */
   5963 	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
   5964 		if (available_idle_cpu(i)) {
   5965 			struct rq *rq = cpu_rq(i);
   5966 			struct cpuidle_state *idle = idle_get_state(rq);
   5967 			if (idle && idle->exit_latency < min_exit_latency) {
   5968 				/*
   5969 				 * We give priority to a CPU whose idle state
   5970 				 * has the smallest exit latency irrespective
   5971 				 * of any idle timestamp.
   5972 				 */
   5973 				min_exit_latency = idle->exit_latency;
   5974 				latest_idle_timestamp = rq->idle_stamp;
   5975 				shallowest_idle_cpu = i;
   5976 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
   5977 				   rq->idle_stamp > latest_idle_timestamp) {
   5978 				/*
   5979 				 * If equal or no active idle state, then
   5980 				 * the most recently idled CPU might have
   5981 				 * a warmer cache.
   5982 				 */
   5983 				latest_idle_timestamp = rq->idle_stamp;
   5984 				shallowest_idle_cpu = i;
   5985 			}
   5986 		} else if (shallowest_idle_cpu == -1) {
   5987 			load = weighted_cpuload(cpu_rq(i));
   5988 			if (load < min_load) {
   5989 				min_load = load;
   5990 				least_loaded_cpu = i;
   5991 			}
   5992 		}
   5993 	}
   5994 
   5995 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
   5996 }
   5997 
   5998 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
   5999 				  int cpu, int prev_cpu, int sd_flag)
   6000 {
   6001 	int new_cpu = cpu;
   6002 
   6003 	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
   6004 		return prev_cpu;
   6005 
   6006 	/*
   6007 	 * We need task's util for capacity_spare_without, sync it up to
   6008 	 * prev_cpu's last_update_time.
   6009 	 */
   6010 	if (!(sd_flag & SD_BALANCE_FORK))
   6011 		sync_entity_load_avg(&p->se);
   6012 
   6013 	while (sd) {
   6014 		struct sched_group *group;
   6015 		struct sched_domain *tmp;
   6016 		int weight;
   6017 
   6018 		if (!(sd->flags & sd_flag)) {
   6019 			sd = sd->child;
   6020 			continue;
   6021 		}
   6022 
   6023 		group = find_idlest_group(sd, p, cpu, sd_flag);
   6024 		if (!group) {
   6025 			sd = sd->child;
   6026 			continue;
   6027 		}
   6028 
   6029 		new_cpu = find_idlest_group_cpu(group, p, cpu);
   6030 		if (new_cpu == cpu) {
   6031 			/* Now try balancing at a lower domain level of 'cpu': */
   6032 			sd = sd->child;
   6033 			continue;
   6034 		}
   6035 
   6036 		/* Now try balancing at a lower domain level of 'new_cpu': */
   6037 		cpu = new_cpu;
   6038 		weight = sd->span_weight;
   6039 		sd = NULL;
   6040 		for_each_domain(cpu, tmp) {
   6041 			if (weight <= tmp->span_weight)
   6042 				break;
   6043 			if (tmp->flags & sd_flag)
   6044 				sd = tmp;
   6045 		}
   6046 	}
   6047 
   6048 	return new_cpu;
   6049 }
   6050 
   6051 #ifdef CONFIG_SCHED_SMT
   6052 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
   6053 EXPORT_SYMBOL_GPL(sched_smt_present);
   6054 
   6055 static inline void set_idle_cores(int cpu, int val)
   6056 {
   6057 	struct sched_domain_shared *sds;
   6058 
   6059 	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
   6060 	if (sds)
   6061 		WRITE_ONCE(sds->has_idle_cores, val);
   6062 }
   6063 
   6064 static inline bool test_idle_cores(int cpu, bool def)
   6065 {
   6066 	struct sched_domain_shared *sds;
   6067 
   6068 	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
   6069 	if (sds)
   6070 		return READ_ONCE(sds->has_idle_cores);
   6071 
   6072 	return def;
   6073 }
   6074 
   6075 /*
   6076  * Scans the local SMT mask to see if the entire core is idle, and records this
   6077  * information in sd_llc_shared->has_idle_cores.
   6078  *
   6079  * Since SMT siblings share all cache levels, inspecting this limited remote
   6080  * state should be fairly cheap.
   6081  */
   6082 void __update_idle_core(struct rq *rq)
   6083 {
   6084 	int core = cpu_of(rq);
   6085 	int cpu;
   6086 
   6087 	rcu_read_lock();
   6088 	if (test_idle_cores(core, true))
   6089 		goto unlock;
   6090 
   6091 	for_each_cpu(cpu, cpu_smt_mask(core)) {
   6092 		if (cpu == core)
   6093 			continue;
   6094 
   6095 		if (!available_idle_cpu(cpu))
   6096 			goto unlock;
   6097 	}
   6098 
   6099 	set_idle_cores(core, 1);
   6100 unlock:
   6101 	rcu_read_unlock();
   6102 }
   6103 
   6104 /*
   6105  * Scan the entire LLC domain for idle cores; this dynamically switches off if
   6106  * there are no idle cores left in the system; tracked through
   6107  * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
   6108  */
   6109 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
   6110 {
   6111 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
   6112 	int core, cpu;
   6113 
   6114 	if (!static_branch_likely(&sched_smt_present))
   6115 		return -1;
   6116 
   6117 	if (!test_idle_cores(target, false))
   6118 		return -1;
   6119 
   6120 	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
   6121 
   6122 	for_each_cpu_wrap(core, cpus, target) {
   6123 		bool idle = true;
   6124 
   6125 		for_each_cpu(cpu, cpu_smt_mask(core)) {
   6126 			__cpumask_clear_cpu(cpu, cpus);
   6127 			if (!available_idle_cpu(cpu))
   6128 				idle = false;
   6129 		}
   6130 
   6131 		if (idle)
   6132 			return core;
   6133 	}
   6134 
   6135 	/*
   6136 	 * Failed to find an idle core; stop looking for one.
   6137 	 */
   6138 	set_idle_cores(target, 0);
   6139 
   6140 	return -1;
   6141 }
   6142 
   6143 /*
   6144  * Scan the local SMT mask for idle CPUs.
   6145  */
   6146 static int select_idle_smt(struct task_struct *p, int target)
   6147 {
   6148 	int cpu;
   6149 
   6150 	if (!static_branch_likely(&sched_smt_present))
   6151 		return -1;
   6152 
   6153 	for_each_cpu(cpu, cpu_smt_mask(target)) {
   6154 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
   6155 			continue;
   6156 		if (available_idle_cpu(cpu))
   6157 			return cpu;
   6158 	}
   6159 
   6160 	return -1;
   6161 }
   6162 
   6163 #else /* CONFIG_SCHED_SMT */
   6164 
   6165 static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
   6166 {
   6167 	return -1;
   6168 }
   6169 
   6170 static inline int select_idle_smt(struct task_struct *p, int target)
   6171 {
   6172 	return -1;
   6173 }
   6174 
   6175 #endif /* CONFIG_SCHED_SMT */
   6176 
   6177 /*
   6178  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
   6179  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
   6180  * average idle time for this rq (as found in rq->avg_idle).
   6181  */
   6182 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
   6183 {
   6184 	struct sched_domain *this_sd;
   6185 	u64 avg_cost, avg_idle;
   6186 	u64 time, cost;
   6187 	s64 delta;
   6188 	int cpu, nr = INT_MAX;
   6189 
   6190 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
   6191 	if (!this_sd)
   6192 		return -1;
   6193 
   6194 	/*
   6195 	 * Due to large variance we need a large fuzz factor; hackbench in
   6196 	 * particularly is sensitive here.
   6197 	 */
   6198 	avg_idle = this_rq()->avg_idle / 512;
   6199 	avg_cost = this_sd->avg_scan_cost + 1;
   6200 
   6201 	if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
   6202 		return -1;
   6203 
   6204 	if (sched_feat(SIS_PROP)) {
   6205 		u64 span_avg = sd->span_weight * avg_idle;
   6206 		if (span_avg > 4*avg_cost)
   6207 			nr = div_u64(span_avg, avg_cost);
   6208 		else
   6209 			nr = 4;
   6210 	}
   6211 
   6212 	time = local_clock();
   6213 
   6214 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
   6215 		if (!--nr)
   6216 			return -1;
   6217 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
   6218 			continue;
   6219 		if (available_idle_cpu(cpu))
   6220 			break;
   6221 	}
   6222 
   6223 	time = local_clock() - time;
   6224 	cost = this_sd->avg_scan_cost;
   6225 	delta = (s64)(time - cost) / 8;
   6226 	this_sd->avg_scan_cost += delta;
   6227 
   6228 	return cpu;
   6229 }
   6230 
   6231 /*
   6232  * Try and locate an idle core/thread in the LLC cache domain.
   6233  */
   6234 static int select_idle_sibling(struct task_struct *p, int prev, int target)
   6235 {
   6236 	struct sched_domain *sd;
   6237 	int i, recent_used_cpu;
   6238 
   6239 	if (available_idle_cpu(target))
   6240 		return target;
   6241 
   6242 	/*
   6243 	 * If the previous CPU is cache affine and idle, don't be stupid:
   6244 	 */
   6245 	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
   6246 		return prev;
   6247 
   6248 	/* Check a recently used CPU as a potential idle candidate: */
   6249 	recent_used_cpu = p->recent_used_cpu;
   6250 	if (recent_used_cpu != prev &&
   6251 	    recent_used_cpu != target &&
   6252 	    cpus_share_cache(recent_used_cpu, target) &&
   6253 	    available_idle_cpu(recent_used_cpu) &&
   6254 	    cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
   6255 		/*
   6256 		 * Replace recent_used_cpu with prev as it is a potential
   6257 		 * candidate for the next wake:
   6258 		 */
   6259 		p->recent_used_cpu = prev;
   6260 		return recent_used_cpu;
   6261 	}
   6262 
   6263 	sd = rcu_dereference(per_cpu(sd_llc, target));
   6264 	if (!sd)
   6265 		return target;
   6266 
   6267 	i = select_idle_core(p, sd, target);
   6268 	if ((unsigned)i < nr_cpumask_bits)
   6269 		return i;
   6270 
   6271 	i = select_idle_cpu(p, sd, target);
   6272 	if ((unsigned)i < nr_cpumask_bits)
   6273 		return i;
   6274 
   6275 	i = select_idle_smt(p, target);
   6276 	if ((unsigned)i < nr_cpumask_bits)
   6277 		return i;
   6278 
   6279 	return target;
   6280 }
   6281 
   6282 /**
   6283  * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
   6284  * @cpu: the CPU to get the utilization of
   6285  *
   6286  * The unit of the return value must be the one of capacity so we can compare
   6287  * the utilization with the capacity of the CPU that is available for CFS task
   6288  * (ie cpu_capacity).
   6289  *
   6290  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
   6291  * recent utilization of currently non-runnable tasks on a CPU. It represents
   6292  * the amount of utilization of a CPU in the range [0..capacity_orig] where
   6293  * capacity_orig is the cpu_capacity available at the highest frequency
   6294  * (arch_scale_freq_capacity()).
   6295  * The utilization of a CPU converges towards a sum equal to or less than the
   6296  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
   6297  * the running time on this CPU scaled by capacity_curr.
   6298  *
   6299  * The estimated utilization of a CPU is defined to be the maximum between its
   6300  * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
   6301  * currently RUNNABLE on that CPU.
   6302  * This allows to properly represent the expected utilization of a CPU which
   6303  * has just got a big task running since a long sleep period. At the same time
   6304  * however it preserves the benefits of the "blocked utilization" in
   6305  * describing the potential for other tasks waking up on the same CPU.
   6306  *
   6307  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
   6308  * higher than capacity_orig because of unfortunate rounding in
   6309  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
   6310  * the average stabilizes with the new running time. We need to check that the
   6311  * utilization stays within the range of [0..capacity_orig] and cap it if
   6312  * necessary. Without utilization capping, a group could be seen as overloaded
   6313  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
   6314  * available capacity. We allow utilization to overshoot capacity_curr (but not
   6315  * capacity_orig) as it useful for predicting the capacity required after task
   6316  * migrations (scheduler-driven DVFS).
   6317  *
   6318  * Return: the (estimated) utilization for the specified CPU
   6319  */
   6320 static inline unsigned long cpu_util(int cpu)
   6321 {
   6322 	struct cfs_rq *cfs_rq;
   6323 	unsigned int util;
   6324 
   6325 	cfs_rq = &cpu_rq(cpu)->cfs;
   6326 	util = READ_ONCE(cfs_rq->avg.util_avg);
   6327 
   6328 	if (sched_feat(UTIL_EST))
   6329 		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
   6330 
   6331 	return min_t(unsigned long, util, capacity_orig_of(cpu));
   6332 }
   6333 
   6334 /*
   6335  * cpu_util_without: compute cpu utilization without any contributions from *p
   6336  * @cpu: the CPU which utilization is requested
   6337  * @p: the task which utilization should be discounted
   6338  *
   6339  * The utilization of a CPU is defined by the utilization of tasks currently
   6340  * enqueued on that CPU as well as tasks which are currently sleeping after an
   6341  * execution on that CPU.
   6342  *
   6343  * This method returns the utilization of the specified CPU by discounting the
   6344  * utilization of the specified task, whenever the task is currently
   6345  * contributing to the CPU utilization.
   6346  */
   6347 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
   6348 {
   6349 	struct cfs_rq *cfs_rq;
   6350 	unsigned int util;
   6351 
   6352 	/* Task has no contribution or is new */
   6353 	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
   6354 		return cpu_util(cpu);
   6355 
   6356 	cfs_rq = &cpu_rq(cpu)->cfs;
   6357 	util = READ_ONCE(cfs_rq->avg.util_avg);
   6358 
   6359 	/* Discount task's util from CPU's util */
   6360 	lsub_positive(&util, task_util(p));
   6361 
   6362 	/*
   6363 	 * Covered cases:
   6364 	 *
   6365 	 * a) if *p is the only task sleeping on this CPU, then:
   6366 	 *      cpu_util (== task_util) > util_est (== 0)
   6367 	 *    and thus we return:
   6368 	 *      cpu_util_without = (cpu_util - task_util) = 0
   6369 	 *
   6370 	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
   6371 	 *    IDLE, then:
   6372 	 *      cpu_util >= task_util
   6373 	 *      cpu_util > util_est (== 0)
   6374 	 *    and thus we discount *p's blocked utilization to return:
   6375 	 *      cpu_util_without = (cpu_util - task_util) >= 0
   6376 	 *
   6377 	 * c) if other tasks are RUNNABLE on that CPU and
   6378 	 *      util_est > cpu_util
   6379 	 *    then we use util_est since it returns a more restrictive
   6380 	 *    estimation of the spare capacity on that CPU, by just
   6381 	 *    considering the expected utilization of tasks already
   6382 	 *    runnable on that CPU.
   6383 	 *
   6384 	 * Cases a) and b) are covered by the above code, while case c) is
   6385 	 * covered by the following code when estimated utilization is
   6386 	 * enabled.
   6387 	 */
   6388 	if (sched_feat(UTIL_EST)) {
   6389 		unsigned int estimated =
   6390 			READ_ONCE(cfs_rq->avg.util_est.enqueued);
   6391 
   6392 		/*
   6393 		 * Despite the following checks we still have a small window
   6394 		 * for a possible race, when an execl's select_task_rq_fair()
   6395 		 * races with LB's detach_task():
   6396 		 *
   6397 		 *   detach_task()
   6398 		 *     p->on_rq = TASK_ON_RQ_MIGRATING;
   6399 		 *     ---------------------------------- A
   6400 		 *     deactivate_task()                   \
   6401 		 *       dequeue_task()                     + RaceTime
   6402 		 *         util_est_dequeue()              /
   6403 		 *     ---------------------------------- B
   6404 		 *
   6405 		 * The additional check on "current == p" it's required to
   6406 		 * properly fix the execl regression and it helps in further
   6407 		 * reducing the chances for the above race.
   6408 		 */
   6409 		if (unlikely(task_on_rq_queued(p) || current == p))
   6410 			lsub_positive(&estimated, _task_util_est(p));
   6411 
   6412 		util = max(util, estimated);
   6413 	}
   6414 
   6415 	/*
   6416 	 * Utilization (estimated) can exceed the CPU capacity, thus let's
   6417 	 * clamp to the maximum CPU capacity to ensure consistency with
   6418 	 * the cpu_util call.
   6419 	 */
   6420 	return min_t(unsigned long, util, capacity_orig_of(cpu));
   6421 }
   6422 
   6423 /*
   6424  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
   6425  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
   6426  *
   6427  * In that case WAKE_AFFINE doesn't make sense and we'll let
   6428  * BALANCE_WAKE sort things out.
   6429  */
   6430 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
   6431 {
   6432 	long min_cap, max_cap;
   6433 
   6434 	if (!static_branch_unlikely(&sched_asym_cpucapacity))
   6435 		return 0;
   6436 
   6437 	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
   6438 	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
   6439 
   6440 	/* Minimum capacity is close to max, no need to abort wake_affine */
   6441 	if (max_cap - min_cap < max_cap >> 3)
   6442 		return 0;
   6443 
   6444 	/* Bring task utilization in sync with prev_cpu */
   6445 	sync_entity_load_avg(&p->se);
   6446 
   6447 	return !task_fits_capacity(p, min_cap);
   6448 }
   6449 
   6450 /*
   6451  * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
   6452  * to @dst_cpu.
   6453  */
   6454 static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
   6455 {
   6456 	struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
   6457 	unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
   6458 
   6459 	/*
   6460 	 * If @p migrates from @cpu to another, remove its contribution. Or,
   6461 	 * if @p migrates from another CPU to @cpu, add its contribution. In
   6462 	 * the other cases, @cpu is not impacted by the migration, so the
   6463 	 * util_avg should already be correct.
   6464 	 */
   6465 	if (task_cpu(p) == cpu && dst_cpu != cpu)
   6466 		sub_positive(&util, task_util(p));
   6467 	else if (task_cpu(p) != cpu && dst_cpu == cpu)
   6468 		util += task_util(p);
   6469 
   6470 	if (sched_feat(UTIL_EST)) {
   6471 		util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
   6472 
   6473 		/*
   6474 		 * During wake-up, the task isn't enqueued yet and doesn't
   6475 		 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
   6476 		 * so just add it (if needed) to "simulate" what will be
   6477 		 * cpu_util() after the task has been enqueued.
   6478 		 */
   6479 		if (dst_cpu == cpu)
   6480 			util_est += _task_util_est(p);
   6481 
   6482 		util = max(util, util_est);
   6483 	}
   6484 
   6485 	return min(util, capacity_orig_of(cpu));
   6486 }
   6487 
   6488 /*
   6489  * compute_energy(): Estimates the energy that would be consumed if @p was
   6490  * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
   6491  * landscape of the * CPUs after the task migration, and uses the Energy Model
   6492  * to compute what would be the energy if we decided to actually migrate that
   6493  * task.
   6494  */
   6495 static long
   6496 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
   6497 {
   6498 	long util, max_util, sum_util, energy = 0;
   6499 	int cpu;
   6500 
   6501 	for (; pd; pd = pd->next) {
   6502 		max_util = sum_util = 0;
   6503 		/*
   6504 		 * The capacity state of CPUs of the current rd can be driven by
   6505 		 * CPUs of another rd if they belong to the same performance
   6506 		 * domain. So, account for the utilization of these CPUs too
   6507 		 * by masking pd with cpu_online_mask instead of the rd span.
   6508 		 *
   6509 		 * If an entire performance domain is outside of the current rd,
   6510 		 * it will not appear in its pd list and will not be accounted
   6511 		 * by compute_energy().
   6512 		 */
   6513 		for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
   6514 			util = cpu_util_next(cpu, p, dst_cpu);
   6515 			util = schedutil_energy_util(cpu, util);
   6516 			max_util = max(util, max_util);
   6517 			sum_util += util;
   6518 		}
   6519 
   6520 		energy += em_pd_energy(pd->em_pd, max_util, sum_util);
   6521 	}
   6522 
   6523 	return energy;
   6524 }
   6525 
   6526 /*
   6527  * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
   6528  * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
   6529  * spare capacity in each performance domain and uses it as a potential
   6530  * candidate to execute the task. Then, it uses the Energy Model to figure
   6531  * out which of the CPU candidates is the most energy-efficient.
   6532  *
   6533  * The rationale for this heuristic is as follows. In a performance domain,
   6534  * all the most energy efficient CPU candidates (according to the Energy
   6535  * Model) are those for which we'll request a low frequency. When there are
   6536  * several CPUs for which the frequency request will be the same, we don't
   6537  * have enough data to break the tie between them, because the Energy Model
   6538  * only includes active power costs. With this model, if we assume that
   6539  * frequency requests follow utilization (e.g. using schedutil), the CPU with
   6540  * the maximum spare capacity in a performance domain is guaranteed to be among
   6541  * the best candidates of the performance domain.
   6542  *
   6543  * In practice, it could be preferable from an energy standpoint to pack
   6544  * small tasks on a CPU in order to let other CPUs go in deeper idle states,
   6545  * but that could also hurt our chances to go cluster idle, and we have no
   6546  * ways to tell with the current Energy Model if this is actually a good
   6547  * idea or not. So, find_energy_efficient_cpu() basically favors
   6548  * cluster-packing, and spreading inside a cluster. That should at least be
   6549  * a good thing for latency, and this is consistent with the idea that most
   6550  * of the energy savings of EAS come from the asymmetry of the system, and
   6551  * not so much from breaking the tie between identical CPUs. That's also the
   6552  * reason why EAS is enabled in the topology code only for systems where
   6553  * SD_ASYM_CPUCAPACITY is set.
   6554  *
   6555  * NOTE: Forkees are not accepted in the energy-aware wake-up path because
   6556  * they don't have any useful utilization data yet and it's not possible to
   6557  * forecast their impact on energy consumption. Consequently, they will be
   6558  * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
   6559  * to be energy-inefficient in some use-cases. The alternative would be to
   6560  * bias new tasks towards specific types of CPUs first, or to try to infer
   6561  * their util_avg from the parent task, but those heuristics could hurt
   6562  * other use-cases too. So, until someone finds a better way to solve this,
   6563  * let's keep things simple by re-using the existing slow path.
   6564  */
   6565 
   6566 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
   6567 {
   6568 	unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
   6569 	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
   6570 	int cpu, best_energy_cpu = prev_cpu;
   6571 	struct perf_domain *head, *pd;
   6572 	unsigned long cpu_cap, util;
   6573 	struct sched_domain *sd;
   6574 
   6575 	rcu_read_lock();
   6576 	pd = rcu_dereference(rd->pd);
   6577 	if (!pd || READ_ONCE(rd->overutilized))
   6578 		goto fail;
   6579 	head = pd;
   6580 
   6581 	/*
   6582 	 * Energy-aware wake-up happens on the lowest sched_domain starting
   6583 	 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
   6584 	 */
   6585 	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
   6586 	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
   6587 		sd = sd->parent;
   6588 	if (!sd)
   6589 		goto fail;
   6590 
   6591 	sync_entity_load_avg(&p->se);
   6592 	if (!task_util_est(p))
   6593 		goto unlock;
   6594 
   6595 	for (; pd; pd = pd->next) {
   6596 		unsigned long cur_energy, spare_cap, max_spare_cap = 0;
   6597 		int max_spare_cap_cpu = -1;
   6598 
   6599 		for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
   6600 			if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
   6601 				continue;
   6602 
   6603 			/* Skip CPUs that will be overutilized. */
   6604 			util = cpu_util_next(cpu, p, cpu);
   6605 			cpu_cap = capacity_of(cpu);
   6606 			if (cpu_cap * 1024 < util * capacity_margin)
   6607 				continue;
   6608 
   6609 			/* Always use prev_cpu as a candidate. */
   6610 			if (cpu == prev_cpu) {
   6611 				prev_energy = compute_energy(p, prev_cpu, head);
   6612 				best_energy = min(best_energy, prev_energy);
   6613 				continue;
   6614 			}
   6615 
   6616 			/*
   6617 			 * Find the CPU with the maximum spare capacity in
   6618 			 * the performance domain
   6619 			 */
   6620 			spare_cap = cpu_cap - util;
   6621 			if (spare_cap > max_spare_cap) {
   6622 				max_spare_cap = spare_cap;
   6623 				max_spare_cap_cpu = cpu;
   6624 			}
   6625 		}
   6626 
   6627 		/* Evaluate the energy impact of using this CPU. */
   6628 		if (max_spare_cap_cpu >= 0) {
   6629 			cur_energy = compute_energy(p, max_spare_cap_cpu, head);
   6630 			if (cur_energy < best_energy) {
   6631 				best_energy = cur_energy;
   6632 				best_energy_cpu = max_spare_cap_cpu;
   6633 			}
   6634 		}
   6635 	}
   6636 unlock:
   6637 	rcu_read_unlock();
   6638 
   6639 	/*
   6640 	 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
   6641 	 * least 6% of the energy used by prev_cpu.
   6642 	 */
   6643 	if (prev_energy == ULONG_MAX)
   6644 		return best_energy_cpu;
   6645 
   6646 	if ((prev_energy - best_energy) > (prev_energy >> 4))
   6647 		return best_energy_cpu;
   6648 
   6649 	return prev_cpu;
   6650 
   6651 fail:
   6652 	rcu_read_unlock();
   6653 
   6654 	return -1;
   6655 }
   6656 
   6657 /*
   6658  * select_task_rq_fair: Select target runqueue for the waking task in domains
   6659  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
   6660  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
   6661  *
   6662  * Balances load by selecting the idlest CPU in the idlest group, or under
   6663  * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
   6664  *
   6665  * Returns the target CPU number.
   6666  *
   6667  * preempt must be disabled.
   6668  */
   6669 static int
   6670 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
   6671 {
   6672 	struct sched_domain *tmp, *sd = NULL;
   6673 	int cpu = smp_processor_id();
   6674 	int new_cpu = prev_cpu;
   6675 	int want_affine = 0;
   6676 	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
   6677 
   6678 	if (sd_flag & SD_BALANCE_WAKE) {
   6679 		record_wakee(p);
   6680 
   6681 		if (sched_energy_enabled()) {
   6682 			new_cpu = find_energy_efficient_cpu(p, prev_cpu);
   6683 			if (new_cpu >= 0)
   6684 				return new_cpu;
   6685 			new_cpu = prev_cpu;
   6686 		}
   6687 
   6688 		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
   6689 			      cpumask_test_cpu(cpu, &p->cpus_allowed);
   6690 	}
   6691 
   6692 	rcu_read_lock();
   6693 	for_each_domain(cpu, tmp) {
   6694 		if (!(tmp->flags & SD_LOAD_BALANCE))
   6695 			break;
   6696 
   6697 		/*
   6698 		 * If both 'cpu' and 'prev_cpu' are part of this domain,
   6699 		 * cpu is a valid SD_WAKE_AFFINE target.
   6700 		 */
   6701 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
   6702 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
   6703 			if (cpu != prev_cpu)
   6704 				new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
   6705 
   6706 			sd = NULL; /* Prefer wake_affine over balance flags */
   6707 			break;
   6708 		}
   6709 
   6710 		if (tmp->flags & sd_flag)
   6711 			sd = tmp;
   6712 		else if (!want_affine)
   6713 			break;
   6714 	}
   6715 
   6716 	if (unlikely(sd)) {
   6717 		/* Slow path */
   6718 		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
   6719 	} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
   6720 		/* Fast path */
   6721 
   6722 		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
   6723 
   6724 		if (want_affine)
   6725 			current->recent_used_cpu = cpu;
   6726 	}
   6727 	rcu_read_unlock();
   6728 
   6729 	return new_cpu;
   6730 }
   6731 
   6732 static void detach_entity_cfs_rq(struct sched_entity *se);
   6733 
   6734 /*
   6735  * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
   6736  * cfs_rq_of(p) references at time of call are still valid and identify the
   6737  * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
   6738  */
   6739 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
   6740 {
   6741 	/*
   6742 	 * As blocked tasks retain absolute vruntime the migration needs to
   6743 	 * deal with this by subtracting the old and adding the new
   6744 	 * min_vruntime -- the latter is done by enqueue_entity() when placing
   6745 	 * the task on the new runqueue.
   6746 	 */
   6747 	if (p->state == TASK_WAKING) {
   6748 		struct sched_entity *se = &p->se;
   6749 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
   6750 		u64 min_vruntime;
   6751 
   6752 #ifndef CONFIG_64BIT
   6753 		u64 min_vruntime_copy;
   6754 
   6755 		do {
   6756 			min_vruntime_copy = cfs_rq->min_vruntime_copy;
   6757 			smp_rmb();
   6758 			min_vruntime = cfs_rq->min_vruntime;
   6759 		} while (min_vruntime != min_vruntime_copy);
   6760 #else
   6761 		min_vruntime = cfs_rq->min_vruntime;
   6762 #endif
   6763 
   6764 		se->vruntime -= min_vruntime;
   6765 	}
   6766 
   6767 	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
   6768 		/*
   6769 		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
   6770 		 * rq->lock and can modify state directly.
   6771 		 */
   6772 		lockdep_assert_held(&task_rq(p)->lock);
   6773 		detach_entity_cfs_rq(&p->se);
   6774 
   6775 	} else {
   6776 		/*
   6777 		 * We are supposed to update the task to "current" time, then
   6778 		 * its up to date and ready to go to new CPU/cfs_rq. But we
   6779 		 * have difficulty in getting what current time is, so simply
   6780 		 * throw away the out-of-date time. This will result in the
   6781 		 * wakee task is less decayed, but giving the wakee more load
   6782 		 * sounds not bad.
   6783 		 */
   6784 		remove_entity_load_avg(&p->se);
   6785 	}
   6786 
   6787 	/* Tell new CPU we are migrated */
   6788 	p->se.avg.last_update_time = 0;
   6789 
   6790 	/* We have migrated, no longer consider this task hot */
   6791 	p->se.exec_start = 0;
   6792 
   6793 	update_scan_period(p, new_cpu);
   6794 }
   6795 
   6796 static void task_dead_fair(struct task_struct *p)
   6797 {
   6798 	remove_entity_load_avg(&p->se);
   6799 }
   6800 #endif /* CONFIG_SMP */
   6801 
   6802 static unsigned long wakeup_gran(struct sched_entity *se)
   6803 {
   6804 	unsigned long gran = sysctl_sched_wakeup_granularity;
   6805 
   6806 	/*
   6807 	 * Since its curr running now, convert the gran from real-time
   6808 	 * to virtual-time in his units.
   6809 	 *
   6810 	 * By using 'se' instead of 'curr' we penalize light tasks, so
   6811 	 * they get preempted easier. That is, if 'se' < 'curr' then
   6812 	 * the resulting gran will be larger, therefore penalizing the
   6813 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
   6814 	 * be smaller, again penalizing the lighter task.
   6815 	 *
   6816 	 * This is especially important for buddies when the leftmost
   6817 	 * task is higher priority than the buddy.
   6818 	 */
   6819 	return calc_delta_fair(gran, se);
   6820 }
   6821 
   6822 /*
   6823  * Should 'se' preempt 'curr'.
   6824  *
   6825  *             |s1
   6826  *        |s2
   6827  *   |s3
   6828  *         g
   6829  *      |<--->|c
   6830  *
   6831  *  w(c, s1) = -1
   6832  *  w(c, s2) =  0
   6833  *  w(c, s3) =  1
   6834  *
   6835  */
   6836 static int
   6837 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
   6838 {
   6839 	s64 gran, vdiff = curr->vruntime - se->vruntime;
   6840 
   6841 	if (vdiff <= 0)
   6842 		return -1;
   6843 
   6844 	gran = wakeup_gran(se);
   6845 	if (vdiff > gran)
   6846 		return 1;
   6847 
   6848 	return 0;
   6849 }
   6850 
   6851 static void set_last_buddy(struct sched_entity *se)
   6852 {
   6853 	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
   6854 		return;
   6855 
   6856 	for_each_sched_entity(se) {
   6857 		if (SCHED_WARN_ON(!se->on_rq))
   6858 			return;
   6859 		cfs_rq_of(se)->last = se;
   6860 	}
   6861 }
   6862 
   6863 static void set_next_buddy(struct sched_entity *se)
   6864 {
   6865 	if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
   6866 		return;
   6867 
   6868 	for_each_sched_entity(se) {
   6869 		if (SCHED_WARN_ON(!se->on_rq))
   6870 			return;
   6871 		cfs_rq_of(se)->next = se;
   6872 	}
   6873 }
   6874 
   6875 static void set_skip_buddy(struct sched_entity *se)
   6876 {
   6877 	for_each_sched_entity(se)
   6878 		cfs_rq_of(se)->skip = se;
   6879 }
   6880 
   6881 /*
   6882  * Preempt the current task with a newly woken task if needed:
   6883  */
   6884 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
   6885 {
   6886 	struct task_struct *curr = rq->curr;
   6887 	struct sched_entity *se = &curr->se, *pse = &p->se;
   6888 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
   6889 	int scale = cfs_rq->nr_running >= sched_nr_latency;
   6890 	int next_buddy_marked = 0;
   6891 
   6892 	if (unlikely(se == pse))
   6893 		return;
   6894 
   6895 	/*
   6896 	 * This is possible from callers such as attach_tasks(), in which we
   6897 	 * unconditionally check_prempt_curr() after an enqueue (which may have
   6898 	 * lead to a throttle).  This both saves work and prevents false
   6899 	 * next-buddy nomination below.
   6900 	 */
   6901 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
   6902 		return;
   6903 
   6904 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
   6905 		set_next_buddy(pse);
   6906 		next_buddy_marked = 1;
   6907 	}
   6908 
   6909 	/*
   6910 	 * We can come here with TIF_NEED_RESCHED already set from new task
   6911 	 * wake up path.
   6912 	 *
   6913 	 * Note: this also catches the edge-case of curr being in a throttled
   6914 	 * group (e.g. via set_curr_task), since update_curr() (in the
   6915 	 * enqueue of curr) will have resulted in resched being set.  This
   6916 	 * prevents us from potentially nominating it as a false LAST_BUDDY
   6917 	 * below.
   6918 	 */
   6919 	if (test_tsk_need_resched(curr))
   6920 		return;
   6921 
   6922 	/* Idle tasks are by definition preempted by non-idle tasks. */
   6923 	if (unlikely(task_has_idle_policy(curr)) &&
   6924 	    likely(!task_has_idle_policy(p)))
   6925 		goto preempt;
   6926 
   6927 	/*
   6928 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
   6929 	 * is driven by the tick):
   6930 	 */
   6931 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
   6932 		return;
   6933 
   6934 	find_matching_se(&se, &pse);
   6935 	update_curr(cfs_rq_of(se));
   6936 	BUG_ON(!pse);
   6937 	if (wakeup_preempt_entity(se, pse) == 1) {
   6938 		/*
   6939 		 * Bias pick_next to pick the sched entity that is
   6940 		 * triggering this preemption.
   6941 		 */
   6942 		if (!next_buddy_marked)
   6943 			set_next_buddy(pse);
   6944 		goto preempt;
   6945 	}
   6946 
   6947 	return;
   6948 
   6949 preempt:
   6950 	resched_curr(rq);
   6951 	/*
   6952 	 * Only set the backward buddy when the current task is still
   6953 	 * on the rq. This can happen when a wakeup gets interleaved
   6954 	 * with schedule on the ->pre_schedule() or idle_balance()
   6955 	 * point, either of which can * drop the rq lock.
   6956 	 *
   6957 	 * Also, during early boot the idle thread is in the fair class,
   6958 	 * for obvious reasons its a bad idea to schedule back to it.
   6959 	 */
   6960 	if (unlikely(!se->on_rq || curr == rq->idle))
   6961 		return;
   6962 
   6963 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
   6964 		set_last_buddy(se);
   6965 }
   6966 
   6967 static struct task_struct *
   6968 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   6969 {
   6970 	struct cfs_rq *cfs_rq = &rq->cfs;
   6971 	struct sched_entity *se;
   6972 	struct task_struct *p;
   6973 	int new_tasks;
   6974 
   6975 again:
   6976 	if (!cfs_rq->nr_running)
   6977 		goto idle;
   6978 
   6979 #ifdef CONFIG_FAIR_GROUP_SCHED
   6980 	if (prev->sched_class != &fair_sched_class)
   6981 		goto simple;
   6982 
   6983 	/*
   6984 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
   6985 	 * likely that a next task is from the same cgroup as the current.
   6986 	 *
   6987 	 * Therefore attempt to avoid putting and setting the entire cgroup
   6988 	 * hierarchy, only change the part that actually changes.
   6989 	 */
   6990 
   6991 	do {
   6992 		struct sched_entity *curr = cfs_rq->curr;
   6993 
   6994 		/*
   6995 		 * Since we got here without doing put_prev_entity() we also
   6996 		 * have to consider cfs_rq->curr. If it is still a runnable
   6997 		 * entity, update_curr() will update its vruntime, otherwise
   6998 		 * forget we've ever seen it.
   6999 		 */
   7000 		if (curr) {
   7001 			if (curr->on_rq)
   7002 				update_curr(cfs_rq);
   7003 			else
   7004 				curr = NULL;
   7005 
   7006 			/*
   7007 			 * This call to check_cfs_rq_runtime() will do the
   7008 			 * throttle and dequeue its entity in the parent(s).
   7009 			 * Therefore the nr_running test will indeed
   7010 			 * be correct.
   7011 			 */
   7012 			if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
   7013 				cfs_rq = &rq->cfs;
   7014 
   7015 				if (!cfs_rq->nr_running)
   7016 					goto idle;
   7017 
   7018 				goto simple;
   7019 			}
   7020 		}
   7021 
   7022 		se = pick_next_entity(cfs_rq, curr);
   7023 		cfs_rq = group_cfs_rq(se);
   7024 	} while (cfs_rq);
   7025 
   7026 	p = task_of(se);
   7027 
   7028 	/*
   7029 	 * Since we haven't yet done put_prev_entity and if the selected task
   7030 	 * is a different task than we started out with, try and touch the
   7031 	 * least amount of cfs_rqs.
   7032 	 */
   7033 	if (prev != p) {
   7034 		struct sched_entity *pse = &prev->se;
   7035 
   7036 		while (!(cfs_rq = is_same_group(se, pse))) {
   7037 			int se_depth = se->depth;
   7038 			int pse_depth = pse->depth;
   7039 
   7040 			if (se_depth <= pse_depth) {
   7041 				put_prev_entity(cfs_rq_of(pse), pse);
   7042 				pse = parent_entity(pse);
   7043 			}
   7044 			if (se_depth >= pse_depth) {
   7045 				set_next_entity(cfs_rq_of(se), se);
   7046 				se = parent_entity(se);
   7047 			}
   7048 		}
   7049 
   7050 		put_prev_entity(cfs_rq, pse);
   7051 		set_next_entity(cfs_rq, se);
   7052 	}
   7053 
   7054 	goto done;
   7055 simple:
   7056 #endif
   7057 
   7058 	put_prev_task(rq, prev);
   7059 
   7060 	do {
   7061 		se = pick_next_entity(cfs_rq, NULL);
   7062 		set_next_entity(cfs_rq, se);
   7063 		cfs_rq = group_cfs_rq(se);
   7064 	} while (cfs_rq);
   7065 
   7066 	p = task_of(se);
   7067 
   7068 done: __maybe_unused;
   7069 #ifdef CONFIG_SMP
   7070 	/*
   7071 	 * Move the next running task to the front of
   7072 	 * the list, so our cfs_tasks list becomes MRU
   7073 	 * one.
   7074 	 */
   7075 	list_move(&p->se.group_node, &rq->cfs_tasks);
   7076 #endif
   7077 
   7078 	if (hrtick_enabled(rq))
   7079 		hrtick_start_fair(rq, p);
   7080 
   7081 	update_misfit_status(p, rq);
   7082 
   7083 	return p;
   7084 
   7085 idle:
   7086 	update_misfit_status(NULL, rq);
   7087 	new_tasks = idle_balance(rq, rf);
   7088 
   7089 	/*
   7090 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
   7091 	 * possible for any higher priority task to appear. In that case we
   7092 	 * must re-start the pick_next_entity() loop.
   7093 	 */
   7094 	if (new_tasks < 0)
   7095 		return RETRY_TASK;
   7096 
   7097 	if (new_tasks > 0)
   7098 		goto again;
   7099 
   7100 	/*
   7101 	 * rq is about to be idle, check if we need to update the
   7102 	 * lost_idle_time of clock_pelt
   7103 	 */
   7104 	update_idle_rq_clock_pelt(rq);
   7105 
   7106 	return NULL;
   7107 }
   7108 
   7109 /*
   7110  * Account for a descheduled task:
   7111  */
   7112 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
   7113 {
   7114 	struct sched_entity *se = &prev->se;
   7115 	struct cfs_rq *cfs_rq;
   7116 
   7117 	for_each_sched_entity(se) {
   7118 		cfs_rq = cfs_rq_of(se);
   7119 		put_prev_entity(cfs_rq, se);
   7120 	}
   7121 }
   7122 
   7123 /*
   7124  * sched_yield() is very simple
   7125  *
   7126  * The magic of dealing with the ->skip buddy is in pick_next_entity.
   7127  */
   7128 static void yield_task_fair(struct rq *rq)
   7129 {
   7130 	struct task_struct *curr = rq->curr;
   7131 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
   7132 	struct sched_entity *se = &curr->se;
   7133 
   7134 	/*
   7135 	 * Are we the only task in the tree?
   7136 	 */
   7137 	if (unlikely(rq->nr_running == 1))
   7138 		return;
   7139 
   7140 	clear_buddies(cfs_rq, se);
   7141 
   7142 	if (curr->policy != SCHED_BATCH) {
   7143 		update_rq_clock(rq);
   7144 		/*
   7145 		 * Update run-time statistics of the 'current'.
   7146 		 */
   7147 		update_curr(cfs_rq);
   7148 		/*
   7149 		 * Tell update_rq_clock() that we've just updated,
   7150 		 * so we don't do microscopic update in schedule()
   7151 		 * and double the fastpath cost.
   7152 		 */
   7153 		rq_clock_skip_update(rq);
   7154 	}
   7155 
   7156 	set_skip_buddy(se);
   7157 }
   7158 
   7159 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
   7160 {
   7161 	struct sched_entity *se = &p->se;
   7162 
   7163 	/* throttled hierarchies are not runnable */
   7164 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
   7165 		return false;
   7166 
   7167 	/* Tell the scheduler that we'd really like pse to run next. */
   7168 	set_next_buddy(se);
   7169 
   7170 	yield_task_fair(rq);
   7171 
   7172 	return true;
   7173 }
   7174 
   7175 #ifdef CONFIG_SMP
   7176 /**************************************************
   7177  * Fair scheduling class load-balancing methods.
   7178  *
   7179  * BASICS
   7180  *
   7181  * The purpose of load-balancing is to achieve the same basic fairness the
   7182  * per-CPU scheduler provides, namely provide a proportional amount of compute
   7183  * time to each task. This is expressed in the following equation:
   7184  *
   7185  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
   7186  *
   7187  * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
   7188  * W_i,0 is defined as:
   7189  *
   7190  *   W_i,0 = \Sum_j w_i,j                                             (2)
   7191  *
   7192  * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
   7193  * is derived from the nice value as per sched_prio_to_weight[].
   7194  *
   7195  * The weight average is an exponential decay average of the instantaneous
   7196  * weight:
   7197  *
   7198  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
   7199  *
   7200  * C_i is the compute capacity of CPU i, typically it is the
   7201  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
   7202  * can also include other factors [XXX].
   7203  *
   7204  * To achieve this balance we define a measure of imbalance which follows
   7205  * directly from (1):
   7206  *
   7207  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
   7208  *
   7209  * We them move tasks around to minimize the imbalance. In the continuous
   7210  * function space it is obvious this converges, in the discrete case we get
   7211  * a few fun cases generally called infeasible weight scenarios.
   7212  *
   7213  * [XXX expand on:
   7214  *     - infeasible weights;
   7215  *     - local vs global optima in the discrete case. ]
   7216  *
   7217  *
   7218  * SCHED DOMAINS
   7219  *
   7220  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
   7221  * for all i,j solution, we create a tree of CPUs that follows the hardware
   7222  * topology where each level pairs two lower groups (or better). This results
   7223  * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
   7224  * tree to only the first of the previous level and we decrease the frequency
   7225  * of load-balance at each level inv. proportional to the number of CPUs in
   7226  * the groups.
   7227  *
   7228  * This yields:
   7229  *
   7230  *     log_2 n     1     n
   7231  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
   7232  *     i = 0      2^i   2^i
   7233  *                               `- size of each group
   7234  *         |         |     `- number of CPUs doing load-balance
   7235  *         |         `- freq
   7236  *         `- sum over all levels
   7237  *
   7238  * Coupled with a limit on how many tasks we can migrate every balance pass,
   7239  * this makes (5) the runtime complexity of the balancer.
   7240  *
   7241  * An important property here is that each CPU is still (indirectly) connected
   7242  * to every other CPU in at most O(log n) steps:
   7243  *
   7244  * The adjacency matrix of the resulting graph is given by:
   7245  *
   7246  *             log_2 n
   7247  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
   7248  *             k = 0
   7249  *
   7250  * And you'll find that:
   7251  *
   7252  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
   7253  *
   7254  * Showing there's indeed a path between every CPU in at most O(log n) steps.
   7255  * The task movement gives a factor of O(m), giving a convergence complexity
   7256  * of:
   7257  *
   7258  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
   7259  *
   7260  *
   7261  * WORK CONSERVING
   7262  *
   7263  * In order to avoid CPUs going idle while there's still work to do, new idle
   7264  * balancing is more aggressive and has the newly idle CPU iterate up the domain
   7265  * tree itself instead of relying on other CPUs to bring it work.
   7266  *
   7267  * This adds some complexity to both (5) and (8) but it reduces the total idle
   7268  * time.
   7269  *
   7270  * [XXX more?]
   7271  *
   7272  *
   7273  * CGROUPS
   7274  *
   7275  * Cgroups make a horror show out of (2), instead of a simple sum we get:
   7276  *
   7277  *                                s_k,i
   7278  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
   7279  *                                 S_k
   7280  *
   7281  * Where
   7282  *
   7283  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
   7284  *
   7285  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
   7286  *
   7287  * The big problem is S_k, its a global sum needed to compute a local (W_i)
   7288  * property.
   7289  *
   7290  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
   7291  *      rewrite all of this once again.]
   7292  */
   7293 
   7294 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
   7295 
   7296 enum fbq_type { regular, remote, all };
   7297 
   7298 enum group_type {
   7299 	group_other = 0,
   7300 	group_misfit_task,
   7301 	group_imbalanced,
   7302 	group_overloaded,
   7303 };
   7304 
   7305 #define LBF_ALL_PINNED	0x01
   7306 #define LBF_NEED_BREAK	0x02
   7307 #define LBF_DST_PINNED  0x04
   7308 #define LBF_SOME_PINNED	0x08
   7309 #define LBF_NOHZ_STATS	0x10
   7310 #define LBF_NOHZ_AGAIN	0x20
   7311 
   7312 struct lb_env {
   7313 	struct sched_domain	*sd;
   7314 
   7315 	struct rq		*src_rq;
   7316 	int			src_cpu;
   7317 
   7318 	int			dst_cpu;
   7319 	struct rq		*dst_rq;
   7320 
   7321 	struct cpumask		*dst_grpmask;
   7322 	int			new_dst_cpu;
   7323 	enum cpu_idle_type	idle;
   7324 	long			imbalance;
   7325 	/* The set of CPUs under consideration for load-balancing */
   7326 	struct cpumask		*cpus;
   7327 
   7328 	unsigned int		flags;
   7329 
   7330 	unsigned int		loop;
   7331 	unsigned int		loop_break;
   7332 	unsigned int		loop_max;
   7333 
   7334 	enum fbq_type		fbq_type;
   7335 	enum group_type		src_grp_type;
   7336 	struct list_head	tasks;
   7337 };
   7338 
   7339 /*
   7340  * Is this task likely cache-hot:
   7341  */
   7342 static int task_hot(struct task_struct *p, struct lb_env *env)
   7343 {
   7344 	s64 delta;
   7345 
   7346 	lockdep_assert_held(&env->src_rq->lock);
   7347 
   7348 	if (p->sched_class != &fair_sched_class)
   7349 		return 0;
   7350 
   7351 	if (unlikely(task_has_idle_policy(p)))
   7352 		return 0;
   7353 
   7354 	/*
   7355 	 * Buddy candidates are cache hot:
   7356 	 */
   7357 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
   7358 			(&p->se == cfs_rq_of(&p->se)->next ||
   7359 			 &p->se == cfs_rq_of(&p->se)->last))
   7360 		return 1;
   7361 
   7362 	if (sysctl_sched_migration_cost == -1)
   7363 		return 1;
   7364 	if (sysctl_sched_migration_cost == 0)
   7365 		return 0;
   7366 
   7367 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
   7368 
   7369 	return delta < (s64)sysctl_sched_migration_cost;
   7370 }
   7371 
   7372 #ifdef CONFIG_NUMA_BALANCING
   7373 /*
   7374  * Returns 1, if task migration degrades locality
   7375  * Returns 0, if task migration improves locality i.e migration preferred.
   7376  * Returns -1, if task migration is not affected by locality.
   7377  */
   7378 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
   7379 {
   7380 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
   7381 	unsigned long src_weight, dst_weight;
   7382 	int src_nid, dst_nid, dist;
   7383 
   7384 	if (!static_branch_likely(&sched_numa_balancing))
   7385 		return -1;
   7386 
   7387 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
   7388 		return -1;
   7389 
   7390 	src_nid = cpu_to_node(env->src_cpu);
   7391 	dst_nid = cpu_to_node(env->dst_cpu);
   7392 
   7393 	if (src_nid == dst_nid)
   7394 		return -1;
   7395 
   7396 	/* Migrating away from the preferred node is always bad. */
   7397 	if (src_nid == p->numa_preferred_nid) {
   7398 		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
   7399 			return 1;
   7400 		else
   7401 			return -1;
   7402 	}
   7403 
   7404 	/* Encourage migration to the preferred node. */
   7405 	if (dst_nid == p->numa_preferred_nid)
   7406 		return 0;
   7407 
   7408 	/* Leaving a core idle is often worse than degrading locality. */
   7409 	if (env->idle == CPU_IDLE)
   7410 		return -1;
   7411 
   7412 	dist = node_distance(src_nid, dst_nid);
   7413 	if (numa_group) {
   7414 		src_weight = group_weight(p, src_nid, dist);
   7415 		dst_weight = group_weight(p, dst_nid, dist);
   7416 	} else {
   7417 		src_weight = task_weight(p, src_nid, dist);
   7418 		dst_weight = task_weight(p, dst_nid, dist);
   7419 	}
   7420 
   7421 	return dst_weight < src_weight;
   7422 }
   7423 
   7424 #else
   7425 static inline int migrate_degrades_locality(struct task_struct *p,
   7426 					     struct lb_env *env)
   7427 {
   7428 	return -1;
   7429 }
   7430 #endif
   7431 
   7432 /*
   7433  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
   7434  */
   7435 static
   7436 int can_migrate_task(struct task_struct *p, struct lb_env *env)
   7437 {
   7438 	int tsk_cache_hot;
   7439 
   7440 	lockdep_assert_held(&env->src_rq->lock);
   7441 
   7442 	/*
   7443 	 * We do not migrate tasks that are:
   7444 	 * 1) throttled_lb_pair, or
   7445 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
   7446 	 * 3) running (obviously), or
   7447 	 * 4) are cache-hot on their current CPU.
   7448 	 */
   7449 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
   7450 		return 0;
   7451 
   7452 	if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
   7453 		int cpu;
   7454 
   7455 		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
   7456 
   7457 		env->flags |= LBF_SOME_PINNED;
   7458 
   7459 		/*
   7460 		 * Remember if this task can be migrated to any other CPU in
   7461 		 * our sched_group. We may want to revisit it if we couldn't
   7462 		 * meet load balance goals by pulling other tasks on src_cpu.
   7463 		 *
   7464 		 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
   7465 		 * already computed one in current iteration.
   7466 		 */
   7467 		if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
   7468 			return 0;
   7469 
   7470 		/* Prevent to re-select dst_cpu via env's CPUs: */
   7471 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
   7472 			if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
   7473 				env->flags |= LBF_DST_PINNED;
   7474 				env->new_dst_cpu = cpu;
   7475 				break;
   7476 			}
   7477 		}
   7478 
   7479 		return 0;
   7480 	}
   7481 
   7482 	/* Record that we found atleast one task that could run on dst_cpu */
   7483 	env->flags &= ~LBF_ALL_PINNED;
   7484 
   7485 	if (task_running(env->src_rq, p)) {
   7486 		schedstat_inc(p->se.statistics.nr_failed_migrations_running);
   7487 		return 0;
   7488 	}
   7489 
   7490 	/*
   7491 	 * Aggressive migration if:
   7492 	 * 1) destination numa is preferred
   7493 	 * 2) task is cache cold, or
   7494 	 * 3) too many balance attempts have failed.
   7495 	 */
   7496 	tsk_cache_hot = migrate_degrades_locality(p, env);
   7497 	if (tsk_cache_hot == -1)
   7498 		tsk_cache_hot = task_hot(p, env);
   7499 
   7500 	if (tsk_cache_hot <= 0 ||
   7501 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
   7502 		if (tsk_cache_hot == 1) {
   7503 			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
   7504 			schedstat_inc(p->se.statistics.nr_forced_migrations);
   7505 		}
   7506 		return 1;
   7507 	}
   7508 
   7509 	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
   7510 	return 0;
   7511 }
   7512 
   7513 /*
   7514  * detach_task() -- detach the task for the migration specified in env
   7515  */
   7516 static void detach_task(struct task_struct *p, struct lb_env *env)
   7517 {
   7518 	lockdep_assert_held(&env->src_rq->lock);
   7519 
   7520 	p->on_rq = TASK_ON_RQ_MIGRATING;
   7521 	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
   7522 	set_task_cpu(p, env->dst_cpu);
   7523 }
   7524 
   7525 /*
   7526  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
   7527  * part of active balancing operations within "domain".
   7528  *
   7529  * Returns a task if successful and NULL otherwise.
   7530  */
   7531 static struct task_struct *detach_one_task(struct lb_env *env)
   7532 {
   7533 	struct task_struct *p;
   7534 
   7535 	lockdep_assert_held(&env->src_rq->lock);
   7536 
   7537 	list_for_each_entry_reverse(p,
   7538 			&env->src_rq->cfs_tasks, se.group_node) {
   7539 		if (!can_migrate_task(p, env))
   7540 			continue;
   7541 
   7542 		detach_task(p, env);
   7543 
   7544 		/*
   7545 		 * Right now, this is only the second place where
   7546 		 * lb_gained[env->idle] is updated (other is detach_tasks)
   7547 		 * so we can safely collect stats here rather than
   7548 		 * inside detach_tasks().
   7549 		 */
   7550 		schedstat_inc(env->sd->lb_gained[env->idle]);
   7551 		return p;
   7552 	}
   7553 	return NULL;
   7554 }
   7555 
   7556 static const unsigned int sched_nr_migrate_break = 32;
   7557 
   7558 /*
   7559  * detach_tasks() -- tries to detach up to imbalance weighted load from
   7560  * busiest_rq, as part of a balancing operation within domain "sd".
   7561  *
   7562  * Returns number of detached tasks if successful and 0 otherwise.
   7563  */
   7564 static int detach_tasks(struct lb_env *env)
   7565 {
   7566 	struct list_head *tasks = &env->src_rq->cfs_tasks;
   7567 	struct task_struct *p;
   7568 	unsigned long load;
   7569 	int detached = 0;
   7570 
   7571 	lockdep_assert_held(&env->src_rq->lock);
   7572 
   7573 	if (env->imbalance <= 0)
   7574 		return 0;
   7575 
   7576 	while (!list_empty(tasks)) {
   7577 		/*
   7578 		 * We don't want to steal all, otherwise we may be treated likewise,
   7579 		 * which could at worst lead to a livelock crash.
   7580 		 */
   7581 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
   7582 			break;
   7583 
   7584 		p = list_last_entry(tasks, struct task_struct, se.group_node);
   7585 
   7586 		env->loop++;
   7587 		/* We've more or less seen every task there is, call it quits */
   7588 		if (env->loop > env->loop_max)
   7589 			break;
   7590 
   7591 		/* take a breather every nr_migrate tasks */
   7592 		if (env->loop > env->loop_break) {
   7593 			env->loop_break += sched_nr_migrate_break;
   7594 			env->flags |= LBF_NEED_BREAK;
   7595 			break;
   7596 		}
   7597 
   7598 		if (!can_migrate_task(p, env))
   7599 			goto next;
   7600 
   7601 		load = task_h_load(p);
   7602 
   7603 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
   7604 			goto next;
   7605 
   7606 		if ((load / 2) > env->imbalance)
   7607 			goto next;
   7608 
   7609 		detach_task(p, env);
   7610 		list_add(&p->se.group_node, &env->tasks);
   7611 
   7612 		detached++;
   7613 		env->imbalance -= load;
   7614 
   7615 #ifdef CONFIG_PREEMPT
   7616 		/*
   7617 		 * NEWIDLE balancing is a source of latency, so preemptible
   7618 		 * kernels will stop after the first task is detached to minimize
   7619 		 * the critical section.
   7620 		 */
   7621 		if (env->idle == CPU_NEWLY_IDLE)
   7622 			break;
   7623 #endif
   7624 
   7625 		/*
   7626 		 * We only want to steal up to the prescribed amount of
   7627 		 * weighted load.
   7628 		 */
   7629 		if (env->imbalance <= 0)
   7630 			break;
   7631 
   7632 		continue;
   7633 next:
   7634 		list_move(&p->se.group_node, tasks);
   7635 	}
   7636 
   7637 	/*
   7638 	 * Right now, this is one of only two places we collect this stat
   7639 	 * so we can safely collect detach_one_task() stats here rather
   7640 	 * than inside detach_one_task().
   7641 	 */
   7642 	schedstat_add(env->sd->lb_gained[env->idle], detached);
   7643 
   7644 	return detached;
   7645 }
   7646 
   7647 /*
   7648  * attach_task() -- attach the task detached by detach_task() to its new rq.
   7649  */
   7650 static void attach_task(struct rq *rq, struct task_struct *p)
   7651 {
   7652 	lockdep_assert_held(&rq->lock);
   7653 
   7654 	BUG_ON(task_rq(p) != rq);
   7655 	activate_task(rq, p, ENQUEUE_NOCLOCK);
   7656 	p->on_rq = TASK_ON_RQ_QUEUED;
   7657 	check_preempt_curr(rq, p, 0);
   7658 }
   7659 
   7660 /*
   7661  * attach_one_task() -- attaches the task returned from detach_one_task() to
   7662  * its new rq.
   7663  */
   7664 static void attach_one_task(struct rq *rq, struct task_struct *p)
   7665 {
   7666 	struct rq_flags rf;
   7667 
   7668 	rq_lock(rq, &rf);
   7669 	update_rq_clock(rq);
   7670 	attach_task(rq, p);
   7671 	rq_unlock(rq, &rf);
   7672 }
   7673 
   7674 /*
   7675  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
   7676  * new rq.
   7677  */
   7678 static void attach_tasks(struct lb_env *env)
   7679 {
   7680 	struct list_head *tasks = &env->tasks;
   7681 	struct task_struct *p;
   7682 	struct rq_flags rf;
   7683 
   7684 	rq_lock(env->dst_rq, &rf);
   7685 	update_rq_clock(env->dst_rq);
   7686 
   7687 	while (!list_empty(tasks)) {
   7688 		p = list_first_entry(tasks, struct task_struct, se.group_node);
   7689 		list_del_init(&p->se.group_node);
   7690 
   7691 		attach_task(env->dst_rq, p);
   7692 	}
   7693 
   7694 	rq_unlock(env->dst_rq, &rf);
   7695 }
   7696 
   7697 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
   7698 {
   7699 	if (cfs_rq->avg.load_avg)
   7700 		return true;
   7701 
   7702 	if (cfs_rq->avg.util_avg)
   7703 		return true;
   7704 
   7705 	return false;
   7706 }
   7707 
   7708 static inline bool others_have_blocked(struct rq *rq)
   7709 {
   7710 	if (READ_ONCE(rq->avg_rt.util_avg))
   7711 		return true;
   7712 
   7713 	if (READ_ONCE(rq->avg_dl.util_avg))
   7714 		return true;
   7715 
   7716 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
   7717 	if (READ_ONCE(rq->avg_irq.util_avg))
   7718 		return true;
   7719 #endif
   7720 
   7721 	return false;
   7722 }
   7723 
   7724 #ifdef CONFIG_FAIR_GROUP_SCHED
   7725 
   7726 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
   7727 {
   7728 	if (cfs_rq->load.weight)
   7729 		return false;
   7730 
   7731 	if (cfs_rq->avg.load_sum)
   7732 		return false;
   7733 
   7734 	if (cfs_rq->avg.util_sum)
   7735 		return false;
   7736 
   7737 	if (cfs_rq->avg.runnable_load_sum)
   7738 		return false;
   7739 
   7740 	return true;
   7741 }
   7742 
   7743 static void update_blocked_averages(int cpu)
   7744 {
   7745 	struct rq *rq = cpu_rq(cpu);
   7746 	struct cfs_rq *cfs_rq, *pos;
   7747 	const struct sched_class *curr_class;
   7748 	struct rq_flags rf;
   7749 	bool done = true;
   7750 
   7751 	rq_lock_irqsave(rq, &rf);
   7752 	update_rq_clock(rq);
   7753 
   7754 	/*
   7755 	 * Iterates the task_group tree in a bottom up fashion, see
   7756 	 * list_add_leaf_cfs_rq() for details.
   7757 	 */
   7758 	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
   7759 		struct sched_entity *se;
   7760 
   7761 		if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
   7762 			update_tg_load_avg(cfs_rq, 0);
   7763 
   7764 		/* Propagate pending load changes to the parent, if any: */
   7765 		se = cfs_rq->tg->se[cpu];
   7766 		if (se && !skip_blocked_update(se))
   7767 			update_load_avg(cfs_rq_of(se), se, 0);
   7768 
   7769 		/*
   7770 		 * There can be a lot of idle CPU cgroups.  Don't let fully
   7771 		 * decayed cfs_rqs linger on the list.
   7772 		 */
   7773 		if (cfs_rq_is_decayed(cfs_rq))
   7774 			list_del_leaf_cfs_rq(cfs_rq);
   7775 
   7776 		/* Don't need periodic decay once load/util_avg are null */
   7777 		if (cfs_rq_has_blocked(cfs_rq))
   7778 			done = false;
   7779 	}
   7780 
   7781 	curr_class = rq->curr->sched_class;
   7782 	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
   7783 	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
   7784 	update_irq_load_avg(rq, 0);
   7785 	/* Don't need periodic decay once load/util_avg are null */
   7786 	if (others_have_blocked(rq))
   7787 		done = false;
   7788 
   7789 #ifdef CONFIG_NO_HZ_COMMON
   7790 	rq->last_blocked_load_update_tick = jiffies;
   7791 	if (done)
   7792 		rq->has_blocked_load = 0;
   7793 #endif
   7794 	rq_unlock_irqrestore(rq, &rf);
   7795 }
   7796 
   7797 /*
   7798  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
   7799  * This needs to be done in a top-down fashion because the load of a child
   7800  * group is a fraction of its parents load.
   7801  */
   7802 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
   7803 {
   7804 	struct rq *rq = rq_of(cfs_rq);
   7805 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
   7806 	unsigned long now = jiffies;
   7807 	unsigned long load;
   7808 
   7809 	if (cfs_rq->last_h_load_update == now)
   7810 		return;
   7811 
   7812 	WRITE_ONCE(cfs_rq->h_load_next, NULL);
   7813 	for_each_sched_entity(se) {
   7814 		cfs_rq = cfs_rq_of(se);
   7815 		WRITE_ONCE(cfs_rq->h_load_next, se);
   7816 		if (cfs_rq->last_h_load_update == now)
   7817 			break;
   7818 	}
   7819 
   7820 	if (!se) {
   7821 		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
   7822 		cfs_rq->last_h_load_update = now;
   7823 	}
   7824 
   7825 	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
   7826 		load = cfs_rq->h_load;
   7827 		load = div64_ul(load * se->avg.load_avg,
   7828 			cfs_rq_load_avg(cfs_rq) + 1);
   7829 		cfs_rq = group_cfs_rq(se);
   7830 		cfs_rq->h_load = load;
   7831 		cfs_rq->last_h_load_update = now;
   7832 	}
   7833 }
   7834 
   7835 static unsigned long task_h_load(struct task_struct *p)
   7836 {
   7837 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
   7838 
   7839 	update_cfs_rq_h_load(cfs_rq);
   7840 	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
   7841 			cfs_rq_load_avg(cfs_rq) + 1);
   7842 }
   7843 #else
   7844 static inline void update_blocked_averages(int cpu)
   7845 {
   7846 	struct rq *rq = cpu_rq(cpu);
   7847 	struct cfs_rq *cfs_rq = &rq->cfs;
   7848 	const struct sched_class *curr_class;
   7849 	struct rq_flags rf;
   7850 
   7851 	rq_lock_irqsave(rq, &rf);
   7852 	update_rq_clock(rq);
   7853 	update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
   7854 
   7855 	curr_class = rq->curr->sched_class;
   7856 	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
   7857 	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
   7858 	update_irq_load_avg(rq, 0);
   7859 #ifdef CONFIG_NO_HZ_COMMON
   7860 	rq->last_blocked_load_update_tick = jiffies;
   7861 	if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
   7862 		rq->has_blocked_load = 0;
   7863 #endif
   7864 	rq_unlock_irqrestore(rq, &rf);
   7865 }
   7866 
   7867 static unsigned long task_h_load(struct task_struct *p)
   7868 {
   7869 	return p->se.avg.load_avg;
   7870 }
   7871 #endif
   7872 
   7873 /********** Helpers for find_busiest_group ************************/
   7874 
   7875 /*
   7876  * sg_lb_stats - stats of a sched_group required for load_balancing
   7877  */
   7878 struct sg_lb_stats {
   7879 	unsigned long avg_load; /*Avg load across the CPUs of the group */
   7880 	unsigned long group_load; /* Total load over the CPUs of the group */
   7881 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
   7882 	unsigned long load_per_task;
   7883 	unsigned long group_capacity;
   7884 	unsigned long group_util; /* Total utilization of the group */
   7885 	unsigned int sum_nr_running; /* Nr tasks running in the group */
   7886 	unsigned int idle_cpus;
   7887 	unsigned int group_weight;
   7888 	enum group_type group_type;
   7889 	int group_no_capacity;
   7890 	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
   7891 #ifdef CONFIG_NUMA_BALANCING
   7892 	unsigned int nr_numa_running;
   7893 	unsigned int nr_preferred_running;
   7894 #endif
   7895 };
   7896 
   7897 /*
   7898  * sd_lb_stats - Structure to store the statistics of a sched_domain
   7899  *		 during load balancing.
   7900  */
   7901 struct sd_lb_stats {
   7902 	struct sched_group *busiest;	/* Busiest group in this sd */
   7903 	struct sched_group *local;	/* Local group in this sd */
   7904 	unsigned long total_running;
   7905 	unsigned long total_load;	/* Total load of all groups in sd */
   7906 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
   7907 	unsigned long avg_load;	/* Average load across all groups in sd */
   7908 
   7909 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
   7910 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
   7911 };
   7912 
   7913 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
   7914 {
   7915 	/*
   7916 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
   7917 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
   7918 	 * We must however clear busiest_stat::avg_load because
   7919 	 * update_sd_pick_busiest() reads this before assignment.
   7920 	 */
   7921 	*sds = (struct sd_lb_stats){
   7922 		.busiest = NULL,
   7923 		.local = NULL,
   7924 		.total_running = 0UL,
   7925 		.total_load = 0UL,
   7926 		.total_capacity = 0UL,
   7927 		.busiest_stat = {
   7928 			.avg_load = 0UL,
   7929 			.sum_nr_running = 0,
   7930 			.group_type = group_other,
   7931 		},
   7932 	};
   7933 }
   7934 
   7935 /**
   7936  * get_sd_load_idx - Obtain the load index for a given sched domain.
   7937  * @sd: The sched_domain whose load_idx is to be obtained.
   7938  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
   7939  *
   7940  * Return: The load index.
   7941  */
   7942 static inline int get_sd_load_idx(struct sched_domain *sd,
   7943 					enum cpu_idle_type idle)
   7944 {
   7945 	int load_idx;
   7946 
   7947 	switch (idle) {
   7948 	case CPU_NOT_IDLE:
   7949 		load_idx = sd->busy_idx;
   7950 		break;
   7951 
   7952 	case CPU_NEWLY_IDLE:
   7953 		load_idx = sd->newidle_idx;
   7954 		break;
   7955 	default:
   7956 		load_idx = sd->idle_idx;
   7957 		break;
   7958 	}
   7959 
   7960 	return load_idx;
   7961 }
   7962 
   7963 static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
   7964 {
   7965 	struct rq *rq = cpu_rq(cpu);
   7966 	unsigned long max = arch_scale_cpu_capacity(sd, cpu);
   7967 	unsigned long used, free;
   7968 	unsigned long irq;
   7969 
   7970 	irq = cpu_util_irq(rq);
   7971 
   7972 	if (unlikely(irq >= max))
   7973 		return 1;
   7974 
   7975 	used = READ_ONCE(rq->avg_rt.util_avg);
   7976 	used += READ_ONCE(rq->avg_dl.util_avg);
   7977 
   7978 	if (unlikely(used >= max))
   7979 		return 1;
   7980 
   7981 	free = max - used;
   7982 
   7983 	return scale_irq_capacity(free, irq, max);
   7984 }
   7985 
   7986 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
   7987 {
   7988 	unsigned long capacity = scale_rt_capacity(sd, cpu);
   7989 	struct sched_group *sdg = sd->groups;
   7990 
   7991 	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
   7992 
   7993 	if (!capacity)
   7994 		capacity = 1;
   7995 
   7996 	cpu_rq(cpu)->cpu_capacity = capacity;
   7997 	sdg->sgc->capacity = capacity;
   7998 	sdg->sgc->min_capacity = capacity;
   7999 	sdg->sgc->max_capacity = capacity;
   8000 }
   8001 
   8002 void update_group_capacity(struct sched_domain *sd, int cpu)
   8003 {
   8004 	struct sched_domain *child = sd->child;
   8005 	struct sched_group *group, *sdg = sd->groups;
   8006 	unsigned long capacity, min_capacity, max_capacity;
   8007 	unsigned long interval;
   8008 
   8009 	interval = msecs_to_jiffies(sd->balance_interval);
   8010 	interval = clamp(interval, 1UL, max_load_balance_interval);
   8011 	sdg->sgc->next_update = jiffies + interval;
   8012 
   8013 	if (!child) {
   8014 		update_cpu_capacity(sd, cpu);
   8015 		return;
   8016 	}
   8017 
   8018 	capacity = 0;
   8019 	min_capacity = ULONG_MAX;
   8020 	max_capacity = 0;
   8021 
   8022 	if (child->flags & SD_OVERLAP) {
   8023 		/*
   8024 		 * SD_OVERLAP domains cannot assume that child groups
   8025 		 * span the current group.
   8026 		 */
   8027 
   8028 		for_each_cpu(cpu, sched_group_span(sdg)) {
   8029 			struct sched_group_capacity *sgc;
   8030 			struct rq *rq = cpu_rq(cpu);
   8031 
   8032 			/*
   8033 			 * build_sched_domains() -> init_sched_groups_capacity()
   8034 			 * gets here before we've attached the domains to the
   8035 			 * runqueues.
   8036 			 *
   8037 			 * Use capacity_of(), which is set irrespective of domains
   8038 			 * in update_cpu_capacity().
   8039 			 *
   8040 			 * This avoids capacity from being 0 and
   8041 			 * causing divide-by-zero issues on boot.
   8042 			 */
   8043 			if (unlikely(!rq->sd)) {
   8044 				capacity += capacity_of(cpu);
   8045 			} else {
   8046 				sgc = rq->sd->groups->sgc;
   8047 				capacity += sgc->capacity;
   8048 			}
   8049 
   8050 			min_capacity = min(capacity, min_capacity);
   8051 			max_capacity = max(capacity, max_capacity);
   8052 		}
   8053 	} else  {
   8054 		/*
   8055 		 * !SD_OVERLAP domains can assume that child groups
   8056 		 * span the current group.
   8057 		 */
   8058 
   8059 		group = child->groups;
   8060 		do {
   8061 			struct sched_group_capacity *sgc = group->sgc;
   8062 
   8063 			capacity += sgc->capacity;
   8064 			min_capacity = min(sgc->min_capacity, min_capacity);
   8065 			max_capacity = max(sgc->max_capacity, max_capacity);
   8066 			group = group->next;
   8067 		} while (group != child->groups);
   8068 	}
   8069 
   8070 	sdg->sgc->capacity = capacity;
   8071 	sdg->sgc->min_capacity = min_capacity;
   8072 	sdg->sgc->max_capacity = max_capacity;
   8073 }
   8074 
   8075 /*
   8076  * Check whether the capacity of the rq has been noticeably reduced by side
   8077  * activity. The imbalance_pct is used for the threshold.
   8078  * Return true is the capacity is reduced
   8079  */
   8080 static inline int
   8081 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
   8082 {
   8083 	return ((rq->cpu_capacity * sd->imbalance_pct) <
   8084 				(rq->cpu_capacity_orig * 100));
   8085 }
   8086 
   8087 /*
   8088  * Check whether a rq has a misfit task and if it looks like we can actually
   8089  * help that task: we can migrate the task to a CPU of higher capacity, or
   8090  * the task's current CPU is heavily pressured.
   8091  */
   8092 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
   8093 {
   8094 	return rq->misfit_task_load &&
   8095 		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
   8096 		 check_cpu_capacity(rq, sd));
   8097 }
   8098 
   8099 /*
   8100  * Group imbalance indicates (and tries to solve) the problem where balancing
   8101  * groups is inadequate due to ->cpus_allowed constraints.
   8102  *
   8103  * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
   8104  * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
   8105  * Something like:
   8106  *
   8107  *	{ 0 1 2 3 } { 4 5 6 7 }
   8108  *	        *     * * *
   8109  *
   8110  * If we were to balance group-wise we'd place two tasks in the first group and
   8111  * two tasks in the second group. Clearly this is undesired as it will overload
   8112  * cpu 3 and leave one of the CPUs in the second group unused.
   8113  *
   8114  * The current solution to this issue is detecting the skew in the first group
   8115  * by noticing the lower domain failed to reach balance and had difficulty
   8116  * moving tasks due to affinity constraints.
   8117  *
   8118  * When this is so detected; this group becomes a candidate for busiest; see
   8119  * update_sd_pick_busiest(). And calculate_imbalance() and
   8120  * find_busiest_group() avoid some of the usual balance conditions to allow it
   8121  * to create an effective group imbalance.
   8122  *
   8123  * This is a somewhat tricky proposition since the next run might not find the
   8124  * group imbalance and decide the groups need to be balanced again. A most
   8125  * subtle and fragile situation.
   8126  */
   8127 
   8128 static inline int sg_imbalanced(struct sched_group *group)
   8129 {
   8130 	return group->sgc->imbalance;
   8131 }
   8132 
   8133 /*
   8134  * group_has_capacity returns true if the group has spare capacity that could
   8135  * be used by some tasks.
   8136  * We consider that a group has spare capacity if the  * number of task is
   8137  * smaller than the number of CPUs or if the utilization is lower than the
   8138  * available capacity for CFS tasks.
   8139  * For the latter, we use a threshold to stabilize the state, to take into
   8140  * account the variance of the tasks' load and to return true if the available
   8141  * capacity in meaningful for the load balancer.
   8142  * As an example, an available capacity of 1% can appear but it doesn't make
   8143  * any benefit for the load balance.
   8144  */
   8145 static inline bool
   8146 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
   8147 {
   8148 	if (sgs->sum_nr_running < sgs->group_weight)
   8149 		return true;
   8150 
   8151 	if ((sgs->group_capacity * 100) >
   8152 			(sgs->group_util * env->sd->imbalance_pct))
   8153 		return true;
   8154 
   8155 	return false;
   8156 }
   8157 
   8158 /*
   8159  *  group_is_overloaded returns true if the group has more tasks than it can
   8160  *  handle.
   8161  *  group_is_overloaded is not equals to !group_has_capacity because a group
   8162  *  with the exact right number of tasks, has no more spare capacity but is not
   8163  *  overloaded so both group_has_capacity and group_is_overloaded return
   8164  *  false.
   8165  */
   8166 static inline bool
   8167 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
   8168 {
   8169 	if (sgs->sum_nr_running <= sgs->group_weight)
   8170 		return false;
   8171 
   8172 	if ((sgs->group_capacity * 100) <
   8173 			(sgs->group_util * env->sd->imbalance_pct))
   8174 		return true;
   8175 
   8176 	return false;
   8177 }
   8178 
   8179 /*
   8180  * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
   8181  * per-CPU capacity than sched_group ref.
   8182  */
   8183 static inline bool
   8184 group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
   8185 {
   8186 	return sg->sgc->min_capacity * capacity_margin <
   8187 						ref->sgc->min_capacity * 1024;
   8188 }
   8189 
   8190 /*
   8191  * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
   8192  * per-CPU capacity_orig than sched_group ref.
   8193  */
   8194 static inline bool
   8195 group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
   8196 {
   8197 	return sg->sgc->max_capacity * capacity_margin <
   8198 						ref->sgc->max_capacity * 1024;
   8199 }
   8200 
   8201 static inline enum
   8202 group_type group_classify(struct sched_group *group,
   8203 			  struct sg_lb_stats *sgs)
   8204 {
   8205 	if (sgs->group_no_capacity)
   8206 		return group_overloaded;
   8207 
   8208 	if (sg_imbalanced(group))
   8209 		return group_imbalanced;
   8210 
   8211 	if (sgs->group_misfit_task_load)
   8212 		return group_misfit_task;
   8213 
   8214 	return group_other;
   8215 }
   8216 
   8217 static bool update_nohz_stats(struct rq *rq, bool force)
   8218 {
   8219 #ifdef CONFIG_NO_HZ_COMMON
   8220 	unsigned int cpu = rq->cpu;
   8221 
   8222 	if (!rq->has_blocked_load)
   8223 		return false;
   8224 
   8225 	if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
   8226 		return false;
   8227 
   8228 	if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
   8229 		return true;
   8230 
   8231 	update_blocked_averages(cpu);
   8232 
   8233 	return rq->has_blocked_load;
   8234 #else
   8235 	return false;
   8236 #endif
   8237 }
   8238 
   8239 /**
   8240  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
   8241  * @env: The load balancing environment.
   8242  * @group: sched_group whose statistics are to be updated.
   8243  * @sgs: variable to hold the statistics for this group.
   8244  * @sg_status: Holds flag indicating the status of the sched_group
   8245  */
   8246 static inline void update_sg_lb_stats(struct lb_env *env,
   8247 				      struct sched_group *group,
   8248 				      struct sg_lb_stats *sgs,
   8249 				      int *sg_status)
   8250 {
   8251 	int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
   8252 	int load_idx = get_sd_load_idx(env->sd, env->idle);
   8253 	unsigned long load;
   8254 	int i, nr_running;
   8255 
   8256 	memset(sgs, 0, sizeof(*sgs));
   8257 
   8258 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
   8259 		struct rq *rq = cpu_rq(i);
   8260 
   8261 		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
   8262 			env->flags |= LBF_NOHZ_AGAIN;
   8263 
   8264 		/* Bias balancing toward CPUs of our domain: */
   8265 		if (local_group)
   8266 			load = target_load(i, load_idx);
   8267 		else
   8268 			load = source_load(i, load_idx);
   8269 
   8270 		sgs->group_load += load;
   8271 		sgs->group_util += cpu_util(i);
   8272 		sgs->sum_nr_running += rq->cfs.h_nr_running;
   8273 
   8274 		nr_running = rq->nr_running;
   8275 		if (nr_running > 1)
   8276 			*sg_status |= SG_OVERLOAD;
   8277 
   8278 		if (cpu_overutilized(i))
   8279 			*sg_status |= SG_OVERUTILIZED;
   8280 
   8281 #ifdef CONFIG_NUMA_BALANCING
   8282 		sgs->nr_numa_running += rq->nr_numa_running;
   8283 		sgs->nr_preferred_running += rq->nr_preferred_running;
   8284 #endif
   8285 		sgs->sum_weighted_load += weighted_cpuload(rq);
   8286 		/*
   8287 		 * No need to call idle_cpu() if nr_running is not 0
   8288 		 */
   8289 		if (!nr_running && idle_cpu(i))
   8290 			sgs->idle_cpus++;
   8291 
   8292 		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
   8293 		    sgs->group_misfit_task_load < rq->misfit_task_load) {
   8294 			sgs->group_misfit_task_load = rq->misfit_task_load;
   8295 			*sg_status |= SG_OVERLOAD;
   8296 		}
   8297 	}
   8298 
   8299 	/* Adjust by relative CPU capacity of the group */
   8300 	sgs->group_capacity = group->sgc->capacity;
   8301 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
   8302 
   8303 	if (sgs->sum_nr_running)
   8304 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
   8305 
   8306 	sgs->group_weight = group->group_weight;
   8307 
   8308 	sgs->group_no_capacity = group_is_overloaded(env, sgs);
   8309 	sgs->group_type = group_classify(group, sgs);
   8310 }
   8311 
   8312 /**
   8313  * update_sd_pick_busiest - return 1 on busiest group
   8314  * @env: The load balancing environment.
   8315  * @sds: sched_domain statistics
   8316  * @sg: sched_group candidate to be checked for being the busiest
   8317  * @sgs: sched_group statistics
   8318  *
   8319  * Determine if @sg is a busier group than the previously selected
   8320  * busiest group.
   8321  *
   8322  * Return: %true if @sg is a busier group than the previously selected
   8323  * busiest group. %false otherwise.
   8324  */
   8325 static bool update_sd_pick_busiest(struct lb_env *env,
   8326 				   struct sd_lb_stats *sds,
   8327 				   struct sched_group *sg,
   8328 				   struct sg_lb_stats *sgs)
   8329 {
   8330 	struct sg_lb_stats *busiest = &sds->busiest_stat;
   8331 
   8332 	/*
   8333 	 * Don't try to pull misfit tasks we can't help.
   8334 	 * We can use max_capacity here as reduction in capacity on some
   8335 	 * CPUs in the group should either be possible to resolve
   8336 	 * internally or be covered by avg_load imbalance (eventually).
   8337 	 */
   8338 	if (sgs->group_type == group_misfit_task &&
   8339 	    (!group_smaller_max_cpu_capacity(sg, sds->local) ||
   8340 	     !group_has_capacity(env, &sds->local_stat)))
   8341 		return false;
   8342 
   8343 	if (sgs->group_type > busiest->group_type)
   8344 		return true;
   8345 
   8346 	if (sgs->group_type < busiest->group_type)
   8347 		return false;
   8348 
   8349 	if (sgs->avg_load <= busiest->avg_load)
   8350 		return false;
   8351 
   8352 	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
   8353 		goto asym_packing;
   8354 
   8355 	/*
   8356 	 * Candidate sg has no more than one task per CPU and
   8357 	 * has higher per-CPU capacity. Migrating tasks to less
   8358 	 * capable CPUs may harm throughput. Maximize throughput,
   8359 	 * power/energy consequences are not considered.
   8360 	 */
   8361 	if (sgs->sum_nr_running <= sgs->group_weight &&
   8362 	    group_smaller_min_cpu_capacity(sds->local, sg))
   8363 		return false;
   8364 
   8365 	/*
   8366 	 * If we have more than one misfit sg go with the biggest misfit.
   8367 	 */
   8368 	if (sgs->group_type == group_misfit_task &&
   8369 	    sgs->group_misfit_task_load < busiest->group_misfit_task_load)
   8370 		return false;
   8371 
   8372 asym_packing:
   8373 	/* This is the busiest node in its class. */
   8374 	if (!(env->sd->flags & SD_ASYM_PACKING))
   8375 		return true;
   8376 
   8377 	/* No ASYM_PACKING if target CPU is already busy */
   8378 	if (env->idle == CPU_NOT_IDLE)
   8379 		return true;
   8380 	/*
   8381 	 * ASYM_PACKING needs to move all the work to the highest
   8382 	 * prority CPUs in the group, therefore mark all groups
   8383 	 * of lower priority than ourself as busy.
   8384 	 */
   8385 	if (sgs->sum_nr_running &&
   8386 	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
   8387 		if (!sds->busiest)
   8388 			return true;
   8389 
   8390 		/* Prefer to move from lowest priority CPU's work */
   8391 		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
   8392 				      sg->asym_prefer_cpu))
   8393 			return true;
   8394 	}
   8395 
   8396 	return false;
   8397 }
   8398 
   8399 #ifdef CONFIG_NUMA_BALANCING
   8400 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
   8401 {
   8402 	if (sgs->sum_nr_running > sgs->nr_numa_running)
   8403 		return regular;
   8404 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
   8405 		return remote;
   8406 	return all;
   8407 }
   8408 
   8409 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
   8410 {
   8411 	if (rq->nr_running > rq->nr_numa_running)
   8412 		return regular;
   8413 	if (rq->nr_running > rq->nr_preferred_running)
   8414 		return remote;
   8415 	return all;
   8416 }
   8417 #else
   8418 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
   8419 {
   8420 	return all;
   8421 }
   8422 
   8423 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
   8424 {
   8425 	return regular;
   8426 }
   8427 #endif /* CONFIG_NUMA_BALANCING */
   8428 
   8429 /**
   8430  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   8431  * @env: The load balancing environment.
   8432  * @sds: variable to hold the statistics for this sched_domain.
   8433  */
   8434 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
   8435 {
   8436 	struct sched_domain *child = env->sd->child;
   8437 	struct sched_group *sg = env->sd->groups;
   8438 	struct sg_lb_stats *local = &sds->local_stat;
   8439 	struct sg_lb_stats tmp_sgs;
   8440 	bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
   8441 	int sg_status = 0;
   8442 
   8443 #ifdef CONFIG_NO_HZ_COMMON
   8444 	if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
   8445 		env->flags |= LBF_NOHZ_STATS;
   8446 #endif
   8447 
   8448 	do {
   8449 		struct sg_lb_stats *sgs = &tmp_sgs;
   8450 		int local_group;
   8451 
   8452 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
   8453 		if (local_group) {
   8454 			sds->local = sg;
   8455 			sgs = local;
   8456 
   8457 			if (env->idle != CPU_NEWLY_IDLE ||
   8458 			    time_after_eq(jiffies, sg->sgc->next_update))
   8459 				update_group_capacity(env->sd, env->dst_cpu);
   8460 		}
   8461 
   8462 		update_sg_lb_stats(env, sg, sgs, &sg_status);
   8463 
   8464 		if (local_group)
   8465 			goto next_group;
   8466 
   8467 		/*
   8468 		 * In case the child domain prefers tasks go to siblings
   8469 		 * first, lower the sg capacity so that we'll try
   8470 		 * and move all the excess tasks away. We lower the capacity
   8471 		 * of a group only if the local group has the capacity to fit
   8472 		 * these excess tasks. The extra check prevents the case where
   8473 		 * you always pull from the heaviest group when it is already
   8474 		 * under-utilized (possible with a large weight task outweighs
   8475 		 * the tasks on the system).
   8476 		 */
   8477 		if (prefer_sibling && sds->local &&
   8478 		    group_has_capacity(env, local) &&
   8479 		    (sgs->sum_nr_running > local->sum_nr_running + 1)) {
   8480 			sgs->group_no_capacity = 1;
   8481 			sgs->group_type = group_classify(sg, sgs);
   8482 		}
   8483 
   8484 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
   8485 			sds->busiest = sg;
   8486 			sds->busiest_stat = *sgs;
   8487 		}
   8488 
   8489 next_group:
   8490 		/* Now, start updating sd_lb_stats */
   8491 		sds->total_running += sgs->sum_nr_running;
   8492 		sds->total_load += sgs->group_load;
   8493 		sds->total_capacity += sgs->group_capacity;
   8494 
   8495 		sg = sg->next;
   8496 	} while (sg != env->sd->groups);
   8497 
   8498 #ifdef CONFIG_NO_HZ_COMMON
   8499 	if ((env->flags & LBF_NOHZ_AGAIN) &&
   8500 	    cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
   8501 
   8502 		WRITE_ONCE(nohz.next_blocked,
   8503 			   jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
   8504 	}
   8505 #endif
   8506 
   8507 	if (env->sd->flags & SD_NUMA)
   8508 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
   8509 
   8510 	if (!env->sd->parent) {
   8511 		struct root_domain *rd = env->dst_rq->rd;
   8512 
   8513 		/* update overload indicator if we are at root domain */
   8514 		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
   8515 
   8516 		/* Update over-utilization (tipping point, U >= 0) indicator */
   8517 		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
   8518 	} else if (sg_status & SG_OVERUTILIZED) {
   8519 		WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
   8520 	}
   8521 }
   8522 
   8523 /**
   8524  * check_asym_packing - Check to see if the group is packed into the
   8525  *			sched domain.
   8526  *
   8527  * This is primarily intended to used at the sibling level.  Some
   8528  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
   8529  * case of POWER7, it can move to lower SMT modes only when higher
   8530  * threads are idle.  When in lower SMT modes, the threads will
   8531  * perform better since they share less core resources.  Hence when we
   8532  * have idle threads, we want them to be the higher ones.
   8533  *
   8534  * This packing function is run on idle threads.  It checks to see if
   8535  * the busiest CPU in this domain (core in the P7 case) has a higher
   8536  * CPU number than the packing function is being run on.  Here we are
   8537  * assuming lower CPU number will be equivalent to lower a SMT thread
   8538  * number.
   8539  *
   8540  * Return: 1 when packing is required and a task should be moved to
   8541  * this CPU.  The amount of the imbalance is returned in env->imbalance.
   8542  *
   8543  * @env: The load balancing environment.
   8544  * @sds: Statistics of the sched_domain which is to be packed
   8545  */
   8546 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
   8547 {
   8548 	int busiest_cpu;
   8549 
   8550 	if (!(env->sd->flags & SD_ASYM_PACKING))
   8551 		return 0;
   8552 
   8553 	if (env->idle == CPU_NOT_IDLE)
   8554 		return 0;
   8555 
   8556 	if (!sds->busiest)
   8557 		return 0;
   8558 
   8559 	busiest_cpu = sds->busiest->asym_prefer_cpu;
   8560 	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
   8561 		return 0;
   8562 
   8563 	env->imbalance = sds->busiest_stat.group_load;
   8564 
   8565 	return 1;
   8566 }
   8567 
   8568 /**
   8569  * fix_small_imbalance - Calculate the minor imbalance that exists
   8570  *			amongst the groups of a sched_domain, during
   8571  *			load balancing.
   8572  * @env: The load balancing environment.
   8573  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
   8574  */
   8575 static inline
   8576 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
   8577 {
   8578 	unsigned long tmp, capa_now = 0, capa_move = 0;
   8579 	unsigned int imbn = 2;
   8580 	unsigned long scaled_busy_load_per_task;
   8581 	struct sg_lb_stats *local, *busiest;
   8582 
   8583 	local = &sds->local_stat;
   8584 	busiest = &sds->busiest_stat;
   8585 
   8586 	if (!local->sum_nr_running)
   8587 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
   8588 	else if (busiest->load_per_task > local->load_per_task)
   8589 		imbn = 1;
   8590 
   8591 	scaled_busy_load_per_task =
   8592 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
   8593 		busiest->group_capacity;
   8594 
   8595 	if (busiest->avg_load + scaled_busy_load_per_task >=
   8596 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
   8597 		env->imbalance = busiest->load_per_task;
   8598 		return;
   8599 	}
   8600 
   8601 	/*
   8602 	 * OK, we don't have enough imbalance to justify moving tasks,
   8603 	 * however we may be able to increase total CPU capacity used by
   8604 	 * moving them.
   8605 	 */
   8606 
   8607 	capa_now += busiest->group_capacity *
   8608 			min(busiest->load_per_task, busiest->avg_load);
   8609 	capa_now += local->group_capacity *
   8610 			min(local->load_per_task, local->avg_load);
   8611 	capa_now /= SCHED_CAPACITY_SCALE;
   8612 
   8613 	/* Amount of load we'd subtract */
   8614 	if (busiest->avg_load > scaled_busy_load_per_task) {
   8615 		capa_move += busiest->group_capacity *
   8616 			    min(busiest->load_per_task,
   8617 				busiest->avg_load - scaled_busy_load_per_task);
   8618 	}
   8619 
   8620 	/* Amount of load we'd add */
   8621 	if (busiest->avg_load * busiest->group_capacity <
   8622 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
   8623 		tmp = (busiest->avg_load * busiest->group_capacity) /
   8624 		      local->group_capacity;
   8625 	} else {
   8626 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
   8627 		      local->group_capacity;
   8628 	}
   8629 	capa_move += local->group_capacity *
   8630 		    min(local->load_per_task, local->avg_load + tmp);
   8631 	capa_move /= SCHED_CAPACITY_SCALE;
   8632 
   8633 	/* Move if we gain throughput */
   8634 	if (capa_move > capa_now)
   8635 		env->imbalance = busiest->load_per_task;
   8636 }
   8637 
   8638 /**
   8639  * calculate_imbalance - Calculate the amount of imbalance present within the
   8640  *			 groups of a given sched_domain during load balance.
   8641  * @env: load balance environment
   8642  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
   8643  */
   8644 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
   8645 {
   8646 	unsigned long max_pull, load_above_capacity = ~0UL;
   8647 	struct sg_lb_stats *local, *busiest;
   8648 
   8649 	local = &sds->local_stat;
   8650 	busiest = &sds->busiest_stat;
   8651 
   8652 	if (busiest->group_type == group_imbalanced) {
   8653 		/*
   8654 		 * In the group_imb case we cannot rely on group-wide averages
   8655 		 * to ensure CPU-load equilibrium, look at wider averages. XXX
   8656 		 */
   8657 		busiest->load_per_task =
   8658 			min(busiest->load_per_task, sds->avg_load);
   8659 	}
   8660 
   8661 	/*
   8662 	 * Avg load of busiest sg can be less and avg load of local sg can
   8663 	 * be greater than avg load across all sgs of sd because avg load
   8664 	 * factors in sg capacity and sgs with smaller group_type are
   8665 	 * skipped when updating the busiest sg:
   8666 	 */
   8667 	if (busiest->group_type != group_misfit_task &&
   8668 	    (busiest->avg_load <= sds->avg_load ||
   8669 	     local->avg_load >= sds->avg_load)) {
   8670 		env->imbalance = 0;
   8671 		return fix_small_imbalance(env, sds);
   8672 	}
   8673 
   8674 	/*
   8675 	 * If there aren't any idle CPUs, avoid creating some.
   8676 	 */
   8677 	if (busiest->group_type == group_overloaded &&
   8678 	    local->group_type   == group_overloaded) {
   8679 		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
   8680 		if (load_above_capacity > busiest->group_capacity) {
   8681 			load_above_capacity -= busiest->group_capacity;
   8682 			load_above_capacity *= scale_load_down(NICE_0_LOAD);
   8683 			load_above_capacity /= busiest->group_capacity;
   8684 		} else
   8685 			load_above_capacity = ~0UL;
   8686 	}
   8687 
   8688 	/*
   8689 	 * We're trying to get all the CPUs to the average_load, so we don't
   8690 	 * want to push ourselves above the average load, nor do we wish to
   8691 	 * reduce the max loaded CPU below the average load. At the same time,
   8692 	 * we also don't want to reduce the group load below the group
   8693 	 * capacity. Thus we look for the minimum possible imbalance.
   8694 	 */
   8695 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
   8696 
   8697 	/* How much load to actually move to equalise the imbalance */
   8698 	env->imbalance = min(
   8699 		max_pull * busiest->group_capacity,
   8700 		(sds->avg_load - local->avg_load) * local->group_capacity
   8701 	) / SCHED_CAPACITY_SCALE;
   8702 
   8703 	/* Boost imbalance to allow misfit task to be balanced. */
   8704 	if (busiest->group_type == group_misfit_task) {
   8705 		env->imbalance = max_t(long, env->imbalance,
   8706 				       busiest->group_misfit_task_load);
   8707 	}
   8708 
   8709 	/*
   8710 	 * if *imbalance is less than the average load per runnable task
   8711 	 * there is no guarantee that any tasks will be moved so we'll have
   8712 	 * a think about bumping its value to force at least one task to be
   8713 	 * moved
   8714 	 */
   8715 	if (env->imbalance < busiest->load_per_task)
   8716 		return fix_small_imbalance(env, sds);
   8717 }
   8718 
   8719 /******* find_busiest_group() helpers end here *********************/
   8720 
   8721 /**
   8722  * find_busiest_group - Returns the busiest group within the sched_domain
   8723  * if there is an imbalance.
   8724  *
   8725  * Also calculates the amount of weighted load which should be moved
   8726  * to restore balance.
   8727  *
   8728  * @env: The load balancing environment.
   8729  *
   8730  * Return:	- The busiest group if imbalance exists.
   8731  */
   8732 static struct sched_group *find_busiest_group(struct lb_env *env)
   8733 {
   8734 	struct sg_lb_stats *local, *busiest;
   8735 	struct sd_lb_stats sds;
   8736 
   8737 	init_sd_lb_stats(&sds);
   8738 
   8739 	/*
   8740 	 * Compute the various statistics relavent for load balancing at
   8741 	 * this level.
   8742 	 */
   8743 	update_sd_lb_stats(env, &sds);
   8744 
   8745 	if (sched_energy_enabled()) {
   8746 		struct root_domain *rd = env->dst_rq->rd;
   8747 
   8748 		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
   8749 			goto out_balanced;
   8750 	}
   8751 
   8752 	local = &sds.local_stat;
   8753 	busiest = &sds.busiest_stat;
   8754 
   8755 	/* ASYM feature bypasses nice load balance check */
   8756 	if (check_asym_packing(env, &sds))
   8757 		return sds.busiest;
   8758 
   8759 	/* There is no busy sibling group to pull tasks from */
   8760 	if (!sds.busiest || busiest->sum_nr_running == 0)
   8761 		goto out_balanced;
   8762 
   8763 	/* XXX broken for overlapping NUMA groups */
   8764 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
   8765 						/ sds.total_capacity;
   8766 
   8767 	/*
   8768 	 * If the busiest group is imbalanced the below checks don't
   8769 	 * work because they assume all things are equal, which typically
   8770 	 * isn't true due to cpus_allowed constraints and the like.
   8771 	 */
   8772 	if (busiest->group_type == group_imbalanced)
   8773 		goto force_balance;
   8774 
   8775 	/*
   8776 	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
   8777 	 * capacities from resulting in underutilization due to avg_load.
   8778 	 */
   8779 	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
   8780 	    busiest->group_no_capacity)
   8781 		goto force_balance;
   8782 
   8783 	/* Misfit tasks should be dealt with regardless of the avg load */
   8784 	if (busiest->group_type == group_misfit_task)
   8785 		goto force_balance;
   8786 
   8787 	/*
   8788 	 * If the local group is busier than the selected busiest group
   8789 	 * don't try and pull any tasks.
   8790 	 */
   8791 	if (local->avg_load >= busiest->avg_load)
   8792 		goto out_balanced;
   8793 
   8794 	/*
   8795 	 * Don't pull any tasks if this group is already above the domain
   8796 	 * average load.
   8797 	 */
   8798 	if (local->avg_load >= sds.avg_load)
   8799 		goto out_balanced;
   8800 
   8801 	if (env->idle == CPU_IDLE) {
   8802 		/*
   8803 		 * This CPU is idle. If the busiest group is not overloaded
   8804 		 * and there is no imbalance between this and busiest group
   8805 		 * wrt idle CPUs, it is balanced. The imbalance becomes
   8806 		 * significant if the diff is greater than 1 otherwise we
   8807 		 * might end up to just move the imbalance on another group
   8808 		 */
   8809 		if ((busiest->group_type != group_overloaded) &&
   8810 				(local->idle_cpus <= (busiest->idle_cpus + 1)))
   8811 			goto out_balanced;
   8812 	} else {
   8813 		/*
   8814 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
   8815 		 * imbalance_pct to be conservative.
   8816 		 */
   8817 		if (100 * busiest->avg_load <=
   8818 				env->sd->imbalance_pct * local->avg_load)
   8819 			goto out_balanced;
   8820 	}
   8821 
   8822 force_balance:
   8823 	/* Looks like there is an imbalance. Compute it */
   8824 	env->src_grp_type = busiest->group_type;
   8825 	calculate_imbalance(env, &sds);
   8826 	return env->imbalance ? sds.busiest : NULL;
   8827 
   8828 out_balanced:
   8829 	env->imbalance = 0;
   8830 	return NULL;
   8831 }
   8832 
   8833 /*
   8834  * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
   8835  */
   8836 static struct rq *find_busiest_queue(struct lb_env *env,
   8837 				     struct sched_group *group)
   8838 {
   8839 	struct rq *busiest = NULL, *rq;
   8840 	unsigned long busiest_load = 0, busiest_capacity = 1;
   8841 	int i;
   8842 
   8843 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
   8844 		unsigned long capacity, wl;
   8845 		enum fbq_type rt;
   8846 
   8847 		rq = cpu_rq(i);
   8848 		rt = fbq_classify_rq(rq);
   8849 
   8850 		/*
   8851 		 * We classify groups/runqueues into three groups:
   8852 		 *  - regular: there are !numa tasks
   8853 		 *  - remote:  there are numa tasks that run on the 'wrong' node
   8854 		 *  - all:     there is no distinction
   8855 		 *
   8856 		 * In order to avoid migrating ideally placed numa tasks,
   8857 		 * ignore those when there's better options.
   8858 		 *
   8859 		 * If we ignore the actual busiest queue to migrate another
   8860 		 * task, the next balance pass can still reduce the busiest
   8861 		 * queue by moving tasks around inside the node.
   8862 		 *
   8863 		 * If we cannot move enough load due to this classification
   8864 		 * the next pass will adjust the group classification and
   8865 		 * allow migration of more tasks.
   8866 		 *
   8867 		 * Both cases only affect the total convergence complexity.
   8868 		 */
   8869 		if (rt > env->fbq_type)
   8870 			continue;
   8871 
   8872 		/*
   8873 		 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
   8874 		 * seek the "biggest" misfit task.
   8875 		 */
   8876 		if (env->src_grp_type == group_misfit_task) {
   8877 			if (rq->misfit_task_load > busiest_load) {
   8878 				busiest_load = rq->misfit_task_load;
   8879 				busiest = rq;
   8880 			}
   8881 
   8882 			continue;
   8883 		}
   8884 
   8885 		capacity = capacity_of(i);
   8886 
   8887 		/*
   8888 		 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
   8889 		 * eventually lead to active_balancing high->low capacity.
   8890 		 * Higher per-CPU capacity is considered better than balancing
   8891 		 * average load.
   8892 		 */
   8893 		if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
   8894 		    capacity_of(env->dst_cpu) < capacity &&
   8895 		    rq->nr_running == 1)
   8896 			continue;
   8897 
   8898 		wl = weighted_cpuload(rq);
   8899 
   8900 		/*
   8901 		 * When comparing with imbalance, use weighted_cpuload()
   8902 		 * which is not scaled with the CPU capacity.
   8903 		 */
   8904 
   8905 		if (rq->nr_running == 1 && wl > env->imbalance &&
   8906 		    !check_cpu_capacity(rq, env->sd))
   8907 			continue;
   8908 
   8909 		/*
   8910 		 * For the load comparisons with the other CPU's, consider
   8911 		 * the weighted_cpuload() scaled with the CPU capacity, so
   8912 		 * that the load can be moved away from the CPU that is
   8913 		 * potentially running at a lower capacity.
   8914 		 *
   8915 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
   8916 		 * multiplication to rid ourselves of the division works out
   8917 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
   8918 		 * our previous maximum.
   8919 		 */
   8920 		if (wl * busiest_capacity > busiest_load * capacity) {
   8921 			busiest_load = wl;
   8922 			busiest_capacity = capacity;
   8923 			busiest = rq;
   8924 		}
   8925 	}
   8926 
   8927 	return busiest;
   8928 }
   8929 
   8930 /*
   8931  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
   8932  * so long as it is large enough.
   8933  */
   8934 #define MAX_PINNED_INTERVAL	512
   8935 
   8936 static inline bool
   8937 asym_active_balance(struct lb_env *env)
   8938 {
   8939 	/*
   8940 	 * ASYM_PACKING needs to force migrate tasks from busy but
   8941 	 * lower priority CPUs in order to pack all tasks in the
   8942 	 * highest priority CPUs.
   8943 	 */
   8944 	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
   8945 	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
   8946 }
   8947 
   8948 static inline bool
   8949 voluntary_active_balance(struct lb_env *env)
   8950 {
   8951 	struct sched_domain *sd = env->sd;
   8952 
   8953 	if (asym_active_balance(env))
   8954 		return 1;
   8955 
   8956 	/*
   8957 	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
   8958 	 * It's worth migrating the task if the src_cpu's capacity is reduced
   8959 	 * because of other sched_class or IRQs if more capacity stays
   8960 	 * available on dst_cpu.
   8961 	 */
   8962 	if ((env->idle != CPU_NOT_IDLE) &&
   8963 	    (env->src_rq->cfs.h_nr_running == 1)) {
   8964 		if ((check_cpu_capacity(env->src_rq, sd)) &&
   8965 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
   8966 			return 1;
   8967 	}
   8968 
   8969 	if (env->src_grp_type == group_misfit_task)
   8970 		return 1;
   8971 
   8972 	return 0;
   8973 }
   8974 
   8975 static int need_active_balance(struct lb_env *env)
   8976 {
   8977 	struct sched_domain *sd = env->sd;
   8978 
   8979 	if (voluntary_active_balance(env))
   8980 		return 1;
   8981 
   8982 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
   8983 }
   8984 
   8985 static int active_load_balance_cpu_stop(void *data);
   8986 
   8987 static int should_we_balance(struct lb_env *env)
   8988 {
   8989 	struct sched_group *sg = env->sd->groups;
   8990 	int cpu, balance_cpu = -1;
   8991 
   8992 	/*
   8993 	 * Ensure the balancing environment is consistent; can happen
   8994 	 * when the softirq triggers 'during' hotplug.
   8995 	 */
   8996 	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
   8997 		return 0;
   8998 
   8999 	/*
   9000 	 * In the newly idle case, we will allow all the CPUs
   9001 	 * to do the newly idle load balance.
   9002 	 */
   9003 	if (env->idle == CPU_NEWLY_IDLE)
   9004 		return 1;
   9005 
   9006 	/* Try to find first idle CPU */
   9007 	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
   9008 		if (!idle_cpu(cpu))
   9009 			continue;
   9010 
   9011 		balance_cpu = cpu;
   9012 		break;
   9013 	}
   9014 
   9015 	if (balance_cpu == -1)
   9016 		balance_cpu = group_balance_cpu(sg);
   9017 
   9018 	/*
   9019 	 * First idle CPU or the first CPU(busiest) in this sched group
   9020 	 * is eligible for doing load balancing at this and above domains.
   9021 	 */
   9022 	return balance_cpu == env->dst_cpu;
   9023 }
   9024 
   9025 /*
   9026  * Check this_cpu to ensure it is balanced within domain. Attempt to move
   9027  * tasks if there is an imbalance.
   9028  */
   9029 static int load_balance(int this_cpu, struct rq *this_rq,
   9030 			struct sched_domain *sd, enum cpu_idle_type idle,
   9031 			int *continue_balancing)
   9032 {
   9033 	int ld_moved, cur_ld_moved, active_balance = 0;
   9034 	struct sched_domain *sd_parent = sd->parent;
   9035 	struct sched_group *group;
   9036 	struct rq *busiest;
   9037 	struct rq_flags rf;
   9038 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
   9039 
   9040 	struct lb_env env = {
   9041 		.sd		= sd,
   9042 		.dst_cpu	= this_cpu,
   9043 		.dst_rq		= this_rq,
   9044 		.dst_grpmask    = sched_group_span(sd->groups),
   9045 		.idle		= idle,
   9046 		.loop_break	= sched_nr_migrate_break,
   9047 		.cpus		= cpus,
   9048 		.fbq_type	= all,
   9049 		.tasks		= LIST_HEAD_INIT(env.tasks),
   9050 	};
   9051 
   9052 	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
   9053 
   9054 	schedstat_inc(sd->lb_count[idle]);
   9055 
   9056 redo:
   9057 	if (!should_we_balance(&env)) {
   9058 		*continue_balancing = 0;
   9059 		goto out_balanced;
   9060 	}
   9061 
   9062 	group = find_busiest_group(&env);
   9063 	if (!group) {
   9064 		schedstat_inc(sd->lb_nobusyg[idle]);
   9065 		goto out_balanced;
   9066 	}
   9067 
   9068 	busiest = find_busiest_queue(&env, group);
   9069 	if (!busiest) {
   9070 		schedstat_inc(sd->lb_nobusyq[idle]);
   9071 		goto out_balanced;
   9072 	}
   9073 
   9074 	BUG_ON(busiest == env.dst_rq);
   9075 
   9076 	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
   9077 
   9078 	env.src_cpu = busiest->cpu;
   9079 	env.src_rq = busiest;
   9080 
   9081 	ld_moved = 0;
   9082 	if (busiest->nr_running > 1) {
   9083 		/*
   9084 		 * Attempt to move tasks. If find_busiest_group has found
   9085 		 * an imbalance but busiest->nr_running <= 1, the group is
   9086 		 * still unbalanced. ld_moved simply stays zero, so it is
   9087 		 * correctly treated as an imbalance.
   9088 		 */
   9089 		env.flags |= LBF_ALL_PINNED;
   9090 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
   9091 
   9092 more_balance:
   9093 		rq_lock_irqsave(busiest, &rf);
   9094 		update_rq_clock(busiest);
   9095 
   9096 		/*
   9097 		 * cur_ld_moved - load moved in current iteration
   9098 		 * ld_moved     - cumulative load moved across iterations
   9099 		 */
   9100 		cur_ld_moved = detach_tasks(&env);
   9101 
   9102 		/*
   9103 		 * We've detached some tasks from busiest_rq. Every
   9104 		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
   9105 		 * unlock busiest->lock, and we are able to be sure
   9106 		 * that nobody can manipulate the tasks in parallel.
   9107 		 * See task_rq_lock() family for the details.
   9108 		 */
   9109 
   9110 		rq_unlock(busiest, &rf);
   9111 
   9112 		if (cur_ld_moved) {
   9113 			attach_tasks(&env);
   9114 			ld_moved += cur_ld_moved;
   9115 		}
   9116 
   9117 		local_irq_restore(rf.flags);
   9118 
   9119 		if (env.flags & LBF_NEED_BREAK) {
   9120 			env.flags &= ~LBF_NEED_BREAK;
   9121 			goto more_balance;
   9122 		}
   9123 
   9124 		/*
   9125 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
   9126 		 * us and move them to an alternate dst_cpu in our sched_group
   9127 		 * where they can run. The upper limit on how many times we
   9128 		 * iterate on same src_cpu is dependent on number of CPUs in our
   9129 		 * sched_group.
   9130 		 *
   9131 		 * This changes load balance semantics a bit on who can move
   9132 		 * load to a given_cpu. In addition to the given_cpu itself
   9133 		 * (or a ilb_cpu acting on its behalf where given_cpu is
   9134 		 * nohz-idle), we now have balance_cpu in a position to move
   9135 		 * load to given_cpu. In rare situations, this may cause
   9136 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
   9137 		 * _independently_ and at _same_ time to move some load to
   9138 		 * given_cpu) causing exceess load to be moved to given_cpu.
   9139 		 * This however should not happen so much in practice and
   9140 		 * moreover subsequent load balance cycles should correct the
   9141 		 * excess load moved.
   9142 		 */
   9143 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
   9144 
   9145 			/* Prevent to re-select dst_cpu via env's CPUs */
   9146 			__cpumask_clear_cpu(env.dst_cpu, env.cpus);
   9147 
   9148 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
   9149 			env.dst_cpu	 = env.new_dst_cpu;
   9150 			env.flags	&= ~LBF_DST_PINNED;
   9151 			env.loop	 = 0;
   9152 			env.loop_break	 = sched_nr_migrate_break;
   9153 
   9154 			/*
   9155 			 * Go back to "more_balance" rather than "redo" since we
   9156 			 * need to continue with same src_cpu.
   9157 			 */
   9158 			goto more_balance;
   9159 		}
   9160 
   9161 		/*
   9162 		 * We failed to reach balance because of affinity.
   9163 		 */
   9164 		if (sd_parent) {
   9165 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
   9166 
   9167 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
   9168 				*group_imbalance = 1;
   9169 		}
   9170 
   9171 		/* All tasks on this runqueue were pinned by CPU affinity */
   9172 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
   9173 			__cpumask_clear_cpu(cpu_of(busiest), cpus);
   9174 			/*
   9175 			 * Attempting to continue load balancing at the current
   9176 			 * sched_domain level only makes sense if there are
   9177 			 * active CPUs remaining as possible busiest CPUs to
   9178 			 * pull load from which are not contained within the
   9179 			 * destination group that is receiving any migrated
   9180 			 * load.
   9181 			 */
   9182 			if (!cpumask_subset(cpus, env.dst_grpmask)) {
   9183 				env.loop = 0;
   9184 				env.loop_break = sched_nr_migrate_break;
   9185 				goto redo;
   9186 			}
   9187 			goto out_all_pinned;
   9188 		}
   9189 	}
   9190 
   9191 	if (!ld_moved) {
   9192 		schedstat_inc(sd->lb_failed[idle]);
   9193 		/*
   9194 		 * Increment the failure counter only on periodic balance.
   9195 		 * We do not want newidle balance, which can be very
   9196 		 * frequent, pollute the failure counter causing
   9197 		 * excessive cache_hot migrations and active balances.
   9198 		 */
   9199 		if (idle != CPU_NEWLY_IDLE)
   9200 			sd->nr_balance_failed++;
   9201 
   9202 		if (need_active_balance(&env)) {
   9203 			unsigned long flags;
   9204 
   9205 			raw_spin_lock_irqsave(&busiest->lock, flags);
   9206 
   9207 			/*
   9208 			 * Don't kick the active_load_balance_cpu_stop,
   9209 			 * if the curr task on busiest CPU can't be
   9210 			 * moved to this_cpu:
   9211 			 */
   9212 			if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
   9213 				raw_spin_unlock_irqrestore(&busiest->lock,
   9214 							    flags);
   9215 				env.flags |= LBF_ALL_PINNED;
   9216 				goto out_one_pinned;
   9217 			}
   9218 
   9219 			/*
   9220 			 * ->active_balance synchronizes accesses to
   9221 			 * ->active_balance_work.  Once set, it's cleared
   9222 			 * only after active load balance is finished.
   9223 			 */
   9224 			if (!busiest->active_balance) {
   9225 				busiest->active_balance = 1;
   9226 				busiest->push_cpu = this_cpu;
   9227 				active_balance = 1;
   9228 			}
   9229 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
   9230 
   9231 			if (active_balance) {
   9232 				stop_one_cpu_nowait(cpu_of(busiest),
   9233 					active_load_balance_cpu_stop, busiest,
   9234 					&busiest->active_balance_work);
   9235 			}
   9236 
   9237 			/* We've kicked active balancing, force task migration. */
   9238 			sd->nr_balance_failed = sd->cache_nice_tries+1;
   9239 		}
   9240 	} else
   9241 		sd->nr_balance_failed = 0;
   9242 
   9243 	if (likely(!active_balance) || voluntary_active_balance(&env)) {
   9244 		/* We were unbalanced, so reset the balancing interval */
   9245 		sd->balance_interval = sd->min_interval;
   9246 	} else {
   9247 		/*
   9248 		 * If we've begun active balancing, start to back off. This
   9249 		 * case may not be covered by the all_pinned logic if there
   9250 		 * is only 1 task on the busy runqueue (because we don't call
   9251 		 * detach_tasks).
   9252 		 */
   9253 		if (sd->balance_interval < sd->max_interval)
   9254 			sd->balance_interval *= 2;
   9255 	}
   9256 
   9257 	goto out;
   9258 
   9259 out_balanced:
   9260 	/*
   9261 	 * We reach balance although we may have faced some affinity
   9262 	 * constraints. Clear the imbalance flag if it was set.
   9263 	 */
   9264 	if (sd_parent) {
   9265 		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
   9266 
   9267 		if (*group_imbalance)
   9268 			*group_imbalance = 0;
   9269 	}
   9270 
   9271 out_all_pinned:
   9272 	/*
   9273 	 * We reach balance because all tasks are pinned at this level so
   9274 	 * we can't migrate them. Let the imbalance flag set so parent level
   9275 	 * can try to migrate them.
   9276 	 */
   9277 	schedstat_inc(sd->lb_balanced[idle]);
   9278 
   9279 	sd->nr_balance_failed = 0;
   9280 
   9281 out_one_pinned:
   9282 	ld_moved = 0;
   9283 
   9284 	/*
   9285 	 * idle_balance() disregards balance intervals, so we could repeatedly
   9286 	 * reach this code, which would lead to balance_interval skyrocketting
   9287 	 * in a short amount of time. Skip the balance_interval increase logic
   9288 	 * to avoid that.
   9289 	 */
   9290 	if (env.idle == CPU_NEWLY_IDLE)
   9291 		goto out;
   9292 
   9293 	/* tune up the balancing interval */
   9294 	if ((env.flags & LBF_ALL_PINNED &&
   9295 	     sd->balance_interval < MAX_PINNED_INTERVAL) ||
   9296 	    sd->balance_interval < sd->max_interval)
   9297 		sd->balance_interval *= 2;
   9298 out:
   9299 	return ld_moved;
   9300 }
   9301 
   9302 static inline unsigned long
   9303 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
   9304 {
   9305 	unsigned long interval = sd->balance_interval;
   9306 
   9307 	if (cpu_busy)
   9308 		interval *= sd->busy_factor;
   9309 
   9310 	/* scale ms to jiffies */
   9311 	interval = msecs_to_jiffies(interval);
   9312 	interval = clamp(interval, 1UL, max_load_balance_interval);
   9313 
   9314 	return interval;
   9315 }
   9316 
   9317 static inline void
   9318 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
   9319 {
   9320 	unsigned long interval, next;
   9321 
   9322 	/* used by idle balance, so cpu_busy = 0 */
   9323 	interval = get_sd_balance_interval(sd, 0);
   9324 	next = sd->last_balance + interval;
   9325 
   9326 	if (time_after(*next_balance, next))
   9327 		*next_balance = next;
   9328 }
   9329 
   9330 /*
   9331  * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
   9332  * running tasks off the busiest CPU onto idle CPUs. It requires at
   9333  * least 1 task to be running on each physical CPU where possible, and
   9334  * avoids physical / logical imbalances.
   9335  */
   9336 static int active_load_balance_cpu_stop(void *data)
   9337 {
   9338 	struct rq *busiest_rq = data;
   9339 	int busiest_cpu = cpu_of(busiest_rq);
   9340 	int target_cpu = busiest_rq->push_cpu;
   9341 	struct rq *target_rq = cpu_rq(target_cpu);
   9342 	struct sched_domain *sd;
   9343 	struct task_struct *p = NULL;
   9344 	struct rq_flags rf;
   9345 
   9346 	rq_lock_irq(busiest_rq, &rf);
   9347 	/*
   9348 	 * Between queueing the stop-work and running it is a hole in which
   9349 	 * CPUs can become inactive. We should not move tasks from or to
   9350 	 * inactive CPUs.
   9351 	 */
   9352 	if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
   9353 		goto out_unlock;
   9354 
   9355 	/* Make sure the requested CPU hasn't gone down in the meantime: */
   9356 	if (unlikely(busiest_cpu != smp_processor_id() ||
   9357 		     !busiest_rq->active_balance))
   9358 		goto out_unlock;
   9359 
   9360 	/* Is there any task to move? */
   9361 	if (busiest_rq->nr_running <= 1)
   9362 		goto out_unlock;
   9363 
   9364 	/*
   9365 	 * This condition is "impossible", if it occurs
   9366 	 * we need to fix it. Originally reported by
   9367 	 * Bjorn Helgaas on a 128-CPU setup.
   9368 	 */
   9369 	BUG_ON(busiest_rq == target_rq);
   9370 
   9371 	/* Search for an sd spanning us and the target CPU. */
   9372 	rcu_read_lock();
   9373 	for_each_domain(target_cpu, sd) {
   9374 		if ((sd->flags & SD_LOAD_BALANCE) &&
   9375 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
   9376 				break;
   9377 	}
   9378 
   9379 	if (likely(sd)) {
   9380 		struct lb_env env = {
   9381 			.sd		= sd,
   9382 			.dst_cpu	= target_cpu,
   9383 			.dst_rq		= target_rq,
   9384 			.src_cpu	= busiest_rq->cpu,
   9385 			.src_rq		= busiest_rq,
   9386 			.idle		= CPU_IDLE,
   9387 			/*
   9388 			 * can_migrate_task() doesn't need to compute new_dst_cpu
   9389 			 * for active balancing. Since we have CPU_IDLE, but no
   9390 			 * @dst_grpmask we need to make that test go away with lying
   9391 			 * about DST_PINNED.
   9392 			 */
   9393 			.flags		= LBF_DST_PINNED,
   9394 		};
   9395 
   9396 		schedstat_inc(sd->alb_count);
   9397 		update_rq_clock(busiest_rq);
   9398 
   9399 		p = detach_one_task(&env);
   9400 		if (p) {
   9401 			schedstat_inc(sd->alb_pushed);
   9402 			/* Active balancing done, reset the failure counter. */
   9403 			sd->nr_balance_failed = 0;
   9404 		} else {
   9405 			schedstat_inc(sd->alb_failed);
   9406 		}
   9407 	}
   9408 	rcu_read_unlock();
   9409 out_unlock:
   9410 	busiest_rq->active_balance = 0;
   9411 	rq_unlock(busiest_rq, &rf);
   9412 
   9413 	if (p)
   9414 		attach_one_task(target_rq, p);
   9415 
   9416 	local_irq_enable();
   9417 
   9418 	return 0;
   9419 }
   9420 
   9421 static DEFINE_SPINLOCK(balancing);
   9422 
   9423 /*
   9424  * Scale the max load_balance interval with the number of CPUs in the system.
   9425  * This trades load-balance latency on larger machines for less cross talk.
   9426  */
   9427 void update_max_interval(void)
   9428 {
   9429 	max_load_balance_interval = HZ*num_online_cpus()/10;
   9430 }
   9431 
   9432 /*
   9433  * It checks each scheduling domain to see if it is due to be balanced,
   9434  * and initiates a balancing operation if so.
   9435  *
   9436  * Balancing parameters are set up in init_sched_domains.
   9437  */
   9438 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
   9439 {
   9440 	int continue_balancing = 1;
   9441 	int cpu = rq->cpu;
   9442 	unsigned long interval;
   9443 	struct sched_domain *sd;
   9444 	/* Earliest time when we have to do rebalance again */
   9445 	unsigned long next_balance = jiffies + 60*HZ;
   9446 	int update_next_balance = 0;
   9447 	int need_serialize, need_decay = 0;
   9448 	u64 max_cost = 0;
   9449 
   9450 	rcu_read_lock();
   9451 	for_each_domain(cpu, sd) {
   9452 		/*
   9453 		 * Decay the newidle max times here because this is a regular
   9454 		 * visit to all the domains. Decay ~1% per second.
   9455 		 */
   9456 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
   9457 			sd->max_newidle_lb_cost =
   9458 				(sd->max_newidle_lb_cost * 253) / 256;
   9459 			sd->next_decay_max_lb_cost = jiffies + HZ;
   9460 			need_decay = 1;
   9461 		}
   9462 		max_cost += sd->max_newidle_lb_cost;
   9463 
   9464 		if (!(sd->flags & SD_LOAD_BALANCE))
   9465 			continue;
   9466 
   9467 		/*
   9468 		 * Stop the load balance at this level. There is another
   9469 		 * CPU in our sched group which is doing load balancing more
   9470 		 * actively.
   9471 		 */
   9472 		if (!continue_balancing) {
   9473 			if (need_decay)
   9474 				continue;
   9475 			break;
   9476 		}
   9477 
   9478 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
   9479 
   9480 		need_serialize = sd->flags & SD_SERIALIZE;
   9481 		if (need_serialize) {
   9482 			if (!spin_trylock(&balancing))
   9483 				goto out;
   9484 		}
   9485 
   9486 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
   9487 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
   9488 				/*
   9489 				 * The LBF_DST_PINNED logic could have changed
   9490 				 * env->dst_cpu, so we can't know our idle
   9491 				 * state even if we migrated tasks. Update it.
   9492 				 */
   9493 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
   9494 			}
   9495 			sd->last_balance = jiffies;
   9496 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
   9497 		}
   9498 		if (need_serialize)
   9499 			spin_unlock(&balancing);
   9500 out:
   9501 		if (time_after(next_balance, sd->last_balance + interval)) {
   9502 			next_balance = sd->last_balance + interval;
   9503 			update_next_balance = 1;
   9504 		}
   9505 	}
   9506 	if (need_decay) {
   9507 		/*
   9508 		 * Ensure the rq-wide value also decays but keep it at a
   9509 		 * reasonable floor to avoid funnies with rq->avg_idle.
   9510 		 */
   9511 		rq->max_idle_balance_cost =
   9512 			max((u64)sysctl_sched_migration_cost, max_cost);
   9513 	}
   9514 	rcu_read_unlock();
   9515 
   9516 	/*
   9517 	 * next_balance will be updated only when there is a need.
   9518 	 * When the cpu is attached to null domain for ex, it will not be
   9519 	 * updated.
   9520 	 */
   9521 	if (likely(update_next_balance)) {
   9522 		rq->next_balance = next_balance;
   9523 
   9524 #ifdef CONFIG_NO_HZ_COMMON
   9525 		/*
   9526 		 * If this CPU has been elected to perform the nohz idle
   9527 		 * balance. Other idle CPUs have already rebalanced with
   9528 		 * nohz_idle_balance() and nohz.next_balance has been
   9529 		 * updated accordingly. This CPU is now running the idle load
   9530 		 * balance for itself and we need to update the
   9531 		 * nohz.next_balance accordingly.
   9532 		 */
   9533 		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
   9534 			nohz.next_balance = rq->next_balance;
   9535 #endif
   9536 	}
   9537 }
   9538 
   9539 static inline int on_null_domain(struct rq *rq)
   9540 {
   9541 	return unlikely(!rcu_dereference_sched(rq->sd));
   9542 }
   9543 
   9544 #ifdef CONFIG_NO_HZ_COMMON
   9545 /*
   9546  * idle load balancing details
   9547  * - When one of the busy CPUs notice that there may be an idle rebalancing
   9548  *   needed, they will kick the idle load balancer, which then does idle
   9549  *   load balancing for all the idle CPUs.
   9550  */
   9551 
   9552 static inline int find_new_ilb(void)
   9553 {
   9554 	int ilb = cpumask_first(nohz.idle_cpus_mask);
   9555 
   9556 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
   9557 		return ilb;
   9558 
   9559 	return nr_cpu_ids;
   9560 }
   9561 
   9562 /*
   9563  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
   9564  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
   9565  * CPU (if there is one).
   9566  */
   9567 static void kick_ilb(unsigned int flags)
   9568 {
   9569 	int ilb_cpu;
   9570 
   9571 	nohz.next_balance++;
   9572 
   9573 	ilb_cpu = find_new_ilb();
   9574 
   9575 	if (ilb_cpu >= nr_cpu_ids)
   9576 		return;
   9577 
   9578 	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
   9579 	if (flags & NOHZ_KICK_MASK)
   9580 		return;
   9581 
   9582 	/*
   9583 	 * Use smp_send_reschedule() instead of resched_cpu().
   9584 	 * This way we generate a sched IPI on the target CPU which
   9585 	 * is idle. And the softirq performing nohz idle load balance
   9586 	 * will be run before returning from the IPI.
   9587 	 */
   9588 	smp_send_reschedule(ilb_cpu);
   9589 }
   9590 
   9591 /*
   9592  * Current decision point for kicking the idle load balancer in the presence
   9593  * of idle CPUs in the system.
   9594  */
   9595 static void nohz_balancer_kick(struct rq *rq)
   9596 {
   9597 	unsigned long now = jiffies;
   9598 	struct sched_domain_shared *sds;
   9599 	struct sched_domain *sd;
   9600 	int nr_busy, i, cpu = rq->cpu;
   9601 	unsigned int flags = 0;
   9602 
   9603 	if (unlikely(rq->idle_balance))
   9604 		return;
   9605 
   9606 	/*
   9607 	 * We may be recently in ticked or tickless idle mode. At the first
   9608 	 * busy tick after returning from idle, we will update the busy stats.
   9609 	 */
   9610 	nohz_balance_exit_idle(rq);
   9611 
   9612 	/*
   9613 	 * None are in tickless mode and hence no need for NOHZ idle load
   9614 	 * balancing.
   9615 	 */
   9616 	if (likely(!atomic_read(&nohz.nr_cpus)))
   9617 		return;
   9618 
   9619 	if (READ_ONCE(nohz.has_blocked) &&
   9620 	    time_after(now, READ_ONCE(nohz.next_blocked)))
   9621 		flags = NOHZ_STATS_KICK;
   9622 
   9623 	if (time_before(now, nohz.next_balance))
   9624 		goto out;
   9625 
   9626 	if (rq->nr_running >= 2) {
   9627 		flags = NOHZ_KICK_MASK;
   9628 		goto out;
   9629 	}
   9630 
   9631 	rcu_read_lock();
   9632 
   9633 	sd = rcu_dereference(rq->sd);
   9634 	if (sd) {
   9635 		/*
   9636 		 * If there's a CFS task and the current CPU has reduced
   9637 		 * capacity; kick the ILB to see if there's a better CPU to run
   9638 		 * on.
   9639 		 */
   9640 		if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
   9641 			flags = NOHZ_KICK_MASK;
   9642 			goto unlock;
   9643 		}
   9644 	}
   9645 
   9646 	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
   9647 	if (sd) {
   9648 		/*
   9649 		 * When ASYM_PACKING; see if there's a more preferred CPU
   9650 		 * currently idle; in which case, kick the ILB to move tasks
   9651 		 * around.
   9652 		 */
   9653 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
   9654 			if (sched_asym_prefer(i, cpu)) {
   9655 				flags = NOHZ_KICK_MASK;
   9656 				goto unlock;
   9657 			}
   9658 		}
   9659 	}
   9660 
   9661 	sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
   9662 	if (sd) {
   9663 		/*
   9664 		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
   9665 		 * to run the misfit task on.
   9666 		 */
   9667 		if (check_misfit_status(rq, sd)) {
   9668 			flags = NOHZ_KICK_MASK;
   9669 			goto unlock;
   9670 		}
   9671 
   9672 		/*
   9673 		 * For asymmetric systems, we do not want to nicely balance
   9674 		 * cache use, instead we want to embrace asymmetry and only
   9675 		 * ensure tasks have enough CPU capacity.
   9676 		 *
   9677 		 * Skip the LLC logic because it's not relevant in that case.
   9678 		 */
   9679 		goto unlock;
   9680 	}
   9681 
   9682 	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
   9683 	if (sds) {
   9684 		/*
   9685 		 * If there is an imbalance between LLC domains (IOW we could
   9686 		 * increase the overall cache use), we need some less-loaded LLC
   9687 		 * domain to pull some load. Likewise, we may need to spread
   9688 		 * load within the current LLC domain (e.g. packed SMT cores but
   9689 		 * other CPUs are idle). We can't really know from here how busy
   9690 		 * the others are - so just get a nohz balance going if it looks
   9691 		 * like this LLC domain has tasks we could move.
   9692 		 */
   9693 		nr_busy = atomic_read(&sds->nr_busy_cpus);
   9694 		if (nr_busy > 1) {
   9695 			flags = NOHZ_KICK_MASK;
   9696 			goto unlock;
   9697 		}
   9698 	}
   9699 unlock:
   9700 	rcu_read_unlock();
   9701 out:
   9702 	if (flags)
   9703 		kick_ilb(flags);
   9704 }
   9705 
   9706 static void set_cpu_sd_state_busy(int cpu)
   9707 {
   9708 	struct sched_domain *sd;
   9709 
   9710 	rcu_read_lock();
   9711 	sd = rcu_dereference(per_cpu(sd_llc, cpu));
   9712 
   9713 	if (!sd || !sd->nohz_idle)
   9714 		goto unlock;
   9715 	sd->nohz_idle = 0;
   9716 
   9717 	atomic_inc(&sd->shared->nr_busy_cpus);
   9718 unlock:
   9719 	rcu_read_unlock();
   9720 }
   9721 
   9722 void nohz_balance_exit_idle(struct rq *rq)
   9723 {
   9724 	SCHED_WARN_ON(rq != this_rq());
   9725 
   9726 	if (likely(!rq->nohz_tick_stopped))
   9727 		return;
   9728 
   9729 	rq->nohz_tick_stopped = 0;
   9730 	cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
   9731 	atomic_dec(&nohz.nr_cpus);
   9732 
   9733 	set_cpu_sd_state_busy(rq->cpu);
   9734 }
   9735 
   9736 static void set_cpu_sd_state_idle(int cpu)
   9737 {
   9738 	struct sched_domain *sd;
   9739 
   9740 	rcu_read_lock();
   9741 	sd = rcu_dereference(per_cpu(sd_llc, cpu));
   9742 
   9743 	if (!sd || sd->nohz_idle)
   9744 		goto unlock;
   9745 	sd->nohz_idle = 1;
   9746 
   9747 	atomic_dec(&sd->shared->nr_busy_cpus);
   9748 unlock:
   9749 	rcu_read_unlock();
   9750 }
   9751 
   9752 /*
   9753  * This routine will record that the CPU is going idle with tick stopped.
   9754  * This info will be used in performing idle load balancing in the future.
   9755  */
   9756 void nohz_balance_enter_idle(int cpu)
   9757 {
   9758 	struct rq *rq = cpu_rq(cpu);
   9759 
   9760 	SCHED_WARN_ON(cpu != smp_processor_id());
   9761 
   9762 	/* If this CPU is going down, then nothing needs to be done: */
   9763 	if (!cpu_active(cpu))
   9764 		return;
   9765 
   9766 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
   9767 	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
   9768 		return;
   9769 
   9770 	/*
   9771 	 * Can be set safely without rq->lock held
   9772 	 * If a clear happens, it will have evaluated last additions because
   9773 	 * rq->lock is held during the check and the clear
   9774 	 */
   9775 	rq->has_blocked_load = 1;
   9776 
   9777 	/*
   9778 	 * The tick is still stopped but load could have been added in the
   9779 	 * meantime. We set the nohz.has_blocked flag to trig a check of the
   9780 	 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
   9781 	 * of nohz.has_blocked can only happen after checking the new load
   9782 	 */
   9783 	if (rq->nohz_tick_stopped)
   9784 		goto out;
   9785 
   9786 	/* If we're a completely isolated CPU, we don't play: */
   9787 	if (on_null_domain(rq))
   9788 		return;
   9789 
   9790 	rq->nohz_tick_stopped = 1;
   9791 
   9792 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
   9793 	atomic_inc(&nohz.nr_cpus);
   9794 
   9795 	/*
   9796 	 * Ensures that if nohz_idle_balance() fails to observe our
   9797 	 * @idle_cpus_mask store, it must observe the @has_blocked
   9798 	 * store.
   9799 	 */
   9800 	smp_mb__after_atomic();
   9801 
   9802 	set_cpu_sd_state_idle(cpu);
   9803 
   9804 out:
   9805 	/*
   9806 	 * Each time a cpu enter idle, we assume that it has blocked load and
   9807 	 * enable the periodic update of the load of idle cpus
   9808 	 */
   9809 	WRITE_ONCE(nohz.has_blocked, 1);
   9810 }
   9811 
   9812 /*
   9813  * Internal function that runs load balance for all idle cpus. The load balance
   9814  * can be a simple update of blocked load or a complete load balance with
   9815  * tasks movement depending of flags.
   9816  * The function returns false if the loop has stopped before running
   9817  * through all idle CPUs.
   9818  */
   9819 static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
   9820 			       enum cpu_idle_type idle)
   9821 {
   9822 	/* Earliest time when we have to do rebalance again */
   9823 	unsigned long now = jiffies;
   9824 	unsigned long next_balance = now + 60*HZ;
   9825 	bool has_blocked_load = false;
   9826 	int update_next_balance = 0;
   9827 	int this_cpu = this_rq->cpu;
   9828 	int balance_cpu;
   9829 	int ret = false;
   9830 	struct rq *rq;
   9831 
   9832 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
   9833 
   9834 	/*
   9835 	 * We assume there will be no idle load after this update and clear
   9836 	 * the has_blocked flag. If a cpu enters idle in the mean time, it will
   9837 	 * set the has_blocked flag and trig another update of idle load.
   9838 	 * Because a cpu that becomes idle, is added to idle_cpus_mask before
   9839 	 * setting the flag, we are sure to not clear the state and not
   9840 	 * check the load of an idle cpu.
   9841 	 */
   9842 	WRITE_ONCE(nohz.has_blocked, 0);
   9843 
   9844 	/*
   9845 	 * Ensures that if we miss the CPU, we must see the has_blocked
   9846 	 * store from nohz_balance_enter_idle().
   9847 	 */
   9848 	smp_mb();
   9849 
   9850 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
   9851 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
   9852 			continue;
   9853 
   9854 		/*
   9855 		 * If this CPU gets work to do, stop the load balancing
   9856 		 * work being done for other CPUs. Next load
   9857 		 * balancing owner will pick it up.
   9858 		 */
   9859 		if (need_resched()) {
   9860 			has_blocked_load = true;
   9861 			goto abort;
   9862 		}
   9863 
   9864 		rq = cpu_rq(balance_cpu);
   9865 
   9866 		has_blocked_load |= update_nohz_stats(rq, true);
   9867 
   9868 		/*
   9869 		 * If time for next balance is due,
   9870 		 * do the balance.
   9871 		 */
   9872 		if (time_after_eq(jiffies, rq->next_balance)) {
   9873 			struct rq_flags rf;
   9874 
   9875 			rq_lock_irqsave(rq, &rf);
   9876 			update_rq_clock(rq);
   9877 			cpu_load_update_idle(rq);
   9878 			rq_unlock_irqrestore(rq, &rf);
   9879 
   9880 			if (flags & NOHZ_BALANCE_KICK)
   9881 				rebalance_domains(rq, CPU_IDLE);
   9882 		}
   9883 
   9884 		if (time_after(next_balance, rq->next_balance)) {
   9885 			next_balance = rq->next_balance;
   9886 			update_next_balance = 1;
   9887 		}
   9888 	}
   9889 
   9890 	/* Newly idle CPU doesn't need an update */
   9891 	if (idle != CPU_NEWLY_IDLE) {
   9892 		update_blocked_averages(this_cpu);
   9893 		has_blocked_load |= this_rq->has_blocked_load;
   9894 	}
   9895 
   9896 	if (flags & NOHZ_BALANCE_KICK)
   9897 		rebalance_domains(this_rq, CPU_IDLE);
   9898 
   9899 	WRITE_ONCE(nohz.next_blocked,
   9900 		now + msecs_to_jiffies(LOAD_AVG_PERIOD));
   9901 
   9902 	/* The full idle balance loop has been done */
   9903 	ret = true;
   9904 
   9905 abort:
   9906 	/* There is still blocked load, enable periodic update */
   9907 	if (has_blocked_load)
   9908 		WRITE_ONCE(nohz.has_blocked, 1);
   9909 
   9910 	/*
   9911 	 * next_balance will be updated only when there is a need.
   9912 	 * When the CPU is attached to null domain for ex, it will not be
   9913 	 * updated.
   9914 	 */
   9915 	if (likely(update_next_balance))
   9916 		nohz.next_balance = next_balance;
   9917 
   9918 	return ret;
   9919 }
   9920 
   9921 /*
   9922  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
   9923  * rebalancing for all the cpus for whom scheduler ticks are stopped.
   9924  */
   9925 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
   9926 {
   9927 	int this_cpu = this_rq->cpu;
   9928 	unsigned int flags;
   9929 
   9930 	if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
   9931 		return false;
   9932 
   9933 	if (idle != CPU_IDLE) {
   9934 		atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
   9935 		return false;
   9936 	}
   9937 
   9938 	/* could be _relaxed() */
   9939 	flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
   9940 	if (!(flags & NOHZ_KICK_MASK))
   9941 		return false;
   9942 
   9943 	_nohz_idle_balance(this_rq, flags, idle);
   9944 
   9945 	return true;
   9946 }
   9947 
   9948 static void nohz_newidle_balance(struct rq *this_rq)
   9949 {
   9950 	int this_cpu = this_rq->cpu;
   9951 
   9952 	/*
   9953 	 * This CPU doesn't want to be disturbed by scheduler
   9954 	 * housekeeping
   9955 	 */
   9956 	if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
   9957 		return;
   9958 
   9959 	/* Will wake up very soon. No time for doing anything else*/
   9960 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
   9961 		return;
   9962 
   9963 	/* Don't need to update blocked load of idle CPUs*/
   9964 	if (!READ_ONCE(nohz.has_blocked) ||
   9965 	    time_before(jiffies, READ_ONCE(nohz.next_blocked)))
   9966 		return;
   9967 
   9968 	raw_spin_unlock(&this_rq->lock);
   9969 	/*
   9970 	 * This CPU is going to be idle and blocked load of idle CPUs
   9971 	 * need to be updated. Run the ilb locally as it is a good
   9972 	 * candidate for ilb instead of waking up another idle CPU.
   9973 	 * Kick an normal ilb if we failed to do the update.
   9974 	 */
   9975 	if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
   9976 		kick_ilb(NOHZ_STATS_KICK);
   9977 	raw_spin_lock(&this_rq->lock);
   9978 }
   9979 
   9980 #else /* !CONFIG_NO_HZ_COMMON */
   9981 static inline void nohz_balancer_kick(struct rq *rq) { }
   9982 
   9983 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
   9984 {
   9985 	return false;
   9986 }
   9987 
   9988 static inline void nohz_newidle_balance(struct rq *this_rq) { }
   9989 #endif /* CONFIG_NO_HZ_COMMON */
   9990 
   9991 /*
   9992  * idle_balance is called by schedule() if this_cpu is about to become
   9993  * idle. Attempts to pull tasks from other CPUs.
   9994  */
   9995 static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
   9996 {
   9997 	unsigned long next_balance = jiffies + HZ;
   9998 	int this_cpu = this_rq->cpu;
   9999 	struct sched_domain *sd;
  10000 	int pulled_task = 0;
  10001 	u64 curr_cost = 0;
  10002 
  10003 	/*
  10004 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
  10005 	 * measure the duration of idle_balance() as idle time.
  10006 	 */
  10007 	this_rq->idle_stamp = rq_clock(this_rq);
  10008 
  10009 	/*
  10010 	 * Do not pull tasks towards !active CPUs...
  10011 	 */
  10012 	if (!cpu_active(this_cpu))
  10013 		return 0;
  10014 
  10015 	/*
  10016 	 * This is OK, because current is on_cpu, which avoids it being picked
  10017 	 * for load-balance and preemption/IRQs are still disabled avoiding
  10018 	 * further scheduler activity on it and we're being very careful to
  10019 	 * re-start the picking loop.
  10020 	 */
  10021 	rq_unpin_lock(this_rq, rf);
  10022 
  10023 	if (this_rq->avg_idle < sysctl_sched_migration_cost ||
  10024 	    !READ_ONCE(this_rq->rd->overload)) {
  10025 
  10026 		rcu_read_lock();
  10027 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
  10028 		if (sd)
  10029 			update_next_balance(sd, &next_balance);
  10030 		rcu_read_unlock();
  10031 
  10032 		nohz_newidle_balance(this_rq);
  10033 
  10034 		goto out;
  10035 	}
  10036 
  10037 	raw_spin_unlock(&this_rq->lock);
  10038 
  10039 	update_blocked_averages(this_cpu);
  10040 	rcu_read_lock();
  10041 	for_each_domain(this_cpu, sd) {
  10042 		int continue_balancing = 1;
  10043 		u64 t0, domain_cost;
  10044 
  10045 		if (!(sd->flags & SD_LOAD_BALANCE))
  10046 			continue;
  10047 
  10048 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
  10049 			update_next_balance(sd, &next_balance);
  10050 			break;
  10051 		}
  10052 
  10053 		if (sd->flags & SD_BALANCE_NEWIDLE) {
  10054 			t0 = sched_clock_cpu(this_cpu);
  10055 
  10056 			pulled_task = load_balance(this_cpu, this_rq,
  10057 						   sd, CPU_NEWLY_IDLE,
  10058 						   &continue_balancing);
  10059 
  10060 			domain_cost = sched_clock_cpu(this_cpu) - t0;
  10061 			if (domain_cost > sd->max_newidle_lb_cost)
  10062 				sd->max_newidle_lb_cost = domain_cost;
  10063 
  10064 			curr_cost += domain_cost;
  10065 		}
  10066 
  10067 		update_next_balance(sd, &next_balance);
  10068 
  10069 		/*
  10070 		 * Stop searching for tasks to pull if there are
  10071 		 * now runnable tasks on this rq.
  10072 		 */
  10073 		if (pulled_task || this_rq->nr_running > 0)
  10074 			break;
  10075 	}
  10076 	rcu_read_unlock();
  10077 
  10078 	raw_spin_lock(&this_rq->lock);
  10079 
  10080 	if (curr_cost > this_rq->max_idle_balance_cost)
  10081 		this_rq->max_idle_balance_cost = curr_cost;
  10082 
  10083 out:
  10084 	/*
  10085 	 * While browsing the domains, we released the rq lock, a task could
  10086 	 * have been enqueued in the meantime. Since we're not going idle,
  10087 	 * pretend we pulled a task.
  10088 	 */
  10089 	if (this_rq->cfs.h_nr_running && !pulled_task)
  10090 		pulled_task = 1;
  10091 
  10092 	/* Move the next balance forward */
  10093 	if (time_after(this_rq->next_balance, next_balance))
  10094 		this_rq->next_balance = next_balance;
  10095 
  10096 	/* Is there a task of a high priority class? */
  10097 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
  10098 		pulled_task = -1;
  10099 
  10100 	if (pulled_task)
  10101 		this_rq->idle_stamp = 0;
  10102 
  10103 	rq_repin_lock(this_rq, rf);
  10104 
  10105 	return pulled_task;
  10106 }
  10107 
  10108 /*
  10109  * run_rebalance_domains is triggered when needed from the scheduler tick.
  10110  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
  10111  */
  10112 static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
  10113 {
  10114 	struct rq *this_rq = this_rq();
  10115 	enum cpu_idle_type idle = this_rq->idle_balance ?
  10116 						CPU_IDLE : CPU_NOT_IDLE;
  10117 
  10118 	/*
  10119 	 * If this CPU has a pending nohz_balance_kick, then do the
  10120 	 * balancing on behalf of the other idle CPUs whose ticks are
  10121 	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
  10122 	 * give the idle CPUs a chance to load balance. Else we may
  10123 	 * load balance only within the local sched_domain hierarchy
  10124 	 * and abort nohz_idle_balance altogether if we pull some load.
  10125 	 */
  10126 	if (nohz_idle_balance(this_rq, idle))
  10127 		return;
  10128 
  10129 	/* normal load balance */
  10130 	update_blocked_averages(this_rq->cpu);
  10131 	rebalance_domains(this_rq, idle);
  10132 }
  10133 
  10134 /*
  10135  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  10136  */
  10137 void trigger_load_balance(struct rq *rq)
  10138 {
  10139 	/* Don't need to rebalance while attached to NULL domain */
  10140 	if (unlikely(on_null_domain(rq)))
  10141 		return;
  10142 
  10143 	if (time_after_eq(jiffies, rq->next_balance))
  10144 		raise_softirq(SCHED_SOFTIRQ);
  10145 
  10146 	nohz_balancer_kick(rq);
  10147 }
  10148 
  10149 static void rq_online_fair(struct rq *rq)
  10150 {
  10151 	update_sysctl();
  10152 
  10153 	update_runtime_enabled(rq);
  10154 }
  10155 
  10156 static void rq_offline_fair(struct rq *rq)
  10157 {
  10158 	update_sysctl();
  10159 
  10160 	/* Ensure any throttled groups are reachable by pick_next_task */
  10161 	unthrottle_offline_cfs_rqs(rq);
  10162 }
  10163 
  10164 #endif /* CONFIG_SMP */
  10165 
  10166 /*
  10167  * scheduler tick hitting a task of our scheduling class.
  10168  *
  10169  * NOTE: This function can be called remotely by the tick offload that
  10170  * goes along full dynticks. Therefore no local assumption can be made
  10171  * and everything must be accessed through the @rq and @curr passed in
  10172  * parameters.
  10173  */
  10174 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
  10175 {
  10176 	struct cfs_rq *cfs_rq;
  10177 	struct sched_entity *se = &curr->se;
  10178 
  10179 	for_each_sched_entity(se) {
  10180 		cfs_rq = cfs_rq_of(se);
  10181 		entity_tick(cfs_rq, se, queued);
  10182 	}
  10183 
  10184 	if (static_branch_unlikely(&sched_numa_balancing))
  10185 		task_tick_numa(rq, curr);
  10186 
  10187 	update_misfit_status(curr, rq);
  10188 	update_overutilized_status(task_rq(curr));
  10189 }
  10190 
  10191 /*
  10192  * called on fork with the child task as argument from the parent's context
  10193  *  - child not yet on the tasklist
  10194  *  - preemption disabled
  10195  */
  10196 static void task_fork_fair(struct task_struct *p)
  10197 {
  10198 	struct cfs_rq *cfs_rq;
  10199 	struct sched_entity *se = &p->se, *curr;
  10200 	struct rq *rq = this_rq();
  10201 	struct rq_flags rf;
  10202 
  10203 	rq_lock(rq, &rf);
  10204 	update_rq_clock(rq);
  10205 
  10206 	cfs_rq = task_cfs_rq(current);
  10207 	curr = cfs_rq->curr;
  10208 	if (curr) {
  10209 		update_curr(cfs_rq);
  10210 		se->vruntime = curr->vruntime;
  10211 	}
  10212 	place_entity(cfs_rq, se, 1);
  10213 
  10214 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
  10215 		/*
  10216 		 * Upon rescheduling, sched_class::put_prev_task() will place
  10217 		 * 'current' within the tree based on its new key value.
  10218 		 */
  10219 		swap(curr->vruntime, se->vruntime);
  10220 		resched_curr(rq);
  10221 	}
  10222 
  10223 	se->vruntime -= cfs_rq->min_vruntime;
  10224 	rq_unlock(rq, &rf);
  10225 }
  10226 
  10227 /*
  10228  * Priority of the task has changed. Check to see if we preempt
  10229  * the current task.
  10230  */
  10231 static void
  10232 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
  10233 {
  10234 	if (!task_on_rq_queued(p))
  10235 		return;
  10236 
  10237 	/*
  10238 	 * Reschedule if we are currently running on this runqueue and
  10239 	 * our priority decreased, or if we are not currently running on
  10240 	 * this runqueue and our priority is higher than the current's
  10241 	 */
  10242 	if (rq->curr == p) {
  10243 		if (p->prio > oldprio)
  10244 			resched_curr(rq);
  10245 	} else
  10246 		check_preempt_curr(rq, p, 0);
  10247 }
  10248 
  10249 static inline bool vruntime_normalized(struct task_struct *p)
  10250 {
  10251 	struct sched_entity *se = &p->se;
  10252 
  10253 	/*
  10254 	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
  10255 	 * the dequeue_entity(.flags=0) will already have normalized the
  10256 	 * vruntime.
  10257 	 */
  10258 	if (p->on_rq)
  10259 		return true;
  10260 
  10261 	/*
  10262 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
  10263 	 * But there are some cases where it has already been normalized:
  10264 	 *
  10265 	 * - A forked child which is waiting for being woken up by
  10266 	 *   wake_up_new_task().
  10267 	 * - A task which has been woken up by try_to_wake_up() and
  10268 	 *   waiting for actually being woken up by sched_ttwu_pending().
  10269 	 */
  10270 	if (!se->sum_exec_runtime ||
  10271 	    (p->state == TASK_WAKING && p->sched_remote_wakeup))
  10272 		return true;
  10273 
  10274 	return false;
  10275 }
  10276 
  10277 #ifdef CONFIG_FAIR_GROUP_SCHED
  10278 /*
  10279  * Propagate the changes of the sched_entity across the tg tree to make it
  10280  * visible to the root
  10281  */
  10282 static void propagate_entity_cfs_rq(struct sched_entity *se)
  10283 {
  10284 	struct cfs_rq *cfs_rq;
  10285 
  10286 	/* Start to propagate at parent */
  10287 	se = se->parent;
  10288 
  10289 	for_each_sched_entity(se) {
  10290 		cfs_rq = cfs_rq_of(se);
  10291 
  10292 		if (cfs_rq_throttled(cfs_rq))
  10293 			break;
  10294 
  10295 		update_load_avg(cfs_rq, se, UPDATE_TG);
  10296 	}
  10297 }
  10298 #else
  10299 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
  10300 #endif
  10301 
  10302 static void detach_entity_cfs_rq(struct sched_entity *se)
  10303 {
  10304 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  10305 
  10306 	/* Catch up with the cfs_rq and remove our load when we leave */
  10307 	update_load_avg(cfs_rq, se, 0);
  10308 	detach_entity_load_avg(cfs_rq, se);
  10309 	update_tg_load_avg(cfs_rq, false);
  10310 	propagate_entity_cfs_rq(se);
  10311 }
  10312 
  10313 static void attach_entity_cfs_rq(struct sched_entity *se)
  10314 {
  10315 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  10316 
  10317 #ifdef CONFIG_FAIR_GROUP_SCHED
  10318 	/*
  10319 	 * Since the real-depth could have been changed (only FAIR
  10320 	 * class maintain depth value), reset depth properly.
  10321 	 */
  10322 	se->depth = se->parent ? se->parent->depth + 1 : 0;
  10323 #endif
  10324 
  10325 	/* Synchronize entity with its cfs_rq */
  10326 	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
  10327 	attach_entity_load_avg(cfs_rq, se, 0);
  10328 	update_tg_load_avg(cfs_rq, false);
  10329 	propagate_entity_cfs_rq(se);
  10330 }
  10331 
  10332 static void detach_task_cfs_rq(struct task_struct *p)
  10333 {
  10334 	struct sched_entity *se = &p->se;
  10335 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  10336 
  10337 	if (!vruntime_normalized(p)) {
  10338 		/*
  10339 		 * Fix up our vruntime so that the current sleep doesn't
  10340 		 * cause 'unlimited' sleep bonus.
  10341 		 */
  10342 		place_entity(cfs_rq, se, 0);
  10343 		se->vruntime -= cfs_rq->min_vruntime;
  10344 	}
  10345 
  10346 	detach_entity_cfs_rq(se);
  10347 }
  10348 
  10349 static void attach_task_cfs_rq(struct task_struct *p)
  10350 {
  10351 	struct sched_entity *se = &p->se;
  10352 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
  10353 
  10354 	attach_entity_cfs_rq(se);
  10355 
  10356 	if (!vruntime_normalized(p))
  10357 		se->vruntime += cfs_rq->min_vruntime;
  10358 }
  10359 
  10360 static void switched_from_fair(struct rq *rq, struct task_struct *p)
  10361 {
  10362 	detach_task_cfs_rq(p);
  10363 }
  10364 
  10365 static void switched_to_fair(struct rq *rq, struct task_struct *p)
  10366 {
  10367 	attach_task_cfs_rq(p);
  10368 
  10369 	if (task_on_rq_queued(p)) {
  10370 		/*
  10371 		 * We were most likely switched from sched_rt, so
  10372 		 * kick off the schedule if running, otherwise just see
  10373 		 * if we can still preempt the current task.
  10374 		 */
  10375 		if (rq->curr == p)
  10376 			resched_curr(rq);
  10377 		else
  10378 			check_preempt_curr(rq, p, 0);
  10379 	}
  10380 }
  10381 
  10382 /* Account for a task changing its policy or group.
  10383  *
  10384  * This routine is mostly called to set cfs_rq->curr field when a task
  10385  * migrates between groups/classes.
  10386  */
  10387 static void set_curr_task_fair(struct rq *rq)
  10388 {
  10389 	struct sched_entity *se = &rq->curr->se;
  10390 
  10391 	for_each_sched_entity(se) {
  10392 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
  10393 
  10394 		set_next_entity(cfs_rq, se);
  10395 		/* ensure bandwidth has been allocated on our new cfs_rq */
  10396 		account_cfs_rq_runtime(cfs_rq, 0);
  10397 	}
  10398 }
  10399 
  10400 void init_cfs_rq(struct cfs_rq *cfs_rq)
  10401 {
  10402 	cfs_rq->tasks_timeline = RB_ROOT_CACHED;
  10403 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
  10404 #ifndef CONFIG_64BIT
  10405 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
  10406 #endif
  10407 #ifdef CONFIG_SMP
  10408 	raw_spin_lock_init(&cfs_rq->removed.lock);
  10409 #endif
  10410 }
  10411 
  10412 #ifdef CONFIG_FAIR_GROUP_SCHED
  10413 static void task_set_group_fair(struct task_struct *p)
  10414 {
  10415 	struct sched_entity *se = &p->se;
  10416 
  10417 	set_task_rq(p, task_cpu(p));
  10418 	se->depth = se->parent ? se->parent->depth + 1 : 0;
  10419 }
  10420 
  10421 static void task_move_group_fair(struct task_struct *p)
  10422 {
  10423 	detach_task_cfs_rq(p);
  10424 	set_task_rq(p, task_cpu(p));
  10425 
  10426 #ifdef CONFIG_SMP
  10427 	/* Tell se's cfs_rq has been changed -- migrated */
  10428 	p->se.avg.last_update_time = 0;
  10429 #endif
  10430 	attach_task_cfs_rq(p);
  10431 }
  10432 
  10433 static void task_change_group_fair(struct task_struct *p, int type)
  10434 {
  10435 	switch (type) {
  10436 	case TASK_SET_GROUP:
  10437 		task_set_group_fair(p);
  10438 		break;
  10439 
  10440 	case TASK_MOVE_GROUP:
  10441 		task_move_group_fair(p);
  10442 		break;
  10443 	}
  10444 }
  10445 
  10446 void free_fair_sched_group(struct task_group *tg)
  10447 {
  10448 	int i;
  10449 
  10450 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
  10451 
  10452 	for_each_possible_cpu(i) {
  10453 		if (tg->cfs_rq)
  10454 			kfree(tg->cfs_rq[i]);
  10455 		if (tg->se)
  10456 			kfree(tg->se[i]);
  10457 	}
  10458 
  10459 	kfree(tg->cfs_rq);
  10460 	kfree(tg->se);
  10461 }
  10462 
  10463 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  10464 {
  10465 	struct sched_entity *se;
  10466 	struct cfs_rq *cfs_rq;
  10467 	int i;
  10468 
  10469 	tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
  10470 	if (!tg->cfs_rq)
  10471 		goto err;
  10472 	tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
  10473 	if (!tg->se)
  10474 		goto err;
  10475 
  10476 	tg->shares = NICE_0_LOAD;
  10477 
  10478 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
  10479 
  10480 	for_each_possible_cpu(i) {
  10481 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
  10482 				      GFP_KERNEL, cpu_to_node(i));
  10483 		if (!cfs_rq)
  10484 			goto err;
  10485 
  10486 		se = kzalloc_node(sizeof(struct sched_entity),
  10487 				  GFP_KERNEL, cpu_to_node(i));
  10488 		if (!se)
  10489 			goto err_free_rq;
  10490 
  10491 		init_cfs_rq(cfs_rq);
  10492 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
  10493 		init_entity_runnable_average(se);
  10494 	}
  10495 
  10496 	return 1;
  10497 
  10498 err_free_rq:
  10499 	kfree(cfs_rq);
  10500 err:
  10501 	return 0;
  10502 }
  10503 
  10504 void online_fair_sched_group(struct task_group *tg)
  10505 {
  10506 	struct sched_entity *se;
  10507 	struct rq *rq;
  10508 	int i;
  10509 
  10510 	for_each_possible_cpu(i) {
  10511 		rq = cpu_rq(i);
  10512 		se = tg->se[i];
  10513 
  10514 		raw_spin_lock_irq(&rq->lock);
  10515 		update_rq_clock(rq);
  10516 		attach_entity_cfs_rq(se);
  10517 		sync_throttle(tg, i);
  10518 		raw_spin_unlock_irq(&rq->lock);
  10519 	}
  10520 }
  10521 
  10522 void unregister_fair_sched_group(struct task_group *tg)
  10523 {
  10524 	unsigned long flags;
  10525 	struct rq *rq;
  10526 	int cpu;
  10527 
  10528 	for_each_possible_cpu(cpu) {
  10529 		if (tg->se[cpu])
  10530 			remove_entity_load_avg(tg->se[cpu]);
  10531 
  10532 		/*
  10533 		 * Only empty task groups can be destroyed; so we can speculatively
  10534 		 * check on_list without danger of it being re-added.
  10535 		 */
  10536 		if (!tg->cfs_rq[cpu]->on_list)
  10537 			continue;
  10538 
  10539 		rq = cpu_rq(cpu);
  10540 
  10541 		raw_spin_lock_irqsave(&rq->lock, flags);
  10542 		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
  10543 		raw_spin_unlock_irqrestore(&rq->lock, flags);
  10544 	}
  10545 }
  10546 
  10547 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  10548 			struct sched_entity *se, int cpu,
  10549 			struct sched_entity *parent)
  10550 {
  10551 	struct rq *rq = cpu_rq(cpu);
  10552 
  10553 	cfs_rq->tg = tg;
  10554 	cfs_rq->rq = rq;
  10555 	init_cfs_rq_runtime(cfs_rq);
  10556 
  10557 	tg->cfs_rq[cpu] = cfs_rq;
  10558 	tg->se[cpu] = se;
  10559 
  10560 	/* se could be NULL for root_task_group */
  10561 	if (!se)
  10562 		return;
  10563 
  10564 	if (!parent) {
  10565 		se->cfs_rq = &rq->cfs;
  10566 		se->depth = 0;
  10567 	} else {
  10568 		se->cfs_rq = parent->my_q;
  10569 		se->depth = parent->depth + 1;
  10570 	}
  10571 
  10572 	se->my_q = cfs_rq;
  10573 	/* guarantee group entities always have weight */
  10574 	update_load_set(&se->load, NICE_0_LOAD);
  10575 	se->parent = parent;
  10576 }
  10577 
  10578 static DEFINE_MUTEX(shares_mutex);
  10579 
  10580 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  10581 {
  10582 	int i;
  10583 
  10584 	/*
  10585 	 * We can't change the weight of the root cgroup.
  10586 	 */
  10587 	if (!tg->se[0])
  10588 		return -EINVAL;
  10589 
  10590 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
  10591 
  10592 	mutex_lock(&shares_mutex);
  10593 	if (tg->shares == shares)
  10594 		goto done;
  10595 
  10596 	tg->shares = shares;
  10597 	for_each_possible_cpu(i) {
  10598 		struct rq *rq = cpu_rq(i);
  10599 		struct sched_entity *se = tg->se[i];
  10600 		struct rq_flags rf;
  10601 
  10602 		/* Propagate contribution to hierarchy */
  10603 		rq_lock_irqsave(rq, &rf);
  10604 		update_rq_clock(rq);
  10605 		for_each_sched_entity(se) {
  10606 			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
  10607 			update_cfs_group(se);
  10608 		}
  10609 		rq_unlock_irqrestore(rq, &rf);
  10610 	}
  10611 
  10612 done:
  10613 	mutex_unlock(&shares_mutex);
  10614 	return 0;
  10615 }
  10616 #else /* CONFIG_FAIR_GROUP_SCHED */
  10617 
  10618 void free_fair_sched_group(struct task_group *tg) { }
  10619 
  10620 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  10621 {
  10622 	return 1;
  10623 }
  10624 
  10625 void online_fair_sched_group(struct task_group *tg) { }
  10626 
  10627 void unregister_fair_sched_group(struct task_group *tg) { }
  10628 
  10629 #endif /* CONFIG_FAIR_GROUP_SCHED */
  10630 
  10631 
  10632 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
  10633 {
  10634 	struct sched_entity *se = &task->se;
  10635 	unsigned int rr_interval = 0;
  10636 
  10637 	/*
  10638 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
  10639 	 * idle runqueue:
  10640 	 */
  10641 	if (rq->cfs.load.weight)
  10642 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
  10643 
  10644 	return rr_interval;
  10645 }
  10646 
  10647 /*
  10648  * All the scheduling class methods:
  10649  */
  10650 const struct sched_class fair_sched_class = {
  10651 	.next			= &idle_sched_class,
  10652 	.enqueue_task		= enqueue_task_fair,
  10653 	.dequeue_task		= dequeue_task_fair,
  10654 	.yield_task		= yield_task_fair,
  10655 	.yield_to_task		= yield_to_task_fair,
  10656 
  10657 	.check_preempt_curr	= check_preempt_wakeup,
  10658 
  10659 	.pick_next_task		= pick_next_task_fair,
  10660 	.put_prev_task		= put_prev_task_fair,
  10661 
  10662 #ifdef CONFIG_SMP
  10663 	.select_task_rq		= select_task_rq_fair,
  10664 	.migrate_task_rq	= migrate_task_rq_fair,
  10665 
  10666 	.rq_online		= rq_online_fair,
  10667 	.rq_offline		= rq_offline_fair,
  10668 
  10669 	.task_dead		= task_dead_fair,
  10670 	.set_cpus_allowed	= set_cpus_allowed_common,
  10671 #endif
  10672 
  10673 	.set_curr_task          = set_curr_task_fair,
  10674 	.task_tick		= task_tick_fair,
  10675 	.task_fork		= task_fork_fair,
  10676 
  10677 	.prio_changed		= prio_changed_fair,
  10678 	.switched_from		= switched_from_fair,
  10679 	.switched_to		= switched_to_fair,
  10680 
  10681 	.get_rr_interval	= get_rr_interval_fair,
  10682 
  10683 	.update_curr		= update_curr_fair,
  10684 
  10685 #ifdef CONFIG_FAIR_GROUP_SCHED
  10686 	.task_change_group	= task_change_group_fair,
  10687 #endif
  10688 };
  10689 
  10690 #ifdef CONFIG_SCHED_DEBUG
  10691 void print_cfs_stats(struct seq_file *m, int cpu)
  10692 {
  10693 	struct cfs_rq *cfs_rq, *pos;
  10694 
  10695 	rcu_read_lock();
  10696 	for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
  10697 		print_cfs_rq(m, cpu, cfs_rq);
  10698 	rcu_read_unlock();
  10699 }
  10700 
  10701 #ifdef CONFIG_NUMA_BALANCING
  10702 void show_numa_stats(struct task_struct *p, struct seq_file *m)
  10703 {
  10704 	int node;
  10705 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
  10706 
  10707 	for_each_online_node(node) {
  10708 		if (p->numa_faults) {
  10709 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
  10710 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
  10711 		}
  10712 		if (p->numa_group) {
  10713 			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
  10714 			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
  10715 		}
  10716 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
  10717 	}
  10718 }
  10719 #endif /* CONFIG_NUMA_BALANCING */
  10720 #endif /* CONFIG_SCHED_DEBUG */
  10721 
  10722 __init void init_sched_fair_class(void)
  10723 {
  10724 #ifdef CONFIG_SMP
  10725 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
  10726 
  10727 #ifdef CONFIG_NO_HZ_COMMON
  10728 	nohz.next_balance = jiffies;
  10729 	nohz.next_blocked = jiffies;
  10730 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
  10731 #endif
  10732 #endif /* SMP */
  10733 
  10734 }