whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

core.c (179089B)


      1 /*
      2  *  kernel/sched/core.c
      3  *
      4  *  Core kernel scheduler code and related syscalls
      5  *
      6  *  Copyright (C) 1991-2002  Linus Torvalds
      7  */
      8 #include "sched.h"
      9 
     10 #include <linux/nospec.h>
     11 
     12 #include <linux/kcov.h>
     13 
     14 #include <asm/switch_to.h>
     15 #include <asm/tlb.h>
     16 
     17 #include "../workqueue_internal.h"
     18 #include "../smpboot.h"
     19 
     20 #include "pelt.h"
     21 
     22 #define CREATE_TRACE_POINTS
     23 #include <trace/events/sched.h>
     24 
     25 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
     26 
     27 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
     28 /*
     29  * Debugging: various feature bits
     30  *
     31  * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
     32  * sysctl_sched_features, defined in sched.h, to allow constants propagation
     33  * at compile time and compiler optimization based on features default.
     34  */
     35 #define SCHED_FEAT(name, enabled)	\
     36 	(1UL << __SCHED_FEAT_##name) * enabled |
     37 const_debug unsigned int sysctl_sched_features =
     38 #include "features.h"
     39 	0;
     40 #undef SCHED_FEAT
     41 #endif
     42 
     43 /*
     44  * Number of tasks to iterate in a single balance run.
     45  * Limited because this is done with IRQs disabled.
     46  */
     47 const_debug unsigned int sysctl_sched_nr_migrate = 32;
     48 
     49 /*
     50  * period over which we measure -rt task CPU usage in us.
     51  * default: 1s
     52  */
     53 unsigned int sysctl_sched_rt_period = 1000000;
     54 
     55 __read_mostly int scheduler_running;
     56 
     57 /*
     58  * part of the period that we allow rt tasks to run in us.
     59  * default: 0.95s
     60  */
     61 int sysctl_sched_rt_runtime = 950000;
     62 
     63 /*
     64  * __task_rq_lock - lock the rq @p resides on.
     65  */
     66 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
     67 	__acquires(rq->lock)
     68 {
     69 	struct rq *rq;
     70 
     71 	lockdep_assert_held(&p->pi_lock);
     72 
     73 	for (;;) {
     74 		rq = task_rq(p);
     75 		raw_spin_lock(&rq->lock);
     76 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
     77 			rq_pin_lock(rq, rf);
     78 			return rq;
     79 		}
     80 		raw_spin_unlock(&rq->lock);
     81 
     82 		while (unlikely(task_on_rq_migrating(p)))
     83 			cpu_relax();
     84 	}
     85 }
     86 
     87 /*
     88  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
     89  */
     90 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
     91 	__acquires(p->pi_lock)
     92 	__acquires(rq->lock)
     93 {
     94 	struct rq *rq;
     95 
     96 	for (;;) {
     97 		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
     98 		rq = task_rq(p);
     99 		raw_spin_lock(&rq->lock);
    100 		/*
    101 		 *	move_queued_task()		task_rq_lock()
    102 		 *
    103 		 *	ACQUIRE (rq->lock)
    104 		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
    105 		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
    106 		 *	[S] ->cpu = new_cpu		[L] task_rq()
    107 		 *					[L] ->on_rq
    108 		 *	RELEASE (rq->lock)
    109 		 *
    110 		 * If we observe the old CPU in task_rq_lock(), the acquire of
    111 		 * the old rq->lock will fully serialize against the stores.
    112 		 *
    113 		 * If we observe the new CPU in task_rq_lock(), the address
    114 		 * dependency headed by '[L] rq = task_rq()' and the acquire
    115 		 * will pair with the WMB to ensure we then also see migrating.
    116 		 */
    117 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
    118 			rq_pin_lock(rq, rf);
    119 			return rq;
    120 		}
    121 		raw_spin_unlock(&rq->lock);
    122 		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
    123 
    124 		while (unlikely(task_on_rq_migrating(p)))
    125 			cpu_relax();
    126 	}
    127 }
    128 
    129 /*
    130  * RQ-clock updating methods:
    131  */
    132 
    133 static void update_rq_clock_task(struct rq *rq, s64 delta)
    134 {
    135 /*
    136  * In theory, the compile should just see 0 here, and optimize out the call
    137  * to sched_rt_avg_update. But I don't trust it...
    138  */
    139 	s64 __maybe_unused steal = 0, irq_delta = 0;
    140 
    141 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
    142 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
    143 
    144 	/*
    145 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
    146 	 * this case when a previous update_rq_clock() happened inside a
    147 	 * {soft,}irq region.
    148 	 *
    149 	 * When this happens, we stop ->clock_task and only update the
    150 	 * prev_irq_time stamp to account for the part that fit, so that a next
    151 	 * update will consume the rest. This ensures ->clock_task is
    152 	 * monotonic.
    153 	 *
    154 	 * It does however cause some slight miss-attribution of {soft,}irq
    155 	 * time, a more accurate solution would be to update the irq_time using
    156 	 * the current rq->clock timestamp, except that would require using
    157 	 * atomic ops.
    158 	 */
    159 	if (irq_delta > delta)
    160 		irq_delta = delta;
    161 
    162 	rq->prev_irq_time += irq_delta;
    163 	delta -= irq_delta;
    164 #endif
    165 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
    166 	if (static_key_false((&paravirt_steal_rq_enabled))) {
    167 		steal = paravirt_steal_clock(cpu_of(rq));
    168 		steal -= rq->prev_steal_time_rq;
    169 
    170 		if (unlikely(steal > delta))
    171 			steal = delta;
    172 
    173 		rq->prev_steal_time_rq += steal;
    174 		delta -= steal;
    175 	}
    176 #endif
    177 
    178 	rq->clock_task += delta;
    179 
    180 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
    181 	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
    182 		update_irq_load_avg(rq, irq_delta + steal);
    183 #endif
    184 	update_rq_clock_pelt(rq, delta);
    185 }
    186 
    187 void update_rq_clock(struct rq *rq)
    188 {
    189 	s64 delta;
    190 
    191 	lockdep_assert_held(&rq->lock);
    192 
    193 	if (rq->clock_update_flags & RQCF_ACT_SKIP)
    194 		return;
    195 
    196 #ifdef CONFIG_SCHED_DEBUG
    197 	if (sched_feat(WARN_DOUBLE_CLOCK))
    198 		SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
    199 	rq->clock_update_flags |= RQCF_UPDATED;
    200 #endif
    201 
    202 	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
    203 	if (delta < 0)
    204 		return;
    205 	rq->clock += delta;
    206 	update_rq_clock_task(rq, delta);
    207 }
    208 
    209 
    210 #ifdef CONFIG_SCHED_HRTICK
    211 /*
    212  * Use HR-timers to deliver accurate preemption points.
    213  */
    214 
    215 static void hrtick_clear(struct rq *rq)
    216 {
    217 	if (hrtimer_active(&rq->hrtick_timer))
    218 		hrtimer_cancel(&rq->hrtick_timer);
    219 }
    220 
    221 /*
    222  * High-resolution timer tick.
    223  * Runs from hardirq context with interrupts disabled.
    224  */
    225 static enum hrtimer_restart hrtick(struct hrtimer *timer)
    226 {
    227 	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
    228 	struct rq_flags rf;
    229 
    230 	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
    231 
    232 	rq_lock(rq, &rf);
    233 	update_rq_clock(rq);
    234 	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
    235 	rq_unlock(rq, &rf);
    236 
    237 	return HRTIMER_NORESTART;
    238 }
    239 
    240 #ifdef CONFIG_SMP
    241 
    242 static void __hrtick_restart(struct rq *rq)
    243 {
    244 	struct hrtimer *timer = &rq->hrtick_timer;
    245 
    246 	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
    247 }
    248 
    249 /*
    250  * called from hardirq (IPI) context
    251  */
    252 static void __hrtick_start(void *arg)
    253 {
    254 	struct rq *rq = arg;
    255 	struct rq_flags rf;
    256 
    257 	rq_lock(rq, &rf);
    258 	__hrtick_restart(rq);
    259 	rq->hrtick_csd_pending = 0;
    260 	rq_unlock(rq, &rf);
    261 }
    262 
    263 /*
    264  * Called to set the hrtick timer state.
    265  *
    266  * called with rq->lock held and irqs disabled
    267  */
    268 void hrtick_start(struct rq *rq, u64 delay)
    269 {
    270 	struct hrtimer *timer = &rq->hrtick_timer;
    271 	ktime_t time;
    272 	s64 delta;
    273 
    274 	/*
    275 	 * Don't schedule slices shorter than 10000ns, that just
    276 	 * doesn't make sense and can cause timer DoS.
    277 	 */
    278 	delta = max_t(s64, delay, 10000LL);
    279 	time = ktime_add_ns(timer->base->get_time(), delta);
    280 
    281 	hrtimer_set_expires(timer, time);
    282 
    283 	if (rq == this_rq()) {
    284 		__hrtick_restart(rq);
    285 	} else if (!rq->hrtick_csd_pending) {
    286 		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
    287 		rq->hrtick_csd_pending = 1;
    288 	}
    289 }
    290 
    291 #else
    292 /*
    293  * Called to set the hrtick timer state.
    294  *
    295  * called with rq->lock held and irqs disabled
    296  */
    297 void hrtick_start(struct rq *rq, u64 delay)
    298 {
    299 	/*
    300 	 * Don't schedule slices shorter than 10000ns, that just
    301 	 * doesn't make sense. Rely on vruntime for fairness.
    302 	 */
    303 	delay = max_t(u64, delay, 10000LL);
    304 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
    305 		      HRTIMER_MODE_REL_PINNED);
    306 }
    307 #endif /* CONFIG_SMP */
    308 
    309 static void hrtick_rq_init(struct rq *rq)
    310 {
    311 #ifdef CONFIG_SMP
    312 	rq->hrtick_csd_pending = 0;
    313 
    314 	rq->hrtick_csd.flags = 0;
    315 	rq->hrtick_csd.func = __hrtick_start;
    316 	rq->hrtick_csd.info = rq;
    317 #endif
    318 
    319 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    320 	rq->hrtick_timer.function = hrtick;
    321 }
    322 #else	/* CONFIG_SCHED_HRTICK */
    323 static inline void hrtick_clear(struct rq *rq)
    324 {
    325 }
    326 
    327 static inline void hrtick_rq_init(struct rq *rq)
    328 {
    329 }
    330 #endif	/* CONFIG_SCHED_HRTICK */
    331 
    332 /*
    333  * cmpxchg based fetch_or, macro so it works for different integer types
    334  */
    335 #define fetch_or(ptr, mask)						\
    336 	({								\
    337 		typeof(ptr) _ptr = (ptr);				\
    338 		typeof(mask) _mask = (mask);				\
    339 		typeof(*_ptr) _old, _val = *_ptr;			\
    340 									\
    341 		for (;;) {						\
    342 			_old = cmpxchg(_ptr, _val, _val | _mask);	\
    343 			if (_old == _val)				\
    344 				break;					\
    345 			_val = _old;					\
    346 		}							\
    347 	_old;								\
    348 })
    349 
    350 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
    351 /*
    352  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
    353  * this avoids any races wrt polling state changes and thereby avoids
    354  * spurious IPIs.
    355  */
    356 static bool set_nr_and_not_polling(struct task_struct *p)
    357 {
    358 	struct thread_info *ti = task_thread_info(p);
    359 	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
    360 }
    361 
    362 /*
    363  * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
    364  *
    365  * If this returns true, then the idle task promises to call
    366  * sched_ttwu_pending() and reschedule soon.
    367  */
    368 static bool set_nr_if_polling(struct task_struct *p)
    369 {
    370 	struct thread_info *ti = task_thread_info(p);
    371 	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
    372 
    373 	for (;;) {
    374 		if (!(val & _TIF_POLLING_NRFLAG))
    375 			return false;
    376 		if (val & _TIF_NEED_RESCHED)
    377 			return true;
    378 		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
    379 		if (old == val)
    380 			break;
    381 		val = old;
    382 	}
    383 	return true;
    384 }
    385 
    386 #else
    387 static bool set_nr_and_not_polling(struct task_struct *p)
    388 {
    389 	set_tsk_need_resched(p);
    390 	return true;
    391 }
    392 
    393 #ifdef CONFIG_SMP
    394 static bool set_nr_if_polling(struct task_struct *p)
    395 {
    396 	return false;
    397 }
    398 #endif
    399 #endif
    400 
    401 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
    402 {
    403 	struct wake_q_node *node = &task->wake_q;
    404 
    405 	/*
    406 	 * Atomically grab the task, if ->wake_q is !nil already it means
    407 	 * its already queued (either by us or someone else) and will get the
    408 	 * wakeup due to that.
    409 	 *
    410 	 * In order to ensure that a pending wakeup will observe our pending
    411 	 * state, even in the failed case, an explicit smp_mb() must be used.
    412 	 */
    413 	smp_mb__before_atomic();
    414 	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
    415 		return false;
    416 
    417 	/*
    418 	 * The head is context local, there can be no concurrency.
    419 	 */
    420 	*head->lastp = node;
    421 	head->lastp = &node->next;
    422 	return true;
    423 }
    424 
    425 /**
    426  * wake_q_add() - queue a wakeup for 'later' waking.
    427  * @head: the wake_q_head to add @task to
    428  * @task: the task to queue for 'later' wakeup
    429  *
    430  * Queue a task for later wakeup, most likely by the wake_up_q() call in the
    431  * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
    432  * instantly.
    433  *
    434  * This function must be used as-if it were wake_up_process(); IOW the task
    435  * must be ready to be woken at this location.
    436  */
    437 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
    438 {
    439 	if (__wake_q_add(head, task))
    440 		get_task_struct(task);
    441 }
    442 
    443 /**
    444  * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
    445  * @head: the wake_q_head to add @task to
    446  * @task: the task to queue for 'later' wakeup
    447  *
    448  * Queue a task for later wakeup, most likely by the wake_up_q() call in the
    449  * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
    450  * instantly.
    451  *
    452  * This function must be used as-if it were wake_up_process(); IOW the task
    453  * must be ready to be woken at this location.
    454  *
    455  * This function is essentially a task-safe equivalent to wake_q_add(). Callers
    456  * that already hold reference to @task can call the 'safe' version and trust
    457  * wake_q to do the right thing depending whether or not the @task is already
    458  * queued for wakeup.
    459  */
    460 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
    461 {
    462 	if (!__wake_q_add(head, task))
    463 		put_task_struct(task);
    464 }
    465 
    466 void wake_up_q(struct wake_q_head *head)
    467 {
    468 	struct wake_q_node *node = head->first;
    469 
    470 	while (node != WAKE_Q_TAIL) {
    471 		struct task_struct *task;
    472 
    473 		task = container_of(node, struct task_struct, wake_q);
    474 		BUG_ON(!task);
    475 		/* Task can safely be re-inserted now: */
    476 		node = node->next;
    477 		task->wake_q.next = NULL;
    478 
    479 		/*
    480 		 * wake_up_process() executes a full barrier, which pairs with
    481 		 * the queueing in wake_q_add() so as not to miss wakeups.
    482 		 */
    483 		wake_up_process(task);
    484 		put_task_struct(task);
    485 	}
    486 }
    487 
    488 /*
    489  * resched_curr - mark rq's current task 'to be rescheduled now'.
    490  *
    491  * On UP this means the setting of the need_resched flag, on SMP it
    492  * might also involve a cross-CPU call to trigger the scheduler on
    493  * the target CPU.
    494  */
    495 void resched_curr(struct rq *rq)
    496 {
    497 	struct task_struct *curr = rq->curr;
    498 	int cpu;
    499 
    500 	lockdep_assert_held(&rq->lock);
    501 
    502 	if (test_tsk_need_resched(curr))
    503 		return;
    504 
    505 	cpu = cpu_of(rq);
    506 
    507 	if (cpu == smp_processor_id()) {
    508 		set_tsk_need_resched(curr);
    509 		set_preempt_need_resched();
    510 		return;
    511 	}
    512 
    513 	if (set_nr_and_not_polling(curr))
    514 		smp_send_reschedule(cpu);
    515 	else
    516 		trace_sched_wake_idle_without_ipi(cpu);
    517 }
    518 
    519 void resched_cpu(int cpu)
    520 {
    521 	struct rq *rq = cpu_rq(cpu);
    522 	unsigned long flags;
    523 
    524 	raw_spin_lock_irqsave(&rq->lock, flags);
    525 	if (cpu_online(cpu) || cpu == smp_processor_id())
    526 		resched_curr(rq);
    527 	raw_spin_unlock_irqrestore(&rq->lock, flags);
    528 }
    529 
    530 #ifdef CONFIG_SMP
    531 #ifdef CONFIG_NO_HZ_COMMON
    532 /*
    533  * In the semi idle case, use the nearest busy CPU for migrating timers
    534  * from an idle CPU.  This is good for power-savings.
    535  *
    536  * We don't do similar optimization for completely idle system, as
    537  * selecting an idle CPU will add more delays to the timers than intended
    538  * (as that CPU's timer base may not be uptodate wrt jiffies etc).
    539  */
    540 int get_nohz_timer_target(void)
    541 {
    542 	int i, cpu = smp_processor_id();
    543 	struct sched_domain *sd;
    544 
    545 	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
    546 		return cpu;
    547 
    548 	rcu_read_lock();
    549 	for_each_domain(cpu, sd) {
    550 		for_each_cpu(i, sched_domain_span(sd)) {
    551 			if (cpu == i)
    552 				continue;
    553 
    554 			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
    555 				cpu = i;
    556 				goto unlock;
    557 			}
    558 		}
    559 	}
    560 
    561 	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
    562 		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
    563 unlock:
    564 	rcu_read_unlock();
    565 	return cpu;
    566 }
    567 
    568 /*
    569  * When add_timer_on() enqueues a timer into the timer wheel of an
    570  * idle CPU then this timer might expire before the next timer event
    571  * which is scheduled to wake up that CPU. In case of a completely
    572  * idle system the next event might even be infinite time into the
    573  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
    574  * leaves the inner idle loop so the newly added timer is taken into
    575  * account when the CPU goes back to idle and evaluates the timer
    576  * wheel for the next timer event.
    577  */
    578 static void wake_up_idle_cpu(int cpu)
    579 {
    580 	struct rq *rq = cpu_rq(cpu);
    581 
    582 	if (cpu == smp_processor_id())
    583 		return;
    584 
    585 	if (set_nr_and_not_polling(rq->idle))
    586 		smp_send_reschedule(cpu);
    587 	else
    588 		trace_sched_wake_idle_without_ipi(cpu);
    589 }
    590 
    591 static bool wake_up_full_nohz_cpu(int cpu)
    592 {
    593 	/*
    594 	 * We just need the target to call irq_exit() and re-evaluate
    595 	 * the next tick. The nohz full kick at least implies that.
    596 	 * If needed we can still optimize that later with an
    597 	 * empty IRQ.
    598 	 */
    599 	if (cpu_is_offline(cpu))
    600 		return true;  /* Don't try to wake offline CPUs. */
    601 	if (tick_nohz_full_cpu(cpu)) {
    602 		if (cpu != smp_processor_id() ||
    603 		    tick_nohz_tick_stopped())
    604 			tick_nohz_full_kick_cpu(cpu);
    605 		return true;
    606 	}
    607 
    608 	return false;
    609 }
    610 
    611 /*
    612  * Wake up the specified CPU.  If the CPU is going offline, it is the
    613  * caller's responsibility to deal with the lost wakeup, for example,
    614  * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
    615  */
    616 void wake_up_nohz_cpu(int cpu)
    617 {
    618 	if (!wake_up_full_nohz_cpu(cpu))
    619 		wake_up_idle_cpu(cpu);
    620 }
    621 
    622 static inline bool got_nohz_idle_kick(void)
    623 {
    624 	int cpu = smp_processor_id();
    625 
    626 	if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK))
    627 		return false;
    628 
    629 	if (idle_cpu(cpu) && !need_resched())
    630 		return true;
    631 
    632 	/*
    633 	 * We can't run Idle Load Balance on this CPU for this time so we
    634 	 * cancel it and clear NOHZ_BALANCE_KICK
    635 	 */
    636 	atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
    637 	return false;
    638 }
    639 
    640 #else /* CONFIG_NO_HZ_COMMON */
    641 
    642 static inline bool got_nohz_idle_kick(void)
    643 {
    644 	return false;
    645 }
    646 
    647 #endif /* CONFIG_NO_HZ_COMMON */
    648 
    649 #ifdef CONFIG_NO_HZ_FULL
    650 bool sched_can_stop_tick(struct rq *rq)
    651 {
    652 	int fifo_nr_running;
    653 
    654 	/* Deadline tasks, even if single, need the tick */
    655 	if (rq->dl.dl_nr_running)
    656 		return false;
    657 
    658 	/*
    659 	 * If there are more than one RR tasks, we need the tick to effect the
    660 	 * actual RR behaviour.
    661 	 */
    662 	if (rq->rt.rr_nr_running) {
    663 		if (rq->rt.rr_nr_running == 1)
    664 			return true;
    665 		else
    666 			return false;
    667 	}
    668 
    669 	/*
    670 	 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
    671 	 * forced preemption between FIFO tasks.
    672 	 */
    673 	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
    674 	if (fifo_nr_running)
    675 		return true;
    676 
    677 	/*
    678 	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
    679 	 * if there's more than one we need the tick for involuntary
    680 	 * preemption.
    681 	 */
    682 	if (rq->nr_running > 1)
    683 		return false;
    684 
    685 	return true;
    686 }
    687 #endif /* CONFIG_NO_HZ_FULL */
    688 #endif /* CONFIG_SMP */
    689 
    690 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
    691 			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
    692 /*
    693  * Iterate task_group tree rooted at *from, calling @down when first entering a
    694  * node and @up when leaving it for the final time.
    695  *
    696  * Caller must hold rcu_lock or sufficient equivalent.
    697  */
    698 int walk_tg_tree_from(struct task_group *from,
    699 			     tg_visitor down, tg_visitor up, void *data)
    700 {
    701 	struct task_group *parent, *child;
    702 	int ret;
    703 
    704 	parent = from;
    705 
    706 down:
    707 	ret = (*down)(parent, data);
    708 	if (ret)
    709 		goto out;
    710 	list_for_each_entry_rcu(child, &parent->children, siblings) {
    711 		parent = child;
    712 		goto down;
    713 
    714 up:
    715 		continue;
    716 	}
    717 	ret = (*up)(parent, data);
    718 	if (ret || parent == from)
    719 		goto out;
    720 
    721 	child = parent;
    722 	parent = parent->parent;
    723 	if (parent)
    724 		goto up;
    725 out:
    726 	return ret;
    727 }
    728 
    729 int tg_nop(struct task_group *tg, void *data)
    730 {
    731 	return 0;
    732 }
    733 #endif
    734 
    735 static void set_load_weight(struct task_struct *p, bool update_load)
    736 {
    737 	int prio = p->static_prio - MAX_RT_PRIO;
    738 	struct load_weight *load = &p->se.load;
    739 
    740 	/*
    741 	 * SCHED_IDLE tasks get minimal weight:
    742 	 */
    743 	if (task_has_idle_policy(p)) {
    744 		load->weight = scale_load(WEIGHT_IDLEPRIO);
    745 		load->inv_weight = WMULT_IDLEPRIO;
    746 		p->se.runnable_weight = load->weight;
    747 		return;
    748 	}
    749 
    750 	/*
    751 	 * SCHED_OTHER tasks have to update their load when changing their
    752 	 * weight
    753 	 */
    754 	if (update_load && p->sched_class == &fair_sched_class) {
    755 		reweight_task(p, prio);
    756 	} else {
    757 		load->weight = scale_load(sched_prio_to_weight[prio]);
    758 		load->inv_weight = sched_prio_to_wmult[prio];
    759 		p->se.runnable_weight = load->weight;
    760 	}
    761 }
    762 
    763 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
    764 {
    765 	if (!(flags & ENQUEUE_NOCLOCK))
    766 		update_rq_clock(rq);
    767 
    768 	if (!(flags & ENQUEUE_RESTORE)) {
    769 		sched_info_queued(rq, p);
    770 		psi_enqueue(p, flags & ENQUEUE_WAKEUP);
    771 	}
    772 
    773 	p->sched_class->enqueue_task(rq, p, flags);
    774 }
    775 
    776 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
    777 {
    778 	if (!(flags & DEQUEUE_NOCLOCK))
    779 		update_rq_clock(rq);
    780 
    781 	if (!(flags & DEQUEUE_SAVE)) {
    782 		sched_info_dequeued(rq, p);
    783 		psi_dequeue(p, flags & DEQUEUE_SLEEP);
    784 	}
    785 
    786 	p->sched_class->dequeue_task(rq, p, flags);
    787 }
    788 
    789 void activate_task(struct rq *rq, struct task_struct *p, int flags)
    790 {
    791 	if (task_contributes_to_load(p))
    792 		rq->nr_uninterruptible--;
    793 
    794 	enqueue_task(rq, p, flags);
    795 }
    796 
    797 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
    798 {
    799 	if (task_contributes_to_load(p))
    800 		rq->nr_uninterruptible++;
    801 
    802 	dequeue_task(rq, p, flags);
    803 }
    804 
    805 /*
    806  * __normal_prio - return the priority that is based on the static prio
    807  */
    808 static inline int __normal_prio(struct task_struct *p)
    809 {
    810 	return p->static_prio;
    811 }
    812 
    813 /*
    814  * Calculate the expected normal priority: i.e. priority
    815  * without taking RT-inheritance into account. Might be
    816  * boosted by interactivity modifiers. Changes upon fork,
    817  * setprio syscalls, and whenever the interactivity
    818  * estimator recalculates.
    819  */
    820 static inline int normal_prio(struct task_struct *p)
    821 {
    822 	int prio;
    823 
    824 	if (task_has_dl_policy(p))
    825 		prio = MAX_DL_PRIO-1;
    826 	else if (task_has_rt_policy(p))
    827 		prio = MAX_RT_PRIO-1 - p->rt_priority;
    828 	else
    829 		prio = __normal_prio(p);
    830 	return prio;
    831 }
    832 
    833 /*
    834  * Calculate the current priority, i.e. the priority
    835  * taken into account by the scheduler. This value might
    836  * be boosted by RT tasks, or might be boosted by
    837  * interactivity modifiers. Will be RT if the task got
    838  * RT-boosted. If not then it returns p->normal_prio.
    839  */
    840 static int effective_prio(struct task_struct *p)
    841 {
    842 	p->normal_prio = normal_prio(p);
    843 	/*
    844 	 * If we are RT tasks or we were boosted to RT priority,
    845 	 * keep the priority unchanged. Otherwise, update priority
    846 	 * to the normal priority:
    847 	 */
    848 	if (!rt_prio(p->prio))
    849 		return p->normal_prio;
    850 	return p->prio;
    851 }
    852 
    853 /**
    854  * task_curr - is this task currently executing on a CPU?
    855  * @p: the task in question.
    856  *
    857  * Return: 1 if the task is currently executing. 0 otherwise.
    858  */
    859 inline int task_curr(const struct task_struct *p)
    860 {
    861 	return cpu_curr(task_cpu(p)) == p;
    862 }
    863 
    864 /*
    865  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
    866  * use the balance_callback list if you want balancing.
    867  *
    868  * this means any call to check_class_changed() must be followed by a call to
    869  * balance_callback().
    870  */
    871 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
    872 				       const struct sched_class *prev_class,
    873 				       int oldprio)
    874 {
    875 	if (prev_class != p->sched_class) {
    876 		if (prev_class->switched_from)
    877 			prev_class->switched_from(rq, p);
    878 
    879 		p->sched_class->switched_to(rq, p);
    880 	} else if (oldprio != p->prio || dl_task(p))
    881 		p->sched_class->prio_changed(rq, p, oldprio);
    882 }
    883 
    884 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
    885 {
    886 	const struct sched_class *class;
    887 
    888 	if (p->sched_class == rq->curr->sched_class) {
    889 		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
    890 	} else {
    891 		for_each_class(class) {
    892 			if (class == rq->curr->sched_class)
    893 				break;
    894 			if (class == p->sched_class) {
    895 				resched_curr(rq);
    896 				break;
    897 			}
    898 		}
    899 	}
    900 
    901 	/*
    902 	 * A queue event has occurred, and we're going to schedule.  In
    903 	 * this case, we can save a useless back to back clock update.
    904 	 */
    905 	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
    906 		rq_clock_skip_update(rq);
    907 }
    908 
    909 #ifdef CONFIG_SMP
    910 
    911 static inline bool is_per_cpu_kthread(struct task_struct *p)
    912 {
    913 	if (!(p->flags & PF_KTHREAD))
    914 		return false;
    915 
    916 	if (p->nr_cpus_allowed != 1)
    917 		return false;
    918 
    919 	return true;
    920 }
    921 
    922 /*
    923  * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
    924  * __set_cpus_allowed_ptr() and select_fallback_rq().
    925  */
    926 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
    927 {
    928 	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
    929 		return false;
    930 
    931 	if (is_per_cpu_kthread(p))
    932 		return cpu_online(cpu);
    933 
    934 	return cpu_active(cpu);
    935 }
    936 
    937 /*
    938  * This is how migration works:
    939  *
    940  * 1) we invoke migration_cpu_stop() on the target CPU using
    941  *    stop_one_cpu().
    942  * 2) stopper starts to run (implicitly forcing the migrated thread
    943  *    off the CPU)
    944  * 3) it checks whether the migrated task is still in the wrong runqueue.
    945  * 4) if it's in the wrong runqueue then the migration thread removes
    946  *    it and puts it into the right queue.
    947  * 5) stopper completes and stop_one_cpu() returns and the migration
    948  *    is done.
    949  */
    950 
    951 /*
    952  * move_queued_task - move a queued task to new rq.
    953  *
    954  * Returns (locked) new rq. Old rq's lock is released.
    955  */
    956 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
    957 				   struct task_struct *p, int new_cpu)
    958 {
    959 	lockdep_assert_held(&rq->lock);
    960 
    961 	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
    962 	dequeue_task(rq, p, DEQUEUE_NOCLOCK);
    963 	set_task_cpu(p, new_cpu);
    964 	rq_unlock(rq, rf);
    965 
    966 	rq = cpu_rq(new_cpu);
    967 
    968 	rq_lock(rq, rf);
    969 	BUG_ON(task_cpu(p) != new_cpu);
    970 	enqueue_task(rq, p, 0);
    971 	p->on_rq = TASK_ON_RQ_QUEUED;
    972 	check_preempt_curr(rq, p, 0);
    973 
    974 	return rq;
    975 }
    976 
    977 struct migration_arg {
    978 	struct task_struct *task;
    979 	int dest_cpu;
    980 };
    981 
    982 /*
    983  * Move (not current) task off this CPU, onto the destination CPU. We're doing
    984  * this because either it can't run here any more (set_cpus_allowed()
    985  * away from this CPU, or CPU going down), or because we're
    986  * attempting to rebalance this task on exec (sched_exec).
    987  *
    988  * So we race with normal scheduler movements, but that's OK, as long
    989  * as the task is no longer on this CPU.
    990  */
    991 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
    992 				 struct task_struct *p, int dest_cpu)
    993 {
    994 	/* Affinity changed (again). */
    995 	if (!is_cpu_allowed(p, dest_cpu))
    996 		return rq;
    997 
    998 	update_rq_clock(rq);
    999 	rq = move_queued_task(rq, rf, p, dest_cpu);
   1000 
   1001 	return rq;
   1002 }
   1003 
   1004 /*
   1005  * migration_cpu_stop - this will be executed by a highprio stopper thread
   1006  * and performs thread migration by bumping thread off CPU then
   1007  * 'pushing' onto another runqueue.
   1008  */
   1009 static int migration_cpu_stop(void *data)
   1010 {
   1011 	struct migration_arg *arg = data;
   1012 	struct task_struct *p = arg->task;
   1013 	struct rq *rq = this_rq();
   1014 	struct rq_flags rf;
   1015 
   1016 	/*
   1017 	 * The original target CPU might have gone down and we might
   1018 	 * be on another CPU but it doesn't matter.
   1019 	 */
   1020 	local_irq_disable();
   1021 	/*
   1022 	 * We need to explicitly wake pending tasks before running
   1023 	 * __migrate_task() such that we will not miss enforcing cpus_allowed
   1024 	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
   1025 	 */
   1026 	sched_ttwu_pending();
   1027 
   1028 	raw_spin_lock(&p->pi_lock);
   1029 	rq_lock(rq, &rf);
   1030 	/*
   1031 	 * If task_rq(p) != rq, it cannot be migrated here, because we're
   1032 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
   1033 	 * we're holding p->pi_lock.
   1034 	 */
   1035 	if (task_rq(p) == rq) {
   1036 		if (task_on_rq_queued(p))
   1037 			rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
   1038 		else
   1039 			p->wake_cpu = arg->dest_cpu;
   1040 	}
   1041 	rq_unlock(rq, &rf);
   1042 	raw_spin_unlock(&p->pi_lock);
   1043 
   1044 	local_irq_enable();
   1045 	return 0;
   1046 }
   1047 
   1048 /*
   1049  * sched_class::set_cpus_allowed must do the below, but is not required to
   1050  * actually call this function.
   1051  */
   1052 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
   1053 {
   1054 	cpumask_copy(&p->cpus_allowed, new_mask);
   1055 	p->nr_cpus_allowed = cpumask_weight(new_mask);
   1056 }
   1057 
   1058 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
   1059 {
   1060 	struct rq *rq = task_rq(p);
   1061 	bool queued, running;
   1062 
   1063 	lockdep_assert_held(&p->pi_lock);
   1064 
   1065 	queued = task_on_rq_queued(p);
   1066 	running = task_current(rq, p);
   1067 
   1068 	if (queued) {
   1069 		/*
   1070 		 * Because __kthread_bind() calls this on blocked tasks without
   1071 		 * holding rq->lock.
   1072 		 */
   1073 		lockdep_assert_held(&rq->lock);
   1074 		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
   1075 	}
   1076 	if (running)
   1077 		put_prev_task(rq, p);
   1078 
   1079 	p->sched_class->set_cpus_allowed(p, new_mask);
   1080 
   1081 	if (queued)
   1082 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
   1083 	if (running)
   1084 		set_curr_task(rq, p);
   1085 }
   1086 
   1087 /*
   1088  * Change a given task's CPU affinity. Migrate the thread to a
   1089  * proper CPU and schedule it away if the CPU it's executing on
   1090  * is removed from the allowed bitmask.
   1091  *
   1092  * NOTE: the caller must have a valid reference to the task, the
   1093  * task must not exit() & deallocate itself prematurely. The
   1094  * call is not atomic; no spinlocks may be held.
   1095  */
   1096 static int __set_cpus_allowed_ptr(struct task_struct *p,
   1097 				  const struct cpumask *new_mask, bool check)
   1098 {
   1099 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
   1100 	unsigned int dest_cpu;
   1101 	struct rq_flags rf;
   1102 	struct rq *rq;
   1103 	int ret = 0;
   1104 
   1105 	rq = task_rq_lock(p, &rf);
   1106 	update_rq_clock(rq);
   1107 
   1108 	if (p->flags & PF_KTHREAD) {
   1109 		/*
   1110 		 * Kernel threads are allowed on online && !active CPUs
   1111 		 */
   1112 		cpu_valid_mask = cpu_online_mask;
   1113 	}
   1114 
   1115 	/*
   1116 	 * Must re-check here, to close a race against __kthread_bind(),
   1117 	 * sched_setaffinity() is not guaranteed to observe the flag.
   1118 	 */
   1119 	if (check && (p->flags & PF_NO_SETAFFINITY)) {
   1120 		ret = -EINVAL;
   1121 		goto out;
   1122 	}
   1123 
   1124 	if (cpumask_equal(&p->cpus_allowed, new_mask))
   1125 		goto out;
   1126 
   1127 	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
   1128 		ret = -EINVAL;
   1129 		goto out;
   1130 	}
   1131 
   1132 	do_set_cpus_allowed(p, new_mask);
   1133 
   1134 	if (p->flags & PF_KTHREAD) {
   1135 		/*
   1136 		 * For kernel threads that do indeed end up on online &&
   1137 		 * !active we want to ensure they are strict per-CPU threads.
   1138 		 */
   1139 		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
   1140 			!cpumask_intersects(new_mask, cpu_active_mask) &&
   1141 			p->nr_cpus_allowed != 1);
   1142 	}
   1143 
   1144 	/* Can the task run on the task's current CPU? If so, we're done */
   1145 	if (cpumask_test_cpu(task_cpu(p), new_mask))
   1146 		goto out;
   1147 
   1148 	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
   1149 	if (task_running(rq, p) || p->state == TASK_WAKING) {
   1150 		struct migration_arg arg = { p, dest_cpu };
   1151 		/* Need help from migration thread: drop lock and wait. */
   1152 		task_rq_unlock(rq, p, &rf);
   1153 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
   1154 		tlb_migrate_finish(p->mm);
   1155 		return 0;
   1156 	} else if (task_on_rq_queued(p)) {
   1157 		/*
   1158 		 * OK, since we're going to drop the lock immediately
   1159 		 * afterwards anyway.
   1160 		 */
   1161 		rq = move_queued_task(rq, &rf, p, dest_cpu);
   1162 	}
   1163 out:
   1164 	task_rq_unlock(rq, p, &rf);
   1165 
   1166 	return ret;
   1167 }
   1168 
   1169 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   1170 {
   1171 	return __set_cpus_allowed_ptr(p, new_mask, false);
   1172 }
   1173 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
   1174 
   1175 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
   1176 {
   1177 #ifdef CONFIG_SCHED_DEBUG
   1178 	/*
   1179 	 * We should never call set_task_cpu() on a blocked task,
   1180 	 * ttwu() will sort out the placement.
   1181 	 */
   1182 	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
   1183 			!p->on_rq);
   1184 
   1185 	/*
   1186 	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
   1187 	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
   1188 	 * time relying on p->on_rq.
   1189 	 */
   1190 	WARN_ON_ONCE(p->state == TASK_RUNNING &&
   1191 		     p->sched_class == &fair_sched_class &&
   1192 		     (p->on_rq && !task_on_rq_migrating(p)));
   1193 
   1194 #ifdef CONFIG_LOCKDEP
   1195 	/*
   1196 	 * The caller should hold either p->pi_lock or rq->lock, when changing
   1197 	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
   1198 	 *
   1199 	 * sched_move_task() holds both and thus holding either pins the cgroup,
   1200 	 * see task_group().
   1201 	 *
   1202 	 * Furthermore, all task_rq users should acquire both locks, see
   1203 	 * task_rq_lock().
   1204 	 */
   1205 	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
   1206 				      lockdep_is_held(&task_rq(p)->lock)));
   1207 #endif
   1208 	/*
   1209 	 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
   1210 	 */
   1211 	WARN_ON_ONCE(!cpu_online(new_cpu));
   1212 #endif
   1213 
   1214 	trace_sched_migrate_task(p, new_cpu);
   1215 
   1216 	if (task_cpu(p) != new_cpu) {
   1217 		if (p->sched_class->migrate_task_rq)
   1218 			p->sched_class->migrate_task_rq(p, new_cpu);
   1219 		p->se.nr_migrations++;
   1220 		rseq_migrate(p);
   1221 		perf_event_task_migrate(p);
   1222 	}
   1223 
   1224 	__set_task_cpu(p, new_cpu);
   1225 }
   1226 
   1227 #ifdef CONFIG_NUMA_BALANCING
   1228 static void __migrate_swap_task(struct task_struct *p, int cpu)
   1229 {
   1230 	if (task_on_rq_queued(p)) {
   1231 		struct rq *src_rq, *dst_rq;
   1232 		struct rq_flags srf, drf;
   1233 
   1234 		src_rq = task_rq(p);
   1235 		dst_rq = cpu_rq(cpu);
   1236 
   1237 		rq_pin_lock(src_rq, &srf);
   1238 		rq_pin_lock(dst_rq, &drf);
   1239 
   1240 		p->on_rq = TASK_ON_RQ_MIGRATING;
   1241 		deactivate_task(src_rq, p, 0);
   1242 		set_task_cpu(p, cpu);
   1243 		activate_task(dst_rq, p, 0);
   1244 		p->on_rq = TASK_ON_RQ_QUEUED;
   1245 		check_preempt_curr(dst_rq, p, 0);
   1246 
   1247 		rq_unpin_lock(dst_rq, &drf);
   1248 		rq_unpin_lock(src_rq, &srf);
   1249 
   1250 	} else {
   1251 		/*
   1252 		 * Task isn't running anymore; make it appear like we migrated
   1253 		 * it before it went to sleep. This means on wakeup we make the
   1254 		 * previous CPU our target instead of where it really is.
   1255 		 */
   1256 		p->wake_cpu = cpu;
   1257 	}
   1258 }
   1259 
   1260 struct migration_swap_arg {
   1261 	struct task_struct *src_task, *dst_task;
   1262 	int src_cpu, dst_cpu;
   1263 };
   1264 
   1265 static int migrate_swap_stop(void *data)
   1266 {
   1267 	struct migration_swap_arg *arg = data;
   1268 	struct rq *src_rq, *dst_rq;
   1269 	int ret = -EAGAIN;
   1270 
   1271 	if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
   1272 		return -EAGAIN;
   1273 
   1274 	src_rq = cpu_rq(arg->src_cpu);
   1275 	dst_rq = cpu_rq(arg->dst_cpu);
   1276 
   1277 	double_raw_lock(&arg->src_task->pi_lock,
   1278 			&arg->dst_task->pi_lock);
   1279 	double_rq_lock(src_rq, dst_rq);
   1280 
   1281 	if (task_cpu(arg->dst_task) != arg->dst_cpu)
   1282 		goto unlock;
   1283 
   1284 	if (task_cpu(arg->src_task) != arg->src_cpu)
   1285 		goto unlock;
   1286 
   1287 	if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
   1288 		goto unlock;
   1289 
   1290 	if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
   1291 		goto unlock;
   1292 
   1293 	__migrate_swap_task(arg->src_task, arg->dst_cpu);
   1294 	__migrate_swap_task(arg->dst_task, arg->src_cpu);
   1295 
   1296 	ret = 0;
   1297 
   1298 unlock:
   1299 	double_rq_unlock(src_rq, dst_rq);
   1300 	raw_spin_unlock(&arg->dst_task->pi_lock);
   1301 	raw_spin_unlock(&arg->src_task->pi_lock);
   1302 
   1303 	return ret;
   1304 }
   1305 
   1306 /*
   1307  * Cross migrate two tasks
   1308  */
   1309 int migrate_swap(struct task_struct *cur, struct task_struct *p,
   1310 		int target_cpu, int curr_cpu)
   1311 {
   1312 	struct migration_swap_arg arg;
   1313 	int ret = -EINVAL;
   1314 
   1315 	arg = (struct migration_swap_arg){
   1316 		.src_task = cur,
   1317 		.src_cpu = curr_cpu,
   1318 		.dst_task = p,
   1319 		.dst_cpu = target_cpu,
   1320 	};
   1321 
   1322 	if (arg.src_cpu == arg.dst_cpu)
   1323 		goto out;
   1324 
   1325 	/*
   1326 	 * These three tests are all lockless; this is OK since all of them
   1327 	 * will be re-checked with proper locks held further down the line.
   1328 	 */
   1329 	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
   1330 		goto out;
   1331 
   1332 	if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
   1333 		goto out;
   1334 
   1335 	if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
   1336 		goto out;
   1337 
   1338 	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
   1339 	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
   1340 
   1341 out:
   1342 	return ret;
   1343 }
   1344 #endif /* CONFIG_NUMA_BALANCING */
   1345 
   1346 /*
   1347  * wait_task_inactive - wait for a thread to unschedule.
   1348  *
   1349  * If @match_state is nonzero, it's the @p->state value just checked and
   1350  * not expected to change.  If it changes, i.e. @p might have woken up,
   1351  * then return zero.  When we succeed in waiting for @p to be off its CPU,
   1352  * we return a positive number (its total switch count).  If a second call
   1353  * a short while later returns the same number, the caller can be sure that
   1354  * @p has remained unscheduled the whole time.
   1355  *
   1356  * The caller must ensure that the task *will* unschedule sometime soon,
   1357  * else this function might spin for a *long* time. This function can't
   1358  * be called with interrupts off, or it may introduce deadlock with
   1359  * smp_call_function() if an IPI is sent by the same process we are
   1360  * waiting to become inactive.
   1361  */
   1362 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
   1363 {
   1364 	int running, queued;
   1365 	struct rq_flags rf;
   1366 	unsigned long ncsw;
   1367 	struct rq *rq;
   1368 
   1369 	for (;;) {
   1370 		/*
   1371 		 * We do the initial early heuristics without holding
   1372 		 * any task-queue locks at all. We'll only try to get
   1373 		 * the runqueue lock when things look like they will
   1374 		 * work out!
   1375 		 */
   1376 		rq = task_rq(p);
   1377 
   1378 		/*
   1379 		 * If the task is actively running on another CPU
   1380 		 * still, just relax and busy-wait without holding
   1381 		 * any locks.
   1382 		 *
   1383 		 * NOTE! Since we don't hold any locks, it's not
   1384 		 * even sure that "rq" stays as the right runqueue!
   1385 		 * But we don't care, since "task_running()" will
   1386 		 * return false if the runqueue has changed and p
   1387 		 * is actually now running somewhere else!
   1388 		 */
   1389 		while (task_running(rq, p)) {
   1390 			if (match_state && unlikely(p->state != match_state))
   1391 				return 0;
   1392 			cpu_relax();
   1393 		}
   1394 
   1395 		/*
   1396 		 * Ok, time to look more closely! We need the rq
   1397 		 * lock now, to be *sure*. If we're wrong, we'll
   1398 		 * just go back and repeat.
   1399 		 */
   1400 		rq = task_rq_lock(p, &rf);
   1401 		trace_sched_wait_task(p);
   1402 		running = task_running(rq, p);
   1403 		queued = task_on_rq_queued(p);
   1404 		ncsw = 0;
   1405 		if (!match_state || p->state == match_state)
   1406 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
   1407 		task_rq_unlock(rq, p, &rf);
   1408 
   1409 		/*
   1410 		 * If it changed from the expected state, bail out now.
   1411 		 */
   1412 		if (unlikely(!ncsw))
   1413 			break;
   1414 
   1415 		/*
   1416 		 * Was it really running after all now that we
   1417 		 * checked with the proper locks actually held?
   1418 		 *
   1419 		 * Oops. Go back and try again..
   1420 		 */
   1421 		if (unlikely(running)) {
   1422 			cpu_relax();
   1423 			continue;
   1424 		}
   1425 
   1426 		/*
   1427 		 * It's not enough that it's not actively running,
   1428 		 * it must be off the runqueue _entirely_, and not
   1429 		 * preempted!
   1430 		 *
   1431 		 * So if it was still runnable (but just not actively
   1432 		 * running right now), it's preempted, and we should
   1433 		 * yield - it could be a while.
   1434 		 */
   1435 		if (unlikely(queued)) {
   1436 			ktime_t to = NSEC_PER_SEC / HZ;
   1437 
   1438 			set_current_state(TASK_UNINTERRUPTIBLE);
   1439 			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
   1440 			continue;
   1441 		}
   1442 
   1443 		/*
   1444 		 * Ahh, all good. It wasn't running, and it wasn't
   1445 		 * runnable, which means that it will never become
   1446 		 * running in the future either. We're all done!
   1447 		 */
   1448 		break;
   1449 	}
   1450 
   1451 	return ncsw;
   1452 }
   1453 
   1454 /***
   1455  * kick_process - kick a running thread to enter/exit the kernel
   1456  * @p: the to-be-kicked thread
   1457  *
   1458  * Cause a process which is running on another CPU to enter
   1459  * kernel-mode, without any delay. (to get signals handled.)
   1460  *
   1461  * NOTE: this function doesn't have to take the runqueue lock,
   1462  * because all it wants to ensure is that the remote task enters
   1463  * the kernel. If the IPI races and the task has been migrated
   1464  * to another CPU then no harm is done and the purpose has been
   1465  * achieved as well.
   1466  */
   1467 void kick_process(struct task_struct *p)
   1468 {
   1469 	int cpu;
   1470 
   1471 	preempt_disable();
   1472 	cpu = task_cpu(p);
   1473 	if ((cpu != smp_processor_id()) && task_curr(p))
   1474 		smp_send_reschedule(cpu);
   1475 	preempt_enable();
   1476 }
   1477 EXPORT_SYMBOL_GPL(kick_process);
   1478 
   1479 /*
   1480  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
   1481  *
   1482  * A few notes on cpu_active vs cpu_online:
   1483  *
   1484  *  - cpu_active must be a subset of cpu_online
   1485  *
   1486  *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
   1487  *    see __set_cpus_allowed_ptr(). At this point the newly online
   1488  *    CPU isn't yet part of the sched domains, and balancing will not
   1489  *    see it.
   1490  *
   1491  *  - on CPU-down we clear cpu_active() to mask the sched domains and
   1492  *    avoid the load balancer to place new tasks on the to be removed
   1493  *    CPU. Existing tasks will remain running there and will be taken
   1494  *    off.
   1495  *
   1496  * This means that fallback selection must not select !active CPUs.
   1497  * And can assume that any active CPU must be online. Conversely
   1498  * select_task_rq() below may allow selection of !active CPUs in order
   1499  * to satisfy the above rules.
   1500  */
   1501 static int select_fallback_rq(int cpu, struct task_struct *p)
   1502 {
   1503 	int nid = cpu_to_node(cpu);
   1504 	const struct cpumask *nodemask = NULL;
   1505 	enum { cpuset, possible, fail } state = cpuset;
   1506 	int dest_cpu;
   1507 
   1508 	/*
   1509 	 * If the node that the CPU is on has been offlined, cpu_to_node()
   1510 	 * will return -1. There is no CPU on the node, and we should
   1511 	 * select the CPU on the other node.
   1512 	 */
   1513 	if (nid != -1) {
   1514 		nodemask = cpumask_of_node(nid);
   1515 
   1516 		/* Look for allowed, online CPU in same node. */
   1517 		for_each_cpu(dest_cpu, nodemask) {
   1518 			if (!cpu_active(dest_cpu))
   1519 				continue;
   1520 			if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
   1521 				return dest_cpu;
   1522 		}
   1523 	}
   1524 
   1525 	for (;;) {
   1526 		/* Any allowed, online CPU? */
   1527 		for_each_cpu(dest_cpu, &p->cpus_allowed) {
   1528 			if (!is_cpu_allowed(p, dest_cpu))
   1529 				continue;
   1530 
   1531 			goto out;
   1532 		}
   1533 
   1534 		/* No more Mr. Nice Guy. */
   1535 		switch (state) {
   1536 		case cpuset:
   1537 			if (IS_ENABLED(CONFIG_CPUSETS)) {
   1538 				cpuset_cpus_allowed_fallback(p);
   1539 				state = possible;
   1540 				break;
   1541 			}
   1542 			/* Fall-through */
   1543 		case possible:
   1544 			do_set_cpus_allowed(p, cpu_possible_mask);
   1545 			state = fail;
   1546 			break;
   1547 
   1548 		case fail:
   1549 			BUG();
   1550 			break;
   1551 		}
   1552 	}
   1553 
   1554 out:
   1555 	if (state != cpuset) {
   1556 		/*
   1557 		 * Don't tell them about moving exiting tasks or
   1558 		 * kernel threads (both mm NULL), since they never
   1559 		 * leave kernel.
   1560 		 */
   1561 		if (p->mm && printk_ratelimit()) {
   1562 			printk_deferred("process %d (%s) no longer affine to cpu%d\n",
   1563 					task_pid_nr(p), p->comm, cpu);
   1564 		}
   1565 	}
   1566 
   1567 	return dest_cpu;
   1568 }
   1569 
   1570 /*
   1571  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
   1572  */
   1573 static inline
   1574 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
   1575 {
   1576 	lockdep_assert_held(&p->pi_lock);
   1577 
   1578 	if (p->nr_cpus_allowed > 1)
   1579 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
   1580 	else
   1581 		cpu = cpumask_any(&p->cpus_allowed);
   1582 
   1583 	/*
   1584 	 * In order not to call set_task_cpu() on a blocking task we need
   1585 	 * to rely on ttwu() to place the task on a valid ->cpus_allowed
   1586 	 * CPU.
   1587 	 *
   1588 	 * Since this is common to all placement strategies, this lives here.
   1589 	 *
   1590 	 * [ this allows ->select_task() to simply return task_cpu(p) and
   1591 	 *   not worry about this generic constraint ]
   1592 	 */
   1593 	if (unlikely(!is_cpu_allowed(p, cpu)))
   1594 		cpu = select_fallback_rq(task_cpu(p), p);
   1595 
   1596 	return cpu;
   1597 }
   1598 
   1599 static void update_avg(u64 *avg, u64 sample)
   1600 {
   1601 	s64 diff = sample - *avg;
   1602 	*avg += diff >> 3;
   1603 }
   1604 
   1605 void sched_set_stop_task(int cpu, struct task_struct *stop)
   1606 {
   1607 	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
   1608 	struct task_struct *old_stop = cpu_rq(cpu)->stop;
   1609 
   1610 	if (stop) {
   1611 		/*
   1612 		 * Make it appear like a SCHED_FIFO task, its something
   1613 		 * userspace knows about and won't get confused about.
   1614 		 *
   1615 		 * Also, it will make PI more or less work without too
   1616 		 * much confusion -- but then, stop work should not
   1617 		 * rely on PI working anyway.
   1618 		 */
   1619 		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
   1620 
   1621 		stop->sched_class = &stop_sched_class;
   1622 	}
   1623 
   1624 	cpu_rq(cpu)->stop = stop;
   1625 
   1626 	if (old_stop) {
   1627 		/*
   1628 		 * Reset it back to a normal scheduling class so that
   1629 		 * it can die in pieces.
   1630 		 */
   1631 		old_stop->sched_class = &rt_sched_class;
   1632 	}
   1633 }
   1634 
   1635 #else
   1636 
   1637 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
   1638 					 const struct cpumask *new_mask, bool check)
   1639 {
   1640 	return set_cpus_allowed_ptr(p, new_mask);
   1641 }
   1642 
   1643 #endif /* CONFIG_SMP */
   1644 
   1645 static void
   1646 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
   1647 {
   1648 	struct rq *rq;
   1649 
   1650 	if (!schedstat_enabled())
   1651 		return;
   1652 
   1653 	rq = this_rq();
   1654 
   1655 #ifdef CONFIG_SMP
   1656 	if (cpu == rq->cpu) {
   1657 		__schedstat_inc(rq->ttwu_local);
   1658 		__schedstat_inc(p->se.statistics.nr_wakeups_local);
   1659 	} else {
   1660 		struct sched_domain *sd;
   1661 
   1662 		__schedstat_inc(p->se.statistics.nr_wakeups_remote);
   1663 		rcu_read_lock();
   1664 		for_each_domain(rq->cpu, sd) {
   1665 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
   1666 				__schedstat_inc(sd->ttwu_wake_remote);
   1667 				break;
   1668 			}
   1669 		}
   1670 		rcu_read_unlock();
   1671 	}
   1672 
   1673 	if (wake_flags & WF_MIGRATED)
   1674 		__schedstat_inc(p->se.statistics.nr_wakeups_migrate);
   1675 #endif /* CONFIG_SMP */
   1676 
   1677 	__schedstat_inc(rq->ttwu_count);
   1678 	__schedstat_inc(p->se.statistics.nr_wakeups);
   1679 
   1680 	if (wake_flags & WF_SYNC)
   1681 		__schedstat_inc(p->se.statistics.nr_wakeups_sync);
   1682 }
   1683 
   1684 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
   1685 {
   1686 	activate_task(rq, p, en_flags);
   1687 	p->on_rq = TASK_ON_RQ_QUEUED;
   1688 
   1689 	/* If a worker is waking up, notify the workqueue: */
   1690 	if (p->flags & PF_WQ_WORKER)
   1691 		wq_worker_waking_up(p, cpu_of(rq));
   1692 }
   1693 
   1694 /*
   1695  * Mark the task runnable and perform wakeup-preemption.
   1696  */
   1697 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
   1698 			   struct rq_flags *rf)
   1699 {
   1700 	check_preempt_curr(rq, p, wake_flags);
   1701 	p->state = TASK_RUNNING;
   1702 	trace_sched_wakeup(p);
   1703 
   1704 #ifdef CONFIG_SMP
   1705 	if (p->sched_class->task_woken) {
   1706 		/*
   1707 		 * Our task @p is fully woken up and running; so its safe to
   1708 		 * drop the rq->lock, hereafter rq is only used for statistics.
   1709 		 */
   1710 		rq_unpin_lock(rq, rf);
   1711 		p->sched_class->task_woken(rq, p);
   1712 		rq_repin_lock(rq, rf);
   1713 	}
   1714 
   1715 	if (rq->idle_stamp) {
   1716 		u64 delta = rq_clock(rq) - rq->idle_stamp;
   1717 		u64 max = 2*rq->max_idle_balance_cost;
   1718 
   1719 		update_avg(&rq->avg_idle, delta);
   1720 
   1721 		if (rq->avg_idle > max)
   1722 			rq->avg_idle = max;
   1723 
   1724 		rq->idle_stamp = 0;
   1725 	}
   1726 #endif
   1727 }
   1728 
   1729 static void
   1730 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
   1731 		 struct rq_flags *rf)
   1732 {
   1733 	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
   1734 
   1735 	lockdep_assert_held(&rq->lock);
   1736 
   1737 #ifdef CONFIG_SMP
   1738 	if (p->sched_contributes_to_load)
   1739 		rq->nr_uninterruptible--;
   1740 
   1741 	if (wake_flags & WF_MIGRATED)
   1742 		en_flags |= ENQUEUE_MIGRATED;
   1743 #endif
   1744 
   1745 	ttwu_activate(rq, p, en_flags);
   1746 	ttwu_do_wakeup(rq, p, wake_flags, rf);
   1747 }
   1748 
   1749 /*
   1750  * Called in case the task @p isn't fully descheduled from its runqueue,
   1751  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
   1752  * since all we need to do is flip p->state to TASK_RUNNING, since
   1753  * the task is still ->on_rq.
   1754  */
   1755 static int ttwu_remote(struct task_struct *p, int wake_flags)
   1756 {
   1757 	struct rq_flags rf;
   1758 	struct rq *rq;
   1759 	int ret = 0;
   1760 
   1761 	rq = __task_rq_lock(p, &rf);
   1762 	if (task_on_rq_queued(p)) {
   1763 		/* check_preempt_curr() may use rq clock */
   1764 		update_rq_clock(rq);
   1765 		ttwu_do_wakeup(rq, p, wake_flags, &rf);
   1766 		ret = 1;
   1767 	}
   1768 	__task_rq_unlock(rq, &rf);
   1769 
   1770 	return ret;
   1771 }
   1772 
   1773 #ifdef CONFIG_SMP
   1774 void sched_ttwu_pending(void)
   1775 {
   1776 	struct rq *rq = this_rq();
   1777 	struct llist_node *llist = llist_del_all(&rq->wake_list);
   1778 	struct task_struct *p, *t;
   1779 	struct rq_flags rf;
   1780 
   1781 	if (!llist)
   1782 		return;
   1783 
   1784 	rq_lock_irqsave(rq, &rf);
   1785 	update_rq_clock(rq);
   1786 
   1787 	llist_for_each_entry_safe(p, t, llist, wake_entry)
   1788 		ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
   1789 
   1790 	rq_unlock_irqrestore(rq, &rf);
   1791 }
   1792 
   1793 void scheduler_ipi(void)
   1794 {
   1795 	/*
   1796 	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
   1797 	 * TIF_NEED_RESCHED remotely (for the first time) will also send
   1798 	 * this IPI.
   1799 	 */
   1800 	preempt_fold_need_resched();
   1801 
   1802 	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
   1803 		return;
   1804 
   1805 	/*
   1806 	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
   1807 	 * traditionally all their work was done from the interrupt return
   1808 	 * path. Now that we actually do some work, we need to make sure
   1809 	 * we do call them.
   1810 	 *
   1811 	 * Some archs already do call them, luckily irq_enter/exit nest
   1812 	 * properly.
   1813 	 *
   1814 	 * Arguably we should visit all archs and update all handlers,
   1815 	 * however a fair share of IPIs are still resched only so this would
   1816 	 * somewhat pessimize the simple resched case.
   1817 	 */
   1818 	irq_enter();
   1819 	sched_ttwu_pending();
   1820 
   1821 	/*
   1822 	 * Check if someone kicked us for doing the nohz idle load balance.
   1823 	 */
   1824 	if (unlikely(got_nohz_idle_kick())) {
   1825 		this_rq()->idle_balance = 1;
   1826 		raise_softirq_irqoff(SCHED_SOFTIRQ);
   1827 	}
   1828 	irq_exit();
   1829 }
   1830 
   1831 static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
   1832 {
   1833 	struct rq *rq = cpu_rq(cpu);
   1834 
   1835 	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
   1836 
   1837 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
   1838 		if (!set_nr_if_polling(rq->idle))
   1839 			smp_send_reschedule(cpu);
   1840 		else
   1841 			trace_sched_wake_idle_without_ipi(cpu);
   1842 	}
   1843 }
   1844 
   1845 void wake_up_if_idle(int cpu)
   1846 {
   1847 	struct rq *rq = cpu_rq(cpu);
   1848 	struct rq_flags rf;
   1849 
   1850 	rcu_read_lock();
   1851 
   1852 	if (!is_idle_task(rcu_dereference(rq->curr)))
   1853 		goto out;
   1854 
   1855 	if (set_nr_if_polling(rq->idle)) {
   1856 		trace_sched_wake_idle_without_ipi(cpu);
   1857 	} else {
   1858 		rq_lock_irqsave(rq, &rf);
   1859 		if (is_idle_task(rq->curr))
   1860 			smp_send_reschedule(cpu);
   1861 		/* Else CPU is not idle, do nothing here: */
   1862 		rq_unlock_irqrestore(rq, &rf);
   1863 	}
   1864 
   1865 out:
   1866 	rcu_read_unlock();
   1867 }
   1868 
   1869 bool cpus_share_cache(int this_cpu, int that_cpu)
   1870 {
   1871 	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
   1872 }
   1873 #endif /* CONFIG_SMP */
   1874 
   1875 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   1876 {
   1877 	struct rq *rq = cpu_rq(cpu);
   1878 	struct rq_flags rf;
   1879 
   1880 #if defined(CONFIG_SMP)
   1881 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
   1882 		sched_clock_cpu(cpu); /* Sync clocks across CPUs */
   1883 		ttwu_queue_remote(p, cpu, wake_flags);
   1884 		return;
   1885 	}
   1886 #endif
   1887 
   1888 	rq_lock(rq, &rf);
   1889 	update_rq_clock(rq);
   1890 	ttwu_do_activate(rq, p, wake_flags, &rf);
   1891 	rq_unlock(rq, &rf);
   1892 }
   1893 
   1894 /*
   1895  * Notes on Program-Order guarantees on SMP systems.
   1896  *
   1897  *  MIGRATION
   1898  *
   1899  * The basic program-order guarantee on SMP systems is that when a task [t]
   1900  * migrates, all its activity on its old CPU [c0] happens-before any subsequent
   1901  * execution on its new CPU [c1].
   1902  *
   1903  * For migration (of runnable tasks) this is provided by the following means:
   1904  *
   1905  *  A) UNLOCK of the rq(c0)->lock scheduling out task t
   1906  *  B) migration for t is required to synchronize *both* rq(c0)->lock and
   1907  *     rq(c1)->lock (if not at the same time, then in that order).
   1908  *  C) LOCK of the rq(c1)->lock scheduling in task
   1909  *
   1910  * Release/acquire chaining guarantees that B happens after A and C after B.
   1911  * Note: the CPU doing B need not be c0 or c1
   1912  *
   1913  * Example:
   1914  *
   1915  *   CPU0            CPU1            CPU2
   1916  *
   1917  *   LOCK rq(0)->lock
   1918  *   sched-out X
   1919  *   sched-in Y
   1920  *   UNLOCK rq(0)->lock
   1921  *
   1922  *                                   LOCK rq(0)->lock // orders against CPU0
   1923  *                                   dequeue X
   1924  *                                   UNLOCK rq(0)->lock
   1925  *
   1926  *                                   LOCK rq(1)->lock
   1927  *                                   enqueue X
   1928  *                                   UNLOCK rq(1)->lock
   1929  *
   1930  *                   LOCK rq(1)->lock // orders against CPU2
   1931  *                   sched-out Z
   1932  *                   sched-in X
   1933  *                   UNLOCK rq(1)->lock
   1934  *
   1935  *
   1936  *  BLOCKING -- aka. SLEEP + WAKEUP
   1937  *
   1938  * For blocking we (obviously) need to provide the same guarantee as for
   1939  * migration. However the means are completely different as there is no lock
   1940  * chain to provide order. Instead we do:
   1941  *
   1942  *   1) smp_store_release(X->on_cpu, 0)
   1943  *   2) smp_cond_load_acquire(!X->on_cpu)
   1944  *
   1945  * Example:
   1946  *
   1947  *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
   1948  *
   1949  *   LOCK rq(0)->lock LOCK X->pi_lock
   1950  *   dequeue X
   1951  *   sched-out X
   1952  *   smp_store_release(X->on_cpu, 0);
   1953  *
   1954  *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
   1955  *                    X->state = WAKING
   1956  *                    set_task_cpu(X,2)
   1957  *
   1958  *                    LOCK rq(2)->lock
   1959  *                    enqueue X
   1960  *                    X->state = RUNNING
   1961  *                    UNLOCK rq(2)->lock
   1962  *
   1963  *                                          LOCK rq(2)->lock // orders against CPU1
   1964  *                                          sched-out Z
   1965  *                                          sched-in X
   1966  *                                          UNLOCK rq(2)->lock
   1967  *
   1968  *                    UNLOCK X->pi_lock
   1969  *   UNLOCK rq(0)->lock
   1970  *
   1971  *
   1972  * However, for wakeups there is a second guarantee we must provide, namely we
   1973  * must ensure that CONDITION=1 done by the caller can not be reordered with
   1974  * accesses to the task state; see try_to_wake_up() and set_current_state().
   1975  */
   1976 
   1977 /**
   1978  * try_to_wake_up - wake up a thread
   1979  * @p: the thread to be awakened
   1980  * @state: the mask of task states that can be woken
   1981  * @wake_flags: wake modifier flags (WF_*)
   1982  *
   1983  * If (@state & @p->state) @p->state = TASK_RUNNING.
   1984  *
   1985  * If the task was not queued/runnable, also place it back on a runqueue.
   1986  *
   1987  * Atomic against schedule() which would dequeue a task, also see
   1988  * set_current_state().
   1989  *
   1990  * This function executes a full memory barrier before accessing the task
   1991  * state; see set_current_state().
   1992  *
   1993  * Return: %true if @p->state changes (an actual wakeup was done),
   1994  *	   %false otherwise.
   1995  */
   1996 static int
   1997 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
   1998 {
   1999 	unsigned long flags;
   2000 	int cpu, success = 0;
   2001 
   2002 	/*
   2003 	 * If we are going to wake up a thread waiting for CONDITION we
   2004 	 * need to ensure that CONDITION=1 done by the caller can not be
   2005 	 * reordered with p->state check below. This pairs with mb() in
   2006 	 * set_current_state() the waiting thread does.
   2007 	 */
   2008 	raw_spin_lock_irqsave(&p->pi_lock, flags);
   2009 	smp_mb__after_spinlock();
   2010 	if (!(p->state & state))
   2011 		goto out;
   2012 
   2013 	trace_sched_waking(p);
   2014 
   2015 	/* We're going to change ->state: */
   2016 	success = 1;
   2017 	cpu = task_cpu(p);
   2018 
   2019 	/*
   2020 	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
   2021 	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
   2022 	 * in smp_cond_load_acquire() below.
   2023 	 *
   2024 	 * sched_ttwu_pending()			try_to_wake_up()
   2025 	 *   STORE p->on_rq = 1			  LOAD p->state
   2026 	 *   UNLOCK rq->lock
   2027 	 *
   2028 	 * __schedule() (switch to task 'p')
   2029 	 *   LOCK rq->lock			  smp_rmb();
   2030 	 *   smp_mb__after_spinlock();
   2031 	 *   UNLOCK rq->lock
   2032 	 *
   2033 	 * [task p]
   2034 	 *   STORE p->state = UNINTERRUPTIBLE	  LOAD p->on_rq
   2035 	 *
   2036 	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
   2037 	 * __schedule().  See the comment for smp_mb__after_spinlock().
   2038 	 */
   2039 	smp_rmb();
   2040 	if (p->on_rq && ttwu_remote(p, wake_flags))
   2041 		goto stat;
   2042 
   2043 #ifdef CONFIG_SMP
   2044 	/*
   2045 	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
   2046 	 * possible to, falsely, observe p->on_cpu == 0.
   2047 	 *
   2048 	 * One must be running (->on_cpu == 1) in order to remove oneself
   2049 	 * from the runqueue.
   2050 	 *
   2051 	 * __schedule() (switch to task 'p')	try_to_wake_up()
   2052 	 *   STORE p->on_cpu = 1		  LOAD p->on_rq
   2053 	 *   UNLOCK rq->lock
   2054 	 *
   2055 	 * __schedule() (put 'p' to sleep)
   2056 	 *   LOCK rq->lock			  smp_rmb();
   2057 	 *   smp_mb__after_spinlock();
   2058 	 *   STORE p->on_rq = 0			  LOAD p->on_cpu
   2059 	 *
   2060 	 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
   2061 	 * __schedule().  See the comment for smp_mb__after_spinlock().
   2062 	 */
   2063 	smp_rmb();
   2064 
   2065 	/*
   2066 	 * If the owning (remote) CPU is still in the middle of schedule() with
   2067 	 * this task as prev, wait until its done referencing the task.
   2068 	 *
   2069 	 * Pairs with the smp_store_release() in finish_task().
   2070 	 *
   2071 	 * This ensures that tasks getting woken will be fully ordered against
   2072 	 * their previous state and preserve Program Order.
   2073 	 */
   2074 	smp_cond_load_acquire(&p->on_cpu, !VAL);
   2075 
   2076 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
   2077 	p->state = TASK_WAKING;
   2078 
   2079 	if (p->in_iowait) {
   2080 		delayacct_blkio_end(p);
   2081 		atomic_dec(&task_rq(p)->nr_iowait);
   2082 	}
   2083 
   2084 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
   2085 	if (task_cpu(p) != cpu) {
   2086 		wake_flags |= WF_MIGRATED;
   2087 		psi_ttwu_dequeue(p);
   2088 		set_task_cpu(p, cpu);
   2089 	}
   2090 
   2091 #else /* CONFIG_SMP */
   2092 
   2093 	if (p->in_iowait) {
   2094 		delayacct_blkio_end(p);
   2095 		atomic_dec(&task_rq(p)->nr_iowait);
   2096 	}
   2097 
   2098 #endif /* CONFIG_SMP */
   2099 
   2100 	ttwu_queue(p, cpu, wake_flags);
   2101 stat:
   2102 	ttwu_stat(p, cpu, wake_flags);
   2103 out:
   2104 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   2105 
   2106 	return success;
   2107 }
   2108 
   2109 /**
   2110  * try_to_wake_up_local - try to wake up a local task with rq lock held
   2111  * @p: the thread to be awakened
   2112  * @rf: request-queue flags for pinning
   2113  *
   2114  * Put @p on the run-queue if it's not already there. The caller must
   2115  * ensure that this_rq() is locked, @p is bound to this_rq() and not
   2116  * the current task.
   2117  */
   2118 static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
   2119 {
   2120 	struct rq *rq = task_rq(p);
   2121 
   2122 	if (WARN_ON_ONCE(rq != this_rq()) ||
   2123 	    WARN_ON_ONCE(p == current))
   2124 		return;
   2125 
   2126 	lockdep_assert_held(&rq->lock);
   2127 
   2128 	if (!raw_spin_trylock(&p->pi_lock)) {
   2129 		/*
   2130 		 * This is OK, because current is on_cpu, which avoids it being
   2131 		 * picked for load-balance and preemption/IRQs are still
   2132 		 * disabled avoiding further scheduler activity on it and we've
   2133 		 * not yet picked a replacement task.
   2134 		 */
   2135 		rq_unlock(rq, rf);
   2136 		raw_spin_lock(&p->pi_lock);
   2137 		rq_relock(rq, rf);
   2138 	}
   2139 
   2140 	if (!(p->state & TASK_NORMAL))
   2141 		goto out;
   2142 
   2143 	trace_sched_waking(p);
   2144 
   2145 	if (!task_on_rq_queued(p)) {
   2146 		if (p->in_iowait) {
   2147 			delayacct_blkio_end(p);
   2148 			atomic_dec(&rq->nr_iowait);
   2149 		}
   2150 		ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
   2151 	}
   2152 
   2153 	ttwu_do_wakeup(rq, p, 0, rf);
   2154 	ttwu_stat(p, smp_processor_id(), 0);
   2155 out:
   2156 	raw_spin_unlock(&p->pi_lock);
   2157 }
   2158 
   2159 /**
   2160  * wake_up_process - Wake up a specific process
   2161  * @p: The process to be woken up.
   2162  *
   2163  * Attempt to wake up the nominated process and move it to the set of runnable
   2164  * processes.
   2165  *
   2166  * Return: 1 if the process was woken up, 0 if it was already running.
   2167  *
   2168  * This function executes a full memory barrier before accessing the task state.
   2169  */
   2170 int wake_up_process(struct task_struct *p)
   2171 {
   2172 	return try_to_wake_up(p, TASK_NORMAL, 0);
   2173 }
   2174 EXPORT_SYMBOL(wake_up_process);
   2175 
   2176 int wake_up_state(struct task_struct *p, unsigned int state)
   2177 {
   2178 	return try_to_wake_up(p, state, 0);
   2179 }
   2180 
   2181 /*
   2182  * Perform scheduler related setup for a newly forked process p.
   2183  * p is forked by current.
   2184  *
   2185  * __sched_fork() is basic setup used by init_idle() too:
   2186  */
   2187 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
   2188 {
   2189 	p->on_rq			= 0;
   2190 
   2191 	p->se.on_rq			= 0;
   2192 	p->se.exec_start		= 0;
   2193 	p->se.sum_exec_runtime		= 0;
   2194 	p->se.prev_sum_exec_runtime	= 0;
   2195 	p->se.nr_migrations		= 0;
   2196 	p->se.vruntime			= 0;
   2197 	INIT_LIST_HEAD(&p->se.group_node);
   2198 
   2199 #ifdef CONFIG_FAIR_GROUP_SCHED
   2200 	p->se.cfs_rq			= NULL;
   2201 #endif
   2202 
   2203 #ifdef CONFIG_SCHEDSTATS
   2204 	/* Even if schedstat is disabled, there should not be garbage */
   2205 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
   2206 #endif
   2207 
   2208 	RB_CLEAR_NODE(&p->dl.rb_node);
   2209 	init_dl_task_timer(&p->dl);
   2210 	init_dl_inactive_task_timer(&p->dl);
   2211 	__dl_clear_params(p);
   2212 
   2213 	INIT_LIST_HEAD(&p->rt.run_list);
   2214 	p->rt.timeout		= 0;
   2215 	p->rt.time_slice	= sched_rr_timeslice;
   2216 	p->rt.on_rq		= 0;
   2217 	p->rt.on_list		= 0;
   2218 
   2219 #ifdef CONFIG_PREEMPT_NOTIFIERS
   2220 	INIT_HLIST_HEAD(&p->preempt_notifiers);
   2221 #endif
   2222 
   2223 #ifdef CONFIG_COMPACTION
   2224 	p->capture_control = NULL;
   2225 #endif
   2226 	init_numa_balancing(clone_flags, p);
   2227 }
   2228 
   2229 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
   2230 
   2231 #ifdef CONFIG_NUMA_BALANCING
   2232 
   2233 void set_numabalancing_state(bool enabled)
   2234 {
   2235 	if (enabled)
   2236 		static_branch_enable(&sched_numa_balancing);
   2237 	else
   2238 		static_branch_disable(&sched_numa_balancing);
   2239 }
   2240 
   2241 #ifdef CONFIG_PROC_SYSCTL
   2242 int sysctl_numa_balancing(struct ctl_table *table, int write,
   2243 			 void __user *buffer, size_t *lenp, loff_t *ppos)
   2244 {
   2245 	struct ctl_table t;
   2246 	int err;
   2247 	int state = static_branch_likely(&sched_numa_balancing);
   2248 
   2249 	if (write && !capable(CAP_SYS_ADMIN))
   2250 		return -EPERM;
   2251 
   2252 	t = *table;
   2253 	t.data = &state;
   2254 	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
   2255 	if (err < 0)
   2256 		return err;
   2257 	if (write)
   2258 		set_numabalancing_state(state);
   2259 	return err;
   2260 }
   2261 #endif
   2262 #endif
   2263 
   2264 #ifdef CONFIG_SCHEDSTATS
   2265 
   2266 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
   2267 static bool __initdata __sched_schedstats = false;
   2268 
   2269 static void set_schedstats(bool enabled)
   2270 {
   2271 	if (enabled)
   2272 		static_branch_enable(&sched_schedstats);
   2273 	else
   2274 		static_branch_disable(&sched_schedstats);
   2275 }
   2276 
   2277 void force_schedstat_enabled(void)
   2278 {
   2279 	if (!schedstat_enabled()) {
   2280 		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
   2281 		static_branch_enable(&sched_schedstats);
   2282 	}
   2283 }
   2284 
   2285 static int __init setup_schedstats(char *str)
   2286 {
   2287 	int ret = 0;
   2288 	if (!str)
   2289 		goto out;
   2290 
   2291 	/*
   2292 	 * This code is called before jump labels have been set up, so we can't
   2293 	 * change the static branch directly just yet.  Instead set a temporary
   2294 	 * variable so init_schedstats() can do it later.
   2295 	 */
   2296 	if (!strcmp(str, "enable")) {
   2297 		__sched_schedstats = true;
   2298 		ret = 1;
   2299 	} else if (!strcmp(str, "disable")) {
   2300 		__sched_schedstats = false;
   2301 		ret = 1;
   2302 	}
   2303 out:
   2304 	if (!ret)
   2305 		pr_warn("Unable to parse schedstats=\n");
   2306 
   2307 	return ret;
   2308 }
   2309 __setup("schedstats=", setup_schedstats);
   2310 
   2311 static void __init init_schedstats(void)
   2312 {
   2313 	set_schedstats(__sched_schedstats);
   2314 }
   2315 
   2316 #ifdef CONFIG_PROC_SYSCTL
   2317 int sysctl_schedstats(struct ctl_table *table, int write,
   2318 			 void __user *buffer, size_t *lenp, loff_t *ppos)
   2319 {
   2320 	struct ctl_table t;
   2321 	int err;
   2322 	int state = static_branch_likely(&sched_schedstats);
   2323 
   2324 	if (write && !capable(CAP_SYS_ADMIN))
   2325 		return -EPERM;
   2326 
   2327 	t = *table;
   2328 	t.data = &state;
   2329 	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
   2330 	if (err < 0)
   2331 		return err;
   2332 	if (write)
   2333 		set_schedstats(state);
   2334 	return err;
   2335 }
   2336 #endif /* CONFIG_PROC_SYSCTL */
   2337 #else  /* !CONFIG_SCHEDSTATS */
   2338 static inline void init_schedstats(void) {}
   2339 #endif /* CONFIG_SCHEDSTATS */
   2340 
   2341 /*
   2342  * fork()/clone()-time setup:
   2343  */
   2344 int sched_fork(unsigned long clone_flags, struct task_struct *p)
   2345 {
   2346 	unsigned long flags;
   2347 
   2348 	__sched_fork(clone_flags, p);
   2349 	/*
   2350 	 * We mark the process as NEW here. This guarantees that
   2351 	 * nobody will actually run it, and a signal or other external
   2352 	 * event cannot wake it up and insert it on the runqueue either.
   2353 	 */
   2354 	p->state = TASK_NEW;
   2355 
   2356 	/*
   2357 	 * Make sure we do not leak PI boosting priority to the child.
   2358 	 */
   2359 	p->prio = current->normal_prio;
   2360 
   2361 	/*
   2362 	 * Revert to default priority/policy on fork if requested.
   2363 	 */
   2364 	if (unlikely(p->sched_reset_on_fork)) {
   2365 		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
   2366 			p->policy = SCHED_NORMAL;
   2367 			p->static_prio = NICE_TO_PRIO(0);
   2368 			p->rt_priority = 0;
   2369 		} else if (PRIO_TO_NICE(p->static_prio) < 0)
   2370 			p->static_prio = NICE_TO_PRIO(0);
   2371 
   2372 		p->prio = p->normal_prio = __normal_prio(p);
   2373 		set_load_weight(p, false);
   2374 
   2375 		/*
   2376 		 * We don't need the reset flag anymore after the fork. It has
   2377 		 * fulfilled its duty:
   2378 		 */
   2379 		p->sched_reset_on_fork = 0;
   2380 	}
   2381 
   2382 	if (dl_prio(p->prio))
   2383 		return -EAGAIN;
   2384 	else if (rt_prio(p->prio))
   2385 		p->sched_class = &rt_sched_class;
   2386 	else
   2387 		p->sched_class = &fair_sched_class;
   2388 
   2389 	init_entity_runnable_average(&p->se);
   2390 
   2391 	/*
   2392 	 * The child is not yet in the pid-hash so no cgroup attach races,
   2393 	 * and the cgroup is pinned to this child due to cgroup_fork()
   2394 	 * is ran before sched_fork().
   2395 	 *
   2396 	 * Silence PROVE_RCU.
   2397 	 */
   2398 	raw_spin_lock_irqsave(&p->pi_lock, flags);
   2399 	/*
   2400 	 * We're setting the CPU for the first time, we don't migrate,
   2401 	 * so use __set_task_cpu().
   2402 	 */
   2403 	__set_task_cpu(p, smp_processor_id());
   2404 	if (p->sched_class->task_fork)
   2405 		p->sched_class->task_fork(p);
   2406 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   2407 
   2408 #ifdef CONFIG_SCHED_INFO
   2409 	if (likely(sched_info_on()))
   2410 		memset(&p->sched_info, 0, sizeof(p->sched_info));
   2411 #endif
   2412 #if defined(CONFIG_SMP)
   2413 	p->on_cpu = 0;
   2414 #endif
   2415 	init_task_preempt_count(p);
   2416 #ifdef CONFIG_SMP
   2417 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
   2418 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
   2419 #endif
   2420 	return 0;
   2421 }
   2422 
   2423 unsigned long to_ratio(u64 period, u64 runtime)
   2424 {
   2425 	if (runtime == RUNTIME_INF)
   2426 		return BW_UNIT;
   2427 
   2428 	/*
   2429 	 * Doing this here saves a lot of checks in all
   2430 	 * the calling paths, and returning zero seems
   2431 	 * safe for them anyway.
   2432 	 */
   2433 	if (period == 0)
   2434 		return 0;
   2435 
   2436 	return div64_u64(runtime << BW_SHIFT, period);
   2437 }
   2438 
   2439 /*
   2440  * wake_up_new_task - wake up a newly created task for the first time.
   2441  *
   2442  * This function will do some initial scheduler statistics housekeeping
   2443  * that must be done for every newly created context, then puts the task
   2444  * on the runqueue and wakes it.
   2445  */
   2446 void wake_up_new_task(struct task_struct *p)
   2447 {
   2448 	struct rq_flags rf;
   2449 	struct rq *rq;
   2450 
   2451 	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
   2452 	p->state = TASK_RUNNING;
   2453 #ifdef CONFIG_SMP
   2454 	/*
   2455 	 * Fork balancing, do it here and not earlier because:
   2456 	 *  - cpus_allowed can change in the fork path
   2457 	 *  - any previously selected CPU might disappear through hotplug
   2458 	 *
   2459 	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
   2460 	 * as we're not fully set-up yet.
   2461 	 */
   2462 	p->recent_used_cpu = task_cpu(p);
   2463 	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
   2464 #endif
   2465 	rq = __task_rq_lock(p, &rf);
   2466 	update_rq_clock(rq);
   2467 	post_init_entity_util_avg(p);
   2468 
   2469 	activate_task(rq, p, ENQUEUE_NOCLOCK);
   2470 	p->on_rq = TASK_ON_RQ_QUEUED;
   2471 	trace_sched_wakeup_new(p);
   2472 	check_preempt_curr(rq, p, WF_FORK);
   2473 #ifdef CONFIG_SMP
   2474 	if (p->sched_class->task_woken) {
   2475 		/*
   2476 		 * Nothing relies on rq->lock after this, so its fine to
   2477 		 * drop it.
   2478 		 */
   2479 		rq_unpin_lock(rq, &rf);
   2480 		p->sched_class->task_woken(rq, p);
   2481 		rq_repin_lock(rq, &rf);
   2482 	}
   2483 #endif
   2484 	task_rq_unlock(rq, p, &rf);
   2485 }
   2486 
   2487 #ifdef CONFIG_PREEMPT_NOTIFIERS
   2488 
   2489 static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
   2490 
   2491 void preempt_notifier_inc(void)
   2492 {
   2493 	static_branch_inc(&preempt_notifier_key);
   2494 }
   2495 EXPORT_SYMBOL_GPL(preempt_notifier_inc);
   2496 
   2497 void preempt_notifier_dec(void)
   2498 {
   2499 	static_branch_dec(&preempt_notifier_key);
   2500 }
   2501 EXPORT_SYMBOL_GPL(preempt_notifier_dec);
   2502 
   2503 /**
   2504  * preempt_notifier_register - tell me when current is being preempted & rescheduled
   2505  * @notifier: notifier struct to register
   2506  */
   2507 void preempt_notifier_register(struct preempt_notifier *notifier)
   2508 {
   2509 	if (!static_branch_unlikely(&preempt_notifier_key))
   2510 		WARN(1, "registering preempt_notifier while notifiers disabled\n");
   2511 
   2512 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
   2513 }
   2514 EXPORT_SYMBOL_GPL(preempt_notifier_register);
   2515 
   2516 /**
   2517  * preempt_notifier_unregister - no longer interested in preemption notifications
   2518  * @notifier: notifier struct to unregister
   2519  *
   2520  * This is *not* safe to call from within a preemption notifier.
   2521  */
   2522 void preempt_notifier_unregister(struct preempt_notifier *notifier)
   2523 {
   2524 	hlist_del(&notifier->link);
   2525 }
   2526 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
   2527 
   2528 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
   2529 {
   2530 	struct preempt_notifier *notifier;
   2531 
   2532 	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
   2533 		notifier->ops->sched_in(notifier, raw_smp_processor_id());
   2534 }
   2535 
   2536 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
   2537 {
   2538 	if (static_branch_unlikely(&preempt_notifier_key))
   2539 		__fire_sched_in_preempt_notifiers(curr);
   2540 }
   2541 
   2542 static void
   2543 __fire_sched_out_preempt_notifiers(struct task_struct *curr,
   2544 				   struct task_struct *next)
   2545 {
   2546 	struct preempt_notifier *notifier;
   2547 
   2548 	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
   2549 		notifier->ops->sched_out(notifier, next);
   2550 }
   2551 
   2552 static __always_inline void
   2553 fire_sched_out_preempt_notifiers(struct task_struct *curr,
   2554 				 struct task_struct *next)
   2555 {
   2556 	if (static_branch_unlikely(&preempt_notifier_key))
   2557 		__fire_sched_out_preempt_notifiers(curr, next);
   2558 }
   2559 
   2560 #else /* !CONFIG_PREEMPT_NOTIFIERS */
   2561 
   2562 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
   2563 {
   2564 }
   2565 
   2566 static inline void
   2567 fire_sched_out_preempt_notifiers(struct task_struct *curr,
   2568 				 struct task_struct *next)
   2569 {
   2570 }
   2571 
   2572 #endif /* CONFIG_PREEMPT_NOTIFIERS */
   2573 
   2574 static inline void prepare_task(struct task_struct *next)
   2575 {
   2576 #ifdef CONFIG_SMP
   2577 	/*
   2578 	 * Claim the task as running, we do this before switching to it
   2579 	 * such that any running task will have this set.
   2580 	 */
   2581 	next->on_cpu = 1;
   2582 #endif
   2583 }
   2584 
   2585 static inline void finish_task(struct task_struct *prev)
   2586 {
   2587 #ifdef CONFIG_SMP
   2588 	/*
   2589 	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
   2590 	 * We must ensure this doesn't happen until the switch is completely
   2591 	 * finished.
   2592 	 *
   2593 	 * In particular, the load of prev->state in finish_task_switch() must
   2594 	 * happen before this.
   2595 	 *
   2596 	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
   2597 	 */
   2598 	smp_store_release(&prev->on_cpu, 0);
   2599 #endif
   2600 }
   2601 
   2602 static inline void
   2603 prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf)
   2604 {
   2605 	/*
   2606 	 * Since the runqueue lock will be released by the next
   2607 	 * task (which is an invalid locking op but in the case
   2608 	 * of the scheduler it's an obvious special-case), so we
   2609 	 * do an early lockdep release here:
   2610 	 */
   2611 	rq_unpin_lock(rq, rf);
   2612 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
   2613 #ifdef CONFIG_DEBUG_SPINLOCK
   2614 	/* this is a valid case when another task releases the spinlock */
   2615 	rq->lock.owner = next;
   2616 #endif
   2617 }
   2618 
   2619 static inline void finish_lock_switch(struct rq *rq)
   2620 {
   2621 	/*
   2622 	 * If we are tracking spinlock dependencies then we have to
   2623 	 * fix up the runqueue lock - which gets 'carried over' from
   2624 	 * prev into current:
   2625 	 */
   2626 	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
   2627 	raw_spin_unlock_irq(&rq->lock);
   2628 }
   2629 
   2630 /*
   2631  * NOP if the arch has not defined these:
   2632  */
   2633 
   2634 #ifndef prepare_arch_switch
   2635 # define prepare_arch_switch(next)	do { } while (0)
   2636 #endif
   2637 
   2638 #ifndef finish_arch_post_lock_switch
   2639 # define finish_arch_post_lock_switch()	do { } while (0)
   2640 #endif
   2641 
   2642 /**
   2643  * prepare_task_switch - prepare to switch tasks
   2644  * @rq: the runqueue preparing to switch
   2645  * @prev: the current task that is being switched out
   2646  * @next: the task we are going to switch to.
   2647  *
   2648  * This is called with the rq lock held and interrupts off. It must
   2649  * be paired with a subsequent finish_task_switch after the context
   2650  * switch.
   2651  *
   2652  * prepare_task_switch sets up locking and calls architecture specific
   2653  * hooks.
   2654  */
   2655 static inline void
   2656 prepare_task_switch(struct rq *rq, struct task_struct *prev,
   2657 		    struct task_struct *next)
   2658 {
   2659 	kcov_prepare_switch(prev);
   2660 	sched_info_switch(rq, prev, next);
   2661 	perf_event_task_sched_out(prev, next);
   2662 	rseq_preempt(prev);
   2663 	fire_sched_out_preempt_notifiers(prev, next);
   2664 	prepare_task(next);
   2665 	prepare_arch_switch(next);
   2666 }
   2667 
   2668 /**
   2669  * finish_task_switch - clean up after a task-switch
   2670  * @prev: the thread we just switched away from.
   2671  *
   2672  * finish_task_switch must be called after the context switch, paired
   2673  * with a prepare_task_switch call before the context switch.
   2674  * finish_task_switch will reconcile locking set up by prepare_task_switch,
   2675  * and do any other architecture-specific cleanup actions.
   2676  *
   2677  * Note that we may have delayed dropping an mm in context_switch(). If
   2678  * so, we finish that here outside of the runqueue lock. (Doing it
   2679  * with the lock held can cause deadlocks; see schedule() for
   2680  * details.)
   2681  *
   2682  * The context switch have flipped the stack from under us and restored the
   2683  * local variables which were saved when this task called schedule() in the
   2684  * past. prev == current is still correct but we need to recalculate this_rq
   2685  * because prev may have moved to another CPU.
   2686  */
   2687 static struct rq *finish_task_switch(struct task_struct *prev)
   2688 	__releases(rq->lock)
   2689 {
   2690 	struct rq *rq = this_rq();
   2691 	struct mm_struct *mm = rq->prev_mm;
   2692 	long prev_state;
   2693 
   2694 	/*
   2695 	 * The previous task will have left us with a preempt_count of 2
   2696 	 * because it left us after:
   2697 	 *
   2698 	 *	schedule()
   2699 	 *	  preempt_disable();			// 1
   2700 	 *	  __schedule()
   2701 	 *	    raw_spin_lock_irq(&rq->lock)	// 2
   2702 	 *
   2703 	 * Also, see FORK_PREEMPT_COUNT.
   2704 	 */
   2705 	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
   2706 		      "corrupted preempt_count: %s/%d/0x%x\n",
   2707 		      current->comm, current->pid, preempt_count()))
   2708 		preempt_count_set(FORK_PREEMPT_COUNT);
   2709 
   2710 	rq->prev_mm = NULL;
   2711 
   2712 	/*
   2713 	 * A task struct has one reference for the use as "current".
   2714 	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
   2715 	 * schedule one last time. The schedule call will never return, and
   2716 	 * the scheduled task must drop that reference.
   2717 	 *
   2718 	 * We must observe prev->state before clearing prev->on_cpu (in
   2719 	 * finish_task), otherwise a concurrent wakeup can get prev
   2720 	 * running on another CPU and we could rave with its RUNNING -> DEAD
   2721 	 * transition, resulting in a double drop.
   2722 	 */
   2723 	prev_state = prev->state;
   2724 	vtime_task_switch(prev);
   2725 	perf_event_task_sched_in(prev, current);
   2726 	finish_task(prev);
   2727 	finish_lock_switch(rq);
   2728 	finish_arch_post_lock_switch();
   2729 	kcov_finish_switch(current);
   2730 
   2731 	fire_sched_in_preempt_notifiers(current);
   2732 	/*
   2733 	 * When switching through a kernel thread, the loop in
   2734 	 * membarrier_{private,global}_expedited() may have observed that
   2735 	 * kernel thread and not issued an IPI. It is therefore possible to
   2736 	 * schedule between user->kernel->user threads without passing though
   2737 	 * switch_mm(). Membarrier requires a barrier after storing to
   2738 	 * rq->curr, before returning to userspace, so provide them here:
   2739 	 *
   2740 	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
   2741 	 *   provided by mmdrop(),
   2742 	 * - a sync_core for SYNC_CORE.
   2743 	 */
   2744 	if (mm) {
   2745 		membarrier_mm_sync_core_before_usermode(mm);
   2746 		mmdrop(mm);
   2747 	}
   2748 	if (unlikely(prev_state == TASK_DEAD)) {
   2749 		if (prev->sched_class->task_dead)
   2750 			prev->sched_class->task_dead(prev);
   2751 
   2752 		/*
   2753 		 * Remove function-return probe instances associated with this
   2754 		 * task and put them back on the free list.
   2755 		 */
   2756 		kprobe_flush_task(prev);
   2757 
   2758 		/* Task is done with its stack. */
   2759 		put_task_stack(prev);
   2760 
   2761 		put_task_struct(prev);
   2762 	}
   2763 
   2764 	tick_nohz_task_switch();
   2765 	return rq;
   2766 }
   2767 
   2768 #ifdef CONFIG_SMP
   2769 
   2770 /* rq->lock is NOT held, but preemption is disabled */
   2771 static void __balance_callback(struct rq *rq)
   2772 {
   2773 	struct callback_head *head, *next;
   2774 	void (*func)(struct rq *rq);
   2775 	unsigned long flags;
   2776 
   2777 	raw_spin_lock_irqsave(&rq->lock, flags);
   2778 	head = rq->balance_callback;
   2779 	rq->balance_callback = NULL;
   2780 	while (head) {
   2781 		func = (void (*)(struct rq *))head->func;
   2782 		next = head->next;
   2783 		head->next = NULL;
   2784 		head = next;
   2785 
   2786 		func(rq);
   2787 	}
   2788 	raw_spin_unlock_irqrestore(&rq->lock, flags);
   2789 }
   2790 
   2791 static inline void balance_callback(struct rq *rq)
   2792 {
   2793 	if (unlikely(rq->balance_callback))
   2794 		__balance_callback(rq);
   2795 }
   2796 
   2797 #else
   2798 
   2799 static inline void balance_callback(struct rq *rq)
   2800 {
   2801 }
   2802 
   2803 #endif
   2804 
   2805 /**
   2806  * schedule_tail - first thing a freshly forked thread must call.
   2807  * @prev: the thread we just switched away from.
   2808  */
   2809 asmlinkage __visible void schedule_tail(struct task_struct *prev)
   2810 	__releases(rq->lock)
   2811 {
   2812 	struct rq *rq;
   2813 
   2814 	/*
   2815 	 * New tasks start with FORK_PREEMPT_COUNT, see there and
   2816 	 * finish_task_switch() for details.
   2817 	 *
   2818 	 * finish_task_switch() will drop rq->lock() and lower preempt_count
   2819 	 * and the preempt_enable() will end up enabling preemption (on
   2820 	 * PREEMPT_COUNT kernels).
   2821 	 */
   2822 
   2823 	rq = finish_task_switch(prev);
   2824 	balance_callback(rq);
   2825 	preempt_enable();
   2826 
   2827 	if (current->set_child_tid)
   2828 		put_user(task_pid_vnr(current), current->set_child_tid);
   2829 
   2830 	calculate_sigpending();
   2831 }
   2832 
   2833 /*
   2834  * context_switch - switch to the new MM and the new thread's register state.
   2835  */
   2836 static __always_inline struct rq *
   2837 context_switch(struct rq *rq, struct task_struct *prev,
   2838 	       struct task_struct *next, struct rq_flags *rf)
   2839 {
   2840 	struct mm_struct *mm, *oldmm;
   2841 
   2842 	prepare_task_switch(rq, prev, next);
   2843 
   2844 	mm = next->mm;
   2845 	oldmm = prev->active_mm;
   2846 	/*
   2847 	 * For paravirt, this is coupled with an exit in switch_to to
   2848 	 * combine the page table reload and the switch backend into
   2849 	 * one hypercall.
   2850 	 */
   2851 	arch_start_context_switch(prev);
   2852 
   2853 	/*
   2854 	 * If mm is non-NULL, we pass through switch_mm(). If mm is
   2855 	 * NULL, we will pass through mmdrop() in finish_task_switch().
   2856 	 * Both of these contain the full memory barrier required by
   2857 	 * membarrier after storing to rq->curr, before returning to
   2858 	 * user-space.
   2859 	 */
   2860 	if (!mm) {
   2861 		next->active_mm = oldmm;
   2862 		mmgrab(oldmm);
   2863 		enter_lazy_tlb(oldmm, next);
   2864 	} else
   2865 		switch_mm_irqs_off(oldmm, mm, next);
   2866 
   2867 	if (!prev->mm) {
   2868 		prev->active_mm = NULL;
   2869 		rq->prev_mm = oldmm;
   2870 	}
   2871 
   2872 	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
   2873 
   2874 	prepare_lock_switch(rq, next, rf);
   2875 
   2876 	/* Here we just switch the register state and the stack. */
   2877 	switch_to(prev, next, prev);
   2878 	barrier();
   2879 
   2880 	return finish_task_switch(prev);
   2881 }
   2882 
   2883 /*
   2884  * nr_running and nr_context_switches:
   2885  *
   2886  * externally visible scheduler statistics: current number of runnable
   2887  * threads, total number of context switches performed since bootup.
   2888  */
   2889 unsigned long nr_running(void)
   2890 {
   2891 	unsigned long i, sum = 0;
   2892 
   2893 	for_each_online_cpu(i)
   2894 		sum += cpu_rq(i)->nr_running;
   2895 
   2896 	return sum;
   2897 }
   2898 
   2899 /*
   2900  * Check if only the current task is running on the CPU.
   2901  *
   2902  * Caution: this function does not check that the caller has disabled
   2903  * preemption, thus the result might have a time-of-check-to-time-of-use
   2904  * race.  The caller is responsible to use it correctly, for example:
   2905  *
   2906  * - from a non-preemptible section (of course)
   2907  *
   2908  * - from a thread that is bound to a single CPU
   2909  *
   2910  * - in a loop with very short iterations (e.g. a polling loop)
   2911  */
   2912 bool single_task_running(void)
   2913 {
   2914 	return raw_rq()->nr_running == 1;
   2915 }
   2916 EXPORT_SYMBOL(single_task_running);
   2917 
   2918 unsigned long long nr_context_switches(void)
   2919 {
   2920 	int i;
   2921 	unsigned long long sum = 0;
   2922 
   2923 	for_each_possible_cpu(i)
   2924 		sum += cpu_rq(i)->nr_switches;
   2925 
   2926 	return sum;
   2927 }
   2928 
   2929 /*
   2930  * Consumers of these two interfaces, like for example the cpuidle menu
   2931  * governor, are using nonsensical data. Preferring shallow idle state selection
   2932  * for a CPU that has IO-wait which might not even end up running the task when
   2933  * it does become runnable.
   2934  */
   2935 
   2936 unsigned long nr_iowait_cpu(int cpu)
   2937 {
   2938 	return atomic_read(&cpu_rq(cpu)->nr_iowait);
   2939 }
   2940 
   2941 /*
   2942  * IO-wait accounting, and how its mostly bollocks (on SMP).
   2943  *
   2944  * The idea behind IO-wait account is to account the idle time that we could
   2945  * have spend running if it were not for IO. That is, if we were to improve the
   2946  * storage performance, we'd have a proportional reduction in IO-wait time.
   2947  *
   2948  * This all works nicely on UP, where, when a task blocks on IO, we account
   2949  * idle time as IO-wait, because if the storage were faster, it could've been
   2950  * running and we'd not be idle.
   2951  *
   2952  * This has been extended to SMP, by doing the same for each CPU. This however
   2953  * is broken.
   2954  *
   2955  * Imagine for instance the case where two tasks block on one CPU, only the one
   2956  * CPU will have IO-wait accounted, while the other has regular idle. Even
   2957  * though, if the storage were faster, both could've ran at the same time,
   2958  * utilising both CPUs.
   2959  *
   2960  * This means, that when looking globally, the current IO-wait accounting on
   2961  * SMP is a lower bound, by reason of under accounting.
   2962  *
   2963  * Worse, since the numbers are provided per CPU, they are sometimes
   2964  * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
   2965  * associated with any one particular CPU, it can wake to another CPU than it
   2966  * blocked on. This means the per CPU IO-wait number is meaningless.
   2967  *
   2968  * Task CPU affinities can make all that even more 'interesting'.
   2969  */
   2970 
   2971 unsigned long nr_iowait(void)
   2972 {
   2973 	unsigned long i, sum = 0;
   2974 
   2975 	for_each_possible_cpu(i)
   2976 		sum += nr_iowait_cpu(i);
   2977 
   2978 	return sum;
   2979 }
   2980 
   2981 #ifdef CONFIG_SMP
   2982 
   2983 /*
   2984  * sched_exec - execve() is a valuable balancing opportunity, because at
   2985  * this point the task has the smallest effective memory and cache footprint.
   2986  */
   2987 void sched_exec(void)
   2988 {
   2989 	struct task_struct *p = current;
   2990 	unsigned long flags;
   2991 	int dest_cpu;
   2992 
   2993 	raw_spin_lock_irqsave(&p->pi_lock, flags);
   2994 	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
   2995 	if (dest_cpu == smp_processor_id())
   2996 		goto unlock;
   2997 
   2998 	if (likely(cpu_active(dest_cpu))) {
   2999 		struct migration_arg arg = { p, dest_cpu };
   3000 
   3001 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   3002 		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
   3003 		return;
   3004 	}
   3005 unlock:
   3006 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   3007 }
   3008 
   3009 #endif
   3010 
   3011 DEFINE_PER_CPU(struct kernel_stat, kstat);
   3012 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
   3013 
   3014 EXPORT_PER_CPU_SYMBOL(kstat);
   3015 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
   3016 
   3017 /*
   3018  * The function fair_sched_class.update_curr accesses the struct curr
   3019  * and its field curr->exec_start; when called from task_sched_runtime(),
   3020  * we observe a high rate of cache misses in practice.
   3021  * Prefetching this data results in improved performance.
   3022  */
   3023 static inline void prefetch_curr_exec_start(struct task_struct *p)
   3024 {
   3025 #ifdef CONFIG_FAIR_GROUP_SCHED
   3026 	struct sched_entity *curr = (&p->se)->cfs_rq->curr;
   3027 #else
   3028 	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
   3029 #endif
   3030 	prefetch(curr);
   3031 	prefetch(&curr->exec_start);
   3032 }
   3033 
   3034 /*
   3035  * Return accounted runtime for the task.
   3036  * In case the task is currently running, return the runtime plus current's
   3037  * pending runtime that have not been accounted yet.
   3038  */
   3039 unsigned long long task_sched_runtime(struct task_struct *p)
   3040 {
   3041 	struct rq_flags rf;
   3042 	struct rq *rq;
   3043 	u64 ns;
   3044 
   3045 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
   3046 	/*
   3047 	 * 64-bit doesn't need locks to atomically read a 64-bit value.
   3048 	 * So we have a optimization chance when the task's delta_exec is 0.
   3049 	 * Reading ->on_cpu is racy, but this is ok.
   3050 	 *
   3051 	 * If we race with it leaving CPU, we'll take a lock. So we're correct.
   3052 	 * If we race with it entering CPU, unaccounted time is 0. This is
   3053 	 * indistinguishable from the read occurring a few cycles earlier.
   3054 	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
   3055 	 * been accounted, so we're correct here as well.
   3056 	 */
   3057 	if (!p->on_cpu || !task_on_rq_queued(p))
   3058 		return p->se.sum_exec_runtime;
   3059 #endif
   3060 
   3061 	rq = task_rq_lock(p, &rf);
   3062 	/*
   3063 	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
   3064 	 * project cycles that may never be accounted to this
   3065 	 * thread, breaking clock_gettime().
   3066 	 */
   3067 	if (task_current(rq, p) && task_on_rq_queued(p)) {
   3068 		prefetch_curr_exec_start(p);
   3069 		update_rq_clock(rq);
   3070 		p->sched_class->update_curr(rq);
   3071 	}
   3072 	ns = p->se.sum_exec_runtime;
   3073 	task_rq_unlock(rq, p, &rf);
   3074 
   3075 	return ns;
   3076 }
   3077 
   3078 /*
   3079  * This function gets called by the timer code, with HZ frequency.
   3080  * We call it with interrupts disabled.
   3081  */
   3082 void scheduler_tick(void)
   3083 {
   3084 	int cpu = smp_processor_id();
   3085 	struct rq *rq = cpu_rq(cpu);
   3086 	struct task_struct *curr = rq->curr;
   3087 	struct rq_flags rf;
   3088 
   3089 	sched_clock_tick();
   3090 
   3091 	rq_lock(rq, &rf);
   3092 
   3093 	update_rq_clock(rq);
   3094 	curr->sched_class->task_tick(rq, curr, 0);
   3095 	cpu_load_update_active(rq);
   3096 	calc_global_load_tick(rq);
   3097 	psi_task_tick(rq);
   3098 
   3099 	rq_unlock(rq, &rf);
   3100 
   3101 	perf_event_task_tick();
   3102 
   3103 #ifdef CONFIG_SMP
   3104 	rq->idle_balance = idle_cpu(cpu);
   3105 	trigger_load_balance(rq);
   3106 #endif
   3107 }
   3108 
   3109 #ifdef CONFIG_NO_HZ_FULL
   3110 
   3111 struct tick_work {
   3112 	int			cpu;
   3113 	struct delayed_work	work;
   3114 };
   3115 
   3116 static struct tick_work __percpu *tick_work_cpu;
   3117 
   3118 static void sched_tick_remote(struct work_struct *work)
   3119 {
   3120 	struct delayed_work *dwork = to_delayed_work(work);
   3121 	struct tick_work *twork = container_of(dwork, struct tick_work, work);
   3122 	int cpu = twork->cpu;
   3123 	struct rq *rq = cpu_rq(cpu);
   3124 	struct task_struct *curr;
   3125 	struct rq_flags rf;
   3126 	u64 delta;
   3127 
   3128 	/*
   3129 	 * Handle the tick only if it appears the remote CPU is running in full
   3130 	 * dynticks mode. The check is racy by nature, but missing a tick or
   3131 	 * having one too much is no big deal because the scheduler tick updates
   3132 	 * statistics and checks timeslices in a time-independent way, regardless
   3133 	 * of when exactly it is running.
   3134 	 */
   3135 	if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
   3136 		goto out_requeue;
   3137 
   3138 	rq_lock_irq(rq, &rf);
   3139 	curr = rq->curr;
   3140 	if (is_idle_task(curr))
   3141 		goto out_unlock;
   3142 
   3143 	update_rq_clock(rq);
   3144 	delta = rq_clock_task(rq) - curr->se.exec_start;
   3145 
   3146 	/*
   3147 	 * Make sure the next tick runs within a reasonable
   3148 	 * amount of time.
   3149 	 */
   3150 	WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
   3151 	curr->sched_class->task_tick(rq, curr, 0);
   3152 
   3153 out_unlock:
   3154 	rq_unlock_irq(rq, &rf);
   3155 
   3156 out_requeue:
   3157 	/*
   3158 	 * Run the remote tick once per second (1Hz). This arbitrary
   3159 	 * frequency is large enough to avoid overload but short enough
   3160 	 * to keep scheduler internal stats reasonably up to date.
   3161 	 */
   3162 	queue_delayed_work(system_unbound_wq, dwork, HZ);
   3163 }
   3164 
   3165 static void sched_tick_start(int cpu)
   3166 {
   3167 	struct tick_work *twork;
   3168 
   3169 	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
   3170 		return;
   3171 
   3172 	WARN_ON_ONCE(!tick_work_cpu);
   3173 
   3174 	twork = per_cpu_ptr(tick_work_cpu, cpu);
   3175 	twork->cpu = cpu;
   3176 	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
   3177 	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
   3178 }
   3179 
   3180 #ifdef CONFIG_HOTPLUG_CPU
   3181 static void sched_tick_stop(int cpu)
   3182 {
   3183 	struct tick_work *twork;
   3184 
   3185 	if (housekeeping_cpu(cpu, HK_FLAG_TICK))
   3186 		return;
   3187 
   3188 	WARN_ON_ONCE(!tick_work_cpu);
   3189 
   3190 	twork = per_cpu_ptr(tick_work_cpu, cpu);
   3191 	cancel_delayed_work_sync(&twork->work);
   3192 }
   3193 #endif /* CONFIG_HOTPLUG_CPU */
   3194 
   3195 int __init sched_tick_offload_init(void)
   3196 {
   3197 	tick_work_cpu = alloc_percpu(struct tick_work);
   3198 	BUG_ON(!tick_work_cpu);
   3199 
   3200 	return 0;
   3201 }
   3202 
   3203 #else /* !CONFIG_NO_HZ_FULL */
   3204 static inline void sched_tick_start(int cpu) { }
   3205 static inline void sched_tick_stop(int cpu) { }
   3206 #endif
   3207 
   3208 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
   3209 				defined(CONFIG_TRACE_PREEMPT_TOGGLE))
   3210 /*
   3211  * If the value passed in is equal to the current preempt count
   3212  * then we just disabled preemption. Start timing the latency.
   3213  */
   3214 static inline void preempt_latency_start(int val)
   3215 {
   3216 	if (preempt_count() == val) {
   3217 		unsigned long ip = get_lock_parent_ip();
   3218 #ifdef CONFIG_DEBUG_PREEMPT
   3219 		current->preempt_disable_ip = ip;
   3220 #endif
   3221 		trace_preempt_off(CALLER_ADDR0, ip);
   3222 	}
   3223 }
   3224 
   3225 void preempt_count_add(int val)
   3226 {
   3227 #ifdef CONFIG_DEBUG_PREEMPT
   3228 	/*
   3229 	 * Underflow?
   3230 	 */
   3231 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
   3232 		return;
   3233 #endif
   3234 	__preempt_count_add(val);
   3235 #ifdef CONFIG_DEBUG_PREEMPT
   3236 	/*
   3237 	 * Spinlock count overflowing soon?
   3238 	 */
   3239 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
   3240 				PREEMPT_MASK - 10);
   3241 #endif
   3242 	preempt_latency_start(val);
   3243 }
   3244 EXPORT_SYMBOL(preempt_count_add);
   3245 NOKPROBE_SYMBOL(preempt_count_add);
   3246 
   3247 /*
   3248  * If the value passed in equals to the current preempt count
   3249  * then we just enabled preemption. Stop timing the latency.
   3250  */
   3251 static inline void preempt_latency_stop(int val)
   3252 {
   3253 	if (preempt_count() == val)
   3254 		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
   3255 }
   3256 
   3257 void preempt_count_sub(int val)
   3258 {
   3259 #ifdef CONFIG_DEBUG_PREEMPT
   3260 	/*
   3261 	 * Underflow?
   3262 	 */
   3263 	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
   3264 		return;
   3265 	/*
   3266 	 * Is the spinlock portion underflowing?
   3267 	 */
   3268 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
   3269 			!(preempt_count() & PREEMPT_MASK)))
   3270 		return;
   3271 #endif
   3272 
   3273 	preempt_latency_stop(val);
   3274 	__preempt_count_sub(val);
   3275 }
   3276 EXPORT_SYMBOL(preempt_count_sub);
   3277 NOKPROBE_SYMBOL(preempt_count_sub);
   3278 
   3279 #else
   3280 static inline void preempt_latency_start(int val) { }
   3281 static inline void preempt_latency_stop(int val) { }
   3282 #endif
   3283 
   3284 static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
   3285 {
   3286 #ifdef CONFIG_DEBUG_PREEMPT
   3287 	return p->preempt_disable_ip;
   3288 #else
   3289 	return 0;
   3290 #endif
   3291 }
   3292 
   3293 /*
   3294  * Print scheduling while atomic bug:
   3295  */
   3296 static noinline void __schedule_bug(struct task_struct *prev)
   3297 {
   3298 	/* Save this before calling printk(), since that will clobber it */
   3299 	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
   3300 
   3301 	if (oops_in_progress)
   3302 		return;
   3303 
   3304 	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
   3305 		prev->comm, prev->pid, preempt_count());
   3306 
   3307 	debug_show_held_locks(prev);
   3308 	print_modules();
   3309 	if (irqs_disabled())
   3310 		print_irqtrace_events(prev);
   3311 	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
   3312 	    && in_atomic_preempt_off()) {
   3313 		pr_err("Preemption disabled at:");
   3314 		print_ip_sym(preempt_disable_ip);
   3315 		pr_cont("\n");
   3316 	}
   3317 	if (panic_on_warn)
   3318 		panic("scheduling while atomic\n");
   3319 
   3320 	dump_stack();
   3321 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
   3322 }
   3323 
   3324 /*
   3325  * Various schedule()-time debugging checks and statistics:
   3326  */
   3327 static inline void schedule_debug(struct task_struct *prev)
   3328 {
   3329 #ifdef CONFIG_SCHED_STACK_END_CHECK
   3330 	if (task_stack_end_corrupted(prev))
   3331 		panic("corrupted stack end detected inside scheduler\n");
   3332 #endif
   3333 
   3334 	if (unlikely(in_atomic_preempt_off())) {
   3335 		__schedule_bug(prev);
   3336 		preempt_count_set(PREEMPT_DISABLED);
   3337 	}
   3338 	rcu_sleep_check();
   3339 
   3340 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
   3341 
   3342 	schedstat_inc(this_rq()->sched_count);
   3343 }
   3344 
   3345 /*
   3346  * Pick up the highest-prio task:
   3347  */
   3348 static inline struct task_struct *
   3349 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
   3350 {
   3351 	const struct sched_class *class;
   3352 	struct task_struct *p;
   3353 
   3354 	/*
   3355 	 * Optimization: we know that if all tasks are in the fair class we can
   3356 	 * call that function directly, but only if the @prev task wasn't of a
   3357 	 * higher scheduling class, because otherwise those loose the
   3358 	 * opportunity to pull in more work from other CPUs.
   3359 	 */
   3360 	if (likely((prev->sched_class == &idle_sched_class ||
   3361 		    prev->sched_class == &fair_sched_class) &&
   3362 		   rq->nr_running == rq->cfs.h_nr_running)) {
   3363 
   3364 		p = fair_sched_class.pick_next_task(rq, prev, rf);
   3365 		if (unlikely(p == RETRY_TASK))
   3366 			goto again;
   3367 
   3368 		/* Assumes fair_sched_class->next == idle_sched_class */
   3369 		if (unlikely(!p))
   3370 			p = idle_sched_class.pick_next_task(rq, prev, rf);
   3371 
   3372 		return p;
   3373 	}
   3374 
   3375 again:
   3376 	for_each_class(class) {
   3377 		p = class->pick_next_task(rq, prev, rf);
   3378 		if (p) {
   3379 			if (unlikely(p == RETRY_TASK))
   3380 				goto again;
   3381 			return p;
   3382 		}
   3383 	}
   3384 
   3385 	/* The idle class should always have a runnable task: */
   3386 	BUG();
   3387 }
   3388 
   3389 /*
   3390  * __schedule() is the main scheduler function.
   3391  *
   3392  * The main means of driving the scheduler and thus entering this function are:
   3393  *
   3394  *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
   3395  *
   3396  *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
   3397  *      paths. For example, see arch/x86/entry_64.S.
   3398  *
   3399  *      To drive preemption between tasks, the scheduler sets the flag in timer
   3400  *      interrupt handler scheduler_tick().
   3401  *
   3402  *   3. Wakeups don't really cause entry into schedule(). They add a
   3403  *      task to the run-queue and that's it.
   3404  *
   3405  *      Now, if the new task added to the run-queue preempts the current
   3406  *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
   3407  *      called on the nearest possible occasion:
   3408  *
   3409  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
   3410  *
   3411  *         - in syscall or exception context, at the next outmost
   3412  *           preempt_enable(). (this might be as soon as the wake_up()'s
   3413  *           spin_unlock()!)
   3414  *
   3415  *         - in IRQ context, return from interrupt-handler to
   3416  *           preemptible context
   3417  *
   3418  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
   3419  *         then at the next:
   3420  *
   3421  *          - cond_resched() call
   3422  *          - explicit schedule() call
   3423  *          - return from syscall or exception to user-space
   3424  *          - return from interrupt-handler to user-space
   3425  *
   3426  * WARNING: must be called with preemption disabled!
   3427  */
   3428 static void __sched notrace __schedule(bool preempt)
   3429 {
   3430 	struct task_struct *prev, *next;
   3431 	unsigned long *switch_count;
   3432 	struct rq_flags rf;
   3433 	struct rq *rq;
   3434 	int cpu;
   3435 
   3436 	cpu = smp_processor_id();
   3437 	rq = cpu_rq(cpu);
   3438 	prev = rq->curr;
   3439 
   3440 	schedule_debug(prev);
   3441 
   3442 	if (sched_feat(HRTICK))
   3443 		hrtick_clear(rq);
   3444 
   3445 	local_irq_disable();
   3446 	rcu_note_context_switch(preempt);
   3447 
   3448 	/*
   3449 	 * Make sure that signal_pending_state()->signal_pending() below
   3450 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
   3451 	 * done by the caller to avoid the race with signal_wake_up().
   3452 	 *
   3453 	 * The membarrier system call requires a full memory barrier
   3454 	 * after coming from user-space, before storing to rq->curr.
   3455 	 */
   3456 	rq_lock(rq, &rf);
   3457 	smp_mb__after_spinlock();
   3458 
   3459 	/* Promote REQ to ACT */
   3460 	rq->clock_update_flags <<= 1;
   3461 	update_rq_clock(rq);
   3462 
   3463 	switch_count = &prev->nivcsw;
   3464 	if (!preempt && prev->state) {
   3465 		if (signal_pending_state(prev->state, prev)) {
   3466 			prev->state = TASK_RUNNING;
   3467 		} else {
   3468 			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
   3469 			prev->on_rq = 0;
   3470 
   3471 			if (prev->in_iowait) {
   3472 				atomic_inc(&rq->nr_iowait);
   3473 				delayacct_blkio_start();
   3474 			}
   3475 
   3476 			/*
   3477 			 * If a worker went to sleep, notify and ask workqueue
   3478 			 * whether it wants to wake up a task to maintain
   3479 			 * concurrency.
   3480 			 */
   3481 			if (prev->flags & PF_WQ_WORKER) {
   3482 				struct task_struct *to_wakeup;
   3483 
   3484 				to_wakeup = wq_worker_sleeping(prev);
   3485 				if (to_wakeup)
   3486 					try_to_wake_up_local(to_wakeup, &rf);
   3487 			}
   3488 		}
   3489 		switch_count = &prev->nvcsw;
   3490 	}
   3491 
   3492 	next = pick_next_task(rq, prev, &rf);
   3493 	clear_tsk_need_resched(prev);
   3494 	clear_preempt_need_resched();
   3495 
   3496 	if (likely(prev != next)) {
   3497 		rq->nr_switches++;
   3498 		rq->curr = next;
   3499 		/*
   3500 		 * The membarrier system call requires each architecture
   3501 		 * to have a full memory barrier after updating
   3502 		 * rq->curr, before returning to user-space.
   3503 		 *
   3504 		 * Here are the schemes providing that barrier on the
   3505 		 * various architectures:
   3506 		 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
   3507 		 *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
   3508 		 * - finish_lock_switch() for weakly-ordered
   3509 		 *   architectures where spin_unlock is a full barrier,
   3510 		 * - switch_to() for arm64 (weakly-ordered, spin_unlock
   3511 		 *   is a RELEASE barrier),
   3512 		 */
   3513 		++*switch_count;
   3514 
   3515 		trace_sched_switch(preempt, prev, next);
   3516 
   3517 		/* Also unlocks the rq: */
   3518 		rq = context_switch(rq, prev, next, &rf);
   3519 	} else {
   3520 		rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
   3521 		rq_unlock_irq(rq, &rf);
   3522 	}
   3523 
   3524 	balance_callback(rq);
   3525 }
   3526 
   3527 void __noreturn do_task_dead(void)
   3528 {
   3529 	/* Causes final put_task_struct in finish_task_switch(): */
   3530 	set_special_state(TASK_DEAD);
   3531 
   3532 	/* Tell freezer to ignore us: */
   3533 	current->flags |= PF_NOFREEZE;
   3534 
   3535 	__schedule(false);
   3536 	BUG();
   3537 
   3538 	/* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
   3539 	for (;;)
   3540 		cpu_relax();
   3541 }
   3542 
   3543 static inline void sched_submit_work(struct task_struct *tsk)
   3544 {
   3545 	if (!tsk->state || tsk_is_pi_blocked(tsk))
   3546 		return;
   3547 	/*
   3548 	 * If we are going to sleep and we have plugged IO queued,
   3549 	 * make sure to submit it to avoid deadlocks.
   3550 	 */
   3551 	if (blk_needs_flush_plug(tsk))
   3552 		blk_schedule_flush_plug(tsk);
   3553 }
   3554 
   3555 asmlinkage __visible void __sched schedule(void)
   3556 {
   3557 	struct task_struct *tsk = current;
   3558 
   3559 	sched_submit_work(tsk);
   3560 	do {
   3561 		preempt_disable();
   3562 		__schedule(false);
   3563 		sched_preempt_enable_no_resched();
   3564 	} while (need_resched());
   3565 }
   3566 EXPORT_SYMBOL(schedule);
   3567 
   3568 /*
   3569  * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
   3570  * state (have scheduled out non-voluntarily) by making sure that all
   3571  * tasks have either left the run queue or have gone into user space.
   3572  * As idle tasks do not do either, they must not ever be preempted
   3573  * (schedule out non-voluntarily).
   3574  *
   3575  * schedule_idle() is similar to schedule_preempt_disable() except that it
   3576  * never enables preemption because it does not call sched_submit_work().
   3577  */
   3578 void __sched schedule_idle(void)
   3579 {
   3580 	/*
   3581 	 * As this skips calling sched_submit_work(), which the idle task does
   3582 	 * regardless because that function is a nop when the task is in a
   3583 	 * TASK_RUNNING state, make sure this isn't used someplace that the
   3584 	 * current task can be in any other state. Note, idle is always in the
   3585 	 * TASK_RUNNING state.
   3586 	 */
   3587 	WARN_ON_ONCE(current->state);
   3588 	do {
   3589 		__schedule(false);
   3590 	} while (need_resched());
   3591 }
   3592 
   3593 #ifdef CONFIG_CONTEXT_TRACKING
   3594 asmlinkage __visible void __sched schedule_user(void)
   3595 {
   3596 	/*
   3597 	 * If we come here after a random call to set_need_resched(),
   3598 	 * or we have been woken up remotely but the IPI has not yet arrived,
   3599 	 * we haven't yet exited the RCU idle mode. Do it here manually until
   3600 	 * we find a better solution.
   3601 	 *
   3602 	 * NB: There are buggy callers of this function.  Ideally we
   3603 	 * should warn if prev_state != CONTEXT_USER, but that will trigger
   3604 	 * too frequently to make sense yet.
   3605 	 */
   3606 	enum ctx_state prev_state = exception_enter();
   3607 	schedule();
   3608 	exception_exit(prev_state);
   3609 }
   3610 #endif
   3611 
   3612 /**
   3613  * schedule_preempt_disabled - called with preemption disabled
   3614  *
   3615  * Returns with preemption disabled. Note: preempt_count must be 1
   3616  */
   3617 void __sched schedule_preempt_disabled(void)
   3618 {
   3619 	sched_preempt_enable_no_resched();
   3620 	schedule();
   3621 	preempt_disable();
   3622 }
   3623 
   3624 static void __sched notrace preempt_schedule_common(void)
   3625 {
   3626 	do {
   3627 		/*
   3628 		 * Because the function tracer can trace preempt_count_sub()
   3629 		 * and it also uses preempt_enable/disable_notrace(), if
   3630 		 * NEED_RESCHED is set, the preempt_enable_notrace() called
   3631 		 * by the function tracer will call this function again and
   3632 		 * cause infinite recursion.
   3633 		 *
   3634 		 * Preemption must be disabled here before the function
   3635 		 * tracer can trace. Break up preempt_disable() into two
   3636 		 * calls. One to disable preemption without fear of being
   3637 		 * traced. The other to still record the preemption latency,
   3638 		 * which can also be traced by the function tracer.
   3639 		 */
   3640 		preempt_disable_notrace();
   3641 		preempt_latency_start(1);
   3642 		__schedule(true);
   3643 		preempt_latency_stop(1);
   3644 		preempt_enable_no_resched_notrace();
   3645 
   3646 		/*
   3647 		 * Check again in case we missed a preemption opportunity
   3648 		 * between schedule and now.
   3649 		 */
   3650 	} while (need_resched());
   3651 }
   3652 
   3653 #ifdef CONFIG_PREEMPT
   3654 /*
   3655  * this is the entry point to schedule() from in-kernel preemption
   3656  * off of preempt_enable. Kernel preemptions off return from interrupt
   3657  * occur there and call schedule directly.
   3658  */
   3659 asmlinkage __visible void __sched notrace preempt_schedule(void)
   3660 {
   3661 	/*
   3662 	 * If there is a non-zero preempt_count or interrupts are disabled,
   3663 	 * we do not want to preempt the current task. Just return..
   3664 	 */
   3665 	if (likely(!preemptible()))
   3666 		return;
   3667 
   3668 	preempt_schedule_common();
   3669 }
   3670 NOKPROBE_SYMBOL(preempt_schedule);
   3671 EXPORT_SYMBOL(preempt_schedule);
   3672 
   3673 /**
   3674  * preempt_schedule_notrace - preempt_schedule called by tracing
   3675  *
   3676  * The tracing infrastructure uses preempt_enable_notrace to prevent
   3677  * recursion and tracing preempt enabling caused by the tracing
   3678  * infrastructure itself. But as tracing can happen in areas coming
   3679  * from userspace or just about to enter userspace, a preempt enable
   3680  * can occur before user_exit() is called. This will cause the scheduler
   3681  * to be called when the system is still in usermode.
   3682  *
   3683  * To prevent this, the preempt_enable_notrace will use this function
   3684  * instead of preempt_schedule() to exit user context if needed before
   3685  * calling the scheduler.
   3686  */
   3687 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
   3688 {
   3689 	enum ctx_state prev_ctx;
   3690 
   3691 	if (likely(!preemptible()))
   3692 		return;
   3693 
   3694 	do {
   3695 		/*
   3696 		 * Because the function tracer can trace preempt_count_sub()
   3697 		 * and it also uses preempt_enable/disable_notrace(), if
   3698 		 * NEED_RESCHED is set, the preempt_enable_notrace() called
   3699 		 * by the function tracer will call this function again and
   3700 		 * cause infinite recursion.
   3701 		 *
   3702 		 * Preemption must be disabled here before the function
   3703 		 * tracer can trace. Break up preempt_disable() into two
   3704 		 * calls. One to disable preemption without fear of being
   3705 		 * traced. The other to still record the preemption latency,
   3706 		 * which can also be traced by the function tracer.
   3707 		 */
   3708 		preempt_disable_notrace();
   3709 		preempt_latency_start(1);
   3710 		/*
   3711 		 * Needs preempt disabled in case user_exit() is traced
   3712 		 * and the tracer calls preempt_enable_notrace() causing
   3713 		 * an infinite recursion.
   3714 		 */
   3715 		prev_ctx = exception_enter();
   3716 		__schedule(true);
   3717 		exception_exit(prev_ctx);
   3718 
   3719 		preempt_latency_stop(1);
   3720 		preempt_enable_no_resched_notrace();
   3721 	} while (need_resched());
   3722 }
   3723 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
   3724 
   3725 #endif /* CONFIG_PREEMPT */
   3726 
   3727 /*
   3728  * this is the entry point to schedule() from kernel preemption
   3729  * off of irq context.
   3730  * Note, that this is called and return with irqs disabled. This will
   3731  * protect us against recursive calling from irq.
   3732  */
   3733 asmlinkage __visible void __sched preempt_schedule_irq(void)
   3734 {
   3735 	enum ctx_state prev_state;
   3736 
   3737 	/* Catch callers which need to be fixed */
   3738 	BUG_ON(preempt_count() || !irqs_disabled());
   3739 
   3740 	prev_state = exception_enter();
   3741 
   3742 	do {
   3743 		preempt_disable();
   3744 		local_irq_enable();
   3745 		__schedule(true);
   3746 		local_irq_disable();
   3747 		sched_preempt_enable_no_resched();
   3748 	} while (need_resched());
   3749 
   3750 	exception_exit(prev_state);
   3751 }
   3752 
   3753 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
   3754 			  void *key)
   3755 {
   3756 	return try_to_wake_up(curr->private, mode, wake_flags);
   3757 }
   3758 EXPORT_SYMBOL(default_wake_function);
   3759 
   3760 #ifdef CONFIG_RT_MUTEXES
   3761 
   3762 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
   3763 {
   3764 	if (pi_task)
   3765 		prio = min(prio, pi_task->prio);
   3766 
   3767 	return prio;
   3768 }
   3769 
   3770 static inline int rt_effective_prio(struct task_struct *p, int prio)
   3771 {
   3772 	struct task_struct *pi_task = rt_mutex_get_top_task(p);
   3773 
   3774 	return __rt_effective_prio(pi_task, prio);
   3775 }
   3776 
   3777 /*
   3778  * rt_mutex_setprio - set the current priority of a task
   3779  * @p: task to boost
   3780  * @pi_task: donor task
   3781  *
   3782  * This function changes the 'effective' priority of a task. It does
   3783  * not touch ->normal_prio like __setscheduler().
   3784  *
   3785  * Used by the rt_mutex code to implement priority inheritance
   3786  * logic. Call site only calls if the priority of the task changed.
   3787  */
   3788 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
   3789 {
   3790 	int prio, oldprio, queued, running, queue_flag =
   3791 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
   3792 	const struct sched_class *prev_class;
   3793 	struct rq_flags rf;
   3794 	struct rq *rq;
   3795 
   3796 	/* XXX used to be waiter->prio, not waiter->task->prio */
   3797 	prio = __rt_effective_prio(pi_task, p->normal_prio);
   3798 
   3799 	/*
   3800 	 * If nothing changed; bail early.
   3801 	 */
   3802 	if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
   3803 		return;
   3804 
   3805 	rq = __task_rq_lock(p, &rf);
   3806 	update_rq_clock(rq);
   3807 	/*
   3808 	 * Set under pi_lock && rq->lock, such that the value can be used under
   3809 	 * either lock.
   3810 	 *
   3811 	 * Note that there is loads of tricky to make this pointer cache work
   3812 	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
   3813 	 * ensure a task is de-boosted (pi_task is set to NULL) before the
   3814 	 * task is allowed to run again (and can exit). This ensures the pointer
   3815 	 * points to a blocked task -- which guaratees the task is present.
   3816 	 */
   3817 	p->pi_top_task = pi_task;
   3818 
   3819 	/*
   3820 	 * For FIFO/RR we only need to set prio, if that matches we're done.
   3821 	 */
   3822 	if (prio == p->prio && !dl_prio(prio))
   3823 		goto out_unlock;
   3824 
   3825 	/*
   3826 	 * Idle task boosting is a nono in general. There is one
   3827 	 * exception, when PREEMPT_RT and NOHZ is active:
   3828 	 *
   3829 	 * The idle task calls get_next_timer_interrupt() and holds
   3830 	 * the timer wheel base->lock on the CPU and another CPU wants
   3831 	 * to access the timer (probably to cancel it). We can safely
   3832 	 * ignore the boosting request, as the idle CPU runs this code
   3833 	 * with interrupts disabled and will complete the lock
   3834 	 * protected section without being interrupted. So there is no
   3835 	 * real need to boost.
   3836 	 */
   3837 	if (unlikely(p == rq->idle)) {
   3838 		WARN_ON(p != rq->curr);
   3839 		WARN_ON(p->pi_blocked_on);
   3840 		goto out_unlock;
   3841 	}
   3842 
   3843 	trace_sched_pi_setprio(p, pi_task);
   3844 	oldprio = p->prio;
   3845 
   3846 	if (oldprio == prio)
   3847 		queue_flag &= ~DEQUEUE_MOVE;
   3848 
   3849 	prev_class = p->sched_class;
   3850 	queued = task_on_rq_queued(p);
   3851 	running = task_current(rq, p);
   3852 	if (queued)
   3853 		dequeue_task(rq, p, queue_flag);
   3854 	if (running)
   3855 		put_prev_task(rq, p);
   3856 
   3857 	/*
   3858 	 * Boosting condition are:
   3859 	 * 1. -rt task is running and holds mutex A
   3860 	 *      --> -dl task blocks on mutex A
   3861 	 *
   3862 	 * 2. -dl task is running and holds mutex A
   3863 	 *      --> -dl task blocks on mutex A and could preempt the
   3864 	 *          running task
   3865 	 */
   3866 	if (dl_prio(prio)) {
   3867 		if (!dl_prio(p->normal_prio) ||
   3868 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
   3869 			p->dl.dl_boosted = 1;
   3870 			queue_flag |= ENQUEUE_REPLENISH;
   3871 		} else
   3872 			p->dl.dl_boosted = 0;
   3873 		p->sched_class = &dl_sched_class;
   3874 	} else if (rt_prio(prio)) {
   3875 		if (dl_prio(oldprio))
   3876 			p->dl.dl_boosted = 0;
   3877 		if (oldprio < prio)
   3878 			queue_flag |= ENQUEUE_HEAD;
   3879 		p->sched_class = &rt_sched_class;
   3880 	} else {
   3881 		if (dl_prio(oldprio))
   3882 			p->dl.dl_boosted = 0;
   3883 		if (rt_prio(oldprio))
   3884 			p->rt.timeout = 0;
   3885 		p->sched_class = &fair_sched_class;
   3886 	}
   3887 
   3888 	p->prio = prio;
   3889 
   3890 	if (queued)
   3891 		enqueue_task(rq, p, queue_flag);
   3892 	if (running)
   3893 		set_curr_task(rq, p);
   3894 
   3895 	check_class_changed(rq, p, prev_class, oldprio);
   3896 out_unlock:
   3897 	/* Avoid rq from going away on us: */
   3898 	preempt_disable();
   3899 	__task_rq_unlock(rq, &rf);
   3900 
   3901 	balance_callback(rq);
   3902 	preempt_enable();
   3903 }
   3904 #else
   3905 static inline int rt_effective_prio(struct task_struct *p, int prio)
   3906 {
   3907 	return prio;
   3908 }
   3909 #endif
   3910 
   3911 void set_user_nice(struct task_struct *p, long nice)
   3912 {
   3913 	bool queued, running;
   3914 	int old_prio, delta;
   3915 	struct rq_flags rf;
   3916 	struct rq *rq;
   3917 
   3918 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
   3919 		return;
   3920 	/*
   3921 	 * We have to be careful, if called from sys_setpriority(),
   3922 	 * the task might be in the middle of scheduling on another CPU.
   3923 	 */
   3924 	rq = task_rq_lock(p, &rf);
   3925 	update_rq_clock(rq);
   3926 
   3927 	/*
   3928 	 * The RT priorities are set via sched_setscheduler(), but we still
   3929 	 * allow the 'normal' nice value to be set - but as expected
   3930 	 * it wont have any effect on scheduling until the task is
   3931 	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
   3932 	 */
   3933 	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
   3934 		p->static_prio = NICE_TO_PRIO(nice);
   3935 		goto out_unlock;
   3936 	}
   3937 	queued = task_on_rq_queued(p);
   3938 	running = task_current(rq, p);
   3939 	if (queued)
   3940 		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
   3941 	if (running)
   3942 		put_prev_task(rq, p);
   3943 
   3944 	p->static_prio = NICE_TO_PRIO(nice);
   3945 	set_load_weight(p, true);
   3946 	old_prio = p->prio;
   3947 	p->prio = effective_prio(p);
   3948 	delta = p->prio - old_prio;
   3949 
   3950 	if (queued) {
   3951 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
   3952 		/*
   3953 		 * If the task increased its priority or is running and
   3954 		 * lowered its priority, then reschedule its CPU:
   3955 		 */
   3956 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
   3957 			resched_curr(rq);
   3958 	}
   3959 	if (running)
   3960 		set_curr_task(rq, p);
   3961 out_unlock:
   3962 	task_rq_unlock(rq, p, &rf);
   3963 }
   3964 EXPORT_SYMBOL(set_user_nice);
   3965 
   3966 /*
   3967  * can_nice - check if a task can reduce its nice value
   3968  * @p: task
   3969  * @nice: nice value
   3970  */
   3971 int can_nice(const struct task_struct *p, const int nice)
   3972 {
   3973 	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
   3974 	int nice_rlim = nice_to_rlimit(nice);
   3975 
   3976 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
   3977 		capable(CAP_SYS_NICE));
   3978 }
   3979 
   3980 #ifdef __ARCH_WANT_SYS_NICE
   3981 
   3982 /*
   3983  * sys_nice - change the priority of the current process.
   3984  * @increment: priority increment
   3985  *
   3986  * sys_setpriority is a more generic, but much slower function that
   3987  * does similar things.
   3988  */
   3989 SYSCALL_DEFINE1(nice, int, increment)
   3990 {
   3991 	long nice, retval;
   3992 
   3993 	/*
   3994 	 * Setpriority might change our priority at the same moment.
   3995 	 * We don't have to worry. Conceptually one call occurs first
   3996 	 * and we have a single winner.
   3997 	 */
   3998 	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
   3999 	nice = task_nice(current) + increment;
   4000 
   4001 	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
   4002 	if (increment < 0 && !can_nice(current, nice))
   4003 		return -EPERM;
   4004 
   4005 	retval = security_task_setnice(current, nice);
   4006 	if (retval)
   4007 		return retval;
   4008 
   4009 	set_user_nice(current, nice);
   4010 	return 0;
   4011 }
   4012 
   4013 #endif
   4014 
   4015 /**
   4016  * task_prio - return the priority value of a given task.
   4017  * @p: the task in question.
   4018  *
   4019  * Return: The priority value as seen by users in /proc.
   4020  * RT tasks are offset by -200. Normal tasks are centered
   4021  * around 0, value goes from -16 to +15.
   4022  */
   4023 int task_prio(const struct task_struct *p)
   4024 {
   4025 	return p->prio - MAX_RT_PRIO;
   4026 }
   4027 
   4028 /**
   4029  * idle_cpu - is a given CPU idle currently?
   4030  * @cpu: the processor in question.
   4031  *
   4032  * Return: 1 if the CPU is currently idle. 0 otherwise.
   4033  */
   4034 int idle_cpu(int cpu)
   4035 {
   4036 	struct rq *rq = cpu_rq(cpu);
   4037 
   4038 	if (rq->curr != rq->idle)
   4039 		return 0;
   4040 
   4041 	if (rq->nr_running)
   4042 		return 0;
   4043 
   4044 #ifdef CONFIG_SMP
   4045 	if (!llist_empty(&rq->wake_list))
   4046 		return 0;
   4047 #endif
   4048 
   4049 	return 1;
   4050 }
   4051 
   4052 /**
   4053  * available_idle_cpu - is a given CPU idle for enqueuing work.
   4054  * @cpu: the CPU in question.
   4055  *
   4056  * Return: 1 if the CPU is currently idle. 0 otherwise.
   4057  */
   4058 int available_idle_cpu(int cpu)
   4059 {
   4060 	if (!idle_cpu(cpu))
   4061 		return 0;
   4062 
   4063 	if (vcpu_is_preempted(cpu))
   4064 		return 0;
   4065 
   4066 	return 1;
   4067 }
   4068 
   4069 /**
   4070  * idle_task - return the idle task for a given CPU.
   4071  * @cpu: the processor in question.
   4072  *
   4073  * Return: The idle task for the CPU @cpu.
   4074  */
   4075 struct task_struct *idle_task(int cpu)
   4076 {
   4077 	return cpu_rq(cpu)->idle;
   4078 }
   4079 
   4080 /**
   4081  * find_process_by_pid - find a process with a matching PID value.
   4082  * @pid: the pid in question.
   4083  *
   4084  * The task of @pid, if found. %NULL otherwise.
   4085  */
   4086 static struct task_struct *find_process_by_pid(pid_t pid)
   4087 {
   4088 	return pid ? find_task_by_vpid(pid) : current;
   4089 }
   4090 
   4091 /*
   4092  * sched_setparam() passes in -1 for its policy, to let the functions
   4093  * it calls know not to change it.
   4094  */
   4095 #define SETPARAM_POLICY	-1
   4096 
   4097 static void __setscheduler_params(struct task_struct *p,
   4098 		const struct sched_attr *attr)
   4099 {
   4100 	int policy = attr->sched_policy;
   4101 
   4102 	if (policy == SETPARAM_POLICY)
   4103 		policy = p->policy;
   4104 
   4105 	p->policy = policy;
   4106 
   4107 	if (dl_policy(policy))
   4108 		__setparam_dl(p, attr);
   4109 	else if (fair_policy(policy))
   4110 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
   4111 
   4112 	/*
   4113 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
   4114 	 * !rt_policy. Always setting this ensures that things like
   4115 	 * getparam()/getattr() don't report silly values for !rt tasks.
   4116 	 */
   4117 	p->rt_priority = attr->sched_priority;
   4118 	p->normal_prio = normal_prio(p);
   4119 	set_load_weight(p, true);
   4120 }
   4121 
   4122 /* Actually do priority change: must hold pi & rq lock. */
   4123 static void __setscheduler(struct rq *rq, struct task_struct *p,
   4124 			   const struct sched_attr *attr, bool keep_boost)
   4125 {
   4126 	__setscheduler_params(p, attr);
   4127 
   4128 	/*
   4129 	 * Keep a potential priority boosting if called from
   4130 	 * sched_setscheduler().
   4131 	 */
   4132 	p->prio = normal_prio(p);
   4133 	if (keep_boost)
   4134 		p->prio = rt_effective_prio(p, p->prio);
   4135 
   4136 	if (dl_prio(p->prio))
   4137 		p->sched_class = &dl_sched_class;
   4138 	else if (rt_prio(p->prio))
   4139 		p->sched_class = &rt_sched_class;
   4140 	else
   4141 		p->sched_class = &fair_sched_class;
   4142 }
   4143 
   4144 /*
   4145  * Check the target process has a UID that matches the current process's:
   4146  */
   4147 static bool check_same_owner(struct task_struct *p)
   4148 {
   4149 	const struct cred *cred = current_cred(), *pcred;
   4150 	bool match;
   4151 
   4152 	rcu_read_lock();
   4153 	pcred = __task_cred(p);
   4154 	match = (uid_eq(cred->euid, pcred->euid) ||
   4155 		 uid_eq(cred->euid, pcred->uid));
   4156 	rcu_read_unlock();
   4157 	return match;
   4158 }
   4159 
   4160 static int __sched_setscheduler(struct task_struct *p,
   4161 				const struct sched_attr *attr,
   4162 				bool user, bool pi)
   4163 {
   4164 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
   4165 		      MAX_RT_PRIO - 1 - attr->sched_priority;
   4166 	int retval, oldprio, oldpolicy = -1, queued, running;
   4167 	int new_effective_prio, policy = attr->sched_policy;
   4168 	const struct sched_class *prev_class;
   4169 	struct rq_flags rf;
   4170 	int reset_on_fork;
   4171 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
   4172 	struct rq *rq;
   4173 
   4174 	/* The pi code expects interrupts enabled */
   4175 	BUG_ON(pi && in_interrupt());
   4176 recheck:
   4177 	/* Double check policy once rq lock held: */
   4178 	if (policy < 0) {
   4179 		reset_on_fork = p->sched_reset_on_fork;
   4180 		policy = oldpolicy = p->policy;
   4181 	} else {
   4182 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
   4183 
   4184 		if (!valid_policy(policy))
   4185 			return -EINVAL;
   4186 	}
   4187 
   4188 	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
   4189 		return -EINVAL;
   4190 
   4191 	/*
   4192 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
   4193 	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
   4194 	 * SCHED_BATCH and SCHED_IDLE is 0.
   4195 	 */
   4196 	if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
   4197 	    (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
   4198 		return -EINVAL;
   4199 	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
   4200 	    (rt_policy(policy) != (attr->sched_priority != 0)))
   4201 		return -EINVAL;
   4202 
   4203 	/*
   4204 	 * Allow unprivileged RT tasks to decrease priority:
   4205 	 */
   4206 	if (user && !capable(CAP_SYS_NICE)) {
   4207 		if (fair_policy(policy)) {
   4208 			if (attr->sched_nice < task_nice(p) &&
   4209 			    !can_nice(p, attr->sched_nice))
   4210 				return -EPERM;
   4211 		}
   4212 
   4213 		if (rt_policy(policy)) {
   4214 			unsigned long rlim_rtprio =
   4215 					task_rlimit(p, RLIMIT_RTPRIO);
   4216 
   4217 			/* Can't set/change the rt policy: */
   4218 			if (policy != p->policy && !rlim_rtprio)
   4219 				return -EPERM;
   4220 
   4221 			/* Can't increase priority: */
   4222 			if (attr->sched_priority > p->rt_priority &&
   4223 			    attr->sched_priority > rlim_rtprio)
   4224 				return -EPERM;
   4225 		}
   4226 
   4227 		 /*
   4228 		  * Can't set/change SCHED_DEADLINE policy at all for now
   4229 		  * (safest behavior); in the future we would like to allow
   4230 		  * unprivileged DL tasks to increase their relative deadline
   4231 		  * or reduce their runtime (both ways reducing utilization)
   4232 		  */
   4233 		if (dl_policy(policy))
   4234 			return -EPERM;
   4235 
   4236 		/*
   4237 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
   4238 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
   4239 		 */
   4240 		if (task_has_idle_policy(p) && !idle_policy(policy)) {
   4241 			if (!can_nice(p, task_nice(p)))
   4242 				return -EPERM;
   4243 		}
   4244 
   4245 		/* Can't change other user's priorities: */
   4246 		if (!check_same_owner(p))
   4247 			return -EPERM;
   4248 
   4249 		/* Normal users shall not reset the sched_reset_on_fork flag: */
   4250 		if (p->sched_reset_on_fork && !reset_on_fork)
   4251 			return -EPERM;
   4252 	}
   4253 
   4254 	if (user) {
   4255 		if (attr->sched_flags & SCHED_FLAG_SUGOV)
   4256 			return -EINVAL;
   4257 
   4258 		retval = security_task_setscheduler(p);
   4259 		if (retval)
   4260 			return retval;
   4261 	}
   4262 
   4263 	/*
   4264 	 * Make sure no PI-waiters arrive (or leave) while we are
   4265 	 * changing the priority of the task:
   4266 	 *
   4267 	 * To be able to change p->policy safely, the appropriate
   4268 	 * runqueue lock must be held.
   4269 	 */
   4270 	rq = task_rq_lock(p, &rf);
   4271 	update_rq_clock(rq);
   4272 
   4273 	/*
   4274 	 * Changing the policy of the stop threads its a very bad idea:
   4275 	 */
   4276 	if (p == rq->stop) {
   4277 		task_rq_unlock(rq, p, &rf);
   4278 		return -EINVAL;
   4279 	}
   4280 
   4281 	/*
   4282 	 * If not changing anything there's no need to proceed further,
   4283 	 * but store a possible modification of reset_on_fork.
   4284 	 */
   4285 	if (unlikely(policy == p->policy)) {
   4286 		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
   4287 			goto change;
   4288 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
   4289 			goto change;
   4290 		if (dl_policy(policy) && dl_param_changed(p, attr))
   4291 			goto change;
   4292 
   4293 		p->sched_reset_on_fork = reset_on_fork;
   4294 		task_rq_unlock(rq, p, &rf);
   4295 		return 0;
   4296 	}
   4297 change:
   4298 
   4299 	if (user) {
   4300 #ifdef CONFIG_RT_GROUP_SCHED
   4301 		/*
   4302 		 * Do not allow realtime tasks into groups that have no runtime
   4303 		 * assigned.
   4304 		 */
   4305 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
   4306 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
   4307 				!task_group_is_autogroup(task_group(p))) {
   4308 			task_rq_unlock(rq, p, &rf);
   4309 			return -EPERM;
   4310 		}
   4311 #endif
   4312 #ifdef CONFIG_SMP
   4313 		if (dl_bandwidth_enabled() && dl_policy(policy) &&
   4314 				!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
   4315 			cpumask_t *span = rq->rd->span;
   4316 
   4317 			/*
   4318 			 * Don't allow tasks with an affinity mask smaller than
   4319 			 * the entire root_domain to become SCHED_DEADLINE. We
   4320 			 * will also fail if there's no bandwidth available.
   4321 			 */
   4322 			if (!cpumask_subset(span, &p->cpus_allowed) ||
   4323 			    rq->rd->dl_bw.bw == 0) {
   4324 				task_rq_unlock(rq, p, &rf);
   4325 				return -EPERM;
   4326 			}
   4327 		}
   4328 #endif
   4329 	}
   4330 
   4331 	/* Re-check policy now with rq lock held: */
   4332 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
   4333 		policy = oldpolicy = -1;
   4334 		task_rq_unlock(rq, p, &rf);
   4335 		goto recheck;
   4336 	}
   4337 
   4338 	/*
   4339 	 * If setscheduling to SCHED_DEADLINE (or changing the parameters
   4340 	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
   4341 	 * is available.
   4342 	 */
   4343 	if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
   4344 		task_rq_unlock(rq, p, &rf);
   4345 		return -EBUSY;
   4346 	}
   4347 
   4348 	p->sched_reset_on_fork = reset_on_fork;
   4349 	oldprio = p->prio;
   4350 
   4351 	if (pi) {
   4352 		/*
   4353 		 * Take priority boosted tasks into account. If the new
   4354 		 * effective priority is unchanged, we just store the new
   4355 		 * normal parameters and do not touch the scheduler class and
   4356 		 * the runqueue. This will be done when the task deboost
   4357 		 * itself.
   4358 		 */
   4359 		new_effective_prio = rt_effective_prio(p, newprio);
   4360 		if (new_effective_prio == oldprio)
   4361 			queue_flags &= ~DEQUEUE_MOVE;
   4362 	}
   4363 
   4364 	queued = task_on_rq_queued(p);
   4365 	running = task_current(rq, p);
   4366 	if (queued)
   4367 		dequeue_task(rq, p, queue_flags);
   4368 	if (running)
   4369 		put_prev_task(rq, p);
   4370 
   4371 	prev_class = p->sched_class;
   4372 	__setscheduler(rq, p, attr, pi);
   4373 
   4374 	if (queued) {
   4375 		/*
   4376 		 * We enqueue to tail when the priority of a task is
   4377 		 * increased (user space view).
   4378 		 */
   4379 		if (oldprio < p->prio)
   4380 			queue_flags |= ENQUEUE_HEAD;
   4381 
   4382 		enqueue_task(rq, p, queue_flags);
   4383 	}
   4384 	if (running)
   4385 		set_curr_task(rq, p);
   4386 
   4387 	check_class_changed(rq, p, prev_class, oldprio);
   4388 
   4389 	/* Avoid rq from going away on us: */
   4390 	preempt_disable();
   4391 	task_rq_unlock(rq, p, &rf);
   4392 
   4393 	if (pi)
   4394 		rt_mutex_adjust_pi(p);
   4395 
   4396 	/* Run balance callbacks after we've adjusted the PI chain: */
   4397 	balance_callback(rq);
   4398 	preempt_enable();
   4399 
   4400 	return 0;
   4401 }
   4402 
   4403 static int _sched_setscheduler(struct task_struct *p, int policy,
   4404 			       const struct sched_param *param, bool check)
   4405 {
   4406 	struct sched_attr attr = {
   4407 		.sched_policy   = policy,
   4408 		.sched_priority = param->sched_priority,
   4409 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
   4410 	};
   4411 
   4412 	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
   4413 	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
   4414 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
   4415 		policy &= ~SCHED_RESET_ON_FORK;
   4416 		attr.sched_policy = policy;
   4417 	}
   4418 
   4419 	return __sched_setscheduler(p, &attr, check, true);
   4420 }
   4421 /**
   4422  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
   4423  * @p: the task in question.
   4424  * @policy: new policy.
   4425  * @param: structure containing the new RT priority.
   4426  *
   4427  * Return: 0 on success. An error code otherwise.
   4428  *
   4429  * NOTE that the task may be already dead.
   4430  */
   4431 int sched_setscheduler(struct task_struct *p, int policy,
   4432 		       const struct sched_param *param)
   4433 {
   4434 	return _sched_setscheduler(p, policy, param, true);
   4435 }
   4436 EXPORT_SYMBOL_GPL(sched_setscheduler);
   4437 
   4438 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
   4439 {
   4440 	return __sched_setscheduler(p, attr, true, true);
   4441 }
   4442 EXPORT_SYMBOL_GPL(sched_setattr);
   4443 
   4444 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
   4445 {
   4446 	return __sched_setscheduler(p, attr, false, true);
   4447 }
   4448 
   4449 /**
   4450  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
   4451  * @p: the task in question.
   4452  * @policy: new policy.
   4453  * @param: structure containing the new RT priority.
   4454  *
   4455  * Just like sched_setscheduler, only don't bother checking if the
   4456  * current context has permission.  For example, this is needed in
   4457  * stop_machine(): we create temporary high priority worker threads,
   4458  * but our caller might not have that capability.
   4459  *
   4460  * Return: 0 on success. An error code otherwise.
   4461  */
   4462 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
   4463 			       const struct sched_param *param)
   4464 {
   4465 	return _sched_setscheduler(p, policy, param, false);
   4466 }
   4467 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
   4468 
   4469 static int
   4470 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
   4471 {
   4472 	struct sched_param lparam;
   4473 	struct task_struct *p;
   4474 	int retval;
   4475 
   4476 	if (!param || pid < 0)
   4477 		return -EINVAL;
   4478 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
   4479 		return -EFAULT;
   4480 
   4481 	rcu_read_lock();
   4482 	retval = -ESRCH;
   4483 	p = find_process_by_pid(pid);
   4484 	if (p != NULL)
   4485 		retval = sched_setscheduler(p, policy, &lparam);
   4486 	rcu_read_unlock();
   4487 
   4488 	return retval;
   4489 }
   4490 
   4491 /*
   4492  * Mimics kernel/events/core.c perf_copy_attr().
   4493  */
   4494 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
   4495 {
   4496 	u32 size;
   4497 	int ret;
   4498 
   4499 	if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0))
   4500 		return -EFAULT;
   4501 
   4502 	/* Zero the full structure, so that a short copy will be nice: */
   4503 	memset(attr, 0, sizeof(*attr));
   4504 
   4505 	ret = get_user(size, &uattr->size);
   4506 	if (ret)
   4507 		return ret;
   4508 
   4509 	/* Bail out on silly large: */
   4510 	if (size > PAGE_SIZE)
   4511 		goto err_size;
   4512 
   4513 	/* ABI compatibility quirk: */
   4514 	if (!size)
   4515 		size = SCHED_ATTR_SIZE_VER0;
   4516 
   4517 	if (size < SCHED_ATTR_SIZE_VER0)
   4518 		goto err_size;
   4519 
   4520 	/*
   4521 	 * If we're handed a bigger struct than we know of,
   4522 	 * ensure all the unknown bits are 0 - i.e. new
   4523 	 * user-space does not rely on any kernel feature
   4524 	 * extensions we dont know about yet.
   4525 	 */
   4526 	if (size > sizeof(*attr)) {
   4527 		unsigned char __user *addr;
   4528 		unsigned char __user *end;
   4529 		unsigned char val;
   4530 
   4531 		addr = (void __user *)uattr + sizeof(*attr);
   4532 		end  = (void __user *)uattr + size;
   4533 
   4534 		for (; addr < end; addr++) {
   4535 			ret = get_user(val, addr);
   4536 			if (ret)
   4537 				return ret;
   4538 			if (val)
   4539 				goto err_size;
   4540 		}
   4541 		size = sizeof(*attr);
   4542 	}
   4543 
   4544 	ret = copy_from_user(attr, uattr, size);
   4545 	if (ret)
   4546 		return -EFAULT;
   4547 
   4548 	/*
   4549 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
   4550 	 * to be strict and return an error on out-of-bounds values?
   4551 	 */
   4552 	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
   4553 
   4554 	return 0;
   4555 
   4556 err_size:
   4557 	put_user(sizeof(*attr), &uattr->size);
   4558 	return -E2BIG;
   4559 }
   4560 
   4561 /**
   4562  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
   4563  * @pid: the pid in question.
   4564  * @policy: new policy.
   4565  * @param: structure containing the new RT priority.
   4566  *
   4567  * Return: 0 on success. An error code otherwise.
   4568  */
   4569 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
   4570 {
   4571 	if (policy < 0)
   4572 		return -EINVAL;
   4573 
   4574 	return do_sched_setscheduler(pid, policy, param);
   4575 }
   4576 
   4577 /**
   4578  * sys_sched_setparam - set/change the RT priority of a thread
   4579  * @pid: the pid in question.
   4580  * @param: structure containing the new RT priority.
   4581  *
   4582  * Return: 0 on success. An error code otherwise.
   4583  */
   4584 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
   4585 {
   4586 	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
   4587 }
   4588 
   4589 /**
   4590  * sys_sched_setattr - same as above, but with extended sched_attr
   4591  * @pid: the pid in question.
   4592  * @uattr: structure containing the extended parameters.
   4593  * @flags: for future extension.
   4594  */
   4595 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
   4596 			       unsigned int, flags)
   4597 {
   4598 	struct sched_attr attr;
   4599 	struct task_struct *p;
   4600 	int retval;
   4601 
   4602 	if (!uattr || pid < 0 || flags)
   4603 		return -EINVAL;
   4604 
   4605 	retval = sched_copy_attr(uattr, &attr);
   4606 	if (retval)
   4607 		return retval;
   4608 
   4609 	if ((int)attr.sched_policy < 0)
   4610 		return -EINVAL;
   4611 
   4612 	rcu_read_lock();
   4613 	retval = -ESRCH;
   4614 	p = find_process_by_pid(pid);
   4615 	if (p != NULL)
   4616 		retval = sched_setattr(p, &attr);
   4617 	rcu_read_unlock();
   4618 
   4619 	return retval;
   4620 }
   4621 
   4622 /**
   4623  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
   4624  * @pid: the pid in question.
   4625  *
   4626  * Return: On success, the policy of the thread. Otherwise, a negative error
   4627  * code.
   4628  */
   4629 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
   4630 {
   4631 	struct task_struct *p;
   4632 	int retval;
   4633 
   4634 	if (pid < 0)
   4635 		return -EINVAL;
   4636 
   4637 	retval = -ESRCH;
   4638 	rcu_read_lock();
   4639 	p = find_process_by_pid(pid);
   4640 	if (p) {
   4641 		retval = security_task_getscheduler(p);
   4642 		if (!retval)
   4643 			retval = p->policy
   4644 				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
   4645 	}
   4646 	rcu_read_unlock();
   4647 	return retval;
   4648 }
   4649 
   4650 /**
   4651  * sys_sched_getparam - get the RT priority of a thread
   4652  * @pid: the pid in question.
   4653  * @param: structure containing the RT priority.
   4654  *
   4655  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
   4656  * code.
   4657  */
   4658 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
   4659 {
   4660 	struct sched_param lp = { .sched_priority = 0 };
   4661 	struct task_struct *p;
   4662 	int retval;
   4663 
   4664 	if (!param || pid < 0)
   4665 		return -EINVAL;
   4666 
   4667 	rcu_read_lock();
   4668 	p = find_process_by_pid(pid);
   4669 	retval = -ESRCH;
   4670 	if (!p)
   4671 		goto out_unlock;
   4672 
   4673 	retval = security_task_getscheduler(p);
   4674 	if (retval)
   4675 		goto out_unlock;
   4676 
   4677 	if (task_has_rt_policy(p))
   4678 		lp.sched_priority = p->rt_priority;
   4679 	rcu_read_unlock();
   4680 
   4681 	/*
   4682 	 * This one might sleep, we cannot do it with a spinlock held ...
   4683 	 */
   4684 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
   4685 
   4686 	return retval;
   4687 
   4688 out_unlock:
   4689 	rcu_read_unlock();
   4690 	return retval;
   4691 }
   4692 
   4693 static int sched_read_attr(struct sched_attr __user *uattr,
   4694 			   struct sched_attr *attr,
   4695 			   unsigned int usize)
   4696 {
   4697 	int ret;
   4698 
   4699 	if (!access_ok(uattr, usize))
   4700 		return -EFAULT;
   4701 
   4702 	/*
   4703 	 * If we're handed a smaller struct than we know of,
   4704 	 * ensure all the unknown bits are 0 - i.e. old
   4705 	 * user-space does not get uncomplete information.
   4706 	 */
   4707 	if (usize < sizeof(*attr)) {
   4708 		unsigned char *addr;
   4709 		unsigned char *end;
   4710 
   4711 		addr = (void *)attr + usize;
   4712 		end  = (void *)attr + sizeof(*attr);
   4713 
   4714 		for (; addr < end; addr++) {
   4715 			if (*addr)
   4716 				return -EFBIG;
   4717 		}
   4718 
   4719 		attr->size = usize;
   4720 	}
   4721 
   4722 	ret = copy_to_user(uattr, attr, attr->size);
   4723 	if (ret)
   4724 		return -EFAULT;
   4725 
   4726 	return 0;
   4727 }
   4728 
   4729 /**
   4730  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
   4731  * @pid: the pid in question.
   4732  * @uattr: structure containing the extended parameters.
   4733  * @size: sizeof(attr) for fwd/bwd comp.
   4734  * @flags: for future extension.
   4735  */
   4736 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
   4737 		unsigned int, size, unsigned int, flags)
   4738 {
   4739 	struct sched_attr attr = {
   4740 		.size = sizeof(struct sched_attr),
   4741 	};
   4742 	struct task_struct *p;
   4743 	int retval;
   4744 
   4745 	if (!uattr || pid < 0 || size > PAGE_SIZE ||
   4746 	    size < SCHED_ATTR_SIZE_VER0 || flags)
   4747 		return -EINVAL;
   4748 
   4749 	rcu_read_lock();
   4750 	p = find_process_by_pid(pid);
   4751 	retval = -ESRCH;
   4752 	if (!p)
   4753 		goto out_unlock;
   4754 
   4755 	retval = security_task_getscheduler(p);
   4756 	if (retval)
   4757 		goto out_unlock;
   4758 
   4759 	attr.sched_policy = p->policy;
   4760 	if (p->sched_reset_on_fork)
   4761 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
   4762 	if (task_has_dl_policy(p))
   4763 		__getparam_dl(p, &attr);
   4764 	else if (task_has_rt_policy(p))
   4765 		attr.sched_priority = p->rt_priority;
   4766 	else
   4767 		attr.sched_nice = task_nice(p);
   4768 
   4769 	rcu_read_unlock();
   4770 
   4771 	retval = sched_read_attr(uattr, &attr, size);
   4772 	return retval;
   4773 
   4774 out_unlock:
   4775 	rcu_read_unlock();
   4776 	return retval;
   4777 }
   4778 
   4779 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
   4780 {
   4781 	cpumask_var_t cpus_allowed, new_mask;
   4782 	struct task_struct *p;
   4783 	int retval;
   4784 
   4785 	rcu_read_lock();
   4786 
   4787 	p = find_process_by_pid(pid);
   4788 	if (!p) {
   4789 		rcu_read_unlock();
   4790 		return -ESRCH;
   4791 	}
   4792 
   4793 	/* Prevent p going away */
   4794 	get_task_struct(p);
   4795 	rcu_read_unlock();
   4796 
   4797 	if (p->flags & PF_NO_SETAFFINITY) {
   4798 		retval = -EINVAL;
   4799 		goto out_put_task;
   4800 	}
   4801 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
   4802 		retval = -ENOMEM;
   4803 		goto out_put_task;
   4804 	}
   4805 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
   4806 		retval = -ENOMEM;
   4807 		goto out_free_cpus_allowed;
   4808 	}
   4809 	retval = -EPERM;
   4810 	if (!check_same_owner(p)) {
   4811 		rcu_read_lock();
   4812 		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
   4813 			rcu_read_unlock();
   4814 			goto out_free_new_mask;
   4815 		}
   4816 		rcu_read_unlock();
   4817 	}
   4818 
   4819 	retval = security_task_setscheduler(p);
   4820 	if (retval)
   4821 		goto out_free_new_mask;
   4822 
   4823 
   4824 	cpuset_cpus_allowed(p, cpus_allowed);
   4825 	cpumask_and(new_mask, in_mask, cpus_allowed);
   4826 
   4827 	/*
   4828 	 * Since bandwidth control happens on root_domain basis,
   4829 	 * if admission test is enabled, we only admit -deadline
   4830 	 * tasks allowed to run on all the CPUs in the task's
   4831 	 * root_domain.
   4832 	 */
   4833 #ifdef CONFIG_SMP
   4834 	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
   4835 		rcu_read_lock();
   4836 		if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
   4837 			retval = -EBUSY;
   4838 			rcu_read_unlock();
   4839 			goto out_free_new_mask;
   4840 		}
   4841 		rcu_read_unlock();
   4842 	}
   4843 #endif
   4844 again:
   4845 	retval = __set_cpus_allowed_ptr(p, new_mask, true);
   4846 
   4847 	if (!retval) {
   4848 		cpuset_cpus_allowed(p, cpus_allowed);
   4849 		if (!cpumask_subset(new_mask, cpus_allowed)) {
   4850 			/*
   4851 			 * We must have raced with a concurrent cpuset
   4852 			 * update. Just reset the cpus_allowed to the
   4853 			 * cpuset's cpus_allowed
   4854 			 */
   4855 			cpumask_copy(new_mask, cpus_allowed);
   4856 			goto again;
   4857 		}
   4858 	}
   4859 out_free_new_mask:
   4860 	free_cpumask_var(new_mask);
   4861 out_free_cpus_allowed:
   4862 	free_cpumask_var(cpus_allowed);
   4863 out_put_task:
   4864 	put_task_struct(p);
   4865 	return retval;
   4866 }
   4867 
   4868 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
   4869 			     struct cpumask *new_mask)
   4870 {
   4871 	if (len < cpumask_size())
   4872 		cpumask_clear(new_mask);
   4873 	else if (len > cpumask_size())
   4874 		len = cpumask_size();
   4875 
   4876 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
   4877 }
   4878 
   4879 /**
   4880  * sys_sched_setaffinity - set the CPU affinity of a process
   4881  * @pid: pid of the process
   4882  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
   4883  * @user_mask_ptr: user-space pointer to the new CPU mask
   4884  *
   4885  * Return: 0 on success. An error code otherwise.
   4886  */
   4887 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
   4888 		unsigned long __user *, user_mask_ptr)
   4889 {
   4890 	cpumask_var_t new_mask;
   4891 	int retval;
   4892 
   4893 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
   4894 		return -ENOMEM;
   4895 
   4896 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
   4897 	if (retval == 0)
   4898 		retval = sched_setaffinity(pid, new_mask);
   4899 	free_cpumask_var(new_mask);
   4900 	return retval;
   4901 }
   4902 
   4903 long sched_getaffinity(pid_t pid, struct cpumask *mask)
   4904 {
   4905 	struct task_struct *p;
   4906 	unsigned long flags;
   4907 	int retval;
   4908 
   4909 	rcu_read_lock();
   4910 
   4911 	retval = -ESRCH;
   4912 	p = find_process_by_pid(pid);
   4913 	if (!p)
   4914 		goto out_unlock;
   4915 
   4916 	retval = security_task_getscheduler(p);
   4917 	if (retval)
   4918 		goto out_unlock;
   4919 
   4920 	raw_spin_lock_irqsave(&p->pi_lock, flags);
   4921 	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
   4922 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
   4923 
   4924 out_unlock:
   4925 	rcu_read_unlock();
   4926 
   4927 	return retval;
   4928 }
   4929 
   4930 /**
   4931  * sys_sched_getaffinity - get the CPU affinity of a process
   4932  * @pid: pid of the process
   4933  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
   4934  * @user_mask_ptr: user-space pointer to hold the current CPU mask
   4935  *
   4936  * Return: size of CPU mask copied to user_mask_ptr on success. An
   4937  * error code otherwise.
   4938  */
   4939 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
   4940 		unsigned long __user *, user_mask_ptr)
   4941 {
   4942 	int ret;
   4943 	cpumask_var_t mask;
   4944 
   4945 	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
   4946 		return -EINVAL;
   4947 	if (len & (sizeof(unsigned long)-1))
   4948 		return -EINVAL;
   4949 
   4950 	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
   4951 		return -ENOMEM;
   4952 
   4953 	ret = sched_getaffinity(pid, mask);
   4954 	if (ret == 0) {
   4955 		unsigned int retlen = min(len, cpumask_size());
   4956 
   4957 		if (copy_to_user(user_mask_ptr, mask, retlen))
   4958 			ret = -EFAULT;
   4959 		else
   4960 			ret = retlen;
   4961 	}
   4962 	free_cpumask_var(mask);
   4963 
   4964 	return ret;
   4965 }
   4966 
   4967 /**
   4968  * sys_sched_yield - yield the current processor to other threads.
   4969  *
   4970  * This function yields the current CPU to other tasks. If there are no
   4971  * other threads running on this CPU then this function will return.
   4972  *
   4973  * Return: 0.
   4974  */
   4975 static void do_sched_yield(void)
   4976 {
   4977 	struct rq_flags rf;
   4978 	struct rq *rq;
   4979 
   4980 	rq = this_rq_lock_irq(&rf);
   4981 
   4982 	schedstat_inc(rq->yld_count);
   4983 	current->sched_class->yield_task(rq);
   4984 
   4985 	/*
   4986 	 * Since we are going to call schedule() anyway, there's
   4987 	 * no need to preempt or enable interrupts:
   4988 	 */
   4989 	preempt_disable();
   4990 	rq_unlock(rq, &rf);
   4991 	sched_preempt_enable_no_resched();
   4992 
   4993 	schedule();
   4994 }
   4995 
   4996 SYSCALL_DEFINE0(sched_yield)
   4997 {
   4998 	do_sched_yield();
   4999 	return 0;
   5000 }
   5001 
   5002 #ifndef CONFIG_PREEMPT
   5003 int __sched _cond_resched(void)
   5004 {
   5005 	if (should_resched(0)) {
   5006 		preempt_schedule_common();
   5007 		return 1;
   5008 	}
   5009 	rcu_all_qs();
   5010 	return 0;
   5011 }
   5012 EXPORT_SYMBOL(_cond_resched);
   5013 #endif
   5014 
   5015 /*
   5016  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
   5017  * call schedule, and on return reacquire the lock.
   5018  *
   5019  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
   5020  * operations here to prevent schedule() from being called twice (once via
   5021  * spin_unlock(), once by hand).
   5022  */
   5023 int __cond_resched_lock(spinlock_t *lock)
   5024 {
   5025 	int resched = should_resched(PREEMPT_LOCK_OFFSET);
   5026 	int ret = 0;
   5027 
   5028 	lockdep_assert_held(lock);
   5029 
   5030 	if (spin_needbreak(lock) || resched) {
   5031 		spin_unlock(lock);
   5032 		if (resched)
   5033 			preempt_schedule_common();
   5034 		else
   5035 			cpu_relax();
   5036 		ret = 1;
   5037 		spin_lock(lock);
   5038 	}
   5039 	return ret;
   5040 }
   5041 EXPORT_SYMBOL(__cond_resched_lock);
   5042 
   5043 /**
   5044  * yield - yield the current processor to other threads.
   5045  *
   5046  * Do not ever use this function, there's a 99% chance you're doing it wrong.
   5047  *
   5048  * The scheduler is at all times free to pick the calling task as the most
   5049  * eligible task to run, if removing the yield() call from your code breaks
   5050  * it, its already broken.
   5051  *
   5052  * Typical broken usage is:
   5053  *
   5054  * while (!event)
   5055  *	yield();
   5056  *
   5057  * where one assumes that yield() will let 'the other' process run that will
   5058  * make event true. If the current task is a SCHED_FIFO task that will never
   5059  * happen. Never use yield() as a progress guarantee!!
   5060  *
   5061  * If you want to use yield() to wait for something, use wait_event().
   5062  * If you want to use yield() to be 'nice' for others, use cond_resched().
   5063  * If you still want to use yield(), do not!
   5064  */
   5065 void __sched yield(void)
   5066 {
   5067 	set_current_state(TASK_RUNNING);
   5068 	do_sched_yield();
   5069 }
   5070 EXPORT_SYMBOL(yield);
   5071 
   5072 /**
   5073  * yield_to - yield the current processor to another thread in
   5074  * your thread group, or accelerate that thread toward the
   5075  * processor it's on.
   5076  * @p: target task
   5077  * @preempt: whether task preemption is allowed or not
   5078  *
   5079  * It's the caller's job to ensure that the target task struct
   5080  * can't go away on us before we can do any checks.
   5081  *
   5082  * Return:
   5083  *	true (>0) if we indeed boosted the target task.
   5084  *	false (0) if we failed to boost the target.
   5085  *	-ESRCH if there's no task to yield to.
   5086  */
   5087 int __sched yield_to(struct task_struct *p, bool preempt)
   5088 {
   5089 	struct task_struct *curr = current;
   5090 	struct rq *rq, *p_rq;
   5091 	unsigned long flags;
   5092 	int yielded = 0;
   5093 
   5094 	local_irq_save(flags);
   5095 	rq = this_rq();
   5096 
   5097 again:
   5098 	p_rq = task_rq(p);
   5099 	/*
   5100 	 * If we're the only runnable task on the rq and target rq also
   5101 	 * has only one task, there's absolutely no point in yielding.
   5102 	 */
   5103 	if (rq->nr_running == 1 && p_rq->nr_running == 1) {
   5104 		yielded = -ESRCH;
   5105 		goto out_irq;
   5106 	}
   5107 
   5108 	double_rq_lock(rq, p_rq);
   5109 	if (task_rq(p) != p_rq) {
   5110 		double_rq_unlock(rq, p_rq);
   5111 		goto again;
   5112 	}
   5113 
   5114 	if (!curr->sched_class->yield_to_task)
   5115 		goto out_unlock;
   5116 
   5117 	if (curr->sched_class != p->sched_class)
   5118 		goto out_unlock;
   5119 
   5120 	if (task_running(p_rq, p) || p->state)
   5121 		goto out_unlock;
   5122 
   5123 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
   5124 	if (yielded) {
   5125 		schedstat_inc(rq->yld_count);
   5126 		/*
   5127 		 * Make p's CPU reschedule; pick_next_entity takes care of
   5128 		 * fairness.
   5129 		 */
   5130 		if (preempt && rq != p_rq)
   5131 			resched_curr(p_rq);
   5132 	}
   5133 
   5134 out_unlock:
   5135 	double_rq_unlock(rq, p_rq);
   5136 out_irq:
   5137 	local_irq_restore(flags);
   5138 
   5139 	if (yielded > 0)
   5140 		schedule();
   5141 
   5142 	return yielded;
   5143 }
   5144 EXPORT_SYMBOL_GPL(yield_to);
   5145 
   5146 int io_schedule_prepare(void)
   5147 {
   5148 	int old_iowait = current->in_iowait;
   5149 
   5150 	current->in_iowait = 1;
   5151 	blk_schedule_flush_plug(current);
   5152 
   5153 	return old_iowait;
   5154 }
   5155 
   5156 void io_schedule_finish(int token)
   5157 {
   5158 	current->in_iowait = token;
   5159 }
   5160 
   5161 /*
   5162  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
   5163  * that process accounting knows that this is a task in IO wait state.
   5164  */
   5165 long __sched io_schedule_timeout(long timeout)
   5166 {
   5167 	int token;
   5168 	long ret;
   5169 
   5170 	token = io_schedule_prepare();
   5171 	ret = schedule_timeout(timeout);
   5172 	io_schedule_finish(token);
   5173 
   5174 	return ret;
   5175 }
   5176 EXPORT_SYMBOL(io_schedule_timeout);
   5177 
   5178 void io_schedule(void)
   5179 {
   5180 	int token;
   5181 
   5182 	token = io_schedule_prepare();
   5183 	schedule();
   5184 	io_schedule_finish(token);
   5185 }
   5186 EXPORT_SYMBOL(io_schedule);
   5187 
   5188 /**
   5189  * sys_sched_get_priority_max - return maximum RT priority.
   5190  * @policy: scheduling class.
   5191  *
   5192  * Return: On success, this syscall returns the maximum
   5193  * rt_priority that can be used by a given scheduling class.
   5194  * On failure, a negative error code is returned.
   5195  */
   5196 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
   5197 {
   5198 	int ret = -EINVAL;
   5199 
   5200 	switch (policy) {
   5201 	case SCHED_FIFO:
   5202 	case SCHED_RR:
   5203 		ret = MAX_USER_RT_PRIO-1;
   5204 		break;
   5205 	case SCHED_DEADLINE:
   5206 	case SCHED_NORMAL:
   5207 	case SCHED_BATCH:
   5208 	case SCHED_IDLE:
   5209 		ret = 0;
   5210 		break;
   5211 	}
   5212 	return ret;
   5213 }
   5214 
   5215 /**
   5216  * sys_sched_get_priority_min - return minimum RT priority.
   5217  * @policy: scheduling class.
   5218  *
   5219  * Return: On success, this syscall returns the minimum
   5220  * rt_priority that can be used by a given scheduling class.
   5221  * On failure, a negative error code is returned.
   5222  */
   5223 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
   5224 {
   5225 	int ret = -EINVAL;
   5226 
   5227 	switch (policy) {
   5228 	case SCHED_FIFO:
   5229 	case SCHED_RR:
   5230 		ret = 1;
   5231 		break;
   5232 	case SCHED_DEADLINE:
   5233 	case SCHED_NORMAL:
   5234 	case SCHED_BATCH:
   5235 	case SCHED_IDLE:
   5236 		ret = 0;
   5237 	}
   5238 	return ret;
   5239 }
   5240 
   5241 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
   5242 {
   5243 	struct task_struct *p;
   5244 	unsigned int time_slice;
   5245 	struct rq_flags rf;
   5246 	struct rq *rq;
   5247 	int retval;
   5248 
   5249 	if (pid < 0)
   5250 		return -EINVAL;
   5251 
   5252 	retval = -ESRCH;
   5253 	rcu_read_lock();
   5254 	p = find_process_by_pid(pid);
   5255 	if (!p)
   5256 		goto out_unlock;
   5257 
   5258 	retval = security_task_getscheduler(p);
   5259 	if (retval)
   5260 		goto out_unlock;
   5261 
   5262 	rq = task_rq_lock(p, &rf);
   5263 	time_slice = 0;
   5264 	if (p->sched_class->get_rr_interval)
   5265 		time_slice = p->sched_class->get_rr_interval(rq, p);
   5266 	task_rq_unlock(rq, p, &rf);
   5267 
   5268 	rcu_read_unlock();
   5269 	jiffies_to_timespec64(time_slice, t);
   5270 	return 0;
   5271 
   5272 out_unlock:
   5273 	rcu_read_unlock();
   5274 	return retval;
   5275 }
   5276 
   5277 /**
   5278  * sys_sched_rr_get_interval - return the default timeslice of a process.
   5279  * @pid: pid of the process.
   5280  * @interval: userspace pointer to the timeslice value.
   5281  *
   5282  * this syscall writes the default timeslice value of a given process
   5283  * into the user-space timespec buffer. A value of '0' means infinity.
   5284  *
   5285  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
   5286  * an error code.
   5287  */
   5288 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
   5289 		struct __kernel_timespec __user *, interval)
   5290 {
   5291 	struct timespec64 t;
   5292 	int retval = sched_rr_get_interval(pid, &t);
   5293 
   5294 	if (retval == 0)
   5295 		retval = put_timespec64(&t, interval);
   5296 
   5297 	return retval;
   5298 }
   5299 
   5300 #ifdef CONFIG_COMPAT_32BIT_TIME
   5301 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
   5302 		struct old_timespec32 __user *, interval)
   5303 {
   5304 	struct timespec64 t;
   5305 	int retval = sched_rr_get_interval(pid, &t);
   5306 
   5307 	if (retval == 0)
   5308 		retval = put_old_timespec32(&t, interval);
   5309 	return retval;
   5310 }
   5311 #endif
   5312 
   5313 void sched_show_task(struct task_struct *p)
   5314 {
   5315 	unsigned long free = 0;
   5316 	int ppid;
   5317 
   5318 	if (!try_get_task_stack(p))
   5319 		return;
   5320 
   5321 	printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
   5322 
   5323 	if (p->state == TASK_RUNNING)
   5324 		printk(KERN_CONT "  running task    ");
   5325 #ifdef CONFIG_DEBUG_STACK_USAGE
   5326 	free = stack_not_used(p);
   5327 #endif
   5328 	ppid = 0;
   5329 	rcu_read_lock();
   5330 	if (pid_alive(p))
   5331 		ppid = task_pid_nr(rcu_dereference(p->real_parent));
   5332 	rcu_read_unlock();
   5333 	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
   5334 		task_pid_nr(p), ppid,
   5335 		(unsigned long)task_thread_info(p)->flags);
   5336 
   5337 	print_worker_info(KERN_INFO, p);
   5338 	show_stack(p, NULL);
   5339 	put_task_stack(p);
   5340 }
   5341 EXPORT_SYMBOL_GPL(sched_show_task);
   5342 
   5343 static inline bool
   5344 state_filter_match(unsigned long state_filter, struct task_struct *p)
   5345 {
   5346 	/* no filter, everything matches */
   5347 	if (!state_filter)
   5348 		return true;
   5349 
   5350 	/* filter, but doesn't match */
   5351 	if (!(p->state & state_filter))
   5352 		return false;
   5353 
   5354 	/*
   5355 	 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
   5356 	 * TASK_KILLABLE).
   5357 	 */
   5358 	if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
   5359 		return false;
   5360 
   5361 	return true;
   5362 }
   5363 
   5364 
   5365 void show_state_filter(unsigned long state_filter)
   5366 {
   5367 	struct task_struct *g, *p;
   5368 
   5369 #if BITS_PER_LONG == 32
   5370 	printk(KERN_INFO
   5371 		"  task                PC stack   pid father\n");
   5372 #else
   5373 	printk(KERN_INFO
   5374 		"  task                        PC stack   pid father\n");
   5375 #endif
   5376 	rcu_read_lock();
   5377 	for_each_process_thread(g, p) {
   5378 		/*
   5379 		 * reset the NMI-timeout, listing all files on a slow
   5380 		 * console might take a lot of time:
   5381 		 * Also, reset softlockup watchdogs on all CPUs, because
   5382 		 * another CPU might be blocked waiting for us to process
   5383 		 * an IPI.
   5384 		 */
   5385 		touch_nmi_watchdog();
   5386 		touch_all_softlockup_watchdogs();
   5387 		if (state_filter_match(state_filter, p))
   5388 			sched_show_task(p);
   5389 	}
   5390 
   5391 #ifdef CONFIG_SCHED_DEBUG
   5392 	if (!state_filter)
   5393 		sysrq_sched_debug_show();
   5394 #endif
   5395 	rcu_read_unlock();
   5396 	/*
   5397 	 * Only show locks if all tasks are dumped:
   5398 	 */
   5399 	if (!state_filter)
   5400 		debug_show_all_locks();
   5401 }
   5402 
   5403 /**
   5404  * init_idle - set up an idle thread for a given CPU
   5405  * @idle: task in question
   5406  * @cpu: CPU the idle task belongs to
   5407  *
   5408  * NOTE: this function does not set the idle thread's NEED_RESCHED
   5409  * flag, to make booting more robust.
   5410  */
   5411 void init_idle(struct task_struct *idle, int cpu)
   5412 {
   5413 	struct rq *rq = cpu_rq(cpu);
   5414 	unsigned long flags;
   5415 
   5416 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
   5417 	raw_spin_lock(&rq->lock);
   5418 
   5419 	__sched_fork(0, idle);
   5420 	idle->state = TASK_RUNNING;
   5421 	idle->se.exec_start = sched_clock();
   5422 	idle->flags |= PF_IDLE;
   5423 
   5424 	kasan_unpoison_task_stack(idle);
   5425 
   5426 #ifdef CONFIG_SMP
   5427 	/*
   5428 	 * Its possible that init_idle() gets called multiple times on a task,
   5429 	 * in that case do_set_cpus_allowed() will not do the right thing.
   5430 	 *
   5431 	 * And since this is boot we can forgo the serialization.
   5432 	 */
   5433 	set_cpus_allowed_common(idle, cpumask_of(cpu));
   5434 #endif
   5435 	/*
   5436 	 * We're having a chicken and egg problem, even though we are
   5437 	 * holding rq->lock, the CPU isn't yet set to this CPU so the
   5438 	 * lockdep check in task_group() will fail.
   5439 	 *
   5440 	 * Similar case to sched_fork(). / Alternatively we could
   5441 	 * use task_rq_lock() here and obtain the other rq->lock.
   5442 	 *
   5443 	 * Silence PROVE_RCU
   5444 	 */
   5445 	rcu_read_lock();
   5446 	__set_task_cpu(idle, cpu);
   5447 	rcu_read_unlock();
   5448 
   5449 	rq->curr = rq->idle = idle;
   5450 	idle->on_rq = TASK_ON_RQ_QUEUED;
   5451 #ifdef CONFIG_SMP
   5452 	idle->on_cpu = 1;
   5453 #endif
   5454 	raw_spin_unlock(&rq->lock);
   5455 	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
   5456 
   5457 	/* Set the preempt count _outside_ the spinlocks! */
   5458 	init_idle_preempt_count(idle, cpu);
   5459 
   5460 	/*
   5461 	 * The idle tasks have their own, simple scheduling class:
   5462 	 */
   5463 	idle->sched_class = &idle_sched_class;
   5464 	ftrace_graph_init_idle_task(idle, cpu);
   5465 	vtime_init_idle(idle, cpu);
   5466 #ifdef CONFIG_SMP
   5467 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
   5468 #endif
   5469 }
   5470 
   5471 #ifdef CONFIG_SMP
   5472 
   5473 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
   5474 			      const struct cpumask *trial)
   5475 {
   5476 	int ret = 1;
   5477 
   5478 	if (!cpumask_weight(cur))
   5479 		return ret;
   5480 
   5481 	ret = dl_cpuset_cpumask_can_shrink(cur, trial);
   5482 
   5483 	return ret;
   5484 }
   5485 
   5486 int task_can_attach(struct task_struct *p,
   5487 		    const struct cpumask *cs_cpus_allowed)
   5488 {
   5489 	int ret = 0;
   5490 
   5491 	/*
   5492 	 * Kthreads which disallow setaffinity shouldn't be moved
   5493 	 * to a new cpuset; we don't want to change their CPU
   5494 	 * affinity and isolating such threads by their set of
   5495 	 * allowed nodes is unnecessary.  Thus, cpusets are not
   5496 	 * applicable for such threads.  This prevents checking for
   5497 	 * success of set_cpus_allowed_ptr() on all attached tasks
   5498 	 * before cpus_allowed may be changed.
   5499 	 */
   5500 	if (p->flags & PF_NO_SETAFFINITY) {
   5501 		ret = -EINVAL;
   5502 		goto out;
   5503 	}
   5504 
   5505 	if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
   5506 					      cs_cpus_allowed))
   5507 		ret = dl_task_can_attach(p, cs_cpus_allowed);
   5508 
   5509 out:
   5510 	return ret;
   5511 }
   5512 
   5513 bool sched_smp_initialized __read_mostly;
   5514 
   5515 #ifdef CONFIG_NUMA_BALANCING
   5516 /* Migrate current task p to target_cpu */
   5517 int migrate_task_to(struct task_struct *p, int target_cpu)
   5518 {
   5519 	struct migration_arg arg = { p, target_cpu };
   5520 	int curr_cpu = task_cpu(p);
   5521 
   5522 	if (curr_cpu == target_cpu)
   5523 		return 0;
   5524 
   5525 	if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
   5526 		return -EINVAL;
   5527 
   5528 	/* TODO: This is not properly updating schedstats */
   5529 
   5530 	trace_sched_move_numa(p, curr_cpu, target_cpu);
   5531 	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
   5532 }
   5533 
   5534 /*
   5535  * Requeue a task on a given node and accurately track the number of NUMA
   5536  * tasks on the runqueues
   5537  */
   5538 void sched_setnuma(struct task_struct *p, int nid)
   5539 {
   5540 	bool queued, running;
   5541 	struct rq_flags rf;
   5542 	struct rq *rq;
   5543 
   5544 	rq = task_rq_lock(p, &rf);
   5545 	queued = task_on_rq_queued(p);
   5546 	running = task_current(rq, p);
   5547 
   5548 	if (queued)
   5549 		dequeue_task(rq, p, DEQUEUE_SAVE);
   5550 	if (running)
   5551 		put_prev_task(rq, p);
   5552 
   5553 	p->numa_preferred_nid = nid;
   5554 
   5555 	if (queued)
   5556 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
   5557 	if (running)
   5558 		set_curr_task(rq, p);
   5559 	task_rq_unlock(rq, p, &rf);
   5560 }
   5561 #endif /* CONFIG_NUMA_BALANCING */
   5562 
   5563 #ifdef CONFIG_HOTPLUG_CPU
   5564 /*
   5565  * Ensure that the idle task is using init_mm right before its CPU goes
   5566  * offline.
   5567  */
   5568 void idle_task_exit(void)
   5569 {
   5570 	struct mm_struct *mm = current->active_mm;
   5571 
   5572 	BUG_ON(cpu_online(smp_processor_id()));
   5573 
   5574 	if (mm != &init_mm) {
   5575 		switch_mm(mm, &init_mm, current);
   5576 		current->active_mm = &init_mm;
   5577 		finish_arch_post_lock_switch();
   5578 	}
   5579 	mmdrop(mm);
   5580 }
   5581 
   5582 /*
   5583  * Since this CPU is going 'away' for a while, fold any nr_active delta
   5584  * we might have. Assumes we're called after migrate_tasks() so that the
   5585  * nr_active count is stable. We need to take the teardown thread which
   5586  * is calling this into account, so we hand in adjust = 1 to the load
   5587  * calculation.
   5588  *
   5589  * Also see the comment "Global load-average calculations".
   5590  */
   5591 static void calc_load_migrate(struct rq *rq)
   5592 {
   5593 	long delta = calc_load_fold_active(rq, 1);
   5594 	if (delta)
   5595 		atomic_long_add(delta, &calc_load_tasks);
   5596 }
   5597 
   5598 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
   5599 {
   5600 }
   5601 
   5602 static const struct sched_class fake_sched_class = {
   5603 	.put_prev_task = put_prev_task_fake,
   5604 };
   5605 
   5606 static struct task_struct fake_task = {
   5607 	/*
   5608 	 * Avoid pull_{rt,dl}_task()
   5609 	 */
   5610 	.prio = MAX_PRIO + 1,
   5611 	.sched_class = &fake_sched_class,
   5612 };
   5613 
   5614 /*
   5615  * Migrate all tasks from the rq, sleeping tasks will be migrated by
   5616  * try_to_wake_up()->select_task_rq().
   5617  *
   5618  * Called with rq->lock held even though we'er in stop_machine() and
   5619  * there's no concurrency possible, we hold the required locks anyway
   5620  * because of lock validation efforts.
   5621  */
   5622 static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
   5623 {
   5624 	struct rq *rq = dead_rq;
   5625 	struct task_struct *next, *stop = rq->stop;
   5626 	struct rq_flags orf = *rf;
   5627 	int dest_cpu;
   5628 
   5629 	/*
   5630 	 * Fudge the rq selection such that the below task selection loop
   5631 	 * doesn't get stuck on the currently eligible stop task.
   5632 	 *
   5633 	 * We're currently inside stop_machine() and the rq is either stuck
   5634 	 * in the stop_machine_cpu_stop() loop, or we're executing this code,
   5635 	 * either way we should never end up calling schedule() until we're
   5636 	 * done here.
   5637 	 */
   5638 	rq->stop = NULL;
   5639 
   5640 	/*
   5641 	 * put_prev_task() and pick_next_task() sched
   5642 	 * class method both need to have an up-to-date
   5643 	 * value of rq->clock[_task]
   5644 	 */
   5645 	update_rq_clock(rq);
   5646 
   5647 	for (;;) {
   5648 		/*
   5649 		 * There's this thread running, bail when that's the only
   5650 		 * remaining thread:
   5651 		 */
   5652 		if (rq->nr_running == 1)
   5653 			break;
   5654 
   5655 		/*
   5656 		 * pick_next_task() assumes pinned rq->lock:
   5657 		 */
   5658 		next = pick_next_task(rq, &fake_task, rf);
   5659 		BUG_ON(!next);
   5660 		put_prev_task(rq, next);
   5661 
   5662 		/*
   5663 		 * Rules for changing task_struct::cpus_allowed are holding
   5664 		 * both pi_lock and rq->lock, such that holding either
   5665 		 * stabilizes the mask.
   5666 		 *
   5667 		 * Drop rq->lock is not quite as disastrous as it usually is
   5668 		 * because !cpu_active at this point, which means load-balance
   5669 		 * will not interfere. Also, stop-machine.
   5670 		 */
   5671 		rq_unlock(rq, rf);
   5672 		raw_spin_lock(&next->pi_lock);
   5673 		rq_relock(rq, rf);
   5674 
   5675 		/*
   5676 		 * Since we're inside stop-machine, _nothing_ should have
   5677 		 * changed the task, WARN if weird stuff happened, because in
   5678 		 * that case the above rq->lock drop is a fail too.
   5679 		 */
   5680 		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
   5681 			raw_spin_unlock(&next->pi_lock);
   5682 			continue;
   5683 		}
   5684 
   5685 		/* Find suitable destination for @next, with force if needed. */
   5686 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
   5687 		rq = __migrate_task(rq, rf, next, dest_cpu);
   5688 		if (rq != dead_rq) {
   5689 			rq_unlock(rq, rf);
   5690 			rq = dead_rq;
   5691 			*rf = orf;
   5692 			rq_relock(rq, rf);
   5693 		}
   5694 		raw_spin_unlock(&next->pi_lock);
   5695 	}
   5696 
   5697 	rq->stop = stop;
   5698 }
   5699 #endif /* CONFIG_HOTPLUG_CPU */
   5700 
   5701 void set_rq_online(struct rq *rq)
   5702 {
   5703 	if (!rq->online) {
   5704 		const struct sched_class *class;
   5705 
   5706 		cpumask_set_cpu(rq->cpu, rq->rd->online);
   5707 		rq->online = 1;
   5708 
   5709 		for_each_class(class) {
   5710 			if (class->rq_online)
   5711 				class->rq_online(rq);
   5712 		}
   5713 	}
   5714 }
   5715 
   5716 void set_rq_offline(struct rq *rq)
   5717 {
   5718 	if (rq->online) {
   5719 		const struct sched_class *class;
   5720 
   5721 		for_each_class(class) {
   5722 			if (class->rq_offline)
   5723 				class->rq_offline(rq);
   5724 		}
   5725 
   5726 		cpumask_clear_cpu(rq->cpu, rq->rd->online);
   5727 		rq->online = 0;
   5728 	}
   5729 }
   5730 
   5731 /*
   5732  * used to mark begin/end of suspend/resume:
   5733  */
   5734 static int num_cpus_frozen;
   5735 
   5736 /*
   5737  * Update cpusets according to cpu_active mask.  If cpusets are
   5738  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
   5739  * around partition_sched_domains().
   5740  *
   5741  * If we come here as part of a suspend/resume, don't touch cpusets because we
   5742  * want to restore it back to its original state upon resume anyway.
   5743  */
   5744 static void cpuset_cpu_active(void)
   5745 {
   5746 	if (cpuhp_tasks_frozen) {
   5747 		/*
   5748 		 * num_cpus_frozen tracks how many CPUs are involved in suspend
   5749 		 * resume sequence. As long as this is not the last online
   5750 		 * operation in the resume sequence, just build a single sched
   5751 		 * domain, ignoring cpusets.
   5752 		 */
   5753 		partition_sched_domains(1, NULL, NULL);
   5754 		if (--num_cpus_frozen)
   5755 			return;
   5756 		/*
   5757 		 * This is the last CPU online operation. So fall through and
   5758 		 * restore the original sched domains by considering the
   5759 		 * cpuset configurations.
   5760 		 */
   5761 		cpuset_force_rebuild();
   5762 	}
   5763 	cpuset_update_active_cpus();
   5764 }
   5765 
   5766 static int cpuset_cpu_inactive(unsigned int cpu)
   5767 {
   5768 	if (!cpuhp_tasks_frozen) {
   5769 		if (dl_cpu_busy(cpu))
   5770 			return -EBUSY;
   5771 		cpuset_update_active_cpus();
   5772 	} else {
   5773 		num_cpus_frozen++;
   5774 		partition_sched_domains(1, NULL, NULL);
   5775 	}
   5776 	return 0;
   5777 }
   5778 
   5779 int sched_cpu_activate(unsigned int cpu)
   5780 {
   5781 	struct rq *rq = cpu_rq(cpu);
   5782 	struct rq_flags rf;
   5783 
   5784 #ifdef CONFIG_SCHED_SMT
   5785 	/*
   5786 	 * When going up, increment the number of cores with SMT present.
   5787 	 */
   5788 	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
   5789 		static_branch_inc_cpuslocked(&sched_smt_present);
   5790 #endif
   5791 	set_cpu_active(cpu, true);
   5792 
   5793 	if (sched_smp_initialized) {
   5794 		sched_domains_numa_masks_set(cpu);
   5795 		cpuset_cpu_active();
   5796 	}
   5797 
   5798 	/*
   5799 	 * Put the rq online, if not already. This happens:
   5800 	 *
   5801 	 * 1) In the early boot process, because we build the real domains
   5802 	 *    after all CPUs have been brought up.
   5803 	 *
   5804 	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
   5805 	 *    domains.
   5806 	 */
   5807 	rq_lock_irqsave(rq, &rf);
   5808 	if (rq->rd) {
   5809 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   5810 		set_rq_online(rq);
   5811 	}
   5812 	rq_unlock_irqrestore(rq, &rf);
   5813 
   5814 	update_max_interval();
   5815 
   5816 	return 0;
   5817 }
   5818 
   5819 int sched_cpu_deactivate(unsigned int cpu)
   5820 {
   5821 	int ret;
   5822 
   5823 	set_cpu_active(cpu, false);
   5824 	/*
   5825 	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
   5826 	 * users of this state to go away such that all new such users will
   5827 	 * observe it.
   5828 	 *
   5829 	 * Do sync before park smpboot threads to take care the rcu boost case.
   5830 	 */
   5831 	synchronize_rcu();
   5832 
   5833 #ifdef CONFIG_SCHED_SMT
   5834 	/*
   5835 	 * When going down, decrement the number of cores with SMT present.
   5836 	 */
   5837 	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
   5838 		static_branch_dec_cpuslocked(&sched_smt_present);
   5839 #endif
   5840 
   5841 	if (!sched_smp_initialized)
   5842 		return 0;
   5843 
   5844 	ret = cpuset_cpu_inactive(cpu);
   5845 	if (ret) {
   5846 		set_cpu_active(cpu, true);
   5847 		return ret;
   5848 	}
   5849 	sched_domains_numa_masks_clear(cpu);
   5850 	return 0;
   5851 }
   5852 
   5853 static void sched_rq_cpu_starting(unsigned int cpu)
   5854 {
   5855 	struct rq *rq = cpu_rq(cpu);
   5856 
   5857 	rq->calc_load_update = calc_load_update;
   5858 	update_max_interval();
   5859 }
   5860 
   5861 int sched_cpu_starting(unsigned int cpu)
   5862 {
   5863 	sched_rq_cpu_starting(cpu);
   5864 	sched_tick_start(cpu);
   5865 	return 0;
   5866 }
   5867 
   5868 #ifdef CONFIG_HOTPLUG_CPU
   5869 int sched_cpu_dying(unsigned int cpu)
   5870 {
   5871 	struct rq *rq = cpu_rq(cpu);
   5872 	struct rq_flags rf;
   5873 
   5874 	/* Handle pending wakeups and then migrate everything off */
   5875 	sched_ttwu_pending();
   5876 	sched_tick_stop(cpu);
   5877 
   5878 	rq_lock_irqsave(rq, &rf);
   5879 	if (rq->rd) {
   5880 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   5881 		set_rq_offline(rq);
   5882 	}
   5883 	migrate_tasks(rq, &rf);
   5884 	BUG_ON(rq->nr_running != 1);
   5885 	rq_unlock_irqrestore(rq, &rf);
   5886 
   5887 	calc_load_migrate(rq);
   5888 	update_max_interval();
   5889 	nohz_balance_exit_idle(rq);
   5890 	hrtick_clear(rq);
   5891 	return 0;
   5892 }
   5893 #endif
   5894 
   5895 void __init sched_init_smp(void)
   5896 {
   5897 	sched_init_numa();
   5898 
   5899 	/*
   5900 	 * There's no userspace yet to cause hotplug operations; hence all the
   5901 	 * CPU masks are stable and all blatant races in the below code cannot
   5902 	 * happen.
   5903 	 */
   5904 	mutex_lock(&sched_domains_mutex);
   5905 	sched_init_domains(cpu_active_mask);
   5906 	mutex_unlock(&sched_domains_mutex);
   5907 
   5908 	/* Move init over to a non-isolated CPU */
   5909 	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
   5910 		BUG();
   5911 	sched_init_granularity();
   5912 
   5913 	init_sched_rt_class();
   5914 	init_sched_dl_class();
   5915 
   5916 	sched_smp_initialized = true;
   5917 }
   5918 
   5919 static int __init migration_init(void)
   5920 {
   5921 	sched_rq_cpu_starting(smp_processor_id());
   5922 	return 0;
   5923 }
   5924 early_initcall(migration_init);
   5925 
   5926 #else
   5927 void __init sched_init_smp(void)
   5928 {
   5929 	sched_init_granularity();
   5930 }
   5931 #endif /* CONFIG_SMP */
   5932 
   5933 int in_sched_functions(unsigned long addr)
   5934 {
   5935 	return in_lock_functions(addr) ||
   5936 		(addr >= (unsigned long)__sched_text_start
   5937 		&& addr < (unsigned long)__sched_text_end);
   5938 }
   5939 
   5940 #ifdef CONFIG_CGROUP_SCHED
   5941 /*
   5942  * Default task group.
   5943  * Every task in system belongs to this group at bootup.
   5944  */
   5945 struct task_group root_task_group;
   5946 LIST_HEAD(task_groups);
   5947 
   5948 /* Cacheline aligned slab cache for task_group */
   5949 static struct kmem_cache *task_group_cache __read_mostly;
   5950 #endif
   5951 
   5952 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
   5953 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
   5954 
   5955 void __init sched_init(void)
   5956 {
   5957 	int i, j;
   5958 	unsigned long alloc_size = 0, ptr;
   5959 
   5960 	wait_bit_init();
   5961 
   5962 #ifdef CONFIG_FAIR_GROUP_SCHED
   5963 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
   5964 #endif
   5965 #ifdef CONFIG_RT_GROUP_SCHED
   5966 	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
   5967 #endif
   5968 	if (alloc_size) {
   5969 		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
   5970 
   5971 #ifdef CONFIG_FAIR_GROUP_SCHED
   5972 		root_task_group.se = (struct sched_entity **)ptr;
   5973 		ptr += nr_cpu_ids * sizeof(void **);
   5974 
   5975 		root_task_group.cfs_rq = (struct cfs_rq **)ptr;
   5976 		ptr += nr_cpu_ids * sizeof(void **);
   5977 
   5978 #endif /* CONFIG_FAIR_GROUP_SCHED */
   5979 #ifdef CONFIG_RT_GROUP_SCHED
   5980 		root_task_group.rt_se = (struct sched_rt_entity **)ptr;
   5981 		ptr += nr_cpu_ids * sizeof(void **);
   5982 
   5983 		root_task_group.rt_rq = (struct rt_rq **)ptr;
   5984 		ptr += nr_cpu_ids * sizeof(void **);
   5985 
   5986 #endif /* CONFIG_RT_GROUP_SCHED */
   5987 	}
   5988 #ifdef CONFIG_CPUMASK_OFFSTACK
   5989 	for_each_possible_cpu(i) {
   5990 		per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
   5991 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
   5992 		per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
   5993 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
   5994 	}
   5995 #endif /* CONFIG_CPUMASK_OFFSTACK */
   5996 
   5997 	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
   5998 	init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
   5999 
   6000 #ifdef CONFIG_SMP
   6001 	init_defrootdomain();
   6002 #endif
   6003 
   6004 #ifdef CONFIG_RT_GROUP_SCHED
   6005 	init_rt_bandwidth(&root_task_group.rt_bandwidth,
   6006 			global_rt_period(), global_rt_runtime());
   6007 #endif /* CONFIG_RT_GROUP_SCHED */
   6008 
   6009 #ifdef CONFIG_CGROUP_SCHED
   6010 	task_group_cache = KMEM_CACHE(task_group, 0);
   6011 
   6012 	list_add(&root_task_group.list, &task_groups);
   6013 	INIT_LIST_HEAD(&root_task_group.children);
   6014 	INIT_LIST_HEAD(&root_task_group.siblings);
   6015 	autogroup_init(&init_task);
   6016 #endif /* CONFIG_CGROUP_SCHED */
   6017 
   6018 	for_each_possible_cpu(i) {
   6019 		struct rq *rq;
   6020 
   6021 		rq = cpu_rq(i);
   6022 		raw_spin_lock_init(&rq->lock);
   6023 		rq->nr_running = 0;
   6024 		rq->calc_load_active = 0;
   6025 		rq->calc_load_update = jiffies + LOAD_FREQ;
   6026 		init_cfs_rq(&rq->cfs);
   6027 		init_rt_rq(&rq->rt);
   6028 		init_dl_rq(&rq->dl);
   6029 #ifdef CONFIG_FAIR_GROUP_SCHED
   6030 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
   6031 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
   6032 		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
   6033 		/*
   6034 		 * How much CPU bandwidth does root_task_group get?
   6035 		 *
   6036 		 * In case of task-groups formed thr' the cgroup filesystem, it
   6037 		 * gets 100% of the CPU resources in the system. This overall
   6038 		 * system CPU resource is divided among the tasks of
   6039 		 * root_task_group and its child task-groups in a fair manner,
   6040 		 * based on each entity's (task or task-group's) weight
   6041 		 * (se->load.weight).
   6042 		 *
   6043 		 * In other words, if root_task_group has 10 tasks of weight
   6044 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
   6045 		 * then A0's share of the CPU resource is:
   6046 		 *
   6047 		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
   6048 		 *
   6049 		 * We achieve this by letting root_task_group's tasks sit
   6050 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
   6051 		 */
   6052 		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
   6053 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
   6054 #endif /* CONFIG_FAIR_GROUP_SCHED */
   6055 
   6056 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
   6057 #ifdef CONFIG_RT_GROUP_SCHED
   6058 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
   6059 #endif
   6060 
   6061 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
   6062 			rq->cpu_load[j] = 0;
   6063 
   6064 #ifdef CONFIG_SMP
   6065 		rq->sd = NULL;
   6066 		rq->rd = NULL;
   6067 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
   6068 		rq->balance_callback = NULL;
   6069 		rq->active_balance = 0;
   6070 		rq->next_balance = jiffies;
   6071 		rq->push_cpu = 0;
   6072 		rq->cpu = i;
   6073 		rq->online = 0;
   6074 		rq->idle_stamp = 0;
   6075 		rq->avg_idle = 2*sysctl_sched_migration_cost;
   6076 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
   6077 
   6078 		INIT_LIST_HEAD(&rq->cfs_tasks);
   6079 
   6080 		rq_attach_root(rq, &def_root_domain);
   6081 #ifdef CONFIG_NO_HZ_COMMON
   6082 		rq->last_load_update_tick = jiffies;
   6083 		rq->last_blocked_load_update_tick = jiffies;
   6084 		atomic_set(&rq->nohz_flags, 0);
   6085 #endif
   6086 #endif /* CONFIG_SMP */
   6087 		hrtick_rq_init(rq);
   6088 		atomic_set(&rq->nr_iowait, 0);
   6089 	}
   6090 
   6091 	set_load_weight(&init_task, false);
   6092 
   6093 	/*
   6094 	 * The boot idle thread does lazy MMU switching as well:
   6095 	 */
   6096 	mmgrab(&init_mm);
   6097 	enter_lazy_tlb(&init_mm, current);
   6098 
   6099 	/*
   6100 	 * Make us the idle thread. Technically, schedule() should not be
   6101 	 * called from this thread, however somewhere below it might be,
   6102 	 * but because we are the idle thread, we just pick up running again
   6103 	 * when this runqueue becomes "idle".
   6104 	 */
   6105 	init_idle(current, smp_processor_id());
   6106 
   6107 	calc_load_update = jiffies + LOAD_FREQ;
   6108 
   6109 #ifdef CONFIG_SMP
   6110 	idle_thread_set_boot_cpu();
   6111 #endif
   6112 	init_sched_fair_class();
   6113 
   6114 	init_schedstats();
   6115 
   6116 	psi_init();
   6117 
   6118 	scheduler_running = 1;
   6119 }
   6120 
   6121 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   6122 static inline int preempt_count_equals(int preempt_offset)
   6123 {
   6124 	int nested = preempt_count() + rcu_preempt_depth();
   6125 
   6126 	return (nested == preempt_offset);
   6127 }
   6128 
   6129 void __might_sleep(const char *file, int line, int preempt_offset)
   6130 {
   6131 	/*
   6132 	 * Blocking primitives will set (and therefore destroy) current->state,
   6133 	 * since we will exit with TASK_RUNNING make sure we enter with it,
   6134 	 * otherwise we will destroy state.
   6135 	 */
   6136 	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
   6137 			"do not call blocking ops when !TASK_RUNNING; "
   6138 			"state=%lx set at [<%p>] %pS\n",
   6139 			current->state,
   6140 			(void *)current->task_state_change,
   6141 			(void *)current->task_state_change);
   6142 
   6143 	___might_sleep(file, line, preempt_offset);
   6144 }
   6145 EXPORT_SYMBOL(__might_sleep);
   6146 
   6147 void ___might_sleep(const char *file, int line, int preempt_offset)
   6148 {
   6149 	/* Ratelimiting timestamp: */
   6150 	static unsigned long prev_jiffy;
   6151 
   6152 	unsigned long preempt_disable_ip;
   6153 
   6154 	/* WARN_ON_ONCE() by default, no rate limit required: */
   6155 	rcu_sleep_check();
   6156 
   6157 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
   6158 	     !is_idle_task(current)) ||
   6159 	    system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
   6160 	    oops_in_progress)
   6161 		return;
   6162 
   6163 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
   6164 		return;
   6165 	prev_jiffy = jiffies;
   6166 
   6167 	/* Save this before calling printk(), since that will clobber it: */
   6168 	preempt_disable_ip = get_preempt_disable_ip(current);
   6169 
   6170 	printk(KERN_ERR
   6171 		"BUG: sleeping function called from invalid context at %s:%d\n",
   6172 			file, line);
   6173 	printk(KERN_ERR
   6174 		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
   6175 			in_atomic(), irqs_disabled(),
   6176 			current->pid, current->comm);
   6177 
   6178 	if (task_stack_end_corrupted(current))
   6179 		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
   6180 
   6181 	debug_show_held_locks(current);
   6182 	if (irqs_disabled())
   6183 		print_irqtrace_events(current);
   6184 	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
   6185 	    && !preempt_count_equals(preempt_offset)) {
   6186 		pr_err("Preemption disabled at:");
   6187 		print_ip_sym(preempt_disable_ip);
   6188 		pr_cont("\n");
   6189 	}
   6190 	dump_stack();
   6191 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
   6192 }
   6193 EXPORT_SYMBOL(___might_sleep);
   6194 
   6195 void __cant_sleep(const char *file, int line, int preempt_offset)
   6196 {
   6197 	static unsigned long prev_jiffy;
   6198 
   6199 	if (irqs_disabled())
   6200 		return;
   6201 
   6202 	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
   6203 		return;
   6204 
   6205 	if (preempt_count() > preempt_offset)
   6206 		return;
   6207 
   6208 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
   6209 		return;
   6210 	prev_jiffy = jiffies;
   6211 
   6212 	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
   6213 	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
   6214 			in_atomic(), irqs_disabled(),
   6215 			current->pid, current->comm);
   6216 
   6217 	debug_show_held_locks(current);
   6218 	dump_stack();
   6219 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
   6220 }
   6221 EXPORT_SYMBOL_GPL(__cant_sleep);
   6222 #endif
   6223 
   6224 #ifdef CONFIG_MAGIC_SYSRQ
   6225 void normalize_rt_tasks(void)
   6226 {
   6227 	struct task_struct *g, *p;
   6228 	struct sched_attr attr = {
   6229 		.sched_policy = SCHED_NORMAL,
   6230 	};
   6231 
   6232 	read_lock(&tasklist_lock);
   6233 	for_each_process_thread(g, p) {
   6234 		/*
   6235 		 * Only normalize user tasks:
   6236 		 */
   6237 		if (p->flags & PF_KTHREAD)
   6238 			continue;
   6239 
   6240 		p->se.exec_start = 0;
   6241 		schedstat_set(p->se.statistics.wait_start,  0);
   6242 		schedstat_set(p->se.statistics.sleep_start, 0);
   6243 		schedstat_set(p->se.statistics.block_start, 0);
   6244 
   6245 		if (!dl_task(p) && !rt_task(p)) {
   6246 			/*
   6247 			 * Renice negative nice level userspace
   6248 			 * tasks back to 0:
   6249 			 */
   6250 			if (task_nice(p) < 0)
   6251 				set_user_nice(p, 0);
   6252 			continue;
   6253 		}
   6254 
   6255 		__sched_setscheduler(p, &attr, false, false);
   6256 	}
   6257 	read_unlock(&tasklist_lock);
   6258 }
   6259 
   6260 #endif /* CONFIG_MAGIC_SYSRQ */
   6261 
   6262 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
   6263 /*
   6264  * These functions are only useful for the IA64 MCA handling, or kdb.
   6265  *
   6266  * They can only be called when the whole system has been
   6267  * stopped - every CPU needs to be quiescent, and no scheduling
   6268  * activity can take place. Using them for anything else would
   6269  * be a serious bug, and as a result, they aren't even visible
   6270  * under any other configuration.
   6271  */
   6272 
   6273 /**
   6274  * curr_task - return the current task for a given CPU.
   6275  * @cpu: the processor in question.
   6276  *
   6277  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
   6278  *
   6279  * Return: The current task for @cpu.
   6280  */
   6281 struct task_struct *curr_task(int cpu)
   6282 {
   6283 	return cpu_curr(cpu);
   6284 }
   6285 
   6286 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
   6287 
   6288 #ifdef CONFIG_IA64
   6289 /**
   6290  * set_curr_task - set the current task for a given CPU.
   6291  * @cpu: the processor in question.
   6292  * @p: the task pointer to set.
   6293  *
   6294  * Description: This function must only be used when non-maskable interrupts
   6295  * are serviced on a separate stack. It allows the architecture to switch the
   6296  * notion of the current task on a CPU in a non-blocking manner. This function
   6297  * must be called with all CPU's synchronized, and interrupts disabled, the
   6298  * and caller must save the original value of the current task (see
   6299  * curr_task() above) and restore that value before reenabling interrupts and
   6300  * re-starting the system.
   6301  *
   6302  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
   6303  */
   6304 void ia64_set_curr_task(int cpu, struct task_struct *p)
   6305 {
   6306 	cpu_curr(cpu) = p;
   6307 }
   6308 
   6309 #endif
   6310 
   6311 #ifdef CONFIG_CGROUP_SCHED
   6312 /* task_group_lock serializes the addition/removal of task groups */
   6313 static DEFINE_SPINLOCK(task_group_lock);
   6314 
   6315 static void sched_free_group(struct task_group *tg)
   6316 {
   6317 	free_fair_sched_group(tg);
   6318 	free_rt_sched_group(tg);
   6319 	autogroup_free(tg);
   6320 	kmem_cache_free(task_group_cache, tg);
   6321 }
   6322 
   6323 /* allocate runqueue etc for a new task group */
   6324 struct task_group *sched_create_group(struct task_group *parent)
   6325 {
   6326 	struct task_group *tg;
   6327 
   6328 	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
   6329 	if (!tg)
   6330 		return ERR_PTR(-ENOMEM);
   6331 
   6332 	if (!alloc_fair_sched_group(tg, parent))
   6333 		goto err;
   6334 
   6335 	if (!alloc_rt_sched_group(tg, parent))
   6336 		goto err;
   6337 
   6338 	return tg;
   6339 
   6340 err:
   6341 	sched_free_group(tg);
   6342 	return ERR_PTR(-ENOMEM);
   6343 }
   6344 
   6345 void sched_online_group(struct task_group *tg, struct task_group *parent)
   6346 {
   6347 	unsigned long flags;
   6348 
   6349 	spin_lock_irqsave(&task_group_lock, flags);
   6350 	list_add_rcu(&tg->list, &task_groups);
   6351 
   6352 	/* Root should already exist: */
   6353 	WARN_ON(!parent);
   6354 
   6355 	tg->parent = parent;
   6356 	INIT_LIST_HEAD(&tg->children);
   6357 	list_add_rcu(&tg->siblings, &parent->children);
   6358 	spin_unlock_irqrestore(&task_group_lock, flags);
   6359 
   6360 	online_fair_sched_group(tg);
   6361 }
   6362 
   6363 /* rcu callback to free various structures associated with a task group */
   6364 static void sched_free_group_rcu(struct rcu_head *rhp)
   6365 {
   6366 	/* Now it should be safe to free those cfs_rqs: */
   6367 	sched_free_group(container_of(rhp, struct task_group, rcu));
   6368 }
   6369 
   6370 void sched_destroy_group(struct task_group *tg)
   6371 {
   6372 	/* Wait for possible concurrent references to cfs_rqs complete: */
   6373 	call_rcu(&tg->rcu, sched_free_group_rcu);
   6374 }
   6375 
   6376 void sched_offline_group(struct task_group *tg)
   6377 {
   6378 	unsigned long flags;
   6379 
   6380 	/* End participation in shares distribution: */
   6381 	unregister_fair_sched_group(tg);
   6382 
   6383 	spin_lock_irqsave(&task_group_lock, flags);
   6384 	list_del_rcu(&tg->list);
   6385 	list_del_rcu(&tg->siblings);
   6386 	spin_unlock_irqrestore(&task_group_lock, flags);
   6387 }
   6388 
   6389 static void sched_change_group(struct task_struct *tsk, int type)
   6390 {
   6391 	struct task_group *tg;
   6392 
   6393 	/*
   6394 	 * All callers are synchronized by task_rq_lock(); we do not use RCU
   6395 	 * which is pointless here. Thus, we pass "true" to task_css_check()
   6396 	 * to prevent lockdep warnings.
   6397 	 */
   6398 	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
   6399 			  struct task_group, css);
   6400 	tg = autogroup_task_group(tsk, tg);
   6401 	tsk->sched_task_group = tg;
   6402 
   6403 #ifdef CONFIG_FAIR_GROUP_SCHED
   6404 	if (tsk->sched_class->task_change_group)
   6405 		tsk->sched_class->task_change_group(tsk, type);
   6406 	else
   6407 #endif
   6408 		set_task_rq(tsk, task_cpu(tsk));
   6409 }
   6410 
   6411 /*
   6412  * Change task's runqueue when it moves between groups.
   6413  *
   6414  * The caller of this function should have put the task in its new group by
   6415  * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
   6416  * its new group.
   6417  */
   6418 void sched_move_task(struct task_struct *tsk)
   6419 {
   6420 	int queued, running, queue_flags =
   6421 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
   6422 	struct rq_flags rf;
   6423 	struct rq *rq;
   6424 
   6425 	rq = task_rq_lock(tsk, &rf);
   6426 	update_rq_clock(rq);
   6427 
   6428 	running = task_current(rq, tsk);
   6429 	queued = task_on_rq_queued(tsk);
   6430 
   6431 	if (queued)
   6432 		dequeue_task(rq, tsk, queue_flags);
   6433 	if (running)
   6434 		put_prev_task(rq, tsk);
   6435 
   6436 	sched_change_group(tsk, TASK_MOVE_GROUP);
   6437 
   6438 	if (queued)
   6439 		enqueue_task(rq, tsk, queue_flags);
   6440 	if (running)
   6441 		set_curr_task(rq, tsk);
   6442 
   6443 	task_rq_unlock(rq, tsk, &rf);
   6444 }
   6445 
   6446 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
   6447 {
   6448 	return css ? container_of(css, struct task_group, css) : NULL;
   6449 }
   6450 
   6451 static struct cgroup_subsys_state *
   6452 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
   6453 {
   6454 	struct task_group *parent = css_tg(parent_css);
   6455 	struct task_group *tg;
   6456 
   6457 	if (!parent) {
   6458 		/* This is early initialization for the top cgroup */
   6459 		return &root_task_group.css;
   6460 	}
   6461 
   6462 	tg = sched_create_group(parent);
   6463 	if (IS_ERR(tg))
   6464 		return ERR_PTR(-ENOMEM);
   6465 
   6466 	return &tg->css;
   6467 }
   6468 
   6469 /* Expose task group only after completing cgroup initialization */
   6470 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
   6471 {
   6472 	struct task_group *tg = css_tg(css);
   6473 	struct task_group *parent = css_tg(css->parent);
   6474 
   6475 	if (parent)
   6476 		sched_online_group(tg, parent);
   6477 	return 0;
   6478 }
   6479 
   6480 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
   6481 {
   6482 	struct task_group *tg = css_tg(css);
   6483 
   6484 	sched_offline_group(tg);
   6485 }
   6486 
   6487 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
   6488 {
   6489 	struct task_group *tg = css_tg(css);
   6490 
   6491 	/*
   6492 	 * Relies on the RCU grace period between css_released() and this.
   6493 	 */
   6494 	sched_free_group(tg);
   6495 }
   6496 
   6497 /*
   6498  * This is called before wake_up_new_task(), therefore we really only
   6499  * have to set its group bits, all the other stuff does not apply.
   6500  */
   6501 static void cpu_cgroup_fork(struct task_struct *task)
   6502 {
   6503 	struct rq_flags rf;
   6504 	struct rq *rq;
   6505 
   6506 	rq = task_rq_lock(task, &rf);
   6507 
   6508 	update_rq_clock(rq);
   6509 	sched_change_group(task, TASK_SET_GROUP);
   6510 
   6511 	task_rq_unlock(rq, task, &rf);
   6512 }
   6513 
   6514 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
   6515 {
   6516 	struct task_struct *task;
   6517 	struct cgroup_subsys_state *css;
   6518 	int ret = 0;
   6519 
   6520 	cgroup_taskset_for_each(task, css, tset) {
   6521 #ifdef CONFIG_RT_GROUP_SCHED
   6522 		if (!sched_rt_can_attach(css_tg(css), task))
   6523 			return -EINVAL;
   6524 #else
   6525 		/* We don't support RT-tasks being in separate groups */
   6526 		if (task->sched_class != &fair_sched_class)
   6527 			return -EINVAL;
   6528 #endif
   6529 		/*
   6530 		 * Serialize against wake_up_new_task() such that if its
   6531 		 * running, we're sure to observe its full state.
   6532 		 */
   6533 		raw_spin_lock_irq(&task->pi_lock);
   6534 		/*
   6535 		 * Avoid calling sched_move_task() before wake_up_new_task()
   6536 		 * has happened. This would lead to problems with PELT, due to
   6537 		 * move wanting to detach+attach while we're not attached yet.
   6538 		 */
   6539 		if (task->state == TASK_NEW)
   6540 			ret = -EINVAL;
   6541 		raw_spin_unlock_irq(&task->pi_lock);
   6542 
   6543 		if (ret)
   6544 			break;
   6545 	}
   6546 	return ret;
   6547 }
   6548 
   6549 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
   6550 {
   6551 	struct task_struct *task;
   6552 	struct cgroup_subsys_state *css;
   6553 
   6554 	cgroup_taskset_for_each(task, css, tset)
   6555 		sched_move_task(task);
   6556 }
   6557 
   6558 #ifdef CONFIG_FAIR_GROUP_SCHED
   6559 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
   6560 				struct cftype *cftype, u64 shareval)
   6561 {
   6562 	return sched_group_set_shares(css_tg(css), scale_load(shareval));
   6563 }
   6564 
   6565 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
   6566 			       struct cftype *cft)
   6567 {
   6568 	struct task_group *tg = css_tg(css);
   6569 
   6570 	return (u64) scale_load_down(tg->shares);
   6571 }
   6572 
   6573 #ifdef CONFIG_CFS_BANDWIDTH
   6574 static DEFINE_MUTEX(cfs_constraints_mutex);
   6575 
   6576 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
   6577 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
   6578 
   6579 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
   6580 
   6581 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
   6582 {
   6583 	int i, ret = 0, runtime_enabled, runtime_was_enabled;
   6584 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   6585 
   6586 	if (tg == &root_task_group)
   6587 		return -EINVAL;
   6588 
   6589 	/*
   6590 	 * Ensure we have at some amount of bandwidth every period.  This is
   6591 	 * to prevent reaching a state of large arrears when throttled via
   6592 	 * entity_tick() resulting in prolonged exit starvation.
   6593 	 */
   6594 	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
   6595 		return -EINVAL;
   6596 
   6597 	/*
   6598 	 * Likewise, bound things on the otherside by preventing insane quota
   6599 	 * periods.  This also allows us to normalize in computing quota
   6600 	 * feasibility.
   6601 	 */
   6602 	if (period > max_cfs_quota_period)
   6603 		return -EINVAL;
   6604 
   6605 	/*
   6606 	 * Prevent race between setting of cfs_rq->runtime_enabled and
   6607 	 * unthrottle_offline_cfs_rqs().
   6608 	 */
   6609 	get_online_cpus();
   6610 	mutex_lock(&cfs_constraints_mutex);
   6611 	ret = __cfs_schedulable(tg, period, quota);
   6612 	if (ret)
   6613 		goto out_unlock;
   6614 
   6615 	runtime_enabled = quota != RUNTIME_INF;
   6616 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
   6617 	/*
   6618 	 * If we need to toggle cfs_bandwidth_used, off->on must occur
   6619 	 * before making related changes, and on->off must occur afterwards
   6620 	 */
   6621 	if (runtime_enabled && !runtime_was_enabled)
   6622 		cfs_bandwidth_usage_inc();
   6623 	raw_spin_lock_irq(&cfs_b->lock);
   6624 	cfs_b->period = ns_to_ktime(period);
   6625 	cfs_b->quota = quota;
   6626 
   6627 	__refill_cfs_bandwidth_runtime(cfs_b);
   6628 
   6629 	/* Restart the period timer (if active) to handle new period expiry: */
   6630 	if (runtime_enabled)
   6631 		start_cfs_bandwidth(cfs_b);
   6632 
   6633 	raw_spin_unlock_irq(&cfs_b->lock);
   6634 
   6635 	for_each_online_cpu(i) {
   6636 		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
   6637 		struct rq *rq = cfs_rq->rq;
   6638 		struct rq_flags rf;
   6639 
   6640 		rq_lock_irq(rq, &rf);
   6641 		cfs_rq->runtime_enabled = runtime_enabled;
   6642 		cfs_rq->runtime_remaining = 0;
   6643 
   6644 		if (cfs_rq->throttled)
   6645 			unthrottle_cfs_rq(cfs_rq);
   6646 		rq_unlock_irq(rq, &rf);
   6647 	}
   6648 	if (runtime_was_enabled && !runtime_enabled)
   6649 		cfs_bandwidth_usage_dec();
   6650 out_unlock:
   6651 	mutex_unlock(&cfs_constraints_mutex);
   6652 	put_online_cpus();
   6653 
   6654 	return ret;
   6655 }
   6656 
   6657 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
   6658 {
   6659 	u64 quota, period;
   6660 
   6661 	period = ktime_to_ns(tg->cfs_bandwidth.period);
   6662 	if (cfs_quota_us < 0)
   6663 		quota = RUNTIME_INF;
   6664 	else
   6665 		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
   6666 
   6667 	return tg_set_cfs_bandwidth(tg, period, quota);
   6668 }
   6669 
   6670 long tg_get_cfs_quota(struct task_group *tg)
   6671 {
   6672 	u64 quota_us;
   6673 
   6674 	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
   6675 		return -1;
   6676 
   6677 	quota_us = tg->cfs_bandwidth.quota;
   6678 	do_div(quota_us, NSEC_PER_USEC);
   6679 
   6680 	return quota_us;
   6681 }
   6682 
   6683 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
   6684 {
   6685 	u64 quota, period;
   6686 
   6687 	period = (u64)cfs_period_us * NSEC_PER_USEC;
   6688 	quota = tg->cfs_bandwidth.quota;
   6689 
   6690 	return tg_set_cfs_bandwidth(tg, period, quota);
   6691 }
   6692 
   6693 long tg_get_cfs_period(struct task_group *tg)
   6694 {
   6695 	u64 cfs_period_us;
   6696 
   6697 	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
   6698 	do_div(cfs_period_us, NSEC_PER_USEC);
   6699 
   6700 	return cfs_period_us;
   6701 }
   6702 
   6703 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
   6704 				  struct cftype *cft)
   6705 {
   6706 	return tg_get_cfs_quota(css_tg(css));
   6707 }
   6708 
   6709 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
   6710 				   struct cftype *cftype, s64 cfs_quota_us)
   6711 {
   6712 	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
   6713 }
   6714 
   6715 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
   6716 				   struct cftype *cft)
   6717 {
   6718 	return tg_get_cfs_period(css_tg(css));
   6719 }
   6720 
   6721 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
   6722 				    struct cftype *cftype, u64 cfs_period_us)
   6723 {
   6724 	return tg_set_cfs_period(css_tg(css), cfs_period_us);
   6725 }
   6726 
   6727 struct cfs_schedulable_data {
   6728 	struct task_group *tg;
   6729 	u64 period, quota;
   6730 };
   6731 
   6732 /*
   6733  * normalize group quota/period to be quota/max_period
   6734  * note: units are usecs
   6735  */
   6736 static u64 normalize_cfs_quota(struct task_group *tg,
   6737 			       struct cfs_schedulable_data *d)
   6738 {
   6739 	u64 quota, period;
   6740 
   6741 	if (tg == d->tg) {
   6742 		period = d->period;
   6743 		quota = d->quota;
   6744 	} else {
   6745 		period = tg_get_cfs_period(tg);
   6746 		quota = tg_get_cfs_quota(tg);
   6747 	}
   6748 
   6749 	/* note: these should typically be equivalent */
   6750 	if (quota == RUNTIME_INF || quota == -1)
   6751 		return RUNTIME_INF;
   6752 
   6753 	return to_ratio(period, quota);
   6754 }
   6755 
   6756 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
   6757 {
   6758 	struct cfs_schedulable_data *d = data;
   6759 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   6760 	s64 quota = 0, parent_quota = -1;
   6761 
   6762 	if (!tg->parent) {
   6763 		quota = RUNTIME_INF;
   6764 	} else {
   6765 		struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
   6766 
   6767 		quota = normalize_cfs_quota(tg, d);
   6768 		parent_quota = parent_b->hierarchical_quota;
   6769 
   6770 		/*
   6771 		 * Ensure max(child_quota) <= parent_quota.  On cgroup2,
   6772 		 * always take the min.  On cgroup1, only inherit when no
   6773 		 * limit is set:
   6774 		 */
   6775 		if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
   6776 			quota = min(quota, parent_quota);
   6777 		} else {
   6778 			if (quota == RUNTIME_INF)
   6779 				quota = parent_quota;
   6780 			else if (parent_quota != RUNTIME_INF && quota > parent_quota)
   6781 				return -EINVAL;
   6782 		}
   6783 	}
   6784 	cfs_b->hierarchical_quota = quota;
   6785 
   6786 	return 0;
   6787 }
   6788 
   6789 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
   6790 {
   6791 	int ret;
   6792 	struct cfs_schedulable_data data = {
   6793 		.tg = tg,
   6794 		.period = period,
   6795 		.quota = quota,
   6796 	};
   6797 
   6798 	if (quota != RUNTIME_INF) {
   6799 		do_div(data.period, NSEC_PER_USEC);
   6800 		do_div(data.quota, NSEC_PER_USEC);
   6801 	}
   6802 
   6803 	rcu_read_lock();
   6804 	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
   6805 	rcu_read_unlock();
   6806 
   6807 	return ret;
   6808 }
   6809 
   6810 static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
   6811 {
   6812 	struct task_group *tg = css_tg(seq_css(sf));
   6813 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   6814 
   6815 	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
   6816 	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
   6817 	seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
   6818 
   6819 	if (schedstat_enabled() && tg != &root_task_group) {
   6820 		u64 ws = 0;
   6821 		int i;
   6822 
   6823 		for_each_possible_cpu(i)
   6824 			ws += schedstat_val(tg->se[i]->statistics.wait_sum);
   6825 
   6826 		seq_printf(sf, "wait_sum %llu\n", ws);
   6827 	}
   6828 
   6829 	return 0;
   6830 }
   6831 #endif /* CONFIG_CFS_BANDWIDTH */
   6832 #endif /* CONFIG_FAIR_GROUP_SCHED */
   6833 
   6834 #ifdef CONFIG_RT_GROUP_SCHED
   6835 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
   6836 				struct cftype *cft, s64 val)
   6837 {
   6838 	return sched_group_set_rt_runtime(css_tg(css), val);
   6839 }
   6840 
   6841 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
   6842 			       struct cftype *cft)
   6843 {
   6844 	return sched_group_rt_runtime(css_tg(css));
   6845 }
   6846 
   6847 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
   6848 				    struct cftype *cftype, u64 rt_period_us)
   6849 {
   6850 	return sched_group_set_rt_period(css_tg(css), rt_period_us);
   6851 }
   6852 
   6853 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
   6854 				   struct cftype *cft)
   6855 {
   6856 	return sched_group_rt_period(css_tg(css));
   6857 }
   6858 #endif /* CONFIG_RT_GROUP_SCHED */
   6859 
   6860 static struct cftype cpu_legacy_files[] = {
   6861 #ifdef CONFIG_FAIR_GROUP_SCHED
   6862 	{
   6863 		.name = "shares",
   6864 		.read_u64 = cpu_shares_read_u64,
   6865 		.write_u64 = cpu_shares_write_u64,
   6866 	},
   6867 #endif
   6868 #ifdef CONFIG_CFS_BANDWIDTH
   6869 	{
   6870 		.name = "cfs_quota_us",
   6871 		.read_s64 = cpu_cfs_quota_read_s64,
   6872 		.write_s64 = cpu_cfs_quota_write_s64,
   6873 	},
   6874 	{
   6875 		.name = "cfs_period_us",
   6876 		.read_u64 = cpu_cfs_period_read_u64,
   6877 		.write_u64 = cpu_cfs_period_write_u64,
   6878 	},
   6879 	{
   6880 		.name = "stat",
   6881 		.seq_show = cpu_cfs_stat_show,
   6882 	},
   6883 #endif
   6884 #ifdef CONFIG_RT_GROUP_SCHED
   6885 	{
   6886 		.name = "rt_runtime_us",
   6887 		.read_s64 = cpu_rt_runtime_read,
   6888 		.write_s64 = cpu_rt_runtime_write,
   6889 	},
   6890 	{
   6891 		.name = "rt_period_us",
   6892 		.read_u64 = cpu_rt_period_read_uint,
   6893 		.write_u64 = cpu_rt_period_write_uint,
   6894 	},
   6895 #endif
   6896 	{ }	/* Terminate */
   6897 };
   6898 
   6899 static int cpu_extra_stat_show(struct seq_file *sf,
   6900 			       struct cgroup_subsys_state *css)
   6901 {
   6902 #ifdef CONFIG_CFS_BANDWIDTH
   6903 	{
   6904 		struct task_group *tg = css_tg(css);
   6905 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
   6906 		u64 throttled_usec;
   6907 
   6908 		throttled_usec = cfs_b->throttled_time;
   6909 		do_div(throttled_usec, NSEC_PER_USEC);
   6910 
   6911 		seq_printf(sf, "nr_periods %d\n"
   6912 			   "nr_throttled %d\n"
   6913 			   "throttled_usec %llu\n",
   6914 			   cfs_b->nr_periods, cfs_b->nr_throttled,
   6915 			   throttled_usec);
   6916 	}
   6917 #endif
   6918 	return 0;
   6919 }
   6920 
   6921 #ifdef CONFIG_FAIR_GROUP_SCHED
   6922 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
   6923 			       struct cftype *cft)
   6924 {
   6925 	struct task_group *tg = css_tg(css);
   6926 	u64 weight = scale_load_down(tg->shares);
   6927 
   6928 	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
   6929 }
   6930 
   6931 static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
   6932 				struct cftype *cft, u64 weight)
   6933 {
   6934 	/*
   6935 	 * cgroup weight knobs should use the common MIN, DFL and MAX
   6936 	 * values which are 1, 100 and 10000 respectively.  While it loses
   6937 	 * a bit of range on both ends, it maps pretty well onto the shares
   6938 	 * value used by scheduler and the round-trip conversions preserve
   6939 	 * the original value over the entire range.
   6940 	 */
   6941 	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
   6942 		return -ERANGE;
   6943 
   6944 	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
   6945 
   6946 	return sched_group_set_shares(css_tg(css), scale_load(weight));
   6947 }
   6948 
   6949 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
   6950 				    struct cftype *cft)
   6951 {
   6952 	unsigned long weight = scale_load_down(css_tg(css)->shares);
   6953 	int last_delta = INT_MAX;
   6954 	int prio, delta;
   6955 
   6956 	/* find the closest nice value to the current weight */
   6957 	for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
   6958 		delta = abs(sched_prio_to_weight[prio] - weight);
   6959 		if (delta >= last_delta)
   6960 			break;
   6961 		last_delta = delta;
   6962 	}
   6963 
   6964 	return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
   6965 }
   6966 
   6967 static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
   6968 				     struct cftype *cft, s64 nice)
   6969 {
   6970 	unsigned long weight;
   6971 	int idx;
   6972 
   6973 	if (nice < MIN_NICE || nice > MAX_NICE)
   6974 		return -ERANGE;
   6975 
   6976 	idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
   6977 	idx = array_index_nospec(idx, 40);
   6978 	weight = sched_prio_to_weight[idx];
   6979 
   6980 	return sched_group_set_shares(css_tg(css), scale_load(weight));
   6981 }
   6982 #endif
   6983 
   6984 static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
   6985 						  long period, long quota)
   6986 {
   6987 	if (quota < 0)
   6988 		seq_puts(sf, "max");
   6989 	else
   6990 		seq_printf(sf, "%ld", quota);
   6991 
   6992 	seq_printf(sf, " %ld\n", period);
   6993 }
   6994 
   6995 /* caller should put the current value in *@periodp before calling */
   6996 static int __maybe_unused cpu_period_quota_parse(char *buf,
   6997 						 u64 *periodp, u64 *quotap)
   6998 {
   6999 	char tok[21];	/* U64_MAX */
   7000 
   7001 	if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
   7002 		return -EINVAL;
   7003 
   7004 	*periodp *= NSEC_PER_USEC;
   7005 
   7006 	if (sscanf(tok, "%llu", quotap))
   7007 		*quotap *= NSEC_PER_USEC;
   7008 	else if (!strcmp(tok, "max"))
   7009 		*quotap = RUNTIME_INF;
   7010 	else
   7011 		return -EINVAL;
   7012 
   7013 	return 0;
   7014 }
   7015 
   7016 #ifdef CONFIG_CFS_BANDWIDTH
   7017 static int cpu_max_show(struct seq_file *sf, void *v)
   7018 {
   7019 	struct task_group *tg = css_tg(seq_css(sf));
   7020 
   7021 	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
   7022 	return 0;
   7023 }
   7024 
   7025 static ssize_t cpu_max_write(struct kernfs_open_file *of,
   7026 			     char *buf, size_t nbytes, loff_t off)
   7027 {
   7028 	struct task_group *tg = css_tg(of_css(of));
   7029 	u64 period = tg_get_cfs_period(tg);
   7030 	u64 quota;
   7031 	int ret;
   7032 
   7033 	ret = cpu_period_quota_parse(buf, &period, &quota);
   7034 	if (!ret)
   7035 		ret = tg_set_cfs_bandwidth(tg, period, quota);
   7036 	return ret ?: nbytes;
   7037 }
   7038 #endif
   7039 
   7040 static struct cftype cpu_files[] = {
   7041 #ifdef CONFIG_FAIR_GROUP_SCHED
   7042 	{
   7043 		.name = "weight",
   7044 		.flags = CFTYPE_NOT_ON_ROOT,
   7045 		.read_u64 = cpu_weight_read_u64,
   7046 		.write_u64 = cpu_weight_write_u64,
   7047 	},
   7048 	{
   7049 		.name = "weight.nice",
   7050 		.flags = CFTYPE_NOT_ON_ROOT,
   7051 		.read_s64 = cpu_weight_nice_read_s64,
   7052 		.write_s64 = cpu_weight_nice_write_s64,
   7053 	},
   7054 #endif
   7055 #ifdef CONFIG_CFS_BANDWIDTH
   7056 	{
   7057 		.name = "max",
   7058 		.flags = CFTYPE_NOT_ON_ROOT,
   7059 		.seq_show = cpu_max_show,
   7060 		.write = cpu_max_write,
   7061 	},
   7062 #endif
   7063 	{ }	/* terminate */
   7064 };
   7065 
   7066 struct cgroup_subsys cpu_cgrp_subsys = {
   7067 	.css_alloc	= cpu_cgroup_css_alloc,
   7068 	.css_online	= cpu_cgroup_css_online,
   7069 	.css_released	= cpu_cgroup_css_released,
   7070 	.css_free	= cpu_cgroup_css_free,
   7071 	.css_extra_stat_show = cpu_extra_stat_show,
   7072 	.fork		= cpu_cgroup_fork,
   7073 	.can_attach	= cpu_cgroup_can_attach,
   7074 	.attach		= cpu_cgroup_attach,
   7075 	.legacy_cftypes	= cpu_legacy_files,
   7076 	.dfl_cftypes	= cpu_files,
   7077 	.early_init	= true,
   7078 	.threaded	= true,
   7079 };
   7080 
   7081 #endif	/* CONFIG_CGROUP_SCHED */
   7082 
   7083 void dump_cpu_task(int cpu)
   7084 {
   7085 	pr_info("Task dump for CPU %d:\n", cpu);
   7086 	sched_show_task(cpu_curr(cpu));
   7087 }
   7088 
   7089 /*
   7090  * Nice levels are multiplicative, with a gentle 10% change for every
   7091  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
   7092  * nice 1, it will get ~10% less CPU time than another CPU-bound task
   7093  * that remained on nice 0.
   7094  *
   7095  * The "10% effect" is relative and cumulative: from _any_ nice level,
   7096  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
   7097  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
   7098  * If a task goes up by ~10% and another task goes down by ~10% then
   7099  * the relative distance between them is ~25%.)
   7100  */
   7101 const int sched_prio_to_weight[40] = {
   7102  /* -20 */     88761,     71755,     56483,     46273,     36291,
   7103  /* -15 */     29154,     23254,     18705,     14949,     11916,
   7104  /* -10 */      9548,      7620,      6100,      4904,      3906,
   7105  /*  -5 */      3121,      2501,      1991,      1586,      1277,
   7106  /*   0 */      1024,       820,       655,       526,       423,
   7107  /*   5 */       335,       272,       215,       172,       137,
   7108  /*  10 */       110,        87,        70,        56,        45,
   7109  /*  15 */        36,        29,        23,        18,        15,
   7110 };
   7111 
   7112 /*
   7113  * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
   7114  *
   7115  * In cases where the weight does not change often, we can use the
   7116  * precalculated inverse to speed up arithmetics by turning divisions
   7117  * into multiplications:
   7118  */
   7119 const u32 sched_prio_to_wmult[40] = {
   7120  /* -20 */     48388,     59856,     76040,     92818,    118348,
   7121  /* -15 */    147320,    184698,    229616,    287308,    360437,
   7122  /* -10 */    449829,    563644,    704093,    875809,   1099582,
   7123  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
   7124  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
   7125  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
   7126  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
   7127  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
   7128 };
   7129 
   7130 #undef CREATE_TRACE_POINTS