whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 45802da05e666a81b421422d3e302930c0e24e77
parent 203b6609e0ede49eb0b97008b1150c69e9d2ffd3
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Wed,  6 Mar 2019 08:14:05 -0800

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - refcount conversions

   - Solve the rq->leaf_cfs_rq_list can of worms for real.

   - improve power-aware scheduling

   - add sysctl knob for Energy Aware Scheduling

   - documentation updates

   - misc other changes"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
  kthread: Do not use TIMER_IRQSAFE
  kthread: Convert worker lock to raw spinlock
  sched/fair: Use non-atomic cpumask_{set,clear}_cpu()
  sched/fair: Remove unused 'sd' parameter from select_idle_smt()
  sched/wait: Use freezable_schedule() when possible
  sched/fair: Prune, fix and simplify the nohz_balancer_kick() comment block
  sched/fair: Explain LLC nohz kick condition
  sched/fair: Simplify nohz_balancer_kick()
  sched/topology: Fix percpu data types in struct sd_data & struct s_data
  sched/fair: Simplify post_init_entity_util_avg() by calling it with a task_struct pointer argument
  sched/fair: Fix O(nr_cgroups) in the load balancing path
  sched/fair: Optimize update_blocked_averages()
  sched/fair: Fix insertion in rq->leaf_cfs_rq_list
  sched/fair: Add tmp_alone_branch assertion
  sched/core: Use READ_ONCE()/WRITE_ONCE() in move_queued_task()/task_rq_lock()
  sched/debug: Initialize sd_sysctl_cpus if !CONFIG_CPUMASK_OFFSTACK
  sched/pelt: Skip updating util_est when utilization is higher than CPU's capacity
  sched/fair: Update scale invariance of PELT
  sched/fair: Move the rq_of() helper function
  sched/core: Convert task_struct.stack_refcount to refcount_t
  ...

Diffstat:
ADocumentation/power/energy-model.txt | 144+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ADocumentation/scheduler/sched-energy.txt | 425+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
MDocumentation/sysctl/kernel.txt | 12++++++++++++
MMAINTAINERS | 9+--------
Mfs/exec.c | 4++--
Mfs/proc/task_nommu.c | 2+-
Minclude/linux/init_task.h | 1+
Minclude/linux/kthread.h | 9++++-----
Minclude/linux/sched.h | 33++++++++++++---------------------
Minclude/linux/sched/signal.h | 5+++--
Minclude/linux/sched/sysctl.h | 7+++++++
Minclude/linux/sched/task.h | 4++--
Minclude/linux/sched/task_stack.h | 2+-
Minclude/linux/sched/topology.h | 8++++----
Minclude/linux/wait.h | 6+++---
Minit/init_task.c | 6+++---
Mkernel/fork.c | 24++++++++++++------------
Mkernel/kthread.c | 43++++++++++++++++++++++---------------------
Mkernel/sched/core.c | 12+++++++-----
Mkernel/sched/deadline.c | 6+++---
Mkernel/sched/debug.c | 4++++
Mkernel/sched/fair.c | 458++++++++++++++++++++++++++++++++++++++++++++++---------------------------------
Mkernel/sched/isolation.c | 2+-
Mkernel/sched/pelt.c | 45++++++++++++++++++++++++---------------------
Mkernel/sched/pelt.h | 114++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mkernel/sched/rt.c | 6+++---
Mkernel/sched/sched.h | 54++++++++++++++++++++++++++++++++++++++++++++----------
Mkernel/sched/topology.c | 33+++++++++++++++++++++++++++++++--
Mkernel/sysctl.c | 11+++++++++++
29 files changed, 1165 insertions(+), 324 deletions(-)

diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt @@ -0,0 +1,144 @@ + ==================== + Energy Model of CPUs + ==================== + +1. Overview +----------- + +The Energy Model (EM) framework serves as an interface between drivers knowing +the power consumed by CPUs at various performance levels, and the kernel +subsystems willing to use that information to make energy-aware decisions. + +The source of the information about the power consumed by CPUs can vary greatly +from one platform to another. These power costs can be estimated using +devicetree data in some cases. In others, the firmware will know better. +Alternatively, userspace might be best positioned. And so on. In order to avoid +each and every client subsystem to re-implement support for each and every +possible source of information on its own, the EM framework intervenes as an +abstraction layer which standardizes the format of power cost tables in the +kernel, hence enabling to avoid redundant work. + +The figure below depicts an example of drivers (Arm-specific here, but the +approach is applicable to any architecture) providing power costs to the EM +framework, and interested clients reading the data from it. + + +---------------+ +-----------------+ +---------------+ + | Thermal (IPA) | | Scheduler (EAS) | | Other | + +---------------+ +-----------------+ +---------------+ + | | em_pd_energy() | + | | em_cpu_get() | + +---------+ | +---------+ + | | | + v v v + +---------------------+ + | Energy Model | + | Framework | + +---------------------+ + ^ ^ ^ + | | | em_register_perf_domain() + +----------+ | +---------+ + | | | + +---------------+ +---------------+ +--------------+ + | cpufreq-dt | | arm_scmi | | Other | + +---------------+ +---------------+ +--------------+ + ^ ^ ^ + | | | + +--------------+ +---------------+ +--------------+ + | Device Tree | | Firmware | | ? | + +--------------+ +---------------+ +--------------+ + +The EM framework manages power cost tables per 'performance domain' in the +system. A performance domain is a group of CPUs whose performance is scaled +together. Performance domains generally have a 1-to-1 mapping with CPUFreq +policies. All CPUs in a performance domain are required to have the same +micro-architecture. CPUs in different performance domains can have different +micro-architectures. + + +2. Core APIs +------------ + + 2.1 Config options + +CONFIG_ENERGY_MODEL must be enabled to use the EM framework. + + + 2.2 Registration of performance domains + +Drivers are expected to register performance domains into the EM framework by +calling the following API: + + int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb); + +Drivers must specify the CPUs of the performance domains using the cpumask +argument, and provide a callback function returning <frequency, power> tuples +for each capacity state. The callback function provided by the driver is free +to fetch data from any relevant location (DT, firmware, ...), and by any mean +deemed necessary. See Section 3. for an example of driver implementing this +callback, and kernel/power/energy_model.c for further documentation on this +API. + + + 2.3 Accessing performance domains + +Subsystems interested in the energy model of a CPU can retrieve it using the +em_cpu_get() API. The energy model tables are allocated once upon creation of +the performance domains, and kept in memory untouched. + +The energy consumed by a performance domain can be estimated using the +em_pd_energy() API. The estimation is performed assuming that the schedutil +CPUfreq governor is in use. + +More details about the above APIs can be found in include/linux/energy_model.h. + + +3. Example driver +----------------- + +This section provides a simple example of a CPUFreq driver registering a +performance domain in the Energy Model framework using the (fake) 'foo' +protocol. The driver implements an est_power() function to be provided to the +EM framework. + + -> drivers/cpufreq/foo_cpufreq.c + +01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu) +02 { +03 long freq, power; +04 +05 /* Use the 'foo' protocol to ceil the frequency */ +06 freq = foo_get_freq_ceil(cpu, *KHz); +07 if (freq < 0); +08 return freq; +09 +10 /* Estimate the power cost for the CPU at the relevant freq. */ +11 power = foo_estimate_power(cpu, freq); +12 if (power < 0); +13 return power; +14 +15 /* Return the values to the EM framework */ +16 *mW = power; +17 *KHz = freq; +18 +19 return 0; +20 } +21 +22 static int foo_cpufreq_init(struct cpufreq_policy *policy) +23 { +24 struct em_data_callback em_cb = EM_DATA_CB(est_power); +25 int nr_opp, ret; +26 +27 /* Do the actual CPUFreq init work ... */ +28 ret = do_foo_cpufreq_init(policy); +29 if (ret) +30 return ret; +31 +32 /* Find the number of OPPs for this policy */ +33 nr_opp = foo_get_nr_opp(policy); +34 +35 /* And register the new performance domain */ +36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb); +37 +38 return 0; +39 } diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt @@ -0,0 +1,425 @@ + ======================= + Energy Aware Scheduling + ======================= + +1. Introduction +--------------- + +Energy Aware Scheduling (or EAS) gives the scheduler the ability to predict +the impact of its decisions on the energy consumed by CPUs. EAS relies on an +Energy Model (EM) of the CPUs to select an energy efficient CPU for each task, +with a minimal impact on throughput. This document aims at providing an +introduction on how EAS works, what are the main design decisions behind it, and +details what is needed to get it to run. + +Before going any further, please note that at the time of writing: + + /!\ EAS does not support platforms with symmetric CPU topologies /!\ + +EAS operates only on heterogeneous CPU topologies (such as Arm big.LITTLE) +because this is where the potential for saving energy through scheduling is +the highest. + +The actual EM used by EAS is _not_ maintained by the scheduler, but by a +dedicated framework. For details about this framework and what it provides, +please refer to its documentation (see Documentation/power/energy-model.txt). + + +2. Background and Terminology +----------------------------- + +To make it clear from the start: + - energy = [joule] (resource like a battery on powered devices) + - power = energy/time = [joule/second] = [watt] + +The goal of EAS is to minimize energy, while still getting the job done. That +is, we want to maximize: + + performance [inst/s] + -------------------- + power [W] + +which is equivalent to minimizing: + + energy [J] + ----------- + instruction + +while still getting 'good' performance. It is essentially an alternative +optimization objective to the current performance-only objective for the +scheduler. This alternative considers two objectives: energy-efficiency and +performance. + +The idea behind introducing an EM is to allow the scheduler to evaluate the +implications of its decisions rather than blindly applying energy-saving +techniques that may have positive effects only on some platforms. At the same +time, the EM must be as simple as possible to minimize the scheduler latency +impact. + +In short, EAS changes the way CFS tasks are assigned to CPUs. When it is time +for the scheduler to decide where a task should run (during wake-up), the EM +is used to break the tie between several good CPU candidates and pick the one +that is predicted to yield the best energy consumption without harming the +system's throughput. The predictions made by EAS rely on specific elements of +knowledge about the platform's topology, which include the 'capacity' of CPUs, +and their respective energy costs. + + +3. Topology information +----------------------- + +EAS (as well as the rest of the scheduler) uses the notion of 'capacity' to +differentiate CPUs with different computing throughput. The 'capacity' of a CPU +represents the amount of work it can absorb when running at its highest +frequency compared to the most capable CPU of the system. Capacity values are +normalized in a 1024 range, and are comparable with the utilization signals of +tasks and CPUs computed by the Per-Entity Load Tracking (PELT) mechanism. Thanks +to capacity and utilization values, EAS is able to estimate how big/busy a +task/CPU is, and to take this into consideration when evaluating performance vs +energy trade-offs. The capacity of CPUs is provided via arch-specific code +through the arch_scale_cpu_capacity() callback. + +The rest of platform knowledge used by EAS is directly read from the Energy +Model (EM) framework. The EM of a platform is composed of a power cost table +per 'performance domain' in the system (see Documentation/power/energy-model.txt +for futher details about performance domains). + +The scheduler manages references to the EM objects in the topology code when the +scheduling domains are built, or re-built. For each root domain (rd), the +scheduler maintains a singly linked list of all performance domains intersecting +the current rd->span. Each node in the list contains a pointer to a struct +em_perf_domain as provided by the EM framework. + +The lists are attached to the root domains in order to cope with exclusive +cpuset configurations. Since the boundaries of exclusive cpusets do not +necessarily match those of performance domains, the lists of different root +domains can contain duplicate elements. + +Example 1. + Let us consider a platform with 12 CPUs, split in 3 performance domains + (pd0, pd4 and pd8), organized as follows: + + CPUs: 0 1 2 3 4 5 6 7 8 9 10 11 + PDs: |--pd0--|--pd4--|---pd8---| + RDs: |----rd1----|-----rd2-----| + + Now, consider that userspace decided to split the system with two + exclusive cpusets, hence creating two independent root domains, each + containing 6 CPUs. The two root domains are denoted rd1 and rd2 in the + above figure. Since pd4 intersects with both rd1 and rd2, it will be + present in the linked list '->pd' attached to each of them: + * rd1->pd: pd0 -> pd4 + * rd2->pd: pd4 -> pd8 + + Please note that the scheduler will create two duplicate list nodes for + pd4 (one for each list). However, both just hold a pointer to the same + shared data structure of the EM framework. + +Since the access to these lists can happen concurrently with hotplug and other +things, they are protected by RCU, like the rest of topology structures +manipulated by the scheduler. + +EAS also maintains a static key (sched_energy_present) which is enabled when at +least one root domain meets all conditions for EAS to start. Those conditions +are summarized in Section 6. + + +4. Energy-Aware task placement +------------------------------ + +EAS overrides the CFS task wake-up balancing code. It uses the EM of the +platform and the PELT signals to choose an energy-efficient target CPU during +wake-up balance. When EAS is enabled, select_task_rq_fair() calls +find_energy_efficient_cpu() to do the placement decision. This function looks +for the CPU with the highest spare capacity (CPU capacity - CPU utilization) in +each performance domain since it is the one which will allow us to keep the +frequency the lowest. Then, the function checks if placing the task there could +save energy compared to leaving it on prev_cpu, i.e. the CPU where the task ran +in its previous activation. + +find_energy_efficient_cpu() uses compute_energy() to estimate what will be the +energy consumed by the system if the waking task was migrated. compute_energy() +looks at the current utilization landscape of the CPUs and adjusts it to +'simulate' the task migration. The EM framework provides the em_pd_energy() API +which computes the expected energy consumption of each performance domain for +the given utilization landscape. + +An example of energy-optimized task placement decision is detailed below. + +Example 2. + Let us consider a (fake) platform with 2 independent performance domains + composed of two CPUs each. CPU0 and CPU1 are little CPUs; CPU2 and CPU3 + are big. + + The scheduler must decide where to place a task P whose util_avg = 200 + and prev_cpu = 0. + + The current utilization landscape of the CPUs is depicted on the graph + below. CPUs 0-3 have a util_avg of 400, 100, 600 and 500 respectively + Each performance domain has three Operating Performance Points (OPPs). + The CPU capacity and power cost associated with each OPP is listed in + the Energy Model table. The util_avg of P is shown on the figures + below as 'PP'. + + CPU util. + 1024 - - - - - - - Energy Model + +-----------+-------------+ + | Little | Big | + 768 ============= +-----+-----+------+------+ + | Cap | Pwr | Cap | Pwr | + +-----+-----+------+------+ + 512 =========== - ##- - - - - | 170 | 50 | 512 | 400 | + ## ## | 341 | 150 | 768 | 800 | + 341 -PP - - - - ## ## | 512 | 300 | 1024 | 1700 | + PP ## ## +-----+-----+------+------+ + 170 -## - - - - ## ## + ## ## ## ## + ------------ ------------- + CPU0 CPU1 CPU2 CPU3 + + Current OPP: ===== Other OPP: - - - util_avg (100 each): ## + + + find_energy_efficient_cpu() will first look for the CPUs with the + maximum spare capacity in the two performance domains. In this example, + CPU1 and CPU3. Then it will estimate the energy of the system if P was + placed on either of them, and check if that would save some energy + compared to leaving P on CPU0. EAS assumes that OPPs follow utilization + (which is coherent with the behaviour of the schedutil CPUFreq + governor, see Section 6. for more details on this topic). + + Case 1. P is migrated to CPU1 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + 1024 - - - - - - - + + Energy calculation: + 768 ============= * CPU0: 200 / 341 * 150 = 88 + * CPU1: 300 / 341 * 150 = 131 + * CPU2: 600 / 768 * 800 = 625 + 512 - - - - - - - ##- - - - - * CPU3: 500 / 768 * 800 = 520 + ## ## => total_energy = 1364 + 341 =========== ## ## + PP ## ## + 170 -## - - PP- ## ## + ## ## ## ## + ------------ ------------- + CPU0 CPU1 CPU2 CPU3 + + + Case 2. P is migrated to CPU3 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + 1024 - - - - - - - + + Energy calculation: + 768 ============= * CPU0: 200 / 341 * 150 = 88 + * CPU1: 100 / 341 * 150 = 43 + PP * CPU2: 600 / 768 * 800 = 625 + 512 - - - - - - - ##- - -PP - * CPU3: 700 / 768 * 800 = 729 + ## ## => total_energy = 1485 + 341 =========== ## ## + ## ## + 170 -## - - - - ## ## + ## ## ## ## + ------------ ------------- + CPU0 CPU1 CPU2 CPU3 + + + Case 3. P stays on prev_cpu / CPU 0 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + 1024 - - - - - - - + + Energy calculation: + 768 ============= * CPU0: 400 / 512 * 300 = 234 + * CPU1: 100 / 512 * 300 = 58 + * CPU2: 600 / 768 * 800 = 625 + 512 =========== - ##- - - - - * CPU3: 500 / 768 * 800 = 520 + ## ## => total_energy = 1437 + 341 -PP - - - - ## ## + PP ## ## + 170 -## - - - - ## ## + ## ## ## ## + ------------ ------------- + CPU0 CPU1 CPU2 CPU3 + + + From these calculations, the Case 1 has the lowest total energy. So CPU 1 + is be the best candidate from an energy-efficiency standpoint. + +Big CPUs are generally more power hungry than the little ones and are thus used +mainly when a task doesn't fit the littles. However, little CPUs aren't always +necessarily more energy-efficient than big CPUs. For some systems, the high OPPs +of the little CPUs can be less energy-efficient than the lowest OPPs of the +bigs, for example. So, if the little CPUs happen to have enough utilization at +a specific point in time, a small task waking up at that moment could be better +of executing on the big side in order to save energy, even though it would fit +on the little side. + +And even in the case where all OPPs of the big CPUs are less energy-efficient +than those of the little, using the big CPUs for a small task might still, under +specific conditions, save energy. Indeed, placing a task on a little CPU can +result in raising the OPP of the entire performance domain, and that will +increase the cost of the tasks already running there. If the waking task is +placed on a big CPU, its own execution cost might be higher than if it was +running on a little, but it won't impact the other tasks of the little CPUs +which will keep running at a lower OPP. So, when considering the total energy +consumed by CPUs, the extra cost of running that one task on a big core can be +smaller than the cost of raising the OPP on the little CPUs for all the other +tasks. + +The examples above would be nearly impossible to get right in a generic way, and +for all platforms, without knowing the cost of running at different OPPs on all +CPUs of the system. Thanks to its EM-based design, EAS should cope with them +correctly without too many troubles. However, in order to ensure a minimal +impact on throughput for high-utilization scenarios, EAS also implements another +mechanism called 'over-utilization'. + + +5. Over-utilization +------------------- + +From a general standpoint, the use-cases where EAS can help the most are those +involving a light/medium CPU utilization. Whenever long CPU-bound tasks are +being run, they will require all of the available CPU capacity, and there isn't +much that can be done by the scheduler to save energy without severly harming +throughput. In order to avoid hurting performance with EAS, CPUs are flagged as +'over-utilized' as soon as they are used at more than 80% of their compute +capacity. As long as no CPUs are over-utilized in a root domain, load balancing +is disabled and EAS overridess the wake-up balancing code. EAS is likely to load +the most energy efficient CPUs of the system more than the others if that can be +done without harming throughput. So, the load-balancer is disabled to prevent +it from breaking the energy-efficient task placement found by EAS. It is safe to +do so when the system isn't overutilized since being below the 80% tipping point +implies that: + + a. there is some idle time on all CPUs, so the utilization signals used by + EAS are likely to accurately represent the 'size' of the various tasks + in the system; + b. all tasks should already be provided with enough CPU capacity, + regardless of their nice values; + c. since there is spare capacity all tasks must be blocking/sleeping + regularly and balancing at wake-up is sufficient. + +As soon as one CPU goes above the 80% tipping point, at least one of the three +assumptions above becomes incorrect. In this scenario, the 'overutilized' flag +is raised for the entire root domain, EAS is disabled, and the load-balancer is +re-enabled. By doing so, the scheduler falls back onto load-based algorithms for +wake-up and load balance under CPU-bound conditions. This provides a better +respect of the nice values of tasks. + +Since the notion of overutilization largely relies on detecting whether or not +there is some idle time in the system, the CPU capacity 'stolen' by higher +(than CFS) scheduling classes (as well as IRQ) must be taken into account. As +such, the detection of overutilization accounts for the capacity used not only +by CFS tasks, but also by the other scheduling classes and IRQ. + + +6. Dependencies and requirements for EAS +---------------------------------------- + +Energy Aware Scheduling depends on the CPUs of the system having specific +hardware properties and on other features of the kernel being enabled. This +section lists these dependencies and provides hints as to how they can be met. + + + 6.1 - Asymmetric CPU topology + +As mentioned in the introduction, EAS is only supported on platforms with +asymmetric CPU topologies for now. This requirement is checked at run-time by +looking for the presence of the SD_ASYM_CPUCAPACITY flag when the scheduling +domains are built. + +The flag is set/cleared automatically by the scheduler topology code whenever +there are CPUs with different capacities in a root domain. The capacities of +CPUs are provided by arch-specific code through the arch_scale_cpu_capacity() +callback. As an example, arm and arm64 share an implementation of this callback +which uses a combination of CPUFreq data and device-tree bindings to compute the +capacity of CPUs (see drivers/base/arch_topology.c for more details). + +So, in order to use EAS on your platform your architecture must implement the +arch_scale_cpu_capacity() callback, and some of the CPUs must have a lower +capacity than others. + +Please note that EAS is not fundamentally incompatible with SMP, but no +significant savings on SMP platforms have been observed yet. This restriction +could be amended in the future if proven otherwise. + + + 6.2 - Energy Model presence + +EAS uses the EM of a platform to estimate the impact of scheduling decisions on +energy. So, your platform must provide power cost tables to the EM framework in +order to make EAS start. To do so, please refer to documentation of the +independent EM framework in Documentation/power/energy-model.txt. + +Please also note that the scheduling domains need to be re-built after the +EM has been registered in order to start EAS. + + + 6.3 - Energy Model complexity + +The task wake-up path is very latency-sensitive. When the EM of a platform is +too complex (too many CPUs, too many performance domains, too many performance +states, ...), the cost of using it in the wake-up path can become prohibitive. +The energy-aware wake-up algorithm has a complexity of: + + C = Nd * (Nc + Ns) + +with: Nd the number of performance domains; Nc the number of CPUs; and Ns the +total number of OPPs (ex: for two perf. domains with 4 OPPs each, Ns = 8). + +A complexity check is performed at the root domain level, when scheduling +domains are built. EAS will not start on a root domain if its C happens to be +higher than the completely arbitrary EM_MAX_COMPLEXITY threshold (2048 at the +time of writing). + +If you really want to use EAS but the complexity of your platform's Energy +Model is too high to be used with a single root domain, you're left with only +two possible options: + + 1. split your system into separate, smaller, root domains using exclusive + cpusets and enable EAS locally on each of them. This option has the + benefit to work out of the box but the drawback of preventing load + balance between root domains, which can result in an unbalanced system + overall; + 2. submit patches to reduce the complexity of the EAS wake-up algorithm, + hence enabling it to cope with larger EMs in reasonable time. + + + 6.4 - Schedutil governor + +EAS tries to predict at which OPP will the CPUs be running in the close future +in order to estimate their energy consumption. To do so, it is assumed that OPPs +of CPUs follow their utilization. + +Although it is very difficult to provide hard guarantees regarding the accuracy +of this assumption in practice (because the hardware might not do what it is +told to do, for example), schedutil as opposed to other CPUFreq governors at +least _requests_ frequencies calculated using the utilization signals. +Consequently, the only sane governor to use together with EAS is schedutil, +because it is the only one providing some degree of consistency between +frequency requests and energy predictions. + +Using EAS with any other governor than schedutil is not supported. + + + 6.5 Scale-invariant utilization signals + +In order to make accurate prediction across CPUs and for all performance +states, EAS needs frequency-invariant and CPU-invariant PELT signals. These can +be obtained using the architecture-defined arch_scale{cpu,freq}_capacity() +callbacks. + +Using EAS on a platform that doesn't implement these two callbacks is not +supported. + + + 6.6 Multithreading (SMT) + +EAS in its current form is SMT unaware and is not able to leverage +multithreaded hardware to save energy. EAS considers threads as independent +CPUs, which can actually be counter-productive for both performance and energy. + +EAS on SMT is not supported. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt @@ -79,6 +79,7 @@ show up in /proc/sys/kernel: - reboot-cmd [ SPARC only ] - rtsig-max - rtsig-nr +- sched_energy_aware - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst - sem - sem_next_id [ sysv ipc ] @@ -890,6 +891,17 @@ rtsig-nr shows the number of RT signals currently queued. ============================================================== +sched_energy_aware: + +Enables/disables Energy Aware Scheduling (EAS). EAS starts +automatically on platforms where it can run (that is, +platforms with asymmetric CPU topologies and having an Energy +Model available). If your platform happens to meet the +requirements for EAS but you do not want to use it, change +this value to 0. + +============================================================== + sched_schedstats: Enables/disables scheduler statistics. Enabling this feature diff --git a/MAINTAINERS b/MAINTAINERS @@ -12280,14 +12280,6 @@ S: Maintained F: drivers/net/ppp/pptp.c W: http://sourceforge.net/projects/accel-pptp -PREEMPTIBLE KERNEL -M: Robert Love <rml@tech9.net> -L: kpreempt-tech@lists.sourceforge.net -W: https://www.kernel.org/pub/linux/kernel/people/rml/preempt-kernel -S: Supported -F: Documentation/preempt-locking.txt -F: include/linux/preempt.h - PRINTK M: Petr Mladek <pmladek@suse.com> M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com> @@ -13525,6 +13517,7 @@ F: kernel/sched/ F: include/linux/sched.h F: include/uapi/linux/sched.h F: include/linux/wait.h +F: include/linux/preempt.h SCR24X CHIP CARD INTERFACE DRIVER M: Lubomir Rintel <lkundrak@v3.sk> diff --git a/fs/exec.c b/fs/exec.c @@ -1189,7 +1189,7 @@ no_thread_group: flush_itimer_signals(); #endif - if (atomic_read(&oldsighand->count) != 1) { + if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND @@ -1199,7 +1199,7 @@ no_thread_group: if (!newsighand) return -ENOMEM; - atomic_set(&newsighand->count, 1); + refcount_set(&newsighand->count, 1); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c @@ -64,7 +64,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) else bytes += kobjsize(current->files); - if (current->sighand && atomic_read(&current->sighand->count) > 1) + if (current->sighand && refcount_read(&current->sighand->count) > 1) sbytes += kobjsize(current->sighand); else bytes += kobjsize(current->sighand); diff --git a/include/linux/init_task.h b/include/linux/init_task.h @@ -13,6 +13,7 @@ #include <linux/securebits.h> #include <linux/seqlock.h> #include <linux/rbtree.h> +#include <linux/refcount.h> #include <linux/sched/autogroup.h> #include <net/net_namespace.h> #include <linux/sched/rt.h> diff --git a/include/linux/kthread.h b/include/linux/kthread.h @@ -86,7 +86,7 @@ enum { struct kthread_worker { unsigned int flags; - spinlock_t lock; + raw_spinlock_t lock; struct list_head work_list; struct list_head delayed_work_list; struct task_struct *task; @@ -107,7 +107,7 @@ struct kthread_delayed_work { }; #define KTHREAD_WORKER_INIT(worker) { \ - .lock = __SPIN_LOCK_UNLOCKED((worker).lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock), \ .work_list = LIST_HEAD_INIT((worker).work_list), \ .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\ } @@ -165,9 +165,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker, #define kthread_init_delayed_work(dwork, fn) \ do { \ kthread_init_work(&(dwork)->work, (fn)); \ - __init_timer(&(dwork)->timer, \ - kthread_delayed_work_timer_fn, \ - TIMER_IRQSAFE); \ + timer_setup(&(dwork)->timer, \ + kthread_delayed_work_timer_fn, 0); \ } while (0) int kthread_worker_fn(void *worker_ptr); diff --git a/include/linux/sched.h b/include/linux/sched.h @@ -21,6 +21,7 @@ #include <linux/seccomp.h> #include <linux/nodemask.h> #include <linux/rcupdate.h> +#include <linux/refcount.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> @@ -356,12 +357,6 @@ struct util_est { * For cfs_rq, it is the aggregated load_avg of all runnable and * blocked sched_entities. * - * load_avg may also take frequency scaling into account: - * - * load_avg = runnable% * scale_load_down(load) * freq% - * - * where freq% is the CPU frequency normalized to the highest frequency. - * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE @@ -370,17 +365,14 @@ struct util_est { * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable * and blocked sched_entities. * - * util_avg may also factor frequency scaling and CPU capacity scaling: - * - * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% - * - * where freq% is the same as above, and capacity% is the CPU capacity - * normalized to the greatest capacity (due to uarch differences, etc). + * load_avg and util_avg don't direcly factor frequency scaling and CPU + * capacity scaling. The scaling is done through the rq_clock_pelt that + * is used for computing those signals (see update_rq_clock_pelt()) * - * N.B., the above ratios (runnable%, running%, freq%, and capacity%) - * themselves are in the range of [0, 1]. To do fixed point arithmetics, - * we therefore scale them to as large a range as necessary. This is for - * example reflected by util_avg's SCHED_CAPACITY_SCALE. + * N.B., the above ratios (runnable% and running%) themselves are in the + * range of [0, 1]. To do fixed point arithmetics, we therefore scale them + * to as large a range as necessary. This is for example reflected by + * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * @@ -607,7 +599,7 @@ struct task_struct { randomized_struct_fields_start void *stack; - atomic_t usage; + refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; @@ -1187,7 +1179,7 @@ struct task_struct { #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ - atomic_t stack_refcount; + refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; @@ -1403,7 +1395,6 @@ extern struct pid *cad_pid; #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ @@ -1753,9 +1744,9 @@ static __always_inline bool need_resched(void) static inline unsigned int task_cpu(const struct task_struct *p) { #ifdef CONFIG_THREAD_INFO_IN_TASK - return p->cpu; + return READ_ONCE(p->cpu); #else - return task_thread_info(p)->cpu; + return READ_ONCE(task_thread_info(p)->cpu); #endif } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h @@ -8,13 +8,14 @@ #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> +#include <linux/refcount.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { - atomic_t count; + refcount_t count; struct k_sigaction action[_NSIG]; spinlock_t siglock; wait_queue_head_t signalfd_wqh; @@ -82,7 +83,7 @@ struct multiprocess_signals { * the locking of signal_struct. */ struct signal_struct { - atomic_t sigcnt; + refcount_t sigcnt; atomic_t live; int nr_threads; struct list_head thread_head; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h @@ -83,4 +83,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern unsigned int sysctl_sched_energy_aware; +extern int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h @@ -88,13 +88,13 @@ extern void sched_exec(void); #define sched_exec() {} #endif -#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0) extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { - if (atomic_dec_and_test(&t->usage)) + if (refcount_dec_and_test(&t->usage)) __put_task_struct(t); } diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h @@ -61,7 +61,7 @@ static inline unsigned long *end_of_stack(struct task_struct *p) #ifdef CONFIG_THREAD_INFO_IN_TASK static inline void *try_get_task_stack(struct task_struct *tsk) { - return atomic_inc_not_zero(&tsk->stack_refcount) ? + return refcount_inc_not_zero(&tsk->stack_refcount) ? task_stack_page(tsk) : NULL; } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h @@ -176,10 +176,10 @@ typedef int (*sched_domain_flags_f)(void); #define SDTL_OVERLAP 0x01 struct sd_data { - struct sched_domain **__percpu sd; - struct sched_domain_shared **__percpu sds; - struct sched_group **__percpu sg; - struct sched_group_capacity **__percpu sgc; + struct sched_domain *__percpu *sd; + struct sched_domain_shared *__percpu *sds; + struct sched_group *__percpu *sg; + struct sched_group_capacity *__percpu *sgc; }; struct sched_domain_topology_level { diff --git a/include/linux/wait.h b/include/linux/wait.h @@ -308,7 +308,7 @@ do { \ #define __wait_event_freezable(wq_head, condition) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ - schedule(); try_to_freeze()) + freezable_schedule()) /** * wait_event_freezable - sleep (or freeze) until a condition gets true @@ -367,7 +367,7 @@ do { \ #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_INTERRUPTIBLE, 0, timeout, \ - __ret = schedule_timeout(__ret); try_to_freeze()) + __ret = freezable_schedule_timeout(__ret)) /* * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid @@ -588,7 +588,7 @@ do { \ #define __wait_event_freezable_exclusive(wq, condition) \ ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ - schedule(); try_to_freeze()) + freezable_schedule()) #define wait_event_freezable_exclusive(wq, condition) \ ({ \ diff --git a/init/init_task.c b/init/init_task.c @@ -44,7 +44,7 @@ static struct signal_struct init_signals = { }; static struct sighand_struct init_sighand = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .action = { { { .sa_handler = SIG_DFL, } }, }, .siglock = __SPIN_LOCK_UNLOCKED(init_sighand.siglock), .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), @@ -61,11 +61,11 @@ struct task_struct init_task = { #ifdef CONFIG_THREAD_INFO_IN_TASK .thread_info = INIT_THREAD_INFO(init_task), - .stack_refcount = ATOMIC_INIT(1), + .stack_refcount = REFCOUNT_INIT(1), #endif .state = 0, .stack = init_stack, - .usage = ATOMIC_INIT(2), + .usage = REFCOUNT_INIT(2), .flags = PF_KTHREAD, .prio = MAX_PRIO - 20, .static_prio = MAX_PRIO - 20, diff --git a/kernel/fork.c b/kernel/fork.c @@ -429,7 +429,7 @@ static void release_task_stack(struct task_struct *tsk) #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { - if (atomic_dec_and_test(&tsk->stack_refcount)) + if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif @@ -447,7 +447,7 @@ void free_task(struct task_struct *tsk) * If the task had a separate stack allocation, it should be gone * by now. */ - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -710,14 +710,14 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); cgroup_free(tsk); @@ -867,7 +867,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->stack_vm_area = stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK - atomic_set(&tsk->stack_refcount, 1); + refcount_set(&tsk->stack_refcount, 1); #endif if (err) @@ -896,7 +896,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) * One for us, one for whoever does the "release_task()" (usually * parent) */ - atomic_set(&tsk->usage, 2); + refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -1463,7 +1463,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { - atomic_inc(&current->sighand->count); + refcount_inc(&current->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); @@ -1471,7 +1471,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->count, 1); + refcount_set(&sig->count, 1); spin_lock_irq(&current->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(&current->sighand->siglock); @@ -1480,7 +1480,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) { + if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it @@ -1527,7 +1527,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); - atomic_set(&sig->sigcnt, 1); + refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); @@ -2082,7 +2082,7 @@ static __latent_entropy struct task_struct *copy_process( } else { current->signal->nr_threads++; atomic_inc(&current->signal->live); - atomic_inc(&current->signal->sigcnt); + refcount_inc(&current->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); @@ -2439,7 +2439,7 @@ static int check_unshare_flags(unsigned long unshare_flags) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { - if (atomic_read(&current->sighand->count) > 1) + if (refcount_read(&current->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { diff --git a/kernel/kthread.c b/kernel/kthread.c @@ -605,7 +605,7 @@ void __kthread_init_worker(struct kthread_worker *worker, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); - spin_lock_init(&worker->lock); + raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); @@ -647,21 +647,21 @@ repeat: if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); worker->task = NULL; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); @@ -818,12 +818,12 @@ bool kthread_queue_work(struct kthread_worker *worker, bool ret = false; unsigned long flags; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); @@ -841,6 +841,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; + unsigned long flags; /* * This might happen when a pending work is reinitialized. @@ -849,7 +850,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) if (WARN_ON_ONCE(!worker)) return; - spin_lock(&worker->lock); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -858,7 +859,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) list_del_init(&work->node); kthread_insert_work(worker, work, &worker->work_list); - spin_unlock(&worker->lock); + raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); @@ -914,14 +915,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, unsigned long flags; bool ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); @@ -957,7 +958,7 @@ void kthread_flush_work(struct kthread_work *work) if (!worker) return; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -969,7 +970,7 @@ void kthread_flush_work(struct kthread_work *work) else noop = true; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); @@ -1002,9 +1003,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, * any queuing is blocked by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, *flags); + raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); - spin_lock_irqsave(&worker->lock, *flags); + raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } @@ -1051,7 +1052,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, unsigned long flags; int ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) @@ -1068,7 +1069,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); @@ -1082,7 +1083,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) if (!worker) goto out; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -1096,13 +1097,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c @@ -107,11 +107,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old CPU in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock(), the acquire of * the old rq->lock will fully serialize against the stores. * - * If we observe the new CPU in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. + * If we observe the new CPU in task_rq_lock(), the address + * dependency headed by '[L] rq = task_rq()' and the acquire + * will pair with the WMB to ensure we then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); @@ -180,6 +181,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif + update_rq_clock_pelt(rq, delta); } void update_rq_clock(struct rq *rq) @@ -956,7 +958,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -2459,7 +2461,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); - post_init_entity_util_avg(&p->se); + post_init_entity_util_avg(p); activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c @@ -1767,7 +1767,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1776,7 +1776,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1793,7 +1793,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c @@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void) { static struct ctl_table *cpu_entries; static struct ctl_table **cpu_idx; + static bool init_done = false; char buf[32]; int i; @@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void) if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; + } + if (!init_done) { + init_done = true; /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c @@ -248,13 +248,6 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -282,79 +275,103 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { - if (!cfs_rq->on_list) { - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cfs_rq->on_list) + return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; + + cfs_rq->on_list = 1; + + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the top of the tree + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { /* - * Ensure we either appear before our parent (if already - * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases and a special case for the root - * cfs_rq. Furthermore, it also means that we will always reset - * tmp_alone_branch either when the branch is connected - * to a tree or when we reach the beg of the tree + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. */ - if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { - /* - * If parent is already on the list, we add the child - * just before. Thanks to circular linked property of - * the list, this means to put the child at the tail - * of the list that starts by parent. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); - /* - * The branch is now connected to its tree so we can - * reset tmp_alone_branch to the beginning of the - * list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else if (!cfs_rq->tg->parent) { - /* - * cfs rq without parent should be put - * at the tail of the list. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - /* - * We have reach the beg of a tree so we can reset - * tmp_alone_branch to the beginning of the list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else { - /* - * The parent has not already been added so we want to - * make sure that it will be put after us. - * tmp_alone_branch points to the beg of the branch - * where we will add parent. - */ - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - rq->tmp_alone_branch); - /* - * update tmp_alone_branch to points to the new beg - * of the branch - */ - rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; - } + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; + } - cfs_rq->on_list = 1; + if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the top of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; } + + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the begin of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new begin + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; + return false; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + + /* + * With cfs_rq being unthrottled/throttled during an enqueue, + * it can happen the tmp_alone_branch points the a leaf that + * we finally want to del. In this case, tmp_alone_branch moves + * to the prev element but it will point to rq->leaf_cfs_rq_list + * at the end of the enqueue. + */ + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); cfs_rq->on_list = 0; } } -/* Iterate through all leaf cfs_rq's on a runqueue: */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ + SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -410,12 +427,6 @@ static inline struct task_struct *task_of(struct sched_entity *se) return container_of(se, struct task_struct, se); } -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - - #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -438,16 +449,21 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { + return true; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ +} + +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -686,9 +702,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -#ifdef CONFIG_SMP #include "pelt.h" -#include "sched-pelt.h" +#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -744,8 +759,9 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap. */ -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); @@ -763,22 +779,19 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - if (entity_is_task(se)) { - struct task_struct *p = task_of(se); - if (p->sched_class != &fair_sched_class) { - /* - * For !fair tasks do: - * - update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); - switched_from_fair(rq, p); - * - * such that the next switched_to_fair() has the - * expected state. - */ - se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); - return; - } + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq); + attach_entity_load_avg(cfs_rq, se, 0); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); + return; } attach_entity_cfs_rq(se); @@ -788,7 +801,7 @@ void post_init_entity_util_avg(struct sched_entity *se) void init_entity_runnable_average(struct sched_entity *se) { } -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { } static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) @@ -1035,7 +1048,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256; unsigned int sysctl_numa_balancing_scan_delay = 1000; struct numa_group { - atomic_t refcount; + refcount_t refcount; spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; @@ -1104,7 +1117,7 @@ static unsigned int task_scan_start(struct task_struct *p) unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; } @@ -1127,7 +1140,7 @@ static unsigned int task_scan_max(struct task_struct *p) unsigned long private = group_faults_priv(ng); unsigned long period = smax; - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; @@ -2203,12 +2216,12 @@ static void task_numa_placement(struct task_struct *p) static inline int get_numa_group(struct numa_group *grp) { - return atomic_inc_not_zero(&grp->refcount); + return refcount_inc_not_zero(&grp->refcount); } static inline void put_numa_group(struct numa_group *grp) { - if (atomic_dec_and_test(&grp->refcount)) + if (refcount_dec_and_test(&grp->refcount)) kfree_rcu(grp, rcu); } @@ -2229,7 +2242,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) return; - atomic_set(&grp->refcount, 1); + refcount_set(&grp->refcount, 1); grp->active_nodes = 1; grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); @@ -3122,7 +3135,7 @@ void set_task_rq_fair(struct sched_entity *se, p_last_update_time = prev->avg.last_update_time; n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } @@ -3257,11 +3270,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* * runnable_sum can't be lower than running_sum - * As running sum is scale with CPU capacity wehreas the runnable sum - * is not we rescale running_sum 1st + * Rescale running sum to be in the same range as runnable sum + * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] + * runnable_sum is in [0 : LOAD_AVG_MAX] */ - running_sum = se->avg.util_sum / - arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; @@ -3364,7 +3377,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages - * @now: current time, as per cfs_rq_clock_task() + * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) @@ -3409,7 +3422,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) decayed = 1; } - decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3499,9 +3512,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed; /* @@ -3509,7 +3520,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * track group sched_entity load average for task_h_load calc in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) - __update_load_avg_se(now, cpu, cfs_rq, se); + __update_load_avg_se(now, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); @@ -3561,7 +3572,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); + __update_load_avg_blocked_se(last_update_time, se); } /* @@ -3577,10 +3588,6 @@ void remove_entity_load_avg(struct sched_entity *se) * tasks cannot exit without having gone through wake_up_new_task() -> * post_init_entity_util_avg() which will have added things to the * cfs_rq, so we can remove unconditionally. - * - * Similarly for groups, they will have passed through - * post_init_entity_util_avg() before unregister_sched_fair_group() - * calls this. */ sync_entity_load_avg(se); @@ -3654,6 +3661,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { long last_ewma_diff; struct util_est ue; + int cpu; if (!sched_feat(UTIL_EST)) return; @@ -3688,6 +3696,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * To avoid overestimation of actual task utilization, skip updates if + * we cannot grant there is idle time in this CPU. + */ + cpu = cpu_of(rq_of(cfs_rq)); + if (task_util(p) > capacity_orig_of(cpu)) + return; + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample @@ -4429,6 +4445,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; + + /* Add cfs_rq with already running entity in the list */ + if (cfs_rq->nr_running >= 1) + list_add_leaf_cfs_rq(cfs_rq); } return 0; @@ -4440,8 +4460,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) + if (!cfs_rq->throttle_count) { cfs_rq->throttled_clock_task = rq_clock_task(rq); + list_del_leaf_cfs_rq(cfs_rq); + } cfs_rq->throttle_count++; return 0; @@ -4544,6 +4566,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) break; } + assert_list_leaf_cfs_rq(rq); + if (!se) add_nr_running(rq, task_delta); @@ -4565,7 +4589,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4582,7 +4606,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); if (!remaining) break; @@ -4598,7 +4622,7 @@ next: * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { u64 runtime, runtime_expires; int throttled; @@ -4640,11 +4664,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); @@ -4753,17 +4777,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); if (cfs_b->distribute_running) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } @@ -4774,18 +4799,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (runtime) cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) return; runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); if (expires == cfs_b->runtime_expires) lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* @@ -4863,20 +4888,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); + unsigned long flags; int overrun; int idle = 0; - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; - idle = do_sched_cfs_period_timer(cfs_b, overrun); + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -4986,6 +5012,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ + +static inline bool cfs_bandwidth_used(void) +{ + return false; +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { return rq_clock_task(rq_of(cfs_rq)); @@ -5177,6 +5209,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } + if (cfs_bandwidth_used()) { + /* + * When bandwidth control is enabled; the cfs_rq_throttled() + * breaks in the above iteration can result in incomplete + * leaf list maintenance, resulting in triggering the assertion + * below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (list_add_leaf_cfs_rq(cfs_rq)) + break; + } + } + + assert_list_leaf_cfs_rq(rq); + hrtick_update(rq); } @@ -5556,11 +5605,6 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6053,7 +6097,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - cpumask_clear_cpu(cpu, cpus); + __cpumask_clear_cpu(cpu, cpus); if (!available_idle_cpu(cpu)) idle = false; } @@ -6073,7 +6117,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; @@ -6097,7 +6141,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6202,7 +6246,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); + i = select_idle_smt(p, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -6608,7 +6652,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -7027,6 +7071,12 @@ idle: if (new_tasks > 0) goto again; + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt + */ + update_idle_rq_clock_pelt(rq); + return NULL; } @@ -7647,10 +7697,27 @@ static inline bool others_have_blocked(struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->avg.runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7662,14 +7729,10 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -7677,14 +7740,21 @@ static void update_blocked_averages(int cpu) if (se && !skip_blocked_update(se)) update_load_avg(cfs_rq_of(se), se, 0); + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); + /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) done = false; } curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) @@ -7754,11 +7824,11 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; @@ -8452,9 +8522,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) return 0; - env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_CAPACITY_SCALE); + env->imbalance = sds->busiest_stat.group_load; return 1; } @@ -8636,7 +8704,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) @@ -8827,21 +8895,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -static int need_active_balance(struct lb_env *env) +static inline bool +asym_active_balance(struct lb_env *env) { - struct sched_domain *sd = env->sd; + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ + return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu); +} - if (env->idle == CPU_NEWLY_IDLE) { +static inline bool +voluntary_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; - /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. - */ - if ((sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu)) - return 1; - } + if (asym_active_balance(env)) + return 1; /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. @@ -8859,6 +8931,16 @@ static int need_active_balance(struct lb_env *env) if (env->src_grp_type == group_misfit_task) return 1; + return 0; +} + +static int need_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + if (voluntary_active_balance(env)) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9023,7 +9105,7 @@ more_balance: if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { /* Prevent to re-select dst_cpu via env's CPUs */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); + __cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; @@ -9050,7 +9132,7 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); + __cpumask_clear_cpu(cpu_of(busiest), cpus); /* * Attempting to continue load balancing at the current * sched_domain level only makes sense if there are @@ -9120,7 +9202,7 @@ more_balance: } else sd->nr_balance_failed = 0; - if (likely(!active_balance)) { + if (likely(!active_balance) || voluntary_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { @@ -9469,15 +9551,8 @@ static void kick_ilb(unsigned int flags) } /* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu in the system. - * - This rq has more than one task. - * - This rq has at least one CFS task and the capacity of the CPU is - * significantly reduced because of RT tasks or IRQs. - * - At parent of LLC scheduler domain level, this cpu's scheduler group has - * multiple busy cpu. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - * domain span are idle. + * Current decision point for kicking the idle load balancer in the presence + * of idle CPUs in the system. */ static void nohz_balancer_kick(struct rq *rq) { @@ -9519,8 +9594,13 @@ static void nohz_balancer_kick(struct rq *rq) sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); if (sds) { /* - * XXX: write a coherent comment on why we do this. - * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { @@ -9533,7 +9613,7 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { + check_cpu_capacity(rq, sd)) { flags = NOHZ_KICK_MASK; goto unlock; } @@ -9541,11 +9621,7 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (i == cpu || - !cpumask_test_cpu(i, nohz.idle_cpus_mask)) - continue; - + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; @@ -10546,10 +10622,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c @@ -80,7 +80,7 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); if (cpumask_empty(housekeeping_mask)) - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); } else { cpumask_var_t tmp; diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c @@ -26,7 +26,6 @@ #include <linux/sched.h> #include "sched.h" -#include "sched-pelt.h" #include "pelt.h" /* @@ -106,16 +105,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) * n=1 */ static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, +accumulate_sum(u64 delta, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { - unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - delta += sa->period_contrib; periods = delta / 1024; /* A period is 1024us (~1ms) */ @@ -137,13 +132,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, } sa->period_contrib = delta; - contrib = cap_scale(contrib, scale_freq); if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) - sa->util_sum += contrib * scale_cpu; + sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; return periods; } @@ -177,7 +171,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, +___update_load_sum(u64 now, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -221,7 +215,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + if (!accumulate_sum(delta, sa, load, runnable, running)) return 0; return 1; @@ -267,9 +261,9 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * runnable_load_avg = \Sum se->avg.runable_load_avg */ -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); return 1; } @@ -277,9 +271,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) return 0; } -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); @@ -290,9 +284,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e return 0; } -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { - if (___update_load_sum(now, cpu, &cfs_rq->avg, + if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), scale_load_down(cfs_rq->runnable_weight), cfs_rq->curr != NULL)) { @@ -317,7 +311,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + if (___update_load_sum(now, &rq->avg_rt, running, running, running)) { @@ -340,7 +334,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + if (___update_load_sum(now, &rq->avg_dl, running, running, running)) { @@ -365,22 +359,31 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) int update_irq_load_avg(struct rq *rq, u64 running) { int ret = 0; + + /* + * We can't use clock_pelt because irq time is not accounted in + * clock_task. Instead we directly scale the running time to + * reflect the real amount of computation + */ + running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + /* * We know the time that has been used by interrupt since last update * but we don't when. Let be pessimistic and assume that interrupt has * happened just before the update. This is not so far from reality * because interrupt will most probably wake up task and trig an update - * of rq clock during which the metric si updated. + * of rq clock during which the metric is updated. * We start to decay with normal context time and then we add the * interrupt context time. * We can safely remove running from rq->clock because * rq->clock += delta with delta >= running */ - ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, 0, 0, 0); - ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + ret += ___update_load_sum(rq->clock, &rq->avg_irq, 1, 1, 1); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h @@ -1,8 +1,9 @@ #ifdef CONFIG_SMP +#include "sched-pelt.h" -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); @@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +/* + * The clock_pelt scales the time to reflect the effective amount of + * computation done during the running delta time but then sync back to + * clock_task when rq is idle. + * + * + * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 + * @ max capacity ------******---------------******--------------- + * @ half capacity ------************---------************--------- + * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16 + * + */ +static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) +{ + if (unlikely(is_idle_task(rq->curr))) { + /* The rq is idle, we can sync to clock_task */ + rq->clock_pelt = rq_clock_task(rq); + return; + } + + /* + * When a rq runs at a lower compute capacity, it will need + * more time to do the same amount of work than at max + * capacity. In order to be invariant, we scale the delta to + * reflect how much work has been really done. + * Running longer results in stealing idle time that will + * disturb the load signal compared to max capacity. This + * stolen idle time will be automatically reflected when the + * rq will be idle and the clock will be synced with + * rq_clock_task. + */ + + /* + * Scale the elapsed time to reflect the real amount of + * computation + */ + delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); + delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); + + rq->clock_pelt += delta; +} + +/* + * When rq becomes idle, we have to check if it has lost idle time + * because it was fully busy. A rq is fully used when the /Sum util_sum + * is greater or equal to: + * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT; + * For optimization and computing rounding purpose, we don't take into account + * the position in the current window (period_contrib) and we use the higher + * bound of util_sum to decide. + */ +static inline void update_idle_rq_clock_pelt(struct rq *rq) +{ + u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX; + u32 util_sum = rq->cfs.avg.util_sum; + util_sum += rq->avg_rt.util_sum; + util_sum += rq->avg_dl.util_sum; + + /* + * Reflecting stolen time makes sense only if the idle + * phase would be present at max capacity. As soon as the + * utilization of a rq has reached the maximum value, it is + * considered as an always runnig rq without idle time to + * steal. This potential idle time is considered as lost in + * this case. We keep track of this lost idle time compare to + * rq's clock_task. + */ + if (util_sum >= divider) + rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; +} + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock_pelt - rq->lost_idle_time; +} + +#ifdef CONFIG_CFS_BANDWIDTH +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; +} +#else +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + return rq_clock_pelt(rq_of(cfs_rq)); +} +#endif + #else static inline int @@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running) { return 0; } + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + return rq_clock_task(rq); +} + +static inline void +update_rq_clock_pelt(struct rq *rq, s64 delta) { } + +static inline void +update_idle_rq_clock_pelt(struct rq *rq) { } + #endif diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c @@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * rt task */ if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * The previous task needs to be made eligible for pushing @@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); watchdog(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h @@ -861,7 +861,10 @@ struct rq { unsigned int clock_update_flags; u64 clock; - u64 clock_task; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; atomic_t nr_iowait; @@ -951,6 +954,22 @@ struct rq { #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -1460,9 +1479,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); #ifdef CONFIG_THREAD_INFO_IN_TASK - p->cpu = cpu; + WRITE_ONCE(p->cpu, cpu); #else - task_thread_info(p)->cpu = cpu; + WRITE_ONCE(task_thread_info(p)->cpu, cpu); #endif p->wake_cpu = cpu; #endif @@ -1563,7 +1582,7 @@ static inline int task_on_rq_queued(struct task_struct *p) static inline int task_on_rq_migrating(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_MIGRATING; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } /* @@ -1781,7 +1800,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void post_init_entity_util_avg(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); @@ -2211,6 +2230,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} # define arch_scale_freq_invariant() false #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL /** * enum schedutil_type - CPU utilization type @@ -2299,11 +2325,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -#else + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + #define perf_domain_span(pd) NULL -#endif +static inline bool sched_energy_enabled(void) { return false; } -#ifdef CONFIG_SMP -extern struct static_key_false sched_energy_present; -#endif +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c @@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_STATIC_KEY_FALSE(sched_energy_present); +unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +#ifdef CONFIG_PROC_SYSCTL +int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + state = static_branch_unlikely(&sched_energy_present); + if (state != sysctl_sched_energy_aware) { + mutex_lock(&sched_energy_mutex); + sched_energy_update = 1; + rebuild_sched_domains(); + sched_energy_update = 0; + mutex_unlock(&sched_energy_mutex); + } + } + + return ret; +} +#endif + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map) struct cpufreq_policy *policy; struct cpufreq_governor *gov; + if (!sysctl_sched_energy_aware) + goto free; + /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { @@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { - struct sched_domain ** __percpu sd; + struct sched_domain * __percpu *sd; struct root_domain *rd; }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c @@ -472,6 +472,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking",