whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 3717f613f48df0222311f974cf8a06c8a6c97bae
parent b1b988a6a035212f5ea205155c49ce449beedee8
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Tue,  5 Mar 2019 14:49:11 -0800

Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull RCU updates from Ingo Molnar:
 "The main RCU related changes in this cycle were:

   - Additional cleanups after RCU flavor consolidation

   - Grace-period forward-progress cleanups and improvements

   - Documentation updates

   - Miscellaneous fixes

   - spin_is_locked() conversions to lockdep

   - SPDX changes to RCU source and header files

   - SRCU updates

   - Torture-test updates, including nolibc updates and moving nolibc to
     tools/include"

* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (71 commits)
  locking/locktorture: Convert to SPDX license identifier
  linux/torture: Convert to SPDX license identifier
  torture: Convert to SPDX license identifier
  linux/srcu: Convert to SPDX license identifier
  linux/rcutree: Convert to SPDX license identifier
  linux/rcutiny: Convert to SPDX license identifier
  linux/rcu_sync: Convert to SPDX license identifier
  linux/rcu_segcblist: Convert to SPDX license identifier
  linux/rcupdate: Convert to SPDX license identifier
  linux/rcu_node_tree: Convert to SPDX license identifier
  rcu/update: Convert to SPDX license identifier
  rcu/tree: Convert to SPDX license identifier
  rcu/tiny: Convert to SPDX license identifier
  rcu/sync: Convert to SPDX license identifier
  rcu/srcu: Convert to SPDX license identifier
  rcu/rcutorture: Convert to SPDX license identifier
  rcu/rcu_segcblist: Convert to SPDX license identifier
  rcu/rcuperf: Convert to SPDX license identifier
  rcu/rcu.h: Convert to SPDX license identifier
  RCU/torture.txt: Remove section MODULE PARAMETERS
  ...

Diffstat:
MDocumentation/RCU/Design/Expedited-Grace-Periods/ExpSchedFlow.svg | 18+++++++++++-------
MDocumentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html | 26+++++++++++++-------------
MDocumentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html | 6+++---
MDocumentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg | 2+-
MDocumentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 8++++----
MDocumentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg | 6+++---
MDocumentation/RCU/Design/Requirements/Requirements.html | 20+++++++++++++-------
MDocumentation/RCU/stallwarn.txt | 15++++++++-------
MDocumentation/RCU/torture.txt | 169+------------------------------------------------------------------------------
MDocumentation/RCU/whatisRCU.txt | 4++--
MDocumentation/admin-guide/kernel-parameters.txt | 32+++++++++++++++++++-------------
MMAINTAINERS | 6++++++
Minclude/asm-generic/bug.h | 3---
Minclude/linux/rcu_node_tree.h | 17++---------------
Minclude/linux/rcu_segcblist.h | 17++---------------
Minclude/linux/rcu_sync.h | 15+--------------
Minclude/linux/rcupdate.h | 91++++++++++++-------------------------------------------------------------------
Minclude/linux/rcutiny.h | 17++---------------
Minclude/linux/rcutree.h | 19+++----------------
Minclude/linux/srcu.h | 18+++---------------
Minclude/linux/srcutiny.h | 17++---------------
Minclude/linux/srcutree.h | 20++++----------------
Minclude/linux/torture.h | 20++++----------------
Mkernel/locking/locktorture.c | 21++++-----------------
Mkernel/rcu/rcu.h | 21+++------------------
Mkernel/rcu/rcu_segcblist.c | 17++---------------
Mkernel/rcu/rcu_segcblist.h | 17++---------------
Mkernel/rcu/rcuperf.c | 27++++++++++-----------------
Mkernel/rcu/rcutorture.c | 59+++++++++++++++++++++++++++++++++++------------------------
Mkernel/rcu/srcutiny.c | 17++---------------
Mkernel/rcu/srcutree.c | 72++++++++++++++++++++++++++----------------------------------------------
Mkernel/rcu/sync.c | 15+--------------
Mkernel/rcu/tiny.c | 19+++----------------
Mkernel/rcu/tree.c | 267++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mkernel/rcu/tree.h | 53++++++++++++++---------------------------------------
Mkernel/rcu/tree_exp.h | 201++++++++++++++++++++++++++++---------------------------------------------------
Mkernel/rcu/tree_plugin.h | 238+++++++++++++++++++------------------------------------------------------------
Mkernel/rcu/update.c | 17++---------------
Mkernel/sched/cpufreq.c | 4++--
Mkernel/sched/cpufreq_schedutil.c | 2+-
Mkernel/sched/sched.h | 2+-
Mkernel/sched/topology.c | 4++--
Mkernel/time/timer.c | 2+-
Mkernel/torture.c | 25++++++++-----------------
Atools/include/nolibc/nolibc.h | 2263+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/memory-model/.gitignore | 1+
Mtools/memory-model/README | 2++
Mtools/memory-model/linux-kernel.bell | 3++-
Mtools/memory-model/linux-kernel.cat | 4+++-
Mtools/memory-model/linux-kernel.def | 1+
Atools/memory-model/scripts/README | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/memory-model/scripts/checkalllitmus.sh | 53+++++++++++++++++++++++------------------------------
Atools/memory-model/scripts/checkghlitmus.sh | 65+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/memory-model/scripts/checklitmus.sh | 74+++++++++++---------------------------------------------------------------
Atools/memory-model/scripts/checklitmushist.sh | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/memory-model/scripts/cmplitmushist.sh | 87+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/memory-model/scripts/initlitmushist.sh | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/memory-model/scripts/judgelitmus.sh | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/memory-model/scripts/newlitmushist.sh | 61+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/memory-model/scripts/parseargs.sh | 136+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/memory-model/scripts/runlitmushist.sh | 87+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/rcutorture/bin/mkinitrd.sh | 27+++++++++++++++++----------
Dtools/testing/selftests/rcutorture/bin/nolibc.h | 2197-------------------------------------------------------------------------------
Mvirt/kvm/kvm_main.c | 2+-
64 files changed, 3566 insertions(+), 3439 deletions(-)

diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/ExpSchedFlow.svg b/Documentation/RCU/Design/Expedited-Grace-Periods/ExpSchedFlow.svg @@ -328,13 +328,13 @@ inkscape:window-height="1148" id="namedview90" showgrid="true" - inkscape:zoom="0.80021373" - inkscape:cx="462.49289" - inkscape:cy="473.6718" + inkscape:zoom="0.69092787" + inkscape:cx="476.34085" + inkscape:cy="712.80957" inkscape:window-x="770" inkscape:window-y="24" inkscape:window-maximized="0" - inkscape:current-layer="g4114-9-3-9" + inkscape:current-layer="g4" inkscape:snap-grids="false" fit-margin-top="5" fit-margin-right="5" @@ -813,14 +813,18 @@ <text sodipodi:linespacing="125%" id="text4110-5-7-6-2-4-0" - y="841.88086" + y="670.74316" x="1460.1007" style="font-size:267.24359131px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans" xml:space="preserve"><tspan - y="841.88086" + y="670.74316" + x="1460.1007" + sodipodi:role="line" + id="tspan4925-1-2-4-5">Request</tspan><tspan + y="1004.7976" x="1460.1007" sodipodi:role="line" - id="tspan4925-1-2-4-5">reched_cpu()</tspan></text> + id="tspan3100">context switch</tspan></text> </g> </g> </svg> diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html @@ -72,10 +72,10 @@ will ignore it because idle and offline CPUs are already residing in quiescent states. Otherwise, the expedited grace period will use <tt>smp_call_function_single()</tt> to send the CPU an IPI, which -is handled by <tt>sync_rcu_exp_handler()</tt>. +is handled by <tt>rcu_exp_handler()</tt>. <p> -However, because this is preemptible RCU, <tt>sync_rcu_exp_handler()</tt> +However, because this is preemptible RCU, <tt>rcu_exp_handler()</tt> can check to see if the CPU is currently running in an RCU read-side critical section. If not, the handler can immediately report a quiescent state. @@ -145,19 +145,18 @@ expedited grace period is shown in the following diagram: <p><img src="ExpSchedFlow.svg" alt="ExpSchedFlow.svg" width="55%"> <p> -As with RCU-preempt's <tt>synchronize_rcu_expedited()</tt>, +As with RCU-preempt, RCU-sched's <tt>synchronize_sched_expedited()</tt> ignores offline and idle CPUs, again because they are in remotely detectable quiescent states. -However, the <tt>synchronize_rcu_expedited()</tt> handler -is <tt>sync_sched_exp_handler()</tt>, and because the +However, because the <tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt> leave no trace of their invocation, in general it is not possible to tell whether or not the current CPU is in an RCU read-side critical section. -The best that <tt>sync_sched_exp_handler()</tt> can do is to check +The best that RCU-sched's <tt>rcu_exp_handler()</tt> can do is to check for idle, on the off-chance that the CPU went idle while the IPI was in flight. -If the CPU is idle, then <tt>sync_sched_exp_handler()</tt> reports +If the CPU is idle, then <tt>rcu_exp_handler()</tt> reports the quiescent state. <p> Otherwise, the handler forces a future context switch by setting the @@ -298,19 +297,18 @@ Instead, the task pushing the grace period forward will include the idle CPUs in the mask passed to <tt>rcu_report_exp_cpu_mult()</tt>. <p> -For RCU-sched, there is an additional check for idle in the IPI -handler, <tt>sync_sched_exp_handler()</tt>. +For RCU-sched, there is an additional check: If the IPI has interrupted the idle loop, then -<tt>sync_sched_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt> +<tt>rcu_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt> to report the corresponding quiescent state. <p> For RCU-preempt, there is no specific check for idle in the -IPI handler (<tt>sync_rcu_exp_handler()</tt>), but because +IPI handler (<tt>rcu_exp_handler()</tt>), but because RCU read-side critical sections are not permitted within the -idle loop, if <tt>sync_rcu_exp_handler()</tt> sees that the CPU is within +idle loop, if <tt>rcu_exp_handler()</tt> sees that the CPU is within RCU read-side critical section, the CPU cannot possibly be idle. -Otherwise, <tt>sync_rcu_exp_handler()</tt> invokes +Otherwise, <tt>rcu_exp_handler()</tt> invokes <tt>rcu_report_exp_rdp()</tt> to report the corresponding quiescent state, regardless of whether or not that quiescent state was due to the CPU being idle. @@ -625,6 +623,8 @@ checks, but only during the mid-boot dead zone. <p> With this refinement, synchronous grace periods can now be used from task context pretty much any time during the life of the kernel. +That is, aside from some points in the suspend, hibernate, or shutdown +code path. <h3><a name="Summary"> Summary</a></h3> diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html @@ -485,13 +485,13 @@ section that the grace period must wait on. noted by <tt>rcu_node_context_switch()</tt> on the left. On the other hand, if the CPU takes a scheduler-clock interrupt while executing in usermode, a quiescent state will be noted by -<tt>rcu_check_callbacks()</tt> on the right. +<tt>rcu_sched_clock_irq()</tt> on the right. Either way, the passage through a quiescent state will be noted in a per-CPU variable. <p>The next time an <tt>RCU_SOFTIRQ</tt> handler executes on this CPU (for example, after the next scheduler-clock -interrupt), <tt>__rcu_process_callbacks()</tt> will invoke +interrupt), <tt>rcu_core()</tt> will invoke <tt>rcu_check_quiescent_state()</tt>, which will notice the recorded quiescent state, and invoke <tt>rcu_report_qs_rdp()</tt>. @@ -651,7 +651,7 @@ to end. These callbacks are identified by <tt>rcu_advance_cbs()</tt>, which is usually invoked by <tt>__note_gp_changes()</tt>. As shown in the diagram below, this invocation can be triggered by -the scheduling-clock interrupt (<tt>rcu_check_callbacks()</tt> on +the scheduling-clock interrupt (<tt>rcu_sched_clock_irq()</tt> on the left) or by idle entry (<tt>rcu_cleanup_after_idle()</tt> on the right, but only for kernels build with <tt>CONFIG_RCU_FAST_NO_HZ=y</tt>). diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-invocation.svg @@ -349,7 +349,7 @@ font-weight="bold" font-size="192" id="text202-7-5" - style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text> + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text> <rect x="7069.6187" y="5087.4678" diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg @@ -3902,7 +3902,7 @@ font-style="normal" y="-4418.6582" x="3745.7725" - xml:space="preserve">rcu_check_callbacks()</text> + xml:space="preserve">rcu_sched_clock_irq()</text> </g> <g transform="translate(-850.30204,55463.106)" @@ -3924,7 +3924,7 @@ font-style="normal" y="-4418.6582" x="3745.7725" - xml:space="preserve">rcu_process_callbacks()</text> + xml:space="preserve">rcu_core()</text> <text style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier" id="text202-7-5-3-27-0" @@ -3933,7 +3933,7 @@ font-style="normal" y="-4165.7954" x="3745.7725" - xml:space="preserve">rcu_check_quiescent_state())</text> + xml:space="preserve">rcu_check_quiescent_state()</text> <text style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier" id="text202-7-5-3-27-0-9" @@ -4968,7 +4968,7 @@ font-weight="bold" font-size="192" id="text202-7-5-19" - style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_check_callbacks()</text> + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_sched_clock_irq()</text> <rect x="5314.2671" y="82817.688" diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-qs.svg @@ -775,7 +775,7 @@ font-style="normal" y="-4418.6582" x="3745.7725" - xml:space="preserve">rcu_check_callbacks()</text> + xml:space="preserve">rcu_sched_clock_irq()</text> </g> <g transform="translate(399.7744,828.86448)" @@ -797,7 +797,7 @@ font-style="normal" y="-4418.6582" x="3745.7725" - xml:space="preserve">rcu_process_callbacks()</text> + xml:space="preserve">rcu_core()</text> <text style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier" id="text202-7-5-3-27-0" @@ -806,7 +806,7 @@ font-style="normal" y="-4165.7954" x="3745.7725" - xml:space="preserve">rcu_check_quiescent_state())</text> + xml:space="preserve">rcu_check_quiescent_state()</text> <text style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier" id="text202-7-5-3-27-0-9" diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html @@ -3099,7 +3099,7 @@ If you block forever in one of a given domain's SRCU read-side critical sections, then that domain's grace periods will also be blocked forever. Of course, one good way to block forever is to deadlock, which can happen if any operation in a given domain's SRCU read-side critical -section can block waiting, either directly or indirectly, for that domain's +section can wait, either directly or indirectly, for that domain's grace period to elapse. For example, this results in a self-deadlock: @@ -3139,12 +3139,18 @@ API, which, in combination with <tt>srcu_read_unlock()</tt>, guarantees a full memory barrier. <p> -Also unlike other RCU flavors, SRCU's callbacks-wait function -<tt>srcu_barrier()</tt> may be invoked from CPU-hotplug notifiers, -though this is not necessarily a good idea. -The reason that this is possible is that SRCU is insensitive -to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt> -need not exclude CPU-hotplug operations. +Also unlike other RCU flavors, <tt>synchronize_srcu()</tt> may <b>not</b> +be invoked from CPU-hotplug notifiers, due to the fact that SRCU grace +periods make use of timers and the possibility of timers being temporarily +&ldquo;stranded&rdquo; on the outgoing CPU. +This stranding of timers means that timers posted to the outgoing CPU +will not fire until late in the CPU-hotplug process. +The problem is that if a notifier is waiting on an SRCU grace period, +that grace period is waiting on a timer, and that timer is stranded on the +outgoing CPU, then the notifier will never be awakened, in other words, +deadlock has occurred. +This same situation of course also prohibits <tt>srcu_barrier()</tt> +from being invoked from CPU-hotplug notifiers. <p> SRCU also differs from other RCU flavors in that SRCU's expedited and diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt @@ -219,17 +219,18 @@ an estimate of the total number of RCU callbacks queued across all CPUs In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed for each CPU: - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D The "last_accelerate:" prints the low-order 16 bits (in hex) of the jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from -rcu_prepare_for_idle(). The "nonlazy_posted:" prints the number -of non-lazy callbacks posted since the last call to rcu_needs_cpu(). -Finally, an "L" indicates that there are currently no non-lazy callbacks -("." is printed otherwise, as shown above) and "D" indicates that -dyntick-idle processing is enabled ("." is printed otherwise, for example, -if disabled via the "nohz=" kernel boot parameter). +rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback +status, so that an "l" indicates that all callbacks were lazy at the start +of the last idle period and an "L" indicates that there are currently +no non-lazy callbacks (in both cases, "." is printed otherwise, as +shown above) and "D" indicates that dyntick-idle processing is enabled +("." is printed otherwise, for example, if disabled via the "nohz=" +kernel boot parameter). If the grace period ends just as the stall warning starts printing, there will be a spurious stall-warning message, which will include diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt @@ -10,173 +10,8 @@ status messages via printk(), which can be examined via the dmesg command (perhaps grepping for "torture"). The test is started when the module is loaded, and stops when the module is unloaded. - -MODULE PARAMETERS - -This module has the following parameters: - -fqs_duration Duration (in microseconds) of artificially induced bursts - of force_quiescent_state() invocations. In RCU - implementations having force_quiescent_state(), these - bursts help force races between forcing a given grace - period and that grace period ending on its own. - -fqs_holdoff Holdoff time (in microseconds) between consecutive calls - to force_quiescent_state() within a burst. - -fqs_stutter Wait time (in seconds) between consecutive bursts - of calls to force_quiescent_state(). - -gp_normal Make the fake writers use normal synchronous grace-period - primitives. - -gp_exp Make the fake writers use expedited synchronous grace-period - primitives. If both gp_normal and gp_exp are set, or - if neither gp_normal nor gp_exp are set, then randomly - choose the primitive so that about 50% are normal and - 50% expedited. By default, neither are set, which - gives best overall test coverage. - -irqreader Says to invoke RCU readers from irq level. This is currently - done via timers. Defaults to "1" for variants of RCU that - permit this. (Or, more accurately, variants of RCU that do - -not- permit this know to ignore this variable.) - -n_barrier_cbs If this is nonzero, RCU barrier testing will be conducted, - in which case n_barrier_cbs specifies the number of - RCU callbacks (and corresponding kthreads) to use for - this testing. The value cannot be negative. If you - specify this to be non-zero when torture_type indicates a - synchronous RCU implementation (one for which a member of - the synchronize_rcu() rather than the call_rcu() family is - used -- see the documentation for torture_type below), an - error will be reported and no testing will be carried out. - -nfakewriters This is the number of RCU fake writer threads to run. Fake - writer threads repeatedly use the synchronous "wait for - current readers" function of the interface selected by - torture_type, with a delay between calls to allow for various - different numbers of writers running in parallel. - nfakewriters defaults to 4, which provides enough parallelism - to trigger special cases caused by multiple writers, such as - the synchronize_srcu() early return optimization. - -nreaders This is the number of RCU reading threads supported. - The default is twice the number of CPUs. Why twice? - To properly exercise RCU implementations with preemptible - read-side critical sections. - -onoff_interval - The number of seconds between each attempt to execute a - randomly selected CPU-hotplug operation. Defaults to - zero, which disables CPU hotplugging. In HOTPLUG_CPU=n - kernels, rcutorture will silently refuse to do any - CPU-hotplug operations regardless of what value is - specified for onoff_interval. - -onoff_holdoff The number of seconds to wait until starting CPU-hotplug - operations. This would normally only be used when - rcutorture was built into the kernel and started - automatically at boot time, in which case it is useful - in order to avoid confusing boot-time code with CPUs - coming and going. - -shuffle_interval - The number of seconds to keep the test threads affinitied - to a particular subset of the CPUs, defaults to 3 seconds. - Used in conjunction with test_no_idle_hz. - -shutdown_secs The number of seconds to run the test before terminating - the test and powering off the system. The default is - zero, which disables test termination and system shutdown. - This capability is useful for automated testing. - -stall_cpu The number of seconds that a CPU should be stalled while - within both an rcu_read_lock() and a preempt_disable(). - This stall happens only once per rcutorture run. - If you need multiple stalls, use modprobe and rmmod to - repeatedly run rcutorture. The default for stall_cpu - is zero, which prevents rcutorture from stalling a CPU. - - Note that attempts to rmmod rcutorture while the stall - is ongoing will hang, so be careful what value you - choose for this module parameter! In addition, too-large - values for stall_cpu might well induce failures and - warnings in other parts of the kernel. You have been - warned! - -stall_cpu_holdoff - The number of seconds to wait after rcutorture starts - before stalling a CPU. Defaults to 10 seconds. - -stat_interval The number of seconds between output of torture - statistics (via printk()). Regardless of the interval, - statistics are printed when the module is unloaded. - Setting the interval to zero causes the statistics to - be printed -only- when the module is unloaded, and this - is the default. - -stutter The length of time to run the test before pausing for this - same period of time. Defaults to "stutter=5", so as - to run and pause for (roughly) five-second intervals. - Specifying "stutter=0" causes the test to run continuously - without pausing, which is the old default behavior. - -test_boost Whether or not to test the ability of RCU to do priority - boosting. Defaults to "test_boost=1", which performs - RCU priority-inversion testing only if the selected - RCU implementation supports priority boosting. Specifying - "test_boost=0" never performs RCU priority-inversion - testing. Specifying "test_boost=2" performs RCU - priority-inversion testing even if the selected RCU - implementation does not support RCU priority boosting, - which can be used to test rcutorture's ability to - carry out RCU priority-inversion testing. - -test_boost_interval - The number of seconds in an RCU priority-inversion test - cycle. Defaults to "test_boost_interval=7". It is - usually wise for this value to be relatively prime to - the value selected for "stutter". - -test_boost_duration - The number of seconds to do RCU priority-inversion testing - within any given "test_boost_interval". Defaults to - "test_boost_duration=4". - -test_no_idle_hz Whether or not to test the ability of RCU to operate in - a kernel that disables the scheduling-clock interrupt to - idle CPUs. Boolean parameter, "1" to test, "0" otherwise. - Defaults to omitting this test. - -torture_type The type of RCU to test, with string values as follows: - - "rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(), - along with expedited, synchronous, and polling - variants. - - "rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and - call_rcu_bh(), along with expedited and synchronous - variants. - - "rcu_busted": This tests an intentionally incorrect version - of RCU in order to help test rcutorture itself. - - "srcu": srcu_read_lock(), srcu_read_unlock() and - call_srcu(), along with expedited and - synchronous variants. - - "sched": preempt_disable(), preempt_enable(), and - call_rcu_sched(), along with expedited, - synchronous, and polling variants. - - "tasks": voluntary context switch and call_rcu_tasks(), - along with expedited and synchronous variants. - - Defaults to "rcu". - -verbose Enable debug printk()s. Default is disabled. - +Module parameters are prefixed by "rcutorture." in +Documentation/admin-guide/kernel-parameters.txt. OUTPUT diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt @@ -302,7 +302,7 @@ rcu_dereference() must prohibit. The rcu_dereference_protected() variant takes a lockdep expression to indicate which locks must be acquired by the caller. If the indicated protection is not provided, - a lockdep splat is emitted. See RCU/Design/Requirements.html + a lockdep splat is emitted. See RCU/Design/Requirements/Requirements.html and the API's code comments for more details and example usage. The following diagram shows how each API communicates among the @@ -560,7 +560,7 @@ presents two such "toy" implementations of RCU, one that is implemented in terms of familiar locking primitives, and another that more closely resembles "classic" RCU. Both are way too simple for real-world use, lacking both functionality and performance. However, they are useful -in getting a feel for how RCU works. See kernel/rcupdate.c for a +in getting a feel for how RCU works. See kernel/rcu/update.c for a production-quality implementation, and see: http://www.rdrop.com/users/paulmck/RCU diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt @@ -3658,19 +3658,6 @@ latencies, which will choose a value aligned with the appropriate hardware boundaries. - rcutree.jiffies_till_sched_qs= [KNL] - Set required age in jiffies for a - given grace period before RCU starts - soliciting quiescent-state help from - rcu_note_context_switch(). If not specified, the - kernel will calculate a value based on the most - recent settings of rcutree.jiffies_till_first_fqs - and rcutree.jiffies_till_next_fqs. - This calculated value may be viewed in - rcutree.jiffies_to_sched_qs. Any attempt to - set rcutree.jiffies_to_sched_qs will be - cheerfully overwritten. - rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. @@ -3682,6 +3669,20 @@ quiescent states. Units are jiffies, minimum value is one, and maximum value is HZ. + rcutree.jiffies_till_sched_qs= [KNL] + Set required age in jiffies for a + given grace period before RCU starts + soliciting quiescent-state help from + rcu_note_context_switch() and cond_resched(). + If not specified, the kernel will calculate + a value based on the most recent settings + of rcutree.jiffies_till_first_fqs + and rcutree.jiffies_till_next_fqs. + This calculated value may be viewed in + rcutree.jiffies_to_sched_qs. Any attempt to set + rcutree.jiffies_to_sched_qs will be cheerfully + overwritten. + rcutree.kthread_prio= [KNL,BOOT] Set the SCHED_FIFO priority of the RCU per-CPU kthreads (rcuc/N). This value is also used for @@ -3725,6 +3726,11 @@ This wake_up() will be accompanied by a WARN_ONCE() splat and an ftrace_dump(). + rcutree.sysrq_rcu= [KNL] + Commandeer a sysrq key to dump out Tree RCU's + rcu_node tree with an eye towards determining + why a new grace period has not yet started. + rcuperf.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). diff --git a/MAINTAINERS b/MAINTAINERS @@ -10827,6 +10827,12 @@ F: drivers/power/supply/bq27xxx_battery_i2c.c F: drivers/power/supply/isp1704_charger.c F: drivers/power/supply/rx51_battery.c +NOLIBC HEADER FILE +M: Willy Tarreau <w@1wt.eu> +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/wtarreau/nolibc.git +F: tools/include/nolibc/ + NTB AMD DRIVER M: Shyam Sundar S K <Shyam-sundar.S-k@amd.com> L: linux-ntb@googlegroups.com diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h @@ -211,9 +211,6 @@ void __warn(const char *file, int line, void *caller, unsigned taint, /* * WARN_ON_SMP() is for cases that the warning is either * meaningless for !SMP or may even cause failures. - * This is usually used for cases that we have - * WARN_ON(!spin_is_locked(&lock)) checks, as spin_is_locked() - * returns 0 for uniprocessor settings. * It can also be used with values that are only defined * on SMP: * diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU node combining tree definitions. These are used to compute * global attributes while avoiding common-case global contention. A key @@ -11,23 +12,9 @@ * because the size of the TREE SRCU srcu_struct structure depends * on these definitions. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ #ifndef __LINUX_RCU_NODE_TREE_H diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU segmented callback lists * @@ -5,23 +6,9 @@ * because the size of the TREE SRCU srcu_struct structure depends * on these definitions. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.net.ibm.com> */ #ifndef __INCLUDE_LINUX_RCU_SEGCBLIST_H diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h @@ -1,20 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU-based infrastructure for lightweight reader-writer locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h @@ -1,25 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf @@ -89,7 +76,7 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active __read_mostly; -void rcu_check_callbacks(int user); +void rcu_sched_clock_irq(int user); void rcu_report_dead(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); @@ -309,16 +296,16 @@ static inline void rcu_preempt_sleep_check(void) { } */ #ifdef __CHECKER__ -#define rcu_dereference_sparse(p, space) \ +#define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ -#define rcu_dereference_sparse(p, space) +#define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ - rcu_dereference_sparse(p, space); \ + rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_check(p, c, space) \ @@ -326,13 +313,13 @@ static inline void rcu_preempt_sleep_check(void) { } /* Dependency order vs. p above. */ \ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ - rcu_dereference_sparse(p, space); \ + rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ - rcu_dereference_sparse(p, space); \ + rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define rcu_dereference_raw(p) \ @@ -382,6 +369,7 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_assign_pointer(p, v) \ ({ \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ + rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ @@ -785,7 +773,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) */ #define RCU_INIT_POINTER(p, v) \ do { \ - rcu_dereference_sparse(p, __rcu); \ + rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) @@ -859,7 +847,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) /* Has the specified rcu_head structure been handed to call_rcu()? */ -/* +/** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * @@ -874,10 +862,10 @@ static inline void rcu_head_init(struct rcu_head *rhp) rhp->func = (rcu_callback_t)~0L; } -/* +/** * rcu_head_after_call_rcu - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. - * @func: The function passed to call_rcu() along with @rhp. + * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including @@ -896,57 +884,4 @@ rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) return false; } - -/* Transitional pre-consolidation compatibility definitions. */ - -static inline void synchronize_rcu_bh(void) -{ - synchronize_rcu(); -} - -static inline void synchronize_rcu_bh_expedited(void) -{ - synchronize_rcu_expedited(); -} - -static inline void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) -{ - call_rcu(head, func); -} - -static inline void rcu_barrier_bh(void) -{ - rcu_barrier(); -} - -static inline void synchronize_sched(void) -{ - synchronize_rcu(); -} - -static inline void synchronize_sched_expedited(void) -{ - synchronize_rcu_expedited(); -} - -static inline void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) -{ - call_rcu(head, func); -} - -static inline void rcu_barrier_sched(void) -{ - rcu_barrier(); -} - -static inline unsigned long get_state_synchronize_sched(void) -{ - return get_state_synchronize_rcu(); -} - -static inline void cond_synchronize_sched(unsigned long oldstate) -{ - cond_synchronize_rcu(oldstate); -} - #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h @@ -1,26 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Author: Dipankar Sarma <dipankar@in.ibm.com> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm + * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical algorithm * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - diff --git a/include/linux/srcu.h b/include/linux/srcu.h @@ -1,24 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Sleepable Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -223,6 +210,7 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { + WARN_ON_ONCE(idx & ~0x1); rcu_lock_release(&(ssp)->dep_map); __srcu_read_unlock(ssp, idx); } diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h @@ -1,24 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tiny variant. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2017 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> */ #ifndef _LINUX_SRCU_TINY_H diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h @@ -1,24 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tree variant. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2017 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> */ #ifndef _LINUX_SRCU_TREE_H @@ -45,7 +32,8 @@ struct srcu_data { unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ bool srcu_cblist_invoking; /* Invoking these CBs? */ - struct delayed_work work; /* Context for CB invoking. */ + struct timer_list delay_work; /* Delay for CB invoking */ + struct work_struct work; /* Context for CB invoking. */ struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ struct srcu_node *mynode; /* Leaf srcu_node. */ unsigned long grpmask; /* Mask for leaf srcu_node */ diff --git a/include/linux/torture.h b/include/linux/torture.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Common functions for in-kernel torture tests. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ #ifndef __LINUX_TORTURE_H @@ -50,11 +37,12 @@ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) /* Definitions for online/offline exerciser. */ +typedef void torture_ofl_func(void); bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes, unsigned long *sum_offl, int *min_onl, int *max_onl); bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, unsigned long *sum_onl, int *min_onl, int *max_onl); -int torture_onoff_init(long ooholdoff, long oointerval); +int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f); void torture_onoff_stats(void); bool torture_onoff_failures(void); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Module-based torture test facility for locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ @@ -45,7 +32,7 @@ #include <linux/torture.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); @@ -970,7 +957,7 @@ static int __init lock_torture_init(void) /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, - onoff_interval * HZ); + onoff_interval * HZ, NULL); if (firsterr) goto unwind; } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update definitions shared among RCU implementations. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2011 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ #ifndef __LINUX_RCU_H @@ -30,7 +17,7 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ +/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -462,8 +449,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, - RCU_BH_FLAVOR, - RCU_SCHED_FLAVOR, RCU_TASKS_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU segmented callback lists, function definitions * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/types.h> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU segmented callback lists, internal-to-rcu header file * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/rcu_segcblist.h> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based performance-test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2015 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #define pr_fmt(fmt) fmt @@ -54,7 +41,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ @@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); * Various other use cases may of course be specified. */ +#ifdef MODULE +# define RCUPERF_SHUTDOWN 0 +#else +# define RCUPERF_SHUTDOWN 1 +#endif + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, !IS_ENABLED(MODULE), +torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based torture test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2005, 2006 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Josh Triplett <josh@joshtriplett.org> * * See also: Documentation/RCU/torture.txt @@ -61,7 +48,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); /* Bits for ->extendables field, extendables param, and related definitions. */ @@ -1630,21 +1617,34 @@ static bool rcu_fwd_emergency_stop; #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; +struct rcu_launder_hist { + long n_launders; + unsigned long launder_gp_seq; +}; +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) +static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; +static unsigned long rcu_launder_gp_seq_start; static void rcu_torture_fwd_cb_hist(void) { + unsigned long gps; + unsigned long gps_old; int i; int j; for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) + if (n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", __func__, jiffies - rcu_fwd_startat); - for (j = 0; j <= i; j++) - pr_cont(" %ds/%d: %ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); + gps_old = rcu_launder_gp_seq_start; + for (j = 0; j <= i; j++) { + gps = n_launders_hist[j].launder_gp_seq; + pr_cont(" %ds/%d: %ld:%ld", + j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, + rcutorture_seq_diff(gps, gps_old)); + gps_old = gps; + } pr_cont("\n"); } @@ -1666,7 +1666,8 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i]++; + n_launders_hist[i].n_launders++; + n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); spin_unlock_irqrestore(&rcu_fwd_lock, flags); } @@ -1786,9 +1787,10 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs = 0; n_max_gps = 0; for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i] = 0; + n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); + rcu_launder_gp_seq_start = gps; while (time_before(jiffies, stopat) && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); @@ -2228,6 +2230,14 @@ static void rcu_test_debug_objects(void) #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } +static void rcutorture_sync(void) +{ + static unsigned long n; + + if (cur_ops->sync && !(++n & 0xfff)) + cur_ops->sync(); +} + static int __init rcu_torture_init(void) { @@ -2389,7 +2399,8 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, + rcutorture_sync); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tiny version for non-preemptible single-CPU use. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2017 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> */ #include <linux/export.h> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -58,6 +45,7 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ #define spin_lock_rcu_node(p) \ @@ -156,7 +144,8 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) snp->grphi = cpu; } sdp->cpu = cpu; - INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); if (is_static) @@ -386,13 +375,19 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) } else { flush_delayed_work(&ssp->work); } - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); + if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) + if (WARN_ON(timer_pending(&sdp->delay_work))) + return; /* Just leak it! */ + if (WARN_ON(work_pending(&sdp->work))) return; /* Just leak it! */ } else { - flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); + del_timer_sync(&sdp->delay_work); + flush_work(&sdp->work); } + } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p state: %d\n", @@ -463,39 +458,23 @@ static void srcu_gp_start(struct srcu_struct *ssp) WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } -/* - * Track online CPUs to guide callback workqueue placement. - */ -DEFINE_PER_CPU(bool, srcu_online); -void srcu_online_cpu(unsigned int cpu) +static void srcu_delay_timer(struct timer_list *t) { - WRITE_ONCE(per_cpu(srcu_online, cpu), true); -} + struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); -void srcu_offline_cpu(unsigned int cpu) -{ - WRITE_ONCE(per_cpu(srcu_online, cpu), false); + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); } -/* - * Place the workqueue handler on the specified CPU if online, otherwise - * just run it whereever. This is useful for placing workqueue handlers - * that are to invoke the specified CPU's callbacks. - */ -static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, +static void srcu_queue_delayed_work_on(struct srcu_data *sdp, unsigned long delay) { - bool ret; + if (!delay) { + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); + return; + } - preempt_disable(); - if (READ_ONCE(per_cpu(srcu_online, cpu))) - ret = queue_delayed_work_on(cpu, wq, dwork, delay); - else - ret = queue_delayed_work(wq, dwork, delay); - preempt_enable(); - return ret; + timer_reduce(&sdp->delay_work, jiffies + delay); } /* @@ -504,7 +483,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); + srcu_queue_delayed_work_on(sdp, delay); } /* @@ -1186,7 +1165,8 @@ static void srcu_invoke_callbacks(struct work_struct *work) struct srcu_data *sdp; struct srcu_struct *ssp; - sdp = container_of(work, struct srcu_data, work.work); + sdp = container_of(work, struct srcu_data, work); + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU-based infrastructure for lightweight reader-writer locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU @@ -76,7 +63,7 @@ void rcu_qs(void) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { if (user) { rcu_qs(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c @@ -1,27 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - @@ -62,6 +49,7 @@ #include <linux/suspend.h> #include <linux/ftrace.h> #include <linux/tick.h> +#include <linux/sysrq.h> #include "tree.h" #include "rcu.h" @@ -115,6 +103,9 @@ int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); /* * The rcu_scheduler_active variable is initialized to the value @@ -479,7 +470,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static void force_quiescent_state(void); static int rcu_pending(void); /* @@ -504,13 +494,12 @@ unsigned long rcu_exp_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); /* - * Force a quiescent state. + * Return the root node of the rcu_state structure. */ -void rcu_force_quiescent_state(void) +static struct rcu_node *rcu_get_root(void) { - force_quiescent_state(); + return &rcu_state.node[0]; } -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* * Convert a ->gp_state value to a character string. @@ -529,19 +518,30 @@ void show_rcu_gp_kthreads(void) { int cpu; unsigned long j; + unsigned long ja; + unsigned long jr; + unsigned long jw; struct rcu_data *rdp; struct rcu_node *rnp; - j = jiffies - READ_ONCE(rcu_state.gp_activity); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", + j = jiffies; + ja = j - READ_ONCE(rcu_state.gp_activity); + jr = j - READ_ONCE(rcu_state.gp_req_activity); + jw = j - READ_ONCE(rcu_state.gp_wake_time); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, rcu_state.gp_kthread->state, j); + rcu_state.gp_state, + rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), + (long)READ_ONCE(rcu_state.gp_seq), + (long)READ_ONCE(rcu_get_root()->gp_seq_needed), + READ_ONCE(rcu_state.gp_flags)); rcu_for_each_node_breadth_first(rnp) { if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) continue; - pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", - rnp->grplo, rnp->grphi, rnp->gp_seq, - rnp->gp_seq_needed); + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", + rnp->grplo, rnp->grphi, (long)rnp->gp_seq, + (long)rnp->gp_seq_needed); if (!rcu_is_leaf_node(rnp)) continue; for_each_leaf_node_possible_cpu(rnp, cpu) { @@ -550,14 +550,35 @@ void show_rcu_gp_kthreads(void) ULONG_CMP_GE(rcu_state.gp_seq, rdp->gp_seq_needed)) continue; - pr_info("\tcpu %d ->gp_seq_needed %lu\n", - cpu, rdp->gp_seq_needed); + pr_info("\tcpu %d ->gp_seq_needed %ld\n", + cpu, (long)rdp->gp_seq_needed); } } /* sched_show_task(rcu_state.gp_kthread); */ } EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ + show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { + .handler = sysrq_show_rcu, + .help_msg = "show-rcu(y)", + .action_msg = "Show RCU tree", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ + if (sysrq_rcu) + return register_sysrq_key('y', &sysrq_rcudump_op); + return 0; +} +early_initcall(rcu_sysrq_init); + /* * Send along grace-period-related data for rcutorture diagnostics. */ @@ -566,8 +587,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, { switch (test_type) { case RCU_FLAVOR: - case RCU_BH_FLAVOR: - case RCU_SCHED_FLAVOR: *flags = READ_ONCE(rcu_state.gp_flags); *gp_seq = rcu_seq_current(&rcu_state.gp_seq); break; @@ -578,14 +597,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* - * Return the root node of the rcu_state structure. - */ -static struct rcu_node *rcu_get_root(void) -{ - return &rcu_state.node[0]; -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -701,7 +712,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq) /** * rcu_nmi_exit - inform RCU of exit from NMI context - * @irq: Is this call from rcu_irq_exit? * * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. @@ -1115,7 +1125,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! + * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq! * The above code handles this, but only for straight cond_resched(). * And some in-kernel loops check need_resched() before calling * cond_resched(), which defeats the above code for CPUs that are @@ -1181,7 +1191,7 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), - rcu_state.gp_flags, + READ_ONCE(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); if (gpk) { @@ -1310,7 +1320,7 @@ static void print_other_cpu_stall(unsigned long gp_seq) panic_on_rcu_stall(); - force_quiescent_state(); /* Kick them all. */ + rcu_force_quiescent_state(); /* Kick them all. */ } static void print_cpu_stall(void) @@ -1557,17 +1567,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken, and don't - * bother awakening when there is nothing for the grace-period kthread - * to do (as in several CPUs raced to awaken, and we lost), and finally - * don't try to awaken a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in + * an interrupt or softirq handler), and don't bother awakening when there + * is nothing for the grace-period kthread to do (as in several CPUs raced + * to awaken, and we lost), and finally don't try to awaken a kthread that + * has not yet been created. If all those checks are passed, track some + * debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context? Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition. In this case, a wakeup really + * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(void) { - if (current == rcu_state.gp_kthread || + if ((current == rcu_state.gp_kthread && + !in_interrupt() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !rcu_state.gp_kthread) return; + WRITE_ONCE(rcu_state.gp_wake_time, jiffies); + WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); swake_up_one(&rcu_state.gp_wq); } @@ -1711,7 +1732,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ - if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) rdp->gp_seq_needed = rnp->gp_seq_needed; WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1939,7 +1960,7 @@ static void rcu_gp_fqs_loop(void) if (!ret) { rcu_state.jiffies_force_qs = jiffies + j; WRITE_ONCE(rcu_state.jiffies_kick_kthreads, - jiffies + 3 * j); + jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), @@ -2497,14 +2518,14 @@ static void rcu_do_batch(struct rcu_data *rdp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * This function is invoked from each scheduling-clock interrupt, + * and checks to see if this CPU is in a non-context-switch quiescent + * state, for example, user mode or idle loop. It also schedules RCU + * core processing. If the current grace period has gone on too long, + * it will ask the scheduler to manufacture a context switch for the sole + * purpose of providing a providing the needed quiescent state. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2517,7 +2538,7 @@ void rcu_check_callbacks(int user) } __this_cpu_write(rcu_data.rcu_urgent_qs, false); } - rcu_flavor_check_callbacks(user); + rcu_flavor_sched_clock_irq(user); if (rcu_pending()) invoke_rcu_core(); @@ -2578,7 +2599,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(void) +void rcu_force_quiescent_state(void) { unsigned long flags; bool ret; @@ -2610,6 +2631,7 @@ static void force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* * This function checks for grace-period requests that fail to motivate @@ -2657,16 +2679,11 @@ rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", - __func__, (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rnp_root->gp_seq_needed), - j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, - rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL); WARN_ON(1); if (rnp_root != rnp) raw_spin_unlock_rcu_node(rnp_root); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + show_rcu_gp_kthreads(); } /* @@ -2711,12 +2728,8 @@ void rcu_fwd_progress_check(unsigned long j) } EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); -/* - * This does the RCU core processing work for the specified rcu_data - * structures. This may be called only from the CPU to whom the rdp - * belongs. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +/* Perform RCU core processing work for the current CPU. */ +static __latent_entropy void rcu_core(struct softirq_action *unused) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2801,9 +2814,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, /* * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() + * Enforce hysteresis, and don't invoke rcu_force_quiescent_state() * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback + * invoking rcu_force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > @@ -2820,7 +2833,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rdp->blimit = LONG_MAX; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) - force_quiescent_state(); + rcu_force_quiescent_state(); rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } @@ -2889,9 +2902,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rcu_segcblist_init(&rdp->cblist); } rcu_segcblist_enqueue(&rdp->cblist, head, lazy); - if (!lazy) - rcu_idle_count_callbacks_posted(); - if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, @@ -2961,6 +2971,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(kfree_call_rcu); +/* + * During early boot, any blocking grace-period wait automatically + * implies a grace period. Later on, this is never the case for PREEMPT. + * + * Howevr, because a context switch is a grace period for !PREEMPT, any + * blocking grace-period wait automatically implies a grace period if + * there is only one CPU online at any point time during execution of + * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds some + * overhead: RCU still operates correctly. + */ +static int rcu_blocking_is_gp(void) +{ + int ret; + + if (IS_ENABLED(CONFIG_PREEMPT)) + return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; + might_sleep(); /* Check for RCU read-side critical section. */ + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; +} + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * In addition, regions of code across which interrupts, preemption, or + * softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + */ +void synchronize_rcu(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + /** * get_state_synchronize_rcu - Snapshot current RCU state * @@ -3049,28 +3132,6 @@ static int rcu_pending(void) } /* - * Return true if the specified CPU has any callback. If all_lazy is - * non-NULL, store an indication of whether all callbacks are lazy. - * (If there are no callbacks, all of them are deemed to be lazy.) - */ -static bool rcu_cpu_has_callbacks(bool *all_lazy) -{ - bool al = true; - bool hc = false; - struct rcu_data *rdp; - - rdp = this_cpu_ptr(&rcu_data); - if (!rcu_segcblist_empty(&rdp->cblist)) { - hc = true; - if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) - al = false; - } - if (all_lazy) - *all_lazy = al; - return hc; -} - -/* * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ @@ -3299,7 +3360,7 @@ int rcutree_prepare_cpu(unsigned int cpu) trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); return 0; } @@ -3329,8 +3390,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->ffmask |= rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); @@ -3355,8 +3414,6 @@ int rcutree_offline_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_offline_cpu(cpu); return 0; } @@ -3777,7 +3834,7 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_core); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h @@ -1,25 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/cache.h> @@ -36,7 +23,6 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { - smp_call_func_t rew_func; unsigned long rew_s; struct work_struct rew_work; }; @@ -194,10 +180,7 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* Nonlazy_posted snapshot. */ + bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ @@ -234,7 +217,13 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 6) Diagnostic data, including RCU CPU stall warnings. */ + /* 6) RCU priority boosting. */ + struct task_struct *rcu_cpu_kthread_task; + /* rcuc per-CPU kthread or NULL. */ + unsigned int rcu_cpu_kthread_status; + char rcu_cpu_has_work; + + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -303,6 +292,8 @@ struct rcu_state { struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ + unsigned long gp_wake_time; /* Last GP kthread wake. */ + unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ /* End of fields guarded by root rcu_node's lock. */ @@ -402,13 +393,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; int rcu_dynticks_snap(struct rcu_data *rdp); -#ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ - /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); @@ -420,7 +404,7 @@ static void rcu_print_detail_task_stall(void); static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -static void rcu_flavor_check_callbacks(int user); +static void rcu_flavor_sched_clock_irq(int user); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); @@ -431,7 +415,6 @@ static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); -static void rcu_idle_count_callbacks_posted(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -451,7 +434,7 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_all_nocb_kthreads(int cpu); +static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); @@ -462,11 +445,3 @@ static void rcu_bind_gp_kthread(void); static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); - -#ifdef CONFIG_SRCU -void srcu_online_cpu(unsigned int cpu); -void srcu_offline_cpu(unsigned int cpu); -#else /* #ifdef CONFIG_SRCU */ -void srcu_online_cpu(unsigned int cpu) { } -void srcu_offline_cpu(unsigned int cpu) { } -#endif /* #else #ifdef CONFIG_SRCU */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h @@ -1,27 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU expedited grace periods * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2016 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/lockdep.h> +static void rcu_exp_handler(void *unused); + /* * Record the start of an expedited grace period. */ @@ -344,7 +333,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; - smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; @@ -352,7 +340,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); - func = rewp->rew_func; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ @@ -396,7 +383,7 @@ retry_ipi: mask_ofl_test |= mask; continue; } - ret = smp_call_function_single(cpu, func, NULL, 0); + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); if (!ret) { mask_ofl_ipi &= ~mask; continue; @@ -426,7 +413,7 @@ retry_ipi: * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_cpus(smp_call_func_t func) +static void sync_rcu_exp_select_cpus(void) { int cpu; struct rcu_node *rnp; @@ -440,7 +427,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - rnp->rew.rew_func = func; if (!READ_ONCE(rcu_par_gp_wq) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { @@ -449,7 +435,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - preempt_disable(); cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); /* If all offline, queue the work on an unbound CPU. */ if (unlikely(cpu > rnp->grphi - rnp->grplo)) @@ -457,7 +442,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) else cpu += rnp->grplo; queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); - preempt_enable(); rnp->exp_need_flush = true; } @@ -580,10 +564,10 @@ static void rcu_exp_wait_wake(unsigned long s) * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. */ -static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(unsigned long s) { /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(func); + sync_rcu_exp_select_cpus(); /* Wait and clean up, including waking everyone. */ rcu_exp_wait_wake(s); @@ -597,52 +581,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) struct rcu_exp_work *rewp; rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); -} - -/* - * Given a smp_call_function() handler, kick off the specified - * implementation of expedited grace period. - */ -static void _synchronize_rcu_expedited(smp_call_func_t func) -{ - struct rcu_data *rdp; - struct rcu_exp_work rew; - struct rcu_node *rnp; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(); - if (exp_funnel_lock(s)) - return; /* Someone else did our work for us. */ - - /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { - /* Direct call during scheduler init and early_initcalls(). */ - rcu_exp_sel_wait_wake(func, s); - } else { - /* Marshall arguments & schedule the expedited grace period. */ - rew.rew_func = func; - rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); - } - - /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); - rnp = rcu_get_root(); - wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ - - /* Let the next expedited grace period start. */ - mutex_unlock(&rcu_state.exp_mutex); + rcu_exp_sel_wait_wake(rewp->rew_s); } #ifdef CONFIG_PREEMPT_RCU @@ -654,7 +593,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func) * ->expmask fields in the rcu_node tree. Otherwise, immediately * report the quiescent state. */ -static void sync_rcu_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -697,6 +636,7 @@ static void sync_rcu_exp_handler(void *unused) WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; } /* @@ -730,43 +670,10 @@ static void sync_sched_exp_online_cleanup(int cpu) { } -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - * - * This has the same semantics as (but is more brutal than) synchronize_rcu(). - */ -void synchronize_rcu_expedited(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - _synchronize_rcu_expedited(sync_rcu_exp_handler); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - #else /* #ifdef CONFIG_PREEMPT_RCU */ /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { struct rcu_data *rdp; struct rcu_node *rnp; @@ -798,44 +705,78 @@ static void sync_sched_exp_online_cleanup(int cpu) rnp = rdp->mynode; if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); WARN_ON_ONCE(ret); } -/* - * Because a context switch is a grace period for !PREEMPT, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. - */ -static int rcu_blocking_is_gp(void) -{ - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; -} +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU grace period, but expedite it. The basic idea is to + * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether + * the CPU is in an RCU critical section, and if so, it sets a flag that + * causes the outermost rcu_read_unlock() to report the quiescent state + * for RCU-preempt or asks the scheduler for help for RCU-sched. On the + * other hand, if the CPU is not in an RCU read-side critical section, + * the IPI handler reports the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + * + * This has the same semantics as (but is more brutal than) synchronize_rcu(). + */ void synchronize_rcu_expedited(void) { + struct rcu_data *rdp; + struct rcu_exp_work rew; + struct rcu_node *rnp; + unsigned long s; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - /* If only one CPU, this is automatically a grace period. */ + /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) return; - _synchronize_rcu_expedited(sync_sched_exp_handler); + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(); + if (exp_funnel_lock(s)) + return; /* Someone else did our work for us. */ + + /* Ensure that load happens before action based on it. */ + if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + /* Direct call during scheduler init and early_initcalls(). */ + rcu_exp_sel_wait_wake(s); + } else { + /* Marshall arguments & schedule the expedited grace period. */ + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew.rew_work); + } + + /* Wait for expedited grace period to complete. */ + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rcu_get_root(); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(s)); + smp_mb(); /* Workqueue actions happen before return. */ + + /* Let the next expedited grace period start. */ + mutex_unlock(&rcu_state.exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h @@ -1,27 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic * or preemptible semantics. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/delay.h> @@ -34,17 +21,7 @@ #include "../time/tick-internal.h" #ifdef CONFIG_RCU_BOOST - #include "../locking/rtmutex_common.h" - -/* - * Control variables for per-CPU and per-rcu_node kthreads. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - #else /* #ifdef CONFIG_RCU_BOOST */ /* @@ -307,7 +284,7 @@ static void rcu_qs(void) __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ + barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ current->rcu_read_unlock_special.b.need_qs = false; } } @@ -788,13 +765,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the corresponding CPU's rcu_node structure, - * which is checked elsewhere. - * - * Caller must disable hard irqs. + * Check for a quiescent state from the current CPU, including voluntary + * context switches for Tasks RCU. When a task blocks, the task is + * recorded in the corresponding CPU's rcu_node structure, which is checked + * elsewhere, hence this function need only check for quiescent states + * related to the current CPU, not to those related to tasks. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; @@ -825,54 +802,6 @@ static void rcu_flavor_check_callbacks(int user) t->rcu_read_unlock_special.b.need_qs = true; } -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. Note, however, that - * upon return from synchronize_rcu(), the caller might well be executing - * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last RCU read-side critical section whose beginning - * preceded the call to synchronize_rcu(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_rcu() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_rcu() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Check for a task exiting while in a preemptible-RCU read-side * critical section, clean up if so. No need to issue warnings, @@ -1088,14 +1017,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * Check to see if this CPU is in a non-context-switch quiescent state, + * namely user mode and idle loop. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -1115,22 +1040,6 @@ static void rcu_flavor_check_callbacks(int user) } } -/* PREEMPT=n implementation of synchronize_rcu(). */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Because preemptible RCU does not exist, tasks cannot possibly exit * while in preemptible RCU read-side critical sections. @@ -1307,11 +1216,11 @@ static void invoke_rcu_callbacks_kthread(void) unsigned long flags; local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), - __this_cpu_read(rcu_cpu_kthread_status)); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), + __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); } local_irq_restore(flags); } @@ -1322,7 +1231,7 @@ static void invoke_rcu_callbacks_kthread(void) */ static bool rcu_is_callbacks_kthread(void) { - return __this_cpu_read(rcu_cpu_kthread_task) == current; + return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1369,11 +1278,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_kthread_do_work(void) -{ - rcu_do_batch(this_cpu_ptr(&rcu_data)); -} - static void rcu_cpu_kthread_setup(unsigned int cpu) { struct sched_param sp; @@ -1384,12 +1288,12 @@ static void rcu_cpu_kthread_setup(unsigned int cpu) static void rcu_cpu_kthread_park(unsigned int cpu) { - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; } static int rcu_cpu_kthread_should_run(unsigned int cpu) { - return __this_cpu_read(rcu_cpu_has_work); + return __this_cpu_read(rcu_data.rcu_cpu_has_work); } /* @@ -1399,21 +1303,20 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu) */ static void rcu_cpu_kthread(unsigned int cpu) { - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; - this_cpu_inc(rcu_cpu_kthread_loops); local_irq_disable(); work = *workp; *workp = 0; local_irq_enable(); if (work) - rcu_kthread_do_work(); + rcu_do_batch(this_cpu_ptr(&rcu_data)); local_bh_enable(); if (*workp == 0) { trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); @@ -1459,7 +1362,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) } static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_cpu_kthread_task, + .store = &rcu_data.rcu_cpu_kthread_task, .thread_should_run = rcu_cpu_kthread_should_run, .thread_fn = rcu_cpu_kthread, .thread_comm = "rcuc/%u", @@ -1476,7 +1379,7 @@ static void __init rcu_spawn_boost_kthreads(void) int cpu; for_each_possible_cpu(cpu) - per_cpu(rcu_cpu_has_work, cpu) = 0; + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) return; rcu_for_each_leaf_node(rnp) @@ -1543,7 +1446,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return rcu_cpu_has_callbacks(NULL); + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); } /* @@ -1562,14 +1465,6 @@ static void rcu_prepare_for_idle(void) { } -/* - * Don't bother keeping a running count of the number of RCU callbacks - * posted because CONFIG_RCU_FAST_NO_HZ=n. - */ -static void rcu_idle_count_callbacks_posted(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1652,11 +1547,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) lockdep_assert_irqs_disabled(); - /* Snapshot to detect later posting of non-lazy callback. */ - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { + if (rcu_segcblist_empty(&rdp->cblist)) { *nextevt = KTIME_MAX; return 0; } @@ -1670,11 +1562,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) rdp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (!rdp->all_lazy) { + rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); + if (rdp->all_lazy) { + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + } else { dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - } else { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } *nextevt = basemono + dj * TICK_NSEC; return 0; @@ -1704,7 +1597,7 @@ static void rcu_prepare_for_idle(void) /* Handle nohz enablement switches conservatively. */ tne = READ_ONCE(tick_nohz_active); if (tne != rdp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(NULL)) + if (!rcu_segcblist_empty(&rdp->cblist)) invoke_rcu_core(); /* force nohz to see update. */ rdp->tick_nohz_enabled_snap = tne; return; @@ -1717,10 +1610,8 @@ static void rcu_prepare_for_idle(void) * callbacks, invoke RCU core for the side-effect of recalculating * idle duration on re-entry to idle. */ - if (rdp->all_lazy && - rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { + if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { rdp->all_lazy = false; - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1756,19 +1647,6 @@ static void rcu_cleanup_after_idle(void) invoke_rcu_core(); } -/* - * Keep a running count of the number of non-lazy callbacks posted - * on this CPU. This running counter (which is never decremented) allows - * rcu_prepare_for_idle() to detect when something out of the idle loop - * posts a callback, even if an equal number of callbacks are invoked. - * Of course, callbacks should only be posted from within a trace event - * designed to be called from idle or from within RCU_NONIDLE(). - */ -static void rcu_idle_count_callbacks_posted(void) -{ - __this_cpu_add(rcu_data.nonlazy_posted, 1); -} - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #ifdef CONFIG_RCU_FAST_NO_HZ @@ -1776,13 +1654,12 @@ static void rcu_idle_count_callbacks_posted(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; - sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", + sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ulong2long(nlpd), - rdp->all_lazy ? 'L' : '.', - rdp->tick_nohz_enabled_snap ? '.' : 'D'); + ".l"[rdp->all_lazy], + ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], + ".D"[!rdp->tick_nohz_enabled_snap]); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ @@ -1868,22 +1745,24 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) /* * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For each CPU in the set, there is a - * kthread created that pulls the callbacks from the corresponding CPU, - * waits for a grace period to elapse, and invokes the callbacks. - * The no-CBs CPUs do a wake_up() on their kthread when they insert - * a callback into any empty list, unless the rcu_nocb_poll boot parameter - * has been specified, in which case each kthread actively polls its - * CPU. (Which isn't so great for energy efficiency, but which does - * reduce RCU's overhead on that CPU.) + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into leaders, which manage incoming callbacks, wait for + * grace periods, and awaken followers, and the followers, which only + * invoke callbacks. Each leader is its own follower. The no-CBs CPUs + * do a wake_up() on their kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) * * This is intended to be used in conjunction with Frederic Weisbecker's * adaptive-idle work, which would seriously reduce OS jitter on CPUs * running CPU-bound user-mode computations. * - * Offloading of callback processing could also in theory be used as - * an energy-efficiency measure because CPUs with no RCU callbacks - * queued are more aggressive about entering dyntick-idle mode. + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. */ @@ -1987,10 +1866,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } -/* - * Does the specified CPU need an RCU callback for this invocation - * of rcu_barrier()? - */ +/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ static bool rcu_nocb_cpu_needs_barrier(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2006,8 +1882,8 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) * callbacks would be posted. In the worst case, the first * barrier in rcu_barrier() suffices (but the caller cannot * necessarily rely on this, not a substitute for the caller - * getting the concurrency design right!). There must also be - * a barrier between the following load an posting of a callback + * getting the concurrency design right!). There must also be a + * barrier between the following load and posting of a callback * (if a callback is in fact needed). This is associated with an * atomic_inc() in the caller. */ @@ -2517,9 +2393,9 @@ static void rcu_spawn_one_nocb_kthread(int cpu) /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthreads, spawn them. + * rcuo kthread, spawn it. */ -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { if (rcu_scheduler_fully_active) rcu_spawn_one_nocb_kthread(cpu); @@ -2536,7 +2412,7 @@ static void __init rcu_spawn_nocb_kthreads(void) int cpu; for_each_online_cpu(cpu) - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); } /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ @@ -2670,7 +2546,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { } -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c @@ -1,26 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2001 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c @@ -48,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); * * Clear the update_util_data pointer for the given CPU. * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() + * Callers must use RCU callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_rcu() * right after this function to avoid use-after-free. */ void cpufreq_remove_update_util_hook(int cpu) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c @@ -859,7 +859,7 @@ static void sugov_stop(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) cpufreq_remove_update_util_hook(cpu); - synchronize_sched(); + synchronize_rcu(); if (!policy->fast_switch_enabled) { irq_work_sync(&sg_policy->irq_work); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h @@ -1260,7 +1260,7 @@ extern void sched_ttwu_pending(void); /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. + * See destroy_sched_domains: call_rcu for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c @@ -442,7 +442,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); + call_rcu(&old_rd->rcu, free_rootdomain); } void sched_get_rd(struct root_domain *rd) @@ -455,7 +455,7 @@ void sched_put_rd(struct root_domain *rd) if (!atomic_dec_and_test(&rd->refcount)) return; - call_rcu_sched(&rd->rcu, free_rootdomain); + call_rcu(&rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) diff --git a/kernel/time/timer.c b/kernel/time/timer.c @@ -1632,7 +1632,7 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(user_tick); + rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); diff --git a/kernel/torture.c b/kernel/torture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Common functions for in-kernel torture tests. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * Based on kernel/rcu/torture.c. */ @@ -53,7 +40,7 @@ #include "rcu/rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); static char *torture_type; static int verbose; @@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex); static struct task_struct *onoff_task; static long onoff_holdoff; static long onoff_interval; +static torture_ofl_func *onoff_f; static long n_offline_attempts; static long n_offline_successes; static unsigned long sum_offline; @@ -118,6 +106,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); + if (onoff_f) + onoff_f(); (*n_offl_successes)++; delta = jiffies - starttime; *sum_offl += delta; @@ -243,11 +233,12 @@ stop: /* * Initiate online-offline handling. */ -int torture_onoff_init(long ooholdoff, long oointerval) +int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f) { #ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; + onoff_f = f; if (onoff_interval <= 0) return 0; return torture_create_kthread(torture_onoff, NULL, onoff_task); diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h @@ -0,0 +1,2263 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* nolibc.h + * Copyright (C) 2017-2018 Willy Tarreau <w@1wt.eu> + */ + +/* + * This file is designed to be used as a libc alternative for minimal programs + * with very limited requirements. It consists of a small number of syscall and + * type definitions, and the minimal startup code needed to call main(). + * All syscalls are declared as static functions so that they can be optimized + * away by the compiler when not used. + * + * Syscalls are split into 3 levels: + * - The lower level is the arch-specific syscall() definition, consisting in + * assembly code in compound expressions. These are called my_syscall0() to + * my_syscall6() depending on the number of arguments. The MIPS + * implementation is limited to 5 arguments. All input arguments are cast + * to a long stored in a register. These expressions always return the + * syscall's return value as a signed long value which is often either a + * pointer or the negated errno value. + * + * - The second level is mostly architecture-independent. It is made of + * static functions called sys_<name>() which rely on my_syscallN() + * depending on the syscall definition. These functions are responsible + * for exposing the appropriate types for the syscall arguments (int, + * pointers, etc) and for setting the appropriate return type (often int). + * A few of them are architecture-specific because the syscalls are not all + * mapped exactly the same among architectures. For example, some archs do + * not implement select() and need pselect6() instead, so the sys_select() + * function will have to abstract this. + * + * - The third level is the libc call definition. It exposes the lower raw + * sys_<name>() calls in a way that looks like what a libc usually does, + * takes care of specific input values, and of setting errno upon error. + * There can be minor variations compared to standard libc calls. For + * example the open() call always takes 3 args here. + * + * The errno variable is declared static and unused. This way it can be + * optimized away if not used. However this means that a program made of + * multiple C files may observe different errno values (one per C file). For + * the type of programs this project targets it usually is not a problem. The + * resulting program may even be reduced by defining the NOLIBC_IGNORE_ERRNO + * macro, in which case the errno value will never be assigned. + * + * Some stdint-like integer types are defined. These are valid on all currently + * supported architectures, because signs are enforced, ints are assumed to be + * 32 bits, longs the size of a pointer and long long 64 bits. If more + * architectures have to be supported, this may need to be adapted. + * + * Some macro definitions like the O_* values passed to open(), and some + * structures like the sys_stat struct depend on the architecture. + * + * The definitions start with the architecture-specific parts, which are picked + * based on what the compiler knows about the target architecture, and are + * completed with the generic code. Since it is the compiler which sets the + * target architecture, cross-compiling normally works out of the box without + * having to specify anything. + * + * Finally some very common libc-level functions are provided. It is the case + * for a few functions usually found in string.h, ctype.h, or stdlib.h. Nothing + * is currently provided regarding stdio emulation. + * + * The macro NOLIBC is always defined, so that it is possible for a program to + * check this macro to know if it is being built against and decide to disable + * some features or simply not to include some standard libc files. + * + * Ideally this file should be split in multiple files for easier long term + * maintenance, but provided as a single file as it is now, it's quite + * convenient to use. Maybe some variations involving a set of includes at the + * top could work. + * + * A simple static executable may be built this way : + * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + * -static -include nolibc.h -lgcc -o hello hello.c + * + * A very useful calling convention table may be found here : + * http://man7.org/linux/man-pages/man2/syscall.2.html + * + * This doc is quite convenient though not necessarily up to date : + * https://w3challs.com/syscalls/ + * + */ + +/* Some archs (at least aarch64) don't expose the regular syscalls anymore by + * default, either because they have an "_at" replacement, or because there are + * more modern alternatives. For now we'd rather still use them. + */ +#define __ARCH_WANT_SYSCALL_NO_AT +#define __ARCH_WANT_SYSCALL_NO_FLAGS +#define __ARCH_WANT_SYSCALL_DEPRECATED + +#include <asm/unistd.h> +#include <asm/ioctls.h> +#include <asm/errno.h> +#include <linux/fs.h> +#include <linux/loop.h> + +#define NOLIBC + +/* this way it will be removed if unused */ +static int errno; + +#ifndef NOLIBC_IGNORE_ERRNO +#define SET_ERRNO(v) do { errno = (v); } while (0) +#else +#define SET_ERRNO(v) do { } while (0) +#endif + +/* errno codes all ensure that they will not conflict with a valid pointer + * because they all correspond to the highest addressable memry page. + */ +#define MAX_ERRNO 4095 + +/* Declare a few quite common macros and types that usually are in stdlib.h, + * stdint.h, ctype.h, unistd.h and a few other common locations. + */ + +#define NULL ((void *)0) + +/* stdint types */ +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +typedef unsigned long size_t; +typedef signed long ssize_t; +typedef unsigned long uintptr_t; +typedef signed long intptr_t; +typedef signed long ptrdiff_t; + +/* for stat() */ +typedef unsigned int dev_t; +typedef unsigned long ino_t; +typedef unsigned int mode_t; +typedef signed int pid_t; +typedef unsigned int uid_t; +typedef unsigned int gid_t; +typedef unsigned long nlink_t; +typedef signed long off_t; +typedef signed long blksize_t; +typedef signed long blkcnt_t; +typedef signed long time_t; + +/* for poll() */ +struct pollfd { + int fd; + short int events; + short int revents; +}; + +/* for select() */ +struct timeval { + long tv_sec; + long tv_usec; +}; + +/* for pselect() */ +struct timespec { + long tv_sec; + long tv_nsec; +}; + +/* for gettimeofday() */ +struct timezone { + int tz_minuteswest; + int tz_dsttime; +}; + +/* for getdents64() */ +struct linux_dirent64 { + uint64_t d_ino; + int64_t d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[]; +}; + +/* commonly an fd_set represents 256 FDs */ +#define FD_SETSIZE 256 +typedef struct { uint32_t fd32[FD_SETSIZE/32]; } fd_set; + +/* needed by wait4() */ +struct rusage { + struct timeval ru_utime; + struct timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +/* stat flags (WARNING, octal here) */ +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFBLK 0060000 +#define S_IFREG 0100000 +#define S_IFIFO 0010000 +#define S_IFLNK 0120000 +#define S_IFSOCK 0140000 +#define S_IFMT 0170000 + +#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) +#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) +#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) +#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) +#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) +#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) +#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) + +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 + +/* all the *at functions */ +#ifndef AT_FDWCD +#define AT_FDCWD -100 +#endif + +/* lseek */ +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 + +/* reboot */ +#define LINUX_REBOOT_MAGIC1 0xfee1dead +#define LINUX_REBOOT_MAGIC2 0x28121969 +#define LINUX_REBOOT_CMD_HALT 0xcdef0123 +#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc +#define LINUX_REBOOT_CMD_RESTART 0x01234567 +#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 + + +/* The format of the struct as returned by the libc to the application, which + * significantly differs from the format returned by the stat() syscall flavours. + */ +struct stat { + dev_t st_dev; /* ID of device containing file */ + ino_t st_ino; /* inode number */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device ID (if special file) */ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for file system I/O */ + blkcnt_t st_blocks; /* number of 512B blocks allocated */ + time_t st_atime; /* time of last access */ + time_t st_mtime; /* time of last modification */ + time_t st_ctime; /* time of last status change */ +}; + +#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) +#define WIFEXITED(status) (((status) & 0x7f) == 0) + + +/* Below comes the architecture-specific code. For each architecture, we have + * the syscall declarations and the _start code definition. This is the only + * global part. On all architectures the kernel puts everything in the stack + * before jumping to _start just above us, without any return address (_start + * is not a function but an entry pint). So at the stack pointer we find argc. + * Then argv[] begins, and ends at the first NULL. Then we have envp which + * starts and ends with a NULL as well. So envp=argv+argc+1. + */ + +#if defined(__x86_64__) +/* Syscalls for x86_64 : + * - registers are 64-bit + * - syscall number is passed in rax + * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively + * - the system call is performed by calling the syscall instruction + * - syscall return comes in rax + * - rcx and r8..r11 may be clobbered, others are preserved. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + */ + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "rcx", "r9", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + register long _arg6 asm("r9") = (long)(arg6); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %rdi\n" // argc (first arg, %rdi) + "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) + "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) + "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned when + "sub $8, %rsp\n" // entering the callee + "call main\n" // main() returns the status code, we'll exit with it. + "movzb %al, %rdi\n" // retrieve exit code from 8 lower bits + "mov $60, %rax\n" // NR_exit == 60 + "syscall\n" // really exit + "hlt\n" // ensure it does not return + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, equivalent to stat64(). The + * syscall returns 116 bytes and stops in the middle of __unused. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned int st_mode; + unsigned int st_uid; + + unsigned int st_gid; + unsigned int __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + + long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + long __unused[3]; +}; + +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +/* Syscalls for i386 : + * - mostly similar to x86_64 + * - registers are 32-bit + * - syscall number is passed in eax + * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively + * - all registers are preserved (except eax of course) + * - the system call is performed by calling int $0x80 + * - syscall return comes in eax + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, i386 supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + register long _arg5 asm("edi") = (long)(arg5); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %eax\n" // argc (first arg, %eax) + "mov %esp, %ebx\n" // argv[] (second arg, %ebx) + "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) + "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned when + "push %ecx\n" // push all registers on the stack so that we + "push %ebx\n" // support both regparm and plain stack modes + "push %eax\n" + "call main\n" // main() returns the status code in %eax + "movzbl %al, %ebx\n" // retrieve exit code from lower 8 bits + "movl $1, %eax\n" // NR_exit == 1 + "int $0x80\n" // exit now + "hlt\n" // ensure it does not + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +#elif defined(__ARM_EABI__) +/* Syscalls for ARM in ARM or Thumb modes : + * - registers are 32-bit + * - stack is 8-byte aligned + * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) + * - syscall number is passed in r7 + * - arguments are in r0, r1, r2, r3, r4, r5 + * - the system call is performed by calling svc #0 + * - syscall return comes in r0. + * - only lr is clobbered. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, ARM supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + register long _arg5 asm("r4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" +#if defined(__THUMBEB__) || defined(__THUMBEL__) + /* We enter here in 32-bit mode but if some previous functions were in + * 16-bit mode, the assembler cannot know, so we need to tell it we're in + * 32-bit now, then switch to 16-bit (is there a better way to do it than + * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that + * it generates correct instructions. Note that we do not support thumb1. + */ + ".code 32\n" + "add r0, pc, #1\n" + "bx r0\n" + ".code 16\n" +#endif + "pop {%r0}\n" // argc was in the stack + "mov %r1, %sp\n" // argv = sp + "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... + "add %r2, %r2, $4\n" // ... + 4 + "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the + "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) + "bl main\n" // main() returns the status code, we'll exit with it. + "and %r0, %r0, $0xff\n" // limit exit code to 8 bits + "movs r7, $1\n" // NR_exit == 1 + "svc $0x00\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). In big endian, the format + * differs as devices are returned as short only. + */ +struct sys_stat_struct { +#if defined(__ARMEB__) + unsigned short st_dev; + unsigned short __pad1; +#else + unsigned long st_dev; +#endif + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; +#if defined(__ARMEB__) + unsigned short st_rdev; + unsigned short __pad2; +#else + unsigned long st_rdev; +#endif + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +#elif defined(__aarch64__) +/* Syscalls for AARCH64 : + * - registers are 64-bit + * - stack is 16-byte aligned + * - syscall number is passed in x8 + * - arguments are in x0, x1, x2, x3, x4, x5 + * - the system call is performed by calling svc 0 + * - syscall return comes in x0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * + * On aarch64, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + register long _arg6 asm("x5") = (long)(arg6); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "ldr x0, [sp]\n" // argc (x0) was in the stack + "add x1, sp, 8\n" // argv (x1) = sp + "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... + "add x2, x2, 8\n" // + 8 (skip null) + "add x2, x2, x1\n" // + argv + "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee + "bl main\n" // main() returns the status code, we'll exit with it. + "and x0, x0, 0xff\n" // limit exit code to 8 bits + "mov x8, 93\n" // NR_exit == 93 + "svc #0\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the newfstatat() syscall. Differs slightly from the + * x86_64's stat one by field ordering, so be careful. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + + unsigned long st_rdev; + unsigned long __pad1; + long st_size; + int st_blksize; + int __pad2; + + long st_blocks; + long st_atime; + unsigned long st_atime_nsec; + long st_mtime; + + unsigned long st_mtime_nsec; + long st_ctime; + unsigned long st_ctime_nsec; + unsigned int __unused[2]; +}; + +#elif defined(__mips__) && defined(_ABIO32) +/* Syscalls for MIPS ABI O32 : + * - WARNING! there's always a delayed slot! + * - WARNING again, the syntax is different, registers take a '$' and numbers + * do not. + * - registers are 32-bit + * - stack is 8-byte aligned + * - syscall number is passed in v0 (starts at 0xfa0). + * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to + * leave some room in the stack for the callee to save a0..a3 if needed. + * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are + * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as + * scall32-o32.S in the kernel sources. + * - the system call is performed by calling "syscall" + * - syscall return comes in v0, and register a3 needs to be checked to know + * if an error occured, in which case errno is in v0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + */ + +#define my_syscall0(num) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "r"(_num) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + register long _arg5 = (long)(arg5); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "sw %7, 16($sp)\n" \ + "syscall\n " \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +/* startup code, note that it's called __start on MIPS */ +asm(".section .text\n" + ".set nomips16\n" + ".global __start\n" + ".set noreorder\n" + ".option pic0\n" + ".ent __start\n" + "__start:\n" + "lw $a0,($sp)\n" // argc was in the stack + "addiu $a1, $sp, 4\n" // argv = sp + 4 + "sll $a2, $a0, 2\n" // a2 = argc * 4 + "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... + "addiu $a2, $a2, 4\n" // ... + 4 + "li $t0, -8\n" + "and $sp, $sp, $t0\n" // sp must be 8-byte aligned + "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! + "jal main\n" // main() returns the status code, we'll exit with it. + "nop\n" // delayed slot + "and $a0, $v0, 0xff\n" // limit exit code to 8 bits + "li $v0, 4001\n" // NR_exit == 4001 + "syscall\n" + ".end __start\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_APPEND 0x0008 +#define O_NONBLOCK 0x0080 +#define O_CREAT 0x0100 +#define O_TRUNC 0x0200 +#define O_EXCL 0x0400 +#define O_NOCTTY 0x0800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall. 88 bytes are returned by the + * syscall. + */ +struct sys_stat_struct { + unsigned int st_dev; + long st_pad1[3]; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + unsigned int st_rdev; + long st_pad2[2]; + long st_size; + long st_pad3; + long st_atime; + long st_atime_nsec; + long st_mtime; + long st_mtime_nsec; + long st_ctime; + long st_ctime_nsec; + long st_blksize; + long st_blocks; + long st_pad4[14]; +}; + +#endif + + +/* Below are the C functions used to declare the raw syscalls. They try to be + * architecture-agnostic, and return either a success or -errno. Declaring them + * static will lead to them being inlined in most cases, but it's still possible + * to reference them by a pointer if needed. + */ +static __attribute__((unused)) +void *sys_brk(void *addr) +{ + return (void *)my_syscall1(__NR_brk, addr); +} + +static __attribute__((noreturn,unused)) +void sys_exit(int status) +{ + my_syscall1(__NR_exit, status & 255); + while(1); // shut the "noreturn" warnings. +} + +static __attribute__((unused)) +int sys_chdir(const char *path) +{ + return my_syscall1(__NR_chdir, path); +} + +static __attribute__((unused)) +int sys_chmod(const char *path, mode_t mode) +{ +#ifdef __NR_fchmodat + return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); +#else + return my_syscall2(__NR_chmod, path, mode); +#endif +} + +static __attribute__((unused)) +int sys_chown(const char *path, uid_t owner, gid_t group) +{ +#ifdef __NR_fchownat + return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); +#else + return my_syscall3(__NR_chown, path, owner, group); +#endif +} + +static __attribute__((unused)) +int sys_chroot(const char *path) +{ + return my_syscall1(__NR_chroot, path); +} + +static __attribute__((unused)) +int sys_close(int fd) +{ + return my_syscall1(__NR_close, fd); +} + +static __attribute__((unused)) +int sys_dup(int fd) +{ + return my_syscall1(__NR_dup, fd); +} + +static __attribute__((unused)) +int sys_dup2(int old, int new) +{ + return my_syscall2(__NR_dup2, old, new); +} + +static __attribute__((unused)) +int sys_execve(const char *filename, char *const argv[], char *const envp[]) +{ + return my_syscall3(__NR_execve, filename, argv, envp); +} + +static __attribute__((unused)) +pid_t sys_fork(void) +{ + return my_syscall0(__NR_fork); +} + +static __attribute__((unused)) +int sys_fsync(int fd) +{ + return my_syscall1(__NR_fsync, fd); +} + +static __attribute__((unused)) +int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + return my_syscall3(__NR_getdents64, fd, dirp, count); +} + +static __attribute__((unused)) +pid_t sys_getpgrp(void) +{ + return my_syscall0(__NR_getpgrp); +} + +static __attribute__((unused)) +pid_t sys_getpid(void) +{ + return my_syscall0(__NR_getpid); +} + +static __attribute__((unused)) +int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return my_syscall2(__NR_gettimeofday, tv, tz); +} + +static __attribute__((unused)) +int sys_ioctl(int fd, unsigned long req, void *value) +{ + return my_syscall3(__NR_ioctl, fd, req, value); +} + +static __attribute__((unused)) +int sys_kill(pid_t pid, int signal) +{ + return my_syscall2(__NR_kill, pid, signal); +} + +static __attribute__((unused)) +int sys_link(const char *old, const char *new) +{ +#ifdef __NR_linkat + return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); +#else + return my_syscall2(__NR_link, old, new); +#endif +} + +static __attribute__((unused)) +off_t sys_lseek(int fd, off_t offset, int whence) +{ + return my_syscall3(__NR_lseek, fd, offset, whence); +} + +static __attribute__((unused)) +int sys_mkdir(const char *path, mode_t mode) +{ +#ifdef __NR_mkdirat + return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); +#else + return my_syscall2(__NR_mkdir, path, mode); +#endif +} + +static __attribute__((unused)) +long sys_mknod(const char *path, mode_t mode, dev_t dev) +{ +#ifdef __NR_mknodat + return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); +#else + return my_syscall3(__NR_mknod, path, mode, dev); +#endif +} + +static __attribute__((unused)) +int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return my_syscall5(__NR_mount, src, tgt, fst, flags, data); +} + +static __attribute__((unused)) +int sys_open(const char *path, int flags, mode_t mode) +{ +#ifdef __NR_openat + return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); +#else + return my_syscall3(__NR_open, path, flags, mode); +#endif +} + +static __attribute__((unused)) +int sys_pivot_root(const char *new, const char *old) +{ + return my_syscall2(__NR_pivot_root, new, old); +} + +static __attribute__((unused)) +int sys_poll(struct pollfd *fds, int nfds, int timeout) +{ + return my_syscall3(__NR_poll, fds, nfds, timeout); +} + +static __attribute__((unused)) +ssize_t sys_read(int fd, void *buf, size_t count) +{ + return my_syscall3(__NR_read, fd, buf, count); +} + +static __attribute__((unused)) +ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) +{ + return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); +} + +static __attribute__((unused)) +int sys_sched_yield(void) +{ + return my_syscall0(__NR_sched_yield); +} + +static __attribute__((unused)) +int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ +#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) + struct sel_arg_struct { + unsigned long n; + fd_set *r, *w, *e; + struct timeval *t; + } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; + return my_syscall1(__NR_select, &arg); +#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) + struct timespec t; + + if (timeout) { + t.tv_sec = timeout->tv_sec; + t.tv_nsec = timeout->tv_usec * 1000; + } + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); +#else +#ifndef __NR__newselect +#define __NR__newselect __NR_select +#endif + return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); +#endif +} + +static __attribute__((unused)) +int sys_setpgid(pid_t pid, pid_t pgid) +{ + return my_syscall2(__NR_setpgid, pid, pgid); +} + +static __attribute__((unused)) +pid_t sys_setsid(void) +{ + return my_syscall0(__NR_setsid); +} + +static __attribute__((unused)) +int sys_stat(const char *path, struct stat *buf) +{ + struct sys_stat_struct stat; + long ret; + +#ifdef __NR_newfstatat + /* only solution for arm64 */ + ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); +#else + ret = my_syscall2(__NR_stat, path, &stat); +#endif + buf->st_dev = stat.st_dev; + buf->st_ino = stat.st_ino; + buf->st_mode = stat.st_mode; + buf->st_nlink = stat.st_nlink; + buf->st_uid = stat.st_uid; + buf->st_gid = stat.st_gid; + buf->st_rdev = stat.st_rdev; + buf->st_size = stat.st_size; + buf->st_blksize = stat.st_blksize; + buf->st_blocks = stat.st_blocks; + buf->st_atime = stat.st_atime; + buf->st_mtime = stat.st_mtime; + buf->st_ctime = stat.st_ctime; + return ret; +} + + +static __attribute__((unused)) +int sys_symlink(const char *old, const char *new) +{ +#ifdef __NR_symlinkat + return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); +#else + return my_syscall2(__NR_symlink, old, new); +#endif +} + +static __attribute__((unused)) +mode_t sys_umask(mode_t mode) +{ + return my_syscall1(__NR_umask, mode); +} + +static __attribute__((unused)) +int sys_umount2(const char *path, int flags) +{ + return my_syscall2(__NR_umount2, path, flags); +} + +static __attribute__((unused)) +int sys_unlink(const char *path) +{ +#ifdef __NR_unlinkat + return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); +#else + return my_syscall1(__NR_unlink, path); +#endif +} + +static __attribute__((unused)) +pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + return my_syscall4(__NR_wait4, pid, status, options, rusage); +} + +static __attribute__((unused)) +pid_t sys_waitpid(pid_t pid, int *status, int options) +{ + return sys_wait4(pid, status, options, 0); +} + +static __attribute__((unused)) +pid_t sys_wait(int *status) +{ + return sys_waitpid(-1, status, 0); +} + +static __attribute__((unused)) +ssize_t sys_write(int fd, const void *buf, size_t count) +{ + return my_syscall3(__NR_write, fd, buf, count); +} + + +/* Below are the libc-compatible syscalls which return x or -1 and set errno. + * They rely on the functions above. Similarly they're marked static so that it + * is possible to assign pointers to them if needed. + */ + +static __attribute__((unused)) +int brk(void *addr) +{ + void *ret = sys_brk(addr); + + if (!ret) { + SET_ERRNO(ENOMEM); + return -1; + } + return 0; +} + +static __attribute__((noreturn,unused)) +void exit(int status) +{ + sys_exit(status); +} + +static __attribute__((unused)) +int chdir(const char *path) +{ + int ret = sys_chdir(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chmod(const char *path, mode_t mode) +{ + int ret = sys_chmod(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chown(const char *path, uid_t owner, gid_t group) +{ + int ret = sys_chown(path, owner, group); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chroot(const char *path) +{ + int ret = sys_chroot(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int close(int fd) +{ + int ret = sys_close(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int dup2(int old, int new) +{ + int ret = sys_dup2(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int execve(const char *filename, char *const argv[], char *const envp[]) +{ + int ret = sys_execve(filename, argv, envp); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t fork(void) +{ + pid_t ret = sys_fork(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int fsync(int fd) +{ + int ret = sys_fsync(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + int ret = sys_getdents64(fd, dirp, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t getpgrp(void) +{ + pid_t ret = sys_getpgrp(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t getpid(void) +{ + pid_t ret = sys_getpid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret = sys_gettimeofday(tv, tz); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int ioctl(int fd, unsigned long req, void *value) +{ + int ret = sys_ioctl(fd, req, value); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int kill(pid_t pid, int signal) +{ + int ret = sys_kill(pid, signal); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int link(const char *old, const char *new) +{ + int ret = sys_link(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +off_t lseek(int fd, off_t offset, int whence) +{ + off_t ret = sys_lseek(fd, offset, whence); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mkdir(const char *path, mode_t mode) +{ + int ret = sys_mkdir(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mknod(const char *path, mode_t mode, dev_t dev) +{ + int ret = sys_mknod(path, mode, dev); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mount(const char *src, const char *tgt, + const char *fst, unsigned long flags, + const void *data) +{ + int ret = sys_mount(src, tgt, fst, flags, data); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int open(const char *path, int flags, mode_t mode) +{ + int ret = sys_open(path, flags, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int pivot_root(const char *new, const char *old) +{ + int ret = sys_pivot_root(new, old); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int poll(struct pollfd *fds, int nfds, int timeout) +{ + int ret = sys_poll(fds, nfds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +ssize_t read(int fd, void *buf, size_t count) +{ + ssize_t ret = sys_read(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int reboot(int cmd) +{ + int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +void *sbrk(intptr_t inc) +{ + void *ret; + + /* first call to find current end */ + if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) + return ret + inc; + + SET_ERRNO(ENOMEM); + return (void *)-1; +} + +static __attribute__((unused)) +int sched_yield(void) +{ + int ret = sys_sched_yield(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ + int ret = sys_select(nfds, rfds, wfds, efds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int setpgid(pid_t pid, pid_t pgid) +{ + int ret = sys_setpgid(pid, pgid); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t setsid(void) +{ + pid_t ret = sys_setsid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +unsigned int sleep(unsigned int seconds) +{ + struct timeval my_timeval = { seconds, 0 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return my_timeval.tv_sec + !!my_timeval.tv_usec; + else + return 0; +} + +static __attribute__((unused)) +int stat(const char *path, struct stat *buf) +{ + int ret = sys_stat(path, buf); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int symlink(const char *old, const char *new) +{ + int ret = sys_symlink(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int tcsetpgrp(int fd, pid_t pid) +{ + return ioctl(fd, TIOCSPGRP, &pid); +} + +static __attribute__((unused)) +mode_t umask(mode_t mode) +{ + return sys_umask(mode); +} + +static __attribute__((unused)) +int umount2(const char *path, int flags) +{ + int ret = sys_umount2(path, flags); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int unlink(const char *path) +{ + int ret = sys_unlink(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + pid_t ret = sys_wait4(pid, status, options, rusage); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t waitpid(pid_t pid, int *status, int options) +{ + pid_t ret = sys_waitpid(pid, status, options); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait(int *status) +{ + pid_t ret = sys_wait(status); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +ssize_t write(int fd, const void *buf, size_t count) +{ + ssize_t ret = sys_write(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +/* some size-optimized reimplementations of a few common str* and mem* + * functions. They're marked static, except memcpy() and raise() which are used + * by libgcc on ARM, so they are marked weak instead in order not to cause an + * error when building a program made of multiple files (not recommended). + */ + +static __attribute__((unused)) +void *memmove(void *dst, const void *src, size_t len) +{ + ssize_t pos = (dst <= src) ? -1 : (long)len; + void *ret = dst; + + while (len--) { + pos += (dst <= src) ? 1 : -1; + ((char *)dst)[pos] = ((char *)src)[pos]; + } + return ret; +} + +static __attribute__((unused)) +void *memset(void *dst, int b, size_t len) +{ + char *p = dst; + + while (len--) + *(p++) = b; + return dst; +} + +static __attribute__((unused)) +int memcmp(const void *s1, const void *s2, size_t n) +{ + size_t ofs = 0; + char c1 = 0; + + while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { + ofs++; + } + return c1; +} + +static __attribute__((unused)) +char *strcpy(char *dst, const char *src) +{ + char *ret = dst; + + while ((*dst++ = *src++)); + return ret; +} + +static __attribute__((unused)) +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + s++; + } + return NULL; +} + +static __attribute__((unused)) +char *strrchr(const char *s, int c) +{ + const char *ret = NULL; + + while (*s) { + if (*s == (char)c) + ret = s; + s++; + } + return (char *)ret; +} + +static __attribute__((unused)) +size_t nolibc_strlen(const char *str) +{ + size_t len; + + for (len = 0; str[len]; len++); + return len; +} + +#define strlen(str) ({ \ + __builtin_constant_p((str)) ? \ + __builtin_strlen((str)) : \ + nolibc_strlen((str)); \ +}) + +static __attribute__((unused)) +int isdigit(int c) +{ + return (unsigned int)(c - '0') <= 9; +} + +static __attribute__((unused)) +long atol(const char *s) +{ + unsigned long ret = 0; + unsigned long d; + int neg = 0; + + if (*s == '-') { + neg = 1; + s++; + } + + while (1) { + d = (*s++) - '0'; + if (d > 9) + break; + ret *= 10; + ret += d; + } + + return neg ? -ret : ret; +} + +static __attribute__((unused)) +int atoi(const char *s) +{ + return atol(s); +} + +static __attribute__((unused)) +const char *ltoa(long in) +{ + /* large enough for -9223372036854775808 */ + static char buffer[21]; + char *pos = buffer + sizeof(buffer) - 1; + int neg = in < 0; + unsigned long n = neg ? -in : in; + + *pos-- = '\0'; + do { + *pos-- = '0' + n % 10; + n /= 10; + if (pos < buffer) + return pos + 1; + } while (n); + + if (neg) + *pos-- = '-'; + return pos + 1; +} + +__attribute__((weak,unused)) +void *memcpy(void *dst, const void *src, size_t len) +{ + return memmove(dst, src, len); +} + +/* needed by libgcc for divide by zero */ +__attribute__((weak,unused)) +int raise(int signal) +{ + return kill(getpid(), signal); +} + +/* Here come a few helper functions */ + +static __attribute__((unused)) +void FD_ZERO(fd_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +static __attribute__((unused)) +void FD_SET(int fd, fd_set *set) +{ + if (fd < 0 || fd >= FD_SETSIZE) + return; + set->fd32[fd / 32] |= 1 << (fd & 31); +} + +/* WARNING, it only deals with the 4096 first majors and 256 first minors */ +static __attribute__((unused)) +dev_t makedev(unsigned int major, unsigned int minor) +{ + return ((major & 0xfff) << 8) | (minor & 0xff); +} diff --git a/tools/memory-model/.gitignore b/tools/memory-model/.gitignore @@ -0,0 +1 @@ +litmus diff --git a/tools/memory-model/README b/tools/memory-model/README @@ -156,6 +156,8 @@ lock.cat README This file. +scripts Various scripts, see scripts/README. + =========== LIMITATIONS diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell @@ -29,7 +29,8 @@ enum Barriers = 'wmb (*smp_wmb*) || 'sync-rcu (*synchronize_rcu*) || 'before-atomic (*smp_mb__before_atomic*) || 'after-atomic (*smp_mb__after_atomic*) || - 'after-spinlock (*smp_mb__after_spinlock*) + 'after-spinlock (*smp_mb__after_spinlock*) || + 'after-unlock-lock (*smp_mb__after_unlock_lock*) instructions F[Barriers] (* Compute matching pairs of nested Rcu-lock and Rcu-unlock *) diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat @@ -30,7 +30,9 @@ let wmb = [W] ; fencerel(Wmb) ; [W] let mb = ([M] ; fencerel(Mb) ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | + ([M] ; po ; [UL] ; (co | po) ; [LKW] ; + fencerel(After-unlock-lock) ; [M]) let gp = po ; [Sync-rcu] ; po? let strong-fence = mb | gp diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def @@ -23,6 +23,7 @@ smp_wmb() { __fence{wmb}; } smp_mb__before_atomic() { __fence{before-atomic}; } smp_mb__after_atomic() { __fence{after-atomic}; } smp_mb__after_spinlock() { __fence{after-spinlock}; } +smp_mb__after_unlock_lock() { __fence{after-unlock-lock}; } // Exchange xchg(X,V) __xchg{mb}(X,V) diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README @@ -0,0 +1,70 @@ + ============ + LKMM SCRIPTS + ============ + + +These scripts are run from the tools/memory-model directory. + +checkalllitmus.sh + + Run all litmus tests in the litmus-tests directory, checking + the results against the expected results recorded in the + "Result:" comment lines. + +checkghlitmus.sh + + Run all litmus tests in the https://github.com/paulmckrcu/litmus + archive that are C-language and that have "Result:" comment lines + documenting expected results, comparing the actual results to + those expected. + +checklitmushist.sh + + Run all litmus tests having .litmus.out files from previous + initlitmushist.sh or newlitmushist.sh runs, comparing the + herd output to that of the original runs. + +checklitmus.sh + + Check a single litmus test against its "Result:" expected result. + +cmplitmushist.sh + + Compare output from two different runs of the same litmus tests, + with the absolute pathnames of the tests to run provided one + name per line on standard input. Not normally run manually, + provided instead for use by other scripts. + +initlitmushist.sh + + Run all litmus tests having no more than the specified number + of processes given a specified timeout, recording the results + in .litmus.out files. + +judgelitmus.sh + + Given a .litmus file and its .litmus.out herd output, check the + .litmus.out file against the .litmus file's "Result:" comment to + judge whether the test ran correctly. Not normally run manually, + provided instead for use by other scripts. + +newlitmushist.sh + + For all new or updated litmus tests having no more than the + specified number of processes given a specified timeout, run + and record the results in .litmus.out files. + +parseargs.sh + + Parse command-line arguments. Not normally run manually, + provided instead for use by other scripts. + +runlitmushist.sh + + Run the litmus tests whose absolute pathnames are provided one + name per line on standard input. Not normally run manually, + provided instead for use by other scripts. + +README + + This file diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh @@ -1,42 +1,27 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ # -# Run herd tests on all .litmus files in the specified directory (which -# defaults to litmus-tests) and check each file's result against a "Result:" -# comment within that litmus test. If the verification result does not -# match that specified in the litmus test, this script prints an error -# message prefixed with "^^^". It also outputs verification results to -# a file whose name is that of the specified litmus test, but with ".out" -# appended. +# Run herd tests on all .litmus files in the litmus-tests directory +# and check each file's result against a "Result:" comment within that +# litmus test. If the verification result does not match that specified +# in the litmus test, this script prints an error message prefixed with +# "^^^". It also outputs verification results to a file whose name is +# that of the specified litmus test, but with ".out" appended. # # Usage: -# checkalllitmus.sh [ directory ] +# checkalllitmus.sh # -# The LINUX_HERD_OPTIONS environment variable may be used to specify -# arguments to herd, whose default is defined by the checklitmus.sh script. -# Thus, one would normally run this in the directory containing the memory -# model, specifying the pathname of the litmus test to check. +# Run this in the directory containing the memory model. # # This script makes no attempt to run the litmus tests concurrently. # -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. -# # Copyright IBM Corporation, 2018 # # Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> -litmusdir=${1-litmus-tests} +. scripts/parseargs.sh + +litmusdir=litmus-tests if test -d "$litmusdir" -a -r "$litmusdir" -a -x "$litmusdir" then : @@ -45,6 +30,14 @@ else exit 255 fi +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find $litmusdir -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + # Find the checklitmus script. If it is not where we expect it, then # assume that the caller has the PATH environment variable set # appropriately. @@ -57,7 +50,7 @@ fi # Run the script on all the litmus tests in the specified directory ret=0 -for i in litmus-tests/*.litmus +for i in $litmusdir/*.litmus do if ! $clscript $i then @@ -66,8 +59,8 @@ do done if test "$ret" -ne 0 then - echo " ^^^ VERIFICATION MISMATCHES" + echo " ^^^ VERIFICATION MISMATCHES" 1>&2 else - echo All litmus tests verified as was expected. + echo All litmus tests verified as was expected. 1>&2 fi exit $ret diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests having a maximum number of processes +# to run, defaults to 6. +# +# sh checkghlitmus.sh +# +# Run from the Linux kernel tools/memory-model directory. See the +# parseargs.sh scripts for arguments. + +. scripts/parseargs.sh + +T=/tmp/checkghlitmus.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +# Clone the repository if it is not already present. +if test -d litmus +then + : +else + git clone https://github.com/paulmckrcu/litmus + ( cd litmus; git checkout origin/master ) +fi + +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find litmus -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + +# Create a list of the C-language litmus tests previously run. +( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | + sed -e 's/\.out$//' | + xargs -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' | + xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already + +# Create a list of C-language litmus tests with "Result:" commands and +# no more than the specified number of processes. +find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result +xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short + +# Form list of tests without corresponding .litmus.out files +sort $T/list-C-already $T/list-C-result-short | uniq -u > $T/list-C-needed + +# Run any needed tests. +if scripts/runlitmushist.sh < $T/list-C-needed > $T/run.stdout 2> $T/run.stderr +then + errs= +else + errs=1 +fi + +sed < $T/list-C-result-short -e 's,^,scripts/judgelitmus.sh ,' | + sh > $T/judge.stdout 2> $T/judge.stderr + +if test -n "$errs" +then + cat $T/run.stderr 1>&2 +fi +grep '!!!' $T/judge.stdout diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh @@ -1,40 +1,24 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ # -# Run a herd test and check the result against a "Result:" comment within -# the litmus test. If the verification result does not match that specified -# in the litmus test, this script prints an error message prefixed with -# "^^^" and exits with a non-zero status. It also outputs verification +# Run a herd test and invokes judgelitmus.sh to check the result against +# a "Result:" comment within the litmus test. It also outputs verification # results to a file whose name is that of the specified litmus test, but # with ".out" appended. # # Usage: # checklitmus.sh file.litmus # -# The LINUX_HERD_OPTIONS environment variable may be used to specify -# arguments to herd, which default to "-conf linux-kernel.cfg". Thus, -# one would normally run this in the directory containing the memory model, -# specifying the pathname of the litmus test to check. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The caller is expected to have +# properly set up the LKMM environment variables. # # Copyright IBM Corporation, 2018 # # Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> litmus=$1 -herdoptions=${LINUX_HERD_OPTIONS--conf linux-kernel.cfg} +herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} if test -f "$litmus" -a -r "$litmus" then @@ -43,44 +27,8 @@ else echo ' --- ' error: \"$litmus\" is not a readable file exit 255 fi -if grep -q '^ \* Result: ' $litmus -then - outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` -else - outcome=specified -fi -echo Herd options: $herdoptions > $litmus.out -/usr/bin/time herd7 -o ~/tmp $herdoptions $litmus >> $litmus.out 2>&1 -grep "Herd options:" $litmus.out -grep '^Observation' $litmus.out -if grep -q '^Observation' $litmus.out -then - : -else - cat $litmus.out - echo ' ^^^ Verification error' - echo ' ^^^ Verification error' >> $litmus.out 2>&1 - exit 255 -fi -if test "$outcome" = DEADLOCK -then - echo grep 3 and 4 - if grep '^Observation' $litmus.out | grep -q 'Never 0 0$' - then - ret=0 - else - echo " ^^^ Unexpected non-$outcome verification" - echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1 - ret=1 - fi -elif grep '^Observation' $litmus.out | grep -q $outcome || test "$outcome" = Maybe -then - ret=0 -else - echo " ^^^ Unexpected non-$outcome verification" - echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1 - ret=1 -fi -tail -2 $litmus.out | head -1 -exit $ret +echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 + +scripts/judgelitmus.sh $litmus diff --git a/tools/memory-model/scripts/checklitmushist.sh b/tools/memory-model/scripts/checklitmushist.sh @@ -0,0 +1,60 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Reruns the C-language litmus tests previously run that match the +# specified criteria, and compares the result to that of the previous +# runs from initlitmushist.sh and/or newlitmushist.sh. +# +# sh checklitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# See scripts/parseargs.sh for list of arguments. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +. scripts/parseargs.sh + +T=/tmp/checklitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + echo Run scripts/initlitmushist.sh first, need litmus repo. + exit 1 +fi + +# Create the results directory and populate it with subdirectories. +# The initial output is created here to avoid clobbering the output +# generated earlier. +mkdir $T/results +find litmus -type d -print | ( cd $T/results; sed -e 's/^/mkdir -p /' | sh ) + +# Create the list of litmus tests already run, then remove those that +# are excluded by this run's --procs argument. +( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | + sed -e 's/\.out$//' | + xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already +xargs < $T/list-C-already -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short + +# Redirect output, run tests, then restore destination directory. +destdir="$LKMM_DESTDIR" +LKMM_DESTDIR=$T/results; export LKMM_DESTDIR +scripts/runlitmushist.sh < $T/list-C-short > $T/runlitmushist.sh.out 2>&1 +LKMM_DESTDIR="$destdir"; export LKMM_DESTDIR + +# Move the newly generated .litmus.out files to .litmus.out.new files +# in the destination directory. +cdir=`pwd` +ddir=`awk -v c="$cdir" -v d="$LKMM_DESTDIR" \ + 'END { if (d ~ /^\//) print d; else print c "/" d; }' < /dev/null` +( cd $T/results; find litmus -type f -name '*.litmus.out' -print | + sed -e 's,^.*$,cp & '"$ddir"'/&.new,' | sh ) + +sed < $T/list-C-short -e 's,^,'"$LKMM_DESTDIR/"',' | + sh scripts/cmplitmushist.sh +exit $? diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh @@ -0,0 +1,87 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Compares .out and .out.new files for each name on standard input, +# one full pathname per line. Outputs comparison results followed by +# a summary. +# +# sh cmplitmushist.sh + +T=/tmp/cmplitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +# comparetest oldpath newpath +perfect=0 +obsline=0 +noobsline=0 +obsresult=0 +badcompare=0 +comparetest () { + grep -v 'maxresident)k\|minor)pagefaults\|^Time' $1 > $T/oldout + grep -v 'maxresident)k\|minor)pagefaults\|^Time' $2 > $T/newout + if cmp -s $T/oldout $T/newout && grep -q '^Observation' $1 + then + echo Exact output match: $2 + perfect=`expr "$perfect" + 1` + return 0 + fi + + grep '^Observation' $1 > $T/oldout + grep '^Observation' $2 > $T/newout + if test -s $T/oldout -o -s $T/newout + then + if cmp -s $T/oldout $T/newout + then + echo Matching Observation result and counts: $2 + obsline=`expr "$obsline" + 1` + return 0 + fi + else + echo Missing Observation line "(e.g., herd7 timeout)": $2 + noobsline=`expr "$noobsline" + 1` + return 0 + fi + + grep '^Observation' $1 | awk '{ print $3 }' > $T/oldout + grep '^Observation' $2 | awk '{ print $3 }' > $T/newout + if cmp -s $T/oldout $T/newout + then + echo Matching Observation Always/Sometimes/Never result: $2 + obsresult=`expr "$obsresult" + 1` + return 0 + fi + echo ' !!!' Result changed: $2 + badcompare=`expr "$badcompare" + 1` + return 1 +} + +sed -e 's/^.*$/comparetest &.out &.out.new/' > $T/cmpscript +. $T/cmpscript > $T/cmpscript.out +cat $T/cmpscript.out + +echo ' ---' Summary: 1>&2 +grep '!!!' $T/cmpscript.out 1>&2 +if test "$perfect" -ne 0 +then + echo Exact output matches: $perfect 1>&2 +fi +if test "$obsline" -ne 0 +then + echo Matching Observation result and counts: $obsline 1>&2 +fi +if test "$noobsline" -ne 0 +then + echo Missing Observation line "(e.g., herd7 timeout)": $noobsline 1>&2 +fi +if test "$obsresult" -ne 0 +then + echo Matching Observation Always/Sometimes/Never result: $obsresult 1>&2 +fi +if test "$badcompare" -ne 0 +then + echo "!!!" Result changed: $badcompare 1>&2 + exit 1 +fi + +exit 0 diff --git a/tools/memory-model/scripts/initlitmushist.sh b/tools/memory-model/scripts/initlitmushist.sh @@ -0,0 +1,68 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests matching the specified criteria. +# Generates the output for each .litmus file into a corresponding +# .litmus.out file, and does not judge the result. +# +# sh initlitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# See scripts/parseargs.sh for list of arguments. +# +# This script can consume significant wallclock time and CPU, especially as +# the value of --procs rises. On a four-core (eight hardware threads) +# 2.5GHz x86 with a one-minute per-run timeout: +# +# --procs wallclock CPU timeouts tests +# 1 0m11.241s 0m1.086s 0 19 +# 2 1m12.598s 2m8.459s 2 393 +# 3 1m30.007s 6m2.479s 4 2291 +# 4 3m26.042s 18m5.139s 9 3217 +# 5 4m26.661s 23m54.128s 13 3784 +# 6 4m41.900s 26m4.721s 13 4352 +# 7 5m51.463s 35m50.868s 13 4626 +# 8 10m5.235s 68m43.672s 34 5117 +# 9 15m57.80s 105m58.101s 69 5156 +# 10 16m14.13s 103m35.009s 69 5165 +# 20 27m48.55s 198m3.286s 156 5269 +# +# Increasing the timeout on the 20-process run to five minutes increases +# the runtime to about 90 minutes with the CPU time rising to about +# 10 hours. On the other hand, it decreases the number of timeouts to 101. +# +# Note that there are historical tests for which herd7 will fail +# completely, for example, litmus/manual/atomic/C-unlock-wait-00.litmus +# contains a call to spin_unlock_wait(), which no longer exists in either +# the kernel or LKMM. + +. scripts/parseargs.sh + +T=/tmp/initlitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + git clone https://github.com/paulmckrcu/litmus + ( cd litmus; git checkout origin/master ) +fi + +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find litmus -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + +# Create a list of the C-language litmus tests with no more than the +# specified number of processes (per the --procs argument). +find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +xargs < $T/list-C -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short + +scripts/runlitmushist.sh < $T/list-C-short + +exit 0 diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh @@ -0,0 +1,78 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Given a .litmus test and the corresponding .litmus.out file, check +# the .litmus.out file against the "Result:" comment to judge whether +# the test ran correctly. +# +# Usage: +# judgelitmus.sh file.litmus +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +litmus=$1 + +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' --- ' error: \"$litmus\" is not a readable file + exit 255 +fi +if test -f "$LKMM_DESTDIR/$litmus".out -a -r "$LKMM_DESTDIR/$litmus".out +then + : +else + echo ' --- ' error: \"$LKMM_DESTDIR/$litmus\".out is not a readable file + exit 255 +fi +if grep -q '^ \* Result: ' $litmus +then + outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` +else + outcome=specified +fi + +grep '^Observation' $LKMM_DESTDIR/$litmus.out +if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out +then + : +else + echo ' !!! Verification error' $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + exit 255 +fi +if test "$outcome" = DEADLOCK +then + if grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' + then + ret=0 + else + echo " !!! Unexpected non-$outcome verification" $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + ret=1 + fi +elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe +then + ret=0 +else + echo " !!! Unexpected non-$outcome verification" $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + ret=1 +fi +tail -2 $LKMM_DESTDIR/$litmus.out | head -1 +exit $ret diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests matching the specified criteria +# that do not already have a corresponding .litmus.out file, and does +# not judge the result. +# +# sh newlitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# See scripts/parseargs.sh for list of arguments. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +. scripts/parseargs.sh + +T=/tmp/newlitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + echo Run scripts/initlitmushist.sh first, need litmus repo. + exit 1 +fi + +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find litmus -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + +# Create a list of the C-language litmus tests previously run. +( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | + sed -e 's/\.out$//' | + xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already + +# Form full list of litmus tests with no more than the specified +# number of processes (per the --procs argument). +find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C-all +xargs < $T/list-C-all -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short + +# Form list of new tests. Note: This does not handle litmus-test deletion! +sort $T/list-C-already $T/list-C-short | uniq -u > $T/list-C-new + +# Form list of litmus tests that have changed since the last run. +sed < $T/list-C-short -e 's,^.*$,if test & -nt '"$LKMM_DESTDIR"'/&.out; then echo &; fi,' > $T/list-C-script +sh $T/list-C-script > $T/list-C-newer + +# Merge the list of new and of updated litmus tests: These must be (re)run. +sort -u $T/list-C-new $T/list-C-newer > $T/list-C-needed + +scripts/runlitmushist.sh < $T/list-C-needed + +exit 0 diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh @@ -0,0 +1,136 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# the corresponding .litmus.out file, and does not judge the result. +# +# . scripts/parseargs.sh +# +# Include into other Linux kernel tools/memory-model scripts. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +T=/tmp/parseargs.sh.$$ +mkdir $T + +# Initialize one parameter: initparam name default +initparam () { + echo if test -z '"$'$1'"' > $T/s + echo then >> $T/s + echo $1='"'$2'"' >> $T/s + echo export $1 >> $T/s + echo fi >> $T/s + echo $1_DEF='$'$1 >> $T/s + . $T/s +} + +initparam LKMM_DESTDIR "." +initparam LKMM_HERD_OPTIONS "-conf linux-kernel.cfg" +initparam LKMM_JOBS `getconf _NPROCESSORS_ONLN` +initparam LKMM_PROCS "3" +initparam LKMM_TIMEOUT "1m" + +scriptname=$0 + +usagehelp () { + echo "Usage $scriptname [ arguments ]" + echo " --destdir path (place for .litmus.out, default by .litmus)" + echo " --herdopts -conf linux-kernel.cfg ..." + echo " --jobs N (number of jobs, default one per CPU)" + echo " --procs N (litmus tests with at most this many processes)" + echo " --timeout N (herd7 timeout (e.g., 10s, 1m, 2hr, 1d, '')" + echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" + exit 1 +} + +usage () { + usagehelp 1>&2 +} + +# checkarg --argname argtype $# arg mustmatch cannotmatch +checkarg () { + if test $3 -le 1 + then + echo $1 needs argument $2 matching \"$5\" + usage + fi + if echo "$4" | grep -q -e "$5" + then + : + else + echo $1 $2 \"$4\" must match \"$5\" + usage + fi + if echo "$4" | grep -q -e "$6" + then + echo $1 $2 \"$4\" must not match \"$6\" + usage + fi +} + +while test $# -gt 0 +do + case "$1" in + --destdir) + checkarg --destdir "(path to directory)" "$#" "$2" '.\+' '^--' + LKMM_DESTDIR="$2" + mkdir $LKMM_DESTDIR > /dev/null 2>&1 + if ! test -e "$LKMM_DESTDIR" + then + echo "Cannot create directory --destdir '$LKMM_DESTDIR'" + usage + fi + if test -d "$LKMM_DESTDIR" -a -w "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" + then + : + else + echo "Directory --destdir '$LKMM_DESTDIR' insufficient permissions to create files" + usage + fi + shift + ;; + --herdopts|--herdopt) + checkarg --destdir "(herd options)" "$#" "$2" '.*' '^--' + LKMM_HERD_OPTIONS="$2" + shift + ;; + -j[1-9]*) + njobs="`echo $1 | sed -e 's/^-j//'`" + trailchars="`echo $njobs | sed -e 's/[0-9]\+\(.*\)$/\1/'`" + if test -n "$trailchars" + then + echo $1 trailing characters "'$trailchars'" + usagehelp + fi + LKMM_JOBS="`echo $njobs | sed -e 's/^\([0-9]\+\).*$/\1/'`" + ;; + --jobs|--job|-j) + checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]\+$' '^--' + LKMM_JOBS="$2" + shift + ;; + --procs|--proc) + checkarg --procs "(number)" "$#" "$2" '^[0-9]\+$' '^--' + LKMM_PROCS="$2" + shift + ;; + --timeout) + checkarg --timeout "(timeout spec)" "$#" "$2" '^\([0-9]\+[smhd]\?\|\)$' '^--' + LKMM_TIMEOUT="$2" + shift + ;; + *) + echo Unknown argument $1 + usage + ;; + esac + shift +done +if test -z "$LKMM_TIMEOUT" +then + LKMM_TIMEOUT_CMD=""; export LKMM_TIMEOUT_CMD +else + LKMM_TIMEOUT_CMD="timeout $LKMM_TIMEOUT"; export LKMM_TIMEOUT_CMD +fi +rm -rf $T diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests specified on standard input, using up +# to the specified number of CPUs (defaulting to all of them) and placing +# the results in the specified directory (defaulting to the same place +# the litmus test came from). +# +# sh runlitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# This script uses environment variables produced by parseargs.sh. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +T=/tmp/runlitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + echo Directory \"litmus\" missing, aborting run. + exit 1 +fi + +# Prefixes for per-CPU scripts +for ((i=0;i<$LKMM_JOBS;i++)) +do + echo dir="$LKMM_DESTDIR" > $T/$i.sh + echo T=$T >> $T/$i.sh + echo herdoptions=\"$LKMM_HERD_OPTIONS\" >> $T/$i.sh + cat << '___EOF___' >> $T/$i.sh + runtest () { + echo ' ... ' /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 '>' $dir/$1.out '2>&1' + if /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 > $dir/$1.out 2>&1 + then + if ! grep -q '^Observation ' $dir/$1.out + then + echo ' !!! Herd failed, no Observation:' $1 + fi + else + exitcode=$? + if test "$exitcode" -eq 124 + then + exitmsg="timed out" + else + exitmsg="failed, exit code $exitcode" + fi + echo ' !!! Herd' ${exitmsg}: $1 + fi + } +___EOF___ +done + +awk -v q="'" -v b='\\' ' +{ + print "echo `grep " q "^P[0-9]" b "+(" q " " $0 " | tail -1 | sed -e " q "s/^P" b "([0-9]" b "+" b ")(.*$/" b "1/" q "` " $0 +}' | bash | +sort -k1n | +awk -v ncpu=$LKMM_JOBS -v t=$T ' +{ + print "runtest " $2 >> t "/" NR % ncpu ".sh"; +} + +END { + for (i = 0; i < ncpu; i++) { + print "sh " t "/" i ".sh > " t "/" i ".sh.out 2>&1 &"; + close(t "/" i ".sh"); + } + print "wait"; +}' | sh +cat $T/*.sh.out +if grep -q '!!!' $T/*.sh.out +then + echo ' ---' Summary: 1>&2 + grep '!!!' $T/*.sh.out 1>&2 + nfail="`grep '!!!' $T/*.sh.out | wc -l`" + echo 'Number of failed herd runs (e.g., timeout): ' $nfail 1>&2 + exit 1 +else + echo All runs completed successfully. 1>&2 + exit 0 +fi diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -40,17 +40,24 @@ mkdir $T cat > $T/init << '__EOF___' #!/bin/sh # Run in userspace a few milliseconds every second. This helps to -# exercise the NO_HZ_FULL portions of RCU. +# exercise the NO_HZ_FULL portions of RCU. The 192 instances of "a" was +# empirically shown to give a nice multi-millisecond burst of user-mode +# execution on a 2GHz CPU, as desired. Modern CPUs will vary from a +# couple of milliseconds up to perhaps 100 milliseconds, which is an +# acceptable range. +# +# Why not calibrate an exact delay? Because within this initrd, we +# are restricted to Bourne-shell builtins, which as far as I know do not +# provide any means of obtaining a fine-grained timestamp. + +a4="a a a a" +a16="$a4 $a4 $a4 $a4" +a64="$a16 $a16 $a16 $a16" +a192="$a64 $a64 $a64" while : do q= - for i in \ - a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ - a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ - a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ - a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ - a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a \ - a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + for i in $a192 do q="$q $i" done @@ -124,8 +131,8 @@ if echo -e "#if __x86_64__||__i386__||__i486__||__i586__||__i686__" \ | grep -q '^yes'; then # architecture supported by nolibc ${CROSS_COMPILE}gcc -fno-asynchronous-unwind-tables -fno-ident \ - -nostdlib -include ../bin/nolibc.h -lgcc -s -static -Os \ - -o init init.c + -nostdlib -include ../../../../include/nolibc/nolibc.h \ + -lgcc -s -static -Os -o init init.c else ${CROSS_COMPILE}gcc -s -static -Os -o init init.c fi diff --git a/tools/testing/selftests/rcutorture/bin/nolibc.h b/tools/testing/selftests/rcutorture/bin/nolibc.h @@ -1,2197 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ -/* nolibc.h - * Copyright (C) 2017-2018 Willy Tarreau <w@1wt.eu> - */ - -/* some archs (at least aarch64) don't expose the regular syscalls anymore by - * default, either because they have an "_at" replacement, or because there are - * more modern alternatives. For now we'd rather still use them. - */ -#define __ARCH_WANT_SYSCALL_NO_AT -#define __ARCH_WANT_SYSCALL_NO_FLAGS -#define __ARCH_WANT_SYSCALL_DEPRECATED - -#include <asm/unistd.h> -#include <asm/ioctls.h> -#include <asm/errno.h> -#include <linux/fs.h> -#include <linux/loop.h> - -#define NOLIBC - -/* Build a static executable this way : - * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - * -static -include nolibc.h -lgcc -o hello hello.c - * - * Useful calling convention table found here : - * http://man7.org/linux/man-pages/man2/syscall.2.html - * - * This doc is even better : - * https://w3challs.com/syscalls/ - */ - - -/* this way it will be removed if unused */ -static int errno; - -#ifndef NOLIBC_IGNORE_ERRNO -#define SET_ERRNO(v) do { errno = (v); } while (0) -#else -#define SET_ERRNO(v) do { } while (0) -#endif - -/* errno codes all ensure that they will not conflict with a valid pointer - * because they all correspond to the highest addressable memry page. - */ -#define MAX_ERRNO 4095 - -/* Declare a few quite common macros and types that usually are in stdlib.h, - * stdint.h, ctype.h, unistd.h and a few other common locations. - */ - -#define NULL ((void *)0) - -/* stdint types */ -typedef unsigned char uint8_t; -typedef signed char int8_t; -typedef unsigned short uint16_t; -typedef signed short int16_t; -typedef unsigned int uint32_t; -typedef signed int int32_t; -typedef unsigned long long uint64_t; -typedef signed long long int64_t; -typedef unsigned long size_t; -typedef signed long ssize_t; -typedef unsigned long uintptr_t; -typedef signed long intptr_t; -typedef signed long ptrdiff_t; - -/* for stat() */ -typedef unsigned int dev_t; -typedef unsigned long ino_t; -typedef unsigned int mode_t; -typedef signed int pid_t; -typedef unsigned int uid_t; -typedef unsigned int gid_t; -typedef unsigned long nlink_t; -typedef signed long off_t; -typedef signed long blksize_t; -typedef signed long blkcnt_t; -typedef signed long time_t; - -/* for poll() */ -struct pollfd { - int fd; - short int events; - short int revents; -}; - -/* for select() */ -struct timeval { - long tv_sec; - long tv_usec; -}; - -/* for pselect() */ -struct timespec { - long tv_sec; - long tv_nsec; -}; - -/* for gettimeofday() */ -struct timezone { - int tz_minuteswest; - int tz_dsttime; -}; - -/* for getdents64() */ -struct linux_dirent64 { - uint64_t d_ino; - int64_t d_off; - unsigned short d_reclen; - unsigned char d_type; - char d_name[]; -}; - -/* commonly an fd_set represents 256 FDs */ -#define FD_SETSIZE 256 -typedef struct { uint32_t fd32[FD_SETSIZE/32]; } fd_set; - -/* needed by wait4() */ -struct rusage { - struct timeval ru_utime; - struct timeval ru_stime; - long ru_maxrss; - long ru_ixrss; - long ru_idrss; - long ru_isrss; - long ru_minflt; - long ru_majflt; - long ru_nswap; - long ru_inblock; - long ru_oublock; - long ru_msgsnd; - long ru_msgrcv; - long ru_nsignals; - long ru_nvcsw; - long ru_nivcsw; -}; - -/* stat flags (WARNING, octal here) */ -#define S_IFDIR 0040000 -#define S_IFCHR 0020000 -#define S_IFBLK 0060000 -#define S_IFREG 0100000 -#define S_IFIFO 0010000 -#define S_IFLNK 0120000 -#define S_IFSOCK 0140000 -#define S_IFMT 0170000 - -#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) -#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) -#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) -#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) -#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) -#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) -#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) - -#define DT_UNKNOWN 0 -#define DT_FIFO 1 -#define DT_CHR 2 -#define DT_DIR 4 -#define DT_BLK 6 -#define DT_REG 8 -#define DT_LNK 10 -#define DT_SOCK 12 - -/* all the *at functions */ -#ifndef AT_FDWCD -#define AT_FDCWD -100 -#endif - -/* lseek */ -#define SEEK_SET 0 -#define SEEK_CUR 1 -#define SEEK_END 2 - -/* reboot */ -#define LINUX_REBOOT_MAGIC1 0xfee1dead -#define LINUX_REBOOT_MAGIC2 0x28121969 -#define LINUX_REBOOT_CMD_HALT 0xcdef0123 -#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc -#define LINUX_REBOOT_CMD_RESTART 0x01234567 -#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 - - -/* The format of the struct as returned by the libc to the application, which - * significantly differs from the format returned by the stat() syscall flavours. - */ -struct stat { - dev_t st_dev; /* ID of device containing file */ - ino_t st_ino; /* inode number */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device ID (if special file) */ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for file system I/O */ - blkcnt_t st_blocks; /* number of 512B blocks allocated */ - time_t st_atime; /* time of last access */ - time_t st_mtime; /* time of last modification */ - time_t st_ctime; /* time of last status change */ -}; - -#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) -#define WIFEXITED(status) (((status) & 0x7f) == 0) - - -/* Below comes the architecture-specific code. For each architecture, we have - * the syscall declarations and the _start code definition. This is the only - * global part. On all architectures the kernel puts everything in the stack - * before jumping to _start just above us, without any return address (_start - * is not a function but an entry pint). So at the stack pointer we find argc. - * Then argv[] begins, and ends at the first NULL. Then we have envp which - * starts and ends with a NULL as well. So envp=argv+argc+1. - */ - -#if defined(__x86_64__) -/* Syscalls for x86_64 : - * - registers are 64-bit - * - syscall number is passed in rax - * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively - * - the system call is performed by calling the syscall instruction - * - syscall return comes in rax - * - rcx and r8..r11 may be clobbered, others are preserved. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - */ - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret), "=r"(_arg4) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "rcx", "r9", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - register long _arg6 asm("r9") = (long)(arg6); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %rdi\n" // argc (first arg, %rdi) - "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) - "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) - "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned when - "sub $8, %rsp\n" // entering the callee - "call main\n" // main() returns the status code, we'll exit with it. - "movzb %al, %rdi\n" // retrieve exit code from 8 lower bits - "mov $60, %rax\n" // NR_exit == 60 - "syscall\n" // really exit - "hlt\n" // ensure it does not return - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, equivalent to stat64(). The - * syscall returns 116 bytes and stops in the middle of __unused. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned long st_nlink; - unsigned int st_mode; - unsigned int st_uid; - - unsigned int st_gid; - unsigned int __pad0; - unsigned long st_rdev; - long st_size; - long st_blksize; - - long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - long __unused[3]; -}; - -#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) -/* Syscalls for i386 : - * - mostly similar to x86_64 - * - registers are 32-bit - * - syscall number is passed in eax - * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively - * - all registers are preserved (except eax of course) - * - the system call is performed by calling int $0x80 - * - syscall return comes in eax - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, i386 supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - register long _arg5 asm("edi") = (long)(arg5); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %eax\n" // argc (first arg, %eax) - "mov %esp, %ebx\n" // argv[] (second arg, %ebx) - "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) - "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned when - "push %ecx\n" // push all registers on the stack so that we - "push %ebx\n" // support both regparm and plain stack modes - "push %eax\n" - "call main\n" // main() returns the status code in %eax - "movzbl %al, %ebx\n" // retrieve exit code from lower 8 bits - "movl $1, %eax\n" // NR_exit == 1 - "int $0x80\n" // exit now - "hlt\n" // ensure it does not - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; - - unsigned long st_rdev; - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__ARM_EABI__) -/* Syscalls for ARM in ARM or Thumb modes : - * - registers are 32-bit - * - stack is 8-byte aligned - * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) - * - syscall number is passed in r7 - * - arguments are in r0, r1, r2, r3, r4, r5 - * - the system call is performed by calling svc #0 - * - syscall return comes in r0. - * - only lr is clobbered. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, ARM supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - register long _arg5 asm("r4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" -#if defined(__THUMBEB__) || defined(__THUMBEL__) - /* We enter here in 32-bit mode but if some previous functions were in - * 16-bit mode, the assembler cannot know, so we need to tell it we're in - * 32-bit now, then switch to 16-bit (is there a better way to do it than - * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that - * it generates correct instructions. Note that we do not support thumb1. - */ - ".code 32\n" - "add r0, pc, #1\n" - "bx r0\n" - ".code 16\n" -#endif - "pop {%r0}\n" // argc was in the stack - "mov %r1, %sp\n" // argv = sp - "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... - "add %r2, %r2, $4\n" // ... + 4 - "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the - "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) - "bl main\n" // main() returns the status code, we'll exit with it. - "and %r0, %r0, $0xff\n" // limit exit code to 8 bits - "movs r7, $1\n" // NR_exit == 1 - "svc $0x00\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). In big endian, the format - * differs as devices are returned as short only. - */ -struct sys_stat_struct { -#if defined(__ARMEB__) - unsigned short st_dev; - unsigned short __pad1; -#else - unsigned long st_dev; -#endif - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; -#if defined(__ARMEB__) - unsigned short st_rdev; - unsigned short __pad2; -#else - unsigned long st_rdev; -#endif - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__aarch64__) -/* Syscalls for AARCH64 : - * - registers are 64-bit - * - stack is 16-byte aligned - * - syscall number is passed in x8 - * - arguments are in x0, x1, x2, x3, x4, x5 - * - the system call is performed by calling svc 0 - * - syscall return comes in x0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - * On aarch64, select() is not implemented so we have to use pselect6(). - */ -#define __ARCH_WANT_SYS_PSELECT6 - -#define my_syscall0(num) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - register long _arg6 asm("x5") = (long)(arg6); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "ldr x0, [sp]\n" // argc (x0) was in the stack - "add x1, sp, 8\n" // argv (x1) = sp - "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... - "add x2, x2, 8\n" // + 8 (skip null) - "add x2, x2, x1\n" // + argv - "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee - "bl main\n" // main() returns the status code, we'll exit with it. - "and x0, x0, 0xff\n" // limit exit code to 8 bits - "mov x8, 93\n" // NR_exit == 93 - "svc #0\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the newfstatat() syscall. Differs slightly from the - * x86_64's stat one by field ordering, so be careful. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - - unsigned long st_rdev; - unsigned long __pad1; - long st_size; - int st_blksize; - int __pad2; - - long st_blocks; - long st_atime; - unsigned long st_atime_nsec; - long st_mtime; - - unsigned long st_mtime_nsec; - long st_ctime; - unsigned long st_ctime_nsec; - unsigned int __unused[2]; -}; - -#elif defined(__mips__) && defined(_ABIO32) -/* Syscalls for MIPS ABI O32 : - * - WARNING! there's always a delayed slot! - * - WARNING again, the syntax is different, registers take a '$' and numbers - * do not. - * - registers are 32-bit - * - stack is 8-byte aligned - * - syscall number is passed in v0 (starts at 0xfa0). - * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to - * leave some room in the stack for the callee to save a0..a3 if needed. - * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are - * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as - * scall32-o32.S in the kernel sources. - * - the system call is performed by calling "syscall" - * - syscall return comes in v0, and register a3 needs to be checked to know - * if an error occured, in which case errno is in v0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - */ - -#define my_syscall0(num) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "r"(_num) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 = (long)(arg5); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "sw %7, 16($sp)\n" \ - "syscall\n " \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - \ - ); \ - _arg4 ? -_num : _num; \ -}) - -/* startup code, note that it's called __start on MIPS */ -asm(".section .text\n" - ".set nomips16\n" - ".global __start\n" - ".set noreorder\n" - ".option pic0\n" - ".ent __start\n" - "__start:\n" - "lw $a0,($sp)\n" // argc was in the stack - "addiu $a1, $sp, 4\n" // argv = sp + 4 - "sll $a2, $a0, 2\n" // a2 = argc * 4 - "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... - "addiu $a2, $a2, 4\n" // ... + 4 - "li $t0, -8\n" - "and $sp, $sp, $t0\n" // sp must be 8-byte aligned - "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! - "jal main\n" // main() returns the status code, we'll exit with it. - "nop\n" // delayed slot - "and $a0, $v0, 0xff\n" // limit exit code to 8 bits - "li $v0, 4001\n" // NR_exit == 4001 - "syscall\n" - ".end __start\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_APPEND 0x0008 -#define O_NONBLOCK 0x0080 -#define O_CREAT 0x0100 -#define O_TRUNC 0x0200 -#define O_EXCL 0x0400 -#define O_NOCTTY 0x0800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall. 88 bytes are returned by the - * syscall. - */ -struct sys_stat_struct { - unsigned int st_dev; - long st_pad1[3]; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - unsigned int st_rdev; - long st_pad2[2]; - long st_size; - long st_pad3; - long st_atime; - long st_atime_nsec; - long st_mtime; - long st_mtime_nsec; - long st_ctime; - long st_ctime_nsec; - long st_blksize; - long st_blocks; - long st_pad4[14]; -}; - -#endif - - -/* Below are the C functions used to declare the raw syscalls. They try to be - * architecture-agnostic, and return either a success or -errno. Declaring them - * static will lead to them being inlined in most cases, but it's still possible - * to reference them by a pointer if needed. - */ -static __attribute__((unused)) -void *sys_brk(void *addr) -{ - return (void *)my_syscall1(__NR_brk, addr); -} - -static __attribute__((noreturn,unused)) -void sys_exit(int status) -{ - my_syscall1(__NR_exit, status & 255); - while(1); // shut the "noreturn" warnings. -} - -static __attribute__((unused)) -int sys_chdir(const char *path) -{ - return my_syscall1(__NR_chdir, path); -} - -static __attribute__((unused)) -int sys_chmod(const char *path, mode_t mode) -{ -#ifdef __NR_fchmodat - return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); -#else - return my_syscall2(__NR_chmod, path, mode); -#endif -} - -static __attribute__((unused)) -int sys_chown(const char *path, uid_t owner, gid_t group) -{ -#ifdef __NR_fchownat - return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); -#else - return my_syscall3(__NR_chown, path, owner, group); -#endif -} - -static __attribute__((unused)) -int sys_chroot(const char *path) -{ - return my_syscall1(__NR_chroot, path); -} - -static __attribute__((unused)) -int sys_close(int fd) -{ - return my_syscall1(__NR_close, fd); -} - -static __attribute__((unused)) -int sys_dup(int fd) -{ - return my_syscall1(__NR_dup, fd); -} - -static __attribute__((unused)) -int sys_dup2(int old, int new) -{ - return my_syscall2(__NR_dup2, old, new); -} - -static __attribute__((unused)) -int sys_execve(const char *filename, char *const argv[], char *const envp[]) -{ - return my_syscall3(__NR_execve, filename, argv, envp); -} - -static __attribute__((unused)) -pid_t sys_fork(void) -{ - return my_syscall0(__NR_fork); -} - -static __attribute__((unused)) -int sys_fsync(int fd) -{ - return my_syscall1(__NR_fsync, fd); -} - -static __attribute__((unused)) -int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - return my_syscall3(__NR_getdents64, fd, dirp, count); -} - -static __attribute__((unused)) -pid_t sys_getpgrp(void) -{ - return my_syscall0(__NR_getpgrp); -} - -static __attribute__((unused)) -pid_t sys_getpid(void) -{ - return my_syscall0(__NR_getpid); -} - -static __attribute__((unused)) -int sys_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - return my_syscall2(__NR_gettimeofday, tv, tz); -} - -static __attribute__((unused)) -int sys_ioctl(int fd, unsigned long req, void *value) -{ - return my_syscall3(__NR_ioctl, fd, req, value); -} - -static __attribute__((unused)) -int sys_kill(pid_t pid, int signal) -{ - return my_syscall2(__NR_kill, pid, signal); -} - -static __attribute__((unused)) -int sys_link(const char *old, const char *new) -{ -#ifdef __NR_linkat - return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); -#else - return my_syscall2(__NR_link, old, new); -#endif -} - -static __attribute__((unused)) -off_t sys_lseek(int fd, off_t offset, int whence) -{ - return my_syscall3(__NR_lseek, fd, offset, whence); -} - -static __attribute__((unused)) -int sys_mkdir(const char *path, mode_t mode) -{ -#ifdef __NR_mkdirat - return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); -#else - return my_syscall2(__NR_mkdir, path, mode); -#endif -} - -static __attribute__((unused)) -long sys_mknod(const char *path, mode_t mode, dev_t dev) -{ -#ifdef __NR_mknodat - return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); -#else - return my_syscall3(__NR_mknod, path, mode, dev); -#endif -} - -static __attribute__((unused)) -int sys_mount(const char *src, const char *tgt, const char *fst, - unsigned long flags, const void *data) -{ - return my_syscall5(__NR_mount, src, tgt, fst, flags, data); -} - -static __attribute__((unused)) -int sys_open(const char *path, int flags, mode_t mode) -{ -#ifdef __NR_openat - return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); -#else - return my_syscall3(__NR_open, path, flags, mode); -#endif -} - -static __attribute__((unused)) -int sys_pivot_root(const char *new, const char *old) -{ - return my_syscall2(__NR_pivot_root, new, old); -} - -static __attribute__((unused)) -int sys_poll(struct pollfd *fds, int nfds, int timeout) -{ - return my_syscall3(__NR_poll, fds, nfds, timeout); -} - -static __attribute__((unused)) -ssize_t sys_read(int fd, void *buf, size_t count) -{ - return my_syscall3(__NR_read, fd, buf, count); -} - -static __attribute__((unused)) -ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) -{ - return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); -} - -static __attribute__((unused)) -int sys_sched_yield(void) -{ - return my_syscall0(__NR_sched_yield); -} - -static __attribute__((unused)) -int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ -#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) - struct sel_arg_struct { - unsigned long n; - fd_set *r, *w, *e; - struct timeval *t; - } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; - return my_syscall1(__NR_select, &arg); -#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) - struct timespec t; - - if (timeout) { - t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; - } - return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#else -#ifndef __NR__newselect -#define __NR__newselect __NR_select -#endif - return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); -#endif -} - -static __attribute__((unused)) -int sys_setpgid(pid_t pid, pid_t pgid) -{ - return my_syscall2(__NR_setpgid, pid, pgid); -} - -static __attribute__((unused)) -pid_t sys_setsid(void) -{ - return my_syscall0(__NR_setsid); -} - -static __attribute__((unused)) -int sys_stat(const char *path, struct stat *buf) -{ - struct sys_stat_struct stat; - long ret; - -#ifdef __NR_newfstatat - /* only solution for arm64 */ - ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); -#else - ret = my_syscall2(__NR_stat, path, &stat); -#endif - buf->st_dev = stat.st_dev; - buf->st_ino = stat.st_ino; - buf->st_mode = stat.st_mode; - buf->st_nlink = stat.st_nlink; - buf->st_uid = stat.st_uid; - buf->st_gid = stat.st_gid; - buf->st_rdev = stat.st_rdev; - buf->st_size = stat.st_size; - buf->st_blksize = stat.st_blksize; - buf->st_blocks = stat.st_blocks; - buf->st_atime = stat.st_atime; - buf->st_mtime = stat.st_mtime; - buf->st_ctime = stat.st_ctime; - return ret; -} - - -static __attribute__((unused)) -int sys_symlink(const char *old, const char *new) -{ -#ifdef __NR_symlinkat - return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); -#else - return my_syscall2(__NR_symlink, old, new); -#endif -} - -static __attribute__((unused)) -mode_t sys_umask(mode_t mode) -{ - return my_syscall1(__NR_umask, mode); -} - -static __attribute__((unused)) -int sys_umount2(const char *path, int flags) -{ - return my_syscall2(__NR_umount2, path, flags); -} - -static __attribute__((unused)) -int sys_unlink(const char *path) -{ -#ifdef __NR_unlinkat - return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); -#else - return my_syscall1(__NR_unlink, path); -#endif -} - -static __attribute__((unused)) -pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - return my_syscall4(__NR_wait4, pid, status, options, rusage); -} - -static __attribute__((unused)) -pid_t sys_waitpid(pid_t pid, int *status, int options) -{ - return sys_wait4(pid, status, options, 0); -} - -static __attribute__((unused)) -pid_t sys_wait(int *status) -{ - return sys_waitpid(-1, status, 0); -} - -static __attribute__((unused)) -ssize_t sys_write(int fd, const void *buf, size_t count) -{ - return my_syscall3(__NR_write, fd, buf, count); -} - - -/* Below are the libc-compatible syscalls which return x or -1 and set errno. - * They rely on the functions above. Similarly they're marked static so that it - * is possible to assign pointers to them if needed. - */ - -static __attribute__((unused)) -int brk(void *addr) -{ - void *ret = sys_brk(addr); - - if (!ret) { - SET_ERRNO(ENOMEM); - return -1; - } - return 0; -} - -static __attribute__((noreturn,unused)) -void exit(int status) -{ - sys_exit(status); -} - -static __attribute__((unused)) -int chdir(const char *path) -{ - int ret = sys_chdir(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chmod(const char *path, mode_t mode) -{ - int ret = sys_chmod(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chown(const char *path, uid_t owner, gid_t group) -{ - int ret = sys_chown(path, owner, group); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chroot(const char *path) -{ - int ret = sys_chroot(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int close(int fd) -{ - int ret = sys_close(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int dup2(int old, int new) -{ - int ret = sys_dup2(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int execve(const char *filename, char *const argv[], char *const envp[]) -{ - int ret = sys_execve(filename, argv, envp); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t fork(void) -{ - pid_t ret = sys_fork(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int fsync(int fd) -{ - int ret = sys_fsync(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - int ret = sys_getdents64(fd, dirp, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpgrp(void) -{ - pid_t ret = sys_getpgrp(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpid(void) -{ - pid_t ret = sys_getpid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret = sys_gettimeofday(tv, tz); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int ioctl(int fd, unsigned long req, void *value) -{ - int ret = sys_ioctl(fd, req, value); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int kill(pid_t pid, int signal) -{ - int ret = sys_kill(pid, signal); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int link(const char *old, const char *new) -{ - int ret = sys_link(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -off_t lseek(int fd, off_t offset, int whence) -{ - off_t ret = sys_lseek(fd, offset, whence); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mkdir(const char *path, mode_t mode) -{ - int ret = sys_mkdir(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mknod(const char *path, mode_t mode, dev_t dev) -{ - int ret = sys_mknod(path, mode, dev); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mount(const char *src, const char *tgt, - const char *fst, unsigned long flags, - const void *data) -{ - int ret = sys_mount(src, tgt, fst, flags, data); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int open(const char *path, int flags, mode_t mode) -{ - int ret = sys_open(path, flags, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int pivot_root(const char *new, const char *old) -{ - int ret = sys_pivot_root(new, old); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int poll(struct pollfd *fds, int nfds, int timeout) -{ - int ret = sys_poll(fds, nfds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t read(int fd, void *buf, size_t count) -{ - ssize_t ret = sys_read(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int reboot(int cmd) -{ - int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -void *sbrk(intptr_t inc) -{ - void *ret; - - /* first call to find current end */ - if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) - return ret + inc; - - SET_ERRNO(ENOMEM); - return (void *)-1; -} - -static __attribute__((unused)) -int sched_yield(void) -{ - int ret = sys_sched_yield(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ - int ret = sys_select(nfds, rfds, wfds, efds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int setpgid(pid_t pid, pid_t pgid) -{ - int ret = sys_setpgid(pid, pgid); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t setsid(void) -{ - pid_t ret = sys_setsid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -unsigned int sleep(unsigned int seconds) -{ - struct timeval my_timeval = { seconds, 0 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return my_timeval.tv_sec + !!my_timeval.tv_usec; - else - return 0; -} - -static __attribute__((unused)) -int stat(const char *path, struct stat *buf) -{ - int ret = sys_stat(path, buf); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int symlink(const char *old, const char *new) -{ - int ret = sys_symlink(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int tcsetpgrp(int fd, pid_t pid) -{ - return ioctl(fd, TIOCSPGRP, &pid); -} - -static __attribute__((unused)) -mode_t umask(mode_t mode) -{ - return sys_umask(mode); -} - -static __attribute__((unused)) -int umount2(const char *path, int flags) -{ - int ret = sys_umount2(path, flags); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int unlink(const char *path) -{ - int ret = sys_unlink(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - pid_t ret = sys_wait4(pid, status, options, rusage); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t waitpid(pid_t pid, int *status, int options) -{ - pid_t ret = sys_waitpid(pid, status, options); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait(int *status) -{ - pid_t ret = sys_wait(status); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t write(int fd, const void *buf, size_t count) -{ - ssize_t ret = sys_write(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -/* some size-optimized reimplementations of a few common str* and mem* - * functions. They're marked static, except memcpy() and raise() which are used - * by libgcc on ARM, so they are marked weak instead in order not to cause an - * error when building a program made of multiple files (not recommended). - */ - -static __attribute__((unused)) -void *memmove(void *dst, const void *src, size_t len) -{ - ssize_t pos = (dst <= src) ? -1 : (long)len; - void *ret = dst; - - while (len--) { - pos += (dst <= src) ? 1 : -1; - ((char *)dst)[pos] = ((char *)src)[pos]; - } - return ret; -} - -static __attribute__((unused)) -void *memset(void *dst, int b, size_t len) -{ - char *p = dst; - - while (len--) - *(p++) = b; - return dst; -} - -static __attribute__((unused)) -int memcmp(const void *s1, const void *s2, size_t n) -{ - size_t ofs = 0; - char c1 = 0; - - while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { - ofs++; - } - return c1; -} - -static __attribute__((unused)) -char *strcpy(char *dst, const char *src) -{ - char *ret = dst; - - while ((*dst++ = *src++)); - return ret; -} - -static __attribute__((unused)) -char *strchr(const char *s, int c) -{ - while (*s) { - if (*s == (char)c) - return (char *)s; - s++; - } - return NULL; -} - -static __attribute__((unused)) -char *strrchr(const char *s, int c) -{ - const char *ret = NULL; - - while (*s) { - if (*s == (char)c) - ret = s; - s++; - } - return (char *)ret; -} - -static __attribute__((unused)) -size_t nolibc_strlen(const char *str) -{ - size_t len; - - for (len = 0; str[len]; len++); - return len; -} - -#define strlen(str) ({ \ - __builtin_constant_p((str)) ? \ - __builtin_strlen((str)) : \ - nolibc_strlen((str)); \ -}) - -static __attribute__((unused)) -int isdigit(int c) -{ - return (unsigned int)(c - '0') <= 9; -} - -static __attribute__((unused)) -long atol(const char *s) -{ - unsigned long ret = 0; - unsigned long d; - int neg = 0; - - if (*s == '-') { - neg = 1; - s++; - } - - while (1) { - d = (*s++) - '0'; - if (d > 9) - break; - ret *= 10; - ret += d; - } - - return neg ? -ret : ret; -} - -static __attribute__((unused)) -int atoi(const char *s) -{ - return atol(s); -} - -static __attribute__((unused)) -const char *ltoa(long in) -{ - /* large enough for -9223372036854775808 */ - static char buffer[21]; - char *pos = buffer + sizeof(buffer) - 1; - int neg = in < 0; - unsigned long n = neg ? -in : in; - - *pos-- = '\0'; - do { - *pos-- = '0' + n % 10; - n /= 10; - if (pos < buffer) - return pos + 1; - } while (n); - - if (neg) - *pos-- = '-'; - return pos + 1; -} - -__attribute__((weak,unused)) -void *memcpy(void *dst, const void *src, size_t len) -{ - return memmove(dst, src, len); -} - -/* needed by libgcc for divide by zero */ -__attribute__((weak,unused)) -int raise(int signal) -{ - return kill(getpid(), signal); -} - -/* Here come a few helper functions */ - -static __attribute__((unused)) -void FD_ZERO(fd_set *set) -{ - memset(set, 0, sizeof(*set)); -} - -static __attribute__((unused)) -void FD_SET(int fd, fd_set *set) -{ - if (fd < 0 || fd >= FD_SETSIZE) - return; - set->fd32[fd / 32] |= 1 << (fd & 31); -} - -/* WARNING, it only deals with the 4096 first majors and 256 first minors */ -static __attribute__((unused)) -dev_t makedev(unsigned int major, unsigned int minor) -{ - return ((major & 0xfff) << 8) | (minor & 0xff); -} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c @@ -4084,7 +4084,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_count_lock)); + lockdep_assert_held(&kvm_count_lock); hardware_enable_nolock(NULL); } }