whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit 0d1e8b8d2bcd3150d51754d8d0fdbf44dc88b0d3
parent 83c4087ce468601501ecde4d0ec5b2abd5f57c31
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu, 25 Oct 2018 17:57:35 -0700

Merge tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Radim Krčmář:
 "ARM:
   - Improved guest IPA space support (32 to 52 bits)

   - RAS event delivery for 32bit

   - PMU fixes

   - Guest entry hardening

   - Various cleanups

   - Port of dirty_log_test selftest

  PPC:
   - Nested HV KVM support for radix guests on POWER9. The performance
     is much better than with PR KVM. Migration and arbitrary level of
     nesting is supported.

   - Disable nested HV-KVM on early POWER9 chips that need a particular
     hardware bug workaround

   - One VM per core mode to prevent potential data leaks

   - PCI pass-through optimization

   - merge ppc-kvm topic branch and kvm-ppc-fixes to get a better base

  s390:
   - Initial version of AP crypto virtualization via vfio-mdev

   - Improvement for vfio-ap

   - Set the host program identifier

   - Optimize page table locking

  x86:
   - Enable nested virtualization by default

   - Implement Hyper-V IPI hypercalls

   - Improve #PF and #DB handling

   - Allow guests to use Enlightened VMCS

   - Add migration selftests for VMCS and Enlightened VMCS

   - Allow coalesced PIO accesses

   - Add an option to perform nested VMCS host state consistency check
     through hardware

   - Automatic tuning of lapic_timer_advance_ns

   - Many fixes, minor improvements, and cleanups"

* tag 'kvm-4.20-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits)
  KVM/nVMX: Do not validate that posted_intr_desc_addr is page aligned
  Revert "kvm: x86: optimize dr6 restore"
  KVM: PPC: Optimize clearing TCEs for sparse tables
  x86/kvm/nVMX: tweak shadow fields
  selftests/kvm: add missing executables to .gitignore
  KVM: arm64: Safety check PSTATE when entering guest and handle IL
  KVM: PPC: Book3S HV: Don't use streamlined entry path on early POWER9 chips
  arm/arm64: KVM: Enable 32 bits kvm vcpu events support
  arm/arm64: KVM: Rename function kvm_arch_dev_ioctl_check_extension()
  KVM: arm64: Fix caching of host MDCR_EL2 value
  KVM: VMX: enable nested virtualization by default
  KVM/x86: Use 32bit xor to clear registers in svm.c
  kvm: x86: Introduce KVM_CAP_EXCEPTION_PAYLOAD
  kvm: vmx: Defer setting of DR6 until #DB delivery
  kvm: x86: Defer setting of CR2 until #PF delivery
  kvm: x86: Add payload operands to kvm_multiple_exception
  kvm: x86: Add exception payload fields to kvm_vcpu_events
  kvm: x86: Add has_payload and payload to kvm_queued_exception
  KVM: Documentation: Fix omission in struct kvm_vcpu_events
  KVM: selftests: add Enlightened VMCS test
  ...

Diffstat:
ADocumentation/s390/vfio-ap.txt | 837+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
MDocumentation/virtual/kvm/api.txt | 135+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
MMAINTAINERS | 12++++++++++++
March/arm/include/asm/kvm_arm.h | 3+--
March/arm/include/asm/kvm_host.h | 13++++++++++++-
March/arm/include/asm/kvm_mmu.h | 15++++++++++-----
March/arm/include/asm/stage2_pgtable.h | 54++++++++++++++++++++++++++++++++----------------------
March/arm64/include/asm/cpufeature.h | 21+++++++++++++++++++++
March/arm64/include/asm/kvm_arm.h | 155+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
March/arm64/include/asm/kvm_asm.h | 3+--
March/arm64/include/asm/kvm_host.h | 18+++++++++---------
March/arm64/include/asm/kvm_hyp.h | 10++++++++++
March/arm64/include/asm/kvm_mmu.h | 42++++++++++++++++++++++++++++++++++--------
March/arm64/include/asm/ptrace.h | 3+++
Darch/arm64/include/asm/stage2_pgtable-nopmd.h | 42------------------------------------------
Darch/arm64/include/asm/stage2_pgtable-nopud.h | 39---------------------------------------
March/arm64/include/asm/stage2_pgtable.h | 236+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
March/arm64/kvm/guest.c | 6+++---
March/arm64/kvm/handle_exit.c | 7+++++++
March/arm64/kvm/hyp/Makefile | 1-
March/arm64/kvm/hyp/hyp-entry.S | 16+++++++++++++++-
Darch/arm64/kvm/hyp/s2-setup.c | 90-------------------------------------------------------------------------------
March/arm64/kvm/hyp/switch.c | 4++--
March/arm64/kvm/hyp/sysreg-sr.c | 19++++++++++++++++++-
March/arm64/kvm/hyp/tlb.c | 4++--
March/arm64/kvm/reset.c | 108++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
March/powerpc/include/asm/asm-prototypes.h | 21+++++++++++++++++++++
March/powerpc/include/asm/book3s/64/mmu-hash.h | 12++++++++++++
March/powerpc/include/asm/book3s/64/tlbflush-radix.h | 1+
March/powerpc/include/asm/hvcall.h | 41+++++++++++++++++++++++++++++++++++++++++
March/powerpc/include/asm/iommu.h | 2+-
March/powerpc/include/asm/kvm_asm.h | 4+---
March/powerpc/include/asm/kvm_book3s.h | 45++++++++++++++++++++++++++++++++++++++++-----
March/powerpc/include/asm/kvm_book3s_64.h | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
March/powerpc/include/asm/kvm_book3s_asm.h | 3+++
March/powerpc/include/asm/kvm_booke.h | 4++--
March/powerpc/include/asm/kvm_host.h | 16++++++++++++++--
March/powerpc/include/asm/kvm_ppc.h | 8+++++---
March/powerpc/include/asm/ppc-opcode.h | 1+
March/powerpc/include/asm/reg.h | 2++
March/powerpc/include/uapi/asm/kvm.h | 1+
March/powerpc/kernel/asm-offsets.c | 5+++--
March/powerpc/kernel/cpu_setup_power.S | 4++--
March/powerpc/kvm/Makefile | 3++-
March/powerpc/kvm/book3s.c | 46++++++++++++++++++++++++++++++++++++----------
March/powerpc/kvm/book3s_64_mmu_hv.c | 7+++----
March/powerpc/kvm/book3s_64_mmu_radix.c | 718+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
March/powerpc/kvm/book3s_64_vio.c | 94++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
March/powerpc/kvm/book3s_64_vio_hv.c | 87++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
March/powerpc/kvm/book3s_emulate.c | 13++++++-------
March/powerpc/kvm/book3s_hv.c | 873++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
March/powerpc/kvm/book3s_hv_builtin.c | 92++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
March/powerpc/kvm/book3s_hv_interrupts.S | 95+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Aarch/powerpc/kvm/book3s_hv_nested.c | 1291+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/powerpc/kvm/book3s_hv_ras.c | 10++++++++++
March/powerpc/kvm/book3s_hv_rm_xics.c | 13++++++++++---
March/powerpc/kvm/book3s_hv_rmhandlers.S | 823++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
March/powerpc/kvm/book3s_hv_tm.c | 6+++---
March/powerpc/kvm/book3s_hv_tm_builtin.c | 5+++--
March/powerpc/kvm/book3s_pr.c | 5++---
March/powerpc/kvm/book3s_xics.c | 14++++++--------
March/powerpc/kvm/book3s_xive.c | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/powerpc/kvm/book3s_xive_template.c | 8--------
March/powerpc/kvm/bookehv_interrupts.S | 8++++----
March/powerpc/kvm/emulate_loadstore.c | 1-
March/powerpc/kvm/powerpc.c | 15++++++++++++++-
March/powerpc/kvm/tm.S | 250+++++++++++++++++++++++++++++++++++++++++++------------------------------------
March/powerpc/kvm/trace_book3s.h | 1-
March/powerpc/mm/tlb-radix.c | 9+++++++++
March/s390/Kconfig | 11+++++++++++
March/s390/include/asm/kvm_host.h | 15++++++++++++++-
March/s390/include/uapi/asm/kvm.h | 2++
March/s390/kvm/kvm-s390.c | 184+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
March/s390/kvm/kvm-s390.h | 1+
March/s390/kvm/vsie.c | 210++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
March/s390/mm/gmap.c | 10++++++++--
March/s390/tools/gen_facilities.c | 2++
March/x86/include/asm/kvm_host.h | 70+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
March/x86/include/asm/virtext.h | 2+-
March/x86/include/asm/vmx.h | 13-------------
March/x86/include/uapi/asm/kvm.h | 8++++++--
March/x86/kvm/hyperv.c | 280++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
March/x86/kvm/hyperv.h | 4++++
March/x86/kvm/lapic.c | 45++++++++++++++++++++++++++++++++++++++-------
March/x86/kvm/lapic.h | 2+-
March/x86/kvm/mmu.c | 393+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
March/x86/kvm/mmu.h | 13++++---------
March/x86/kvm/mmu_audit.c | 12++++++------
March/x86/kvm/paging_tmpl.h | 15++++++++-------
March/x86/kvm/svm.c | 64+++++++++++++++++++++++++++++++++++-----------------------------
March/x86/kvm/trace.h | 42++++++++++++++++++++++++++++++++++++++++++
March/x86/kvm/vmx.c | 2363++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
March/x86/kvm/vmx_shadow_fields.h | 5+----
March/x86/kvm/x86.c | 244+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
March/x86/kvm/x86.h | 2++
Mdrivers/iommu/Kconfig | 8++++++++
Mdrivers/s390/crypto/Makefile | 4++++
Adrivers/s390/crypto/vfio_ap_drv.c | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adrivers/s390/crypto/vfio_ap_ops.c | 939+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adrivers/s390/crypto/vfio_ap_private.h | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/vfio/vfio_iommu_spapr_tce.c | 23+++++++++++++++++++++--
Minclude/linux/irqchip/arm-gic-v3.h | 5+++++
Minclude/uapi/linux/kvm.h | 26++++++++++++++++++++++++--
Minclude/uapi/linux/vfio.h | 2++
Mtools/arch/x86/include/uapi/asm/kvm.h | 10++++++++--
Mtools/include/uapi/linux/kvm.h | 5+++++
Mtools/perf/arch/powerpc/util/book3s_hv_exits.h | 1-
Mtools/testing/selftests/kvm/.gitignore | 14++++++++------
Mtools/testing/selftests/kvm/Makefile | 39++++++++++++++++++++++++++-------------
Dtools/testing/selftests/kvm/cr4_cpuid_sync_test.c | 113-------------------------------------------------------------------------------
Mtools/testing/selftests/kvm/dirty_log_test.c | 374++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
Atools/testing/selftests/kvm/include/aarch64/processor.h | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/include/evmcs.h | 1098+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/kvm/include/kvm_util.h | 169+++++++++++++++++++++++++++++++++++++-------------------------------------------
Mtools/testing/selftests/kvm/include/sparsebit.h | 6+++---
Mtools/testing/selftests/kvm/include/test_util.h | 6+++---
Dtools/testing/selftests/kvm/include/vmx.h | 552-------------------------------------------------------------------------------
Dtools/testing/selftests/kvm/include/x86.h | 1047-------------------------------------------------------------------------------
Atools/testing/selftests/kvm/include/x86_64/processor.h | 1065+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/include/x86_64/vmx.h | 581+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/lib/aarch64/processor.c | 311+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtools/testing/selftests/kvm/lib/assert.c | 2+-
Mtools/testing/selftests/kvm/lib/kvm_util.c | 564++++++++++++++++++++++++++-----------------------------------------------------
Mtools/testing/selftests/kvm/lib/kvm_util_internal.h | 33+++++++++++++++++----------------
Atools/testing/selftests/kvm/lib/ucall.c | 144+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dtools/testing/selftests/kvm/lib/vmx.c | 283-------------------------------------------------------------------------------
Dtools/testing/selftests/kvm/lib/x86.c | 888-------------------------------------------------------------------------------
Atools/testing/selftests/kvm/lib/x86_64/processor.c | 1133+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/lib/x86_64/vmx.c | 312+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dtools/testing/selftests/kvm/platform_info_test.c | 110-------------------------------------------------------------------------------
Dtools/testing/selftests/kvm/set_sregs_test.c | 54------------------------------------------------------
Dtools/testing/selftests/kvm/state_test.c | 196-------------------------------------------------------------------------------
Dtools/testing/selftests/kvm/sync_regs_test.c | 237-------------------------------------------------------------------------------
Dtools/testing/selftests/kvm/vmx_tsc_adjust_test.c | 175-------------------------------------------------------------------------------
Atools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c | 113+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/x86_64/evmcs_test.c | 160+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/x86_64/platform_info_test.c | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/x86_64/set_sregs_test.c | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/x86_64/state_test.c | 199+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/x86_64/sync_regs_test.c | 237+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c | 175+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mvirt/kvm/arm/arm.c | 26++++++++++++--------------
Mvirt/kvm/arm/mmu.c | 128+++++++++++++++++++++++++++++++++++++++++++------------------------------------
Mvirt/kvm/arm/vgic/vgic-its.c | 36++++++++++--------------------------
Mvirt/kvm/arm/vgic/vgic-kvm-device.c | 2+-
Mvirt/kvm/arm/vgic/vgic-mmio-v3.c | 2--
Mvirt/kvm/coalesced_mmio.c | 12+++++++++---
Mvirt/kvm/kvm_main.c | 39++++++++++++++++++++++++---------------
148 files changed, 16031 insertions(+), 6834 deletions(-)

diff --git a/Documentation/s390/vfio-ap.txt b/Documentation/s390/vfio-ap.txt @@ -0,0 +1,837 @@ +Introduction: +============ +The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised +of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards. +The AP devices provide cryptographic functions to all CPUs assigned to a +linux system running in an IBM Z system LPAR. + +The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap +is to make AP cards available to KVM guests using the VFIO mediated device +framework. This implementation relies considerably on the s390 virtualization +facilities which do most of the hard work of providing direct access to AP +devices. + +AP Architectural Overview: +========================= +To facilitate the comprehension of the design, let's start with some +definitions: + +* AP adapter + + An AP adapter is an IBM Z adapter card that can perform cryptographic + functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters + assigned to the LPAR in which a linux host is running will be available to + the linux host. Each adapter is identified by a number from 0 to 255; however, + the maximum adapter number is determined by machine model and/or adapter type. + When installed, an AP adapter is accessed by AP instructions executed by any + CPU. + + The AP adapter cards are assigned to a given LPAR via the system's Activation + Profile which can be edited via the HMC. When the linux host system is IPL'd + in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and + creates a sysfs device for each assigned adapter. For example, if AP adapters + 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following + sysfs device entries: + + /sys/devices/ap/card04 + /sys/devices/ap/card0a + + Symbolic links to these devices will also be created in the AP bus devices + sub-directory: + + /sys/bus/ap/devices/[card04] + /sys/bus/ap/devices/[card04] + +* AP domain + + An adapter is partitioned into domains. An adapter can hold up to 256 domains + depending upon the adapter type and hardware configuration. A domain is + identified by a number from 0 to 255; however, the maximum domain number is + determined by machine model and/or adapter type.. A domain can be thought of + as a set of hardware registers and memory used for processing AP commands. A + domain can be configured with a secure private key used for clear key + encryption. A domain is classified in one of two ways depending upon how it + may be accessed: + + * Usage domains are domains that are targeted by an AP instruction to + process an AP command. + + * Control domains are domains that are changed by an AP command sent to a + usage domain; for example, to set the secure private key for the control + domain. + + The AP usage and control domains are assigned to a given LPAR via the system's + Activation Profile which can be edited via the HMC. When a linux host system + is IPL'd in the LPAR, the AP bus module detects the AP usage and control + domains assigned to the LPAR. The domain number of each usage domain and + adapter number of each AP adapter are combined to create AP queue devices + (see AP Queue section below). The domain number of each control domain will be + represented in a bitmask and stored in a sysfs file + /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least + significant bit, correspond to domains 0-255. + +* AP Queue + + An AP queue is the means by which an AP command is sent to a usage domain + inside a specific adapter. An AP queue is identified by a tuple + comprised of an AP adapter ID (APID) and an AP queue index (APQI). The + APQI corresponds to a given usage domain number within the adapter. This tuple + forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP + instructions include a field containing the APQN to identify the AP queue to + which the AP command is to be sent for processing. + + The AP bus will create a sysfs device for each APQN that can be derived from + the cross product of the AP adapter and usage domain numbers detected when the + AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage + domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the + following sysfs entries: + + /sys/devices/ap/card04/04.0006 + /sys/devices/ap/card04/04.0047 + /sys/devices/ap/card0a/0a.0006 + /sys/devices/ap/card0a/0a.0047 + + The following symbolic links to these devices will be created in the AP bus + devices subdirectory: + + /sys/bus/ap/devices/[04.0006] + /sys/bus/ap/devices/[04.0047] + /sys/bus/ap/devices/[0a.0006] + /sys/bus/ap/devices/[0a.0047] + +* AP Instructions: + + There are three AP instructions: + + * NQAP: to enqueue an AP command-request message to a queue + * DQAP: to dequeue an AP command-reply message from a queue + * PQAP: to administer the queues + + AP instructions identify the domain that is targeted to process the AP + command; this must be one of the usage domains. An AP command may modify a + domain that is not one of the usage domains, but the modified domain + must be one of the control domains. + +AP and SIE: +========== +Let's now take a look at how AP instructions executed on a guest are interpreted +by the hardware. + +A satellite control block called the Crypto Control Block (CRYCB) is attached to +our main hardware virtualization control block. The CRYCB contains three fields +to identify the adapters, usage domains and control domains assigned to the KVM +guest: + +* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned + to the KVM guest. Each bit in the mask, from left to right (i.e. from most + significant to least significant bit in big endian order), corresponds to + an APID from 0-255. If a bit is set, the corresponding adapter is valid for + use by the KVM guest. + +* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains + assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from + most significant to least significant bit in big endian order), corresponds to + an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue + is valid for use by the KVM guest. + +* The AP Domain Mask field is a bit mask that identifies the AP control domains + assigned to the KVM guest. The ADM bit mask controls which domains can be + changed by an AP command-request message sent to a usage domain from the + guest. Each bit in the mask, from left to right (i.e. from most significant to + least significant bit in big endian order), corresponds to a domain from + 0-255. If a bit is set, the corresponding domain can be modified by an AP + command-request message sent to a usage domain. + +If you recall from the description of an AP Queue, AP instructions include +an APQN to identify the AP queue to which an AP command-request message is to be +sent (NQAP and PQAP instructions), or from which a command-reply message is to +be received (DQAP instruction). The validity of an APQN is defined by the matrix +calculated from the APM and AQM; it is the cross product of all assigned adapter +numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1 +and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6), +(2,5) and (2,6) will be valid for the guest. + +The APQNs can provide secure key functionality - i.e., a private key is stored +on the adapter card for each of its domains - so each APQN must be assigned to +at most one guest or to the linux host. + + Example 1: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1,2 domain 7 + + This is valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (1,7), (2,7) + + Example 2: Valid configuration: + ------------------------------ + Guest1: adapters 1,2 domains 5,6 + Guest2: adapters 3,4 domains 5,6 + + This is also valid because both guests have a unique set of APQNs: + Guest1 has APQNs (1,5), (1,6), (2,5), (2,6); + Guest2 has APQNs (3,5), (3,6), (4,5), (4,6) + + Example 3: Invalid configuration: + -------------------------------- + Guest1: adapters 1,2 domains 5,6 + Guest2: adapter 1 domains 6,7 + + This is an invalid configuration because both guests have access to + APQN (1,6). + +The Design: +=========== +The design introduces three new objects: + +1. AP matrix device +2. VFIO AP device driver (vfio_ap.ko) +3. VFIO AP mediated matrix pass-through device + +The VFIO AP device driver +------------------------- +The VFIO AP (vfio_ap) device driver serves the following purposes: + +1. Provides the interfaces to secure APQNs for exclusive use of KVM guests. + +2. Sets up the VFIO mediated device interfaces to manage a mediated matrix + device and creates the sysfs interfaces for assigning adapters, usage + domains, and control domains comprising the matrix for a KVM guest. + +3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's + SIE state description to grant the guest access to a matrix of AP devices + +Reserve APQNs for exclusive use of KVM guests +--------------------------------------------- +The following block diagram illustrates the mechanism by which APQNs are +reserved: + + +------------------+ + 7 remove | | + +--------------------> cex4queue driver | + | | | + | +------------------+ + | + | + | +------------------+ +-----------------+ + | 5 register driver | | 3 create | | + | +----------------> Device core +----------> matrix device | + | | | | | | + | | +--------^---------+ +-----------------+ + | | | + | | +-------------------+ + | | +-----------------------------------+ | + | | | 4 register AP driver | | 2 register device + | | | | | ++--------+---+-v---+ +--------+-------+-+ +| | | | +| ap_bus +--------------------- > vfio_ap driver | +| | 8 probe | | ++--------^---------+ +--^--^------------+ +6 edit | | | + apmask | +-----------------------------+ | 9 mdev create + aqmask | | 1 modprobe | ++--------+-----+---+ +----------------+-+ +------------------+ +| | | |8 create | mediated | +| admin | | VFIO device core |---------> matrix | +| + | | | device | ++------+-+---------+ +--------^---------+ +--------^---------+ + | | | | + | | 9 create vfio_ap-passthrough | | + | +------------------------------+ | + +-------------------------------------------------------------+ + 10 assign adapter/domain/control domain + +The process for reserving an AP queue for use by a KVM guest is: + +1. The administrator loads the vfio_ap device driver +2. The vfio-ap driver during its initialization will register a single 'matrix' + device with the device core. This will serve as the parent device for + all mediated matrix devices used to configure an AP matrix for a guest. +3. The /sys/devices/vfio_ap/matrix device is created by the device core +4 The vfio_ap device driver will register with the AP bus for AP queue devices + of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap + driver's probe and remove callback interfaces. Devices older than CEX4 queues + are not supported to simplify the implementation by not needlessly + complicating the design by supporting older devices that will go out of + service in the relatively near future, and for which there are few older + systems around on which to test. +5. The AP bus registers the vfio_ap device driver with the device core +6. The administrator edits the AP adapter and queue masks to reserve AP queues + for use by the vfio_ap device driver. +7. The AP bus removes the AP queues reserved for the vfio_ap driver from the + default zcrypt cex4queue driver. +8. The AP bus probes the vfio_ap device driver to bind the queues reserved for + it. +9. The administrator creates a passthrough type mediated matrix device to be + used by a guest +10 The administrator assigns the adapters, usage domains and control domains + to be exclusively used by a guest. + +Set up the VFIO mediated device interfaces +------------------------------------------ +The VFIO AP device driver utilizes the common interface of the VFIO mediated +device core driver to: +* Register an AP mediated bus driver to add a mediated matrix device to and + remove it from a VFIO group. +* Create and destroy a mediated matrix device +* Add a mediated matrix device to and remove it from the AP mediated bus driver +* Add a mediated matrix device to and remove it from an IOMMU group + +The following high-level block diagram shows the main components and interfaces +of the VFIO AP mediated matrix device driver: + + +-------------+ + | | + | +---------+ | mdev_register_driver() +--------------+ + | | Mdev | +<-----------------------+ | + | | bus | | | vfio_mdev.ko | + | | driver | +----------------------->+ |<-> VFIO user + | +---------+ | probe()/remove() +--------------+ APIs + | | + | MDEV CORE | + | MODULE | + | mdev.ko | + | +---------+ | mdev_register_device() +--------------+ + | |Physical | +<-----------------------+ | + | | device | | | vfio_ap.ko |<-> matrix + | |interface| +----------------------->+ | device + | +---------+ | callback +--------------+ + +-------------+ + +During initialization of the vfio_ap module, the matrix device is registered +with an 'mdev_parent_ops' structure that provides the sysfs attribute +structures, mdev functions and callback interfaces for managing the mediated +matrix device. + +* sysfs attribute structures: + * supported_type_groups + The VFIO mediated device framework supports creation of user-defined + mediated device types. These mediated device types are specified + via the 'supported_type_groups' structure when a device is registered + with the mediated device framework. The registration process creates the + sysfs structures for each mediated device type specified in the + 'mdev_supported_types' sub-directory of the device being registered. Along + with the device type, the sysfs attributes of the mediated device type are + provided. + + The VFIO AP device driver will register one mediated device type for + passthrough devices: + /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough + Only the read-only attributes required by the VFIO mdev framework will + be provided: + ... name + ... device_api + ... available_instances + ... device_api + Where: + * name: specifies the name of the mediated device type + * device_api: the mediated device type's API + * available_instances: the number of mediated matrix passthrough devices + that can be created + * device_api: specifies the VFIO API + * mdev_attr_groups + This attribute group identifies the user-defined sysfs attributes of the + mediated device. When a device is registered with the VFIO mediated device + framework, the sysfs attribute files identified in the 'mdev_attr_groups' + structure will be created in the mediated matrix device's directory. The + sysfs attributes for a mediated matrix device are: + * assign_adapter: + * unassign_adapter: + Write-only attributes for assigning/unassigning an AP adapter to/from the + mediated matrix device. To assign/unassign an adapter, the APID of the + adapter is echoed to the respective attribute file. + * assign_domain: + * unassign_domain: + Write-only attributes for assigning/unassigning an AP usage domain to/from + the mediated matrix device. To assign/unassign a domain, the domain + number of the the usage domain is echoed to the respective attribute + file. + * matrix: + A read-only file for displaying the APQNs derived from the cross product + of the adapter and domain numbers assigned to the mediated matrix device. + * assign_control_domain: + * unassign_control_domain: + Write-only attributes for assigning/unassigning an AP control domain + to/from the mediated matrix device. To assign/unassign a control domain, + the ID of the domain to be assigned/unassigned is echoed to the respective + attribute file. + * control_domains: + A read-only file for displaying the control domain numbers assigned to the + mediated matrix device. + +* functions: + * create: + allocates the ap_matrix_mdev structure used by the vfio_ap driver to: + * Store the reference to the KVM structure for the guest using the mdev + * Store the AP matrix configuration for the adapters, domains, and control + domains assigned via the corresponding sysfs attributes files + * remove: + deallocates the mediated matrix device's ap_matrix_mdev structure. This will + be allowed only if a running guest is not using the mdev. + +* callback interfaces + * open: + The vfio_ap driver uses this callback to register a + VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix + device. The open is invoked when QEMU connects the VFIO iommu group + for the mdev matrix device to the MDEV bus. Access to the KVM structure used + to configure the KVM guest is provided via this callback. The KVM structure, + is used to configure the guest's access to the AP matrix defined via the + mediated matrix device's sysfs attribute files. + * release: + unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the + mdev matrix device and deconfigures the guest's AP matrix. + +Configure the APM, AQM and ADM in the CRYCB: +------------------------------------------- +Configuring the AP matrix for a KVM guest will be performed when the +VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier +function is called when QEMU connects to KVM. The guest's AP matrix is +configured via it's CRYCB by: +* Setting the bits in the APM corresponding to the APIDs assigned to the + mediated matrix device via its 'assign_adapter' interface. +* Setting the bits in the AQM corresponding to the domains assigned to the + mediated matrix device via its 'assign_domain' interface. +* Setting the bits in the ADM corresponding to the domain dIDs assigned to the + mediated matrix device via its 'assign_control_domains' interface. + +The CPU model features for AP +----------------------------- +The AP stack relies on the presence of the AP instructions as well as two +facilities: The AP Facilities Test (APFT) facility; and the AP Query +Configuration Information (QCI) facility. These features/facilities are made +available to a KVM guest via the following CPU model features: + +1. ap: Indicates whether the AP instructions are installed on the guest. This + feature will be enabled by KVM only if the AP instructions are installed + on the host. + +2. apft: Indicates the APFT facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 15 is set). + +3. apqci: Indicates the AP QCI facility is available on the guest. This facility + can be made available to the guest only if it is available on the host (i.e., + facility bit 12 is set). + +Note: If the user chooses to specify a CPU model different than the 'host' +model to QEMU, the CPU model features and facilities need to be turned on +explicitly; for example: + + /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on + +A guest can be precluded from using AP features/facilities by turning them off +explicitly; for example: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off + +Note: If the APFT facility is turned off (apft=off) for the guest, the guest +will not see any AP devices. The zcrypt device drivers that register for type 10 +and newer AP devices - i.e., the cex4card and cex4queue device drivers - need +the APFT facility to ascertain the facilities installed on a given AP device. If +the APFT facility is not installed on the guest, then the probe of device +drivers will fail since only type 10 and newer devices can be configured for +guest use. + +Example: +======= +Let's now provide an example to illustrate how KVM guests may be given +access to AP facilities. For this example, we will show how to configure +three guests such that executing the lszcrypt command on the guests would +look like this: + +Guest1 +------ +CARD.DOMAIN TYPE MODE +------------------------------ +05 CEX5C CCA-Coproc +05.0004 CEX5C CCA-Coproc +05.00ab CEX5C CCA-Coproc +06 CEX5A Accelerator +06.0004 CEX5A Accelerator +06.00ab CEX5C CCA-Coproc + +Guest2 +------ +CARD.DOMAIN TYPE MODE +------------------------------ +05 CEX5A Accelerator +05.0047 CEX5A Accelerator +05.00ff CEX5A Accelerator + +Guest2 +------ +CARD.DOMAIN TYPE MODE +------------------------------ +06 CEX5A Accelerator +06.0047 CEX5A Accelerator +06.00ff CEX5A Accelerator + +These are the steps: + +1. Install the vfio_ap module on the linux host. The dependency chain for the + vfio_ap module is: + * iommu + * s390 + * zcrypt + * vfio + * vfio_mdev + * vfio_mdev_device + * KVM + + To build the vfio_ap module, the kernel build must be configured with the + following Kconfig elements selected: + * IOMMU_SUPPORT + * S390 + * ZCRYPT + * S390_AP_IOMMU + * VFIO + * VFIO_MDEV + * VFIO_MDEV_DEVICE + * KVM + + If using make menuconfig select the following to build the vfio_ap module: + -> Device Drivers + -> IOMMU Hardware Support + select S390 AP IOMMU Support + -> VFIO Non-Privileged userspace driver framework + -> Mediated device driver frramework + -> VFIO driver for Mediated devices + -> I/O subsystem + -> VFIO support for AP devices + +2. Secure the AP queues to be used by the three guests so that the host can not + access them. To secure them, there are two sysfs files that specify + bitmasks marking a subset of the APQN range as 'usable by the default AP + queue device drivers' or 'not usable by the default device drivers' and thus + available for use by the vfio_ap device driver'. The location of the sysfs + files containing the masks are: + + /sys/bus/ap/apmask + /sys/bus/ap/aqmask + + The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs + (APID). Each bit in the mask, from left to right (i.e., from most significant + to least significant bit in big endian order), corresponds to an APID from + 0-255. If a bit is set, the APID is marked as usable only by the default AP + queue device drivers; otherwise, the APID is usable by the vfio_ap + device driver. + + The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes + (APQI). Each bit in the mask, from left to right (i.e., from most significant + to least significant bit in big endian order), corresponds to an APQI from + 0-255. If a bit is set, the APQI is marked as usable only by the default AP + queue device drivers; otherwise, the APQI is usable by the vfio_ap device + driver. + + Take, for example, the following mask: + + 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff + + It indicates: + + 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6 + belong to the vfio_ap device driver's pool. + + The APQN of each AP queue device assigned to the linux host is checked by the + AP bus against the set of APQNs derived from the cross product of APIDs + and APQIs marked as usable only by the default AP queue device drivers. If a + match is detected, only the default AP queue device drivers will be probed; + otherwise, the vfio_ap device driver will be probed. + + By default, the two masks are set to reserve all APQNs for use by the default + AP queue device drivers. There are two ways the default masks can be changed: + + 1. The sysfs mask files can be edited by echoing a string into the + respective sysfs mask file in one of two formats: + + * An absolute hex string starting with 0x - like "0x12345678" - sets + the mask. If the given string is shorter than the mask, it is padded + with 0s on the right; for example, specifying a mask value of 0x41 is + the same as specifying: + + 0x4100000000000000000000000000000000000000000000000000000000000000 + + Keep in mind that the mask reads from left to right (i.e., most + significant to least significant bit in big endian order), so the mask + above identifies device numbers 1 and 7 (01000001). + + If the string is longer than the mask, the operation is terminated with + an error (EINVAL). + + * Individual bits in the mask can be switched on and off by specifying + each bit number to be switched in a comma separated list. Each bit + number string must be prepended with a ('+') or minus ('-') to indicate + the corresponding bit is to be switched on ('+') or off ('-'). Some + valid values are: + + "+0" switches bit 0 on + "-13" switches bit 13 off + "+0x41" switches bit 65 on + "-0xff" switches bit 255 off + + The following example: + +0,-6,+0x47,-0xf0 + + Switches bits 0 and 71 (0x47) on + Switches bits 6 and 240 (0xf0) off + + Note that the bits not specified in the list remain as they were before + the operation. + + 2. The masks can also be changed at boot time via parameters on the kernel + command line like this: + + ap.apmask=0xffff ap.aqmask=0x40 + + This would create the following masks: + + apmask: + 0xffff000000000000000000000000000000000000000000000000000000000000 + + aqmask: + 0x4000000000000000000000000000000000000000000000000000000000000000 + + Resulting in these two pools: + + default drivers pool: adapter 0-15, domain 1 + alternate drivers pool: adapter 16-255, domains 0, 2-255 + + Securing the APQNs for our example: + ---------------------------------- + To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047, + 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding + APQNs can either be removed from the default masks: + + echo -5,-6 > /sys/bus/ap/apmask + + echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask + + Or the masks can be set as follows: + + echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \ + > apmask + + echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \ + > aqmask + + This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, + 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The + sysfs directory for the vfio_ap device driver will now contain symbolic links + to the AP queue devices bound to it: + + /sys/bus/ap + ... [drivers] + ...... [vfio_ap] + ......... [05.0004] + ......... [05.0047] + ......... [05.00ab] + ......... [05.00ff] + ......... [06.0004] + ......... [06.0047] + ......... [06.00ab] + ......... [06.00ff] + + Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later) + can be bound to the vfio_ap device driver. The reason for this is to + simplify the implementation by not needlessly complicating the design by + supporting older devices that will go out of service in the relatively near + future and for which there are few older systems on which to test. + + The administrator, therefore, must take care to secure only AP queues that + can be bound to the vfio_ap device driver. The device type for a given AP + queue device can be read from the parent card's sysfs directory. For example, + to see the hardware type of the queue 05.0004: + + cat /sys/bus/ap/devices/card05/hwtype + + The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the + vfio_ap device driver. + +3. Create the mediated devices needed to configure the AP matrixes for the + three guests and to provide an interface to the vfio_ap driver for + use by the guests: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] (passthrough mediated matrix device type) + --------- create + --------- [devices] + + To create the mediated devices for the three guests: + + uuidgen > create + uuidgen > create + uuidgen > create + + or + + echo $uuid1 > create + echo $uuid2 > create + echo $uuid3 > create + + This will create three mediated devices in the [devices] subdirectory named + after the UUID written to the create attribute file. We call them $uuid1, + $uuid2 and $uuid3 and this is the sysfs directory structure after creation: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + --------------- unassign_control_domain + --------------- unassign_domain + + ------------ [$uuid2] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + + ------------ [$uuid3] + --------------- assign_adapter + --------------- assign_control_domain + --------------- assign_domain + --------------- matrix + --------------- unassign_adapter + ----------------unassign_control_domain + ----------------unassign_domain + +4. The administrator now needs to configure the matrixes for the mediated + devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3). + + This is how the matrix is configured for Guest1: + + echo 5 > assign_adapter + echo 6 > assign_adapter + echo 4 > assign_domain + echo 0xab > assign_domain + + Control domains can similarly be assigned using the assign_control_domain + sysfs file. + + If a mistake is made configuring an adapter, domain or control domain, + you can use the unassign_xxx files to unassign the adapter, domain or + control domain. + + To display the matrix configuration for Guest1: + + cat matrix + + This is how the matrix is configured for Guest2: + + echo 5 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + This is how the matrix is configured for Guest3: + + echo 6 > assign_adapter + echo 0x47 > assign_domain + echo 0xff > assign_domain + + In order to successfully assign an adapter: + + * The adapter number specified must represent a value from 0 up to the + maximum adapter number configured for the system. If an adapter number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + * All APQNs that can be derived from the adapter ID and the IDs of + the previously assigned domains must be bound to the vfio_ap device + driver. If no domains have yet been assigned, then there must be at least + one APQN with the specified APID bound to the vfio_ap driver. If no such + APQNs are bound to the driver, the operation will terminate with an + error (EADDRNOTAVAIL). + + No APQN that can be derived from the adapter ID and the IDs of the + previously assigned domains can be assigned to another mediated matrix + device. If an APQN is assigned to another mediated matrix device, the + operation will terminate with an error (EADDRINUSE). + + In order to successfully assign a domain: + + * The domain number specified must represent a value from 0 up to the + maximum domain number configured for the system. If a domain number + higher than the maximum is specified, the operation will terminate with + an error (ENODEV). + + * All APQNs that can be derived from the domain ID and the IDs of + the previously assigned adapters must be bound to the vfio_ap device + driver. If no domains have yet been assigned, then there must be at least + one APQN with the specified APQI bound to the vfio_ap driver. If no such + APQNs are bound to the driver, the operation will terminate with an + error (EADDRNOTAVAIL). + + No APQN that can be derived from the domain ID and the IDs of the + previously assigned adapters can be assigned to another mediated matrix + device. If an APQN is assigned to another mediated matrix device, the + operation will terminate with an error (EADDRINUSE). + + In order to successfully assign a control domain, the domain number + specified must represent a value from 0 up to the maximum domain number + configured for the system. If a control domain number higher than the maximum + is specified, the operation will terminate with an error (ENODEV). + +5. Start Guest1: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ... + +7. Start Guest2: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ... + +7. Start Guest3: + + /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \ + -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ... + +When the guest is shut down, the mediated matrix devices may be removed. + +Using our example again, to remove the mediated matrix device $uuid1: + + /sys/devices/vfio_ap/matrix/ + --- [mdev_supported_types] + ------ [vfio_ap-passthrough] + --------- [devices] + ------------ [$uuid1] + --------------- remove + + + echo 1 > remove + + This will remove all of the mdev matrix device's sysfs structures including + the mdev device itself. To recreate and reconfigure the mdev matrix device, + all of the steps starting with step 3 will have to be performed again. Note + that the remove will fail if a guest using the mdev is still running. + + It is not necessary to remove an mdev matrix device, but one may want to + remove it if no guest will use it during the remaining lifetime of the linux + host. If the mdev matrix device is removed, one may want to also reconfigure + the pool of adapters and queues reserved for use by the default drivers. + +Limitations +=========== +* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN + to the default drivers pool of a queue that is still assigned to a mediated + device in use by a guest. It is incumbent upon the administrator to + ensure there is no mediated device in use by a guest to which the APQN is + assigned lest the host be given access to the private data of the AP queue + device such as a private key configured specifically for the guest. + +* Dynamically modifying the AP matrix for a running guest (which would amount to + hot(un)plug of AP devices for the guest) is currently not supported + +* Live guest migration is not supported for guests using AP devices. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt @@ -123,6 +123,37 @@ memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the flag KVM_VM_MIPS_VZ. +On arm64, the physical address size for a VM (IPA Size limit) is limited +to 40bits by default. The limit can be configured if the host supports the +extension KVM_CAP_ARM_VM_IPA_SIZE. When supported, use +KVM_VM_TYPE_ARM_IPA_SIZE(IPA_Bits) to set the size in the machine type +identifier, where IPA_Bits is the maximum width of any physical +address used by the VM. The IPA_Bits is encoded in bits[7-0] of the +machine type identifier. + +e.g, to configure a guest to use 48bit physical address size : + + vm_fd = ioctl(dev_fd, KVM_CREATE_VM, KVM_VM_TYPE_ARM_IPA_SIZE(48)); + +The requested size (IPA_Bits) must be : + 0 - Implies default size, 40bits (for backward compatibility) + + or + + N - Implies N bits, where N is a positive integer such that, + 32 <= N <= Host_IPA_Limit + +Host_IPA_Limit is the maximum possible value for IPA_Bits on the host and +is dependent on the CPU capability and the kernel configuration. The limit can +be retrieved using KVM_CAP_ARM_VM_IPA_SIZE of the KVM_CHECK_EXTENSION +ioctl() at run-time. + +Please note that configuring the IPA size does not affect the capability +exposed by the guest CPUs in ID_AA64MMFR0_EL1[PARange]. It only affects +size of the address translated by the stage2 level (guest physical to +host physical address translations). + + 4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST @@ -850,7 +881,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - __u8 pad; + __u8 pending; __u32 error_code; } exception; struct { @@ -873,15 +904,23 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; }; -Only two fields are defined in the flags field: +The following bits are defined in the flags field: -- KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that +- KVM_VCPUEVENT_VALID_SHADOW may be set to signal that interrupt.shadow contains a valid state. -- KVM_VCPUEVENT_VALID_SMM may be set in the flags field to signal that - smi contains a valid state. +- KVM_VCPUEVENT_VALID_SMM may be set to signal that smi contains a + valid state. + +- KVM_VCPUEVENT_VALID_PAYLOAD may be set to signal that the + exception_has_payload, exception_payload, and exception.pending + fields contain a valid state. This bit will be set whenever + KVM_CAP_EXCEPTION_PAYLOAD is enabled. ARM/ARM64: @@ -961,6 +1000,11 @@ shall be written into the VCPU. KVM_VCPUEVENT_VALID_SMM can only be set if KVM_CAP_X86_SMM is available. +If KVM_CAP_EXCEPTION_PAYLOAD is enabled, KVM_VCPUEVENT_VALID_PAYLOAD +can be set in the flags field to signal that the +exception_has_payload, exception_payload, and exception.pending fields +contain a valid state and shall be written into the VCPU. + ARM/ARM64: Set the pending SError exception state for this VCPU. It is not possible to @@ -1922,6 +1966,7 @@ registers, find a list below: PPC | KVM_REG_PPC_TIDR | 64 PPC | KVM_REG_PPC_PSSCR | 64 PPC | KVM_REG_PPC_DEC_EXPIRY | 64 + PPC | KVM_REG_PPC_PTCR | 64 PPC | KVM_REG_PPC_TM_GPR0 | 64 ... PPC | KVM_REG_PPC_TM_GPR31 | 64 @@ -2269,6 +2314,10 @@ The supported flags are: The emulated MMU supports 1T segments in addition to the standard 256M ones. + - KVM_PPC_NO_HASH + This flag indicates that HPT guests are not supported by KVM, + thus all guests must use radix MMU mode. + The "slb_size" field indicates how many SLB entries are supported The "sps" array contains 8 entries indicating the supported base @@ -3676,6 +3725,34 @@ Returns: 0 on success, -1 on error This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE. +4.116 KVM_(UN)REGISTER_COALESCED_MMIO + +Capability: KVM_CAP_COALESCED_MMIO (for coalesced mmio) + KVM_CAP_COALESCED_PIO (for coalesced pio) +Architectures: all +Type: vm ioctl +Parameters: struct kvm_coalesced_mmio_zone +Returns: 0 on success, < 0 on error + +Coalesced I/O is a performance optimization that defers hardware +register write emulation so that userspace exits are avoided. It is +typically used to reduce the overhead of emulating frequently accessed +hardware registers. + +When a hardware register is configured for coalesced I/O, write accesses +do not exit to userspace and their value is recorded in a ring buffer +that is shared between kernel and userspace. + +Coalesced I/O is used if one or more write accesses to a hardware +register can be deferred until a read or a write to another hardware +register on the same device. This last access will cause a vmexit and +userspace will process accesses from the ring buffer before emulating +it. That will avoid exiting to userspace on repeated writes. + +Coalesced pio is based on coalesced mmio. There is little difference +between coalesced mmio and pio except that coalesced pio records accesses +to I/O ports. + 5. The kvm_run structure ------------------------ @@ -4522,7 +4599,7 @@ hpage module parameter is not set to 1, -EINVAL is returned. While it is generally possible to create a huge page backed VM without this capability, the VM will not be able to run. -7.14 KVM_CAP_MSR_PLATFORM_INFO +7.15 KVM_CAP_MSR_PLATFORM_INFO Architectures: x86 Parameters: args[0] whether feature should be enabled or not @@ -4531,6 +4608,45 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, a #GP would be raised when the guest tries to access. Currently, this capability does not enable write permissions of this MSR for the guest. +7.16 KVM_CAP_PPC_NESTED_HV + +Architectures: ppc +Parameters: none +Returns: 0 on success, -EINVAL when the implementation doesn't support + nested-HV virtualization. + +HV-KVM on POWER9 and later systems allows for "nested-HV" +virtualization, which provides a way for a guest VM to run guests that +can run using the CPU's supervisor mode (privileged non-hypervisor +state). Enabling this capability on a VM depends on the CPU having +the necessary functionality and on the facility being enabled with a +kvm-hv module parameter. + +7.17 KVM_CAP_EXCEPTION_PAYLOAD + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability enabled, CR2 will not be modified prior to the +emulated VM-exit when L1 intercepts a #PF exception that occurs in +L2. Similarly, for kvm-intel only, DR6 will not be modified prior to +the emulated VM-exit when L1 intercepts a #DB exception that occurs in +L2. As a result, when KVM_GET_VCPU_EVENTS reports a pending #PF (or +#DB) exception for L2, exception.has_payload will be set and the +faulting address (or the new DR6 bits*) will be reported in the +exception_payload field. Similarly, when userspace injects a #PF (or +#DB) into L2 using KVM_SET_VCPU_EVENTS, it is expected to set +exception.has_payload and to put the faulting address (or the new DR6 +bits*) in the exception_payload field. + +This capability also enables exception.pending in struct +kvm_vcpu_events, which allows userspace to distinguish between pending +and injected exceptions. + + +* For the new DR6 bits, note that bit 16 is set iff the #DB exception + will clear DR6.RTM. + 8. Other capabilities. ---------------------- @@ -4772,3 +4888,10 @@ CPU when the exception is taken. If this virtual SError is taken to EL1 using AArch64, this value will be reported in the ISS field of ESR_ELx. See KVM_CAP_VCPU_EVENTS for more details. +8.20 KVM_CAP_HYPERV_SEND_IPI + +Architectures: x86 + +This capability indicates that KVM supports paravirtualized Hyper-V IPI send +hypercalls: +HvCallSendSyntheticClusterIpi, HvCallSendSyntheticClusterIpiEx. diff --git a/MAINTAINERS b/MAINTAINERS @@ -12800,6 +12800,18 @@ W: http://www.ibm.com/developerworks/linux/linux390/ S: Supported F: drivers/s390/crypto/ +S390 VFIO AP DRIVER +M: Tony Krowiak <akrowiak@linux.ibm.com> +M: Pierre Morel <pmorel@linux.ibm.com> +M: Halil Pasic <pasic@linux.ibm.com> +L: linux-s390@vger.kernel.org +W: http://www.ibm.com/developerworks/linux/linux390/ +S: Supported +F: drivers/s390/crypto/vfio_ap_drv.c +F: drivers/s390/crypto/vfio_ap_private.h +F: drivers/s390/crypto/vfio_ap_ops.c +F: Documentation/s390/vfio-ap.txt + S390 ZFCP DRIVER M: Steffen Maier <maier@linux.ibm.com> M: Benjamin Block <bblock@linux.ibm.com> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h @@ -133,8 +133,7 @@ * space. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL)) + #define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30)) /* Virtualization Translation Control Register (VTCR) bits */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h @@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; } @@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) +{ + /* + * On 32bit ARM, VMs get a static 40bit IPA stage2 setup, + * so any non-zero value used as type is illegal. + */ + if (type) + return -EINVAL; + return 0; +} + #endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h @@ -35,16 +35,12 @@ addr; \ }) -/* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels. - */ -#define KVM_MMU_CACHE_MIN_PAGES 2 - #ifndef __ASSEMBLY__ #include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/cputype.h> +#include <asm/kvm_arm.h> #include <asm/kvm_hyp.h> #include <asm/pgalloc.h> #include <asm/stage2_pgtable.h> @@ -52,6 +48,13 @@ /* Ensure compatibility with arm64 */ #define VA_BITS 32 +#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT +#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL) +#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK + +#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) + int create_hyp_mappings(void *from, void *to, pgprot_t prot); int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, void __iomem **kaddr, @@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) (addr) +static inline void kvm_set_ipa_limit(void) {} + static inline bool kvm_cpu_has_cnp(void) { return false; diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h @@ -19,43 +19,53 @@ #ifndef __ARM_S2_PGTABLE_H_ #define __ARM_S2_PGTABLE_H_ -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) - -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) - -#define stage2_pud_huge(pud) pud_huge(pud) +/* + * kvm_mmu_cache_min_pages() is the number of pages required + * to install a stage-2 translation. We pre-allocate the entry + * level table at VM creation. Since we have a 3 level page-table, + * we need only two pages to add a new mapping. + */ +#define kvm_mmu_cache_min_pages(kvm) 2 + +#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) +#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) +#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) +#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) +#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) +#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) + +#define stage2_pud_none(kvm, pud) pud_none(pud) +#define stage2_pud_clear(kvm, pud) pud_clear(pud) +#define stage2_pud_present(kvm, pud) pud_present(pud) +#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) +#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) +#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) + +#define stage2_pud_huge(kvm, pud) pud_huge(pud) /* Open coded p*d_addr_end that can deal with 64bit addresses */ -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pud_addr_end(addr, end) (end) +#define stage2_pud_addr_end(kvm, addr, end) (end) -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; return (boundary - 1 < end - 1) ? boundary : end; } -#define stage2_pgd_index(addr) pgd_index(addr) +#define stage2_pgd_index(kvm, addr) pgd_index(addr) -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#define stage2_pud_table_empty(pudp) false +#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) +#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) +#define stage2_pud_table_empty(kvm, pudp) false #endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h @@ -537,6 +537,27 @@ static inline void arm64_set_ssbd_mitigation(bool state) {} #endif extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); + +static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) +{ + switch (parange) { + case 0: return 32; + case 1: return 36; + case 2: return 40; + case 3: return 42; + case 4: return 44; + case 5: return 48; + case 6: return 52; + /* + * A future PE could use a value unknown to the kernel. + * However, by the "D10.1.4 Principles of the ID scheme + * for fields in ID registers", ARM DDI 0487C.a, any new + * value is guaranteed to be higher than what we know already. + * As a safe limit, we return the limit supported by the kernel. + */ + default: return CONFIG_ARM64_PA_BITS; + } +} #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h @@ -107,6 +107,7 @@ #define VTCR_EL2_RES1 (1 << 31) #define VTCR_EL2_HD (1 << 22) #define VTCR_EL2_HA (1 << 21) +#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT #define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK #define VTCR_EL2_TG0_MASK TCR_TG0_MASK #define VTCR_EL2_TG0_4K TCR_TG0_4K @@ -120,63 +121,150 @@ #define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA #define VTCR_EL2_SL0_SHIFT 6 #define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT) -#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT) #define VTCR_EL2_T0SZ_MASK 0x3f -#define VTCR_EL2_T0SZ_40B 24 #define VTCR_EL2_VS_SHIFT 19 #define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT) #define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT) +#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x) + /* * We configure the Stage-2 page tables to always restrict the IPA space to be * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * - * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time - * (see hyp-init.S). + * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2(). * * Note that when using 4K pages, we concatenate two first level page tables * together. With 16K pages, we concatenate 16 first level page tables. * - * The magic numbers used for VTTBR_X in this patch can be found in Tables - * D4-23 and D4-25 in ARM DDI 0487A.b. */ -#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B #define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \ VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1) -#ifdef CONFIG_ARM64_64K_PAGES /* - * Stage2 translation configuration: - * 64kB pages (TG0 = 1) - * 2 level page tables (SL = 1) + * VTCR_EL2:SL0 indicates the entry level for Stage2 translation. + * Interestingly, it depends on the page size. + * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a + * + * ----------------------------------------- + * | Entry level | 4K | 16K/64K | + * ------------------------------------------ + * | Level: 0 | 2 | - | + * ------------------------------------------ + * | Level: 1 | 1 | 2 | + * ------------------------------------------ + * | Level: 2 | 0 | 1 | + * ------------------------------------------ + * | Level: 3 | - | 0 | + * ------------------------------------------ + * + * The table roughly translates to : + * + * SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level + * + * Where TGRAN_SL0_BASE is a magic number depending on the page size: + * TGRAN_SL0_BASE(4K) = 2 + * TGRAN_SL0_BASE(16K) = 3 + * TGRAN_SL0_BASE(64K) = 3 + * provided we take care of ruling out the unsupported cases and + * Entry_Level = 4 - Number_of_levels. + * */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 38 +#ifdef CONFIG_ARM64_64K_PAGES + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #elif defined(CONFIG_ARM64_16K_PAGES) -/* - * Stage2 translation configuration: - * 16kB pages (TG0 = 2) - * 2 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 42 + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K +#define VTCR_EL2_TGRAN_SL0_BASE 3UL + #else /* 4K */ -/* - * Stage2 translation configuration: - * 4kB pages (TG0 = 0) - * 3 level page tables (SL = 1) - */ -#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1) -#define VTTBR_X_TGRAN_MAGIC 37 + +#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K +#define VTCR_EL2_TGRAN_SL0_BASE 2UL + #endif -#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS) -#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA) +#define VTCR_EL2_LVLS_TO_SL0(levels) \ + ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT) +#define VTCR_EL2_SL0_TO_LVLS(sl0) \ + ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE) +#define VTCR_EL2_LVLS(vtcr) \ + VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT) + +#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN) +#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK)) + +/* + * ARM VMSAv8-64 defines an algorithm for finding the translation table + * descriptors in section D4.2.8 in ARM DDI 0487C.a. + * + * The algorithm defines the expectations on the translation table + * addresses for each level, based on PAGE_SIZE, entry level + * and the translation table size (T0SZ). The variable "x" in the + * algorithm determines the alignment of a table base address at a given + * level and thus determines the alignment of VTTBR:BADDR for stage2 + * page table entry level. + * Since the number of bits resolved at the entry level could vary + * depending on the T0SZ, the value of "x" is defined based on a + * Magic constant for a given PAGE_SIZE and Entry Level. The + * intermediate levels must be always aligned to the PAGE_SIZE (i.e, + * x = PAGE_SHIFT). + * + * The value of "x" for entry level is calculated as : + * x = Magic_N - T0SZ + * + * where Magic_N is an integer depending on the page size and the entry + * level of the page table as below: + * + * -------------------------------------------- + * | Entry level | 4K 16K 64K | + * -------------------------------------------- + * | Level: 0 (4 levels) | 28 | - | - | + * -------------------------------------------- + * | Level: 1 (3 levels) | 37 | 31 | 25 | + * -------------------------------------------- + * | Level: 2 (2 levels) | 46 | 42 | 38 | + * -------------------------------------------- + * | Level: 3 (1 level) | - | 53 | 51 | + * -------------------------------------------- + * + * We have a magic formula for the Magic_N below: + * + * Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels) + * + * where Number_of_levels = (4 - Level). We are only interested in the + * value for Entry_Level for the stage2 page table. + * + * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows: + * + * x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT) + * = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels) + * + * Here is one way to explain the Magic Formula: + * + * x = log2(Size_of_Entry_Level_Table) + * + * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another + * PAGE_SHIFT bits in the PTE, we have : + * + * Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT) + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3 + * where n = number of levels, and since each pointer is 8bytes, we have: + * + * x = Bits_Entry_Level + 3 + * = IPA_SHIFT - (PAGE_SHIFT - 3) * n + * + * The only constraint here is that, we have to find the number of page table + * levels for a given IPA size (which we do, see stage2_pt_levels()) + */ +#define ARM64_VTTBR_X(ipa, levels) ((ipa) - ((levels) * (PAGE_SHIFT - 3))) #define VTTBR_CNP_BIT (UL(1)) -#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X) #define VTTBR_VMID_SHIFT (UL(48)) #define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) @@ -224,6 +312,13 @@ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) +/* + * We have + * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12] + * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12] + */ +#define PAR_TO_HPFAR(par) \ + (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8) #define kvm_arm_exception_type \ {0, "IRQ" }, \ diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h @@ -30,6 +30,7 @@ #define ARM_EXCEPTION_IRQ 0 #define ARM_EXCEPTION_EL1_SERROR 1 #define ARM_EXCEPTION_TRAP 2 +#define ARM_EXCEPTION_IL 3 /* The hyp-stub will return this for any kvm_call_hyp() call */ #define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR @@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); -extern u32 __init_stage2_translation(void); - /* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */ #define __hyp_this_cpu_ptr(sym) \ ({ \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h @@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext); +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext); void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start); struct kvm_arch { @@ -61,11 +61,13 @@ struct kvm_arch { u64 vmid_gen; u32 vmid; - /* 1-level 2nd stage table, protected by kvm->mmu_lock */ + /* stage2 entry level table */ pgd_t *pgd; /* VTTBR value associated with above pgd and vmid */ u64 vttbr; + /* VTCR_EL2 value for this VM */ + u64 vtcr; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; @@ -451,13 +453,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); -static inline void __cpu_init_stage2(void) -{ - u32 parange = kvm_call_hyp(__init_stage2_translation); - - WARN_ONCE(parange < 40, - "PARange is %d bits, unsupported configuration!", parange); -} +static inline void __cpu_init_stage2(void) {} /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); @@ -520,8 +516,12 @@ static inline int kvm_arm_have_ssbd(void) void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu); void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu); +void kvm_set_ipa_limit(void); + #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); void kvm_arch_free_vm(struct kvm *kvm); +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h @@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void); u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt); void __noreturn __hyp_do_panic(unsigned long, ...); +/* + * Must be called from hyp code running at EL2 with an updated VTTBR + * and interrupts disabled. + */ +static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm) +{ + write_sysreg(kvm->arch.vtcr, vtcr_el2); + write_sysreg(kvm->arch.vttbr, vttbr_el2); +} + #endif /* __ARM64_KVM_HYP_H__ */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h @@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v) * We currently only support a 40bit IPA. */ #define KVM_PHYS_SHIFT (40) -#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT) -#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL) + +#define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr) +#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm)) +#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) + +static inline bool kvm_page_empty(void *ptr) +{ + struct page *ptr_page = virt_to_page(ptr); + return page_count(ptr_page) == 1; +} #include <asm/stage2_pgtable.h> @@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) #ifdef __PAGETABLE_PMD_FOLDED @@ -517,6 +519,30 @@ static inline int hyp_map_aux_data(void) #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * Get the magic number 'x' for VTTBR:BADDR of this KVM instance. + * With v8.2 LVA extensions, 'x' should be a minimum of 6 with + * 52bit IPS. + */ +static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels) +{ + int x = ARM64_VTTBR_X(ipa_shift, levels); + + return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x; +} + +static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels) +{ + unsigned int x = arm64_vttbr_x(ipa_shift, levels); + + return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x); +} + +static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm) +{ + return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)); +} + static inline bool kvm_cpu_has_cnp(void) { return system_supports_cnp(); diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h @@ -25,6 +25,9 @@ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) +/* Additional SPSR bits not exposed in the UABI */ +#define PSR_IL_BIT (1 << 20) + /* AArch32-specific ptrace requests */ #define COMPAT_PTRACE_GETREGS 12 #define COMPAT_PTRACE_SETREGS 13 diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h @@ -1,42 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ -#define __ARM64_S2_PGTABLE_NOPMD_H_ - -#include <asm/stage2_pgtable-nopud.h> - -#define __S2_PGTABLE_PMD_FOLDED - -#define S2_PMD_SHIFT S2_PUD_SHIFT -#define S2_PTRS_PER_PMD 1 -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) -#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) - -#define stage2_pud_none(pud) (0) -#define stage2_pud_present(pud) (1) -#define stage2_pud_clear(pud) do { } while (0) -#define stage2_pud_populate(pud, pmd) do { } while (0) -#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud)) - -#define stage2_pmd_free(pmd) do { } while (0) - -#define stage2_pmd_addr_end(addr, end) (end) - -#define stage2_pud_huge(pud) (0) -#define stage2_pmd_table_empty(pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ -#define __ARM64_S2_PGTABLE_NOPUD_H_ - -#define __S2_PGTABLE_PUD_FOLDED - -#define S2_PUD_SHIFT S2_PGDIR_SHIFT -#define S2_PTRS_PER_PUD 1 -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) -#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) - -#define stage2_pgd_none(pgd) (0) -#define stage2_pgd_present(pgd) (1) -#define stage2_pgd_clear(pgd) do { } while (0) -#define stage2_pgd_populate(pgd, pud) do { } while (0) - -#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd)) - -#define stage2_pud_free(x) do { } while (0) - -#define stage2_pud_addr_end(addr, end) (end) -#define stage2_pud_table_empty(pmdp) (0) - -#endif diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h @@ -19,9 +19,17 @@ #ifndef __ARM64_S2_PGTABLE_H_ #define __ARM64_S2_PGTABLE_H_ +#include <linux/hugetlb.h> #include <asm/pgtable.h> /* + * PGDIR_SHIFT determines the size a top-level page table entry can map + * and depends on the number of levels in the page table. Compute the + * PGDIR_SHIFT for a given number of levels. + */ +#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) + +/* * The hardware supports concatenation of up to 16 tables at stage2 entry level * and we use the feature whenever possible. * @@ -29,112 +37,208 @@ * On arm64, the smallest PAGE_SIZE supported is 4k, which means * (PAGE_SHIFT - 3) > 4 holds for all page sizes. * This implies, the total number of page table levels at stage2 expected - * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) + * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) * in normal translations(e.g, stage1), since we cannot have another level in - * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). + * the range (IPA_SHIFT, IPA_SHIFT - 4). */ -#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) +#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) +#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) -/* - * With all the supported VA_BITs and 40bit guest IPA, the following condition - * is always true: - * - * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS - * - * We base our stage-2 page table walker helpers on this assumption and - * fall back to using the host version of the helper wherever possible. - * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back - * to using the host version, since it is guaranteed it is not folded at host. - * - * If the condition breaks in the future, we can rearrange the host level - * definitions and reuse them for stage2. Till then... - */ -#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS -#error "Unsupported combination of guest IPA and host VA_BITS." -#endif - -/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ -#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) -#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) -#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) +/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ +#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) +#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) +#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) /* * The number of PTRS across all concatenated stage2 tables given by the * number of bits resolved at the initial level. + * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA), + * in which case, stage2_pgd_ptrs will have one entry. */ -#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) +#define pgd_ptrs_shift(ipa, pgdir_shift) \ + ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0) +#define __s2_pgd_ptrs(ipa, lvls) \ + (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls)))) +#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t)) + +#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) +#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) /* - * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation - * levels in addition to the PGD. + * kvm_mmmu_cache_min_pages() is the number of pages required to install + * a stage-2 translation. We pre-allocate the entry level page table at + * the VM creation. */ -#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1) +#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) - -#if STAGE2_PGTABLE_LEVELS > 3 +/* Stage2 PUD definitions when the level is present */ +static inline bool kvm_stage2_has_pud(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3); +} #define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) +#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT) #define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) -#define stage2_pgd_none(pgd) pgd_none(pgd) -#define stage2_pgd_clear(pgd) pgd_clear(pgd) -#define stage2_pgd_present(pgd) pgd_present(pgd) -#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(pud) pud_free(NULL, pud) +static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd) +{ + if (kvm_stage2_has_pud(kvm)) + return pgd_none(pgd); + else + return 0; +} -#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp) +static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) +{ + if (kvm_stage2_has_pud(kvm)) + pgd_clear(pgdp); +} -static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end) +static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd) { - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + if (kvm_stage2_has_pud(kvm)) + return pgd_present(pgd); + else + return 1; +} - return (boundary - 1 < end - 1) ? boundary : end; +static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud) +{ + if (kvm_stage2_has_pud(kvm)) + pgd_populate(NULL, pgd, pud); +} + +static inline pud_t *stage2_pud_offset(struct kvm *kvm, + pgd_t *pgd, unsigned long address) +{ + if (kvm_stage2_has_pud(kvm)) + return pud_offset(pgd, address); + else + return (pud_t *)pgd; } -#endif /* STAGE2_PGTABLE_LEVELS > 3 */ +static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) +{ + if (kvm_stage2_has_pud(kvm)) + pud_free(NULL, pud); +} +static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) +{ + if (kvm_stage2_has_pud(kvm)) + return kvm_page_empty(pudp); + else + return false; +} -#if STAGE2_PGTABLE_LEVELS > 2 +static inline phys_addr_t +stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + if (kvm_stage2_has_pud(kvm)) { + phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; + + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } +} + +/* Stage2 PMD definitions when the level is present */ +static inline bool kvm_stage2_has_pmd(struct kvm *kvm) +{ + return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2); +} #define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) -#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) +#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) #define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) -#define stage2_pud_none(pud) pud_none(pud) -#define stage2_pud_clear(pud) pud_clear(pud) -#define stage2_pud_present(pud) pud_present(pud) -#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(pmd) pmd_free(NULL, pmd) +static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_none(pud); + else + return 0; +} + +static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud) +{ + if (kvm_stage2_has_pmd(kvm)) + pud_clear(pud); +} -#define stage2_pud_huge(pud) pud_huge(pud) -#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp) +static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_present(pud); + else + return 1; +} -static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd) { - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; + if (kvm_stage2_has_pmd(kvm)) + pud_populate(NULL, pud, pmd); +} - return (boundary - 1 < end - 1) ? boundary : end; +static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, + pud_t *pud, unsigned long address) +{ + if (kvm_stage2_has_pmd(kvm)) + return pmd_offset(pud, address); + else + return (pmd_t *)pud; } -#endif /* STAGE2_PGTABLE_LEVELS > 2 */ +static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) +{ + if (kvm_stage2_has_pmd(kvm)) + pmd_free(NULL, pmd); +} + +static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) +{ + if (kvm_stage2_has_pmd(kvm)) + return pud_huge(pud); + else + return 0; +} + +static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) +{ + if (kvm_stage2_has_pmd(kvm)) + return kvm_page_empty(pmdp); + else + return 0; +} -#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep) +static inline phys_addr_t +stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) +{ + if (kvm_stage2_has_pmd(kvm)) { + phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; -#if STAGE2_PGTABLE_LEVELS == 2 -#include <asm/stage2_pgtable-nopmd.h> -#elif STAGE2_PGTABLE_LEVELS == 3 -#include <asm/stage2_pgtable-nopud.h> -#endif + return (boundary - 1 < end - 1) ? boundary : end; + } else { + return end; + } +} +static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep) +{ + return kvm_page_empty(ptep); +} -#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) +static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) +{ + return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1)); +} -static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end) +static inline phys_addr_t +stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) { - phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; + phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); return (boundary - 1 < end - 1) ? boundary : end; } diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c @@ -391,15 +391,15 @@ int __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_CORTEX_A53; case ARM_CPU_PART_CORTEX_A57: return KVM_ARM_TARGET_CORTEX_A57; - }; + } break; case ARM_CPU_IMP_APM: switch (part_number) { case APM_CPU_PART_POTENZA: return KVM_ARM_TARGET_XGENE_POTENZA; - }; + } break; - }; + } /* Return a default generic target */ return KVM_ARM_TARGET_GENERIC_V8; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c @@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, */ run->exit_reason = KVM_EXIT_FAIL_ENTRY; return 0; + case ARM_EXCEPTION_IL: + /* + * We attempted an illegal exception return. Guest state must + * have been corrupted somehow. Give up. + */ + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return -EINVAL; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o -obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o # KVM code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S @@ -162,6 +162,20 @@ el1_error: mov x0, #ARM_EXCEPTION_EL1_SERROR b __guest_exit +el2_sync: + /* Check for illegal exception return, otherwise panic */ + mrs x0, spsr_el2 + + /* if this was something else, then panic! */ + tst x0, #PSR_IL_BIT + b.eq __hyp_panic + + /* Let's attempt a recovery from the illegal exception return */ + get_vcpu_ptr x1, x0 + mov x0, #ARM_EXCEPTION_IL + b __guest_exit + + el2_error: ldp x0, x1, [sp], #16 @@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector) invalid_vect el2t_fiq_invalid // FIQ EL2t invalid_vect el2t_error_invalid // Error EL2t - invalid_vect el2h_sync_invalid // Synchronous EL2h + valid_vect el2_sync // Synchronous EL2h invalid_vect el2h_irq_invalid // IRQ EL2h invalid_vect el2h_fiq_invalid // FIQ EL2h valid_vect el2_error // Error EL2h diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c @@ -1,90 +0,0 @@ -/* - * Copyright (C) 2016 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <linux/types.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_hyp.h> - -u32 __hyp_text __init_stage2_translation(void) -{ - u64 val = VTCR_EL2_FLAGS; - u64 parange; - u64 tmp; - - /* - * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS - * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while - * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2... - */ - parange = read_sysreg(id_aa64mmfr0_el1) & 7; - if (parange > ID_AA64MMFR0_PARANGE_MAX) - parange = ID_AA64MMFR0_PARANGE_MAX; - val |= parange << 16; - - /* Compute the actual PARange... */ - switch (parange) { - case 0: - parange = 32; - break; - case 1: - parange = 36; - break; - case 2: - parange = 40; - break; - case 3: - parange = 42; - break; - case 4: - parange = 44; - break; - case 5: - default: - parange = 48; - break; - } - - /* - * ... and clamp it to 40 bits, unless we have some braindead - * HW that implements less than that. In all cases, we'll - * return that value for the rest of the kernel to decide what - * to do. - */ - val |= 64 - (parange > 40 ? 40 : parange); - - /* - * Check the availability of Hardware Access Flag / Dirty Bit - * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (tmp) - val |= VTCR_EL2_HA; - - /* - * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS - * bit in VTCR_EL2. - */ - tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf; - val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ? - VTCR_EL2_VS_16BIT : - VTCR_EL2_VS_8BIT; - - write_sysreg(val, vtcr_el2); - - return parange; -} diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c @@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void) static void __hyp_text __activate_vm(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); } static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) @@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar) return false; /* Translation failed, back to guest */ /* Convert PAR to HPFAR format */ - *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4; + *hpfar = PAR_TO_HPFAR(tmp); return true; } diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c @@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt) static void __hyp_text __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt) { + u64 pstate = ctxt->gp_regs.regs.pstate; + u64 mode = pstate & PSR_AA32_MODE_MASK; + + /* + * Safety check to ensure we're setting the CPU up to enter the guest + * in a less privileged mode. + * + * If we are attempting a return to EL2 or higher in AArch64 state, + * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that + * we'll take an illegal exception state exception immediately after + * the ERET to the guest. Attempts to return to AArch32 Hyp will + * result in an illegal exception return because EL2's execution state + * is determined by SCR_EL3.RW. + */ + if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t) + pstate = PSR_MODE_EL2h | PSR_IL_BIT; + write_sysreg_el2(ctxt->gp_regs.regs.pc, elr); - write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr); + write_sysreg_el2(pstate, spsr); if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2); diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c @@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so * let's flip TGE before executing the TLB operation. */ - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); @@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm) static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm) { - write_sysreg(kvm->arch.vttbr, vttbr_el2); + __load_guest_stage2(kvm); isb(); } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c @@ -26,6 +26,7 @@ #include <kvm/arm_arch_timer.h> +#include <asm/cpufeature.h> #include <asm/cputype.h> #include <asm/ptrace.h> #include <asm/kvm_arm.h> @@ -33,6 +34,9 @@ #include <asm/kvm_coproc.h> #include <asm/kvm_mmu.h> +/* Maximum phys_shift supported for any VM on this host */ +static u32 kvm_ipa_limit; + /* * ARMv8 Reset Values */ @@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void) } /** - * kvm_arch_dev_ioctl_check_extension + * kvm_arch_vm_ioctl_check_extension * * We currently assume that the number of HW registers is uniform * across all CPUs (see cpuinfo_sanity_check). */ -int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) +int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; @@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: - case KVM_CAP_VCPU_EVENTS: r = 1; break; + case KVM_CAP_ARM_VM_IPA_SIZE: + r = kvm_ipa_limit; + break; default: r = 0; } @@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) /* Reset timer */ return kvm_timer_vcpu_reset(vcpu); } + +void kvm_set_ipa_limit(void) +{ + unsigned int ipa_max, pa_max, va_max, parange; + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7; + pa_max = id_aa64mmfr0_parange_to_phys_shift(parange); + + /* Clamp the IPA limit to the PA size supported by the kernel */ + ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max; + /* + * Since our stage2 table is dependent on the stage1 page table code, + * we must always honor the following condition: + * + * Number of levels in Stage1 >= Number of levels in Stage2. + * + * So clamp the ipa limit further down to limit the number of levels. + * Since we can concatenate upto 16 tables at entry level, we could + * go upto 4bits above the maximum VA addressible with the current + * number of levels. + */ + va_max = PGDIR_SHIFT + PAGE_SHIFT - 3; + va_max += 4; + + if (va_max < ipa_max) + ipa_max = va_max; + + /* + * If the final limit is lower than the real physical address + * limit of the CPUs, report the reason. + */ + if (ipa_max < pa_max) + pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n", + (va_max < pa_max) ? "Virtual" : "Physical"); + + WARN(ipa_max < KVM_PHYS_SHIFT, + "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max); + kvm_ipa_limit = ipa_max; + kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit); +} + +/* + * Configure the VTCR_EL2 for this VM. The VTCR value is common + * across all the physical CPUs on the system. We use system wide + * sanitised values to fill in different fields, except for Hardware + * Management of Access Flags. HA Flag is set unconditionally on + * all CPUs, as it is safe to run with or without the feature and + * the bit is RES0 on CPUs that don't support it. + */ +int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) +{ + u64 vtcr = VTCR_EL2_FLAGS; + u32 parange, phys_shift; + u8 lvls; + + if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) + return -EINVAL; + + phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); + if (phys_shift) { + if (phys_shift > kvm_ipa_limit || + phys_shift < 32) + return -EINVAL; + } else { + phys_shift = KVM_PHYS_SHIFT; + } + + parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7; + if (parange > ID_AA64MMFR0_PARANGE_MAX) + parange = ID_AA64MMFR0_PARANGE_MAX; + vtcr |= parange << VTCR_EL2_PS_SHIFT; + + vtcr |= VTCR_EL2_T0SZ(phys_shift); + /* + * Use a minimum 2 level page table to prevent splitting + * host PMD huge pages at stage2. + */ + lvls = stage2_pgtable_levels(phys_shift); + if (lvls < 2) + lvls = 2; + vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); + + /* + * Enable the Hardware Access Flag management, unconditionally + * on all CPUs. The features is RES0 on CPUs without the support + * and must be ignored by the CPUs. + */ + vtcr |= VTCR_EL2_HA; + + /* Set the vmid bits */ + vtcr |= (kvm_get_vmid_bits() == 16) ? + VTCR_EL2_VS_16BIT : + VTCR_EL2_VS_8BIT; + kvm->arch.vtcr = vtcr; + return 0; +} diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h @@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache; extern long flush_count_cache; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); +void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv); +#else +static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, + bool preserve_nv) { } +static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, + bool preserve_nv) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + +void kvmhv_save_host_pmu(void); +void kvmhv_load_host_pmu(void); +void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); +void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); + +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); + +long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr); +long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr, + unsigned long dabrx); + #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) BUG(); } +static inline unsigned int ap_to_shift(unsigned long ap) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + if (mmu_psize_defs[psize].ap == ap) + return mmu_psize_defs[psize].shift; + } + + return -1; +} + static inline unsigned long get_sllp_encoding(int psize) { unsigned long sllp; diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid, unsigned long addr, unsigned long page_size); extern void radix__flush_pwc_lpid(unsigned int lpid); +extern void radix__flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h @@ -322,6 +322,11 @@ #define H_GET_24X7_DATA 0xF07C #define H_GET_PERF_COUNTER_INFO 0xF080 +/* Platform-specific hcalls used for nested HV KVM */ +#define H_SET_PARTITION_TABLE 0xF800 +#define H_ENTER_NESTED 0xF804 +#define H_TLB_INVALIDATE 0xF808 + /* Values for 2nd argument to H_SET_MODE */ #define H_SET_MODE_RESOURCE_SET_CIABR 1 #define H_SET_MODE_RESOURCE_SET_DAWR 2 @@ -461,6 +466,42 @@ struct h_cpu_char_result { u64 behaviour; }; +/* Register state for entering a nested guest with H_ENTER_NESTED */ +struct hv_guest_state { + u64 version; /* version of this structure layout */ + u32 lpid; + u32 vcpu_token; + /* These registers are hypervisor privileged (at least for writing) */ + u64 lpcr; + u64 pcr; + u64 amor; + u64 dpdes; + u64 hfscr; + s64 tb_offset; + u64 dawr0; + u64 dawrx0; + u64 ciabr; + u64 hdec_expiry; + u64 purr; + u64 spurr; + u64 ic; + u64 vtb; + u64 hdar; + u64 hdsisr; + u64 heir; + u64 asdr; + /* These are OS privileged but need to be set late in guest entry */ + u64 srr0; + u64 srr1; + u64 sprg[4]; + u64 pidr; + u64 cfar; + u64 ppr; +}; + +/* Latest version of hv_guest_state structure */ +#define HV_GUEST_STATE_VERSION 1 + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h @@ -126,7 +126,7 @@ struct iommu_table { int it_nid; }; -#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \ +#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \ ((tbl)->it_ops->useraddrptr((tbl), (entry), false)) #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ ((tbl)->it_ops->useraddrptr((tbl), (entry), true)) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h @@ -84,7 +84,6 @@ #define BOOK3S_INTERRUPT_INST_STORAGE 0x400 #define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 #define BOOK3S_INTERRUPT_EXTERNAL 0x500 -#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501 #define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502 #define BOOK3S_INTERRUPT_ALIGNMENT 0x600 #define BOOK3S_INTERRUPT_PROGRAM 0x700 @@ -134,8 +133,7 @@ #define BOOK3S_IRQPRIO_EXTERNAL 14 #define BOOK3S_IRQPRIO_DECREMENTER 15 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16 -#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 17 -#define BOOK3S_IRQPRIO_MAX 18 +#define BOOK3S_IRQPRIO_MAX 17 #define BOOK3S_HFLAG_DCBZ32 0x1 #define BOOK3S_HFLAG_SLB 0x2 diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h @@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr); +extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 root, + u64 *pte_ret_p); +extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 table, + int table_index, u64 *pte_ret_p); extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data, bool iswrite); +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, + unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int lpid); +extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, + bool writing, unsigned long gpa, + unsigned int lpid); +extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, + unsigned long gpa, + struct kvm_memory_slot *memslot, + bool writing, bool kvm_ro, + pte_t *inserted_pte, unsigned int *levelp); extern int kvmppc_init_vm_radix(struct kvm *kvm); extern void kvmppc_free_radix(struct kvm *kvm); +extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, + unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); +extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, + unsigned long gpa, unsigned int shift, + struct kvm_memory_slot *memslot, + unsigned int lpid); extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {} static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {} #endif +long kvmhv_nested_init(void); +void kvmhv_nested_exit(void); +void kvmhv_vm_nested_init(struct kvm *kvm); +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu); +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1); +void kvmhv_release_all_nested(struct kvm *kvm); +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu); +long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu); +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu, + u64 time_limit, unsigned long lpcr); +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr); +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr); +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu); + void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); extern int kvm_irq_bypass; @@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - vcpu->arch.cr = val; + vcpu->arch.regs.ccr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr; + return vcpu->arch.regs.ccr; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) @@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); /* TO = 31 for unconditional trap */ #define INS_TW 0x7fe00008 -/* LPIDs we support with this build -- runtime limit may be lower */ -#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) - #define SPLIT_HACK_MASK 0xff000000 #define SPLIT_HACK_OFFS 0xfb000000 diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -23,6 +23,108 @@ #include <linux/string.h> #include <asm/bitops.h> #include <asm/book3s/64/mmu-hash.h> +#include <asm/cpu_has_feature.h> +#include <asm/ppc-opcode.h> + +#ifdef CONFIG_PPC_PSERIES +static inline bool kvmhv_on_pseries(void) +{ + return !cpu_has_feature(CPU_FTR_HVMODE); +} +#else +static inline bool kvmhv_on_pseries(void) +{ + return false; +} +#endif + +/* + * Structure for a nested guest, that is, for a guest that is managed by + * one of our guests. + */ +struct kvm_nested_guest { + struct kvm *l1_host; /* L1 VM that owns this nested guest */ + int l1_lpid; /* lpid L1 guest thinks this guest is */ + int shadow_lpid; /* real lpid of this nested guest */ + pgd_t *shadow_pgtable; /* our page table for this guest */ + u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ + u64 process_table; /* process table entry for this guest */ + long refcnt; /* number of pointers to this struct */ + struct mutex tlb_lock; /* serialize page faults and tlbies */ + struct kvm_nested_guest *next; + cpumask_t need_tlb_flush; + cpumask_t cpu_in_guest; + short prev_cpu[NR_CPUS]; +}; + +/* + * We define a nested rmap entry as a single 64-bit quantity + * 0xFFF0000000000000 12-bit lpid field + * 0x000FFFFFFFFFF000 40-bit guest 4k page frame number + * 0x0000000000000001 1-bit single entry flag + */ +#define RMAP_NESTED_LPID_MASK 0xFFF0000000000000UL +#define RMAP_NESTED_LPID_SHIFT (52) +#define RMAP_NESTED_GPA_MASK 0x000FFFFFFFFFF000UL +#define RMAP_NESTED_IS_SINGLE_ENTRY 0x0000000000000001UL + +/* Structure for a nested guest rmap entry */ +struct rmap_nested { + struct llist_node list; + u64 rmap; +}; + +/* + * for_each_nest_rmap_safe - iterate over the list of nested rmap entries + * safe against removal of the list entry or NULL list + * @pos: a (struct rmap_nested *) to use as a loop cursor + * @node: pointer to the first entry + * NOTE: this can be NULL + * @rmapp: an (unsigned long *) in which to return the rmap entries on each + * iteration + * NOTE: this must point to already allocated memory + * + * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the + * rmap entry in the memslot. The list is always terminated by a "single entry" + * stored in the list element of the final entry of the llist. If there is ONLY + * a single entry then this is itself in the rmap entry of the memslot, not a + * llist head pointer. + * + * Note that the iterator below assumes that a nested rmap entry is always + * non-zero. This is true for our usage because the LPID field is always + * non-zero (zero is reserved for the host). + * + * This should be used to iterate over the list of rmap_nested entries with + * processing done on the u64 rmap value given by each iteration. This is safe + * against removal of list entries and it is always safe to call free on (pos). + * + * e.g. + * struct rmap_nested *cursor; + * struct llist_node *first; + * unsigned long rmap; + * for_each_nest_rmap_safe(cursor, first, &rmap) { + * do_something(rmap); + * free(cursor); + * } + */ +#define for_each_nest_rmap_safe(pos, node, rmapp) \ + for ((pos) = llist_entry((node), typeof(*(pos)), list); \ + (node) && \ + (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ + ((u64) (node)) : ((pos)->rmap))) && \ + (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \ + ((struct llist_node *) ((pos) = NULL)) : \ + (pos)->list.next)), true); \ + (pos) = llist_entry((node), typeof(*(pos)), list)) + +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, + bool create); +void kvmhv_put_nested(struct kvm_nested_guest *gp); +int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid); + +/* Encoding of first parameter for H_TLB_INVALIDATE */ +#define H_TLBIE_P1_ENC(ric, prs, r) (___PPC_RIC(ric) | ___PPC_PRS(prs) | \ + ___PPC_R(r)) /* Power architecture requires HPT is at least 256kiB, at most 64TiB */ #define PPC_MIN_HPT_ORDER 18 @@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) } extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); +extern void kvmhv_radix_debugfs_init(struct kvm *kvm); extern void kvmhv_rm_send_ipi(int cpu); @@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) { - vcpu->arch.cr = vcpu->arch.cr_tm; + vcpu->arch.regs.ccr = vcpu->arch.cr_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm; vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; @@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) { - vcpu->arch.cr_tm = vcpu->arch.cr; + vcpu->arch.cr_tm = vcpu->arch.regs.ccr; vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.lr_tm = vcpu->arch.regs.link; vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; @@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ +extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, + unsigned long gpa, unsigned int level, + unsigned long mmu_seq, unsigned int lpid, + unsigned long *rmapp, struct rmap_nested **n_rmap); +extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, + struct rmap_nested **n_rmap); +extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes); + #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -25,6 +25,9 @@ #define XICS_MFRR 0xc #define XICS_IPI 2 /* interrupt source # for IPIs */ +/* LPIDs we support with this build -- runtime limit may be lower */ +#define KVMPPC_NR_LPIDS (LPID_RSVD + 1) + /* Maximum number of threads per physical core */ #define MAX_SMT_THREADS 8 diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h @@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - vcpu->arch.cr = val; + vcpu->arch.regs.ccr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr; + return vcpu->arch.regs.ccr; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h @@ -46,6 +46,7 @@ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ #define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) +#define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS #else #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS @@ -94,6 +95,7 @@ struct dtl_entry; struct kvmppc_vcpu_book3s; struct kvmppc_book3s_shadow_vcpu; +struct kvm_nested_guest; struct kvm_vm_stat { ulong remote_tlb_flush; @@ -287,10 +289,12 @@ struct kvm_arch { u8 radix; u8 fwnmi_enabled; bool threads_indep; + bool nested_enable; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; + struct dentry *radix_dentry; struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -311,6 +315,9 @@ struct kvm_arch { #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + u64 l1_ptcr; + int max_nested_lpid; + struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS]; /* This array can grow quite large, keep it at the end */ struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; #endif @@ -360,7 +367,9 @@ struct kvmppc_pte { bool may_write : 1; bool may_execute : 1; unsigned long wimg; + unsigned long rc; u8 page_size; /* MMU_PAGE_xxx */ + u8 page_shift; }; struct kvmppc_mmu { @@ -537,8 +546,6 @@ struct kvm_vcpu_arch { ulong tar; #endif - u32 cr; - #ifdef CONFIG_PPC_BOOK3S ulong hflags; ulong guest_owned_ext; @@ -707,6 +714,7 @@ struct kvm_vcpu_arch { u8 hcall_needed; u8 epr_flags; /* KVMPPC_EPR_xxx */ u8 epr_needed; + u8 external_oneshot; /* clear external irq after delivery */ u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ @@ -781,6 +789,10 @@ struct kvm_vcpu_arch { u32 emul_inst; u32 online; + + /* For support of nested guests */ + struct kvm_nested_guest *nested; + u32 nested_vcpu_id; #endif #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h @@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table( (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ (stt)->size, (ioba), (npages)) ? \ H_PARAMETER : H_SUCCESS) -extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, - unsigned long tce); -extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, +extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long *ua, unsigned long **prmap); extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, unsigned long idx, unsigned long tce); @@ -327,6 +325,7 @@ struct kvmppc_ops { int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, unsigned long flags); void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); + int (*enable_nested)(struct kvm *kvm); }; extern struct kvmppc_ops *kvmppc_hv_ops; @@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); +extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) { return -1; } @@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { return -ENODEV; } +static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } #endif /* CONFIG_KVM_XIVE */ /* @@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, unsigned long mfrr); int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); +void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu); /* * Host-side operations we want to set up while running in real diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h @@ -104,6 +104,7 @@ #define OP_31_XOP_LHZUX 311 #define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MSGCLRP 174 +#define OP_31_XOP_TLBIE 306 #define OP_31_XOP_MFSPR 339 #define OP_31_XOP_LWAX 341 #define OP_31_XOP_LHAX 343 diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h @@ -415,6 +415,7 @@ #define HFSCR_DSCR __MASK(FSCR_DSCR_LG) #define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) #define HFSCR_FP __MASK(FSCR_FP_LG) +#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ #define SPRN_TAR 0x32f /* Target Address Register */ #define SPRN_LPCR 0x13E /* LPAR Control Register */ #define LPCR_VPM0 ASM_CONST(0x8000000000000000) @@ -766,6 +767,7 @@ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ #define HSRR1_DENORM 0x00100000 /* Denorm exception */ +#define HSRR1_HISI_WRITE 0x00010000 /* HISI bcs couldn't update mem */ #define SPRN_TBCTL 0x35f /* PA6T Timebase control register */ #define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h @@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) +#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c @@ -438,7 +438,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S OFFSET(VCPU_TAR, kvm_vcpu, arch.tar); #endif - OFFSET(VCPU_CR, kvm_vcpu, arch.cr); + OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr); OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr); @@ -503,6 +503,7 @@ int main(void) OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); + OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested); OFFSET(VCPU_CPU, kvm_vcpu, cpu); OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); #endif @@ -695,7 +696,7 @@ int main(void) #endif /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S */ - OFFSET(VCPU_CR, kvm_vcpu, arch.cr); + OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr); OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer); OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link); OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S @@ -147,8 +147,8 @@ __init_hvmode_206: rldicl. r0,r3,4,63 bnelr ld r5,CPU_SPEC_FEATURES(r4) - LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) - xor r5,r5,r6 + LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST) + andc r5,r5,r6 std r5,CPU_SPEC_FEATURES(r4) blr diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile @@ -75,7 +75,8 @@ kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o \ - book3s_64_mmu_radix.o + book3s_64_mmu_radix.o \ + book3s_hv_nested.o kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c @@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) { if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { ulong pc = kvmppc_get_pc(vcpu); + ulong lr = kvmppc_get_lr(vcpu); if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); + if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK); vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; } } @@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; - case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break; case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; @@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL; - - if (irq->irq == KVM_INTERRUPT_SET_LEVEL) - vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL; + /* + * This case (KVM_INTERRUPT_SET) should never actually arise for + * a pseries guest (because pseries guests expect their interrupt + * controllers to continue asserting an external interrupt request + * until it is acknowledged at the interrupt controller), but is + * included to avoid ABI breakage and potentially for other + * sorts of guest. + * + * There is a subtlety here: HV KVM does not test the + * external_oneshot flag in the code that synthesizes + * external interrupts for the guest just before entering + * the guest. That is OK even if userspace did do a + * KVM_INTERRUPT_SET on a pseries guest vcpu, because the + * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick() + * which ends up doing a smp_send_reschedule(), which will + * pull the guest all the way out to the host, meaning that + * we will call kvmppc_core_prepare_to_enter() before entering + * the guest again, and that will handle the external_oneshot + * flag correctly. + */ + if (irq->irq == KVM_INTERRUPT_SET) + vcpu->arch.external_oneshot = 1; - kvmppc_book3s_queue_irqprio(vcpu, vec); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); } void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) { kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); - kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); } void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, @@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, vec = BOOK3S_INTERRUPT_DECREMENTER; break; case BOOK3S_IRQPRIO_EXTERNAL: - case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; vec = BOOK3S_INTERRUPT_EXTERNAL; break; @@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) case BOOK3S_IRQPRIO_DECREMENTER: /* DEC interrupts get cleared by mtdec */ return false; - case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: - /* External interrupts get cleared by userspace */ + case BOOK3S_IRQPRIO_EXTERNAL: + /* + * External interrupts get cleared by userspace + * except when set by the KVM_INTERRUPT ioctl with + * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL). + */ + if (vcpu->arch.external_oneshot) { + vcpu->arch.external_oneshot = 0; + return true; + } return false; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void) { unsigned long host_lpid, rsvd_lpid; - if (!cpu_has_feature(CPU_FTR_HVMODE)) - return -EINVAL; - if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) return -EINVAL; /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ - host_lpid = mfspr(SPRN_LPID); + host_lpid = 0; + if (cpu_has_feature(CPU_FTR_HVMODE)) + host_lpid = mfspr(SPRN_LPID); rsvd_lpid = LPID_RSVD; kvmppc_init_lpid(rsvd_lpid + 1); diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -10,6 +10,9 @@ #include <linux/string.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/debugfs.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> @@ -26,87 +29,74 @@ */ static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; -int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data, bool iswrite) +int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 root, + u64 *pte_ret_p) { struct kvm *kvm = vcpu->kvm; - u32 pid; int ret, level, ps; - __be64 prte, rpte; - unsigned long ptbl; - unsigned long root, pte, index; - unsigned long rts, bits, offset; - unsigned long gpa; - unsigned long proc_tbl_size; - - /* Work out effective PID */ - switch (eaddr >> 62) { - case 0: - pid = vcpu->arch.pid; - break; - case 3: - pid = 0; - break; - default: - return -EINVAL; - } - proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12); - if (pid * 16 >= proc_tbl_size) - return -EINVAL; - - /* Read partition table to find root of tree for effective PID */ - ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16); - ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte)); - if (ret) - return ret; + unsigned long rts, bits, offset, index; + u64 pte, base, gpa; + __be64 rpte; - root = be64_to_cpu(prte); rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | ((root & RTS2_MASK) >> RTS2_SHIFT); bits = root & RPDS_MASK; - root = root & RPDB_MASK; + base = root & RPDB_MASK; offset = rts + 31; - /* current implementations only support 52-bit space */ + /* Current implementations only support 52-bit space */ if (offset != 52) return -EINVAL; + /* Walk each level of the radix tree */ for (level = 3; level >= 0; --level) { + u64 addr; + /* Check a valid size */ if (level && bits != p9_supported_radix_bits[level]) return -EINVAL; if (level == 0 && !(bits == 5 || bits == 9)) return -EINVAL; offset -= bits; index = (eaddr >> offset) & ((1UL << bits) - 1); - /* check that low bits of page table base are zero */ - if (root & ((1UL << (bits + 3)) - 1)) + /* Check that low bits of page table base are zero */ + if (base & ((1UL << (bits + 3)) - 1)) return -EINVAL; - ret = kvm_read_guest(kvm, root + index * 8, - &rpte, sizeof(rpte)); - if (ret) + /* Read the entry from guest memory */ + addr = base + (index * sizeof(rpte)); + ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte)); + if (ret) { + if (pte_ret_p) + *pte_ret_p = addr; return ret; + } pte = __be64_to_cpu(rpte); if (!(pte & _PAGE_PRESENT)) return -ENOENT; + /* Check if a leaf entry */ if (pte & _PAGE_PTE) break; - bits = pte & 0x1f; - root = pte & 0x0fffffffffffff00ul; + /* Get ready to walk the next level */ + base = pte & RPDB_MASK; + bits = pte & RPDS_MASK; } - /* need a leaf at lowest level; 512GB pages not supported */ + + /* Need a leaf at lowest level; 512GB pages not supported */ if (level < 0 || level == 3) return -EINVAL; - /* offset is now log base 2 of the page size */ + /* We found a valid leaf PTE */ + /* Offset is now log base 2 of the page size */ gpa = pte & 0x01fffffffffff000ul; if (gpa & ((1ul << offset) - 1)) return -EINVAL; - gpa += eaddr & ((1ul << offset) - 1); + gpa |= eaddr & ((1ul << offset) - 1); for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) if (offset == mmu_psize_defs[ps].shift) break; gpte->page_size = ps; + gpte->page_shift = offset; gpte->eaddr = eaddr; gpte->raddr = gpa; @@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, gpte->may_read = !!(pte & _PAGE_READ); gpte->may_write = !!(pte & _PAGE_WRITE); gpte->may_execute = !!(pte & _PAGE_EXEC); + + gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); + + if (pte_ret_p) + *pte_ret_p = pte; + + return 0; +} + +/* + * Used to walk a partition or process table radix tree in guest memory + * Note: We exploit the fact that a partition table and a process + * table have the same layout, a partition-scoped page table and a + * process-scoped page table have the same layout, and the 2nd + * doubleword of a partition table entry has the same layout as + * the PTCR register. + */ +int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, u64 table, + int table_index, u64 *pte_ret_p) +{ + struct kvm *kvm = vcpu->kvm; + int ret; + unsigned long size, ptbl, root; + struct prtb_entry entry; + + if ((table & PRTS_MASK) > 24) + return -EINVAL; + size = 1ul << ((table & PRTS_MASK) + 12); + + /* Is the table big enough to contain this entry? */ + if ((table_index * sizeof(entry)) >= size) + return -EINVAL; + + /* Read the table to find the root of the radix tree */ + ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); + ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry)); + if (ret) + return ret; + + /* Root is stored in the first double word */ + root = be64_to_cpu(entry.prtb0); + + return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); +} + +int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite) +{ + u32 pid; + u64 pte; + int ret; + + /* Work out effective PID */ + switch (eaddr >> 62) { + case 0: + pid = vcpu->arch.pid; + break; + case 3: + pid = 0; + break; + default: + return -EINVAL; + } + + ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, + vcpu->kvm->arch.process_table, pid, &pte); + if (ret) + return ret; + + /* Check privilege (applies only to process scoped translations) */ if (kvmppc_get_msr(vcpu) & MSR_PR) { if (pte & _PAGE_PRIVILEGED) { gpte->may_read = 0; @@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, } static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, - unsigned int pshift) + unsigned int pshift, unsigned int lpid) { unsigned long psize = PAGE_SIZE; + int psi; + long rc; + unsigned long rb; if (pshift) psize = 1UL << pshift; + else + pshift = PAGE_SHIFT; addr &= ~(psize - 1); - radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize); + + if (!kvmhv_on_pseries()) { + radix__flush_tlb_lpid_page(lpid, addr, psize); + return; + } + + psi = shift_to_mmu_psize(pshift); + rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), + lpid, rb); + if (rc) + pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc); } -static void kvmppc_radix_flush_pwc(struct kvm *kvm) +static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid) { - radix__flush_pwc_lpid(kvm->arch.lpid); + long rc; + + if (!kvmhv_on_pseries()) { + radix__flush_pwc_lpid(lpid); + return; + } + + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), + lpid, TLBIEL_INVAL_SET_LPID); + if (rc) + pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc); } static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, @@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp) kmem_cache_free(kvm_pmd_cache, pmdp); } -static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, - unsigned long gpa, unsigned int shift) +/* Called with kvm->mmu_lock held */ +void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, + unsigned int shift, struct kvm_memory_slot *memslot, + unsigned int lpid) { - unsigned long page_size = 1ul << shift; unsigned long old; + unsigned long gfn = gpa >> PAGE_SHIFT; + unsigned long page_size = PAGE_SIZE; + unsigned long hpa; old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); - if (old & _PAGE_DIRTY) { - unsigned long gfn = gpa >> PAGE_SHIFT; - struct kvm_memory_slot *memslot; + kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); + + /* The following only applies to L1 entries */ + if (lpid != kvm->arch.lpid) + return; + if (!memslot) { memslot = gfn_to_memslot(kvm, gfn); - if (memslot && memslot->dirty_bitmap) - kvmppc_update_dirty_map(memslot, gfn, page_size); + if (!memslot) + return; } + if (shift) + page_size = 1ul << shift; + + gpa &= ~(page_size - 1); + hpa = old & PTE_RPN_MASK; + kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); + + if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, gfn, page_size); } /* @@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, * and emit a warning if encountered, but there may already be data * corruption due to the unexpected mappings. */ -static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) +static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, + unsigned int lpid) { if (full) { memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); @@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) WARN_ON_ONCE(1); kvmppc_unmap_pte(kvm, p, pte_pfn(*p) << PAGE_SHIFT, - PAGE_SHIFT); + PAGE_SHIFT, NULL, lpid); } } kvmppc_pte_free(pte); } -static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) +static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, + unsigned int lpid) { unsigned long im; pmd_t *p = pmd; @@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) WARN_ON_ONCE(1); kvmppc_unmap_pte(kvm, (pte_t *)p, pte_pfn(*(pte_t *)p) << PAGE_SHIFT, - PMD_SHIFT); + PMD_SHIFT, NULL, lpid); } } else { pte_t *pte; pte = pte_offset_map(p, 0); - kvmppc_unmap_free_pte(kvm, pte, full); + kvmppc_unmap_free_pte(kvm, pte, full, lpid); pmd_clear(p); } } kvmppc_pmd_free(pmd); } -static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) +static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, + unsigned int lpid) { unsigned long iu; pud_t *p = pud; @@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) pmd_t *pmd; pmd = pmd_offset(p, 0); - kvmppc_unmap_free_pmd(kvm, pmd, true); + kvmppc_unmap_free_pmd(kvm, pmd, true, lpid); pud_clear(p); } } pud_free(kvm->mm, pud); } -void kvmppc_free_radix(struct kvm *kvm) +void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid) { unsigned long ig; - pgd_t *pgd; - if (!kvm->arch.pgtable) - return; - pgd = kvm->arch.pgtable; for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { pud_t *pud; if (!pgd_present(*pgd)) continue; pud = pud_offset(pgd, 0); - kvmppc_unmap_free_pud(kvm, pud); + kvmppc_unmap_free_pud(kvm, pud, lpid); pgd_clear(pgd); } - pgd_free(kvm->mm, kvm->arch.pgtable); - kvm->arch.pgtable = NULL; +} + +void kvmppc_free_radix(struct kvm *kvm) +{ + if (kvm->arch.pgtable) { + kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable, + kvm->arch.lpid); + pgd_free(kvm->mm, kvm->arch.pgtable); + kvm->arch.pgtable = NULL; + } } static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, - unsigned long gpa) + unsigned long gpa, unsigned int lpid) { pte_t *pte = pte_offset_kernel(pmd, 0); @@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, * flushing the PWC again. */ pmd_clear(pmd); - kvmppc_radix_flush_pwc(kvm); + kvmppc_radix_flush_pwc(kvm, lpid); - kvmppc_unmap_free_pte(kvm, pte, false); + kvmppc_unmap_free_pte(kvm, pte, false, lpid); } static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, - unsigned long gpa) + unsigned long gpa, unsigned int lpid) { pmd_t *pmd = pmd_offset(pud, 0); @@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, * so can be freed without flushing the PWC again. */ pud_clear(pud); - kvmppc_radix_flush_pwc(kvm); + kvmppc_radix_flush_pwc(kvm, lpid); - kvmppc_unmap_free_pmd(kvm, pmd, false); + kvmppc_unmap_free_pmd(kvm, pmd, false, lpid); } /* @@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, */ #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) -static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, - unsigned int level, unsigned long mmu_seq) +int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, + unsigned long gpa, unsigned int level, + unsigned long mmu_seq, unsigned int lpid, + unsigned long *rmapp, struct rmap_nested **n_rmap) { pgd_t *pgd; pud_t *pud, *new_pud = NULL; @@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, int ret; /* Traverse the guest's 2nd-level tree, allocate new levels needed */ - pgd = kvm->arch.pgtable + pgd_index(gpa); + pgd = pgtable + pgd_index(gpa); pud = NULL; if (pgd_present(*pgd)) pud = pud_offset(pgd, gpa); @@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } /* Valid 1GB page here already, remove it */ - kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT); + kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL, + lpid); } if (level == 2) { if (!pud_none(*pud)) { @@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, * install a large page, so remove and free the page * table page. */ - kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa); + kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); } kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; goto out_unlock; } @@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & PTE_BITS_MUST_MATCH); kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), - 0, pte_val(pte), lgpa, PMD_SHIFT); + 0, pte_val(pte), lgpa, PMD_SHIFT); ret = 0; goto out_unlock; } @@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } /* Valid 2MB page here already, remove it */ - kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT); + kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL, + lpid); } if (level == 1) { if (!pmd_none(*pmd)) { @@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, * install a large page, so remove and free the page * table page. */ - kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa); + kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); } kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; goto out_unlock; } @@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, goto out_unlock; } kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); + if (rmapp && n_rmap) + kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); ret = 0; out_unlock: @@ -521,95 +640,49 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, return ret; } -int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned long ea, unsigned long dsisr) +bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing, + unsigned long gpa, unsigned int lpid) +{ + unsigned long pgflags; + unsigned int shift; + pte_t *ptep; + + /* + * Need to set an R or C bit in the 2nd-level tables; + * since we are just helping out the hardware here, + * it is sufficient to do what the hardware does. + */ + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + /* + * We are walking the secondary (partition-scoped) page table here. + * We can do this without disabling irq because the Linux MM + * subsystem doesn't do THP splits and collapses on this tree. + */ + ptep = __find_linux_pte(pgtable, gpa, NULL, &shift); + if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift); + return true; + } + return false; +} + +int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, + unsigned long gpa, + struct kvm_memory_slot *memslot, + bool writing, bool kvm_ro, + pte_t *inserted_pte, unsigned int *levelp) { struct kvm *kvm = vcpu->kvm; - unsigned long mmu_seq; - unsigned long gpa, gfn, hva; - struct kvm_memory_slot *memslot; struct page *page = NULL; - long ret; - bool writing; + unsigned long mmu_seq; + unsigned long hva, gfn = gpa >> PAGE_SHIFT; bool upgrade_write = false; bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; - unsigned long pgflags; unsigned int shift, level; - - /* Check for unusual errors */ - if (dsisr & DSISR_UNSUPP_MMU) { - pr_err("KVM: Got unsupported MMU fault\n"); - return -EFAULT; - } - if (dsisr & DSISR_BADACCESS) { - /* Reflect to the guest as DSI */ - pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); - return RESUME_GUEST; - } - - /* Translate the logical address and get the page */ - gpa = vcpu->arch.fault_gpa & ~0xfffUL; - gpa &= ~0xF000000000000000ul; - gfn = gpa >> PAGE_SHIFT; - if (!(dsisr & DSISR_PRTABLE_FAULT)) - gpa |= ea & 0xfff; - memslot = gfn_to_memslot(kvm, gfn); - - /* No memslot means it's an emulated MMIO region */ - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { - if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | - DSISR_SET_RC)) { - /* - * Bad address in guest page table tree, or other - * unusual error - reflect it to the guest as DSI. - */ - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); - return RESUME_GUEST; - } - return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, - dsisr & DSISR_ISSTORE); - } - - writing = (dsisr & DSISR_ISSTORE) != 0; - if (memslot->flags & KVM_MEM_READONLY) { - if (writing) { - /* give the guest a DSI */ - dsisr = DSISR_ISSTORE | DSISR_PROTFAULT; - kvmppc_core_queue_data_storage(vcpu, ea, dsisr); - return RESUME_GUEST; - } - upgrade_p = NULL; - } - - if (dsisr & DSISR_SET_RC) { - /* - * Need to set an R or C bit in the 2nd-level tables; - * since we are just helping out the hardware here, - * it is sufficient to do what the hardware does. - */ - pgflags = _PAGE_ACCESSED; - if (writing) - pgflags |= _PAGE_DIRTY; - /* - * We are walking the secondary page table here. We can do this - * without disabling irq. - */ - spin_lock(&kvm->mmu_lock); - ptep = __find_linux_pte(kvm->arch.pgtable, - gpa, NULL, &shift); - if (ptep && pte_present(*ptep) && - (!writing || pte_write(*ptep))) { - kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, - gpa, shift); - dsisr &= ~DSISR_SET_RC; - } - spin_unlock(&kvm->mmu_lock); - if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | - DSISR_PROTFAULT | DSISR_SET_RC))) - return RESUME_GUEST; - } + int ret; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_notifier_seq; @@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { upgrade_write = true; } else { unsigned long pfn; @@ -690,7 +763,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } /* Allocate space in the tree and write the PTE */ - ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level, + mmu_seq, kvm->arch.lpid, NULL, NULL); + if (inserted_pte) + *inserted_pte = pte; + if (levelp) + *levelp = level; if (page) { if (!ret && (pte_val(pte) & _PAGE_WRITE)) @@ -698,6 +776,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, put_page(page); } + return ret; +} + +int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long gpa, gfn; + struct kvm_memory_slot *memslot; + long ret; + bool writing = !!(dsisr & DSISR_ISSTORE); + bool kvm_ro = false; + + /* Check for unusual errors */ + if (dsisr & DSISR_UNSUPP_MMU) { + pr_err("KVM: Got unsupported MMU fault\n"); + return -EFAULT; + } + if (dsisr & DSISR_BADACCESS) { + /* Reflect to the guest as DSI */ + pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + + /* Translate the logical address */ + gpa = vcpu->arch.fault_gpa & ~0xfffUL; + gpa &= ~0xF000000000000000ul; + gfn = gpa >> PAGE_SHIFT; + if (!(dsisr & DSISR_PRTABLE_FAULT)) + gpa |= ea & 0xfff; + + /* Get the corresponding memslot */ + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | + DSISR_SET_RC)) { + /* + * Bad address in guest page table tree, or other + * unusual error - reflect it to the guest as DSI. + */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing); + } + + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* give the guest a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE | + DSISR_PROTFAULT); + return RESUME_GUEST; + } + kvm_ro = true; + } + + /* Failed to set the reference/change bits */ + if (dsisr & DSISR_SET_RC) { + spin_lock(&kvm->mmu_lock); + if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, + writing, gpa, kvm->arch.lpid)) + dsisr &= ~DSISR_SET_RC; + spin_unlock(&kvm->mmu_lock); + + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | + DSISR_PROTFAULT | DSISR_SET_RC))) + return RESUME_GUEST; + } + + /* Try to insert a pte */ + ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, + kvm_ro, NULL, NULL); + if (ret == 0 || ret == -EAGAIN) ret = RESUME_GUEST; return ret; @@ -710,20 +864,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, pte_t *ptep; unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; - unsigned long old; ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); - if (ptep && pte_present(*ptep)) { - old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0, - gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); - if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) { - unsigned long psize = PAGE_SIZE; - if (shift) - psize = 1ul << shift; - kvmppc_update_dirty_map(memslot, gfn, psize); - } - } + if (ptep && pte_present(*ptep)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, + kvm->arch.lpid); return 0; } @@ -778,7 +923,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, ret = 1 << (shift - PAGE_SHIFT); kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, gpa, shift); - kvmppc_radix_tlbie_page(kvm, gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid); } return ret; } @@ -863,6 +1008,215 @@ static void pmd_ctor(void *addr) memset(addr, 0, RADIX_PMD_TABLE_SIZE); } +struct debugfs_radix_state { + struct kvm *kvm; + struct mutex mutex; + unsigned long gpa; + int lpid; + int chars_left; + int buf_index; + char buf[128]; + u8 hdr; +}; + +static int debugfs_radix_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + struct debugfs_radix_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(kvm); + p->kvm = kvm; + mutex_init(&p->mutex); + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_radix_release(struct inode *inode, struct file *file) +{ + struct debugfs_radix_state *p = file->private_data; + + kvm_put_kvm(p->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_radix_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_radix_state *p = file->private_data; + ssize_t ret, r; + unsigned long n; + struct kvm *kvm; + unsigned long gpa; + pgd_t *pgt; + struct kvm_nested_guest *nested; + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ptep; + int shift; + unsigned long pte; + + kvm = p->kvm; + if (!kvm_is_radix(kvm)) + return 0; + + ret = mutex_lock_interruptible(&p->mutex); + if (ret) + return ret; + + if (p->chars_left) { + n = p->chars_left; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf + p->buf_index, n); + n -= r; + p->chars_left -= n; + p->buf_index += n; + buf += n; + len -= n; + ret = n; + if (r) { + if (!n) + ret = -EFAULT; + goto out; + } + } + + gpa = p->gpa; + nested = NULL; + pgt = NULL; + while (len != 0 && p->lpid >= 0) { + if (gpa >= RADIX_PGTABLE_RANGE) { + gpa = 0; + pgt = NULL; + if (nested) { + kvmhv_put_nested(nested); + nested = NULL; + } + p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); + p->hdr = 0; + if (p->lpid < 0) + break; + } + if (!pgt) { + if (p->lpid == 0) { + pgt = kvm->arch.pgtable; + } else { + nested = kvmhv_get_nested(kvm, p->lpid, false); + if (!nested) { + gpa = RADIX_PGTABLE_RANGE; + continue; + } + pgt = nested->shadow_pgtable; + } + } + n = 0; + if (!p->hdr) { + if (p->lpid > 0) + n = scnprintf(p->buf, sizeof(p->buf), + "\nNested LPID %d: ", p->lpid); + n += scnprintf(p->buf + n, sizeof(p->buf) - n, + "pgdir: %lx\n", (unsigned long)pgt); + p->hdr = 1; + goto copy; + } + + pgdp = pgt + pgd_index(gpa); + pgd = READ_ONCE(*pgdp); + if (!(pgd_val(pgd) & _PAGE_PRESENT)) { + gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE; + continue; + } + + pudp = pud_offset(&pgd, gpa); + pud = READ_ONCE(*pudp); + if (!(pud_val(pud) & _PAGE_PRESENT)) { + gpa = (gpa & PUD_MASK) + PUD_SIZE; + continue; + } + if (pud_val(pud) & _PAGE_PTE) { + pte = pud_val(pud); + shift = PUD_SHIFT; + goto leaf; + } + + pmdp = pmd_offset(&pud, gpa); + pmd = READ_ONCE(*pmdp); + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { + gpa = (gpa & PMD_MASK) + PMD_SIZE; + continue; + } + if (pmd_val(pmd) & _PAGE_PTE) { + pte = pmd_val(pmd); + shift = PMD_SHIFT; + goto leaf; + } + + ptep = pte_offset_kernel(&pmd, gpa); + pte = pte_val(READ_ONCE(*ptep)); + if (!(pte & _PAGE_PRESENT)) { + gpa += PAGE_SIZE; + continue; + } + shift = PAGE_SHIFT; + leaf: + n = scnprintf(p->buf, sizeof(p->buf), + " %lx: %lx %d\n", gpa, pte, shift); + gpa += 1ul << shift; + copy: + p->chars_left = n; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf, n); + n -= r; + p->chars_left -= n; + p->buf_index = n; + buf += n; + len -= n; + ret += n; + if (r) { + if (!ret) + ret = -EFAULT; + break; + } + } + p->gpa = gpa; + if (nested) + kvmhv_put_nested(nested); + + out: + mutex_unlock(&p->mutex); + return ret; +} + +static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_radix_fops = { + .owner = THIS_MODULE, + .open = debugfs_radix_open, + .release = debugfs_radix_release, + .read = debugfs_radix_read, + .write = debugfs_radix_write, + .llseek = generic_file_llseek, +}; + +void kvmhv_radix_debugfs_init(struct kvm *kvm) +{ + kvm->arch.radix_dentry = debugfs_create_file("radix", 0400, + kvm->arch.debugfs_dir, kvm, + &debugfs_radix_fops); +} + int kvmppc_radix_init(void) { unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c @@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return ret; } +static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long tce) +{ + unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); + enum dma_data_direction dir = iommu_tce_direction(tce); + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long ua = 0; + + /* Allow userspace to poison TCE table */ + if (dir == DMA_NONE) + return H_SUCCESS; + + if (iommu_tce_check_gpa(stt->page_shift, gpa)) + return H_TOO_HARD; + + if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + return H_TOO_HARD; + + list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { + unsigned long hpa = 0; + struct mm_iommu_table_group_mem_t *mem; + long shift = stit->tbl->it_page_shift; + + mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift); + if (!mem) + return H_TOO_HARD; + + if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa)) + return H_TOO_HARD; + } + + return H_SUCCESS; +} + static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) { unsigned long hpa = 0; @@ -376,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); if (!pua) - /* it_userspace allocation might be delayed */ - return H_TOO_HARD; + return H_SUCCESS; mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) @@ -401,7 +434,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, long ret; if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) - return H_HARDWARE; + return H_TOO_HARD; if (dir == DMA_NONE) return H_SUCCESS; @@ -449,15 +482,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, return H_TOO_HARD; if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa))) - return H_HARDWARE; + return H_TOO_HARD; if (mm_iommu_mapped_inc(mem)) - return H_CLOSED; + return H_TOO_HARD; ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); if (WARN_ON_ONCE(ret)) { mm_iommu_mapped_dec(mem); - return H_HARDWARE; + return H_TOO_HARD; } if (dir != DMA_NONE) @@ -517,8 +550,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, idx = srcu_read_lock(&vcpu->kvm->srcu); - if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) { + if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { ret = H_PARAMETER; goto unlock_exit; } @@ -533,14 +565,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_clear_tce(stit->tbl, entry); goto unlock_exit; - - WARN_ON_ONCE(1); - kvmppc_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry, tce); @@ -583,7 +611,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, return ret; idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { ret = H_TOO_HARD; goto unlock_exit; } @@ -599,10 +627,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, ret = kvmppc_tce_validate(stt, tce); if (ret != H_SUCCESS) goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + /* + * This looks unsafe, because we validate, then regrab + * the TCE from userspace which could have been changed by + * another thread. + * + * But it actually is safe, because the relevant checks will be + * re-executed in the following code. If userspace tries to + * change this dodgily it will result in a messier failure mode + * but won't threaten the host. + */ + if (get_user(tce, tces + i)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tce = be64_to_cpu(tce); - if (kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), - &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -610,14 +654,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, stit->tbl, entry + i, ua, iommu_tce_direction(tce)); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_clear_tce(stit->tbl, entry); goto unlock_exit; - - WARN_ON_ONCE(1); - kvmppc_clear_tce(stit->tbl, entry); + } } kvmppc_tce_put(stt, entry + i, tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, } EXPORT_SYMBOL_GPL(kvmppc_find_table); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* * Validates TCE address. * At the moment flags and page mask are validated. @@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table); * to the table and user space is supposed to process them), we can skip * checking other things (such as TCE is a guest RAM address or the page * was actually allocated). - * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM */ -long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long tce) { unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); enum dma_data_direction dir = iommu_tce_direction(tce); + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long ua = 0; /* Allow userspace to poison TCE table */ if (dir == DMA_NONE) @@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) if (iommu_tce_check_gpa(stt->page_shift, gpa)) return H_PARAMETER; + if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL)) + return H_TOO_HARD; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long hpa = 0; + struct mm_iommu_table_group_mem_t *mem; + long shift = stit->tbl->it_page_shift; + + mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift); + if (!mem) + return H_TOO_HARD; + + if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa)) + return H_TOO_HARD; + } + return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_tce_validate); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ /* Note on the use of page_address() in real mode, * @@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, } EXPORT_SYMBOL_GPL(kvmppc_tce_put); -long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, +long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long *ua, unsigned long **prmap) { - unsigned long gfn = gpa >> PAGE_SHIFT; + unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; memslot = search_memslots(kvm_memslots(kvm), gfn); @@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, return -EINVAL; *ua = __gfn_to_hva_memslot(memslot, gfn) | - (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE if (prmap) @@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, return 0; } -EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); +EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, @@ -197,7 +214,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL))) { - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); /* * kvmppc_rm_tce_iommu_do_map() updates the UA cache after * calling this so we still get here a valid UA. @@ -223,7 +240,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -287,7 +304,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, { long ret; unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) @@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, &hpa))) - return H_HARDWARE; + return H_TOO_HARD; if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) - return H_CLOSED; + return H_TOO_HARD; ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); if (ret) { @@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - ret = kvmppc_tce_validate(stt, tce); + ret = kvmppc_rm_tce_validate(stt, tce); if (ret != H_SUCCESS) return ret; dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; entry = ioba >> stt->page_shift; @@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); return ret; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + } } kvmppc_tce_put(stt, entry, tce); @@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ struct mm_iommu_table_group_mem_t *mem; - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) return H_TOO_HARD; mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); @@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, * We do not require memory to be preregistered in this case * so lock rmap and do __find_linux_pte_or_hugepte(). */ - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) return H_TOO_HARD; rmap = (void *) vmalloc_to_phys(rmap); if (WARN_ON_ONCE_RM(!rmap)) - return H_HARDWARE; + return H_TOO_HARD; /* * Synchronize with the MMU notifier callbacks in @@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, for (i = 0; i < npages; ++i) { unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); - ret = kvmppc_tce_validate(stt, tce); + ret = kvmppc_rm_tce_validate(stt, tce); if (ret != H_SUCCESS) goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); ua = 0; - if (kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), - &ua, NULL)) + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) return H_PARAMETER; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { @@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, stit->tbl, entry + i, ua, iommu_tce_direction(tce)); - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) + if (ret != H_SUCCESS) { + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, + entry); goto unlock_exit; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); + } } kvmppc_tce_put(stt, entry + i, tce); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c @@ -36,7 +36,6 @@ #define OP_31_XOP_MTSR 210 #define OP_31_XOP_MTSRIN 242 #define OP_31_XOP_TLBIEL 274 -#define OP_31_XOP_TLBIE 306 /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ #define OP_31_XOP_FAKE_SC1 308 #define OP_31_XOP_SLBMTE 402 @@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu) vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; vcpu->arch.tar_tm = vcpu->arch.tar; vcpu->arch.lr_tm = vcpu->arch.regs.link; - vcpu->arch.cr_tm = vcpu->arch.cr; + vcpu->arch.cr_tm = vcpu->arch.regs.ccr; vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.vrsave_tm = vcpu->arch.vrsave; } @@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu) vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; vcpu->arch.tar = vcpu->arch.tar_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm; - vcpu->arch.cr = vcpu->arch.cr_tm; + vcpu->arch.regs.ccr = vcpu->arch.cr_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.vrsave = vcpu->arch.vrsave_tm; } @@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val) uint64_t texasr; /* CR0 = 0 | MSR[TS] | 0 */ - vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) | (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) << CR0_SHIFT); @@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) tm_abort(ra_val); /* CR0 = 0 | MSR[TS] | 0 */ - vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) | (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) << CR0_SHIFT); @@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!(kvmppc_get_msr(vcpu) & MSR_PR)) { preempt_disable(); - vcpu->arch.cr = (CR0_TBEGIN_FAILURE | - (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT))); + vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE | + (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT))); vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT | (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT)) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c @@ -50,6 +50,7 @@ #include <asm/reg.h> #include <asm/ppc-opcode.h> #include <asm/asm-prototypes.h> +#include <asm/archrandom.h> #include <asm/debug.h> #include <asm/disassemble.h> #include <asm/cputable.h> @@ -104,6 +105,10 @@ static bool indep_threads_mode = true; module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); +static bool one_vm_per_core; +module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)"); + #ifdef CONFIG_KVM_XICS static struct kernel_param_ops module_param_ops = { .set = param_set_int, @@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* If set, guests are allowed to create and control nested guests */ +static bool nested = true; +module_param(nested, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)"); + +static inline bool nesting_enabled(struct kvm *kvm) +{ + return kvm->arch.nested_enable && kvm_is_radix(kvm); +} + /* If set, the threads on each CPU core have to be in the same MMU mode */ static bool no_mixing_hpt_and_radix; @@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu) { unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + /* If we're a nested hypervisor, fall back to ordinary IPIs for now */ + if (kvmhv_on_pseries()) + return false; + /* On POWER9 we can use msgsnd to IPI any cpu */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { msg |= get_hard_smp_processor_id(cpu); @@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); - pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", - vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); + pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n", + vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); pr_err("fault dar = %.16lx dsisr = %.8x\n", vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); @@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) /* * Ensure that the read of vcore->dpdes comes after the read * of vcpu->doorbell_request. This barrier matches the - * lwsync in book3s_hv_rmhandlers.S just before the - * fast_guest_return label. + * smb_wmb() in kvmppc_guest_entry_inject(). */ smp_rmb(); vc = vcpu->arch.vcore; @@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; } return RESUME_HOST; + case H_SET_DABR: + ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_SET_XDABR: + ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + break; + case H_GET_TCE: + ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; case H_PUT_TCE: ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), @@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (ret == H_TOO_HARD) return RESUME_HOST; break; + case H_RANDOM: + if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) + ret = H_HARDWARE; + break; + + case H_SET_PARTITION_TABLE: + ret = H_FUNCTION; + if (nesting_enabled(vcpu->kvm)) + ret = kvmhv_set_partition_table(vcpu); + break; + case H_ENTER_NESTED: + ret = H_FUNCTION; + if (!nesting_enabled(vcpu->kvm)) + break; + ret = kvmhv_enter_nested_guest(vcpu); + if (ret == H_INTERRUPT) { + kvmppc_set_gpr(vcpu, 3, 0); + return -EINTR; + } + break; + case H_TLB_INVALIDATE: + ret = H_FUNCTION; + if (nesting_enabled(vcpu->kvm)) + ret = kvmhv_do_nested_tlbie(vcpu); + break; + default: return RESUME_HOST; } @@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_GUEST; } +/* + * Handle H_CEDE in the nested virtualization case where we haven't + * called the real-mode hcall handlers in book3s_hv_rmhandlers.S. + * This has to be done early, not in kvmppc_pseries_do_hcall(), so + * that the cede logic in kvmppc_run_single_vcpu() works properly. + */ +static void kvmppc_nested_cede(struct kvm_vcpu *vcpu) +{ + vcpu->arch.shregs.msr |= MSR_EE; + vcpu->arch.ceded = 1; + smp_mb(); + if (vcpu->arch.prodded) { + vcpu->arch.prodded = 0; + smp_mb(); + vcpu->arch.ceded = 0; + } +} + static int kvmppc_hcall_impl_hv(unsigned long cmd) { switch (cmd) { @@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) return RESUME_GUEST; } -/* Called with vcpu->arch.vcore->lock held */ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_H_INST_STORAGE: vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); - vcpu->arch.fault_dsisr = 0; + vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr & + DSISR_SRR1_MATCH_64S; + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; r = RESUME_PAGE_FAULT; break; /* @@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, swab32(vcpu->arch.emul_inst) : vcpu->arch.emul_inst; if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { - /* Need vcore unlocked to call kvmppc_get_last_inst */ - spin_unlock(&vcpu->arch.vcore->lock); r = kvmppc_emulate_debug_inst(run, vcpu); - spin_lock(&vcpu->arch.vcore->lock); } else { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: r = EMULATE_FAIL; if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) && - cpu_has_feature(CPU_FTR_ARCH_300)) { - /* Need vcore unlocked to call kvmppc_get_last_inst */ - spin_unlock(&vcpu->arch.vcore->lock); + cpu_has_feature(CPU_FTR_ARCH_300)) r = kvmppc_emulate_doorbell_instr(vcpu); - spin_lock(&vcpu->arch.vcore->lock); - } if (r == EMULATE_FAIL) { kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; @@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, return r; } +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) +{ + int r; + int srcu_idx; + + vcpu->stat.sum_exits++; + + /* + * This can happen if an interrupt occurs in the last stages + * of guest entry or the first stages of guest exit (i.e. after + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV + * and before setting it to KVM_GUEST_MODE_HOST_HV). + * That can happen due to a bug, or due to a machine check + * occurring at just the wrong time. + */ + if (vcpu->arch.shregs.msr & MSR_HV) { + pr_emerg("KVM trap in HV mode while nested!\n"); + pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + kvmppc_dump_regs(vcpu); + return RESUME_HOST; + } + switch (vcpu->arch.trap) { + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_HV_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_HOST; + break; + case BOOK3S_INTERRUPT_H_DOORBELL: + case BOOK3S_INTERRUPT_H_VIRT: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ + case BOOK3S_INTERRUPT_HMI: + case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* Pass the machine check to the L1 guest */ + r = RESUME_HOST; + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); + break; + /* + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. + */ + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) & + DSISR_SRR1_MATCH_64S; + if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE) + vcpu->arch.fault_dsisr |= DSISR_ISSTORE; + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmhv_nested_page_fault(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + break; + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case BOOK3S_INTERRUPT_HV_SOFTPATCH: + /* + * This occurs for various TM-related instructions that + * we need to emulate on POWER9 DD2.2. We have already + * handled the cases where the guest was in real-suspend + * mode and was transitioning to transactional state. + */ + r = kvmhv_p9_tm_emulation(vcpu); + break; +#endif + + case BOOK3S_INTERRUPT_HV_RM_HARD: + vcpu->arch.trap = 0; + r = RESUME_GUEST; + if (!xive_enabled()) + kvmppc_xics_rm_complete(vcpu, 0); + break; + default: + r = RESUME_HOST; + break; + } + + return r; +} + static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_ONLINE: *val = get_reg_val(id, vcpu->arch.online); break; + case KVM_REG_PPC_PTCR: + *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr); + break; default: r = -EINVAL; break; @@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, atomic_dec(&vcpu->arch.vcore->online_count); vcpu->arch.online = i; break; + case KVM_REG_PPC_PTCR: + vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val); + break; default: r = -EINVAL; break; @@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, * Set the default HFSCR for the guest from the host value. * This value is only used on POWER9. * On POWER9, we want to virtualize the doorbell facility, so we - * turn off the HFSCR bit, which causes those instructions to trap. + * don't set the HFSCR_MSGP bit, and that causes those instructions + * to trap and then we emulate them. */ - vcpu->arch.hfscr = mfspr(SPRN_HFSCR); - if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB | + HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP; + if (cpu_has_feature(CPU_FTR_HVMODE)) { + vcpu->arch.hfscr &= mfspr(SPRN_HFSCR); + if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + vcpu->arch.hfscr |= HFSCR_TM; + } + if (cpu_has_feature(CPU_FTR_TM_COMP)) vcpu->arch.hfscr |= HFSCR_TM; - else if (!cpu_has_feature(CPU_FTR_TM_COMP)) - vcpu->arch.hfscr &= ~HFSCR_TM; - if (cpu_has_feature(CPU_FTR_ARCH_300)) - vcpu->arch.hfscr &= ~HFSCR_MSGP; kvmppc_mmu_book3s_hv_init(vcpu); @@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu) static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; + cpumask_t *cpu_in_guest; int i; cpu = cpu_first_thread_sibling(cpu); - cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + if (nested) { + cpumask_set_cpu(cpu, &nested->need_tlb_flush); + cpu_in_guest = &nested->cpu_in_guest; + } else { + cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + cpu_in_guest = &kvm->arch.cpu_in_guest; + } /* * Make sure setting of bit in need_tlb_flush precedes * testing of cpu_in_guest bits. The matching barrier on @@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) */ smp_mb(); for (i = 0; i < threads_per_core; ++i) - if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) + if (cpumask_test_cpu(cpu + i, cpu_in_guest)) smp_call_function_single(cpu + i, do_nothing, NULL, 1); } static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; struct kvm *kvm = vcpu->kvm; + int prev_cpu; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; + + if (nested) + prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id]; + else + prev_cpu = vcpu->arch.prev_cpu; /* * With radix, the guest can do TLB invalidations itself, @@ -2273,12 +2468,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) * ran to flush the TLB. The TLB is shared between threads, * so we use a single bit in .need_tlb_flush for all 4 threads. */ - if (vcpu->arch.prev_cpu != pcpu) { - if (vcpu->arch.prev_cpu >= 0 && - cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + if (prev_cpu != pcpu) { + if (prev_cpu >= 0 && + cpu_first_thread_sibling(prev_cpu) != cpu_first_thread_sibling(pcpu)) - radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); - vcpu->arch.prev_cpu = pcpu; + radix_flush_cpu(kvm, prev_cpu, vcpu); + if (nested) + nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu; + else + vcpu->arch.prev_cpu = pcpu; + } +} + +static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu, + struct kvm_nested_guest *nested) +{ + cpumask_t *need_tlb_flush; + int lpid; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pcpu &= ~0x3UL; + + if (nested) { + lpid = nested->shadow_lpid; + need_tlb_flush = &nested->need_tlb_flush; + } else { + lpid = kvm->arch.lpid; + need_tlb_flush = &kvm->arch.need_tlb_flush; + } + + mtspr(SPRN_LPID, lpid); + isync(); + smp_mb(); + + if (cpumask_test_cpu(pcpu, need_tlb_flush)) { + radix__local_flush_tlb_lpid_guest(lpid); + /* Clear the bit after the TLB flush */ + cpumask_clear_cpu(pcpu, need_tlb_flush); } } @@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; + /* In one_vm_per_core mode, require all vcores to be from the same vm */ + if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm) + return false; + /* Some POWER9 chips require all threads to be in the same MMU mode */ if (no_mixing_hpt_and_radix && kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) @@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) spin_lock(&vc->lock); now = get_tb(); for_each_runnable_thread(i, vcpu, vc) { + /* + * It's safe to unlock the vcore in the loop here, because + * for_each_runnable_thread() is safe against removal of + * the vcpu, and the vcore state is VCORE_EXITING here, + * so any vcpus becoming runnable will have their arch.trap + * set to zero and can't actually run in the guest. + */ + spin_unlock(&vc->lock); /* cancel pending dec exception if dec is positive */ if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) @@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) vcpu->arch.ret = ret; vcpu->arch.trap = 0; + spin_lock(&vc->lock); if (is_kvmppc_resume_guest(vcpu->arch.ret)) { if (vcpu->arch.pending_exceptions) kvmppc_core_prepare_to_enter(vcpu); @@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) spin_unlock(&core_info.vc[sub]->lock); if (kvm_is_radix(vc->kvm)) { - int tmp = pcpu; - /* * Do we need to flush the process scoped TLB for the LPAR? * @@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * * Hash must be flushed in realmode in order to use tlbiel. */ - mtspr(SPRN_LPID, vc->kvm->arch.lpid); - isync(); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - tmp &= ~0x3UL; - - if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) { - radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid); - /* Clear the bit after the TLB flush */ - cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush); - } + kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL); } /* @@ -3080,6 +3310,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } /* + * Load up hypervisor-mode registers on P9. + */ +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + s64 hdec; + u64 tb, purr, spurr; + int trap; + unsigned long host_hfscr = mfspr(SPRN_HFSCR); + unsigned long host_ciabr = mfspr(SPRN_CIABR); + unsigned long host_dawr = mfspr(SPRN_DAWR); + unsigned long host_dawrx = mfspr(SPRN_DAWRX); + unsigned long host_psscr = mfspr(SPRN_PSSCR); + unsigned long host_pidr = mfspr(SPRN_PID); + + hdec = time_limit - mftb(); + if (hdec < 0) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + mtspr(SPRN_HDEC, hdec); + + if (vc->tb_offset) { + u64 new_tb = mftb() + vc->tb_offset; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = vc->tb_offset; + } + + if (vc->pcr) + mtspr(SPRN_PCR, vc->pcr); + mtspr(SPRN_DPDES, vc->dpdes); + mtspr(SPRN_VTB, vc->vtb); + + local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); + local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, vcpu->arch.purr); + mtspr(SPRN_SPURR, vcpu->arch.spurr); + + if (cpu_has_feature(CPU_FTR_DAWR)) { + mtspr(SPRN_DAWR, vcpu->arch.dawr); + mtspr(SPRN_DAWRX, vcpu->arch.dawrx); + } + mtspr(SPRN_CIABR, vcpu->arch.ciabr); + mtspr(SPRN_IC, vcpu->arch.ic); + mtspr(SPRN_PID, vcpu->arch.pid); + + mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); + + mtspr(SPRN_HFSCR, vcpu->arch.hfscr); + + mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); + mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); + mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); + mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); + + mtspr(SPRN_AMOR, ~0UL); + + mtspr(SPRN_LPCR, lpcr); + isync(); + + kvmppc_xive_push_vcpu(vcpu); + + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); + mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); + + trap = __kvmhv_vcpu_entry_p9(vcpu); + + /* Advance host PURR/SPURR by the amount used by guest */ + purr = mfspr(SPRN_PURR); + spurr = mfspr(SPRN_SPURR); + mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr + + purr - vcpu->arch.purr); + mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr + + spurr - vcpu->arch.spurr); + vcpu->arch.purr = purr; + vcpu->arch.spurr = spurr; + + vcpu->arch.ic = mfspr(SPRN_IC); + vcpu->arch.pid = mfspr(SPRN_PID); + vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS; + + vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0); + vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1); + vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); + vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); + + mtspr(SPRN_PSSCR, host_psscr); + mtspr(SPRN_HFSCR, host_hfscr); + mtspr(SPRN_CIABR, host_ciabr); + mtspr(SPRN_DAWR, host_dawr); + mtspr(SPRN_DAWRX, host_dawrx); + mtspr(SPRN_PID, host_pidr); + + /* + * Since this is radix, do a eieio; tlbsync; ptesync sequence in + * case we interrupted the guest between a tlbie and a ptesync. + */ + asm volatile("eieio; tlbsync; ptesync"); + + mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */ + isync(); + + vc->dpdes = mfspr(SPRN_DPDES); + vc->vtb = mfspr(SPRN_VTB); + mtspr(SPRN_DPDES, 0); + if (vc->pcr) + mtspr(SPRN_PCR, 0); + + if (vc->tb_offset_applied) { + u64 new_tb = mftb() - vc->tb_offset_applied; + mtspr(SPRN_TBU40, new_tb); + tb = mftb(); + if ((tb & 0xffffff) < (new_tb & 0xffffff)) + mtspr(SPRN_TBU40, new_tb + 0x1000000); + vc->tb_offset_applied = 0; + } + + mtspr(SPRN_HDEC, 0x7fffffff); + mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + + return trap; +} + +/* + * Virtual-mode guest entry for POWER9 and later when the host and + * guest are both using the radix MMU. The LPIDR has already been set. + */ +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long host_dscr = mfspr(SPRN_DSCR); + unsigned long host_tidr = mfspr(SPRN_TIDR); + unsigned long host_iamr = mfspr(SPRN_IAMR); + s64 dec; + u64 tb; + int trap, save_pmu; + + dec = mfspr(SPRN_DEC); + tb = mftb(); + if (dec < 512) + return BOOK3S_INTERRUPT_HV_DECREMENTER; + local_paca->kvm_hstate.dec_expires = dec + tb; + if (local_paca->kvm_hstate.dec_expires < time_limit) + time_limit = local_paca->kvm_hstate.dec_expires; + + vcpu->arch.ceded = 0; + + kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ + + kvmppc_subcore_enter_guest(); + + vc->entry_exit_map = 1; + vc->in_guest = 1; + + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + } + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + kvmhv_load_guest_pmu(vcpu); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + load_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + load_vr_state(&vcpu->arch.vr); +#endif + + mtspr(SPRN_DSCR, vcpu->arch.dscr); + mtspr(SPRN_IAMR, vcpu->arch.iamr); + mtspr(SPRN_PSPB, vcpu->arch.pspb); + mtspr(SPRN_FSCR, vcpu->arch.fscr); + mtspr(SPRN_TAR, vcpu->arch.tar); + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + mtspr(SPRN_BESCR, vcpu->arch.bescr); + mtspr(SPRN_WORT, vcpu->arch.wort); + mtspr(SPRN_TIDR, vcpu->arch.tid); + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + mtspr(SPRN_AMR, vcpu->arch.amr); + mtspr(SPRN_UAMOR, vcpu->arch.uamor); + + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + + mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); + + if (kvmhv_on_pseries()) { + /* call our hypervisor to load up HV regs and go */ + struct hv_guest_state hvregs; + + kvmhv_save_hv_regs(vcpu, &hvregs); + hvregs.lpcr = lpcr; + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + hvregs.version = HV_GUEST_STATE_VERSION; + if (vcpu->arch.nested) { + hvregs.lpid = vcpu->arch.nested->shadow_lpid; + hvregs.vcpu_token = vcpu->arch.nested_vcpu_id; + } else { + hvregs.lpid = vcpu->kvm->arch.lpid; + hvregs.vcpu_token = vcpu->vcpu_id; + } + hvregs.hdec_expiry = time_limit; + trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), + __pa(&vcpu->arch.regs)); + kvmhv_restore_hv_return_state(vcpu, &hvregs); + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + vcpu->arch.shregs.dar = mfspr(SPRN_DAR); + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); + + /* H_CEDE has to be handled now, not later */ + if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && + kvmppc_get_gpr(vcpu, 3) == H_CEDE) { + kvmppc_nested_cede(vcpu); + trap = 0; + } + } else { + trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr); + } + + vcpu->arch.slb_max = 0; + dec = mfspr(SPRN_DEC); + tb = mftb(); + vcpu->arch.dec_expires = dec + tb; + vcpu->cpu = -1; + vcpu->arch.thread_cpu = -1; + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); + + vcpu->arch.iamr = mfspr(SPRN_IAMR); + vcpu->arch.pspb = mfspr(SPRN_PSPB); + vcpu->arch.fscr = mfspr(SPRN_FSCR); + vcpu->arch.tar = mfspr(SPRN_TAR); + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); + vcpu->arch.bescr = mfspr(SPRN_BESCR); + vcpu->arch.wort = mfspr(SPRN_WORT); + vcpu->arch.tid = mfspr(SPRN_TIDR); + vcpu->arch.amr = mfspr(SPRN_AMR); + vcpu->arch.uamor = mfspr(SPRN_UAMOR); + vcpu->arch.dscr = mfspr(SPRN_DSCR); + + mtspr(SPRN_PSPB, 0); + mtspr(SPRN_WORT, 0); + mtspr(SPRN_AMR, 0); + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_DSCR, host_dscr); + mtspr(SPRN_TIDR, host_tidr); + mtspr(SPRN_IAMR, host_iamr); + mtspr(SPRN_PSPB, 0); + + msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); + store_fp_state(&vcpu->arch.fp); +#ifdef CONFIG_ALTIVEC + store_vr_state(&vcpu->arch.vr); +#endif + + if (cpu_has_feature(CPU_FTR_TM) || + cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) + kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true); + + save_pmu = 1; + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + u32 yield_count = be32_to_cpu(lp->yield_count) + 1; + lp->yield_count = cpu_to_be32(yield_count); + vcpu->arch.vpa.dirty = 1; + save_pmu = lp->pmcregs_in_use; + } + + kvmhv_save_guest_pmu(vcpu, save_pmu); + + vc->entry_exit_map = 0x101; + vc->in_guest = 0; + + mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + + kvmhv_load_host_pmu(); + + kvmppc_subcore_exit_guest(); + + return trap; +} + +/* * Wait for some other vcpu thread to execute us, and * wake us up when we need to handle something in the host. */ @@ -3256,6 +3780,11 @@ out: trace_kvmppc_vcore_wakeup(do_sleep, block_ns); } +/* + * This never fails for a radix guest, as none of the operations it does + * for a radix guest can fail or have a way to report failure. + * kvmhv_run_single_vcpu() relies on this fact. + */ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) { int r = 0; @@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return vcpu->arch.ret; } +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, + struct kvm_vcpu *vcpu, u64 time_limit, + unsigned long lpcr) +{ + int trap, r, pcpu; + int srcu_idx; + struct kvmppc_vcore *vc; + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *nested = vcpu->arch.nested; + + trace_kvmppc_run_vcpu_enter(vcpu); + + kvm_run->exit_reason = 0; + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + + vc = vcpu->arch.vcore; + vcpu->arch.ceded = 0; + vcpu->arch.run_task = current; + vcpu->arch.kvm_run = kvm_run; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED; + vc->runnable_threads[0] = vcpu; + vc->n_runnable = 1; + vc->runner = vcpu; + + /* See if the MMU is ready to go */ + if (!kvm->arch.mmu_ready) + kvmhv_setup_mmu(vcpu); + + if (need_resched()) + cond_resched(); + + kvmppc_update_vpas(vcpu); + + init_vcore_to_run(vc); + vc->preempt_tb = TB_NIL; + + preempt_disable(); + pcpu = smp_processor_id(); + vc->pcpu = pcpu; + kvmppc_prepare_radix_vcpu(vcpu, pcpu); + + local_irq_disable(); + hard_irq_disable(); + if (signal_pending(current)) + goto sigpend; + if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready) + goto out; + + if (!nested) { + kvmppc_core_prepare_to_enter(vcpu); + if (vcpu->arch.doorbell_request) { + vc->dpdes = 1; + smp_wmb(); + vcpu->arch.doorbell_request = 0; + } + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, + &vcpu->arch.pending_exceptions)) + lpcr |= LPCR_MER; + } else if (vcpu->arch.pending_exceptions || + vcpu->arch.doorbell_request || + xive_interrupt_pending(vcpu)) { + vcpu->arch.ret = RESUME_HOST; + goto out; + } + + kvmppc_clear_host_core(pcpu); + + local_paca->kvm_hstate.tid = 0; + local_paca->kvm_hstate.napping = 0; + local_paca->kvm_hstate.kvm_split_mode = NULL; + kvmppc_start_thread(vcpu, vc); + kvmppc_create_dtl_entry(vcpu, vc); + trace_kvm_guest_enter(vcpu); + + vc->vcore_state = VCORE_RUNNING; + trace_kvmppc_run_core(vc, 0); + + if (cpu_has_feature(CPU_FTR_HVMODE)) + kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested); + + trace_hardirqs_on(); + guest_enter_irqoff(); + + srcu_idx = srcu_read_lock(&kvm->srcu); + + this_cpu_disable_ftrace(); + + trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr); + vcpu->arch.trap = trap; + + this_cpu_enable_ftrace(); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + if (cpu_has_feature(CPU_FTR_HVMODE)) { + mtspr(SPRN_LPID, kvm->arch.host_lpid); + isync(); + } + + trace_hardirqs_off(); + set_irq_happened(trap); + + kvmppc_set_host_core(pcpu); + + local_irq_enable(); + guest_exit(); + + cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest); + + preempt_enable(); + + /* cancel pending decrementer exception if DEC is now positive */ + if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + trace_kvm_guest_exit(vcpu); + r = RESUME_GUEST; + if (trap) { + if (!nested) + r = kvmppc_handle_exit_hv(kvm_run, vcpu, current); + else + r = kvmppc_handle_nested_exit(vcpu); + } + vcpu->arch.ret = r; + + if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded && + !kvmppc_vcpu_woken(vcpu)) { + kvmppc_set_timer(vcpu); + while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) { + if (signal_pending(current)) { + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + break; + } + spin_lock(&vc->lock); + kvmppc_vcore_blocked(vc); + spin_unlock(&vc->lock); + } + } + vcpu->arch.ceded = 0; + + vc->vcore_state = VCORE_INACTIVE; + trace_kvmppc_run_core(vc, 1); + + done: + kvmppc_remove_runnable(vc, vcpu); + trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); + + return vcpu->arch.ret; + + sigpend: + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + out: + local_irq_enable(); + preempt_enable(); + goto done; +} + static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; @@ -3480,7 +4174,20 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { - r = kvmppc_run_vcpu(run, vcpu); + /* + * The early POWER9 chips that can't mix radix and HPT threads + * on the same core also need the workaround for the problem + * where the TLB would prefetch entries in the guest exit path + * for radix guests using the guest PIDR value and LPID 0. + * The workaround is in the old path (kvmppc_run_vcpu()) + * but not the new path (kvmhv_run_single_vcpu()). + */ + if (kvm->arch.threads_indep && kvm_is_radix(kvm) && + !no_mixing_hpt_and_radix) + r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0, + vcpu->arch.vcore->lpcr); + else + r = kvmppc_run_vcpu(run, vcpu); if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { @@ -3559,6 +4266,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); + /* If running as a nested hypervisor, we don't support HPT guests */ + if (kvmhv_on_pseries()) + info->flags |= KVM_PPC_NO_HASH; + return 0; } @@ -3723,8 +4434,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm) __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; dw1 = PATB_GR | kvm->arch.process_table; } - - mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); + kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1); } /* @@ -3820,6 +4530,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) { + if (nesting_enabled(kvm)) + kvmhv_release_all_nested(kvm); kvmppc_free_radix(kvm); kvmppc_update_lpcr(kvm, LPCR_VPM1, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); @@ -3841,6 +4553,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); + kvmppc_rmap_reset(kvm); kvm->arch.radix = 1; return 0; } @@ -3940,6 +4653,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvmppc_alloc_host_rm_ops(); + kvmhv_vm_nested_init(kvm); + /* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, @@ -3958,9 +4673,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); /* Init LPCR for virtual RMA mode */ - kvm->arch.host_lpid = mfspr(SPRN_LPID); - kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); - lpcr &= LPCR_PECE | LPCR_LPES; + if (cpu_has_feature(CPU_FTR_HVMODE)) { + kvm->arch.host_lpid = mfspr(SPRN_LPID); + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); + lpcr &= LPCR_PECE | LPCR_LPES; + } else { + lpcr = 0; + } lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | LPCR_VPM0 | LPCR_VPM1; kvm->arch.vrma_slb_v = SLB_VSID_B_1T | @@ -4027,8 +4746,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * On POWER9, we only need to do this if the "indep_threads_mode" * module parameter has been set to N. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - kvm->arch.threads_indep = indep_threads_mode; + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { + pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); + kvm->arch.threads_indep = true; + } else { + kvm->arch.threads_indep = indep_threads_mode; + } + } if (!kvm->arch.threads_indep) kvm_hv_vm_activated(); @@ -4051,6 +4776,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) snprintf(buf, sizeof(buf), "vm%d", current->pid); kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); kvmppc_mmu_debugfs_init(kvm); + if (radix_enabled()) + kvmhv_radix_debugfs_init(kvm); return 0; } @@ -4073,13 +4800,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmppc_free_vcores(kvm); - kvmppc_free_lpid(kvm->arch.lpid); if (kvm_is_radix(kvm)) kvmppc_free_radix(kvm); else kvmppc_free_hpt(&kvm->arch.hpt); + /* Perform global invalidation and return lpid to the pool */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (nesting_enabled(kvm)) + kvmhv_release_all_nested(kvm); + kvm->arch.process_table = 0; + kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0); + } + kvmppc_free_lpid(kvm->arch.lpid); + kvmppc_free_pimap(kvm); } @@ -4104,11 +4839,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, static int kvmppc_core_check_processor_compat_hv(void) { - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_206)) - return -EIO; + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_206)) + return 0; - return 0; + /* POWER9 in radix mode is capable of being a nested hypervisor. */ + if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) + return 0; + + return -EIO; } #ifdef CONFIG_KVM_XICS @@ -4426,6 +5165,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if (radix && !radix_enabled()) return -EINVAL; + /* If we're a nested hypervisor, we currently only support radix */ + if (kvmhv_on_pseries() && !radix) + return -EINVAL; + mutex_lock(&kvm->lock); if (radix != kvm_is_radix(kvm)) { if (kvm->arch.mmu_ready) { @@ -4458,6 +5201,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) return err; } +static int kvmhv_enable_nested(struct kvm *kvm) +{ + if (!nested) + return -EPERM; + if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix) + return -ENODEV; + + /* kvm == NULL means the caller is testing if the capability exists */ + if (kvm) + kvm->arch.nested_enable = true; + return 0; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -4497,6 +5253,7 @@ static struct kvmppc_ops kvm_ops_hv = { .configure_mmu = kvmhv_configure_mmu, .get_rmmu_info = kvmhv_get_rmmu_info, .set_smt_mode = kvmhv_set_smt_mode, + .enable_nested = kvmhv_enable_nested, }; static int kvm_init_subcore_bitmap(void) @@ -4547,6 +5304,10 @@ static int kvmppc_book3s_init_hv(void) if (r < 0) return -ENODEV; + r = kvmhv_nested_init(); + if (r) + return r; + r = kvm_init_subcore_bitmap(); if (r) return r; @@ -4557,7 +5318,8 @@ static int kvmppc_book3s_init_hv(void) * indirectly, via OPAL. */ #ifdef CONFIG_SMP - if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) { + if (!xive_enabled() && !kvmhv_on_pseries() && + !local_paca->kvm_hstate.xics_phys) { struct device_node *np; np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); @@ -4605,6 +5367,7 @@ static void kvmppc_book3s_exit_hv(void) if (kvmppc_radix_possible()) kvmppc_radix_exit(); kvmppc_hv_ops = NULL; + kvmhv_nested_exit(); } module_init(kvmppc_book3s_init_hv); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu) void __iomem *xics_phys; unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + /* For a nested hypervisor, use the XICS via hcall */ + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu), + IPI_PRIORITY); + return; + } + /* On POWER9 we can use msgsnd for any destination cpu. */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { msg |= get_hard_smp_processor_id(cpu); @@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again) return 1; /* Now read the interrupt from the ICP */ - xics_phys = local_paca->kvm_hstate.xics_phys; - rc = 0; - if (!xics_phys) - rc = opal_int_get_xirr(&xirr, false); - else - xirr = __raw_rm_readl(xics_phys + XICS_XIRR); + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF); + xirr = cpu_to_be32(retbuf[0]); + } else { + xics_phys = local_paca->kvm_hstate.xics_phys; + rc = 0; + if (!xics_phys) + rc = opal_int_get_xirr(&xirr, false); + else + xirr = __raw_rm_readl(xics_phys + XICS_XIRR); + } if (rc < 0) return 1; @@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again) */ if (xisr == XICS_IPI) { rc = 0; - if (xics_phys) { + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, + hard_smp_processor_id(), 0xff); + plpar_hcall_raw(H_EOI, retbuf, h_xirr); + } else if (xics_phys) { __raw_rm_writeb(0xff, xics_phys + XICS_MFRR); __raw_rm_writel(xirr, xics_phys + XICS_XIRR); } else { @@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again) /* We raced with the host, * we need to resend that IPI, bummer */ - if (xics_phys) + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + plpar_hcall_raw(H_IPI, retbuf, + hard_smp_processor_id(), + IPI_PRIORITY); + } else if (xics_phys) __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); else @@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) smp_mb(); local_paca->kvm_hstate.kvm_split_mode = NULL; } + +/* + * Is there a PRIV_DOORBELL pending for the guest (on POWER9)? + * Can we inject a Decrementer or a External interrupt? + */ +void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) +{ + int ext; + unsigned long vec = 0; + unsigned long lpcr; + + /* Insert EXTERNAL bit into LPCR at the MER bit position */ + ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1; + lpcr = mfspr(SPRN_LPCR); + lpcr |= ext << LPCR_MER_SH; + mtspr(SPRN_LPCR, lpcr); + isync(); + + if (vcpu->arch.shregs.msr & MSR_EE) { + if (ext) { + vec = BOOK3S_INTERRUPT_EXTERNAL; + } else { + long int dec = mfspr(SPRN_DEC); + if (!(lpcr & LPCR_LD)) + dec = (int) dec; + if (dec < 0) + vec = BOOK3S_INTERRUPT_DECREMENTER; + } + } + if (vec) { + unsigned long msr, old_msr = vcpu->arch.shregs.msr; + + kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); + kvmppc_set_srr1(vcpu, old_msr); + kvmppc_set_pc(vcpu, vec); + msr = vcpu->arch.intr_msr; + if (MSR_TM_ACTIVE(old_msr)) + msr |= MSR_TS_S; + vcpu->arch.shregs.msr = msr; + } + + if (vcpu->arch.doorbell_request) { + mtspr(SPRN_DPDES, 1); + vcpu->arch.vcore->dpdes = 1; + smp_wmb(); + vcpu->arch.doorbell_request = 0; + } +} diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -64,52 +64,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* Save host PMU registers */ -BEGIN_FTR_SECTION - /* Work around P8 PMAE bug */ - li r3, -1 - clrrdi r3, r3, 10 - mfspr r8, SPRN_MMCR2 - mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ - isync -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r7, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ - mfspr r6, SPRN_MMCRA - /* Clear MMCRA in order to disable SDAR updates */ - li r5, 0 - mtspr SPRN_MMCRA, r5 - isync - lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */ - cmpwi r5, 0 - beq 31f /* skip if not */ - mfspr r5, SPRN_MMCR1 - mfspr r9, SPRN_SIAR - mfspr r10, SPRN_SDAR - std r7, HSTATE_MMCR0(r13) - std r5, HSTATE_MMCR1(r13) - std r6, HSTATE_MMCRA(r13) - std r9, HSTATE_SIAR(r13) - std r10, HSTATE_SDAR(r13) -BEGIN_FTR_SECTION - mfspr r9, SPRN_SIER - std r8, HSTATE_MMCR2(r13) - std r9, HSTATE_SIER(r13) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mfspr r3, SPRN_PMC1 - mfspr r5, SPRN_PMC2 - mfspr r6, SPRN_PMC3 - mfspr r7, SPRN_PMC4 - mfspr r8, SPRN_PMC5 - mfspr r9, SPRN_PMC6 - stw r3, HSTATE_PMC1(r13) - stw r5, HSTATE_PMC2(r13) - stw r6, HSTATE_PMC3(r13) - stw r7, HSTATE_PMC4(r13) - stw r8, HSTATE_PMC5(r13) - stw r9, HSTATE_PMC6(r13) -31: + bl kvmhv_save_host_pmu /* * Put whatever is in the decrementer into the @@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr + +_GLOBAL(kvmhv_save_host_pmu) +BEGIN_FTR_SECTION + /* Work around P8 PMAE bug */ + li r3, -1 + clrrdi r3, r3, 10 + mfspr r8, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r7, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r5, 0 + mtspr SPRN_MMCRA, r5 + isync + lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */ + cmpwi r5, 0 + beq 31f /* skip if not */ + mfspr r5, SPRN_MMCR1 + mfspr r9, SPRN_SIAR + mfspr r10, SPRN_SDAR + std r7, HSTATE_MMCR0(r13) + std r5, HSTATE_MMCR1(r13) + std r6, HSTATE_MMCRA(r13) + std r9, HSTATE_SIAR(r13) + std r10, HSTATE_SDAR(r13) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR2(r13) + std r9, HSTATE_SIER(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfspr r3, SPRN_PMC1 + mfspr r5, SPRN_PMC2 + mfspr r6, SPRN_PMC3 + mfspr r7, SPRN_PMC4 + mfspr r8, SPRN_PMC5 + mfspr r9, SPRN_PMC6 + stw r3, HSTATE_PMC1(r13) + stw r5, HSTATE_PMC2(r13) + stw r6, HSTATE_PMC3(r13) + stw r7, HSTATE_PMC4(r13) + stw r8, HSTATE_PMC5(r13) + stw r9, HSTATE_PMC6(r13) +31: blr diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c @@ -0,0 +1,1291 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2018 + * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com> + * Paul Mackerras <paulus@ozlabs.org> + * + * Description: KVM functions specific to running nested KVM-HV guests + * on Book3S processors (specifically POWER9 and later). + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/llist.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/pte-walk.h> +#include <asm/reg.h> + +static struct patb_entry *pseries_partition_tb; + +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free); + +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->pcr = vc->pcr; + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->tb_offset = vc->tb_offset; + hr->dawr0 = vcpu->arch.dawr; + hr->dawrx0 = vcpu->arch.dawrx; + hr->ciabr = vcpu->arch.ciabr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; +} + +static void byteswap_pt_regs(struct pt_regs *regs) +{ + unsigned long *addr = (unsigned long *) regs; + + for (; addr < ((unsigned long *) (regs + 1)); addr++) + *addr = swab64(*addr); +} + +static void byteswap_hv_regs(struct hv_guest_state *hr) +{ + hr->version = swab64(hr->version); + hr->lpid = swab32(hr->lpid); + hr->vcpu_token = swab32(hr->vcpu_token); + hr->lpcr = swab64(hr->lpcr); + hr->pcr = swab64(hr->pcr); + hr->amor = swab64(hr->amor); + hr->dpdes = swab64(hr->dpdes); + hr->hfscr = swab64(hr->hfscr); + hr->tb_offset = swab64(hr->tb_offset); + hr->dawr0 = swab64(hr->dawr0); + hr->dawrx0 = swab64(hr->dawrx0); + hr->ciabr = swab64(hr->ciabr); + hr->hdec_expiry = swab64(hr->hdec_expiry); + hr->purr = swab64(hr->purr); + hr->spurr = swab64(hr->spurr); + hr->ic = swab64(hr->ic); + hr->vtb = swab64(hr->vtb); + hr->hdar = swab64(hr->hdar); + hr->hdsisr = swab64(hr->hdsisr); + hr->heir = swab64(hr->heir); + hr->asdr = swab64(hr->asdr); + hr->srr0 = swab64(hr->srr0); + hr->srr1 = swab64(hr->srr1); + hr->sprg[0] = swab64(hr->sprg[0]); + hr->sprg[1] = swab64(hr->sprg[1]); + hr->sprg[2] = swab64(hr->sprg[2]); + hr->sprg[3] = swab64(hr->sprg[3]); + hr->pidr = swab64(hr->pidr); + hr->cfar = swab64(hr->cfar); + hr->ppr = swab64(hr->ppr); +} + +static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + hr->dpdes = vc->dpdes; + hr->hfscr = vcpu->arch.hfscr; + hr->purr = vcpu->arch.purr; + hr->spurr = vcpu->arch.spurr; + hr->ic = vcpu->arch.ic; + hr->vtb = vc->vtb; + hr->srr0 = vcpu->arch.shregs.srr0; + hr->srr1 = vcpu->arch.shregs.srr1; + hr->sprg[0] = vcpu->arch.shregs.sprg0; + hr->sprg[1] = vcpu->arch.shregs.sprg1; + hr->sprg[2] = vcpu->arch.shregs.sprg2; + hr->sprg[3] = vcpu->arch.shregs.sprg3; + hr->pidr = vcpu->arch.pid; + hr->cfar = vcpu->arch.cfar; + hr->ppr = vcpu->arch.ppr; + switch (trap) { + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + hr->hdar = vcpu->arch.fault_dar; + hr->hdsisr = vcpu->arch.fault_dsisr; + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + hr->asdr = vcpu->arch.fault_gpa; + break; + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + hr->heir = vcpu->arch.emul_inst; + break; + } +} + +static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + /* + * Don't let L1 enable features for L2 which we've disabled for L1, + * but preserve the interrupt cause field. + */ + hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); + + /* Don't let data address watchpoint match in hypervisor state */ + hr->dawrx0 &= ~DAWRX_HYP; + + /* Don't let completed instruction address breakpt match in HV state */ + if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + hr->ciabr &= ~CIABR_PRIV; +} + +static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->pcr = hr->pcr; + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.dawr = hr->dawr0; + vcpu->arch.dawrx = hr->dawrx0; + vcpu->arch.ciabr = hr->ciabr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, + struct hv_guest_state *hr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + vc->dpdes = hr->dpdes; + vcpu->arch.hfscr = hr->hfscr; + vcpu->arch.purr = hr->purr; + vcpu->arch.spurr = hr->spurr; + vcpu->arch.ic = hr->ic; + vc->vtb = hr->vtb; + vcpu->arch.fault_dar = hr->hdar; + vcpu->arch.fault_dsisr = hr->hdsisr; + vcpu->arch.fault_gpa = hr->asdr; + vcpu->arch.emul_inst = hr->heir; + vcpu->arch.shregs.srr0 = hr->srr0; + vcpu->arch.shregs.srr1 = hr->srr1; + vcpu->arch.shregs.sprg0 = hr->sprg[0]; + vcpu->arch.shregs.sprg1 = hr->sprg[1]; + vcpu->arch.shregs.sprg2 = hr->sprg[2]; + vcpu->arch.shregs.sprg3 = hr->sprg[3]; + vcpu->arch.pid = hr->pidr; + vcpu->arch.cfar = hr->cfar; + vcpu->arch.ppr = hr->ppr; +} + +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) +{ + long int err, r; + struct kvm_nested_guest *l2; + struct pt_regs l2_regs, saved_l1_regs; + struct hv_guest_state l2_hv, saved_l1_hv; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 hv_ptr, regs_ptr; + u64 hdec_exp; + s64 delta_purr, delta_spurr, delta_ic, delta_vtb; + u64 mask; + unsigned long lpcr; + + if (vcpu->kvm->arch.l1_ptcr == 0) + return H_NOT_AVAILABLE; + + /* copy parameters in */ + hv_ptr = kvmppc_get_gpr(vcpu, 4); + err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_PARAMETER; + if (kvmppc_need_byteswap(vcpu)) + byteswap_hv_regs(&l2_hv); + if (l2_hv.version != HV_GUEST_STATE_VERSION) + return H_P2; + + regs_ptr = kvmppc_get_gpr(vcpu, 5); + err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_PARAMETER; + if (kvmppc_need_byteswap(vcpu)) + byteswap_pt_regs(&l2_regs); + if (l2_hv.vcpu_token >= NR_CPUS) + return H_PARAMETER; + + /* translate lpid */ + l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); + if (!l2) + return H_PARAMETER; + if (!l2->l1_gr_to_hr) { + mutex_lock(&l2->tlb_lock); + kvmhv_update_ptbl_cache(l2); + mutex_unlock(&l2->tlb_lock); + } + + /* save l1 values of things */ + vcpu->arch.regs.msr = vcpu->arch.shregs.msr; + saved_l1_regs = vcpu->arch.regs; + kvmhv_save_hv_regs(vcpu, &saved_l1_hv); + + /* convert TB values/offsets to host (L0) values */ + hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; + vc->tb_offset += l2_hv.tb_offset; + + /* set L1 state to L2 state */ + vcpu->arch.nested = l2; + vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; + vcpu->arch.regs = l2_regs; + vcpu->arch.shregs.msr = vcpu->arch.regs.msr; + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); + sanitise_hv_regs(vcpu, &l2_hv); + restore_hv_regs(vcpu, &l2_hv); + + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + do { + if (mftb() >= hdec_exp) { + vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER; + r = RESUME_HOST; + break; + } + r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp, + lpcr); + } while (is_kvmppc_resume_guest(r)); + + /* save L2 state for return */ + l2_regs = vcpu->arch.regs; + l2_regs.msr = vcpu->arch.shregs.msr; + delta_purr = vcpu->arch.purr - l2_hv.purr; + delta_spurr = vcpu->arch.spurr - l2_hv.spurr; + delta_ic = vcpu->arch.ic - l2_hv.ic; + delta_vtb = vc->vtb - l2_hv.vtb; + save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + + /* restore L1 state */ + vcpu->arch.nested = NULL; + vcpu->arch.regs = saved_l1_regs; + vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK; + /* set L1 MSR TS field according to L2 transaction state */ + if (l2_regs.msr & MSR_TS_MASK) + vcpu->arch.shregs.msr |= MSR_TS_S; + vc->tb_offset = saved_l1_hv.tb_offset; + restore_hv_regs(vcpu, &saved_l1_hv); + vcpu->arch.purr += delta_purr; + vcpu->arch.spurr += delta_spurr; + vcpu->arch.ic += delta_ic; + vc->vtb += delta_vtb; + + kvmhv_put_nested(l2); + + /* copy l2_hv_state and regs back to guest */ + if (kvmppc_need_byteswap(vcpu)) { + byteswap_hv_regs(&l2_hv); + byteswap_pt_regs(&l2_regs); + } + err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv, + sizeof(struct hv_guest_state)); + if (err) + return H_AUTHORITY; + err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs, + sizeof(struct pt_regs)); + if (err) + return H_AUTHORITY; + + if (r == -EINTR) + return H_INTERRUPT; + + return vcpu->arch.trap; +} + +long kvmhv_nested_init(void) +{ + long int ptb_order; + unsigned long ptcr; + long rc; + + if (!kvmhv_on_pseries()) + return 0; + if (!radix_enabled()) + return -ENODEV; + + /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */ + ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1; + if (ptb_order < 8) + ptb_order = 8; + pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order, + GFP_KERNEL); + if (!pseries_partition_tb) { + pr_err("kvm-hv: failed to allocated nested partition table\n"); + return -ENOMEM; + } + + ptcr = __pa(pseries_partition_tb) | (ptb_order - 8); + rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr); + if (rc != H_SUCCESS) { + pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n", + rc); + kfree(pseries_partition_tb); + pseries_partition_tb = NULL; + return -ENODEV; + } + + return 0; +} + +void kvmhv_nested_exit(void) +{ + /* + * N.B. the kvmhv_on_pseries() test is there because it enables + * the compiler to remove the call to plpar_hcall_norets() + * when CONFIG_PPC_PSERIES=n. + */ + if (kvmhv_on_pseries() && pseries_partition_tb) { + plpar_hcall_norets(H_SET_PARTITION_TABLE, 0); + kfree(pseries_partition_tb); + pseries_partition_tb = NULL; + } +} + +static void kvmhv_flush_lpid(unsigned int lpid) +{ + long rc; + + if (!kvmhv_on_pseries()) { + radix__flush_tlb_lpid(lpid); + return; + } + + rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1), + lpid, TLBIEL_INVAL_SET_LPID); + if (rc) + pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc); +} + +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1) +{ + if (!kvmhv_on_pseries()) { + mmu_partition_table_set_entry(lpid, dw0, dw1); + return; + } + + pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0); + pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1); + /* L0 will do the necessary barriers */ + kvmhv_flush_lpid(lpid); +} + +static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp) +{ + unsigned long dw0; + + dw0 = PATB_HR | radix__get_tree_size() | + __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE; + kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table); +} + +void kvmhv_vm_nested_init(struct kvm *kvm) +{ + kvm->arch.max_nested_lpid = -1; +} + +/* + * Handle the H_SET_PARTITION_TABLE hcall. + * r4 = guest real address of partition table + log_2(size) - 12 + * (formatted as for the PTCR). + */ +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long ptcr = kvmppc_get_gpr(vcpu, 4); + int srcu_idx; + long ret = H_SUCCESS; + + srcu_idx = srcu_read_lock(&kvm->srcu); + /* + * Limit the partition table to 4096 entries (because that's what + * hardware supports), and check the base address. + */ + if ((ptcr & PRTS_MASK) > 12 - 8 || + !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT)) + ret = H_PARAMETER; + srcu_read_unlock(&kvm->srcu, srcu_idx); + if (ret == H_SUCCESS) + kvm->arch.l1_ptcr = ptcr; + return ret; +} + +/* + * Reload the partition table entry for a guest. + * Caller must hold gp->tlb_lock. + */ +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) +{ + int ret; + struct patb_entry ptbl_entry; + unsigned long ptbl_addr; + struct kvm *kvm = gp->l1_host; + + ret = -EFAULT; + ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4); + if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8))) + ret = kvm_read_guest(kvm, ptbl_addr, + &ptbl_entry, sizeof(ptbl_entry)); + if (ret) { + gp->l1_gr_to_hr = 0; + gp->process_table = 0; + } else { + gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0); + gp->process_table = be64_to_cpu(ptbl_entry.patb1); + } + kvmhv_set_nested_ptbl(gp); +} + +struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) +{ + struct kvm_nested_guest *gp; + long shadow_lpid; + + gp = kzalloc(sizeof(*gp), GFP_KERNEL); + if (!gp) + return NULL; + gp->l1_host = kvm; + gp->l1_lpid = lpid; + mutex_init(&gp->tlb_lock); + gp->shadow_pgtable = pgd_alloc(kvm->mm); + if (!gp->shadow_pgtable) + goto out_free; + shadow_lpid = kvmppc_alloc_lpid(); + if (shadow_lpid < 0) + goto out_free2; + gp->shadow_lpid = shadow_lpid; + + memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); + + return gp; + + out_free2: + pgd_free(kvm->mm, gp->shadow_pgtable); + out_free: + kfree(gp); + return NULL; +} + +/* + * Free up any resources allocated for a nested guest. + */ +static void kvmhv_release_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + + if (gp->shadow_pgtable) { + /* + * No vcpu is using this struct and no call to + * kvmhv_get_nested can find this struct, + * so we don't need to hold kvm->mmu_lock. + */ + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, + gp->shadow_lpid); + pgd_free(kvm->mm, gp->shadow_pgtable); + } + kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0); + kvmppc_free_lpid(gp->shadow_lpid); + kfree(gp); +} + +static void kvmhv_remove_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + int lpid = gp->l1_lpid; + long ref; + + spin_lock(&kvm->mmu_lock); + if (gp == kvm->arch.nested_guests[lpid]) { + kvm->arch.nested_guests[lpid] = NULL; + if (lpid == kvm->arch.max_nested_lpid) { + while (--lpid >= 0 && !kvm->arch.nested_guests[lpid]) + ; + kvm->arch.max_nested_lpid = lpid; + } + --gp->refcnt; + } + ref = gp->refcnt; + spin_unlock(&kvm->mmu_lock); + if (ref == 0) + kvmhv_release_nested(gp); +} + +/* + * Free up all nested resources allocated for this guest. + * This is called with no vcpus of the guest running, when + * switching the guest to HPT mode or when destroying the + * guest. + */ +void kvmhv_release_all_nested(struct kvm *kvm) +{ + int i; + struct kvm_nested_guest *gp; + struct kvm_nested_guest *freelist = NULL; + struct kvm_memory_slot *memslot; + int srcu_idx; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { + gp = kvm->arch.nested_guests[i]; + if (!gp) + continue; + kvm->arch.nested_guests[i] = NULL; + if (--gp->refcnt == 0) { + gp->next = freelist; + freelist = gp; + } + } + kvm->arch.max_nested_lpid = -1; + spin_unlock(&kvm->mmu_lock); + while ((gp = freelist) != NULL) { + freelist = gp->next; + kvmhv_release_nested(gp); + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) + kvmhv_free_memslot_nest_rmap(memslot); + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/* caller must hold gp->tlb_lock */ +static void kvmhv_flush_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + + spin_lock(&kvm->mmu_lock); + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid); + spin_unlock(&kvm->mmu_lock); + kvmhv_flush_lpid(gp->shadow_lpid); + kvmhv_update_ptbl_cache(gp); + if (gp->l1_gr_to_hr == 0) + kvmhv_remove_nested(gp); +} + +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, + bool create) +{ + struct kvm_nested_guest *gp, *newgp; + + if (l1_lpid >= KVM_MAX_NESTED_GUESTS || + l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) + return NULL; + + spin_lock(&kvm->mmu_lock); + gp = kvm->arch.nested_guests[l1_lpid]; + if (gp) + ++gp->refcnt; + spin_unlock(&kvm->mmu_lock); + + if (gp || !create) + return gp; + + newgp = kvmhv_alloc_nested(kvm, l1_lpid); + if (!newgp) + return NULL; + spin_lock(&kvm->mmu_lock); + if (kvm->arch.nested_guests[l1_lpid]) { + /* someone else beat us to it */ + gp = kvm->arch.nested_guests[l1_lpid]; + } else { + kvm->arch.nested_guests[l1_lpid] = newgp; + ++newgp->refcnt; + gp = newgp; + newgp = NULL; + if (l1_lpid > kvm->arch.max_nested_lpid) + kvm->arch.max_nested_lpid = l1_lpid; + } + ++gp->refcnt; + spin_unlock(&kvm->mmu_lock); + + if (newgp) + kvmhv_release_nested(newgp); + + return gp; +} + +void kvmhv_put_nested(struct kvm_nested_guest *gp) +{ + struct kvm *kvm = gp->l1_host; + long ref; + + spin_lock(&kvm->mmu_lock); + ref = --gp->refcnt; + spin_unlock(&kvm->mmu_lock); + if (ref == 0) + kvmhv_release_nested(gp); +} + +static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) +{ + if (lpid > kvm->arch.max_nested_lpid) + return NULL; + return kvm->arch.nested_guests[lpid]; +} + +static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2) +{ + return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK | + RMAP_NESTED_GPA_MASK)); +} + +void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, + struct rmap_nested **n_rmap) +{ + struct llist_node *entry = ((struct llist_head *) rmapp)->first; + struct rmap_nested *cursor; + u64 rmap, new_rmap = (*n_rmap)->rmap; + + /* Are there any existing entries? */ + if (!(*rmapp)) { + /* No -> use the rmap as a single entry */ + *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY; + return; + } + + /* Do any entries match what we're trying to insert? */ + for_each_nest_rmap_safe(cursor, entry, &rmap) { + if (kvmhv_n_rmap_is_equal(rmap, new_rmap)) + return; + } + + /* Do we need to create a list or just add the new entry? */ + rmap = *rmapp; + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ + *rmapp = 0UL; + llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp); + if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ + (*n_rmap)->list.next = (struct llist_node *) rmap; + + /* Set NULL so not freed by caller */ + *n_rmap = NULL; +} + +static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, + unsigned long hpa, unsigned long mask) +{ + struct kvm_nested_guest *gp; + unsigned long gpa; + unsigned int shift, lpid; + pte_t *ptep; + + gpa = n_rmap & RMAP_NESTED_GPA_MASK; + lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; + gp = kvmhv_find_nested(kvm, lpid); + if (!gp) + return; + + /* Find and invalidate the pte */ + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + /* Don't spuriously invalidate ptes if the pfn has changed */ + if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); +} + +static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp, + unsigned long hpa, unsigned long mask) +{ + struct llist_node *entry = llist_del_all((struct llist_head *) rmapp); + struct rmap_nested *cursor; + unsigned long rmap; + + for_each_nest_rmap_safe(cursor, entry, &rmap) { + kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask); + kfree(cursor); + } +} + +/* called with kvm->mmu_lock held */ +void kvmhv_remove_nest_rmap_range(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long gpa, unsigned long hpa, + unsigned long nbytes) +{ + unsigned long gfn, end_gfn; + unsigned long addr_mask; + + if (!memslot) + return; + gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn; + end_gfn = gfn + (nbytes >> PAGE_SHIFT); + + addr_mask = PTE_RPN_MASK & ~(nbytes - 1); + hpa &= addr_mask; + + for (; gfn < end_gfn; gfn++) { + unsigned long *rmap = &memslot->arch.rmap[gfn]; + kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask); + } +} + +static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free) +{ + unsigned long page; + + for (page = 0; page < free->npages; page++) { + unsigned long rmap, *rmapp = &free->arch.rmap[page]; + struct rmap_nested *cursor; + struct llist_node *entry; + + entry = llist_del_all((struct llist_head *) rmapp); + for_each_nest_rmap_safe(cursor, entry, &rmap) + kfree(cursor); + } +} + +static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + long gpa, int *shift_ret) +{ + struct kvm *kvm = vcpu->kvm; + bool ret = false; + pte_t *ptep; + int shift; + + spin_lock(&kvm->mmu_lock); + ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); + if (!shift) + shift = PAGE_SHIFT; + if (ptep && pte_present(*ptep)) { + kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); + ret = true; + } + spin_unlock(&kvm->mmu_lock); + + if (shift_ret) + *shift_ret = shift; + return ret; +} + +static inline int get_ric(unsigned int instr) +{ + return (instr >> 18) & 0x3; +} + +static inline int get_prs(unsigned int instr) +{ + return (instr >> 17) & 0x1; +} + +static inline int get_r(unsigned int instr) +{ + return (instr >> 16) & 0x1; +} + +static inline int get_lpid(unsigned long r_val) +{ + return r_val & 0xffffffff; +} + +static inline int get_is(unsigned long r_val) +{ + return (r_val >> 10) & 0x3; +} + +static inline int get_ap(unsigned long r_val) +{ + return (r_val >> 5) & 0x7; +} + +static inline long get_epn(unsigned long r_val) +{ + return r_val >> 12; +} + +static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid, + int ap, long epn) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + long npages; + int shift, shadow_shift; + unsigned long addr; + + shift = ap_to_shift(ap); + addr = epn << 12; + if (shift < 0) + /* Invalid ap encoding */ + return -EINVAL; + + addr &= ~((1UL << shift) - 1); + npages = 1UL << (shift - PAGE_SHIFT); + + gp = kvmhv_get_nested(kvm, lpid, false); + if (!gp) /* No such guest -> nothing to do */ + return 0; + mutex_lock(&gp->tlb_lock); + + /* There may be more than one host page backing this single guest pte */ + do { + kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift); + + npages -= 1UL << (shadow_shift - PAGE_SHIFT); + addr += 1UL << shadow_shift; + } while (npages > 0); + + mutex_unlock(&gp->tlb_lock); + kvmhv_put_nested(gp); + return 0; +} + +static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, int ric) +{ + struct kvm *kvm = vcpu->kvm; + + mutex_lock(&gp->tlb_lock); + switch (ric) { + case 0: + /* Invalidate TLB */ + spin_lock(&kvm->mmu_lock); + kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, + gp->shadow_lpid); + kvmhv_flush_lpid(gp->shadow_lpid); + spin_unlock(&kvm->mmu_lock); + break; + case 1: + /* + * Invalidate PWC + * We don't cache this -> nothing to do + */ + break; + case 2: + /* Invalidate TLB, PWC and caching of partition table entries */ + kvmhv_flush_nested(gp); + break; + default: + break; + } + mutex_unlock(&gp->tlb_lock); +} + +static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + int i; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { + gp = kvm->arch.nested_guests[i]; + if (gp) { + spin_unlock(&kvm->mmu_lock); + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + spin_lock(&kvm->mmu_lock); + } + } + spin_unlock(&kvm->mmu_lock); +} + +static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr, + unsigned long rsval, unsigned long rbval) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *gp; + int r, ric, prs, is, ap; + int lpid; + long epn; + int ret = 0; + + ric = get_ric(instr); + prs = get_prs(instr); + r = get_r(instr); + lpid = get_lpid(rsval); + is = get_is(rbval); + + /* + * These cases are invalid and are not handled: + * r != 1 -> Only radix supported + * prs == 1 -> Not HV privileged + * ric == 3 -> No cluster bombs for radix + * is == 1 -> Partition scoped translations not associated with pid + * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA + */ + if ((!r) || (prs) || (ric == 3) || (is == 1) || + ((!is) && (ric == 1 || ric == 2))) + return -EINVAL; + + switch (is) { + case 0: + /* + * We know ric == 0 + * Invalidate TLB for a given target address + */ + epn = get_epn(rbval); + ap = get_ap(rbval); + ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn); + break; + case 2: + /* Invalidate matching LPID */ + gp = kvmhv_get_nested(kvm, lpid, false); + if (gp) { + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + kvmhv_put_nested(gp); + } + break; + case 3: + /* Invalidate ALL LPIDs */ + kvmhv_emulate_tlbie_all_lpid(vcpu, ric); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +/* + * This handles the H_TLB_INVALIDATE hcall. + * Parameters are (r4) tlbie instruction code, (r5) rS contents, + * (r6) rB contents. + */ +long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu) +{ + int ret; + + ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); + if (ret) + return H_PARAMETER; + return H_SUCCESS; +} + +/* Used to convert a nested guest real address to a L1 guest real address */ +static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + unsigned long n_gpa, unsigned long dsisr, + struct kvmppc_pte *gpte_p) +{ + u64 fault_addr, flags = dsisr & DSISR_ISSTORE; + int ret; + + ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr, + &fault_addr); + + if (ret) { + /* We didn't find a pte */ + if (ret == -EINVAL) { + /* Unsupported mmu config */ + flags |= DSISR_UNSUPP_MMU; + } else if (ret == -ENOENT) { + /* No translation found */ + flags |= DSISR_NOHPTE; + } else if (ret == -EFAULT) { + /* Couldn't access L1 real address */ + flags |= DSISR_PRTABLE_FAULT; + vcpu->arch.fault_gpa = fault_addr; + } else { + /* Unknown error */ + return ret; + } + goto forward_to_l1; + } else { + /* We found a pte -> check permissions */ + if (dsisr & DSISR_ISSTORE) { + /* Can we write? */ + if (!gpte_p->may_write) { + flags |= DSISR_PROTFAULT; + goto forward_to_l1; + } + } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { + /* Can we execute? */ + if (!gpte_p->may_execute) { + flags |= SRR1_ISI_N_OR_G; + goto forward_to_l1; + } + } else { + /* Can we read? */ + if (!gpte_p->may_read && !gpte_p->may_write) { + flags |= DSISR_PROTFAULT; + goto forward_to_l1; + } + } + } + + return 0; + +forward_to_l1: + vcpu->arch.fault_dsisr = flags; + if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { + vcpu->arch.shregs.msr &= ~0x783f0000ul; + vcpu->arch.shregs.msr |= flags; + } + return RESUME_HOST; +} + +static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp, + unsigned long n_gpa, + struct kvmppc_pte gpte, + unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + bool writing = !!(dsisr & DSISR_ISSTORE); + u64 pgflags; + bool ret; + + /* Are the rc bits set in the L1 partition scoped pte? */ + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + if (pgflags & ~gpte.rc) + return RESUME_HOST; + + spin_lock(&kvm->mmu_lock); + /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ + ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing, + gpte.raddr, kvm->arch.lpid); + spin_unlock(&kvm->mmu_lock); + if (!ret) + return -EINVAL; + + /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ + ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa, + gp->shadow_lpid); + if (!ret) + return -EINVAL; + return 0; +} + +static inline int kvmppc_radix_level_to_shift(int level) +{ + switch (level) { + case 2: + return PUD_SHIFT; + case 1: + return PMD_SHIFT; + default: + return PAGE_SHIFT; + } +} + +static inline int kvmppc_radix_shift_to_level(int shift) +{ + if (shift == PUD_SHIFT) + return 2; + if (shift == PMD_SHIFT) + return 1; + if (shift == PAGE_SHIFT) + return 0; + WARN_ON_ONCE(1); + return 0; +} + +/* called with gp->tlb_lock held */ +static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, + struct kvm_nested_guest *gp) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_memory_slot *memslot; + struct rmap_nested *n_rmap; + struct kvmppc_pte gpte; + pte_t pte, *pte_p; + unsigned long mmu_seq; + unsigned long dsisr = vcpu->arch.fault_dsisr; + unsigned long ea = vcpu->arch.fault_dar; + unsigned long *rmapp; + unsigned long n_gpa, gpa, gfn, perm = 0UL; + unsigned int shift, l1_shift, level; + bool writing = !!(dsisr & DSISR_ISSTORE); + bool kvm_ro = false; + long int ret; + + if (!gp->l1_gr_to_hr) { + kvmhv_update_ptbl_cache(gp); + if (!gp->l1_gr_to_hr) + return RESUME_HOST; + } + + /* Convert the nested guest real address into a L1 guest real address */ + + n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL; + if (!(dsisr & DSISR_PRTABLE_FAULT)) + n_gpa |= ea & 0xFFF; + ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte); + + /* + * If the hardware found a translation but we don't now have a usable + * translation in the l1 partition-scoped tree, remove the shadow pte + * and let the guest retry. + */ + if (ret == RESUME_HOST && + (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G | + DSISR_BAD_COPYPASTE))) + goto inval; + if (ret) + return ret; + + /* Failed to set the reference/change bits */ + if (dsisr & DSISR_SET_RC) { + ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr); + if (ret == RESUME_HOST) + return ret; + if (ret) + goto inval; + dsisr &= ~DSISR_SET_RC; + if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | + DSISR_PROTFAULT))) + return RESUME_GUEST; + } + + /* + * We took an HISI or HDSI while we were running a nested guest which + * means we have no partition scoped translation for that. This means + * we need to insert a pte for the mapping into our shadow_pgtable. + */ + + l1_shift = gpte.page_shift; + if (l1_shift < PAGE_SHIFT) { + /* We don't support l1 using a page size smaller than our own */ + pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n", + l1_shift, PAGE_SHIFT); + return -EINVAL; + } + gpa = gpte.raddr; + gfn = gpa >> PAGE_SHIFT; + + /* 1. Get the corresponding host memslot */ + + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) { + /* unusual error -> reflect to the guest as a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + /* passthrough of emulated MMIO case... */ + pr_err("emulated MMIO passthrough?\n"); + return -EINVAL; + } + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* Give the guest a DSI */ + kvmppc_core_queue_data_storage(vcpu, ea, + DSISR_ISSTORE | DSISR_PROTFAULT); + return RESUME_GUEST; + } + kvm_ro = true; + } + + /* 2. Find the host pte for this L1 guest real address */ + + /* Used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* See if can find translation in our partition scoped tables for L1 */ + pte = __pte(0); + spin_lock(&kvm->mmu_lock); + pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); + if (!shift) + shift = PAGE_SHIFT; + if (pte_p) + pte = *pte_p; + spin_unlock(&kvm->mmu_lock); + + if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) { + /* No suitable pte found -> try to insert a mapping */ + ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, + writing, kvm_ro, &pte, &level); + if (ret == -EAGAIN) + return RESUME_GUEST; + else if (ret) + return ret; + shift = kvmppc_radix_level_to_shift(level); + } + + /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ + + /* The permissions is the combination of the host and l1 guest ptes */ + perm |= gpte.may_read ? 0UL : _PAGE_READ; + perm |= gpte.may_write ? 0UL : _PAGE_WRITE; + perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; + pte = __pte(pte_val(pte) & ~perm); + + /* What size pte can we insert? */ + if (shift > l1_shift) { + u64 mask; + unsigned int actual_shift = PAGE_SHIFT; + if (PMD_SHIFT < l1_shift) + actual_shift = PMD_SHIFT; + mask = (1UL << shift) - (1UL << actual_shift); + pte = __pte(pte_val(pte) | (gpa & mask)); + shift = actual_shift; + } + level = kvmppc_radix_shift_to_level(shift); + n_gpa &= ~((1UL << shift) - 1); + + /* 4. Insert the pte into our shadow_pgtable */ + + n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL); + if (!n_rmap) + return RESUME_GUEST; /* Let the guest try again */ + n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) | + (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT); + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; + ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level, + mmu_seq, gp->shadow_lpid, rmapp, &n_rmap); + if (n_rmap) + kfree(n_rmap); + if (ret == -EAGAIN) + ret = RESUME_GUEST; /* Let the guest try again */ + + return ret; + + inval: + kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL); + return RESUME_GUEST; +} + +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) +{ + struct kvm_nested_guest *gp = vcpu->arch.nested; + long int ret; + + mutex_lock(&gp->tlb_lock); + ret = __kvmhv_nested_page_fault(vcpu, gp); + mutex_unlock(&gp->tlb_lock); + return ret; +} + +int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid) +{ + int ret = -1; + + spin_lock(&kvm->mmu_lock); + while (++lpid <= kvm->arch.max_nested_lpid) { + if (kvm->arch.nested_guests[lpid]) { + ret = lpid; + break; + } + } + spin_unlock(&kvm->mmu_lock); + return ret; +} diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c @@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest); void kvmppc_subcore_exit_guest(void) { @@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void) local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; } +EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest); static bool kvmppc_tb_resync_required(void) { @@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void) } else { wait_for_tb_resync(); } + + /* + * Reset tb_offset_applied so the guest exit code won't try + * to subtract the previous timebase offset from the timebase. + */ + if (local_paca->kvm_hstate.kvm_vcore) + local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0; + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, /* Mark the target VCPU as having an interrupt pending */ vcpu->stat.queue_intr++; - set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); /* Kick self ? Just set MER and return */ if (vcpu == this_vcpu) { @@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) { /* Note: Only called on self ! */ - clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, - &vcpu->arch.pending_exceptions); + clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); } @@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) void __iomem *xics_phys; int64_t rc; + if (kvmhv_on_pseries()) { + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + iosync(); + plpar_hcall_raw(H_EOI, retbuf, hwirq); + return; + } + rc = pnv_opal_pci_msi_eoi(c, hwirq); if (rc) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -28,6 +28,7 @@ #include <asm/exception-64s.h> #include <asm/kvm_book3s_asm.h> #include <asm/book3s/64/mmu-hash.h> +#include <asm/export.h> #include <asm/tm.h> #include <asm/opal.h> #include <asm/xive-regs.h> @@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define NAPPING_NOVCPU 2 /* Stack frame offsets for kvmppc_hv_entry */ -#define SFS 160 +#define SFS 208 #define STACK_SLOT_TRAP (SFS-4) +#define STACK_SLOT_SHORT_PATH (SFS-8) #define STACK_SLOT_TID (SFS-16) #define STACK_SLOT_PSSCR (SFS-24) #define STACK_SLOT_PID (SFS-32) @@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_HFSCR (SFS-72) +/* the following is used by the P9 short path */ +#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */ /* * Call kvmppc_hv_entry in real mode. @@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_SPRG_VDSO_WRITE,r3 /* Reload the host's PMU registers */ - lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ - cmpwi r4, 0 - beq 23f /* skip if not */ -BEGIN_FTR_SECTION - ld r3, HSTATE_MMCR0(r13) - andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO - cmpwi r4, MMCR0_PMAO - beql kvmppc_fix_pmao -END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) - lwz r3, HSTATE_PMC1(r13) - lwz r4, HSTATE_PMC2(r13) - lwz r5, HSTATE_PMC3(r13) - lwz r6, HSTATE_PMC4(r13) - lwz r8, HSTATE_PMC5(r13) - lwz r9, HSTATE_PMC6(r13) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r4 - mtspr SPRN_PMC3, r5 - mtspr SPRN_PMC4, r6 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 - ld r3, HSTATE_MMCR0(r13) - ld r4, HSTATE_MMCR1(r13) - ld r5, HSTATE_MMCRA(r13) - ld r6, HSTATE_SIAR(r13) - ld r7, HSTATE_SDAR(r13) - mtspr SPRN_MMCR1, r4 - mtspr SPRN_MMCRA, r5 - mtspr SPRN_SIAR, r6 - mtspr SPRN_SDAR, r7 -BEGIN_FTR_SECTION - ld r8, HSTATE_MMCR2(r13) - ld r9, HSTATE_SIER(r13) - mtspr SPRN_MMCR2, r8 - mtspr SPRN_SIER, r9 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mtspr SPRN_MMCR0, r3 - isync -23: + bl kvmhv_load_host_pmu /* * Reload DEC. HDEC interrupts were disabled when @@ -796,66 +762,23 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r4 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_restore_tm_hv + nop ld r4, HSTATE_KVM_VCPU(r13) 91: #endif - /* Load guest PMU registers */ - /* R4 is live here (vcpu pointer) */ - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - isync -BEGIN_FTR_SECTION - ld r3, VCPU_MMCR(r4) - andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO - cmpwi r5, MMCR0_PMAO - beql kvmppc_fix_pmao -END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) - lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ - lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ - lwz r6, VCPU_PMC + 8(r4) - lwz r7, VCPU_PMC + 12(r4) - lwz r8, VCPU_PMC + 16(r4) - lwz r9, VCPU_PMC + 20(r4) - mtspr SPRN_PMC1, r3 - mtspr SPRN_PMC2, r5 - mtspr SPRN_PMC3, r6 - mtspr SPRN_PMC4, r7 - mtspr SPRN_PMC5, r8 - mtspr SPRN_PMC6, r9 - ld r3, VCPU_MMCR(r4) - ld r5, VCPU_MMCR + 8(r4) - ld r6, VCPU_MMCR + 16(r4) - ld r7, VCPU_SIAR(r4) - ld r8, VCPU_SDAR(r4) - mtspr SPRN_MMCR1, r5 - mtspr SPRN_MMCRA, r6 - mtspr SPRN_SIAR, r7 - mtspr SPRN_SDAR, r8 -BEGIN_FTR_SECTION - ld r5, VCPU_MMCR + 24(r4) - ld r6, VCPU_SIER(r4) - mtspr SPRN_MMCR2, r5 - mtspr SPRN_SIER, r6 -BEGIN_FTR_SECTION_NESTED(96) - lwz r7, VCPU_PMC + 24(r4) - lwz r8, VCPU_PMC + 28(r4) - ld r9, VCPU_MMCR + 32(r4) - mtspr SPRN_SPMC1, r7 - mtspr SPRN_SPMC2, r8 - mtspr SPRN_MMCRS, r9 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - mtspr SPRN_MMCR0, r3 - isync + /* Load guest PMU registers; r4 = vcpu pointer here */ + mr r3, r4 + bl kvmhv_load_guest_pmu /* Load up FP, VMX and VSX registers */ + ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_load_fp ld r14, VCPU_GPR(R14)(r4) @@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) no_xive: #endif /* CONFIG_KVM_XICS */ -deliver_guest_interrupt: - ld r6, VCPU_CTR(r4) - ld r7, VCPU_XER(r4) - - mtctr r6 - mtxer r7 + li r0, 0 + stw r0, STACK_SLOT_SHORT_PATH(r1) -kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ - ld r10, VCPU_PC(r4) - ld r11, VCPU_MSR(r4) +deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */ + /* Check if we can deliver an external or decrementer interrupt now */ + ld r0, VCPU_PENDING_EXC(r4) +BEGIN_FTR_SECTION + /* On POWER9, also check for emulated doorbell interrupt */ + lbz r3, VCPU_DBELL_REQ(r4) + or r0, r0, r3 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + cmpdi r0, 0 + beq 71f + mr r3, r4 + bl kvmppc_guest_entry_inject_int + ld r4, HSTATE_KVM_VCPU(r13) +71: ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) mtspr SPRN_SRR0, r6 mtspr SPRN_SRR1, r7 +fast_guest_entry_c: + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME - /* Check if we can deliver an external or decrementer interrupt now */ - ld r0, VCPU_PENDING_EXC(r4) - rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 - cmpdi cr1, r0, 0 - andi. r8, r11, MSR_EE - mfspr r8, SPRN_LPCR - /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ - rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH - mtspr SPRN_LPCR, r8 - isync - beq 5f - li r0, BOOK3S_INTERRUPT_EXTERNAL - bne cr1, 12f - mfspr r0, SPRN_DEC -BEGIN_FTR_SECTION - /* On POWER9 check whether the guest has large decrementer enabled */ - andis. r8, r8, LPCR_LD@h - bne 15f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - extsw r0, r0 -15: cmpdi r0, 0 - li r0, BOOK3S_INTERRUPT_DECREMENTER - bge 5f - -12: mtspr SPRN_SRR0, r10 - mr r10,r0 - mtspr SPRN_SRR1, r11 - mr r9, r4 - bl kvmppc_msr_interrupt -5: -BEGIN_FTR_SECTION - b fast_guest_return -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) - /* On POWER9, check for pending doorbell requests */ - lbz r0, VCPU_DBELL_REQ(r4) - cmpwi r0, 0 - beq fast_guest_return - ld r5, HSTATE_KVM_VCORE(r13) - /* Set DPDES register so the CPU will take a doorbell interrupt */ - li r0, 1 - mtspr SPRN_DPDES, r0 - std r0, VCORE_DPDES(r5) - /* Make sure other cpus see vcore->dpdes set before dbell req clear */ - lwsync - /* Clear the pending doorbell request */ - li r0, 0 - stb r0, VCPU_DBELL_REQ(r4) + ld r6, VCPU_CTR(r4) + ld r7, VCPU_XER(r4) + mtctr r6 + mtxer r7 /* * Required state: @@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r5, VCPU_LR(r4) - lwz r6, VCPU_CR(r4) + ld r6, VCPU_CR(r4) mtlr r5 mtcr r6 @@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) HRFI_TO_GUEST b . +/* + * Enter the guest on a P9 or later system where we have exactly + * one vcpu per vcore and we don't need to go to real mode + * (which implies that host and guest are both using radix MMU mode). + * r3 = vcpu pointer + * Most SPRs and all the VSRs have been loaded already. + */ +_GLOBAL(__kvmhv_vcpu_entry_p9) +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -SFS(r1) + + li r0, 1 + stw r0, STACK_SLOT_SHORT_PATH(r1) + + std r3, HSTATE_KVM_VCPU(r13) + mfcr r4 + stw r4, SFS+8(r1) + + std r1, HSTATE_HOST_R1(r13) + + reg = 14 + .rept 18 + std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, __VCPU_GPR(reg)(r3) + reg = reg + 1 + .endr + + mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) + + mr r4, r3 + b fast_guest_entry_c +guest_exit_short_path: + + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + reg = 14 + .rept 18 + std reg, __VCPU_GPR(reg)(r9) + reg = reg + 1 + .endr + + reg = 14 + .rept 18 + ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1) + reg = reg + 1 + .endr + + lwz r4, SFS+8(r1) + mtcr r4 + + mr r3, r12 /* trap number */ + + addi r1, r1, SFS + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + + /* If we are in real mode, do a rfid to get back to the caller */ + mfmsr r4 + andi. r5, r4, MSR_IR + bnelr + rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */ + mtspr SPRN_SRR0, r0 + ld r10, HSTATE_HOST_MSR(r13) + rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG + mtspr SPRN_SRR1, r10 + RFI_TO_KERNEL + b . + secondary_too_late: li r12, 0 stw r12, STACK_SLOT_TRAP(r1) @@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv: std r3, VCPU_GPR(R12)(r9) /* CR is in the high half of r12 */ srdi r4, r12, 32 - stw r4, VCPU_CR(r9) + std r4, VCPU_CR(r9) BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) std r3, VCPU_CFAR(r9) @@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r3, VCPU_CTR(r9) std r4, VCPU_XER(r9) -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* For softpatch interrupt, go off and do TM instruction emulation */ - cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - beq kvmppc_tm_emul -#endif + /* Save more register state */ + mfdar r3 + mfdsisr r4 + std r3, VCPU_DAR(r9) + stw r4, VCPU_DSISR(r9) /* If this is a page table miss then see if it's theirs or ours */ cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE beq kvmppc_hdsi + std r3, VCPU_FAULT_DAR(r9) + stw r4, VCPU_FAULT_DSISR(r9) cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE beq kvmppc_hisi +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* For softpatch interrupt, go off and do TM instruction emulation */ + cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH + beq kvmppc_tm_emul +#endif + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) BEGIN_FTR_SECTION PPC_MSGSYNC lwsync + /* always exit if we're running a nested guest */ + ld r0, VCPU_NESTED(r9) + cmpdi r0, 0 + bne guest_exit_cont END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 - beq 4f + beq maybe_reenter_guest b guest_exit_cont 3: /* If it's a hypervisor facility unavailable interrupt, save HFSCR */ @@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 14: /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - bne+ guest_exit_cont - - /* External interrupt, first check for host_ipi. If this is - * set, we know the host wants us out so let's do it now - */ - bl kvmppc_read_intr - - /* - * Restore the active volatile registers after returning from - * a C function. - */ - ld r9, HSTATE_KVM_VCPU(r13) - li r12, BOOK3S_INTERRUPT_EXTERNAL - - /* - * kvmppc_read_intr return codes: - * - * Exit to host (r3 > 0) - * 1 An interrupt is pending that needs to be handled by the host - * Exit guest and return to host by branching to guest_exit_cont - * - * 2 Passthrough that needs completion in the host - * Exit guest and return to host by branching to guest_exit_cont - * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD - * to indicate to the host to complete handling the interrupt - * - * Before returning to guest, we check if any CPU is heading out - * to the host and if so, we head out also. If no CPUs are heading - * check return values <= 0. - * - * Return to guest (r3 <= 0) - * 0 No external interrupt is pending - * -1 A guest wakeup IPI (which has now been cleared) - * In either case, we return to guest to deliver any pending - * guest interrupts. - * - * -2 A PCI passthrough external interrupt was handled - * (interrupt was delivered directly to guest) - * Return to guest to deliver any pending guest interrupts. - */ - - cmpdi r3, 1 - ble 1f - - /* Return code = 2 */ - li r12, BOOK3S_INTERRUPT_HV_RM_HARD - stw r12, VCPU_TRAP(r9) - b guest_exit_cont - -1: /* Return code <= 1 */ - cmpdi r3, 0 - bgt guest_exit_cont - - /* Return code <= 0 */ -4: ld r5, HSTATE_KVM_VCORE(r13) - lwz r0, VCORE_ENTRY_EXIT(r5) - cmpwi r0, 0x100 - mr r4, r9 - blt deliver_guest_interrupt - -guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ - /* Save more register state */ - mfdar r6 - mfdsisr r7 - std r6, VCPU_DAR(r9) - stw r7, VCPU_DSISR(r9) - /* don't overwrite fault_dar/fault_dsisr if HDSI */ - cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq mc_cont - std r6, VCPU_FAULT_DAR(r9) - stw r7, VCPU_FAULT_DSISR(r9) - + beq kvmppc_guest_external /* See if it is a machine check */ cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK beq machine_check_realmode -mc_cont: + /* Or a hypervisor maintenance interrupt */ + cmpwi r12, BOOK3S_INTERRUPT_HMI + beq hmi_realmode + +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r9, VCPU_TB_RMEXIT mr r4, r9 @@ -1552,6 +1465,11 @@ mc_cont: 1: #endif /* CONFIG_KVM_XICS */ + /* If we came in through the P9 short path, go back out to C now */ + lwz r0, STACK_SLOT_SHORT_PATH(r1) + cmpwi r0, 0 + bne guest_exit_short_path + /* For hash guest, read the guest SLB and save it away */ ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) @@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r9 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_save_tm_hv + nop ld r9, HSTATE_KVM_VCPU(r13) 91: #endif @@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) 25: /* Save PMU registers if requested */ /* r8 and cr0.eq are live here */ + mr r3, r9 + li r4, 1 + beq 21f /* if no VPA, save PMU stuff anyway */ + lbz r4, LPPACA_PMCINUSE(r8) +21: bl kvmhv_save_guest_pmu + ld r9, HSTATE_KVM_VCPU(r13) + + /* Restore host values of some registers */ BEGIN_FTR_SECTION - /* - * POWER8 seems to have a hardware bug where setting - * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] - * when some counters are already negative doesn't seem - * to cause a performance monitor alert (and hence interrupt). - * The effect of this is that when saving the PMU state, - * if there is no PMU alert pending when we read MMCR0 - * before freezing the counters, but one becomes pending - * before we read the counters, we lose it. - * To work around this, we need a way to freeze the counters - * before reading MMCR0. Normally, freezing the counters - * is done by writing MMCR0 (to set MMCR0[FC]) which - * unavoidably writes MMCR0[PMA0] as well. On POWER8, - * we can also freeze the counters using MMCR2, by writing - * 1s to all the counter freeze condition bits (there are - * 9 bits each for 6 counters). - */ - li r3, -1 /* set all freeze bits */ - clrrdi r3, r3, 10 - mfspr r10, SPRN_MMCR2 - mtspr SPRN_MMCR2, r3 - isync -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - li r3, 1 - sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ - mfspr r4, SPRN_MMCR0 /* save MMCR0 */ - mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ - mfspr r6, SPRN_MMCRA - /* Clear MMCRA in order to disable SDAR updates */ - li r7, 0 - mtspr SPRN_MMCRA, r7 - isync - beq 21f /* if no VPA, save PMU stuff anyway */ - lbz r7, LPPACA_PMCINUSE(r8) - cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ - bne 21f - std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ - b 22f -21: mfspr r5, SPRN_MMCR1 - mfspr r7, SPRN_SIAR - mfspr r8, SPRN_SDAR - std r4, VCPU_MMCR(r9) - std r5, VCPU_MMCR + 8(r9) - std r6, VCPU_MMCR + 16(r9) -BEGIN_FTR_SECTION - std r10, VCPU_MMCR + 24(r9) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - std r7, VCPU_SIAR(r9) - std r8, VCPU_SDAR(r9) - mfspr r3, SPRN_PMC1 - mfspr r4, SPRN_PMC2 - mfspr r5, SPRN_PMC3 - mfspr r6, SPRN_PMC4 - mfspr r7, SPRN_PMC5 - mfspr r8, SPRN_PMC6 - stw r3, VCPU_PMC(r9) - stw r4, VCPU_PMC + 4(r9) - stw r5, VCPU_PMC + 8(r9) - stw r6, VCPU_PMC + 12(r9) - stw r7, VCPU_PMC + 16(r9) - stw r8, VCPU_PMC + 20(r9) -BEGIN_FTR_SECTION - mfspr r5, SPRN_SIER - std r5, VCPU_SIER(r9) -BEGIN_FTR_SECTION_NESTED(96) - mfspr r6, SPRN_SPMC1 - mfspr r7, SPRN_SPMC2 - mfspr r8, SPRN_MMCRS - stw r6, VCPU_PMC + 24(r9) - stw r7, VCPU_PMC + 28(r9) - std r8, VCPU_MMCR + 32(r9) - lis r4, 0x8000 - mtspr SPRN_MMCRS, r4 -END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) -22: - - /* Restore host values of some registers */ -BEGIN_FTR_SECTION - ld r5, STACK_SLOT_CIABR(r1) - ld r6, STACK_SLOT_DAWR(r1) - ld r7, STACK_SLOT_DAWRX(r1) - mtspr SPRN_CIABR, r5 + ld r5, STACK_SLOT_CIABR(r1) + ld r6, STACK_SLOT_DAWR(r1) + ld r7, STACK_SLOT_DAWRX(r1) + mtspr SPRN_CIABR, r5 /* * If the DAWR doesn't work, it's ok to write these here as * this value should always be zero @@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION mtspr SPRN_DPDES, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) - /* If HMI, call kvmppc_realmode_hmi_handler() */ - lwz r12, STACK_SLOT_TRAP(r1) - cmpwi r12, BOOK3S_INTERRUPT_HMI - bne 27f - bl kvmppc_realmode_hmi_handler - nop - cmpdi r3, 0 - /* - * At this point kvmppc_realmode_hmi_handler may have resync-ed - * the TB, and if it has, we must not subtract the guest timebase - * offset from the timebase. So, skip it. - * - * Also, do not call kvmppc_subcore_exit_guest() because it has - * been invoked as part of kvmppc_realmode_hmi_handler(). - */ - beq 30f - -27: /* Subtract timebase offset from timebase */ ld r8, VCORE_TB_OFFSET_APPL(r5) cmpdi r8,0 @@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addis r8,r8,0x100 /* if so, increment upper 40 bits */ mtspr SPRN_TBU40,r8 -17: bl kvmppc_subcore_exit_guest +17: + /* + * If this is an HMI, we called kvmppc_realmode_hmi_handler + * above, which may or may not have already called + * kvmppc_subcore_exit_guest. Fortunately, all that + * kvmppc_subcore_exit_guest does is clear a flag, so calling + * it again here is benign even if kvmppc_realmode_hmi_handler + * has already called it. + */ + bl kvmppc_subcore_exit_guest nop 30: ld r5,HSTATE_KVM_VCORE(r13) ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ @@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtlr r0 blr +kvmppc_guest_external: + /* External interrupt, first check for host_ipi. If this is + * set, we know the host wants us out so let's do it now + */ + bl kvmppc_read_intr + + /* + * Restore the active volatile registers after returning from + * a C function. + */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_EXTERNAL + + /* + * kvmppc_read_intr return codes: + * + * Exit to host (r3 > 0) + * 1 An interrupt is pending that needs to be handled by the host + * Exit guest and return to host by branching to guest_exit_cont + * + * 2 Passthrough that needs completion in the host + * Exit guest and return to host by branching to guest_exit_cont + * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD + * to indicate to the host to complete handling the interrupt + * + * Before returning to guest, we check if any CPU is heading out + * to the host and if so, we head out also. If no CPUs are heading + * check return values <= 0. + * + * Return to guest (r3 <= 0) + * 0 No external interrupt is pending + * -1 A guest wakeup IPI (which has now been cleared) + * In either case, we return to guest to deliver any pending + * guest interrupts. + * + * -2 A PCI passthrough external interrupt was handled + * (interrupt was delivered directly to guest) + * Return to guest to deliver any pending guest interrupts. + */ + + cmpdi r3, 1 + ble 1f + + /* Return code = 2 */ + li r12, BOOK3S_INTERRUPT_HV_RM_HARD + stw r12, VCPU_TRAP(r9) + b guest_exit_cont + +1: /* Return code <= 1 */ + cmpdi r3, 0 + bgt guest_exit_cont + + /* Return code <= 0 */ +maybe_reenter_guest: + ld r5, HSTATE_KVM_VCORE(r13) + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + mr r4, r9 + blt deliver_guest_interrupt + b guest_exit_cont + #ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* * Softpatch interrupt for transactional memory emulation cases @@ -2302,6 +2203,10 @@ hcall_try_real_mode: andi. r0,r11,MSR_PR /* sc 1 from userspace - reflect to guest syscall */ bne sc_1_fast_return + /* sc 1 from nested guest - give it to L1 to handle */ + ld r0, VCPU_NESTED(r9) + cmpdi r0, 0 + bne guest_exit_cont clrrdi r3,r3,2 cmpldi r3,hcall_real_table_end - hcall_real_table bge guest_exit_cont @@ -2561,6 +2466,7 @@ hcall_real_table: hcall_real_table_end: _GLOBAL(kvmppc_h_set_xdabr) +EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr) andi. r0, r5, DABRX_USER | DABRX_KERNEL beq 6f li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI @@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr) blr _GLOBAL(kvmppc_h_set_dabr) +EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr) li r5, DABRX_USER | DABRX_KERNEL 3: BEGIN_FTR_SECTION @@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ ld r3, HSTATE_KVM_VCPU(r13) ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_save_tm_hv + nop 91: #endif @@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION b 91f END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) /* - * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR) */ mr r3, r4 ld r4, VCPU_MSR(r3) + li r5, 0 /* don't preserve non-vol regs */ bl kvmppc_restore_tm_hv + nop ld r4, HSTATE_KVM_VCPU(r13) 91: #endif @@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) mr r9, r4 cmpdi r3, 0 bgt guest_exit_cont - - /* see if any other thread is already exiting */ - lwz r0,VCORE_ENTRY_EXIT(r5) - cmpwi r0,0x100 - bge guest_exit_cont - - b kvmppc_cede_reentry /* if not go back to guest */ + b maybe_reenter_guest /* cede when already previously prodded case */ kvm_cede_prodded: @@ -2947,12 +2852,12 @@ machine_check_realmode: */ ld r11, VCPU_MSR(r9) rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ - bne mc_cont /* if so, exit to host */ + bne guest_exit_cont /* if so, exit to host */ /* Check if guest is capable of handling NMI exit */ ld r10, VCPU_KVM(r9) lbz r10, KVM_FWNMI(r10) cmpdi r10, 1 /* FWNMI capable? */ - beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ + beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */ /* if not, fall through for backward compatibility. */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ @@ -2966,6 +2871,21 @@ machine_check_realmode: 2: b fast_interrupt_c_return /* + * Call C code to handle a HMI in real mode. + * Only the primary thread does the call, secondary threads are handled + * by calling hmi_exception_realmode() after kvmppc_hv_entry returns. + * r9 points to the vcpu on entry + */ +hmi_realmode: + lbz r0, HSTATE_PTID(r13) + cmpwi r0, 0 + bne guest_exit_cont + bl kvmppc_realmode_hmi_handler + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_HMI + b guest_exit_cont + +/* * Check the reason we woke from nap, and take appropriate action. * Returns (in r3): * 0 if nothing needs to be done @@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) * Save transactional state and TM-related registers. * Called with r3 pointing to the vcpu struct and r4 containing * the guest MSR value. - * This can modify all checkpointed registers, but + * r5 is non-zero iff non-volatile register state needs to be maintained. + * If r5 == 0, this can modify all checkpointed registers, but * restores r1 and r2 before exit. */ -kvmppc_save_tm_hv: +_GLOBAL_TOC(kvmppc_save_tm_hv) +EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv) /* See if we need to handle fake suspend mode */ BEGIN_FTR_SECTION b __kvmppc_save_tm @@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop - std r1, HSTATE_HOST_R1(r13) - - /* Clear the MSR RI since r1, r13 may be foobar. */ - li r5, 0 - mtmsrd r5, 1 - /* We have to treclaim here because that's the only way to do S->N */ li r3, TM_CAUSE_KVM_RESCHED TRECLAIM(R3) @@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) * We were in fake suspend, so we are not going to save the * register state as the guest checkpointed state (since * we already have it), therefore we can now use any volatile GPR. + * In fact treclaim in fake suspend state doesn't modify + * any registers. */ - /* Reload PACA pointer, stack pointer and TOC. */ - GET_PACA(r13) - ld r1, HSTATE_HOST_R1(r13) - ld r2, PACATOC(r13) - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 - - HMT_MEDIUM - ld r6, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r6 -BEGIN_FTR_SECTION_NESTED(96) +BEGIN_FTR_SECTION bl pnv_power9_force_smt4_release -END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) +END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) nop 4: @@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) * Restore transactional state and TM-related registers. * Called with r3 pointing to the vcpu struct * and r4 containing the guest MSR value. + * r5 is non-zero iff non-volatile register state needs to be maintained. * This potentially modifies all checkpointed registers. * It restores r1 and r2 from the PACA. */ -kvmppc_restore_tm_hv: +_GLOBAL_TOC(kvmppc_restore_tm_hv) +EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv) /* * If we are doing TM emulation for the guest on a POWER9 DD2, * then we don't actually do a trechkpt -- we either set up @@ -3424,6 +3333,194 @@ kvmppc_msr_interrupt: blr /* + * Load up guest PMU state. R3 points to the vcpu struct. + */ +_GLOBAL(kvmhv_load_guest_pmu) +EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu) + mr r4, r3 + mflr r0 + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync +BEGIN_FTR_SECTION + ld r3, VCPU_MMCR(r4) + andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r5, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ + lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ + lwz r6, VCPU_PMC + 8(r4) + lwz r7, VCPU_PMC + 12(r4) + lwz r8, VCPU_PMC + 16(r4) + lwz r9, VCPU_PMC + 20(r4) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r5 + mtspr SPRN_PMC3, r6 + mtspr SPRN_PMC4, r7 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, VCPU_MMCR(r4) + ld r5, VCPU_MMCR + 8(r4) + ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) + mtspr SPRN_MMCR1, r5 + mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 +BEGIN_FTR_SECTION + ld r5, VCPU_MMCR + 24(r4) + ld r6, VCPU_SIER(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 +BEGIN_FTR_SECTION_NESTED(96) + lwz r7, VCPU_PMC + 24(r4) + lwz r8, VCPU_PMC + 28(r4) + ld r9, VCPU_MMCR + 32(r4) + mtspr SPRN_SPMC1, r7 + mtspr SPRN_SPMC2, r8 + mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + mtlr r0 + blr + +/* + * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu. + */ +_GLOBAL(kvmhv_load_host_pmu) +EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu) + mflr r0 + lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ + cmpwi r4, 0 + beq 23f /* skip if not */ +BEGIN_FTR_SECTION + ld r3, HSTATE_MMCR0(r13) + andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r4, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, HSTATE_PMC1(r13) + lwz r4, HSTATE_PMC2(r13) + lwz r5, HSTATE_PMC3(r13) + lwz r6, HSTATE_PMC4(r13) + lwz r8, HSTATE_PMC5(r13) + lwz r9, HSTATE_PMC6(r13) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, HSTATE_MMCR0(r13) + ld r4, HSTATE_MMCR1(r13) + ld r5, HSTATE_MMCRA(r13) + ld r6, HSTATE_SIAR(r13) + ld r7, HSTATE_SDAR(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_SIAR, r6 + mtspr SPRN_SDAR, r7 +BEGIN_FTR_SECTION + ld r8, HSTATE_MMCR2(r13) + ld r9, HSTATE_SIER(r13) + mtspr SPRN_MMCR2, r8 + mtspr SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + mtlr r0 +23: blr + +/* + * Save guest PMU state into the vcpu struct. + * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA) + */ +_GLOBAL(kvmhv_save_guest_pmu) +EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu) + mr r9, r3 + mr r8, r4 +BEGIN_FTR_SECTION + /* + * POWER8 seems to have a hardware bug where setting + * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] + * when some counters are already negative doesn't seem + * to cause a performance monitor alert (and hence interrupt). + * The effect of this is that when saving the PMU state, + * if there is no PMU alert pending when we read MMCR0 + * before freezing the counters, but one becomes pending + * before we read the counters, we lose it. + * To work around this, we need a way to freeze the counters + * before reading MMCR0. Normally, freezing the counters + * is done by writing MMCR0 (to set MMCR0[FC]) which + * unavoidably writes MMCR0[PMA0] as well. On POWER8, + * we can also freeze the counters using MMCR2, by writing + * 1s to all the counter freeze condition bits (there are + * 9 bits each for 6 counters). + */ + li r3, -1 /* set all freeze bits */ + clrrdi r3, r3, 10 + mfspr r10, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r4, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r7, 0 + mtspr SPRN_MMCRA, r7 + isync + cmpwi r8, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR + std r4, VCPU_MMCR(r9) + std r5, VCPU_MMCR + 8(r9) + std r6, VCPU_MMCR + 16(r9) +BEGIN_FTR_SECTION + std r10, VCPU_MMCR + 24(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) + mfspr r3, SPRN_PMC1 + mfspr r4, SPRN_PMC2 + mfspr r5, SPRN_PMC3 + mfspr r6, SPRN_PMC4 + mfspr r7, SPRN_PMC5 + mfspr r8, SPRN_PMC6 + stw r3, VCPU_PMC(r9) + stw r4, VCPU_PMC + 4(r9) + stw r5, VCPU_PMC + 8(r9) + stw r6, VCPU_PMC + 12(r9) + stw r7, VCPU_PMC + 16(r9) + stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + mfspr r5, SPRN_SIER + std r5, VCPU_SIER(r9) +BEGIN_FTR_SECTION_NESTED(96) + mfspr r6, SPRN_SPMC1 + mfspr r7, SPRN_SPMC2 + mfspr r8, SPRN_MMCRS + stw r6, VCPU_PMC + 24(r9) + stw r7, VCPU_PMC + 28(r9) + std r8, VCPU_MMCR + 32(r9) + lis r4, 0x8000 + mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +22: blr + +/* * This works around a hardware bug on POWER8E processors, where * writing a 1 to the MMCR0[PMAO] bit doesn't generate a * performance monitor interrupt. Instead, when we need to have diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c @@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) return RESUME_GUEST; } /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); /* L=1 => tresume, L=0 => tsuspend */ if (instr & (1 << 21)) { @@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) copy_from_checkpoint(vcpu); /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); vcpu->arch.shregs.msr &= ~MSR_TS_MASK; return RESUME_GUEST; @@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) copy_to_checkpoint(vcpu); /* Set CR0 to indicate previous transactional state */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); vcpu->arch.shregs.msr = msr | MSR_TS_S; return RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_hv_tm_builtin.c b/arch/powerpc/kvm/book3s_hv_tm_builtin.c @@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu) if (instr & (1 << 21)) vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; /* Set CR0 to 0b0010 */ - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000; + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | + 0x20000000; return 1; } @@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */ vcpu->arch.regs.nip = vcpu->arch.tfhar; copy_from_checkpoint(vcpu); - vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000; + vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c @@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu) svcpu->gpr[11] = vcpu->arch.regs.gpr[11]; svcpu->gpr[12] = vcpu->arch.regs.gpr[12]; svcpu->gpr[13] = vcpu->arch.regs.gpr[13]; - svcpu->cr = vcpu->arch.cr; + svcpu->cr = vcpu->arch.regs.ccr; svcpu->xer = vcpu->arch.regs.xer; svcpu->ctr = vcpu->arch.regs.ctr; svcpu->lr = vcpu->arch.regs.link; @@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu) vcpu->arch.regs.gpr[11] = svcpu->gpr[11]; vcpu->arch.regs.gpr[12] = svcpu->gpr[12]; vcpu->arch.regs.gpr[13] = svcpu->gpr[13]; - vcpu->arch.cr = svcpu->cr; + vcpu->arch.regs.ccr = svcpu->cr; vcpu->arch.regs.xer = svcpu->xer; vcpu->arch.regs.ctr = svcpu->ctr; vcpu->arch.regs.link = svcpu->lr; @@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_EXTERNAL: - case BOOK3S_INTERRUPT_EXTERNAL_LEVEL: case BOOK3S_INTERRUPT_EXTERNAL_HV: case BOOK3S_INTERRUPT_H_VIRT: vcpu->stat.ext_intr_exits++; diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c @@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp, */ if (new.out_ee) { kvmppc_book3s_queue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + BOOK3S_INTERRUPT_EXTERNAL); if (!change_self) kvmppc_fast_vcpu_kick(icp->vcpu); } @@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) u32 xirr; /* First, remove EE from the processor */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); /* * ICP State: Accept_Interrupt @@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) * We can remove EE from the current processor, the update * transaction will set it again if needed */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); do { old_state = new_state = READ_ONCE(icp->state); @@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) * Deassert the CPU interrupt request. * icp_try_update will reassert it if necessary. */ - kvmppc_book3s_dequeue_irqprio(icp->vcpu, - BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL); /* * Note that if we displace an interrupt from old_state.xisr, @@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - if (cpu_has_feature(CPU_FTR_ARCH_206)) { + if (cpu_has_feature(CPU_FTR_ARCH_206) && + cpu_has_feature(CPU_FTR_HVMODE)) { /* Enable real mode support */ xics->real_mode = ENABLE_REALMODE; xics->real_mode_dbg = DEBUG_REALMODE; diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c @@ -62,6 +62,69 @@ #define XIVE_Q_GAP 2 /* + * Push a vcpu's context to the XIVE on guest entry. + * This assumes we are in virtual mode (MMU on) + */ +void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) +{ + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; + u64 pq; + + if (!tima) + return; + eieio(); + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); + __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); + vcpu->arch.xive_pushed = 1; + eieio(); + + /* + * We clear the irq_pending flag. There is a small chance of a + * race vs. the escalation interrupt happening on another + * processor setting it again, but the only consequence is to + * cause a spurious wakeup on the next H_CEDE, which is not an + * issue. + */ + vcpu->arch.irq_pending = 0; + + /* + * In single escalation mode, if the escalation interrupt is + * on, we mask it. + */ + if (vcpu->arch.xive_esc_on) { + pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr + + XIVE_ESB_SET_PQ_01)); + mb(); + + /* + * We have a possible subtle race here: The escalation + * interrupt might have fired and be on its way to the + * host queue while we mask it, and if we unmask it + * early enough (re-cede right away), there is a + * theorical possibility that it fires again, thus + * landing in the target queue more than once which is + * a big no-no. + * + * Fortunately, solving this is rather easy. If the + * above load setting PQ to 01 returns a previous + * value where P is set, then we know the escalation + * interrupt is somewhere on its way to the host. In + * that case we simply don't clear the xive_esc_on + * flag below. It will be eventually cleared by the + * handler for the escalation interrupt. + * + * Then, when doing a cede, we check that flag again + * before re-enabling the escalation interrupt, and if + * set, we abort the cede. + */ + if (!(pq & XIVE_ESB_VAL_P)) + /* Now P is 0, we can clear the flag */ + vcpu->arch.xive_esc_on = 0; + } +} +EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); + +/* * This is a simple trigger for a generic XIVE IRQ. This must * only be called for interrupts that support a trigger page */ diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c @@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu) /* First collect pending bits from HW */ GLUE(X_PFX,ack_pending)(xc); - /* - * Cleanup the old-style bits if needed (they may have been - * set by pull or an escalation interrupts). - */ - if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions)) - clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, - &vcpu->arch.pending_exceptions); - pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", xc->pending, xc->hw_cppr, xc->cppr); diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S @@ -182,7 +182,7 @@ */ PPC_LL r4, PACACURRENT(r13) PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) - stw r10, VCPU_CR(r4) + PPC_STL r10, VCPU_CR(r4) PPC_STL r11, VCPU_GPR(R4)(r4) PPC_STL r5, VCPU_GPR(R5)(r4) PPC_STL r6, VCPU_GPR(R6)(r4) @@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) PPC_STL r4, VCPU_GPR(R4)(r11) PPC_LL r4, THREAD_NORMSAVE(0)(r10) PPC_STL r5, VCPU_GPR(R5)(r11) - stw r13, VCPU_CR(r11) + PPC_STL r13, VCPU_CR(r11) mfspr r5, \srr0 PPC_STL r3, VCPU_GPR(R10)(r11) PPC_LL r3, THREAD_NORMSAVE(2)(r10) @@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) PPC_STL r4, VCPU_GPR(R4)(r11) PPC_LL r4, GPR9(r8) PPC_STL r5, VCPU_GPR(R5)(r11) - stw r9, VCPU_CR(r11) + PPC_STL r9, VCPU_CR(r11) mfspr r5, \srr0 PPC_STL r3, VCPU_GPR(R8)(r11) PPC_LL r3, GPR10(r8) @@ -643,7 +643,7 @@ lightweight_exit: PPC_LL r3, VCPU_LR(r4) PPC_LL r5, VCPU_XER(r4) PPC_LL r6, VCPU_CTR(r4) - lwz r7, VCPU_CR(r4) + PPC_LL r7, VCPU_CR(r4) PPC_LL r8, VCPU_PC(r4) PPC_LD(r9, VCPU_SHARED_MSR, r11) PPC_LL r0, VCPU_GPR(R0)(r4) diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c @@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) emulated = EMULATE_FAIL; vcpu->arch.regs.msr = vcpu->arch.shared->msr; - vcpu->arch.regs.ccr = vcpu->arch.cr; if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) { int type = op.type & INSTR_TYPE_MASK; int size = GETSIZE(op.type); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c @@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!(hv_enabled && radix_enabled()); break; case KVM_CAP_PPC_MMU_HASH_V3: - r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); + r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) && + cpu_has_feature(CPU_FTR_HVMODE)); + break; + case KVM_CAP_PPC_NESTED_HV: + r = !!(hv_enabled && kvmppc_hv_ops->enable_nested && + !kvmppc_hv_ops->enable_nested(NULL)); break; #endif case KVM_CAP_SYNC_MMU: @@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); break; } + + case KVM_CAP_PPC_NESTED_HV: + r = -EINVAL; + if (!is_kvmppc_hv_enabled(kvm) || + !kvm->arch.kvm_ops->enable_nested) + break; + r = kvm->arch.kvm_ops->enable_nested(kvm); + break; #endif default: r = -EINVAL; diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S @@ -28,17 +28,25 @@ * Save transactional state and TM-related registers. * Called with: * - r3 pointing to the vcpu struct - * - r4 points to the MSR with current TS bits: + * - r4 containing the MSR with current TS bits: * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR). - * This can modify all checkpointed registers, but - * restores r1, r2 before exit. + * - r5 containing a flag indicating that non-volatile registers + * must be preserved. + * If r5 == 0, this can modify all checkpointed registers, but + * restores r1, r2 before exit. If r5 != 0, this restores the + * MSR TM/FP/VEC/VSX bits to their state on entry. */ _GLOBAL(__kvmppc_save_tm) mflr r0 std r0, PPC_LR_STKOFF(r1) + stdu r1, -SWITCH_FRAME_SIZE(r1) + + mr r9, r3 + cmpdi cr7, r5, 0 /* Turn on TM. */ mfmsr r8 + mr r10, r8 li r0, 1 rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG ori r8, r8, MSR_FP @@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm) std r1, HSTATE_SCRATCH2(r13) std r3, HSTATE_SCRATCH1(r13) + /* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */ + mfcr r6 + SAVE_GPR(6, r1) + + /* Save DSCR so we can restore it to avoid running with user value */ + mfspr r7, SPRN_DSCR + SAVE_GPR(7, r1) + + /* + * We are going to do treclaim., which will modify all checkpointed + * registers. Save the non-volatile registers on the stack if + * preservation of non-volatile state has been requested. + */ + beq cr7, 3f + SAVE_NVGPRS(r1) + + /* MSR[TS] will be 0 (non-transactional) once we do treclaim. */ + li r0, 0 + rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + SAVE_GPR(10, r1) /* final MSR value */ +3: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE BEGIN_FTR_SECTION /* Emulation of the treclaim instruction needs TEXASR before treclaim */ @@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) std r9, PACATMSCRATCH(r13) ld r9, HSTATE_SCRATCH1(r13) - /* Get a few more GPRs free. */ - std r29, VCPU_GPRS_TM(29)(r9) - std r30, VCPU_GPRS_TM(30)(r9) - std r31, VCPU_GPRS_TM(31)(r9) - - /* Save away PPR and DSCR soon so don't run with user values. */ - mfspr r31, SPRN_PPR + /* Save away PPR soon so we don't run with user value. */ + std r0, VCPU_GPRS_TM(0)(r9) + mfspr r0, SPRN_PPR HMT_MEDIUM - mfspr r30, SPRN_DSCR -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 -#endif - /* Save all but r9, r13 & r29-r31 */ - reg = 0 + /* Reload stack pointer. */ + std r1, VCPU_GPRS_TM(1)(r9) + ld r1, HSTATE_SCRATCH2(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + std r2, VCPU_GPRS_TM(2)(r9) + li r2, MSR_RI + mtmsrd r2, 1 + + /* Reload TOC pointer. */ + ld r2, PACATOC(r13) + + /* Save all but r0-r2, r9 & r13 */ + reg = 3 .rept 29 .if (reg != 9) && (reg != 13) std reg, VCPU_GPRS_TM(reg)(r9) @@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ld r4, PACATMSCRATCH(r13) std r4, VCPU_GPRS_TM(9)(r9) - /* Reload stack pointer and TOC. */ - ld r1, HSTATE_SCRATCH2(r13) - ld r2, PACATOC(r13) - - /* Set MSR RI now we have r1 and r13 back. */ - li r5, MSR_RI - mtmsrd r5, 1 + /* Restore host DSCR and CR values, after saving guest values */ + mfcr r6 + mfspr r7, SPRN_DSCR + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_DSCR_TM(r9) + REST_GPR(6, r1) + REST_GPR(7, r1) + mtcr r6 + mtspr SPRN_DSCR, r7 - /* Save away checkpinted SPRs. */ - std r31, VCPU_PPR_TM(r9) - std r30, VCPU_DSCR_TM(r9) + /* Save away checkpointed SPRs. */ + std r0, VCPU_PPR_TM(r9) mflr r5 - mfcr r6 mfctr r7 mfspr r8, SPRN_AMR mfspr r10, SPRN_TAR mfxer r11 std r5, VCPU_LR_TM(r9) - stw r6, VCPU_CR_TM(r9) std r7, VCPU_CTR_TM(r9) std r8, VCPU_AMR_TM(r9) std r10, VCPU_TAR_TM(r9) std r11, VCPU_XER_TM(r9) - /* Restore r12 as trap number. */ - lwz r12, VCPU_TRAP(r9) - /* Save FP/VSX. */ addi r3, r9, VCPU_FPRS_TM bl store_fp_state @@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) bl store_vr_state mfspr r6, SPRN_VRSAVE stw r6, VCPU_VRSAVE_TM(r9) + + /* Restore non-volatile registers if requested to */ + beq cr7, 1f + REST_NVGPRS(r1) + REST_GPR(10, r1) 1: /* * We need to save these SPRs after the treclaim so that the software @@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) */ mfspr r7, SPRN_TEXASR std r7, VCPU_TEXASR(r9) -11: mfspr r5, SPRN_TFHAR mfspr r6, SPRN_TFIAR std r5, VCPU_TFHAR(r9) std r6, VCPU_TFIAR(r9) + /* Restore MSR state if requested */ + beq cr7, 2f + mtmsrd r10, 0 +2: + addi r1, r1, SWITCH_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 blr @@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) * be invoked from C function by PR KVM only. */ _GLOBAL(_kvmppc_save_tm_pr) - mflr r5 - std r5, PPC_LR_STKOFF(r1) - stdu r1, -SWITCH_FRAME_SIZE(r1) - SAVE_NVGPRS(r1) - - /* save MSR since TM/math bits might be impacted - * by __kvmppc_save_tm(). - */ - mfmsr r5 - SAVE_GPR(5, r1) - - /* also save DSCR/CR/TAR so that it can be recovered later */ - mfspr r6, SPRN_DSCR - SAVE_GPR(6, r1) - - mfcr r7 - stw r7, _CCR(r1) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) mfspr r8, SPRN_TAR - SAVE_GPR(8, r1) + std r8, PPC_MIN_STKFRM-8(r1) + li r5, 1 /* preserve non-volatile registers */ bl __kvmppc_save_tm - REST_GPR(8, r1) + ld r8, PPC_MIN_STKFRM-8(r1) mtspr SPRN_TAR, r8 - ld r7, _CCR(r1) - mtcr r7 - - REST_GPR(6, r1) - mtspr SPRN_DSCR, r6 - - /* need preserve current MSR's MSR_TS bits */ - REST_GPR(5, r1) - mfmsr r6 - rldicl r6, r6, 64 - MSR_TS_S_LG, 62 - rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG - mtmsrd r5 - - REST_NVGPRS(r1) - addi r1, r1, SWITCH_FRAME_SIZE - ld r5, PPC_LR_STKOFF(r1) - mtlr r5 + addi r1, r1, PPC_MIN_STKFRM + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 blr EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); @@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); * - r4 is the guest MSR with desired TS bits: * For HV KVM, it is VCPU_MSR * For PR KVM, it is provided by caller - * This potentially modifies all checkpointed registers. - * It restores r1, r2 from the PACA. + * - r5 containing a flag indicating that non-volatile registers + * must be preserved. + * If r5 == 0, this potentially modifies all checkpointed registers, but + * restores r1, r2 from the PACA before exit. + * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry. */ _GLOBAL(__kvmppc_restore_tm) mflr r0 std r0, PPC_LR_STKOFF(r1) + cmpdi cr7, r5, 0 + /* Turn on TM/FP/VSX/VMX so we can restore them. */ mfmsr r5 + mr r10, r5 li r6, MSR_TM >> 32 sldi r6, r6, 32 or r5, r5, r6 @@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm) mr r5, r4 rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 - beqlr /* TM not active in guest */ - std r1, HSTATE_SCRATCH2(r13) + beq 9f /* TM not active in guest */ /* Make sure the failure summary is set, otherwise we'll program check * when we trechkpt. It's possible that this might have been not set @@ -256,6 +271,26 @@ _GLOBAL(__kvmppc_restore_tm) mtspr SPRN_TEXASR, r7 /* + * Make a stack frame and save non-volatile registers if requested. + */ + stdu r1, -SWITCH_FRAME_SIZE(r1) + std r1, HSTATE_SCRATCH2(r13) + + mfcr r6 + mfspr r7, SPRN_DSCR + SAVE_GPR(2, r1) + SAVE_GPR(6, r1) + SAVE_GPR(7, r1) + + beq cr7, 4f + SAVE_NVGPRS(r1) + + /* MSR[TS] will be 1 (suspended) once we do trechkpt */ + li r0, 1 + rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + SAVE_GPR(10, r1) /* final MSR value */ +4: + /* * We need to load up the checkpointed state for the guest. * We need to do this early as it will blow away any GPRs, VSRs and * some SPRs. @@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm) ld r29, VCPU_DSCR_TM(r3) ld r30, VCPU_PPR_TM(r3) - std r2, PACATMSCRATCH(r13) /* Save TOC */ - /* Clear the MSR RI since r1, r13 are all going to be foobar. */ li r5, 0 mtmsrd r5, 1 @@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm) /* Now let's get back the state we need. */ HMT_MEDIUM GET_PACA(r13) -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - ld r29, HSTATE_DSCR(r13) - mtspr SPRN_DSCR, r29 -#endif ld r1, HSTATE_SCRATCH2(r13) - ld r2, PACATMSCRATCH(r13) + REST_GPR(7, r1) + mtspr SPRN_DSCR, r7 /* Set the MSR RI since we have our registers back. */ li r5, MSR_RI mtmsrd r5, 1 + + /* Restore TOC pointer and CR */ + REST_GPR(2, r1) + REST_GPR(6, r1) + mtcr r6 + + /* Restore non-volatile registers if requested to. */ + beq cr7, 5f + REST_GPR(10, r1) + REST_NVGPRS(r1) + +5: addi r1, r1, SWITCH_FRAME_SIZE ld r0, PPC_LR_STKOFF(r1) mtlr r0 + +9: /* Restore MSR bits if requested */ + beqlr cr7 + mtmsrd r10, 0 blr /* @@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm) * can be invoked from C function by PR KVM only. */ _GLOBAL(_kvmppc_restore_tm_pr) - mflr r5 - std r5, PPC_LR_STKOFF(r1) - stdu r1, -SWITCH_FRAME_SIZE(r1) - SAVE_NVGPRS(r1) - - /* save MSR to avoid TM/math bits change */ - mfmsr r5 - SAVE_GPR(5, r1) - - /* also save DSCR/CR/TAR so that it can be recovered later */ - mfspr r6, SPRN_DSCR - SAVE_GPR(6, r1) - - mfcr r7 - stw r7, _CCR(r1) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -PPC_MIN_STKFRM(r1) + /* save TAR so that it can be recovered later */ mfspr r8, SPRN_TAR - SAVE_GPR(8, r1) + std r8, PPC_MIN_STKFRM-8(r1) + li r5, 1 bl __kvmppc_restore_tm - REST_GPR(8, r1) + ld r8, PPC_MIN_STKFRM-8(r1) mtspr SPRN_TAR, r8 - ld r7, _CCR(r1) - mtcr r7 - - REST_GPR(6, r1) - mtspr SPRN_DSCR, r6 - - /* need preserve current MSR's MSR_TS bits */ - REST_GPR(5, r1) - mfmsr r6 - rldicl r6, r6, 64 - MSR_TS_S_LG, 62 - rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG - mtmsrd r5 - - REST_NVGPRS(r1) - addi r1, r1, SWITCH_FRAME_SIZE - ld r5, PPC_LR_STKOFF(r1) - mtlr r5 + addi r1, r1, PPC_MIN_STKFRM + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 blr EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr); diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h @@ -14,7 +14,6 @@ {0x400, "INST_STORAGE"}, \ {0x480, "INST_SEGMENT"}, \ {0x500, "EXTERNAL"}, \ - {0x501, "EXTERNAL_LEVEL"}, \ {0x502, "EXTERNAL_HV"}, \ {0x600, "ALIGNMENT"}, \ {0x700, "PROGRAM"}, \ diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c @@ -833,6 +833,15 @@ EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); /* * Flush partition scoped translations from LPID (=LPIDR) */ +void radix__flush_tlb_lpid(unsigned int lpid) +{ + _tlbie_lpid(lpid, RIC_FLUSH_ALL); +} +EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid); + +/* + * Flush partition scoped translations from LPID (=LPIDR) + */ void radix__local_flush_tlb_lpid(unsigned int lpid) { _tlbiel_lpid(lpid, RIC_FLUSH_ALL); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig @@ -783,6 +783,17 @@ config VFIO_CCW To compile this driver as a module, choose M here: the module will be called vfio_ccw. +config VFIO_AP + def_tristate n + prompt "VFIO support for AP devices" + depends on S390_AP_IOMMU && VFIO_MDEV_DEVICE && KVM + help + This driver grants access to Adjunct Processor (AP) devices + via the VFIO mediated device interface. + + To compile this driver as a module, choose M here: the module + will be called vfio_ap. + endmenu menu "Dump support" diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h @@ -44,6 +44,7 @@ #define KVM_REQ_ICPT_OPEREXC KVM_ARCH_REQ(2) #define KVM_REQ_START_MIGRATION KVM_ARCH_REQ(3) #define KVM_REQ_STOP_MIGRATION KVM_ARCH_REQ(4) +#define KVM_REQ_VSIE_RESTART KVM_ARCH_REQ(5) #define SIGP_CTRL_C 0x80 #define SIGP_CTRL_SCN_MASK 0x3f @@ -186,6 +187,7 @@ struct kvm_s390_sie_block { #define ECA_AIV 0x00200000 #define ECA_VX 0x00020000 #define ECA_PROTEXCI 0x00002000 +#define ECA_APIE 0x00000008 #define ECA_SII 0x00000001 __u32 eca; /* 0x004c */ #define ICPT_INST 0x04 @@ -237,7 +239,11 @@ struct kvm_s390_sie_block { psw_t gpsw; /* 0x0090 */ __u64 gg14; /* 0x00a0 */ __u64 gg15; /* 0x00a8 */ - __u8 reservedb0[20]; /* 0x00b0 */ + __u8 reservedb0[8]; /* 0x00b0 */ +#define HPID_KVM 0x4 +#define HPID_VSIE 0x5 + __u8 hpid; /* 0x00b8 */ + __u8 reservedb9[11]; /* 0x00b9 */ __u16 extcpuaddr; /* 0x00c4 */ __u16 eic; /* 0x00c6 */ __u32 reservedc8; /* 0x00c8 */ @@ -255,6 +261,8 @@ struct kvm_s390_sie_block { __u8 reservede4[4]; /* 0x00e4 */ __u64 tecmc; /* 0x00e8 */ __u8 reservedf0[12]; /* 0x00f0 */ +#define CRYCB_FORMAT_MASK 0x00000003 +#define CRYCB_FORMAT0 0x00000000 #define CRYCB_FORMAT1 0x00000001 #define CRYCB_FORMAT2 0x00000003 __u32 crycbd; /* 0x00fc */ @@ -715,6 +723,7 @@ struct kvm_s390_crypto { __u32 crycbd; __u8 aes_kw; __u8 dea_kw; + __u8 apie; }; #define APCB0_MASK_SIZE 1 @@ -855,6 +864,10 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); +void kvm_arch_crypto_clear_masks(struct kvm *kvm); +void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, + unsigned long *aqm, unsigned long *adm); + extern int sie64a(struct kvm_s390_sie_block *, u64 *); extern char sie_exit; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h @@ -160,6 +160,8 @@ struct kvm_s390_vm_cpu_subfunc { #define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1 #define KVM_S390_VM_CRYPTO_DISABLE_AES_KW 2 #define KVM_S390_VM_CRYPTO_DISABLE_DEA_KW 3 +#define KVM_S390_VM_CRYPTO_ENABLE_APIE 4 +#define KVM_S390_VM_CRYPTO_DISABLE_APIE 5 /* kvm attributes for migration mode */ #define KVM_S390_VM_MIGRATION_STOP 0 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c @@ -40,6 +40,7 @@ #include <asm/sclp.h> #include <asm/cpacf.h> #include <asm/timex.h> +#include <asm/ap.h> #include "kvm-s390.h" #include "gaccess.h" @@ -844,20 +845,24 @@ void kvm_s390_vcpu_crypto_reset_all(struct kvm *kvm) kvm_s390_vcpu_block_all(kvm); - kvm_for_each_vcpu(i, vcpu, kvm) + kvm_for_each_vcpu(i, vcpu, kvm) { kvm_s390_vcpu_crypto_setup(vcpu); + /* recreate the shadow crycb by leaving the VSIE handler */ + kvm_s390_sync_request(KVM_REQ_VSIE_RESTART, vcpu); + } kvm_s390_vcpu_unblock_all(kvm); } static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) { - if (!test_kvm_facility(kvm, 76)) - return -EINVAL; - mutex_lock(&kvm->lock); switch (attr->attr) { case KVM_S390_VM_CRYPTO_ENABLE_AES_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } get_random_bytes( kvm->arch.crypto.crycb->aes_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); @@ -865,6 +870,10 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } get_random_bytes( kvm->arch.crypto.crycb->dea_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); @@ -872,17 +881,39 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } kvm->arch.crypto.aes_kw = 0; memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: + if (!test_kvm_facility(kvm, 76)) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } kvm->arch.crypto.dea_kw = 0; memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support"); break; + case KVM_S390_VM_CRYPTO_ENABLE_APIE: + if (!ap_instructions_available()) { + mutex_unlock(&kvm->lock); + return -EOPNOTSUPP; + } + kvm->arch.crypto.apie = 1; + break; + case KVM_S390_VM_CRYPTO_DISABLE_APIE: + if (!ap_instructions_available()) { + mutex_unlock(&kvm->lock); + return -EOPNOTSUPP; + } + kvm->arch.crypto.apie = 0; + break; default: mutex_unlock(&kvm->lock); return -ENXIO; @@ -1491,6 +1522,10 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: ret = 0; break; + case KVM_S390_VM_CRYPTO_ENABLE_APIE: + case KVM_S390_VM_CRYPTO_DISABLE_APIE: + ret = ap_instructions_available() ? 0 : -ENXIO; + break; default: ret = -ENXIO; break; @@ -1992,55 +2027,101 @@ long kvm_arch_vm_ioctl(struct file *filp, return r; } -static int kvm_s390_query_ap_config(u8 *config) -{ - u32 fcn_code = 0x04000000UL; - u32 cc = 0; - - memset(config, 0, 128); - asm volatile( - "lgr 0,%1\n" - "lgr 2,%2\n" - ".long 0xb2af0000\n" /* PQAP(QCI) */ - "0: ipm %0\n" - "srl %0,28\n" - "1:\n" - EX_TABLE(0b, 1b) - : "+r" (cc) - : "r" (fcn_code), "r" (config) - : "cc", "0", "2", "memory" - ); - - return cc; -} - static int kvm_s390_apxa_installed(void) { - u8 config[128]; - int cc; + struct ap_config_info info; - if (test_facility(12)) { - cc = kvm_s390_query_ap_config(config); - - if (cc) - pr_err("PQAP(QCI) failed with cc=%d", cc); - else - return config[0] & 0x40; + if (ap_instructions_available()) { + if (ap_qci(&info) == 0) + return info.apxa; } return 0; } +/* + * The format of the crypto control block (CRYCB) is specified in the 3 low + * order bits of the CRYCB designation (CRYCBD) field as follows: + * Format 0: Neither the message security assist extension 3 (MSAX3) nor the + * AP extended addressing (APXA) facility are installed. + * Format 1: The APXA facility is not installed but the MSAX3 facility is. + * Format 2: Both the APXA and MSAX3 facilities are installed + */ static void kvm_s390_set_crycb_format(struct kvm *kvm) { kvm->arch.crypto.crycbd = (__u32)(unsigned long) kvm->arch.crypto.crycb; + /* Clear the CRYCB format bits - i.e., set format 0 by default */ + kvm->arch.crypto.crycbd &= ~(CRYCB_FORMAT_MASK); + + /* Check whether MSAX3 is installed */ + if (!test_kvm_facility(kvm, 76)) + return; + if (kvm_s390_apxa_installed()) kvm->arch.crypto.crycbd |= CRYCB_FORMAT2; else kvm->arch.crypto.crycbd |= CRYCB_FORMAT1; } +void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, + unsigned long *aqm, unsigned long *adm) +{ + struct kvm_s390_crypto_cb *crycb = kvm->arch.crypto.crycb; + + mutex_lock(&kvm->lock); + kvm_s390_vcpu_block_all(kvm); + + switch (kvm->arch.crypto.crycbd & CRYCB_FORMAT_MASK) { + case CRYCB_FORMAT2: /* APCB1 use 256 bits */ + memcpy(crycb->apcb1.apm, apm, 32); + VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx %016lx %016lx %016lx", + apm[0], apm[1], apm[2], apm[3]); + memcpy(crycb->apcb1.aqm, aqm, 32); + VM_EVENT(kvm, 3, "SET CRYCB: aqm %016lx %016lx %016lx %016lx", + aqm[0], aqm[1], aqm[2], aqm[3]); + memcpy(crycb->apcb1.adm, adm, 32); + VM_EVENT(kvm, 3, "SET CRYCB: adm %016lx %016lx %016lx %016lx", + adm[0], adm[1], adm[2], adm[3]); + break; + case CRYCB_FORMAT1: + case CRYCB_FORMAT0: /* Fall through both use APCB0 */ + memcpy(crycb->apcb0.apm, apm, 8); + memcpy(crycb->apcb0.aqm, aqm, 2); + memcpy(crycb->apcb0.adm, adm, 2); + VM_EVENT(kvm, 3, "SET CRYCB: apm %016lx aqm %04x adm %04x", + apm[0], *((unsigned short *)aqm), + *((unsigned short *)adm)); + break; + default: /* Can not happen */ + break; + } + + /* recreate the shadow crycb for each vcpu */ + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); + kvm_s390_vcpu_unblock_all(kvm); + mutex_unlock(&kvm->lock); +} +EXPORT_SYMBOL_GPL(kvm_arch_crypto_set_masks); + +void kvm_arch_crypto_clear_masks(struct kvm *kvm) +{ + mutex_lock(&kvm->lock); + kvm_s390_vcpu_block_all(kvm); + + memset(&kvm->arch.crypto.crycb->apcb0, 0, + sizeof(kvm->arch.crypto.crycb->apcb0)); + memset(&kvm->arch.crypto.crycb->apcb1, 0, + sizeof(kvm->arch.crypto.crycb->apcb1)); + + VM_EVENT(kvm, 3, "%s", "CLR CRYCB:"); + /* recreate the shadow crycb for each vcpu */ + kvm_s390_sync_request_broadcast(kvm, KVM_REQ_VSIE_RESTART); + kvm_s390_vcpu_unblock_all(kvm); + mutex_unlock(&kvm->lock); +} +EXPORT_SYMBOL_GPL(kvm_arch_crypto_clear_masks); + static u64 kvm_s390_get_initial_cpuid(void) { struct cpuid cpuid; @@ -2052,12 +2133,12 @@ static u64 kvm_s390_get_initial_cpuid(void) static void kvm_s390_crypto_init(struct kvm *kvm) { - if (!test_kvm_facility(kvm, 76)) - return; - kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb; kvm_s390_set_crycb_format(kvm); + if (!test_kvm_facility(kvm, 76)) + return; + /* Enable AES/DEA protected key functions by default */ kvm->arch.crypto.aes_kw = 1; kvm->arch.crypto.dea_kw = 1; @@ -2583,17 +2664,25 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu) { - if (!test_kvm_facility(vcpu->kvm, 76)) + /* + * If the AP instructions are not being interpreted and the MSAX3 + * facility is not configured for the guest, there is nothing to set up. + */ + if (!vcpu->kvm->arch.crypto.apie && !test_kvm_facility(vcpu->kvm, 76)) return; + vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; vcpu->arch.sie_block->ecb3 &= ~(ECB3_AES | ECB3_DEA); + vcpu->arch.sie_block->eca &= ~ECA_APIE; + + if (vcpu->kvm->arch.crypto.apie) + vcpu->arch.sie_block->eca |= ECA_APIE; + /* Set up protected key support */ if (vcpu->kvm->arch.crypto.aes_kw) vcpu->arch.sie_block->ecb3 |= ECB3_AES; if (vcpu->kvm->arch.crypto.dea_kw) vcpu->arch.sie_block->ecb3 |= ECB3_DEA; - - vcpu->arch.sie_block->crycbd = vcpu->kvm->arch.crypto.crycbd; } void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu) @@ -2685,6 +2774,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup; + vcpu->arch.sie_block->hpid = HPID_KVM; + kvm_s390_vcpu_crypto_setup(vcpu); return rc; @@ -2768,18 +2859,25 @@ static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu) exit_sie(vcpu); } +bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu) +{ + return atomic_read(&vcpu->arch.sie_block->prog20) & + (PROG_BLOCK_SIE | PROG_REQUEST); +} + static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu) { atomic_andnot(PROG_REQUEST, &vcpu->arch.sie_block->prog20); } /* - * Kick a guest cpu out of SIE and wait until SIE is not running. + * Kick a guest cpu out of (v)SIE and wait until (v)SIE is not running. * If the CPU is not running (e.g. waiting as idle) the function will * return immediately. */ void exit_sie(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_STOP_INT); + kvm_s390_vsie_kick(vcpu); while (vcpu->arch.sie_block->prog0c & PROG_IN_SIE) cpu_relax(); } @@ -3196,6 +3294,8 @@ retry: /* nothing to do, just clear the request */ kvm_clear_request(KVM_REQ_UNHALT, vcpu); + /* we left the vsie handler, nothing to do, just clear the request */ + kvm_clear_request(KVM_REQ_VSIE_RESTART, vcpu); return 0; } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h @@ -290,6 +290,7 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); +bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu); void exit_sie(struct kvm_vcpu *vcpu); void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu); int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c @@ -135,14 +135,148 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) atomic_set(&scb_s->cpuflags, newflags); return 0; } +/* Copy to APCB FORMAT1 from APCB FORMAT0 */ +static int setup_apcb10(struct kvm_vcpu *vcpu, struct kvm_s390_apcb1 *apcb_s, + unsigned long apcb_o, struct kvm_s390_apcb1 *apcb_h) +{ + struct kvm_s390_apcb0 tmp; -/* + if (read_guest_real(vcpu, apcb_o, &tmp, sizeof(struct kvm_s390_apcb0))) + return -EFAULT; + + apcb_s->apm[0] = apcb_h->apm[0] & tmp.apm[0]; + apcb_s->aqm[0] = apcb_h->aqm[0] & tmp.aqm[0] & 0xffff000000000000UL; + apcb_s->adm[0] = apcb_h->adm[0] & tmp.adm[0] & 0xffff000000000000UL; + + return 0; + +} + +/** + * setup_apcb00 - Copy to APCB FORMAT0 from APCB FORMAT0 + * @vcpu: pointer to the virtual CPU + * @apcb_s: pointer to start of apcb in the shadow crycb + * @apcb_o: pointer to start of original apcb in the guest2 + * @apcb_h: pointer to start of apcb in the guest1 + * + * Returns 0 and -EFAULT on error reading guest apcb + */ +static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, + unsigned long apcb_o, unsigned long *apcb_h) +{ + if (read_guest_real(vcpu, apcb_o, apcb_s, + sizeof(struct kvm_s390_apcb0))) + return -EFAULT; + + bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0)); + + return 0; +} + +/** + * setup_apcb11 - Copy the FORMAT1 APCB from the guest to the shadow CRYCB + * @vcpu: pointer to the virtual CPU + * @apcb_s: pointer to start of apcb in the shadow crycb + * @apcb_o: pointer to start of original guest apcb + * @apcb_h: pointer to start of apcb in the host + * + * Returns 0 and -EFAULT on error reading guest apcb + */ +static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, + unsigned long apcb_o, + unsigned long *apcb_h) +{ + if (read_guest_real(vcpu, apcb_o, apcb_s, + sizeof(struct kvm_s390_apcb1))) + return -EFAULT; + + bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1)); + + return 0; +} + +/** + * setup_apcb - Create a shadow copy of the apcb. + * @vcpu: pointer to the virtual CPU + * @crycb_s: pointer to shadow crycb + * @crycb_o: pointer to original guest crycb + * @crycb_h: pointer to the host crycb + * @fmt_o: format of the original guest crycb. + * @fmt_h: format of the host crycb. + * + * Checks the compatibility between the guest and host crycb and calls the + * appropriate copy function. + * + * Return 0 or an error number if the guest and host crycb are incompatible. + */ +static int setup_apcb(struct kvm_vcpu *vcpu, struct kvm_s390_crypto_cb *crycb_s, + const u32 crycb_o, + struct kvm_s390_crypto_cb *crycb_h, + int fmt_o, int fmt_h) +{ + struct kvm_s390_crypto_cb *crycb; + + crycb = (struct kvm_s390_crypto_cb *) (unsigned long)crycb_o; + + switch (fmt_o) { + case CRYCB_FORMAT2: + if ((crycb_o & PAGE_MASK) != ((crycb_o + 256) & PAGE_MASK)) + return -EACCES; + if (fmt_h != CRYCB_FORMAT2) + return -EINVAL; + return setup_apcb11(vcpu, (unsigned long *)&crycb_s->apcb1, + (unsigned long) &crycb->apcb1, + (unsigned long *)&crycb_h->apcb1); + case CRYCB_FORMAT1: + switch (fmt_h) { + case CRYCB_FORMAT2: + return setup_apcb10(vcpu, &crycb_s->apcb1, + (unsigned long) &crycb->apcb0, + &crycb_h->apcb1); + case CRYCB_FORMAT1: + return setup_apcb00(vcpu, + (unsigned long *) &crycb_s->apcb0, + (unsigned long) &crycb->apcb0, + (unsigned long *) &crycb_h->apcb0); + } + break; + case CRYCB_FORMAT0: + if ((crycb_o & PAGE_MASK) != ((crycb_o + 32) & PAGE_MASK)) + return -EACCES; + + switch (fmt_h) { + case CRYCB_FORMAT2: + return setup_apcb10(vcpu, &crycb_s->apcb1, + (unsigned long) &crycb->apcb0, + &crycb_h->apcb1); + case CRYCB_FORMAT1: + case CRYCB_FORMAT0: + return setup_apcb00(vcpu, + (unsigned long *) &crycb_s->apcb0, + (unsigned long) &crycb->apcb0, + (unsigned long *) &crycb_h->apcb0); + } + } + return -EINVAL; +} + +/** + * shadow_crycb - Create a shadow copy of the crycb block + * @vcpu: a pointer to the virtual CPU + * @vsie_page: a pointer to internal date used for the vSIE + * * Create a shadow copy of the crycb block and setup key wrapping, if * requested for guest 3 and enabled for guest 2. * - * We only accept format-1 (no AP in g2), but convert it into format-2 + * We accept format-1 or format-2, but we convert format-1 into format-2 + * in the shadow CRYCB. + * Using format-2 enables the firmware to choose the right format when + * scheduling the SIE. * There is nothing to do for format-0. * + * This function centralize the issuing of set_validity_icpt() for all + * the subfunctions working on the crycb. + * * Returns: - 0 if shadowed or nothing to do * - > 0 if control has to be given to guest 2 */ @@ -154,23 +288,40 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) const u32 crycb_addr = crycbd_o & 0x7ffffff8U; unsigned long *b1, *b2; u8 ecb3_flags; + int apie_h; + int key_msk = test_kvm_facility(vcpu->kvm, 76); + int fmt_o = crycbd_o & CRYCB_FORMAT_MASK; + int fmt_h = vcpu->arch.sie_block->crycbd & CRYCB_FORMAT_MASK; + int ret = 0; scb_s->crycbd = 0; - if (!(crycbd_o & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1)) - return 0; - /* format-1 is supported with message-security-assist extension 3 */ - if (!test_kvm_facility(vcpu->kvm, 76)) + + apie_h = vcpu->arch.sie_block->eca & ECA_APIE; + if (!apie_h && !key_msk) return 0; + + if (!crycb_addr) + return set_validity_icpt(scb_s, 0x0039U); + + if (fmt_o == CRYCB_FORMAT1) + if ((crycb_addr & PAGE_MASK) != + ((crycb_addr + 128) & PAGE_MASK)) + return set_validity_icpt(scb_s, 0x003CU); + + if (apie_h && (scb_o->eca & ECA_APIE)) { + ret = setup_apcb(vcpu, &vsie_page->crycb, crycb_addr, + vcpu->kvm->arch.crypto.crycb, + fmt_o, fmt_h); + if (ret) + goto end; + scb_s->eca |= scb_o->eca & ECA_APIE; + } + /* we may only allow it if enabled for guest 2 */ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 & (ECB3_AES | ECB3_DEA); if (!ecb3_flags) - return 0; - - if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK)) - return set_validity_icpt(scb_s, 0x003CU); - else if (!crycb_addr) - return set_validity_icpt(scb_s, 0x0039U); + goto end; /* copy only the wrapping keys */ if (read_guest_real(vcpu, crycb_addr + 72, @@ -178,8 +329,6 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) return set_validity_icpt(scb_s, 0x0035U); scb_s->ecb3 |= ecb3_flags; - scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 | - CRYCB_FORMAT2; /* xor both blocks in one run */ b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask; @@ -187,6 +336,16 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask; /* as 56%8 == 0, bitmap_xor won't overwrite any data */ bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56); +end: + switch (ret) { + case -EINVAL: + return set_validity_icpt(scb_s, 0x0020U); + case -EFAULT: + return set_validity_icpt(scb_s, 0x0035U); + case -EACCES: + return set_validity_icpt(scb_s, 0x003CU); + } + scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT2; return 0; } @@ -383,6 +542,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (test_kvm_facility(vcpu->kvm, 156)) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; + scb_s->hpid = HPID_VSIE; + prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); out: @@ -830,7 +991,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; int guest_bp_isolation; - int rc; + int rc = 0; handle_last_fault(vcpu, vsie_page); @@ -858,7 +1019,18 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) guest_enter_irqoff(); local_irq_enable(); - rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + /* + * Simulate a SIE entry of the VCPU (see sie64a), so VCPU blocking + * and VCPU requests also hinder the vSIE from running and lead + * to an immediate exit. kvm_s390_vsie_kick() has to be used to + * also kick the vSIE. + */ + vcpu->arch.sie_block->prog0c |= PROG_IN_SIE; + barrier(); + if (!kvm_s390_vcpu_sie_inhibited(vcpu)) + rc = sie64a(scb_s, vcpu->run->s.regs.gprs); + barrier(); + vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE; local_irq_disable(); guest_exit_irqoff(); @@ -1005,7 +1177,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) if (rc == -EAGAIN) rc = 0; if (rc || scb_s->icptcode || signal_pending(current) || - kvm_s390_vcpu_has_irq(vcpu, 0)) + kvm_s390_vcpu_has_irq(vcpu, 0) || + kvm_s390_vcpu_sie_inhibited(vcpu)) break; } @@ -1122,7 +1295,8 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (unlikely(scb_addr & 0x1ffUL)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0)) + if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0) || + kvm_s390_vcpu_sie_inhibited(vcpu)) return 0; vsie_page = get_vsie_page(vcpu->kvm, scb_addr); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c @@ -907,10 +907,16 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) pmd_t *pmdp; BUG_ON(gmap_is_shadow(gmap)); - spin_lock(&gmap->guest_table_lock); pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1); + if (!pmdp) + return NULL; - if (!pmdp || pmd_none(*pmdp)) { + /* without huge pages, there is no need to take the table lock */ + if (!gmap->mm->context.allow_gmap_hpage_1m) + return pmd_none(*pmdp) ? NULL : pmdp; + + spin_lock(&gmap->guest_table_lock); + if (pmd_none(*pmdp)) { spin_unlock(&gmap->guest_table_lock); return NULL; } diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c @@ -106,6 +106,8 @@ static struct facility_def facility_defs[] = { .name = "FACILITIES_KVM_CPUMODEL", .bits = (int[]){ + 12, /* AP Query Configuration Information */ + 15, /* AP Facilities Test */ 156, /* etoken facility */ -1 /* END */ } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h @@ -102,7 +102,15 @@ #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ -#define KVM_NR_PAGE_SIZES 3 +enum { + PT_PAGE_TABLE_LEVEL = 1, + PT_DIRECTORY_LEVEL = 2, + PT_PDPE_LEVEL = 3, + /* set max level to the biggest one */ + PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL, +}; +#define KVM_NR_PAGE_SIZES (PT_MAX_HUGEPAGE_LEVEL - \ + PT_PAGE_TABLE_LEVEL + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) @@ -177,6 +185,7 @@ enum { #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) +#define DR6_BT (1 << 15) #define DR6_RTM (1 << 16) #define DR6_FIXED_1 0xfffe0ff0 #define DR6_INIT 0xffff0ff0 @@ -247,7 +256,7 @@ struct kvm_mmu_memory_cache { * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp. */ union kvm_mmu_page_role { - unsigned word; + u32 word; struct { unsigned level:4; unsigned cr4_pae:1; @@ -273,6 +282,34 @@ union kvm_mmu_page_role { }; }; +union kvm_mmu_extended_role { +/* + * This structure complements kvm_mmu_page_role caching everything needed for + * MMU configuration. If nothing in both these structures changed, MMU + * re-configuration can be skipped. @valid bit is set on first usage so we don't + * treat all-zero structure as valid data. + */ + u32 word; + struct { + unsigned int valid:1; + unsigned int execonly:1; + unsigned int cr0_pg:1; + unsigned int cr4_pse:1; + unsigned int cr4_pke:1; + unsigned int cr4_smap:1; + unsigned int cr4_smep:1; + unsigned int cr4_la57:1; + }; +}; + +union kvm_mmu_role { + u64 as_u64; + struct { + union kvm_mmu_page_role base; + union kvm_mmu_extended_role ext; + }; +}; + struct kvm_rmap_head { unsigned long val; }; @@ -280,18 +317,18 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + bool unsync; /* * The following two entries are used to key the shadow page in the * hash table. */ - gfn_t gfn; union kvm_mmu_page_role role; + gfn_t gfn; u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - bool unsync; int root_count; /* Currently serving as active root */ unsigned int unsync_children; struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ @@ -360,7 +397,7 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - union kvm_mmu_page_role base_role; + union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; u8 ept_ad; @@ -490,7 +527,7 @@ struct kvm_vcpu_hv { struct kvm_hyperv_exit exit; struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); - cpumask_t tlb_lush; + cpumask_t tlb_flush; }; struct kvm_vcpu_arch { @@ -534,7 +571,13 @@ struct kvm_vcpu_arch { * the paging mode of the l1 guest. This context is always used to * handle faults. */ - struct kvm_mmu mmu; + struct kvm_mmu *mmu; + + /* Non-nested MMU for L1 */ + struct kvm_mmu root_mmu; + + /* L1 MMU when running nested */ + struct kvm_mmu guest_mmu; /* * Paging state of an L2 guest (used for nested npt) @@ -585,6 +628,8 @@ struct kvm_vcpu_arch { bool has_error_code; u8 nr; u32 error_code; + unsigned long payload; + bool has_payload; u8 nested_apf; } exception; @@ -781,6 +826,9 @@ struct kvm_hv { u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; + + /* How many vCPUs have VP index != vCPU index */ + atomic_t num_mismatched_vp_indexes; }; enum kvm_irqchip_mode { @@ -871,6 +919,7 @@ struct kvm_arch { bool x2apic_broadcast_quirk_disabled; bool guest_can_read_msr_platform_info; + bool exception_payload_enabled; }; struct kvm_vm_stat { @@ -1133,6 +1182,9 @@ struct kvm_x86_ops { int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*get_msr_feature)(struct kvm_msr_entry *entry); + + int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); }; struct kvm_arch_async_pf { @@ -1170,7 +1222,6 @@ void kvm_mmu_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); -void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, @@ -1324,7 +1375,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h @@ -40,7 +40,7 @@ static inline int cpu_has_vmx(void) */ static inline void cpu_vmxoff(void) { - asm volatile (ASM_VMX_VMXOFF : : : "cc"); + asm volatile ("vmxoff"); cr4_clear_bits(X86_CR4_VMXE); } diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h @@ -503,19 +503,6 @@ enum vmcs_field { #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul - -#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" -#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" -#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" -#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" -#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" -#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" -#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" -#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" - struct vmx_msr_entry { u32 index; u32 reserved; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h @@ -288,6 +288,7 @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 +#define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -299,7 +300,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 has_error_code; - __u8 pad; + __u8 pending; __u32 error_code; } exception; struct { @@ -322,7 +323,9 @@ struct kvm_vcpu_events { __u8 smm_inside_nmi; __u8 latched_init; } smi; - __u32 reserved[9]; + __u8 reserved[27]; + __u8 exception_has_payload; + __u64 exception_payload; }; /* for KVM_GET/SET_DEBUGREGS */ @@ -381,6 +384,7 @@ struct kvm_sync_regs { #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 +#define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c @@ -36,6 +36,8 @@ #include "trace.h" +#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64) + static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint) { return atomic64_read(&synic->sint[sint]); @@ -132,8 +134,10 @@ static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) struct kvm_vcpu *vcpu = NULL; int i; - if (vpidx < KVM_MAX_VCPUS) - vcpu = kvm_get_vcpu(kvm, vpidx); + if (vpidx >= KVM_MAX_VCPUS) + return NULL; + + vcpu = kvm_get_vcpu(kvm, vpidx); if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) @@ -689,6 +693,24 @@ void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu) stimer_cleanup(&hv_vcpu->stimer[i]); } +bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) + return false; + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} +EXPORT_SYMBOL_GPL(kvm_hv_assist_page_enabled); + +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, + struct hv_vp_assist_page *assist_page) +{ + if (!kvm_hv_assist_page_enabled(vcpu)) + return false; + return !kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, + assist_page, sizeof(*assist_page)); +} +EXPORT_SYMBOL_GPL(kvm_hv_get_assist_page); + static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer) { struct hv_message *msg = &stimer->msg; @@ -1040,21 +1062,41 @@ static u64 current_task_runtime_100ns(void) static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) { - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: - if (!host) + case HV_X64_MSR_VP_INDEX: { + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; + int vcpu_idx = kvm_vcpu_get_idx(vcpu); + u32 new_vp_index = (u32)data; + + if (!host || new_vp_index >= KVM_MAX_VCPUS) return 1; - hv->vp_index = (u32)data; + + if (new_vp_index == hv_vcpu->vp_index) + return 0; + + /* + * The VP index is initialized to vcpu_index by + * kvm_hv_vcpu_postcreate so they initially match. Now the + * VP index is changing, adjust num_mismatched_vp_indexes if + * it now matches or no longer matches vcpu_idx. + */ + if (hv_vcpu->vp_index == vcpu_idx) + atomic_inc(&hv->num_mismatched_vp_indexes); + else if (new_vp_index == vcpu_idx) + atomic_dec(&hv->num_mismatched_vp_indexes); + + hv_vcpu->vp_index = new_vp_index; break; + } case HV_X64_MSR_VP_ASSIST_PAGE: { u64 gfn; unsigned long addr; if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { - hv->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0)) + hv_vcpu->hv_vapic = data; + if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1062,12 +1104,19 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; - if (__clear_user((void __user *)addr, PAGE_SIZE)) + + /* + * Clear apic_assist portion of f(struct hv_vp_assist_page + * only, there can be valuable data in the rest which needs + * to be preserved e.g. on migration. + */ + if (__clear_user((void __user *)addr, sizeof(u32))) return 1; - hv->hv_vapic = data; + hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_enable_pv_eoi(vcpu, - gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) + gfn_to_gpa(gfn) | KVM_MSR_ENABLED, + sizeof(struct hv_vp_assist_page))) return 1; break; } @@ -1080,7 +1129,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) case HV_X64_MSR_VP_RUNTIME: if (!host) return 1; - hv->runtime_offset = data - current_task_runtime_100ns(); + hv_vcpu->runtime_offset = data - current_task_runtime_100ns(); break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: @@ -1172,11 +1221,11 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data = 0; - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; switch (msr) { case HV_X64_MSR_VP_INDEX: - data = hv->vp_index; + data = hv_vcpu->vp_index; break; case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); @@ -1185,10 +1234,10 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); case HV_X64_MSR_VP_ASSIST_PAGE: - data = hv->hv_vapic; + data = hv_vcpu->hv_vapic; break; case HV_X64_MSR_VP_RUNTIME: - data = current_task_runtime_100ns() + hv->runtime_offset; + data = current_task_runtime_100ns() + hv_vcpu->runtime_offset; break; case HV_X64_MSR_SCONTROL: case HV_X64_MSR_SVERSION: @@ -1255,32 +1304,47 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) return kvm_hv_get_msr(vcpu, msr, pdata, host); } -static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) +static __always_inline unsigned long *sparse_set_to_vcpu_mask( + struct kvm *kvm, u64 *sparse_banks, u64 valid_bank_mask, + u64 *vp_bitmap, unsigned long *vcpu_bitmap) { - int i = 0, j; + struct kvm_hv *hv = &kvm->arch.hyperv; + struct kvm_vcpu *vcpu; + int i, bank, sbank = 0; - if (!(valid_bank_mask & BIT_ULL(bank_no))) - return -1; + memset(vp_bitmap, 0, + KVM_HV_MAX_SPARSE_VCPU_SET_BITS * sizeof(*vp_bitmap)); + for_each_set_bit(bank, (unsigned long *)&valid_bank_mask, + KVM_HV_MAX_SPARSE_VCPU_SET_BITS) + vp_bitmap[bank] = sparse_banks[sbank++]; - for (j = 0; j < bank_no; j++) - if (valid_bank_mask & BIT_ULL(j)) - i++; + if (likely(!atomic_read(&hv->num_mismatched_vp_indexes))) { + /* for all vcpus vp_index == vcpu_idx */ + return (unsigned long *)vp_bitmap; + } - return i; + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (test_bit(vcpu_to_hv_vcpu(vcpu)->vp_index, + (unsigned long *)vp_bitmap)) + __set_bit(i, vcpu_bitmap); + } + return vcpu_bitmap; } static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, u16 rep_cnt, bool ex) { struct kvm *kvm = current_vcpu->kvm; - struct kvm_vcpu_hv *hv_current = &current_vcpu->arch.hyperv; + struct kvm_vcpu_hv *hv_vcpu = &current_vcpu->arch.hyperv; struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; - struct kvm_vcpu *vcpu; - unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] = {0}; - unsigned long valid_bank_mask = 0; + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + unsigned long *vcpu_mask; + u64 valid_bank_mask; u64 sparse_banks[64]; - int sparse_banks_len, i; + int sparse_banks_len; bool all_cpus; if (!ex) { @@ -1290,6 +1354,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, trace_kvm_hv_flush_tlb(flush.processor_mask, flush.address_space, flush.flags); + valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; } else { @@ -1306,7 +1371,8 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, all_cpus = flush_ex.hv_vp_set.format != HV_GENERIC_SET_SPARSE_4K; - sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * + sparse_banks_len = + bitmap_weight((unsigned long *)&valid_bank_mask, 64) * sizeof(sparse_banks[0]); if (!sparse_banks_len && !all_cpus) @@ -1321,48 +1387,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, return HV_STATUS_INVALID_HYPERCALL_INPUT; } - cpumask_clear(&hv_current->tlb_lush); - - kvm_for_each_vcpu(i, vcpu, kvm) { - struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; - int bank = hv->vp_index / 64, sbank = 0; - - if (!all_cpus) { - /* Banks >64 can't be represented */ - if (bank >= 64) - continue; - - /* Non-ex hypercalls can only address first 64 vCPUs */ - if (!ex && bank) - continue; - - if (ex) { - /* - * Check is the bank of this vCPU is in sparse - * set and get the sparse bank number. - */ - sbank = get_sparse_bank_no(valid_bank_mask, - bank); - - if (sbank < 0) - continue; - } - - if (!(sparse_banks[sbank] & BIT_ULL(hv->vp_index % 64))) - continue; - } + cpumask_clear(&hv_vcpu->tlb_flush); - /* - * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we - * can't analyze it here, flush TLB regardless of the specified - * address space. - */ - __set_bit(i, vcpu_bitmap); - } + vcpu_mask = all_cpus ? NULL : + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + /* + * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't + * analyze it here, flush TLB regardless of the specified address space. + */ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, - vcpu_bitmap, &hv_current->tlb_lush); + vcpu_mask, &hv_vcpu->tlb_flush); ret_success: /* We always do full TLB flush, set rep_done = rep_cnt. */ @@ -1370,6 +1407,99 @@ ret_success: ((u64)rep_cnt << HV_HYPERCALL_REP_COMP_OFFSET); } +static void kvm_send_ipi_to_many(struct kvm *kvm, u32 vector, + unsigned long *vcpu_bitmap) +{ + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vector + }; + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) + continue; + + /* We fail only when APIC is disabled */ + kvm_apic_set_irq(vcpu, &irq, NULL); + } +} + +static u64 kvm_hv_send_ipi(struct kvm_vcpu *current_vcpu, u64 ingpa, u64 outgpa, + bool ex, bool fast) +{ + struct kvm *kvm = current_vcpu->kvm; + struct hv_send_ipi_ex send_ipi_ex; + struct hv_send_ipi send_ipi; + u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + unsigned long *vcpu_mask; + unsigned long valid_bank_mask; + u64 sparse_banks[64]; + int sparse_banks_len; + u32 vector; + bool all_cpus; + + if (!ex) { + if (!fast) { + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi, + sizeof(send_ipi)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + sparse_banks[0] = send_ipi.cpu_mask; + vector = send_ipi.vector; + } else { + /* 'reserved' part of hv_send_ipi should be 0 */ + if (unlikely(ingpa >> 32 != 0)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + sparse_banks[0] = outgpa; + vector = (u32)ingpa; + } + all_cpus = false; + valid_bank_mask = BIT_ULL(0); + + trace_kvm_hv_send_ipi(vector, sparse_banks[0]); + } else { + if (unlikely(kvm_read_guest(kvm, ingpa, &send_ipi_ex, + sizeof(send_ipi_ex)))) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + trace_kvm_hv_send_ipi_ex(send_ipi_ex.vector, + send_ipi_ex.vp_set.format, + send_ipi_ex.vp_set.valid_bank_mask); + + vector = send_ipi_ex.vector; + valid_bank_mask = send_ipi_ex.vp_set.valid_bank_mask; + sparse_banks_len = bitmap_weight(&valid_bank_mask, 64) * + sizeof(sparse_banks[0]); + + all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL; + + if (!sparse_banks_len) + goto ret_success; + + if (!all_cpus && + kvm_read_guest(kvm, + ingpa + offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents), + sparse_banks, + sparse_banks_len)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + } + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + + vcpu_mask = all_cpus ? NULL : + sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + + kvm_send_ipi_to_many(kvm, vector, vcpu_mask); + +ret_success: + return HV_STATUS_SUCCESS; +} + bool kvm_hv_hypercall_enabled(struct kvm *kvm) { return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; @@ -1539,6 +1669,20 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) } ret = kvm_hv_flush_tlb(vcpu, ingpa, rep_cnt, true); break; + case HVCALL_SEND_IPI: + if (unlikely(rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, false, fast); + break; + case HVCALL_SEND_IPI_EX: + if (unlikely(fast || rep)) { + ret = HV_STATUS_INVALID_HYPERCALL_INPUT; + break; + } + ret = kvm_hv_send_ipi(vcpu, ingpa, outgpa, true, false); + break; default: ret = HV_STATUS_INVALID_HYPERCALL_CODE; break; diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h @@ -62,6 +62,10 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); +bool kvm_hv_assist_page_enabled(struct kvm_vcpu *vcpu); +bool kvm_hv_get_assist_page(struct kvm_vcpu *vcpu, + struct hv_vp_assist_page *assist_page); + static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, int timer_index) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c @@ -70,6 +70,11 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul +static bool lapic_timer_advance_adjust_done = false; +#define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 +/* step-by-step approximation to mitigate fluctuation */ +#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 + static inline int apic_test_vector(int vec, void *bitmap) { return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -955,14 +960,14 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, map = rcu_dereference(kvm->arch.apic_map); ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); - if (ret) + if (ret) { + *r = 0; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; - if (*r < 0) - *r = 0; *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } + } rcu_read_unlock(); return ret; @@ -1472,7 +1477,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - u64 guest_tsc, tsc_deadline; + u64 guest_tsc, tsc_deadline, ns; if (!lapic_in_kernel(vcpu)) return; @@ -1492,6 +1497,24 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) if (guest_tsc < tsc_deadline) __delay(min(tsc_deadline - guest_tsc, nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + + if (!lapic_timer_advance_adjust_done) { + /* too early */ + if (guest_tsc < tsc_deadline) { + ns = (tsc_deadline - guest_tsc) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + lapic_timer_advance_ns -= min((unsigned int)ns, + lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } else { + /* too late */ + ns = (guest_tsc - tsc_deadline) * 1000000ULL; + do_div(ns, vcpu->arch.virtual_tsc_khz); + lapic_timer_advance_ns += min((unsigned int)ns, + lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + } + if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) + lapic_timer_advance_adjust_done = true; + } } static void start_sw_tscdeadline(struct kvm_lapic *apic) @@ -2621,17 +2644,25 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; + struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; + unsigned long new_len; + if (!IS_ALIGNED(addr, 4)) return 1; vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, - addr, sizeof(u8)); + + if (addr == ghc->gpa && len <= ghc->len) + new_len = ghc->len; + else + new_len = len; + + return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); } void kvm_apic_accept_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h @@ -120,7 +120,7 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_init(void); void kvm_lapic_exit(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c @@ -932,7 +932,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, while (cache->nobjs < ARRAY_SIZE(cache->objects)) { obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); if (!obj) - return -ENOMEM; + return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = obj; } return 0; @@ -960,7 +960,7 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, while (cache->nobjs < ARRAY_SIZE(cache->objects)) { page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); if (!page) - return -ENOMEM; + return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = page; } return 0; @@ -1265,24 +1265,24 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, mmu_free_pte_list_desc(desc); } -static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) +static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; struct pte_list_desc *prev_desc; int i; if (!rmap_head->val) { - printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); + pr_err("%s: %p 0->BUG\n", __func__, spte); BUG(); } else if (!(rmap_head->val & 1)) { - rmap_printk("pte_list_remove: %p 1->0\n", spte); + rmap_printk("%s: %p 1->0\n", __func__, spte); if ((u64 *)rmap_head->val != spte) { - printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); + pr_err("%s: %p 1->BUG\n", __func__, spte); BUG(); } rmap_head->val = 0; } else { - rmap_printk("pte_list_remove: %p many->many\n", spte); + rmap_printk("%s: %p many->many\n", __func__, spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { @@ -1296,11 +1296,17 @@ static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) prev_desc = desc; desc = desc->more; } - pr_err("pte_list_remove: %p many->many\n", spte); + pr_err("%s: %p many->many\n", __func__, spte); BUG(); } } +static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +{ + mmu_spte_clear_track_bits(sptep); + __pte_list_remove(sptep, rmap_head); +} + static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -1349,7 +1355,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); - pte_list_remove(spte, rmap_head); + __pte_list_remove(spte, rmap_head); } /* @@ -1685,7 +1691,7 @@ static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) while ((sptep = rmap_get_first(rmap_head, &iter))) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); flush = true; } @@ -1721,7 +1727,7 @@ restart: need_flush = 1; if (pte_write(*ptep)) { - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); goto restart; } else { new_spte = *sptep & ~PT64_BASE_ADDR_MASK; @@ -1988,7 +1994,7 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, u64 *parent_pte) { - pte_list_remove(parent_pte, &sp->parent_ptes); + __pte_list_remove(parent_pte, &sp->parent_ptes); } static void drop_parent_pte(struct kvm_mmu_page *sp, @@ -2181,7 +2187,7 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { if (sp->role.cr4_pae != !!is_pae(vcpu) - || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { + || vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } @@ -2375,14 +2381,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int collisions = 0; LIST_HEAD(invalid_list); - role = vcpu->arch.mmu.base_role; + role = vcpu->arch.mmu->mmu_role.base; role.level = level; role.direct = direct; if (role.direct) role.cr4_pae = 0; role.access = access; - if (!vcpu->arch.mmu.direct_map - && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!vcpu->arch.mmu->direct_map + && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -2457,11 +2463,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato { iterator->addr = addr; iterator->shadow_addr = root; - iterator->level = vcpu->arch.mmu.shadow_root_level; + iterator->level = vcpu->arch.mmu->shadow_root_level; if (iterator->level == PT64_ROOT_4LEVEL && - vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL && - !vcpu->arch.mmu.direct_map) + vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && + !vcpu->arch.mmu->direct_map) --iterator->level; if (iterator->level == PT32E_ROOT_LEVEL) { @@ -2469,10 +2475,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato * prev_root is currently only used for 64-bit hosts. So only * the active root_hpa is valid here. */ - BUG_ON(root != vcpu->arch.mmu.root_hpa); + BUG_ON(root != vcpu->arch.mmu->root_hpa); iterator->shadow_addr - = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; --iterator->level; if (!iterator->shadow_addr) @@ -2483,7 +2489,7 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, u64 addr) { - shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, addr); } @@ -3095,7 +3101,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, int emulate = 0; gfn_t pseudo_gfn; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return 0; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { @@ -3301,7 +3307,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, u64 spte = 0ull; uint retry_count = 0; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return false; if (!page_fault_can_be_fast(error_code)) @@ -3471,11 +3477,11 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, } /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + ulong roots_to_free) { int i; LIST_HEAD(invalid_list); - struct kvm_mmu *mmu = &vcpu->arch.mmu; bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); @@ -3535,20 +3541,20 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) struct kvm_mmu_page *sp; unsigned i; - if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); if(make_mmu_pages_available(vcpu) < 0) { spin_unlock(&vcpu->kvm->mmu_lock); return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, 0, 0, - vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); + vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = __pa(sp->spt); - } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { + vcpu->arch.mmu->root_hpa = __pa(sp->spt); + } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) { for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; MMU_WARN_ON(VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); @@ -3561,9 +3567,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); @@ -3577,7 +3583,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn; int i; - root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; + root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; @@ -3586,8 +3592,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * Do we shadow a long mode page table? If so we need to * write-protect the guests page table root. */ - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; MMU_WARN_ON(VALID_PAGE(root)); @@ -3597,11 +3603,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) return -ENOSPC; } sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); + vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.root_hpa = root; + vcpu->arch.mmu->root_hpa = root; return 0; } @@ -3611,17 +3617,17 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) + if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; MMU_WARN_ON(VALID_PAGE(root)); - if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); + if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) { + pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i); if (!(pdptr & PT_PRESENT_MASK)) { - vcpu->arch.mmu.pae_root[i] = 0; + vcpu->arch.mmu->pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGE_SHIFT; @@ -3639,16 +3645,16 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | pm_mask; + vcpu->arch.mmu->pae_root[i] = root | pm_mask; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); /* * If we shadow a 32 bit page table with a long mode page * table we enter this path. */ - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) { - if (vcpu->arch.mmu.lm_root == NULL) { + if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (vcpu->arch.mmu->lm_root == NULL) { /* * The additional page necessary for this is only * allocated on demand. @@ -3660,12 +3666,12 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) if (lm_root == NULL) return 1; - lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; + lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask; - vcpu->arch.mmu.lm_root = lm_root; + vcpu->arch.mmu->lm_root = lm_root; } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); + vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); } return 0; @@ -3673,7 +3679,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) static int mmu_alloc_roots(struct kvm_vcpu *vcpu) { - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return mmu_alloc_direct_roots(vcpu); else return mmu_alloc_shadow_roots(vcpu); @@ -3684,17 +3690,16 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *sp; - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; sp = page_header(root); /* @@ -3725,7 +3730,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; @@ -3799,7 +3804,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf; bool reserved = false; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) goto exit; walk_shadow_page_lockless_begin(vcpu); @@ -3816,7 +3821,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) if (!is_shadow_present_pte(spte)) break; - reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, + reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte, iterator.level); } @@ -3895,7 +3900,7 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) struct kvm_shadow_walk_iterator iterator; u64 spte; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; walk_shadow_page_lockless_begin(vcpu); @@ -3922,7 +3927,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, if (r) return r; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); return nonpaging_map(vcpu, gva & PAGE_MASK, @@ -3935,8 +3940,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; arch.gfn = gfn; - arch.direct_map = vcpu->arch.mmu.direct_map; - arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); + arch.direct_map = vcpu->arch.mmu->direct_map; + arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } @@ -4042,7 +4047,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, int write = error_code & PFERR_WRITE_MASK; bool map_writable; - MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); + MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; @@ -4118,7 +4123,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, { uint i; struct kvm_mmu_root_info root; - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; root.cr3 = mmu->get_cr3(vcpu); root.hpa = mmu->root_hpa; @@ -4141,7 +4146,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, union kvm_mmu_page_role new_role, bool skip_tlb_flush) { - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; /* * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid @@ -4192,7 +4197,8 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) { if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) - kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, + KVM_MMU_ROOT_CURRENT); } void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) @@ -4210,7 +4216,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu) static void inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { - vcpu->arch.mmu.inject_page_fault(vcpu, fault); + vcpu->arch.mmu->inject_page_fault(vcpu, fault); } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, @@ -4414,7 +4420,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { - bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + bool uses_nx = context->nx || + context->mmu_role.base.smep_andnot_wp; struct rsvd_bits_validate *shadow_zero_check; int i; @@ -4553,7 +4560,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * SMAP:kernel-mode data accesses from user-mode * mappings should fault. A fault is considered * as a SMAP violation if all of the following - * conditions are ture: + * conditions are true: * - X86_CR4_SMAP is set in CR4 * - A user page is accessed * - The access is not a fetch @@ -4714,27 +4721,65 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } -static union kvm_mmu_page_role -kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu) +static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_extended_role ext = {0}; + + ext.cr0_pg = !!is_paging(vcpu); + ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); + ext.cr4_pse = !!is_pse(vcpu); + ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); + ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57); + + ext.valid = 1; + + return ext; +} + +static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, + bool base_only) +{ + union kvm_mmu_role role = {0}; + + role.base.access = ACC_ALL; + role.base.nxe = !!is_nx(vcpu); + role.base.cr4_pae = !!is_pae(vcpu); + role.base.cr0_wp = is_write_protection(vcpu); + role.base.smm = is_smm(vcpu); + role.base.guest_mode = is_guest_mode(vcpu); + + if (base_only) + return role; + + role.ext = kvm_calc_mmu_role_ext(vcpu); + + return role; +} + +static union kvm_mmu_role +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { - union kvm_mmu_page_role role = {0}; + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); - role.guest_mode = is_guest_mode(vcpu); - role.smm = is_smm(vcpu); - role.ad_disabled = (shadow_accessed_mask == 0); - role.level = kvm_x86_ops->get_tdp_level(vcpu); - role.direct = true; - role.access = ACC_ALL; + role.base.ad_disabled = (shadow_accessed_mask == 0); + role.base.level = kvm_x86_ops->get_tdp_level(vcpu); + role.base.direct = true; return role; } static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_tdp_mmu_root_page_role(vcpu, false); - context->base_role.word = mmu_base_role_mask.word & - kvm_calc_tdp_mmu_root_page_role(vcpu).word; + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == context->mmu_role.as_u64) + return; + + context->mmu_role.as_u64 = new_role.as_u64; context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -4774,36 +4819,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) reset_tdp_shadow_zero_bits_mask(vcpu, context); } -static union kvm_mmu_page_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu) -{ - union kvm_mmu_page_role role = {0}; - bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); - bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); - - role.nxe = is_nx(vcpu); - role.cr4_pae = !!is_pae(vcpu); - role.cr0_wp = is_write_protection(vcpu); - role.smep_andnot_wp = smep && !is_write_protection(vcpu); - role.smap_andnot_wp = smap && !is_write_protection(vcpu); - role.guest_mode = is_guest_mode(vcpu); - role.smm = is_smm(vcpu); - role.direct = !is_paging(vcpu); - role.access = ACC_ALL; +static union kvm_mmu_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +{ + union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); + + role.base.smep_andnot_wp = role.ext.cr4_smep && + !is_write_protection(vcpu); + role.base.smap_andnot_wp = role.ext.cr4_smap && + !is_write_protection(vcpu); + role.base.direct = !is_paging(vcpu); if (!is_long_mode(vcpu)) - role.level = PT32E_ROOT_LEVEL; + role.base.level = PT32E_ROOT_LEVEL; else if (is_la57_mode(vcpu)) - role.level = PT64_ROOT_5LEVEL; + role.base.level = PT64_ROOT_5LEVEL; else - role.level = PT64_ROOT_4LEVEL; + role.base.level = PT64_ROOT_4LEVEL; return role; } void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == context->mmu_role.as_u64) + return; if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); @@ -4814,22 +4859,28 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) else paging32_init_context(vcpu, context); - context->base_role.word = mmu_base_role_mask.word & - kvm_calc_shadow_mmu_root_page_role(vcpu).word; + context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -static union kvm_mmu_page_role -kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) +static union kvm_mmu_role +kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, + bool execonly) { - union kvm_mmu_page_role role = vcpu->arch.mmu.base_role; + union kvm_mmu_role role; + + /* Base role is inherited from root_mmu */ + role.base.word = vcpu->arch.root_mmu.mmu_role.base.word; + role.ext = kvm_calc_mmu_role_ext(vcpu); + + role.base.level = PT64_ROOT_4LEVEL; + role.base.direct = false; + role.base.ad_disabled = !accessed_dirty; + role.base.guest_mode = true; + role.base.access = ACC_ALL; - role.level = PT64_ROOT_4LEVEL; - role.direct = false; - role.ad_disabled = !accessed_dirty; - role.guest_mode = true; - role.access = ACC_ALL; + role.ext.execonly = execonly; return role; } @@ -4837,11 +4888,17 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { - struct kvm_mmu *context = &vcpu->arch.mmu; - union kvm_mmu_page_role root_page_role = - kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); + struct kvm_mmu *context = vcpu->arch.mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, + execonly); + + __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false); + + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == context->mmu_role.as_u64) + return; - __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; @@ -4853,7 +4910,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; context->direct_map = false; - context->base_role.word = root_page_role.word & mmu_base_role_mask.word; + context->mmu_role.as_u64 = new_role.as_u64; + update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); @@ -4864,7 +4922,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.mmu; kvm_init_shadow_mmu(vcpu); context->set_cr3 = kvm_x86_ops->set_cr3; @@ -4875,14 +4933,20 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) { + union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; + new_role.base.word &= mmu_base_role_mask.word; + if (new_role.as_u64 == g_context->mmu_role.as_u64) + return; + + g_context->mmu_role.as_u64 = new_role.as_u64; g_context->get_cr3 = get_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* - * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using + * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using * L2's page tables as the first level of translation and L1's @@ -4921,10 +4985,10 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) if (reset_roots) { uint i; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu->root_hpa = INVALID_PAGE; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; } if (mmu_is_nested(vcpu)) @@ -4939,10 +5003,14 @@ EXPORT_SYMBOL_GPL(kvm_init_mmu); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) { + union kvm_mmu_role role; + if (tdp_enabled) - return kvm_calc_tdp_mmu_root_page_role(vcpu); + role = kvm_calc_tdp_mmu_root_page_role(vcpu, true); else - return kvm_calc_shadow_mmu_root_page_role(vcpu); + role = kvm_calc_shadow_mmu_root_page_role(vcpu, true); + + return role.base; } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -4972,8 +5040,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); - WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); + WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa)); + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -4987,7 +5057,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); + vcpu->arch.mmu->update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -5164,10 +5234,12 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, local_flush = true; while (npte--) { + u32 base_role = vcpu->arch.mmu->mmu_role.base.word; + entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && - !((sp->role.word ^ vcpu->arch.mmu.base_role.word) + !((sp->role.word ^ base_role) & mmu_base_role_mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (need_remote_flush(entry, *spte)) @@ -5185,7 +5257,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - if (vcpu->arch.mmu.direct_map) + if (vcpu->arch.mmu->direct_map) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); @@ -5221,10 +5293,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, { int r, emulation_type = 0; enum emulation_result er; - bool direct = vcpu->arch.mmu.direct_map; + bool direct = vcpu->arch.mmu->direct_map; /* With shadow page tables, fault_address contains a GVA or nGPA. */ - if (vcpu->arch.mmu.direct_map) { + if (vcpu->arch.mmu->direct_map) { vcpu->arch.gpa_available = true; vcpu->arch.gpa_val = cr2; } @@ -5237,8 +5309,9 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, } if (r == RET_PF_INVALID) { - r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), - false); + r = vcpu->arch.mmu->page_fault(vcpu, cr2, + lower_32_bits(error_code), + false); WARN_ON(r == RET_PF_INVALID); } @@ -5254,7 +5327,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, * paging in both guests. If true, we simply unprotect the page * and resume the guest. */ - if (vcpu->arch.mmu.direct_map && + if (vcpu->arch.mmu->direct_map && (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); return 1; @@ -5302,7 +5375,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; int i; /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ @@ -5333,7 +5406,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { - struct kvm_mmu *mmu = &vcpu->arch.mmu; + struct kvm_mmu *mmu = vcpu->arch.mmu; bool tlb_flush = false; uint i; @@ -5377,8 +5450,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { - free_page((unsigned long)vcpu->arch.mmu.pae_root); - free_page((unsigned long)vcpu->arch.mmu.lm_root); + free_page((unsigned long)vcpu->arch.mmu->pae_root); + free_page((unsigned long)vcpu->arch.mmu->lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) @@ -5398,9 +5471,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) if (!page) return -ENOMEM; - vcpu->arch.mmu.pae_root = page_address(page); + vcpu->arch.mmu->pae_root = page_address(page); for (i = 0; i < 4; ++i) - vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; + vcpu->arch.mmu->pae_root[i] = INVALID_PAGE; return 0; } @@ -5409,27 +5482,21 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) { uint i; - vcpu->arch.walk_mmu = &vcpu->arch.mmu; - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.translate_gpa = translate_gpa; - vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; + vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; + vcpu->arch.root_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - - return alloc_mmu_pages(vcpu); -} + vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; -void kvm_mmu_setup(struct kvm_vcpu *vcpu) -{ - MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; + vcpu->arch.guest_mmu.translate_gpa = translate_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; - /* - * kvm_mmu_setup() is called only on vCPU initialization. - * Therefore, no need to reset mmu roots as they are not yet - * initialized. - */ - kvm_init_mmu(vcpu, false); + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + return alloc_mmu_pages(vcpu); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, @@ -5612,7 +5679,7 @@ restart: if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && PageTransCompoundMap(pfn_to_page(pfn))) { - drop_spte(kvm, sptep); + pte_list_remove(rmap_head, sptep); need_tlb_flush = 1; goto restart; } @@ -5869,6 +5936,16 @@ int kvm_mmu_module_init(void) { int ret = -ENOMEM; + /* + * MMU roles use union aliasing which is, generally speaking, an + * undefined behavior. However, we supposedly know how compilers behave + * and the current status quo is unlikely to change. Guardians below are + * supposed to let us know if the assumption becomes false. + */ + BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); + BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); + BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64)); + kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", @@ -5898,7 +5975,7 @@ out: } /* - * Caculate mmu pages needed for kvm. + * Calculate mmu pages needed for kvm. */ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h @@ -43,11 +43,6 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 -#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) - static inline u64 rsvd_bits(int s, int e) { if (e < s) @@ -80,7 +75,7 @@ static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { - if (likely(vcpu->arch.mmu.root_hpa != INVALID_PAGE)) + if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) return 0; return kvm_mmu_load(vcpu); @@ -102,9 +97,9 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) { - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa | - kvm_get_active_pcid(vcpu)); + if (VALID_PAGE(vcpu->arch.mmu->root_hpa)) + vcpu->arch.mmu->set_cr3(vcpu, vcpu->arch.mmu->root_hpa | + kvm_get_active_pcid(vcpu)); } /* diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c @@ -59,19 +59,19 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) int i; struct kvm_mmu_page *sp; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; - if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; + if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { + hpa_t root = vcpu->arch.mmu->root_hpa; sp = page_header(root); - __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu.root_level); + __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); return; } for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; + hpa_t root = vcpu->arch.mmu->pae_root[i]; if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; @@ -122,7 +122,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) hpa = pfn << PAGE_SHIFT; if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx " - "ent %llxn", vcpu->arch.mmu.root_level, pfn, + "ent %llxn", vcpu->arch.mmu->root_level, pfn, hpa, *sptep); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h @@ -158,14 +158,15 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) goto no_present; if (!FNAME(is_present_gpte)(gpte)) goto no_present; /* if accessed bit is not supported prefetch non accessed gpte */ - if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) + if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && + !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; return false; @@ -480,7 +481,7 @@ error: static int FNAME(walk_addr)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, u32 access) { - return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, + return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); } @@ -509,7 +510,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -604,7 +605,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, direct_access = gw->pte_access; - top_level = vcpu->arch.mmu.root_level; + top_level = vcpu->arch.mmu->root_level; if (top_level == PT32E_ROOT_LEVEL) top_level = PT32_ROOT_LEVEL; /* @@ -616,7 +617,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) goto out_gpte_changed; for (shadow_walk_init(&it, vcpu, addr); @@ -1004,7 +1005,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(gpte); - FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); + FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c @@ -809,6 +809,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) nested_svm_check_exception(svm, nr, has_error_code, error_code)) return; + kvm_deliver_exception_payload(&svm->vcpu); + if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); @@ -2922,18 +2924,18 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); kvm_init_shadow_mmu(vcpu); - vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; - vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; - vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; - vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu); - reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu); + vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3; + vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; + vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; + vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); + reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) { - vcpu->arch.walk_mmu = &vcpu->arch.mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static int nested_svm_check_permissions(struct vcpu_svm *svm) @@ -2969,16 +2971,13 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_info_1 = error_code; /* - * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception. - * The fix is to add the ancillary datum (CR2 or DR6) to structs - * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be - * written only when inject_pending_event runs (DR6 would written here - * too). This should be conditional on a new capability---if the - * capability is disabled, kvm_multiple_exception would write the - * ancillary information to CR2 or DR6, for backwards ABI-compatibility. + * EXITINFO2 is undefined for all exception intercepts other + * than #PF. */ if (svm->vcpu.arch.exception.nested_apf) svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else if (svm->vcpu.arch.exception.has_payload) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; else svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; @@ -5642,26 +5641,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" -#endif /* * Clear host registers marked as clobbered to prevent * speculative use. */ - "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" - "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" - "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" - "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" - "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" -#ifdef CONFIG_X86_64 - "xor %%r8, %%r8 \n\t" - "xor %%r9, %%r9 \n\t" - "xor %%r10, %%r10 \n\t" - "xor %%r11, %%r11 \n\t" - "xor %%r12, %%r12 \n\t" - "xor %%r13, %%r13 \n\t" - "xor %%r14, %%r14 \n\t" - "xor %%r15, %%r15 \n\t" + "xor %%r8d, %%r8d \n\t" + "xor %%r9d, %%r9d \n\t" + "xor %%r10d, %%r10d \n\t" + "xor %%r11d, %%r11d \n\t" + "xor %%r12d, %%r12d \n\t" + "xor %%r13d, %%r13d \n\t" + "xor %%r14d, %%r14d \n\t" + "xor %%r15d, %%r15d \n\t" #endif + "xor %%ebx, %%ebx \n\t" + "xor %%ecx, %%ecx \n\t" + "xor %%edx, %%edx \n\t" + "xor %%esi, %%esi \n\t" + "xor %%edi, %%edi \n\t" "pop %%" _ASM_BP : : [svm]"a"(svm), @@ -7040,6 +7037,13 @@ failed: return ret; } +static int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version) +{ + /* Intel-only feature */ + return -ENODEV; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -7169,6 +7173,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .mem_enc_op = svm_mem_enc_op, .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, + + .nested_enable_evmcs = nested_enable_evmcs, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h @@ -1418,6 +1418,48 @@ TRACE_EVENT(kvm_hv_flush_tlb_ex, __entry->valid_bank_mask, __entry->format, __entry->address_space, __entry->flags) ); + +/* + * Tracepoints for kvm_hv_send_ipi. + */ +TRACE_EVENT(kvm_hv_send_ipi, + TP_PROTO(u32 vector, u64 processor_mask), + TP_ARGS(vector, processor_mask), + + TP_STRUCT__entry( + __field(u32, vector) + __field(u64, processor_mask) + ), + + TP_fast_assign( + __entry->vector = vector; + __entry->processor_mask = processor_mask; + ), + + TP_printk("vector %x processor_mask 0x%llx", + __entry->vector, __entry->processor_mask) +); + +TRACE_EVENT(kvm_hv_send_ipi_ex, + TP_PROTO(u32 vector, u64 format, u64 valid_bank_mask), + TP_ARGS(vector, format, valid_bank_mask), + + TP_STRUCT__entry( + __field(u32, vector) + __field(u64, format) + __field(u64, valid_bank_mask) + ), + + TP_fast_assign( + __entry->vector = vector; + __entry->format = format; + __entry->valid_bank_mask = valid_bank_mask; + ), + + TP_printk("vector %x format %llx valid_bank_mask 0x%llx", + __entry->vector, __entry->format, + __entry->valid_bank_mask) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c @@ -20,6 +20,7 @@ #include "mmu.h" #include "cpuid.h" #include "lapic.h" +#include "hyperv.h" #include <linux/kvm_host.h> #include <linux/module.h> @@ -61,7 +62,7 @@ #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ - ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) + ____kvm_handle_fault_on_reboot(x, "xor " reg ", " reg) MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -107,9 +108,12 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); * VMX and be a hypervisor for its own guests. If nested=0, guests may not * use VMX instructions. */ -static bool __read_mostly nested = 0; +static bool __read_mostly nested = 1; module_param(nested, bool, S_IRUGO); +static bool __read_mostly nested_early_check = 0; +module_param(nested_early_check, bool, S_IRUGO); + static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; @@ -131,7 +135,7 @@ static bool __read_mostly enable_preemption_timer = 1; module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif -#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ @@ -187,6 +191,7 @@ static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +extern const ulong vmx_early_consistency_check_return; static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); @@ -827,14 +832,28 @@ struct nested_vmx { */ struct vmcs12 *cached_shadow_vmcs12; /* - * Indicates if the shadow vmcs must be updated with the - * data hold by vmcs12 + * Indicates if the shadow vmcs or enlightened vmcs must be updated + * with the data held by struct vmcs12. */ - bool sync_shadow_vmcs; + bool need_vmcs12_sync; bool dirty_vmcs12; + /* + * vmcs02 has been initialized, i.e. state that is constant for + * vmcs02 has been written to the backing VMCS. Initialization + * is delayed until L1 actually attempts to run a nested VM. + */ + bool vmcs02_initialized; + bool change_vmcs01_virtual_apic_mode; + /* + * Enlightened VMCS has been enabled. It does not mean that L1 has to + * use it. However, VMX features available to L1 will be limited based + * on what the enlightened VMCS supports. + */ + bool enlightened_vmcs_enabled; + /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; @@ -870,6 +889,10 @@ struct nested_vmx { /* in guest mode on SMM entry? */ bool guest_mode; } smm; + + gpa_t hv_evmcs_vmptr; + struct page *hv_evmcs_page; + struct hv_enlightened_vmcs *hv_evmcs; }; #define POSTED_INTR_ON 0 @@ -1381,6 +1404,49 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); #define KVM_EVMCS_VERSION 1 +/* + * Enlightened VMCSv1 doesn't support these: + * + * POSTED_INTR_NV = 0x00000002, + * GUEST_INTR_STATUS = 0x00000810, + * APIC_ACCESS_ADDR = 0x00002014, + * POSTED_INTR_DESC_ADDR = 0x00002016, + * EOI_EXIT_BITMAP0 = 0x0000201c, + * EOI_EXIT_BITMAP1 = 0x0000201e, + * EOI_EXIT_BITMAP2 = 0x00002020, + * EOI_EXIT_BITMAP3 = 0x00002022, + * GUEST_PML_INDEX = 0x00000812, + * PML_ADDRESS = 0x0000200e, + * VM_FUNCTION_CONTROL = 0x00002018, + * EPTP_LIST_ADDRESS = 0x00002024, + * VMREAD_BITMAP = 0x00002026, + * VMWRITE_BITMAP = 0x00002028, + * + * TSC_MULTIPLIER = 0x00002032, + * PLE_GAP = 0x00004020, + * PLE_WINDOW = 0x00004022, + * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, + * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + * + * Currently unsupported in KVM: + * GUEST_IA32_RTIT_CTL = 0x00002814, + */ +#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \ + PIN_BASED_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_2NDEXEC \ + (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ + SECONDARY_EXEC_APIC_REGISTER_VIRT | \ + SECONDARY_EXEC_ENABLE_PML | \ + SECONDARY_EXEC_ENABLE_VMFUNC | \ + SECONDARY_EXEC_SHADOW_VMCS | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_PAUSE_LOOP_EXITING) +#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) +#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) + #if IS_ENABLED(CONFIG_HYPERV) static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -1473,69 +1539,12 @@ static void evmcs_load(u64 phys_addr) static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) { - /* - * Enlightened VMCSv1 doesn't support these: - * - * POSTED_INTR_NV = 0x00000002, - * GUEST_INTR_STATUS = 0x00000810, - * APIC_ACCESS_ADDR = 0x00002014, - * POSTED_INTR_DESC_ADDR = 0x00002016, - * EOI_EXIT_BITMAP0 = 0x0000201c, - * EOI_EXIT_BITMAP1 = 0x0000201e, - * EOI_EXIT_BITMAP2 = 0x00002020, - * EOI_EXIT_BITMAP3 = 0x00002022, - */ - vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; - vmcs_conf->cpu_based_2nd_exec_ctrl &= - ~SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; - vmcs_conf->cpu_based_2nd_exec_ctrl &= - ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmcs_conf->cpu_based_2nd_exec_ctrl &= - ~SECONDARY_EXEC_APIC_REGISTER_VIRT; - - /* - * GUEST_PML_INDEX = 0x00000812, - * PML_ADDRESS = 0x0000200e, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_PML; - - /* VM_FUNCTION_CONTROL = 0x00002018, */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_ENABLE_VMFUNC; - - /* - * EPTP_LIST_ADDRESS = 0x00002024, - * VMREAD_BITMAP = 0x00002026, - * VMWRITE_BITMAP = 0x00002028, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_SHADOW_VMCS; - - /* - * TSC_MULTIPLIER = 0x00002032, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_TSC_SCALING; - - /* - * PLE_GAP = 0x00004020, - * PLE_WINDOW = 0x00004022, - */ - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; - - /* - * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - */ - vmcs_conf->pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - /* - * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, - * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, - */ - vmcs_conf->vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - vmcs_conf->vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - /* - * Currently unsupported in KVM: - * GUEST_IA32_RTIT_CTL = 0x00002814, - */ } /* check_ept_pointer() should be under protection of ept_pointer_lock. */ @@ -1560,26 +1569,27 @@ static void check_ept_pointer_match(struct kvm *kvm) static int vmx_hv_remote_flush_tlb(struct kvm *kvm) { - int ret; + struct kvm_vcpu *vcpu; + int ret = -ENOTSUPP, i; spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) check_ept_pointer_match(kvm); - if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { - ret = -ENOTSUPP; - goto out; - } - /* * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the * base of EPT PML4 table, strip off EPT configuration information. */ - ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + kvm_for_each_vcpu(i, vcpu, kvm) + ret |= hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, i))->ept_pointer & PAGE_MASK); + } else { + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); + } -out: spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); return ret; } @@ -1595,6 +1605,35 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +static int nested_enable_evmcs(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* We don't support disabling the feature for simplicity. */ + if (vmx->nested.enlightened_vmcs_enabled) + return 0; + + vmx->nested.enlightened_vmcs_enabled = true; + + /* + * vmcs_version represents the range of supported Enlightened VMCS + * versions: lower 8 bits is the minimal version, higher 8 bits is the + * maximum supported version. KVM supports versions from 1 to + * KVM_EVMCS_VERSION. + */ + if (vmcs_version) + *vmcs_version = (KVM_EVMCS_VERSION << 8) | 1; + + vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; + + return 0; +} + static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -1617,11 +1656,6 @@ static inline bool is_page_fault(u32 intr_info) return is_exception_n(intr_info, PF_VECTOR); } -static inline bool is_no_device(u32 intr_info) -{ - return is_exception_n(intr_info, NM_VECTOR); -} - static inline bool is_invalid_opcode(u32 intr_info) { return is_exception_n(intr_info, UD_VECTOR); @@ -1632,12 +1666,6 @@ static inline bool is_gp_fault(u32 intr_info) return is_exception_n(intr_info, GP_VECTOR); } -static inline bool is_external_interrupt(u32 intr_info) -{ - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); -} - static inline bool is_machine_check(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -2063,9 +2091,6 @@ static inline bool is_nmi(u32 intr_info) static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); -static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12, - u32 reason, unsigned long qualification); static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { @@ -2077,7 +2102,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } -static inline void __invvpid(int ext, u16 vpid, gva_t gva) +static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) { struct { u64 vpid : 16; @@ -2086,22 +2111,20 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) } operand = { vpid, 0, gva }; bool error; - asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) - : CC_OUT(na) (error) : "a"(&operand), "c"(ext) - : "memory"); + asm volatile (__ex("invvpid %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); BUG_ON(error); } -static inline void __invept(int ext, u64 eptp, gpa_t gpa) +static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) { struct { u64 eptp, gpa; } operand = {eptp, gpa}; bool error; - asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) - : CC_OUT(na) (error) : "a" (&operand), "c" (ext) - : "memory"); + asm volatile (__ex("invept %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(ext), "m"(operand)); BUG_ON(error); } @@ -2120,9 +2143,8 @@ static void vmcs_clear(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); bool error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) - : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) - : "memory"); + asm volatile (__ex("vmclear %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); if (unlikely(error)) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); @@ -2145,9 +2167,8 @@ static void vmcs_load(struct vmcs *vmcs) if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) - : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) - : "memory"); + asm volatile (__ex("vmptrld %1") CC_SET(na) + : CC_OUT(na) (error) : "m"(phys_addr)); if (unlikely(error)) printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); @@ -2323,8 +2344,8 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") - : "=a"(value) : "d"(field) : "cc"); + asm volatile (__ex_clear("vmread %1, %0", "%k0") + : "=r"(value) : "r"(field)); return value; } @@ -2375,8 +2396,8 @@ static __always_inline void __vmcs_writel(unsigned long field, unsigned long val { bool error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) - : CC_OUT(na) (error) : "a"(value), "d"(field)); + asm volatile (__ex("vmwrite %2, %1") CC_SET(na) + : CC_OUT(na) (error) : "r"(field), "rm"(value)); if (unlikely(error)) vmwrite_error(field, value); } @@ -2707,7 +2728,8 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, u64 guest_val, u64 host_val) { vmcs_write64(guest_val_vmcs, guest_val); - vmcs_write64(host_val_vmcs, host_val); + if (host_val_vmcs != HOST_IA32_EFER) + vmcs_write64(host_val_vmcs, host_val); vm_entry_controls_setbit(vmx, entry); vm_exit_controls_setbit(vmx, exit); } @@ -2805,8 +2827,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) ignore_bits &= ~(u64)EFER_SCE; #endif - clear_atomic_switch_msr(vmx, MSR_EFER); - /* * On EPT, we can't emulate NX, so we must switch EFER atomically. * On CPUs that support "load IA32_EFER", always switch EFER @@ -2819,8 +2839,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) if (guest_efer != host_efer) add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer, false); + else + clear_atomic_switch_msr(vmx, MSR_EFER); return false; } else { + clear_atomic_switch_msr(vmx, MSR_EFER); + guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; @@ -3272,34 +3296,30 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned int nr = vcpu->arch.exception.nr; + bool has_payload = vcpu->arch.exception.has_payload; + unsigned long payload = vcpu->arch.exception.payload; if (nr == PF_VECTOR) { if (vcpu->arch.exception.nested_apf) { *exit_qual = vcpu->arch.apf.nested_apf_token; return 1; } - /* - * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception. - * The fix is to add the ancillary datum (CR2 or DR6) to structs - * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 - * can be written only when inject_pending_event runs. This should be - * conditional on a new capability---if the capability is disabled, - * kvm_multiple_exception would write the ancillary information to - * CR2 or DR6, for backwards ABI-compatibility. - */ if (nested_vmx_is_page_fault_vmexit(vmcs12, vcpu->arch.exception.error_code)) { - *exit_qual = vcpu->arch.cr2; - return 1; - } - } else { - if (vmcs12->exception_bitmap & (1u << nr)) { - if (nr == DB_VECTOR) - *exit_qual = vcpu->arch.dr6; - else - *exit_qual = 0; + *exit_qual = has_payload ? payload : vcpu->arch.cr2; return 1; } + } else if (vmcs12->exception_bitmap & (1u << nr)) { + if (nr == DB_VECTOR) { + if (!has_payload) { + payload = vcpu->arch.dr6; + payload &= ~(DR6_FIXED_1 | DR6_BT); + payload ^= DR6_RTM; + } + *exit_qual = payload; + } else + *exit_qual = 0; + return 1; } return 0; @@ -3326,6 +3346,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; + kvm_deliver_exception_payload(vcpu); + if (has_error_code) { vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; @@ -4397,9 +4419,7 @@ static void kvm_cpu_vmxon(u64 addr) cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&addr), "m"(addr) - : "memory", "cc"); + asm volatile ("vmxon %0" : : "m"(addr)); } static int hardware_enable(void) @@ -4468,7 +4488,7 @@ static void vmclear_local_loaded_vmcss(void) */ static void kvm_cpu_vmxoff(void) { - asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); + asm volatile (__ex("vmxoff")); intel_pt_handle_vmx(0); cr4_clear_bits(X86_CR4_VMXE); @@ -5112,9 +5132,10 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, bool invalidate_gpa) { if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) return; - ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); + ept_sync_context(construct_eptp(vcpu, + vcpu->arch.mmu->root_hpa)); } else { vpid_sync_context(vpid); } @@ -5264,7 +5285,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); + hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (enable_unrestricted_guest) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { @@ -6339,6 +6360,9 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) rdmsr(MSR_IA32_CR_PAT, low32, high32); vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); } + + if (cpu_has_load_ia32_efer) + vmcs_write64(HOST_IA32_EFER, host_efer); } static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -6666,7 +6690,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); if (enable_pml) { - ASSERT(vmx->pml_pg); vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); } @@ -8067,35 +8090,39 @@ static int handle_monitor(struct kvm_vcpu *vcpu) /* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), - * set the success or error code of an emulated VMX instruction, as specified - * by Vol 2B, VMX Instruction Reference, "Conventions". + * set the success or error code of an emulated VMX instruction (as specified + * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated + * instruction. */ -static void nested_vmx_succeed(struct kvm_vcpu *vcpu) +static int nested_vmx_succeed(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); + return kvm_skip_emulated_instruction(vcpu); } -static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) +static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) { vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)) | X86_EFLAGS_CF); + return kvm_skip_emulated_instruction(vcpu); } -static void nested_vmx_failValid(struct kvm_vcpu *vcpu, - u32 vm_instruction_error) +static int nested_vmx_failValid(struct kvm_vcpu *vcpu, + u32 vm_instruction_error) { - if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { - /* - * failValid writes the error number to the current VMCS, which - * can't be done there isn't a current VMCS. - */ - nested_vmx_failInvalid(vcpu); - return; - } + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF)) @@ -8105,6 +8132,7 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu, * We don't need to force a shadow sync because * VM_INSTRUCTION_ERROR is not shadowed */ + return kvm_skip_emulated_instruction(vcpu); } static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) @@ -8292,6 +8320,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vpid02 = allocate_vpid(); + vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; return 0; @@ -8345,10 +8374,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (vmx->nested.vmxon) { - nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmx->nested.vmxon) + return nested_vmx_failValid(vcpu, + VMXERR_VMXON_IN_VMX_ROOT_OPERATION); if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { @@ -8367,21 +8395,17 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; * which replaces physical address width with 32 */ - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failInvalid(vcpu); page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (is_error_page(page)) + return nested_vmx_failInvalid(vcpu); + if (*(u32 *)kmap(page) != VMCS12_REVISION) { kunmap(page); kvm_release_page_clean(page); - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_failInvalid(vcpu); } kunmap(page); kvm_release_page_clean(page); @@ -8391,8 +8415,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (ret) return ret; - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } /* @@ -8423,8 +8446,24 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); } -static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) +static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.hv_evmcs) + return; + + kunmap(vmx->nested.hv_evmcs_page); + kvm_release_page_dirty(vmx->nested.hv_evmcs_page); + vmx->nested.hv_evmcs_vmptr = -1ull; + vmx->nested.hv_evmcs_page = NULL; + vmx->nested.hv_evmcs = NULL; +} + +static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->nested.current_vmptr == -1ull) return; @@ -8432,16 +8471,18 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; + vmx->nested.need_vmcs12_sync = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; /* Flush VMCS12 to guest memory */ - kvm_vcpu_write_guest_page(&vmx->vcpu, + kvm_vcpu_write_guest_page(vcpu, vmx->nested.current_vmptr >> PAGE_SHIFT, vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + vmx->nested.current_vmptr = -1ull; } @@ -8449,8 +8490,10 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. */ -static void free_nested(struct vcpu_vmx *vmx) +static void free_nested(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; @@ -8483,6 +8526,10 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } + kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); + + nested_release_evmcs(vcpu); + free_loaded_vmcs(&vmx->nested.vmcs02); } @@ -8491,9 +8538,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) { if (!nested_vmx_check_permission(vcpu)) return 1; - free_nested(to_vmx(vcpu)); - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + free_nested(vcpu); + return nested_vmx_succeed(vcpu); } /* Emulate the VMCLEAR instruction */ @@ -8509,25 +8555,28 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_INVALID_ADDRESS); - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMCLEAR_VMXON_POINTER); - if (vmptr == vmx->nested.current_vmptr) - nested_release_vmcs12(vmx); + if (vmx->nested.hv_evmcs_page) { + if (vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + } else { + if (vmptr == vmx->nested.current_vmptr) + nested_release_vmcs12(vcpu); - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, launch_state), - &zero, sizeof(zero)); + kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); + } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); @@ -8610,6 +8659,395 @@ static inline int vmcs12_write_any(struct vmcs12 *vmcs12, } +static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + vmcs12->hdr.revision_id = evmcs->revision_id; + + /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ + vmcs12->tpr_threshold = evmcs->tpr_threshold; + vmcs12->guest_rip = evmcs->guest_rip; + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { + vmcs12->guest_rsp = evmcs->guest_rsp; + vmcs12->guest_rflags = evmcs->guest_rflags; + vmcs12->guest_interruptibility_info = + evmcs->guest_interruptibility_info; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->cpu_based_vm_exec_control = + evmcs->cpu_based_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + vmcs12->exception_bitmap = evmcs->exception_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { + vmcs12->vm_entry_controls = evmcs->vm_entry_controls; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { + vmcs12->vm_entry_intr_info_field = + evmcs->vm_entry_intr_info_field; + vmcs12->vm_entry_exception_error_code = + evmcs->vm_entry_exception_error_code; + vmcs12->vm_entry_instruction_len = + evmcs->vm_entry_instruction_len; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->host_ia32_pat = evmcs->host_ia32_pat; + vmcs12->host_ia32_efer = evmcs->host_ia32_efer; + vmcs12->host_cr0 = evmcs->host_cr0; + vmcs12->host_cr3 = evmcs->host_cr3; + vmcs12->host_cr4 = evmcs->host_cr4; + vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; + vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; + vmcs12->host_rip = evmcs->host_rip; + vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; + vmcs12->host_es_selector = evmcs->host_es_selector; + vmcs12->host_cs_selector = evmcs->host_cs_selector; + vmcs12->host_ss_selector = evmcs->host_ss_selector; + vmcs12->host_ds_selector = evmcs->host_ds_selector; + vmcs12->host_fs_selector = evmcs->host_fs_selector; + vmcs12->host_gs_selector = evmcs->host_gs_selector; + vmcs12->host_tr_selector = evmcs->host_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + vmcs12->pin_based_vm_exec_control = + evmcs->pin_based_vm_exec_control; + vmcs12->vm_exit_controls = evmcs->vm_exit_controls; + vmcs12->secondary_vm_exec_control = + evmcs->secondary_vm_exec_control; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { + vmcs12->io_bitmap_a = evmcs->io_bitmap_a; + vmcs12->io_bitmap_b = evmcs->io_bitmap_b; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { + vmcs12->msr_bitmap = evmcs->msr_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { + vmcs12->guest_es_base = evmcs->guest_es_base; + vmcs12->guest_cs_base = evmcs->guest_cs_base; + vmcs12->guest_ss_base = evmcs->guest_ss_base; + vmcs12->guest_ds_base = evmcs->guest_ds_base; + vmcs12->guest_fs_base = evmcs->guest_fs_base; + vmcs12->guest_gs_base = evmcs->guest_gs_base; + vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; + vmcs12->guest_tr_base = evmcs->guest_tr_base; + vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; + vmcs12->guest_idtr_base = evmcs->guest_idtr_base; + vmcs12->guest_es_limit = evmcs->guest_es_limit; + vmcs12->guest_cs_limit = evmcs->guest_cs_limit; + vmcs12->guest_ss_limit = evmcs->guest_ss_limit; + vmcs12->guest_ds_limit = evmcs->guest_ds_limit; + vmcs12->guest_fs_limit = evmcs->guest_fs_limit; + vmcs12->guest_gs_limit = evmcs->guest_gs_limit; + vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; + vmcs12->guest_tr_limit = evmcs->guest_tr_limit; + vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; + vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; + vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; + vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; + vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; + vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; + vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; + vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; + vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; + vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; + vmcs12->guest_es_selector = evmcs->guest_es_selector; + vmcs12->guest_cs_selector = evmcs->guest_cs_selector; + vmcs12->guest_ss_selector = evmcs->guest_ss_selector; + vmcs12->guest_ds_selector = evmcs->guest_ds_selector; + vmcs12->guest_fs_selector = evmcs->guest_fs_selector; + vmcs12->guest_gs_selector = evmcs->guest_gs_selector; + vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; + vmcs12->guest_tr_selector = evmcs->guest_tr_selector; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { + vmcs12->tsc_offset = evmcs->tsc_offset; + vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; + vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { + vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; + vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; + vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; + vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; + vmcs12->guest_cr0 = evmcs->guest_cr0; + vmcs12->guest_cr3 = evmcs->guest_cr3; + vmcs12->guest_cr4 = evmcs->guest_cr4; + vmcs12->guest_dr7 = evmcs->guest_dr7; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { + vmcs12->host_fs_base = evmcs->host_fs_base; + vmcs12->host_gs_base = evmcs->host_gs_base; + vmcs12->host_tr_base = evmcs->host_tr_base; + vmcs12->host_gdtr_base = evmcs->host_gdtr_base; + vmcs12->host_idtr_base = evmcs->host_idtr_base; + vmcs12->host_rsp = evmcs->host_rsp; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { + vmcs12->ept_pointer = evmcs->ept_pointer; + vmcs12->virtual_processor_id = evmcs->virtual_processor_id; + } + + if (unlikely(!(evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { + vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; + vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; + vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; + vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; + vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; + vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; + vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; + vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; + vmcs12->guest_pending_dbg_exceptions = + evmcs->guest_pending_dbg_exceptions; + vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; + vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; + vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; + vmcs12->guest_activity_state = evmcs->guest_activity_state; + vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; + } + + /* + * Not used? + * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; + * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; + * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; + * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; + * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; + * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; + * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; + * vmcs12->page_fault_error_code_mask = + * evmcs->page_fault_error_code_mask; + * vmcs12->page_fault_error_code_match = + * evmcs->page_fault_error_code_match; + * vmcs12->cr3_target_count = evmcs->cr3_target_count; + * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; + * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; + * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; + */ + + /* + * Read only fields: + * vmcs12->guest_physical_address = evmcs->guest_physical_address; + * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; + * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; + * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; + * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; + * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; + * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; + * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; + * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; + * vmcs12->exit_qualification = evmcs->exit_qualification; + * vmcs12->guest_linear_address = evmcs->guest_linear_address; + * + * Not present in struct vmcs12: + * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; + * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; + * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; + * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; + */ + + return 0; +} + +static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) +{ + struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; + struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + + /* + * Should not be changed by KVM: + * + * evmcs->host_es_selector = vmcs12->host_es_selector; + * evmcs->host_cs_selector = vmcs12->host_cs_selector; + * evmcs->host_ss_selector = vmcs12->host_ss_selector; + * evmcs->host_ds_selector = vmcs12->host_ds_selector; + * evmcs->host_fs_selector = vmcs12->host_fs_selector; + * evmcs->host_gs_selector = vmcs12->host_gs_selector; + * evmcs->host_tr_selector = vmcs12->host_tr_selector; + * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; + * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; + * evmcs->host_cr0 = vmcs12->host_cr0; + * evmcs->host_cr3 = vmcs12->host_cr3; + * evmcs->host_cr4 = vmcs12->host_cr4; + * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; + * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; + * evmcs->host_rip = vmcs12->host_rip; + * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; + * evmcs->host_fs_base = vmcs12->host_fs_base; + * evmcs->host_gs_base = vmcs12->host_gs_base; + * evmcs->host_tr_base = vmcs12->host_tr_base; + * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; + * evmcs->host_idtr_base = vmcs12->host_idtr_base; + * evmcs->host_rsp = vmcs12->host_rsp; + * sync_vmcs12() doesn't read these: + * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; + * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; + * evmcs->msr_bitmap = vmcs12->msr_bitmap; + * evmcs->ept_pointer = vmcs12->ept_pointer; + * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; + * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; + * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; + * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; + * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; + * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; + * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; + * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; + * evmcs->tpr_threshold = vmcs12->tpr_threshold; + * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; + * evmcs->exception_bitmap = vmcs12->exception_bitmap; + * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; + * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; + * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; + * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; + * evmcs->page_fault_error_code_mask = + * vmcs12->page_fault_error_code_mask; + * evmcs->page_fault_error_code_match = + * vmcs12->page_fault_error_code_match; + * evmcs->cr3_target_count = vmcs12->cr3_target_count; + * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; + * evmcs->tsc_offset = vmcs12->tsc_offset; + * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; + * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; + * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; + * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; + * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; + * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; + * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; + * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; + * + * Not present in struct vmcs12: + * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; + * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; + * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; + * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; + */ + + evmcs->guest_es_selector = vmcs12->guest_es_selector; + evmcs->guest_cs_selector = vmcs12->guest_cs_selector; + evmcs->guest_ss_selector = vmcs12->guest_ss_selector; + evmcs->guest_ds_selector = vmcs12->guest_ds_selector; + evmcs->guest_fs_selector = vmcs12->guest_fs_selector; + evmcs->guest_gs_selector = vmcs12->guest_gs_selector; + evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; + evmcs->guest_tr_selector = vmcs12->guest_tr_selector; + + evmcs->guest_es_limit = vmcs12->guest_es_limit; + evmcs->guest_cs_limit = vmcs12->guest_cs_limit; + evmcs->guest_ss_limit = vmcs12->guest_ss_limit; + evmcs->guest_ds_limit = vmcs12->guest_ds_limit; + evmcs->guest_fs_limit = vmcs12->guest_fs_limit; + evmcs->guest_gs_limit = vmcs12->guest_gs_limit; + evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; + evmcs->guest_tr_limit = vmcs12->guest_tr_limit; + evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; + evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; + + evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; + evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; + evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; + evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; + evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; + evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; + evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; + evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; + + evmcs->guest_es_base = vmcs12->guest_es_base; + evmcs->guest_cs_base = vmcs12->guest_cs_base; + evmcs->guest_ss_base = vmcs12->guest_ss_base; + evmcs->guest_ds_base = vmcs12->guest_ds_base; + evmcs->guest_fs_base = vmcs12->guest_fs_base; + evmcs->guest_gs_base = vmcs12->guest_gs_base; + evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; + evmcs->guest_tr_base = vmcs12->guest_tr_base; + evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; + evmcs->guest_idtr_base = vmcs12->guest_idtr_base; + + evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; + evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; + + evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; + evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; + evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; + evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; + + evmcs->guest_pending_dbg_exceptions = + vmcs12->guest_pending_dbg_exceptions; + evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; + evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; + + evmcs->guest_activity_state = vmcs12->guest_activity_state; + evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; + + evmcs->guest_cr0 = vmcs12->guest_cr0; + evmcs->guest_cr3 = vmcs12->guest_cr3; + evmcs->guest_cr4 = vmcs12->guest_cr4; + evmcs->guest_dr7 = vmcs12->guest_dr7; + + evmcs->guest_physical_address = vmcs12->guest_physical_address; + + evmcs->vm_instruction_error = vmcs12->vm_instruction_error; + evmcs->vm_exit_reason = vmcs12->vm_exit_reason; + evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; + evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; + evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; + evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; + evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; + evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; + + evmcs->exit_qualification = vmcs12->exit_qualification; + + evmcs->guest_linear_address = vmcs12->guest_linear_address; + evmcs->guest_rsp = vmcs12->guest_rsp; + evmcs->guest_rflags = vmcs12->guest_rflags; + + evmcs->guest_interruptibility_info = + vmcs12->guest_interruptibility_info; + evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; + evmcs->vm_entry_controls = vmcs12->vm_entry_controls; + evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; + evmcs->vm_entry_exception_error_code = + vmcs12->vm_entry_exception_error_code; + evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; + + evmcs->guest_rip = vmcs12->guest_rip; + + evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; + + return 0; +} + /* * Copy the writable VMCS shadow fields back to the VMCS12, in case * they have been modified by the L1 guest. Note that the "read-only" @@ -8683,20 +9121,6 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) vmcs_load(vmx->loaded_vmcs->vmcs); } -/* - * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was - * used before) all generate the same failure when it is missing. - */ -static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.current_vmptr == -1ull) { - nested_vmx_failInvalid(vcpu); - return 0; - } - return 1; -} - static int handle_vmread(struct kvm_vcpu *vcpu) { unsigned long field; @@ -8709,8 +9133,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_check_vmcs12(vcpu)) - return kvm_skip_emulated_instruction(vcpu); + if (to_vmx(vcpu)->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); if (!is_guest_mode(vcpu)) vmcs12 = get_vmcs12(vcpu); @@ -8719,20 +9143,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * When vmcs->vmcs_link_pointer is -1ull, any VMREAD * to shadowed-field sets the ALU flags for VMfailInvalid. */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); vmcs12 = get_shadow_vmcs12(vcpu); } /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); + /* * Now copy part of this value to register or memory, as requested. * Note that the number of bits actually copied is 32 or 64 depending @@ -8750,8 +9172,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) (is_long_mode(vcpu) ? 8 : 4), NULL); } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } @@ -8776,8 +9197,8 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_check_vmcs12(vcpu)) - return kvm_skip_emulated_instruction(vcpu); + if (vmx->nested.current_vmptr == -1ull) + return nested_vmx_failInvalid(vcpu); if (vmx_instruction_info & (1u << 10)) field_value = kvm_register_readl(vcpu, @@ -8800,11 +9221,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * VMCS," then the "read-only" fields are actually read/write. */ if (vmcs_field_readonly(field) && - !nested_cpu_has_vmwrite_any_field(vcpu)) { - nested_vmx_failValid(vcpu, + !nested_cpu_has_vmwrite_any_field(vcpu)) + return nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - return kvm_skip_emulated_instruction(vcpu); - } if (!is_guest_mode(vcpu)) vmcs12 = get_vmcs12(vcpu); @@ -8813,18 +9232,14 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE * to shadowed-field sets the ALU flags for VMfailInvalid. */ - if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) + return nested_vmx_failInvalid(vcpu); vmcs12 = get_shadow_vmcs12(vcpu); - } - if (vmcs12_write_any(vmcs12, field, field_value) < 0) { - nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmcs12_write_any(vmcs12, field, field_value) < 0) + return nested_vmx_failValid(vcpu, + VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * Do not track vmcs12 dirty-state if in guest-mode @@ -8846,8 +9261,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) } } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) @@ -8858,7 +9272,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, __pa(vmx->vmcs01.shadow_vmcs)); - vmx->nested.sync_shadow_vmcs = true; + vmx->nested.need_vmcs12_sync = true; } vmx->nested.dirty_vmcs12 = true; } @@ -8875,36 +9289,37 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) if (nested_vmx_get_vmptr(vcpu, &vmptr)) return 1; - if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - return kvm_skip_emulated_instruction(vcpu); - } + if (!PAGE_ALIGNED(vmptr) || (vmptr >> cpuid_maxphyaddr(vcpu))) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_INVALID_ADDRESS); - if (vmptr == vmx->nested.vmxon_ptr) { - nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); - return kvm_skip_emulated_instruction(vcpu); - } + if (vmptr == vmx->nested.vmxon_ptr) + return nested_vmx_failValid(vcpu, + VMXERR_VMPTRLD_VMXON_POINTER); + + /* Forbid normal VMPTRLD if Enlightened version was used */ + if (vmx->nested.hv_evmcs) + return 1; if (vmx->nested.current_vmptr != vmptr) { struct vmcs12 *new_vmcs12; struct page *page; page = kvm_vcpu_gpa_to_page(vcpu, vmptr); - if (is_error_page(page)) { - nested_vmx_failInvalid(vcpu); - return kvm_skip_emulated_instruction(vcpu); - } + if (is_error_page(page)) + return nested_vmx_failInvalid(vcpu); + new_vmcs12 = kmap(page); if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kunmap(page); kvm_release_page_clean(page); - nested_vmx_failValid(vcpu, + return nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - return kvm_skip_emulated_instruction(vcpu); } - nested_release_vmcs12(vmx); + nested_release_vmcs12(vcpu); + /* * Load VMCS12 from guest memory since it is not already * cached. @@ -8916,8 +9331,71 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) set_current_vmptr(vmx, vmptr); } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); +} + +/* + * This is an equivalent of the nested hypervisor executing the vmptrld + * instruction. + */ +static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, + bool from_launch) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_vp_assist_page assist_page; + + if (likely(!vmx->nested.enlightened_vmcs_enabled)) + return 1; + + if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page))) + return 1; + + if (unlikely(!assist_page.enlighten_vmentry)) + return 1; + + if (unlikely(assist_page.current_nested_vmcs != + vmx->nested.hv_evmcs_vmptr)) { + + if (!vmx->nested.hv_evmcs) + vmx->nested.current_vmptr = -1ull; + + nested_release_evmcs(vcpu); + + vmx->nested.hv_evmcs_page = kvm_vcpu_gpa_to_page( + vcpu, assist_page.current_nested_vmcs); + + if (unlikely(is_error_page(vmx->nested.hv_evmcs_page))) + return 0; + + vmx->nested.hv_evmcs = kmap(vmx->nested.hv_evmcs_page); + + if (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION) { + nested_release_evmcs(vcpu); + return 0; + } + + vmx->nested.dirty_vmcs12 = true; + /* + * As we keep L2 state for one guest only 'hv_clean_fields' mask + * can't be used when we switch between them. Reset it here for + * simplicity. + */ + vmx->nested.hv_evmcs->hv_clean_fields &= + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs; + + /* + * Unlike normal vmcs12, enlightened vmcs12 is not fully + * reloaded from guest's memory (read only fields, fields not + * present in struct hv_enlightened_vmcs, ...). Make sure there + * are no leftovers. + */ + if (from_launch) + memset(vmx->nested.cached_vmcs12, 0, + sizeof(*vmx->nested.cached_vmcs12)); + + } + return 1; } /* Emulate the VMPTRST instruction */ @@ -8932,6 +9410,9 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; + if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) + return 1; + if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva)) return 1; /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ @@ -8940,8 +9421,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) kvm_inject_page_fault(vcpu, &e); return 1; } - nested_vmx_succeed(vcpu); - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } /* Emulate the INVEPT instruction */ @@ -8971,11 +9451,9 @@ static int handle_invept(struct kvm_vcpu *vcpu) types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; - if (type >= 32 || !(types & (1 << type))) { - nested_vmx_failValid(vcpu, + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) @@ -8997,14 +9475,20 @@ static int handle_invept(struct kvm_vcpu *vcpu) case VMX_EPT_EXTENT_CONTEXT: kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - nested_vmx_succeed(vcpu); break; default: BUG_ON(1); break; } - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); +} + +static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; } static int handle_invvpid(struct kvm_vcpu *vcpu) @@ -9018,6 +9502,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 vpid; u64 gla; } operand; + u16 vpid02; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -9035,11 +9520,9 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; - if (type >= 32 || !(types & (1 << type))) { - nested_vmx_failValid(vcpu, + if (type >= 32 || !(types & (1 << type))) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) @@ -9051,47 +9534,39 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) kvm_inject_page_fault(vcpu, &e); return 1; } - if (operand.vpid >> 16) { - nested_vmx_failValid(vcpu, + if (operand.vpid >> 16) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } + vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) { - nested_vmx_failValid(vcpu, + is_noncanonical_address(operand.gla, vcpu)) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } - if (cpu_has_vmx_invvpid_individual_addr() && - vmx->nested.vpid02) { + if (cpu_has_vmx_invvpid_individual_addr()) { __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, - vmx->nested.vpid02, operand.gla); + vpid02, operand.gla); } else - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, vpid02, false); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: - if (!operand.vpid) { - nested_vmx_failValid(vcpu, + if (!operand.vpid) + return nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - return kvm_skip_emulated_instruction(vcpu); - } - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, vpid02, false); break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, vpid02, false); break; default: WARN_ON_ONCE(1); return kvm_skip_emulated_instruction(vcpu); } - nested_vmx_succeed(vcpu); - - return kvm_skip_emulated_instruction(vcpu); + return nested_vmx_succeed(vcpu); } static int handle_invpcid(struct kvm_vcpu *vcpu) @@ -9162,11 +9637,11 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) == operand.pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - kvm_mmu_free_roots(vcpu, roots_to_free); + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); /* * If neither the current cr3 nor any of the prev_roots use the * given PCID, then nothing needs to be done here because a @@ -9293,7 +9768,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; - mmu->base_role.ad_disabled = !accessed_dirty; + mmu->mmu_role.base.ad_disabled = !accessed_dirty; vmcs12->ept_pointer = address; /* * TODO: Check what's the correct approach in case @@ -9652,9 +10127,6 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return false; else if (is_page_fault(intr_info)) return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; - else if (is_no_device(intr_info) && - !(vmcs12->guest_cr0 & X86_CR0_TS)) - return false; else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) @@ -10676,9 +11148,25 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_write32(PLE_WINDOW, vmx->ple_window); } - if (vmx->nested.sync_shadow_vmcs) { - copy_vmcs12_to_shadow(vmx); - vmx->nested.sync_shadow_vmcs = false; + if (vmx->nested.need_vmcs12_sync) { + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && + !vmx->nested.hv_evmcs) + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (vmx->nested.hv_evmcs) { + copy_vmcs12_to_enlightened(vmx); + /* All fields are clean */ + vmx->nested.hv_evmcs->hv_clean_fields |= + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + } else { + copy_vmcs12_to_shadow(vmx); + } + vmx->nested.need_vmcs12_sync = false; } if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) @@ -10745,7 +11233,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t" "jmp 1f \n\t" "2: \n\t" - __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ "mov %c[cr2](%0), %%" _ASM_AX " \n\t" @@ -10777,9 +11265,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Enter guest mode */ "jne 1f \n\t" - __ex(ASM_VMX_VMLAUNCH) "\n\t" + __ex("vmlaunch") "\n\t" "jmp 2f \n\t" - "1: " __ex(ASM_VMX_VMRESUME) "\n\t" + "1: " __ex("vmresume") "\n\t" "2: " /* Save guest registers, load host registers, keep flags */ "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" @@ -10801,6 +11289,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" + /* + * Clear host registers marked as clobbered to prevent + * speculative use. + */ "xor %%r8d, %%r8d \n\t" "xor %%r9d, %%r9d \n\t" "xor %%r10d, %%r10d \n\t" @@ -10958,6 +11450,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); put_cpu(); + + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); + vmx_segment_cache_clear(vmx); } /* @@ -10966,12 +11462,10 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) */ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - vcpu_load(vcpu); - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - free_nested(vmx); - vcpu_put(vcpu); + vcpu_load(vcpu); + vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); + free_nested(vcpu); + vcpu_put(vcpu); } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) @@ -11334,28 +11828,28 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { WARN_ON(mmu_is_nested(vcpu)); - if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) - return 1; + vcpu->arch.mmu = &vcpu->arch.guest_mmu; kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, nested_ept_ad_enabled(vcpu), nested_ept_get_cr3(vcpu)); - vcpu->arch.mmu.set_cr3 = vmx_set_cr3; - vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; - vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->set_cr3 = vmx_set_cr3; + vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; + vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; - return 0; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) { - vcpu->arch.walk_mmu = &vcpu->arch.mmu; + vcpu->arch.mmu = &vcpu->arch.root_mmu; + vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; } static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, @@ -11716,7 +12210,7 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, !nested_exit_intr_ack_set(vcpu) || (vmcs12->posted_intr_nv & 0xff00) || (vmcs12->posted_intr_desc_addr & 0x3f) || - (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) + (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -11772,15 +12266,12 @@ static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - u64 address = vmcs12->pml_address; - int maxphyaddr = cpuid_maxphyaddr(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; - if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_PML)) { - if (!nested_cpu_has_ept(vmcs12) || - !IS_ALIGNED(address, 4096) || - address >> maxphyaddr) - return -EINVAL; - } + if (!nested_cpu_has_ept(vmcs12) || + !page_address_valid(vcpu, vmcs12->pml_address)) + return -EINVAL; return 0; } @@ -11960,112 +12451,87 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 0; } -static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); +/* + * Returns if KVM is able to config CPU to tag TLB entries + * populated by L2 differently than TLB entries populated + * by L1. + * + * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * + * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged + * with different VPID (L1 entries are tagged with vmx->vpid + * while L2 entries are tagged with vmx->nested.vpid02). + */ +static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); - vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); - vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); - vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); - vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); - vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); - vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); - vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); - vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); - vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); - vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); - vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); - vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); - vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); - vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); - vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); - vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); - vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); - vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); - vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); - vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); - vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); - vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); - vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); - vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); - vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); - vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + return nested_cpu_has_ept(vmcs12) || + (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); +} - if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - vmcs_write64(VMCS_LINK_POINTER, -1ull); +static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) + return vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); + else + return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); +} - if (cpu_has_vmx_posted_intr()) - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); +static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) +{ + /* + * If vmcs02 hasn't been initialized, set the constant vmcs02 state + * according to L0's settings (vmcs12 is irrelevant here). Host + * fields that come from L0 and are not constant, e.g. HOST_CR3, + * will be set as needed prior to VMLAUNCH/VMRESUME. + */ + if (vmx->nested.vmcs02_initialized) + return; + vmx->nested.vmcs02_initialized = true; /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. + * We don't care what the EPTP value is we just need to guarantee + * it's valid so we don't get a false positive when doing early + * consistency checks. */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); + if (enable_ept && nested_early_check) + vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); - if (cpu_has_vmx_apicv()) { - vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); - } + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); - /* - * Set host-state according to L0's settings (vmcs12 is irrelevant here) - * Some constant fields are set here by vmx_set_constant_host_state(). - * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() - * is called. - */ - vmx_set_constant_host_state(vmx); + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); + + if (enable_pml) + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); /* - * Set the MSR load/store lists to match L0's settings. + * Set the MSR load/store lists to match L0's settings. Only the + * addresses are constant (for vmcs02), the counts can change based + * on L2's behavior, e.g. switching to/from long mode. */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); - set_cr4_guest_host_mask(vmx); + vmx_set_constant_host_state(vmx); +} - if (kvm_mpx_supported()) { - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - else - vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); - } +static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx, + struct vmcs12 *vmcs12) +{ + prepare_vmcs02_constant_state(vmx); + + vmcs_write64(VMCS_LINK_POINTER, -1ull); if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -12073,78 +12539,30 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) else vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); } - - /* - * L1 may access the L2's PDPTR, so save them to construct vmcs12 - */ - if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - } - - if (cpu_has_vmx_msr_bitmap()) - vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); } -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *entry_failure_code) +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control, vmcs12_exec_ctrl; + u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - if (vmx->nested.dirty_vmcs12) { - prepare_vmcs02_full(vcpu, vmcs12); - vmx->nested.dirty_vmcs12 = false; - } + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) + prepare_vmcs02_early_full(vmx, vmcs12); /* - * First, the fields that are shadowed. This must be kept in sync - * with vmx_shadow_fields.h. + * HOST_RSP is normally set correctly in vmx_vcpu_run() just before + * entry, but only if the current (host) sp changed from the value + * we wrote last (vmx->host_rsp). This cache is no longer relevant + * if we switch vmcs, and rather than hold a separate cache per vmcs, + * here we just force the write to happen on entry. host_rsp will + * also be written unconditionally by nested_vmx_check_vmentry_hw() + * if we are doing early consistency checks via hardware. */ + vmx->host_rsp = 0; - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); - } else { - kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); - } - if (vmx->nested.nested_run_pending) { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - vmcs12->vm_entry_intr_info_field); - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs12->vm_entry_exception_error_code); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs12->vm_entry_instruction_len); - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs12->guest_interruptibility_info); - vmx->loaded_vmcs->nmi_known_unmasked = - !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); - } else { - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); - } - vmx_set_rflags(vcpu, vmcs12->guest_rflags); - + /* + * PIN CONTROLS + */ exec_control = vmcs12->pin_based_vm_exec_control; /* Preemption timer setting is computed directly in vmx_vcpu_run. */ @@ -12159,13 +12577,43 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } else { exec_control &= ~PIN_BASED_POSTED_INTR; } - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); - vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); + /* + * EXEC CONTROLS + */ + exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; + exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + exec_control &= ~CPU_BASED_TPR_SHADOW; + exec_control |= vmcs12->cpu_based_vm_exec_control; + + /* + * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if + * nested_get_vmcs12_pages can't fix it up, the illegal value + * will result in a VM entry failure. + */ + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif + } + + /* + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. + */ + exec_control &= ~CPU_BASED_USE_IO_BITMAPS; + exec_control |= CPU_BASED_UNCOND_IO_EXITING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + /* + * SECONDARY EXEC CONTROLS + */ if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx->secondary_exec_control; @@ -12206,43 +12654,214 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * HOST_RSP is normally set correctly in vmx_vcpu_run() just before - * entry, but only if the current (host) sp changed from the value - * we wrote last (vmx->host_rsp). This cache is no longer relevant - * if we switch vmcs, and rather than hold a separate cache per vmcs, - * here we just force the write to happen on entry. + * ENTRY CONTROLS + * + * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE + * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate + * on the related bits (if supported by the CPU) in the hope that + * we can avoid VMWrites during vmx_set_efer(). + */ + exec_control = (vmcs12->vm_entry_controls | vmcs_config.vmentry_ctrl) & + ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + if (cpu_has_load_ia32_efer) { + if (guest_efer & EFER_LMA) + exec_control |= VM_ENTRY_IA32E_MODE; + if (guest_efer != host_efer) + exec_control |= VM_ENTRY_LOAD_IA32_EFER; + } + vm_entry_controls_init(vmx, exec_control); + + /* + * EXIT CONTROLS + * + * L2->L1 exit controls are emulated - the hardware exit is to L0 so + * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER + * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - vmx->host_rsp = 0; + exec_control = vmcs_config.vmexit_ctrl; + if (cpu_has_load_ia32_efer && guest_efer != host_efer) + exec_control |= VM_EXIT_LOAD_IA32_EFER; + vm_exit_controls_init(vmx, exec_control); - exec_control = vmx_exec_control(vmx); /* L0's desires */ - exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; - exec_control &= ~CPU_BASED_TPR_SHADOW; - exec_control |= vmcs12->cpu_based_vm_exec_control; + /* + * Conceptually we want to copy the PML address and index from + * vmcs01 here, and then back to vmcs01 on nested vmexit. But, + * since we always flush the log on each vmexit and never change + * the PML address (once set), this happens to be equivalent to + * simply resetting the index in vmcs02. + */ + if (enable_pml) + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); /* - * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if - * nested_get_vmcs12_pages can't fix it up, the illegal value - * will result in a VM entry failure. + * Interrupt/Exception Fields */ - if (exec_control & CPU_BASED_TPR_SHADOW) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); - vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + if (vmx->nested.nested_run_pending) { + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + vmcs12->vm_entry_intr_info_field); + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, + vmcs12->vm_entry_exception_error_code); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, + vmcs12->vm_entry_instruction_len); + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs12->guest_interruptibility_info); + vmx->loaded_vmcs->nmi_known_unmasked = + !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); } else { -#ifdef CONFIG_X86_64 - exec_control |= CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING; -#endif + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); + } +} + +static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +{ + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); + vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); + vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); + vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); + vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); + vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); + vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); + vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); + vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); + vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); + vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); + vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); + vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); + vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); + vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); + vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); + vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); + vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); + vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); + vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); + vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); + vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); + vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); + vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); + vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); + vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); + vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + } + + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + /* + * L1 may access the L2's PDPTR, so save them to construct + * vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } + } + + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + + set_cr4_guest_host_mask(vmx); + + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + + if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) { + prepare_vmcs02_full(vmx, vmcs12); + vmx->nested.dirty_vmcs12 = false; } /* - * A vmexit (to either L1 hypervisor or L0 userspace) is always needed - * for I/O port accesses. + * First, the fields that are shadowed. This must be kept in sync + * with vmx_shadow_fields.h. */ - exec_control &= ~CPU_BASED_USE_IO_BITMAPS; - exec_control |= CPU_BASED_UNCOND_IO_EXITING; + if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); + } - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } + vmx_set_rflags(vcpu, vmcs12->guest_rflags); + + vmx->nested.preemption_timer_expired = false; + if (nested_cpu_has_preemption_timer(vmcs12)) + vmx_start_preemption_timer(vcpu); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the * bitwise-or of what L1 wants to trap for L2, and what we want to @@ -12252,20 +12871,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - /* L2->L1 exit controls are emulated - the hardware exit is to L0 so - * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER - * bits are further modified by vmx_set_efer() below. - */ - vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); - - /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are - * emulated by vmx_set_efer(), below. - */ - vm_entry_controls_init(vmx, - (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & - ~VM_ENTRY_IA32E_MODE) | - (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); @@ -12288,37 +12893,29 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * influence global bitmap(for vpid01 and vpid02 allocation) * even if spawn a lot of nested vCPUs. */ - if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { + if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); + __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); } } else { - vmx_flush_tlb(vcpu, true); + /* + * If L1 use EPT, then L0 needs to execute INVEPT on + * EPTP02 instead of EPTP01. Therefore, delay TLB + * flush until vmcs02->eptp is fully updated by + * KVM_REQ_LOAD_CR3. Note that this assumes + * KVM_REQ_TLB_FLUSH is evaluated after + * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). + */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } } - if (enable_pml) { - /* - * Conceptually we want to copy the PML address and index from - * vmcs01 here, and then back to vmcs01 on nested vmexit. But, - * since we always flush the log on each vmexit, this happens - * to be equivalent to simply resetting the fields in vmcs02. - */ - ASSERT(vmx->pml_pg); - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - } - - if (nested_cpu_has_ept(vmcs12)) { - if (nested_ept_init_mmu_context(vcpu)) { - *entry_failure_code = ENTRY_FAIL_DEFAULT; - return 1; - } - } else if (nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + if (nested_cpu_has_ept(vmcs12)) + nested_ept_init_mmu_context(vcpu); + else if (nested_cpu_has2(vmcs12, + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) vmx_flush_tlb(vcpu, true); - } /* * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those @@ -12334,14 +12931,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - if (vmx->nested.nested_run_pending && - (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); + /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); /* @@ -12383,6 +12974,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); + bool ia32e; if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) @@ -12457,6 +13049,21 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; /* + * If the load IA32_EFER VM-exit control is 1, bits reserved in the + * IA32_EFER MSR must be 0 in the field for that register. In addition, + * the values of the LMA and LME bits in the field must each be that of + * the host address-space size VM-exit control. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { + ia32e = (vmcs12->vm_exit_controls & + VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || + ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) + return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + } + + /* * From the Intel SDM, volume 3: * Fields relevant to VM-entry event injection must be set properly. * These fields are the VM-entry interruption-information field, the @@ -12512,6 +13119,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } } + if (nested_cpu_has_ept(vmcs12) && + !valid_ept_address(vcpu, vmcs12->ept_pointer)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + return 0; } @@ -12532,94 +13143,192 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, if (is_error_page(page)) return -EINVAL; - r = 0; - shadow = kmap(page); - if (shadow->hdr.revision_id != VMCS12_REVISION || - shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) - r = -EINVAL; - kunmap(page); - kvm_release_page_clean(page); - return r; -} + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + +static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + u32 *exit_qual) +{ + bool ia32e; + + *exit_qual = ENTRY_FAIL_DEFAULT; + + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) + return 1; + + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { + *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; + return 1; + } + + /* + * If the load IA32_EFER VM-entry control is 1, the following checks + * are performed on the field for the IA32_EFER MSR: + * - Bits reserved in the IA32_EFER MSR must be 0. + * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of + * the IA-32e mode guest VM-exit control. It must also be identical + * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to + * CR0.PG) is 1. + */ + if (to_vmx(vcpu)->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { + ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; + if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || + ((vmcs12->guest_cr0 & X86_CR0_PG) && + ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) + return 1; + } + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && + (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || + (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) + return 1; + + return 0; +} + +static int __noclone nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr3, cr4;